]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/port/proc.c
c101b9988caa60008c2b6d816857d47a91cbdd4c
[plan9front.git] / sys / src / 9 / port / proc.c
1 #include        <u.h>
2 #include        "../port/lib.h"
3 #include        "mem.h"
4 #include        "dat.h"
5 #include        "fns.h"
6 #include        "../port/error.h"
7 #include        "edf.h"
8 #include        <trace.h>
9
10 int     schedgain = 30; /* units in seconds */
11 int     nrdy;
12
13 void updatecpu(Proc*);
14 int reprioritize(Proc*);
15
16 ulong   delayedscheds;  /* statistics */
17 long skipscheds;
18 long preempts;
19 ulong load;
20
21 static struct Procalloc
22 {
23         Lock;
24         Proc*   ht[128];
25         Proc*   arena;
26         Proc*   free;
27 } procalloc;
28
29 enum
30 {
31         Q=10,
32         DQ=4,
33         Scaling=2,
34 };
35
36 Schedq  runq[Nrq];
37 ulong   runvec;
38
39 char *statename[] =
40 {       /* BUG: generate automatically */
41         "Dead",
42         "Moribund",
43         "Ready",
44         "Scheding",
45         "Running",
46         "Queueing",
47         "QueueingR",
48         "QueueingW",
49         "Wakeme",
50         "Broken",
51         "Stopped",
52         "Rendez",
53         "Waitrelease",
54 };
55
56 static void pidfree(Proc*);
57 static void rebalance(void);
58
59 /*
60  * Always splhi()'ed.
61  */
62 void
63 schedinit(void)         /* never returns */
64 {
65         Edf *e;
66
67         setlabel(&m->sched);
68         if(up) {
69                 if((e = up->edf) && (e->flags & Admitted))
70                         edfrecord(up);
71                 m->proc = 0;
72                 switch(up->state) {
73                 case Running:
74                         ready(up);
75                         break;
76                 case Moribund:
77                         up->state = Dead;
78                         edfstop(up);
79                         if (up->edf)
80                                 free(up->edf);
81                         up->edf = nil;
82
83                         /*
84                          * Holding locks from pexit:
85                          *      procalloc
86                          *      palloc
87                          */
88                         mmurelease(up);
89                         unlock(&palloc);
90
91                         up->mach = nil;
92                         updatecpu(up);
93
94                         up->qnext = procalloc.free;
95                         procalloc.free = up;
96
97                         /* proc is free now, make sure unlock() wont touch it */
98                         up = procalloc.p = nil;
99                         unlock(&procalloc);
100                         sched();
101                 }
102                 up->mach = nil;
103                 updatecpu(up);
104                 up = nil;
105         }
106         sched();
107 }
108
109 /*
110  *  If changing this routine, look also at sleep().  It
111  *  contains a copy of the guts of sched().
112  */
113 void
114 sched(void)
115 {
116         Proc *p;
117
118         if(m->ilockdepth)
119                 panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
120                         m->machno,
121                         m->ilockdepth,
122                         up? up->lastilock: nil,
123                         (up && up->lastilock)? up->lastilock->pc: 0,
124                         getcallerpc(&p+2));
125         if(up){
126                 /*
127                  * Delay the sched until the process gives up the locks
128                  * it is holding.  This avoids dumb lock loops.
129                  * Don't delay if the process is Moribund.
130                  * It called sched to die.
131                  * But do sched eventually.  This avoids a missing unlock
132                  * from hanging the entire kernel. 
133                  * But don't reschedule procs holding palloc or procalloc.
134                  * Those are far too important to be holding while asleep.
135                  *
136                  * This test is not exact.  There can still be a few instructions
137                  * in the middle of taslock when a process holds a lock
138                  * but Lock.p has not yet been initialized.
139                  */
140                 if(up->nlocks.ref)
141                 if(up->state != Moribund)
142                 if(up->delaysched < 20
143                 || palloc.Lock.p == up
144                 || procalloc.Lock.p == up){
145                         up->delaysched++;
146                         delayedscheds++;
147                         return;
148                 }
149                 up->delaysched = 0;
150
151                 splhi();
152
153                 /* statistics */
154                 m->cs++;
155
156                 procsave(up);
157                 if(setlabel(&up->sched)){
158                         procrestore(up);
159                         spllo();
160                         return;
161                 }
162                 gotolabel(&m->sched);
163         }
164         p = runproc();
165         if(!p->edf){
166                 updatecpu(p);
167                 p->priority = reprioritize(p);
168         }
169         if(p != m->readied)
170                 m->schedticks = m->ticks + HZ/10;
171         m->readied = 0;
172         up = p;
173         up->state = Running;
174         up->mach = MACHP(m->machno);
175         m->proc = up;
176         mmuswitch(up);
177         gotolabel(&up->sched);
178 }
179
180 int
181 anyready(void)
182 {
183         return runvec;
184 }
185
186 int
187 anyhigher(void)
188 {
189         return runvec & ~((1<<(up->priority+1))-1);
190 }
191
192 /*
193  *  here once per clock tick to see if we should resched
194  */
195 void
196 hzsched(void)
197 {
198         /* once a second, rebalance will reprioritize ready procs */
199         if(m->machno == 0)
200                 rebalance();
201
202         /* unless preempted, get to run for at least 100ms */
203         if(anyhigher()
204         || (!up->fixedpri && m->ticks > m->schedticks && anyready())){
205                 m->readied = nil;       /* avoid cooperative scheduling */
206                 up->delaysched++;
207         }
208 }
209
210 /*
211  *  here at the end of non-clock interrupts to see if we should preempt the
212  *  current process.  Returns 1 if preempted, 0 otherwise.
213  */
214 int
215 preempted(void)
216 {
217         if(up && up->state == Running)
218         if(up->preempted == 0)
219         if(anyhigher())
220         if(!active.exiting){
221                 m->readied = nil;       /* avoid cooperative scheduling */
222                 up->preempted = 1;
223                 sched();
224                 splhi();
225                 up->preempted = 0;
226                 return 1;
227         }
228         return 0;
229 }
230
231 /*
232  * Update the cpu time average for this particular process,
233  * which is about to change from up -> not up or vice versa.
234  * p->lastupdate is the last time an updatecpu happened.
235  *
236  * The cpu time average is a decaying average that lasts
237  * about D clock ticks.  D is chosen to be approximately
238  * the cpu time of a cpu-intensive "quick job".  A job has to run
239  * for approximately D clock ticks before we home in on its 
240  * actual cpu usage.  Thus if you manage to get in and get out
241  * quickly, you won't be penalized during your burst.  Once you
242  * start using your share of the cpu for more than about D
243  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up 
244  * below all the other quick jobs.  Interactive tasks, because
245  * they basically always use less than their fair share of cpu,
246  * will be rewarded.
247  *
248  * If the process has not been running, then we want to
249  * apply the filter
250  *
251  *      cpu = cpu * (D-1)/D
252  *
253  * n times, yielding 
254  * 
255  *      cpu = cpu * ((D-1)/D)^n
256  *
257  * but D is big enough that this is approximately 
258  *
259  *      cpu = cpu * (D-n)/D
260  *
261  * so we use that instead.
262  * 
263  * If the process has been running, we apply the filter to
264  * 1 - cpu, yielding a similar equation.  Note that cpu is 
265  * stored in fixed point (* 1000).
266  *
267  * Updatecpu must be called before changing up, in order
268  * to maintain accurate cpu usage statistics.  It can be called
269  * at any time to bring the stats for a given proc up-to-date.
270  */
271 void
272 updatecpu(Proc *p)
273 {
274         int n, t, ocpu;
275         int D = schedgain*HZ*Scaling;
276
277         if(p->edf)
278                 return;
279
280         t = MACHP(0)->ticks*Scaling + Scaling/2;
281         n = t - p->lastupdate;
282         p->lastupdate = t;
283
284         if(n == 0)
285                 return;
286         if(n > D)
287                 n = D;
288
289         ocpu = p->cpu;
290         if(p != up)
291                 p->cpu = (ocpu*(D-n))/D;
292         else{
293                 t = 1000 - ocpu;
294                 t = (t*(D-n))/D;
295                 p->cpu = 1000 - t;
296         }
297
298 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
299 }
300
301 /*
302  * On average, p has used p->cpu of a cpu recently.
303  * Its fair share is conf.nmach/m->load of a cpu.  If it has been getting
304  * too much, penalize it.  If it has been getting not enough, reward it.
305  * I don't think you can get much more than your fair share that 
306  * often, so most of the queues are for using less.  Having a priority
307  * of 3 means you're just right.  Having a higher priority (up to p->basepri) 
308  * means you're not using as much as you could.
309  */
310 int
311 reprioritize(Proc *p)
312 {
313         int fairshare, n, load, ratio;
314
315         load = MACHP(0)->load;
316         if(load == 0)
317                 return p->basepri;
318
319         /*
320          *  fairshare = 1.000 * conf.nproc * 1.000/load,
321          * except the decimal point is moved three places
322          * on both load and fairshare.
323          */
324         fairshare = (conf.nmach*1000*1000)/load;
325         n = p->cpu;
326         if(n == 0)
327                 n = 1;
328         ratio = (fairshare+n/2) / n;
329         if(ratio > p->basepri)
330                 ratio = p->basepri;
331         if(ratio < 0)
332                 panic("reprioritize");
333 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
334         return ratio;
335 }
336
337 /*
338  * add a process to a scheduling queue
339  */
340 void
341 queueproc(Schedq *rq, Proc *p)
342 {
343         int pri;
344
345         pri = rq - runq;
346         lock(runq);
347         p->priority = pri;
348         p->rnext = 0;
349         if(rq->tail)
350                 rq->tail->rnext = p;
351         else
352                 rq->head = p;
353         rq->tail = p;
354         rq->n++;
355         nrdy++;
356         runvec |= 1<<pri;
357         unlock(runq);
358 }
359
360 /*
361  *  try to remove a process from a scheduling queue (called splhi)
362  */
363 Proc*
364 dequeueproc(Schedq *rq, Proc *tp)
365 {
366         Proc *l, *p;
367
368         if(!canlock(runq))
369                 return nil;
370
371         /*
372          *  the queue may have changed before we locked runq,
373          *  refind the target process.
374          */
375         l = 0;
376         for(p = rq->head; p; p = p->rnext){
377                 if(p == tp)
378                         break;
379                 l = p;
380         }
381
382         /*
383          *  p->mach==0 only when process state is saved
384          */
385         if(p == 0 || p->mach){
386                 unlock(runq);
387                 return nil;
388         }
389         if(p->rnext == 0)
390                 rq->tail = l;
391         if(l)
392                 l->rnext = p->rnext;
393         else
394                 rq->head = p->rnext;
395         if(rq->head == nil)
396                 runvec &= ~(1<<(rq-runq));
397         rq->n--;
398         nrdy--;
399         if(p->state != Ready)
400                 print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
401
402         unlock(runq);
403         return p;
404 }
405
406 /*
407  *  ready(p) picks a new priority for a process and sticks it in the
408  *  runq for that priority.
409  */
410 void
411 ready(Proc *p)
412 {
413         int s, pri;
414         Schedq *rq;
415         void (*pt)(Proc*, int, vlong);
416
417         if(p->state == Ready){
418                 print("double ready %s %lud pc %p\n", p->text, p->pid, getcallerpc(&p));
419                 return;
420         }
421                 
422         s = splhi();
423         if(edfready(p)){
424                 splx(s);
425                 return;
426         }
427
428         if(up != p)
429                 m->readied = p; /* group scheduling */
430
431         updatecpu(p);
432         pri = reprioritize(p);
433         p->priority = pri;
434         rq = &runq[pri];
435         p->state = Ready;
436         queueproc(rq, p);
437         pt = proctrace;
438         if(pt)
439                 pt(p, SReady, 0);
440         splx(s);
441 }
442
443 /*
444  *  yield the processor and drop our priority
445  */
446 void
447 yield(void)
448 {
449         if(anyready()){
450                 /* pretend we just used 1/2 tick */
451                 up->lastupdate -= Scaling/2;  
452                 sched();
453         }
454 }
455
456 /*
457  *  recalculate priorities once a second.  We need to do this
458  *  since priorities will otherwise only be recalculated when
459  *  the running process blocks.
460  */
461 ulong balancetime;
462
463 static void
464 rebalance(void)
465 {
466         int pri, npri, t, x;
467         Schedq *rq;
468         Proc *p;
469
470         t = m->ticks;
471         if(t - balancetime < HZ)
472                 return;
473         balancetime = t;
474
475         for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
476 another:
477                 p = rq->head;
478                 if(p == nil)
479                         continue;
480                 if(p->mp != MACHP(m->machno))
481                         continue;
482                 if(pri == p->basepri)
483                         continue;
484                 updatecpu(p);
485                 npri = reprioritize(p);
486                 if(npri != pri){
487                         x = splhi();
488                         p = dequeueproc(rq, p);
489                         if(p)
490                                 queueproc(&runq[npri], p);
491                         splx(x);
492                         goto another;
493                 }
494         }
495 }
496         
497
498 /*
499  *  pick a process to run
500  */
501 Proc*
502 runproc(void)
503 {
504         Schedq *rq;
505         Proc *p;
506         ulong start, now;
507         int i;
508         void (*pt)(Proc*, int, vlong);
509
510         start = perfticks();
511
512         /* cooperative scheduling until the clock ticks */
513         if((p=m->readied) && p->mach==0 && p->state==Ready
514         && runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
515                 skipscheds++;
516                 rq = &runq[p->priority];
517                 goto found;
518         }
519
520         preempts++;
521
522 loop:
523         /*
524          *  find a process that last ran on this processor (affinity),
525          *  or one that hasn't moved in a while (load balancing).  Every
526          *  time around the loop affinity goes down.
527          */
528         spllo();
529         for(i = 0;; i++){
530                 /*
531                  *  find the highest priority target process that this
532                  *  processor can run given affinity constraints.
533                  *
534                  */
535                 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
536                         for(p = rq->head; p; p = p->rnext){
537                                 if(p->mp == nil || p->mp == MACHP(m->machno)
538                                 || (!p->wired && i > 0))
539                                         goto found;
540                         }
541                 }
542
543                 /* waste time or halt the CPU */
544                 idlehands();
545
546                 /* remember how much time we're here */
547                 now = perfticks();
548                 m->perf.inidle += now-start;
549                 start = now;
550         }
551
552 found:
553         splhi();
554         p = dequeueproc(rq, p);
555         if(p == nil)
556                 goto loop;
557
558         p->state = Scheding;
559         p->mp = MACHP(m->machno);
560
561         if(edflock(p)){
562                 edfrun(p, rq == &runq[PriEdf]); /* start deadline timer and do admin */
563                 edfunlock();
564         }
565         pt = proctrace;
566         if(pt)
567                 pt(p, SRun, 0);
568         return p;
569 }
570
571 int
572 canpage(Proc *p)
573 {
574         int ok = 0;
575
576         splhi();
577         lock(runq);
578         /* Only reliable way to see if we are Running */
579         if(p->mach == 0) {
580                 p->newtlb = 1;
581                 ok = 1;
582         }
583         unlock(runq);
584         spllo();
585
586         return ok;
587 }
588
589 Proc*
590 newproc(void)
591 {
592         char msg[64];
593         Proc *p;
594
595         lock(&procalloc);
596         for(;;) {
597                 if(p = procalloc.free)
598                         break;
599
600                 snprint(msg, sizeof msg, "no procs; %s forking",
601                         up? up->text: "kernel");
602                 unlock(&procalloc);
603                 resrcwait(msg);
604                 lock(&procalloc);
605         }
606         procalloc.free = p->qnext;
607         unlock(&procalloc);
608
609         p->state = Scheding;
610         p->psstate = "New";
611         p->mach = 0;
612         p->eql = 0;
613         p->qnext = 0;
614         p->nchild = 0;
615         p->nwait = 0;
616         p->waitq = 0;
617         p->parent = 0;
618         p->pgrp = 0;
619         p->egrp = 0;
620         p->fgrp = 0;
621         p->rgrp = 0;
622         p->pdbg = 0;
623         p->fpstate = FPinit;
624         p->kp = 0;
625         if(up && up->procctl == Proc_tracesyscall)
626                 p->procctl = Proc_tracesyscall;
627         else
628                 p->procctl = 0;
629         p->syscalltrace = 0;    
630         p->notepending = 0;
631         p->ureg = 0;
632         p->privatemem = 0;
633         p->noswap = 0;
634         p->errstr = p->errbuf0;
635         p->syserrstr = p->errbuf1;
636         p->errbuf0[0] = '\0';
637         p->errbuf1[0] = '\0';
638         p->nlocks.ref = 0;
639         p->delaysched = 0;
640         p->trace = 0;
641         kstrdup(&p->user, "*nouser");
642         kstrdup(&p->text, "*notext");
643         kstrdup(&p->args, "");
644         p->nargs = 0;
645         p->setargs = 0;
646         memset(p->seg, 0, sizeof p->seg);
647         p->noteid = pidalloc(p);
648         if(p->kstack == 0)
649                 p->kstack = smalloc(KSTACK);
650
651         /* sched params */
652         p->mp = 0;
653         p->wired = 0;
654         procpriority(p, PriNormal, 0);
655         p->cpu = 0;
656         p->lastupdate = MACHP(0)->ticks*Scaling;
657         p->edf = nil;
658
659         return p;
660 }
661
662 /*
663  * wire this proc to a machine
664  */
665 void
666 procwired(Proc *p, int bm)
667 {
668         Proc *pp;
669         int i;
670         char nwired[MAXMACH];
671         Mach *wm;
672
673         if(bm < 0){
674                 /* pick a machine to wire to */
675                 memset(nwired, 0, sizeof(nwired));
676                 p->wired = 0;
677                 pp = proctab(0);
678                 for(i=0; i<conf.nproc; i++, pp++){
679                         wm = pp->wired;
680                         if(wm && pp->pid)
681                                 nwired[wm->machno]++;
682                 }
683                 bm = 0;
684                 for(i=0; i<conf.nmach; i++)
685                         if(nwired[i] < nwired[bm])
686                                 bm = i;
687         } else {
688                 /* use the virtual machine requested */
689                 bm = bm % conf.nmach;
690         }
691
692         p->wired = MACHP(bm);
693         p->mp = p->wired;
694 }
695
696 void
697 procpriority(Proc *p, int pri, int fixed)
698 {
699         if(pri >= Npriq)
700                 pri = Npriq - 1;
701         else if(pri < 0)
702                 pri = 0;
703         p->basepri = pri;
704         p->priority = pri;
705         if(fixed){
706                 p->fixedpri = 1;
707         } else {
708                 p->fixedpri = 0;
709         }
710 }
711
712 void
713 procinit0(void)         /* bad planning - clashes with devproc.c */
714 {
715         Proc *p;
716         int i;
717
718         procalloc.free = xalloc(conf.nproc*sizeof(Proc));
719         if(procalloc.free == nil){
720                 xsummary();
721                 panic("cannot allocate %lud procs (%ludMB)\n", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
722         }
723         procalloc.arena = procalloc.free;
724
725         p = procalloc.free;
726         for(i=0; i<conf.nproc-1; i++,p++)
727                 p->qnext = p+1;
728         p->qnext = 0;
729 }
730
731 /*
732  *  sleep if a condition is not true.  Another process will
733  *  awaken us after it sets the condition.  When we awaken
734  *  the condition may no longer be true.
735  *
736  *  we lock both the process and the rendezvous to keep r->p
737  *  and p->r synchronized.
738  */
739 void
740 sleep(Rendez *r, int (*f)(void*), void *arg)
741 {
742         int s;
743         void (*pt)(Proc*, int, vlong);
744
745         s = splhi();
746
747         if(up->nlocks.ref)
748                 print("process %lud sleeps with %lud locks held, last lock %#p locked at pc %#lux, sleep called from %#p\n",
749                         up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r));
750         lock(r);
751         lock(&up->rlock);
752         if(r->p){
753                 print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
754                 dumpstack();
755         }
756
757         /*
758          *  Wakeup only knows there may be something to do by testing
759          *  r->p in order to get something to lock on.
760          *  Flush that information out to memory in case the sleep is
761          *  committed.
762          */
763         r->p = up;
764
765         if((*f)(arg) || up->notepending){
766                 /*
767                  *  if condition happened or a note is pending
768                  *  never mind
769                  */
770                 r->p = nil;
771                 unlock(&up->rlock);
772                 unlock(r);
773         } else {
774                 /*
775                  *  now we are committed to
776                  *  change state and call scheduler
777                  */
778                 pt = proctrace;
779                 if(pt)
780                         pt(up, SSleep, 0);
781                 up->state = Wakeme;
782                 up->r = r;
783
784                 /* statistics */
785                 m->cs++;
786
787                 procsave(up);
788                 if(setlabel(&up->sched)) {
789                         /*
790                          *  here when the process is awakened
791                          */
792                         procrestore(up);
793                         spllo();
794                 } else {
795                         /*
796                          *  here to go to sleep (i.e. stop Running)
797                          */
798                         unlock(&up->rlock);
799                         unlock(r);
800                         gotolabel(&m->sched);
801                 }
802         }
803
804         if(up->notepending) {
805                 up->notepending = 0;
806                 splx(s);
807                 if(up->procctl == Proc_exitme && up->closingfgrp)
808                         forceclosefgrp();
809                 error(Eintr);
810         }
811
812         splx(s);
813 }
814
815 static int
816 tfn(void *arg)
817 {
818         return up->trend == nil || up->tfn(arg);
819 }
820
821 void
822 twakeup(Ureg*, Timer *t)
823 {
824         Proc *p;
825         Rendez *trend;
826
827         p = t->ta;
828         trend = p->trend;
829         p->trend = 0;
830         if(trend)
831                 wakeup(trend);
832 }
833
834 void
835 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
836 {
837         if (up->tt){
838                 print("tsleep: timer active: mode %d, tf %#p\n", up->tmode, up->tf);
839                 timerdel(up);
840         }
841         up->tns = MS2NS(ms);
842         up->tf = twakeup;
843         up->tmode = Trelative;
844         up->ta = up;
845         up->trend = r;
846         up->tfn = fn;
847         timeradd(up);
848
849         if(waserror()){
850                 timerdel(up);
851                 nexterror();
852         }
853         sleep(r, tfn, arg);
854         if (up->tt)
855                 timerdel(up);
856         up->twhen = 0;
857         poperror();
858 }
859
860 /*
861  *  Expects that only one process can call wakeup for any given Rendez.
862  *  We hold both locks to ensure that r->p and p->r remain consistent.
863  *  Richard Miller has a better solution that doesn't require both to
864  *  be held simultaneously, but I'm a paranoid - presotto.
865  */
866 Proc*
867 wakeup(Rendez *r)
868 {
869         Proc *p;
870         int s;
871
872         s = splhi();
873
874         lock(r);
875         p = r->p;
876
877         if(p != nil){
878                 lock(&p->rlock);
879                 if(p->state != Wakeme || p->r != r){
880                         iprint("%p %p %d\n", p->r, r, p->state);
881                         panic("wakeup: state");
882                 }
883                 r->p = nil;
884                 p->r = nil;
885                 ready(p);
886                 unlock(&p->rlock);
887         }
888         unlock(r);
889
890         splx(s);
891
892         return p;
893 }
894
895 /*
896  *  if waking a sleeping process, this routine must hold both
897  *  p->rlock and r->lock.  However, it can't know them in
898  *  the same order as wakeup causing a possible lock ordering
899  *  deadlock.  We break the deadlock by giving up the p->rlock
900  *  lock if we can't get the r->lock and retrying.
901  */
902 int
903 postnote(Proc *p, int dolock, char *n, int flag)
904 {
905         int s, ret;
906         QLock *q;
907
908         if(dolock)
909                 qlock(&p->debug);
910
911         if(n != nil && flag != NUser && (p->notify == 0 || p->notified))
912                 p->nnote = 0;
913
914         ret = 0;
915         if(p->nnote < NNOTE && n != nil) {
916                 strcpy(p->note[p->nnote].msg, n);
917                 p->note[p->nnote++].flag = flag;
918                 ret = 1;
919         }
920         p->notepending = 1;
921         if(dolock)
922                 qunlock(&p->debug);
923
924         /* this loop is to avoid lock ordering problems. */
925         for(;;){
926                 Rendez *r;
927
928                 s = splhi();
929                 lock(&p->rlock);
930                 r = p->r;
931
932                 /* waiting for a wakeup? */
933                 if(r == nil)
934                         break;  /* no */
935
936                 /* try for the second lock */
937                 if(canlock(r)){
938                         if(p->state != Wakeme || r->p != p)
939                                 panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
940                         p->r = nil;
941                         r->p = nil;
942                         ready(p);
943                         unlock(r);
944                         break;
945                 }
946
947                 /* give other process time to get out of critical section and try again */
948                 unlock(&p->rlock);
949                 splx(s);
950                 sched();
951         }
952         unlock(&p->rlock);
953         splx(s);
954
955         switch(p->state){
956         case Queueing:
957                 /* Try and pull out of a eqlock */
958                 if(q = p->eql){
959                         lock(&q->use);
960                         if(p->state == Queueing && p->eql == q){
961                                 Proc *d, *l;
962
963                                 for(l = nil, d = q->head; d; l = d, d = d->qnext){
964                                         if(d == p){
965                                                 if(l)
966                                                         l->qnext = p->qnext;
967                                                 else
968                                                         q->head = p->qnext;
969                                                 if(p->qnext == 0)
970                                                         q->tail = l;
971                                                 p->qnext = 0;
972                                                 p->eql = 0;     /* not taken */
973                                                 ready(p);
974                                                 break;
975                                         }
976                                 }
977                         }
978                         unlock(&q->use);
979                 }
980                 break;
981         case Rendezvous:
982                 /* Try and pull out of a rendezvous */
983                 lock(p->rgrp);
984                 if(p->state == Rendezvous) {
985                         Proc *d, **l;
986
987                         p->rendval = ~0;
988                         l = &REND(p->rgrp, p->rendtag);
989                         for(d = *l; d; d = d->rendhash) {
990                                 if(d == p) {
991                                         *l = p->rendhash;
992                                         break;
993                                 }
994                                 l = &d->rendhash;
995                         }
996                         ready(p);
997                 }
998                 unlock(p->rgrp);
999                 break;
1000         }
1001         return ret;
1002 }
1003
1004 /*
1005  * weird thing: keep at most NBROKEN around
1006  */
1007 #define NBROKEN 4
1008 struct
1009 {
1010         QLock;
1011         int     n;
1012         Proc    *p[NBROKEN];
1013 }broken;
1014
1015 void
1016 addbroken(Proc *p)
1017 {
1018         qlock(&broken);
1019         if(broken.n == NBROKEN) {
1020                 ready(broken.p[0]);
1021                 memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
1022                 --broken.n;
1023         }
1024         broken.p[broken.n++] = p;
1025         qunlock(&broken);
1026
1027         edfstop(up);
1028         p->state = Broken;
1029         p->psstate = 0;
1030         sched();
1031 }
1032
1033 void
1034 unbreak(Proc *p)
1035 {
1036         int b;
1037
1038         qlock(&broken);
1039         for(b=0; b < broken.n; b++)
1040                 if(broken.p[b] == p) {
1041                         broken.n--;
1042                         memmove(&broken.p[b], &broken.p[b+1],
1043                                         sizeof(Proc*)*(NBROKEN-(b+1)));
1044                         ready(p);
1045                         break;
1046                 }
1047         qunlock(&broken);
1048 }
1049
1050 int
1051 freebroken(void)
1052 {
1053         int i, n;
1054
1055         qlock(&broken);
1056         n = broken.n;
1057         for(i=0; i<n; i++) {
1058                 ready(broken.p[i]);
1059                 broken.p[i] = 0;
1060         }
1061         broken.n = 0;
1062         qunlock(&broken);
1063         return n;
1064 }
1065
1066 void
1067 pexit(char *exitstr, int freemem)
1068 {
1069         Proc *p;
1070         Segment **s, **es;
1071         long utime, stime;
1072         Waitq *wq, *f, *next;
1073         Fgrp *fgrp;
1074         Egrp *egrp;
1075         Rgrp *rgrp;
1076         Pgrp *pgrp;
1077         Chan *dot;
1078         void (*pt)(Proc*, int, vlong);
1079
1080         if(up->syscalltrace)
1081                 free(up->syscalltrace);
1082         up->alarm = 0;
1083         if (up->tt)
1084                 timerdel(up);
1085         pt = proctrace;
1086         if(pt)
1087                 pt(up, SDead, 0);
1088
1089         /* nil out all the resources under lock (free later) */
1090         qlock(&up->debug);
1091         fgrp = up->fgrp;
1092         up->fgrp = nil;
1093         egrp = up->egrp;
1094         up->egrp = nil;
1095         rgrp = up->rgrp;
1096         up->rgrp = nil;
1097         pgrp = up->pgrp;
1098         up->pgrp = nil;
1099         dot = up->dot;
1100         up->dot = nil;
1101         qunlock(&up->debug);
1102
1103         if(fgrp)
1104                 closefgrp(fgrp);
1105         if(egrp)
1106                 closeegrp(egrp);
1107         if(rgrp)
1108                 closergrp(rgrp);
1109         if(dot)
1110                 cclose(dot);
1111         if(pgrp)
1112                 closepgrp(pgrp);
1113
1114         /*
1115          * if not a kernel process and have a parent,
1116          * do some housekeeping.
1117          */
1118         if(up->kp == 0) {
1119                 p = up->parent;
1120                 if(p == 0) {
1121                         if(exitstr == 0)
1122                                 exitstr = "unknown";
1123                         panic("boot process died: %s", exitstr);
1124                 }
1125
1126                 while(waserror())
1127                         ;
1128
1129                 wq = smalloc(sizeof(Waitq));
1130                 poperror();
1131
1132                 wq->w.pid = up->pid;
1133                 utime = up->time[TUser] + up->time[TCUser];
1134                 stime = up->time[TSys] + up->time[TCSys];
1135                 wq->w.time[TUser] = tk2ms(utime);
1136                 wq->w.time[TSys] = tk2ms(stime);
1137                 wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1138                 if(exitstr && exitstr[0])
1139                         snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1140                 else
1141                         wq->w.msg[0] = '\0';
1142
1143                 lock(&p->exl);
1144                 /*
1145                  * Check that parent is still alive.
1146                  */
1147                 if(p->pid == up->parentpid && p->state != Broken) {
1148                         p->nchild--;
1149                         p->time[TCUser] += utime;
1150                         p->time[TCSys] += stime;
1151                         /*
1152                          * If there would be more than 128 wait records
1153                          * processes for my parent, then don't leave a wait
1154                          * record behind.  This helps prevent badly written
1155                          * daemon processes from accumulating lots of wait
1156                          * records.
1157                          */
1158                         if(p->nwait < 128) {
1159                                 wq->next = p->waitq;
1160                                 p->waitq = wq;
1161                                 p->nwait++;
1162                                 wq = nil;
1163                                 wakeup(&p->waitr);
1164                         }
1165                 }
1166                 unlock(&p->exl);
1167                 if(wq)
1168                         free(wq);
1169         }
1170
1171         if(!freemem)
1172                 addbroken(up);
1173
1174         qlock(&up->seglock);
1175         es = &up->seg[NSEG];
1176         for(s = up->seg; s < es; s++) {
1177                 if(*s) {
1178                         putseg(*s);
1179                         *s = 0;
1180                 }
1181         }
1182         qunlock(&up->seglock);
1183
1184         lock(&up->exl);         /* Prevent my children from leaving waits */
1185         pidfree(up);
1186         up->pid = 0;
1187         wakeup(&up->waitr);
1188         unlock(&up->exl);
1189
1190         for(f = up->waitq; f; f = next) {
1191                 next = f->next;
1192                 free(f);
1193         }
1194
1195         /* release debuggers */
1196         qlock(&up->debug);
1197         if(up->pdbg) {
1198                 wakeup(&up->pdbg->sleep);
1199                 up->pdbg = 0;
1200         }
1201         qunlock(&up->debug);
1202
1203         /* Sched must not loop for these locks */
1204         lock(&procalloc);
1205         lock(&palloc);
1206
1207         edfstop(up);
1208         up->state = Moribund;
1209         sched();
1210         panic("pexit");
1211 }
1212
1213 int
1214 haswaitq(void *x)
1215 {
1216         Proc *p;
1217
1218         p = (Proc *)x;
1219         return p->waitq != 0;
1220 }
1221
1222 ulong
1223 pwait(Waitmsg *w)
1224 {
1225         ulong cpid;
1226         Waitq *wq;
1227
1228         if(!canqlock(&up->qwaitr))
1229                 error(Einuse);
1230
1231         if(waserror()) {
1232                 qunlock(&up->qwaitr);
1233                 nexterror();
1234         }
1235
1236         lock(&up->exl);
1237         if(up->nchild == 0 && up->waitq == 0) {
1238                 unlock(&up->exl);
1239                 error(Enochild);
1240         }
1241         unlock(&up->exl);
1242
1243         sleep(&up->waitr, haswaitq, up);
1244
1245         lock(&up->exl);
1246         wq = up->waitq;
1247         up->waitq = wq->next;
1248         up->nwait--;
1249         unlock(&up->exl);
1250
1251         qunlock(&up->qwaitr);
1252         poperror();
1253
1254         if(w)
1255                 memmove(w, &wq->w, sizeof(Waitmsg));
1256         cpid = wq->w.pid;
1257         free(wq);
1258         return cpid;
1259 }
1260
1261 Proc*
1262 proctab(int i)
1263 {
1264         return &procalloc.arena[i];
1265 }
1266
1267 void
1268 dumpaproc(Proc *p)
1269 {
1270         ulong bss;
1271         char *s;
1272
1273         if(p == 0)
1274                 return;
1275
1276         bss = 0;
1277         if(p->seg[BSEG])
1278                 bss = p->seg[BSEG]->top;
1279
1280         s = p->psstate;
1281         if(s == 0)
1282                 s = statename[p->state];
1283         print("%3lud:%10s pc %8lux dbgpc %8lux  %8s (%s) ut %ld st %ld bss %lux qpc %lux nl %lud nd %lud lpc %lux pri %lud\n",
1284                 p->pid, p->text, p->pc, dbgpc(p),  s, statename[p->state],
1285                 p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1286 }
1287
1288 void
1289 procdump(void)
1290 {
1291         int i;
1292         Proc *p;
1293
1294         if(up)
1295                 print("up %lud\n", up->pid);
1296         else
1297                 print("no current process\n");
1298         for(i=0; i<conf.nproc; i++) {
1299                 p = &procalloc.arena[i];
1300                 if(p->state == Dead)
1301                         continue;
1302
1303                 dumpaproc(p);
1304         }
1305 }
1306
1307 /*
1308  *  wait till all processes have flushed their mmu
1309  *  state about segement s
1310  */
1311 void
1312 procflushseg(Segment *s)
1313 {
1314         int i, ns, nm, nwait;
1315         Proc *p;
1316
1317         /*
1318          *  tell all processes with this
1319          *  segment to flush their mmu's
1320          */
1321         nwait = 0;
1322         for(i=0; i<conf.nproc; i++) {
1323                 p = &procalloc.arena[i];
1324                 if(p->state == Dead)
1325                         continue;
1326                 for(ns = 0; ns < NSEG; ns++)
1327                         if(p->seg[ns] == s){
1328                                 p->newtlb = 1;
1329                                 for(nm = 0; nm < conf.nmach; nm++){
1330                                         if(MACHP(nm)->proc == p){
1331                                                 MACHP(nm)->flushmmu = 1;
1332                                                 nwait++;
1333                                         }
1334                                 }
1335                                 break;
1336                         }
1337         }
1338
1339         if(nwait == 0)
1340                 return;
1341
1342         /*
1343          *  wait for all processors to take a clock interrupt
1344          *  and flush their mmu's
1345          */
1346         for(nm = 0; nm < conf.nmach; nm++)
1347                 if(MACHP(nm) != m)
1348                         while(MACHP(nm)->flushmmu)
1349                                 sched();
1350 }
1351
1352 void
1353 scheddump(void)
1354 {
1355         Proc *p;
1356         Schedq *rq;
1357
1358         for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1359                 if(rq->head == 0)
1360                         continue;
1361                 print("rq%ld:", rq-runq);
1362                 for(p = rq->head; p; p = p->rnext)
1363                         print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1364                 print("\n");
1365                 delay(150);
1366         }
1367         print("nrdy %d\n", nrdy);
1368 }
1369
1370 void
1371 kproc(char *name, void (*func)(void *), void *arg)
1372 {
1373         Proc *p;
1374         static Pgrp *kpgrp;
1375
1376         p = newproc();
1377         p->psstate = 0;
1378         p->procmode = 0640;
1379         p->kp = 1;
1380         p->noswap = 1;
1381
1382         p->fpsave = up->fpsave;
1383         p->scallnr = up->scallnr;
1384         p->s = up->s;
1385         p->nerrlab = 0;
1386         p->slash = up->slash;
1387         p->dot = up->dot;
1388         if(p->dot)
1389                 incref(p->dot);
1390
1391         memmove(p->note, up->note, sizeof(p->note));
1392         p->nnote = up->nnote;
1393         p->notified = 0;
1394         p->lastnote = up->lastnote;
1395         p->notify = up->notify;
1396         p->ureg = 0;
1397         p->dbgreg = 0;
1398
1399         procpriority(p, PriKproc, 0);
1400
1401         kprocchild(p, func, arg);
1402
1403         kstrdup(&p->user, eve);
1404         kstrdup(&p->text, name);
1405         if(kpgrp == 0)
1406                 kpgrp = newpgrp();
1407         p->pgrp = kpgrp;
1408         incref(kpgrp);
1409
1410         memset(p->time, 0, sizeof(p->time));
1411         p->time[TReal] = MACHP(0)->ticks;
1412         ready(p);
1413 }
1414
1415 /*
1416  *  called splhi() by notify().  See comment in notify for the
1417  *  reasoning.
1418  */
1419 void
1420 procctl(Proc *p)
1421 {
1422         char *state;
1423         ulong s;
1424
1425         switch(p->procctl) {
1426         case Proc_exitbig:
1427                 spllo();
1428                 pprint("Killed: Insufficient physical memory\n");
1429                 pexit("Killed: Insufficient physical memory", 1);
1430
1431         case Proc_exitme:
1432                 spllo();                /* pexit has locks in it */
1433                 pexit("Killed", 1);
1434
1435         case Proc_traceme:
1436                 if(p->nnote == 0)
1437                         return;
1438                 /* No break */
1439
1440         case Proc_stopme:
1441                 p->procctl = 0;
1442                 state = p->psstate;
1443                 p->psstate = "Stopped";
1444                 /* free a waiting debugger */
1445                 s = spllo();
1446                 qlock(&p->debug);
1447                 if(p->pdbg) {
1448                         wakeup(&p->pdbg->sleep);
1449                         p->pdbg = 0;
1450                 }
1451                 qunlock(&p->debug);
1452                 splhi();
1453                 p->state = Stopped;
1454                 sched();
1455                 p->psstate = state;
1456                 splx(s);
1457                 return;
1458         }
1459 }
1460
1461 #include "errstr.h"
1462
1463 void
1464 error(char *err)
1465 {
1466         spllo();
1467
1468         assert(up->nerrlab < NERR);
1469         kstrcpy(up->errstr, err, ERRMAX);
1470         setlabel(&up->errlab[NERR-1]);
1471         nexterror();
1472 }
1473
1474 void
1475 nexterror(void)
1476 {
1477         gotolabel(&up->errlab[--up->nerrlab]);
1478 }
1479
1480 void
1481 exhausted(char *resource)
1482 {
1483         char buf[ERRMAX];
1484
1485         snprint(buf, sizeof buf, "no free %s", resource);
1486         iprint("%s\n", buf);
1487         error(buf);
1488 }
1489
1490 void
1491 killbig(char *why)
1492 {
1493         int i;
1494         Segment *s;
1495         ulong l, max;
1496         Proc *p, *ep, *kp;
1497
1498         max = 0;
1499         kp = 0;
1500         ep = procalloc.arena+conf.nproc;
1501         for(p = procalloc.arena; p < ep; p++) {
1502                 if(p->state == Dead || p->kp)
1503                         continue;
1504                 l = 0;
1505                 for(i=1; i<NSEG; i++) {
1506                         s = p->seg[i];
1507                         if(s == 0 || !canqlock(&s->lk))
1508                                 continue;
1509                         l += (ulong)mcountseg(s);
1510                         qunlock(&s->lk);
1511                 }
1512                 if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
1513                         kp = p;
1514                         max = l;
1515                 }
1516         }
1517         if(kp == 0)
1518                 return;
1519         print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1520         for(p = procalloc.arena; p < ep; p++) {
1521                 if(p->state == Dead || p->kp)
1522                         continue;
1523                 if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
1524                         p->procctl = Proc_exitbig;
1525         }
1526         kp->procctl = Proc_exitbig;
1527         for(i = 0; i < NSEG; i++) {
1528                 s = kp->seg[i];
1529                 if(s != 0 && canqlock(&s->lk)) {
1530                         mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1531                         qunlock(&s->lk);
1532                 }
1533         }
1534 }
1535
1536 /*
1537  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1538  *  eve changes.
1539  */
1540 void
1541 renameuser(char *old, char *new)
1542 {
1543         Proc *p, *ep;
1544
1545         ep = procalloc.arena+conf.nproc;
1546         for(p = procalloc.arena; p < ep; p++)
1547                 if(p->user!=nil && strcmp(old, p->user)==0)
1548                         kstrdup(&p->user, new);
1549 }
1550
1551 /*
1552  *  time accounting called by clock() splhi'd
1553  */
1554 void
1555 accounttime(void)
1556 {
1557         Proc *p;
1558         ulong n, per;
1559         static ulong nrun;
1560
1561         p = m->proc;
1562         if(p) {
1563                 nrun++;
1564                 p->time[p->insyscall]++;
1565         }
1566
1567         /* calculate decaying duty cycles */
1568         n = perfticks();
1569         per = n - m->perf.last;
1570         m->perf.last = n;
1571         per = (m->perf.period*(HZ-1) + per)/HZ;
1572         if(per != 0)
1573                 m->perf.period = per;
1574
1575         m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1576         m->perf.inidle = 0;
1577
1578         m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1579         m->perf.inintr = 0;
1580
1581         /* only one processor gets to compute system load averages */
1582         if(m->machno != 0)
1583                 return;
1584
1585         /*
1586          * calculate decaying load average.
1587          * if we decay by (n-1)/n then it takes
1588          * n clock ticks to go from load L to .36 L once
1589          * things quiet down.  it takes about 5 n clock
1590          * ticks to go to zero.  so using HZ means this is
1591          * approximately the load over the last second,
1592          * with a tail lasting about 5 seconds.
1593          */
1594         n = nrun;
1595         nrun = 0;
1596         n = (nrdy+n)*1000;
1597         m->load = (m->load*(HZ-1)+n)/HZ;
1598 }
1599
1600 int
1601 pidalloc(Proc *p)
1602 {
1603         static int gen, wrapped;
1604         int pid, h;
1605         Proc *x;
1606
1607         lock(&procalloc);
1608 Retry:
1609         pid = ++gen & 0x7FFFFFFF;
1610         if(pid == 0){
1611                 wrapped = 1;
1612                 goto Retry;
1613         }
1614         h = pid % nelem(procalloc.ht);
1615         if(wrapped)
1616                 for(x = procalloc.ht[h]; x != nil; x = x->pidhash)
1617                         if(x->pid == pid)
1618                                 goto Retry;
1619         if(p){
1620                 p->pid = pid;
1621                 p->pidhash = procalloc.ht[h];
1622                 procalloc.ht[h] = p;
1623         }
1624         unlock(&procalloc);
1625         return pid;
1626 }
1627
1628 static void
1629 pidfree(Proc *p)
1630 {
1631         int h;
1632         Proc **l;
1633
1634         h = p->pid % nelem(procalloc.ht);
1635         lock(&procalloc);
1636         for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1637                 if(*l == p){
1638                         *l = p->pidhash;
1639                         break;
1640                 }
1641         unlock(&procalloc);
1642 }
1643
1644 int
1645 procindex(ulong pid)
1646 {
1647         Proc *p;
1648         int h;
1649         int s;
1650
1651         s = -1;
1652         h = pid % nelem(procalloc.ht);
1653         lock(&procalloc);
1654         for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1655                 if(p->pid == pid){
1656                         s = p - procalloc.arena;
1657                         break;
1658                 }
1659         unlock(&procalloc);
1660         return s;
1661 }