]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/port/proc.c
import E script from bell labs
[plan9front.git] / sys / src / 9 / port / proc.c
1 #include        <u.h>
2 #include        "../port/lib.h"
3 #include        "mem.h"
4 #include        "dat.h"
5 #include        "fns.h"
6 #include        "../port/error.h"
7 #include        "edf.h"
8 #include        <trace.h>
9
10 int     schedgain = 30; /* units in seconds */
11 int     nrdy;
12
13 void updatecpu(Proc*);
14 int reprioritize(Proc*);
15
16 ulong   delayedscheds;  /* statistics */
17 long skipscheds;
18 long preempts;
19 ulong load;
20
21 static struct Procalloc
22 {
23         Lock;
24         Proc*   ht[128];
25         Proc*   arena;
26         Proc*   free;
27 } procalloc;
28
29 enum
30 {
31         Q=10,
32         DQ=4,
33         Scaling=2,
34 };
35
36 Schedq  runq[Nrq];
37 ulong   runvec;
38
39 char *statename[] =
40 {       /* BUG: generate automatically */
41         "Dead",
42         "Moribund",
43         "Ready",
44         "Scheding",
45         "Running",
46         "Queueing",
47         "QueueingR",
48         "QueueingW",
49         "Wakeme",
50         "Broken",
51         "Stopped",
52         "Rendez",
53         "Waitrelease",
54 };
55
56 static void pidfree(Proc*);
57 static void rebalance(void);
58
59 /*
60  * Always splhi()'ed.
61  */
62 void
63 schedinit(void)         /* never returns */
64 {
65         Edf *e;
66
67         setlabel(&m->sched);
68         if(up != nil) {
69                 if((e = up->edf) != nil && (e->flags & Admitted))
70                         edfrecord(up);
71                 m->proc = nil;
72                 switch(up->state) {
73                 case Running:
74                         ready(up);
75                         break;
76                 case Moribund:
77                         up->state = Dead;
78                         edfstop(up);
79                         if(up->edf != nil)
80                                 free(up->edf);
81                         up->edf = nil;
82
83                         /*
84                          * Holding locks from pexit:
85                          *      procalloc
86                          *      palloc
87                          */
88                         mmurelease(up);
89                         unlock(&palloc);
90
91                         up->mach = nil;
92                         updatecpu(up);
93
94                         up->qnext = procalloc.free;
95                         procalloc.free = up;
96
97                         /* proc is free now, make sure unlock() wont touch it */
98                         up = procalloc.Lock.p = nil;
99                         unlock(&procalloc);
100                         sched();
101                 }
102                 up->mach = nil;
103                 updatecpu(up);
104                 up = nil;
105         }
106         sched();
107 }
108
109 /*
110  *  If changing this routine, look also at sleep().  It
111  *  contains a copy of the guts of sched().
112  */
113 void
114 sched(void)
115 {
116         Proc *p;
117
118         if(m->ilockdepth)
119                 panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
120                         m->machno,
121                         m->ilockdepth,
122                         up != nil ? up->lastilock: nil,
123                         (up != nil && up->lastilock != nil) ? up->lastilock->pc: 0,
124                         getcallerpc(&p+2));
125         if(up != nil) {
126                 /*
127                  * Delay the sched until the process gives up the locks
128                  * it is holding.  This avoids dumb lock loops.
129                  * Don't delay if the process is Moribund.
130                  * It called sched to die.
131                  * But do sched eventually.  This avoids a missing unlock
132                  * from hanging the entire kernel. 
133                  * But don't reschedule procs holding palloc or procalloc.
134                  * Those are far too important to be holding while asleep.
135                  *
136                  * This test is not exact.  There can still be a few instructions
137                  * in the middle of taslock when a process holds a lock
138                  * but Lock.p has not yet been initialized.
139                  */
140                 if(up->nlocks)
141                 if(up->state != Moribund)
142                 if(up->delaysched < 20
143                 || palloc.Lock.p == up
144                 || procalloc.Lock.p == up){
145                         up->delaysched++;
146                         delayedscheds++;
147                         return;
148                 }
149                 up->delaysched = 0;
150
151                 splhi();
152
153                 /* statistics */
154                 m->cs++;
155
156                 procsave(up);
157                 if(setlabel(&up->sched)){
158                         procrestore(up);
159                         spllo();
160                         return;
161                 }
162                 gotolabel(&m->sched);
163         }
164         p = runproc();
165         if(p->edf == nil){
166                 updatecpu(p);
167                 p->priority = reprioritize(p);
168         }
169         if(p != m->readied)
170                 m->schedticks = m->ticks + HZ/10;
171         m->readied = nil;
172         up = p;
173         up->state = Running;
174         up->mach = MACHP(m->machno);
175         m->proc = up;
176         //print("poolcheck sched %s\n", p->text);
177         //#include <pool.h>
178         //poolcheck(mainmem);
179         mmuswitch(up);
180         gotolabel(&up->sched);
181 }
182
183 int
184 anyready(void)
185 {
186         return runvec;
187 }
188
189 int
190 anyhigher(void)
191 {
192         return runvec & ~((1<<(up->priority+1))-1);
193 }
194
195 /*
196  *  here once per clock tick to see if we should resched
197  */
198 void
199 hzsched(void)
200 {
201         /* once a second, rebalance will reprioritize ready procs */
202         if(m->machno == 0)
203                 rebalance();
204
205         /* unless preempted, get to run for at least 100ms */
206         if(anyhigher()
207         || (!up->fixedpri && m->ticks > m->schedticks && anyready())){
208                 m->readied = nil;       /* avoid cooperative scheduling */
209                 up->delaysched++;
210         }
211 }
212
213 /*
214  *  here at the end of non-clock interrupts to see if we should preempt the
215  *  current process.  Returns 1 if preempted, 0 otherwise.
216  */
217 int
218 preempted(void)
219 {
220         if(up != nil && up->state == Running)
221         if(up->preempted == 0)
222         if(anyhigher())
223         if(!active.exiting){
224                 m->readied = nil;       /* avoid cooperative scheduling */
225                 up->preempted = 1;
226                 sched();
227                 splhi();
228                 up->preempted = 0;
229                 return 1;
230         }
231         return 0;
232 }
233
234 /*
235  * Update the cpu time average for this particular process,
236  * which is about to change from up -> not up or vice versa.
237  * p->lastupdate is the last time an updatecpu happened.
238  *
239  * The cpu time average is a decaying average that lasts
240  * about D clock ticks.  D is chosen to be approximately
241  * the cpu time of a cpu-intensive "quick job".  A job has to run
242  * for approximately D clock ticks before we home in on its 
243  * actual cpu usage.  Thus if you manage to get in and get out
244  * quickly, you won't be penalized during your burst.  Once you
245  * start using your share of the cpu for more than about D
246  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up 
247  * below all the other quick jobs.  Interactive tasks, because
248  * they basically always use less than their fair share of cpu,
249  * will be rewarded.
250  *
251  * If the process has not been running, then we want to
252  * apply the filter
253  *
254  *      cpu = cpu * (D-1)/D
255  *
256  * n times, yielding 
257  * 
258  *      cpu = cpu * ((D-1)/D)^n
259  *
260  * but D is big enough that this is approximately 
261  *
262  *      cpu = cpu * (D-n)/D
263  *
264  * so we use that instead.
265  * 
266  * If the process has been running, we apply the filter to
267  * 1 - cpu, yielding a similar equation.  Note that cpu is 
268  * stored in fixed point (* 1000).
269  *
270  * Updatecpu must be called before changing up, in order
271  * to maintain accurate cpu usage statistics.  It can be called
272  * at any time to bring the stats for a given proc up-to-date.
273  */
274 void
275 updatecpu(Proc *p)
276 {
277         int n, t, ocpu;
278         int D = schedgain*HZ*Scaling;
279
280         if(p->edf != nil)
281                 return;
282
283         t = MACHP(0)->ticks*Scaling + Scaling/2;
284         n = t - p->lastupdate;
285         p->lastupdate = t;
286
287         if(n == 0)
288                 return;
289         if(n > D)
290                 n = D;
291
292         ocpu = p->cpu;
293         if(p != up)
294                 p->cpu = (ocpu*(D-n))/D;
295         else{
296                 t = 1000 - ocpu;
297                 t = (t*(D-n))/D;
298                 p->cpu = 1000 - t;
299         }
300
301 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
302 }
303
304 /*
305  * On average, p has used p->cpu of a cpu recently.
306  * Its fair share is conf.nmach/m->load of a cpu.  If it has been getting
307  * too much, penalize it.  If it has been getting not enough, reward it.
308  * I don't think you can get much more than your fair share that 
309  * often, so most of the queues are for using less.  Having a priority
310  * of 3 means you're just right.  Having a higher priority (up to p->basepri) 
311  * means you're not using as much as you could.
312  */
313 int
314 reprioritize(Proc *p)
315 {
316         int fairshare, n, load, ratio;
317
318         load = MACHP(0)->load;
319         if(load == 0)
320                 return p->basepri;
321
322         /*
323          * fairshare = 1.000 * conf.nmach * 1.000/load,
324          * except the decimal point is moved three places
325          * on both load and fairshare.
326          */
327         fairshare = (conf.nmach*1000*1000)/load;
328         n = p->cpu;
329         if(n == 0)
330                 n = 1;
331         ratio = (fairshare+n/2) / n;
332         if(ratio > p->basepri)
333                 ratio = p->basepri;
334         if(ratio < 0)
335                 panic("reprioritize");
336 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
337         return ratio;
338 }
339
340 /*
341  * add a process to a scheduling queue
342  */
343 void
344 queueproc(Schedq *rq, Proc *p)
345 {
346         int pri;
347
348         pri = rq - runq;
349         lock(runq);
350         p->priority = pri;
351         p->rnext = nil;
352         if(rq->tail != nil)
353                 rq->tail->rnext = p;
354         else
355                 rq->head = p;
356         rq->tail = p;
357         rq->n++;
358         nrdy++;
359         runvec |= 1<<pri;
360         unlock(runq);
361 }
362
363 /*
364  *  try to remove a process from a scheduling queue (called splhi)
365  */
366 Proc*
367 dequeueproc(Schedq *rq, Proc *tp)
368 {
369         Proc *l, *p;
370
371         if(!canlock(runq))
372                 return nil;
373
374         /*
375          *  the queue may have changed before we locked runq,
376          *  refind the target process.
377          */
378         l = nil;
379         for(p = rq->head; p != nil; p = p->rnext){
380                 if(p == tp)
381                         break;
382                 l = p;
383         }
384
385         /*
386          *  p->mach==0 only when process state is saved
387          */
388         if(p == nil || p->mach != nil){
389                 unlock(runq);
390                 return nil;
391         }
392         if(p->rnext == nil)
393                 rq->tail = l;
394         if(l != nil)
395                 l->rnext = p->rnext;
396         else
397                 rq->head = p->rnext;
398         if(rq->head == nil)
399                 runvec &= ~(1<<(rq-runq));
400         rq->n--;
401         nrdy--;
402         if(p->state != Ready)
403                 print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
404
405         unlock(runq);
406         return p;
407 }
408
409 /*
410  *  ready(p) picks a new priority for a process and sticks it in the
411  *  runq for that priority.
412  */
413 void
414 ready(Proc *p)
415 {
416         int s, pri;
417         Schedq *rq;
418         void (*pt)(Proc*, int, vlong);
419
420         if(p->state == Ready){
421                 print("double ready %s %lud pc %p\n", p->text, p->pid, getcallerpc(&p));
422                 return;
423         }
424
425         s = splhi();
426         if(edfready(p)){
427                 splx(s);
428                 return;
429         }
430
431         if(up != p && (p->wired == nil || p->wired == MACHP(m->machno)))
432                 m->readied = p; /* group scheduling */
433
434         updatecpu(p);
435         pri = reprioritize(p);
436         p->priority = pri;
437         rq = &runq[pri];
438         p->state = Ready;
439         queueproc(rq, p);
440         pt = proctrace;
441         if(pt != nil)
442                 pt(p, SReady, 0);
443         splx(s);
444 }
445
446 /*
447  *  yield the processor and drop our priority
448  */
449 void
450 yield(void)
451 {
452         if(anyready()){
453                 /* pretend we just used 1/2 tick */
454                 up->lastupdate -= Scaling/2;  
455                 sched();
456         }
457 }
458
459 /*
460  *  recalculate priorities once a second.  We need to do this
461  *  since priorities will otherwise only be recalculated when
462  *  the running process blocks.
463  */
464 ulong balancetime;
465
466 static void
467 rebalance(void)
468 {
469         int pri, npri, t, x;
470         Schedq *rq;
471         Proc *p;
472
473         t = m->ticks;
474         if(t - balancetime < HZ)
475                 return;
476         balancetime = t;
477
478         for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
479 another:
480                 p = rq->head;
481                 if(p == nil)
482                         continue;
483                 if(p->mp != MACHP(m->machno))
484                         continue;
485                 if(pri == p->basepri)
486                         continue;
487                 updatecpu(p);
488                 npri = reprioritize(p);
489                 if(npri != pri){
490                         x = splhi();
491                         p = dequeueproc(rq, p);
492                         if(p != nil)
493                                 queueproc(&runq[npri], p);
494                         splx(x);
495                         goto another;
496                 }
497         }
498 }
499         
500
501 /*
502  *  pick a process to run
503  */
504 Proc*
505 runproc(void)
506 {
507         Schedq *rq;
508         Proc *p;
509         ulong start, now;
510         int i;
511         void (*pt)(Proc*, int, vlong);
512
513         start = perfticks();
514
515         /* cooperative scheduling until the clock ticks */
516         if((p = m->readied) != nil && p->mach == nil && p->state == Ready
517         && (p->wired == nil || p->wired == MACHP(m->machno))
518         && runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
519                 skipscheds++;
520                 rq = &runq[p->priority];
521                 goto found;
522         }
523
524         preempts++;
525
526 loop:
527         /*
528          *  find a process that last ran on this processor (affinity),
529          *  or one that hasn't moved in a while (load balancing).  Every
530          *  time around the loop affinity goes down.
531          */
532         spllo();
533         for(i = 0;; i++){
534                 /*
535                  *  find the highest priority target process that this
536                  *  processor can run given affinity constraints.
537                  *
538                  */
539                 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
540                         for(p = rq->head; p != nil; p = p->rnext){
541                                 if(p->mp == nil || p->mp == MACHP(m->machno)
542                                 || (p->wired == nil && i > 0))
543                                         goto found;
544                         }
545                 }
546
547                 /* waste time or halt the CPU */
548                 idlehands();
549
550                 /* remember how much time we're here */
551                 now = perfticks();
552                 m->perf.inidle += now-start;
553                 start = now;
554         }
555
556 found:
557         splhi();
558         p = dequeueproc(rq, p);
559         if(p == nil)
560                 goto loop;
561
562         p->state = Scheding;
563         p->mp = MACHP(m->machno);
564
565         if(edflock(p)){
566                 edfrun(p, rq == &runq[PriEdf]); /* start deadline timer and do admin */
567                 edfunlock();
568         }
569         pt = proctrace;
570         if(pt != nil)
571                 pt(p, SRun, 0);
572         return p;
573 }
574
575 int
576 canpage(Proc *p)
577 {
578         int ok = 0;
579
580         splhi();
581         lock(runq);
582         /* Only reliable way to see if we are Running */
583         if(p->mach == nil) {
584                 p->newtlb = 1;
585                 ok = 1;
586         }
587         unlock(runq);
588         spllo();
589
590         return ok;
591 }
592
593 Proc*
594 newproc(void)
595 {
596         char msg[64];
597         Proc *p;
598
599         lock(&procalloc);
600         for(;;) {
601                 if((p = procalloc.free) != nil)
602                         break;
603
604                 snprint(msg, sizeof msg, "no procs; %s forking",
605                         up != nil ? up->text: "kernel");
606                 unlock(&procalloc);
607                 resrcwait(msg);
608                 lock(&procalloc);
609         }
610         procalloc.free = p->qnext;
611         unlock(&procalloc);
612
613         p->state = Scheding;
614         p->psstate = "New";
615         p->mach = nil;
616         p->eql = nil;
617         p->qnext = nil;
618         p->nchild = 0;
619         p->nwait = 0;
620         p->waitq = nil;
621         p->parent = nil;
622         p->pgrp = nil;
623         p->egrp = nil;
624         p->fgrp = nil;
625         p->rgrp = nil;
626         p->pdbg = nil;
627         p->fpstate = FPinit;
628         p->kp = 0;
629         p->procctl = 0;
630         p->syscalltrace = nil;  
631         p->notepending = 0;
632         p->ureg = nil;
633         p->privatemem = 0;
634         p->noswap = 0;
635         p->errstr = p->errbuf0;
636         p->syserrstr = p->errbuf1;
637         p->errbuf0[0] = '\0';
638         p->errbuf1[0] = '\0';
639         p->nlocks = 0;
640         p->delaysched = 0;
641         p->trace = 0;
642         kstrdup(&p->user, "*nouser");
643         kstrdup(&p->text, "*notext");
644         kstrdup(&p->args, "");
645         p->nargs = 0;
646         p->setargs = 0;
647         memset(p->seg, 0, sizeof p->seg);
648         p->parentpid = 0;
649         p->noteid = pidalloc(p);
650         if(p->kstack == nil)
651                 p->kstack = smalloc(KSTACK);
652
653         /* sched params */
654         p->mp = nil;
655         p->wired = nil;
656         procpriority(p, PriNormal, 0);
657         p->cpu = 0;
658         p->lastupdate = MACHP(0)->ticks*Scaling;
659         p->edf = nil;
660
661         return p;
662 }
663
664 /*
665  * wire this proc to a machine
666  */
667 void
668 procwired(Proc *p, int bm)
669 {
670         Proc *pp;
671         int i;
672         char nwired[MAXMACH];
673         Mach *wm;
674
675         if(bm < 0){
676                 /* pick a machine to wire to */
677                 memset(nwired, 0, sizeof(nwired));
678                 p->wired = nil;
679                 pp = proctab(0);
680                 for(i=0; i<conf.nproc; i++, pp++){
681                         wm = pp->wired;
682                         if(wm != nil && pp->pid)
683                                 nwired[wm->machno]++;
684                 }
685                 bm = 0;
686                 for(i=0; i<conf.nmach; i++)
687                         if(nwired[i] < nwired[bm])
688                                 bm = i;
689         } else {
690                 /* use the virtual machine requested */
691                 bm = bm % conf.nmach;
692         }
693
694         p->wired = MACHP(bm);
695         p->mp = p->wired;
696 }
697
698 void
699 procpriority(Proc *p, int pri, int fixed)
700 {
701         if(pri >= Npriq)
702                 pri = Npriq - 1;
703         else if(pri < 0)
704                 pri = 0;
705         p->basepri = pri;
706         p->priority = pri;
707         if(fixed){
708                 p->fixedpri = 1;
709         } else {
710                 p->fixedpri = 0;
711         }
712 }
713
714 void
715 procinit0(void)         /* bad planning - clashes with devproc.c */
716 {
717         Proc *p;
718         int i;
719
720         p = xalloc(conf.nproc*sizeof(Proc));
721         if(p == nil){
722                 xsummary();
723                 panic("cannot allocate %lud procs (%ludMB)", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
724         }
725         procalloc.arena = p;
726         procalloc.free = p;
727         for(i=0; i<conf.nproc-1; i++, p++)
728                 p->qnext = p+1;
729         p->qnext = nil;
730 }
731
732 /*
733  *  sleep if a condition is not true.  Another process will
734  *  awaken us after it sets the condition.  When we awaken
735  *  the condition may no longer be true.
736  *
737  *  we lock both the process and the rendezvous to keep r->p
738  *  and p->r synchronized.
739  */
740 void
741 sleep(Rendez *r, int (*f)(void*), void *arg)
742 {
743         int s;
744         void (*pt)(Proc*, int, vlong);
745
746         s = splhi();
747
748         if(up->nlocks)
749                 print("process %lud sleeps with %d locks held, last lock %#p locked at pc %#p, sleep called from %#p\n",
750                         up->pid, up->nlocks, up->lastlock, up->lastlock->pc, getcallerpc(&r));
751         lock(r);
752         lock(&up->rlock);
753         if(r->p != nil){
754                 print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
755                 dumpstack();
756         }
757
758         /*
759          *  Wakeup only knows there may be something to do by testing
760          *  r->p in order to get something to lock on.
761          *  Flush that information out to memory in case the sleep is
762          *  committed.
763          */
764         r->p = up;
765
766         if((*f)(arg) || up->notepending){
767                 /*
768                  *  if condition happened or a note is pending
769                  *  never mind
770                  */
771                 r->p = nil;
772                 unlock(&up->rlock);
773                 unlock(r);
774         } else {
775                 /*
776                  *  now we are committed to
777                  *  change state and call scheduler
778                  */
779                 pt = proctrace;
780                 if(pt != nil)
781                         pt(up, SSleep, 0);
782                 up->state = Wakeme;
783                 up->r = r;
784
785                 /* statistics */
786                 m->cs++;
787
788                 procsave(up);
789                 if(setlabel(&up->sched)) {
790                         /*
791                          *  here when the process is awakened
792                          */
793                         procrestore(up);
794                         spllo();
795                 } else {
796                         /*
797                          *  here to go to sleep (i.e. stop Running)
798                          */
799                         unlock(&up->rlock);
800                         unlock(r);
801                         gotolabel(&m->sched);
802                 }
803         }
804
805         if(up->notepending) {
806                 up->notepending = 0;
807                 splx(s);
808                 interrupted();
809         }
810
811         splx(s);
812 }
813
814 void
815 interrupted(void)
816 {
817         if(up->procctl == Proc_exitme && up->closingfgrp != nil)
818                 forceclosefgrp();
819         error(Eintr);
820 }
821
822 static int
823 tfn(void *arg)
824 {
825         return up->trend == nil || up->tfn(arg);
826 }
827
828 void
829 twakeup(Ureg*, Timer *t)
830 {
831         Proc *p;
832         Rendez *trend;
833
834         p = t->ta;
835         trend = p->trend;
836         p->trend = nil;
837         if(trend != nil)
838                 wakeup(trend);
839 }
840
841 void
842 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
843 {
844         if(up->tt != nil){
845                 print("tsleep: timer active: mode %d, tf %#p\n", up->tmode, up->tf);
846                 timerdel(up);
847         }
848         up->tns = MS2NS(ms);
849         up->tf = twakeup;
850         up->tmode = Trelative;
851         up->ta = up;
852         up->trend = r;
853         up->tfn = fn;
854         timeradd(up);
855
856         if(waserror()){
857                 timerdel(up);
858                 nexterror();
859         }
860         sleep(r, tfn, arg);
861         if(up->tt != nil)
862                 timerdel(up);
863         up->twhen = 0;
864         poperror();
865 }
866
867 /*
868  *  Expects that only one process can call wakeup for any given Rendez.
869  *  We hold both locks to ensure that r->p and p->r remain consistent.
870  *  Richard Miller has a better solution that doesn't require both to
871  *  be held simultaneously, but I'm a paranoid - presotto.
872  */
873 Proc*
874 wakeup(Rendez *r)
875 {
876         Proc *p;
877         int s;
878
879         s = splhi();
880
881         lock(r);
882         p = r->p;
883
884         if(p != nil){
885                 lock(&p->rlock);
886                 if(p->state != Wakeme || p->r != r){
887                         iprint("%p %p %d\n", p->r, r, p->state);
888                         panic("wakeup: state");
889                 }
890                 r->p = nil;
891                 p->r = nil;
892                 ready(p);
893                 unlock(&p->rlock);
894         }
895         unlock(r);
896
897         splx(s);
898
899         return p;
900 }
901
902 /*
903  *  if waking a sleeping process, this routine must hold both
904  *  p->rlock and r->lock.  However, it can't know them in
905  *  the same order as wakeup causing a possible lock ordering
906  *  deadlock.  We break the deadlock by giving up the p->rlock
907  *  lock if we can't get the r->lock and retrying.
908  */
909 int
910 postnote(Proc *p, int dolock, char *n, int flag)
911 {
912         int s, ret;
913         QLock *q;
914
915         if(p == nil)
916                 return 0;
917
918         if(dolock)
919                 qlock(&p->debug);
920
921         if(p->pid == 0){
922                 if(dolock)
923                         qunlock(&p->debug);
924                 return 0;
925         }
926
927         if(n != nil && flag != NUser && (p->notify == 0 || p->notified))
928                 p->nnote = 0;
929
930         ret = 0;
931         if(p->nnote < NNOTE && n != nil) {
932                 kstrcpy(p->note[p->nnote].msg, n, ERRMAX);
933                 p->note[p->nnote++].flag = flag;
934                 ret = 1;
935         }
936         p->notepending = 1;
937         if(dolock)
938                 qunlock(&p->debug);
939
940         /* this loop is to avoid lock ordering problems. */
941         for(;;){
942                 Rendez *r;
943
944                 s = splhi();
945                 lock(&p->rlock);
946                 r = p->r;
947
948                 /* waiting for a wakeup? */
949                 if(r == nil)
950                         break;  /* no */
951
952                 /* try for the second lock */
953                 if(canlock(r)){
954                         if(p->state != Wakeme || r->p != p)
955                                 panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
956                         p->r = nil;
957                         r->p = nil;
958                         ready(p);
959                         unlock(r);
960                         break;
961                 }
962
963                 /* give other process time to get out of critical section and try again */
964                 unlock(&p->rlock);
965                 splx(s);
966                 sched();
967         }
968         unlock(&p->rlock);
969         splx(s);
970
971         switch(p->state){
972         case Queueing:
973                 /* Try and pull out of a eqlock */
974                 if((q = p->eql) != nil){
975                         lock(&q->use);
976                         if(p->state == Queueing && p->eql == q){
977                                 Proc *d, *l;
978
979                                 for(l = nil, d = q->head; d != nil; l = d, d = d->qnext){
980                                         if(d == p){
981                                                 if(l != nil)
982                                                         l->qnext = p->qnext;
983                                                 else
984                                                         q->head = p->qnext;
985                                                 if(p->qnext == nil)
986                                                         q->tail = l;
987                                                 p->qnext = nil;
988                                                 p->eql = nil;   /* not taken */
989                                                 ready(p);
990                                                 break;
991                                         }
992                                 }
993                         }
994                         unlock(&q->use);
995                 }
996                 break;
997         case Rendezvous:
998                 /* Try and pull out of a rendezvous */
999                 lock(p->rgrp);
1000                 if(p->state == Rendezvous) {
1001                         Proc *d, **l;
1002
1003                         l = &REND(p->rgrp, p->rendtag);
1004                         for(d = *l; d != nil; d = d->rendhash) {
1005                                 if(d == p) {
1006                                         *l = p->rendhash;
1007                                         p->rendval = ~0;
1008                                         ready(p);
1009                                         break;
1010                                 }
1011                                 l = &d->rendhash;
1012                         }
1013                 }
1014                 unlock(p->rgrp);
1015                 break;
1016         }
1017         return ret;
1018 }
1019
1020 /*
1021  * weird thing: keep at most NBROKEN around
1022  */
1023 #define NBROKEN 4
1024 struct
1025 {
1026         QLock;
1027         int     n;
1028         Proc    *p[NBROKEN];
1029 }broken;
1030
1031 void
1032 addbroken(Proc *p)
1033 {
1034         qlock(&broken);
1035         if(broken.n == NBROKEN) {
1036                 ready(broken.p[0]);
1037                 memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
1038                 --broken.n;
1039         }
1040         broken.p[broken.n++] = p;
1041         qunlock(&broken);
1042
1043         edfstop(up);
1044         p->state = Broken;
1045         p->psstate = nil;
1046         sched();
1047 }
1048
1049 void
1050 unbreak(Proc *p)
1051 {
1052         int b;
1053
1054         qlock(&broken);
1055         for(b=0; b < broken.n; b++)
1056                 if(broken.p[b] == p) {
1057                         broken.n--;
1058                         memmove(&broken.p[b], &broken.p[b+1],
1059                                         sizeof(Proc*)*(NBROKEN-(b+1)));
1060                         ready(p);
1061                         break;
1062                 }
1063         qunlock(&broken);
1064 }
1065
1066 int
1067 freebroken(void)
1068 {
1069         int i, n;
1070
1071         qlock(&broken);
1072         n = broken.n;
1073         for(i=0; i<n; i++) {
1074                 ready(broken.p[i]);
1075                 broken.p[i] = nil;
1076         }
1077         broken.n = 0;
1078         qunlock(&broken);
1079         return n;
1080 }
1081
1082 void
1083 pexit(char *exitstr, int freemem)
1084 {
1085         Proc *p;
1086         Segment **s, **es;
1087         long utime, stime;
1088         Waitq *wq;
1089         Fgrp *fgrp;
1090         Egrp *egrp;
1091         Rgrp *rgrp;
1092         Pgrp *pgrp;
1093         Chan *dot;
1094         void (*pt)(Proc*, int, vlong);
1095
1096         up->alarm = 0;
1097         if(up->tt != nil)
1098                 timerdel(up);
1099         pt = proctrace;
1100         if(pt != nil)
1101                 pt(up, SDead, 0);
1102
1103         /* nil out all the resources under lock (free later) */
1104         qlock(&up->debug);
1105         fgrp = up->fgrp;
1106         up->fgrp = nil;
1107         egrp = up->egrp;
1108         up->egrp = nil;
1109         rgrp = up->rgrp;
1110         up->rgrp = nil;
1111         pgrp = up->pgrp;
1112         up->pgrp = nil;
1113         dot = up->dot;
1114         up->dot = nil;
1115         qunlock(&up->debug);
1116
1117         if(fgrp != nil)
1118                 closefgrp(fgrp);
1119         if(egrp != nil)
1120                 closeegrp(egrp);
1121         if(rgrp != nil)
1122                 closergrp(rgrp);
1123         if(dot != nil)
1124                 cclose(dot);
1125         if(pgrp != nil)
1126                 closepgrp(pgrp);
1127
1128         /*
1129          * if not a kernel process and have a parent,
1130          * do some housekeeping.
1131          */
1132         if(up->kp == 0 && up->parentpid != 0) {
1133                 wq = smalloc(sizeof(Waitq));
1134                 wq->w.pid = up->pid;
1135                 utime = up->time[TUser] + up->time[TCUser];
1136                 stime = up->time[TSys] + up->time[TCSys];
1137                 wq->w.time[TUser] = tk2ms(utime);
1138                 wq->w.time[TSys] = tk2ms(stime);
1139                 wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1140                 if(exitstr != nil && exitstr[0])
1141                         snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1142                 else
1143                         wq->w.msg[0] = '\0';
1144
1145                 p = up->parent;
1146                 lock(&p->exl);
1147                 /*
1148                  * Check that parent is still alive.
1149                  */
1150                 if(p->pid == up->parentpid && p->state != Broken) {
1151                         p->nchild--;
1152                         p->time[TCUser] += utime;
1153                         p->time[TCSys] += stime;
1154                         /*
1155                          * If there would be more than 128 wait records
1156                          * processes for my parent, then don't leave a wait
1157                          * record behind.  This helps prevent badly written
1158                          * daemon processes from accumulating lots of wait
1159                          * records.
1160                          */
1161                         if(p->nwait < 128) {
1162                                 wq->next = p->waitq;
1163                                 p->waitq = wq;
1164                                 p->nwait++;
1165                                 wq = nil;
1166                                 wakeup(&p->waitr);
1167                         }
1168                 }
1169                 unlock(&p->exl);
1170                 if(wq != nil)
1171                         free(wq);
1172         }
1173         else if(up->kp == 0 && up->parent == nil){
1174                 if(exitstr == nil)
1175                         exitstr = "unknown";
1176                 panic("boot process died: %s", exitstr);
1177         }
1178
1179         if(!freemem)
1180                 addbroken(up);
1181
1182         qlock(&up->seglock);
1183         es = &up->seg[NSEG];
1184         for(s = up->seg; s < es; s++) {
1185                 if(*s != nil) {
1186                         putseg(*s);
1187                         *s = nil;
1188                 }
1189         }
1190         qunlock(&up->seglock);
1191
1192         lock(&up->exl);         /* Prevent my children from leaving waits */
1193         pidfree(up);
1194         up->pid = 0;
1195         wakeup(&up->waitr);
1196         unlock(&up->exl);
1197
1198         while((wq = up->waitq) != nil){
1199                 up->waitq = wq->next;
1200                 free(wq);
1201         }
1202
1203         /* release debuggers */
1204         qlock(&up->debug);
1205         if(up->pdbg != nil) {
1206                 wakeup(&up->pdbg->sleep);
1207                 up->pdbg = nil;
1208         }
1209         if(up->syscalltrace != nil) {
1210                 free(up->syscalltrace);
1211                 up->syscalltrace = nil;
1212         }
1213         qunlock(&up->debug);
1214
1215         /* Sched must not loop for these locks */
1216         lock(&procalloc);
1217         lock(&palloc);
1218
1219         edfstop(up);
1220         up->state = Moribund;
1221         sched();
1222         panic("pexit");
1223 }
1224
1225 static int
1226 haswaitq(void *x)
1227 {
1228         return ((Proc*)x)->waitq != nil;
1229 }
1230
1231 ulong
1232 pwait(Waitmsg *w)
1233 {
1234         ulong cpid;
1235         Waitq *wq;
1236
1237         if(!canqlock(&up->qwaitr))
1238                 error(Einuse);
1239
1240         if(waserror()) {
1241                 qunlock(&up->qwaitr);
1242                 nexterror();
1243         }
1244
1245         lock(&up->exl);
1246         while(up->waitq == nil) {
1247                 if(up->nchild == 0) {
1248                         unlock(&up->exl);
1249                         error(Enochild);
1250                 }
1251                 unlock(&up->exl);
1252                 sleep(&up->waitr, haswaitq, up);
1253                 lock(&up->exl);
1254         }
1255         wq = up->waitq;
1256         up->waitq = wq->next;
1257         up->nwait--;
1258         unlock(&up->exl);
1259
1260         qunlock(&up->qwaitr);
1261         poperror();
1262
1263         if(w != nil)
1264                 memmove(w, &wq->w, sizeof(Waitmsg));
1265         cpid = wq->w.pid;
1266         free(wq);
1267         return cpid;
1268 }
1269
1270 Proc*
1271 proctab(int i)
1272 {
1273         return &procalloc.arena[i];
1274 }
1275
1276 void
1277 dumpaproc(Proc *p)
1278 {
1279         ulong bss;
1280         char *s;
1281
1282         if(p == nil)
1283                 return;
1284
1285         bss = 0;
1286         if(p->seg[BSEG] != nil)
1287                 bss = p->seg[BSEG]->top;
1288
1289         s = p->psstate;
1290         if(s == nil)
1291                 s = statename[p->state];
1292         print("%3lud:%10s pc %#p dbgpc %#p  %8s (%s) ut %ld st %ld bss %lux qpc %#p nl %d nd %lud lpc %#p pri %lud\n",
1293                 p->pid, p->text, p->pc, dbgpc(p),  s, statename[p->state],
1294                 p->time[0], p->time[1], bss, p->qpc, p->nlocks, p->delaysched,
1295                 p->lastlock ? p->lastlock->pc : 0, p->priority);
1296 }
1297
1298 void
1299 procdump(void)
1300 {
1301         int i;
1302         Proc *p;
1303
1304         if(up != nil)
1305                 print("up %lud\n", up->pid);
1306         else
1307                 print("no current process\n");
1308         for(i=0; i<conf.nproc; i++) {
1309                 p = &procalloc.arena[i];
1310                 if(p->state != Dead)
1311                         dumpaproc(p);
1312         }
1313 }
1314
1315 /*
1316  *  wait till all processes have flushed their mmu
1317  *  state about segement s
1318  */
1319 void
1320 procflushseg(Segment *s)
1321 {
1322         int i, ns, nm, nwait;
1323         Proc *p;
1324
1325         /*
1326          *  tell all processes with this
1327          *  segment to flush their mmu's
1328          */
1329         nwait = 0;
1330         for(i=0; i<conf.nproc; i++) {
1331                 p = &procalloc.arena[i];
1332                 if(p->state == Dead)
1333                         continue;
1334                 for(ns = 0; ns < NSEG; ns++)
1335                         if(p->seg[ns] == s){
1336                                 p->newtlb = 1;
1337                                 for(nm = 0; nm < conf.nmach; nm++){
1338                                         if(MACHP(nm)->proc == p){
1339                                                 MACHP(nm)->flushmmu = 1;
1340                                                 nwait++;
1341                                         }
1342                                 }
1343                                 break;
1344                         }
1345         }
1346
1347         if(nwait == 0)
1348                 return;
1349
1350         /*
1351          *  wait for all other processors to take a clock interrupt
1352          *  and flush their mmu's
1353          */
1354         for(nm = 0; nm < conf.nmach; nm++)
1355                 while(m->machno != nm && MACHP(nm)->flushmmu)
1356                         sched();
1357 }
1358
1359 void
1360 scheddump(void)
1361 {
1362         Proc *p;
1363         Schedq *rq;
1364
1365         for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1366                 if(rq->head == nil)
1367                         continue;
1368                 print("rq%ld:", rq-runq);
1369                 for(p = rq->head; p != nil; p = p->rnext)
1370                         print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1371                 print("\n");
1372                 delay(150);
1373         }
1374         print("nrdy %d\n", nrdy);
1375 }
1376
1377 void
1378 kproc(char *name, void (*func)(void *), void *arg)
1379 {
1380         Proc *p;
1381         static Pgrp *kpgrp;
1382
1383         p = newproc();
1384         p->psstate = nil;
1385         p->procmode = 0640;
1386         p->kp = 1;
1387         p->noswap = 1;
1388
1389         p->scallnr = up->scallnr;
1390         p->s = up->s;
1391         p->nerrlab = 0;
1392         p->slash = up->slash;
1393         p->dot = up->slash;     /* unlike fork, do not inherit the dot for kprocs */
1394         if(p->dot != nil)
1395                 incref(p->dot);
1396
1397         memmove(p->note, up->note, sizeof(p->note));
1398         p->nnote = up->nnote;
1399         p->notified = 0;
1400         p->lastnote = up->lastnote;
1401         p->notify = up->notify;
1402         p->ureg = nil;
1403         p->dbgreg = nil;
1404
1405         procpriority(p, PriKproc, 0);
1406
1407         kprocchild(p, func, arg);
1408
1409         kstrdup(&p->user, eve);
1410         kstrdup(&p->text, name);
1411         if(kpgrp == nil)
1412                 kpgrp = newpgrp();
1413         p->pgrp = kpgrp;
1414         incref(kpgrp);
1415
1416         memset(p->time, 0, sizeof(p->time));
1417         p->time[TReal] = MACHP(0)->ticks;
1418         ready(p);
1419 }
1420
1421 /*
1422  *  called splhi() by notify().  See comment in notify for the
1423  *  reasoning.
1424  */
1425 void
1426 procctl(void)
1427 {
1428         char *state;
1429         ulong s;
1430
1431         switch(up->procctl) {
1432         case Proc_exitbig:
1433                 spllo();
1434                 pprint("Killed: Insufficient physical memory\n");
1435                 pexit("Killed: Insufficient physical memory", 1);
1436
1437         case Proc_exitme:
1438                 spllo();                /* pexit has locks in it */
1439                 pexit("Killed", 1);
1440
1441         case Proc_traceme:
1442                 if(up->nnote == 0)
1443                         return;
1444                 /* No break */
1445
1446         case Proc_stopme:
1447                 up->procctl = 0;
1448                 state = up->psstate;
1449                 up->psstate = "Stopped";
1450                 /* free a waiting debugger */
1451                 s = spllo();
1452                 qlock(&up->debug);
1453                 if(up->pdbg != nil) {
1454                         wakeup(&up->pdbg->sleep);
1455                         up->pdbg = nil;
1456                 }
1457                 qunlock(&up->debug);
1458                 splhi();
1459                 up->state = Stopped;
1460                 sched();
1461                 up->psstate = state;
1462                 splx(s);
1463                 return;
1464         }
1465 }
1466
1467 #include "errstr.h"
1468
1469 void
1470 error(char *err)
1471 {
1472         spllo();
1473
1474         assert(up->nerrlab < NERR);
1475         kstrcpy(up->errstr, err, ERRMAX);
1476         setlabel(&up->errlab[NERR-1]);
1477         nexterror();
1478 }
1479
1480 void
1481 nexterror(void)
1482 {
1483         assert(up->nerrlab > 0);
1484         gotolabel(&up->errlab[--up->nerrlab]);
1485 }
1486
1487 void
1488 exhausted(char *resource)
1489 {
1490         char buf[ERRMAX];
1491
1492         snprint(buf, sizeof buf, "no free %s", resource);
1493         iprint("%s\n", buf);
1494         error(buf);
1495 }
1496
1497 ulong
1498 procpagecount(Proc *p)
1499 {
1500         Segment *s;
1501         ulong pages;
1502         int i;
1503
1504         eqlock(&p->seglock);
1505         if(waserror()){
1506                 qunlock(&p->seglock);
1507                 nexterror();
1508         }
1509         pages = 0;
1510         for(i=0; i<NSEG; i++){
1511                 if((s = p->seg[i]) != nil){
1512                         eqlock(s);
1513                         pages += mcountseg(s);
1514                         qunlock(s);
1515                 }
1516         }
1517         qunlock(&p->seglock);
1518         poperror();
1519
1520         return pages;
1521 }
1522
1523 void
1524 killbig(char *why)
1525 {
1526         int i;
1527         Segment *s;
1528         ulong l, max;
1529         Proc *p, *ep, *kp;
1530
1531         max = 0;
1532         kp = nil;
1533         ep = procalloc.arena+conf.nproc;
1534         for(p = procalloc.arena; p < ep; p++) {
1535                 if(p->state == Dead || p->kp)
1536                         continue;
1537                 if((p->noswap || (p->procmode & 0222) == 0) && strcmp(eve, p->user) == 0)
1538                         continue;
1539                 l = procpagecount(p);
1540                 if(l > max){
1541                         kp = p;
1542                         max = l;
1543                 }
1544         }
1545         if(kp == nil)
1546                 return;
1547         print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1548         qlock(&kp->seglock);
1549         for(p = procalloc.arena; p < ep; p++) {
1550                 if(p->state == Dead || p->kp)
1551                         continue;
1552                 if(p != kp && p->seg[BSEG] != nil && p->seg[BSEG] == kp->seg[BSEG])
1553                         p->procctl = Proc_exitbig;
1554         }
1555         kp->procctl = Proc_exitbig;
1556         for(i = 0; i < NSEG; i++) {
1557                 s = kp->seg[i];
1558                 if(s == nil)
1559                         continue;
1560                 switch(s->type & SG_TYPE){
1561                 case SG_SHARED:
1562                 case SG_PHYSICAL:
1563                 case SG_FIXED:
1564                         continue;
1565                 }
1566                 qlock(s);
1567                 mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1568                 qunlock(s);
1569         }
1570         qunlock(&kp->seglock);
1571 }
1572
1573 /*
1574  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1575  *  eve changes.
1576  */
1577 void
1578 renameuser(char *old, char *new)
1579 {
1580         Proc *p, *ep;
1581
1582         ep = procalloc.arena+conf.nproc;
1583         for(p = procalloc.arena; p < ep; p++)
1584                 if(p->user != nil && strcmp(old, p->user) == 0)
1585                         kstrdup(&p->user, new);
1586 }
1587
1588 /*
1589  *  time accounting called by clock() splhi'd
1590  */
1591 void
1592 accounttime(void)
1593 {
1594         Proc *p;
1595         ulong n, per;
1596         static ulong nrun;
1597
1598         p = m->proc;
1599         if(p != nil) {
1600                 nrun++;
1601                 p->time[p->insyscall]++;
1602         }
1603
1604         /* calculate decaying duty cycles */
1605         n = perfticks();
1606         per = n - m->perf.last;
1607         m->perf.last = n;
1608         per = ((uvlong)m->perf.period*(HZ-1) + per)/HZ;
1609         if(per != 0)
1610                 m->perf.period = per;
1611
1612         m->perf.avg_inidle = ((uvlong)m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1613         m->perf.inidle = 0;
1614
1615         m->perf.avg_inintr = ((uvlong)m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1616         m->perf.inintr = 0;
1617
1618         /* only one processor gets to compute system load averages */
1619         if(m->machno != 0)
1620                 return;
1621
1622         /*
1623          * calculate decaying load average.
1624          * if we decay by (n-1)/n then it takes
1625          * n clock ticks to go from load L to .36 L once
1626          * things quiet down.  it takes about 5 n clock
1627          * ticks to go to zero.  so using HZ means this is
1628          * approximately the load over the last second,
1629          * with a tail lasting about 5 seconds.
1630          */
1631         n = nrun;
1632         nrun = 0;
1633         n = (nrdy+n)*1000*100;
1634         load = ((uvlong)load*(HZ-1)+n)/HZ;
1635         m->load = load/100;
1636 }
1637
1638 int
1639 pidalloc(Proc *p)
1640 {
1641         static int gen, wrapped;
1642         int pid, h;
1643         Proc *x;
1644
1645         lock(&procalloc);
1646 Retry:
1647         pid = ++gen & 0x7FFFFFFF;
1648         if(pid == 0){
1649                 wrapped = 1;
1650                 goto Retry;
1651         }
1652         h = pid % nelem(procalloc.ht);
1653         if(wrapped)
1654                 for(x = procalloc.ht[h]; x != nil; x = x->pidhash)
1655                         if(x->pid == pid)
1656                                 goto Retry;
1657         if(p != nil){
1658                 p->pid = pid;
1659                 p->pidhash = procalloc.ht[h];
1660                 procalloc.ht[h] = p;
1661         }
1662         unlock(&procalloc);
1663         return pid;
1664 }
1665
1666 static void
1667 pidfree(Proc *p)
1668 {
1669         int h;
1670         Proc **l;
1671
1672         h = p->pid % nelem(procalloc.ht);
1673         lock(&procalloc);
1674         for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1675                 if(*l == p){
1676                         *l = p->pidhash;
1677                         break;
1678                 }
1679         unlock(&procalloc);
1680 }
1681
1682 int
1683 procindex(ulong pid)
1684 {
1685         Proc *p;
1686         int h;
1687         int s;
1688
1689         s = -1;
1690         h = pid % nelem(procalloc.ht);
1691         lock(&procalloc);
1692         for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1693                 if(p->pid == pid){
1694                         s = p - procalloc.arena;
1695                         break;
1696                 }
1697         unlock(&procalloc);
1698         return s;
1699 }