]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/port/proc.c
kprocs dont share memory with up, see: http://9fans.net/archive/2011/05/159
[plan9front.git] / sys / src / 9 / port / proc.c
1 #include        <u.h>
2 #include        "../port/lib.h"
3 #include        "mem.h"
4 #include        "dat.h"
5 #include        "fns.h"
6 #include        "../port/error.h"
7 #include        "edf.h"
8 #include        <trace.h>
9
10 int     schedgain = 30; /* units in seconds */
11 int     nrdy;
12 Ref     noteidalloc;
13
14 void updatecpu(Proc*);
15 int reprioritize(Proc*);
16
17 ulong   delayedscheds;  /* statistics */
18 long skipscheds;
19 long preempts;
20 ulong load;
21
22 static Ref      pidalloc;
23
24 static struct Procalloc
25 {
26         Lock;
27         Proc*   ht[128];
28         Proc*   arena;
29         Proc*   free;
30 } procalloc;
31
32 enum
33 {
34         Q=10,
35         DQ=4,
36         Scaling=2,
37 };
38
39 Schedq  runq[Nrq];
40 ulong   runvec;
41
42 char *statename[] =
43 {       /* BUG: generate automatically */
44         "Dead",
45         "Moribund",
46         "Ready",
47         "Scheding",
48         "Running",
49         "Queueing",
50         "QueueingR",
51         "QueueingW",
52         "Wakeme",
53         "Broken",
54         "Stopped",
55         "Rendez",
56         "Waitrelease",
57 };
58
59 static void pidhash(Proc*);
60 static void pidunhash(Proc*);
61 static void rebalance(void);
62
63 /*
64  * Always splhi()'ed.
65  */
66 void
67 schedinit(void)         /* never returns */
68 {
69         Edf *e;
70
71         setlabel(&m->sched);
72         if(up) {
73                 if((e = up->edf) && (e->flags & Admitted))
74                         edfrecord(up);
75                 m->proc = 0;
76                 switch(up->state) {
77                 case Running:
78                         ready(up);
79                         break;
80                 case Moribund:
81                         up->state = Dead;
82                         edfstop(up);
83                         if (up->edf)
84                                 free(up->edf);
85                         up->edf = nil;
86
87                         /*
88                          * Holding locks from pexit:
89                          *      procalloc
90                          *      palloc
91                          */
92                         mmurelease(up);
93
94                         up->qnext = procalloc.free;
95                         procalloc.free = up;
96
97                         unlock(&palloc);
98                         unlock(&procalloc);
99                         break;
100                 }
101                 up->mach = nil;
102                 updatecpu(up);
103                 up = nil;
104         }
105         sched();
106 }
107
108 /*
109  *  If changing this routine, look also at sleep().  It
110  *  contains a copy of the guts of sched().
111  */
112 void
113 sched(void)
114 {
115         Proc *p;
116
117         if(m->ilockdepth)
118                 panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
119                         m->machno,
120                         m->ilockdepth,
121                         up? up->lastilock: nil,
122                         (up && up->lastilock)? up->lastilock->pc: 0,
123                         getcallerpc(&p+2));
124         if(up){
125                 /*
126                  * Delay the sched until the process gives up the locks
127                  * it is holding.  This avoids dumb lock loops.
128                  * Don't delay if the process is Moribund.
129                  * It called sched to die.
130                  * But do sched eventually.  This avoids a missing unlock
131                  * from hanging the entire kernel. 
132                  * But don't reschedule procs holding palloc or procalloc.
133                  * Those are far too important to be holding while asleep.
134                  *
135                  * This test is not exact.  There can still be a few instructions
136                  * in the middle of taslock when a process holds a lock
137                  * but Lock.p has not yet been initialized.
138                  */
139                 if(up->nlocks.ref)
140                 if(up->state != Moribund)
141                 if(up->delaysched < 20
142                 || palloc.Lock.p == up
143                 || procalloc.Lock.p == up){
144                         up->delaysched++;
145                         delayedscheds++;
146                         return;
147                 }
148                 up->delaysched = 0;
149
150                 splhi();
151
152                 /* statistics */
153                 m->cs++;
154
155                 procsave(up);
156                 if(setlabel(&up->sched)){
157                         procrestore(up);
158                         spllo();
159                         return;
160                 }
161                 gotolabel(&m->sched);
162         }
163         p = runproc();
164         if(!p->edf){
165                 updatecpu(p);
166                 p->priority = reprioritize(p);
167         }
168         if(p != m->readied)
169                 m->schedticks = m->ticks + HZ/10;
170         m->readied = 0;
171         up = p;
172         up->state = Running;
173         up->mach = MACHP(m->machno);
174         m->proc = up;
175         mmuswitch(up);
176         gotolabel(&up->sched);
177 }
178
179 int
180 anyready(void)
181 {
182         return runvec;
183 }
184
185 int
186 anyhigher(void)
187 {
188         return runvec & ~((1<<(up->priority+1))-1);
189 }
190
191 /*
192  *  here once per clock tick to see if we should resched
193  */
194 void
195 hzsched(void)
196 {
197         /* once a second, rebalance will reprioritize ready procs */
198         if(m->machno == 0)
199                 rebalance();
200
201         /* unless preempted, get to run for at least 100ms */
202         if(anyhigher()
203         || (!up->fixedpri && m->ticks > m->schedticks && anyready())){
204                 m->readied = nil;       /* avoid cooperative scheduling */
205                 up->delaysched++;
206         }
207 }
208
209 /*
210  *  here at the end of non-clock interrupts to see if we should preempt the
211  *  current process.  Returns 1 if preempted, 0 otherwise.
212  */
213 int
214 preempted(void)
215 {
216         if(up && up->state == Running)
217         if(up->preempted == 0)
218         if(anyhigher())
219         if(!active.exiting){
220                 m->readied = nil;       /* avoid cooperative scheduling */
221                 up->preempted = 1;
222                 sched();
223                 splhi();
224                 up->preempted = 0;
225                 return 1;
226         }
227         return 0;
228 }
229
230 /*
231  * Update the cpu time average for this particular process,
232  * which is about to change from up -> not up or vice versa.
233  * p->lastupdate is the last time an updatecpu happened.
234  *
235  * The cpu time average is a decaying average that lasts
236  * about D clock ticks.  D is chosen to be approximately
237  * the cpu time of a cpu-intensive "quick job".  A job has to run
238  * for approximately D clock ticks before we home in on its 
239  * actual cpu usage.  Thus if you manage to get in and get out
240  * quickly, you won't be penalized during your burst.  Once you
241  * start using your share of the cpu for more than about D
242  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up 
243  * below all the other quick jobs.  Interactive tasks, because
244  * they basically always use less than their fair share of cpu,
245  * will be rewarded.
246  *
247  * If the process has not been running, then we want to
248  * apply the filter
249  *
250  *      cpu = cpu * (D-1)/D
251  *
252  * n times, yielding 
253  * 
254  *      cpu = cpu * ((D-1)/D)^n
255  *
256  * but D is big enough that this is approximately 
257  *
258  *      cpu = cpu * (D-n)/D
259  *
260  * so we use that instead.
261  * 
262  * If the process has been running, we apply the filter to
263  * 1 - cpu, yielding a similar equation.  Note that cpu is 
264  * stored in fixed point (* 1000).
265  *
266  * Updatecpu must be called before changing up, in order
267  * to maintain accurate cpu usage statistics.  It can be called
268  * at any time to bring the stats for a given proc up-to-date.
269  */
270 void
271 updatecpu(Proc *p)
272 {
273         int n, t, ocpu;
274         int D = schedgain*HZ*Scaling;
275
276         if(p->edf)
277                 return;
278
279         t = MACHP(0)->ticks*Scaling + Scaling/2;
280         n = t - p->lastupdate;
281         p->lastupdate = t;
282
283         if(n == 0)
284                 return;
285         if(n > D)
286                 n = D;
287
288         ocpu = p->cpu;
289         if(p != up)
290                 p->cpu = (ocpu*(D-n))/D;
291         else{
292                 t = 1000 - ocpu;
293                 t = (t*(D-n))/D;
294                 p->cpu = 1000 - t;
295         }
296
297 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
298 }
299
300 /*
301  * On average, p has used p->cpu of a cpu recently.
302  * Its fair share is conf.nmach/m->load of a cpu.  If it has been getting
303  * too much, penalize it.  If it has been getting not enough, reward it.
304  * I don't think you can get much more than your fair share that 
305  * often, so most of the queues are for using less.  Having a priority
306  * of 3 means you're just right.  Having a higher priority (up to p->basepri) 
307  * means you're not using as much as you could.
308  */
309 int
310 reprioritize(Proc *p)
311 {
312         int fairshare, n, load, ratio;
313
314         load = MACHP(0)->load;
315         if(load == 0)
316                 return p->basepri;
317
318         /*
319          *  fairshare = 1.000 * conf.nproc * 1.000/load,
320          * except the decimal point is moved three places
321          * on both load and fairshare.
322          */
323         fairshare = (conf.nmach*1000*1000)/load;
324         n = p->cpu;
325         if(n == 0)
326                 n = 1;
327         ratio = (fairshare+n/2) / n;
328         if(ratio > p->basepri)
329                 ratio = p->basepri;
330         if(ratio < 0)
331                 panic("reprioritize");
332 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
333         return ratio;
334 }
335
336 /*
337  * add a process to a scheduling queue
338  */
339 void
340 queueproc(Schedq *rq, Proc *p)
341 {
342         int pri;
343
344         pri = rq - runq;
345         lock(runq);
346         p->priority = pri;
347         p->rnext = 0;
348         if(rq->tail)
349                 rq->tail->rnext = p;
350         else
351                 rq->head = p;
352         rq->tail = p;
353         rq->n++;
354         nrdy++;
355         runvec |= 1<<pri;
356         unlock(runq);
357 }
358
359 /*
360  *  try to remove a process from a scheduling queue (called splhi)
361  */
362 Proc*
363 dequeueproc(Schedq *rq, Proc *tp)
364 {
365         Proc *l, *p;
366
367         if(!canlock(runq))
368                 return nil;
369
370         /*
371          *  the queue may have changed before we locked runq,
372          *  refind the target process.
373          */
374         l = 0;
375         for(p = rq->head; p; p = p->rnext){
376                 if(p == tp)
377                         break;
378                 l = p;
379         }
380
381         /*
382          *  p->mach==0 only when process state is saved
383          */
384         if(p == 0 || p->mach){
385                 unlock(runq);
386                 return nil;
387         }
388         if(p->rnext == 0)
389                 rq->tail = l;
390         if(l)
391                 l->rnext = p->rnext;
392         else
393                 rq->head = p->rnext;
394         if(rq->head == nil)
395                 runvec &= ~(1<<(rq-runq));
396         rq->n--;
397         nrdy--;
398         if(p->state != Ready)
399                 print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
400
401         unlock(runq);
402         return p;
403 }
404
405 /*
406  *  ready(p) picks a new priority for a process and sticks it in the
407  *  runq for that priority.
408  */
409 void
410 ready(Proc *p)
411 {
412         int s, pri;
413         Schedq *rq;
414         void (*pt)(Proc*, int, vlong);
415
416         s = splhi();
417         if(edfready(p)){
418                 splx(s);
419                 return;
420         }
421
422         if(up != p)
423                 m->readied = p; /* group scheduling */
424
425         updatecpu(p);
426         pri = reprioritize(p);
427         p->priority = pri;
428         rq = &runq[pri];
429         p->state = Ready;
430         queueproc(rq, p);
431         pt = proctrace;
432         if(pt)
433                 pt(p, SReady, 0);
434         splx(s);
435 }
436
437 /*
438  *  yield the processor and drop our priority
439  */
440 void
441 yield(void)
442 {
443         if(anyready()){
444                 /* pretend we just used 1/2 tick */
445                 up->lastupdate -= Scaling/2;  
446                 sched();
447         }
448 }
449
450 /*
451  *  recalculate priorities once a second.  We need to do this
452  *  since priorities will otherwise only be recalculated when
453  *  the running process blocks.
454  */
455 ulong balancetime;
456
457 static void
458 rebalance(void)
459 {
460         int pri, npri, t, x;
461         Schedq *rq;
462         Proc *p;
463
464         t = m->ticks;
465         if(t - balancetime < HZ)
466                 return;
467         balancetime = t;
468
469         for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
470 another:
471                 p = rq->head;
472                 if(p == nil)
473                         continue;
474                 if(p->mp != MACHP(m->machno))
475                         continue;
476                 if(pri == p->basepri)
477                         continue;
478                 updatecpu(p);
479                 npri = reprioritize(p);
480                 if(npri != pri){
481                         x = splhi();
482                         p = dequeueproc(rq, p);
483                         if(p)
484                                 queueproc(&runq[npri], p);
485                         splx(x);
486                         goto another;
487                 }
488         }
489 }
490         
491
492 /*
493  *  pick a process to run
494  */
495 Proc*
496 runproc(void)
497 {
498         Schedq *rq;
499         Proc *p;
500         ulong start, now;
501         int i;
502         void (*pt)(Proc*, int, vlong);
503
504         start = perfticks();
505
506         /* cooperative scheduling until the clock ticks */
507         if((p=m->readied) && p->mach==0 && p->state==Ready
508         && runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
509                 skipscheds++;
510                 rq = &runq[p->priority];
511                 goto found;
512         }
513
514         preempts++;
515
516 loop:
517         /*
518          *  find a process that last ran on this processor (affinity),
519          *  or one that hasn't moved in a while (load balancing).  Every
520          *  time around the loop affinity goes down.
521          */
522         spllo();
523         for(i = 0;; i++){
524                 /*
525                  *  find the highest priority target process that this
526                  *  processor can run given affinity constraints.
527                  *
528                  */
529                 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
530                         for(p = rq->head; p; p = p->rnext){
531                                 if(p->mp == nil || p->mp == MACHP(m->machno)
532                                 || (!p->wired && i > 0))
533                                         goto found;
534                         }
535                 }
536
537                 /* waste time or halt the CPU */
538                 idlehands();
539
540                 /* remember how much time we're here */
541                 now = perfticks();
542                 m->perf.inidle += now-start;
543                 start = now;
544         }
545
546 found:
547         splhi();
548         p = dequeueproc(rq, p);
549         if(p == nil)
550                 goto loop;
551
552         p->state = Scheding;
553         p->mp = MACHP(m->machno);
554
555         if(edflock(p)){
556                 edfrun(p, rq == &runq[PriEdf]); /* start deadline timer and do admin */
557                 edfunlock();
558         }
559         pt = proctrace;
560         if(pt)
561                 pt(p, SRun, 0);
562         return p;
563 }
564
565 int
566 canpage(Proc *p)
567 {
568         int ok = 0;
569
570         splhi();
571         lock(runq);
572         /* Only reliable way to see if we are Running */
573         if(p->mach == 0) {
574                 p->newtlb = 1;
575                 ok = 1;
576         }
577         unlock(runq);
578         spllo();
579
580         return ok;
581 }
582
583 Proc*
584 newproc(void)
585 {
586         char msg[64];
587         Proc *p;
588
589         lock(&procalloc);
590         for(;;) {
591                 if(p = procalloc.free)
592                         break;
593
594                 snprint(msg, sizeof msg, "no procs; %s forking",
595                         up? up->text: "kernel");
596                 unlock(&procalloc);
597                 resrcwait(msg);
598                 lock(&procalloc);
599         }
600         procalloc.free = p->qnext;
601         unlock(&procalloc);
602
603         p->state = Scheding;
604         p->psstate = "New";
605         p->mach = 0;
606         p->qnext = 0;
607         p->nchild = 0;
608         p->nwait = 0;
609         p->waitq = 0;
610         p->parent = 0;
611         p->pgrp = 0;
612         p->egrp = 0;
613         p->fgrp = 0;
614         p->rgrp = 0;
615         p->pdbg = 0;
616         p->fpstate = FPinit;
617         p->kp = 0;
618         if(up && up->procctl == Proc_tracesyscall)
619                 p->procctl = Proc_tracesyscall;
620         else
621                 p->procctl = 0;
622         p->syscalltrace = 0;    
623         p->notepending = 0;
624         p->ureg = 0;
625         p->privatemem = 0;
626         p->noswap = 0;
627         p->errstr = p->errbuf0;
628         p->syserrstr = p->errbuf1;
629         p->errbuf0[0] = '\0';
630         p->errbuf1[0] = '\0';
631         p->nlocks.ref = 0;
632         p->delaysched = 0;
633         p->trace = 0;
634         kstrdup(&p->user, "*nouser");
635         kstrdup(&p->text, "*notext");
636         kstrdup(&p->args, "");
637         p->nargs = 0;
638         p->setargs = 0;
639         memset(p->seg, 0, sizeof p->seg);
640         p->pid = incref(&pidalloc);
641         pidhash(p);
642         p->noteid = incref(&noteidalloc);
643         if(p->pid==0 || p->noteid==0)
644                 panic("pidalloc");
645         if(p->kstack == 0)
646                 p->kstack = smalloc(KSTACK);
647
648         /* sched params */
649         p->mp = 0;
650         p->wired = 0;
651         procpriority(p, PriNormal, 0);
652         p->cpu = 0;
653         p->lastupdate = MACHP(0)->ticks*Scaling;
654         p->edf = nil;
655
656         return p;
657 }
658
659 /*
660  * wire this proc to a machine
661  */
662 void
663 procwired(Proc *p, int bm)
664 {
665         Proc *pp;
666         int i;
667         char nwired[MAXMACH];
668         Mach *wm;
669
670         if(bm < 0){
671                 /* pick a machine to wire to */
672                 memset(nwired, 0, sizeof(nwired));
673                 p->wired = 0;
674                 pp = proctab(0);
675                 for(i=0; i<conf.nproc; i++, pp++){
676                         wm = pp->wired;
677                         if(wm && pp->pid)
678                                 nwired[wm->machno]++;
679                 }
680                 bm = 0;
681                 for(i=0; i<conf.nmach; i++)
682                         if(nwired[i] < nwired[bm])
683                                 bm = i;
684         } else {
685                 /* use the virtual machine requested */
686                 bm = bm % conf.nmach;
687         }
688
689         p->wired = MACHP(bm);
690         p->mp = p->wired;
691 }
692
693 void
694 procpriority(Proc *p, int pri, int fixed)
695 {
696         if(pri >= Npriq)
697                 pri = Npriq - 1;
698         else if(pri < 0)
699                 pri = 0;
700         p->basepri = pri;
701         p->priority = pri;
702         if(fixed){
703                 p->fixedpri = 1;
704         } else {
705                 p->fixedpri = 0;
706         }
707 }
708
709 void
710 procinit0(void)         /* bad planning - clashes with devproc.c */
711 {
712         Proc *p;
713         int i;
714
715         procalloc.free = xalloc(conf.nproc*sizeof(Proc));
716         if(procalloc.free == nil){
717                 xsummary();
718                 panic("cannot allocate %lud procs (%ludMB)\n", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
719         }
720         procalloc.arena = procalloc.free;
721
722         p = procalloc.free;
723         for(i=0; i<conf.nproc-1; i++,p++)
724                 p->qnext = p+1;
725         p->qnext = 0;
726 }
727
728 /*
729  *  sleep if a condition is not true.  Another process will
730  *  awaken us after it sets the condition.  When we awaken
731  *  the condition may no longer be true.
732  *
733  *  we lock both the process and the rendezvous to keep r->p
734  *  and p->r synchronized.
735  */
736 void
737 sleep(Rendez *r, int (*f)(void*), void *arg)
738 {
739         int s;
740         void (*pt)(Proc*, int, vlong);
741
742         s = splhi();
743
744         if(up->nlocks.ref)
745                 print("process %lud sleeps with %lud locks held, last lock %#p locked at pc %#lux, sleep called from %#p\n",
746                         up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r));
747         lock(r);
748         lock(&up->rlock);
749         if(r->p){
750                 print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
751                 dumpstack();
752         }
753
754         /*
755          *  Wakeup only knows there may be something to do by testing
756          *  r->p in order to get something to lock on.
757          *  Flush that information out to memory in case the sleep is
758          *  committed.
759          */
760         r->p = up;
761
762         if((*f)(arg) || up->notepending){
763                 /*
764                  *  if condition happened or a note is pending
765                  *  never mind
766                  */
767                 r->p = nil;
768                 unlock(&up->rlock);
769                 unlock(r);
770         } else {
771                 /*
772                  *  now we are committed to
773                  *  change state and call scheduler
774                  */
775                 pt = proctrace;
776                 if(pt)
777                         pt(up, SSleep, 0);
778                 up->state = Wakeme;
779                 up->r = r;
780
781                 /* statistics */
782                 m->cs++;
783
784                 procsave(up);
785                 if(setlabel(&up->sched)) {
786                         /*
787                          *  here when the process is awakened
788                          */
789                         procrestore(up);
790                         spllo();
791                 } else {
792                         /*
793                          *  here to go to sleep (i.e. stop Running)
794                          */
795                         unlock(&up->rlock);
796                         unlock(r);
797                         gotolabel(&m->sched);
798                 }
799         }
800
801         if(up->notepending) {
802                 up->notepending = 0;
803                 splx(s);
804                 if(up->procctl == Proc_exitme && up->closingfgrp)
805                         forceclosefgrp();
806                 error(Eintr);
807         }
808
809         splx(s);
810 }
811
812 static int
813 tfn(void *arg)
814 {
815         return up->trend == nil || up->tfn(arg);
816 }
817
818 void
819 twakeup(Ureg*, Timer *t)
820 {
821         Proc *p;
822         Rendez *trend;
823
824         p = t->ta;
825         trend = p->trend;
826         p->trend = 0;
827         if(trend)
828                 wakeup(trend);
829 }
830
831 void
832 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
833 {
834         if (up->tt){
835                 print("tsleep: timer active: mode %d, tf %#p\n", up->tmode, up->tf);
836                 timerdel(up);
837         }
838         up->tns = MS2NS(ms);
839         up->tf = twakeup;
840         up->tmode = Trelative;
841         up->ta = up;
842         up->trend = r;
843         up->tfn = fn;
844         timeradd(up);
845
846         if(waserror()){
847                 timerdel(up);
848                 nexterror();
849         }
850         sleep(r, tfn, arg);
851         if (up->tt)
852                 timerdel(up);
853         up->twhen = 0;
854         poperror();
855 }
856
857 /*
858  *  Expects that only one process can call wakeup for any given Rendez.
859  *  We hold both locks to ensure that r->p and p->r remain consistent.
860  *  Richard Miller has a better solution that doesn't require both to
861  *  be held simultaneously, but I'm a paranoid - presotto.
862  */
863 Proc*
864 wakeup(Rendez *r)
865 {
866         Proc *p;
867         int s;
868
869         s = splhi();
870
871         lock(r);
872         p = r->p;
873
874         if(p != nil){
875                 lock(&p->rlock);
876                 if(p->state != Wakeme || p->r != r){
877                         iprint("%p %p %d\n", p->r, r, p->state);
878                         panic("wakeup: state");
879                 }
880                 r->p = nil;
881                 p->r = nil;
882                 ready(p);
883                 unlock(&p->rlock);
884         }
885         unlock(r);
886
887         splx(s);
888
889         return p;
890 }
891
892 /*
893  *  if waking a sleeping process, this routine must hold both
894  *  p->rlock and r->lock.  However, it can't know them in
895  *  the same order as wakeup causing a possible lock ordering
896  *  deadlock.  We break the deadlock by giving up the p->rlock
897  *  lock if we can't get the r->lock and retrying.
898  */
899 int
900 postnote(Proc *p, int dolock, char *n, int flag)
901 {
902         int s, ret;
903         Rendez *r;
904         Proc *d, **l;
905
906         if(dolock)
907                 qlock(&p->debug);
908
909         if(flag != NUser && (p->notify == 0 || p->notified))
910                 p->nnote = 0;
911
912         ret = 0;
913         if(p->nnote < NNOTE) {
914                 strcpy(p->note[p->nnote].msg, n);
915                 p->note[p->nnote++].flag = flag;
916                 ret = 1;
917         }
918         p->notepending = 1;
919         if(dolock)
920                 qunlock(&p->debug);
921
922         /* this loop is to avoid lock ordering problems. */
923         for(;;){
924                 s = splhi();
925                 lock(&p->rlock);
926                 r = p->r;
927
928                 /* waiting for a wakeup? */
929                 if(r == nil)
930                         break;  /* no */
931
932                 /* try for the second lock */
933                 if(canlock(r)){
934                         if(p->state != Wakeme || r->p != p)
935                                 panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
936                         p->r = nil;
937                         r->p = nil;
938                         ready(p);
939                         unlock(r);
940                         break;
941                 }
942
943                 /* give other process time to get out of critical section and try again */
944                 unlock(&p->rlock);
945                 splx(s);
946                 sched();
947         }
948         unlock(&p->rlock);
949         splx(s);
950
951         if(p->state != Rendezvous)
952                 return ret;
953
954         /* Try and pull out of a rendezvous */
955         lock(p->rgrp);
956         if(p->state == Rendezvous) {
957                 p->rendval = ~0;
958                 l = &REND(p->rgrp, p->rendtag);
959                 for(d = *l; d; d = d->rendhash) {
960                         if(d == p) {
961                                 *l = p->rendhash;
962                                 break;
963                         }
964                         l = &d->rendhash;
965                 }
966                 ready(p);
967         }
968         unlock(p->rgrp);
969         return ret;
970 }
971
972 /*
973  * weird thing: keep at most NBROKEN around
974  */
975 #define NBROKEN 4
976 struct
977 {
978         QLock;
979         int     n;
980         Proc    *p[NBROKEN];
981 }broken;
982
983 void
984 addbroken(Proc *p)
985 {
986         qlock(&broken);
987         if(broken.n == NBROKEN) {
988                 ready(broken.p[0]);
989                 memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
990                 --broken.n;
991         }
992         broken.p[broken.n++] = p;
993         qunlock(&broken);
994
995         edfstop(up);
996         p->state = Broken;
997         p->psstate = 0;
998         sched();
999 }
1000
1001 void
1002 unbreak(Proc *p)
1003 {
1004         int b;
1005
1006         qlock(&broken);
1007         for(b=0; b < broken.n; b++)
1008                 if(broken.p[b] == p) {
1009                         broken.n--;
1010                         memmove(&broken.p[b], &broken.p[b+1],
1011                                         sizeof(Proc*)*(NBROKEN-(b+1)));
1012                         ready(p);
1013                         break;
1014                 }
1015         qunlock(&broken);
1016 }
1017
1018 int
1019 freebroken(void)
1020 {
1021         int i, n;
1022
1023         qlock(&broken);
1024         n = broken.n;
1025         for(i=0; i<n; i++) {
1026                 ready(broken.p[i]);
1027                 broken.p[i] = 0;
1028         }
1029         broken.n = 0;
1030         qunlock(&broken);
1031         return n;
1032 }
1033
1034 void
1035 pexit(char *exitstr, int freemem)
1036 {
1037         Proc *p;
1038         Segment **s, **es;
1039         long utime, stime;
1040         Waitq *wq, *f, *next;
1041         Fgrp *fgrp;
1042         Egrp *egrp;
1043         Rgrp *rgrp;
1044         Pgrp *pgrp;
1045         Chan *dot;
1046         void (*pt)(Proc*, int, vlong);
1047
1048         if(up->syscalltrace)
1049                 free(up->syscalltrace);
1050         up->alarm = 0;
1051         if (up->tt)
1052                 timerdel(up);
1053         pt = proctrace;
1054         if(pt)
1055                 pt(up, SDead, 0);
1056
1057         /* nil out all the resources under lock (free later) */
1058         qlock(&up->debug);
1059         fgrp = up->fgrp;
1060         up->fgrp = nil;
1061         egrp = up->egrp;
1062         up->egrp = nil;
1063         rgrp = up->rgrp;
1064         up->rgrp = nil;
1065         pgrp = up->pgrp;
1066         up->pgrp = nil;
1067         dot = up->dot;
1068         up->dot = nil;
1069         qunlock(&up->debug);
1070
1071         if(fgrp)
1072                 closefgrp(fgrp);
1073         if(egrp)
1074                 closeegrp(egrp);
1075         if(rgrp)
1076                 closergrp(rgrp);
1077         if(dot)
1078                 cclose(dot);
1079         if(pgrp)
1080                 closepgrp(pgrp);
1081
1082         /*
1083          * if not a kernel process and have a parent,
1084          * do some housekeeping.
1085          */
1086         if(up->kp == 0) {
1087                 p = up->parent;
1088                 if(p == 0) {
1089                         if(exitstr == 0)
1090                                 exitstr = "unknown";
1091                         panic("boot process died: %s", exitstr);
1092                 }
1093
1094                 while(waserror())
1095                         ;
1096
1097                 wq = smalloc(sizeof(Waitq));
1098                 poperror();
1099
1100                 wq->w.pid = up->pid;
1101                 utime = up->time[TUser] + up->time[TCUser];
1102                 stime = up->time[TSys] + up->time[TCSys];
1103                 wq->w.time[TUser] = tk2ms(utime);
1104                 wq->w.time[TSys] = tk2ms(stime);
1105                 wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1106                 if(exitstr && exitstr[0])
1107                         snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1108                 else
1109                         wq->w.msg[0] = '\0';
1110
1111                 lock(&p->exl);
1112                 /*
1113                  * Check that parent is still alive.
1114                  */
1115                 if(p->pid == up->parentpid && p->state != Broken) {
1116                         p->nchild--;
1117                         p->time[TCUser] += utime;
1118                         p->time[TCSys] += stime;
1119                         /*
1120                          * If there would be more than 128 wait records
1121                          * processes for my parent, then don't leave a wait
1122                          * record behind.  This helps prevent badly written
1123                          * daemon processes from accumulating lots of wait
1124                          * records.
1125                          */
1126                         if(p->nwait < 128) {
1127                                 wq->next = p->waitq;
1128                                 p->waitq = wq;
1129                                 p->nwait++;
1130                                 wq = nil;
1131                                 wakeup(&p->waitr);
1132                         }
1133                 }
1134                 unlock(&p->exl);
1135                 if(wq)
1136                         free(wq);
1137         }
1138
1139         if(!freemem)
1140                 addbroken(up);
1141
1142         qlock(&up->seglock);
1143         es = &up->seg[NSEG];
1144         for(s = up->seg; s < es; s++) {
1145                 if(*s) {
1146                         putseg(*s);
1147                         *s = 0;
1148                 }
1149         }
1150         qunlock(&up->seglock);
1151
1152         lock(&up->exl);         /* Prevent my children from leaving waits */
1153         pidunhash(up);
1154         up->pid = 0;
1155         wakeup(&up->waitr);
1156         unlock(&up->exl);
1157
1158         for(f = up->waitq; f; f = next) {
1159                 next = f->next;
1160                 free(f);
1161         }
1162
1163         /* release debuggers */
1164         qlock(&up->debug);
1165         if(up->pdbg) {
1166                 wakeup(&up->pdbg->sleep);
1167                 up->pdbg = 0;
1168         }
1169         qunlock(&up->debug);
1170
1171         /* Sched must not loop for these locks */
1172         lock(&procalloc);
1173         lock(&palloc);
1174
1175         edfstop(up);
1176         up->state = Moribund;
1177         sched();
1178         panic("pexit");
1179 }
1180
1181 int
1182 haswaitq(void *x)
1183 {
1184         Proc *p;
1185
1186         p = (Proc *)x;
1187         return p->waitq != 0;
1188 }
1189
1190 ulong
1191 pwait(Waitmsg *w)
1192 {
1193         ulong cpid;
1194         Waitq *wq;
1195
1196         if(!canqlock(&up->qwaitr))
1197                 error(Einuse);
1198
1199         if(waserror()) {
1200                 qunlock(&up->qwaitr);
1201                 nexterror();
1202         }
1203
1204         lock(&up->exl);
1205         if(up->nchild == 0 && up->waitq == 0) {
1206                 unlock(&up->exl);
1207                 error(Enochild);
1208         }
1209         unlock(&up->exl);
1210
1211         sleep(&up->waitr, haswaitq, up);
1212
1213         lock(&up->exl);
1214         wq = up->waitq;
1215         up->waitq = wq->next;
1216         up->nwait--;
1217         unlock(&up->exl);
1218
1219         qunlock(&up->qwaitr);
1220         poperror();
1221
1222         if(w)
1223                 memmove(w, &wq->w, sizeof(Waitmsg));
1224         cpid = wq->w.pid;
1225         free(wq);
1226         return cpid;
1227 }
1228
1229 Proc*
1230 proctab(int i)
1231 {
1232         return &procalloc.arena[i];
1233 }
1234
1235 void
1236 dumpaproc(Proc *p)
1237 {
1238         ulong bss;
1239         char *s;
1240
1241         if(p == 0)
1242                 return;
1243
1244         bss = 0;
1245         if(p->seg[BSEG])
1246                 bss = p->seg[BSEG]->top;
1247
1248         s = p->psstate;
1249         if(s == 0)
1250                 s = statename[p->state];
1251         print("%3lud:%10s pc %8lux dbgpc %8lux  %8s (%s) ut %ld st %ld bss %lux qpc %lux nl %lud nd %lud lpc %lux pri %lud\n",
1252                 p->pid, p->text, p->pc, dbgpc(p),  s, statename[p->state],
1253                 p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1254 }
1255
1256 void
1257 procdump(void)
1258 {
1259         int i;
1260         Proc *p;
1261
1262         if(up)
1263                 print("up %lud\n", up->pid);
1264         else
1265                 print("no current process\n");
1266         for(i=0; i<conf.nproc; i++) {
1267                 p = &procalloc.arena[i];
1268                 if(p->state == Dead)
1269                         continue;
1270
1271                 dumpaproc(p);
1272         }
1273 }
1274
1275 /*
1276  *  wait till all processes have flushed their mmu
1277  *  state about segement s
1278  */
1279 void
1280 procflushseg(Segment *s)
1281 {
1282         int i, ns, nm, nwait;
1283         Proc *p;
1284
1285         /*
1286          *  tell all processes with this
1287          *  segment to flush their mmu's
1288          */
1289         nwait = 0;
1290         for(i=0; i<conf.nproc; i++) {
1291                 p = &procalloc.arena[i];
1292                 if(p->state == Dead)
1293                         continue;
1294                 for(ns = 0; ns < NSEG; ns++)
1295                         if(p->seg[ns] == s){
1296                                 p->newtlb = 1;
1297                                 for(nm = 0; nm < conf.nmach; nm++){
1298                                         if(MACHP(nm)->proc == p){
1299                                                 MACHP(nm)->flushmmu = 1;
1300                                                 nwait++;
1301                                         }
1302                                 }
1303                                 break;
1304                         }
1305         }
1306
1307         if(nwait == 0)
1308                 return;
1309
1310         /*
1311          *  wait for all processors to take a clock interrupt
1312          *  and flush their mmu's
1313          */
1314         for(nm = 0; nm < conf.nmach; nm++)
1315                 if(MACHP(nm) != m)
1316                         while(MACHP(nm)->flushmmu)
1317                                 sched();
1318 }
1319
1320 void
1321 scheddump(void)
1322 {
1323         Proc *p;
1324         Schedq *rq;
1325
1326         for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1327                 if(rq->head == 0)
1328                         continue;
1329                 print("rq%ld:", rq-runq);
1330                 for(p = rq->head; p; p = p->rnext)
1331                         print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1332                 print("\n");
1333                 delay(150);
1334         }
1335         print("nrdy %d\n", nrdy);
1336 }
1337
1338 void
1339 kproc(char *name, void (*func)(void *), void *arg)
1340 {
1341         Proc *p;
1342         static Pgrp *kpgrp;
1343
1344         p = newproc();
1345         p->psstate = 0;
1346         p->procmode = 0640;
1347         p->kp = 1;
1348         p->noswap = 1;
1349
1350         p->fpsave = up->fpsave;
1351         p->scallnr = up->scallnr;
1352         p->s = up->s;
1353         p->nerrlab = 0;
1354         p->slash = up->slash;
1355         p->dot = up->dot;
1356         if(p->dot)
1357                 incref(p->dot);
1358
1359         memmove(p->note, up->note, sizeof(p->note));
1360         p->nnote = up->nnote;
1361         p->notified = 0;
1362         p->lastnote = up->lastnote;
1363         p->notify = up->notify;
1364         p->ureg = 0;
1365         p->dbgreg = 0;
1366
1367         procpriority(p, PriKproc, 0);
1368
1369         kprocchild(p, func, arg);
1370
1371         kstrdup(&p->user, eve);
1372         kstrdup(&p->text, name);
1373         if(kpgrp == 0)
1374                 kpgrp = newpgrp();
1375         p->pgrp = kpgrp;
1376         incref(kpgrp);
1377
1378         memset(p->time, 0, sizeof(p->time));
1379         p->time[TReal] = MACHP(0)->ticks;
1380         ready(p);
1381
1382         /*
1383          *  since the bss/data segments are now shareable,
1384          *  any mmu info about this process is now stale
1385          *  and has to be discarded.
1386         p->newtlb = 1;
1387         flushmmu();
1388          */
1389 }
1390
1391 /*
1392  *  called splhi() by notify().  See comment in notify for the
1393  *  reasoning.
1394  */
1395 void
1396 procctl(Proc *p)
1397 {
1398         char *state;
1399         ulong s;
1400
1401         switch(p->procctl) {
1402         case Proc_exitbig:
1403                 spllo();
1404                 pexit("Killed: Insufficient physical memory", 1);
1405
1406         case Proc_exitme:
1407                 spllo();                /* pexit has locks in it */
1408                 pexit("Killed", 1);
1409
1410         case Proc_traceme:
1411                 if(p->nnote == 0)
1412                         return;
1413                 /* No break */
1414
1415         case Proc_stopme:
1416                 p->procctl = 0;
1417                 state = p->psstate;
1418                 p->psstate = "Stopped";
1419                 /* free a waiting debugger */
1420                 s = spllo();
1421                 qlock(&p->debug);
1422                 if(p->pdbg) {
1423                         wakeup(&p->pdbg->sleep);
1424                         p->pdbg = 0;
1425                 }
1426                 qunlock(&p->debug);
1427                 splhi();
1428                 p->state = Stopped;
1429                 sched();
1430                 p->psstate = state;
1431                 splx(s);
1432                 return;
1433         }
1434 }
1435
1436 #include "errstr.h"
1437
1438 void
1439 error(char *err)
1440 {
1441         spllo();
1442
1443         assert(up->nerrlab < NERR);
1444         kstrcpy(up->errstr, err, ERRMAX);
1445         setlabel(&up->errlab[NERR-1]);
1446         nexterror();
1447 }
1448
1449 void
1450 nexterror(void)
1451 {
1452         gotolabel(&up->errlab[--up->nerrlab]);
1453 }
1454
1455 void
1456 exhausted(char *resource)
1457 {
1458         char buf[ERRMAX];
1459
1460         snprint(buf, sizeof buf, "no free %s", resource);
1461         iprint("%s\n", buf);
1462         error(buf);
1463 }
1464
1465 void
1466 killbig(char *why)
1467 {
1468         int i;
1469         Segment *s;
1470         ulong l, max;
1471         Proc *p, *ep, *kp;
1472
1473         max = 0;
1474         kp = 0;
1475         ep = procalloc.arena+conf.nproc;
1476         for(p = procalloc.arena; p < ep; p++) {
1477                 if(p->state == Dead || p->kp)
1478                         continue;
1479                 l = 0;
1480                 for(i=1; i<NSEG; i++) {
1481                         s = p->seg[i];
1482                         if(s != 0)
1483                                 l += s->top - s->base;
1484                 }
1485                 if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
1486                         kp = p;
1487                         max = l;
1488                 }
1489         }
1490
1491         print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1492         for(p = procalloc.arena; p < ep; p++) {
1493                 if(p->state == Dead || p->kp)
1494                         continue;
1495                 if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
1496                         p->procctl = Proc_exitbig;
1497         }
1498         kp->procctl = Proc_exitbig;
1499         for(i = 0; i < NSEG; i++) {
1500                 s = kp->seg[i];
1501                 if(s != 0 && canqlock(&s->lk)) {
1502                         mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1503                         qunlock(&s->lk);
1504                 }
1505         }
1506 }
1507
1508 /*
1509  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1510  *  eve changes.
1511  */
1512 void
1513 renameuser(char *old, char *new)
1514 {
1515         Proc *p, *ep;
1516
1517         ep = procalloc.arena+conf.nproc;
1518         for(p = procalloc.arena; p < ep; p++)
1519                 if(p->user!=nil && strcmp(old, p->user)==0)
1520                         kstrdup(&p->user, new);
1521 }
1522
1523 /*
1524  *  time accounting called by clock() splhi'd
1525  */
1526 void
1527 accounttime(void)
1528 {
1529         Proc *p;
1530         ulong n, per;
1531         static ulong nrun;
1532
1533         p = m->proc;
1534         if(p) {
1535                 nrun++;
1536                 p->time[p->insyscall]++;
1537         }
1538
1539         /* calculate decaying duty cycles */
1540         n = perfticks();
1541         per = n - m->perf.last;
1542         m->perf.last = n;
1543         per = (m->perf.period*(HZ-1) + per)/HZ;
1544         if(per != 0)
1545                 m->perf.period = per;
1546
1547         m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1548         m->perf.inidle = 0;
1549
1550         m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1551         m->perf.inintr = 0;
1552
1553         /* only one processor gets to compute system load averages */
1554         if(m->machno != 0)
1555                 return;
1556
1557         /*
1558          * calculate decaying load average.
1559          * if we decay by (n-1)/n then it takes
1560          * n clock ticks to go from load L to .36 L once
1561          * things quiet down.  it takes about 5 n clock
1562          * ticks to go to zero.  so using HZ means this is
1563          * approximately the load over the last second,
1564          * with a tail lasting about 5 seconds.
1565          */
1566         n = nrun;
1567         nrun = 0;
1568         n = (nrdy+n)*1000;
1569         m->load = (m->load*(HZ-1)+n)/HZ;
1570 }
1571
1572 static void
1573 pidhash(Proc *p)
1574 {
1575         int h;
1576
1577         h = p->pid % nelem(procalloc.ht);
1578         lock(&procalloc);
1579         p->pidhash = procalloc.ht[h];
1580         procalloc.ht[h] = p;
1581         unlock(&procalloc);
1582 }
1583
1584 static void
1585 pidunhash(Proc *p)
1586 {
1587         int h;
1588         Proc **l;
1589
1590         h = p->pid % nelem(procalloc.ht);
1591         lock(&procalloc);
1592         for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1593                 if(*l == p){
1594                         *l = p->pidhash;
1595                         break;
1596                 }
1597         unlock(&procalloc);
1598 }
1599
1600 int
1601 procindex(ulong pid)
1602 {
1603         Proc *p;
1604         int h;
1605         int s;
1606
1607         s = -1;
1608         h = pid % nelem(procalloc.ht);
1609         lock(&procalloc);
1610         for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1611                 if(p->pid == pid){
1612                         s = p - procalloc.arena;
1613                         break;
1614                 }
1615         unlock(&procalloc);
1616         return s;
1617 }