3 #include "../port/lib.h"
7 #include "../port/error.h"
12 extern void checkpages(void);
13 extern void checkpagerefs(void);
27 pexit("fork aborted", 1);
31 sysrfork(va_list list)
42 flag = va_arg(list, ulong);
43 /* Check flags before we commit */
44 if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
46 if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
48 if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
51 if((flag&RFPROC) == 0) {
52 if(flag & (RFMEM|RFNOWAIT))
54 if(flag & (RFFDG|RFCFDG)) {
57 up->fgrp = dupfgrp(ofg);
59 up->fgrp = dupfgrp(nil);
62 if(flag & (RFNAMEG|RFCNAMEG)) {
66 pgrpcpy(up->pgrp, opg);
67 /* inherit noattach */
68 up->pgrp->noattach = opg->noattach;
72 up->pgrp->noattach = 1;
78 if(flag & (RFENVG|RFCENVG)) {
80 up->egrp = smalloc(sizeof(Egrp));
83 envcpy(up->egrp, oeg);
87 up->noteid = pidalloc(0);
93 p->scallnr = up->scallnr;
100 memmove(p->note, up->note, sizeof(p->note));
101 p->privatemem = up->privatemem;
102 p->noswap = up->noswap;
103 p->nnote = up->nnote;
105 p->lastnote = up->lastnote;
106 p->notify = up->notify;
110 /* Abort the child process on error */
113 kprocchild(p, abortion, 0);
118 /* Make a new set of memory segments */
122 qunlock(&p->seglock);
125 for(i = 0; i < NSEG; i++)
126 if(up->seg[i] != nil)
127 p->seg[i] = dupseg(up->seg, i, n);
128 qunlock(&p->seglock);
131 /* File descriptors */
132 if(flag & (RFFDG|RFCFDG)) {
134 p->fgrp = dupfgrp(up->fgrp);
136 p->fgrp = dupfgrp(nil);
144 if(flag & (RFNAMEG|RFCNAMEG)) {
147 pgrpcpy(p->pgrp, up->pgrp);
148 /* inherit noattach */
149 p->pgrp->noattach = up->pgrp->noattach;
156 p->pgrp->noattach = 1;
165 /* Environment group */
166 if(flag & (RFENVG|RFCENVG)) {
167 p->egrp = smalloc(sizeof(Egrp));
170 envcpy(p->egrp, up->egrp);
177 p->procmode = up->procmode;
178 if(up->procctl == Proc_tracesyscall)
179 p->procctl = Proc_tracesyscall;
181 poperror(); /* abortion */
183 /* Craft a return frame which will cause the child to pop out of
184 * the scheduler in user mode with the return register zero
186 forkchild(p, up->dbgreg);
189 if((flag&RFNOWAIT) == 0){
190 p->parentpid = up->pid;
195 if((flag&RFNOTEG) == 0)
196 p->noteid = up->noteid;
199 memset(p->time, 0, sizeof(p->time));
200 p->time[TReal] = MACHP(0)->ticks;
202 kstrdup(&p->text, up->text);
203 kstrdup(&p->user, up->user);
208 * since the bss/data segments are now shareable,
209 * any mmu info about this process is now stale
210 * (i.e. has bad properties) and has to be discarded.
213 p->basepri = up->basepri;
214 p->priority = up->basepri;
215 p->fixedpri = up->fixedpri;
219 procwired(p, wm->machno);
226 shargs(char *s, int n, char **ap)
231 n -= 2; /* skip #! */
242 while(*s==' ' || *s=='\t')
247 while(*s && *s!=' ' && *s!='\t')
263 return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
267 sysexec(va_list list)
272 char **argv, **argp, **argp0;
273 char *a, *e, *charp, *args, *file, *file0;
274 char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
275 ulong magic, ssize, nargs, nbytes, n;
276 uintptr t, d, b, entry, bssend, text, data, bss, tstk, align;
279 char line[sizeof(Exec)];
285 file0 = va_arg(list, char*);
286 validaddr((uintptr)file0, 1, 0);
287 argp0 = va_arg(list, char**);
288 evenaddr((uintptr)argp0);
289 validaddr((uintptr)argp0, 2*BY2WD, 0);
292 file0 = validnamedup(file0, 1);
297 /* Disaster after commit */
298 if(up->seg[SSEG] == nil)
299 pexit(up->errstr, 1);
311 tc = namec(file, Aopen, OEXEC, 0);
317 kstrdup(&elem, up->genbuf);
319 n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
322 magic = l2be(exec.magic);
323 if(n == sizeof(Exec) && (magic == AOUT_MAGIC)){
324 entry = l2be(exec.entry);
325 text = l2be(exec.text);
326 if(magic & HDR_MAGIC)
329 case S_MAGIC: /* 2MB segment alignment for amd64 */
332 case V_MAGIC: /* 16K segment alignment for mips */
336 if(text >= (USTKTOP-USTKSIZE)-(UTZERO+sizeof(Exec))
337 || entry < UTZERO+sizeof(Exec)
338 || entry >= UTZERO+sizeof(Exec)+text)
340 break; /* for binary */
344 * Process #! /bin/sh args ...
346 memmove(line, &exec, n);
347 if(indir || line[0]!='#' || line[1]!='!')
349 n = shargs(line, n, progarg);
354 * First arg becomes complete file name
360 if(strlen(elem) >= sizeof progelem)
362 strcpy(progelem, elem);
363 progarg[0] = progelem;
368 data = l2be(exec.data);
369 bss = l2be(exec.bss);
371 t = (UTZERO+sizeof(Exec)+text+align) & ~align;
373 d = (t + data + align) & ~align;
374 bssend = t + data + bss;
375 b = (bssend + align) & ~align;
376 if(t >= (USTKTOP-USTKSIZE) || d >= (USTKTOP-USTKSIZE) || b >= (USTKTOP-USTKSIZE))
380 * Args: pass 1: count
382 nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */
388 nbytes += strlen(a) + 1;
395 if(((uintptr)argp&(BY2PG-1)) < BY2WD)
396 validaddr((uintptr)argp, BY2WD, 0);
397 validaddr((uintptr)a, 1, 0);
398 e = vmemchr(a, 0, USTKSIZE);
401 nbytes += (e - a) + 1;
402 if(nbytes >= USTKSIZE)
406 ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
409 * 8-byte align SP for those (e.g. sparc) that need it.
410 * execregs() will subtract another 4 bytes for argc.
412 if(BY2WD == 4 && (ssize+4) & 7)
415 if(PGROUND(ssize) >= USTKSIZE)
419 * Build the stack segment, putting it in kernel virtual for the moment
423 qunlock(&up->seglock);
432 } while((s = isoverlap(up, tstk-USTKSIZE, USTKSIZE)) != nil);
433 up->seg[ESEG] = newseg(SG_STACK, tstk-USTKSIZE, USTKSIZE/BY2PG);
436 * Args: pass 2: assemble; the pages will be faulted in
438 tos = (Tos*)(tstk - sizeof(Tos));
439 tos->cyclefreq = m->cyclefreq;
444 argv = (char**)(tstk - ssize);
445 charp = (char*)(tstk - nbytes);
451 for(i=0; i<nargs; i++){
452 if(indir && *argp==nil) {
456 *argv++ = charp + (USTKTOP-tstk);
461 validaddr((uintptr)a, 1, 0);
462 e = vmemchr(a, 0, (char*)tstk - charp);
467 memmove(charp, a, n);
471 /* copy args; easiest from new process's stack */
472 a = (char*)(tstk - nbytes);
474 if(n > 128) /* don't waste too much space on huge arg lists */
478 if(n>0 && args[n-1]!='\0'){
479 /* make sure last arg is NUL-terminated */
480 /* put NUL at UTF-8 character boundary */
482 if(fullrune(args+i, n-i))
491 * Special segments are maintained across exec
493 for(i = SSEG; i <= BSEG; i++) {
495 /* prevent a second free if we have an error */
498 for(i = ESEG+1; i < NSEG; i++) {
500 if(s != nil && (s->type&SG_CEXEC) != 0) {
509 if((f = up->fgrp) != nil) {
510 for(i=0; i<=f->maxfd; i++)
514 /* Text. Shared. Attaches to cache image if possible */
515 /* attachimage returns a locked cache image */
516 img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
521 ts->flen = sizeof(Exec)+text;
525 s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
528 /* Attached by hand */
531 s->fstart = ts->fstart+ts->flen;
534 /* BSS. Zero fill on demand */
535 up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
542 s->base = USTKTOP-USTKSIZE;
544 relocateseg(s, USTKTOP-tstk);
546 qunlock(&up->seglock);
547 poperror(); /* seglock */
550 * '/' processes are higher priority (hack to make /ip more responsive).
552 if(devtab[tc->type]->dc == L'/')
553 up->basepri = PriRoot;
554 up->priority = up->basepri;
557 poperror(); /* file0 */
577 * At this point, the mmu contains info about the old address
578 * space and needs to be flushed
583 up->procctl = Proc_stopme;
584 return execregs(entry, ssize, nargs);
594 syssleep(va_list list)
598 ms = va_arg(list, long);
600 if (up->edf != nil && (up->edf->flags & Admitted))
607 tsleep(&up->sleep, return0, 0, ms);
613 sysalarm(va_list list)
615 return procalarm(va_arg(list, ulong));
620 sysexits(va_list list)
623 char *inval = "invalid exit string";
626 status = va_arg(list, char*);
631 validaddr((uintptr)status, 1, 0);
632 if(vmemchr(status, 0, ERRMAX) == nil){
633 memmove(buf, status, ERRMAX);
642 return 0; /* not reached */
646 sys_wait(va_list list)
652 ow = va_arg(list, OWaitmsg*);
656 validaddr((uintptr)ow, sizeof(OWaitmsg), 1);
657 evenaddr((uintptr)ow);
661 readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
662 readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
663 readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
664 readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
665 strncpy(ow->msg, w.msg, sizeof(ow->msg)-1);
666 ow->msg[sizeof(ow->msg)-1] = '\0';
672 sysawait(va_list list)
678 p = va_arg(list, char*);
679 n = va_arg(list, uint);
680 validaddr((uintptr)p, n, 1);
682 return (uintptr)snprint(p, n, "%d %lud %lud %lud %q",
684 w.time[TUser], w.time[TSys], w.time[TReal],
689 werrstr(char *fmt, ...)
697 vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
702 generrstr(char *buf, uint nbuf)
708 validaddr((uintptr)buf, nbuf, 1);
709 if(nbuf > sizeof tmp)
711 memmove(tmp, buf, nbuf);
713 /* make sure it's NUL-terminated */
715 memmove(buf, up->syserrstr, nbuf);
717 memmove(up->syserrstr, tmp, nbuf);
722 syserrstr(va_list list)
727 buf = va_arg(list, char*);
728 len = va_arg(list, uint);
729 return (uintptr)generrstr(buf, len);
732 /* compatibility for old binaries */
734 sys_errstr(va_list list)
736 return (uintptr)generrstr(va_arg(list, char*), 64);
740 sysnotify(va_list list)
742 int (*f)(void*, char*);
743 f = va_arg(list, void*);
745 validaddr((uintptr)f, sizeof(void*), 0);
751 sysnoted(va_list list)
753 if(va_arg(list, int) != NRSTR && !up->notified)
759 syssegbrk(va_list list)
765 addr = va_arg(list, uintptr);
766 for(i = 0; i < NSEG; i++) {
768 if(s == nil || addr < s->base || addr >= s->top)
770 switch(s->type&SG_TYPE) {
778 return ibrk(va_arg(list, uintptr), i);
782 return 0; /* not reached */
786 syssegattach(va_list list)
793 attr = va_arg(list, ulong);
794 name = va_arg(list, char*);
795 va = va_arg(list, uintptr);
796 len = va_arg(list, ulong);
797 validaddr((uintptr)name, 1, 0);
798 name = validnamedup(name, 1);
803 va = segattach(up, attr, name, va, len);
810 syssegdetach(va_list list)
816 addr = va_arg(list, uintptr);
820 qunlock(&up->seglock);
825 for(i = 0; i < NSEG; i++)
826 if((s = up->seg[i]) != nil) {
828 if((addr >= s->base && addr < s->top) ||
829 (s->top == s->base && addr == s->base))
838 * Check we are not detaching the initial stack segment.
840 if(s == up->seg[SSEG]){
847 qunlock(&up->seglock);
850 /* Ensure we flush any entries from the lost segment */
856 syssegfree(va_list list)
861 from = va_arg(list, uintptr);
862 to = va_arg(list, ulong);
866 s = seg(up, from, 1);
870 from = PGROUND(from);
879 mfreeseg(s, from, (to - from) / BY2PG);
885 /* For binary compatibility */
887 sysbrk_(va_list list)
889 return ibrk(va_arg(list, uintptr), BSEG);
893 sysrendezvous(va_list list)
895 uintptr tag, val, new;
898 tag = va_arg(list, uintptr);
899 new = va_arg(list, uintptr);
900 l = &REND(up->rgrp, tag);
903 for(p = *l; p != nil; p = p->rendhash) {
904 if(p->rendtag == tag) {
917 /* Going to sleep here */
922 up->state = Rendezvous;
931 * The implementation of semaphores is complicated by needing
932 * to avoid rescheduling in syssemrelease, so that it is safe
933 * to call from real-time processes. This means syssemrelease
934 * cannot acquire any qlocks, only spin locks.
936 * Semacquire and semrelease must both manipulate the semaphore
937 * wait list. Lock-free linked lists only exist in theory, not
938 * in practice, so the wait list is protected by a spin lock.
940 * The semaphore value *addr is stored in user memory, so it
941 * cannot be read or written while holding spin locks.
943 * Thus, we can access the list only when holding the lock, and
944 * we can access the semaphore only when not holding the lock.
945 * This makes things interesting. Note that sleep's condition function
946 * is called while holding two locks - r and up->rlock - so it cannot
947 * access the semaphore value either.
949 * An acquirer announces its intention to try for the semaphore
950 * by putting a Sema structure onto the wait list and then
951 * setting Sema.waiting. After one last check of semaphore,
952 * the acquirer sleeps until Sema.waiting==0. A releaser of n
953 * must wake up n acquirers who have Sema.waiting set. It does
954 * this by clearing Sema.waiting and then calling wakeup.
956 * There are three interesting races here.
958 * The first is that in this particular sleep/wakeup usage, a single
959 * wakeup can rouse a process from two consecutive sleeps!
962 * (a) set Sema.waiting = 1
964 * (b) set Sema.waiting = 0
965 * (a) check Sema.waiting inside sleep, return w/o sleeping
966 * (a) try for semaphore, fail
967 * (a) set Sema.waiting = 1
972 * This is okay - semacquire will just go around the loop
973 * again. It does mean that at the top of the for(;;) loop in
974 * semacquire, phore.waiting might already be set to 1.
976 * The second is that a releaser might wake an acquirer who is
977 * interrupted before he can acquire the lock. Since
978 * release(n) issues only n wakeup calls -- only n can be used
979 * anyway -- if the interrupted process is not going to use his
980 * wakeup call he must pass it on to another acquirer.
982 * The third race is similar to the second but more subtle. An
983 * acquirer sets waiting=1 and then does a final canacquire()
984 * before going to sleep. The opposite order would result in
985 * missing wakeups that happen between canacquire and
986 * waiting=1. (In fact, the whole point of Sema.waiting is to
987 * avoid missing wakeups between canacquire() and sleep().) But
988 * there can be spurious wakeups between a successful
989 * canacquire() and the following semdequeue(). This wakeup is
990 * not useful to the acquirer, since he has already acquired
991 * the semaphore. Like in the previous case, though, the
992 * acquirer must pass the wakeup call along.
994 * This is all rather subtle. The code below has been verified
995 * with the spin model /sys/src/9/port/semaphore.p. The
996 * original code anticipated the second race but not the first
997 * or third, which were caught only with spin. The first race
998 * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
999 * It was lucky that my abstract model of sleep/wakeup still managed
1000 * to preserve that behavior.
1002 * I remain slightly concerned about memory coherence
1003 * outside of locks. The spin model does not take
1004 * queued processor writes into account so we have to
1005 * think hard. The only variables accessed outside locks
1006 * are the semaphore value itself and the boolean flag
1007 * Sema.waiting. The value is only accessed with cmpswap,
1008 * whose job description includes doing the right thing as
1009 * far as memory coherence across processors. That leaves
1010 * Sema.waiting. To handle it, we call coherence() before each
1011 * read and after each write. - rsc
1014 /* Add semaphore p with addr a to list in seg. */
1016 semqueue(Segment *s, long *a, Sema *p)
1018 memset(p, 0, sizeof *p);
1020 lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */
1022 p->prev = s->sema.prev;
1028 /* Remove semaphore p from list in seg. */
1030 semdequeue(Segment *s, Sema *p)
1033 p->next->prev = p->prev;
1034 p->prev->next = p->next;
1038 /* Wake up n waiters with addr a on list in seg. */
1040 semwakeup(Segment *s, long *a, long n)
1045 for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
1046 if(p->addr == a && p->waiting){
1056 /* Add delta to semaphore and wake up waiters as appropriate. */
1058 semrelease(Segment *s, long *addr, long delta)
1064 while(!cmpswap(addr, value, value+delta));
1065 semwakeup(s, addr, delta);
1069 /* Try to acquire semaphore using compare-and-swap */
1071 canacquire(long *addr)
1075 while((value=*addr) > 0)
1076 if(cmpswap(addr, value, value-1))
1081 /* Should we wake up? */
1086 return !((Sema*)p)->waiting;
1089 /* Acquire semaphore (subtract 1). */
1091 semacquire(Segment *s, long *addr, int block)
1096 if(canacquire(addr))
1102 semqueue(s, addr, &phore);
1106 if(canacquire(addr)){
1112 sleep(&phore, semawoke, &phore);
1115 semdequeue(s, &phore);
1116 coherence(); /* not strictly necessary due to lock in semdequeue */
1118 semwakeup(s, addr, 1);
1124 /* Acquire semaphore or time-out */
1126 tsemacquire(Segment *s, long *addr, ulong ms)
1128 int acquired, timedout;
1132 if(canacquire(addr))
1136 acquired = timedout = 0;
1137 semqueue(s, addr, &phore);
1141 if(canacquire(addr)){
1148 tsleep(&phore, semawoke, &phore, ms);
1149 elms = TK2MS(m->ticks - t);
1157 semdequeue(s, &phore);
1158 coherence(); /* not strictly necessary due to lock in semdequeue */
1160 semwakeup(s, addr, 1);
1169 syssemacquire(va_list list)
1175 addr = va_arg(list, long*);
1176 block = va_arg(list, int);
1177 evenaddr((uintptr)addr);
1178 s = seg(up, (uintptr)addr, 0);
1179 if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
1180 validaddr((uintptr)addr, sizeof(long), 1);
1185 return (uintptr)semacquire(s, addr, block);
1189 systsemacquire(va_list list)
1195 addr = va_arg(list, long*);
1196 ms = va_arg(list, ulong);
1197 evenaddr((uintptr)addr);
1198 s = seg(up, (uintptr)addr, 0);
1199 if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
1200 validaddr((uintptr)addr, sizeof(long), 1);
1205 return (uintptr)tsemacquire(s, addr, ms);
1209 syssemrelease(va_list list)
1214 addr = va_arg(list, long*);
1215 delta = va_arg(list, long);
1216 evenaddr((uintptr)addr);
1217 s = seg(up, (uintptr)addr, 0);
1218 if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
1219 validaddr((uintptr)addr, sizeof(long), 1);
1222 /* delta == 0 is a no-op, not a release */
1223 if(delta < 0 || *addr < 0)
1225 return (uintptr)semrelease(s, addr, delta);
1228 /* For binary compatibility */
1230 sys_nsec(va_list list)
1234 /* return in register on 64bit machine */
1235 if(sizeof(uintptr) == sizeof(vlong)){
1237 return (uintptr)todget(nil);
1240 v = va_arg(list, vlong*);
1241 evenaddr((uintptr)v);
1242 validaddr((uintptr)v, sizeof(vlong), 1);