]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/pc/devvmx.c
pc: add vmx device
[plan9front.git] / sys / src / 9 / pc / devvmx.c
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7 #include "ureg.h"
8
9 extern int vmxon(u64int);
10 extern int vmxoff(void);
11 extern int vmclear(u64int);
12 extern int vmptrld(u64int);
13 extern int vmlaunch(Ureg *, int, FPsave *);
14 extern int vmread(u32int, uintptr *);
15 extern int vmwrite(u32int, uintptr);
16 extern int invept(u32int, uvlong, uvlong);
17 extern int invvpid(u32int, uvlong, uvlong);
18
19 static vlong procb_ctls, pinb_ctls;
20
21 enum {
22         VMX_BASIC_MSR = 0x480,
23         VMX_PINB_CTLS_MSR = 0x481,
24         VMX_PROCB_CTLS_MSR = 0x482,
25         VMX_VMEXIT_CTLS_MSR = 0x483,
26         VMX_VMENTRY_CTLS_MSR = 0x484,
27         VMX_MISC_MSR = 0x485,
28         VMX_CR0_FIXED0 = 0x486,
29         VMX_CR0_FIXED1 = 0x487,
30         VMX_CR4_FIXED0 = 0x488,
31         VMX_CR4_FIXED1 = 0x489,
32         VMX_VMCS_ENUM = 0x48A,
33         VMX_PROCB_CTLS2_MSR = 0x48B,
34         VMX_TRUE_PINB_CTLS_MSR = 0x48D,
35         VMX_TRUE_PROCB_CTLS_MSR = 0x48E,
36         VMX_TRUE_EXIT_CTLS_MSR = 0x48F,
37         VMX_TRUE_ENTRY_CTLS_MSR = 0x490,
38         VMX_VMFUNC_MSR = 0x491,
39         
40         PINB_CTLS = 0x4000,
41         PINB_EXITIRQ = 1<<0,
42         PINB_EXITNMI = 1<<3,
43         
44         PROCB_CTLS = 0x4002,
45         PROCB_IRQWIN = 1<<2,
46         PROCB_EXITHLT = 1<<7,
47         PROCB_EXITINVLPG = 1<<9,
48         PROCB_EXITMWAIT = 1<<10,
49         PROCB_EXITRDPMC = 1<<11,
50         PROCB_EXITRDTSC = 1<<12,
51         PROCB_EXITCR3LD = 1<<15,
52         PROCB_EXITCR3ST = 1<<16,
53         PROCB_EXITCR8LD = 1<<19,
54         PROCB_EXITCR8ST = 1<<20,
55         PROCB_EXITMOVDR = 1<<23,
56         PROCB_EXITIO = 1<<24,
57         PROCB_MONTRAP = 1<<27,
58         PROCB_EXITMONITOR = 1<<29,
59         PROCB_EXITPAUSE = 1<<30,
60         PROCB_USECTLS2 = 1<<31,
61         
62         PROCB_CTLS2 = 0x401E,
63         PROCB_EPT = 1<<1,
64         PROCB_EXITGDT = 1<<2,
65         PROCB_VPID = 1<<5,
66         PROCB_UNRESTR = 1<<7,
67
68         EXC_BITMAP = 0x4004,
69         PFAULT_MASK = 0x4006,
70         PFAULT_MATCH = 0x4008,
71         CR3_TARGCNT = 0x400a,
72         
73         VMEXIT_CTLS = 0x400c,
74         VMEXIT_HOST64 = 1<<9,
75         
76         VMEXIT_MSRSTCNT = 0x400e,
77         VMEXIT_MSRLDCNT = 0x4010,
78         
79         VMENTRY_CTLS = 0x4012,
80         VMENTRY_GUEST64 = 1<<9,
81         
82         VMENTRY_MSRLDCNT = 0x4014,
83         VMENTRY_INTRINFO = 0x4016,
84         VMENTRY_INTRCODE = 0x4018,
85         VMENTRY_INTRILEN = 0x401a,
86         
87         VMCS_LINK = 0x2800,
88         
89         GUEST_ES = 0x800,
90         GUEST_CS = 0x802,
91         GUEST_SS = 0x804,
92         GUEST_DS = 0x806,
93         GUEST_FS = 0x808,
94         GUEST_GS = 0x80A,
95         GUEST_LDTR = 0x80C,
96         GUEST_TR = 0x80E,
97         GUEST_CR0 = 0x6800,
98         GUEST_CR3 = 0x6802,
99         GUEST_CR4 = 0x6804,
100         GUEST_ESLIMIT = 0x4800,
101         GUEST_CSLIMIT = 0x4802,
102         GUEST_SSLIMIT = 0x4804,
103         GUEST_DSLIMIT = 0x4806,
104         GUEST_FSLIMIT = 0x4808,
105         GUEST_GSLIMIT = 0x480A,
106         GUEST_LDTRLIMIT = 0x480C,
107         GUEST_TRLIMIT = 0x480E,
108         GUEST_GDTRLIMIT = 0x4810,
109         GUEST_IDTRLIMIT = 0x4812,
110         GUEST_ESPERM = 0x4814,
111         GUEST_CSPERM = 0x4816,
112         GUEST_SSPERM = 0x4818,
113         GUEST_DSPERM = 0x481A,
114         GUEST_FSPERM = 0x481C,
115         GUEST_GSPERM = 0x481E,
116         GUEST_LDTRPERM = 0x4820,
117         GUEST_TRPERM = 0x4822,
118         GUEST_CR0MASK = 0x6000,
119         GUEST_CR4MASK = 0x6002,
120         GUEST_CR0SHADOW = 0x6004,
121         GUEST_CR4SHADOW = 0x6006,
122         GUEST_ESBASE = 0x6806,
123         GUEST_CSBASE = 0x6808,
124         GUEST_SSBASE = 0x680A,
125         GUEST_DSBASE = 0x680C,
126         GUEST_FSBASE = 0x680E,
127         GUEST_GSBASE = 0x6810,
128         GUEST_LDTRBASE = 0x6812,
129         GUEST_TRBASE = 0x6814,
130         GUEST_GDTRBASE = 0x6816,
131         GUEST_IDTRBASE = 0x6818,
132         GUEST_DR7 = 0x681A,
133         GUEST_RSP = 0x681C,
134         GUEST_RIP = 0x681E,
135         GUEST_RFLAGS = 0x6820,
136         
137         HOST_ES = 0xC00,
138         HOST_CS = 0xC02,
139         HOST_SS = 0xC04,
140         HOST_DS = 0xC06,
141         HOST_FS = 0xC08,
142         HOST_GS = 0xC0A,
143         HOST_TR = 0xC0C,
144         HOST_CR0 = 0x6C00,
145         HOST_CR3 = 0x6C02,
146         HOST_CR4 = 0x6C04,
147         HOST_FSBASE = 0x6C06,
148         HOST_GSBASE = 0x6C08,
149         HOST_TRBASE = 0x6C0A,
150         HOST_GDTR = 0x6C0C,
151         HOST_IDTR = 0x6C0E,
152         HOST_RSP = 0x6C14,
153         HOST_RIP = 0x6C16,
154         
155         GUEST_CANINTR = 0x4824,
156         
157         VM_INSTRERR = 0x4400,
158         VM_EXREASON = 0x4402,
159         VM_EXINTRINFO = 0x4404,
160         VM_EXINTRCODE = 0x4406,
161         VM_IDTVECINFO = 0x4408,
162         VM_IDTVECCODE = 0x440A,
163         VM_EXINSTRLEN = 0x440C,
164         VM_EXINSTRINFO = 0x440E,
165         VM_EXQUALIF = 0x6400,
166         VM_IORCX = 0x6402,
167         VM_IORSI = 0x6404,
168         VM_IORDI = 0x6406,
169         VM_IORIP = 0x6408,
170         VM_GUESTVA = 0x640A,
171         VM_GUESTPA = 0x2400,
172         
173         VM_VPID = 0x000,
174         VM_EPTPIDX = 0x0004,
175         
176         VM_EPTP = 0x201A,
177         VM_EPTPLA = 0x2024,
178         
179         INVLOCAL = 1,
180 };
181
182 typedef struct Vmx Vmx;
183 typedef struct VmCmd VmCmd;
184 typedef struct VmMem VmMem;
185 typedef struct VmIntr VmIntr;
186
187 struct VmMem {
188         uvlong lo, hi;
189         Segment *seg;
190         uintptr off;
191         VmMem *next, *prev;
192         u16int attr;
193 };
194
195 struct VmIntr {
196         u32int info, code, ilen;
197 };
198
199 struct Vmx {
200         enum {
201                 NOVMX,
202                 VMXINACTIVE,
203                 VMXINIT,
204                 VMXREADY,
205                 VMXRUNNING,
206                 VMXDEAD,
207                 VMXENDING,
208         } state;
209         char errstr[ERRMAX];
210         Ureg ureg;
211         FPsave *fp;
212         u8int launched;
213         u8int vpid;
214         enum {
215                 FLUSHVPID = 1,
216                 FLUSHEPT = 2,
217                 STEP = 4,
218                 POSTEX = 8,
219                 POSTIRQ = 16,
220         } onentry;
221         
222         Rendez cmdwait;
223         Lock cmdlock;
224         VmCmd *firstcmd, **lastcmd;
225         VmCmd *postponed;
226         uvlong *pml4;
227         VmMem mem;
228         
229         enum {
230                 GOTEXIT = 1,
231                 GOTIRQACK = 2,
232                 GOTSTEP = 4,
233                 GOTSTEPERR = 8,
234         } got;
235         VmMem *stepmap;
236         VmIntr exc, irq, irqack;
237 };
238
239 struct VmCmd {
240         enum {
241                 CMDFDONE = 1,
242                 CMDFFAIL = 2,
243                 CMDFPOSTP = 4,
244         } flags;
245         u8int scratched;
246         Rendez;
247         Lock;
248         int (*cmd)(VmCmd *, va_list);
249         int retval;
250         char *errstr;
251         va_list va;
252         VmCmd *next;
253 };
254
255 static char Equit[] = "vmx: ending";
256
257 static char *statenames[] = {
258         [NOVMX] "novmx",
259         [VMXINACTIVE] "inactive",
260         [VMXINIT] "init",
261         [VMXREADY] "ready",
262         [VMXRUNNING] "running",
263         [VMXDEAD] "dead",
264         [VMXENDING]"ending"
265 };
266
267 static Vmx vmx;
268
269 static u64int
270 vmcsread(u32int addr)
271 {
272         int rc;
273         u64int val;
274
275         val = 0;
276         rc = vmread(addr, (uintptr *) &val);
277         if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000)
278                 rc = vmread(addr | 1, (uintptr *) &val + 1);
279         if(rc < 0){
280                 char errbuf[128];
281                 snprint(errbuf, sizeof(errbuf), "vmcsread failed (%#.4ux)", addr);
282                 error(errbuf);
283         }
284         return val;
285 }
286
287 static void
288 vmcswrite(u32int addr, u64int val)
289 {
290         int rc;
291         
292         rc = vmwrite(addr, val);
293         if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000)
294                 rc = vmwrite(addr | 1, val >> 32);
295         if(rc < 0){
296                 char errbuf[128];
297                 snprint(errbuf, sizeof(errbuf), "vmcswrite failed (%#.4ux = %#.16ullx)", addr, val);
298                 error(errbuf);
299         }
300 }
301
302 static char *
303 cr0read(char *p, char *e)
304 {
305         uvlong guest, mask, shadow;
306         
307         guest = vmcsread(GUEST_CR0);
308         mask = vmcsread(GUEST_CR0MASK);
309         shadow = vmcsread(GUEST_CR0SHADOW);
310         return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & mask | shadow & ~mask);
311 }
312
313 static char *
314 cr4read(char *p, char *e)
315 {
316         uvlong guest, mask, shadow;
317         
318         guest = vmcsread(GUEST_CR4);
319         mask = vmcsread(GUEST_CR4MASK);
320         shadow = vmcsread(GUEST_CR4SHADOW);
321         return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & mask | shadow & ~mask);
322 }
323
324 static int
325 readonly(char *)
326 {
327         return -1;
328 }
329
330 typedef struct GuestReg GuestReg;
331 struct GuestReg {
332         int offset;
333         u8int size; /* in bytes; 0 means == uintptr */
334         char *name;
335         char *(*read)(char *, char *);
336         int (*write)(char *);
337 };
338 #define UREG(x) ~(ulong)&((Ureg*)0)->x
339 static GuestReg guestregs[] = {
340         {GUEST_RIP, 0, "pc"},
341         {GUEST_RSP, 0, "sp"},
342         {GUEST_RFLAGS, 0, "flags"},
343         {UREG(ax), 0, "ax"},
344         {UREG(bx), 0, "bx"},
345         {UREG(cx), 0, "cx"},
346         {UREG(dx), 0, "dx"},
347         {UREG(bp), 0, "bp"},
348         {UREG(si), 0, "si"},
349         {UREG(di), 0, "di"},
350         {GUEST_GDTRBASE, 0, "gdtrbase"},
351         {GUEST_GDTRLIMIT, 4, "gdtrlimit"},
352         {GUEST_IDTRBASE, 0, "idtrbase"},
353         {GUEST_IDTRLIMIT, 4, "idtrlimit"},
354         {GUEST_CS, 2, "cs"},
355         {GUEST_CSBASE, 0, "csbase"},
356         {GUEST_CSLIMIT, 4, "cslimit"},
357         {GUEST_CSPERM, 4, "csperm"},
358         {GUEST_DS, 2, "ds"},
359         {GUEST_DSBASE, 0, "dsbase"},
360         {GUEST_DSLIMIT, 4, "dslimit"},
361         {GUEST_DSPERM, 4, "dsperm"},
362         {GUEST_ES, 2, "es"},
363         {GUEST_ESBASE, 0, "esbase"},
364         {GUEST_ESLIMIT, 4, "eslimit"},
365         {GUEST_ESPERM, 4, "esperm"},
366         {GUEST_FS, 2, "fs"},
367         {GUEST_FSBASE, 0, "fsbase"},
368         {GUEST_FSLIMIT, 4, "fslimit"},
369         {GUEST_FSPERM, 4, "fsperm"},
370         {GUEST_GS, 2, "gs"},
371         {GUEST_GSBASE, 0, "gsbase"},
372         {GUEST_GSLIMIT, 4, "gslimit"},
373         {GUEST_GSPERM, 4, "gsperm"},
374         {GUEST_SS, 2, "ss"},
375         {GUEST_SSBASE, 0, "ssbase"},
376         {GUEST_SSLIMIT, 4, "sslimit"},
377         {GUEST_SSPERM, 4, "ssperm"},
378         {GUEST_TR, 2, "tr"},
379         {GUEST_TRBASE, 0, "trbase"},
380         {GUEST_TRLIMIT, 4, "trlimit"},
381         {GUEST_TRPERM, 4, "trperm"},
382         {GUEST_LDTR, 2, "ldtr"},
383         {GUEST_LDTRBASE, 0, "ldtrbase"},
384         {GUEST_LDTRLIMIT, 4, "ldtrlimit"},
385         {GUEST_LDTRPERM, 4, "ldtrperm"},
386         {GUEST_CR0, 0, "cr0", cr0read, readonly},
387         {UREG(trap), 0, "cr2"},
388         {GUEST_CR3, 0, "cr3"},
389         {GUEST_CR4, 0, "cr4", cr4read, readonly},
390         {VM_INSTRERR, 4, "instructionerror", nil, readonly},
391         {VM_EXREASON, 4, "exitreason", nil, readonly},
392         {VM_EXQUALIF, 0, "exitqualification", nil, readonly},
393         {VM_EXINTRINFO, 4, "exitinterruptinfo", nil, readonly},
394         {VM_EXINTRCODE, 4, "exitinterruptcode", nil, readonly},
395         {VM_EXINSTRLEN, 4, "exitinstructionlen", nil, readonly},
396         {VM_EXINSTRINFO, 4, "exitinstructioninfo", nil, readonly},
397         {VM_GUESTVA, 0, "exitva", nil, readonly},
398         {VM_GUESTPA, 0, "exitpa", nil, readonly},
399         {VM_IDTVECINFO, 4, "idtinterruptinfo", nil, readonly},
400         {VM_IDTVECCODE, 4, "idtinterruptcode", nil, readonly},
401 };
402
403 static int
404 vmokpage(u64int addr)
405 {
406         return (addr & 0xfff) == 0 && addr >> 48 == 0;
407 }
408
409 static uvlong *
410 eptwalk(uvlong addr)
411 {
412         uvlong *tab, *nt;
413         uvlong v;
414         int i;
415         
416         tab = vmx.pml4;
417         for(i = 3; i >= 1; i--){
418                 tab += addr >> 12 + 9 * i & 0x1ff;
419                 v = *tab;
420                 if((v & 3) == 0){
421                         nt = mallocalign(BY2PG, BY2PG, 0, 0);
422                         if(nt == nil) error(Enomem);
423                         memset(nt, 0, BY2PG);
424                         v = PADDR(nt) | 0x407;
425                         *tab = v;
426                 }
427                 tab = KADDR(v & ~0xfff);
428         }
429         return tab + (addr >> 12 & 0x1ff);
430 }
431
432 static void
433 eptfree(uvlong *tab, int level)
434 {
435         int i;
436         uvlong v, *t;
437         
438         if(level < 3){
439                 for(i = 0; i < 512; i++){
440                         v = tab[i];
441                         if((v & 3) == 0) continue;
442                         t = KADDR(v & ~0xfff);
443                         eptfree(t, level + 1);
444                         tab[i] = 0;
445                 }
446         }
447         if(level > 0)
448                 free(tab);              
449 }
450
451 static void
452 epttranslate(VmMem *mp)
453 {
454         uvlong p, hpa;
455
456         if(mp->seg != nil && (mp->seg->type & SG_TYPE) != SG_FIXED || (mp->lo & 0xfff) != 0 || (mp->hi & 0xfff) != 0 || (uint)mp->attr >= 0x1000)
457                 error(Egreg);
458         if(mp->seg != nil){
459                 if(mp->seg->base + mp->off + (mp->hi - mp->lo) > mp->seg->top)
460                         error(Egreg);
461                 hpa = mp->seg->map[0]->pages[0]->pa + mp->off;
462         }else
463                 hpa = 0;
464         for(p = mp->lo; p < mp->hi; p += BY2PG)
465                 *eptwalk(p) = hpa + (p - mp->lo) + mp->attr;
466         vmx.onentry |= FLUSHEPT;
467 }
468
469 static char *mtype[] = {"uc", "wc", "02", "03", "wt", "wp", "wb", "07"};
470
471 static int
472 cmdgetmeminfo(VmCmd *, va_list va)
473 {
474         VmMem *mp;
475         char *p0, *e, *p;
476         char attr[4];
477         char mt[4];
478         
479         p0 = va_arg(va, char *);
480         e = va_arg(va, char *);
481         p = p0;
482         for(mp = vmx.mem.next; mp != &vmx.mem; mp = mp->next){
483                 attr[0] = (mp->attr & 1) != 0 ? 'r' : '-';
484                 attr[1] = (mp->attr & 2) != 0 ? 'w' : '-';
485                 attr[2] = (mp->attr & 4) != 0 ? 'x' : '-';
486                 attr[3] = 0;
487                 *(ushort*)mt = *(u16int*)mtype[mp->attr >> 3 & 7];
488                 mt[2] = (mp->attr & 0x40) != 0 ? '!' : 0;
489                 mt[3] = 0;
490                 p = seprint(p, e, "%s %s %#llux %#llux %p %#llux\n", attr, mt, mp->lo, mp->hi, mp->seg, (uvlong)mp->off);
491         }
492         return p - p0;
493 }
494
495 static int
496 cmdclearmeminfo(VmCmd *, va_list)
497 {
498         VmMem *mp, *mn;
499         
500         eptfree(vmx.pml4, 0);
501         for(mp = vmx.mem.next; mp != &vmx.mem; mp = mn){
502                 mn = mp->next;
503                 free(mp);
504         }
505         vmx.mem.prev = &vmx.mem;
506         vmx.mem.next = &vmx.mem;
507         vmx.onentry |= FLUSHEPT;
508         return 0;
509 }
510
511 extern Segment* (*_globalsegattach)(char*);
512
513 static int
514 cmdsetmeminfo(VmCmd *, va_list va)
515 {
516         char *p0, *p, *q, *r;
517         int j;
518         char *f[10];
519         VmMem *mp;
520         int rc;
521         
522         p0 = va_arg(va, char *);
523         p = p0;
524         mp = nil;
525         for(;;){
526                 q = strchr(p, '\n');
527                 if(q == 0) break;
528                 *q = 0;
529                 if(mp == nil)
530                         mp = malloc(sizeof(VmMem));
531                 if(waserror()){
532                         free(mp);
533                         nexterror();
534                 }
535                 rc = tokenize(p, f, nelem(f));
536                 p = q + 1;
537                 if(rc == 0) goto next;
538                 if(rc != 4 && rc != 6) error("number of fields wrong");
539                 memset(mp, 0, sizeof(VmMem));
540                 for(q = f[0]; *q != 0; q++)
541                         switch(*q){
542                         case 'r': if((mp->attr & 1) != 0) goto tinval; mp->attr |= 1; break;
543                         case 'w': if((mp->attr & 2) != 0) goto tinval; mp->attr |= 2; break;
544                         case 'x': if((mp->attr & 4) != 0) goto tinval; mp->attr |= 0x404; break;
545                         case '-': break;
546                         default: tinval: error("invalid access field");
547                         }
548                 for(j = 0; j < 8; j++)
549                         if(strncmp(mtype[j], f[1], 2) == 0){
550                                 mp->attr |= j << 3;
551                                 break;
552                         }
553                 if(j == 8 || strlen(f[1]) > 3) error("invalid memory type");
554                 if(f[1][2] == '!') mp->attr |= 0x40;
555                 else if(f[1][2] != 0) error("invalid memory type");
556                 mp->lo = strtoull(f[2], &r, 0);
557                 if(*r != 0 || !vmokpage(mp->lo)) error("invalid low guest physical address");
558                 mp->hi = strtoull(f[3], &r, 0);
559                 if(*r != 0 || !vmokpage(mp->hi) || mp->hi <= mp->lo) error("invalid high guest physical address");
560                 mp->off = strtoull(f[5], &r, 0);
561                 if(*r != 0 || !vmokpage(mp->off)) error("invalid offset");
562                 if((mp->attr & 7) != 0){
563                         if(rc != 6) error("number of fields wrong");
564                         mp->seg = _globalsegattach(f[4]);
565                         if(mp->seg == nil) error("no such segment");
566                         if(mp->seg->base + mp->off + (mp->hi - mp->lo) > mp->seg->top) error("out of bounds");
567                 }
568                 epttranslate(mp);
569                 mp->prev = vmx.mem.prev;
570                 mp->next = &vmx.mem;
571                 mp->prev->next = mp;
572                 mp->next->prev = mp;
573                 mp = nil;
574         next:
575                 poperror();
576         }
577         free(mp);
578         return p - p0;
579 }
580
581 static void
582 vmxreset(void)
583 {
584         ulong regs[4];
585         vlong msr;
586
587         cpuid(1, regs);
588         if((regs[2] & 1<<5) == 0) return;
589         /* check if disabled by BIOS */
590         if(rdmsr(0x3a, &msr) < 0) return;
591         if((msr & 5) != 5){
592                 if((msr & 1) == 0){ /* msr still unlocked */
593                         wrmsr(0x3a, msr | 5);
594                         if(rdmsr(0x3a, &msr) < 0)
595                                 return;
596                 }
597                 if((msr & 5) != 5)
598                         return;
599         }
600         if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) return;
601         if((vlong)msr >= 0) return;
602         if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) return;
603         if((msr >> 32 & PROCB_EPT) == 0 || (msr >> 32 & PROCB_VPID) == 0) return;
604         vmx.state = VMXINACTIVE;
605         vmx.lastcmd = &vmx.firstcmd;
606         vmx.mem.next = &vmx.mem;
607         vmx.mem.prev = &vmx.mem;
608 }
609
610 static void
611 vmxshutdown(void)
612 {
613         if(vmx.state != NOVMX && vmx.state != VMXINACTIVE)
614                 vmxoff();
615 }
616
617 static void
618 vmcsinit(void)
619 {
620         vlong msr;
621         u32int x;
622         
623         memset(&vmx.ureg, 0, sizeof(vmx.ureg));
624         vmx.launched = 0;
625         vmx.onentry = 0;
626         
627         if(rdmsr(VMX_BASIC_MSR, &msr) < 0) error("rdmsr(VMX_BASIC_MSR) failed");
628         if((msr & 1ULL<<55) != 0){
629                 if(rdmsr(VMX_TRUE_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_TRUE_PROCB_CTLS_MSR) failed");
630                 if(rdmsr(VMX_TRUE_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_TRUE_PINB_CTLS_MSR) failed");
631         }else{
632                 if(rdmsr(VMX_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR) failed");
633                 if(rdmsr(VMX_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_PINB_CTLS_MSR) failed");
634         }
635
636         if(rdmsr(VMX_PINB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PINB_CTLS_MSR failed");
637         x = (u32int)pinb_ctls | 1<<1 | 1<<2 | 1<<4; /* currently reserved default1 bits */
638         x |= PINB_EXITIRQ | PINB_EXITNMI;
639         x &= pinb_ctls >> 32;
640         vmcswrite(PINB_CTLS, x);
641         
642         if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR failed");
643         x = (u32int)procb_ctls | 1<<1 | 7<<4 | 1<<8 | 1<<13 | 1<<14 | 1<<26; /* currently reserved default1 bits */
644         x |= PROCB_EXITHLT | PROCB_EXITMWAIT;
645         x |= PROCB_EXITMOVDR | PROCB_EXITIO | PROCB_EXITMONITOR | PROCB_EXITPAUSE;
646         x |= PROCB_USECTLS2;
647         x &= msr >> 32;
648         vmcswrite(PROCB_CTLS, x);
649         
650         if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS2_MSR failed");
651         x = PROCB_EPT | PROCB_VPID | PROCB_UNRESTR;
652         x &= msr >> 32;
653         vmcswrite(PROCB_CTLS2, x);
654         
655         if(rdmsr(VMX_VMEXIT_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMEXIT_CTLS_MSR failed");
656         x = (u32int)msr;
657         if(sizeof(uintptr) == 8) x |= VMEXIT_HOST64;
658         x &= msr >> 32;
659         vmcswrite(VMEXIT_CTLS, x);
660         
661         if(rdmsr(VMX_VMENTRY_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMENTRY_CTLS_MSR failed");
662         x = (u32int)msr;
663         if(sizeof(uintptr) == 8) x |= VMENTRY_GUEST64;
664         x &= msr >> 32;
665         vmcswrite(VMENTRY_CTLS, x);
666         
667         vmcswrite(CR3_TARGCNT, 0);
668         vmcswrite(VMEXIT_MSRLDCNT, 0);
669         vmcswrite(VMEXIT_MSRSTCNT, 0);
670         vmcswrite(VMENTRY_MSRLDCNT, 0);
671         vmcswrite(VMENTRY_INTRINFO, 0);
672         vmcswrite(VMCS_LINK, -1);
673         
674         vmcswrite(HOST_CS, KESEL);
675         vmcswrite(HOST_DS, KDSEL);
676         vmcswrite(HOST_ES, KDSEL);
677         vmcswrite(HOST_FS, KDSEL);
678         vmcswrite(HOST_GS, KDSEL);
679         vmcswrite(HOST_SS, KDSEL);
680         vmcswrite(HOST_TR, TSSSEL);
681         vmcswrite(HOST_CR0, getcr0() & ~0xe);
682         vmcswrite(HOST_CR3, getcr3());
683         vmcswrite(HOST_CR4, getcr4());
684         rdmsr(0xc0000100, &msr);
685         vmcswrite(HOST_FSBASE, msr);
686         rdmsr(0xc0000101, &msr);
687         vmcswrite(HOST_GSBASE, msr);
688         vmcswrite(HOST_TRBASE, (uintptr) m->tss);
689         vmcswrite(HOST_GDTR, (uintptr) m->gdt);
690         vmcswrite(HOST_IDTR, IDTADDR);
691         
692         vmcswrite(EXC_BITMAP, 1<<18);
693         vmcswrite(PFAULT_MASK, 0);
694         vmcswrite(PFAULT_MATCH, 0);
695         
696         vmcswrite(GUEST_CSBASE, 0);
697         vmcswrite(GUEST_DSBASE, 0);
698         vmcswrite(GUEST_ESBASE, 0);
699         vmcswrite(GUEST_FSBASE, 0);
700         vmcswrite(GUEST_GSBASE, 0);
701         vmcswrite(GUEST_SSBASE, 0);
702         vmcswrite(GUEST_CSLIMIT, -1);
703         vmcswrite(GUEST_DSLIMIT, -1);
704         vmcswrite(GUEST_ESLIMIT, -1);
705         vmcswrite(GUEST_FSLIMIT, -1);
706         vmcswrite(GUEST_GSLIMIT, -1);
707         vmcswrite(GUEST_SSLIMIT, -1);
708         vmcswrite(GUEST_CSPERM, (SEGG|SEGD|SEGP|SEGPL(0)|SEGEXEC|SEGR) >> 8 | 1);
709         vmcswrite(GUEST_DSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
710         vmcswrite(GUEST_ESPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
711         vmcswrite(GUEST_FSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
712         vmcswrite(GUEST_GSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
713         vmcswrite(GUEST_SSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
714         vmcswrite(GUEST_LDTRPERM, 1<<16);
715
716         enum {
717                 CR0RSVD = 0x1ffaffc0,
718                 CR4RSVD = 0xff889000,
719                 CR4VMXE = 1<<13,
720                 CR4SMXE = 1<<14,
721         };
722         vmcswrite(GUEST_CR0MASK, CR0RSVD | (uintptr)0xFFFFFFFF00000000ULL);
723         vmcswrite(GUEST_CR4MASK, CR4RSVD | CR4VMXE | CR4SMXE | (uintptr)0xFFFFFFFF00000000ULL);
724         vmcswrite(GUEST_CR0, getcr0() & ~(1<<31));
725         vmcswrite(GUEST_CR3, 0);
726         vmcswrite(GUEST_CR4, getcr4());
727         vmcswrite(GUEST_CR0SHADOW, getcr0());
728         vmcswrite(GUEST_CR4SHADOW, getcr4() & ~CR4VMXE);
729         
730         vmcswrite(GUEST_TRBASE, (uintptr) m->tss);
731         vmcswrite(GUEST_TRLIMIT, 0xffff);
732         vmcswrite(GUEST_TRPERM, (SEGTSS|SEGPL(0)|SEGP) >> 8 | 2);
733         
734         vmx.pml4 = mallocalign(BY2PG, BY2PG, 0, 0);
735         memset(vmx.pml4, 0, BY2PG);
736         vmcswrite(VM_EPTP, PADDR(vmx.pml4) | 3<<3);
737         vmx.vpid = 1;
738         vmcswrite(VM_VPID, vmx.vpid);
739         
740         vmcswrite(GUEST_RFLAGS, 2);
741         
742         vmx.onentry = FLUSHVPID | FLUSHEPT;
743         
744         vmx.fp = mallocalign(512, 512, 0, 0);
745         if(vmx.fp == nil)
746                 error(Enomem);
747         fpinit();
748         fpsave(vmx.fp);
749 }
750
751 static void
752 vmxstart(void)
753 {
754         static uchar *vmcs; /* also vmxon region */
755         vlong x;
756
757         putcr4(getcr4() | 0x2000);
758
759         if(vmcs == nil){
760                 vmcs = mallocalign(8192, 4096, 0, 0);
761                 if(vmcs == nil)
762                         error(Enomem);
763         }
764         memset(vmcs, 0, 8192);
765         rdmsr(VMX_BASIC_MSR, &x);
766         *(ulong*)vmcs = x;
767         *(ulong*)&vmcs[4096] = x;
768         if(vmxon(PADDR(vmcs + 4096)) < 0)
769                 error("vmxon failed");
770         if(vmclear(PADDR(vmcs)) < 0)
771                 error("vmclear failed");
772         if(vmptrld(PADDR(vmcs)) < 0)
773                 error("vmptrld failed");
774         vmcsinit();
775 }
776
777 static void
778 cmdrelease(VmCmd *p, int f)
779 {
780         lock(p);
781         p->flags |= CMDFDONE | f;
782         wakeup(p);
783         unlock(p);
784 }
785
786 static void
787 killcmds(VmCmd *notme)
788 {
789         VmCmd *p, *pn;
790         
791         for(p = vmx.postponed; p != nil; p = pn){
792                 pn = p->next;
793                 p->next = nil;
794                 if(p == notme) continue;
795                 kstrcpy(p->errstr, Equit, ERRMAX);
796                 cmdrelease(p, CMDFFAIL);
797         }
798         vmx.postponed = nil;
799         ilock(&vmx.cmdlock);
800         for(p = vmx.firstcmd; p != nil; p = pn){
801                 pn = p->next;
802                 p->next = nil;
803                 if(p == notme) continue;
804                 kstrcpy(p->errstr, Equit, ERRMAX);
805                 cmdrelease(p, CMDFFAIL);
806         }
807         vmx.firstcmd = nil;
808         vmx.lastcmd = &vmx.firstcmd;
809         iunlock(&vmx.cmdlock);
810 }
811
812 static int
813 cmdquit(VmCmd *p, va_list va)
814 {
815         vmx.state = VMXENDING;
816         cmdclearmeminfo(p, va);
817         killcmds(p);
818
819         free(vmx.pml4);
820         vmx.pml4 = nil;
821         vmx.got = 0;
822         vmx.onentry = 0;
823         vmx.stepmap = nil;
824
825         vmxoff();
826         vmx.state = VMXINACTIVE;
827         cmdrelease(p, 0);
828         pexit(Equit, 1);
829         return 0;
830 }
831
832 static void
833 processexit(void)
834 {
835         u32int reason;
836         
837         reason = vmcsread(VM_EXREASON);
838         if((reason & 1<<31) == 0)
839                 switch(reason & 0xffff){
840                 case 1: /* external interrupt */
841                 case 3: /* INIT */
842                 case 4: /* SIPI */
843                 case 5: /* IO SMI */
844                 case 6: /* SMI */
845                 case 7: /* IRQ window */
846                 case 8: /* NMI window */
847                         return;
848                 case 37:
849                         if((vmx.onentry & STEP) != 0){
850                                 vmx.state = VMXREADY;
851                                 vmx.got |= GOTSTEP;
852                                 vmx.onentry &= ~STEP;
853                                 return;
854                         }
855                         break;
856                 }
857         if((vmx.onentry & STEP) != 0){
858                 iprint("VMX: exit reason %#x when expected step...\n", reason & 0xffff);
859                 vmx.onentry &= ~STEP;
860                 vmx.got |= GOTSTEP|GOTSTEPERR;
861         }
862         vmx.state = VMXREADY;
863         vmx.got |= GOTEXIT;
864 }
865
866 static int
867 cmdgetregs(VmCmd *, va_list va)
868 {
869         char *p0, *e;
870         GuestReg *r;
871         uvlong val;
872         int s;
873         char *p;
874         
875         p0 = va_arg(va, char *);
876         e = va_arg(va, char *);
877         p = p0;
878         for(r = guestregs; r < guestregs + nelem(guestregs); r++){
879                 if(r->offset >= 0)
880                         val = vmcsread(r->offset);
881                 else
882                         val = *(uintptr*)((uchar*)&vmx.ureg + ~r->offset);
883                 s = r->size;
884                 if(s == 0) s = sizeof(uintptr);
885                 p = seprint(p, e, "%s %#.*llux\n", r->name, s * 2, val);
886         }
887         return p - p0;
888 }
889
890 static int
891 setregs(char *p0, char rs, char *fs)
892 {
893         char *p, *q, *rp;
894         char *f[10];
895         GuestReg *r;
896         uvlong val;
897         int sz;
898         int rc;
899
900         p = p0;
901         for(;;){
902                 q = strchr(p, rs);
903                 if(q == 0) break;
904                 *q = 0;
905                 rc = getfields(p, f, nelem(f), 1, fs);
906                 p = q + 1;
907                 if(rc == 0) continue;
908                 if(rc != 2) error("number of fields wrong");
909                 
910                 for(r = guestregs; r < guestregs + nelem(guestregs); r++)
911                         if(strcmp(r->name, f[0]) == 0)
912                                 break;
913                 if(r == guestregs + nelem(guestregs))
914                         error("unknown register");
915                 if(r->write != nil){
916                         r->write(f[1]);
917                         continue;
918                 }
919                 val = strtoull(f[1], &rp, 0);
920                 sz = r->size;
921                 if(sz == 0) sz = sizeof(uintptr);
922                 if(*rp != 0 || val >> 8 * sz != 0) error("invalid value");
923                 if(r->offset >= 0)
924                         vmcswrite(r->offset, val);
925                 else{
926                         assert((u32int)~r->offset + sz <= sizeof(Ureg)); 
927                         switch(sz){
928                         case 1: *(u8int*)((u8int*)&vmx.ureg + (u32int)~r->offset) = val; break;
929                         case 2: *(u16int*)((u8int*)&vmx.ureg + (u32int)~r->offset) = val; break;
930                         case 4: *(u32int*)((u8int*)&vmx.ureg + (u32int)~r->offset) = val; break;
931                         case 8: *(u64int*)((u8int*)&vmx.ureg + (u32int)~r->offset) = val; break;
932                         default: error(Egreg);
933                         }
934                 }
935         }
936         return p - p0;
937 }
938
939 static int
940 cmdsetregs(VmCmd *, va_list va)
941 {
942         return setregs(va_arg(va, char *), '\n', " \t");
943 }
944
945 static int
946 cmdgetfpregs(VmCmd *, va_list va)
947 {
948         uchar *p;
949         
950         p = va_arg(va, uchar *);
951         memmove(p, vmx.fp, sizeof(FPsave));
952         return sizeof(FPsave);
953 }
954
955 static int
956 cmdsetfpregs(VmCmd *, va_list va)
957 {
958         uchar *p;
959         ulong n;
960         vlong off;
961         
962         p = va_arg(va, uchar *);
963         n = va_arg(va, ulong);
964         off = va_arg(va, vlong);
965         if(off < 0 || off >= sizeof(FPsave)) n = 0;
966         else if(off + n > sizeof(FPsave)) n = sizeof(FPsave) - n;
967         memmove((uchar*)vmx.fp + off, p, n);
968         return n;
969 }
970
971 static int
972 cmdgo(VmCmd *, va_list va)
973 {
974         char *r;
975
976         if(vmx.state != VMXREADY)
977                 error("VM not ready");
978         r = va_arg(va, char *);
979         if(r != nil) setregs(r, ';', "=");
980         vmx.state = VMXRUNNING;
981         return 0;
982 }
983
984 static int
985 cmdstop(VmCmd *, va_list)
986 {
987         if(vmx.state != VMXREADY && vmx.state != VMXRUNNING)
988                 error("VM not ready or running");
989         vmx.state = VMXREADY;
990         return 0;
991 }
992
993 static int
994 cmdstatus(VmCmd *, va_list va)
995 {       
996         kstrcpy(va_arg(va, char *), vmx.errstr, ERRMAX);
997         return vmx.state;
998 }
999
1000 static char *exitreasons[] = {
1001         [0] "exc", [1] "extirq", [2] "triplef", [3] "initsig", [4] "sipi", [5] "smiio", [6] "smiother", [7] "irqwin",
1002         [8] "nmiwin", [9] "taskswitch", [10] ".cpuid", [11] ".getsec", [12] ".hlt", [13] ".invd", [14] ".invlpg", [15] ".rdpmc",
1003         [16] ".rdtsc", [17] ".rsm", [18] ".vmcall", [19] ".vmclear", [20] ".vmlaunch", [21] ".vmptrld", [22] ".vmptrst", [23] ".vmread",
1004         [24] ".vmresume", [25] ".vmwrite", [26] ".vmxoff", [27] ".vmxon", [28] "movcr", [29] ".movdr", [30] "io", [31] ".rdmsr",
1005         [32] ".wrmsr", [33] "entrystate", [34] "entrymsr", [36] ".mwait", [37] "monitortrap", [39] ".monitor",
1006         [40] ".pause", [41] "mcheck", [43] "tpr", [44] "apicacc", [45] "eoi", [46] "gdtr_idtr", [47] "ldtr_tr",
1007         [48] "eptfault", [49] "eptinval", [50] ".invept", [51] ".rdtscp", [52] "preempt", [53] ".invvpid", [54] ".wbinvd", [55] ".xsetbv",
1008         [56] "apicwrite", [57] ".rdrand", [58] ".invpcid", [59] ".vmfunc", [60] ".encls", [61] ".rdseed", [62] "pmlfull", [63] ".xsaves",
1009         [64] ".xrstors", 
1010 };
1011
1012 static char *except[] = {
1013         [0] "#de", [1] "#db", [3] "#bp", [4] "#of", [5] "#br", [6] "#ud", [7] "#nm",
1014         [8] "#df", [10] "#ts", [11] "#np", [12] "#ss", [13] "#gp", [14] "#pf",
1015         [16] "#mf", [17] "#ac", [18] "#mc", [19] "#xm", [20] "#ve",
1016 };
1017
1018 static int
1019 cmdwait(VmCmd *cp, va_list va)
1020 {
1021         char *p, *p0, *e;
1022         u32int reason, intr;
1023         uvlong qual;
1024         u16int rno;
1025
1026         if(cp->scratched)
1027                 error(Eintr);
1028         p0 = p = va_arg(va, char *);
1029         e = va_arg(va, char *);
1030         if((vmx.got & GOTIRQACK) != 0){
1031                 p = seprint(p, e, "*ack %d\n", vmx.irqack.info & 0xff);
1032                 vmx.got &= ~GOTIRQACK;
1033                 return p - p0;
1034         }
1035         if((vmx.got & GOTEXIT) == 0){
1036                 cp->flags |= CMDFPOSTP;
1037                 return -1;
1038         }
1039         vmx.got &= ~GOTEXIT;
1040         reason = vmcsread(VM_EXREASON);
1041         qual = vmcsread(VM_EXQUALIF);
1042         rno = reason;
1043         intr = vmcsread(VM_EXINTRINFO);
1044         if((reason & 1<<31) != 0)
1045                 p = seprint(p, e, "!");
1046         if(rno == 0 && (intr & 1<<31) != 0){
1047                 if((intr & 0xff) >= nelem(except) || except[intr & 0xff] == nil)
1048                         p = seprint(p, e, "#%d ", intr & 0xff);
1049                 else
1050                         p = seprint(p, e, "%s ", except[intr & 0xff]);
1051         }else if(rno >= nelem(exitreasons) || exitreasons[rno] == nil)
1052                 p = seprint(p, e, "?%d ", rno);
1053         else
1054                 p = seprint(p, e, "%s ", exitreasons[rno]);
1055         p = seprint(p, e, "%#ullx pc %#ullx sp %#ullx ilen %#ullx iinfo %#ullx", qual, vmcsread(GUEST_RIP), vmcsread(GUEST_RSP), vmcsread(VM_EXINSTRLEN), vmcsread(VM_EXINSTRINFO));
1056         if((intr & 1<<11) != 0) p = seprint(p, e, " excode %#ullx", vmcsread(VM_EXINTRCODE));
1057         if(rno == 48 && (qual & 0x80) != 0) p = seprint(p, e, " va %#ullx", vmcsread(VM_GUESTVA));
1058         if(rno == 48 || rno == 49) p = seprint(p, e, " pa %#ullx", vmcsread(VM_GUESTPA));
1059         if(rno == 30) p = seprint(p, e, " ax %#ullx", (uvlong)vmx.ureg.ax);
1060         p = seprint(p, e, "\n");
1061         return p - p0;
1062 }
1063
1064 static int
1065 cmdstep(VmCmd *cp, va_list va)
1066 {
1067         switch(cp->retval){
1068         case 0:
1069                 if((vmx.got & GOTSTEP) != 0 || (vmx.onentry & STEP) != 0)
1070                         error(Einuse);
1071                 if(vmx.state != VMXREADY){
1072                         iprint("pre-step in state %s\n", statenames[vmx.state]);
1073                         error("not ready");
1074                 }
1075                 vmx.stepmap = va_arg(va, VmMem *);
1076                 vmx.onentry |= STEP;
1077                 vmx.state = VMXRUNNING;
1078                 cp->flags |= CMDFPOSTP;
1079                 return 1;
1080         case 1:
1081                 if(vmx.state != VMXREADY){
1082                         iprint("post-step in state %s\n", statenames[vmx.state]);
1083                         vmx.onentry &= ~STEP;
1084                         vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1085                         error("not ready");
1086                 }
1087                 if((vmx.got & GOTSTEP) == 0){
1088                         cp->flags |= CMDFPOSTP;
1089                         return 1;
1090                 }
1091                 if((vmx.got & GOTSTEPERR) != 0){
1092                         vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1093                         error("step failed");
1094                 }
1095                 vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1096                 return 1;
1097         }
1098         return 0;
1099 }
1100
1101 static void
1102 eventparse(char *p, VmIntr *vi)
1103 {
1104         char *q, *r;
1105         int i;
1106         
1107         memset(vi, 0, sizeof(VmIntr));
1108         q = nil;
1109         kstrdup(&q, p);
1110         if(waserror()){
1111                 free(q);
1112                 memset(vi, 0, sizeof(VmIntr));
1113                 nexterror();
1114         }
1115         vi->info = 1<<31;
1116         r = strchr(q, ',');
1117         if(r != nil) *r++ = 0;
1118         for(i = 0; i < nelem(except); i++)
1119                 if(except[i] != nil && strcmp(except[i], q) == 0)
1120                         break;
1121         if(*q == '#'){
1122                 q++;
1123                 vi->info |= 3 << 8;
1124         }
1125         if(i == nelem(except)){
1126                 i = strtoul(q, &q, 10);
1127                 if(*q != 0 || i > 255) error(Ebadctl);
1128         }
1129         vi->info |= i;
1130         if((vi->info & 0x7ff) == 3 || (vi->info & 0x7ff) == 4)
1131                 vi->info += 3 << 8;
1132         if(r == nil) goto out;
1133         if(*r != ','){
1134                 vi->code = strtoul(r, &r, 0);
1135                 vi->info |= 1<<11;
1136         }else r++;
1137         if(*r == ',')
1138                 vi->ilen = strtoul(r + 1, &r, 0);
1139         if(*r != 0) error(Ebadctl);
1140 out:
1141         poperror();
1142         free(q);
1143 }
1144
1145 static int
1146 cmdexcept(VmCmd *cp, va_list va)
1147 {
1148         if(cp->scratched) error(Eintr);
1149         if((vmx.onentry & POSTEX) != 0){
1150                 cp->flags |= CMDFPOSTP;
1151                 return 0;
1152         }
1153         eventparse(va_arg(va, char *), &vmx.exc);
1154         vmx.onentry |= POSTEX;
1155         return 0;
1156 }
1157
1158 static int
1159 cmdirq(VmCmd *, va_list va)
1160 {
1161         char *p;
1162         VmIntr vi;
1163         
1164         p = va_arg(va, char *);
1165         if(p == nil)
1166                 vmx.onentry &= ~POSTIRQ;
1167         else{
1168                 eventparse(p, &vi);
1169                 vmx.irq = vi;
1170                 vmx.onentry |= POSTIRQ;
1171         }
1172         return 0;
1173 }
1174
1175
1176 static int
1177 gotcmd(void *)
1178 {
1179         int rc;
1180
1181         ilock(&vmx.cmdlock);
1182         rc = vmx.firstcmd != nil;
1183         iunlock(&vmx.cmdlock);
1184         return rc;
1185 }
1186
1187 static void
1188 markcmddone(VmCmd *p, VmCmd ***pp)
1189 {
1190         if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP){
1191                 **pp = p;
1192                 *pp = &p->next;
1193         }else{
1194                 p->flags = p->flags & ~CMDFPOSTP;
1195                 cmdrelease(p, 0);
1196         }
1197 }
1198
1199 static VmCmd **
1200 markppcmddone(VmCmd **pp)
1201 {
1202         VmCmd *p;
1203         
1204         p = *pp;
1205         if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP)
1206                 return &p->next;
1207         *pp = p->next;
1208         p->next = nil;
1209         p->flags = p->flags & ~CMDFPOSTP;
1210         cmdrelease(p, 0);
1211         return pp;
1212 }
1213
1214
1215 static void
1216 runcmd(void)
1217 {
1218         VmCmd *p, **pp;
1219         
1220         for(pp = &vmx.postponed; p = *pp, p != nil; ){
1221                 if(waserror()){
1222                         kstrcpy(p->errstr, up->errstr, ERRMAX);
1223                         p->flags |= CMDFFAIL;
1224                         pp = markppcmddone(pp);
1225                         continue;
1226                 }
1227                 p->flags &= ~CMDFPOSTP;
1228                 p->retval = p->cmd(p, p->va);
1229                 poperror();
1230                 pp = markppcmddone(pp);
1231         }
1232         for(;;){
1233                 ilock(&vmx.cmdlock);
1234                 p = vmx.firstcmd;
1235                 if(p == nil){
1236                         iunlock(&vmx.cmdlock);
1237                         break;
1238                 }
1239                 vmx.firstcmd = p->next;
1240                 if(vmx.lastcmd == &p->next)
1241                         vmx.lastcmd = &vmx.firstcmd;
1242                 iunlock(&vmx.cmdlock);
1243                 p->next = nil;
1244                 if(waserror()){
1245                         kstrcpy(p->errstr, up->errstr, ERRMAX);
1246                         p->flags |= CMDFFAIL;
1247                         markcmddone(p, &pp);
1248                         continue;
1249                 }
1250                 if(p->scratched) error(Eintr);
1251                 p->retval = p->cmd(p, p->va);
1252                 poperror();
1253                 markcmddone(p, &pp);
1254         }
1255 }
1256
1257 static void
1258 dostep(int setup)
1259 {
1260         static uvlong oldmap;
1261         static uvlong *mapptr;
1262
1263         if(setup){
1264                 if(vmx.stepmap != nil){
1265                         mapptr = eptwalk(vmx.stepmap->lo);
1266                         oldmap = *mapptr;
1267                         epttranslate(vmx.stepmap);
1268                 }
1269         }else{
1270                 vmcswrite(PROCB_CTLS, vmcsread(PROCB_CTLS) & ~(uvlong)PROCB_MONTRAP);
1271                 if(vmx.stepmap != nil){
1272                         *mapptr = oldmap;
1273                         vmx.stepmap = nil;
1274                         vmx.onentry |= FLUSHEPT;
1275                 }
1276         }
1277 }
1278
1279 static void
1280 vmxproc(void *)
1281 {
1282         int init;
1283         u32int procbctls, defprocbctls;
1284
1285         procwired(up, 0);
1286         sched();
1287         init = 0;
1288         defprocbctls = 0;
1289         while(waserror()){
1290                 kstrcpy(vmx.errstr, up->errstr, ERRMAX);
1291                 vmx.state = VMXDEAD;
1292         }
1293         for(;;){
1294                 if(!init){
1295                         init = 1;
1296                         vmxstart();
1297                         vmx.state = VMXREADY;
1298                         defprocbctls = vmcsread(PROCB_CTLS);
1299                 }
1300                 runcmd();
1301                 if(vmx.state == VMXRUNNING){
1302                         procbctls = defprocbctls;
1303                         if((vmx.onentry & STEP) != 0){
1304                                 procbctls |= PROCB_MONTRAP;
1305                                 dostep(1);
1306                                 if(waserror()){
1307                                         dostep(0);
1308                                         nexterror();
1309                                 }
1310                         }
1311                         if((vmx.onentry & POSTEX) != 0){
1312                                 vmcswrite(VMENTRY_INTRINFO, vmx.exc.info);
1313                                 vmcswrite(VMENTRY_INTRCODE, vmx.exc.code);
1314                                 vmcswrite(VMENTRY_INTRILEN, vmx.exc.ilen);
1315                                 vmx.onentry &= ~POSTEX;
1316                         }
1317                         if((vmx.onentry & POSTIRQ) != 0 && (vmx.onentry & STEP) == 0){
1318                                 if((vmx.onentry & POSTEX) == 0 && (vmcsread(GUEST_RFLAGS) & 1<<9) != 0 && (vmcsread(GUEST_CANINTR) & 3) == 0){
1319                                         vmcswrite(VMENTRY_INTRINFO, vmx.irq.info);
1320                                         vmcswrite(VMENTRY_INTRCODE, vmx.irq.code);
1321                                         vmcswrite(VMENTRY_INTRILEN, vmx.irq.ilen);
1322                                         vmx.onentry &= ~POSTIRQ;
1323                                         vmx.got |= GOTIRQACK;
1324                                         vmx.irqack = vmx.irq;
1325                                 }else
1326                                         procbctls |= PROCB_IRQWIN;
1327                         }
1328                         if((vmx.onentry & FLUSHVPID) != 0){
1329                                 if(invvpid(INVLOCAL, vmx.vpid, 0) < 0)
1330                                         error("invvpid failed");
1331                                 vmx.onentry &= ~FLUSHVPID;
1332                         }
1333                         if((vmx.onentry & FLUSHEPT) != 0){
1334                                 if(invept(INVLOCAL, PADDR(vmx.pml4) | 3<<3, 0) < 0)
1335                                         error("invept failed");
1336                                 vmx.onentry &= ~FLUSHEPT;
1337                         }
1338                         vmcswrite(PROCB_CTLS, procbctls);
1339                         vmx.got &= ~GOTEXIT;
1340                         if(vmlaunch(&vmx.ureg, vmx.launched, vmx.fp) < 0)
1341                                 error("vmlaunch failed");
1342                         vmx.launched = 1;
1343                         if((vmx.onentry & STEP) != 0){
1344                                 dostep(0);
1345                                 poperror();
1346                         }
1347                         processexit();
1348                 }else{
1349                         up->psstate = "Idle";
1350                         sleep(&vmx.cmdwait, gotcmd, nil);
1351                         up->psstate = nil;
1352                 }
1353         }
1354 }
1355
1356 enum {
1357         Qdir,
1358         Qctl,
1359         Qregs,
1360         Qstatus,
1361         Qmap,
1362         Qwait,
1363         Qfpregs,
1364 };
1365
1366 static Dirtab vmxdir[] = {
1367         ".",            { Qdir, 0, QTDIR },     0,              0550,
1368         "ctl",          { Qctl, 0, 0 },         0,              0660,
1369         "regs",         { Qregs, 0, 0 },        0,              0660,
1370         "status",       { Qstatus, 0, 0 },      0,              0440,
1371         "map",          { Qmap, 0, 0 },         0,              0660,
1372         "wait",         { Qwait, 0, 0 },        0,              0440,
1373         "fpregs",       { Qfpregs, 0, 0 },      0,              0660,
1374 };
1375
1376 enum {
1377         CMinit,
1378         CMquit,
1379         CMgo,
1380         CMstop,
1381         CMstep,
1382         CMexc,
1383         CMirq,
1384 };
1385
1386 static Cmdtab vmxctlmsg[] = {
1387         CMinit,         "init",         1,
1388         CMquit,         "quit",         1,
1389         CMgo,           "go",           0,
1390         CMstop,         "stop",         1,
1391         CMstep,         "step",         0,
1392         CMexc,          "exc",          2,
1393         CMirq,          "irq",          0,
1394 };
1395
1396 static int
1397 iscmddone(void *cp)
1398 {
1399         return (((VmCmd*)cp)->flags & CMDFDONE) != 0;
1400 }
1401
1402 static int
1403 vmxcmd(int (*f)(VmCmd *, va_list), ...)
1404 {
1405         VmCmd cmd;
1406         
1407         if(vmx.state == VMXINACTIVE)
1408                 error("no VM");
1409         if(vmx.state == VMXENDING)
1410         ending:
1411                 error(Equit);
1412         memset(&cmd, 0, sizeof(VmCmd));
1413         cmd.errstr = up->errstr;
1414         cmd.cmd = f;
1415         va_start(cmd.va, f);
1416          
1417         ilock(&vmx.cmdlock);
1418         if(vmx.state == VMXENDING){
1419                 iunlock(&vmx.cmdlock);
1420                 goto ending;
1421         }
1422         *vmx.lastcmd = &cmd;
1423         vmx.lastcmd = &cmd.next;
1424         iunlock(&vmx.cmdlock);
1425         
1426         while(waserror())
1427                 cmd.scratched = 1;
1428         wakeup(&vmx.cmdwait);
1429         do
1430                 sleep(&cmd, iscmddone, &cmd);
1431         while(!iscmddone(&cmd));
1432         poperror();
1433         lock(&cmd);
1434         unlock(&cmd);
1435         if((cmd.flags & CMDFFAIL) != 0)
1436                 error(up->errstr);
1437         return cmd.retval;
1438 }
1439
1440 static Chan *
1441 vmxattach(char *spec)
1442 {
1443         if(vmx.state == NOVMX) error(Enodev);
1444         return devattach('X', spec);
1445 }
1446
1447 static Walkqid*
1448 vmxwalk(Chan *c, Chan *nc, char **name, int nname)
1449 {
1450         return devwalk(c, nc, name, nname, vmxdir, nelem(vmxdir), devgen);
1451 }
1452
1453 static int
1454 vmxstat(Chan *c, uchar *dp, int n)
1455 {
1456         return devstat(c, dp, n, vmxdir, nelem(vmxdir), devgen);
1457 }
1458
1459 static Chan*
1460 vmxopen(Chan* c, int omode)
1461 {
1462         Chan *ch;
1463
1464         if(c->qid.path != Qdir && !iseve()) error(Eperm);
1465         ch = devopen(c, omode, vmxdir, nelem(vmxdir), devgen);
1466         if(ch->qid.path == Qmap){
1467                 if((omode & OTRUNC) != 0)
1468                         vmxcmd(cmdclearmeminfo);
1469         }
1470         return ch;
1471 }
1472
1473 static void
1474 vmxclose(Chan*)
1475 {
1476 }
1477
1478 static long
1479 vmxread(Chan* c, void* a, long n, vlong off)
1480 {
1481         static char regbuf[4096];
1482         static char membuf[4096];
1483         int rc;
1484
1485         switch((ulong)c->qid.path){
1486         case Qdir:
1487                 return devdirread(c, a, n, vmxdir, nelem(vmxdir), devgen);
1488         case Qregs:
1489                 if(off == 0)
1490                         vmxcmd(cmdgetregs, regbuf, regbuf + sizeof(regbuf));
1491                 return readstr(off, a, n, regbuf);
1492         case Qmap:
1493                 if(off == 0)
1494                         vmxcmd(cmdgetmeminfo, membuf, membuf + sizeof(membuf));
1495                 return readstr(off, a, n, membuf);
1496         case Qstatus:
1497                 {
1498                         char buf[ERRMAX+128];
1499                         char errbuf[ERRMAX];
1500                         int status;
1501                         
1502                         status = vmx.state;
1503                         if(status == VMXDEAD){
1504                                 vmxcmd(cmdstatus, errbuf);
1505                                 snprint(buf, sizeof(buf), "%s %#q\n", statenames[status], errbuf);
1506                         }else if(status >= 0 && status < nelem(statenames))
1507                                 snprint(buf, sizeof(buf), "%s\n", statenames[status]);
1508                         else
1509                                 snprint(buf, sizeof(buf), "%d\n", status);
1510                         return readstr(off, a, n, buf);
1511                 }
1512         case Qwait:
1513                 {
1514                         char buf[512];
1515                         
1516                         rc = vmxcmd(cmdwait, buf, buf + sizeof(buf));
1517                         if(rc > n) rc = n;
1518                         if(rc > 0) memmove(a, buf, rc);
1519                         return rc;
1520                 }
1521         case Qfpregs:
1522                 {
1523                         char buf[sizeof(FPsave)];
1524                         
1525                         vmxcmd(cmdgetfpregs, buf);
1526                         if(n < 0 || off < 0 || off >= sizeof(buf)) n = 0;
1527                         else if(off + n > sizeof(buf)) n = sizeof(buf) - off;
1528                         if(n != 0) memmove(a, buf + off, n);
1529                         return n;
1530                 }
1531         default:
1532                 error(Egreg);
1533                 break;
1534         }
1535         return 0;
1536 }
1537
1538 static long
1539 vmxwrite(Chan* c, void* a, long n, vlong off)
1540 {
1541         static QLock initlock;
1542         Cmdbuf *cb;
1543         Cmdtab *ct;
1544         char *s;
1545         int rc;
1546         int i;
1547         VmMem tmpmem;
1548
1549         switch((ulong)c->qid.path){
1550         case Qdir:
1551                 error(Eperm);
1552         case Qctl:
1553                 cb = parsecmd(a, n);
1554                 if(waserror()){
1555                         free(cb);
1556                         nexterror();
1557                 }
1558                 ct = lookupcmd(cb, vmxctlmsg, nelem(vmxctlmsg));
1559                 switch(ct->index){
1560                 case CMinit:
1561                         qlock(&initlock);
1562                         if(waserror()){
1563                                 qunlock(&initlock);
1564                                 nexterror();
1565                         }
1566                         if(vmx.state != VMXINACTIVE)
1567                                 error("vmx already active");
1568                         vmx.state = VMXINIT;
1569                         kproc("kvmx", vmxproc, nil);
1570                         poperror();
1571                         qunlock(&initlock);
1572                         if(vmxcmd(cmdstatus, up->errstr) == VMXDEAD)
1573                                 error(up->errstr);
1574                         break;
1575                 case CMquit:
1576                         vmxcmd(cmdquit);
1577                         break;
1578                 case CMgo:
1579                         s = nil;
1580                         if(cb->nf == 2) kstrdup(&s, cb->f[1]);
1581                         else if(cb->nf != 1) error(Ebadarg);
1582                         if(waserror()){
1583                                 free(s);
1584                                 nexterror();
1585                         }
1586                         vmxcmd(cmdgo, s);
1587                         poperror();
1588                         free(s);
1589                         break;
1590                 case CMstop:
1591                         vmxcmd(cmdstop);
1592                         break;
1593                 case CMstep:
1594                         rc = 0;
1595                         for(i = 1; i < cb->nf; i++)
1596                                 if(strcmp(cb->f[i], "-map") == 0){
1597                                         rc = 1;
1598                                         if(i+4 > cb->nf) error("missing argument");
1599                                         memset(&tmpmem, 0, sizeof(tmpmem));
1600                                         tmpmem.lo = strtoull(cb->f[i+1], &s, 0);
1601                                         if(*s != 0 || !vmokpage(tmpmem.lo)) error("invalid address");
1602                                         tmpmem.hi = tmpmem.lo + BY2PG;
1603                                         tmpmem.attr = 0x407;
1604                                         tmpmem.seg = _globalsegattach(cb->f[i+2]);
1605                                         if(tmpmem.seg == nil) error("unknown segment");
1606                                         tmpmem.off = strtoull(cb->f[i+3], &s, 0);
1607                                         if(*s != 0 || !vmokpage(tmpmem.off)) error("invalid offset");
1608                                         i += 3;
1609                                 }else
1610                                         error(Ebadctl);
1611                         vmxcmd(cmdstep, rc ? &tmpmem : nil);
1612                         break;
1613                 case CMexc:
1614                         s = nil;
1615                         kstrdup(&s, cb->f[1]);
1616                         if(waserror()){
1617                                 free(s);
1618                                 nexterror();
1619                         }
1620                         vmxcmd(cmdexcept, s);
1621                         poperror();
1622                         free(s);
1623                         break;
1624                 case CMirq:
1625                         s = nil;
1626                         if(cb->nf == 2)
1627                                 kstrdup(&s, cb->f[1]);
1628                         if(waserror()){
1629                                 free(s);
1630                                 nexterror();
1631                         }
1632                         vmxcmd(cmdirq, s);
1633                         poperror();
1634                         free(s);
1635                         break;
1636                 default:
1637                         error(Egreg);
1638                 }
1639                 poperror();
1640                 free(cb);
1641                 break;
1642         case Qmap:
1643         case Qregs:
1644                 s = malloc(n+1);
1645                 if(s == nil) error(Enomem);
1646                 if(waserror()){
1647                         free(s);
1648                         nexterror();
1649                 }
1650                 memmove(s, a, n);
1651                 s[n] = 0;
1652                 rc = vmxcmd((ulong)c->qid.path == Qregs ? cmdsetregs : cmdsetmeminfo, s);
1653                 poperror();
1654                 free(s);
1655                 return rc;
1656         case Qfpregs:
1657                 {
1658                         char buf[sizeof(FPsave)];
1659                         
1660                         if(n > sizeof(FPsave)) n = sizeof(FPsave);
1661                         memmove(buf, a, n);
1662                         return vmxcmd(cmdsetfpregs, buf, n, off);
1663                 }
1664         default:
1665                 error(Egreg);
1666                 break;
1667         }
1668         return n;
1669 }
1670
1671 Dev vmxdevtab = {
1672         'X',
1673         "vmx",
1674         
1675         vmxreset,
1676         devinit,
1677         vmxshutdown,
1678         vmxattach,
1679         vmxwalk,
1680         vmxstat,
1681         vmxopen,
1682         devcreate,
1683         vmxclose,
1684         vmxread,
1685         devbread,
1686         vmxwrite,
1687         devbwrite,
1688         devremove,
1689         devwstat,
1690 };