]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/pc/devvmx.c
devvmx: support pat and efer registers
[plan9front.git] / sys / src / 9 / pc / devvmx.c
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7 #include "ureg.h"
8
9 extern int vmxon(u64int);
10 extern int vmxoff(void);
11 extern int vmclear(u64int);
12 extern int vmptrld(u64int);
13 extern int vmlaunch(Ureg *, int, FPsave *);
14 extern int vmread(u32int, uintptr *);
15 extern int vmwrite(u32int, uintptr);
16 extern int invept(u32int, uvlong, uvlong);
17 extern int invvpid(u32int, uvlong, uvlong);
18
19 static vlong procb_ctls, pinb_ctls;
20
21 enum {
22         VMX_BASIC_MSR = 0x480,
23         VMX_PINB_CTLS_MSR = 0x481,
24         VMX_PROCB_CTLS_MSR = 0x482,
25         VMX_VMEXIT_CTLS_MSR = 0x483,
26         VMX_VMENTRY_CTLS_MSR = 0x484,
27         VMX_MISC_MSR = 0x485,
28         VMX_CR0_FIXED0 = 0x486,
29         VMX_CR0_FIXED1 = 0x487,
30         VMX_CR4_FIXED0 = 0x488,
31         VMX_CR4_FIXED1 = 0x489,
32         VMX_VMCS_ENUM = 0x48A,
33         VMX_PROCB_CTLS2_MSR = 0x48B,
34         VMX_TRUE_PINB_CTLS_MSR = 0x48D,
35         VMX_TRUE_PROCB_CTLS_MSR = 0x48E,
36         VMX_TRUE_EXIT_CTLS_MSR = 0x48F,
37         VMX_TRUE_ENTRY_CTLS_MSR = 0x490,
38         VMX_VMFUNC_MSR = 0x491,
39         
40         PINB_CTLS = 0x4000,
41         PINB_EXITIRQ = 1<<0,
42         PINB_EXITNMI = 1<<3,
43         
44         PROCB_CTLS = 0x4002,
45         PROCB_IRQWIN = 1<<2,
46         PROCB_EXITHLT = 1<<7,
47         PROCB_EXITINVLPG = 1<<9,
48         PROCB_EXITMWAIT = 1<<10,
49         PROCB_EXITRDPMC = 1<<11,
50         PROCB_EXITRDTSC = 1<<12,
51         PROCB_EXITCR3LD = 1<<15,
52         PROCB_EXITCR3ST = 1<<16,
53         PROCB_EXITCR8LD = 1<<19,
54         PROCB_EXITCR8ST = 1<<20,
55         PROCB_EXITMOVDR = 1<<23,
56         PROCB_EXITIO = 1<<24,
57         PROCB_MONTRAP = 1<<27,
58         PROCB_EXITMONITOR = 1<<29,
59         PROCB_EXITPAUSE = 1<<30,
60         PROCB_USECTLS2 = 1<<31,
61         
62         PROCB_CTLS2 = 0x401E,
63         PROCB_EPT = 1<<1,
64         PROCB_EXITGDT = 1<<2,
65         PROCB_VPID = 1<<5,
66         PROCB_UNRESTR = 1<<7,
67
68         EXC_BITMAP = 0x4004,
69         PFAULT_MASK = 0x4006,
70         PFAULT_MATCH = 0x4008,
71         CR3_TARGCNT = 0x400a,
72         
73         VMEXIT_CTLS = 0x400c,
74         VMEXIT_HOST64 = 1<<9,
75         VMEXIT_LD_IA32_PERF_GLOBAL_CTRL = 1<<12,
76         VMEXIT_ST_IA32_PAT = 1<<18,
77         VMEXIT_LD_IA32_PAT = 1<<19,
78         VMEXIT_ST_IA32_EFER = 1<<20,
79         VMEXIT_LD_IA32_EFER = 1<<21,    
80         
81         VMEXIT_MSRSTCNT = 0x400e,
82         VMEXIT_MSRLDCNT = 0x4010,
83         
84         VMENTRY_CTLS = 0x4012,
85         VMENTRY_GUEST64 = 1<<9,
86         VMENTRY_LD_IA32_PERF_GLOBAL_CTRL = 1<<13,
87         VMENTRY_LD_IA32_PAT = 1<<14,
88         VMENTRY_LD_IA32_EFER = 1<<15,
89         
90         VMENTRY_MSRLDCNT = 0x4014,
91         VMENTRY_INTRINFO = 0x4016,
92         VMENTRY_INTRCODE = 0x4018,
93         VMENTRY_INTRILEN = 0x401a,
94         
95         VMCS_LINK = 0x2800,
96         
97         GUEST_ES = 0x800,
98         GUEST_CS = 0x802,
99         GUEST_SS = 0x804,
100         GUEST_DS = 0x806,
101         GUEST_FS = 0x808,
102         GUEST_GS = 0x80A,
103         GUEST_LDTR = 0x80C,
104         GUEST_TR = 0x80E,
105         GUEST_CR0 = 0x6800,
106         GUEST_CR3 = 0x6802,
107         GUEST_CR4 = 0x6804,
108         GUEST_ESLIMIT = 0x4800,
109         GUEST_CSLIMIT = 0x4802,
110         GUEST_SSLIMIT = 0x4804,
111         GUEST_DSLIMIT = 0x4806,
112         GUEST_FSLIMIT = 0x4808,
113         GUEST_GSLIMIT = 0x480A,
114         GUEST_LDTRLIMIT = 0x480C,
115         GUEST_TRLIMIT = 0x480E,
116         GUEST_GDTRLIMIT = 0x4810,
117         GUEST_IDTRLIMIT = 0x4812,
118         GUEST_ESPERM = 0x4814,
119         GUEST_CSPERM = 0x4816,
120         GUEST_SSPERM = 0x4818,
121         GUEST_DSPERM = 0x481A,
122         GUEST_FSPERM = 0x481C,
123         GUEST_GSPERM = 0x481E,
124         GUEST_LDTRPERM = 0x4820,
125         GUEST_TRPERM = 0x4822,
126         GUEST_CR0MASK = 0x6000,
127         GUEST_CR4MASK = 0x6002,
128         GUEST_CR0SHADOW = 0x6004,
129         GUEST_CR4SHADOW = 0x6006,
130         GUEST_ESBASE = 0x6806,
131         GUEST_CSBASE = 0x6808,
132         GUEST_SSBASE = 0x680A,
133         GUEST_DSBASE = 0x680C,
134         GUEST_FSBASE = 0x680E,
135         GUEST_GSBASE = 0x6810,
136         GUEST_LDTRBASE = 0x6812,
137         GUEST_TRBASE = 0x6814,
138         GUEST_GDTRBASE = 0x6816,
139         GUEST_IDTRBASE = 0x6818,
140         GUEST_DR7 = 0x681A,
141         GUEST_RSP = 0x681C,
142         GUEST_RIP = 0x681E,
143         GUEST_RFLAGS = 0x6820,
144         GUEST_IA32_DEBUGCTL = 0x2802,
145         GUEST_IA32_PAT = 0x2804,
146         GUEST_IA32_EFER = 0x2806,
147         GUEST_IA32_PERF_GLOBAL_CTRL = 0x2808,
148         
149         HOST_ES = 0xC00,
150         HOST_CS = 0xC02,
151         HOST_SS = 0xC04,
152         HOST_DS = 0xC06,
153         HOST_FS = 0xC08,
154         HOST_GS = 0xC0A,
155         HOST_TR = 0xC0C,
156         HOST_CR0 = 0x6C00,
157         HOST_CR3 = 0x6C02,
158         HOST_CR4 = 0x6C04,
159         HOST_FSBASE = 0x6C06,
160         HOST_GSBASE = 0x6C08,
161         HOST_TRBASE = 0x6C0A,
162         HOST_GDTR = 0x6C0C,
163         HOST_IDTR = 0x6C0E,
164         HOST_RSP = 0x6C14,
165         HOST_RIP = 0x6C16,
166         HOST_IA32_PAT = 0x2C00,
167         HOST_IA32_EFER = 0x2C02,
168         HOST_IA32_PERF_GLOBAL_CTRL = 0x2C04,
169         
170         GUEST_CANINTR = 0x4824,
171         
172         VM_INSTRERR = 0x4400,
173         VM_EXREASON = 0x4402,
174         VM_EXINTRINFO = 0x4404,
175         VM_EXINTRCODE = 0x4406,
176         VM_IDTVECINFO = 0x4408,
177         VM_IDTVECCODE = 0x440A,
178         VM_EXINSTRLEN = 0x440C,
179         VM_EXINSTRINFO = 0x440E,
180         VM_EXQUALIF = 0x6400,
181         VM_IORCX = 0x6402,
182         VM_IORSI = 0x6404,
183         VM_IORDI = 0x6406,
184         VM_IORIP = 0x6408,
185         VM_GUESTVA = 0x640A,
186         VM_GUESTPA = 0x2400,
187         
188         VM_VPID = 0x000,
189         VM_EPTPIDX = 0x0004,
190         
191         VM_EPTP = 0x201A,
192         VM_EPTPLA = 0x2024,
193         
194         INVLOCAL = 1,
195 };
196
197 enum {
198         CR0RSVD = 0x1ffaffc0,
199         CR4RSVD = 0xff889000,
200         CR4MCE = 1<<6,
201         CR4VMXE = 1<<13,
202         CR4SMXE = 1<<14,
203         CR4PKE = 1<<22,
204         
205         CR0KERNEL = CR0RSVD | (uintptr)0xFFFFFFFF00000000ULL,
206         CR4KERNEL = CR4RSVD | CR4VMXE | CR4SMXE | CR4MCE | CR4PKE | (uintptr)0xFFFFFFFF00000000ULL
207 };
208
209 typedef struct Vmx Vmx;
210 typedef struct VmCmd VmCmd;
211 typedef struct VmMem VmMem;
212 typedef struct VmIntr VmIntr;
213
214 struct VmMem {
215         uvlong lo, hi;
216         Segment *seg;
217         uintptr off;
218         VmMem *next, *prev;
219         u16int attr;
220 };
221
222 struct VmIntr {
223         u32int info, code, ilen;
224 };
225
226 struct Vmx {
227         enum {
228                 NOVMX,
229                 VMXINACTIVE,
230                 VMXINIT,
231                 VMXREADY,
232                 VMXRUNNING,
233                 VMXDEAD,
234                 VMXENDING,
235         } state;
236         char errstr[ERRMAX];
237         Ureg ureg;
238         FPsave *fp;
239         u8int launched;
240         u8int vpid;
241         enum {
242                 FLUSHVPID = 1,
243                 FLUSHEPT = 2,
244                 STEP = 4,
245                 POSTEX = 8,
246                 POSTIRQ = 16,
247         } onentry;
248         
249         Rendez cmdwait;
250         Lock cmdlock;
251         VmCmd *firstcmd, **lastcmd;
252         VmCmd *postponed;
253         uvlong *pml4;
254         VmMem mem;
255         
256         enum {
257                 GOTEXIT = 1,
258                 GOTIRQACK = 2,
259                 GOTSTEP = 4,
260                 GOTSTEPERR = 8,
261         } got;
262         VmMem *stepmap;
263         VmIntr exc, irq, irqack;
264 };
265
266 struct VmCmd {
267         enum {
268                 CMDFDONE = 1,
269                 CMDFFAIL = 2,
270                 CMDFPOSTP = 4,
271         } flags;
272         u8int scratched;
273         Rendez;
274         Lock;
275         int (*cmd)(VmCmd *, va_list);
276         int retval;
277         char *errstr;
278         va_list va;
279         VmCmd *next;
280 };
281
282 static char Equit[] = "vmx: ending";
283
284 static char *statenames[] = {
285         [NOVMX] "novmx",
286         [VMXINACTIVE] "inactive",
287         [VMXINIT] "init",
288         [VMXREADY] "ready",
289         [VMXRUNNING] "running",
290         [VMXDEAD] "dead",
291         [VMXENDING]"ending"
292 };
293
294 static Vmx vmx;
295
296 static u64int
297 vmcsread(u32int addr)
298 {
299         int rc;
300         u64int val;
301
302         val = 0;
303         rc = vmread(addr, (uintptr *) &val);
304         if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000)
305                 rc = vmread(addr | 1, (uintptr *) &val + 1);
306         if(rc < 0){
307                 char errbuf[128];
308                 snprint(errbuf, sizeof(errbuf), "vmcsread failed (%#.4ux)", addr);
309                 error(errbuf);
310         }
311         return val;
312 }
313
314 static void
315 vmcswrite(u32int addr, u64int val)
316 {
317         int rc;
318         
319         rc = vmwrite(addr, val);
320         if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000)
321                 rc = vmwrite(addr | 1, val >> 32);
322         if(rc < 0){
323                 char errbuf[128];
324                 snprint(errbuf, sizeof(errbuf), "vmcswrite failed (%#.4ux = %#.16ullx)", addr, val);
325                 error(errbuf);
326         }
327 }
328
329 static uvlong
330 parseval(char *s, int sz)
331 {
332         uvlong v;
333         char *p;
334         
335         if(sz == 0) sz = sizeof(uintptr);
336         v = strtoull(s, &p, 0);
337         if(p == s || *p != 0 || v >> sz * 8 != 0) error("invalid value");
338         return v;
339 }
340
341 static char *
342 cr0fakeread(char *p, char *e)
343 {
344         uvlong guest, mask, shadow;
345         
346         guest = vmcsread(GUEST_CR0);
347         mask = vmcsread(GUEST_CR0MASK);
348         shadow = vmcsread(GUEST_CR0SHADOW);
349         return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & mask | shadow & ~mask);
350 }
351
352 static char *
353 cr4fakeread(char *p, char *e)
354 {
355         uvlong guest, mask, shadow;
356         
357         guest = vmcsread(GUEST_CR4);
358         mask = vmcsread(GUEST_CR4MASK);
359         shadow = vmcsread(GUEST_CR4SHADOW);
360         return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & mask | shadow & ~mask);
361 }
362
363 static int
364 cr0realwrite(char *s)
365 {
366         uvlong v;
367         
368         v = parseval(s, 8);
369         vmcswrite(GUEST_CR0, vmcsread(GUEST_CR0) & CR0KERNEL | v & ~CR0KERNEL);
370         return 0;
371 }
372
373 static int
374 cr0maskwrite(char *s)
375 {
376         uvlong v;
377         
378         v = parseval(s, 8);
379         vmcswrite(GUEST_CR0MASK, vmcsread(GUEST_CR0MASK) | CR0KERNEL);
380         return 0;
381 }
382
383 static int
384 cr4realwrite(char *s)
385 {
386         uvlong v;
387         
388         v = parseval(s, 8);
389         vmcswrite(GUEST_CR4, vmcsread(GUEST_CR4) & CR4KERNEL | v & ~CR4KERNEL);
390         return 0;
391 }
392
393 static int
394 cr4maskwrite(char *s)
395 {
396         uvlong v;
397         
398         v = parseval(s, 8);
399         vmcswrite(GUEST_CR4MASK, vmcsread(GUEST_CR4MASK) | CR4KERNEL);
400         return 0;
401 }
402
403 static int
404 readonly(char *)
405 {
406         return -1;
407 }
408
409 typedef struct GuestReg GuestReg;
410 struct GuestReg {
411         int offset;
412         u8int size; /* in bytes; 0 means == uintptr */
413         char *name;
414         char *(*read)(char *, char *);
415         int (*write)(char *);
416 };
417 #define UREG(x) ~(ulong)&((Ureg*)0)->x
418 static GuestReg guestregs[] = {
419         {GUEST_RIP, 0, "pc"},
420         {GUEST_RSP, 0, "sp"},
421         {GUEST_RFLAGS, 0, "flags"},
422         {UREG(ax), 0, "ax"},
423         {UREG(bx), 0, "bx"},
424         {UREG(cx), 0, "cx"},
425         {UREG(dx), 0, "dx"},
426         {UREG(bp), 0, "bp"},
427         {UREG(si), 0, "si"},
428         {UREG(di), 0, "di"},
429         {GUEST_GDTRBASE, 0, "gdtrbase"},
430         {GUEST_GDTRLIMIT, 4, "gdtrlimit"},
431         {GUEST_IDTRBASE, 0, "idtrbase"},
432         {GUEST_IDTRLIMIT, 4, "idtrlimit"},
433         {GUEST_CS, 2, "cs"},
434         {GUEST_CSBASE, 0, "csbase"},
435         {GUEST_CSLIMIT, 4, "cslimit"},
436         {GUEST_CSPERM, 4, "csperm"},
437         {GUEST_DS, 2, "ds"},
438         {GUEST_DSBASE, 0, "dsbase"},
439         {GUEST_DSLIMIT, 4, "dslimit"},
440         {GUEST_DSPERM, 4, "dsperm"},
441         {GUEST_ES, 2, "es"},
442         {GUEST_ESBASE, 0, "esbase"},
443         {GUEST_ESLIMIT, 4, "eslimit"},
444         {GUEST_ESPERM, 4, "esperm"},
445         {GUEST_FS, 2, "fs"},
446         {GUEST_FSBASE, 0, "fsbase"},
447         {GUEST_FSLIMIT, 4, "fslimit"},
448         {GUEST_FSPERM, 4, "fsperm"},
449         {GUEST_GS, 2, "gs"},
450         {GUEST_GSBASE, 0, "gsbase"},
451         {GUEST_GSLIMIT, 4, "gslimit"},
452         {GUEST_GSPERM, 4, "gsperm"},
453         {GUEST_SS, 2, "ss"},
454         {GUEST_SSBASE, 0, "ssbase"},
455         {GUEST_SSLIMIT, 4, "sslimit"},
456         {GUEST_SSPERM, 4, "ssperm"},
457         {GUEST_TR, 2, "tr"},
458         {GUEST_TRBASE, 0, "trbase"},
459         {GUEST_TRLIMIT, 4, "trlimit"},
460         {GUEST_TRPERM, 4, "trperm"},
461         {GUEST_LDTR, 2, "ldtr"},
462         {GUEST_LDTRBASE, 0, "ldtrbase"},
463         {GUEST_LDTRLIMIT, 4, "ldtrlimit"},
464         {GUEST_LDTRPERM, 4, "ldtrperm"},
465         {GUEST_CR0, 0, "cr0real", nil, cr0realwrite},
466         {GUEST_CR0SHADOW, 0, "cr0fake", cr0fakeread},
467         {GUEST_CR0MASK, 0, "cr0mask", nil, cr0maskwrite},
468         {UREG(trap), 0, "cr2"},
469         {GUEST_CR3, 0, "cr3"},
470         {GUEST_CR4, 0, "cr4real", nil, cr4realwrite},
471         {GUEST_CR4SHADOW, 0, "cr4fake", cr4fakeread},
472         {GUEST_CR4MASK, 0, "cr4mask", nil, cr4maskwrite},
473         {GUEST_IA32_PAT, 8, "pat"},
474         {GUEST_IA32_EFER, 8, "efer"},
475         {VM_INSTRERR, 4, "instructionerror", nil, readonly},
476         {VM_EXREASON, 4, "exitreason", nil, readonly},
477         {VM_EXQUALIF, 0, "exitqualification", nil, readonly},
478         {VM_EXINTRINFO, 4, "exitinterruptinfo", nil, readonly},
479         {VM_EXINTRCODE, 4, "exitinterruptcode", nil, readonly},
480         {VM_EXINSTRLEN, 4, "exitinstructionlen", nil, readonly},
481         {VM_EXINSTRINFO, 4, "exitinstructioninfo", nil, readonly},
482         {VM_GUESTVA, 0, "exitva", nil, readonly},
483         {VM_GUESTPA, 0, "exitpa", nil, readonly},
484         {VM_IDTVECINFO, 4, "idtinterruptinfo", nil, readonly},
485         {VM_IDTVECCODE, 4, "idtinterruptcode", nil, readonly},
486 };
487
488 static int
489 vmokpage(u64int addr)
490 {
491         return (addr & 0xfff) == 0 && addr >> 48 == 0;
492 }
493
494 static uvlong *
495 eptwalk(uvlong addr)
496 {
497         uvlong *tab, *nt;
498         uvlong v;
499         int i;
500         
501         tab = vmx.pml4;
502         for(i = 3; i >= 1; i--){
503                 tab += addr >> 12 + 9 * i & 0x1ff;
504                 v = *tab;
505                 if((v & 3) == 0){
506                         nt = mallocalign(BY2PG, BY2PG, 0, 0);
507                         if(nt == nil) error(Enomem);
508                         memset(nt, 0, BY2PG);
509                         v = PADDR(nt) | 0x407;
510                         *tab = v;
511                 }
512                 tab = KADDR(v & ~0xfff);
513         }
514         return tab + (addr >> 12 & 0x1ff);
515 }
516
517 static void
518 eptfree(uvlong *tab, int level)
519 {
520         int i;
521         uvlong v, *t;
522         
523         if(level < 3){
524                 for(i = 0; i < 512; i++){
525                         v = tab[i];
526                         if((v & 3) == 0) continue;
527                         t = KADDR(v & ~0xfff);
528                         eptfree(t, level + 1);
529                         tab[i] = 0;
530                 }
531         }
532         if(level > 0)
533                 free(tab);              
534 }
535
536 static void
537 epttranslate(VmMem *mp)
538 {
539         uvlong p, hpa;
540
541         if(mp->seg != nil && (mp->seg->type & SG_TYPE) != SG_FIXED || (mp->lo & 0xfff) != 0 || (mp->hi & 0xfff) != 0 || (uint)mp->attr >= 0x1000)
542                 error(Egreg);
543         if(mp->seg != nil){
544                 if(mp->seg->base + mp->off + (mp->hi - mp->lo) > mp->seg->top)
545                         error(Egreg);
546                 hpa = mp->seg->map[0]->pages[0]->pa + mp->off;
547         }else
548                 hpa = 0;
549         for(p = mp->lo; p < mp->hi; p += BY2PG)
550                 *eptwalk(p) = hpa + (p - mp->lo) + mp->attr;
551         vmx.onentry |= FLUSHEPT;
552 }
553
554 static char *mtype[] = {"uc", "wc", "02", "03", "wt", "wp", "wb", "07"};
555
556 static int
557 cmdgetmeminfo(VmCmd *, va_list va)
558 {
559         VmMem *mp;
560         char *p0, *e, *p;
561         char attr[4];
562         char mt[4];
563         
564         p0 = va_arg(va, char *);
565         e = va_arg(va, char *);
566         p = p0;
567         for(mp = vmx.mem.next; mp != &vmx.mem; mp = mp->next){
568                 attr[0] = (mp->attr & 1) != 0 ? 'r' : '-';
569                 attr[1] = (mp->attr & 2) != 0 ? 'w' : '-';
570                 attr[2] = (mp->attr & 4) != 0 ? 'x' : '-';
571                 attr[3] = 0;
572                 *(ushort*)mt = *(u16int*)mtype[mp->attr >> 3 & 7];
573                 mt[2] = (mp->attr & 0x40) != 0 ? '!' : 0;
574                 mt[3] = 0;
575                 p = seprint(p, e, "%s %s %#llux %#llux %p %#llux\n", attr, mt, mp->lo, mp->hi, mp->seg, (uvlong)mp->off);
576         }
577         return p - p0;
578 }
579
580 static int
581 cmdclearmeminfo(VmCmd *, va_list)
582 {
583         VmMem *mp, *mn;
584         
585         eptfree(vmx.pml4, 0);
586         for(mp = vmx.mem.next; mp != &vmx.mem; mp = mn){
587                 mn = mp->next;
588                 free(mp);
589         }
590         vmx.mem.prev = &vmx.mem;
591         vmx.mem.next = &vmx.mem;
592         vmx.onentry |= FLUSHEPT;
593         return 0;
594 }
595
596 extern Segment* (*_globalsegattach)(char*);
597
598 static int
599 cmdsetmeminfo(VmCmd *, va_list va)
600 {
601         char *p0, *p, *q, *r;
602         int j;
603         char *f[10];
604         VmMem *mp;
605         int rc;
606         
607         p0 = va_arg(va, char *);
608         p = p0;
609         mp = nil;
610         for(;;){
611                 q = strchr(p, '\n');
612                 if(q == 0) break;
613                 *q = 0;
614                 if(mp == nil)
615                         mp = malloc(sizeof(VmMem));
616                 if(waserror()){
617                         free(mp);
618                         nexterror();
619                 }
620                 rc = tokenize(p, f, nelem(f));
621                 p = q + 1;
622                 if(rc == 0) goto next;
623                 if(rc != 4 && rc != 6) error("number of fields wrong");
624                 memset(mp, 0, sizeof(VmMem));
625                 for(q = f[0]; *q != 0; q++)
626                         switch(*q){
627                         case 'r': if((mp->attr & 1) != 0) goto tinval; mp->attr |= 1; break;
628                         case 'w': if((mp->attr & 2) != 0) goto tinval; mp->attr |= 2; break;
629                         case 'x': if((mp->attr & 4) != 0) goto tinval; mp->attr |= 0x404; break;
630                         case '-': break;
631                         default: tinval: error("invalid access field");
632                         }
633                 for(j = 0; j < 8; j++)
634                         if(strncmp(mtype[j], f[1], 2) == 0){
635                                 mp->attr |= j << 3;
636                                 break;
637                         }
638                 if(j == 8 || strlen(f[1]) > 3) error("invalid memory type");
639                 if(f[1][2] == '!') mp->attr |= 0x40;
640                 else if(f[1][2] != 0) error("invalid memory type");
641                 mp->lo = strtoull(f[2], &r, 0);
642                 if(*r != 0 || !vmokpage(mp->lo)) error("invalid low guest physical address");
643                 mp->hi = strtoull(f[3], &r, 0);
644                 if(*r != 0 || !vmokpage(mp->hi) || mp->hi <= mp->lo) error("invalid high guest physical address");
645                 mp->off = strtoull(f[5], &r, 0);
646                 if(*r != 0 || !vmokpage(mp->off)) error("invalid offset");
647                 if((mp->attr & 7) != 0){
648                         if(rc != 6) error("number of fields wrong");
649                         mp->seg = _globalsegattach(f[4]);
650                         if(mp->seg == nil) error("no such segment");
651                         if(mp->seg->base + mp->off + (mp->hi - mp->lo) > mp->seg->top) error("out of bounds");
652                 }
653                 epttranslate(mp);
654                 mp->prev = vmx.mem.prev;
655                 mp->next = &vmx.mem;
656                 mp->prev->next = mp;
657                 mp->next->prev = mp;
658                 mp = nil;
659         next:
660                 poperror();
661         }
662         free(mp);
663         return p - p0;
664 }
665
666 static void
667 vmxreset(void)
668 {
669         ulong regs[4];
670         vlong msr;
671
672         cpuid(1, regs);
673         if((regs[2] & 1<<5) == 0) return;
674         /* check if disabled by BIOS */
675         if(rdmsr(0x3a, &msr) < 0) return;
676         if((msr & 5) != 5){
677                 if((msr & 1) == 0){ /* msr still unlocked */
678                         wrmsr(0x3a, msr | 5);
679                         if(rdmsr(0x3a, &msr) < 0)
680                                 return;
681                 }
682                 if((msr & 5) != 5)
683                         return;
684         }
685         if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) return;
686         if((vlong)msr >= 0) return;
687         if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) return;
688         if((msr >> 32 & PROCB_EPT) == 0 || (msr >> 32 & PROCB_VPID) == 0) return;
689         vmx.state = VMXINACTIVE;
690         vmx.lastcmd = &vmx.firstcmd;
691         vmx.mem.next = &vmx.mem;
692         vmx.mem.prev = &vmx.mem;
693 }
694
695 static void
696 vmxshutdown(void)
697 {
698         if(vmx.state != NOVMX && vmx.state != VMXINACTIVE)
699                 vmxoff();
700 }
701
702 static void
703 vmcsinit(void)
704 {
705         vlong msr;
706         u32int x;
707         
708         memset(&vmx.ureg, 0, sizeof(vmx.ureg));
709         vmx.launched = 0;
710         vmx.onentry = 0;
711         
712         if(rdmsr(VMX_BASIC_MSR, &msr) < 0) error("rdmsr(VMX_BASIC_MSR) failed");
713         if((msr & 1ULL<<55) != 0){
714                 if(rdmsr(VMX_TRUE_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_TRUE_PROCB_CTLS_MSR) failed");
715                 if(rdmsr(VMX_TRUE_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_TRUE_PINB_CTLS_MSR) failed");
716         }else{
717                 if(rdmsr(VMX_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR) failed");
718                 if(rdmsr(VMX_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_PINB_CTLS_MSR) failed");
719         }
720
721         if(rdmsr(VMX_PINB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PINB_CTLS_MSR failed");
722         x = (u32int)pinb_ctls | 1<<1 | 1<<2 | 1<<4; /* currently reserved default1 bits */
723         x |= PINB_EXITIRQ | PINB_EXITNMI;
724         x &= pinb_ctls >> 32;
725         vmcswrite(PINB_CTLS, x);
726         
727         if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR failed");
728         x = (u32int)procb_ctls | 1<<1 | 7<<4 | 1<<8 | 1<<13 | 1<<14 | 1<<26; /* currently reserved default1 bits */
729         x |= PROCB_EXITHLT | PROCB_EXITMWAIT;
730         x |= PROCB_EXITMOVDR | PROCB_EXITIO | PROCB_EXITMONITOR | PROCB_EXITPAUSE;
731         x |= PROCB_USECTLS2;
732         x &= msr >> 32;
733         vmcswrite(PROCB_CTLS, x);
734         
735         if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS2_MSR failed");
736         x = PROCB_EPT | PROCB_VPID | PROCB_UNRESTR;
737         x &= msr >> 32;
738         vmcswrite(PROCB_CTLS2, x);
739         
740         if(rdmsr(VMX_VMEXIT_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMEXIT_CTLS_MSR failed");
741         x = (u32int)msr;
742         if(sizeof(uintptr) == 8) x |= VMEXIT_HOST64;
743         x |= VMEXIT_LD_IA32_PAT | VMEXIT_LD_IA32_EFER;
744         x &= msr >> 32;
745         vmcswrite(VMEXIT_CTLS, x);
746         
747         if(rdmsr(VMX_VMENTRY_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMENTRY_CTLS_MSR failed");
748         x = (u32int)msr;
749         if(sizeof(uintptr) == 8) x |= VMENTRY_GUEST64;
750         x |= VMENTRY_LD_IA32_PAT | VMENTRY_LD_IA32_EFER;
751         x &= msr >> 32;
752         vmcswrite(VMENTRY_CTLS, x);
753         
754         vmcswrite(CR3_TARGCNT, 0);
755         vmcswrite(VMEXIT_MSRLDCNT, 0);
756         vmcswrite(VMEXIT_MSRSTCNT, 0);
757         vmcswrite(VMENTRY_MSRLDCNT, 0);
758         vmcswrite(VMENTRY_INTRINFO, 0);
759         vmcswrite(VMCS_LINK, -1);
760         
761         vmcswrite(HOST_CS, KESEL);
762         vmcswrite(HOST_DS, KDSEL);
763         vmcswrite(HOST_ES, KDSEL);
764         vmcswrite(HOST_FS, KDSEL);
765         vmcswrite(HOST_GS, KDSEL);
766         vmcswrite(HOST_SS, KDSEL);
767         vmcswrite(HOST_TR, TSSSEL);
768         vmcswrite(HOST_CR0, getcr0() & ~0xe);
769         vmcswrite(HOST_CR3, getcr3());
770         vmcswrite(HOST_CR4, getcr4());
771         rdmsr(0xc0000100, &msr);
772         vmcswrite(HOST_FSBASE, msr);
773         rdmsr(0xc0000101, &msr);
774         vmcswrite(HOST_GSBASE, msr);
775         vmcswrite(HOST_TRBASE, (uintptr) m->tss);
776         vmcswrite(HOST_GDTR, (uintptr) m->gdt);
777         vmcswrite(HOST_IDTR, IDTADDR);
778         if(rdmsr(0x277, &msr) < 0) error("rdmsr(IA32_PAT) failed");
779         vmcswrite(HOST_IA32_PAT, msr);
780         if(rdmsr(0xc0000080, &msr) < 0) error("rdmsr(IA32_EFER) failed");
781         vmcswrite(HOST_IA32_EFER, msr);
782         
783         vmcswrite(EXC_BITMAP, 1<<18);
784         vmcswrite(PFAULT_MASK, 0);
785         vmcswrite(PFAULT_MATCH, 0);
786         
787         vmcswrite(GUEST_CSBASE, 0);
788         vmcswrite(GUEST_DSBASE, 0);
789         vmcswrite(GUEST_ESBASE, 0);
790         vmcswrite(GUEST_FSBASE, 0);
791         vmcswrite(GUEST_GSBASE, 0);
792         vmcswrite(GUEST_SSBASE, 0);
793         vmcswrite(GUEST_CSLIMIT, -1);
794         vmcswrite(GUEST_DSLIMIT, -1);
795         vmcswrite(GUEST_ESLIMIT, -1);
796         vmcswrite(GUEST_FSLIMIT, -1);
797         vmcswrite(GUEST_GSLIMIT, -1);
798         vmcswrite(GUEST_SSLIMIT, -1);
799         vmcswrite(GUEST_CSPERM, (SEGG|SEGD|SEGP|SEGPL(0)|SEGEXEC|SEGR) >> 8 | 1);
800         vmcswrite(GUEST_DSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
801         vmcswrite(GUEST_ESPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
802         vmcswrite(GUEST_FSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
803         vmcswrite(GUEST_GSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
804         vmcswrite(GUEST_SSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
805         vmcswrite(GUEST_LDTRPERM, 1<<16);
806
807         vmcswrite(GUEST_CR0MASK, CR0KERNEL);
808         vmcswrite(GUEST_CR4MASK, CR4KERNEL);
809         vmcswrite(GUEST_CR0, getcr0() & ~(1<<31));
810         vmcswrite(GUEST_CR3, 0);
811         vmcswrite(GUEST_CR4, getcr4());
812         vmcswrite(GUEST_CR0SHADOW, getcr0());
813         vmcswrite(GUEST_CR4SHADOW, getcr4() & ~CR4VMXE);
814         
815         vmcswrite(GUEST_IA32_PAT, 0x0007040600070406ULL);
816         vmcswrite(GUEST_IA32_EFER, 0);
817         
818         vmcswrite(GUEST_TRBASE, (uintptr) m->tss);
819         vmcswrite(GUEST_TRLIMIT, 0xffff);
820         vmcswrite(GUEST_TRPERM, (SEGTSS|SEGPL(0)|SEGP) >> 8 | 2);
821         
822         vmx.pml4 = mallocalign(BY2PG, BY2PG, 0, 0);
823         memset(vmx.pml4, 0, BY2PG);
824         vmcswrite(VM_EPTP, PADDR(vmx.pml4) | 3<<3);
825         vmx.vpid = 1;
826         vmcswrite(VM_VPID, vmx.vpid);
827         
828         vmcswrite(GUEST_RFLAGS, 2);
829         
830         vmx.onentry = FLUSHVPID | FLUSHEPT;
831         
832         vmx.fp = mallocalign(512, 512, 0, 0);
833         if(vmx.fp == nil)
834                 error(Enomem);
835         fpinit();
836         fpsave(vmx.fp);
837 }
838
839 static void
840 vmxstart(void)
841 {
842         static uchar *vmcs; /* also vmxon region */
843         vlong x;
844
845         putcr4(getcr4() | 0x2000);
846
847         if(vmcs == nil){
848                 vmcs = mallocalign(8192, 4096, 0, 0);
849                 if(vmcs == nil)
850                         error(Enomem);
851         }
852         memset(vmcs, 0, 8192);
853         rdmsr(VMX_BASIC_MSR, &x);
854         *(ulong*)vmcs = x;
855         *(ulong*)&vmcs[4096] = x;
856         if(vmxon(PADDR(vmcs + 4096)) < 0)
857                 error("vmxon failed");
858         if(vmclear(PADDR(vmcs)) < 0)
859                 error("vmclear failed");
860         if(vmptrld(PADDR(vmcs)) < 0)
861                 error("vmptrld failed");
862         vmcsinit();
863 }
864
865 static void
866 cmdrelease(VmCmd *p, int f)
867 {
868         lock(p);
869         p->flags |= CMDFDONE | f;
870         wakeup(p);
871         unlock(p);
872 }
873
874 static void
875 killcmds(VmCmd *notme)
876 {
877         VmCmd *p, *pn;
878         
879         for(p = vmx.postponed; p != nil; p = pn){
880                 pn = p->next;
881                 p->next = nil;
882                 if(p == notme) continue;
883                 kstrcpy(p->errstr, Equit, ERRMAX);
884                 cmdrelease(p, CMDFFAIL);
885         }
886         vmx.postponed = nil;
887         ilock(&vmx.cmdlock);
888         for(p = vmx.firstcmd; p != nil; p = pn){
889                 pn = p->next;
890                 p->next = nil;
891                 if(p == notme) continue;
892                 kstrcpy(p->errstr, Equit, ERRMAX);
893                 cmdrelease(p, CMDFFAIL);
894         }
895         vmx.firstcmd = nil;
896         vmx.lastcmd = &vmx.firstcmd;
897         iunlock(&vmx.cmdlock);
898 }
899
900 static int
901 cmdquit(VmCmd *p, va_list va)
902 {
903         vmx.state = VMXENDING;
904         cmdclearmeminfo(p, va);
905         killcmds(p);
906
907         free(vmx.pml4);
908         vmx.pml4 = nil;
909         vmx.got = 0;
910         vmx.onentry = 0;
911         vmx.stepmap = nil;
912
913         vmxoff();
914         vmx.state = VMXINACTIVE;
915         cmdrelease(p, 0);
916         pexit(Equit, 1);
917         return 0;
918 }
919
920 static void
921 processexit(void)
922 {
923         u32int reason;
924         
925         reason = vmcsread(VM_EXREASON);
926         if((reason & 1<<31) == 0)
927                 switch(reason & 0xffff){
928                 case 1: /* external interrupt */
929                 case 3: /* INIT */
930                 case 4: /* SIPI */
931                 case 5: /* IO SMI */
932                 case 6: /* SMI */
933                 case 7: /* IRQ window */
934                 case 8: /* NMI window */
935                         return;
936                 case 37:
937                         if((vmx.onentry & STEP) != 0){
938                                 vmx.state = VMXREADY;
939                                 vmx.got |= GOTSTEP;
940                                 vmx.onentry &= ~STEP;
941                                 return;
942                         }
943                         break;
944                 }
945         if((vmx.onentry & STEP) != 0){
946                 iprint("VMX: exit reason %#x when expected step...\n", reason & 0xffff);
947                 vmx.onentry &= ~STEP;
948                 vmx.got |= GOTSTEP|GOTSTEPERR;
949         }
950         vmx.state = VMXREADY;
951         vmx.got |= GOTEXIT;
952 }
953
954 static int
955 cmdgetregs(VmCmd *, va_list va)
956 {
957         char *p0, *e;
958         GuestReg *r;
959         uvlong val;
960         int s;
961         char *p;
962         
963         p0 = va_arg(va, char *);
964         e = va_arg(va, char *);
965         p = p0;
966         for(r = guestregs; r < guestregs + nelem(guestregs); r++){
967                 if(r->offset >= 0)
968                         val = vmcsread(r->offset);
969                 else
970                         val = *(uintptr*)((uchar*)&vmx.ureg + ~r->offset);
971                 s = r->size;
972                 if(s == 0) s = sizeof(uintptr);
973                 p = seprint(p, e, "%s %#.*llux\n", r->name, s * 2, val);
974         }
975         return p - p0;
976 }
977
978 static int
979 setregs(char *p0, char rs, char *fs)
980 {
981         char *p, *q, *rp;
982         char *f[10];
983         GuestReg *r;
984         uvlong val;
985         int sz;
986         int rc;
987
988         p = p0;
989         for(;;){
990                 q = strchr(p, rs);
991                 if(q == 0) break;
992                 *q = 0;
993                 rc = getfields(p, f, nelem(f), 1, fs);
994                 p = q + 1;
995                 if(rc == 0) continue;
996                 if(rc != 2) error("number of fields wrong");
997                 
998                 for(r = guestregs; r < guestregs + nelem(guestregs); r++)
999                         if(strcmp(r->name, f[0]) == 0)
1000                                 break;
1001                 if(r == guestregs + nelem(guestregs))
1002                         error("unknown register");
1003                 if(r->write != nil){
1004                         r->write(f[1]);
1005                         continue;
1006                 }
1007                 val = strtoull(f[1], &rp, 0);
1008                 sz = r->size;
1009                 if(sz == 0) sz = sizeof(uintptr);
1010                 if(rp == f[1] || *rp != 0 || val >> 8 * sz != 0) error("invalid value");
1011                 if(r->offset >= 0)
1012                         vmcswrite(r->offset, val);
1013                 else{
1014                         assert((u32int)~r->offset + sz <= sizeof(Ureg)); 
1015                         switch(sz){
1016                         case 1: *(u8int*)((u8int*)&vmx.ureg + (u32int)~r->offset) = val; break;
1017                         case 2: *(u16int*)((u8int*)&vmx.ureg + (u32int)~r->offset) = val; break;
1018                         case 4: *(u32int*)((u8int*)&vmx.ureg + (u32int)~r->offset) = val; break;
1019                         case 8: *(u64int*)((u8int*)&vmx.ureg + (u32int)~r->offset) = val; break;
1020                         default: error(Egreg);
1021                         }
1022                 }
1023         }
1024         return p - p0;
1025 }
1026
1027 static int
1028 cmdsetregs(VmCmd *, va_list va)
1029 {
1030         return setregs(va_arg(va, char *), '\n', " \t");
1031 }
1032
1033 static int
1034 cmdgetfpregs(VmCmd *, va_list va)
1035 {
1036         uchar *p;
1037         
1038         p = va_arg(va, uchar *);
1039         memmove(p, vmx.fp, sizeof(FPsave));
1040         return sizeof(FPsave);
1041 }
1042
1043 static int
1044 cmdsetfpregs(VmCmd *, va_list va)
1045 {
1046         uchar *p;
1047         ulong n;
1048         vlong off;
1049         
1050         p = va_arg(va, uchar *);
1051         n = va_arg(va, ulong);
1052         off = va_arg(va, vlong);
1053         if(off < 0 || off >= sizeof(FPsave)) n = 0;
1054         else if(off + n > sizeof(FPsave)) n = sizeof(FPsave) - n;
1055         memmove((uchar*)vmx.fp + off, p, n);
1056         return n;
1057 }
1058
1059 static int
1060 cmdgo(VmCmd *, va_list va)
1061 {
1062         char *r;
1063
1064         if(vmx.state != VMXREADY)
1065                 error("VM not ready");
1066         r = va_arg(va, char *);
1067         if(r != nil) setregs(r, ';', "=");
1068         vmx.state = VMXRUNNING;
1069         return 0;
1070 }
1071
1072 static int
1073 cmdstop(VmCmd *, va_list)
1074 {
1075         if(vmx.state != VMXREADY && vmx.state != VMXRUNNING)
1076                 error("VM not ready or running");
1077         vmx.state = VMXREADY;
1078         return 0;
1079 }
1080
1081 static int
1082 cmdstatus(VmCmd *, va_list va)
1083 {       
1084         kstrcpy(va_arg(va, char *), vmx.errstr, ERRMAX);
1085         return vmx.state;
1086 }
1087
1088 static char *exitreasons[] = {
1089         [0] "exc", [1] "extirq", [2] "triplef", [3] "initsig", [4] "sipi", [5] "smiio", [6] "smiother", [7] "irqwin",
1090         [8] "nmiwin", [9] "taskswitch", [10] ".cpuid", [11] ".getsec", [12] ".hlt", [13] ".invd", [14] ".invlpg", [15] ".rdpmc",
1091         [16] ".rdtsc", [17] ".rsm", [18] ".vmcall", [19] ".vmclear", [20] ".vmlaunch", [21] ".vmptrld", [22] ".vmptrst", [23] ".vmread",
1092         [24] ".vmresume", [25] ".vmwrite", [26] ".vmxoff", [27] ".vmxon", [28] "movcr", [29] ".movdr", [30] "io", [31] ".rdmsr",
1093         [32] ".wrmsr", [33] "entrystate", [34] "entrymsr", [36] ".mwait", [37] "monitortrap", [39] ".monitor",
1094         [40] ".pause", [41] "mcheck", [43] "tpr", [44] "apicacc", [45] "eoi", [46] "gdtr_idtr", [47] "ldtr_tr",
1095         [48] "eptfault", [49] "eptinval", [50] ".invept", [51] ".rdtscp", [52] "preempt", [53] ".invvpid", [54] ".wbinvd", [55] ".xsetbv",
1096         [56] "apicwrite", [57] ".rdrand", [58] ".invpcid", [59] ".vmfunc", [60] ".encls", [61] ".rdseed", [62] "pmlfull", [63] ".xsaves",
1097         [64] ".xrstors", 
1098 };
1099
1100 static char *except[] = {
1101         [0] "#de", [1] "#db", [3] "#bp", [4] "#of", [5] "#br", [6] "#ud", [7] "#nm",
1102         [8] "#df", [10] "#ts", [11] "#np", [12] "#ss", [13] "#gp", [14] "#pf",
1103         [16] "#mf", [17] "#ac", [18] "#mc", [19] "#xm", [20] "#ve",
1104 };
1105
1106 static int
1107 cmdwait(VmCmd *cp, va_list va)
1108 {
1109         char *p, *p0, *e;
1110         u32int reason, intr;
1111         uvlong qual;
1112         u16int rno;
1113
1114         if(cp->scratched)
1115                 error(Eintr);
1116         p0 = p = va_arg(va, char *);
1117         e = va_arg(va, char *);
1118         if((vmx.got & GOTIRQACK) != 0){
1119                 p = seprint(p, e, "*ack %d\n", vmx.irqack.info & 0xff);
1120                 vmx.got &= ~GOTIRQACK;
1121                 return p - p0;
1122         }
1123         if((vmx.got & GOTEXIT) == 0){
1124                 cp->flags |= CMDFPOSTP;
1125                 return -1;
1126         }
1127         vmx.got &= ~GOTEXIT;
1128         reason = vmcsread(VM_EXREASON);
1129         qual = vmcsread(VM_EXQUALIF);
1130         rno = reason;
1131         intr = vmcsread(VM_EXINTRINFO);
1132         if((reason & 1<<31) != 0)
1133                 p = seprint(p, e, "!");
1134         if(rno == 0 && (intr & 1<<31) != 0){
1135                 if((intr & 0xff) >= nelem(except) || except[intr & 0xff] == nil)
1136                         p = seprint(p, e, "#%d ", intr & 0xff);
1137                 else
1138                         p = seprint(p, e, "%s ", except[intr & 0xff]);
1139         }else if(rno >= nelem(exitreasons) || exitreasons[rno] == nil)
1140                 p = seprint(p, e, "?%d ", rno);
1141         else
1142                 p = seprint(p, e, "%s ", exitreasons[rno]);
1143         p = seprint(p, e, "%#ullx pc %#ullx sp %#ullx ilen %#ullx iinfo %#ullx", qual, vmcsread(GUEST_RIP), vmcsread(GUEST_RSP), vmcsread(VM_EXINSTRLEN), vmcsread(VM_EXINSTRINFO));
1144         if((intr & 1<<11) != 0) p = seprint(p, e, " excode %#ullx", vmcsread(VM_EXINTRCODE));
1145         if(rno == 48 && (qual & 0x80) != 0) p = seprint(p, e, " va %#ullx", vmcsread(VM_GUESTVA));
1146         if(rno == 48 || rno == 49) p = seprint(p, e, " pa %#ullx", vmcsread(VM_GUESTPA));
1147         if(rno == 30) p = seprint(p, e, " ax %#ullx", (uvlong)vmx.ureg.ax);
1148         p = seprint(p, e, "\n");
1149         return p - p0;
1150 }
1151
1152 static int
1153 cmdstep(VmCmd *cp, va_list va)
1154 {
1155         switch(cp->retval){
1156         case 0:
1157                 if((vmx.got & GOTSTEP) != 0 || (vmx.onentry & STEP) != 0)
1158                         error(Einuse);
1159                 if(vmx.state != VMXREADY){
1160                         iprint("pre-step in state %s\n", statenames[vmx.state]);
1161                         error("not ready");
1162                 }
1163                 vmx.stepmap = va_arg(va, VmMem *);
1164                 vmx.onentry |= STEP;
1165                 vmx.state = VMXRUNNING;
1166                 cp->flags |= CMDFPOSTP;
1167                 return 1;
1168         case 1:
1169                 if(vmx.state != VMXREADY){
1170                         iprint("post-step in state %s\n", statenames[vmx.state]);
1171                         vmx.onentry &= ~STEP;
1172                         vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1173                         error("not ready");
1174                 }
1175                 if((vmx.got & GOTSTEP) == 0){
1176                         cp->flags |= CMDFPOSTP;
1177                         return 1;
1178                 }
1179                 if((vmx.got & GOTSTEPERR) != 0){
1180                         vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1181                         error("step failed");
1182                 }
1183                 vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1184                 return 1;
1185         }
1186         return 0;
1187 }
1188
1189 static void
1190 eventparse(char *p, VmIntr *vi)
1191 {
1192         char *q, *r;
1193         int i;
1194         
1195         memset(vi, 0, sizeof(VmIntr));
1196         q = nil;
1197         kstrdup(&q, p);
1198         if(waserror()){
1199                 free(q);
1200                 memset(vi, 0, sizeof(VmIntr));
1201                 nexterror();
1202         }
1203         vi->info = 1<<31;
1204         r = strchr(q, ',');
1205         if(r != nil) *r++ = 0;
1206         for(i = 0; i < nelem(except); i++)
1207                 if(except[i] != nil && strcmp(except[i], q) == 0)
1208                         break;
1209         if(*q == '#'){
1210                 q++;
1211                 vi->info |= 3 << 8;
1212         }
1213         if(i == nelem(except)){
1214                 i = strtoul(q, &q, 10);
1215                 if(*q != 0 || i > 255) error(Ebadctl);
1216         }
1217         vi->info |= i;
1218         if((vi->info & 0x7ff) == 3 || (vi->info & 0x7ff) == 4)
1219                 vi->info += 3 << 8;
1220         if(r == nil) goto out;
1221         if(*r != ','){
1222                 vi->code = strtoul(r, &r, 0);
1223                 vi->info |= 1<<11;
1224         }else r++;
1225         if(*r == ',')
1226                 vi->ilen = strtoul(r + 1, &r, 0);
1227         if(*r != 0) error(Ebadctl);
1228 out:
1229         poperror();
1230         free(q);
1231 }
1232
1233 static int
1234 cmdexcept(VmCmd *cp, va_list va)
1235 {
1236         if(cp->scratched) error(Eintr);
1237         if((vmx.onentry & POSTEX) != 0){
1238                 cp->flags |= CMDFPOSTP;
1239                 return 0;
1240         }
1241         eventparse(va_arg(va, char *), &vmx.exc);
1242         vmx.onentry |= POSTEX;
1243         return 0;
1244 }
1245
1246 static int
1247 cmdirq(VmCmd *, va_list va)
1248 {
1249         char *p;
1250         VmIntr vi;
1251         
1252         p = va_arg(va, char *);
1253         if(p == nil)
1254                 vmx.onentry &= ~POSTIRQ;
1255         else{
1256                 eventparse(p, &vi);
1257                 vmx.irq = vi;
1258                 vmx.onentry |= POSTIRQ;
1259         }
1260         return 0;
1261 }
1262
1263
1264 static int
1265 gotcmd(void *)
1266 {
1267         int rc;
1268
1269         ilock(&vmx.cmdlock);
1270         rc = vmx.firstcmd != nil;
1271         iunlock(&vmx.cmdlock);
1272         return rc;
1273 }
1274
1275 static void
1276 markcmddone(VmCmd *p, VmCmd ***pp)
1277 {
1278         if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP){
1279                 **pp = p;
1280                 *pp = &p->next;
1281         }else{
1282                 p->flags = p->flags & ~CMDFPOSTP;
1283                 cmdrelease(p, 0);
1284         }
1285 }
1286
1287 static VmCmd **
1288 markppcmddone(VmCmd **pp)
1289 {
1290         VmCmd *p;
1291         
1292         p = *pp;
1293         if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP)
1294                 return &p->next;
1295         *pp = p->next;
1296         p->next = nil;
1297         p->flags = p->flags & ~CMDFPOSTP;
1298         cmdrelease(p, 0);
1299         return pp;
1300 }
1301
1302
1303 static void
1304 runcmd(void)
1305 {
1306         VmCmd *p, **pp;
1307         
1308         for(pp = &vmx.postponed; p = *pp, p != nil; ){
1309                 if(waserror()){
1310                         kstrcpy(p->errstr, up->errstr, ERRMAX);
1311                         p->flags |= CMDFFAIL;
1312                         pp = markppcmddone(pp);
1313                         continue;
1314                 }
1315                 p->flags &= ~CMDFPOSTP;
1316                 p->retval = p->cmd(p, p->va);
1317                 poperror();
1318                 pp = markppcmddone(pp);
1319         }
1320         for(;;){
1321                 ilock(&vmx.cmdlock);
1322                 p = vmx.firstcmd;
1323                 if(p == nil){
1324                         iunlock(&vmx.cmdlock);
1325                         break;
1326                 }
1327                 vmx.firstcmd = p->next;
1328                 if(vmx.lastcmd == &p->next)
1329                         vmx.lastcmd = &vmx.firstcmd;
1330                 iunlock(&vmx.cmdlock);
1331                 p->next = nil;
1332                 if(waserror()){
1333                         kstrcpy(p->errstr, up->errstr, ERRMAX);
1334                         p->flags |= CMDFFAIL;
1335                         markcmddone(p, &pp);
1336                         continue;
1337                 }
1338                 if(p->scratched) error(Eintr);
1339                 p->retval = p->cmd(p, p->va);
1340                 poperror();
1341                 markcmddone(p, &pp);
1342         }
1343 }
1344
1345 static void
1346 dostep(int setup)
1347 {
1348         static uvlong oldmap;
1349         static uvlong *mapptr;
1350
1351         if(setup){
1352                 if(vmx.stepmap != nil){
1353                         mapptr = eptwalk(vmx.stepmap->lo);
1354                         oldmap = *mapptr;
1355                         epttranslate(vmx.stepmap);
1356                 }
1357         }else{
1358                 vmcswrite(PROCB_CTLS, vmcsread(PROCB_CTLS) & ~(uvlong)PROCB_MONTRAP);
1359                 if(vmx.stepmap != nil){
1360                         *mapptr = oldmap;
1361                         vmx.stepmap = nil;
1362                         vmx.onentry |= FLUSHEPT;
1363                 }
1364         }
1365 }
1366
1367 static void
1368 vmxproc(void *)
1369 {
1370         int init;
1371         u32int procbctls, defprocbctls;
1372
1373         procwired(up, 0);
1374         sched();
1375         init = 0;
1376         defprocbctls = 0;
1377         while(waserror()){
1378                 kstrcpy(vmx.errstr, up->errstr, ERRMAX);
1379                 vmx.state = VMXDEAD;
1380         }
1381         for(;;){
1382                 if(!init){
1383                         init = 1;
1384                         vmxstart();
1385                         vmx.state = VMXREADY;
1386                         defprocbctls = vmcsread(PROCB_CTLS);
1387                 }
1388                 runcmd();
1389                 if(vmx.state == VMXRUNNING){
1390                         procbctls = defprocbctls;
1391                         if((vmx.onentry & STEP) != 0){
1392                                 procbctls |= PROCB_MONTRAP;
1393                                 dostep(1);
1394                                 if(waserror()){
1395                                         dostep(0);
1396                                         nexterror();
1397                                 }
1398                         }
1399                         if((vmx.onentry & POSTEX) != 0){
1400                                 vmcswrite(VMENTRY_INTRINFO, vmx.exc.info);
1401                                 vmcswrite(VMENTRY_INTRCODE, vmx.exc.code);
1402                                 vmcswrite(VMENTRY_INTRILEN, vmx.exc.ilen);
1403                                 vmx.onentry &= ~POSTEX;
1404                         }
1405                         if((vmx.onentry & POSTIRQ) != 0 && (vmx.onentry & STEP) == 0){
1406                                 if((vmx.onentry & POSTEX) == 0 && (vmcsread(GUEST_RFLAGS) & 1<<9) != 0 && (vmcsread(GUEST_CANINTR) & 3) == 0){
1407                                         vmcswrite(VMENTRY_INTRINFO, vmx.irq.info);
1408                                         vmcswrite(VMENTRY_INTRCODE, vmx.irq.code);
1409                                         vmcswrite(VMENTRY_INTRILEN, vmx.irq.ilen);
1410                                         vmx.onentry &= ~POSTIRQ;
1411                                         vmx.got |= GOTIRQACK;
1412                                         vmx.irqack = vmx.irq;
1413                                 }else
1414                                         procbctls |= PROCB_IRQWIN;
1415                         }
1416                         if((vmx.onentry & FLUSHVPID) != 0){
1417                                 if(invvpid(INVLOCAL, vmx.vpid, 0) < 0)
1418                                         error("invvpid failed");
1419                                 vmx.onentry &= ~FLUSHVPID;
1420                         }
1421                         if((vmx.onentry & FLUSHEPT) != 0){
1422                                 if(invept(INVLOCAL, PADDR(vmx.pml4) | 3<<3, 0) < 0)
1423                                         error("invept failed");
1424                                 vmx.onentry &= ~FLUSHEPT;
1425                         }
1426                         vmcswrite(PROCB_CTLS, procbctls);
1427                         vmx.got &= ~GOTEXIT;
1428                         if(vmlaunch(&vmx.ureg, vmx.launched, vmx.fp) < 0)
1429                                 error("vmlaunch failed");
1430                         vmx.launched = 1;
1431                         if((vmx.onentry & STEP) != 0){
1432                                 dostep(0);
1433                                 poperror();
1434                         }
1435                         processexit();
1436                 }else{
1437                         up->psstate = "Idle";
1438                         sleep(&vmx.cmdwait, gotcmd, nil);
1439                         up->psstate = nil;
1440                 }
1441         }
1442 }
1443
1444 enum {
1445         Qdir,
1446         Qctl,
1447         Qregs,
1448         Qstatus,
1449         Qmap,
1450         Qwait,
1451         Qfpregs,
1452 };
1453
1454 static Dirtab vmxdir[] = {
1455         ".",            { Qdir, 0, QTDIR },     0,              0550,
1456         "ctl",          { Qctl, 0, 0 },         0,              0660,
1457         "regs",         { Qregs, 0, 0 },        0,              0660,
1458         "status",       { Qstatus, 0, 0 },      0,              0440,
1459         "map",          { Qmap, 0, 0 },         0,              0660,
1460         "wait",         { Qwait, 0, 0 },        0,              0440,
1461         "fpregs",       { Qfpregs, 0, 0 },      0,              0660,
1462 };
1463
1464 enum {
1465         CMinit,
1466         CMquit,
1467         CMgo,
1468         CMstop,
1469         CMstep,
1470         CMexc,
1471         CMirq,
1472 };
1473
1474 static Cmdtab vmxctlmsg[] = {
1475         CMinit,         "init",         1,
1476         CMquit,         "quit",         1,
1477         CMgo,           "go",           0,
1478         CMstop,         "stop",         1,
1479         CMstep,         "step",         0,
1480         CMexc,          "exc",          2,
1481         CMirq,          "irq",          0,
1482 };
1483
1484 static int
1485 iscmddone(void *cp)
1486 {
1487         return (((VmCmd*)cp)->flags & CMDFDONE) != 0;
1488 }
1489
1490 static int
1491 vmxcmd(int (*f)(VmCmd *, va_list), ...)
1492 {
1493         VmCmd cmd;
1494         
1495         if(vmx.state == VMXINACTIVE)
1496                 error("no VM");
1497         if(vmx.state == VMXENDING)
1498         ending:
1499                 error(Equit);
1500         memset(&cmd, 0, sizeof(VmCmd));
1501         cmd.errstr = up->errstr;
1502         cmd.cmd = f;
1503         va_start(cmd.va, f);
1504          
1505         ilock(&vmx.cmdlock);
1506         if(vmx.state == VMXENDING){
1507                 iunlock(&vmx.cmdlock);
1508                 goto ending;
1509         }
1510         *vmx.lastcmd = &cmd;
1511         vmx.lastcmd = &cmd.next;
1512         iunlock(&vmx.cmdlock);
1513         
1514         while(waserror())
1515                 cmd.scratched = 1;
1516         wakeup(&vmx.cmdwait);
1517         do
1518                 sleep(&cmd, iscmddone, &cmd);
1519         while(!iscmddone(&cmd));
1520         poperror();
1521         lock(&cmd);
1522         unlock(&cmd);
1523         if((cmd.flags & CMDFFAIL) != 0)
1524                 error(up->errstr);
1525         return cmd.retval;
1526 }
1527
1528 static Chan *
1529 vmxattach(char *spec)
1530 {
1531         if(vmx.state == NOVMX) error(Enodev);
1532         return devattach('X', spec);
1533 }
1534
1535 static Walkqid*
1536 vmxwalk(Chan *c, Chan *nc, char **name, int nname)
1537 {
1538         return devwalk(c, nc, name, nname, vmxdir, nelem(vmxdir), devgen);
1539 }
1540
1541 static int
1542 vmxstat(Chan *c, uchar *dp, int n)
1543 {
1544         return devstat(c, dp, n, vmxdir, nelem(vmxdir), devgen);
1545 }
1546
1547 static Chan*
1548 vmxopen(Chan* c, int omode)
1549 {
1550         Chan *ch;
1551
1552         if(c->qid.path != Qdir && !iseve()) error(Eperm);
1553         ch = devopen(c, omode, vmxdir, nelem(vmxdir), devgen);
1554         if(ch->qid.path == Qmap){
1555                 if((omode & OTRUNC) != 0)
1556                         vmxcmd(cmdclearmeminfo);
1557         }
1558         return ch;
1559 }
1560
1561 static void
1562 vmxclose(Chan*)
1563 {
1564 }
1565
1566 static long
1567 vmxread(Chan* c, void* a, long n, vlong off)
1568 {
1569         static char regbuf[4096];
1570         static char membuf[4096];
1571         int rc;
1572
1573         switch((ulong)c->qid.path){
1574         case Qdir:
1575                 return devdirread(c, a, n, vmxdir, nelem(vmxdir), devgen);
1576         case Qregs:
1577                 if(off == 0)
1578                         vmxcmd(cmdgetregs, regbuf, regbuf + sizeof(regbuf));
1579                 return readstr(off, a, n, regbuf);
1580         case Qmap:
1581                 if(off == 0)
1582                         vmxcmd(cmdgetmeminfo, membuf, membuf + sizeof(membuf));
1583                 return readstr(off, a, n, membuf);
1584         case Qstatus:
1585                 {
1586                         char buf[ERRMAX+128];
1587                         char errbuf[ERRMAX];
1588                         int status;
1589                         
1590                         status = vmx.state;
1591                         if(status == VMXDEAD){
1592                                 vmxcmd(cmdstatus, errbuf);
1593                                 snprint(buf, sizeof(buf), "%s %#q\n", statenames[status], errbuf);
1594                         }else if(status >= 0 && status < nelem(statenames))
1595                                 snprint(buf, sizeof(buf), "%s\n", statenames[status]);
1596                         else
1597                                 snprint(buf, sizeof(buf), "%d\n", status);
1598                         return readstr(off, a, n, buf);
1599                 }
1600         case Qwait:
1601                 {
1602                         char buf[512];
1603                         
1604                         rc = vmxcmd(cmdwait, buf, buf + sizeof(buf));
1605                         if(rc > n) rc = n;
1606                         if(rc > 0) memmove(a, buf, rc);
1607                         return rc;
1608                 }
1609         case Qfpregs:
1610                 {
1611                         char buf[sizeof(FPsave)];
1612                         
1613                         vmxcmd(cmdgetfpregs, buf);
1614                         if(n < 0 || off < 0 || off >= sizeof(buf)) n = 0;
1615                         else if(off + n > sizeof(buf)) n = sizeof(buf) - off;
1616                         if(n != 0) memmove(a, buf + off, n);
1617                         return n;
1618                 }
1619         default:
1620                 error(Egreg);
1621                 break;
1622         }
1623         return 0;
1624 }
1625
1626 static long
1627 vmxwrite(Chan* c, void* a, long n, vlong off)
1628 {
1629         static QLock initlock;
1630         Cmdbuf *cb;
1631         Cmdtab *ct;
1632         char *s;
1633         int rc;
1634         int i;
1635         VmMem tmpmem;
1636
1637         switch((ulong)c->qid.path){
1638         case Qdir:
1639                 error(Eperm);
1640         case Qctl:
1641                 cb = parsecmd(a, n);
1642                 if(waserror()){
1643                         free(cb);
1644                         nexterror();
1645                 }
1646                 ct = lookupcmd(cb, vmxctlmsg, nelem(vmxctlmsg));
1647                 switch(ct->index){
1648                 case CMinit:
1649                         qlock(&initlock);
1650                         if(waserror()){
1651                                 qunlock(&initlock);
1652                                 nexterror();
1653                         }
1654                         if(vmx.state != VMXINACTIVE)
1655                                 error("vmx already active");
1656                         vmx.state = VMXINIT;
1657                         kproc("kvmx", vmxproc, nil);
1658                         poperror();
1659                         qunlock(&initlock);
1660                         if(vmxcmd(cmdstatus, up->errstr) == VMXDEAD)
1661                                 error(up->errstr);
1662                         break;
1663                 case CMquit:
1664                         vmxcmd(cmdquit);
1665                         break;
1666                 case CMgo:
1667                         s = nil;
1668                         if(cb->nf == 2) kstrdup(&s, cb->f[1]);
1669                         else if(cb->nf != 1) error(Ebadarg);
1670                         if(waserror()){
1671                                 free(s);
1672                                 nexterror();
1673                         }
1674                         vmxcmd(cmdgo, s);
1675                         poperror();
1676                         free(s);
1677                         break;
1678                 case CMstop:
1679                         vmxcmd(cmdstop);
1680                         break;
1681                 case CMstep:
1682                         rc = 0;
1683                         for(i = 1; i < cb->nf; i++)
1684                                 if(strcmp(cb->f[i], "-map") == 0){
1685                                         rc = 1;
1686                                         if(i+4 > cb->nf) error("missing argument");
1687                                         memset(&tmpmem, 0, sizeof(tmpmem));
1688                                         tmpmem.lo = strtoull(cb->f[i+1], &s, 0);
1689                                         if(*s != 0 || !vmokpage(tmpmem.lo)) error("invalid address");
1690                                         tmpmem.hi = tmpmem.lo + BY2PG;
1691                                         tmpmem.attr = 0x407;
1692                                         tmpmem.seg = _globalsegattach(cb->f[i+2]);
1693                                         if(tmpmem.seg == nil) error("unknown segment");
1694                                         tmpmem.off = strtoull(cb->f[i+3], &s, 0);
1695                                         if(*s != 0 || !vmokpage(tmpmem.off)) error("invalid offset");
1696                                         i += 3;
1697                                 }else
1698                                         error(Ebadctl);
1699                         vmxcmd(cmdstep, rc ? &tmpmem : nil);
1700                         break;
1701                 case CMexc:
1702                         s = nil;
1703                         kstrdup(&s, cb->f[1]);
1704                         if(waserror()){
1705                                 free(s);
1706                                 nexterror();
1707                         }
1708                         vmxcmd(cmdexcept, s);
1709                         poperror();
1710                         free(s);
1711                         break;
1712                 case CMirq:
1713                         s = nil;
1714                         if(cb->nf == 2)
1715                                 kstrdup(&s, cb->f[1]);
1716                         if(waserror()){
1717                                 free(s);
1718                                 nexterror();
1719                         }
1720                         vmxcmd(cmdirq, s);
1721                         poperror();
1722                         free(s);
1723                         break;
1724                 default:
1725                         error(Egreg);
1726                 }
1727                 poperror();
1728                 free(cb);
1729                 break;
1730         case Qmap:
1731         case Qregs:
1732                 s = malloc(n+1);
1733                 if(s == nil) error(Enomem);
1734                 if(waserror()){
1735                         free(s);
1736                         nexterror();
1737                 }
1738                 memmove(s, a, n);
1739                 s[n] = 0;
1740                 rc = vmxcmd((ulong)c->qid.path == Qregs ? cmdsetregs : cmdsetmeminfo, s);
1741                 poperror();
1742                 free(s);
1743                 return rc;
1744         case Qfpregs:
1745                 {
1746                         char buf[sizeof(FPsave)];
1747                         
1748                         if(n > sizeof(FPsave)) n = sizeof(FPsave);
1749                         memmove(buf, a, n);
1750                         return vmxcmd(cmdsetfpregs, buf, n, off);
1751                 }
1752         default:
1753                 error(Egreg);
1754                 break;
1755         }
1756         return n;
1757 }
1758
1759 Dev vmxdevtab = {
1760         'X',
1761         "vmx",
1762         
1763         vmxreset,
1764         devinit,
1765         vmxshutdown,
1766         vmxattach,
1767         vmxwalk,
1768         vmxstat,
1769         vmxopen,
1770         devcreate,
1771         vmxclose,
1772         vmxread,
1773         devbread,
1774         vmxwrite,
1775         devbwrite,
1776         devremove,
1777         devwstat,
1778 };