]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/pc/devvmx.c
devvmx: fix CR0/CR4 readout; also don't exit on PAUSE instruction
[plan9front.git] / sys / src / 9 / pc / devvmx.c
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7 #include "ureg.h"
8
9 extern int vmxon(u64int);
10 extern int vmxoff(void);
11 extern int vmclear(u64int);
12 extern int vmptrld(u64int);
13 extern int vmlaunch(Ureg *, int);
14 extern int vmread(u32int, uintptr *);
15 extern int vmwrite(u32int, uintptr);
16 extern int invept(u32int, uvlong, uvlong);
17 extern int invvpid(u32int, uvlong, uvlong);
18
19 static vlong procb_ctls, pinb_ctls;
20
21 enum {
22         VMX_BASIC_MSR = 0x480,
23         VMX_PINB_CTLS_MSR = 0x481,
24         VMX_PROCB_CTLS_MSR = 0x482,
25         VMX_VMEXIT_CTLS_MSR = 0x483,
26         VMX_VMENTRY_CTLS_MSR = 0x484,
27         VMX_MISC_MSR = 0x485,
28         VMX_CR0_FIXED0 = 0x486,
29         VMX_CR0_FIXED1 = 0x487,
30         VMX_CR4_FIXED0 = 0x488,
31         VMX_CR4_FIXED1 = 0x489,
32         VMX_VMCS_ENUM = 0x48A,
33         VMX_PROCB_CTLS2_MSR = 0x48B,
34         VMX_TRUE_PINB_CTLS_MSR = 0x48D,
35         VMX_TRUE_PROCB_CTLS_MSR = 0x48E,
36         VMX_TRUE_EXIT_CTLS_MSR = 0x48F,
37         VMX_TRUE_ENTRY_CTLS_MSR = 0x490,
38         VMX_VMFUNC_MSR = 0x491,
39         
40         PINB_CTLS = 0x4000,
41         PINB_EXITIRQ = 1<<0,
42         PINB_EXITNMI = 1<<3,
43         
44         PROCB_CTLS = 0x4002,
45         PROCB_IRQWIN = 1<<2,
46         PROCB_EXITHLT = 1<<7,
47         PROCB_EXITINVLPG = 1<<9,
48         PROCB_EXITMWAIT = 1<<10,
49         PROCB_EXITRDPMC = 1<<11,
50         PROCB_EXITRDTSC = 1<<12,
51         PROCB_EXITCR3LD = 1<<15,
52         PROCB_EXITCR3ST = 1<<16,
53         PROCB_EXITCR8LD = 1<<19,
54         PROCB_EXITCR8ST = 1<<20,
55         PROCB_EXITMOVDR = 1<<23,
56         PROCB_EXITIO = 1<<24,
57         PROCB_MONTRAP = 1<<27,
58         PROCB_EXITMONITOR = 1<<29,
59         PROCB_EXITPAUSE = 1<<30,
60         PROCB_USECTLS2 = 1<<31,
61         
62         PROCB_CTLS2 = 0x401E,
63         PROCB_EPT = 1<<1,
64         PROCB_EXITGDT = 1<<2,
65         PROCB_VPID = 1<<5,
66         PROCB_UNRESTR = 1<<7,
67
68         EXC_BITMAP = 0x4004,
69         PFAULT_MASK = 0x4006,
70         PFAULT_MATCH = 0x4008,
71         CR3_TARGCNT = 0x400a,
72         
73         VMEXIT_CTLS = 0x400c,
74         VMEXIT_ST_DEBUG = 1<<2,
75         VMEXIT_HOST64 = 1<<9,
76         VMEXIT_LD_IA32_PERF_GLOBAL_CTRL = 1<<12,
77         VMEXIT_ST_IA32_PAT = 1<<18,
78         VMEXIT_LD_IA32_PAT = 1<<19,
79         VMEXIT_ST_IA32_EFER = 1<<20,
80         VMEXIT_LD_IA32_EFER = 1<<21,    
81         
82         VMEXIT_MSRSTCNT = 0x400e,
83         VMEXIT_MSRLDCNT = 0x4010,
84         
85         VMENTRY_CTLS = 0x4012,
86         VMENTRY_LD_DEBUG = 1<<2,
87         VMENTRY_GUEST64 = 1<<9,
88         VMENTRY_LD_IA32_PERF_GLOBAL_CTRL = 1<<13,
89         VMENTRY_LD_IA32_PAT = 1<<14,
90         VMENTRY_LD_IA32_EFER = 1<<15,
91         
92         VMENTRY_MSRLDCNT = 0x4014,
93         VMENTRY_INTRINFO = 0x4016,
94         VMENTRY_INTRCODE = 0x4018,
95         VMENTRY_INTRILEN = 0x401a,
96         
97         VMCS_LINK = 0x2800,
98         
99         GUEST_ES = 0x800,
100         GUEST_CS = 0x802,
101         GUEST_SS = 0x804,
102         GUEST_DS = 0x806,
103         GUEST_FS = 0x808,
104         GUEST_GS = 0x80A,
105         GUEST_LDTR = 0x80C,
106         GUEST_TR = 0x80E,
107         GUEST_CR0 = 0x6800,
108         GUEST_CR3 = 0x6802,
109         GUEST_CR4 = 0x6804,
110         GUEST_ESLIMIT = 0x4800,
111         GUEST_CSLIMIT = 0x4802,
112         GUEST_SSLIMIT = 0x4804,
113         GUEST_DSLIMIT = 0x4806,
114         GUEST_FSLIMIT = 0x4808,
115         GUEST_GSLIMIT = 0x480A,
116         GUEST_LDTRLIMIT = 0x480C,
117         GUEST_TRLIMIT = 0x480E,
118         GUEST_GDTRLIMIT = 0x4810,
119         GUEST_IDTRLIMIT = 0x4812,
120         GUEST_ESPERM = 0x4814,
121         GUEST_CSPERM = 0x4816,
122         GUEST_SSPERM = 0x4818,
123         GUEST_DSPERM = 0x481A,
124         GUEST_FSPERM = 0x481C,
125         GUEST_GSPERM = 0x481E,
126         GUEST_LDTRPERM = 0x4820,
127         GUEST_TRPERM = 0x4822,
128         GUEST_CR0MASK = 0x6000,
129         GUEST_CR4MASK = 0x6002,
130         GUEST_CR0SHADOW = 0x6004,
131         GUEST_CR4SHADOW = 0x6006,
132         GUEST_ESBASE = 0x6806,
133         GUEST_CSBASE = 0x6808,
134         GUEST_SSBASE = 0x680A,
135         GUEST_DSBASE = 0x680C,
136         GUEST_FSBASE = 0x680E,
137         GUEST_GSBASE = 0x6810,
138         GUEST_LDTRBASE = 0x6812,
139         GUEST_TRBASE = 0x6814,
140         GUEST_GDTRBASE = 0x6816,
141         GUEST_IDTRBASE = 0x6818,
142         GUEST_DR7 = 0x681A,
143         GUEST_RSP = 0x681C,
144         GUEST_RIP = 0x681E,
145         GUEST_RFLAGS = 0x6820,
146         GUEST_IA32_DEBUGCTL = 0x2802,
147         GUEST_IA32_PAT = 0x2804,
148         GUEST_IA32_EFER = 0x2806,
149         GUEST_IA32_PERF_GLOBAL_CTRL = 0x2808,
150         
151         HOST_ES = 0xC00,
152         HOST_CS = 0xC02,
153         HOST_SS = 0xC04,
154         HOST_DS = 0xC06,
155         HOST_FS = 0xC08,
156         HOST_GS = 0xC0A,
157         HOST_TR = 0xC0C,
158         HOST_CR0 = 0x6C00,
159         HOST_CR3 = 0x6C02,
160         HOST_CR4 = 0x6C04,
161         HOST_FSBASE = 0x6C06,
162         HOST_GSBASE = 0x6C08,
163         HOST_TRBASE = 0x6C0A,
164         HOST_GDTR = 0x6C0C,
165         HOST_IDTR = 0x6C0E,
166         HOST_RSP = 0x6C14,
167         HOST_RIP = 0x6C16,
168         HOST_IA32_PAT = 0x2C00,
169         HOST_IA32_EFER = 0x2C02,
170         HOST_IA32_PERF_GLOBAL_CTRL = 0x2C04,
171         
172         GUEST_CANINTR = 0x4824,
173         
174         VM_INSTRERR = 0x4400,
175         VM_EXREASON = 0x4402,
176         VM_EXINTRINFO = 0x4404,
177         VM_EXINTRCODE = 0x4406,
178         VM_IDTVECINFO = 0x4408,
179         VM_IDTVECCODE = 0x440A,
180         VM_EXINSTRLEN = 0x440C,
181         VM_EXINSTRINFO = 0x440E,
182         VM_EXQUALIF = 0x6400,
183         VM_IORCX = 0x6402,
184         VM_IORSI = 0x6404,
185         VM_IORDI = 0x6406,
186         VM_IORIP = 0x6408,
187         VM_GUESTVA = 0x640A,
188         VM_GUESTPA = 0x2400,
189         
190         VM_VPID = 0x000,
191         VM_EPTPIDX = 0x0004,
192         
193         VM_EPTP = 0x201A,
194         VM_EPTPLA = 0x2024,
195         
196         INVLOCAL = 1,
197 };
198
199 enum {
200         CR0RSVD = 0x1ffaffc0,
201         CR4RSVD = 0xff889000,
202         CR4MCE = 1<<6,
203         CR4VMXE = 1<<13,
204         CR4SMXE = 1<<14,
205         CR4PKE = 1<<22,
206         
207         CR0KERNEL = CR0RSVD | (uintptr)0xFFFFFFFF00000000ULL,
208         CR4KERNEL = CR4RSVD | CR4VMXE | CR4SMXE | CR4MCE | CR4PKE | (uintptr)0xFFFFFFFF00000000ULL
209 };
210
211 typedef struct Vmx Vmx;
212 typedef struct VmCmd VmCmd;
213 typedef struct VmMem VmMem;
214 typedef struct VmIntr VmIntr;
215
216 struct VmMem {
217         uvlong lo, hi;
218         Segment *seg;
219         uintptr off;
220         VmMem *next, *prev;
221         u16int attr;
222 };
223
224 struct VmIntr {
225         u32int info, code, ilen;
226 };
227
228 struct Vmx {
229         enum {
230                 NOVMX,
231                 VMXINACTIVE,
232                 VMXINIT,
233                 VMXREADY,
234                 VMXRUNNING,
235                 VMXDEAD,
236                 VMXENDING,
237         } state;
238         char errstr[ERRMAX];
239         Ureg ureg;
240         uintptr dr[8]; /* DR7 is also kept in VMCS */
241         FPsave *fp;
242         u8int launched;
243         u8int vpid;
244         enum {
245                 FLUSHVPID = 1,
246                 FLUSHEPT = 2,
247                 STEP = 4,
248                 POSTEX = 8,
249                 POSTIRQ = 16,
250         } onentry;
251         
252         Rendez cmdwait;
253         Lock cmdlock;
254         VmCmd *firstcmd, **lastcmd;
255         VmCmd *postponed;
256         uvlong *pml4;
257         VmMem mem;
258         
259         enum {
260                 GOTEXIT = 1,
261                 GOTIRQACK = 2,
262                 GOTSTEP = 4,
263                 GOTSTEPERR = 8,
264         } got;
265         VmMem *stepmap;
266         VmIntr exc, irq, irqack;
267 };
268
269 struct VmCmd {
270         enum {
271                 CMDFDONE = 1,
272                 CMDFFAIL = 2,
273                 CMDFPOSTP = 4,
274         } flags;
275         u8int scratched;
276         Rendez;
277         Lock;
278         int (*cmd)(VmCmd *, va_list);
279         int retval;
280         char *errstr;
281         va_list va;
282         VmCmd *next;
283 };
284
285 static char Equit[] = "vmx: ending";
286
287 static char *statenames[] = {
288         [NOVMX] "novmx",
289         [VMXINACTIVE] "inactive",
290         [VMXINIT] "init",
291         [VMXREADY] "ready",
292         [VMXRUNNING] "running",
293         [VMXDEAD] "dead",
294         [VMXENDING]"ending"
295 };
296
297 static Vmx vmx;
298
299 static u64int
300 vmcsread(u32int addr)
301 {
302         int rc;
303         u64int val;
304
305         val = 0;
306         rc = vmread(addr, (uintptr *) &val);
307         if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000)
308                 rc = vmread(addr | 1, (uintptr *) &val + 1);
309         if(rc < 0){
310                 char errbuf[128];
311                 snprint(errbuf, sizeof(errbuf), "vmcsread failed (%#.4ux)", addr);
312                 error(errbuf);
313         }
314         return val;
315 }
316
317 static void
318 vmcswrite(u32int addr, u64int val)
319 {
320         int rc;
321         
322         rc = vmwrite(addr, val);
323         if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000)
324                 rc = vmwrite(addr | 1, val >> 32);
325         if(rc < 0){
326                 char errbuf[128];
327                 snprint(errbuf, sizeof(errbuf), "vmcswrite failed (%#.4ux = %#.16ullx)", addr, val);
328                 error(errbuf);
329         }
330 }
331
332 static uvlong
333 parseval(char *s, int sz)
334 {
335         uvlong v;
336         char *p;
337         
338         if(sz == 0) sz = sizeof(uintptr);
339         v = strtoull(s, &p, 0);
340         if(p == s || *p != 0 || v >> sz * 8 != 0) error("invalid value");
341         return v;
342 }
343
344 static char *
345 cr0fakeread(char *p, char *e)
346 {
347         uvlong guest, mask, shadow;
348         
349         guest = vmcsread(GUEST_CR0);
350         mask = vmcsread(GUEST_CR0MASK);
351         shadow = vmcsread(GUEST_CR0SHADOW);
352         return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & ~mask | shadow & mask);
353 }
354
355 static char *
356 cr4fakeread(char *p, char *e)
357 {
358         uvlong guest, mask, shadow;
359         
360         guest = vmcsread(GUEST_CR4);
361         mask = vmcsread(GUEST_CR4MASK);
362         shadow = vmcsread(GUEST_CR4SHADOW);
363         return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & ~mask | shadow & mask);
364 }
365
366 static int
367 cr0realwrite(char *s)
368 {
369         uvlong v;
370         
371         v = parseval(s, 8);
372         vmcswrite(GUEST_CR0, vmcsread(GUEST_CR0) & CR0KERNEL | v & ~CR0KERNEL);
373         return 0;
374 }
375
376 static int
377 cr0maskwrite(char *s)
378 {
379         uvlong v;
380         
381         v = parseval(s, 8);
382         vmcswrite(GUEST_CR0MASK, vmcsread(GUEST_CR0MASK) | CR0KERNEL);
383         return 0;
384 }
385
386 static int
387 cr4realwrite(char *s)
388 {
389         uvlong v;
390         
391         v = parseval(s, 8);
392         vmcswrite(GUEST_CR4, vmcsread(GUEST_CR4) & CR4KERNEL | v & ~CR4KERNEL);
393         return 0;
394 }
395
396 static int
397 cr4maskwrite(char *s)
398 {
399         uvlong v;
400         
401         v = parseval(s, 8);
402         vmcswrite(GUEST_CR4MASK, vmcsread(GUEST_CR4MASK) | CR4KERNEL);
403         return 0;
404 }
405
406 static int
407 dr7write(char *s)
408 {
409         uvlong v;
410         
411         v = (u32int) parseval(s, 8);
412         vmcswrite(GUEST_DR7, vmx.dr[7] = (u32int) v);
413         return 0;
414 }
415
416 static int
417 readonly(char *)
418 {
419         return -1;
420 }
421
422 static int
423 dr6write(char *s)
424 {
425         uvlong v;
426         
427         v = parseval(s, 8);
428         vmx.dr[6] = (u32int) v;
429         return 0;
430 }
431
432 typedef struct GuestReg GuestReg;
433 struct GuestReg {
434         int offset;
435         u8int size; /* in bytes; 0 means == uintptr */
436         char *name;
437         char *(*read)(char *, char *);
438         int (*write)(char *);
439 };
440 #define VMXVAR(x) ~(ulong)&(((Vmx*)0)->x)
441 #define UREG(x) VMXVAR(ureg.x)
442 static GuestReg guestregs[] = {
443         {GUEST_RIP, 0, "pc"},
444         {GUEST_RSP, 0, "sp"},
445         {GUEST_RFLAGS, 0, "flags"},
446         {UREG(ax), 0, "ax"},
447         {UREG(bx), 0, "bx"},
448         {UREG(cx), 0, "cx"},
449         {UREG(dx), 0, "dx"},
450         {UREG(bp), 0, "bp"},
451         {UREG(si), 0, "si"},
452         {UREG(di), 0, "di"},
453         {GUEST_GDTRBASE, 0, "gdtrbase"},
454         {GUEST_GDTRLIMIT, 4, "gdtrlimit"},
455         {GUEST_IDTRBASE, 0, "idtrbase"},
456         {GUEST_IDTRLIMIT, 4, "idtrlimit"},
457         {GUEST_CS, 2, "cs"},
458         {GUEST_CSBASE, 0, "csbase"},
459         {GUEST_CSLIMIT, 4, "cslimit"},
460         {GUEST_CSPERM, 4, "csperm"},
461         {GUEST_DS, 2, "ds"},
462         {GUEST_DSBASE, 0, "dsbase"},
463         {GUEST_DSLIMIT, 4, "dslimit"},
464         {GUEST_DSPERM, 4, "dsperm"},
465         {GUEST_ES, 2, "es"},
466         {GUEST_ESBASE, 0, "esbase"},
467         {GUEST_ESLIMIT, 4, "eslimit"},
468         {GUEST_ESPERM, 4, "esperm"},
469         {GUEST_FS, 2, "fs"},
470         {GUEST_FSBASE, 0, "fsbase"},
471         {GUEST_FSLIMIT, 4, "fslimit"},
472         {GUEST_FSPERM, 4, "fsperm"},
473         {GUEST_GS, 2, "gs"},
474         {GUEST_GSBASE, 0, "gsbase"},
475         {GUEST_GSLIMIT, 4, "gslimit"},
476         {GUEST_GSPERM, 4, "gsperm"},
477         {GUEST_SS, 2, "ss"},
478         {GUEST_SSBASE, 0, "ssbase"},
479         {GUEST_SSLIMIT, 4, "sslimit"},
480         {GUEST_SSPERM, 4, "ssperm"},
481         {GUEST_TR, 2, "tr"},
482         {GUEST_TRBASE, 0, "trbase"},
483         {GUEST_TRLIMIT, 4, "trlimit"},
484         {GUEST_TRPERM, 4, "trperm"},
485         {GUEST_LDTR, 2, "ldtr"},
486         {GUEST_LDTRBASE, 0, "ldtrbase"},
487         {GUEST_LDTRLIMIT, 4, "ldtrlimit"},
488         {GUEST_LDTRPERM, 4, "ldtrperm"},
489         {GUEST_CR0, 0, "cr0real", nil, cr0realwrite},
490         {GUEST_CR0SHADOW, 0, "cr0fake", cr0fakeread},
491         {GUEST_CR0MASK, 0, "cr0mask", nil, cr0maskwrite},
492         {UREG(trap), 0, "cr2"},
493         {GUEST_CR3, 0, "cr3"},
494         {GUEST_CR4, 0, "cr4real", nil, cr4realwrite},
495         {GUEST_CR4SHADOW, 0, "cr4fake", cr4fakeread},
496         {GUEST_CR4MASK, 0, "cr4mask", nil, cr4maskwrite},
497         {GUEST_IA32_PAT, 8, "pat"},
498         {GUEST_IA32_EFER, 8, "efer"},
499         {VMXVAR(dr[0]), 0, "dr0"},
500         {VMXVAR(dr[1]), 0, "dr1"},
501         {VMXVAR(dr[2]), 0, "dr2"},
502         {VMXVAR(dr[3]), 0, "dr3"},
503         {VMXVAR(dr[6]), 0, "dr6", nil, dr6write},
504         {GUEST_DR7, 0, "dr7", nil, dr7write},
505         {VM_INSTRERR, 4, "instructionerror", nil, readonly},
506         {VM_EXREASON, 4, "exitreason", nil, readonly},
507         {VM_EXQUALIF, 0, "exitqualification", nil, readonly},
508         {VM_EXINTRINFO, 4, "exitinterruptinfo", nil, readonly},
509         {VM_EXINTRCODE, 4, "exitinterruptcode", nil, readonly},
510         {VM_EXINSTRLEN, 4, "exitinstructionlen", nil, readonly},
511         {VM_EXINSTRINFO, 4, "exitinstructioninfo", nil, readonly},
512         {VM_GUESTVA, 0, "exitva", nil, readonly},
513         {VM_GUESTPA, 0, "exitpa", nil, readonly},
514         {VM_IDTVECINFO, 4, "idtinterruptinfo", nil, readonly},
515         {VM_IDTVECCODE, 4, "idtinterruptcode", nil, readonly},
516 };
517
518 static int
519 vmokpage(u64int addr)
520 {
521         return (addr & 0xfff) == 0 && addr >> 48 == 0;
522 }
523
524 static uvlong *
525 eptwalk(uvlong addr)
526 {
527         uvlong *tab, *nt;
528         uvlong v;
529         int i;
530         
531         tab = vmx.pml4;
532         for(i = 3; i >= 1; i--){
533                 tab += addr >> 12 + 9 * i & 0x1ff;
534                 v = *tab;
535                 if((v & 3) == 0){
536                         nt = mallocalign(BY2PG, BY2PG, 0, 0);
537                         if(nt == nil) error(Enomem);
538                         memset(nt, 0, BY2PG);
539                         v = PADDR(nt) | 0x407;
540                         *tab = v;
541                 }
542                 tab = KADDR(v & ~0xfff);
543         }
544         return tab + (addr >> 12 & 0x1ff);
545 }
546
547 static void
548 eptfree(uvlong *tab, int level)
549 {
550         int i;
551         uvlong v, *t;
552         
553         if(level < 3){
554                 for(i = 0; i < 512; i++){
555                         v = tab[i];
556                         if((v & 3) == 0) continue;
557                         t = KADDR(v & ~0xfff);
558                         eptfree(t, level + 1);
559                         tab[i] = 0;
560                 }
561         }
562         if(level > 0)
563                 free(tab);              
564 }
565
566 static void
567 epttranslate(VmMem *mp)
568 {
569         uvlong p, hpa;
570
571         if(mp->seg != nil && (mp->seg->type & SG_TYPE) != SG_FIXED || (mp->lo & 0xfff) != 0 || (mp->hi & 0xfff) != 0 || (uint)mp->attr >= 0x1000)
572                 error(Egreg);
573         if(mp->seg != nil){
574                 if(mp->seg->base + mp->off + (mp->hi - mp->lo) > mp->seg->top)
575                         error(Egreg);
576                 hpa = mp->seg->map[0]->pages[0]->pa + mp->off;
577         }else
578                 hpa = 0;
579         for(p = mp->lo; p < mp->hi; p += BY2PG)
580                 *eptwalk(p) = hpa + (p - mp->lo) + mp->attr;
581         vmx.onentry |= FLUSHEPT;
582 }
583
584 static char *mtype[] = {"uc", "wc", "02", "03", "wt", "wp", "wb", "07"};
585
586 static int
587 cmdgetmeminfo(VmCmd *, va_list va)
588 {
589         VmMem *mp;
590         char *p0, *e, *p;
591         char attr[4];
592         char mt[4];
593         
594         p0 = va_arg(va, char *);
595         e = va_arg(va, char *);
596         p = p0;
597         for(mp = vmx.mem.next; mp != &vmx.mem; mp = mp->next){
598                 attr[0] = (mp->attr & 1) != 0 ? 'r' : '-';
599                 attr[1] = (mp->attr & 2) != 0 ? 'w' : '-';
600                 attr[2] = (mp->attr & 4) != 0 ? 'x' : '-';
601                 attr[3] = 0;
602                 *(ushort*)mt = *(u16int*)mtype[mp->attr >> 3 & 7];
603                 mt[2] = (mp->attr & 0x40) != 0 ? '!' : 0;
604                 mt[3] = 0;
605                 p = seprint(p, e, "%s %s %#llux %#llux %p %#llux\n", attr, mt, mp->lo, mp->hi, mp->seg, (uvlong)mp->off);
606         }
607         return p - p0;
608 }
609
610 static int
611 cmdclearmeminfo(VmCmd *, va_list)
612 {
613         VmMem *mp, *mn;
614         
615         eptfree(vmx.pml4, 0);
616         for(mp = vmx.mem.next; mp != &vmx.mem; mp = mn){
617                 mn = mp->next;
618                 free(mp);
619         }
620         vmx.mem.prev = &vmx.mem;
621         vmx.mem.next = &vmx.mem;
622         vmx.onentry |= FLUSHEPT;
623         return 0;
624 }
625
626 extern Segment* (*_globalsegattach)(char*);
627
628 static int
629 cmdsetmeminfo(VmCmd *, va_list va)
630 {
631         char *p0, *p, *q, *r;
632         int j;
633         char *f[10];
634         VmMem *mp;
635         int rc;
636         
637         p0 = va_arg(va, char *);
638         p = p0;
639         mp = nil;
640         for(;;){
641                 q = strchr(p, '\n');
642                 if(q == 0) break;
643                 *q = 0;
644                 if(mp == nil)
645                         mp = malloc(sizeof(VmMem));
646                 if(waserror()){
647                         free(mp);
648                         nexterror();
649                 }
650                 rc = tokenize(p, f, nelem(f));
651                 p = q + 1;
652                 if(rc == 0) goto next;
653                 if(rc != 4 && rc != 6) error("number of fields wrong");
654                 memset(mp, 0, sizeof(VmMem));
655                 for(q = f[0]; *q != 0; q++)
656                         switch(*q){
657                         case 'r': if((mp->attr & 1) != 0) goto tinval; mp->attr |= 1; break;
658                         case 'w': if((mp->attr & 2) != 0) goto tinval; mp->attr |= 2; break;
659                         case 'x': if((mp->attr & 4) != 0) goto tinval; mp->attr |= 0x404; break;
660                         case '-': break;
661                         default: tinval: error("invalid access field");
662                         }
663                 for(j = 0; j < 8; j++)
664                         if(strncmp(mtype[j], f[1], 2) == 0){
665                                 mp->attr |= j << 3;
666                                 break;
667                         }
668                 if(j == 8 || strlen(f[1]) > 3) error("invalid memory type");
669                 if(f[1][2] == '!') mp->attr |= 0x40;
670                 else if(f[1][2] != 0) error("invalid memory type");
671                 mp->lo = strtoull(f[2], &r, 0);
672                 if(*r != 0 || !vmokpage(mp->lo)) error("invalid low guest physical address");
673                 mp->hi = strtoull(f[3], &r, 0);
674                 if(*r != 0 || !vmokpage(mp->hi) || mp->hi <= mp->lo) error("invalid high guest physical address");
675                 mp->off = strtoull(f[5], &r, 0);
676                 if(*r != 0 || !vmokpage(mp->off)) error("invalid offset");
677                 if((mp->attr & 7) != 0){
678                         if(rc != 6) error("number of fields wrong");
679                         mp->seg = _globalsegattach(f[4]);
680                         if(mp->seg == nil) error("no such segment");
681                         if(mp->seg->base + mp->off + (mp->hi - mp->lo) > mp->seg->top) error("out of bounds");
682                 }
683                 epttranslate(mp);
684                 mp->prev = vmx.mem.prev;
685                 mp->next = &vmx.mem;
686                 mp->prev->next = mp;
687                 mp->next->prev = mp;
688                 mp = nil;
689         next:
690                 poperror();
691         }
692         free(mp);
693         return p - p0;
694 }
695
696 static void
697 vmxreset(void)
698 {
699         ulong regs[4];
700         vlong msr;
701
702         cpuid(1, regs);
703         if((regs[2] & 1<<5) == 0) return;
704         /* check if disabled by BIOS */
705         if(rdmsr(0x3a, &msr) < 0) return;
706         if((msr & 5) != 5){
707                 if((msr & 1) == 0){ /* msr still unlocked */
708                         wrmsr(0x3a, msr | 5);
709                         if(rdmsr(0x3a, &msr) < 0)
710                                 return;
711                 }
712                 if((msr & 5) != 5)
713                         return;
714         }
715         if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) return;
716         if((vlong)msr >= 0) return;
717         if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) return;
718         if((msr >> 32 & PROCB_EPT) == 0 || (msr >> 32 & PROCB_VPID) == 0) return;
719         vmx.state = VMXINACTIVE;
720         vmx.lastcmd = &vmx.firstcmd;
721         vmx.mem.next = &vmx.mem;
722         vmx.mem.prev = &vmx.mem;
723 }
724
725 static void
726 vmxshutdown(void)
727 {
728         if(vmx.state != NOVMX && vmx.state != VMXINACTIVE)
729                 vmxoff();
730 }
731
732 static void
733 vmcsinit(void)
734 {
735         vlong msr;
736         u32int x;
737         
738         memset(&vmx.ureg, 0, sizeof(vmx.ureg));
739         vmx.launched = 0;
740         vmx.onentry = 0;
741         
742         if(rdmsr(VMX_BASIC_MSR, &msr) < 0) error("rdmsr(VMX_BASIC_MSR) failed");
743         if((msr & 1ULL<<55) != 0){
744                 if(rdmsr(VMX_TRUE_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_TRUE_PROCB_CTLS_MSR) failed");
745                 if(rdmsr(VMX_TRUE_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_TRUE_PINB_CTLS_MSR) failed");
746         }else{
747                 if(rdmsr(VMX_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR) failed");
748                 if(rdmsr(VMX_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_PINB_CTLS_MSR) failed");
749         }
750
751         if(rdmsr(VMX_PINB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PINB_CTLS_MSR failed");
752         x = (u32int)pinb_ctls | 1<<1 | 1<<2 | 1<<4; /* currently reserved default1 bits */
753         x |= PINB_EXITIRQ | PINB_EXITNMI;
754         x &= pinb_ctls >> 32;
755         vmcswrite(PINB_CTLS, x);
756         
757         if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR failed");
758         x = (u32int)procb_ctls | 1<<1 | 7<<4 | 1<<8 | 1<<13 | 1<<14 | 1<<26; /* currently reserved default1 bits */
759         x |= PROCB_EXITHLT | PROCB_EXITMWAIT;
760         x |= PROCB_EXITMOVDR | PROCB_EXITIO | PROCB_EXITMONITOR;
761         x |= PROCB_USECTLS2;
762         x &= msr >> 32;
763         vmcswrite(PROCB_CTLS, x);
764         
765         if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS2_MSR failed");
766         x = PROCB_EPT | PROCB_VPID | PROCB_UNRESTR;
767         x &= msr >> 32;
768         vmcswrite(PROCB_CTLS2, x);
769         
770         if(rdmsr(VMX_VMEXIT_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMEXIT_CTLS_MSR failed");
771         x = (u32int)msr;
772         if(sizeof(uintptr) == 8) x |= VMEXIT_HOST64;
773         x |= VMEXIT_LD_IA32_PAT | VMEXIT_LD_IA32_EFER | VMEXIT_ST_DEBUG;
774         x &= msr >> 32;
775         vmcswrite(VMEXIT_CTLS, x);
776         
777         if(rdmsr(VMX_VMENTRY_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMENTRY_CTLS_MSR failed");
778         x = (u32int)msr;
779         if(sizeof(uintptr) == 8) x |= VMENTRY_GUEST64;
780         x |= VMENTRY_LD_IA32_PAT | VMENTRY_LD_IA32_EFER | VMENTRY_LD_DEBUG;
781         x &= msr >> 32;
782         vmcswrite(VMENTRY_CTLS, x);
783         
784         vmcswrite(CR3_TARGCNT, 0);
785         vmcswrite(VMEXIT_MSRLDCNT, 0);
786         vmcswrite(VMEXIT_MSRSTCNT, 0);
787         vmcswrite(VMENTRY_MSRLDCNT, 0);
788         vmcswrite(VMENTRY_INTRINFO, 0);
789         vmcswrite(VMCS_LINK, -1);
790         
791         vmcswrite(HOST_CS, KESEL);
792         vmcswrite(HOST_DS, KDSEL);
793         vmcswrite(HOST_ES, KDSEL);
794         vmcswrite(HOST_FS, KDSEL);
795         vmcswrite(HOST_GS, KDSEL);
796         vmcswrite(HOST_SS, KDSEL);
797         vmcswrite(HOST_TR, TSSSEL);
798         vmcswrite(HOST_CR0, getcr0() & ~0xe);
799         vmcswrite(HOST_CR3, getcr3());
800         vmcswrite(HOST_CR4, getcr4());
801         rdmsr(0xc0000100, &msr);
802         vmcswrite(HOST_FSBASE, msr);
803         rdmsr(0xc0000101, &msr);
804         vmcswrite(HOST_GSBASE, msr);
805         vmcswrite(HOST_TRBASE, (uintptr) m->tss);
806         vmcswrite(HOST_GDTR, (uintptr) m->gdt);
807         vmcswrite(HOST_IDTR, IDTADDR);
808         if(rdmsr(0x277, &msr) < 0) error("rdmsr(IA32_PAT) failed");
809         vmcswrite(HOST_IA32_PAT, msr);
810         if(rdmsr(0xc0000080, &msr) < 0) error("rdmsr(IA32_EFER) failed");
811         vmcswrite(HOST_IA32_EFER, msr);
812         
813         vmcswrite(EXC_BITMAP, 1<<18|1<<1);
814         vmcswrite(PFAULT_MASK, 0);
815         vmcswrite(PFAULT_MATCH, 0);
816         
817         vmcswrite(GUEST_CSBASE, 0);
818         vmcswrite(GUEST_DSBASE, 0);
819         vmcswrite(GUEST_ESBASE, 0);
820         vmcswrite(GUEST_FSBASE, 0);
821         vmcswrite(GUEST_GSBASE, 0);
822         vmcswrite(GUEST_SSBASE, 0);
823         vmcswrite(GUEST_CSLIMIT, -1);
824         vmcswrite(GUEST_DSLIMIT, -1);
825         vmcswrite(GUEST_ESLIMIT, -1);
826         vmcswrite(GUEST_FSLIMIT, -1);
827         vmcswrite(GUEST_GSLIMIT, -1);
828         vmcswrite(GUEST_SSLIMIT, -1);
829         vmcswrite(GUEST_CSPERM, (SEGG|SEGD|SEGP|SEGPL(0)|SEGEXEC|SEGR) >> 8 | 1);
830         vmcswrite(GUEST_DSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
831         vmcswrite(GUEST_ESPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
832         vmcswrite(GUEST_FSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
833         vmcswrite(GUEST_GSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
834         vmcswrite(GUEST_SSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
835         vmcswrite(GUEST_LDTRPERM, 1<<16);
836
837         vmcswrite(GUEST_CR0MASK, CR0KERNEL);
838         vmcswrite(GUEST_CR4MASK, CR4KERNEL);
839         vmcswrite(GUEST_CR0, getcr0() & ~(1<<31));
840         vmcswrite(GUEST_CR3, 0);
841         vmcswrite(GUEST_CR4, getcr4());
842         vmcswrite(GUEST_CR0SHADOW, getcr0());
843         vmcswrite(GUEST_CR4SHADOW, getcr4() & ~CR4VMXE);
844         
845         vmcswrite(GUEST_IA32_PAT, 0x0007040600070406ULL);
846         vmcswrite(GUEST_IA32_EFER, 0);
847         
848         vmcswrite(GUEST_TRBASE, (uintptr) m->tss);
849         vmcswrite(GUEST_TRLIMIT, 0xffff);
850         vmcswrite(GUEST_TRPERM, (SEGTSS|SEGPL(0)|SEGP) >> 8 | 2);
851         
852         vmx.pml4 = mallocalign(BY2PG, BY2PG, 0, 0);
853         memset(vmx.pml4, 0, BY2PG);
854         vmcswrite(VM_EPTP, PADDR(vmx.pml4) | 3<<3);
855         vmx.vpid = 1;
856         vmcswrite(VM_VPID, vmx.vpid);
857         
858         vmcswrite(GUEST_RFLAGS, 2);
859         
860         vmx.onentry = FLUSHVPID | FLUSHEPT;
861         
862         vmx.fp = mallocalign(512, 512, 0, 0);
863         if(vmx.fp == nil)
864                 error(Enomem);
865         fpinit();
866         fpsave(vmx.fp);
867 }
868
869 static void
870 vmxstart(void)
871 {
872         static uchar *vmcs; /* also vmxon region */
873         vlong x;
874
875         putcr4(getcr4() | 0x2000);
876
877         if(vmcs == nil){
878                 vmcs = mallocalign(8192, 4096, 0, 0);
879                 if(vmcs == nil)
880                         error(Enomem);
881         }
882         memset(vmcs, 0, 8192);
883         rdmsr(VMX_BASIC_MSR, &x);
884         *(ulong*)vmcs = x;
885         *(ulong*)&vmcs[4096] = x;
886         if(vmxon(PADDR(vmcs + 4096)) < 0)
887                 error("vmxon failed");
888         if(vmclear(PADDR(vmcs)) < 0)
889                 error("vmclear failed");
890         if(vmptrld(PADDR(vmcs)) < 0)
891                 error("vmptrld failed");
892         vmcsinit();
893 }
894
895 static void
896 cmdrelease(VmCmd *p, int f)
897 {
898         lock(p);
899         p->flags |= CMDFDONE | f;
900         wakeup(p);
901         unlock(p);
902 }
903
904 static void
905 killcmds(VmCmd *notme)
906 {
907         VmCmd *p, *pn;
908         
909         for(p = vmx.postponed; p != nil; p = pn){
910                 pn = p->next;
911                 p->next = nil;
912                 if(p == notme) continue;
913                 kstrcpy(p->errstr, Equit, ERRMAX);
914                 cmdrelease(p, CMDFFAIL);
915         }
916         vmx.postponed = nil;
917         ilock(&vmx.cmdlock);
918         for(p = vmx.firstcmd; p != nil; p = pn){
919                 pn = p->next;
920                 p->next = nil;
921                 if(p == notme) continue;
922                 kstrcpy(p->errstr, Equit, ERRMAX);
923                 cmdrelease(p, CMDFFAIL);
924         }
925         vmx.firstcmd = nil;
926         vmx.lastcmd = &vmx.firstcmd;
927         iunlock(&vmx.cmdlock);
928 }
929
930 static int
931 cmdquit(VmCmd *p, va_list va)
932 {
933         vmx.state = VMXENDING;
934         cmdclearmeminfo(p, va);
935         killcmds(p);
936
937         free(vmx.pml4);
938         vmx.pml4 = nil;
939         vmx.got = 0;
940         vmx.onentry = 0;
941         vmx.stepmap = nil;
942
943         vmxoff();
944         vmx.state = VMXINACTIVE;
945         cmdrelease(p, 0);
946         pexit(Equit, 1);
947         return 0;
948 }
949
950 static void
951 processexit(void)
952 {
953         u32int reason;
954         
955         reason = vmcsread(VM_EXREASON);
956         if((reason & 1<<31) == 0)
957                 switch(reason & 0xffff){
958                 case 1: /* external interrupt */
959                 case 3: /* INIT */
960                 case 4: /* SIPI */
961                 case 5: /* IO SMI */
962                 case 6: /* SMI */
963                 case 7: /* IRQ window */
964                 case 8: /* NMI window */
965                         return;
966                 case 37:
967                         if((vmx.onentry & STEP) != 0){
968                                 vmx.state = VMXREADY;
969                                 vmx.got |= GOTSTEP;
970                                 vmx.onentry &= ~STEP;
971                                 return;
972                         }
973                         break;
974                 }
975         if((vmx.onentry & STEP) != 0){
976                 iprint("VMX: exit reason %#x when expected step...\n", reason & 0xffff);
977                 vmx.onentry &= ~STEP;
978                 vmx.got |= GOTSTEP|GOTSTEPERR;
979         }
980         vmx.state = VMXREADY;
981         vmx.got |= GOTEXIT;
982 }
983
984 static int
985 cmdgetregs(VmCmd *, va_list va)
986 {
987         char *p0, *e;
988         GuestReg *r;
989         uvlong val;
990         int s;
991         char *p;
992         
993         p0 = va_arg(va, char *);
994         e = va_arg(va, char *);
995         p = p0;
996         for(r = guestregs; r < guestregs + nelem(guestregs); r++)
997                 if(r->read != nil){
998                         p = seprint(p, e, "%s ", r->name);
999                         p = r->read(p, e);
1000                         p = strecpy(p, e, "\n");
1001                 }else{
1002                         if(r->offset >= 0)
1003                                 val = vmcsread(r->offset);
1004                         else
1005                                 val = *(uintptr*)((uchar*)&vmx + ~r->offset);
1006                         s = r->size;
1007                         if(s == 0) s = sizeof(uintptr);
1008                         p = seprint(p, e, "%s %#.*llux\n", r->name, s * 2, val);
1009                 }
1010         return p - p0;
1011 }
1012
1013 static int
1014 setregs(char *p0, char rs, char *fs)
1015 {
1016         char *p, *q, *rp;
1017         char *f[10];
1018         GuestReg *r;
1019         uvlong val;
1020         int sz;
1021         int rc;
1022
1023         p = p0;
1024         for(;;){
1025                 q = strchr(p, rs);
1026                 if(q == 0) break;
1027                 *q = 0;
1028                 rc = getfields(p, f, nelem(f), 1, fs);
1029                 p = q + 1;
1030                 if(rc == 0) continue;
1031                 if(rc != 2) error("number of fields wrong");
1032                 
1033                 for(r = guestregs; r < guestregs + nelem(guestregs); r++)
1034                         if(strcmp(r->name, f[0]) == 0)
1035                                 break;
1036                 if(r == guestregs + nelem(guestregs))
1037                         error("unknown register");
1038                 if(r->write != nil){
1039                         r->write(f[1]);
1040                         continue;
1041                 }
1042                 val = strtoull(f[1], &rp, 0);
1043                 sz = r->size;
1044                 if(sz == 0) sz = sizeof(uintptr);
1045                 if(rp == f[1] || *rp != 0 || val >> 8 * sz != 0) error("invalid value");
1046                 if(r->offset >= 0)
1047                         vmcswrite(r->offset, val);
1048                 else{
1049                         assert((u32int)~r->offset + sz <= sizeof(Vmx)); 
1050                         switch(sz){
1051                         case 1: *(u8int*)((u8int*)&vmx + (u32int)~r->offset) = val; break;
1052                         case 2: *(u16int*)((u8int*)&vmx + (u32int)~r->offset) = val; break;
1053                         case 4: *(u32int*)((u8int*)&vmx + (u32int)~r->offset) = val; break;
1054                         case 8: *(u64int*)((u8int*)&vmx + (u32int)~r->offset) = val; break;
1055                         default: error(Egreg);
1056                         }
1057                 }
1058         }
1059         return p - p0;
1060 }
1061
1062 static int
1063 cmdsetregs(VmCmd *, va_list va)
1064 {
1065         return setregs(va_arg(va, char *), '\n', " \t");
1066 }
1067
1068 static int
1069 cmdgetfpregs(VmCmd *, va_list va)
1070 {
1071         uchar *p;
1072         
1073         p = va_arg(va, uchar *);
1074         memmove(p, vmx.fp, sizeof(FPsave));
1075         return sizeof(FPsave);
1076 }
1077
1078 static int
1079 cmdsetfpregs(VmCmd *, va_list va)
1080 {
1081         uchar *p;
1082         ulong n;
1083         vlong off;
1084         
1085         p = va_arg(va, uchar *);
1086         n = va_arg(va, ulong);
1087         off = va_arg(va, vlong);
1088         if(off < 0 || off >= sizeof(FPsave)) n = 0;
1089         else if(off + n > sizeof(FPsave)) n = sizeof(FPsave) - n;
1090         memmove((uchar*)vmx.fp + off, p, n);
1091         return n;
1092 }
1093
1094 static int
1095 cmdgo(VmCmd *, va_list va)
1096 {
1097         char *r;
1098
1099         if(vmx.state != VMXREADY)
1100                 error("VM not ready");
1101         r = va_arg(va, char *);
1102         if(r != nil) setregs(r, ';', "=");
1103         vmx.state = VMXRUNNING;
1104         return 0;
1105 }
1106
1107 static int
1108 cmdstop(VmCmd *, va_list)
1109 {
1110         if(vmx.state != VMXREADY && vmx.state != VMXRUNNING)
1111                 error("VM not ready or running");
1112         vmx.state = VMXREADY;
1113         return 0;
1114 }
1115
1116 static int
1117 cmdstatus(VmCmd *, va_list va)
1118 {       
1119         kstrcpy(va_arg(va, char *), vmx.errstr, ERRMAX);
1120         return vmx.state;
1121 }
1122
1123 static char *exitreasons[] = {
1124         [0] "exc", [1] "extirq", [2] "triplef", [3] "initsig", [4] "sipi", [5] "smiio", [6] "smiother", [7] "irqwin",
1125         [8] "nmiwin", [9] "taskswitch", [10] ".cpuid", [11] ".getsec", [12] ".hlt", [13] ".invd", [14] ".invlpg", [15] ".rdpmc",
1126         [16] ".rdtsc", [17] ".rsm", [18] ".vmcall", [19] ".vmclear", [20] ".vmlaunch", [21] ".vmptrld", [22] ".vmptrst", [23] ".vmread",
1127         [24] ".vmresume", [25] ".vmwrite", [26] ".vmxoff", [27] ".vmxon", [28] "movcr", [29] ".movdr", [30] "io", [31] ".rdmsr",
1128         [32] ".wrmsr", [33] "entrystate", [34] "entrymsr", [36] ".mwait", [37] "monitortrap", [39] ".monitor",
1129         [40] ".pause", [41] "mcheck", [43] "tpr", [44] "apicacc", [45] "eoi", [46] "gdtr_idtr", [47] "ldtr_tr",
1130         [48] "eptfault", [49] "eptinval", [50] ".invept", [51] ".rdtscp", [52] "preempt", [53] ".invvpid", [54] ".wbinvd", [55] ".xsetbv",
1131         [56] "apicwrite", [57] ".rdrand", [58] ".invpcid", [59] ".vmfunc", [60] ".encls", [61] ".rdseed", [62] "pmlfull", [63] ".xsaves",
1132         [64] ".xrstors", 
1133 };
1134
1135 static char *except[] = {
1136         [0] "#de", [1] "#db", [3] "#bp", [4] "#of", [5] "#br", [6] "#ud", [7] "#nm",
1137         [8] "#df", [10] "#ts", [11] "#np", [12] "#ss", [13] "#gp", [14] "#pf",
1138         [16] "#mf", [17] "#ac", [18] "#mc", [19] "#xm", [20] "#ve",
1139 };
1140
1141 static int
1142 cmdwait(VmCmd *cp, va_list va)
1143 {
1144         char *p, *p0, *e;
1145         u32int reason, intr;
1146         uvlong qual;
1147         u16int rno;
1148
1149         if(cp->scratched)
1150                 error(Eintr);
1151         p0 = p = va_arg(va, char *);
1152         e = va_arg(va, char *);
1153         if((vmx.got & GOTIRQACK) != 0){
1154                 p = seprint(p, e, "*ack %d\n", vmx.irqack.info & 0xff);
1155                 vmx.got &= ~GOTIRQACK;
1156                 return p - p0;
1157         }
1158         if((vmx.got & GOTEXIT) == 0){
1159                 cp->flags |= CMDFPOSTP;
1160                 return -1;
1161         }
1162         vmx.got &= ~GOTEXIT;
1163         reason = vmcsread(VM_EXREASON);
1164         qual = vmcsread(VM_EXQUALIF);
1165         rno = reason;
1166         intr = vmcsread(VM_EXINTRINFO);
1167         if((reason & 1<<31) != 0)
1168                 p = seprint(p, e, "!");
1169         if(rno == 0 && (intr & 1<<31) != 0){
1170                 if((intr & 0xff) >= nelem(except) || except[intr & 0xff] == nil)
1171                         p = seprint(p, e, "#%d ", intr & 0xff);
1172                 else
1173                         p = seprint(p, e, "%s ", except[intr & 0xff]);
1174         }else if(rno >= nelem(exitreasons) || exitreasons[rno] == nil)
1175                 p = seprint(p, e, "?%d ", rno);
1176         else
1177                 p = seprint(p, e, "%s ", exitreasons[rno]);
1178         p = seprint(p, e, "%#ullx pc %#ullx sp %#ullx ilen %#ullx iinfo %#ullx", qual, vmcsread(GUEST_RIP), vmcsread(GUEST_RSP), vmcsread(VM_EXINSTRLEN), vmcsread(VM_EXINSTRINFO));
1179         if((intr & 1<<11) != 0) p = seprint(p, e, " excode %#ullx", vmcsread(VM_EXINTRCODE));
1180         if(rno == 48 && (qual & 0x80) != 0) p = seprint(p, e, " va %#ullx", vmcsread(VM_GUESTVA));
1181         if(rno == 48 || rno == 49) p = seprint(p, e, " pa %#ullx", vmcsread(VM_GUESTPA));
1182         if(rno == 30) p = seprint(p, e, " ax %#ullx", (uvlong)vmx.ureg.ax);
1183         p = seprint(p, e, "\n");
1184         return p - p0;
1185 }
1186
1187 static int
1188 cmdstep(VmCmd *cp, va_list va)
1189 {
1190         switch(cp->retval){
1191         case 0:
1192                 if((vmx.got & GOTSTEP) != 0 || (vmx.onentry & STEP) != 0)
1193                         error(Einuse);
1194                 if(vmx.state != VMXREADY){
1195                         iprint("pre-step in state %s\n", statenames[vmx.state]);
1196                         error("not ready");
1197                 }
1198                 vmx.stepmap = va_arg(va, VmMem *);
1199                 vmx.onentry |= STEP;
1200                 vmx.state = VMXRUNNING;
1201                 cp->flags |= CMDFPOSTP;
1202                 return 1;
1203         case 1:
1204                 if(vmx.state != VMXREADY){
1205                         iprint("post-step in state %s\n", statenames[vmx.state]);
1206                         vmx.onentry &= ~STEP;
1207                         vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1208                         error("not ready");
1209                 }
1210                 if((vmx.got & GOTSTEP) == 0){
1211                         cp->flags |= CMDFPOSTP;
1212                         return 1;
1213                 }
1214                 if((vmx.got & GOTSTEPERR) != 0){
1215                         vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1216                         error("step failed");
1217                 }
1218                 vmx.got &= ~(GOTSTEP|GOTSTEPERR);
1219                 return 1;
1220         }
1221         return 0;
1222 }
1223
1224 static void
1225 eventparse(char *p, VmIntr *vi)
1226 {
1227         char *q, *r;
1228         int i;
1229         
1230         memset(vi, 0, sizeof(VmIntr));
1231         q = nil;
1232         kstrdup(&q, p);
1233         if(waserror()){
1234                 free(q);
1235                 memset(vi, 0, sizeof(VmIntr));
1236                 nexterror();
1237         }
1238         vi->info = 1<<31;
1239         r = strchr(q, ',');
1240         if(r != nil) *r++ = 0;
1241         for(i = 0; i < nelem(except); i++)
1242                 if(except[i] != nil && strcmp(except[i], q) == 0)
1243                         break;
1244         if(*q == '#'){
1245                 q++;
1246                 vi->info |= 3 << 8;
1247         }
1248         if(i == nelem(except)){
1249                 i = strtoul(q, &q, 10);
1250                 if(*q != 0 || i > 255) error(Ebadctl);
1251         }
1252         vi->info |= i;
1253         if((vi->info & 0x7ff) == 3 || (vi->info & 0x7ff) == 4)
1254                 vi->info += 3 << 8;
1255         if(r == nil) goto out;
1256         if(*r != ','){
1257                 vi->code = strtoul(r, &r, 0);
1258                 vi->info |= 1<<11;
1259         }else r++;
1260         if(*r == ',')
1261                 vi->ilen = strtoul(r + 1, &r, 0);
1262         if(*r != 0) error(Ebadctl);
1263 out:
1264         poperror();
1265         free(q);
1266 }
1267
1268 static int
1269 cmdexcept(VmCmd *cp, va_list va)
1270 {
1271         if(cp->scratched) error(Eintr);
1272         if((vmx.onentry & POSTEX) != 0){
1273                 cp->flags |= CMDFPOSTP;
1274                 return 0;
1275         }
1276         eventparse(va_arg(va, char *), &vmx.exc);
1277         vmx.onentry |= POSTEX;
1278         return 0;
1279 }
1280
1281 static int
1282 cmdirq(VmCmd *, va_list va)
1283 {
1284         char *p;
1285         VmIntr vi;
1286         
1287         p = va_arg(va, char *);
1288         if(p == nil)
1289                 vmx.onentry &= ~POSTIRQ;
1290         else{
1291                 eventparse(p, &vi);
1292                 vmx.irq = vi;
1293                 vmx.onentry |= POSTIRQ;
1294         }
1295         return 0;
1296 }
1297
1298
1299 static int
1300 gotcmd(void *)
1301 {
1302         int rc;
1303
1304         ilock(&vmx.cmdlock);
1305         rc = vmx.firstcmd != nil;
1306         iunlock(&vmx.cmdlock);
1307         return rc;
1308 }
1309
1310 static void
1311 markcmddone(VmCmd *p, VmCmd ***pp)
1312 {
1313         if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP){
1314                 **pp = p;
1315                 *pp = &p->next;
1316         }else{
1317                 p->flags = p->flags & ~CMDFPOSTP;
1318                 cmdrelease(p, 0);
1319         }
1320 }
1321
1322 static VmCmd **
1323 markppcmddone(VmCmd **pp)
1324 {
1325         VmCmd *p;
1326         
1327         p = *pp;
1328         if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP)
1329                 return &p->next;
1330         *pp = p->next;
1331         p->next = nil;
1332         p->flags = p->flags & ~CMDFPOSTP;
1333         cmdrelease(p, 0);
1334         return pp;
1335 }
1336
1337
1338 static void
1339 runcmd(void)
1340 {
1341         VmCmd *p, **pp;
1342         
1343         for(pp = &vmx.postponed; p = *pp, p != nil; ){
1344                 if(waserror()){
1345                         kstrcpy(p->errstr, up->errstr, ERRMAX);
1346                         p->flags |= CMDFFAIL;
1347                         pp = markppcmddone(pp);
1348                         continue;
1349                 }
1350                 p->flags &= ~CMDFPOSTP;
1351                 p->retval = p->cmd(p, p->va);
1352                 poperror();
1353                 pp = markppcmddone(pp);
1354         }
1355         for(;;){
1356                 ilock(&vmx.cmdlock);
1357                 p = vmx.firstcmd;
1358                 if(p == nil){
1359                         iunlock(&vmx.cmdlock);
1360                         break;
1361                 }
1362                 vmx.firstcmd = p->next;
1363                 if(vmx.lastcmd == &p->next)
1364                         vmx.lastcmd = &vmx.firstcmd;
1365                 iunlock(&vmx.cmdlock);
1366                 p->next = nil;
1367                 if(waserror()){
1368                         kstrcpy(p->errstr, up->errstr, ERRMAX);
1369                         p->flags |= CMDFFAIL;
1370                         markcmddone(p, &pp);
1371                         continue;
1372                 }
1373                 if(p->scratched) error(Eintr);
1374                 p->retval = p->cmd(p, p->va);
1375                 poperror();
1376                 markcmddone(p, &pp);
1377         }
1378 }
1379
1380 static void
1381 dostep(int setup)
1382 {
1383         static uvlong oldmap;
1384         static uvlong *mapptr;
1385
1386         if(setup){
1387                 if(vmx.stepmap != nil){
1388                         mapptr = eptwalk(vmx.stepmap->lo);
1389                         oldmap = *mapptr;
1390                         epttranslate(vmx.stepmap);
1391                 }
1392         }else{
1393                 vmcswrite(PROCB_CTLS, vmcsread(PROCB_CTLS) & ~(uvlong)PROCB_MONTRAP);
1394                 if(vmx.stepmap != nil){
1395                         *mapptr = oldmap;
1396                         vmx.stepmap = nil;
1397                         vmx.onentry |= FLUSHEPT;
1398                 }
1399         }
1400 }
1401
1402 static void
1403 vmxproc(void *)
1404 {
1405         int init, rc, x;
1406         u32int procbctls, defprocbctls;
1407
1408         procwired(up, 0);
1409         sched();
1410         init = 0;
1411         defprocbctls = 0;
1412         while(waserror()){
1413                 kstrcpy(vmx.errstr, up->errstr, ERRMAX);
1414                 vmx.state = VMXDEAD;
1415         }
1416         for(;;){
1417                 if(!init){
1418                         init = 1;
1419                         vmxstart();
1420                         vmx.state = VMXREADY;
1421                         defprocbctls = vmcsread(PROCB_CTLS);
1422                 }
1423                 runcmd();
1424                 if(vmx.state == VMXRUNNING){
1425                         procbctls = defprocbctls;
1426                         if((vmx.onentry & STEP) != 0){
1427                                 procbctls |= PROCB_MONTRAP;
1428                                 dostep(1);
1429                                 if(waserror()){
1430                                         dostep(0);
1431                                         nexterror();
1432                                 }
1433                         }
1434                         if((vmx.onentry & POSTEX) != 0){
1435                                 vmcswrite(VMENTRY_INTRINFO, vmx.exc.info);
1436                                 vmcswrite(VMENTRY_INTRCODE, vmx.exc.code);
1437                                 vmcswrite(VMENTRY_INTRILEN, vmx.exc.ilen);
1438                                 vmx.onentry &= ~POSTEX;
1439                         }
1440                         if((vmx.onentry & POSTIRQ) != 0 && (vmx.onentry & STEP) == 0){
1441                                 if((vmx.onentry & POSTEX) == 0 && (vmcsread(GUEST_RFLAGS) & 1<<9) != 0 && (vmcsread(GUEST_CANINTR) & 3) == 0){
1442                                         vmcswrite(VMENTRY_INTRINFO, vmx.irq.info);
1443                                         vmcswrite(VMENTRY_INTRCODE, vmx.irq.code);
1444                                         vmcswrite(VMENTRY_INTRILEN, vmx.irq.ilen);
1445                                         vmx.onentry &= ~POSTIRQ;
1446                                         vmx.got |= GOTIRQACK;
1447                                         vmx.irqack = vmx.irq;
1448                                 }else
1449                                         procbctls |= PROCB_IRQWIN;
1450                         }
1451                         if((vmx.onentry & FLUSHVPID) != 0){
1452                                 if(invvpid(INVLOCAL, vmx.vpid, 0) < 0)
1453                                         error("invvpid failed");
1454                                 vmx.onentry &= ~FLUSHVPID;
1455                         }
1456                         if((vmx.onentry & FLUSHEPT) != 0){
1457                                 if(invept(INVLOCAL, PADDR(vmx.pml4) | 3<<3, 0) < 0)
1458                                         error("invept failed");
1459                                 vmx.onentry &= ~FLUSHEPT;
1460                         }
1461                         vmcswrite(PROCB_CTLS, procbctls);
1462                         vmx.got &= ~GOTEXIT;
1463                         
1464                         x = splhi();
1465                         if((vmx.dr[7] & ~0xd400) != 0)
1466                                 putdr01236(vmx.dr);
1467                         fpsserestore0(vmx.fp);
1468                         rc = vmlaunch(&vmx.ureg, vmx.launched);
1469                         fpssesave0(vmx.fp);
1470                         splx(x);
1471                         if(rc < 0)
1472                                 error("vmlaunch failed");
1473                         vmx.launched = 1;
1474                         if((vmx.onentry & STEP) != 0){
1475                                 dostep(0);
1476                                 poperror();
1477                         }
1478                         processexit();
1479                 }else{
1480                         up->psstate = "Idle";
1481                         sleep(&vmx.cmdwait, gotcmd, nil);
1482                         up->psstate = nil;
1483                 }
1484         }
1485 }
1486
1487 enum {
1488         Qdir,
1489         Qctl,
1490         Qregs,
1491         Qstatus,
1492         Qmap,
1493         Qwait,
1494         Qfpregs,
1495 };
1496
1497 static Dirtab vmxdir[] = {
1498         ".",            { Qdir, 0, QTDIR },     0,              0550,
1499         "ctl",          { Qctl, 0, 0 },         0,              0660,
1500         "regs",         { Qregs, 0, 0 },        0,              0660,
1501         "status",       { Qstatus, 0, 0 },      0,              0440,
1502         "map",          { Qmap, 0, 0 },         0,              0660,
1503         "wait",         { Qwait, 0, 0 },        0,              0440,
1504         "fpregs",       { Qfpregs, 0, 0 },      0,              0660,
1505 };
1506
1507 enum {
1508         CMinit,
1509         CMquit,
1510         CMgo,
1511         CMstop,
1512         CMstep,
1513         CMexc,
1514         CMirq,
1515 };
1516
1517 static Cmdtab vmxctlmsg[] = {
1518         CMinit,         "init",         1,
1519         CMquit,         "quit",         1,
1520         CMgo,           "go",           0,
1521         CMstop,         "stop",         1,
1522         CMstep,         "step",         0,
1523         CMexc,          "exc",          2,
1524         CMirq,          "irq",          0,
1525 };
1526
1527 static int
1528 iscmddone(void *cp)
1529 {
1530         return (((VmCmd*)cp)->flags & CMDFDONE) != 0;
1531 }
1532
1533 static int
1534 vmxcmd(int (*f)(VmCmd *, va_list), ...)
1535 {
1536         VmCmd cmd;
1537         
1538         if(vmx.state == VMXINACTIVE)
1539                 error("no VM");
1540         if(vmx.state == VMXENDING)
1541         ending:
1542                 error(Equit);
1543         memset(&cmd, 0, sizeof(VmCmd));
1544         cmd.errstr = up->errstr;
1545         cmd.cmd = f;
1546         va_start(cmd.va, f);
1547          
1548         ilock(&vmx.cmdlock);
1549         if(vmx.state == VMXENDING){
1550                 iunlock(&vmx.cmdlock);
1551                 goto ending;
1552         }
1553         *vmx.lastcmd = &cmd;
1554         vmx.lastcmd = &cmd.next;
1555         iunlock(&vmx.cmdlock);
1556         
1557         while(waserror())
1558                 cmd.scratched = 1;
1559         wakeup(&vmx.cmdwait);
1560         do
1561                 sleep(&cmd, iscmddone, &cmd);
1562         while(!iscmddone(&cmd));
1563         poperror();
1564         lock(&cmd);
1565         unlock(&cmd);
1566         if((cmd.flags & CMDFFAIL) != 0)
1567                 error(up->errstr);
1568         return cmd.retval;
1569 }
1570
1571 static Chan *
1572 vmxattach(char *spec)
1573 {
1574         if(vmx.state == NOVMX) error(Enodev);
1575         return devattach('X', spec);
1576 }
1577
1578 static Walkqid*
1579 vmxwalk(Chan *c, Chan *nc, char **name, int nname)
1580 {
1581         return devwalk(c, nc, name, nname, vmxdir, nelem(vmxdir), devgen);
1582 }
1583
1584 static int
1585 vmxstat(Chan *c, uchar *dp, int n)
1586 {
1587         return devstat(c, dp, n, vmxdir, nelem(vmxdir), devgen);
1588 }
1589
1590 static Chan*
1591 vmxopen(Chan* c, int omode)
1592 {
1593         Chan *ch;
1594
1595         if(c->qid.path != Qdir && !iseve()) error(Eperm);
1596         ch = devopen(c, omode, vmxdir, nelem(vmxdir), devgen);
1597         if(ch->qid.path == Qmap){
1598                 if((omode & OTRUNC) != 0)
1599                         vmxcmd(cmdclearmeminfo);
1600         }
1601         return ch;
1602 }
1603
1604 static void
1605 vmxclose(Chan*)
1606 {
1607 }
1608
1609 static long
1610 vmxread(Chan* c, void* a, long n, vlong off)
1611 {
1612         static char regbuf[4096];
1613         static char membuf[4096];
1614         int rc;
1615
1616         switch((ulong)c->qid.path){
1617         case Qdir:
1618                 return devdirread(c, a, n, vmxdir, nelem(vmxdir), devgen);
1619         case Qregs:
1620                 if(off == 0)
1621                         vmxcmd(cmdgetregs, regbuf, regbuf + sizeof(regbuf));
1622                 return readstr(off, a, n, regbuf);
1623         case Qmap:
1624                 if(off == 0)
1625                         vmxcmd(cmdgetmeminfo, membuf, membuf + sizeof(membuf));
1626                 return readstr(off, a, n, membuf);
1627         case Qstatus:
1628                 {
1629                         char buf[ERRMAX+128];
1630                         char errbuf[ERRMAX];
1631                         int status;
1632                         
1633                         status = vmx.state;
1634                         if(status == VMXDEAD){
1635                                 vmxcmd(cmdstatus, errbuf);
1636                                 snprint(buf, sizeof(buf), "%s %#q\n", statenames[status], errbuf);
1637                         }else if(status >= 0 && status < nelem(statenames))
1638                                 snprint(buf, sizeof(buf), "%s\n", statenames[status]);
1639                         else
1640                                 snprint(buf, sizeof(buf), "%d\n", status);
1641                         return readstr(off, a, n, buf);
1642                 }
1643         case Qwait:
1644                 {
1645                         char buf[512];
1646                         
1647                         rc = vmxcmd(cmdwait, buf, buf + sizeof(buf));
1648                         if(rc > n) rc = n;
1649                         if(rc > 0) memmove(a, buf, rc);
1650                         return rc;
1651                 }
1652         case Qfpregs:
1653                 {
1654                         char buf[sizeof(FPsave)];
1655                         
1656                         vmxcmd(cmdgetfpregs, buf);
1657                         if(n < 0 || off < 0 || off >= sizeof(buf)) n = 0;
1658                         else if(off + n > sizeof(buf)) n = sizeof(buf) - off;
1659                         if(n != 0) memmove(a, buf + off, n);
1660                         return n;
1661                 }
1662         default:
1663                 error(Egreg);
1664                 break;
1665         }
1666         return 0;
1667 }
1668
1669 static long
1670 vmxwrite(Chan* c, void* a, long n, vlong off)
1671 {
1672         static QLock initlock;
1673         Cmdbuf *cb;
1674         Cmdtab *ct;
1675         char *s;
1676         int rc;
1677         int i;
1678         VmMem tmpmem;
1679
1680         switch((ulong)c->qid.path){
1681         case Qdir:
1682                 error(Eperm);
1683         case Qctl:
1684                 cb = parsecmd(a, n);
1685                 if(waserror()){
1686                         free(cb);
1687                         nexterror();
1688                 }
1689                 ct = lookupcmd(cb, vmxctlmsg, nelem(vmxctlmsg));
1690                 switch(ct->index){
1691                 case CMinit:
1692                         qlock(&initlock);
1693                         if(waserror()){
1694                                 qunlock(&initlock);
1695                                 nexterror();
1696                         }
1697                         if(vmx.state != VMXINACTIVE)
1698                                 error("vmx already active");
1699                         vmx.state = VMXINIT;
1700                         kproc("kvmx", vmxproc, nil);
1701                         poperror();
1702                         qunlock(&initlock);
1703                         if(vmxcmd(cmdstatus, up->errstr) == VMXDEAD)
1704                                 error(up->errstr);
1705                         break;
1706                 case CMquit:
1707                         vmxcmd(cmdquit);
1708                         break;
1709                 case CMgo:
1710                         s = nil;
1711                         if(cb->nf == 2) kstrdup(&s, cb->f[1]);
1712                         else if(cb->nf != 1) error(Ebadarg);
1713                         if(waserror()){
1714                                 free(s);
1715                                 nexterror();
1716                         }
1717                         vmxcmd(cmdgo, s);
1718                         poperror();
1719                         free(s);
1720                         break;
1721                 case CMstop:
1722                         vmxcmd(cmdstop);
1723                         break;
1724                 case CMstep:
1725                         rc = 0;
1726                         for(i = 1; i < cb->nf; i++)
1727                                 if(strcmp(cb->f[i], "-map") == 0){
1728                                         rc = 1;
1729                                         if(i+4 > cb->nf) error("missing argument");
1730                                         memset(&tmpmem, 0, sizeof(tmpmem));
1731                                         tmpmem.lo = strtoull(cb->f[i+1], &s, 0);
1732                                         if(*s != 0 || !vmokpage(tmpmem.lo)) error("invalid address");
1733                                         tmpmem.hi = tmpmem.lo + BY2PG;
1734                                         tmpmem.attr = 0x407;
1735                                         tmpmem.seg = _globalsegattach(cb->f[i+2]);
1736                                         if(tmpmem.seg == nil) error("unknown segment");
1737                                         tmpmem.off = strtoull(cb->f[i+3], &s, 0);
1738                                         if(*s != 0 || !vmokpage(tmpmem.off)) error("invalid offset");
1739                                         i += 3;
1740                                 }else
1741                                         error(Ebadctl);
1742                         vmxcmd(cmdstep, rc ? &tmpmem : nil);
1743                         break;
1744                 case CMexc:
1745                         s = nil;
1746                         kstrdup(&s, cb->f[1]);
1747                         if(waserror()){
1748                                 free(s);
1749                                 nexterror();
1750                         }
1751                         vmxcmd(cmdexcept, s);
1752                         poperror();
1753                         free(s);
1754                         break;
1755                 case CMirq:
1756                         s = nil;
1757                         if(cb->nf == 2)
1758                                 kstrdup(&s, cb->f[1]);
1759                         if(waserror()){
1760                                 free(s);
1761                                 nexterror();
1762                         }
1763                         vmxcmd(cmdirq, s);
1764                         poperror();
1765                         free(s);
1766                         break;
1767                 default:
1768                         error(Egreg);
1769                 }
1770                 poperror();
1771                 free(cb);
1772                 break;
1773         case Qmap:
1774         case Qregs:
1775                 s = malloc(n+1);
1776                 if(s == nil) error(Enomem);
1777                 if(waserror()){
1778                         free(s);
1779                         nexterror();
1780                 }
1781                 memmove(s, a, n);
1782                 s[n] = 0;
1783                 rc = vmxcmd((ulong)c->qid.path == Qregs ? cmdsetregs : cmdsetmeminfo, s);
1784                 poperror();
1785                 free(s);
1786                 return rc;
1787         case Qfpregs:
1788                 {
1789                         char buf[sizeof(FPsave)];
1790                         
1791                         if(n > sizeof(FPsave)) n = sizeof(FPsave);
1792                         memmove(buf, a, n);
1793                         return vmxcmd(cmdsetfpregs, buf, n, off);
1794                 }
1795         default:
1796                 error(Egreg);
1797                 break;
1798         }
1799         return n;
1800 }
1801
1802 Dev vmxdevtab = {
1803         'X',
1804         "vmx",
1805         
1806         vmxreset,
1807         devinit,
1808         vmxshutdown,
1809         vmxattach,
1810         vmxwalk,
1811         vmxstat,
1812         vmxopen,
1813         devcreate,
1814         vmxclose,
1815         vmxread,
1816         devbread,
1817         vmxwrite,
1818         devbwrite,
1819         devremove,
1820         devwstat,
1821 };