]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/pc/devvmx.c
devvmx, vmx: lilu dallas multivm
[plan9front.git] / sys / src / 9 / pc / devvmx.c
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7 #include "ureg.h"
8
9 extern int vmxon(u64int);
10 extern int vmxoff(void);
11 extern int vmclear(u64int);
12 extern int vmptrld(u64int);
13 extern int vmlaunch(Ureg *, int);
14 extern int vmread(u32int, uintptr *);
15 extern int vmwrite(u32int, uintptr);
16 extern int invept(u32int, uvlong, uvlong);
17 extern int invvpid(u32int, uvlong, uvlong);
18
19 static int gotvmx;
20 static vlong procb_ctls, pinb_ctls;
21
22 enum {
23         VMX_BASIC_MSR = 0x480,
24         VMX_PINB_CTLS_MSR = 0x481,
25         VMX_PROCB_CTLS_MSR = 0x482,
26         VMX_VMEXIT_CTLS_MSR = 0x483,
27         VMX_VMENTRY_CTLS_MSR = 0x484,
28         VMX_MISC_MSR = 0x485,
29         VMX_CR0_FIXED0 = 0x486,
30         VMX_CR0_FIXED1 = 0x487,
31         VMX_CR4_FIXED0 = 0x488,
32         VMX_CR4_FIXED1 = 0x489,
33         VMX_VMCS_ENUM = 0x48A,
34         VMX_PROCB_CTLS2_MSR = 0x48B,
35         VMX_TRUE_PINB_CTLS_MSR = 0x48D,
36         VMX_TRUE_PROCB_CTLS_MSR = 0x48E,
37         VMX_TRUE_EXIT_CTLS_MSR = 0x48F,
38         VMX_TRUE_ENTRY_CTLS_MSR = 0x490,
39         VMX_VMFUNC_MSR = 0x491,
40         
41         PINB_CTLS = 0x4000,
42         PINB_EXITIRQ = 1<<0,
43         PINB_EXITNMI = 1<<3,
44         
45         PROCB_CTLS = 0x4002,
46         PROCB_IRQWIN = 1<<2,
47         PROCB_EXITHLT = 1<<7,
48         PROCB_EXITINVLPG = 1<<9,
49         PROCB_EXITMWAIT = 1<<10,
50         PROCB_EXITRDPMC = 1<<11,
51         PROCB_EXITRDTSC = 1<<12,
52         PROCB_EXITCR3LD = 1<<15,
53         PROCB_EXITCR3ST = 1<<16,
54         PROCB_EXITCR8LD = 1<<19,
55         PROCB_EXITCR8ST = 1<<20,
56         PROCB_EXITMOVDR = 1<<23,
57         PROCB_EXITIO = 1<<24,
58         PROCB_MONTRAP = 1<<27,
59         PROCB_MSRBITMAP = 1<<28,
60         PROCB_EXITMONITOR = 1<<29,
61         PROCB_EXITPAUSE = 1<<30,
62         PROCB_USECTLS2 = 1<<31,
63         
64         PROCB_CTLS2 = 0x401E,
65         PROCB_EPT = 1<<1,
66         PROCB_EXITGDT = 1<<2,
67         PROCB_VPID = 1<<5,
68         PROCB_UNRESTR = 1<<7,
69
70         EXC_BITMAP = 0x4004,
71         PFAULT_MASK = 0x4006,
72         PFAULT_MATCH = 0x4008,
73         CR3_TARGCNT = 0x400a,
74         MSR_BITMAP = 0x2004,
75         
76         VMEXIT_CTLS = 0x400c,
77         VMEXIT_ST_DEBUG = 1<<2,
78         VMEXIT_HOST64 = 1<<9,
79         VMEXIT_LD_IA32_PERF_GLOBAL_CTRL = 1<<12,
80         VMEXIT_ST_IA32_PAT = 1<<18,
81         VMEXIT_LD_IA32_PAT = 1<<19,
82         VMEXIT_ST_IA32_EFER = 1<<20,
83         VMEXIT_LD_IA32_EFER = 1<<21,    
84         
85         VMEXIT_MSRSTCNT = 0x400e,
86         VMEXIT_MSRLDCNT = 0x4010,
87         VMEXIT_MSRSTADDR = 0x2006,
88         VMEXIT_MSRLDADDR = 0x2008,
89         VMENTRY_MSRLDADDR = 0x200A,
90         
91         VMENTRY_CTLS = 0x4012,
92         VMENTRY_LD_DEBUG = 1<<2,
93         VMENTRY_GUEST64 = 1<<9,
94         VMENTRY_LD_IA32_PERF_GLOBAL_CTRL = 1<<13,
95         VMENTRY_LD_IA32_PAT = 1<<14,
96         VMENTRY_LD_IA32_EFER = 1<<15,
97         
98         VMENTRY_MSRLDCNT = 0x4014,
99         VMENTRY_INTRINFO = 0x4016,
100         VMENTRY_INTRCODE = 0x4018,
101         VMENTRY_INTRILEN = 0x401a,
102         
103         VMCS_LINK = 0x2800,
104         
105         GUEST_ES = 0x800,
106         GUEST_CS = 0x802,
107         GUEST_SS = 0x804,
108         GUEST_DS = 0x806,
109         GUEST_FS = 0x808,
110         GUEST_GS = 0x80A,
111         GUEST_LDTR = 0x80C,
112         GUEST_TR = 0x80E,
113         GUEST_CR0 = 0x6800,
114         GUEST_CR3 = 0x6802,
115         GUEST_CR4 = 0x6804,
116         GUEST_ESLIMIT = 0x4800,
117         GUEST_CSLIMIT = 0x4802,
118         GUEST_SSLIMIT = 0x4804,
119         GUEST_DSLIMIT = 0x4806,
120         GUEST_FSLIMIT = 0x4808,
121         GUEST_GSLIMIT = 0x480A,
122         GUEST_LDTRLIMIT = 0x480C,
123         GUEST_TRLIMIT = 0x480E,
124         GUEST_GDTRLIMIT = 0x4810,
125         GUEST_IDTRLIMIT = 0x4812,
126         GUEST_ESPERM = 0x4814,
127         GUEST_CSPERM = 0x4816,
128         GUEST_SSPERM = 0x4818,
129         GUEST_DSPERM = 0x481A,
130         GUEST_FSPERM = 0x481C,
131         GUEST_GSPERM = 0x481E,
132         GUEST_LDTRPERM = 0x4820,
133         GUEST_TRPERM = 0x4822,
134         GUEST_CR0MASK = 0x6000,
135         GUEST_CR4MASK = 0x6002,
136         GUEST_CR0SHADOW = 0x6004,
137         GUEST_CR4SHADOW = 0x6006,
138         GUEST_ESBASE = 0x6806,
139         GUEST_CSBASE = 0x6808,
140         GUEST_SSBASE = 0x680A,
141         GUEST_DSBASE = 0x680C,
142         GUEST_FSBASE = 0x680E,
143         GUEST_GSBASE = 0x6810,
144         GUEST_LDTRBASE = 0x6812,
145         GUEST_TRBASE = 0x6814,
146         GUEST_GDTRBASE = 0x6816,
147         GUEST_IDTRBASE = 0x6818,
148         GUEST_DR7 = 0x681A,
149         GUEST_RSP = 0x681C,
150         GUEST_RIP = 0x681E,
151         GUEST_RFLAGS = 0x6820,
152         GUEST_IA32_DEBUGCTL = 0x2802,
153         GUEST_IA32_PAT = 0x2804,
154         GUEST_IA32_EFER = 0x2806,
155         GUEST_IA32_PERF_GLOBAL_CTRL = 0x2808,
156         
157         HOST_ES = 0xC00,
158         HOST_CS = 0xC02,
159         HOST_SS = 0xC04,
160         HOST_DS = 0xC06,
161         HOST_FS = 0xC08,
162         HOST_GS = 0xC0A,
163         HOST_TR = 0xC0C,
164         HOST_CR0 = 0x6C00,
165         HOST_CR3 = 0x6C02,
166         HOST_CR4 = 0x6C04,
167         HOST_FSBASE = 0x6C06,
168         HOST_GSBASE = 0x6C08,
169         HOST_TRBASE = 0x6C0A,
170         HOST_GDTR = 0x6C0C,
171         HOST_IDTR = 0x6C0E,
172         HOST_RSP = 0x6C14,
173         HOST_RIP = 0x6C16,
174         HOST_IA32_PAT = 0x2C00,
175         HOST_IA32_EFER = 0x2C02,
176         HOST_IA32_PERF_GLOBAL_CTRL = 0x2C04,
177         
178         GUEST_CANINTR = 0x4824,
179         
180         VM_INSTRERR = 0x4400,
181         VM_EXREASON = 0x4402,
182         VM_EXINTRINFO = 0x4404,
183         VM_EXINTRCODE = 0x4406,
184         VM_IDTVECINFO = 0x4408,
185         VM_IDTVECCODE = 0x440A,
186         VM_EXINSTRLEN = 0x440C,
187         VM_EXINSTRINFO = 0x440E,
188         VM_EXQUALIF = 0x6400,
189         VM_IORCX = 0x6402,
190         VM_IORSI = 0x6404,
191         VM_IORDI = 0x6406,
192         VM_IORIP = 0x6408,
193         VM_GUESTVA = 0x640A,
194         VM_GUESTPA = 0x2400,
195         
196         VM_VPID = 0x000,
197         VM_EPTPIDX = 0x0004,
198         
199         VM_EPTP = 0x201A,
200         VM_EPTPLA = 0x2024,
201         
202         INVLOCAL = 1,
203 };
204
205 enum {
206         CR0RSVD = 0x1ffaffc0,
207         CR4RSVD = 0xff889000,
208         CR4MCE = 1<<6,
209         CR4VMXE = 1<<13,
210         CR4SMXE = 1<<14,
211         CR4PKE = 1<<22,
212         
213         CR0KERNEL = CR0RSVD | 0x30 | (uintptr)0xFFFFFFFF00000000ULL,
214         CR4KERNEL = CR4RSVD | CR4VMXE | CR4SMXE | CR4MCE | CR4PKE | (uintptr)0xFFFFFFFF00000000ULL
215 };
216
217 enum {
218         MAXMSR = 512,
219 };
220
221 typedef struct VmxMach VmxMach;
222 typedef struct Vmx Vmx;
223 typedef struct VmCmd VmCmd;
224 typedef struct VmMem VmMem;
225 typedef struct VmIntr VmIntr;
226
227 struct VmMem {
228         uvlong addr;
229         Segment *seg;
230         uintptr off;
231         char *name;
232         VmMem *next, *prev;
233         u16int attr;
234 };
235
236 struct VmxMach {
237         char vmxon[4096]; /* has to be at the start for alignment */
238         QLock;
239         int vms;
240         Vmx *active;
241 };
242 #define vmxmach ((VmxMach*)((m)->vmx))
243 #define vmxmachp(n) ((VmxMach*)(MACHP(n)->vmx))
244
245 struct VmIntr {
246         u32int info, code, ilen;
247 };
248
249 struct Vmx {
250         uchar vmcs[4096]; /* page aligned */
251         uvlong pml4[512]; /* page aligned */
252         u32int msrbits[1024]; /* page aligned */
253         FPsave fp; /* page aligned */
254         u64int msrhost[MAXMSR*2]; /* 16 byte aligned */
255         u64int msrguest[MAXMSR*2]; /* 16 byte aligned */
256
257         enum {
258                 VMXINIT,
259                 VMXREADY,
260                 VMXRUNNING,
261                 VMXDEAD,
262                 VMXENDING,
263         } state;
264         int index, machno;
265         char errstr[ERRMAX];
266         Ureg ureg;
267         uintptr cr2;
268         uintptr dr[8]; /* DR7 is also kept in VMCS */
269         u8int launched;
270         u8int vpid;
271         enum {
272                 FLUSHVPID = 1,
273                 FLUSHEPT = 2,
274                 STEP = 4,
275                 POSTEX = 8,
276                 POSTIRQ = 16,
277         } onentry;
278         
279         Rendez cmdwait;
280         Lock cmdlock;
281         VmCmd *firstcmd, **lastcmd;
282         VmCmd *postponed;
283         VmMem mem;
284         
285         enum {
286                 GOTEXIT = 1,
287                 GOTIRQACK = 2,
288         } got;
289         VmIntr exc, irq, irqack;
290
291         int nmsr;
292 };
293
294 struct VmCmd {
295         enum {
296                 CMDFDONE = 1,
297                 CMDFFAIL = 2,
298                 CMDFPOSTP = 4,
299         } flags;
300         u8int scratched;
301         Rendez;
302         Lock;
303         int (*cmd)(VmCmd *, va_list);
304         int retval;
305         char *errstr;
306         va_list va;
307         VmCmd *next;
308         Vmx *vmx;
309 };
310
311 static char Equit[] = "vmx: ending";
312
313 static char *statenames[] = {
314         [VMXINIT] "init",
315         [VMXREADY] "ready",
316         [VMXRUNNING] "running",
317         [VMXDEAD] "dead",
318         [VMXENDING]"ending"
319 };
320
321 static Vmx *moribund;
322 static QLock vmxtablock;
323 static Vmx **vmxtab;
324 static int nvmxtab;
325
326 void
327 vmxprocrestore(Proc *p)
328 {
329         int s;
330         Vmx *vmx;
331         
332         s = splhi();
333         vmx = p->vmx;
334         if(vmxmach->active != vmx){
335                 if(vmx != nil && vmptrld(PADDR(vmx->vmcs)) < 0)
336                         panic("VMPTRLD(%p) failed", vmx->vmcs);
337                 vmxmach->active = vmx;
338         }
339         splx(s);
340 }
341
342 static u64int
343 vmcsread(u32int addr)
344 {
345         int rc;
346         u64int val;
347
348         val = 0;
349         rc = vmread(addr, (uintptr *) &val);
350         if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000)
351                 rc = vmread(addr | 1, (uintptr *) &val + 1);
352         if(rc < 0){
353                 char errbuf[128];
354                 snprint(errbuf, sizeof(errbuf), "vmcsread failed (%#.4ux)", addr);
355                 error(errbuf);
356         }
357         return val;
358 }
359
360 static void
361 vmcswrite(u32int addr, u64int val)
362 {
363         int rc;
364         
365         rc = vmwrite(addr, val);
366         if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000)
367                 rc = vmwrite(addr | 1, val >> 32);
368         if(rc < 0){
369                 char errbuf[128];
370                 snprint(errbuf, sizeof(errbuf), "vmcswrite failed (%#.4ux = %#.16ullx)", addr, val);
371                 error(errbuf);
372         }
373 }
374
375 static uvlong
376 parseval(char *s)
377 {
378         uvlong v;
379         char *p;
380
381         v = strtoull(s, &p, 0);
382         if(p == s || *p != 0) error("invalid value");
383         return v;
384 }
385
386 static char *
387 cr0fakeread(Vmx *, char *p, char *e)
388 {
389         uvlong guest, mask, shadow;
390         
391         guest = vmcsread(GUEST_CR0);
392         mask = vmcsread(GUEST_CR0MASK);
393         shadow = vmcsread(GUEST_CR0SHADOW);
394         return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & ~mask | shadow & mask);
395 }
396
397 static char *
398 cr4fakeread(Vmx *, char *p, char *e)
399 {
400         uvlong guest, mask, shadow;
401         
402         guest = vmcsread(GUEST_CR4);
403         mask = vmcsread(GUEST_CR4MASK);
404         shadow = vmcsread(GUEST_CR4SHADOW);
405         return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & ~mask | shadow & mask);
406 }
407
408 static void
409 updatelma(void)
410 {
411         uvlong cr0, efer, nefer, ectrl;
412
413         if(sizeof(uintptr) != 8) return;
414         cr0 = vmcsread(GUEST_CR0);
415         efer = vmcsread(GUEST_IA32_EFER);
416         nefer = efer & ~0x400 | efer << 2 & cr0 >> 21 & 0x400;
417         if(efer == nefer) return;
418         vmcswrite(GUEST_IA32_EFER, nefer);
419         ectrl = vmcsread(VMENTRY_CTLS);
420         ectrl = ectrl & ~0x200 | nefer >> 1 & 0x200;
421         vmcswrite(VMENTRY_CTLS, ectrl);
422 }
423
424 static int
425 cr0realwrite(Vmx *, char *s)
426 {
427         uvlong v;
428         
429         v = parseval(s);
430         vmcswrite(GUEST_CR0, vmcsread(GUEST_CR0) & CR0KERNEL | v & ~CR0KERNEL);
431         updatelma();
432         return 0;
433 }
434
435 static int
436 cr0maskwrite(Vmx *, char *s)
437 {
438         uvlong v;
439         
440         v = parseval(s);
441         vmcswrite(GUEST_CR0MASK, v | CR0KERNEL);
442         return 0;
443 }
444
445 static int
446 eferwrite(Vmx *, char *s)
447 {
448         uvlong v;
449         
450         v = parseval(s);
451         vmcswrite(GUEST_IA32_EFER, v);
452         updatelma();
453         return 0;
454 }
455
456 static int
457 cr4realwrite(Vmx *, char *s)
458 {
459         uvlong v;
460         
461         v = parseval(s);
462         vmcswrite(GUEST_CR4, vmcsread(GUEST_CR4) & CR4KERNEL | v & ~CR4KERNEL);
463         return 0;
464 }
465
466 static int
467 cr4maskwrite(Vmx *, char *s)
468 {
469         uvlong v;
470         
471         v = parseval(s);
472         vmcswrite(GUEST_CR4MASK, v | CR4KERNEL);
473         return 0;
474 }
475
476 static int
477 dr7write(Vmx *vmx, char *s)
478 {
479         uvlong v;
480         
481         v = (u32int) parseval(s);
482         vmcswrite(GUEST_DR7, vmx->dr[7] = (u32int) v);
483         return 0;
484 }
485
486 static int
487 readonly(Vmx *, char *)
488 {
489         return -1;
490 }
491
492 static int
493 dr6write(Vmx *vmx, char *s)
494 {
495         uvlong v;
496         
497         v = parseval(s);
498         vmx->dr[6] = (u32int) v;
499         return 0;
500 }
501
502 typedef struct GuestReg GuestReg;
503 struct GuestReg {
504         int offset;
505         u8int size; /* in bytes; 0 means == uintptr */
506         char *name;
507         char *(*read)(Vmx *, char *, char *);
508         int (*write)(Vmx *, char *);
509 };
510 #define VMXVAR(x) ~(ulong)&(((Vmx*)0)->x)
511 #define UREG(x) VMXVAR(ureg.x)
512 static GuestReg guestregs[] = {
513         {GUEST_RIP, 0, "pc"},
514         {GUEST_RSP, 0, "sp"},
515         {GUEST_RFLAGS, 0, "flags"},
516         {UREG(ax), 0, "ax"},
517         {UREG(bx), 0, "bx"},
518         {UREG(cx), 0, "cx"},
519         {UREG(dx), 0, "dx"},
520         {UREG(bp), 0, "bp"},
521         {UREG(si), 0, "si"},
522         {UREG(di), 0, "di"},
523 #ifdef RMACH
524         {UREG(r8), 0, "r8"},
525         {UREG(r9), 0, "r9"},
526         {UREG(r10), 0, "r10"},
527         {UREG(r11), 0, "r11"},
528         {UREG(r12), 0, "r12"},
529         {UREG(r13), 0, "r13"},
530         {UREG(r14), 0, "r14"},
531         {UREG(r15), 0, "r15"},
532 #endif
533         {GUEST_GDTRBASE, 0, "gdtrbase"},
534         {GUEST_GDTRLIMIT, 4, "gdtrlimit"},
535         {GUEST_IDTRBASE, 0, "idtrbase"},
536         {GUEST_IDTRLIMIT, 4, "idtrlimit"},
537         {GUEST_CS, 2, "cs"},
538         {GUEST_CSBASE, 0, "csbase"},
539         {GUEST_CSLIMIT, 4, "cslimit"},
540         {GUEST_CSPERM, 4, "csperm"},
541         {GUEST_DS, 2, "ds"},
542         {GUEST_DSBASE, 0, "dsbase"},
543         {GUEST_DSLIMIT, 4, "dslimit"},
544         {GUEST_DSPERM, 4, "dsperm"},
545         {GUEST_ES, 2, "es"},
546         {GUEST_ESBASE, 0, "esbase"},
547         {GUEST_ESLIMIT, 4, "eslimit"},
548         {GUEST_ESPERM, 4, "esperm"},
549         {GUEST_FS, 2, "fs"},
550         {GUEST_FSBASE, 0, "fsbase"},
551         {GUEST_FSLIMIT, 4, "fslimit"},
552         {GUEST_FSPERM, 4, "fsperm"},
553         {GUEST_GS, 2, "gs"},
554         {GUEST_GSBASE, 0, "gsbase"},
555         {GUEST_GSLIMIT, 4, "gslimit"},
556         {GUEST_GSPERM, 4, "gsperm"},
557         {GUEST_SS, 2, "ss"},
558         {GUEST_SSBASE, 0, "ssbase"},
559         {GUEST_SSLIMIT, 4, "sslimit"},
560         {GUEST_SSPERM, 4, "ssperm"},
561         {GUEST_TR, 2, "tr"},
562         {GUEST_TRBASE, 0, "trbase"},
563         {GUEST_TRLIMIT, 4, "trlimit"},
564         {GUEST_TRPERM, 4, "trperm"},
565         {GUEST_LDTR, 2, "ldtr"},
566         {GUEST_LDTRBASE, 0, "ldtrbase"},
567         {GUEST_LDTRLIMIT, 4, "ldtrlimit"},
568         {GUEST_LDTRPERM, 4, "ldtrperm"},
569         {GUEST_CR0, 0, "cr0real", nil, cr0realwrite},
570         {GUEST_CR0SHADOW, 0, "cr0fake", cr0fakeread},
571         {GUEST_CR0MASK, 0, "cr0mask", nil, cr0maskwrite},
572         {VMXVAR(cr2), 0, "cr2"},
573         {GUEST_CR3, 0, "cr3"},
574         {GUEST_CR4, 0, "cr4real", nil, cr4realwrite},
575         {GUEST_CR4SHADOW, 0, "cr4fake", cr4fakeread},
576         {GUEST_CR4MASK, 0, "cr4mask", nil, cr4maskwrite},
577         {GUEST_IA32_PAT, 8, "pat"},
578         {GUEST_IA32_EFER, 8, "efer", nil, eferwrite},
579         {VMXVAR(dr[0]), 0, "dr0"},
580         {VMXVAR(dr[1]), 0, "dr1"},
581         {VMXVAR(dr[2]), 0, "dr2"},
582         {VMXVAR(dr[3]), 0, "dr3"},
583         {VMXVAR(dr[6]), 0, "dr6", nil, dr6write},
584         {GUEST_DR7, 0, "dr7", nil, dr7write},
585         {VM_INSTRERR, 4, "instructionerror", nil, readonly},
586         {VM_EXREASON, 4, "exitreason", nil, readonly},
587         {VM_EXQUALIF, 0, "exitqualification", nil, readonly},
588         {VM_EXINTRINFO, 4, "exitinterruptinfo", nil, readonly},
589         {VM_EXINTRCODE, 4, "exitinterruptcode", nil, readonly},
590         {VM_EXINSTRLEN, 4, "exitinstructionlen", nil, readonly},
591         {VM_EXINSTRINFO, 4, "exitinstructioninfo", nil, readonly},
592         {VM_GUESTVA, 0, "exitva", nil, readonly},
593         {VM_GUESTPA, 0, "exitpa", nil, readonly},
594         {VM_IDTVECINFO, 4, "idtinterruptinfo", nil, readonly},
595         {VM_IDTVECCODE, 4, "idtinterruptcode", nil, readonly},
596 };
597
598 static int
599 vmokpage(u64int addr)
600 {
601         return (addr & 0xfff) == 0 && addr >> 48 == 0;
602 }
603
604 static uvlong *
605 eptwalk(Vmx *vmx, uvlong addr)
606 {
607         uvlong *tab, *nt;
608         uvlong v;
609         int i;
610         
611         tab = vmx->pml4;
612         if(tab == nil) error(Egreg);
613         for(i = 3; i >= 1; i--){
614                 tab += addr >> 12 + 9 * i & 0x1ff;
615                 v = *tab;
616                 if((v & 3) == 0){
617                         nt = mallocalign(BY2PG, BY2PG, 0, 0);
618                         if(nt == nil) error(Enomem);
619                         memset(nt, 0, BY2PG);
620                         v = PADDR(nt) | 0x407;
621                         *tab = v;
622                 }
623                 tab = KADDR(v & ~0xfff);
624         }
625         return tab + (addr >> 12 & 0x1ff);
626 }
627
628 static void
629 eptfree(uvlong *tab, int level)
630 {
631         int i;
632         uvlong v, *t;
633         
634         if(tab == nil) error(Egreg);
635         if(level < 3){
636                 for(i = 0; i < 512; i++){
637                         v = tab[i];
638                         if((v & 3) == 0) continue;
639                         t = KADDR(v & ~0xfff);
640                         eptfree(t, level + 1);
641                         tab[i] = 0;
642                 }
643         }
644         if(level > 0)
645                 free(tab);              
646 }
647
648 static void
649 epttranslate(Vmx *vmx, VmMem *mp, uvlong end)
650 {
651         uvlong p, v;
652
653         if((mp->addr & 0xfff) != 0 || (end & 0xfff) != 0 || (uint)mp->attr >= 0x1000)
654                 error(Egreg);
655         if(mp->seg != nil){
656                 switch(mp->seg->type & SG_TYPE){
657                 default:
658                         error(Egreg);
659                 case SG_FIXED:
660                 case SG_STICKY:
661                         break;
662                 }
663                 if(mp->seg->base + mp->off + (end - mp->addr) > mp->seg->top)
664                         error(Egreg);
665                 for(p = mp->addr, v = mp->off; p != end; p += BY2PG, v += BY2PG)
666                         *eptwalk(vmx, p) = mp->seg->map[v/PTEMAPMEM]->pages[(v & PTEMAPMEM-1)/BY2PG]->pa | mp->attr;
667         }else {
668                 for(p = mp->addr; p != end; p += BY2PG)
669                         *eptwalk(vmx, p) = 0;
670         }
671         vmx->onentry |= FLUSHEPT;
672 }
673
674 static char *mtype[] = {"uc", "wc", "02", "03", "wt", "wp", "wb", "07"};
675
676 static int
677 cmdgetmeminfo(VmCmd *cmd, va_list va)
678 {
679         VmMem *mp;
680         char *p0, *e, *p;
681         char attr[4];
682         char mt[4];
683         
684         p0 = va_arg(va, char *);
685         e = va_arg(va, char *);
686         p = p0;
687         if(p < e) *p = 0;
688         for(mp = cmd->vmx->mem.next; mp != &cmd->vmx->mem; mp = mp->next){
689                 if(mp->seg == nil)
690                         continue;
691                 attr[0] = (mp->attr & 1) != 0 ? 'r' : '-';
692                 attr[1] = (mp->attr & 2) != 0 ? 'w' : '-';
693                 attr[2] = (mp->attr & 4) != 0 ? 'x' : '-';
694                 attr[3] = 0;
695                 *(ushort*)mt = *(u16int*)mtype[mp->attr >> 3 & 7];
696                 mt[2] = (mp->attr & 0x40) != 0 ? '!' : 0;
697                 mt[3] = 0;
698                 p = seprint(p, e, "%s %s %#llux %#llux %s %#llux\n", attr, mt, mp->addr, mp->next->addr, mp->name, (uvlong)mp->off);
699         }
700         return p - p0;
701 }
702
703 static void
704 vmmeminsert(VmMem *l, VmMem *p)
705 {
706         p->prev = l->prev;
707         p->next = l;
708         p->prev->next = p;
709         p->next->prev = p;
710 }
711
712 static VmMem *
713 vmmemremove(VmMem *p)
714 {
715         VmMem *r;
716         
717         r = p->next;
718         p->next->prev = p->prev;
719         p->prev->next = p->next;
720         free(p->name);
721         putseg(p->seg);
722         free(p);
723         return r;
724 }
725
726 static int
727 cmdclearmeminfo(VmCmd *cmd, va_list)
728 {
729         VmMem *mp;
730         Vmx *vmx;
731
732         vmx = cmd->vmx;
733         eptfree(cmd->vmx->pml4, 0);
734         for(mp = vmx->mem.next; mp != &vmx->mem; )
735                 mp = vmmemremove(mp);
736         vmx->mem.prev = &vmx->mem;
737         vmx->mem.next = &vmx->mem;
738         vmx->onentry |= FLUSHEPT;
739         return 0;
740 }
741
742
743 static void
744 vmmemupdate(Vmx *vmx, VmMem *mp, uvlong end)
745 {
746         VmMem *p, *q;
747         
748         for(p = vmx->mem.prev; p != &vmx->mem; p = p->prev)
749                 if(p->addr <= end || end == 0)
750                         break;
751         if(p == &vmx->mem || p->addr < mp->addr){
752                 q = smalloc(sizeof(VmMem));
753                 *q = *p;
754                 if(p->seg != nil){
755                         incref(q->seg);
756                         kstrdup(&q->name, p->name);
757                 }
758                 vmmeminsert(p->next, q);
759         }else
760                 q = p;
761         if(q->seg != nil)
762                 q->off += end - q->addr;
763         q->addr = end;
764         for(p = vmx->mem.next; p != &vmx->mem; p = p->next)
765                 if(p->addr >= mp->addr)
766                         break;
767         vmmeminsert(p, mp);
768         while(p != q)
769                 p = vmmemremove(p);
770         for(p = vmx->mem.next; p != &vmx->mem; )
771                 if(p->seg == p->prev->seg && (p->seg == nil || p->addr - p->prev->addr == p->off - p->prev->off))
772                         p = vmmemremove(p);
773                 else
774                         p = p->next;
775 }
776
777 extern Segment* (*_globalsegattach)(char*);
778
779 static int
780 cmdsetmeminfo(VmCmd *cmd, va_list va)
781 {
782         char *p0, *p, *q, *r;
783         int j;
784         char *f[10];
785         VmMem *mp;
786         int rc;
787         uvlong end;
788         
789         p0 = va_arg(va, char *);
790         p = p0;
791         mp = nil;
792         for(;;){
793                 q = strchr(p, '\n');
794                 if(q == 0) break;
795                 *q = 0;
796                 if(mp == nil){
797                         mp = malloc(sizeof(VmMem));
798                         if(mp == nil)
799                                 error(Enomem);
800                 }
801                 memset(mp, 0, sizeof(VmMem));
802                 if(waserror()){
803                         putseg(mp->seg);
804                         free(mp->name);
805                         free(mp);
806                         nexterror();
807                 }
808                 rc = tokenize(p, f, nelem(f));
809                 p = q + 1;
810                 if(rc == 0){
811                         poperror();
812                         continue;
813                 }
814                 if(rc != 4 && rc != 6) error("number of fields wrong");
815                 for(q = f[0]; *q != 0; q++)
816                         switch(*q){
817                         case 'r': if((mp->attr & 1) != 0) goto tinval; mp->attr |= 1; break;
818                         case 'w': if((mp->attr & 2) != 0) goto tinval; mp->attr |= 2; break;
819                         case 'x': if((mp->attr & 4) != 0) goto tinval; mp->attr |= 0x404; break;
820                         case '-': break;
821                         default: tinval: error("invalid access field");
822                         }
823                 for(j = 0; j < 8; j++)
824                         if(strncmp(mtype[j], f[1], 2) == 0){
825                                 mp->attr |= j << 3;
826                                 break;
827                         }
828                 if(j == 8 || strlen(f[1]) > 3) error("invalid memory type");
829                 if(f[1][2] == '!') mp->attr |= 0x40;
830                 else if(f[1][2] != 0) error("invalid memory type");
831                 mp->addr = strtoull(f[2], &r, 0);
832                 if(*r != 0 || !vmokpage(mp->addr)) error("invalid low guest physical address");
833                 end = strtoull(f[3], &r, 0);
834                 if(*r != 0 || !vmokpage(end) || end <= mp->addr) error("invalid high guest physical address");
835                 if((mp->attr & 7) != 0){
836                         if(rc != 6) error("number of fields wrong");
837                         mp->seg = _globalsegattach(f[4]);
838                         if(mp->seg == nil) error("no such segment");
839                         if(mp->seg->base + mp->off + (end - mp->addr) > mp->seg->top) error("out of bounds");
840                         kstrdup(&mp->name, f[4]);
841                         mp->off = strtoull(f[5], &r, 0);
842                         if(*r != 0 || !vmokpage(mp->off)) error("invalid offset");
843                 }
844                 poperror();
845                 epttranslate(cmd->vmx, mp, end);
846                 vmmemupdate(cmd->vmx, mp, end);
847                 mp = nil;
848         }
849         free(mp);
850         return p - p0;
851 }
852
853 static void
854 vmxreset(void)
855 {
856         ulong regs[4];
857         vlong msr;
858         int i;
859
860         cpuid(1, regs);
861         if((regs[2] & 1<<5) == 0) return;
862         /* check if disabled by BIOS */
863         if(rdmsr(0x3a, &msr) < 0) return;
864         if((msr & 5) != 5){
865                 if((msr & 1) == 0){ /* msr still unlocked */
866                         wrmsr(0x3a, msr | 5);
867                         if(rdmsr(0x3a, &msr) < 0)
868                                 return;
869                 }
870                 if((msr & 5) != 5)
871                         return;
872         }
873         if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) return;
874         if((vlong)msr >= 0) return;
875         if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) return;
876         if((msr >> 32 & PROCB_EPT) == 0 || (msr >> 32 & PROCB_VPID) == 0) return;
877         for(i = 0; i < conf.nmach; i++){
878                 MACHP(i)->vmx = mallocalign(sizeof(VmxMach), 4096, 0, 0);
879                 if(vmxmachp(i) == nil)
880                         error(Enomem);
881         }
882         gotvmx = 1;
883 }
884
885 static void
886 vmxaddmsr(Vmx *vmx, u32int msr, u64int gval)
887 {
888         int i;
889
890         if(vmx->nmsr >= MAXMSR)
891                 error("too many MSRs");
892         i = 2 * vmx->nmsr++;
893         vmx->msrhost[i] = msr;
894         rdmsr(msr, (vlong *) &vmx->msrhost[i+1]);
895         vmx->msrguest[i] = msr;
896         vmx->msrguest[i+1] = gval;
897         vmcswrite(VMENTRY_MSRLDCNT, vmx->nmsr);
898         vmcswrite(VMEXIT_MSRSTCNT, vmx->nmsr);
899         vmcswrite(VMEXIT_MSRLDCNT, vmx->nmsr);
900 }
901
902 static void
903 vmxtrapmsr(Vmx *vmx, u32int msr, enum { TRAPRD = 1, TRAPWR = 2 } state)
904 {
905         u32int m;
906         
907         if(msr >= 0x2000 && (u32int)(msr - 0xc0000000) >= 0x2000)
908                 return;
909         msr = msr & 0x1fff | msr >> 18 & 0x2000;
910         m = 1<<(msr & 31);
911         if((state & TRAPRD) != 0)
912                 vmx->msrbits[msr / 32] |= m;
913         else
914                 vmx->msrbits[msr / 32] &= ~m;
915         if((state & TRAPWR) != 0)
916                 vmx->msrbits[msr / 32 + 512] |= m;
917         else
918                 vmx->msrbits[msr / 32 + 512] &= ~m;
919 }
920
921 static void
922 vmcsinit(Vmx *vmx)
923 {
924         vlong msr;
925         u32int x;
926         
927         memset(&vmx->ureg, 0, sizeof(vmx->ureg));
928         vmx->launched = 0;
929         vmx->onentry = 0;       
930         
931         if(rdmsr(VMX_BASIC_MSR, &msr) < 0) error("rdmsr(VMX_BASIC_MSR) failed");
932         if((msr & 1ULL<<55) != 0){
933                 if(rdmsr(VMX_TRUE_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_TRUE_PROCB_CTLS_MSR) failed");
934                 if(rdmsr(VMX_TRUE_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_TRUE_PINB_CTLS_MSR) failed");
935         }else{
936                 if(rdmsr(VMX_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR) failed");
937                 if(rdmsr(VMX_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_PINB_CTLS_MSR) failed");
938         }
939
940         if(rdmsr(VMX_PINB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PINB_CTLS_MSR failed");
941         x = (u32int)pinb_ctls | 1<<1 | 1<<2 | 1<<4; /* currently reserved default1 bits */
942         x |= PINB_EXITIRQ | PINB_EXITNMI;
943         x &= pinb_ctls >> 32;
944         vmcswrite(PINB_CTLS, x);
945         
946         if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR failed");
947         x = (u32int)procb_ctls | 1<<1 | 7<<4 | 1<<8 | 1<<13 | 1<<14 | 1<<26; /* currently reserved default1 bits */
948         x |= PROCB_EXITHLT | PROCB_EXITMWAIT;
949         x |= PROCB_EXITMOVDR | PROCB_EXITIO | PROCB_EXITMONITOR | PROCB_MSRBITMAP;
950         x |= PROCB_USECTLS2;
951         x &= msr >> 32;
952         vmcswrite(PROCB_CTLS, x);
953         
954         if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS2_MSR failed");
955         x = PROCB_EPT | PROCB_VPID | PROCB_UNRESTR;
956         x &= msr >> 32;
957         vmcswrite(PROCB_CTLS2, x);
958         
959         if(rdmsr(VMX_VMEXIT_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMEXIT_CTLS_MSR failed");
960         x = (u32int)msr;
961         if(sizeof(uintptr) == 8) x |= VMEXIT_HOST64;
962         x |= VMEXIT_LD_IA32_PAT | VMEXIT_LD_IA32_EFER | VMEXIT_ST_DEBUG | VMEXIT_ST_IA32_EFER;
963         x &= msr >> 32;
964         vmcswrite(VMEXIT_CTLS, x);
965         
966         if(rdmsr(VMX_VMENTRY_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMENTRY_CTLS_MSR failed");
967         x = (u32int)msr;
968         x |= VMENTRY_LD_IA32_PAT | VMENTRY_LD_IA32_EFER | VMENTRY_LD_DEBUG;
969         x &= msr >> 32;
970         vmcswrite(VMENTRY_CTLS, x);
971         
972         vmcswrite(CR3_TARGCNT, 0);
973         vmcswrite(VMENTRY_INTRINFO, 0);
974         vmcswrite(VMCS_LINK, -1);
975         
976         vmcswrite(HOST_CS, KESEL);
977         vmcswrite(HOST_DS, KDSEL);
978         vmcswrite(HOST_ES, KDSEL);
979         vmcswrite(HOST_FS, KDSEL);
980         vmcswrite(HOST_GS, KDSEL);
981         vmcswrite(HOST_SS, KDSEL);
982         vmcswrite(HOST_TR, TSSSEL);
983         vmcswrite(HOST_CR0, getcr0() & ~0xe);
984         vmcswrite(HOST_CR3, getcr3());
985         vmcswrite(HOST_CR4, getcr4());
986         rdmsr(FSbase, &msr);
987         vmcswrite(HOST_FSBASE, msr);
988         rdmsr(GSbase, &msr);
989         vmcswrite(HOST_GSBASE, msr);
990         vmcswrite(HOST_TRBASE, (uintptr) m->tss);
991         vmcswrite(HOST_GDTR, (uintptr) m->gdt);
992         vmcswrite(HOST_IDTR, IDTADDR);
993         if(rdmsr(0x277, &msr) < 0) error("rdmsr(IA32_PAT) failed");
994         vmcswrite(HOST_IA32_PAT, msr);
995         if(rdmsr(Efer, &msr) < 0) error("rdmsr(IA32_EFER) failed");
996         vmcswrite(HOST_IA32_EFER, msr);
997         
998         vmcswrite(EXC_BITMAP, 1<<18|1<<1);
999         vmcswrite(PFAULT_MASK, 0);
1000         vmcswrite(PFAULT_MATCH, 0);
1001         
1002         vmcswrite(GUEST_CSBASE, 0);
1003         vmcswrite(GUEST_DSBASE, 0);
1004         vmcswrite(GUEST_ESBASE, 0);
1005         vmcswrite(GUEST_FSBASE, 0);
1006         vmcswrite(GUEST_GSBASE, 0);
1007         vmcswrite(GUEST_SSBASE, 0);
1008         vmcswrite(GUEST_CSLIMIT, -1);
1009         vmcswrite(GUEST_DSLIMIT, -1);
1010         vmcswrite(GUEST_ESLIMIT, -1);
1011         vmcswrite(GUEST_FSLIMIT, -1);
1012         vmcswrite(GUEST_GSLIMIT, -1);
1013         vmcswrite(GUEST_SSLIMIT, -1);
1014         vmcswrite(GUEST_CSPERM, (SEGG|SEGD|SEGP|SEGPL(0)|SEGEXEC|SEGR) >> 8 | 1);
1015         vmcswrite(GUEST_DSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
1016         vmcswrite(GUEST_ESPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
1017         vmcswrite(GUEST_FSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
1018         vmcswrite(GUEST_GSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
1019         vmcswrite(GUEST_SSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1);
1020         vmcswrite(GUEST_LDTRPERM, 1<<16);
1021
1022         vmcswrite(GUEST_CR0MASK, CR0KERNEL);
1023         vmcswrite(GUEST_CR4MASK, CR4KERNEL);
1024         vmcswrite(GUEST_CR0, getcr0() & CR0KERNEL | 0x31);
1025         vmcswrite(GUEST_CR3, 0);
1026         vmcswrite(GUEST_CR4, getcr4() & CR4KERNEL);
1027         vmcswrite(GUEST_CR0SHADOW, getcr0() & CR0KERNEL | 0x31);
1028         vmcswrite(GUEST_CR4SHADOW, getcr4() & ~CR4VMXE & CR4KERNEL);
1029         
1030         vmcswrite(GUEST_IA32_PAT, 0x0007040600070406ULL);
1031         vmcswrite(GUEST_IA32_EFER, 0);
1032         
1033         vmcswrite(GUEST_TRBASE, 0);
1034         vmcswrite(GUEST_TRLIMIT, 0xffff);
1035         vmcswrite(GUEST_TRPERM, (SEGTSS|SEGPL(0)|SEGP) >> 8 | 2);
1036
1037         vmcswrite(VM_EPTP, PADDR(vmx->pml4) | 3<<3);
1038         vmx->vpid = 1;
1039         vmcswrite(VM_VPID, vmx->vpid);
1040         
1041         vmcswrite(GUEST_RFLAGS, 2);
1042         
1043         vmx->onentry = FLUSHVPID | FLUSHEPT;
1044         fpinit();
1045         fpsave(&vmx->fp);
1046         
1047         memset(vmx->msrbits, -1, 4096);
1048         vmxtrapmsr(vmx, Efer, 0);
1049         vmcswrite(VMENTRY_MSRLDADDR, PADDR(vmx->msrguest));
1050         vmcswrite(VMEXIT_MSRSTADDR, PADDR(vmx->msrguest));
1051         vmcswrite(VMEXIT_MSRLDADDR, PADDR(vmx->msrhost));
1052         vmcswrite(MSR_BITMAP, PADDR(vmx->msrbits));
1053         
1054         if(sizeof(uintptr) == 8){
1055                 vmxaddmsr(vmx, Star, 0);
1056                 vmxaddmsr(vmx, Lstar, 0);
1057                 vmxaddmsr(vmx, Cstar, 0);
1058                 vmxaddmsr(vmx, Sfmask, 0);
1059                 vmxaddmsr(vmx, KernelGSbase, 0);
1060                 vmxtrapmsr(vmx, Star, 0);
1061                 vmxtrapmsr(vmx, Lstar, 0);
1062                 vmxtrapmsr(vmx, Cstar, 0);
1063                 vmxtrapmsr(vmx, Sfmask, 0);
1064                 vmxtrapmsr(vmx, FSbase, 0);
1065                 vmxtrapmsr(vmx, GSbase, 0);
1066                 vmxtrapmsr(vmx, KernelGSbase, 0);
1067         }
1068 }
1069
1070 static void
1071 vmxstart(Vmx *vmx)
1072 {
1073         vlong msr, msr2;
1074         uintptr cr;
1075         vlong x;
1076
1077         putcr4(getcr4() | 0x2000); /* set VMXE */
1078         putcr0(getcr0() | 0x20); /* set NE */
1079         cr = getcr0();
1080         if(rdmsr(VMX_CR0_FIXED0, &msr) < 0) error("rdmsr(VMX_CR0_FIXED0) failed");
1081         if(rdmsr(VMX_CR0_FIXED1, &msr2) < 0) error("rdmsr(VMX_CR0_FIXED1) failed");
1082         if((cr & ~msr & ~msr2 | ~cr & msr & msr2) != 0) error("invalid CR0 value");
1083         cr = getcr4();
1084         if(rdmsr(VMX_CR4_FIXED0, &msr) < 0) error("rdmsr(VMX_CR4_FIXED0) failed");
1085         if(rdmsr(VMX_CR4_FIXED1, &msr2) < 0) error("rdmsr(VMX_CR4_FIXED1) failed");
1086         if((cr & ~msr & ~msr2 | ~cr & msr & msr2) != 0) error("invalid CR4 value");
1087         
1088         rdmsr(VMX_BASIC_MSR, &x);
1089         qlock(vmxmach);
1090         if(waserror()){
1091                 qunlock(vmxmach);
1092                 nexterror();
1093         }
1094         if(vmxmach->vms == 0){
1095                 memset(vmxmach->vmxon, 0, sizeof(vmxmach->vmxon));
1096                 *(ulong*)vmxmach->vmxon = x;
1097                 if(vmxon(PADDR(vmxmach->vmxon)) < 0)
1098                         error("vmxon failed");
1099         }
1100         vmxmach->vms++;
1101         qunlock(vmxmach);
1102         poperror();
1103
1104         memset(vmx->vmcs, 0, sizeof(vmx->vmcs));
1105         *(ulong*)vmx->vmcs = x;
1106         if(vmclear(PADDR(vmx->vmcs)) < 0)
1107                 error("vmclear failed");
1108         up->vmx = vmx;
1109         vmxprocrestore(up);
1110         vmcsinit(vmx);
1111 }
1112
1113 static void
1114 cmdrelease(VmCmd *p, int f)
1115 {
1116         lock(p);
1117         p->flags |= CMDFDONE | f;
1118         wakeup(p);
1119         unlock(p);
1120 }
1121
1122 static void
1123 killcmds(Vmx *vmx, VmCmd *notme)
1124 {
1125         VmCmd *p, *pn;
1126         
1127         for(p = vmx->postponed; p != nil; p = pn){
1128                 pn = p->next;
1129                 p->next = nil;
1130                 if(p == notme) continue;
1131                 kstrcpy(p->errstr, Equit, ERRMAX);
1132                 cmdrelease(p, CMDFFAIL);
1133         }
1134         vmx->postponed = nil;
1135         ilock(&vmx->cmdlock);
1136         for(p = vmx->firstcmd; p != nil; p = pn){
1137                 pn = p->next;
1138                 p->next = nil;
1139                 if(p == notme) continue;
1140                 kstrcpy(p->errstr, Equit, ERRMAX);
1141                 cmdrelease(p, CMDFFAIL);
1142         }
1143         vmx->firstcmd = nil;
1144         vmx->lastcmd = &vmx->firstcmd;
1145         iunlock(&vmx->cmdlock);
1146 }
1147
1148 static int
1149 cmdquit(VmCmd *p, va_list va)
1150 {
1151         Vmx *vmx;
1152         
1153         vmx = p->vmx;
1154         vmx->state = VMXENDING;
1155         killcmds(vmx, p);
1156
1157         cmdclearmeminfo(p, va);
1158         
1159         up->vmx = nil;
1160         vmxprocrestore(up);
1161         vmclear(PADDR(vmx->vmcs));
1162         
1163         qlock(vmxmach);
1164         if(--vmxmach->vms == 0)
1165                 vmxoff();
1166         qunlock(vmxmach);
1167         
1168         qlock(&vmxtablock);
1169         if(moribund == vmx)
1170                 moribund = nil;
1171         vmxtab[vmx->index] = nil;
1172         qunlock(&vmxtablock);
1173         free(vmx);
1174         
1175         cmdrelease(p, 0);
1176         pexit(Equit, 1);
1177         return 0;
1178 }
1179
1180 static void
1181 processexit(Vmx *vmx)
1182 {
1183         u32int reason;
1184         
1185         reason = vmcsread(VM_EXREASON);
1186         if((reason & 1<<31) == 0)
1187                 switch(reason & 0xffff){
1188                 case 1: /* external interrupt */
1189                 case 3: /* INIT */
1190                 case 4: /* SIPI */
1191                 case 5: /* IO SMI */
1192                 case 6: /* SMI */
1193                 case 7: /* IRQ window */
1194                 case 8: /* NMI window */
1195                         return;
1196                 }
1197         vmx->state = VMXREADY;
1198         vmx->got |= GOTEXIT;
1199         vmx->onentry &= ~STEP;
1200 }
1201
1202 static int
1203 cmdgetregs(VmCmd *cmd, va_list va)
1204 {
1205         char *p0, *e;
1206         GuestReg *r;
1207         uvlong val;
1208         int s;
1209         char *p;
1210         
1211         p0 = va_arg(va, char *);
1212         e = va_arg(va, char *);
1213         p = p0;
1214         for(r = guestregs; r < guestregs + nelem(guestregs); r++)
1215                 if(r->read != nil){
1216                         p = seprint(p, e, "%s ", r->name);
1217                         p = r->read(cmd->vmx, p, e);
1218                         p = strecpy(p, e, "\n");
1219                 }else{
1220                         if(r->offset >= 0)
1221                                 val = vmcsread(r->offset);
1222                         else
1223                                 val = *(uintptr*)((uchar*)cmd->vmx + ~r->offset);
1224                         s = r->size;
1225                         if(s == 0) s = sizeof(uintptr);
1226                         p = seprint(p, e, "%s %#.*llux\n", r->name, s * 2, val);
1227                 }
1228         return p - p0;
1229 }
1230
1231 static int
1232 setregs(Vmx *vmx, char *p0, char rs, char *fs)
1233 {
1234         char *p, *q, *rp;
1235         char *f[10];
1236         GuestReg *r;
1237         uvlong val;
1238         int sz;
1239         int rc;
1240
1241         p = p0;
1242         for(;;){
1243                 q = strchr(p, rs);
1244                 if(q == 0) break;
1245                 *q = 0;
1246                 rc = getfields(p, f, nelem(f), 1, fs);
1247                 p = q + 1;
1248                 if(rc == 0) continue;
1249                 if(rc != 2) error("number of fields wrong");
1250                 
1251                 for(r = guestregs; r < guestregs + nelem(guestregs); r++)
1252                         if(strcmp(r->name, f[0]) == 0)
1253                                 break;
1254                 if(r == guestregs + nelem(guestregs))
1255                         error("unknown register");
1256                 if(r->write != nil){
1257                         r->write(vmx, f[1]);
1258                         continue;
1259                 }
1260                 val = strtoull(f[1], &rp, 0);
1261                 sz = r->size;
1262                 if(sz == 0) sz = sizeof(uintptr);
1263                 if(rp == f[1] || *rp != 0) error("invalid value");
1264                 if(r->offset >= 0)
1265                         vmcswrite(r->offset, val);
1266                 else{
1267                         assert((u32int)~r->offset + sz <= sizeof(Vmx)); 
1268                         switch(sz){
1269                         case 1: *(u8int*)((u8int*)vmx + (u32int)~r->offset) = val; break;
1270                         case 2: *(u16int*)((u8int*)vmx + (u32int)~r->offset) = val; break;
1271                         case 4: *(u32int*)((u8int*)vmx + (u32int)~r->offset) = val; break;
1272                         case 8: *(u64int*)((u8int*)vmx + (u32int)~r->offset) = val; break;
1273                         default: error(Egreg);
1274                         }
1275                 }
1276         }
1277         return p - p0;
1278 }
1279
1280 static int
1281 cmdsetregs(VmCmd *cmd, va_list va)
1282 {
1283         return setregs(cmd->vmx, va_arg(va, char *), '\n', " \t");
1284 }
1285
1286 static int
1287 cmdgetfpregs(VmCmd *cmd, va_list va)
1288 {
1289         uchar *p;
1290         
1291         p = va_arg(va, uchar *);
1292         memmove(p, &cmd->vmx->fp, sizeof(FPsave));
1293         return sizeof(FPsave);
1294 }
1295
1296 static int
1297 cmdsetfpregs(VmCmd *cmd, va_list va)
1298 {
1299         uchar *p;
1300         ulong n;
1301         vlong off;
1302         
1303         p = va_arg(va, uchar *);
1304         n = va_arg(va, ulong);
1305         off = va_arg(va, vlong);
1306         if(off < 0 || off >= sizeof(FPsave)) n = 0;
1307         else if(off + n > sizeof(FPsave)) n = sizeof(FPsave) - n;
1308         memmove((uchar*)&cmd->vmx->fp + off, p, n);
1309         return n;
1310 }
1311
1312 static int
1313 cmdgo(VmCmd *cmd, va_list va)
1314 {
1315         int step;
1316         char *r;
1317         Vmx *vmx;
1318         
1319         vmx = cmd->vmx;
1320         if(vmx->state != VMXREADY)
1321                 error("VM not ready");
1322         step = va_arg(va, int);
1323         r = va_arg(va, char *);
1324         if(r != nil) setregs(vmx, r, ';', "=");
1325         if(step) vmx->onentry |= STEP;
1326         vmx->state = VMXRUNNING;
1327         return 0;
1328 }
1329
1330 static int
1331 cmdstop(VmCmd *cmd, va_list)
1332 {
1333         Vmx *vmx;
1334         
1335         vmx = cmd->vmx;
1336         if(vmx->state != VMXREADY && vmx->state != VMXRUNNING)
1337                 error("VM not ready or running");
1338         vmx->state = VMXREADY;
1339         return 0;
1340 }
1341
1342 static int
1343 cmdstatus(VmCmd *cmd, va_list va)
1344 {       
1345         kstrcpy(va_arg(va, char *), cmd->vmx->errstr, ERRMAX);
1346         return cmd->vmx->state;
1347 }
1348
1349 static char *exitreasons[] = {
1350         [0] "exc", [1] "extirq", [2] "triplef", [3] "initsig", [4] "sipi", [5] "smiio", [6] "smiother", [7] "irqwin",
1351         [8] "nmiwin", [9] "taskswitch", [10] ".cpuid", [11] ".getsec", [12] ".hlt", [13] ".invd", [14] ".invlpg", [15] ".rdpmc",
1352         [16] ".rdtsc", [17] ".rsm", [18] ".vmcall", [19] ".vmclear", [20] ".vmlaunch", [21] ".vmptrld", [22] ".vmptrst", [23] ".vmread",
1353         [24] ".vmresume", [25] ".vmwrite", [26] ".vmxoff", [27] ".vmxon", [28] "movcr", [29] ".movdr", [30] "io", [31] ".rdmsr",
1354         [32] ".wrmsr", [33] "entrystate", [34] "entrymsr", [36] ".mwait", [37] "monitortrap", [39] ".monitor",
1355         [40] ".pause", [41] "mcheck", [43] "tpr", [44] "apicacc", [45] "eoi", [46] "gdtr_idtr", [47] "ldtr_tr",
1356         [48] "eptfault", [49] "eptinval", [50] ".invept", [51] ".rdtscp", [52] "preempt", [53] ".invvpid", [54] ".wbinvd", [55] ".xsetbv",
1357         [56] "apicwrite", [57] ".rdrand", [58] ".invpcid", [59] ".vmfunc", [60] ".encls", [61] ".rdseed", [62] "pmlfull", [63] ".xsaves",
1358         [64] ".xrstors", 
1359 };
1360
1361 static char *except[] = {
1362         [0] "#de", [1] "#db", [3] "#bp", [4] "#of", [5] "#br", [6] "#ud", [7] "#nm",
1363         [8] "#df", [10] "#ts", [11] "#np", [12] "#ss", [13] "#gp", [14] "#pf",
1364         [16] "#mf", [17] "#ac", [18] "#mc", [19] "#xm", [20] "#ve",
1365 };
1366
1367 static int
1368 cmdwait(VmCmd *cp, va_list va)
1369 {
1370         char *p, *p0, *e;
1371         u32int reason, intr;
1372         uvlong qual;
1373         u16int rno;
1374         Vmx *vmx;
1375
1376         if(cp->scratched)
1377                 error(Eintr);
1378         vmx = cp->vmx;
1379         p0 = p = va_arg(va, char *);
1380         e = va_arg(va, char *);
1381         if((vmx->got & GOTIRQACK) != 0){
1382                 p = seprint(p, e, "*ack %d\n", vmx->irqack.info & 0xff);
1383                 vmx->got &= ~GOTIRQACK;
1384                 return p - p0;
1385         }
1386         if((vmx->got & GOTEXIT) == 0){
1387                 cp->flags |= CMDFPOSTP;
1388                 return -1;
1389         }
1390         vmx->got &= ~GOTEXIT;
1391         reason = vmcsread(VM_EXREASON);
1392         qual = vmcsread(VM_EXQUALIF);
1393         rno = reason;
1394         intr = vmcsread(VM_EXINTRINFO);
1395         if((reason & 1<<31) != 0)
1396                 p = seprint(p, e, "!");
1397         if(rno == 0 && (intr & 1<<31) != 0){
1398                 if((intr & 0xff) >= nelem(except) || except[intr & 0xff] == nil)
1399                         p = seprint(p, e, "#%d ", intr & 0xff);
1400                 else
1401                         p = seprint(p, e, "%s ", except[intr & 0xff]);
1402         }else if(rno >= nelem(exitreasons) || exitreasons[rno] == nil)
1403                 p = seprint(p, e, "?%d ", rno);
1404         else
1405                 p = seprint(p, e, "%s ", exitreasons[rno]);
1406         p = seprint(p, e, "%#ullx pc %#ullx sp %#ullx ilen %#ullx iinfo %#ullx", qual, vmcsread(GUEST_RIP), vmcsread(GUEST_RSP), vmcsread(VM_EXINSTRLEN), vmcsread(VM_EXINSTRINFO));
1407         if((intr & 1<<11) != 0) p = seprint(p, e, " excode %#ullx", vmcsread(VM_EXINTRCODE));
1408         if(rno == 48 && (qual & 0x80) != 0) p = seprint(p, e, " va %#ullx", vmcsread(VM_GUESTVA));
1409         if(rno == 48 || rno == 49) p = seprint(p, e, " pa %#ullx", vmcsread(VM_GUESTPA));
1410         if(rno == 30) p = seprint(p, e, " ax %#ullx", (uvlong)vmx->ureg.ax);
1411         p = seprint(p, e, "\n");
1412         return p - p0;
1413 }
1414
1415 static void
1416 eventparse(char *p, VmIntr *vi)
1417 {
1418         char *q, *r;
1419         int i;
1420         
1421         memset(vi, 0, sizeof(VmIntr));
1422         q = nil;
1423         kstrdup(&q, p);
1424         if(waserror()){
1425                 free(q);
1426                 memset(vi, 0, sizeof(VmIntr));
1427                 nexterror();
1428         }
1429         vi->info = 1<<31;
1430         r = strchr(q, ',');
1431         if(r != nil) *r++ = 0;
1432         for(i = 0; i < nelem(except); i++)
1433                 if(except[i] != nil && strcmp(except[i], q) == 0)
1434                         break;
1435         if(*q == '#'){
1436                 q++;
1437                 vi->info |= 3 << 8;
1438         }
1439         if(i == nelem(except)){
1440                 i = strtoul(q, &q, 10);
1441                 if(*q != 0 || i > 255) error(Ebadctl);
1442         }
1443         vi->info |= i;
1444         if((vi->info & 0x7ff) == 3 || (vi->info & 0x7ff) == 4)
1445                 vi->info += 3 << 8;
1446         if(r == nil) goto out;
1447         if(*r != ','){
1448                 vi->code = strtoul(r, &r, 0);
1449                 vi->info |= 1<<11;
1450         }else r++;
1451         if(*r == ',')
1452                 vi->ilen = strtoul(r + 1, &r, 0);
1453         if(*r != 0) error(Ebadctl);
1454 out:
1455         poperror();
1456         free(q);
1457 }
1458
1459 static int
1460 cmdexcept(VmCmd *cp, va_list va)
1461 {
1462         Vmx *vmx;
1463         
1464         vmx = cp->vmx;
1465         if(cp->scratched) error(Eintr);
1466         if((vmx->onentry & POSTEX) != 0){
1467                 cp->flags |= CMDFPOSTP;
1468                 return 0;
1469         }
1470         eventparse(va_arg(va, char *), &vmx->exc);
1471         vmx->onentry |= POSTEX;
1472         return 0;
1473 }
1474
1475 static int
1476 cmdirq(VmCmd *cmd, va_list va)
1477 {
1478         char *p;
1479         VmIntr vi;
1480         Vmx *vmx;
1481         
1482         vmx = cmd->vmx;
1483         p = va_arg(va, char *);
1484         if(p == nil)
1485                 vmx->onentry &= ~POSTIRQ;
1486         else{
1487                 eventparse(p, &vi);
1488                 vmx->irq = vi;
1489                 vmx->onentry |= POSTIRQ;
1490         }
1491         return 0;
1492 }
1493
1494 static int
1495 cmdextrap(VmCmd *, va_list va)
1496 {
1497         char *p, *q;
1498         u32int v;
1499         
1500         p = va_arg(va, char *);
1501         v = strtoul(p, &q, 0);
1502         if(q == p || *q != 0) error(Ebadarg);
1503         vmcswrite(EXC_BITMAP, v);
1504         return 0;
1505 }
1506
1507 static int
1508 gotcmd(void *vmxp)
1509 {
1510         int rc;
1511         Vmx *vmx;
1512
1513         vmx = vmxp;
1514         ilock(&vmx->cmdlock);
1515         rc = vmx->firstcmd != nil;
1516         iunlock(&vmx->cmdlock);
1517         return rc;
1518 }
1519
1520 static void
1521 markcmddone(VmCmd *p, VmCmd ***pp)
1522 {
1523         if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP){
1524                 **pp = p;
1525                 *pp = &p->next;
1526         }else{
1527                 p->flags = p->flags & ~CMDFPOSTP;
1528                 cmdrelease(p, 0);
1529         }
1530 }
1531
1532 static VmCmd **
1533 markppcmddone(VmCmd **pp)
1534 {
1535         VmCmd *p;
1536         
1537         p = *pp;
1538         if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP)
1539                 return &p->next;
1540         *pp = p->next;
1541         p->next = nil;
1542         p->flags = p->flags & ~CMDFPOSTP;
1543         cmdrelease(p, 0);
1544         return pp;
1545 }
1546
1547
1548 static void
1549 runcmd(Vmx *vmx)
1550 {
1551         VmCmd *p, **pp;
1552         
1553         for(pp = &vmx->postponed; p = *pp, p != nil; ){
1554                 if(waserror()){
1555                         kstrcpy(p->errstr, up->errstr, ERRMAX);
1556                         p->flags |= CMDFFAIL;
1557                         pp = markppcmddone(pp);
1558                         continue;
1559                 }
1560                 p->flags &= ~CMDFPOSTP;
1561                 p->retval = p->cmd(p, p->va);
1562                 poperror();
1563                 pp = markppcmddone(pp);
1564         }
1565         for(;;){
1566                 ilock(&vmx->cmdlock);
1567                 p = vmx->firstcmd;
1568                 if(p == nil){
1569                         iunlock(&vmx->cmdlock);
1570                         break;
1571                 }
1572                 vmx->firstcmd = p->next;
1573                 if(vmx->lastcmd == &p->next)
1574                         vmx->lastcmd = &vmx->firstcmd;
1575                 iunlock(&vmx->cmdlock);
1576                 p->next = nil;
1577                 if(waserror()){
1578                         kstrcpy(p->errstr, up->errstr, ERRMAX);
1579                         p->flags |= CMDFFAIL;
1580                         markcmddone(p, &pp);
1581                         continue;
1582                 }
1583                 if(p->scratched) error(Eintr);
1584                 p->retval = p->cmd(p, p->va);
1585                 poperror();
1586                 markcmddone(p, &pp);
1587         }
1588 }
1589
1590 static void
1591 vmxproc(void *vmxp)
1592 {
1593         int init, rc, x;
1594         u32int procbctls, defprocbctls;
1595         vlong v;
1596         Vmx *vmx;
1597
1598         vmx = vmxp;
1599         procwired(up, vmx->machno);
1600         sched();
1601         init = 0;
1602         defprocbctls = 0;
1603         while(waserror()){
1604                 kstrcpy(vmx->errstr, up->errstr, ERRMAX);
1605                 vmx->state = VMXDEAD;
1606         }
1607         for(;;){
1608                 if(!init){
1609                         init = 1;
1610                         vmxstart(vmx);
1611                         vmx->state = VMXREADY;
1612                         defprocbctls = vmcsread(PROCB_CTLS);
1613                 }
1614                 runcmd(vmx);
1615                 if(vmx->state == VMXRUNNING){
1616                         procbctls = defprocbctls;
1617                         if((vmx->onentry & STEP) != 0)
1618                                 defprocbctls |= PROCB_MONTRAP;
1619                         if((vmx->onentry & POSTEX) != 0){
1620                                 vmcswrite(VMENTRY_INTRINFO, vmx->exc.info);
1621                                 vmcswrite(VMENTRY_INTRCODE, vmx->exc.code);
1622                                 vmcswrite(VMENTRY_INTRILEN, vmx->exc.ilen);
1623                                 vmx->onentry &= ~POSTEX;
1624                         }
1625                         if((vmx->onentry & POSTIRQ) != 0 && (vmx->onentry & STEP) == 0){
1626                                 if((vmx->onentry & POSTEX) == 0 && (vmcsread(GUEST_RFLAGS) & 1<<9) != 0 && (vmcsread(GUEST_CANINTR) & 3) == 0){
1627                                         vmcswrite(VMENTRY_INTRINFO, vmx->irq.info);
1628                                         vmcswrite(VMENTRY_INTRCODE, vmx->irq.code);
1629                                         vmcswrite(VMENTRY_INTRILEN, vmx->irq.ilen);
1630                                         vmx->onentry &= ~POSTIRQ;
1631                                         vmx->got |= GOTIRQACK;
1632                                         vmx->irqack = vmx->irq;
1633                                 }else
1634                                         procbctls |= PROCB_IRQWIN;
1635                         }
1636                         if((vmx->onentry & FLUSHVPID) != 0){
1637                                 if(invvpid(INVLOCAL, vmx->vpid, 0) < 0)
1638                                         error("invvpid failed");
1639                                 vmx->onentry &= ~FLUSHVPID;
1640                         }
1641                         if((vmx->onentry & FLUSHEPT) != 0){
1642                                 if(invept(INVLOCAL, PADDR(vmx->pml4) | 3<<3, 0) < 0)
1643                                         error("invept failed");
1644                                 vmx->onentry &= ~FLUSHEPT;
1645                         }
1646                         vmcswrite(PROCB_CTLS, procbctls);
1647                         vmx->got &= ~GOTEXIT;
1648                         
1649                         x = splhi();
1650                         if(sizeof(uintptr) == 8){
1651                                 rdmsr(FSbase, &v);
1652                                 vmwrite(HOST_FSBASE, v);
1653                         }
1654                         if((vmx->dr[7] & ~0xd400) != 0)
1655                                 putdr01236(vmx->dr);
1656                         fpsserestore(&vmx->fp);
1657                         putcr2(vmx->cr2);
1658                         rc = vmlaunch(&vmx->ureg, vmx->launched);
1659                         vmx->cr2 = getcr2();
1660                         fpssesave(&vmx->fp);
1661                         splx(x);
1662                         if(rc < 0)
1663                                 error("vmlaunch failed");
1664                         vmx->launched = 1;
1665                         processexit(vmx);
1666                 }else{
1667                         up->psstate = "Idle";
1668                         sleep(&vmx->cmdwait, gotcmd, vmx);
1669                         up->psstate = nil;
1670                 }
1671         }
1672 }
1673
1674 enum {
1675         /* Qdir */
1676         Qclone = 1,
1677 };
1678
1679 enum {
1680         Qdir,
1681         Qctl,
1682         Qregs,
1683         Qstatus,
1684         Qmap,
1685         Qwait,
1686         Qfpregs,
1687 };
1688
1689 static Dirtab vmxdir[] = {
1690         ".",            { Qdir, 0, QTDIR },     0,              0550,
1691         "ctl",          { Qctl, 0, 0 },         0,              0660,
1692         "regs",         { Qregs, 0, 0 },        0,              0660,
1693         "status",       { Qstatus, 0, 0 },      0,              0440,
1694         "map",          { Qmap, 0, 0 },         0,              0660,
1695         "wait",         { Qwait, 0, 0 },        0,              0440,
1696         "fpregs",       { Qfpregs, 0, 0 },      0,              0660,
1697 };
1698
1699 enum {
1700         CMquit,
1701         CMgo,
1702         CMstop,
1703         CMstep,
1704         CMexc,
1705         CMirq,
1706         CMextrap,
1707 };
1708
1709 static Cmdtab vmxctlmsg[] = {
1710         CMquit,         "quit",         1,
1711         CMgo,           "go",           0,
1712         CMstop,         "stop",         1,
1713         CMstep,         "step",         0,
1714         CMexc,          "exc",          2,
1715         CMirq,          "irq",          0,
1716         CMextrap,       "extrap",       2,
1717 };
1718
1719 enum { AUXSIZE = 4096 };
1720
1721 static Vmx *
1722 vmxlook(vlong n)
1723 {
1724         if(n < 0) return nil;
1725         if(n >= nvmxtab) return nil;
1726         return vmxtab[n];
1727 }
1728 #define QIDPATH(q,e) ((q) + 1 << 8 | (e)) 
1729 #define SLOT(q) ((vlong)((q).path >> 8) - 1)
1730 #define FILE(q) ((int)(q).path & 0xff)
1731 static Vmx *
1732 vmxent(Qid q)
1733 {
1734         Vmx *vmx;
1735
1736         eqlock(&vmxtablock);
1737         if(waserror()){
1738                 qunlock(&vmxtablock);
1739                 nexterror();
1740         }
1741         vmx = vmxlook(SLOT(q));
1742         qunlock(&vmxtablock);
1743         poperror();
1744         return vmx;
1745 }
1746
1747 static int
1748 iscmddone(void *cp)
1749 {
1750         return (((VmCmd*)cp)->flags & CMDFDONE) != 0;
1751 }
1752
1753 static int
1754 vmxcmd(Vmx *vmx, int (*f)(VmCmd *, va_list), ...)
1755 {
1756         VmCmd cmd;
1757
1758         if(vmx->state == VMXENDING)
1759         ending:
1760                 error(Equit);
1761         memset(&cmd, 0, sizeof(VmCmd));
1762         cmd.vmx = vmx;
1763         cmd.errstr = up->errstr;
1764         cmd.cmd = f;
1765         va_start(cmd.va, f);
1766          
1767         ilock(&vmx->cmdlock);
1768         if(vmx->state == VMXENDING){
1769                 iunlock(&vmx->cmdlock);
1770                 goto ending;
1771         }
1772         *vmx->lastcmd = &cmd;
1773         vmx->lastcmd = &cmd.next;
1774         iunlock(&vmx->cmdlock);
1775         
1776         while(waserror())
1777                 cmd.scratched = 1;
1778         wakeup(&vmx->cmdwait);
1779         do
1780                 sleep(&cmd, iscmddone, &cmd);
1781         while(!iscmddone(&cmd));
1782         poperror();
1783         lock(&cmd);
1784         unlock(&cmd);
1785         if((cmd.flags & CMDFFAIL) != 0)
1786                 error(up->errstr);
1787         return cmd.retval;
1788 }
1789
1790 static Vmx *
1791 vmxnew(void)
1792 {
1793         Vmx *vmx;
1794         Vmx **newtab;
1795         int i, mi, mv;
1796         
1797         vmx = mallocalign(sizeof(Vmx), 4096, 0, 0);
1798         if(waserror()){
1799
1800                 free(vmx);
1801                 nexterror();
1802         }
1803         vmx->state = VMXINIT;
1804         vmx->lastcmd = &vmx->firstcmd;
1805         vmx->mem.next = &vmx->mem;
1806         vmx->mem.prev = &vmx->mem;
1807         vmx->index = -1;
1808         
1809         eqlock(&vmxtablock);
1810         if(waserror()){
1811                 if(vmx->index >= 0)
1812                         vmxtab[vmx->index] = 0;
1813                 qunlock(&vmxtablock);
1814                 nexterror();
1815         }
1816         for(i = 0; i < nvmxtab; i++)
1817                 if(vmxtab[i] == nil){
1818                         vmxtab[i] = vmx;
1819                         vmx->index = i;
1820                         break;
1821                 }
1822         if(i == nvmxtab){
1823                 newtab = realloc(vmxtab, (nvmxtab + 1) * sizeof(Vmx *));
1824                 if(newtab == nil)
1825                         error(Enomem);
1826                 vmxtab = newtab;
1827                 vmxtab[nvmxtab] = vmx;
1828                 vmx->index = nvmxtab++;
1829         }
1830         kproc("kvmx", vmxproc, vmx);
1831         qunlock(&vmxtablock);
1832         poperror();
1833         poperror();
1834         mi = 0;
1835         mv = 0x7fffffff;
1836         for(i = 0; i < conf.nmach; i++)
1837                 if(vmxmachp(i)->vms < mv){
1838                         mi = i;
1839                         mv = vmxmachp(i)->vms;
1840                 }
1841         vmx->machno = mi;
1842         if(vmxcmd(vmx, cmdstatus, up->errstr) == VMXDEAD)
1843                 error(up->errstr);
1844         return vmx;
1845 }
1846
1847 static void
1848 vmxshutdown(void)
1849 {
1850         int i;
1851         
1852         for(i = 0; i < nvmxtab; i++)
1853                 if(vmxtab[i] != nil)
1854                         vmxcmd(vmxtab[i], cmdquit);
1855 }
1856
1857 static Chan *
1858 vmxattach(char *spec)
1859 {
1860         if(!gotvmx) error(Enodev);
1861         return devattach('X', spec);
1862 }
1863
1864 static int
1865 vmxgen(Chan *c, char *, Dirtab *, int, int s, Dir *dp)
1866 {
1867         Dirtab *tab;
1868         int path;
1869
1870         if(s == DEVDOTDOT){
1871                 devdir(c, (Qid){Qdir, 0, QTDIR}, "#X", 0, eve, 0555, dp);
1872                 return 1;
1873         }
1874         if(c->qid.path == Qdir){
1875                 if(s-- == 0) goto clone;
1876                 if(s >= nvmxtab)
1877                         return -1;
1878                 if(vmxlook(s) == nil)
1879                         return 0;
1880                 sprint(up->genbuf, "%d", s);
1881                 devdir(c, (Qid){QIDPATH(s, Qdir), 0, QTDIR}, up->genbuf, 0, eve, DMDIR|0555, dp);
1882                 return 1;
1883         }
1884         if(c->qid.path == Qclone){
1885         clone:
1886                 strcpy(up->genbuf, "clone");
1887                 devdir(c, (Qid){Qclone, 0, QTFILE}, up->genbuf, 0, eve, 0444, dp);
1888                 return 1;
1889         }
1890         if(s >= nelem(vmxdir))
1891                 return -1;
1892         tab = &vmxdir[s];
1893         path = QIDPATH(SLOT(c->qid), 0);
1894         devdir(c, (Qid){tab->qid.path|path, tab->qid.vers, tab->qid.type}, tab->name, tab->length, eve, tab->perm, dp);
1895         return 1;
1896 }
1897
1898 static Walkqid*
1899 vmxwalk(Chan *c, Chan *nc, char **name, int nname)
1900 {
1901         Walkqid *rc;
1902
1903         eqlock(&vmxtablock);
1904         if(waserror()){
1905                 qunlock(&vmxtablock);
1906                 nexterror();
1907         }
1908         rc = devwalk(c, nc, name, nname, nil, 0, vmxgen);
1909         qunlock(&vmxtablock);
1910         poperror();
1911         return rc;
1912 }
1913
1914 static int
1915 vmxstat(Chan *c, uchar *dp, int n)
1916 {
1917         int rc;
1918         
1919         eqlock(&vmxtablock);
1920         if(waserror()){
1921                 qunlock(&vmxtablock);
1922                 nexterror();
1923         }
1924         rc = devstat(c, dp, n, nil, 0, vmxgen);
1925         qunlock(&vmxtablock);
1926         poperror();
1927         return rc;
1928 }
1929
1930 static Chan*
1931 vmxopen(Chan* c, int omode)
1932 {
1933         Chan *ch;
1934         Vmx *vmx;
1935
1936         if(c->qid.path == Qclone){
1937                 if(!iseve()) error(Eperm);
1938                 vmx = vmxnew();
1939                 c->qid.path = QIDPATH(vmx->index, Qctl);
1940         }
1941         eqlock(&vmxtablock);
1942         if(waserror()){
1943                 qunlock(&vmxtablock);
1944                 nexterror();
1945         }
1946         vmx = vmxlook(SLOT(c->qid));
1947         if(SLOT(c->qid) >= 0 && vmx == nil) error(Enonexist);
1948         if(FILE(c->qid) != Qdir && !iseve()) error(Eperm);
1949         ch = devopen(c, omode, nil, 0, vmxgen);
1950         qunlock(&vmxtablock);
1951         poperror();
1952         ch->aux = smalloc(AUXSIZE);
1953         if(SLOT(ch->qid) >= 0 && FILE(ch->qid) == Qmap){
1954                 if((omode & OTRUNC) != 0)
1955                         vmxcmd(vmx, cmdclearmeminfo);
1956         }
1957         return ch;
1958 }
1959
1960 static void
1961 vmxclunk(Chan *ch)
1962 {
1963         free(ch->aux);
1964         ch->aux = nil;
1965 }
1966
1967 static void
1968 vmxremove(Chan *ch)
1969 {
1970         Vmx *vmx, *old;
1971
1972         vmxclunk(ch);
1973         if(SLOT(ch->qid) == -1 || FILE(ch->qid) != Qctl)
1974                 error(Eperm);
1975         vmx = vmxent(ch->qid);
1976         if(vmx == nil)
1977                 error(Enonexist);
1978         qlock(&vmxtablock);
1979         old = moribund;
1980         moribund = vmx;
1981         qunlock(&vmxtablock);
1982         if(old != nil)
1983                 vmxcmd(old, cmdquit);
1984 }
1985
1986 static void
1987 vmxclose(Chan *ch)
1988 {
1989         if((ch->flag & CRCLOSE) != 0)
1990                 vmxremove(ch);
1991         else
1992                 vmxclunk(ch);
1993 }
1994
1995
1996 static long
1997 vmxread(Chan* c, void* a, long n, vlong off)
1998 {
1999         int rc;
2000         Vmx *vmx;
2001
2002         if(SLOT(c->qid) == -1){
2003                 switch((int)c->qid.path){
2004                 case Qdir:
2005                 dirread:
2006                         eqlock(&vmxtablock);
2007                         if(waserror()){
2008                                 qunlock(&vmxtablock);
2009                                 nexterror();
2010                         }
2011                         rc = devdirread(c, a, n, nil, 0, vmxgen);
2012                         qunlock(&vmxtablock);
2013                         poperror();
2014                         return rc;
2015                 default:
2016                         error(Egreg);
2017                 }
2018         }
2019         vmx = vmxent(c->qid);
2020         if(vmx == nil) error(Enonexist);
2021         switch(FILE(c->qid)){
2022         case Qdir:
2023                 goto dirread;
2024         case Qctl:
2025                 {
2026                         char buf[20];
2027                         
2028                         sprint(buf, "%d", vmx->index);
2029                         return readstr(off, a, n, buf);
2030                 }
2031         case Qregs:
2032                 if(off == 0)
2033                         vmxcmd(vmx, cmdgetregs, c->aux, (char *) c->aux + AUXSIZE);
2034                 return readstr(off, a, n, c->aux);
2035         case Qmap:
2036                 if(off == 0)
2037                         vmxcmd(vmx, cmdgetmeminfo, c->aux, (char *) c->aux + AUXSIZE);
2038                 return readstr(off, a, n, c->aux);
2039         case Qstatus:
2040                 {
2041                         char buf[ERRMAX+128];
2042                         char errbuf[ERRMAX];
2043                         int status;
2044                         
2045                         status = vmx->state;
2046                         if(status == VMXDEAD){
2047                                 vmxcmd(vmx, cmdstatus, errbuf);
2048                                 snprint(buf, sizeof(buf), "%s %#q\n", statenames[status], errbuf);
2049                         }else if(status >= 0 && status < nelem(statenames))
2050                                 snprint(buf, sizeof(buf), "%s\n", statenames[status]);
2051                         else
2052                                 snprint(buf, sizeof(buf), "%d\n", status);
2053                         return readstr(off, a, n, buf);
2054                 }
2055         case Qwait:
2056                 {
2057                         char buf[512];
2058                         
2059                         rc = vmxcmd(vmx, cmdwait, buf, buf + sizeof(buf));
2060                         if(rc > n) rc = n;
2061                         if(rc > 0) memmove(a, buf, rc);
2062                         return rc;
2063                 }
2064         case Qfpregs:
2065                 {
2066                         char buf[sizeof(FPsave)];
2067                         
2068                         vmxcmd(vmx, cmdgetfpregs, buf);
2069                         if(n < 0 || off < 0 || off >= sizeof(buf)) n = 0;
2070                         else if(off + n > sizeof(buf)) n = sizeof(buf) - off;
2071                         if(n != 0) memmove(a, buf + off, n);
2072                         return n;
2073                 }
2074         default:
2075                 error(Egreg);
2076                 break;
2077         }
2078         return 0;
2079 }
2080
2081 static long
2082 vmxwrite(Chan* c, void* a, long n, vlong off)
2083 {
2084         Cmdbuf *cb;
2085         Cmdtab *ct;
2086         char *s;
2087         int rc;
2088         Vmx *vmx;
2089
2090         if(SLOT(c->qid) == -1){
2091                 switch((int)c->qid.path){
2092                 case Qdir:
2093                         error(Eperm);
2094                 default:
2095                         error(Egreg);
2096                 }
2097         }
2098         vmx = vmxent(c->qid);
2099         if(vmx == nil) error(Enonexist);
2100         switch(FILE(c->qid)){
2101         case Qdir:
2102                 error(Eperm);
2103         case Qctl:
2104                 cb = parsecmd(a, n);
2105                 if(waserror()){
2106                         free(cb);
2107                         nexterror();
2108                 }
2109                 ct = lookupcmd(cb, vmxctlmsg, nelem(vmxctlmsg));
2110                 switch(ct->index){
2111                 case CMquit:
2112                         vmxcmd(vmx, cmdquit);
2113                         break;
2114                 case CMgo:
2115                 case CMstep:
2116                         s = nil;
2117                         if(cb->nf == 2) kstrdup(&s, cb->f[1]);
2118                         else if(cb->nf != 1) error(Ebadarg);
2119                         if(waserror()){
2120                                 free(s);
2121                                 nexterror();
2122                         }
2123                         vmxcmd(vmx, cmdgo, ct->index == CMstep, s);
2124                         poperror();
2125                         free(s);
2126                         break;
2127                 case CMstop:
2128                         vmxcmd(vmx, cmdstop);
2129                         break;
2130                 case CMexc:
2131                         s = nil;
2132                         kstrdup(&s, cb->f[1]);
2133                         if(waserror()){
2134                                 free(s);
2135                                 nexterror();
2136                         }
2137                         vmxcmd(vmx, cmdexcept, s);
2138                         poperror();
2139                         free(s);
2140                         break;
2141                 case CMirq:
2142                         s = nil;
2143                         if(cb->nf == 2)
2144                                 kstrdup(&s, cb->f[1]);
2145                         if(waserror()){
2146                                 free(s);
2147                                 nexterror();
2148                         }
2149                         vmxcmd(vmx, cmdirq, s);
2150                         poperror();
2151                         free(s);
2152                         break;
2153                 case CMextrap:
2154                         s = nil;
2155                         kstrdup(&s, cb->f[1]);
2156                         if(waserror()){
2157                                 free(s);
2158                                 nexterror();
2159                         }
2160                         vmxcmd(vmx, cmdextrap, s);
2161                         poperror();
2162                         free(s);
2163                         break;
2164
2165                 default:
2166                         error(Egreg);
2167                 }
2168                 poperror();
2169                 free(cb);
2170                 break;
2171         case Qmap:
2172         case Qregs:
2173                 s = malloc(n+1);
2174                 if(s == nil) error(Enomem);
2175                 if(waserror()){
2176                         free(s);
2177                         nexterror();
2178                 }
2179                 memmove(s, a, n);
2180                 s[n] = 0;
2181                 rc = vmxcmd(vmx, FILE(c->qid) == Qregs ? cmdsetregs : cmdsetmeminfo, s);
2182                 poperror();
2183                 free(s);
2184                 return rc;
2185         case Qfpregs:
2186                 {
2187                         char buf[sizeof(FPsave)];
2188                         
2189                         if(n > sizeof(FPsave)) n = sizeof(FPsave);
2190                         memmove(buf, a, n);
2191                         return vmxcmd(vmx, cmdsetfpregs, buf, n, off);
2192                 }
2193         default:
2194                 error(Egreg);
2195                 break;
2196         }
2197         return n;
2198 }
2199
2200 Dev vmxdevtab = {
2201         'X',
2202         "vmx",
2203         
2204         vmxreset,
2205         devinit,
2206         vmxshutdown,
2207         vmxattach,
2208         vmxwalk,
2209         vmxstat,
2210         vmxopen,
2211         devcreate,
2212         vmxclose,
2213         vmxread,
2214         devbread,
2215         vmxwrite,
2216         devbwrite,
2217         vmxremove,
2218         devwstat,
2219 };