]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/kvm/x86.c
KVM: Portability: Move x86 instruction emulation code to x86.c
[linux-2.6-omap-h63xx.git] / drivers / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  *
8  * Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This work is licensed under the terms of the GNU GPL, version 2.  See
13  * the COPYING file in the top-level directory.
14  *
15  */
16
17 #include "kvm.h"
18 #include "x86.h"
19 #include "segment_descriptor.h"
20 #include "irq.h"
21
22 #include <linux/kvm.h>
23 #include <linux/fs.h>
24 #include <linux/vmalloc.h>
25 #include <linux/module.h>
26
27 #include <asm/uaccess.h>
28
29 #define MAX_IO_MSRS 256
30 #define CR0_RESERVED_BITS                                               \
31         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
32                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
33                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
34 #define CR4_RESERVED_BITS                                               \
35         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
36                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
37                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
38                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
39
40 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
41 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
42
43 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
44
45 struct kvm_stats_debugfs_item debugfs_entries[] = {
46         { "pf_fixed", STAT_OFFSET(pf_fixed) },
47         { "pf_guest", STAT_OFFSET(pf_guest) },
48         { "tlb_flush", STAT_OFFSET(tlb_flush) },
49         { "invlpg", STAT_OFFSET(invlpg) },
50         { "exits", STAT_OFFSET(exits) },
51         { "io_exits", STAT_OFFSET(io_exits) },
52         { "mmio_exits", STAT_OFFSET(mmio_exits) },
53         { "signal_exits", STAT_OFFSET(signal_exits) },
54         { "irq_window", STAT_OFFSET(irq_window_exits) },
55         { "halt_exits", STAT_OFFSET(halt_exits) },
56         { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
57         { "request_irq", STAT_OFFSET(request_irq_exits) },
58         { "irq_exits", STAT_OFFSET(irq_exits) },
59         { "light_exits", STAT_OFFSET(light_exits) },
60         { "efer_reload", STAT_OFFSET(efer_reload) },
61         { NULL }
62 };
63
64
65 unsigned long segment_base(u16 selector)
66 {
67         struct descriptor_table gdt;
68         struct segment_descriptor *d;
69         unsigned long table_base;
70         unsigned long v;
71
72         if (selector == 0)
73                 return 0;
74
75         asm("sgdt %0" : "=m"(gdt));
76         table_base = gdt.base;
77
78         if (selector & 4) {           /* from ldt */
79                 u16 ldt_selector;
80
81                 asm("sldt %0" : "=g"(ldt_selector));
82                 table_base = segment_base(ldt_selector);
83         }
84         d = (struct segment_descriptor *)(table_base + (selector & ~7));
85         v = d->base_low | ((unsigned long)d->base_mid << 16) |
86                 ((unsigned long)d->base_high << 24);
87 #ifdef CONFIG_X86_64
88         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
89                 v |= ((unsigned long) \
90                       ((struct segment_descriptor_64 *)d)->base_higher) << 32;
91 #endif
92         return v;
93 }
94 EXPORT_SYMBOL_GPL(segment_base);
95
96 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
97 {
98         if (irqchip_in_kernel(vcpu->kvm))
99                 return vcpu->apic_base;
100         else
101                 return vcpu->apic_base;
102 }
103 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
104
105 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
106 {
107         /* TODO: reserve bits check */
108         if (irqchip_in_kernel(vcpu->kvm))
109                 kvm_lapic_set_base(vcpu, data);
110         else
111                 vcpu->apic_base = data;
112 }
113 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
114
115 static void inject_gp(struct kvm_vcpu *vcpu)
116 {
117         kvm_x86_ops->inject_gp(vcpu, 0);
118 }
119
120 /*
121  * Load the pae pdptrs.  Return true is they are all valid.
122  */
123 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
124 {
125         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
126         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
127         int i;
128         int ret;
129         u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
130
131         mutex_lock(&vcpu->kvm->lock);
132         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
133                                   offset * sizeof(u64), sizeof(pdpte));
134         if (ret < 0) {
135                 ret = 0;
136                 goto out;
137         }
138         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
139                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
140                         ret = 0;
141                         goto out;
142                 }
143         }
144         ret = 1;
145
146         memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
147 out:
148         mutex_unlock(&vcpu->kvm->lock);
149
150         return ret;
151 }
152
153 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
154 {
155         if (cr0 & CR0_RESERVED_BITS) {
156                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
157                        cr0, vcpu->cr0);
158                 inject_gp(vcpu);
159                 return;
160         }
161
162         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
163                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
164                 inject_gp(vcpu);
165                 return;
166         }
167
168         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
169                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
170                        "and a clear PE flag\n");
171                 inject_gp(vcpu);
172                 return;
173         }
174
175         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
176 #ifdef CONFIG_X86_64
177                 if ((vcpu->shadow_efer & EFER_LME)) {
178                         int cs_db, cs_l;
179
180                         if (!is_pae(vcpu)) {
181                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
182                                        "in long mode while PAE is disabled\n");
183                                 inject_gp(vcpu);
184                                 return;
185                         }
186                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
187                         if (cs_l) {
188                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
189                                        "in long mode while CS.L == 1\n");
190                                 inject_gp(vcpu);
191                                 return;
192
193                         }
194                 } else
195 #endif
196                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
197                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
198                                "reserved bits\n");
199                         inject_gp(vcpu);
200                         return;
201                 }
202
203         }
204
205         kvm_x86_ops->set_cr0(vcpu, cr0);
206         vcpu->cr0 = cr0;
207
208         mutex_lock(&vcpu->kvm->lock);
209         kvm_mmu_reset_context(vcpu);
210         mutex_unlock(&vcpu->kvm->lock);
211         return;
212 }
213 EXPORT_SYMBOL_GPL(set_cr0);
214
215 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
216 {
217         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
218 }
219 EXPORT_SYMBOL_GPL(lmsw);
220
221 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
222 {
223         if (cr4 & CR4_RESERVED_BITS) {
224                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
225                 inject_gp(vcpu);
226                 return;
227         }
228
229         if (is_long_mode(vcpu)) {
230                 if (!(cr4 & X86_CR4_PAE)) {
231                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
232                                "in long mode\n");
233                         inject_gp(vcpu);
234                         return;
235                 }
236         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
237                    && !load_pdptrs(vcpu, vcpu->cr3)) {
238                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
239                 inject_gp(vcpu);
240                 return;
241         }
242
243         if (cr4 & X86_CR4_VMXE) {
244                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
245                 inject_gp(vcpu);
246                 return;
247         }
248         kvm_x86_ops->set_cr4(vcpu, cr4);
249         vcpu->cr4 = cr4;
250         mutex_lock(&vcpu->kvm->lock);
251         kvm_mmu_reset_context(vcpu);
252         mutex_unlock(&vcpu->kvm->lock);
253 }
254 EXPORT_SYMBOL_GPL(set_cr4);
255
256 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
257 {
258         if (is_long_mode(vcpu)) {
259                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
260                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
261                         inject_gp(vcpu);
262                         return;
263                 }
264         } else {
265                 if (is_pae(vcpu)) {
266                         if (cr3 & CR3_PAE_RESERVED_BITS) {
267                                 printk(KERN_DEBUG
268                                        "set_cr3: #GP, reserved bits\n");
269                                 inject_gp(vcpu);
270                                 return;
271                         }
272                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
273                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
274                                        "reserved bits\n");
275                                 inject_gp(vcpu);
276                                 return;
277                         }
278                 }
279                 /*
280                  * We don't check reserved bits in nonpae mode, because
281                  * this isn't enforced, and VMware depends on this.
282                  */
283         }
284
285         mutex_lock(&vcpu->kvm->lock);
286         /*
287          * Does the new cr3 value map to physical memory? (Note, we
288          * catch an invalid cr3 even in real-mode, because it would
289          * cause trouble later on when we turn on paging anyway.)
290          *
291          * A real CPU would silently accept an invalid cr3 and would
292          * attempt to use it - with largely undefined (and often hard
293          * to debug) behavior on the guest side.
294          */
295         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
296                 inject_gp(vcpu);
297         else {
298                 vcpu->cr3 = cr3;
299                 vcpu->mmu.new_cr3(vcpu);
300         }
301         mutex_unlock(&vcpu->kvm->lock);
302 }
303 EXPORT_SYMBOL_GPL(set_cr3);
304
305 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
306 {
307         if (cr8 & CR8_RESERVED_BITS) {
308                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
309                 inject_gp(vcpu);
310                 return;
311         }
312         if (irqchip_in_kernel(vcpu->kvm))
313                 kvm_lapic_set_tpr(vcpu, cr8);
314         else
315                 vcpu->cr8 = cr8;
316 }
317 EXPORT_SYMBOL_GPL(set_cr8);
318
319 unsigned long get_cr8(struct kvm_vcpu *vcpu)
320 {
321         if (irqchip_in_kernel(vcpu->kvm))
322                 return kvm_lapic_get_cr8(vcpu);
323         else
324                 return vcpu->cr8;
325 }
326 EXPORT_SYMBOL_GPL(get_cr8);
327
328 /*
329  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
330  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
331  *
332  * This list is modified at module load time to reflect the
333  * capabilities of the host cpu.
334  */
335 static u32 msrs_to_save[] = {
336         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
337         MSR_K6_STAR,
338 #ifdef CONFIG_X86_64
339         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
340 #endif
341         MSR_IA32_TIME_STAMP_COUNTER,
342 };
343
344 static unsigned num_msrs_to_save;
345
346 static u32 emulated_msrs[] = {
347         MSR_IA32_MISC_ENABLE,
348 };
349
350 #ifdef CONFIG_X86_64
351
352 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
353 {
354         if (efer & EFER_RESERVED_BITS) {
355                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
356                        efer);
357                 inject_gp(vcpu);
358                 return;
359         }
360
361         if (is_paging(vcpu)
362             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
363                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
364                 inject_gp(vcpu);
365                 return;
366         }
367
368         kvm_x86_ops->set_efer(vcpu, efer);
369
370         efer &= ~EFER_LMA;
371         efer |= vcpu->shadow_efer & EFER_LMA;
372
373         vcpu->shadow_efer = efer;
374 }
375
376 #endif
377
378 /*
379  * Writes msr value into into the appropriate "register".
380  * Returns 0 on success, non-0 otherwise.
381  * Assumes vcpu_load() was already called.
382  */
383 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
384 {
385         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
386 }
387
388 /*
389  * Adapt set_msr() to msr_io()'s calling convention
390  */
391 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
392 {
393         return kvm_set_msr(vcpu, index, *data);
394 }
395
396
397 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
398 {
399         switch (msr) {
400 #ifdef CONFIG_X86_64
401         case MSR_EFER:
402                 set_efer(vcpu, data);
403                 break;
404 #endif
405         case MSR_IA32_MC0_STATUS:
406                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
407                        __FUNCTION__, data);
408                 break;
409         case MSR_IA32_MCG_STATUS:
410                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
411                         __FUNCTION__, data);
412                 break;
413         case MSR_IA32_UCODE_REV:
414         case MSR_IA32_UCODE_WRITE:
415         case 0x200 ... 0x2ff: /* MTRRs */
416                 break;
417         case MSR_IA32_APICBASE:
418                 kvm_set_apic_base(vcpu, data);
419                 break;
420         case MSR_IA32_MISC_ENABLE:
421                 vcpu->ia32_misc_enable_msr = data;
422                 break;
423         default:
424                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
425                 return 1;
426         }
427         return 0;
428 }
429 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
430
431
432 /*
433  * Reads an msr value (of 'msr_index') into 'pdata'.
434  * Returns 0 on success, non-0 otherwise.
435  * Assumes vcpu_load() was already called.
436  */
437 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
438 {
439         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
440 }
441
442 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
443 {
444         u64 data;
445
446         switch (msr) {
447         case 0xc0010010: /* SYSCFG */
448         case 0xc0010015: /* HWCR */
449         case MSR_IA32_PLATFORM_ID:
450         case MSR_IA32_P5_MC_ADDR:
451         case MSR_IA32_P5_MC_TYPE:
452         case MSR_IA32_MC0_CTL:
453         case MSR_IA32_MCG_STATUS:
454         case MSR_IA32_MCG_CAP:
455         case MSR_IA32_MC0_MISC:
456         case MSR_IA32_MC0_MISC+4:
457         case MSR_IA32_MC0_MISC+8:
458         case MSR_IA32_MC0_MISC+12:
459         case MSR_IA32_MC0_MISC+16:
460         case MSR_IA32_UCODE_REV:
461         case MSR_IA32_PERF_STATUS:
462         case MSR_IA32_EBL_CR_POWERON:
463                 /* MTRR registers */
464         case 0xfe:
465         case 0x200 ... 0x2ff:
466                 data = 0;
467                 break;
468         case 0xcd: /* fsb frequency */
469                 data = 3;
470                 break;
471         case MSR_IA32_APICBASE:
472                 data = kvm_get_apic_base(vcpu);
473                 break;
474         case MSR_IA32_MISC_ENABLE:
475                 data = vcpu->ia32_misc_enable_msr;
476                 break;
477 #ifdef CONFIG_X86_64
478         case MSR_EFER:
479                 data = vcpu->shadow_efer;
480                 break;
481 #endif
482         default:
483                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
484                 return 1;
485         }
486         *pdata = data;
487         return 0;
488 }
489 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
490
491 /*
492  * Read or write a bunch of msrs. All parameters are kernel addresses.
493  *
494  * @return number of msrs set successfully.
495  */
496 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
497                     struct kvm_msr_entry *entries,
498                     int (*do_msr)(struct kvm_vcpu *vcpu,
499                                   unsigned index, u64 *data))
500 {
501         int i;
502
503         vcpu_load(vcpu);
504
505         for (i = 0; i < msrs->nmsrs; ++i)
506                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
507                         break;
508
509         vcpu_put(vcpu);
510
511         return i;
512 }
513
514 /*
515  * Read or write a bunch of msrs. Parameters are user addresses.
516  *
517  * @return number of msrs set successfully.
518  */
519 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
520                   int (*do_msr)(struct kvm_vcpu *vcpu,
521                                 unsigned index, u64 *data),
522                   int writeback)
523 {
524         struct kvm_msrs msrs;
525         struct kvm_msr_entry *entries;
526         int r, n;
527         unsigned size;
528
529         r = -EFAULT;
530         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
531                 goto out;
532
533         r = -E2BIG;
534         if (msrs.nmsrs >= MAX_IO_MSRS)
535                 goto out;
536
537         r = -ENOMEM;
538         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
539         entries = vmalloc(size);
540         if (!entries)
541                 goto out;
542
543         r = -EFAULT;
544         if (copy_from_user(entries, user_msrs->entries, size))
545                 goto out_free;
546
547         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
548         if (r < 0)
549                 goto out_free;
550
551         r = -EFAULT;
552         if (writeback && copy_to_user(user_msrs->entries, entries, size))
553                 goto out_free;
554
555         r = n;
556
557 out_free:
558         vfree(entries);
559 out:
560         return r;
561 }
562
563 long kvm_arch_dev_ioctl(struct file *filp,
564                         unsigned int ioctl, unsigned long arg)
565 {
566         void __user *argp = (void __user *)arg;
567         long r;
568
569         switch (ioctl) {
570         case KVM_GET_MSR_INDEX_LIST: {
571                 struct kvm_msr_list __user *user_msr_list = argp;
572                 struct kvm_msr_list msr_list;
573                 unsigned n;
574
575                 r = -EFAULT;
576                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
577                         goto out;
578                 n = msr_list.nmsrs;
579                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
580                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
581                         goto out;
582                 r = -E2BIG;
583                 if (n < num_msrs_to_save)
584                         goto out;
585                 r = -EFAULT;
586                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
587                                  num_msrs_to_save * sizeof(u32)))
588                         goto out;
589                 if (copy_to_user(user_msr_list->indices
590                                  + num_msrs_to_save * sizeof(u32),
591                                  &emulated_msrs,
592                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
593                         goto out;
594                 r = 0;
595                 break;
596         }
597         default:
598                 r = -EINVAL;
599         }
600 out:
601         return r;
602 }
603
604 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
605 {
606         kvm_x86_ops->vcpu_load(vcpu, cpu);
607 }
608
609 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
610 {
611         kvm_x86_ops->vcpu_put(vcpu);
612 }
613
614 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
615 {
616         u64 efer;
617         int i;
618         struct kvm_cpuid_entry *e, *entry;
619
620         rdmsrl(MSR_EFER, efer);
621         entry = NULL;
622         for (i = 0; i < vcpu->cpuid_nent; ++i) {
623                 e = &vcpu->cpuid_entries[i];
624                 if (e->function == 0x80000001) {
625                         entry = e;
626                         break;
627                 }
628         }
629         if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
630                 entry->edx &= ~(1 << 20);
631                 printk(KERN_INFO "kvm: guest NX capability removed\n");
632         }
633 }
634
635 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
636                                     struct kvm_cpuid *cpuid,
637                                     struct kvm_cpuid_entry __user *entries)
638 {
639         int r;
640
641         r = -E2BIG;
642         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
643                 goto out;
644         r = -EFAULT;
645         if (copy_from_user(&vcpu->cpuid_entries, entries,
646                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
647                 goto out;
648         vcpu->cpuid_nent = cpuid->nent;
649         cpuid_fix_nx_cap(vcpu);
650         return 0;
651
652 out:
653         return r;
654 }
655
656 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
657                                     struct kvm_lapic_state *s)
658 {
659         vcpu_load(vcpu);
660         memcpy(s->regs, vcpu->apic->regs, sizeof *s);
661         vcpu_put(vcpu);
662
663         return 0;
664 }
665
666 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
667                                     struct kvm_lapic_state *s)
668 {
669         vcpu_load(vcpu);
670         memcpy(vcpu->apic->regs, s->regs, sizeof *s);
671         kvm_apic_post_state_restore(vcpu);
672         vcpu_put(vcpu);
673
674         return 0;
675 }
676
677 long kvm_arch_vcpu_ioctl(struct file *filp,
678                          unsigned int ioctl, unsigned long arg)
679 {
680         struct kvm_vcpu *vcpu = filp->private_data;
681         void __user *argp = (void __user *)arg;
682         int r;
683
684         switch (ioctl) {
685         case KVM_GET_LAPIC: {
686                 struct kvm_lapic_state lapic;
687
688                 memset(&lapic, 0, sizeof lapic);
689                 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
690                 if (r)
691                         goto out;
692                 r = -EFAULT;
693                 if (copy_to_user(argp, &lapic, sizeof lapic))
694                         goto out;
695                 r = 0;
696                 break;
697         }
698         case KVM_SET_LAPIC: {
699                 struct kvm_lapic_state lapic;
700
701                 r = -EFAULT;
702                 if (copy_from_user(&lapic, argp, sizeof lapic))
703                         goto out;
704                 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
705                 if (r)
706                         goto out;
707                 r = 0;
708                 break;
709         }
710         case KVM_SET_CPUID: {
711                 struct kvm_cpuid __user *cpuid_arg = argp;
712                 struct kvm_cpuid cpuid;
713
714                 r = -EFAULT;
715                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
716                         goto out;
717                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
718                 if (r)
719                         goto out;
720                 break;
721         }
722         case KVM_GET_MSRS:
723                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
724                 break;
725         case KVM_SET_MSRS:
726                 r = msr_io(vcpu, argp, do_set_msr, 0);
727                 break;
728         default:
729                 r = -EINVAL;
730         }
731 out:
732         return r;
733 }
734
735 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
736 {
737         int ret;
738
739         if (addr > (unsigned int)(-3 * PAGE_SIZE))
740                 return -1;
741         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
742         return ret;
743 }
744
745 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
746                                           u32 kvm_nr_mmu_pages)
747 {
748         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
749                 return -EINVAL;
750
751         mutex_lock(&kvm->lock);
752
753         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
754         kvm->n_requested_mmu_pages = kvm_nr_mmu_pages;
755
756         mutex_unlock(&kvm->lock);
757         return 0;
758 }
759
760 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
761 {
762         return kvm->n_alloc_mmu_pages;
763 }
764
765 /*
766  * Set a new alias region.  Aliases map a portion of physical memory into
767  * another portion.  This is useful for memory windows, for example the PC
768  * VGA region.
769  */
770 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
771                                          struct kvm_memory_alias *alias)
772 {
773         int r, n;
774         struct kvm_mem_alias *p;
775
776         r = -EINVAL;
777         /* General sanity checks */
778         if (alias->memory_size & (PAGE_SIZE - 1))
779                 goto out;
780         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
781                 goto out;
782         if (alias->slot >= KVM_ALIAS_SLOTS)
783                 goto out;
784         if (alias->guest_phys_addr + alias->memory_size
785             < alias->guest_phys_addr)
786                 goto out;
787         if (alias->target_phys_addr + alias->memory_size
788             < alias->target_phys_addr)
789                 goto out;
790
791         mutex_lock(&kvm->lock);
792
793         p = &kvm->aliases[alias->slot];
794         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
795         p->npages = alias->memory_size >> PAGE_SHIFT;
796         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
797
798         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
799                 if (kvm->aliases[n - 1].npages)
800                         break;
801         kvm->naliases = n;
802
803         kvm_mmu_zap_all(kvm);
804
805         mutex_unlock(&kvm->lock);
806
807         return 0;
808
809 out:
810         return r;
811 }
812
813 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
814 {
815         int r;
816
817         r = 0;
818         switch (chip->chip_id) {
819         case KVM_IRQCHIP_PIC_MASTER:
820                 memcpy(&chip->chip.pic,
821                         &pic_irqchip(kvm)->pics[0],
822                         sizeof(struct kvm_pic_state));
823                 break;
824         case KVM_IRQCHIP_PIC_SLAVE:
825                 memcpy(&chip->chip.pic,
826                         &pic_irqchip(kvm)->pics[1],
827                         sizeof(struct kvm_pic_state));
828                 break;
829         case KVM_IRQCHIP_IOAPIC:
830                 memcpy(&chip->chip.ioapic,
831                         ioapic_irqchip(kvm),
832                         sizeof(struct kvm_ioapic_state));
833                 break;
834         default:
835                 r = -EINVAL;
836                 break;
837         }
838         return r;
839 }
840
841 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
842 {
843         int r;
844
845         r = 0;
846         switch (chip->chip_id) {
847         case KVM_IRQCHIP_PIC_MASTER:
848                 memcpy(&pic_irqchip(kvm)->pics[0],
849                         &chip->chip.pic,
850                         sizeof(struct kvm_pic_state));
851                 break;
852         case KVM_IRQCHIP_PIC_SLAVE:
853                 memcpy(&pic_irqchip(kvm)->pics[1],
854                         &chip->chip.pic,
855                         sizeof(struct kvm_pic_state));
856                 break;
857         case KVM_IRQCHIP_IOAPIC:
858                 memcpy(ioapic_irqchip(kvm),
859                         &chip->chip.ioapic,
860                         sizeof(struct kvm_ioapic_state));
861                 break;
862         default:
863                 r = -EINVAL;
864                 break;
865         }
866         kvm_pic_update_irq(pic_irqchip(kvm));
867         return r;
868 }
869
870 long kvm_arch_vm_ioctl(struct file *filp,
871                        unsigned int ioctl, unsigned long arg)
872 {
873         struct kvm *kvm = filp->private_data;
874         void __user *argp = (void __user *)arg;
875         int r = -EINVAL;
876
877         switch (ioctl) {
878         case KVM_SET_TSS_ADDR:
879                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
880                 if (r < 0)
881                         goto out;
882                 break;
883         case KVM_SET_MEMORY_REGION: {
884                 struct kvm_memory_region kvm_mem;
885                 struct kvm_userspace_memory_region kvm_userspace_mem;
886
887                 r = -EFAULT;
888                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
889                         goto out;
890                 kvm_userspace_mem.slot = kvm_mem.slot;
891                 kvm_userspace_mem.flags = kvm_mem.flags;
892                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
893                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
894                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
895                 if (r)
896                         goto out;
897                 break;
898         }
899         case KVM_SET_NR_MMU_PAGES:
900                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
901                 if (r)
902                         goto out;
903                 break;
904         case KVM_GET_NR_MMU_PAGES:
905                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
906                 break;
907         case KVM_SET_MEMORY_ALIAS: {
908                 struct kvm_memory_alias alias;
909
910                 r = -EFAULT;
911                 if (copy_from_user(&alias, argp, sizeof alias))
912                         goto out;
913                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
914                 if (r)
915                         goto out;
916                 break;
917         }
918         case KVM_CREATE_IRQCHIP:
919                 r = -ENOMEM;
920                 kvm->vpic = kvm_create_pic(kvm);
921                 if (kvm->vpic) {
922                         r = kvm_ioapic_init(kvm);
923                         if (r) {
924                                 kfree(kvm->vpic);
925                                 kvm->vpic = NULL;
926                                 goto out;
927                         }
928                 } else
929                         goto out;
930                 break;
931         case KVM_IRQ_LINE: {
932                 struct kvm_irq_level irq_event;
933
934                 r = -EFAULT;
935                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
936                         goto out;
937                 if (irqchip_in_kernel(kvm)) {
938                         mutex_lock(&kvm->lock);
939                         if (irq_event.irq < 16)
940                                 kvm_pic_set_irq(pic_irqchip(kvm),
941                                         irq_event.irq,
942                                         irq_event.level);
943                         kvm_ioapic_set_irq(kvm->vioapic,
944                                         irq_event.irq,
945                                         irq_event.level);
946                         mutex_unlock(&kvm->lock);
947                         r = 0;
948                 }
949                 break;
950         }
951         case KVM_GET_IRQCHIP: {
952                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
953                 struct kvm_irqchip chip;
954
955                 r = -EFAULT;
956                 if (copy_from_user(&chip, argp, sizeof chip))
957                         goto out;
958                 r = -ENXIO;
959                 if (!irqchip_in_kernel(kvm))
960                         goto out;
961                 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
962                 if (r)
963                         goto out;
964                 r = -EFAULT;
965                 if (copy_to_user(argp, &chip, sizeof chip))
966                         goto out;
967                 r = 0;
968                 break;
969         }
970         case KVM_SET_IRQCHIP: {
971                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
972                 struct kvm_irqchip chip;
973
974                 r = -EFAULT;
975                 if (copy_from_user(&chip, argp, sizeof chip))
976                         goto out;
977                 r = -ENXIO;
978                 if (!irqchip_in_kernel(kvm))
979                         goto out;
980                 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
981                 if (r)
982                         goto out;
983                 r = 0;
984                 break;
985         }
986         default:
987                 ;
988         }
989 out:
990         return r;
991 }
992
993 static __init void kvm_init_msr_list(void)
994 {
995         u32 dummy[2];
996         unsigned i, j;
997
998         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
999                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1000                         continue;
1001                 if (j < i)
1002                         msrs_to_save[j] = msrs_to_save[i];
1003                 j++;
1004         }
1005         num_msrs_to_save = j;
1006 }
1007
1008 /*
1009  * Only apic need an MMIO device hook, so shortcut now..
1010  */
1011 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1012                                                 gpa_t addr)
1013 {
1014         struct kvm_io_device *dev;
1015
1016         if (vcpu->apic) {
1017                 dev = &vcpu->apic->dev;
1018                 if (dev->in_range(dev, addr))
1019                         return dev;
1020         }
1021         return NULL;
1022 }
1023
1024
1025 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1026                                                 gpa_t addr)
1027 {
1028         struct kvm_io_device *dev;
1029
1030         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1031         if (dev == NULL)
1032                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1033         return dev;
1034 }
1035
1036 int emulator_read_std(unsigned long addr,
1037                              void *val,
1038                              unsigned int bytes,
1039                              struct kvm_vcpu *vcpu)
1040 {
1041         void *data = val;
1042
1043         while (bytes) {
1044                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1045                 unsigned offset = addr & (PAGE_SIZE-1);
1046                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1047                 int ret;
1048
1049                 if (gpa == UNMAPPED_GVA)
1050                         return X86EMUL_PROPAGATE_FAULT;
1051                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1052                 if (ret < 0)
1053                         return X86EMUL_UNHANDLEABLE;
1054
1055                 bytes -= tocopy;
1056                 data += tocopy;
1057                 addr += tocopy;
1058         }
1059
1060         return X86EMUL_CONTINUE;
1061 }
1062 EXPORT_SYMBOL_GPL(emulator_read_std);
1063
1064 static int emulator_write_std(unsigned long addr,
1065                               const void *val,
1066                               unsigned int bytes,
1067                               struct kvm_vcpu *vcpu)
1068 {
1069         pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
1070         return X86EMUL_UNHANDLEABLE;
1071 }
1072
1073 static int emulator_read_emulated(unsigned long addr,
1074                                   void *val,
1075                                   unsigned int bytes,
1076                                   struct kvm_vcpu *vcpu)
1077 {
1078         struct kvm_io_device *mmio_dev;
1079         gpa_t                 gpa;
1080
1081         if (vcpu->mmio_read_completed) {
1082                 memcpy(val, vcpu->mmio_data, bytes);
1083                 vcpu->mmio_read_completed = 0;
1084                 return X86EMUL_CONTINUE;
1085         }
1086
1087         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1088
1089         /* For APIC access vmexit */
1090         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1091                 goto mmio;
1092
1093         if (emulator_read_std(addr, val, bytes, vcpu)
1094                         == X86EMUL_CONTINUE)
1095                 return X86EMUL_CONTINUE;
1096         if (gpa == UNMAPPED_GVA)
1097                 return X86EMUL_PROPAGATE_FAULT;
1098
1099 mmio:
1100         /*
1101          * Is this MMIO handled locally?
1102          */
1103         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1104         if (mmio_dev) {
1105                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1106                 return X86EMUL_CONTINUE;
1107         }
1108
1109         vcpu->mmio_needed = 1;
1110         vcpu->mmio_phys_addr = gpa;
1111         vcpu->mmio_size = bytes;
1112         vcpu->mmio_is_write = 0;
1113
1114         return X86EMUL_UNHANDLEABLE;
1115 }
1116
1117 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1118                                const void *val, int bytes)
1119 {
1120         int ret;
1121
1122         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1123         if (ret < 0)
1124                 return 0;
1125         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1126         return 1;
1127 }
1128
1129 static int emulator_write_emulated_onepage(unsigned long addr,
1130                                            const void *val,
1131                                            unsigned int bytes,
1132                                            struct kvm_vcpu *vcpu)
1133 {
1134         struct kvm_io_device *mmio_dev;
1135         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1136
1137         if (gpa == UNMAPPED_GVA) {
1138                 kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1139                 return X86EMUL_PROPAGATE_FAULT;
1140         }
1141
1142         /* For APIC access vmexit */
1143         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1144                 goto mmio;
1145
1146         if (emulator_write_phys(vcpu, gpa, val, bytes))
1147                 return X86EMUL_CONTINUE;
1148
1149 mmio:
1150         /*
1151          * Is this MMIO handled locally?
1152          */
1153         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1154         if (mmio_dev) {
1155                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1156                 return X86EMUL_CONTINUE;
1157         }
1158
1159         vcpu->mmio_needed = 1;
1160         vcpu->mmio_phys_addr = gpa;
1161         vcpu->mmio_size = bytes;
1162         vcpu->mmio_is_write = 1;
1163         memcpy(vcpu->mmio_data, val, bytes);
1164
1165         return X86EMUL_CONTINUE;
1166 }
1167
1168 int emulator_write_emulated(unsigned long addr,
1169                                    const void *val,
1170                                    unsigned int bytes,
1171                                    struct kvm_vcpu *vcpu)
1172 {
1173         /* Crossing a page boundary? */
1174         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1175                 int rc, now;
1176
1177                 now = -addr & ~PAGE_MASK;
1178                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1179                 if (rc != X86EMUL_CONTINUE)
1180                         return rc;
1181                 addr += now;
1182                 val += now;
1183                 bytes -= now;
1184         }
1185         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1186 }
1187 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1188
1189 static int emulator_cmpxchg_emulated(unsigned long addr,
1190                                      const void *old,
1191                                      const void *new,
1192                                      unsigned int bytes,
1193                                      struct kvm_vcpu *vcpu)
1194 {
1195         static int reported;
1196
1197         if (!reported) {
1198                 reported = 1;
1199                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1200         }
1201         return emulator_write_emulated(addr, new, bytes, vcpu);
1202 }
1203
1204 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1205 {
1206         return kvm_x86_ops->get_segment_base(vcpu, seg);
1207 }
1208
1209 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1210 {
1211         return X86EMUL_CONTINUE;
1212 }
1213
1214 int emulate_clts(struct kvm_vcpu *vcpu)
1215 {
1216         kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
1217         return X86EMUL_CONTINUE;
1218 }
1219
1220 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1221 {
1222         struct kvm_vcpu *vcpu = ctxt->vcpu;
1223
1224         switch (dr) {
1225         case 0 ... 3:
1226                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1227                 return X86EMUL_CONTINUE;
1228         default:
1229                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1230                 return X86EMUL_UNHANDLEABLE;
1231         }
1232 }
1233
1234 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1235 {
1236         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1237         int exception;
1238
1239         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1240         if (exception) {
1241                 /* FIXME: better handling */
1242                 return X86EMUL_UNHANDLEABLE;
1243         }
1244         return X86EMUL_CONTINUE;
1245 }
1246
1247 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1248 {
1249         static int reported;
1250         u8 opcodes[4];
1251         unsigned long rip = vcpu->rip;
1252         unsigned long rip_linear;
1253
1254         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1255
1256         if (reported)
1257                 return;
1258
1259         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1260
1261         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1262                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1263         reported = 1;
1264 }
1265 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1266
1267 struct x86_emulate_ops emulate_ops = {
1268         .read_std            = emulator_read_std,
1269         .write_std           = emulator_write_std,
1270         .read_emulated       = emulator_read_emulated,
1271         .write_emulated      = emulator_write_emulated,
1272         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1273 };
1274
1275 int emulate_instruction(struct kvm_vcpu *vcpu,
1276                         struct kvm_run *run,
1277                         unsigned long cr2,
1278                         u16 error_code,
1279                         int no_decode)
1280 {
1281         int r;
1282
1283         vcpu->mmio_fault_cr2 = cr2;
1284         kvm_x86_ops->cache_regs(vcpu);
1285
1286         vcpu->mmio_is_write = 0;
1287         vcpu->pio.string = 0;
1288
1289         if (!no_decode) {
1290                 int cs_db, cs_l;
1291                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1292
1293                 vcpu->emulate_ctxt.vcpu = vcpu;
1294                 vcpu->emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1295                 vcpu->emulate_ctxt.cr2 = cr2;
1296                 vcpu->emulate_ctxt.mode =
1297                         (vcpu->emulate_ctxt.eflags & X86_EFLAGS_VM)
1298                         ? X86EMUL_MODE_REAL : cs_l
1299                         ? X86EMUL_MODE_PROT64 : cs_db
1300                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1301
1302                 if (vcpu->emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1303                         vcpu->emulate_ctxt.cs_base = 0;
1304                         vcpu->emulate_ctxt.ds_base = 0;
1305                         vcpu->emulate_ctxt.es_base = 0;
1306                         vcpu->emulate_ctxt.ss_base = 0;
1307                 } else {
1308                         vcpu->emulate_ctxt.cs_base =
1309                                         get_segment_base(vcpu, VCPU_SREG_CS);
1310                         vcpu->emulate_ctxt.ds_base =
1311                                         get_segment_base(vcpu, VCPU_SREG_DS);
1312                         vcpu->emulate_ctxt.es_base =
1313                                         get_segment_base(vcpu, VCPU_SREG_ES);
1314                         vcpu->emulate_ctxt.ss_base =
1315                                         get_segment_base(vcpu, VCPU_SREG_SS);
1316                 }
1317
1318                 vcpu->emulate_ctxt.gs_base =
1319                                         get_segment_base(vcpu, VCPU_SREG_GS);
1320                 vcpu->emulate_ctxt.fs_base =
1321                                         get_segment_base(vcpu, VCPU_SREG_FS);
1322
1323                 r = x86_decode_insn(&vcpu->emulate_ctxt, &emulate_ops);
1324                 if (r)  {
1325                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1326                                 return EMULATE_DONE;
1327                         return EMULATE_FAIL;
1328                 }
1329         }
1330
1331         r = x86_emulate_insn(&vcpu->emulate_ctxt, &emulate_ops);
1332
1333         if (vcpu->pio.string)
1334                 return EMULATE_DO_MMIO;
1335
1336         if ((r || vcpu->mmio_is_write) && run) {
1337                 run->exit_reason = KVM_EXIT_MMIO;
1338                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1339                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1340                 run->mmio.len = vcpu->mmio_size;
1341                 run->mmio.is_write = vcpu->mmio_is_write;
1342         }
1343
1344         if (r) {
1345                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1346                         return EMULATE_DONE;
1347                 if (!vcpu->mmio_needed) {
1348                         kvm_report_emulation_failure(vcpu, "mmio");
1349                         return EMULATE_FAIL;
1350                 }
1351                 return EMULATE_DO_MMIO;
1352         }
1353
1354         kvm_x86_ops->decache_regs(vcpu);
1355         kvm_x86_ops->set_rflags(vcpu, vcpu->emulate_ctxt.eflags);
1356
1357         if (vcpu->mmio_is_write) {
1358                 vcpu->mmio_needed = 0;
1359                 return EMULATE_DO_MMIO;
1360         }
1361
1362         return EMULATE_DONE;
1363 }
1364 EXPORT_SYMBOL_GPL(emulate_instruction);
1365
1366 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1367 {
1368         int i;
1369
1370         for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
1371                 if (vcpu->pio.guest_pages[i]) {
1372                         kvm_release_page(vcpu->pio.guest_pages[i]);
1373                         vcpu->pio.guest_pages[i] = NULL;
1374                 }
1375 }
1376
1377 static int pio_copy_data(struct kvm_vcpu *vcpu)
1378 {
1379         void *p = vcpu->pio_data;
1380         void *q;
1381         unsigned bytes;
1382         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1383
1384         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1385                  PAGE_KERNEL);
1386         if (!q) {
1387                 free_pio_guest_pages(vcpu);
1388                 return -ENOMEM;
1389         }
1390         q += vcpu->pio.guest_page_offset;
1391         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1392         if (vcpu->pio.in)
1393                 memcpy(q, p, bytes);
1394         else
1395                 memcpy(p, q, bytes);
1396         q -= vcpu->pio.guest_page_offset;
1397         vunmap(q);
1398         free_pio_guest_pages(vcpu);
1399         return 0;
1400 }
1401
1402 int complete_pio(struct kvm_vcpu *vcpu)
1403 {
1404         struct kvm_pio_request *io = &vcpu->pio;
1405         long delta;
1406         int r;
1407
1408         kvm_x86_ops->cache_regs(vcpu);
1409
1410         if (!io->string) {
1411                 if (io->in)
1412                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1413                                io->size);
1414         } else {
1415                 if (io->in) {
1416                         r = pio_copy_data(vcpu);
1417                         if (r) {
1418                                 kvm_x86_ops->cache_regs(vcpu);
1419                                 return r;
1420                         }
1421                 }
1422
1423                 delta = 1;
1424                 if (io->rep) {
1425                         delta *= io->cur_count;
1426                         /*
1427                          * The size of the register should really depend on
1428                          * current address size.
1429                          */
1430                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1431                 }
1432                 if (io->down)
1433                         delta = -delta;
1434                 delta *= io->size;
1435                 if (io->in)
1436                         vcpu->regs[VCPU_REGS_RDI] += delta;
1437                 else
1438                         vcpu->regs[VCPU_REGS_RSI] += delta;
1439         }
1440
1441         kvm_x86_ops->decache_regs(vcpu);
1442
1443         io->count -= io->cur_count;
1444         io->cur_count = 0;
1445
1446         return 0;
1447 }
1448
1449 static void kernel_pio(struct kvm_io_device *pio_dev,
1450                        struct kvm_vcpu *vcpu,
1451                        void *pd)
1452 {
1453         /* TODO: String I/O for in kernel device */
1454
1455         mutex_lock(&vcpu->kvm->lock);
1456         if (vcpu->pio.in)
1457                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1458                                   vcpu->pio.size,
1459                                   pd);
1460         else
1461                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1462                                    vcpu->pio.size,
1463                                    pd);
1464         mutex_unlock(&vcpu->kvm->lock);
1465 }
1466
1467 static void pio_string_write(struct kvm_io_device *pio_dev,
1468                              struct kvm_vcpu *vcpu)
1469 {
1470         struct kvm_pio_request *io = &vcpu->pio;
1471         void *pd = vcpu->pio_data;
1472         int i;
1473
1474         mutex_lock(&vcpu->kvm->lock);
1475         for (i = 0; i < io->cur_count; i++) {
1476                 kvm_iodevice_write(pio_dev, io->port,
1477                                    io->size,
1478                                    pd);
1479                 pd += io->size;
1480         }
1481         mutex_unlock(&vcpu->kvm->lock);
1482 }
1483
1484 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1485                                                gpa_t addr)
1486 {
1487         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1488 }
1489
1490 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1491                   int size, unsigned port)
1492 {
1493         struct kvm_io_device *pio_dev;
1494
1495         vcpu->run->exit_reason = KVM_EXIT_IO;
1496         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1497         vcpu->run->io.size = vcpu->pio.size = size;
1498         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1499         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1500         vcpu->run->io.port = vcpu->pio.port = port;
1501         vcpu->pio.in = in;
1502         vcpu->pio.string = 0;
1503         vcpu->pio.down = 0;
1504         vcpu->pio.guest_page_offset = 0;
1505         vcpu->pio.rep = 0;
1506
1507         kvm_x86_ops->cache_regs(vcpu);
1508         memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1509         kvm_x86_ops->decache_regs(vcpu);
1510
1511         kvm_x86_ops->skip_emulated_instruction(vcpu);
1512
1513         pio_dev = vcpu_find_pio_dev(vcpu, port);
1514         if (pio_dev) {
1515                 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1516                 complete_pio(vcpu);
1517                 return 1;
1518         }
1519         return 0;
1520 }
1521 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
1522
1523 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1524                   int size, unsigned long count, int down,
1525                   gva_t address, int rep, unsigned port)
1526 {
1527         unsigned now, in_page;
1528         int i, ret = 0;
1529         int nr_pages = 1;
1530         struct page *page;
1531         struct kvm_io_device *pio_dev;
1532
1533         vcpu->run->exit_reason = KVM_EXIT_IO;
1534         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1535         vcpu->run->io.size = vcpu->pio.size = size;
1536         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1537         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
1538         vcpu->run->io.port = vcpu->pio.port = port;
1539         vcpu->pio.in = in;
1540         vcpu->pio.string = 1;
1541         vcpu->pio.down = down;
1542         vcpu->pio.guest_page_offset = offset_in_page(address);
1543         vcpu->pio.rep = rep;
1544
1545         if (!count) {
1546                 kvm_x86_ops->skip_emulated_instruction(vcpu);
1547                 return 1;
1548         }
1549
1550         if (!down)
1551                 in_page = PAGE_SIZE - offset_in_page(address);
1552         else
1553                 in_page = offset_in_page(address) + size;
1554         now = min(count, (unsigned long)in_page / size);
1555         if (!now) {
1556                 /*
1557                  * String I/O straddles page boundary.  Pin two guest pages
1558                  * so that we satisfy atomicity constraints.  Do just one
1559                  * transaction to avoid complexity.
1560                  */
1561                 nr_pages = 2;
1562                 now = 1;
1563         }
1564         if (down) {
1565                 /*
1566                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1567                  */
1568                 pr_unimpl(vcpu, "guest string pio down\n");
1569                 inject_gp(vcpu);
1570                 return 1;
1571         }
1572         vcpu->run->io.count = now;
1573         vcpu->pio.cur_count = now;
1574
1575         if (vcpu->pio.cur_count == vcpu->pio.count)
1576                 kvm_x86_ops->skip_emulated_instruction(vcpu);
1577
1578         for (i = 0; i < nr_pages; ++i) {
1579                 mutex_lock(&vcpu->kvm->lock);
1580                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1581                 vcpu->pio.guest_pages[i] = page;
1582                 mutex_unlock(&vcpu->kvm->lock);
1583                 if (!page) {
1584                         inject_gp(vcpu);
1585                         free_pio_guest_pages(vcpu);
1586                         return 1;
1587                 }
1588         }
1589
1590         pio_dev = vcpu_find_pio_dev(vcpu, port);
1591         if (!vcpu->pio.in) {
1592                 /* string PIO write */
1593                 ret = pio_copy_data(vcpu);
1594                 if (ret >= 0 && pio_dev) {
1595                         pio_string_write(pio_dev, vcpu);
1596                         complete_pio(vcpu);
1597                         if (vcpu->pio.count == 0)
1598                                 ret = 1;
1599                 }
1600         } else if (pio_dev)
1601                 pr_unimpl(vcpu, "no string pio read support yet, "
1602                        "port %x size %d count %ld\n",
1603                         port, size, count);
1604
1605         return ret;
1606 }
1607 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1608
1609 __init void kvm_arch_init(void)
1610 {
1611         kvm_init_msr_list();
1612 }
1613
1614 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1615 {
1616         ++vcpu->stat.halt_exits;
1617         if (irqchip_in_kernel(vcpu->kvm)) {
1618                 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1619                 kvm_vcpu_block(vcpu);
1620                 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1621                         return -EINTR;
1622                 return 1;
1623         } else {
1624                 vcpu->run->exit_reason = KVM_EXIT_HLT;
1625                 return 0;
1626         }
1627 }
1628 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1629
1630 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
1631 {
1632         unsigned long nr, a0, a1, a2, a3, ret;
1633
1634         kvm_x86_ops->cache_regs(vcpu);
1635
1636         nr = vcpu->regs[VCPU_REGS_RAX];
1637         a0 = vcpu->regs[VCPU_REGS_RBX];
1638         a1 = vcpu->regs[VCPU_REGS_RCX];
1639         a2 = vcpu->regs[VCPU_REGS_RDX];
1640         a3 = vcpu->regs[VCPU_REGS_RSI];
1641
1642         if (!is_long_mode(vcpu)) {
1643                 nr &= 0xFFFFFFFF;
1644                 a0 &= 0xFFFFFFFF;
1645                 a1 &= 0xFFFFFFFF;
1646                 a2 &= 0xFFFFFFFF;
1647                 a3 &= 0xFFFFFFFF;
1648         }
1649
1650         switch (nr) {
1651         default:
1652                 ret = -KVM_ENOSYS;
1653                 break;
1654         }
1655         vcpu->regs[VCPU_REGS_RAX] = ret;
1656         kvm_x86_ops->decache_regs(vcpu);
1657         return 0;
1658 }
1659 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
1660
1661 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
1662 {
1663         char instruction[3];
1664         int ret = 0;
1665
1666         mutex_lock(&vcpu->kvm->lock);
1667
1668         /*
1669          * Blow out the MMU to ensure that no other VCPU has an active mapping
1670          * to ensure that the updated hypercall appears atomically across all
1671          * VCPUs.
1672          */
1673         kvm_mmu_zap_all(vcpu->kvm);
1674
1675         kvm_x86_ops->cache_regs(vcpu);
1676         kvm_x86_ops->patch_hypercall(vcpu, instruction);
1677         if (emulator_write_emulated(vcpu->rip, instruction, 3, vcpu)
1678             != X86EMUL_CONTINUE)
1679                 ret = -EFAULT;
1680
1681         mutex_unlock(&vcpu->kvm->lock);
1682
1683         return ret;
1684 }
1685
1686 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1687 {
1688         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1689 }
1690
1691 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1692 {
1693         struct descriptor_table dt = { limit, base };
1694
1695         kvm_x86_ops->set_gdt(vcpu, &dt);
1696 }
1697
1698 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1699 {
1700         struct descriptor_table dt = { limit, base };
1701
1702         kvm_x86_ops->set_idt(vcpu, &dt);
1703 }
1704
1705 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1706                    unsigned long *rflags)
1707 {
1708         lmsw(vcpu, msw);
1709         *rflags = kvm_x86_ops->get_rflags(vcpu);
1710 }
1711
1712 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1713 {
1714         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1715         switch (cr) {
1716         case 0:
1717                 return vcpu->cr0;
1718         case 2:
1719                 return vcpu->cr2;
1720         case 3:
1721                 return vcpu->cr3;
1722         case 4:
1723                 return vcpu->cr4;
1724         default:
1725                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1726                 return 0;
1727         }
1728 }
1729
1730 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1731                      unsigned long *rflags)
1732 {
1733         switch (cr) {
1734         case 0:
1735                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1736                 *rflags = kvm_x86_ops->get_rflags(vcpu);
1737                 break;
1738         case 2:
1739                 vcpu->cr2 = val;
1740                 break;
1741         case 3:
1742                 set_cr3(vcpu, val);
1743                 break;
1744         case 4:
1745                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1746                 break;
1747         default:
1748                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1749         }
1750 }
1751
1752 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1753 {
1754         int i;
1755         u32 function;
1756         struct kvm_cpuid_entry *e, *best;
1757
1758         kvm_x86_ops->cache_regs(vcpu);
1759         function = vcpu->regs[VCPU_REGS_RAX];
1760         vcpu->regs[VCPU_REGS_RAX] = 0;
1761         vcpu->regs[VCPU_REGS_RBX] = 0;
1762         vcpu->regs[VCPU_REGS_RCX] = 0;
1763         vcpu->regs[VCPU_REGS_RDX] = 0;
1764         best = NULL;
1765         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1766                 e = &vcpu->cpuid_entries[i];
1767                 if (e->function == function) {
1768                         best = e;
1769                         break;
1770                 }
1771                 /*
1772                  * Both basic or both extended?
1773                  */
1774                 if (((e->function ^ function) & 0x80000000) == 0)
1775                         if (!best || e->function > best->function)
1776                                 best = e;
1777         }
1778         if (best) {
1779                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1780                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1781                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1782                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1783         }
1784         kvm_x86_ops->decache_regs(vcpu);
1785         kvm_x86_ops->skip_emulated_instruction(vcpu);
1786 }
1787 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);