]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/kvm/kvm_main.c
KVM: Add kernel-internal memory slots
[linux-2.6-omap-h63xx.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19 #include "x86.h"
20 #include "x86_emulate.h"
21 #include "segment_descriptor.h"
22 #include "irq.h"
23
24 #include <linux/kvm.h>
25 #include <linux/module.h>
26 #include <linux/errno.h>
27 #include <linux/percpu.h>
28 #include <linux/gfp.h>
29 #include <linux/mm.h>
30 #include <linux/miscdevice.h>
31 #include <linux/vmalloc.h>
32 #include <linux/reboot.h>
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/file.h>
36 #include <linux/sysdev.h>
37 #include <linux/cpu.h>
38 #include <linux/sched.h>
39 #include <linux/cpumask.h>
40 #include <linux/smp.h>
41 #include <linux/anon_inodes.h>
42 #include <linux/profile.h>
43 #include <linux/kvm_para.h>
44 #include <linux/pagemap.h>
45 #include <linux/mman.h>
46
47 #include <asm/processor.h>
48 #include <asm/msr.h>
49 #include <asm/io.h>
50 #include <asm/uaccess.h>
51 #include <asm/desc.h>
52
53 MODULE_AUTHOR("Qumranet");
54 MODULE_LICENSE("GPL");
55
56 static DEFINE_SPINLOCK(kvm_lock);
57 static LIST_HEAD(vm_list);
58
59 static cpumask_t cpus_hardware_enabled;
60
61 struct kvm_x86_ops *kvm_x86_ops;
62 struct kmem_cache *kvm_vcpu_cache;
63 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
64
65 static __read_mostly struct preempt_ops kvm_preempt_ops;
66
67 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
68
69 static struct kvm_stats_debugfs_item {
70         const char *name;
71         int offset;
72         struct dentry *dentry;
73 } debugfs_entries[] = {
74         { "pf_fixed", STAT_OFFSET(pf_fixed) },
75         { "pf_guest", STAT_OFFSET(pf_guest) },
76         { "tlb_flush", STAT_OFFSET(tlb_flush) },
77         { "invlpg", STAT_OFFSET(invlpg) },
78         { "exits", STAT_OFFSET(exits) },
79         { "io_exits", STAT_OFFSET(io_exits) },
80         { "mmio_exits", STAT_OFFSET(mmio_exits) },
81         { "signal_exits", STAT_OFFSET(signal_exits) },
82         { "irq_window", STAT_OFFSET(irq_window_exits) },
83         { "halt_exits", STAT_OFFSET(halt_exits) },
84         { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
85         { "request_irq", STAT_OFFSET(request_irq_exits) },
86         { "irq_exits", STAT_OFFSET(irq_exits) },
87         { "light_exits", STAT_OFFSET(light_exits) },
88         { "efer_reload", STAT_OFFSET(efer_reload) },
89         { NULL }
90 };
91
92 static struct dentry *debugfs_dir;
93
94 #define CR0_RESERVED_BITS                                               \
95         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
96                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
97                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
98 #define CR4_RESERVED_BITS                                               \
99         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
100                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
101                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
102                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
103
104 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
105 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
106
107 #ifdef CONFIG_X86_64
108 /* LDT or TSS descriptor in the GDT. 16 bytes. */
109 struct segment_descriptor_64 {
110         struct segment_descriptor s;
111         u32 base_higher;
112         u32 pad_zero;
113 };
114
115 #endif
116
117 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
118                            unsigned long arg);
119
120 unsigned long segment_base(u16 selector)
121 {
122         struct descriptor_table gdt;
123         struct segment_descriptor *d;
124         unsigned long table_base;
125         unsigned long v;
126
127         if (selector == 0)
128                 return 0;
129
130         asm("sgdt %0" : "=m"(gdt));
131         table_base = gdt.base;
132
133         if (selector & 4) {           /* from ldt */
134                 u16 ldt_selector;
135
136                 asm("sldt %0" : "=g"(ldt_selector));
137                 table_base = segment_base(ldt_selector);
138         }
139         d = (struct segment_descriptor *)(table_base + (selector & ~7));
140         v = d->base_low | ((unsigned long)d->base_mid << 16) |
141                 ((unsigned long)d->base_high << 24);
142 #ifdef CONFIG_X86_64
143         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
144                 v |= ((unsigned long) \
145                       ((struct segment_descriptor_64 *)d)->base_higher) << 32;
146 #endif
147         return v;
148 }
149 EXPORT_SYMBOL_GPL(segment_base);
150
151 static inline int valid_vcpu(int n)
152 {
153         return likely(n >= 0 && n < KVM_MAX_VCPUS);
154 }
155
156 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
157 {
158         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
159                 return;
160
161         vcpu->guest_fpu_loaded = 1;
162         fx_save(&vcpu->host_fx_image);
163         fx_restore(&vcpu->guest_fx_image);
164 }
165 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
166
167 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
168 {
169         if (!vcpu->guest_fpu_loaded)
170                 return;
171
172         vcpu->guest_fpu_loaded = 0;
173         fx_save(&vcpu->guest_fx_image);
174         fx_restore(&vcpu->host_fx_image);
175 }
176 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
177
178 /*
179  * Switches to specified vcpu, until a matching vcpu_put()
180  */
181 void vcpu_load(struct kvm_vcpu *vcpu)
182 {
183         int cpu;
184
185         mutex_lock(&vcpu->mutex);
186         cpu = get_cpu();
187         preempt_notifier_register(&vcpu->preempt_notifier);
188         kvm_arch_vcpu_load(vcpu, cpu);
189         put_cpu();
190 }
191
192 void vcpu_put(struct kvm_vcpu *vcpu)
193 {
194         preempt_disable();
195         kvm_arch_vcpu_put(vcpu);
196         preempt_notifier_unregister(&vcpu->preempt_notifier);
197         preempt_enable();
198         mutex_unlock(&vcpu->mutex);
199 }
200
201 static void ack_flush(void *_completed)
202 {
203 }
204
205 void kvm_flush_remote_tlbs(struct kvm *kvm)
206 {
207         int i, cpu;
208         cpumask_t cpus;
209         struct kvm_vcpu *vcpu;
210
211         cpus_clear(cpus);
212         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
213                 vcpu = kvm->vcpus[i];
214                 if (!vcpu)
215                         continue;
216                 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
217                         continue;
218                 cpu = vcpu->cpu;
219                 if (cpu != -1 && cpu != raw_smp_processor_id())
220                         cpu_set(cpu, cpus);
221         }
222         smp_call_function_mask(cpus, ack_flush, NULL, 1);
223 }
224
225 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
226 {
227         struct page *page;
228         int r;
229
230         mutex_init(&vcpu->mutex);
231         vcpu->cpu = -1;
232         vcpu->mmu.root_hpa = INVALID_PAGE;
233         vcpu->kvm = kvm;
234         vcpu->vcpu_id = id;
235         if (!irqchip_in_kernel(kvm) || id == 0)
236                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
237         else
238                 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
239         init_waitqueue_head(&vcpu->wq);
240
241         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
242         if (!page) {
243                 r = -ENOMEM;
244                 goto fail;
245         }
246         vcpu->run = page_address(page);
247
248         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
249         if (!page) {
250                 r = -ENOMEM;
251                 goto fail_free_run;
252         }
253         vcpu->pio_data = page_address(page);
254
255         r = kvm_mmu_create(vcpu);
256         if (r < 0)
257                 goto fail_free_pio_data;
258
259         if (irqchip_in_kernel(kvm)) {
260                 r = kvm_create_lapic(vcpu);
261                 if (r < 0)
262                         goto fail_mmu_destroy;
263         }
264
265         return 0;
266
267 fail_mmu_destroy:
268         kvm_mmu_destroy(vcpu);
269 fail_free_pio_data:
270         free_page((unsigned long)vcpu->pio_data);
271 fail_free_run:
272         free_page((unsigned long)vcpu->run);
273 fail:
274         return r;
275 }
276 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
277
278 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
279 {
280         kvm_free_lapic(vcpu);
281         kvm_mmu_destroy(vcpu);
282         free_page((unsigned long)vcpu->pio_data);
283         free_page((unsigned long)vcpu->run);
284 }
285 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
286
287 static struct kvm *kvm_create_vm(void)
288 {
289         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
290
291         if (!kvm)
292                 return ERR_PTR(-ENOMEM);
293
294         kvm_io_bus_init(&kvm->pio_bus);
295         mutex_init(&kvm->lock);
296         INIT_LIST_HEAD(&kvm->active_mmu_pages);
297         kvm_io_bus_init(&kvm->mmio_bus);
298         spin_lock(&kvm_lock);
299         list_add(&kvm->vm_list, &vm_list);
300         spin_unlock(&kvm_lock);
301         return kvm;
302 }
303
304 /*
305  * Free any memory in @free but not in @dont.
306  */
307 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
308                                   struct kvm_memory_slot *dont)
309 {
310         if (!dont || free->rmap != dont->rmap)
311                 vfree(free->rmap);
312
313         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
314                 vfree(free->dirty_bitmap);
315
316         free->npages = 0;
317         free->dirty_bitmap = NULL;
318         free->rmap = NULL;
319 }
320
321 static void kvm_free_physmem(struct kvm *kvm)
322 {
323         int i;
324
325         for (i = 0; i < kvm->nmemslots; ++i)
326                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
327 }
328
329 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
330 {
331         int i;
332
333         for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
334                 if (vcpu->pio.guest_pages[i]) {
335                         kvm_release_page(vcpu->pio.guest_pages[i]);
336                         vcpu->pio.guest_pages[i] = NULL;
337                 }
338 }
339
340 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
341 {
342         vcpu_load(vcpu);
343         kvm_mmu_unload(vcpu);
344         vcpu_put(vcpu);
345 }
346
347 static void kvm_free_vcpus(struct kvm *kvm)
348 {
349         unsigned int i;
350
351         /*
352          * Unpin any mmu pages first.
353          */
354         for (i = 0; i < KVM_MAX_VCPUS; ++i)
355                 if (kvm->vcpus[i])
356                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
357         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
358                 if (kvm->vcpus[i]) {
359                         kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
360                         kvm->vcpus[i] = NULL;
361                 }
362         }
363
364 }
365
366 static void kvm_destroy_vm(struct kvm *kvm)
367 {
368         spin_lock(&kvm_lock);
369         list_del(&kvm->vm_list);
370         spin_unlock(&kvm_lock);
371         kvm_io_bus_destroy(&kvm->pio_bus);
372         kvm_io_bus_destroy(&kvm->mmio_bus);
373         kfree(kvm->vpic);
374         kfree(kvm->vioapic);
375         kvm_free_vcpus(kvm);
376         kvm_free_physmem(kvm);
377         kfree(kvm);
378 }
379
380 static int kvm_vm_release(struct inode *inode, struct file *filp)
381 {
382         struct kvm *kvm = filp->private_data;
383
384         kvm_destroy_vm(kvm);
385         return 0;
386 }
387
388 static void inject_gp(struct kvm_vcpu *vcpu)
389 {
390         kvm_x86_ops->inject_gp(vcpu, 0);
391 }
392
393 /*
394  * Load the pae pdptrs.  Return true is they are all valid.
395  */
396 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
397 {
398         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
399         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
400         int i;
401         int ret;
402         u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
403
404         mutex_lock(&vcpu->kvm->lock);
405         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
406                                   offset * sizeof(u64), sizeof(pdpte));
407         if (ret < 0) {
408                 ret = 0;
409                 goto out;
410         }
411         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
412                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
413                         ret = 0;
414                         goto out;
415                 }
416         }
417         ret = 1;
418
419         memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
420 out:
421         mutex_unlock(&vcpu->kvm->lock);
422
423         return ret;
424 }
425
426 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
427 {
428         if (cr0 & CR0_RESERVED_BITS) {
429                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
430                        cr0, vcpu->cr0);
431                 inject_gp(vcpu);
432                 return;
433         }
434
435         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
436                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
437                 inject_gp(vcpu);
438                 return;
439         }
440
441         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
442                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
443                        "and a clear PE flag\n");
444                 inject_gp(vcpu);
445                 return;
446         }
447
448         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
449 #ifdef CONFIG_X86_64
450                 if ((vcpu->shadow_efer & EFER_LME)) {
451                         int cs_db, cs_l;
452
453                         if (!is_pae(vcpu)) {
454                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
455                                        "in long mode while PAE is disabled\n");
456                                 inject_gp(vcpu);
457                                 return;
458                         }
459                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
460                         if (cs_l) {
461                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
462                                        "in long mode while CS.L == 1\n");
463                                 inject_gp(vcpu);
464                                 return;
465
466                         }
467                 } else
468 #endif
469                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
470                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
471                                "reserved bits\n");
472                         inject_gp(vcpu);
473                         return;
474                 }
475
476         }
477
478         kvm_x86_ops->set_cr0(vcpu, cr0);
479         vcpu->cr0 = cr0;
480
481         mutex_lock(&vcpu->kvm->lock);
482         kvm_mmu_reset_context(vcpu);
483         mutex_unlock(&vcpu->kvm->lock);
484         return;
485 }
486 EXPORT_SYMBOL_GPL(set_cr0);
487
488 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
489 {
490         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
491 }
492 EXPORT_SYMBOL_GPL(lmsw);
493
494 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
495 {
496         if (cr4 & CR4_RESERVED_BITS) {
497                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
498                 inject_gp(vcpu);
499                 return;
500         }
501
502         if (is_long_mode(vcpu)) {
503                 if (!(cr4 & X86_CR4_PAE)) {
504                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
505                                "in long mode\n");
506                         inject_gp(vcpu);
507                         return;
508                 }
509         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
510                    && !load_pdptrs(vcpu, vcpu->cr3)) {
511                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
512                 inject_gp(vcpu);
513                 return;
514         }
515
516         if (cr4 & X86_CR4_VMXE) {
517                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
518                 inject_gp(vcpu);
519                 return;
520         }
521         kvm_x86_ops->set_cr4(vcpu, cr4);
522         vcpu->cr4 = cr4;
523         mutex_lock(&vcpu->kvm->lock);
524         kvm_mmu_reset_context(vcpu);
525         mutex_unlock(&vcpu->kvm->lock);
526 }
527 EXPORT_SYMBOL_GPL(set_cr4);
528
529 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
530 {
531         if (is_long_mode(vcpu)) {
532                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
533                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
534                         inject_gp(vcpu);
535                         return;
536                 }
537         } else {
538                 if (is_pae(vcpu)) {
539                         if (cr3 & CR3_PAE_RESERVED_BITS) {
540                                 printk(KERN_DEBUG
541                                        "set_cr3: #GP, reserved bits\n");
542                                 inject_gp(vcpu);
543                                 return;
544                         }
545                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
546                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
547                                        "reserved bits\n");
548                                 inject_gp(vcpu);
549                                 return;
550                         }
551                 }
552                 /*
553                  * We don't check reserved bits in nonpae mode, because
554                  * this isn't enforced, and VMware depends on this.
555                  */
556         }
557
558         mutex_lock(&vcpu->kvm->lock);
559         /*
560          * Does the new cr3 value map to physical memory? (Note, we
561          * catch an invalid cr3 even in real-mode, because it would
562          * cause trouble later on when we turn on paging anyway.)
563          *
564          * A real CPU would silently accept an invalid cr3 and would
565          * attempt to use it - with largely undefined (and often hard
566          * to debug) behavior on the guest side.
567          */
568         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
569                 inject_gp(vcpu);
570         else {
571                 vcpu->cr3 = cr3;
572                 vcpu->mmu.new_cr3(vcpu);
573         }
574         mutex_unlock(&vcpu->kvm->lock);
575 }
576 EXPORT_SYMBOL_GPL(set_cr3);
577
578 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
579 {
580         if (cr8 & CR8_RESERVED_BITS) {
581                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
582                 inject_gp(vcpu);
583                 return;
584         }
585         if (irqchip_in_kernel(vcpu->kvm))
586                 kvm_lapic_set_tpr(vcpu, cr8);
587         else
588                 vcpu->cr8 = cr8;
589 }
590 EXPORT_SYMBOL_GPL(set_cr8);
591
592 unsigned long get_cr8(struct kvm_vcpu *vcpu)
593 {
594         if (irqchip_in_kernel(vcpu->kvm))
595                 return kvm_lapic_get_cr8(vcpu);
596         else
597                 return vcpu->cr8;
598 }
599 EXPORT_SYMBOL_GPL(get_cr8);
600
601 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
602 {
603         if (irqchip_in_kernel(vcpu->kvm))
604                 return vcpu->apic_base;
605         else
606                 return vcpu->apic_base;
607 }
608 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
609
610 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
611 {
612         /* TODO: reserve bits check */
613         if (irqchip_in_kernel(vcpu->kvm))
614                 kvm_lapic_set_base(vcpu, data);
615         else
616                 vcpu->apic_base = data;
617 }
618 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
619
620 void fx_init(struct kvm_vcpu *vcpu)
621 {
622         unsigned after_mxcsr_mask;
623
624         /* Initialize guest FPU by resetting ours and saving into guest's */
625         preempt_disable();
626         fx_save(&vcpu->host_fx_image);
627         fpu_init();
628         fx_save(&vcpu->guest_fx_image);
629         fx_restore(&vcpu->host_fx_image);
630         preempt_enable();
631
632         vcpu->cr0 |= X86_CR0_ET;
633         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
634         vcpu->guest_fx_image.mxcsr = 0x1f80;
635         memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
636                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
637 }
638 EXPORT_SYMBOL_GPL(fx_init);
639
640 /*
641  * Allocate some memory and give it an address in the guest physical address
642  * space.
643  *
644  * Discontiguous memory is allowed, mostly for framebuffers.
645  */
646 int kvm_set_memory_region(struct kvm *kvm,
647                           struct kvm_userspace_memory_region *mem,
648                           int user_alloc)
649 {
650         int r;
651         gfn_t base_gfn;
652         unsigned long npages;
653         unsigned long i;
654         struct kvm_memory_slot *memslot;
655         struct kvm_memory_slot old, new;
656
657         r = -EINVAL;
658         /* General sanity checks */
659         if (mem->memory_size & (PAGE_SIZE - 1))
660                 goto out;
661         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
662                 goto out;
663         if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
664                 goto out;
665         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
666                 goto out;
667
668         memslot = &kvm->memslots[mem->slot];
669         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
670         npages = mem->memory_size >> PAGE_SHIFT;
671
672         if (!npages)
673                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
674
675         mutex_lock(&kvm->lock);
676
677         new = old = *memslot;
678
679         new.base_gfn = base_gfn;
680         new.npages = npages;
681         new.flags = mem->flags;
682
683         /* Disallow changing a memory slot's size. */
684         r = -EINVAL;
685         if (npages && old.npages && npages != old.npages)
686                 goto out_unlock;
687
688         /* Check for overlaps */
689         r = -EEXIST;
690         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
691                 struct kvm_memory_slot *s = &kvm->memslots[i];
692
693                 if (s == memslot)
694                         continue;
695                 if (!((base_gfn + npages <= s->base_gfn) ||
696                       (base_gfn >= s->base_gfn + s->npages)))
697                         goto out_unlock;
698         }
699
700         /* Free page dirty bitmap if unneeded */
701         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
702                 new.dirty_bitmap = NULL;
703
704         r = -ENOMEM;
705
706         /* Allocate if a slot is being created */
707         if (npages && !new.rmap) {
708                 new.rmap = vmalloc(npages * sizeof(struct page *));
709
710                 if (!new.rmap)
711                         goto out_unlock;
712
713                 memset(new.rmap, 0, npages * sizeof(*new.rmap));
714
715                 new.user_alloc = user_alloc;
716                 if (user_alloc)
717                         new.userspace_addr = mem->userspace_addr;
718                 else {
719                         down_write(&current->mm->mmap_sem);
720                         new.userspace_addr = do_mmap(NULL, 0,
721                                                      npages * PAGE_SIZE,
722                                                      PROT_READ | PROT_WRITE,
723                                                      MAP_SHARED | MAP_ANONYMOUS,
724                                                      0);
725                         up_write(&current->mm->mmap_sem);
726
727                         if (IS_ERR((void *)new.userspace_addr))
728                                 goto out_unlock;
729                 }
730         } else {
731                 if (!old.user_alloc && old.rmap) {
732                         int ret;
733
734                         down_write(&current->mm->mmap_sem);
735                         ret = do_munmap(current->mm, old.userspace_addr,
736                                         old.npages * PAGE_SIZE);
737                         up_write(&current->mm->mmap_sem);
738                         if (ret < 0)
739                                 printk(KERN_WARNING
740                                        "kvm_vm_ioctl_set_memory_region: "
741                                        "failed to munmap memory\n");
742                 }
743         }
744
745         /* Allocate page dirty bitmap if needed */
746         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
747                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
748
749                 new.dirty_bitmap = vmalloc(dirty_bytes);
750                 if (!new.dirty_bitmap)
751                         goto out_unlock;
752                 memset(new.dirty_bitmap, 0, dirty_bytes);
753         }
754
755         if (mem->slot >= kvm->nmemslots)
756                 kvm->nmemslots = mem->slot + 1;
757
758         if (!kvm->n_requested_mmu_pages) {
759                 unsigned int n_pages;
760
761                 if (npages) {
762                         n_pages = npages * KVM_PERMILLE_MMU_PAGES / 1000;
763                         kvm_mmu_change_mmu_pages(kvm, kvm->n_alloc_mmu_pages +
764                                                  n_pages);
765                 } else {
766                         unsigned int nr_mmu_pages;
767
768                         n_pages = old.npages * KVM_PERMILLE_MMU_PAGES / 1000;
769                         nr_mmu_pages = kvm->n_alloc_mmu_pages - n_pages;
770                         nr_mmu_pages = max(nr_mmu_pages,
771                                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
772                         kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
773                 }
774         }
775
776         *memslot = new;
777
778         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
779         kvm_flush_remote_tlbs(kvm);
780
781         mutex_unlock(&kvm->lock);
782
783         kvm_free_physmem_slot(&old, &new);
784         return 0;
785
786 out_unlock:
787         mutex_unlock(&kvm->lock);
788         kvm_free_physmem_slot(&new, &old);
789 out:
790         return r;
791
792 }
793 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
794
795 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
796                                           struct
797                                           kvm_userspace_memory_region *mem,
798                                           int user_alloc)
799 {
800         if (mem->slot >= KVM_MEMORY_SLOTS)
801                 return -EINVAL;
802         return kvm_set_memory_region(kvm, mem, user_alloc);
803 }
804
805 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
806                                           u32 kvm_nr_mmu_pages)
807 {
808         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
809                 return -EINVAL;
810
811         mutex_lock(&kvm->lock);
812
813         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
814         kvm->n_requested_mmu_pages = kvm_nr_mmu_pages;
815
816         mutex_unlock(&kvm->lock);
817         return 0;
818 }
819
820 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
821 {
822         return kvm->n_alloc_mmu_pages;
823 }
824
825 /*
826  * Get (and clear) the dirty memory log for a memory slot.
827  */
828 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
829                                       struct kvm_dirty_log *log)
830 {
831         struct kvm_memory_slot *memslot;
832         int r, i;
833         int n;
834         unsigned long any = 0;
835
836         mutex_lock(&kvm->lock);
837
838         r = -EINVAL;
839         if (log->slot >= KVM_MEMORY_SLOTS)
840                 goto out;
841
842         memslot = &kvm->memslots[log->slot];
843         r = -ENOENT;
844         if (!memslot->dirty_bitmap)
845                 goto out;
846
847         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
848
849         for (i = 0; !any && i < n/sizeof(long); ++i)
850                 any = memslot->dirty_bitmap[i];
851
852         r = -EFAULT;
853         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
854                 goto out;
855
856         /* If nothing is dirty, don't bother messing with page tables. */
857         if (any) {
858                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
859                 kvm_flush_remote_tlbs(kvm);
860                 memset(memslot->dirty_bitmap, 0, n);
861         }
862
863         r = 0;
864
865 out:
866         mutex_unlock(&kvm->lock);
867         return r;
868 }
869
870 /*
871  * Set a new alias region.  Aliases map a portion of physical memory into
872  * another portion.  This is useful for memory windows, for example the PC
873  * VGA region.
874  */
875 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
876                                          struct kvm_memory_alias *alias)
877 {
878         int r, n;
879         struct kvm_mem_alias *p;
880
881         r = -EINVAL;
882         /* General sanity checks */
883         if (alias->memory_size & (PAGE_SIZE - 1))
884                 goto out;
885         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
886                 goto out;
887         if (alias->slot >= KVM_ALIAS_SLOTS)
888                 goto out;
889         if (alias->guest_phys_addr + alias->memory_size
890             < alias->guest_phys_addr)
891                 goto out;
892         if (alias->target_phys_addr + alias->memory_size
893             < alias->target_phys_addr)
894                 goto out;
895
896         mutex_lock(&kvm->lock);
897
898         p = &kvm->aliases[alias->slot];
899         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
900         p->npages = alias->memory_size >> PAGE_SHIFT;
901         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
902
903         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
904                 if (kvm->aliases[n - 1].npages)
905                         break;
906         kvm->naliases = n;
907
908         kvm_mmu_zap_all(kvm);
909
910         mutex_unlock(&kvm->lock);
911
912         return 0;
913
914 out:
915         return r;
916 }
917
918 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
919 {
920         int r;
921
922         r = 0;
923         switch (chip->chip_id) {
924         case KVM_IRQCHIP_PIC_MASTER:
925                 memcpy(&chip->chip.pic,
926                         &pic_irqchip(kvm)->pics[0],
927                         sizeof(struct kvm_pic_state));
928                 break;
929         case KVM_IRQCHIP_PIC_SLAVE:
930                 memcpy(&chip->chip.pic,
931                         &pic_irqchip(kvm)->pics[1],
932                         sizeof(struct kvm_pic_state));
933                 break;
934         case KVM_IRQCHIP_IOAPIC:
935                 memcpy(&chip->chip.ioapic,
936                         ioapic_irqchip(kvm),
937                         sizeof(struct kvm_ioapic_state));
938                 break;
939         default:
940                 r = -EINVAL;
941                 break;
942         }
943         return r;
944 }
945
946 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
947 {
948         int r;
949
950         r = 0;
951         switch (chip->chip_id) {
952         case KVM_IRQCHIP_PIC_MASTER:
953                 memcpy(&pic_irqchip(kvm)->pics[0],
954                         &chip->chip.pic,
955                         sizeof(struct kvm_pic_state));
956                 break;
957         case KVM_IRQCHIP_PIC_SLAVE:
958                 memcpy(&pic_irqchip(kvm)->pics[1],
959                         &chip->chip.pic,
960                         sizeof(struct kvm_pic_state));
961                 break;
962         case KVM_IRQCHIP_IOAPIC:
963                 memcpy(ioapic_irqchip(kvm),
964                         &chip->chip.ioapic,
965                         sizeof(struct kvm_ioapic_state));
966                 break;
967         default:
968                 r = -EINVAL;
969                 break;
970         }
971         kvm_pic_update_irq(pic_irqchip(kvm));
972         return r;
973 }
974
975 int is_error_page(struct page *page)
976 {
977         return page == bad_page;
978 }
979 EXPORT_SYMBOL_GPL(is_error_page);
980
981 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
982 {
983         int i;
984         struct kvm_mem_alias *alias;
985
986         for (i = 0; i < kvm->naliases; ++i) {
987                 alias = &kvm->aliases[i];
988                 if (gfn >= alias->base_gfn
989                     && gfn < alias->base_gfn + alias->npages)
990                         return alias->target_gfn + gfn - alias->base_gfn;
991         }
992         return gfn;
993 }
994
995 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
996 {
997         int i;
998
999         for (i = 0; i < kvm->nmemslots; ++i) {
1000                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
1001
1002                 if (gfn >= memslot->base_gfn
1003                     && gfn < memslot->base_gfn + memslot->npages)
1004                         return memslot;
1005         }
1006         return NULL;
1007 }
1008
1009 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1010 {
1011         gfn = unalias_gfn(kvm, gfn);
1012         return __gfn_to_memslot(kvm, gfn);
1013 }
1014
1015 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1016 {
1017         int i;
1018
1019         gfn = unalias_gfn(kvm, gfn);
1020         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1021                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
1022
1023                 if (gfn >= memslot->base_gfn
1024                     && gfn < memslot->base_gfn + memslot->npages)
1025                         return 1;
1026         }
1027         return 0;
1028 }
1029 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1030
1031 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1032 {
1033         struct kvm_memory_slot *slot;
1034         struct page *page[1];
1035         int npages;
1036
1037         might_sleep();
1038
1039         gfn = unalias_gfn(kvm, gfn);
1040         slot = __gfn_to_memslot(kvm, gfn);
1041         if (!slot) {
1042                 get_page(bad_page);
1043                 return bad_page;
1044         }
1045
1046         down_read(&current->mm->mmap_sem);
1047         npages = get_user_pages(current, current->mm,
1048                                 slot->userspace_addr
1049                                 + (gfn - slot->base_gfn) * PAGE_SIZE, 1,
1050                                 1, 1, page, NULL);
1051         up_read(&current->mm->mmap_sem);
1052         if (npages != 1) {
1053                 get_page(bad_page);
1054                 return bad_page;
1055         }
1056
1057         return page[0];
1058 }
1059 EXPORT_SYMBOL_GPL(gfn_to_page);
1060
1061 void kvm_release_page(struct page *page)
1062 {
1063         if (!PageReserved(page))
1064                 SetPageDirty(page);
1065         put_page(page);
1066 }
1067 EXPORT_SYMBOL_GPL(kvm_release_page);
1068
1069 static int next_segment(unsigned long len, int offset)
1070 {
1071         if (len > PAGE_SIZE - offset)
1072                 return PAGE_SIZE - offset;
1073         else
1074                 return len;
1075 }
1076
1077 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1078                         int len)
1079 {
1080         void *page_virt;
1081         struct page *page;
1082
1083         page = gfn_to_page(kvm, gfn);
1084         if (is_error_page(page)) {
1085                 kvm_release_page(page);
1086                 return -EFAULT;
1087         }
1088         page_virt = kmap_atomic(page, KM_USER0);
1089
1090         memcpy(data, page_virt + offset, len);
1091
1092         kunmap_atomic(page_virt, KM_USER0);
1093         kvm_release_page(page);
1094         return 0;
1095 }
1096 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1097
1098 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1099 {
1100         gfn_t gfn = gpa >> PAGE_SHIFT;
1101         int seg;
1102         int offset = offset_in_page(gpa);
1103         int ret;
1104
1105         while ((seg = next_segment(len, offset)) != 0) {
1106                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1107                 if (ret < 0)
1108                         return ret;
1109                 offset = 0;
1110                 len -= seg;
1111                 data += seg;
1112                 ++gfn;
1113         }
1114         return 0;
1115 }
1116 EXPORT_SYMBOL_GPL(kvm_read_guest);
1117
1118 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1119                          int offset, int len)
1120 {
1121         void *page_virt;
1122         struct page *page;
1123
1124         page = gfn_to_page(kvm, gfn);
1125         if (is_error_page(page)) {
1126                 kvm_release_page(page);
1127                 return -EFAULT;
1128         }
1129         page_virt = kmap_atomic(page, KM_USER0);
1130
1131         memcpy(page_virt + offset, data, len);
1132
1133         kunmap_atomic(page_virt, KM_USER0);
1134         mark_page_dirty(kvm, gfn);
1135         kvm_release_page(page);
1136         return 0;
1137 }
1138 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1139
1140 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1141                     unsigned long len)
1142 {
1143         gfn_t gfn = gpa >> PAGE_SHIFT;
1144         int seg;
1145         int offset = offset_in_page(gpa);
1146         int ret;
1147
1148         while ((seg = next_segment(len, offset)) != 0) {
1149                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1150                 if (ret < 0)
1151                         return ret;
1152                 offset = 0;
1153                 len -= seg;
1154                 data += seg;
1155                 ++gfn;
1156         }
1157         return 0;
1158 }
1159
1160 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1161 {
1162         void *page_virt;
1163         struct page *page;
1164
1165         page = gfn_to_page(kvm, gfn);
1166         if (is_error_page(page)) {
1167                 kvm_release_page(page);
1168                 return -EFAULT;
1169         }
1170         page_virt = kmap_atomic(page, KM_USER0);
1171
1172         memset(page_virt + offset, 0, len);
1173
1174         kunmap_atomic(page_virt, KM_USER0);
1175         kvm_release_page(page);
1176         return 0;
1177 }
1178 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1179
1180 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1181 {
1182         gfn_t gfn = gpa >> PAGE_SHIFT;
1183         int seg;
1184         int offset = offset_in_page(gpa);
1185         int ret;
1186
1187         while ((seg = next_segment(len, offset)) != 0) {
1188                 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1189                 if (ret < 0)
1190                         return ret;
1191                 offset = 0;
1192                 len -= seg;
1193                 ++gfn;
1194         }
1195         return 0;
1196 }
1197 EXPORT_SYMBOL_GPL(kvm_clear_guest);
1198
1199 /* WARNING: Does not work on aliased pages. */
1200 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1201 {
1202         struct kvm_memory_slot *memslot;
1203
1204         memslot = __gfn_to_memslot(kvm, gfn);
1205         if (memslot && memslot->dirty_bitmap) {
1206                 unsigned long rel_gfn = gfn - memslot->base_gfn;
1207
1208                 /* avoid RMW */
1209                 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1210                         set_bit(rel_gfn, memslot->dirty_bitmap);
1211         }
1212 }
1213
1214 int emulator_read_std(unsigned long addr,
1215                              void *val,
1216                              unsigned int bytes,
1217                              struct kvm_vcpu *vcpu)
1218 {
1219         void *data = val;
1220
1221         while (bytes) {
1222                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1223                 unsigned offset = addr & (PAGE_SIZE-1);
1224                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1225                 int ret;
1226
1227                 if (gpa == UNMAPPED_GVA)
1228                         return X86EMUL_PROPAGATE_FAULT;
1229                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1230                 if (ret < 0)
1231                         return X86EMUL_UNHANDLEABLE;
1232
1233                 bytes -= tocopy;
1234                 data += tocopy;
1235                 addr += tocopy;
1236         }
1237
1238         return X86EMUL_CONTINUE;
1239 }
1240 EXPORT_SYMBOL_GPL(emulator_read_std);
1241
1242 static int emulator_write_std(unsigned long addr,
1243                               const void *val,
1244                               unsigned int bytes,
1245                               struct kvm_vcpu *vcpu)
1246 {
1247         pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
1248         return X86EMUL_UNHANDLEABLE;
1249 }
1250
1251 /*
1252  * Only apic need an MMIO device hook, so shortcut now..
1253  */
1254 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1255                                                 gpa_t addr)
1256 {
1257         struct kvm_io_device *dev;
1258
1259         if (vcpu->apic) {
1260                 dev = &vcpu->apic->dev;
1261                 if (dev->in_range(dev, addr))
1262                         return dev;
1263         }
1264         return NULL;
1265 }
1266
1267 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1268                                                 gpa_t addr)
1269 {
1270         struct kvm_io_device *dev;
1271
1272         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1273         if (dev == NULL)
1274                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1275         return dev;
1276 }
1277
1278 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1279                                                gpa_t addr)
1280 {
1281         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1282 }
1283
1284 static int emulator_read_emulated(unsigned long addr,
1285                                   void *val,
1286                                   unsigned int bytes,
1287                                   struct kvm_vcpu *vcpu)
1288 {
1289         struct kvm_io_device *mmio_dev;
1290         gpa_t                 gpa;
1291
1292         if (vcpu->mmio_read_completed) {
1293                 memcpy(val, vcpu->mmio_data, bytes);
1294                 vcpu->mmio_read_completed = 0;
1295                 return X86EMUL_CONTINUE;
1296         } else if (emulator_read_std(addr, val, bytes, vcpu)
1297                    == X86EMUL_CONTINUE)
1298                 return X86EMUL_CONTINUE;
1299
1300         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1301         if (gpa == UNMAPPED_GVA)
1302                 return X86EMUL_PROPAGATE_FAULT;
1303
1304         /*
1305          * Is this MMIO handled locally?
1306          */
1307         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1308         if (mmio_dev) {
1309                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1310                 return X86EMUL_CONTINUE;
1311         }
1312
1313         vcpu->mmio_needed = 1;
1314         vcpu->mmio_phys_addr = gpa;
1315         vcpu->mmio_size = bytes;
1316         vcpu->mmio_is_write = 0;
1317
1318         return X86EMUL_UNHANDLEABLE;
1319 }
1320
1321 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1322                                const void *val, int bytes)
1323 {
1324         int ret;
1325
1326         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1327         if (ret < 0)
1328                 return 0;
1329         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1330         return 1;
1331 }
1332
1333 static int emulator_write_emulated_onepage(unsigned long addr,
1334                                            const void *val,
1335                                            unsigned int bytes,
1336                                            struct kvm_vcpu *vcpu)
1337 {
1338         struct kvm_io_device *mmio_dev;
1339         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1340
1341         if (gpa == UNMAPPED_GVA) {
1342                 kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1343                 return X86EMUL_PROPAGATE_FAULT;
1344         }
1345
1346         if (emulator_write_phys(vcpu, gpa, val, bytes))
1347                 return X86EMUL_CONTINUE;
1348
1349         /*
1350          * Is this MMIO handled locally?
1351          */
1352         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1353         if (mmio_dev) {
1354                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1355                 return X86EMUL_CONTINUE;
1356         }
1357
1358         vcpu->mmio_needed = 1;
1359         vcpu->mmio_phys_addr = gpa;
1360         vcpu->mmio_size = bytes;
1361         vcpu->mmio_is_write = 1;
1362         memcpy(vcpu->mmio_data, val, bytes);
1363
1364         return X86EMUL_CONTINUE;
1365 }
1366
1367 int emulator_write_emulated(unsigned long addr,
1368                                    const void *val,
1369                                    unsigned int bytes,
1370                                    struct kvm_vcpu *vcpu)
1371 {
1372         /* Crossing a page boundary? */
1373         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1374                 int rc, now;
1375
1376                 now = -addr & ~PAGE_MASK;
1377                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1378                 if (rc != X86EMUL_CONTINUE)
1379                         return rc;
1380                 addr += now;
1381                 val += now;
1382                 bytes -= now;
1383         }
1384         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1385 }
1386 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1387
1388 static int emulator_cmpxchg_emulated(unsigned long addr,
1389                                      const void *old,
1390                                      const void *new,
1391                                      unsigned int bytes,
1392                                      struct kvm_vcpu *vcpu)
1393 {
1394         static int reported;
1395
1396         if (!reported) {
1397                 reported = 1;
1398                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1399         }
1400         return emulator_write_emulated(addr, new, bytes, vcpu);
1401 }
1402
1403 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1404 {
1405         return kvm_x86_ops->get_segment_base(vcpu, seg);
1406 }
1407
1408 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1409 {
1410         return X86EMUL_CONTINUE;
1411 }
1412
1413 int emulate_clts(struct kvm_vcpu *vcpu)
1414 {
1415         kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
1416         return X86EMUL_CONTINUE;
1417 }
1418
1419 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1420 {
1421         struct kvm_vcpu *vcpu = ctxt->vcpu;
1422
1423         switch (dr) {
1424         case 0 ... 3:
1425                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1426                 return X86EMUL_CONTINUE;
1427         default:
1428                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1429                 return X86EMUL_UNHANDLEABLE;
1430         }
1431 }
1432
1433 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1434 {
1435         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1436         int exception;
1437
1438         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1439         if (exception) {
1440                 /* FIXME: better handling */
1441                 return X86EMUL_UNHANDLEABLE;
1442         }
1443         return X86EMUL_CONTINUE;
1444 }
1445
1446 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1447 {
1448         static int reported;
1449         u8 opcodes[4];
1450         unsigned long rip = vcpu->rip;
1451         unsigned long rip_linear;
1452
1453         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1454
1455         if (reported)
1456                 return;
1457
1458         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1459
1460         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1461                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1462         reported = 1;
1463 }
1464 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1465
1466 struct x86_emulate_ops emulate_ops = {
1467         .read_std            = emulator_read_std,
1468         .write_std           = emulator_write_std,
1469         .read_emulated       = emulator_read_emulated,
1470         .write_emulated      = emulator_write_emulated,
1471         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1472 };
1473
1474 int emulate_instruction(struct kvm_vcpu *vcpu,
1475                         struct kvm_run *run,
1476                         unsigned long cr2,
1477                         u16 error_code,
1478                         int no_decode)
1479 {
1480         int r;
1481
1482         vcpu->mmio_fault_cr2 = cr2;
1483         kvm_x86_ops->cache_regs(vcpu);
1484
1485         vcpu->mmio_is_write = 0;
1486         vcpu->pio.string = 0;
1487
1488         if (!no_decode) {
1489                 int cs_db, cs_l;
1490                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1491
1492                 vcpu->emulate_ctxt.vcpu = vcpu;
1493                 vcpu->emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1494                 vcpu->emulate_ctxt.cr2 = cr2;
1495                 vcpu->emulate_ctxt.mode =
1496                         (vcpu->emulate_ctxt.eflags & X86_EFLAGS_VM)
1497                         ? X86EMUL_MODE_REAL : cs_l
1498                         ? X86EMUL_MODE_PROT64 : cs_db
1499                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1500
1501                 if (vcpu->emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1502                         vcpu->emulate_ctxt.cs_base = 0;
1503                         vcpu->emulate_ctxt.ds_base = 0;
1504                         vcpu->emulate_ctxt.es_base = 0;
1505                         vcpu->emulate_ctxt.ss_base = 0;
1506                 } else {
1507                         vcpu->emulate_ctxt.cs_base =
1508                                         get_segment_base(vcpu, VCPU_SREG_CS);
1509                         vcpu->emulate_ctxt.ds_base =
1510                                         get_segment_base(vcpu, VCPU_SREG_DS);
1511                         vcpu->emulate_ctxt.es_base =
1512                                         get_segment_base(vcpu, VCPU_SREG_ES);
1513                         vcpu->emulate_ctxt.ss_base =
1514                                         get_segment_base(vcpu, VCPU_SREG_SS);
1515                 }
1516
1517                 vcpu->emulate_ctxt.gs_base =
1518                                         get_segment_base(vcpu, VCPU_SREG_GS);
1519                 vcpu->emulate_ctxt.fs_base =
1520                                         get_segment_base(vcpu, VCPU_SREG_FS);
1521
1522                 r = x86_decode_insn(&vcpu->emulate_ctxt, &emulate_ops);
1523                 if (r)  {
1524                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1525                                 return EMULATE_DONE;
1526                         return EMULATE_FAIL;
1527                 }
1528         }
1529
1530         r = x86_emulate_insn(&vcpu->emulate_ctxt, &emulate_ops);
1531
1532         if (vcpu->pio.string)
1533                 return EMULATE_DO_MMIO;
1534
1535         if ((r || vcpu->mmio_is_write) && run) {
1536                 run->exit_reason = KVM_EXIT_MMIO;
1537                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1538                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1539                 run->mmio.len = vcpu->mmio_size;
1540                 run->mmio.is_write = vcpu->mmio_is_write;
1541         }
1542
1543         if (r) {
1544                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1545                         return EMULATE_DONE;
1546                 if (!vcpu->mmio_needed) {
1547                         kvm_report_emulation_failure(vcpu, "mmio");
1548                         return EMULATE_FAIL;
1549                 }
1550                 return EMULATE_DO_MMIO;
1551         }
1552
1553         kvm_x86_ops->decache_regs(vcpu);
1554         kvm_x86_ops->set_rflags(vcpu, vcpu->emulate_ctxt.eflags);
1555
1556         if (vcpu->mmio_is_write) {
1557                 vcpu->mmio_needed = 0;
1558                 return EMULATE_DO_MMIO;
1559         }
1560
1561         return EMULATE_DONE;
1562 }
1563 EXPORT_SYMBOL_GPL(emulate_instruction);
1564
1565 /*
1566  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1567  */
1568 static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1569 {
1570         DECLARE_WAITQUEUE(wait, current);
1571
1572         add_wait_queue(&vcpu->wq, &wait);
1573
1574         /*
1575          * We will block until either an interrupt or a signal wakes us up
1576          */
1577         while (!kvm_cpu_has_interrupt(vcpu)
1578                && !signal_pending(current)
1579                && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1580                && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1581                 set_current_state(TASK_INTERRUPTIBLE);
1582                 vcpu_put(vcpu);
1583                 schedule();
1584                 vcpu_load(vcpu);
1585         }
1586
1587         __set_current_state(TASK_RUNNING);
1588         remove_wait_queue(&vcpu->wq, &wait);
1589 }
1590
1591 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1592 {
1593         ++vcpu->stat.halt_exits;
1594         if (irqchip_in_kernel(vcpu->kvm)) {
1595                 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1596                 kvm_vcpu_block(vcpu);
1597                 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1598                         return -EINTR;
1599                 return 1;
1600         } else {
1601                 vcpu->run->exit_reason = KVM_EXIT_HLT;
1602                 return 0;
1603         }
1604 }
1605 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1606
1607 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
1608 {
1609         unsigned long nr, a0, a1, a2, a3, ret;
1610
1611         kvm_x86_ops->cache_regs(vcpu);
1612
1613         nr = vcpu->regs[VCPU_REGS_RAX];
1614         a0 = vcpu->regs[VCPU_REGS_RBX];
1615         a1 = vcpu->regs[VCPU_REGS_RCX];
1616         a2 = vcpu->regs[VCPU_REGS_RDX];
1617         a3 = vcpu->regs[VCPU_REGS_RSI];
1618
1619         if (!is_long_mode(vcpu)) {
1620                 nr &= 0xFFFFFFFF;
1621                 a0 &= 0xFFFFFFFF;
1622                 a1 &= 0xFFFFFFFF;
1623                 a2 &= 0xFFFFFFFF;
1624                 a3 &= 0xFFFFFFFF;
1625         }
1626
1627         switch (nr) {
1628         default:
1629                 ret = -KVM_ENOSYS;
1630                 break;
1631         }
1632         vcpu->regs[VCPU_REGS_RAX] = ret;
1633         kvm_x86_ops->decache_regs(vcpu);
1634         return 0;
1635 }
1636 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
1637
1638 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
1639 {
1640         char instruction[3];
1641         int ret = 0;
1642
1643         mutex_lock(&vcpu->kvm->lock);
1644
1645         /*
1646          * Blow out the MMU to ensure that no other VCPU has an active mapping
1647          * to ensure that the updated hypercall appears atomically across all
1648          * VCPUs.
1649          */
1650         kvm_mmu_zap_all(vcpu->kvm);
1651
1652         kvm_x86_ops->cache_regs(vcpu);
1653         kvm_x86_ops->patch_hypercall(vcpu, instruction);
1654         if (emulator_write_emulated(vcpu->rip, instruction, 3, vcpu)
1655             != X86EMUL_CONTINUE)
1656                 ret = -EFAULT;
1657
1658         mutex_unlock(&vcpu->kvm->lock);
1659
1660         return ret;
1661 }
1662
1663 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1664 {
1665         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1666 }
1667
1668 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1669 {
1670         struct descriptor_table dt = { limit, base };
1671
1672         kvm_x86_ops->set_gdt(vcpu, &dt);
1673 }
1674
1675 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1676 {
1677         struct descriptor_table dt = { limit, base };
1678
1679         kvm_x86_ops->set_idt(vcpu, &dt);
1680 }
1681
1682 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1683                    unsigned long *rflags)
1684 {
1685         lmsw(vcpu, msw);
1686         *rflags = kvm_x86_ops->get_rflags(vcpu);
1687 }
1688
1689 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1690 {
1691         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1692         switch (cr) {
1693         case 0:
1694                 return vcpu->cr0;
1695         case 2:
1696                 return vcpu->cr2;
1697         case 3:
1698                 return vcpu->cr3;
1699         case 4:
1700                 return vcpu->cr4;
1701         default:
1702                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1703                 return 0;
1704         }
1705 }
1706
1707 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1708                      unsigned long *rflags)
1709 {
1710         switch (cr) {
1711         case 0:
1712                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1713                 *rflags = kvm_x86_ops->get_rflags(vcpu);
1714                 break;
1715         case 2:
1716                 vcpu->cr2 = val;
1717                 break;
1718         case 3:
1719                 set_cr3(vcpu, val);
1720                 break;
1721         case 4:
1722                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1723                 break;
1724         default:
1725                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1726         }
1727 }
1728
1729 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1730 {
1731         u64 data;
1732
1733         switch (msr) {
1734         case 0xc0010010: /* SYSCFG */
1735         case 0xc0010015: /* HWCR */
1736         case MSR_IA32_PLATFORM_ID:
1737         case MSR_IA32_P5_MC_ADDR:
1738         case MSR_IA32_P5_MC_TYPE:
1739         case MSR_IA32_MC0_CTL:
1740         case MSR_IA32_MCG_STATUS:
1741         case MSR_IA32_MCG_CAP:
1742         case MSR_IA32_MC0_MISC:
1743         case MSR_IA32_MC0_MISC+4:
1744         case MSR_IA32_MC0_MISC+8:
1745         case MSR_IA32_MC0_MISC+12:
1746         case MSR_IA32_MC0_MISC+16:
1747         case MSR_IA32_UCODE_REV:
1748         case MSR_IA32_PERF_STATUS:
1749         case MSR_IA32_EBL_CR_POWERON:
1750                 /* MTRR registers */
1751         case 0xfe:
1752         case 0x200 ... 0x2ff:
1753                 data = 0;
1754                 break;
1755         case 0xcd: /* fsb frequency */
1756                 data = 3;
1757                 break;
1758         case MSR_IA32_APICBASE:
1759                 data = kvm_get_apic_base(vcpu);
1760                 break;
1761         case MSR_IA32_MISC_ENABLE:
1762                 data = vcpu->ia32_misc_enable_msr;
1763                 break;
1764 #ifdef CONFIG_X86_64
1765         case MSR_EFER:
1766                 data = vcpu->shadow_efer;
1767                 break;
1768 #endif
1769         default:
1770                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1771                 return 1;
1772         }
1773         *pdata = data;
1774         return 0;
1775 }
1776 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1777
1778 /*
1779  * Reads an msr value (of 'msr_index') into 'pdata'.
1780  * Returns 0 on success, non-0 otherwise.
1781  * Assumes vcpu_load() was already called.
1782  */
1783 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1784 {
1785         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1786 }
1787
1788 #ifdef CONFIG_X86_64
1789
1790 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1791 {
1792         if (efer & EFER_RESERVED_BITS) {
1793                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1794                        efer);
1795                 inject_gp(vcpu);
1796                 return;
1797         }
1798
1799         if (is_paging(vcpu)
1800             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1801                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1802                 inject_gp(vcpu);
1803                 return;
1804         }
1805
1806         kvm_x86_ops->set_efer(vcpu, efer);
1807
1808         efer &= ~EFER_LMA;
1809         efer |= vcpu->shadow_efer & EFER_LMA;
1810
1811         vcpu->shadow_efer = efer;
1812 }
1813
1814 #endif
1815
1816 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1817 {
1818         switch (msr) {
1819 #ifdef CONFIG_X86_64
1820         case MSR_EFER:
1821                 set_efer(vcpu, data);
1822                 break;
1823 #endif
1824         case MSR_IA32_MC0_STATUS:
1825                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1826                        __FUNCTION__, data);
1827                 break;
1828         case MSR_IA32_MCG_STATUS:
1829                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1830                         __FUNCTION__, data);
1831                 break;
1832         case MSR_IA32_UCODE_REV:
1833         case MSR_IA32_UCODE_WRITE:
1834         case 0x200 ... 0x2ff: /* MTRRs */
1835                 break;
1836         case MSR_IA32_APICBASE:
1837                 kvm_set_apic_base(vcpu, data);
1838                 break;
1839         case MSR_IA32_MISC_ENABLE:
1840                 vcpu->ia32_misc_enable_msr = data;
1841                 break;
1842         default:
1843                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1844                 return 1;
1845         }
1846         return 0;
1847 }
1848 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1849
1850 /*
1851  * Writes msr value into into the appropriate "register".
1852  * Returns 0 on success, non-0 otherwise.
1853  * Assumes vcpu_load() was already called.
1854  */
1855 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1856 {
1857         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1858 }
1859
1860 void kvm_resched(struct kvm_vcpu *vcpu)
1861 {
1862         if (!need_resched())
1863                 return;
1864         cond_resched();
1865 }
1866 EXPORT_SYMBOL_GPL(kvm_resched);
1867
1868 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1869 {
1870         int i;
1871         u32 function;
1872         struct kvm_cpuid_entry *e, *best;
1873
1874         kvm_x86_ops->cache_regs(vcpu);
1875         function = vcpu->regs[VCPU_REGS_RAX];
1876         vcpu->regs[VCPU_REGS_RAX] = 0;
1877         vcpu->regs[VCPU_REGS_RBX] = 0;
1878         vcpu->regs[VCPU_REGS_RCX] = 0;
1879         vcpu->regs[VCPU_REGS_RDX] = 0;
1880         best = NULL;
1881         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1882                 e = &vcpu->cpuid_entries[i];
1883                 if (e->function == function) {
1884                         best = e;
1885                         break;
1886                 }
1887                 /*
1888                  * Both basic or both extended?
1889                  */
1890                 if (((e->function ^ function) & 0x80000000) == 0)
1891                         if (!best || e->function > best->function)
1892                                 best = e;
1893         }
1894         if (best) {
1895                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1896                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1897                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1898                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1899         }
1900         kvm_x86_ops->decache_regs(vcpu);
1901         kvm_x86_ops->skip_emulated_instruction(vcpu);
1902 }
1903 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1904
1905 static int pio_copy_data(struct kvm_vcpu *vcpu)
1906 {
1907         void *p = vcpu->pio_data;
1908         void *q;
1909         unsigned bytes;
1910         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1911
1912         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1913                  PAGE_KERNEL);
1914         if (!q) {
1915                 free_pio_guest_pages(vcpu);
1916                 return -ENOMEM;
1917         }
1918         q += vcpu->pio.guest_page_offset;
1919         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1920         if (vcpu->pio.in)
1921                 memcpy(q, p, bytes);
1922         else
1923                 memcpy(p, q, bytes);
1924         q -= vcpu->pio.guest_page_offset;
1925         vunmap(q);
1926         free_pio_guest_pages(vcpu);
1927         return 0;
1928 }
1929
1930 static int complete_pio(struct kvm_vcpu *vcpu)
1931 {
1932         struct kvm_pio_request *io = &vcpu->pio;
1933         long delta;
1934         int r;
1935
1936         kvm_x86_ops->cache_regs(vcpu);
1937
1938         if (!io->string) {
1939                 if (io->in)
1940                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1941                                io->size);
1942         } else {
1943                 if (io->in) {
1944                         r = pio_copy_data(vcpu);
1945                         if (r) {
1946                                 kvm_x86_ops->cache_regs(vcpu);
1947                                 return r;
1948                         }
1949                 }
1950
1951                 delta = 1;
1952                 if (io->rep) {
1953                         delta *= io->cur_count;
1954                         /*
1955                          * The size of the register should really depend on
1956                          * current address size.
1957                          */
1958                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1959                 }
1960                 if (io->down)
1961                         delta = -delta;
1962                 delta *= io->size;
1963                 if (io->in)
1964                         vcpu->regs[VCPU_REGS_RDI] += delta;
1965                 else
1966                         vcpu->regs[VCPU_REGS_RSI] += delta;
1967         }
1968
1969         kvm_x86_ops->decache_regs(vcpu);
1970
1971         io->count -= io->cur_count;
1972         io->cur_count = 0;
1973
1974         return 0;
1975 }
1976
1977 static void kernel_pio(struct kvm_io_device *pio_dev,
1978                        struct kvm_vcpu *vcpu,
1979                        void *pd)
1980 {
1981         /* TODO: String I/O for in kernel device */
1982
1983         mutex_lock(&vcpu->kvm->lock);
1984         if (vcpu->pio.in)
1985                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1986                                   vcpu->pio.size,
1987                                   pd);
1988         else
1989                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1990                                    vcpu->pio.size,
1991                                    pd);
1992         mutex_unlock(&vcpu->kvm->lock);
1993 }
1994
1995 static void pio_string_write(struct kvm_io_device *pio_dev,
1996                              struct kvm_vcpu *vcpu)
1997 {
1998         struct kvm_pio_request *io = &vcpu->pio;
1999         void *pd = vcpu->pio_data;
2000         int i;
2001
2002         mutex_lock(&vcpu->kvm->lock);
2003         for (i = 0; i < io->cur_count; i++) {
2004                 kvm_iodevice_write(pio_dev, io->port,
2005                                    io->size,
2006                                    pd);
2007                 pd += io->size;
2008         }
2009         mutex_unlock(&vcpu->kvm->lock);
2010 }
2011
2012 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2013                   int size, unsigned port)
2014 {
2015         struct kvm_io_device *pio_dev;
2016
2017         vcpu->run->exit_reason = KVM_EXIT_IO;
2018         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2019         vcpu->run->io.size = vcpu->pio.size = size;
2020         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2021         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
2022         vcpu->run->io.port = vcpu->pio.port = port;
2023         vcpu->pio.in = in;
2024         vcpu->pio.string = 0;
2025         vcpu->pio.down = 0;
2026         vcpu->pio.guest_page_offset = 0;
2027         vcpu->pio.rep = 0;
2028
2029         kvm_x86_ops->cache_regs(vcpu);
2030         memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
2031         kvm_x86_ops->decache_regs(vcpu);
2032
2033         kvm_x86_ops->skip_emulated_instruction(vcpu);
2034
2035         pio_dev = vcpu_find_pio_dev(vcpu, port);
2036         if (pio_dev) {
2037                 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
2038                 complete_pio(vcpu);
2039                 return 1;
2040         }
2041         return 0;
2042 }
2043 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2044
2045 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2046                   int size, unsigned long count, int down,
2047                   gva_t address, int rep, unsigned port)
2048 {
2049         unsigned now, in_page;
2050         int i, ret = 0;
2051         int nr_pages = 1;
2052         struct page *page;
2053         struct kvm_io_device *pio_dev;
2054
2055         vcpu->run->exit_reason = KVM_EXIT_IO;
2056         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2057         vcpu->run->io.size = vcpu->pio.size = size;
2058         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2059         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
2060         vcpu->run->io.port = vcpu->pio.port = port;
2061         vcpu->pio.in = in;
2062         vcpu->pio.string = 1;
2063         vcpu->pio.down = down;
2064         vcpu->pio.guest_page_offset = offset_in_page(address);
2065         vcpu->pio.rep = rep;
2066
2067         if (!count) {
2068                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2069                 return 1;
2070         }
2071
2072         if (!down)
2073                 in_page = PAGE_SIZE - offset_in_page(address);
2074         else
2075                 in_page = offset_in_page(address) + size;
2076         now = min(count, (unsigned long)in_page / size);
2077         if (!now) {
2078                 /*
2079                  * String I/O straddles page boundary.  Pin two guest pages
2080                  * so that we satisfy atomicity constraints.  Do just one
2081                  * transaction to avoid complexity.
2082                  */
2083                 nr_pages = 2;
2084                 now = 1;
2085         }
2086         if (down) {
2087                 /*
2088                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2089                  */
2090                 pr_unimpl(vcpu, "guest string pio down\n");
2091                 inject_gp(vcpu);
2092                 return 1;
2093         }
2094         vcpu->run->io.count = now;
2095         vcpu->pio.cur_count = now;
2096
2097         if (vcpu->pio.cur_count == vcpu->pio.count)
2098                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2099
2100         for (i = 0; i < nr_pages; ++i) {
2101                 mutex_lock(&vcpu->kvm->lock);
2102                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2103                 vcpu->pio.guest_pages[i] = page;
2104                 mutex_unlock(&vcpu->kvm->lock);
2105                 if (!page) {
2106                         inject_gp(vcpu);
2107                         free_pio_guest_pages(vcpu);
2108                         return 1;
2109                 }
2110         }
2111
2112         pio_dev = vcpu_find_pio_dev(vcpu, port);
2113         if (!vcpu->pio.in) {
2114                 /* string PIO write */
2115                 ret = pio_copy_data(vcpu);
2116                 if (ret >= 0 && pio_dev) {
2117                         pio_string_write(pio_dev, vcpu);
2118                         complete_pio(vcpu);
2119                         if (vcpu->pio.count == 0)
2120                                 ret = 1;
2121                 }
2122         } else if (pio_dev)
2123                 pr_unimpl(vcpu, "no string pio read support yet, "
2124                        "port %x size %d count %ld\n",
2125                         port, size, count);
2126
2127         return ret;
2128 }
2129 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2130
2131 /*
2132  * Check if userspace requested an interrupt window, and that the
2133  * interrupt window is open.
2134  *
2135  * No need to exit to userspace if we already have an interrupt queued.
2136  */
2137 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2138                                           struct kvm_run *kvm_run)
2139 {
2140         return (!vcpu->irq_summary &&
2141                 kvm_run->request_interrupt_window &&
2142                 vcpu->interrupt_window_open &&
2143                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2144 }
2145
2146 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2147                               struct kvm_run *kvm_run)
2148 {
2149         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2150         kvm_run->cr8 = get_cr8(vcpu);
2151         kvm_run->apic_base = kvm_get_apic_base(vcpu);
2152         if (irqchip_in_kernel(vcpu->kvm))
2153                 kvm_run->ready_for_interrupt_injection = 1;
2154         else
2155                 kvm_run->ready_for_interrupt_injection =
2156                                         (vcpu->interrupt_window_open &&
2157                                          vcpu->irq_summary == 0);
2158 }
2159
2160 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2161 {
2162         int r;
2163
2164         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2165                 pr_debug("vcpu %d received sipi with vector # %x\n",
2166                        vcpu->vcpu_id, vcpu->sipi_vector);
2167                 kvm_lapic_reset(vcpu);
2168                 r = kvm_x86_ops->vcpu_reset(vcpu);
2169                 if (r)
2170                         return r;
2171                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
2172         }
2173
2174 preempted:
2175         if (vcpu->guest_debug.enabled)
2176                 kvm_x86_ops->guest_debug_pre(vcpu);
2177
2178 again:
2179         r = kvm_mmu_reload(vcpu);
2180         if (unlikely(r))
2181                 goto out;
2182
2183         kvm_inject_pending_timer_irqs(vcpu);
2184
2185         preempt_disable();
2186
2187         kvm_x86_ops->prepare_guest_switch(vcpu);
2188         kvm_load_guest_fpu(vcpu);
2189
2190         local_irq_disable();
2191
2192         if (signal_pending(current)) {
2193                 local_irq_enable();
2194                 preempt_enable();
2195                 r = -EINTR;
2196                 kvm_run->exit_reason = KVM_EXIT_INTR;
2197                 ++vcpu->stat.signal_exits;
2198                 goto out;
2199         }
2200
2201         if (irqchip_in_kernel(vcpu->kvm))
2202                 kvm_x86_ops->inject_pending_irq(vcpu);
2203         else if (!vcpu->mmio_read_completed)
2204                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2205
2206         vcpu->guest_mode = 1;
2207         kvm_guest_enter();
2208
2209         if (vcpu->requests)
2210                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2211                         kvm_x86_ops->tlb_flush(vcpu);
2212
2213         kvm_x86_ops->run(vcpu, kvm_run);
2214
2215         vcpu->guest_mode = 0;
2216         local_irq_enable();
2217
2218         ++vcpu->stat.exits;
2219
2220         /*
2221          * We must have an instruction between local_irq_enable() and
2222          * kvm_guest_exit(), so the timer interrupt isn't delayed by
2223          * the interrupt shadow.  The stat.exits increment will do nicely.
2224          * But we need to prevent reordering, hence this barrier():
2225          */
2226         barrier();
2227
2228         kvm_guest_exit();
2229
2230         preempt_enable();
2231
2232         /*
2233          * Profile KVM exit RIPs:
2234          */
2235         if (unlikely(prof_on == KVM_PROFILING)) {
2236                 kvm_x86_ops->cache_regs(vcpu);
2237                 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
2238         }
2239
2240         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2241
2242         if (r > 0) {
2243                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2244                         r = -EINTR;
2245                         kvm_run->exit_reason = KVM_EXIT_INTR;
2246                         ++vcpu->stat.request_irq_exits;
2247                         goto out;
2248                 }
2249                 if (!need_resched()) {
2250                         ++vcpu->stat.light_exits;
2251                         goto again;
2252                 }
2253         }
2254
2255 out:
2256         if (r > 0) {
2257                 kvm_resched(vcpu);
2258                 goto preempted;
2259         }
2260
2261         post_kvm_run_save(vcpu, kvm_run);
2262
2263         return r;
2264 }
2265
2266
2267 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2268 {
2269         int r;
2270         sigset_t sigsaved;
2271
2272         vcpu_load(vcpu);
2273
2274         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2275                 kvm_vcpu_block(vcpu);
2276                 vcpu_put(vcpu);
2277                 return -EAGAIN;
2278         }
2279
2280         if (vcpu->sigset_active)
2281                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2282
2283         /* re-sync apic's tpr */
2284         if (!irqchip_in_kernel(vcpu->kvm))
2285                 set_cr8(vcpu, kvm_run->cr8);
2286
2287         if (vcpu->pio.cur_count) {
2288                 r = complete_pio(vcpu);
2289                 if (r)
2290                         goto out;
2291         }
2292 #if CONFIG_HAS_IOMEM
2293         if (vcpu->mmio_needed) {
2294                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2295                 vcpu->mmio_read_completed = 1;
2296                 vcpu->mmio_needed = 0;
2297                 r = emulate_instruction(vcpu, kvm_run,
2298                                         vcpu->mmio_fault_cr2, 0, 1);
2299                 if (r == EMULATE_DO_MMIO) {
2300                         /*
2301                          * Read-modify-write.  Back to userspace.
2302                          */
2303                         r = 0;
2304                         goto out;
2305                 }
2306         }
2307 #endif
2308         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2309                 kvm_x86_ops->cache_regs(vcpu);
2310                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2311                 kvm_x86_ops->decache_regs(vcpu);
2312         }
2313
2314         r = __vcpu_run(vcpu, kvm_run);
2315
2316 out:
2317         if (vcpu->sigset_active)
2318                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2319
2320         vcpu_put(vcpu);
2321         return r;
2322 }
2323
2324 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2325                                    struct kvm_regs *regs)
2326 {
2327         vcpu_load(vcpu);
2328
2329         kvm_x86_ops->cache_regs(vcpu);
2330
2331         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2332         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2333         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2334         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2335         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2336         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2337         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2338         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2339 #ifdef CONFIG_X86_64
2340         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2341         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2342         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2343         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2344         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2345         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2346         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2347         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2348 #endif
2349
2350         regs->rip = vcpu->rip;
2351         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2352
2353         /*
2354          * Don't leak debug flags in case they were set for guest debugging
2355          */
2356         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2357                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2358
2359         vcpu_put(vcpu);
2360
2361         return 0;
2362 }
2363
2364 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2365                                    struct kvm_regs *regs)
2366 {
2367         vcpu_load(vcpu);
2368
2369         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2370         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2371         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2372         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2373         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2374         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2375         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2376         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2377 #ifdef CONFIG_X86_64
2378         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2379         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2380         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2381         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2382         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2383         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2384         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2385         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2386 #endif
2387
2388         vcpu->rip = regs->rip;
2389         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2390
2391         kvm_x86_ops->decache_regs(vcpu);
2392
2393         vcpu_put(vcpu);
2394
2395         return 0;
2396 }
2397
2398 static void get_segment(struct kvm_vcpu *vcpu,
2399                         struct kvm_segment *var, int seg)
2400 {
2401         return kvm_x86_ops->get_segment(vcpu, var, seg);
2402 }
2403
2404 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2405                                     struct kvm_sregs *sregs)
2406 {
2407         struct descriptor_table dt;
2408         int pending_vec;
2409
2410         vcpu_load(vcpu);
2411
2412         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2413         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2414         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2415         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2416         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2417         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2418
2419         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2420         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2421
2422         kvm_x86_ops->get_idt(vcpu, &dt);
2423         sregs->idt.limit = dt.limit;
2424         sregs->idt.base = dt.base;
2425         kvm_x86_ops->get_gdt(vcpu, &dt);
2426         sregs->gdt.limit = dt.limit;
2427         sregs->gdt.base = dt.base;
2428
2429         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2430         sregs->cr0 = vcpu->cr0;
2431         sregs->cr2 = vcpu->cr2;
2432         sregs->cr3 = vcpu->cr3;
2433         sregs->cr4 = vcpu->cr4;
2434         sregs->cr8 = get_cr8(vcpu);
2435         sregs->efer = vcpu->shadow_efer;
2436         sregs->apic_base = kvm_get_apic_base(vcpu);
2437
2438         if (irqchip_in_kernel(vcpu->kvm)) {
2439                 memset(sregs->interrupt_bitmap, 0,
2440                        sizeof sregs->interrupt_bitmap);
2441                 pending_vec = kvm_x86_ops->get_irq(vcpu);
2442                 if (pending_vec >= 0)
2443                         set_bit(pending_vec,
2444                                 (unsigned long *)sregs->interrupt_bitmap);
2445         } else
2446                 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2447                        sizeof sregs->interrupt_bitmap);
2448
2449         vcpu_put(vcpu);
2450
2451         return 0;
2452 }
2453
2454 static void set_segment(struct kvm_vcpu *vcpu,
2455                         struct kvm_segment *var, int seg)
2456 {
2457         return kvm_x86_ops->set_segment(vcpu, var, seg);
2458 }
2459
2460 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2461                                     struct kvm_sregs *sregs)
2462 {
2463         int mmu_reset_needed = 0;
2464         int i, pending_vec, max_bits;
2465         struct descriptor_table dt;
2466
2467         vcpu_load(vcpu);
2468
2469         dt.limit = sregs->idt.limit;
2470         dt.base = sregs->idt.base;
2471         kvm_x86_ops->set_idt(vcpu, &dt);
2472         dt.limit = sregs->gdt.limit;
2473         dt.base = sregs->gdt.base;
2474         kvm_x86_ops->set_gdt(vcpu, &dt);
2475
2476         vcpu->cr2 = sregs->cr2;
2477         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2478         vcpu->cr3 = sregs->cr3;
2479
2480         set_cr8(vcpu, sregs->cr8);
2481
2482         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2483 #ifdef CONFIG_X86_64
2484         kvm_x86_ops->set_efer(vcpu, sregs->efer);
2485 #endif
2486         kvm_set_apic_base(vcpu, sregs->apic_base);
2487
2488         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2489
2490         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2491         vcpu->cr0 = sregs->cr0;
2492         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2493
2494         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2495         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2496         if (!is_long_mode(vcpu) && is_pae(vcpu))
2497                 load_pdptrs(vcpu, vcpu->cr3);
2498
2499         if (mmu_reset_needed)
2500                 kvm_mmu_reset_context(vcpu);
2501
2502         if (!irqchip_in_kernel(vcpu->kvm)) {
2503                 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2504                        sizeof vcpu->irq_pending);
2505                 vcpu->irq_summary = 0;
2506                 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2507                         if (vcpu->irq_pending[i])
2508                                 __set_bit(i, &vcpu->irq_summary);
2509         } else {
2510                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2511                 pending_vec = find_first_bit(
2512                         (const unsigned long *)sregs->interrupt_bitmap,
2513                         max_bits);
2514                 /* Only pending external irq is handled here */
2515                 if (pending_vec < max_bits) {
2516                         kvm_x86_ops->set_irq(vcpu, pending_vec);
2517                         pr_debug("Set back pending irq %d\n",
2518                                  pending_vec);
2519                 }
2520         }
2521
2522         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2523         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2524         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2525         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2526         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2527         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2528
2529         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2530         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2531
2532         vcpu_put(vcpu);
2533
2534         return 0;
2535 }
2536
2537 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2538 {
2539         struct kvm_segment cs;
2540
2541         get_segment(vcpu, &cs, VCPU_SREG_CS);
2542         *db = cs.db;
2543         *l = cs.l;
2544 }
2545 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2546
2547 /*
2548  * Translate a guest virtual address to a guest physical address.
2549  */
2550 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2551                                     struct kvm_translation *tr)
2552 {
2553         unsigned long vaddr = tr->linear_address;
2554         gpa_t gpa;
2555
2556         vcpu_load(vcpu);
2557         mutex_lock(&vcpu->kvm->lock);
2558         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2559         tr->physical_address = gpa;
2560         tr->valid = gpa != UNMAPPED_GVA;
2561         tr->writeable = 1;
2562         tr->usermode = 0;
2563         mutex_unlock(&vcpu->kvm->lock);
2564         vcpu_put(vcpu);
2565
2566         return 0;
2567 }
2568
2569 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2570                                     struct kvm_interrupt *irq)
2571 {
2572         if (irq->irq < 0 || irq->irq >= 256)
2573                 return -EINVAL;
2574         if (irqchip_in_kernel(vcpu->kvm))
2575                 return -ENXIO;
2576         vcpu_load(vcpu);
2577
2578         set_bit(irq->irq, vcpu->irq_pending);
2579         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2580
2581         vcpu_put(vcpu);
2582
2583         return 0;
2584 }
2585
2586 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2587                                       struct kvm_debug_guest *dbg)
2588 {
2589         int r;
2590
2591         vcpu_load(vcpu);
2592
2593         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2594
2595         vcpu_put(vcpu);
2596
2597         return r;
2598 }
2599
2600 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2601                                     unsigned long address,
2602                                     int *type)
2603 {
2604         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2605         unsigned long pgoff;
2606         struct page *page;
2607
2608         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2609         if (pgoff == 0)
2610                 page = virt_to_page(vcpu->run);
2611         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2612                 page = virt_to_page(vcpu->pio_data);
2613         else
2614                 return NOPAGE_SIGBUS;
2615         get_page(page);
2616         if (type != NULL)
2617                 *type = VM_FAULT_MINOR;
2618
2619         return page;
2620 }
2621
2622 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2623         .nopage = kvm_vcpu_nopage,
2624 };
2625
2626 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2627 {
2628         vma->vm_ops = &kvm_vcpu_vm_ops;
2629         return 0;
2630 }
2631
2632 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2633 {
2634         struct kvm_vcpu *vcpu = filp->private_data;
2635
2636         fput(vcpu->kvm->filp);
2637         return 0;
2638 }
2639
2640 static struct file_operations kvm_vcpu_fops = {
2641         .release        = kvm_vcpu_release,
2642         .unlocked_ioctl = kvm_vcpu_ioctl,
2643         .compat_ioctl   = kvm_vcpu_ioctl,
2644         .mmap           = kvm_vcpu_mmap,
2645 };
2646
2647 /*
2648  * Allocates an inode for the vcpu.
2649  */
2650 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2651 {
2652         int fd, r;
2653         struct inode *inode;
2654         struct file *file;
2655
2656         r = anon_inode_getfd(&fd, &inode, &file,
2657                              "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2658         if (r)
2659                 return r;
2660         atomic_inc(&vcpu->kvm->filp->f_count);
2661         return fd;
2662 }
2663
2664 /*
2665  * Creates some virtual cpus.  Good luck creating more than one.
2666  */
2667 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2668 {
2669         int r;
2670         struct kvm_vcpu *vcpu;
2671
2672         if (!valid_vcpu(n))
2673                 return -EINVAL;
2674
2675         vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2676         if (IS_ERR(vcpu))
2677                 return PTR_ERR(vcpu);
2678
2679         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2680
2681         /* We do fxsave: this must be aligned. */
2682         BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2683
2684         vcpu_load(vcpu);
2685         r = kvm_x86_ops->vcpu_reset(vcpu);
2686         if (r == 0)
2687                 r = kvm_mmu_setup(vcpu);
2688         vcpu_put(vcpu);
2689         if (r < 0)
2690                 goto free_vcpu;
2691
2692         mutex_lock(&kvm->lock);
2693         if (kvm->vcpus[n]) {
2694                 r = -EEXIST;
2695                 mutex_unlock(&kvm->lock);
2696                 goto mmu_unload;
2697         }
2698         kvm->vcpus[n] = vcpu;
2699         mutex_unlock(&kvm->lock);
2700
2701         /* Now it's all set up, let userspace reach it */
2702         r = create_vcpu_fd(vcpu);
2703         if (r < 0)
2704                 goto unlink;
2705         return r;
2706
2707 unlink:
2708         mutex_lock(&kvm->lock);
2709         kvm->vcpus[n] = NULL;
2710         mutex_unlock(&kvm->lock);
2711
2712 mmu_unload:
2713         vcpu_load(vcpu);
2714         kvm_mmu_unload(vcpu);
2715         vcpu_put(vcpu);
2716
2717 free_vcpu:
2718         kvm_x86_ops->vcpu_free(vcpu);
2719         return r;
2720 }
2721
2722 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2723 {
2724         if (sigset) {
2725                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2726                 vcpu->sigset_active = 1;
2727                 vcpu->sigset = *sigset;
2728         } else
2729                 vcpu->sigset_active = 0;
2730         return 0;
2731 }
2732
2733 /*
2734  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2735  * we have asm/x86/processor.h
2736  */
2737 struct fxsave {
2738         u16     cwd;
2739         u16     swd;
2740         u16     twd;
2741         u16     fop;
2742         u64     rip;
2743         u64     rdp;
2744         u32     mxcsr;
2745         u32     mxcsr_mask;
2746         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2747 #ifdef CONFIG_X86_64
2748         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2749 #else
2750         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2751 #endif
2752 };
2753
2754 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2755 {
2756         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2757
2758         vcpu_load(vcpu);
2759
2760         memcpy(fpu->fpr, fxsave->st_space, 128);
2761         fpu->fcw = fxsave->cwd;
2762         fpu->fsw = fxsave->swd;
2763         fpu->ftwx = fxsave->twd;
2764         fpu->last_opcode = fxsave->fop;
2765         fpu->last_ip = fxsave->rip;
2766         fpu->last_dp = fxsave->rdp;
2767         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2768
2769         vcpu_put(vcpu);
2770
2771         return 0;
2772 }
2773
2774 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2775 {
2776         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2777
2778         vcpu_load(vcpu);
2779
2780         memcpy(fxsave->st_space, fpu->fpr, 128);
2781         fxsave->cwd = fpu->fcw;
2782         fxsave->swd = fpu->fsw;
2783         fxsave->twd = fpu->ftwx;
2784         fxsave->fop = fpu->last_opcode;
2785         fxsave->rip = fpu->last_ip;
2786         fxsave->rdp = fpu->last_dp;
2787         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2788
2789         vcpu_put(vcpu);
2790
2791         return 0;
2792 }
2793
2794 static long kvm_vcpu_ioctl(struct file *filp,
2795                            unsigned int ioctl, unsigned long arg)
2796 {
2797         struct kvm_vcpu *vcpu = filp->private_data;
2798         void __user *argp = (void __user *)arg;
2799         int r;
2800
2801         switch (ioctl) {
2802         case KVM_RUN:
2803                 r = -EINVAL;
2804                 if (arg)
2805                         goto out;
2806                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2807                 break;
2808         case KVM_GET_REGS: {
2809                 struct kvm_regs kvm_regs;
2810
2811                 memset(&kvm_regs, 0, sizeof kvm_regs);
2812                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2813                 if (r)
2814                         goto out;
2815                 r = -EFAULT;
2816                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2817                         goto out;
2818                 r = 0;
2819                 break;
2820         }
2821         case KVM_SET_REGS: {
2822                 struct kvm_regs kvm_regs;
2823
2824                 r = -EFAULT;
2825                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2826                         goto out;
2827                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2828                 if (r)
2829                         goto out;
2830                 r = 0;
2831                 break;
2832         }
2833         case KVM_GET_SREGS: {
2834                 struct kvm_sregs kvm_sregs;
2835
2836                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2837                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2838                 if (r)
2839                         goto out;
2840                 r = -EFAULT;
2841                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2842                         goto out;
2843                 r = 0;
2844                 break;
2845         }
2846         case KVM_SET_SREGS: {
2847                 struct kvm_sregs kvm_sregs;
2848
2849                 r = -EFAULT;
2850                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2851                         goto out;
2852                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2853                 if (r)
2854                         goto out;
2855                 r = 0;
2856                 break;
2857         }
2858         case KVM_TRANSLATE: {
2859                 struct kvm_translation tr;
2860
2861                 r = -EFAULT;
2862                 if (copy_from_user(&tr, argp, sizeof tr))
2863                         goto out;
2864                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2865                 if (r)
2866                         goto out;
2867                 r = -EFAULT;
2868                 if (copy_to_user(argp, &tr, sizeof tr))
2869                         goto out;
2870                 r = 0;
2871                 break;
2872         }
2873         case KVM_INTERRUPT: {
2874                 struct kvm_interrupt irq;
2875
2876                 r = -EFAULT;
2877                 if (copy_from_user(&irq, argp, sizeof irq))
2878                         goto out;
2879                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2880                 if (r)
2881                         goto out;
2882                 r = 0;
2883                 break;
2884         }
2885         case KVM_DEBUG_GUEST: {
2886                 struct kvm_debug_guest dbg;
2887
2888                 r = -EFAULT;
2889                 if (copy_from_user(&dbg, argp, sizeof dbg))
2890                         goto out;
2891                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2892                 if (r)
2893                         goto out;
2894                 r = 0;
2895                 break;
2896         }
2897         case KVM_SET_SIGNAL_MASK: {
2898                 struct kvm_signal_mask __user *sigmask_arg = argp;
2899                 struct kvm_signal_mask kvm_sigmask;
2900                 sigset_t sigset, *p;
2901
2902                 p = NULL;
2903                 if (argp) {
2904                         r = -EFAULT;
2905                         if (copy_from_user(&kvm_sigmask, argp,
2906                                            sizeof kvm_sigmask))
2907                                 goto out;
2908                         r = -EINVAL;
2909                         if (kvm_sigmask.len != sizeof sigset)
2910                                 goto out;
2911                         r = -EFAULT;
2912                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2913                                            sizeof sigset))
2914                                 goto out;
2915                         p = &sigset;
2916                 }
2917                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2918                 break;
2919         }
2920         case KVM_GET_FPU: {
2921                 struct kvm_fpu fpu;
2922
2923                 memset(&fpu, 0, sizeof fpu);
2924                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2925                 if (r)
2926                         goto out;
2927                 r = -EFAULT;
2928                 if (copy_to_user(argp, &fpu, sizeof fpu))
2929                         goto out;
2930                 r = 0;
2931                 break;
2932         }
2933         case KVM_SET_FPU: {
2934                 struct kvm_fpu fpu;
2935
2936                 r = -EFAULT;
2937                 if (copy_from_user(&fpu, argp, sizeof fpu))
2938                         goto out;
2939                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2940                 if (r)
2941                         goto out;
2942                 r = 0;
2943                 break;
2944         }
2945         default:
2946                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
2947         }
2948 out:
2949         return r;
2950 }
2951
2952 static long kvm_vm_ioctl(struct file *filp,
2953                            unsigned int ioctl, unsigned long arg)
2954 {
2955         struct kvm *kvm = filp->private_data;
2956         void __user *argp = (void __user *)arg;
2957         int r = -EINVAL;
2958
2959         switch (ioctl) {
2960         case KVM_CREATE_VCPU:
2961                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2962                 if (r < 0)
2963                         goto out;
2964                 break;
2965         case KVM_SET_MEMORY_REGION: {
2966                 struct kvm_memory_region kvm_mem;
2967                 struct kvm_userspace_memory_region kvm_userspace_mem;
2968
2969                 r = -EFAULT;
2970                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2971                         goto out;
2972                 kvm_userspace_mem.slot = kvm_mem.slot;
2973                 kvm_userspace_mem.flags = kvm_mem.flags;
2974                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2975                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2976                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2977                 if (r)
2978                         goto out;
2979                 break;
2980         }
2981         case KVM_SET_USER_MEMORY_REGION: {
2982                 struct kvm_userspace_memory_region kvm_userspace_mem;
2983
2984                 r = -EFAULT;
2985                 if (copy_from_user(&kvm_userspace_mem, argp,
2986                                                 sizeof kvm_userspace_mem))
2987                         goto out;
2988
2989                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
2990                 if (r)
2991                         goto out;
2992                 break;
2993         }
2994         case KVM_SET_NR_MMU_PAGES:
2995                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2996                 if (r)
2997                         goto out;
2998                 break;
2999         case KVM_GET_NR_MMU_PAGES:
3000                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
3001                 break;
3002         case KVM_GET_DIRTY_LOG: {
3003                 struct kvm_dirty_log log;
3004
3005                 r = -EFAULT;
3006                 if (copy_from_user(&log, argp, sizeof log))
3007                         goto out;
3008                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3009                 if (r)
3010                         goto out;
3011                 break;
3012         }
3013         case KVM_SET_MEMORY_ALIAS: {
3014                 struct kvm_memory_alias alias;
3015
3016                 r = -EFAULT;
3017                 if (copy_from_user(&alias, argp, sizeof alias))
3018                         goto out;
3019                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
3020                 if (r)
3021                         goto out;
3022                 break;
3023         }
3024         case KVM_CREATE_IRQCHIP:
3025                 r = -ENOMEM;
3026                 kvm->vpic = kvm_create_pic(kvm);
3027                 if (kvm->vpic) {
3028                         r = kvm_ioapic_init(kvm);
3029                         if (r) {
3030                                 kfree(kvm->vpic);
3031                                 kvm->vpic = NULL;
3032                                 goto out;
3033                         }
3034                 } else
3035                         goto out;
3036                 break;
3037         case KVM_IRQ_LINE: {
3038                 struct kvm_irq_level irq_event;
3039
3040                 r = -EFAULT;
3041                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3042                         goto out;
3043                 if (irqchip_in_kernel(kvm)) {
3044                         mutex_lock(&kvm->lock);
3045                         if (irq_event.irq < 16)
3046                                 kvm_pic_set_irq(pic_irqchip(kvm),
3047                                         irq_event.irq,
3048                                         irq_event.level);
3049                         kvm_ioapic_set_irq(kvm->vioapic,
3050                                         irq_event.irq,
3051                                         irq_event.level);
3052                         mutex_unlock(&kvm->lock);
3053                         r = 0;
3054                 }
3055                 break;
3056         }
3057         case KVM_GET_IRQCHIP: {
3058                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3059                 struct kvm_irqchip chip;
3060
3061                 r = -EFAULT;
3062                 if (copy_from_user(&chip, argp, sizeof chip))
3063                         goto out;
3064                 r = -ENXIO;
3065                 if (!irqchip_in_kernel(kvm))
3066                         goto out;
3067                 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3068                 if (r)
3069                         goto out;
3070                 r = -EFAULT;
3071                 if (copy_to_user(argp, &chip, sizeof chip))
3072                         goto out;
3073                 r = 0;
3074                 break;
3075         }
3076         case KVM_SET_IRQCHIP: {
3077                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3078                 struct kvm_irqchip chip;
3079
3080                 r = -EFAULT;
3081                 if (copy_from_user(&chip, argp, sizeof chip))
3082                         goto out;
3083                 r = -ENXIO;
3084                 if (!irqchip_in_kernel(kvm))
3085                         goto out;
3086                 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3087                 if (r)
3088                         goto out;
3089                 r = 0;
3090                 break;
3091         }
3092         default:
3093                 ;
3094         }
3095 out:
3096         return r;
3097 }
3098
3099 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
3100                                   unsigned long address,
3101                                   int *type)
3102 {
3103         struct kvm *kvm = vma->vm_file->private_data;
3104         unsigned long pgoff;
3105         struct page *page;
3106
3107         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3108         if (!kvm_is_visible_gfn(kvm, pgoff))
3109                 return NOPAGE_SIGBUS;
3110         page = gfn_to_page(kvm, pgoff);
3111         if (is_error_page(page)) {
3112                 kvm_release_page(page);
3113                 return NOPAGE_SIGBUS;
3114         }
3115         if (type != NULL)
3116                 *type = VM_FAULT_MINOR;
3117
3118         return page;
3119 }
3120
3121 static struct vm_operations_struct kvm_vm_vm_ops = {
3122         .nopage = kvm_vm_nopage,
3123 };
3124
3125 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
3126 {
3127         vma->vm_ops = &kvm_vm_vm_ops;
3128         return 0;
3129 }
3130
3131 static struct file_operations kvm_vm_fops = {
3132         .release        = kvm_vm_release,
3133         .unlocked_ioctl = kvm_vm_ioctl,
3134         .compat_ioctl   = kvm_vm_ioctl,
3135         .mmap           = kvm_vm_mmap,
3136 };
3137
3138 static int kvm_dev_ioctl_create_vm(void)
3139 {
3140         int fd, r;
3141         struct inode *inode;
3142         struct file *file;
3143         struct kvm *kvm;
3144
3145         kvm = kvm_create_vm();
3146         if (IS_ERR(kvm))
3147                 return PTR_ERR(kvm);
3148         r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
3149         if (r) {
3150                 kvm_destroy_vm(kvm);
3151                 return r;
3152         }
3153
3154         kvm->filp = file;
3155
3156         return fd;
3157 }
3158
3159 static long kvm_dev_ioctl(struct file *filp,
3160                           unsigned int ioctl, unsigned long arg)
3161 {
3162         void __user *argp = (void __user *)arg;
3163         long r = -EINVAL;
3164
3165         switch (ioctl) {
3166         case KVM_GET_API_VERSION:
3167                 r = -EINVAL;
3168                 if (arg)
3169                         goto out;
3170                 r = KVM_API_VERSION;
3171                 break;
3172         case KVM_CREATE_VM:
3173                 r = -EINVAL;
3174                 if (arg)
3175                         goto out;
3176                 r = kvm_dev_ioctl_create_vm();
3177                 break;
3178         case KVM_CHECK_EXTENSION: {
3179                 int ext = (long)argp;
3180
3181                 switch (ext) {
3182                 case KVM_CAP_IRQCHIP:
3183                 case KVM_CAP_HLT:
3184                 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
3185                 case KVM_CAP_USER_MEMORY:
3186                         r = 1;
3187                         break;
3188                 default:
3189                         r = 0;
3190                         break;
3191                 }
3192                 break;
3193         }
3194         case KVM_GET_VCPU_MMAP_SIZE:
3195                 r = -EINVAL;
3196                 if (arg)
3197                         goto out;
3198                 r = 2 * PAGE_SIZE;
3199                 break;
3200         default:
3201                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
3202         }
3203 out:
3204         return r;
3205 }
3206
3207 static struct file_operations kvm_chardev_ops = {
3208         .unlocked_ioctl = kvm_dev_ioctl,
3209         .compat_ioctl   = kvm_dev_ioctl,
3210 };
3211
3212 static struct miscdevice kvm_dev = {
3213         KVM_MINOR,
3214         "kvm",
3215         &kvm_chardev_ops,
3216 };
3217
3218 /*
3219  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3220  * cached on it.
3221  */
3222 static void decache_vcpus_on_cpu(int cpu)
3223 {
3224         struct kvm *vm;
3225         struct kvm_vcpu *vcpu;
3226         int i;
3227
3228         spin_lock(&kvm_lock);
3229         list_for_each_entry(vm, &vm_list, vm_list)
3230                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3231                         vcpu = vm->vcpus[i];
3232                         if (!vcpu)
3233                                 continue;
3234                         /*
3235                          * If the vcpu is locked, then it is running on some
3236                          * other cpu and therefore it is not cached on the
3237                          * cpu in question.
3238                          *
3239                          * If it's not locked, check the last cpu it executed
3240                          * on.
3241                          */
3242                         if (mutex_trylock(&vcpu->mutex)) {
3243                                 if (vcpu->cpu == cpu) {
3244                                         kvm_x86_ops->vcpu_decache(vcpu);
3245                                         vcpu->cpu = -1;
3246                                 }
3247                                 mutex_unlock(&vcpu->mutex);
3248                         }
3249                 }
3250         spin_unlock(&kvm_lock);
3251 }
3252
3253 static void hardware_enable(void *junk)
3254 {
3255         int cpu = raw_smp_processor_id();
3256
3257         if (cpu_isset(cpu, cpus_hardware_enabled))
3258                 return;
3259         cpu_set(cpu, cpus_hardware_enabled);
3260         kvm_x86_ops->hardware_enable(NULL);
3261 }
3262
3263 static void hardware_disable(void *junk)
3264 {
3265         int cpu = raw_smp_processor_id();
3266
3267         if (!cpu_isset(cpu, cpus_hardware_enabled))
3268                 return;
3269         cpu_clear(cpu, cpus_hardware_enabled);
3270         decache_vcpus_on_cpu(cpu);
3271         kvm_x86_ops->hardware_disable(NULL);
3272 }
3273
3274 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3275                            void *v)
3276 {
3277         int cpu = (long)v;
3278
3279         switch (val) {
3280         case CPU_DYING:
3281         case CPU_DYING_FROZEN:
3282                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3283                        cpu);
3284                 hardware_disable(NULL);
3285                 break;
3286         case CPU_UP_CANCELED:
3287         case CPU_UP_CANCELED_FROZEN:
3288                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3289                        cpu);
3290                 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3291                 break;
3292         case CPU_ONLINE:
3293         case CPU_ONLINE_FROZEN:
3294                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3295                        cpu);
3296                 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3297                 break;
3298         }
3299         return NOTIFY_OK;
3300 }
3301
3302 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3303                       void *v)
3304 {
3305         if (val == SYS_RESTART) {
3306                 /*
3307                  * Some (well, at least mine) BIOSes hang on reboot if
3308                  * in vmx root mode.
3309                  */
3310                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3311                 on_each_cpu(hardware_disable, NULL, 0, 1);
3312         }
3313         return NOTIFY_OK;
3314 }
3315
3316 static struct notifier_block kvm_reboot_notifier = {
3317         .notifier_call = kvm_reboot,
3318         .priority = 0,
3319 };
3320
3321 void kvm_io_bus_init(struct kvm_io_bus *bus)
3322 {
3323         memset(bus, 0, sizeof(*bus));
3324 }
3325
3326 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3327 {
3328         int i;
3329
3330         for (i = 0; i < bus->dev_count; i++) {
3331                 struct kvm_io_device *pos = bus->devs[i];
3332
3333                 kvm_iodevice_destructor(pos);
3334         }
3335 }
3336
3337 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3338 {
3339         int i;
3340
3341         for (i = 0; i < bus->dev_count; i++) {
3342                 struct kvm_io_device *pos = bus->devs[i];
3343
3344                 if (pos->in_range(pos, addr))
3345                         return pos;
3346         }
3347
3348         return NULL;
3349 }
3350
3351 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3352 {
3353         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3354
3355         bus->devs[bus->dev_count++] = dev;
3356 }
3357
3358 static struct notifier_block kvm_cpu_notifier = {
3359         .notifier_call = kvm_cpu_hotplug,
3360         .priority = 20, /* must be > scheduler priority */
3361 };
3362
3363 static u64 stat_get(void *_offset)
3364 {
3365         unsigned offset = (long)_offset;
3366         u64 total = 0;
3367         struct kvm *kvm;
3368         struct kvm_vcpu *vcpu;
3369         int i;
3370
3371         spin_lock(&kvm_lock);
3372         list_for_each_entry(kvm, &vm_list, vm_list)
3373                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3374                         vcpu = kvm->vcpus[i];
3375                         if (vcpu)
3376                                 total += *(u32 *)((void *)vcpu + offset);
3377                 }
3378         spin_unlock(&kvm_lock);
3379         return total;
3380 }
3381
3382 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
3383
3384 static __init void kvm_init_debug(void)
3385 {
3386         struct kvm_stats_debugfs_item *p;
3387
3388         debugfs_dir = debugfs_create_dir("kvm", NULL);
3389         for (p = debugfs_entries; p->name; ++p)
3390                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3391                                                 (void *)(long)p->offset,
3392                                                 &stat_fops);
3393 }
3394
3395 static void kvm_exit_debug(void)
3396 {
3397         struct kvm_stats_debugfs_item *p;
3398
3399         for (p = debugfs_entries; p->name; ++p)
3400                 debugfs_remove(p->dentry);
3401         debugfs_remove(debugfs_dir);
3402 }
3403
3404 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3405 {
3406         hardware_disable(NULL);
3407         return 0;
3408 }
3409
3410 static int kvm_resume(struct sys_device *dev)
3411 {
3412         hardware_enable(NULL);
3413         return 0;
3414 }
3415
3416 static struct sysdev_class kvm_sysdev_class = {
3417         .name = "kvm",
3418         .suspend = kvm_suspend,
3419         .resume = kvm_resume,
3420 };
3421
3422 static struct sys_device kvm_sysdev = {
3423         .id = 0,
3424         .cls = &kvm_sysdev_class,
3425 };
3426
3427 struct page *bad_page;
3428
3429 static inline
3430 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3431 {
3432         return container_of(pn, struct kvm_vcpu, preempt_notifier);
3433 }
3434
3435 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3436 {
3437         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3438
3439         kvm_x86_ops->vcpu_load(vcpu, cpu);
3440 }
3441
3442 static void kvm_sched_out(struct preempt_notifier *pn,
3443                           struct task_struct *next)
3444 {
3445         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3446
3447         kvm_x86_ops->vcpu_put(vcpu);
3448 }
3449
3450 int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
3451                   struct module *module)
3452 {
3453         int r;
3454         int cpu;
3455
3456         if (kvm_x86_ops) {
3457                 printk(KERN_ERR "kvm: already loaded the other module\n");
3458                 return -EEXIST;
3459         }
3460
3461         if (!ops->cpu_has_kvm_support()) {
3462                 printk(KERN_ERR "kvm: no hardware support\n");
3463                 return -EOPNOTSUPP;
3464         }
3465         if (ops->disabled_by_bios()) {
3466                 printk(KERN_ERR "kvm: disabled by bios\n");
3467                 return -EOPNOTSUPP;
3468         }
3469
3470         kvm_x86_ops = ops;
3471
3472         r = kvm_x86_ops->hardware_setup();
3473         if (r < 0)
3474                 goto out;
3475
3476         for_each_online_cpu(cpu) {
3477                 smp_call_function_single(cpu,
3478                                 kvm_x86_ops->check_processor_compatibility,
3479                                 &r, 0, 1);
3480                 if (r < 0)
3481                         goto out_free_0;
3482         }
3483
3484         on_each_cpu(hardware_enable, NULL, 0, 1);
3485         r = register_cpu_notifier(&kvm_cpu_notifier);
3486         if (r)
3487                 goto out_free_1;
3488         register_reboot_notifier(&kvm_reboot_notifier);
3489
3490         r = sysdev_class_register(&kvm_sysdev_class);
3491         if (r)
3492                 goto out_free_2;
3493
3494         r = sysdev_register(&kvm_sysdev);
3495         if (r)
3496                 goto out_free_3;
3497
3498         /* A kmem cache lets us meet the alignment requirements of fx_save. */
3499         kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3500                                            __alignof__(struct kvm_vcpu), 0, 0);
3501         if (!kvm_vcpu_cache) {
3502                 r = -ENOMEM;
3503                 goto out_free_4;
3504         }
3505
3506         kvm_chardev_ops.owner = module;
3507
3508         r = misc_register(&kvm_dev);
3509         if (r) {
3510                 printk(KERN_ERR "kvm: misc device register failed\n");
3511                 goto out_free;
3512         }
3513
3514         kvm_preempt_ops.sched_in = kvm_sched_in;
3515         kvm_preempt_ops.sched_out = kvm_sched_out;
3516
3517         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
3518
3519         return 0;
3520
3521 out_free:
3522         kmem_cache_destroy(kvm_vcpu_cache);
3523 out_free_4:
3524         sysdev_unregister(&kvm_sysdev);
3525 out_free_3:
3526         sysdev_class_unregister(&kvm_sysdev_class);
3527 out_free_2:
3528         unregister_reboot_notifier(&kvm_reboot_notifier);
3529         unregister_cpu_notifier(&kvm_cpu_notifier);
3530 out_free_1:
3531         on_each_cpu(hardware_disable, NULL, 0, 1);
3532 out_free_0:
3533         kvm_x86_ops->hardware_unsetup();
3534 out:
3535         kvm_x86_ops = NULL;
3536         return r;
3537 }
3538 EXPORT_SYMBOL_GPL(kvm_init_x86);
3539
3540 void kvm_exit_x86(void)
3541 {
3542         misc_deregister(&kvm_dev);
3543         kmem_cache_destroy(kvm_vcpu_cache);
3544         sysdev_unregister(&kvm_sysdev);
3545         sysdev_class_unregister(&kvm_sysdev_class);
3546         unregister_reboot_notifier(&kvm_reboot_notifier);
3547         unregister_cpu_notifier(&kvm_cpu_notifier);
3548         on_each_cpu(hardware_disable, NULL, 0, 1);
3549         kvm_x86_ops->hardware_unsetup();
3550         kvm_x86_ops = NULL;
3551 }
3552 EXPORT_SYMBOL_GPL(kvm_exit_x86);
3553
3554 static __init int kvm_init(void)
3555 {
3556         int r;
3557
3558         r = kvm_mmu_module_init();
3559         if (r)
3560                 goto out4;
3561
3562         kvm_init_debug();
3563
3564         kvm_arch_init();
3565
3566         bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3567
3568         if (bad_page == NULL) {
3569                 r = -ENOMEM;
3570                 goto out;
3571         }
3572
3573         return 0;
3574
3575 out:
3576         kvm_exit_debug();
3577         kvm_mmu_module_exit();
3578 out4:
3579         return r;
3580 }
3581
3582 static __exit void kvm_exit(void)
3583 {
3584         kvm_exit_debug();
3585         __free_page(bad_page);
3586         kvm_mmu_module_exit();
3587 }
3588
3589 module_init(kvm_init)
3590 module_exit(kvm_exit)