]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/kvm/kvm_main.c
KVM: Portability: Move x86 instruction emulation code to x86.c
[linux-2.6-omap-h63xx.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19 #include "x86.h"
20 #include "x86_emulate.h"
21 #include "irq.h"
22
23 #include <linux/kvm.h>
24 #include <linux/module.h>
25 #include <linux/errno.h>
26 #include <linux/percpu.h>
27 #include <linux/gfp.h>
28 #include <linux/mm.h>
29 #include <linux/miscdevice.h>
30 #include <linux/vmalloc.h>
31 #include <linux/reboot.h>
32 #include <linux/debugfs.h>
33 #include <linux/highmem.h>
34 #include <linux/file.h>
35 #include <linux/sysdev.h>
36 #include <linux/cpu.h>
37 #include <linux/sched.h>
38 #include <linux/cpumask.h>
39 #include <linux/smp.h>
40 #include <linux/anon_inodes.h>
41 #include <linux/profile.h>
42 #include <linux/kvm_para.h>
43 #include <linux/pagemap.h>
44 #include <linux/mman.h>
45
46 #include <asm/processor.h>
47 #include <asm/msr.h>
48 #include <asm/io.h>
49 #include <asm/uaccess.h>
50 #include <asm/desc.h>
51
52 MODULE_AUTHOR("Qumranet");
53 MODULE_LICENSE("GPL");
54
55 static DEFINE_SPINLOCK(kvm_lock);
56 static LIST_HEAD(vm_list);
57
58 static cpumask_t cpus_hardware_enabled;
59
60 struct kvm_x86_ops *kvm_x86_ops;
61 struct kmem_cache *kvm_vcpu_cache;
62 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
63
64 static __read_mostly struct preempt_ops kvm_preempt_ops;
65
66 static struct dentry *debugfs_dir;
67
68 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
69                            unsigned long arg);
70
71 static inline int valid_vcpu(int n)
72 {
73         return likely(n >= 0 && n < KVM_MAX_VCPUS);
74 }
75
76 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
77 {
78         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
79                 return;
80
81         vcpu->guest_fpu_loaded = 1;
82         fx_save(&vcpu->host_fx_image);
83         fx_restore(&vcpu->guest_fx_image);
84 }
85 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
86
87 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
88 {
89         if (!vcpu->guest_fpu_loaded)
90                 return;
91
92         vcpu->guest_fpu_loaded = 0;
93         fx_save(&vcpu->guest_fx_image);
94         fx_restore(&vcpu->host_fx_image);
95 }
96 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
97
98 /*
99  * Switches to specified vcpu, until a matching vcpu_put()
100  */
101 void vcpu_load(struct kvm_vcpu *vcpu)
102 {
103         int cpu;
104
105         mutex_lock(&vcpu->mutex);
106         cpu = get_cpu();
107         preempt_notifier_register(&vcpu->preempt_notifier);
108         kvm_arch_vcpu_load(vcpu, cpu);
109         put_cpu();
110 }
111
112 void vcpu_put(struct kvm_vcpu *vcpu)
113 {
114         preempt_disable();
115         kvm_arch_vcpu_put(vcpu);
116         preempt_notifier_unregister(&vcpu->preempt_notifier);
117         preempt_enable();
118         mutex_unlock(&vcpu->mutex);
119 }
120
121 static void ack_flush(void *_completed)
122 {
123 }
124
125 void kvm_flush_remote_tlbs(struct kvm *kvm)
126 {
127         int i, cpu;
128         cpumask_t cpus;
129         struct kvm_vcpu *vcpu;
130
131         cpus_clear(cpus);
132         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
133                 vcpu = kvm->vcpus[i];
134                 if (!vcpu)
135                         continue;
136                 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
137                         continue;
138                 cpu = vcpu->cpu;
139                 if (cpu != -1 && cpu != raw_smp_processor_id())
140                         cpu_set(cpu, cpus);
141         }
142         smp_call_function_mask(cpus, ack_flush, NULL, 1);
143 }
144
145 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
146 {
147         struct page *page;
148         int r;
149
150         mutex_init(&vcpu->mutex);
151         vcpu->cpu = -1;
152         vcpu->mmu.root_hpa = INVALID_PAGE;
153         vcpu->kvm = kvm;
154         vcpu->vcpu_id = id;
155         if (!irqchip_in_kernel(kvm) || id == 0)
156                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
157         else
158                 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
159         init_waitqueue_head(&vcpu->wq);
160
161         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
162         if (!page) {
163                 r = -ENOMEM;
164                 goto fail;
165         }
166         vcpu->run = page_address(page);
167
168         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
169         if (!page) {
170                 r = -ENOMEM;
171                 goto fail_free_run;
172         }
173         vcpu->pio_data = page_address(page);
174
175         r = kvm_mmu_create(vcpu);
176         if (r < 0)
177                 goto fail_free_pio_data;
178
179         if (irqchip_in_kernel(kvm)) {
180                 r = kvm_create_lapic(vcpu);
181                 if (r < 0)
182                         goto fail_mmu_destroy;
183         }
184
185         return 0;
186
187 fail_mmu_destroy:
188         kvm_mmu_destroy(vcpu);
189 fail_free_pio_data:
190         free_page((unsigned long)vcpu->pio_data);
191 fail_free_run:
192         free_page((unsigned long)vcpu->run);
193 fail:
194         return r;
195 }
196 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
197
198 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
199 {
200         kvm_free_lapic(vcpu);
201         kvm_mmu_destroy(vcpu);
202         free_page((unsigned long)vcpu->pio_data);
203         free_page((unsigned long)vcpu->run);
204 }
205 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
206
207 static struct kvm *kvm_create_vm(void)
208 {
209         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
210
211         if (!kvm)
212                 return ERR_PTR(-ENOMEM);
213
214         kvm_io_bus_init(&kvm->pio_bus);
215         mutex_init(&kvm->lock);
216         INIT_LIST_HEAD(&kvm->active_mmu_pages);
217         kvm_io_bus_init(&kvm->mmio_bus);
218         spin_lock(&kvm_lock);
219         list_add(&kvm->vm_list, &vm_list);
220         spin_unlock(&kvm_lock);
221         return kvm;
222 }
223
224 /*
225  * Free any memory in @free but not in @dont.
226  */
227 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
228                                   struct kvm_memory_slot *dont)
229 {
230         if (!dont || free->rmap != dont->rmap)
231                 vfree(free->rmap);
232
233         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
234                 vfree(free->dirty_bitmap);
235
236         free->npages = 0;
237         free->dirty_bitmap = NULL;
238         free->rmap = NULL;
239 }
240
241 static void kvm_free_physmem(struct kvm *kvm)
242 {
243         int i;
244
245         for (i = 0; i < kvm->nmemslots; ++i)
246                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
247 }
248
249 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
250 {
251         vcpu_load(vcpu);
252         kvm_mmu_unload(vcpu);
253         vcpu_put(vcpu);
254 }
255
256 static void kvm_free_vcpus(struct kvm *kvm)
257 {
258         unsigned int i;
259
260         /*
261          * Unpin any mmu pages first.
262          */
263         for (i = 0; i < KVM_MAX_VCPUS; ++i)
264                 if (kvm->vcpus[i])
265                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
266         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
267                 if (kvm->vcpus[i]) {
268                         kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
269                         kvm->vcpus[i] = NULL;
270                 }
271         }
272
273 }
274
275 static void kvm_destroy_vm(struct kvm *kvm)
276 {
277         spin_lock(&kvm_lock);
278         list_del(&kvm->vm_list);
279         spin_unlock(&kvm_lock);
280         kvm_io_bus_destroy(&kvm->pio_bus);
281         kvm_io_bus_destroy(&kvm->mmio_bus);
282         kfree(kvm->vpic);
283         kfree(kvm->vioapic);
284         kvm_free_vcpus(kvm);
285         kvm_free_physmem(kvm);
286         kfree(kvm);
287 }
288
289 static int kvm_vm_release(struct inode *inode, struct file *filp)
290 {
291         struct kvm *kvm = filp->private_data;
292
293         kvm_destroy_vm(kvm);
294         return 0;
295 }
296
297 void fx_init(struct kvm_vcpu *vcpu)
298 {
299         unsigned after_mxcsr_mask;
300
301         /* Initialize guest FPU by resetting ours and saving into guest's */
302         preempt_disable();
303         fx_save(&vcpu->host_fx_image);
304         fpu_init();
305         fx_save(&vcpu->guest_fx_image);
306         fx_restore(&vcpu->host_fx_image);
307         preempt_enable();
308
309         vcpu->cr0 |= X86_CR0_ET;
310         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
311         vcpu->guest_fx_image.mxcsr = 0x1f80;
312         memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
313                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
314 }
315 EXPORT_SYMBOL_GPL(fx_init);
316
317 /*
318  * Allocate some memory and give it an address in the guest physical address
319  * space.
320  *
321  * Discontiguous memory is allowed, mostly for framebuffers.
322  *
323  * Must be called holding kvm->lock.
324  */
325 int __kvm_set_memory_region(struct kvm *kvm,
326                             struct kvm_userspace_memory_region *mem,
327                             int user_alloc)
328 {
329         int r;
330         gfn_t base_gfn;
331         unsigned long npages;
332         unsigned long i;
333         struct kvm_memory_slot *memslot;
334         struct kvm_memory_slot old, new;
335
336         r = -EINVAL;
337         /* General sanity checks */
338         if (mem->memory_size & (PAGE_SIZE - 1))
339                 goto out;
340         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
341                 goto out;
342         if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
343                 goto out;
344         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
345                 goto out;
346
347         memslot = &kvm->memslots[mem->slot];
348         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
349         npages = mem->memory_size >> PAGE_SHIFT;
350
351         if (!npages)
352                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
353
354         new = old = *memslot;
355
356         new.base_gfn = base_gfn;
357         new.npages = npages;
358         new.flags = mem->flags;
359
360         /* Disallow changing a memory slot's size. */
361         r = -EINVAL;
362         if (npages && old.npages && npages != old.npages)
363                 goto out_free;
364
365         /* Check for overlaps */
366         r = -EEXIST;
367         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
368                 struct kvm_memory_slot *s = &kvm->memslots[i];
369
370                 if (s == memslot)
371                         continue;
372                 if (!((base_gfn + npages <= s->base_gfn) ||
373                       (base_gfn >= s->base_gfn + s->npages)))
374                         goto out_free;
375         }
376
377         /* Free page dirty bitmap if unneeded */
378         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
379                 new.dirty_bitmap = NULL;
380
381         r = -ENOMEM;
382
383         /* Allocate if a slot is being created */
384         if (npages && !new.rmap) {
385                 new.rmap = vmalloc(npages * sizeof(struct page *));
386
387                 if (!new.rmap)
388                         goto out_free;
389
390                 memset(new.rmap, 0, npages * sizeof(*new.rmap));
391
392                 new.user_alloc = user_alloc;
393                 if (user_alloc)
394                         new.userspace_addr = mem->userspace_addr;
395                 else {
396                         down_write(&current->mm->mmap_sem);
397                         new.userspace_addr = do_mmap(NULL, 0,
398                                                      npages * PAGE_SIZE,
399                                                      PROT_READ | PROT_WRITE,
400                                                      MAP_SHARED | MAP_ANONYMOUS,
401                                                      0);
402                         up_write(&current->mm->mmap_sem);
403
404                         if (IS_ERR((void *)new.userspace_addr))
405                                 goto out_free;
406                 }
407         } else {
408                 if (!old.user_alloc && old.rmap) {
409                         int ret;
410
411                         down_write(&current->mm->mmap_sem);
412                         ret = do_munmap(current->mm, old.userspace_addr,
413                                         old.npages * PAGE_SIZE);
414                         up_write(&current->mm->mmap_sem);
415                         if (ret < 0)
416                                 printk(KERN_WARNING
417                                        "kvm_vm_ioctl_set_memory_region: "
418                                        "failed to munmap memory\n");
419                 }
420         }
421
422         /* Allocate page dirty bitmap if needed */
423         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
424                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
425
426                 new.dirty_bitmap = vmalloc(dirty_bytes);
427                 if (!new.dirty_bitmap)
428                         goto out_free;
429                 memset(new.dirty_bitmap, 0, dirty_bytes);
430         }
431
432         if (mem->slot >= kvm->nmemslots)
433                 kvm->nmemslots = mem->slot + 1;
434
435         if (!kvm->n_requested_mmu_pages) {
436                 unsigned int n_pages;
437
438                 if (npages) {
439                         n_pages = npages * KVM_PERMILLE_MMU_PAGES / 1000;
440                         kvm_mmu_change_mmu_pages(kvm, kvm->n_alloc_mmu_pages +
441                                                  n_pages);
442                 } else {
443                         unsigned int nr_mmu_pages;
444
445                         n_pages = old.npages * KVM_PERMILLE_MMU_PAGES / 1000;
446                         nr_mmu_pages = kvm->n_alloc_mmu_pages - n_pages;
447                         nr_mmu_pages = max(nr_mmu_pages,
448                                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
449                         kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
450                 }
451         }
452
453         *memslot = new;
454
455         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
456         kvm_flush_remote_tlbs(kvm);
457
458         kvm_free_physmem_slot(&old, &new);
459         return 0;
460
461 out_free:
462         kvm_free_physmem_slot(&new, &old);
463 out:
464         return r;
465
466 }
467 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
468
469 int kvm_set_memory_region(struct kvm *kvm,
470                           struct kvm_userspace_memory_region *mem,
471                           int user_alloc)
472 {
473         int r;
474
475         mutex_lock(&kvm->lock);
476         r = __kvm_set_memory_region(kvm, mem, user_alloc);
477         mutex_unlock(&kvm->lock);
478         return r;
479 }
480 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
481
482 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
483                                    struct
484                                    kvm_userspace_memory_region *mem,
485                                    int user_alloc)
486 {
487         if (mem->slot >= KVM_MEMORY_SLOTS)
488                 return -EINVAL;
489         return kvm_set_memory_region(kvm, mem, user_alloc);
490 }
491
492 /*
493  * Get (and clear) the dirty memory log for a memory slot.
494  */
495 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
496                                       struct kvm_dirty_log *log)
497 {
498         struct kvm_memory_slot *memslot;
499         int r, i;
500         int n;
501         unsigned long any = 0;
502
503         mutex_lock(&kvm->lock);
504
505         r = -EINVAL;
506         if (log->slot >= KVM_MEMORY_SLOTS)
507                 goto out;
508
509         memslot = &kvm->memslots[log->slot];
510         r = -ENOENT;
511         if (!memslot->dirty_bitmap)
512                 goto out;
513
514         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
515
516         for (i = 0; !any && i < n/sizeof(long); ++i)
517                 any = memslot->dirty_bitmap[i];
518
519         r = -EFAULT;
520         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
521                 goto out;
522
523         /* If nothing is dirty, don't bother messing with page tables. */
524         if (any) {
525                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
526                 kvm_flush_remote_tlbs(kvm);
527                 memset(memslot->dirty_bitmap, 0, n);
528         }
529
530         r = 0;
531
532 out:
533         mutex_unlock(&kvm->lock);
534         return r;
535 }
536
537 int is_error_page(struct page *page)
538 {
539         return page == bad_page;
540 }
541 EXPORT_SYMBOL_GPL(is_error_page);
542
543 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
544 {
545         int i;
546         struct kvm_mem_alias *alias;
547
548         for (i = 0; i < kvm->naliases; ++i) {
549                 alias = &kvm->aliases[i];
550                 if (gfn >= alias->base_gfn
551                     && gfn < alias->base_gfn + alias->npages)
552                         return alias->target_gfn + gfn - alias->base_gfn;
553         }
554         return gfn;
555 }
556
557 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
558 {
559         int i;
560
561         for (i = 0; i < kvm->nmemslots; ++i) {
562                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
563
564                 if (gfn >= memslot->base_gfn
565                     && gfn < memslot->base_gfn + memslot->npages)
566                         return memslot;
567         }
568         return NULL;
569 }
570
571 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
572 {
573         gfn = unalias_gfn(kvm, gfn);
574         return __gfn_to_memslot(kvm, gfn);
575 }
576
577 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
578 {
579         int i;
580
581         gfn = unalias_gfn(kvm, gfn);
582         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
583                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
584
585                 if (gfn >= memslot->base_gfn
586                     && gfn < memslot->base_gfn + memslot->npages)
587                         return 1;
588         }
589         return 0;
590 }
591 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
592
593 /*
594  * Requires current->mm->mmap_sem to be held
595  */
596 static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
597 {
598         struct kvm_memory_slot *slot;
599         struct page *page[1];
600         int npages;
601
602         might_sleep();
603
604         gfn = unalias_gfn(kvm, gfn);
605         slot = __gfn_to_memslot(kvm, gfn);
606         if (!slot) {
607                 get_page(bad_page);
608                 return bad_page;
609         }
610
611         npages = get_user_pages(current, current->mm,
612                                 slot->userspace_addr
613                                 + (gfn - slot->base_gfn) * PAGE_SIZE, 1,
614                                 1, 1, page, NULL);
615         if (npages != 1) {
616                 get_page(bad_page);
617                 return bad_page;
618         }
619
620         return page[0];
621 }
622
623 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
624 {
625         struct page *page;
626
627         down_read(&current->mm->mmap_sem);
628         page = __gfn_to_page(kvm, gfn);
629         up_read(&current->mm->mmap_sem);
630
631         return page;
632 }
633
634 EXPORT_SYMBOL_GPL(gfn_to_page);
635
636 void kvm_release_page(struct page *page)
637 {
638         if (!PageReserved(page))
639                 SetPageDirty(page);
640         put_page(page);
641 }
642 EXPORT_SYMBOL_GPL(kvm_release_page);
643
644 static int next_segment(unsigned long len, int offset)
645 {
646         if (len > PAGE_SIZE - offset)
647                 return PAGE_SIZE - offset;
648         else
649                 return len;
650 }
651
652 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
653                         int len)
654 {
655         void *page_virt;
656         struct page *page;
657
658         page = gfn_to_page(kvm, gfn);
659         if (is_error_page(page)) {
660                 kvm_release_page(page);
661                 return -EFAULT;
662         }
663         page_virt = kmap_atomic(page, KM_USER0);
664
665         memcpy(data, page_virt + offset, len);
666
667         kunmap_atomic(page_virt, KM_USER0);
668         kvm_release_page(page);
669         return 0;
670 }
671 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
672
673 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
674 {
675         gfn_t gfn = gpa >> PAGE_SHIFT;
676         int seg;
677         int offset = offset_in_page(gpa);
678         int ret;
679
680         while ((seg = next_segment(len, offset)) != 0) {
681                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
682                 if (ret < 0)
683                         return ret;
684                 offset = 0;
685                 len -= seg;
686                 data += seg;
687                 ++gfn;
688         }
689         return 0;
690 }
691 EXPORT_SYMBOL_GPL(kvm_read_guest);
692
693 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
694                          int offset, int len)
695 {
696         void *page_virt;
697         struct page *page;
698
699         page = gfn_to_page(kvm, gfn);
700         if (is_error_page(page)) {
701                 kvm_release_page(page);
702                 return -EFAULT;
703         }
704         page_virt = kmap_atomic(page, KM_USER0);
705
706         memcpy(page_virt + offset, data, len);
707
708         kunmap_atomic(page_virt, KM_USER0);
709         mark_page_dirty(kvm, gfn);
710         kvm_release_page(page);
711         return 0;
712 }
713 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
714
715 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
716                     unsigned long len)
717 {
718         gfn_t gfn = gpa >> PAGE_SHIFT;
719         int seg;
720         int offset = offset_in_page(gpa);
721         int ret;
722
723         while ((seg = next_segment(len, offset)) != 0) {
724                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
725                 if (ret < 0)
726                         return ret;
727                 offset = 0;
728                 len -= seg;
729                 data += seg;
730                 ++gfn;
731         }
732         return 0;
733 }
734
735 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
736 {
737         void *page_virt;
738         struct page *page;
739
740         page = gfn_to_page(kvm, gfn);
741         if (is_error_page(page)) {
742                 kvm_release_page(page);
743                 return -EFAULT;
744         }
745         page_virt = kmap_atomic(page, KM_USER0);
746
747         memset(page_virt + offset, 0, len);
748
749         kunmap_atomic(page_virt, KM_USER0);
750         kvm_release_page(page);
751         return 0;
752 }
753 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
754
755 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
756 {
757         gfn_t gfn = gpa >> PAGE_SHIFT;
758         int seg;
759         int offset = offset_in_page(gpa);
760         int ret;
761
762         while ((seg = next_segment(len, offset)) != 0) {
763                 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
764                 if (ret < 0)
765                         return ret;
766                 offset = 0;
767                 len -= seg;
768                 ++gfn;
769         }
770         return 0;
771 }
772 EXPORT_SYMBOL_GPL(kvm_clear_guest);
773
774 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
775 {
776         struct kvm_memory_slot *memslot;
777
778         gfn = unalias_gfn(kvm, gfn);
779         memslot = __gfn_to_memslot(kvm, gfn);
780         if (memslot && memslot->dirty_bitmap) {
781                 unsigned long rel_gfn = gfn - memslot->base_gfn;
782
783                 /* avoid RMW */
784                 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
785                         set_bit(rel_gfn, memslot->dirty_bitmap);
786         }
787 }
788
789 /*
790  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
791  */
792 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
793 {
794         DECLARE_WAITQUEUE(wait, current);
795
796         add_wait_queue(&vcpu->wq, &wait);
797
798         /*
799          * We will block until either an interrupt or a signal wakes us up
800          */
801         while (!kvm_cpu_has_interrupt(vcpu)
802                && !signal_pending(current)
803                && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
804                && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
805                 set_current_state(TASK_INTERRUPTIBLE);
806                 vcpu_put(vcpu);
807                 schedule();
808                 vcpu_load(vcpu);
809         }
810
811         __set_current_state(TASK_RUNNING);
812         remove_wait_queue(&vcpu->wq, &wait);
813 }
814
815 void kvm_resched(struct kvm_vcpu *vcpu)
816 {
817         if (!need_resched())
818                 return;
819         cond_resched();
820 }
821 EXPORT_SYMBOL_GPL(kvm_resched);
822
823 /*
824  * Check if userspace requested an interrupt window, and that the
825  * interrupt window is open.
826  *
827  * No need to exit to userspace if we already have an interrupt queued.
828  */
829 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
830                                           struct kvm_run *kvm_run)
831 {
832         return (!vcpu->irq_summary &&
833                 kvm_run->request_interrupt_window &&
834                 vcpu->interrupt_window_open &&
835                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
836 }
837
838 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
839                               struct kvm_run *kvm_run)
840 {
841         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
842         kvm_run->cr8 = get_cr8(vcpu);
843         kvm_run->apic_base = kvm_get_apic_base(vcpu);
844         if (irqchip_in_kernel(vcpu->kvm))
845                 kvm_run->ready_for_interrupt_injection = 1;
846         else
847                 kvm_run->ready_for_interrupt_injection =
848                                         (vcpu->interrupt_window_open &&
849                                          vcpu->irq_summary == 0);
850 }
851
852 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
853 {
854         int r;
855
856         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
857                 pr_debug("vcpu %d received sipi with vector # %x\n",
858                        vcpu->vcpu_id, vcpu->sipi_vector);
859                 kvm_lapic_reset(vcpu);
860                 r = kvm_x86_ops->vcpu_reset(vcpu);
861                 if (r)
862                         return r;
863                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
864         }
865
866 preempted:
867         if (vcpu->guest_debug.enabled)
868                 kvm_x86_ops->guest_debug_pre(vcpu);
869
870 again:
871         r = kvm_mmu_reload(vcpu);
872         if (unlikely(r))
873                 goto out;
874
875         kvm_inject_pending_timer_irqs(vcpu);
876
877         preempt_disable();
878
879         kvm_x86_ops->prepare_guest_switch(vcpu);
880         kvm_load_guest_fpu(vcpu);
881
882         local_irq_disable();
883
884         if (signal_pending(current)) {
885                 local_irq_enable();
886                 preempt_enable();
887                 r = -EINTR;
888                 kvm_run->exit_reason = KVM_EXIT_INTR;
889                 ++vcpu->stat.signal_exits;
890                 goto out;
891         }
892
893         if (irqchip_in_kernel(vcpu->kvm))
894                 kvm_x86_ops->inject_pending_irq(vcpu);
895         else if (!vcpu->mmio_read_completed)
896                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
897
898         vcpu->guest_mode = 1;
899         kvm_guest_enter();
900
901         if (vcpu->requests)
902                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
903                         kvm_x86_ops->tlb_flush(vcpu);
904
905         kvm_x86_ops->run(vcpu, kvm_run);
906
907         vcpu->guest_mode = 0;
908         local_irq_enable();
909
910         ++vcpu->stat.exits;
911
912         /*
913          * We must have an instruction between local_irq_enable() and
914          * kvm_guest_exit(), so the timer interrupt isn't delayed by
915          * the interrupt shadow.  The stat.exits increment will do nicely.
916          * But we need to prevent reordering, hence this barrier():
917          */
918         barrier();
919
920         kvm_guest_exit();
921
922         preempt_enable();
923
924         /*
925          * Profile KVM exit RIPs:
926          */
927         if (unlikely(prof_on == KVM_PROFILING)) {
928                 kvm_x86_ops->cache_regs(vcpu);
929                 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
930         }
931
932         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
933
934         if (r > 0) {
935                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
936                         r = -EINTR;
937                         kvm_run->exit_reason = KVM_EXIT_INTR;
938                         ++vcpu->stat.request_irq_exits;
939                         goto out;
940                 }
941                 if (!need_resched()) {
942                         ++vcpu->stat.light_exits;
943                         goto again;
944                 }
945         }
946
947 out:
948         if (r > 0) {
949                 kvm_resched(vcpu);
950                 goto preempted;
951         }
952
953         post_kvm_run_save(vcpu, kvm_run);
954
955         return r;
956 }
957
958
959 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
960 {
961         int r;
962         sigset_t sigsaved;
963
964         vcpu_load(vcpu);
965
966         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
967                 kvm_vcpu_block(vcpu);
968                 vcpu_put(vcpu);
969                 return -EAGAIN;
970         }
971
972         if (vcpu->sigset_active)
973                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
974
975         /* re-sync apic's tpr */
976         if (!irqchip_in_kernel(vcpu->kvm))
977                 set_cr8(vcpu, kvm_run->cr8);
978
979         if (vcpu->pio.cur_count) {
980                 r = complete_pio(vcpu);
981                 if (r)
982                         goto out;
983         }
984 #if CONFIG_HAS_IOMEM
985         if (vcpu->mmio_needed) {
986                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
987                 vcpu->mmio_read_completed = 1;
988                 vcpu->mmio_needed = 0;
989                 r = emulate_instruction(vcpu, kvm_run,
990                                         vcpu->mmio_fault_cr2, 0, 1);
991                 if (r == EMULATE_DO_MMIO) {
992                         /*
993                          * Read-modify-write.  Back to userspace.
994                          */
995                         r = 0;
996                         goto out;
997                 }
998         }
999 #endif
1000         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1001                 kvm_x86_ops->cache_regs(vcpu);
1002                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1003                 kvm_x86_ops->decache_regs(vcpu);
1004         }
1005
1006         r = __vcpu_run(vcpu, kvm_run);
1007
1008 out:
1009         if (vcpu->sigset_active)
1010                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1011
1012         vcpu_put(vcpu);
1013         return r;
1014 }
1015
1016 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1017                                    struct kvm_regs *regs)
1018 {
1019         vcpu_load(vcpu);
1020
1021         kvm_x86_ops->cache_regs(vcpu);
1022
1023         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1024         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1025         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1026         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1027         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1028         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1029         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1030         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1031 #ifdef CONFIG_X86_64
1032         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1033         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1034         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1035         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1036         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1037         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1038         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1039         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1040 #endif
1041
1042         regs->rip = vcpu->rip;
1043         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
1044
1045         /*
1046          * Don't leak debug flags in case they were set for guest debugging
1047          */
1048         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1049                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1050
1051         vcpu_put(vcpu);
1052
1053         return 0;
1054 }
1055
1056 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
1057                                    struct kvm_regs *regs)
1058 {
1059         vcpu_load(vcpu);
1060
1061         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1062         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1063         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1064         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1065         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1066         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1067         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1068         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1069 #ifdef CONFIG_X86_64
1070         vcpu->regs[VCPU_REGS_R8] = regs->r8;
1071         vcpu->regs[VCPU_REGS_R9] = regs->r9;
1072         vcpu->regs[VCPU_REGS_R10] = regs->r10;
1073         vcpu->regs[VCPU_REGS_R11] = regs->r11;
1074         vcpu->regs[VCPU_REGS_R12] = regs->r12;
1075         vcpu->regs[VCPU_REGS_R13] = regs->r13;
1076         vcpu->regs[VCPU_REGS_R14] = regs->r14;
1077         vcpu->regs[VCPU_REGS_R15] = regs->r15;
1078 #endif
1079
1080         vcpu->rip = regs->rip;
1081         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
1082
1083         kvm_x86_ops->decache_regs(vcpu);
1084
1085         vcpu_put(vcpu);
1086
1087         return 0;
1088 }
1089
1090 static void get_segment(struct kvm_vcpu *vcpu,
1091                         struct kvm_segment *var, int seg)
1092 {
1093         return kvm_x86_ops->get_segment(vcpu, var, seg);
1094 }
1095
1096 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1097                                     struct kvm_sregs *sregs)
1098 {
1099         struct descriptor_table dt;
1100         int pending_vec;
1101
1102         vcpu_load(vcpu);
1103
1104         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1105         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1106         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1107         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1108         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1109         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1110
1111         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1112         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1113
1114         kvm_x86_ops->get_idt(vcpu, &dt);
1115         sregs->idt.limit = dt.limit;
1116         sregs->idt.base = dt.base;
1117         kvm_x86_ops->get_gdt(vcpu, &dt);
1118         sregs->gdt.limit = dt.limit;
1119         sregs->gdt.base = dt.base;
1120
1121         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1122         sregs->cr0 = vcpu->cr0;
1123         sregs->cr2 = vcpu->cr2;
1124         sregs->cr3 = vcpu->cr3;
1125         sregs->cr4 = vcpu->cr4;
1126         sregs->cr8 = get_cr8(vcpu);
1127         sregs->efer = vcpu->shadow_efer;
1128         sregs->apic_base = kvm_get_apic_base(vcpu);
1129
1130         if (irqchip_in_kernel(vcpu->kvm)) {
1131                 memset(sregs->interrupt_bitmap, 0,
1132                        sizeof sregs->interrupt_bitmap);
1133                 pending_vec = kvm_x86_ops->get_irq(vcpu);
1134                 if (pending_vec >= 0)
1135                         set_bit(pending_vec,
1136                                 (unsigned long *)sregs->interrupt_bitmap);
1137         } else
1138                 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1139                        sizeof sregs->interrupt_bitmap);
1140
1141         vcpu_put(vcpu);
1142
1143         return 0;
1144 }
1145
1146 static void set_segment(struct kvm_vcpu *vcpu,
1147                         struct kvm_segment *var, int seg)
1148 {
1149         return kvm_x86_ops->set_segment(vcpu, var, seg);
1150 }
1151
1152 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1153                                     struct kvm_sregs *sregs)
1154 {
1155         int mmu_reset_needed = 0;
1156         int i, pending_vec, max_bits;
1157         struct descriptor_table dt;
1158
1159         vcpu_load(vcpu);
1160
1161         dt.limit = sregs->idt.limit;
1162         dt.base = sregs->idt.base;
1163         kvm_x86_ops->set_idt(vcpu, &dt);
1164         dt.limit = sregs->gdt.limit;
1165         dt.base = sregs->gdt.base;
1166         kvm_x86_ops->set_gdt(vcpu, &dt);
1167
1168         vcpu->cr2 = sregs->cr2;
1169         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
1170         vcpu->cr3 = sregs->cr3;
1171
1172         set_cr8(vcpu, sregs->cr8);
1173
1174         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
1175 #ifdef CONFIG_X86_64
1176         kvm_x86_ops->set_efer(vcpu, sregs->efer);
1177 #endif
1178         kvm_set_apic_base(vcpu, sregs->apic_base);
1179
1180         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1181
1182         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1183         vcpu->cr0 = sregs->cr0;
1184         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
1185
1186         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1187         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
1188         if (!is_long_mode(vcpu) && is_pae(vcpu))
1189                 load_pdptrs(vcpu, vcpu->cr3);
1190
1191         if (mmu_reset_needed)
1192                 kvm_mmu_reset_context(vcpu);
1193
1194         if (!irqchip_in_kernel(vcpu->kvm)) {
1195                 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
1196                        sizeof vcpu->irq_pending);
1197                 vcpu->irq_summary = 0;
1198                 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
1199                         if (vcpu->irq_pending[i])
1200                                 __set_bit(i, &vcpu->irq_summary);
1201         } else {
1202                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
1203                 pending_vec = find_first_bit(
1204                         (const unsigned long *)sregs->interrupt_bitmap,
1205                         max_bits);
1206                 /* Only pending external irq is handled here */
1207                 if (pending_vec < max_bits) {
1208                         kvm_x86_ops->set_irq(vcpu, pending_vec);
1209                         pr_debug("Set back pending irq %d\n",
1210                                  pending_vec);
1211                 }
1212         }
1213
1214         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1215         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1216         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1217         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1218         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1219         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1220
1221         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1222         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1223
1224         vcpu_put(vcpu);
1225
1226         return 0;
1227 }
1228
1229 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1230 {
1231         struct kvm_segment cs;
1232
1233         get_segment(vcpu, &cs, VCPU_SREG_CS);
1234         *db = cs.db;
1235         *l = cs.l;
1236 }
1237 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
1238
1239 /*
1240  * Translate a guest virtual address to a guest physical address.
1241  */
1242 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
1243                                     struct kvm_translation *tr)
1244 {
1245         unsigned long vaddr = tr->linear_address;
1246         gpa_t gpa;
1247
1248         vcpu_load(vcpu);
1249         mutex_lock(&vcpu->kvm->lock);
1250         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
1251         tr->physical_address = gpa;
1252         tr->valid = gpa != UNMAPPED_GVA;
1253         tr->writeable = 1;
1254         tr->usermode = 0;
1255         mutex_unlock(&vcpu->kvm->lock);
1256         vcpu_put(vcpu);
1257
1258         return 0;
1259 }
1260
1261 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1262                                     struct kvm_interrupt *irq)
1263 {
1264         if (irq->irq < 0 || irq->irq >= 256)
1265                 return -EINVAL;
1266         if (irqchip_in_kernel(vcpu->kvm))
1267                 return -ENXIO;
1268         vcpu_load(vcpu);
1269
1270         set_bit(irq->irq, vcpu->irq_pending);
1271         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1272
1273         vcpu_put(vcpu);
1274
1275         return 0;
1276 }
1277
1278 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
1279                                       struct kvm_debug_guest *dbg)
1280 {
1281         int r;
1282
1283         vcpu_load(vcpu);
1284
1285         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
1286
1287         vcpu_put(vcpu);
1288
1289         return r;
1290 }
1291
1292 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
1293                                     unsigned long address,
1294                                     int *type)
1295 {
1296         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1297         unsigned long pgoff;
1298         struct page *page;
1299
1300         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1301         if (pgoff == 0)
1302                 page = virt_to_page(vcpu->run);
1303         else if (pgoff == KVM_PIO_PAGE_OFFSET)
1304                 page = virt_to_page(vcpu->pio_data);
1305         else
1306                 return NOPAGE_SIGBUS;
1307         get_page(page);
1308         if (type != NULL)
1309                 *type = VM_FAULT_MINOR;
1310
1311         return page;
1312 }
1313
1314 static struct vm_operations_struct kvm_vcpu_vm_ops = {
1315         .nopage = kvm_vcpu_nopage,
1316 };
1317
1318 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1319 {
1320         vma->vm_ops = &kvm_vcpu_vm_ops;
1321         return 0;
1322 }
1323
1324 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1325 {
1326         struct kvm_vcpu *vcpu = filp->private_data;
1327
1328         fput(vcpu->kvm->filp);
1329         return 0;
1330 }
1331
1332 static struct file_operations kvm_vcpu_fops = {
1333         .release        = kvm_vcpu_release,
1334         .unlocked_ioctl = kvm_vcpu_ioctl,
1335         .compat_ioctl   = kvm_vcpu_ioctl,
1336         .mmap           = kvm_vcpu_mmap,
1337 };
1338
1339 /*
1340  * Allocates an inode for the vcpu.
1341  */
1342 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1343 {
1344         int fd, r;
1345         struct inode *inode;
1346         struct file *file;
1347
1348         r = anon_inode_getfd(&fd, &inode, &file,
1349                              "kvm-vcpu", &kvm_vcpu_fops, vcpu);
1350         if (r)
1351                 return r;
1352         atomic_inc(&vcpu->kvm->filp->f_count);
1353         return fd;
1354 }
1355
1356 /*
1357  * Creates some virtual cpus.  Good luck creating more than one.
1358  */
1359 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1360 {
1361         int r;
1362         struct kvm_vcpu *vcpu;
1363
1364         if (!valid_vcpu(n))
1365                 return -EINVAL;
1366
1367         vcpu = kvm_x86_ops->vcpu_create(kvm, n);
1368         if (IS_ERR(vcpu))
1369                 return PTR_ERR(vcpu);
1370
1371         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1372
1373         /* We do fxsave: this must be aligned. */
1374         BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
1375
1376         vcpu_load(vcpu);
1377         r = kvm_x86_ops->vcpu_reset(vcpu);
1378         if (r == 0)
1379                 r = kvm_mmu_setup(vcpu);
1380         vcpu_put(vcpu);
1381         if (r < 0)
1382                 goto free_vcpu;
1383
1384         mutex_lock(&kvm->lock);
1385         if (kvm->vcpus[n]) {
1386                 r = -EEXIST;
1387                 mutex_unlock(&kvm->lock);
1388                 goto mmu_unload;
1389         }
1390         kvm->vcpus[n] = vcpu;
1391         mutex_unlock(&kvm->lock);
1392
1393         /* Now it's all set up, let userspace reach it */
1394         r = create_vcpu_fd(vcpu);
1395         if (r < 0)
1396                 goto unlink;
1397         return r;
1398
1399 unlink:
1400         mutex_lock(&kvm->lock);
1401         kvm->vcpus[n] = NULL;
1402         mutex_unlock(&kvm->lock);
1403
1404 mmu_unload:
1405         vcpu_load(vcpu);
1406         kvm_mmu_unload(vcpu);
1407         vcpu_put(vcpu);
1408
1409 free_vcpu:
1410         kvm_x86_ops->vcpu_free(vcpu);
1411         return r;
1412 }
1413
1414 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1415 {
1416         if (sigset) {
1417                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1418                 vcpu->sigset_active = 1;
1419                 vcpu->sigset = *sigset;
1420         } else
1421                 vcpu->sigset_active = 0;
1422         return 0;
1423 }
1424
1425 /*
1426  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
1427  * we have asm/x86/processor.h
1428  */
1429 struct fxsave {
1430         u16     cwd;
1431         u16     swd;
1432         u16     twd;
1433         u16     fop;
1434         u64     rip;
1435         u64     rdp;
1436         u32     mxcsr;
1437         u32     mxcsr_mask;
1438         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
1439 #ifdef CONFIG_X86_64
1440         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
1441 #else
1442         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
1443 #endif
1444 };
1445
1446 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
1447 {
1448         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
1449
1450         vcpu_load(vcpu);
1451
1452         memcpy(fpu->fpr, fxsave->st_space, 128);
1453         fpu->fcw = fxsave->cwd;
1454         fpu->fsw = fxsave->swd;
1455         fpu->ftwx = fxsave->twd;
1456         fpu->last_opcode = fxsave->fop;
1457         fpu->last_ip = fxsave->rip;
1458         fpu->last_dp = fxsave->rdp;
1459         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
1460
1461         vcpu_put(vcpu);
1462
1463         return 0;
1464 }
1465
1466 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
1467 {
1468         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
1469
1470         vcpu_load(vcpu);
1471
1472         memcpy(fxsave->st_space, fpu->fpr, 128);
1473         fxsave->cwd = fpu->fcw;
1474         fxsave->swd = fpu->fsw;
1475         fxsave->twd = fpu->ftwx;
1476         fxsave->fop = fpu->last_opcode;
1477         fxsave->rip = fpu->last_ip;
1478         fxsave->rdp = fpu->last_dp;
1479         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
1480
1481         vcpu_put(vcpu);
1482
1483         return 0;
1484 }
1485
1486 static long kvm_vcpu_ioctl(struct file *filp,
1487                            unsigned int ioctl, unsigned long arg)
1488 {
1489         struct kvm_vcpu *vcpu = filp->private_data;
1490         void __user *argp = (void __user *)arg;
1491         int r;
1492
1493         switch (ioctl) {
1494         case KVM_RUN:
1495                 r = -EINVAL;
1496                 if (arg)
1497                         goto out;
1498                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
1499                 break;
1500         case KVM_GET_REGS: {
1501                 struct kvm_regs kvm_regs;
1502
1503                 memset(&kvm_regs, 0, sizeof kvm_regs);
1504                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
1505                 if (r)
1506                         goto out;
1507                 r = -EFAULT;
1508                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
1509                         goto out;
1510                 r = 0;
1511                 break;
1512         }
1513         case KVM_SET_REGS: {
1514                 struct kvm_regs kvm_regs;
1515
1516                 r = -EFAULT;
1517                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
1518                         goto out;
1519                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
1520                 if (r)
1521                         goto out;
1522                 r = 0;
1523                 break;
1524         }
1525         case KVM_GET_SREGS: {
1526                 struct kvm_sregs kvm_sregs;
1527
1528                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
1529                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
1530                 if (r)
1531                         goto out;
1532                 r = -EFAULT;
1533                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
1534                         goto out;
1535                 r = 0;
1536                 break;
1537         }
1538         case KVM_SET_SREGS: {
1539                 struct kvm_sregs kvm_sregs;
1540
1541                 r = -EFAULT;
1542                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
1543                         goto out;
1544                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
1545                 if (r)
1546                         goto out;
1547                 r = 0;
1548                 break;
1549         }
1550         case KVM_TRANSLATE: {
1551                 struct kvm_translation tr;
1552
1553                 r = -EFAULT;
1554                 if (copy_from_user(&tr, argp, sizeof tr))
1555                         goto out;
1556                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
1557                 if (r)
1558                         goto out;
1559                 r = -EFAULT;
1560                 if (copy_to_user(argp, &tr, sizeof tr))
1561                         goto out;
1562                 r = 0;
1563                 break;
1564         }
1565         case KVM_INTERRUPT: {
1566                 struct kvm_interrupt irq;
1567
1568                 r = -EFAULT;
1569                 if (copy_from_user(&irq, argp, sizeof irq))
1570                         goto out;
1571                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1572                 if (r)
1573                         goto out;
1574                 r = 0;
1575                 break;
1576         }
1577         case KVM_DEBUG_GUEST: {
1578                 struct kvm_debug_guest dbg;
1579
1580                 r = -EFAULT;
1581                 if (copy_from_user(&dbg, argp, sizeof dbg))
1582                         goto out;
1583                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
1584                 if (r)
1585                         goto out;
1586                 r = 0;
1587                 break;
1588         }
1589         case KVM_SET_SIGNAL_MASK: {
1590                 struct kvm_signal_mask __user *sigmask_arg = argp;
1591                 struct kvm_signal_mask kvm_sigmask;
1592                 sigset_t sigset, *p;
1593
1594                 p = NULL;
1595                 if (argp) {
1596                         r = -EFAULT;
1597                         if (copy_from_user(&kvm_sigmask, argp,
1598                                            sizeof kvm_sigmask))
1599                                 goto out;
1600                         r = -EINVAL;
1601                         if (kvm_sigmask.len != sizeof sigset)
1602                                 goto out;
1603                         r = -EFAULT;
1604                         if (copy_from_user(&sigset, sigmask_arg->sigset,
1605                                            sizeof sigset))
1606                                 goto out;
1607                         p = &sigset;
1608                 }
1609                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
1610                 break;
1611         }
1612         case KVM_GET_FPU: {
1613                 struct kvm_fpu fpu;
1614
1615                 memset(&fpu, 0, sizeof fpu);
1616                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
1617                 if (r)
1618                         goto out;
1619                 r = -EFAULT;
1620                 if (copy_to_user(argp, &fpu, sizeof fpu))
1621                         goto out;
1622                 r = 0;
1623                 break;
1624         }
1625         case KVM_SET_FPU: {
1626                 struct kvm_fpu fpu;
1627
1628                 r = -EFAULT;
1629                 if (copy_from_user(&fpu, argp, sizeof fpu))
1630                         goto out;
1631                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
1632                 if (r)
1633                         goto out;
1634                 r = 0;
1635                 break;
1636         }
1637         default:
1638                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1639         }
1640 out:
1641         return r;
1642 }
1643
1644 static long kvm_vm_ioctl(struct file *filp,
1645                            unsigned int ioctl, unsigned long arg)
1646 {
1647         struct kvm *kvm = filp->private_data;
1648         void __user *argp = (void __user *)arg;
1649         int r;
1650
1651         switch (ioctl) {
1652         case KVM_CREATE_VCPU:
1653                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1654                 if (r < 0)
1655                         goto out;
1656                 break;
1657         case KVM_SET_USER_MEMORY_REGION: {
1658                 struct kvm_userspace_memory_region kvm_userspace_mem;
1659
1660                 r = -EFAULT;
1661                 if (copy_from_user(&kvm_userspace_mem, argp,
1662                                                 sizeof kvm_userspace_mem))
1663                         goto out;
1664
1665                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1666                 if (r)
1667                         goto out;
1668                 break;
1669         }
1670         case KVM_GET_DIRTY_LOG: {
1671                 struct kvm_dirty_log log;
1672
1673                 r = -EFAULT;
1674                 if (copy_from_user(&log, argp, sizeof log))
1675                         goto out;
1676                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1677                 if (r)
1678                         goto out;
1679                 break;
1680         }
1681         default:
1682                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1683         }
1684 out:
1685         return r;
1686 }
1687
1688 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
1689                                   unsigned long address,
1690                                   int *type)
1691 {
1692         struct kvm *kvm = vma->vm_file->private_data;
1693         unsigned long pgoff;
1694         struct page *page;
1695
1696         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1697         if (!kvm_is_visible_gfn(kvm, pgoff))
1698                 return NOPAGE_SIGBUS;
1699         /* current->mm->mmap_sem is already held so call lockless version */
1700         page = __gfn_to_page(kvm, pgoff);
1701         if (is_error_page(page)) {
1702                 kvm_release_page(page);
1703                 return NOPAGE_SIGBUS;
1704         }
1705         if (type != NULL)
1706                 *type = VM_FAULT_MINOR;
1707
1708         return page;
1709 }
1710
1711 static struct vm_operations_struct kvm_vm_vm_ops = {
1712         .nopage = kvm_vm_nopage,
1713 };
1714
1715 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1716 {
1717         vma->vm_ops = &kvm_vm_vm_ops;
1718         return 0;
1719 }
1720
1721 static struct file_operations kvm_vm_fops = {
1722         .release        = kvm_vm_release,
1723         .unlocked_ioctl = kvm_vm_ioctl,
1724         .compat_ioctl   = kvm_vm_ioctl,
1725         .mmap           = kvm_vm_mmap,
1726 };
1727
1728 static int kvm_dev_ioctl_create_vm(void)
1729 {
1730         int fd, r;
1731         struct inode *inode;
1732         struct file *file;
1733         struct kvm *kvm;
1734
1735         kvm = kvm_create_vm();
1736         if (IS_ERR(kvm))
1737                 return PTR_ERR(kvm);
1738         r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
1739         if (r) {
1740                 kvm_destroy_vm(kvm);
1741                 return r;
1742         }
1743
1744         kvm->filp = file;
1745
1746         return fd;
1747 }
1748
1749 static long kvm_dev_ioctl(struct file *filp,
1750                           unsigned int ioctl, unsigned long arg)
1751 {
1752         void __user *argp = (void __user *)arg;
1753         long r = -EINVAL;
1754
1755         switch (ioctl) {
1756         case KVM_GET_API_VERSION:
1757                 r = -EINVAL;
1758                 if (arg)
1759                         goto out;
1760                 r = KVM_API_VERSION;
1761                 break;
1762         case KVM_CREATE_VM:
1763                 r = -EINVAL;
1764                 if (arg)
1765                         goto out;
1766                 r = kvm_dev_ioctl_create_vm();
1767                 break;
1768         case KVM_CHECK_EXTENSION: {
1769                 int ext = (long)argp;
1770
1771                 switch (ext) {
1772                 case KVM_CAP_IRQCHIP:
1773                 case KVM_CAP_HLT:
1774                 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1775                 case KVM_CAP_USER_MEMORY:
1776                 case KVM_CAP_SET_TSS_ADDR:
1777                         r = 1;
1778                         break;
1779                 default:
1780                         r = 0;
1781                         break;
1782                 }
1783                 break;
1784         }
1785         case KVM_GET_VCPU_MMAP_SIZE:
1786                 r = -EINVAL;
1787                 if (arg)
1788                         goto out;
1789                 r = 2 * PAGE_SIZE;
1790                 break;
1791         default:
1792                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
1793         }
1794 out:
1795         return r;
1796 }
1797
1798 static struct file_operations kvm_chardev_ops = {
1799         .unlocked_ioctl = kvm_dev_ioctl,
1800         .compat_ioctl   = kvm_dev_ioctl,
1801 };
1802
1803 static struct miscdevice kvm_dev = {
1804         KVM_MINOR,
1805         "kvm",
1806         &kvm_chardev_ops,
1807 };
1808
1809 /*
1810  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
1811  * cached on it.
1812  */
1813 static void decache_vcpus_on_cpu(int cpu)
1814 {
1815         struct kvm *vm;
1816         struct kvm_vcpu *vcpu;
1817         int i;
1818
1819         spin_lock(&kvm_lock);
1820         list_for_each_entry(vm, &vm_list, vm_list)
1821                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
1822                         vcpu = vm->vcpus[i];
1823                         if (!vcpu)
1824                                 continue;
1825                         /*
1826                          * If the vcpu is locked, then it is running on some
1827                          * other cpu and therefore it is not cached on the
1828                          * cpu in question.
1829                          *
1830                          * If it's not locked, check the last cpu it executed
1831                          * on.
1832                          */
1833                         if (mutex_trylock(&vcpu->mutex)) {
1834                                 if (vcpu->cpu == cpu) {
1835                                         kvm_x86_ops->vcpu_decache(vcpu);
1836                                         vcpu->cpu = -1;
1837                                 }
1838                                 mutex_unlock(&vcpu->mutex);
1839                         }
1840                 }
1841         spin_unlock(&kvm_lock);
1842 }
1843
1844 static void hardware_enable(void *junk)
1845 {
1846         int cpu = raw_smp_processor_id();
1847
1848         if (cpu_isset(cpu, cpus_hardware_enabled))
1849                 return;
1850         cpu_set(cpu, cpus_hardware_enabled);
1851         kvm_x86_ops->hardware_enable(NULL);
1852 }
1853
1854 static void hardware_disable(void *junk)
1855 {
1856         int cpu = raw_smp_processor_id();
1857
1858         if (!cpu_isset(cpu, cpus_hardware_enabled))
1859                 return;
1860         cpu_clear(cpu, cpus_hardware_enabled);
1861         decache_vcpus_on_cpu(cpu);
1862         kvm_x86_ops->hardware_disable(NULL);
1863 }
1864
1865 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1866                            void *v)
1867 {
1868         int cpu = (long)v;
1869
1870         switch (val) {
1871         case CPU_DYING:
1872         case CPU_DYING_FROZEN:
1873                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1874                        cpu);
1875                 hardware_disable(NULL);
1876                 break;
1877         case CPU_UP_CANCELED:
1878         case CPU_UP_CANCELED_FROZEN:
1879                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1880                        cpu);
1881                 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
1882                 break;
1883         case CPU_ONLINE:
1884         case CPU_ONLINE_FROZEN:
1885                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1886                        cpu);
1887                 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
1888                 break;
1889         }
1890         return NOTIFY_OK;
1891 }
1892
1893 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1894                       void *v)
1895 {
1896         if (val == SYS_RESTART) {
1897                 /*
1898                  * Some (well, at least mine) BIOSes hang on reboot if
1899                  * in vmx root mode.
1900                  */
1901                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1902                 on_each_cpu(hardware_disable, NULL, 0, 1);
1903         }
1904         return NOTIFY_OK;
1905 }
1906
1907 static struct notifier_block kvm_reboot_notifier = {
1908         .notifier_call = kvm_reboot,
1909         .priority = 0,
1910 };
1911
1912 void kvm_io_bus_init(struct kvm_io_bus *bus)
1913 {
1914         memset(bus, 0, sizeof(*bus));
1915 }
1916
1917 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
1918 {
1919         int i;
1920
1921         for (i = 0; i < bus->dev_count; i++) {
1922                 struct kvm_io_device *pos = bus->devs[i];
1923
1924                 kvm_iodevice_destructor(pos);
1925         }
1926 }
1927
1928 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
1929 {
1930         int i;
1931
1932         for (i = 0; i < bus->dev_count; i++) {
1933                 struct kvm_io_device *pos = bus->devs[i];
1934
1935                 if (pos->in_range(pos, addr))
1936                         return pos;
1937         }
1938
1939         return NULL;
1940 }
1941
1942 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
1943 {
1944         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
1945
1946         bus->devs[bus->dev_count++] = dev;
1947 }
1948
1949 static struct notifier_block kvm_cpu_notifier = {
1950         .notifier_call = kvm_cpu_hotplug,
1951         .priority = 20, /* must be > scheduler priority */
1952 };
1953
1954 static u64 stat_get(void *_offset)
1955 {
1956         unsigned offset = (long)_offset;
1957         u64 total = 0;
1958         struct kvm *kvm;
1959         struct kvm_vcpu *vcpu;
1960         int i;
1961
1962         spin_lock(&kvm_lock);
1963         list_for_each_entry(kvm, &vm_list, vm_list)
1964                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
1965                         vcpu = kvm->vcpus[i];
1966                         if (vcpu)
1967                                 total += *(u32 *)((void *)vcpu + offset);
1968                 }
1969         spin_unlock(&kvm_lock);
1970         return total;
1971 }
1972
1973 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
1974
1975 static __init void kvm_init_debug(void)
1976 {
1977         struct kvm_stats_debugfs_item *p;
1978
1979         debugfs_dir = debugfs_create_dir("kvm", NULL);
1980         for (p = debugfs_entries; p->name; ++p)
1981                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
1982                                                 (void *)(long)p->offset,
1983                                                 &stat_fops);
1984 }
1985
1986 static void kvm_exit_debug(void)
1987 {
1988         struct kvm_stats_debugfs_item *p;
1989
1990         for (p = debugfs_entries; p->name; ++p)
1991                 debugfs_remove(p->dentry);
1992         debugfs_remove(debugfs_dir);
1993 }
1994
1995 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
1996 {
1997         hardware_disable(NULL);
1998         return 0;
1999 }
2000
2001 static int kvm_resume(struct sys_device *dev)
2002 {
2003         hardware_enable(NULL);
2004         return 0;
2005 }
2006
2007 static struct sysdev_class kvm_sysdev_class = {
2008         .name = "kvm",
2009         .suspend = kvm_suspend,
2010         .resume = kvm_resume,
2011 };
2012
2013 static struct sys_device kvm_sysdev = {
2014         .id = 0,
2015         .cls = &kvm_sysdev_class,
2016 };
2017
2018 struct page *bad_page;
2019
2020 static inline
2021 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2022 {
2023         return container_of(pn, struct kvm_vcpu, preempt_notifier);
2024 }
2025
2026 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2027 {
2028         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2029
2030         kvm_x86_ops->vcpu_load(vcpu, cpu);
2031 }
2032
2033 static void kvm_sched_out(struct preempt_notifier *pn,
2034                           struct task_struct *next)
2035 {
2036         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2037
2038         kvm_x86_ops->vcpu_put(vcpu);
2039 }
2040
2041 int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
2042                   struct module *module)
2043 {
2044         int r;
2045         int cpu;
2046
2047         if (kvm_x86_ops) {
2048                 printk(KERN_ERR "kvm: already loaded the other module\n");
2049                 return -EEXIST;
2050         }
2051
2052         if (!ops->cpu_has_kvm_support()) {
2053                 printk(KERN_ERR "kvm: no hardware support\n");
2054                 return -EOPNOTSUPP;
2055         }
2056         if (ops->disabled_by_bios()) {
2057                 printk(KERN_ERR "kvm: disabled by bios\n");
2058                 return -EOPNOTSUPP;
2059         }
2060
2061         kvm_x86_ops = ops;
2062
2063         r = kvm_x86_ops->hardware_setup();
2064         if (r < 0)
2065                 goto out;
2066
2067         for_each_online_cpu(cpu) {
2068                 smp_call_function_single(cpu,
2069                                 kvm_x86_ops->check_processor_compatibility,
2070                                 &r, 0, 1);
2071                 if (r < 0)
2072                         goto out_free_0;
2073         }
2074
2075         on_each_cpu(hardware_enable, NULL, 0, 1);
2076         r = register_cpu_notifier(&kvm_cpu_notifier);
2077         if (r)
2078                 goto out_free_1;
2079         register_reboot_notifier(&kvm_reboot_notifier);
2080
2081         r = sysdev_class_register(&kvm_sysdev_class);
2082         if (r)
2083                 goto out_free_2;
2084
2085         r = sysdev_register(&kvm_sysdev);
2086         if (r)
2087                 goto out_free_3;
2088
2089         /* A kmem cache lets us meet the alignment requirements of fx_save. */
2090         kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
2091                                            __alignof__(struct kvm_vcpu), 0, 0);
2092         if (!kvm_vcpu_cache) {
2093                 r = -ENOMEM;
2094                 goto out_free_4;
2095         }
2096
2097         kvm_chardev_ops.owner = module;
2098
2099         r = misc_register(&kvm_dev);
2100         if (r) {
2101                 printk(KERN_ERR "kvm: misc device register failed\n");
2102                 goto out_free;
2103         }
2104
2105         kvm_preempt_ops.sched_in = kvm_sched_in;
2106         kvm_preempt_ops.sched_out = kvm_sched_out;
2107
2108         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2109
2110         return 0;
2111
2112 out_free:
2113         kmem_cache_destroy(kvm_vcpu_cache);
2114 out_free_4:
2115         sysdev_unregister(&kvm_sysdev);
2116 out_free_3:
2117         sysdev_class_unregister(&kvm_sysdev_class);
2118 out_free_2:
2119         unregister_reboot_notifier(&kvm_reboot_notifier);
2120         unregister_cpu_notifier(&kvm_cpu_notifier);
2121 out_free_1:
2122         on_each_cpu(hardware_disable, NULL, 0, 1);
2123 out_free_0:
2124         kvm_x86_ops->hardware_unsetup();
2125 out:
2126         kvm_x86_ops = NULL;
2127         return r;
2128 }
2129 EXPORT_SYMBOL_GPL(kvm_init_x86);
2130
2131 void kvm_exit_x86(void)
2132 {
2133         misc_deregister(&kvm_dev);
2134         kmem_cache_destroy(kvm_vcpu_cache);
2135         sysdev_unregister(&kvm_sysdev);
2136         sysdev_class_unregister(&kvm_sysdev_class);
2137         unregister_reboot_notifier(&kvm_reboot_notifier);
2138         unregister_cpu_notifier(&kvm_cpu_notifier);
2139         on_each_cpu(hardware_disable, NULL, 0, 1);
2140         kvm_x86_ops->hardware_unsetup();
2141         kvm_x86_ops = NULL;
2142 }
2143 EXPORT_SYMBOL_GPL(kvm_exit_x86);
2144
2145 static __init int kvm_init(void)
2146 {
2147         int r;
2148
2149         r = kvm_mmu_module_init();
2150         if (r)
2151                 goto out4;
2152
2153         kvm_init_debug();
2154
2155         kvm_arch_init();
2156
2157         bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2158
2159         if (bad_page == NULL) {
2160                 r = -ENOMEM;
2161                 goto out;
2162         }
2163
2164         return 0;
2165
2166 out:
2167         kvm_exit_debug();
2168         kvm_mmu_module_exit();
2169 out4:
2170         return r;
2171 }
2172
2173 static __exit void kvm_exit(void)
2174 {
2175         kvm_exit_debug();
2176         __free_page(bad_page);
2177         kvm_mmu_module_exit();
2178 }
2179
2180 module_init(kvm_init)
2181 module_exit(kvm_exit)