#include "vmx.h"
#include "kvm.h"
-#define pgprintk(x...) do { printk(x); } while (0)
-#define rmap_printk(x...) do { printk(x); } while (0)
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
#define ASSERT(x) \
if (!(x)) { \
#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
#define PFERR_USER_MASK (1U << 2)
+#define PFERR_FETCH_MASK (1U << 4)
#define PT64_ROOT_LEVEL 4
#define PT32_ROOT_LEVEL 2
return 1;
}
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+ return vcpu->shadow_efer & EFER_NX;
+}
+
static int is_present_pte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
== (PT_WRITABLE_MASK | PT_PRESENT_MASK);
}
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ size_t objsize, int min)
+{
+ void *obj;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ obj = kzalloc(objsize, GFP_NOWAIT);
+ if (!obj)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = obj;
+ }
+ return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ kfree(mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
+ sizeof(struct kvm_pte_chain), 4);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
+ sizeof(struct kvm_rmap_desc), 1);
+out:
+ return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+ size_t size)
+{
+ void *p;
+
+ BUG_ON(!mc->nobjs);
+ p = mc->objects[--mc->nobjs];
+ memset(p, 0, size);
+ return p;
+}
+
+static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj)
+{
+ if (mc->nobjs < KVM_NR_MEM_OBJS)
+ mc->objects[mc->nobjs++] = obj;
+ else
+ kfree(obj);
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
+ sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_vcpu *vcpu,
+ struct kvm_pte_chain *pc)
+{
+ mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
+ sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu,
+ struct kvm_rmap_desc *rd)
+{
+ mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd);
+}
+
/*
* Reverse mapping data structures:
*
* If page->private bit zero is one, (then page->private & ~1) points
* to a struct kvm_rmap_desc containing more mappings.
*/
-static void rmap_add(struct kvm *kvm, u64 *spte)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
{
struct page *page;
struct kvm_rmap_desc *desc;
page->private = (unsigned long)spte;
} else if (!(page->private & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
- desc = kzalloc(sizeof *desc, GFP_NOWAIT);
- if (!desc)
- BUG(); /* FIXME: return error */
+ desc = mmu_alloc_rmap_desc(vcpu);
desc->shadow_ptes[0] = (u64 *)page->private;
desc->shadow_ptes[1] = spte;
page->private = (unsigned long)desc | 1;
while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
desc = desc->more;
if (desc->shadow_ptes[RMAP_EXT-1]) {
- desc->more = kzalloc(sizeof *desc->more, GFP_NOWAIT);
- if (!desc->more)
- BUG(); /* FIXME: return error */
+ desc->more = mmu_alloc_rmap_desc(vcpu);
desc = desc->more;
}
for (i = 0; desc->shadow_ptes[i]; ++i)
}
}
-static void rmap_desc_remove_entry(struct page *page,
+static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu,
+ struct page *page,
struct kvm_rmap_desc *desc,
int i,
struct kvm_rmap_desc *prev_desc)
for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
;
desc->shadow_ptes[i] = desc->shadow_ptes[j];
- desc->shadow_ptes[j] = 0;
+ desc->shadow_ptes[j] = NULL;
if (j != 0)
return;
if (!prev_desc && !desc->more)
prev_desc->more = desc->more;
else
page->private = (unsigned long)desc->more | 1;
- kfree(desc);
+ mmu_free_rmap_desc(vcpu, desc);
}
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte)
{
struct page *page;
struct kvm_rmap_desc *desc;
while (desc) {
for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
if (desc->shadow_ptes[i] == spte) {
- rmap_desc_remove_entry(page, desc, i,
+ rmap_desc_remove_entry(vcpu, page,
+ desc, i,
prev_desc);
return;
}
}
}
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
{
+ struct kvm *kvm = vcpu->kvm;
struct page *page;
struct kvm_memory_slot *slot;
struct kvm_rmap_desc *desc;
BUG_ON(!(*spte & PT_PRESENT_MASK));
BUG_ON(!(*spte & PT_WRITABLE_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
+ rmap_remove(vcpu, spte);
+ kvm_arch_ops->tlb_flush(vcpu);
*spte &= ~(u64)PT_WRITABLE_MASK;
}
}
+static int is_empty_shadow_page(hpa_t page_hpa)
+{
+ u64 *pos;
+ u64 *end;
+
+ for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64);
+ pos != end; pos++)
+ if (*pos != 0) {
+ printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+ pos, *pos);
+ return 0;
+ }
+ return 1;
+}
+
static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
{
struct kvm_mmu_page *page_head = page_header(page_hpa);
+ ASSERT(is_empty_shadow_page(page_hpa));
list_del(&page_head->link);
page_head->page_hpa = page_hpa;
list_add(&page_head->link, &vcpu->free_pages);
-}
-
-static int is_empty_shadow_page(hpa_t page_hpa)
-{
- u32 *pos;
- u32 *end;
- for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
- pos != end; pos++)
- if (*pos != 0)
- return 0;
- return 1;
+ ++vcpu->kvm->n_free_mmu_pages;
}
static unsigned kvm_page_table_hashfn(gfn_t gfn)
page->global = 1;
page->multimapped = 0;
page->parent_pte = parent_pte;
+ --vcpu->kvm->n_free_mmu_pages;
return page;
}
-static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte)
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page, u64 *parent_pte)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
return;
}
page->multimapped = 1;
- pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
- BUG_ON(!pte_chain);
+ pte_chain = mmu_alloc_pte_chain(vcpu);
INIT_HLIST_HEAD(&page->parent_ptes);
hlist_add_head(&pte_chain->link, &page->parent_ptes);
pte_chain->parent_ptes[0] = old;
return;
}
}
- pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
+ pte_chain = mmu_alloc_pte_chain(vcpu);
BUG_ON(!pte_chain);
hlist_add_head(&pte_chain->link, &page->parent_ptes);
pte_chain->parent_ptes[0] = parent_pte;
}
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
+static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page,
u64 *parent_pte)
{
struct kvm_pte_chain *pte_chain;
break;
if (pte_chain->parent_ptes[i] != parent_pte)
continue;
- while (i + 1 < NR_PTE_CHAIN_ENTRIES) {
+ while (i + 1 < NR_PTE_CHAIN_ENTRIES
+ && pte_chain->parent_ptes[i + 1]) {
pte_chain->parent_ptes[i]
= pte_chain->parent_ptes[i + 1];
++i;
}
pte_chain->parent_ptes[i] = NULL;
+ if (i == 0) {
+ hlist_del(&pte_chain->link);
+ mmu_free_pte_chain(vcpu, pte_chain);
+ if (hlist_empty(&page->parent_ptes)) {
+ page->multimapped = 0;
+ page->parent_pte = NULL;
+ }
+ }
return;
}
BUG();
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry(page, node, bucket, hash_link)
if (page->gfn == gfn && page->role.word == role.word) {
- mmu_page_add_parent_pte(page, parent_pte);
+ mmu_page_add_parent_pte(vcpu, page, parent_pte);
pgprintk("%s: found\n", __FUNCTION__);
return page;
}
page->role = role;
hlist_add_head(&page->hash_link, bucket);
if (!metaphysical)
- rmap_write_protect(vcpu->kvm, gfn);
+ rmap_write_protect(vcpu, gfn);
return page;
}
+static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page)
+{
+ unsigned i;
+ u64 *pt;
+ u64 ent;
+
+ pt = __va(page->page_hpa);
+
+ if (page->role.level == PT_PAGE_TABLE_LEVEL) {
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (pt[i] & PT_PRESENT_MASK)
+ rmap_remove(vcpu, &pt[i]);
+ pt[i] = 0;
+ }
+ kvm_arch_ops->tlb_flush(vcpu);
+ return;
+ }
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ ent = pt[i];
+
+ pt[i] = 0;
+ if (!(ent & PT_PRESENT_MASK))
+ continue;
+ ent &= PT64_BASE_ADDR_MASK;
+ mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
+ }
+}
+
static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
u64 *parent_pte)
{
- mmu_page_remove_parent_pte(page, parent_pte);
+ mmu_page_remove_parent_pte(vcpu, page, parent_pte);
+}
+
+static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page)
+{
+ u64 *parent_pte;
+
+ while (page->multimapped || page->parent_pte) {
+ if (!page->multimapped)
+ parent_pte = page->parent_pte;
+ else {
+ struct kvm_pte_chain *chain;
+
+ chain = container_of(page->parent_ptes.first,
+ struct kvm_pte_chain, link);
+ parent_pte = chain->parent_ptes[0];
+ }
+ BUG_ON(!parent_pte);
+ kvm_mmu_put_page(vcpu, page, parent_pte);
+ *parent_pte = 0;
+ }
+ kvm_mmu_page_unlink_children(vcpu, page);
+ if (!page->root_count) {
+ hlist_del(&page->hash_link);
+ kvm_mmu_free_page(vcpu, page->page_hpa);
+ } else {
+ list_del(&page->link);
+ list_add(&page->link, &vcpu->kvm->active_mmu_pages);
+ }
+}
+
+static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *page;
+ struct hlist_node *node, *n;
+ int r;
+
+ pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+ r = 0;
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &vcpu->kvm->mmu_page_hash[index];
+ hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
+ if (page->gfn == gfn && !page->role.metaphysical) {
+ pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+ page->role.word);
+ kvm_mmu_zap_page(vcpu, page);
+ r = 1;
+ }
+ return r;
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
return gpa_to_hpa(vcpu, gpa);
}
-
-static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
- int level)
-{
- u64 *pos;
- u64 *end;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(page_hpa));
- ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
-
- for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
- pos != end; pos++) {
- u64 current_ent = *pos;
-
- if (is_present_pte(current_ent)) {
- if (level != 1)
- release_pt_page_64(vcpu,
- current_ent &
- PT64_BASE_ADDR_MASK,
- level - 1);
- else
- rmap_remove(vcpu->kvm, pos);
- }
- *pos = 0;
- }
- kvm_mmu_free_page(vcpu, page_hpa);
-}
-
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
page_header_update_slot(vcpu->kvm, table, v);
table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
PT_USER_MASK;
- rmap_add(vcpu->kvm, &table[index]);
+ rmap_add(vcpu, &table[index]);
return 0;
}
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
+ struct kvm_mmu_page *page;
#ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(VALID_PAGE(root));
+ page = page_header(root);
+ --page->root_count;
vcpu->mmu.root_hpa = INVALID_PAGE;
return;
}
ASSERT(VALID_PAGE(root));
root &= PT64_BASE_ADDR_MASK;
+ page = page_header(root);
+ --page->root_count;
vcpu->mmu.pae_root[i] = INVALID_PAGE;
}
vcpu->mmu.root_hpa = INVALID_PAGE;
{
int i;
gfn_t root_gfn;
+ struct kvm_mmu_page *page;
+
root_gfn = vcpu->cr3 >> PAGE_SHIFT;
#ifdef CONFIG_X86_64
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
- root = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, 0, NULL)->page_hpa;
+ page = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ PT64_ROOT_LEVEL, 0, NULL);
+ root = page->page_hpa;
+ ++page->root_count;
vcpu->mmu.root_hpa = root;
return;
}
root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
else if (vcpu->mmu.root_level == 0)
root_gfn = 0;
- root = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+ page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, !is_paging(vcpu),
- NULL)->page_hpa;
+ NULL);
+ root = page->page_hpa;
+ ++page->root_count;
vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
}
-static void nonpaging_flush(struct kvm_vcpu *vcpu)
-{
- hpa_t root = vcpu->mmu.root_hpa;
-
- ++kvm_stat.tlb_flush;
- pgprintk("nonpaging_flush\n");
- mmu_free_roots(vcpu);
- mmu_alloc_roots(vcpu);
- kvm_arch_ops->set_cr3(vcpu, root);
- kvm_arch_ops->tlb_flush(vcpu);
-}
-
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
- int ret;
gpa_t addr = gva;
+ hpa_t paddr;
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
- for (;;) {
- hpa_t paddr;
- paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
+ paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
- if (is_error_hpa(paddr))
- return 1;
-
- ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
- if (ret) {
- nonpaging_flush(vcpu);
- continue;
- }
- break;
- }
- return ret;
-}
+ if (is_error_hpa(paddr))
+ return 1;
-static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
-{
+ return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
}
static void nonpaging_free(struct kvm_vcpu *vcpu)
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
- context->inval_page = nonpaging_inval_page;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
context->root_level = 0;
{
pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
mmu_free_roots(vcpu);
+ if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+ kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
u64 *shadow_pte,
gpa_t gaddr,
int dirty,
- u64 access_bits)
+ u64 access_bits,
+ gfn_t gfn)
{
hpa_t paddr;
if (access_bits & PT_WRITABLE_MASK) {
struct kvm_mmu_page *shadow;
- shadow = kvm_mmu_lookup_page(vcpu, gaddr >> PAGE_SHIFT);
+ shadow = kvm_mmu_lookup_page(vcpu, gfn);
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
- __FUNCTION__, (gfn_t)(gaddr >> PAGE_SHIFT));
+ __FUNCTION__, gfn);
access_bits &= ~PT_WRITABLE_MASK;
- *shadow_pte &= ~PT_WRITABLE_MASK;
+ if (is_writeble_pte(*shadow_pte)) {
+ *shadow_pte &= ~PT_WRITABLE_MASK;
+ kvm_arch_ops->tlb_flush(vcpu);
+ }
}
}
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
- rmap_add(vcpu->kvm, shadow_pte);
+ rmap_add(vcpu, shadow_pte);
}
static void inject_page_fault(struct kvm_vcpu *vcpu,
return 0;
}
-static int may_access(u64 pte, int write, int user)
-{
-
- if (user && !(pte & PT_USER_MASK))
- return 0;
- if (write && !(pte & PT_WRITABLE_MASK))
- return 0;
- return 1;
-}
-
-/*
- * Remove a shadow pte.
- */
-static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
-{
- hpa_t page_addr = vcpu->mmu.root_hpa;
- int level = vcpu->mmu.shadow_root_level;
-
- ++kvm_stat.invlpg;
-
- for (; ; level--) {
- u32 index = PT64_INDEX(addr, level);
- u64 *table = __va(page_addr);
-
- if (level == PT_PAGE_TABLE_LEVEL ) {
- rmap_remove(vcpu->kvm, &table[index]);
- table[index] = 0;
- return;
- }
-
- if (!is_present_pte(table[index]))
- return;
-
- page_addr = table[index] & PT64_BASE_ADDR_MASK;
-
- if (level == PT_DIRECTORY_LEVEL &&
- (table[index] & PT_SHADOW_PS_MARK)) {
- table[index] = 0;
- release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
-
- kvm_arch_ops->tlb_flush(vcpu);
- return;
- }
- }
-}
-
static void paging_free(struct kvm_vcpu *vcpu)
{
nonpaging_free(vcpu);
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
- context->inval_page = paging_inval_page;
context->gva_to_gpa = paging64_gva_to_gpa;
context->free = paging_free;
context->root_level = level;
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
- context->inval_page = paging_inval_page;
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
context->root_level = PT32_ROOT_LEVEL;
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
+ int r;
+
destroy_kvm_mmu(vcpu);
- return init_kvm_mmu(vcpu);
+ r = init_kvm_mmu(vcpu);
+ if (r < 0)
+ goto out;
+ r = mmu_topup_memory_caches(vcpu);
+out:
+ return r;
}
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
+void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
{
- while (!list_empty(&vcpu->free_pages)) {
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_mmu_page *page;
+ struct kvm_mmu_page *child;
+ struct hlist_node *node, *n;
+ struct hlist_head *bucket;
+ unsigned index;
+ u64 *spte;
+ u64 pte;
+ unsigned offset = offset_in_page(gpa);
+ unsigned pte_size;
+ unsigned page_offset;
+ unsigned misaligned;
+ int level;
+ int flooded = 0;
+
+ pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ if (gfn == vcpu->last_pt_write_gfn) {
+ ++vcpu->last_pt_write_count;
+ if (vcpu->last_pt_write_count >= 3)
+ flooded = 1;
+ } else {
+ vcpu->last_pt_write_gfn = gfn;
+ vcpu->last_pt_write_count = 1;
+ }
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &vcpu->kvm->mmu_page_hash[index];
+ hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
+ if (page->gfn != gfn || page->role.metaphysical)
+ continue;
+ pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ if (misaligned || flooded) {
+ /*
+ * Misaligned accesses are too much trouble to fix
+ * up; also, they usually indicate a page is not used
+ * as a page table.
+ *
+ * If we're seeing too many writes to a page,
+ * it may no longer be a page table, or we may be
+ * forking, in which case it is better to unmap the
+ * page.
+ */
+ pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+ gpa, bytes, page->role.word);
+ kvm_mmu_zap_page(vcpu, page);
+ continue;
+ }
+ page_offset = offset;
+ level = page->role.level;
+ if (page->role.glevels == PT32_ROOT_LEVEL) {
+ page_offset <<= 1; /* 32->64 */
+ page_offset &= ~PAGE_MASK;
+ }
+ spte = __va(page->page_hpa);
+ spte += page_offset / sizeof(*spte);
+ pte = *spte;
+ if (is_present_pte(pte)) {
+ if (level == PT_PAGE_TABLE_LEVEL)
+ rmap_remove(vcpu, spte);
+ else {
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(vcpu, child, spte);
+ }
+ }
+ *spte = 0;
+ }
+}
+
+void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
+{
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
+
+ return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
+}
+
+void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+ while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
struct kvm_mmu_page *page;
+ page = container_of(vcpu->kvm->active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(vcpu, page);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *page;
+
+ while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
+ page = container_of(vcpu->kvm->active_mmu_pages.next,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(vcpu, page);
+ }
+ while (!list_empty(&vcpu->free_pages)) {
page = list_entry(vcpu->free_pages.next,
struct kvm_mmu_page, link);
list_del(&page->link);
page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
list_add(&page_header->link, &vcpu->free_pages);
+ ++vcpu->kvm->n_free_mmu_pages;
}
/*
destroy_kvm_mmu(vcpu);
free_mmu_pages(vcpu);
+ mmu_free_memory_caches(vcpu);
}
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
{
+ struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *page;
list_for_each_entry(page, &kvm->active_mmu_pages, link) {
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (pt[i] & PT_WRITABLE_MASK) {
- rmap_remove(kvm, &pt[i]);
+ rmap_remove(vcpu, &pt[i]);
pt[i] &= ~PT_WRITABLE_MASK;
}
}
}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+ gva = (long long)(gva << 16) >> 16;
+#endif
+ return gva;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+ gva_t va, int level)
+{
+ u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+ int i;
+ gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+ u64 ent = pt[i];
+
+ if (!ent & PT_PRESENT_MASK)
+ continue;
+
+ va = canonicalize(va);
+ if (level > 1)
+ audit_mappings_page(vcpu, ent, va, level - 1);
+ else {
+ gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
+ hpa_t hpa = gpa_to_hpa(vcpu, gpa);
+
+ if ((ent & PT_PRESENT_MASK)
+ && (ent & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "audit error: (%s) levels %d"
+ " gva %lx gpa %llx hpa %llx ent %llx\n",
+ audit_msg, vcpu->mmu.root_level,
+ va, gpa, hpa, ent);
+ }
+ }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (vcpu->mmu.root_level == 4)
+ audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
+ else
+ for (i = 0; i < 4; ++i)
+ if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
+ audit_mappings_page(vcpu,
+ vcpu->mmu.pae_root[i],
+ i << 30,
+ 2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+ int nmaps = 0;
+ int i, j, k;
+
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+ struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+ struct kvm_rmap_desc *d;
+
+ for (j = 0; j < m->npages; ++j) {
+ struct page *page = m->phys_mem[j];
+
+ if (!page->private)
+ continue;
+ if (!(page->private & 1)) {
+ ++nmaps;
+ continue;
+ }
+ d = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ while (d) {
+ for (k = 0; k < RMAP_EXT; ++k)
+ if (d->shadow_ptes[k])
+ ++nmaps;
+ else
+ break;
+ d = d->more;
+ }
+ }
+ }
+ return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+ int nmaps = 0;
+ struct kvm_mmu_page *page;
+ int i;
+
+ list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
+ u64 *pt = __va(page->page_hpa);
+
+ if (page->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = pt[i];
+
+ if (!(ent & PT_PRESENT_MASK))
+ continue;
+ if (!(ent & PT_WRITABLE_MASK))
+ continue;
+ ++nmaps;
+ }
+ }
+ return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+ int n_rmap = count_rmaps(vcpu);
+ int n_actual = count_writable_mappings(vcpu);
+
+ if (n_rmap != n_actual)
+ printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+ __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *page;
+
+ list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
+ hfn_t hfn;
+ struct page *pg;
+
+ if (page->role.metaphysical)
+ continue;
+
+ hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
+ >> PAGE_SHIFT;
+ pg = pfn_to_page(hfn);
+ if (pg->private)
+ printk(KERN_ERR "%s: (%s) shadow page has writable"
+ " mappings: gfn %lx role %x\n",
+ __FUNCTION__, audit_msg, page->gfn,
+ page->role.word);
+ }
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+ int olddbg = dbg;
+
+ dbg = 0;
+ audit_msg = msg;
+ audit_rmap(vcpu);
+ audit_write_protection(vcpu);
+ audit_mappings(vcpu);
+ dbg = olddbg;
+}
+
+#endif