arch/i386/xen/mmu.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/highmem.h>
  42 #include <linux/bug.h>
  43 #include <linux/sched.h>
  44
  45 #include <asm/pgtable.h>
  46 #include <asm/tlbflush.h>
  47 #include <asm/mmu_context.h>
  48 #include <asm/paravirt.h>
  49
  50 #include <asm/xen/hypercall.h>
  51 #include <asm/xen/hypervisor.h>
  52
  53 #include <xen/page.h>
  54 #include <xen/interface/xen.h>
  55
  56 #include "multicalls.h"
  57 #include "mmu.h"
  58
  59 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
  60 {
  61         pte_t *pte = lookup_address(address);
  62         unsigned offset = address & PAGE_MASK;
  63
  64         BUG_ON(pte == NULL);
  65
  66         return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
  67 }
  68
  69 void make_lowmem_page_readonly(void *vaddr)
  70 {
  71         pte_t *pte, ptev;
  72         unsigned long address = (unsigned long)vaddr;
  73
  74         pte = lookup_address(address);
  75         BUG_ON(pte == NULL);
  76
  77         ptev = pte_wrprotect(*pte);
  78
  79         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
  80                 BUG();
  81 }
  82
  83 void make_lowmem_page_readwrite(void *vaddr)
  84 {
  85         pte_t *pte, ptev;
  86         unsigned long address = (unsigned long)vaddr;
  87
  88         pte = lookup_address(address);
  89         BUG_ON(pte == NULL);
  90
  91         ptev = pte_mkwrite(*pte);
  92
  93         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
  94                 BUG();
  95 }
  96
  97
  98 void xen_set_pmd(pmd_t *ptr, pmd_t val)
  99 {
 100         struct mmu_update u;
 101
 102         u.ptr = virt_to_machine(ptr).maddr;
 103         u.val = pmd_val_ma(val);
 104         if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
 105                 BUG();
 106 }
 107
 108 /*
 109  * Associate a virtual page frame with a given physical page frame
 110  * and protection flags for that frame.
 111  */
 112 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 113 {
 114         pgd_t *pgd;
 115         pud_t *pud;
 116         pmd_t *pmd;
 117         pte_t *pte;
 118
 119         pgd = swapper_pg_dir + pgd_index(vaddr);
 120         if (pgd_none(*pgd)) {
 121                 BUG();
 122                 return;
 123         }
 124         pud = pud_offset(pgd, vaddr);
 125         if (pud_none(*pud)) {
 126                 BUG();
 127                 return;
 128         }
 129         pmd = pmd_offset(pud, vaddr);
 130         if (pmd_none(*pmd)) {
 131                 BUG();
 132                 return;
 133         }
 134         pte = pte_offset_kernel(pmd, vaddr);
 135         /* <mfn,flags> stored as-is, to permit clearing entries */
 136         xen_set_pte(pte, mfn_pte(mfn, flags));
 137
 138         /*
 139          * It's enough to flush this one mapping.
 140          * (PGE mappings get flushed as well)
 141          */
 142         __flush_tlb_one(vaddr);
 143 }
 144
 145 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 146                     pte_t *ptep, pte_t pteval)
 147 {
 148         if ((mm != current->mm && mm != &init_mm) ||
 149             HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0)
 150                 xen_set_pte(ptep, pteval);
 151 }
 152
 153 #ifdef CONFIG_X86_PAE
 154 void xen_set_pud(pud_t *ptr, pud_t val)
 155 {
 156         struct mmu_update u;
 157
 158         u.ptr = virt_to_machine(ptr).maddr;
 159         u.val = pud_val_ma(val);
 160         if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
 161                 BUG();
 162 }
 163
 164 void xen_set_pte(pte_t *ptep, pte_t pte)
 165 {
 166         ptep->pte_high = pte.pte_high;
 167         smp_wmb();
 168         ptep->pte_low = pte.pte_low;
 169 }
 170
 171 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 172 {
 173         set_64bit((u64 *)ptep, pte_val_ma(pte));
 174 }
 175
 176 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 177 {
 178         ptep->pte_low = 0;
 179         smp_wmb();              /* make sure low gets written first */
 180         ptep->pte_high = 0;
 181 }
 182
 183 void xen_pmd_clear(pmd_t *pmdp)
 184 {
 185         xen_set_pmd(pmdp, __pmd(0));
 186 }
 187
 188 unsigned long long xen_pte_val(pte_t pte)
 189 {
 190         unsigned long long ret = 0;
 191
 192         if (pte.pte_low) {
 193                 ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
 194                 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
 195         }
 196
 197         return ret;
 198 }
 199
 200 unsigned long long xen_pmd_val(pmd_t pmd)
 201 {
 202         unsigned long long ret = pmd.pmd;
 203         if (ret)
 204                 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
 205         return ret;
 206 }
 207
 208 unsigned long long xen_pgd_val(pgd_t pgd)
 209 {
 210         unsigned long long ret = pgd.pgd;
 211         if (ret)
 212                 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
 213         return ret;
 214 }
 215
 216 pte_t xen_make_pte(unsigned long long pte)
 217 {
 218         if (pte & 1)
 219                 pte = phys_to_machine(XPADDR(pte)).maddr;
 220
 221         return (pte_t){ pte, pte >> 32 };
 222 }
 223
 224 pmd_t xen_make_pmd(unsigned long long pmd)
 225 {
 226         if (pmd & 1)
 227                 pmd = phys_to_machine(XPADDR(pmd)).maddr;
 228
 229         return (pmd_t){ pmd };
 230 }
 231
 232 pgd_t xen_make_pgd(unsigned long long pgd)
 233 {
 234         if (pgd & _PAGE_PRESENT)
 235                 pgd = phys_to_machine(XPADDR(pgd)).maddr;
 236
 237         return (pgd_t){ pgd };
 238 }
 239 #else  /* !PAE */
 240 void xen_set_pte(pte_t *ptep, pte_t pte)
 241 {
 242         *ptep = pte;
 243 }
 244
 245 unsigned long xen_pte_val(pte_t pte)
 246 {
 247         unsigned long ret = pte.pte_low;
 248
 249         if (ret & _PAGE_PRESENT)
 250                 ret = machine_to_phys(XMADDR(ret)).paddr;
 251
 252         return ret;
 253 }
 254
 255 unsigned long xen_pgd_val(pgd_t pgd)
 256 {
 257         unsigned long ret = pgd.pgd;
 258         if (ret)
 259                 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
 260         return ret;
 261 }
 262
 263 pte_t xen_make_pte(unsigned long pte)
 264 {
 265         if (pte & _PAGE_PRESENT)
 266                 pte = phys_to_machine(XPADDR(pte)).maddr;
 267
 268         return (pte_t){ pte };
 269 }
 270
 271 pgd_t xen_make_pgd(unsigned long pgd)
 272 {
 273         if (pgd & _PAGE_PRESENT)
 274                 pgd = phys_to_machine(XPADDR(pgd)).maddr;
 275
 276         return (pgd_t){ pgd };
 277 }
 278 #endif  /* CONFIG_X86_PAE */
 279
 280
 281
 282 /*
 283   (Yet another) pagetable walker.  This one is intended for pinning a
 284   pagetable.  This means that it walks a pagetable and calls the
 285   callback function on each page it finds making up the page table,
 286   at every level.  It walks the entire pagetable, but it only bothers
 287   pinning pte pages which are below pte_limit.  In the normal case
 288   this will be TASK_SIZE, but at boot we need to pin up to
 289   FIXADDR_TOP.  But the important bit is that we don't pin beyond
 290   there, because then we start getting into Xen's ptes.
 291 */
 292 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 293                     unsigned long limit)
 294 {
 295         pgd_t *pgd = pgd_base;
 296         int flush = 0;
 297         unsigned long addr = 0;
 298         unsigned long pgd_next;
 299
 300         BUG_ON(limit > FIXADDR_TOP);
 301
 302         if (xen_feature(XENFEAT_auto_translated_physmap))
 303                 return 0;
 304
 305         for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
 306                 pud_t *pud;
 307                 unsigned long pud_limit, pud_next;
 308
 309                 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
 310
 311                 if (!pgd_val(*pgd))
 312                         continue;
 313
 314                 pud = pud_offset(pgd, 0);
 315
 316                 if (PTRS_PER_PUD > 1) /* not folded */
 317                         flush |= (*func)(virt_to_page(pud), 0);
 318
 319                 for (; addr != pud_limit; pud++, addr = pud_next) {
 320                         pmd_t *pmd;
 321                         unsigned long pmd_limit;
 322
 323                         pud_next = pud_addr_end(addr, pud_limit);
 324
 325                         if (pud_next < limit)
 326                                 pmd_limit = pud_next;
 327                         else
 328                                 pmd_limit = limit;
 329
 330                         if (pud_none(*pud))
 331                                 continue;
 332
 333                         pmd = pmd_offset(pud, 0);
 334
 335                         if (PTRS_PER_PMD > 1) /* not folded */
 336                                 flush |= (*func)(virt_to_page(pmd), 0);
 337
 338                         for (; addr != pmd_limit; pmd++) {
 339                                 addr += (PAGE_SIZE * PTRS_PER_PTE);
 340                                 if ((pmd_limit-1) < (addr-1)) {
 341                                         addr = pmd_limit;
 342                                         break;
 343                                 }
 344
 345                                 if (pmd_none(*pmd))
 346                                         continue;
 347
 348                                 flush |= (*func)(pmd_page(*pmd), 0);
 349                         }
 350                 }
 351         }
 352
 353         flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
 354
 355         return flush;
 356 }
 357
 358 static int pin_page(struct page *page, unsigned flags)
 359 {
 360         unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
 361         int flush;
 362
 363         if (pgfl)
 364                 flush = 0;              /* already pinned */
 365         else if (PageHighMem(page))
 366                 /* kmaps need flushing if we found an unpinned
 367                    highpage */
 368                 flush = 1;
 369         else {
 370                 void *pt = lowmem_page_address(page);
 371                 unsigned long pfn = page_to_pfn(page);
 372                 struct multicall_space mcs = __xen_mc_entry(0);
 373
 374                 flush = 0;
 375
 376                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 377                                         pfn_pte(pfn, PAGE_KERNEL_RO),
 378                                         flags);
 379         }
 380
 381         return flush;
 382 }
 383
 384 /* This is called just after a mm has been created, but it has not
 385    been used yet.  We need to make sure that its pagetable is all
 386    read-only, and can be pinned. */
 387 void xen_pgd_pin(pgd_t *pgd)
 388 {
 389         struct multicall_space mcs;
 390         struct mmuext_op *op;
 391
 392         xen_mc_batch();
 393
 394         if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
 395                 /* re-enable interrupts for kmap_flush_unused */
 396                 xen_mc_issue(0);
 397                 kmap_flush_unused();
 398                 xen_mc_batch();
 399         }
 400
 401         mcs = __xen_mc_entry(sizeof(*op));
 402         op = mcs.args;
 403
 404 #ifdef CONFIG_X86_PAE
 405         op->cmd = MMUEXT_PIN_L3_TABLE;
 406 #else
 407         op->cmd = MMUEXT_PIN_L2_TABLE;
 408 #endif
 409         op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
 410         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 411
 412         xen_mc_issue(0);
 413 }
 414
 415 /* The init_mm pagetable is really pinned as soon as its created, but
 416    that's before we have page structures to store the bits.  So do all
 417    the book-keeping now. */
 418 static __init int mark_pinned(struct page *page, unsigned flags)
 419 {
 420         SetPagePinned(page);
 421         return 0;
 422 }
 423
 424 void __init xen_mark_init_mm_pinned(void)
 425 {
 426         pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
 427 }
 428
 429 static int unpin_page(struct page *page, unsigned flags)
 430 {
 431         unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
 432
 433         if (pgfl && !PageHighMem(page)) {
 434                 void *pt = lowmem_page_address(page);
 435                 unsigned long pfn = page_to_pfn(page);
 436                 struct multicall_space mcs = __xen_mc_entry(0);
 437
 438                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 439                                         pfn_pte(pfn, PAGE_KERNEL),
 440                                         flags);
 441         }
 442
 443         return 0;               /* never need to flush on unpin */
 444 }
 445
 446 /* Release a pagetables pages back as normal RW */
 447 static void xen_pgd_unpin(pgd_t *pgd)
 448 {
 449         struct mmuext_op *op;
 450         struct multicall_space mcs;
 451
 452         xen_mc_batch();
 453
 454         mcs = __xen_mc_entry(sizeof(*op));
 455
 456         op = mcs.args;
 457         op->cmd = MMUEXT_UNPIN_TABLE;
 458         op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
 459
 460         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 461
 462         pgd_walk(pgd, unpin_page, TASK_SIZE);
 463
 464         xen_mc_issue(0);
 465 }
 466
 467 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 468 {
 469         spin_lock(&next->page_table_lock);
 470         xen_pgd_pin(next->pgd);
 471         spin_unlock(&next->page_table_lock);
 472 }
 473
 474 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 475 {
 476         spin_lock(&mm->page_table_lock);
 477         xen_pgd_pin(mm->pgd);
 478         spin_unlock(&mm->page_table_lock);
 479 }
 480
 481
 482 #ifdef CONFIG_SMP
 483 /* Another cpu may still have their %cr3 pointing at the pagetable, so
 484    we need to repoint it somewhere else before we can unpin it. */
 485 static void drop_other_mm_ref(void *info)
 486 {
 487         struct mm_struct *mm = info;
 488
 489         if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
 490                 leave_mm(smp_processor_id());
 491 }
 492
 493 static void drop_mm_ref(struct mm_struct *mm)
 494 {
 495         if (current->active_mm == mm) {
 496                 if (current->mm == mm)
 497                         load_cr3(swapper_pg_dir);
 498                 else
 499                         leave_mm(smp_processor_id());
 500         }
 501
 502         if (!cpus_empty(mm->cpu_vm_mask))
 503                 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
 504                                            mm, 1);
 505 }
 506 #else
 507 static void drop_mm_ref(struct mm_struct *mm)
 508 {
 509         if (current->active_mm == mm)
 510                 load_cr3(swapper_pg_dir);
 511 }
 512 #endif
 513
 514 /*
 515  * While a process runs, Xen pins its pagetables, which means that the
 516  * hypervisor forces it to be read-only, and it controls all updates
 517  * to it.  This means that all pagetable updates have to go via the
 518  * hypervisor, which is moderately expensive.
 519  *
 520  * Since we're pulling the pagetable down, we switch to use init_mm,
 521  * unpin old process pagetable and mark it all read-write, which
 522  * allows further operations on it to be simple memory accesses.
 523  *
 524  * The only subtle point is that another CPU may be still using the
 525  * pagetable because of lazy tlb flushing.  This means we need need to
 526  * switch all CPUs off this pagetable before we can unpin it.
 527  */
 528 void xen_exit_mmap(struct mm_struct *mm)
 529 {
 530         get_cpu();              /* make sure we don't move around */
 531         drop_mm_ref(mm);
 532         put_cpu();
 533
 534         xen_pgd_unpin(mm->pgd);
 535 }