arch/x86/mm/fault_64.c

   1 /*
   2  *  Copyright (C) 1995  Linus Torvalds
   3  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
   4  */
   5
   6 #include <linux/signal.h>
   7 #include <linux/sched.h>
   8 #include <linux/kernel.h>
   9 #include <linux/errno.h>
  10 #include <linux/string.h>
  11 #include <linux/types.h>
  12 #include <linux/ptrace.h>
  13 #include <linux/mman.h>
  14 #include <linux/mm.h>
  15 #include <linux/smp.h>
  16 #include <linux/interrupt.h>
  17 #include <linux/init.h>
  18 #include <linux/tty.h>
  19 #include <linux/vt_kern.h>              /* For unblank_screen() */
  20 #include <linux/compiler.h>
  21 #include <linux/vmalloc.h>
  22 #include <linux/module.h>
  23 #include <linux/kprobes.h>
  24 #include <linux/uaccess.h>
  25 #include <linux/kdebug.h>
  26
  27 #include <asm/system.h>
  28 #include <asm/pgalloc.h>
  29 #include <asm/smp.h>
  30 #include <asm/tlbflush.h>
  31 #include <asm/proto.h>
  32 #include <asm-generic/sections.h>
  33
  34 /*
  35  * Page fault error code bits
  36  *      bit 0 == 0 means no page found, 1 means protection fault
  37  *      bit 1 == 0 means read, 1 means write
  38  *      bit 2 == 0 means kernel, 1 means user-mode
  39  *      bit 3 == 1 means use of reserved bit detected
  40  *      bit 4 == 1 means fault was an instruction fetch
  41  */
  42 #define PF_PROT         (1<<0)
  43 #define PF_WRITE        (1<<1)
  44 #define PF_USER         (1<<2)
  45 #define PF_RSVD         (1<<3)
  46 #define PF_INSTR        (1<<4)
  47
  48 static inline int notify_page_fault(struct pt_regs *regs)
  49 {
  50 #ifdef CONFIG_KPROBES
  51         int ret = 0;
  52
  53         /* kprobe_running() needs smp_processor_id() */
  54         if (!user_mode(regs)) {
  55                 preempt_disable();
  56                 if (kprobe_running() && kprobe_fault_handler(regs, 14))
  57                         ret = 1;
  58                 preempt_enable();
  59         }
  60
  61         return ret;
  62 #else
  63         return 0;
  64 #endif
  65 }
  66
  67 #ifdef CONFIG_X86_32
  68 /*
  69  * Return EIP plus the CS segment base.  The segment limit is also
  70  * adjusted, clamped to the kernel/user address space (whichever is
  71  * appropriate), and returned in *eip_limit.
  72  *
  73  * The segment is checked, because it might have been changed by another
  74  * task between the original faulting instruction and here.
  75  *
  76  * If CS is no longer a valid code segment, or if EIP is beyond the
  77  * limit, or if it is a kernel address when CS is not a kernel segment,
  78  * then the returned value will be greater than *eip_limit.
  79  *
  80  * This is slow, but is very rarely executed.
  81  */
  82 static inline unsigned long get_segment_eip(struct pt_regs *regs,
  83                                             unsigned long *eip_limit)
  84 {
  85         unsigned long ip = regs->ip;
  86         unsigned seg = regs->cs & 0xffff;
  87         u32 seg_ar, seg_limit, base, *desc;
  88
  89         /* Unlikely, but must come before segment checks. */
  90         if (unlikely(regs->flags & VM_MASK)) {
  91                 base = seg << 4;
  92                 *eip_limit = base + 0xffff;
  93                 return base + (ip & 0xffff);
  94         }
  95
  96         /* The standard kernel/user address space limit. */
  97         *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
  98
  99         /* By far the most common cases. */
 100         if (likely(SEGMENT_IS_FLAT_CODE(seg)))
 101                 return ip;
 102
 103         /* Check the segment exists, is within the current LDT/GDT size,
 104            that kernel/user (ring 0..3) has the appropriate privilege,
 105            that it's a code segment, and get the limit. */
 106         __asm__("larl %3,%0; lsll %3,%1"
 107                  : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
 108         if ((~seg_ar & 0x9800) || ip > seg_limit) {
 109                 *eip_limit = 0;
 110                 return 1;        /* So that returned ip > *eip_limit. */
 111         }
 112
 113         /* Get the GDT/LDT descriptor base.
 114            When you look for races in this code remember that
 115            LDT and other horrors are only used in user space. */
 116         if (seg & (1<<2)) {
 117                 /* Must lock the LDT while reading it. */
 118                 mutex_lock(&current->mm->context.lock);
 119                 desc = current->mm->context.ldt;
 120                 desc = (void *)desc + (seg & ~7);
 121         } else {
 122                 /* Must disable preemption while reading the GDT. */
 123                 desc = (u32 *)get_cpu_gdt_table(get_cpu());
 124                 desc = (void *)desc + (seg & ~7);
 125         }
 126
 127         /* Decode the code segment base from the descriptor */
 128         base = get_desc_base((struct desc_struct *)desc);
 129
 130         if (seg & (1<<2))
 131                 mutex_unlock(&current->mm->context.lock);
 132         else
 133                 put_cpu();
 134
 135         /* Adjust EIP and segment limit, and clamp at the kernel limit.
 136            It's legitimate for segments to wrap at 0xffffffff. */
 137         seg_limit += base;
 138         if (seg_limit < *eip_limit && seg_limit >= base)
 139                 *eip_limit = seg_limit;
 140         return ip + base;
 141 }
 142 #endif
 143
 144 /*
 145  * X86_32
 146  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 147  * Check that here and ignore it.
 148  *
 149  * X86_64
 150  * Sometimes the CPU reports invalid exceptions on prefetch.
 151  * Check that here and ignore it.
 152  *
 153  * Opcode checker based on code by Richard Brunner
 154  */
 155 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 156                        unsigned long error_code)
 157 {
 158         unsigned char *instr;
 159         int scan_more = 1;
 160         int prefetch = 0;
 161         unsigned char *max_instr;
 162
 163 #ifdef CONFIG_X86_32
 164         unsigned long limit;
 165         if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 166                      boot_cpu_data.x86 >= 6)) {
 167                 /* Catch an obscure case of prefetch inside an NX page. */
 168                 if (nx_enabled && (error_code & PF_INSTR))
 169                         return 0;
 170         } else {
 171                 return 0;
 172         }
 173         instr = (unsigned char *)get_segment_eip(regs, &limit);
 174 #else
 175         /* If it was a exec fault ignore */
 176         if (error_code & PF_INSTR)
 177                 return 0;
 178         instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
 179 #endif
 180
 181         max_instr = instr + 15;
 182
 183 #ifdef CONFIG_X86_64
 184         if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 185                 return 0;
 186 #endif
 187
 188         while (scan_more && instr < max_instr) {
 189                 unsigned char opcode;
 190                 unsigned char instr_hi;
 191                 unsigned char instr_lo;
 192
 193 #ifdef CONFIG_X86_32
 194                 if (instr > (unsigned char *)limit)
 195                         break;
 196 #endif
 197                 if (probe_kernel_address(instr, opcode))
 198                         break;
 199
 200                 instr_hi = opcode & 0xf0;
 201                 instr_lo = opcode & 0x0f;
 202                 instr++;
 203
 204                 switch (instr_hi) {
 205                 case 0x20:
 206                 case 0x30:
 207                         /*
 208                          * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
 209                          * In X86_64 long mode, the CPU will signal invalid
 210                          * opcode if some of these prefixes are present so
 211                          * X86_64 will never get here anyway
 212                          */
 213                         scan_more = ((instr_lo & 7) == 0x6);
 214                         break;
 215 #ifdef CONFIG_X86_64
 216                 case 0x40:
 217                         /*
 218                          * In AMD64 long mode 0x40..0x4F are valid REX prefixes
 219                          * Need to figure out under what instruction mode the
 220                          * instruction was issued. Could check the LDT for lm,
 221                          * but for now it's good enough to assume that long
 222                          * mode only uses well known segments or kernel.
 223                          */
 224                         scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
 225                         break;
 226 #endif
 227                 case 0x60:
 228                         /* 0x64 thru 0x67 are valid prefixes in all modes. */
 229                         scan_more = (instr_lo & 0xC) == 0x4;
 230                         break;
 231                 case 0xF0:
 232                         /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
 233                         scan_more = !instr_lo || (instr_lo>>1) == 1;
 234                         break;
 235                 case 0x00:
 236                         /* Prefetch instruction is 0x0F0D or 0x0F18 */
 237                         scan_more = 0;
 238 #ifdef CONFIG_X86_32
 239                         if (instr > (unsigned char *)limit)
 240                                 break;
 241 #endif
 242                         if (probe_kernel_address(instr, opcode))
 243                                 break;
 244                         prefetch = (instr_lo == 0xF) &&
 245                                 (opcode == 0x0D || opcode == 0x18);
 246                         break;
 247                 default:
 248                         scan_more = 0;
 249                         break;
 250                 }
 251         }
 252         return prefetch;
 253 }
 254
 255 static void force_sig_info_fault(int si_signo, int si_code,
 256         unsigned long address, struct task_struct *tsk)
 257 {
 258         siginfo_t info;
 259
 260         info.si_signo = si_signo;
 261         info.si_errno = 0;
 262         info.si_code = si_code;
 263         info.si_addr = (void __user *)address;
 264         force_sig_info(si_signo, &info, tsk);
 265 }
 266
 267 static int bad_address(void *p)
 268 {
 269         unsigned long dummy;
 270         return probe_kernel_address((unsigned long *)p, dummy);
 271 }
 272
 273 void dump_pagetable(unsigned long address)
 274 {
 275         pgd_t *pgd;
 276         pud_t *pud;
 277         pmd_t *pmd;
 278         pte_t *pte;
 279
 280         pgd = (pgd_t *)read_cr3();
 281
 282         pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
 283         pgd += pgd_index(address);
 284         if (bad_address(pgd)) goto bad;
 285         printk("PGD %lx ", pgd_val(*pgd));
 286         if (!pgd_present(*pgd)) goto ret;
 287
 288         pud = pud_offset(pgd, address);
 289         if (bad_address(pud)) goto bad;
 290         printk("PUD %lx ", pud_val(*pud));
 291         if (!pud_present(*pud)) goto ret;
 292
 293         pmd = pmd_offset(pud, address);
 294         if (bad_address(pmd)) goto bad;
 295         printk("PMD %lx ", pmd_val(*pmd));
 296         if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
 297
 298         pte = pte_offset_kernel(pmd, address);
 299         if (bad_address(pte)) goto bad;
 300         printk("PTE %lx", pte_val(*pte));
 301 ret:
 302         printk("\n");
 303         return;
 304 bad:
 305         printk("BAD\n");
 306 }
 307
 308 #ifdef CONFIG_X86_64
 309 static const char errata93_warning[] =
 310 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 311 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 312 KERN_ERR "******* Please consider a BIOS update.\n"
 313 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 314
 315 /* Workaround for K8 erratum #93 & buggy BIOS.
 316    BIOS SMM functions are required to use a specific workaround
 317    to avoid corruption of the 64bit RIP register on C stepping K8.
 318    A lot of BIOS that didn't get tested properly miss this.
 319    The OS sees this as a page fault with the upper 32bits of RIP cleared.
 320    Try to work around it here.
 321    Note we only handle faults in kernel here. */
 322
 323 static int is_errata93(struct pt_regs *regs, unsigned long address)
 324 {
 325         static int warned;
 326         if (address != regs->ip)
 327                 return 0;
 328         if ((address >> 32) != 0)
 329                 return 0;
 330         address |= 0xffffffffUL << 32;
 331         if ((address >= (u64)_stext && address <= (u64)_etext) ||
 332             (address >= MODULES_VADDR && address <= MODULES_END)) {
 333                 if (!warned) {
 334                         printk(errata93_warning);
 335                         warned = 1;
 336                 }
 337                 regs->ip = address;
 338                 return 1;
 339         }
 340         return 0;
 341 }
 342 #endif
 343
 344 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 345                                  unsigned long error_code)
 346 {
 347         unsigned long flags = oops_begin();
 348         struct task_struct *tsk;
 349
 350         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 351                current->comm, address);
 352         dump_pagetable(address);
 353         tsk = current;
 354         tsk->thread.cr2 = address;
 355         tsk->thread.trap_no = 14;
 356         tsk->thread.error_code = error_code;
 357         if (__die("Bad pagetable", regs, error_code))
 358                 regs = NULL;
 359         oops_end(flags, regs, SIGKILL);
 360 }
 361
 362 /*
 363  * Handle a fault on the vmalloc area
 364  *
 365  * This assumes no large pages in there.
 366  */
 367 static int vmalloc_fault(unsigned long address)
 368 {
 369         pgd_t *pgd, *pgd_ref;
 370         pud_t *pud, *pud_ref;
 371         pmd_t *pmd, *pmd_ref;
 372         pte_t *pte, *pte_ref;
 373
 374         /* Copy kernel mappings over when needed. This can also
 375            happen within a race in page table update. In the later
 376            case just flush. */
 377
 378         pgd = pgd_offset(current->mm ?: &init_mm, address);
 379         pgd_ref = pgd_offset_k(address);
 380         if (pgd_none(*pgd_ref))
 381                 return -1;
 382         if (pgd_none(*pgd))
 383                 set_pgd(pgd, *pgd_ref);
 384         else
 385                 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 386
 387         /* Below here mismatches are bugs because these lower tables
 388            are shared */
 389
 390         pud = pud_offset(pgd, address);
 391         pud_ref = pud_offset(pgd_ref, address);
 392         if (pud_none(*pud_ref))
 393                 return -1;
 394         if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 395                 BUG();
 396         pmd = pmd_offset(pud, address);
 397         pmd_ref = pmd_offset(pud_ref, address);
 398         if (pmd_none(*pmd_ref))
 399                 return -1;
 400         if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 401                 BUG();
 402         pte_ref = pte_offset_kernel(pmd_ref, address);
 403         if (!pte_present(*pte_ref))
 404                 return -1;
 405         pte = pte_offset_kernel(pmd, address);
 406         /* Don't use pte_page here, because the mappings can point
 407            outside mem_map, and the NUMA hash lookup cannot handle
 408            that. */
 409         if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 410                 BUG();
 411         return 0;
 412 }
 413
 414 int show_unhandled_signals = 1;
 415
 416 /*
 417  * This routine handles page faults.  It determines the address,
 418  * and the problem, and then passes it off to one of the appropriate
 419  * routines.
 420  */
 421 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 422                                         unsigned long error_code)
 423 {
 424         struct task_struct *tsk;
 425         struct mm_struct *mm;
 426         struct vm_area_struct *vma;
 427         unsigned long address;
 428         int write, fault;
 429         unsigned long flags;
 430         int si_code;
 431
 432         /*
 433          * We can fault from pretty much anywhere, with unknown IRQ state.
 434          */
 435         trace_hardirqs_fixup();
 436
 437         tsk = current;
 438         mm = tsk->mm;
 439         prefetchw(&mm->mmap_sem);
 440
 441         /* get the address */
 442         address = read_cr2();
 443
 444         si_code = SEGV_MAPERR;
 445
 446
 447         /*
 448          * We fault-in kernel-space virtual memory on-demand. The
 449          * 'reference' page table is init_mm.pgd.
 450          *
 451          * NOTE! We MUST NOT take any locks for this case. We may
 452          * be in an interrupt or a critical region, and should
 453          * only copy the information from the master page table,
 454          * nothing more.
 455          *
 456          * This verifies that the fault happens in kernel space
 457          * (error_code & 4) == 0, and that the fault was not a
 458          * protection error (error_code & 9) == 0.
 459          */
 460         if (unlikely(address >= TASK_SIZE64)) {
 461                 /*
 462                  * Don't check for the module range here: its PML4
 463                  * is always initialized because it's shared with the main
 464                  * kernel text. Only vmalloc may need PML4 syncups.
 465                  */
 466                 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 467                       ((address >= VMALLOC_START && address < VMALLOC_END))) {
 468                         if (vmalloc_fault(address) >= 0)
 469                                 return;
 470                 }
 471                 if (notify_page_fault(regs))
 472                         return;
 473                 /*
 474                  * Don't take the mm semaphore here. If we fixup a prefetch
 475                  * fault we could otherwise deadlock.
 476                  */
 477                 goto bad_area_nosemaphore;
 478         }
 479
 480         if (notify_page_fault(regs))
 481                 return;
 482
 483         if (likely(regs->flags & X86_EFLAGS_IF))
 484                 local_irq_enable();
 485
 486         if (unlikely(error_code & PF_RSVD))
 487                 pgtable_bad(address, regs, error_code);
 488
 489         /*
 490          * If we're in an interrupt, have no user context or are running in an
 491          * atomic region then we must not take the fault.
 492          */
 493         if (unlikely(in_atomic() || !mm))
 494                 goto bad_area_nosemaphore;
 495
 496         /*
 497          * User-mode registers count as a user access even for any
 498          * potential system fault or CPU buglet.
 499          */
 500         if (user_mode_vm(regs))
 501                 error_code |= PF_USER;
 502
 503  again:
 504         /* When running in the kernel we expect faults to occur only to
 505          * addresses in user space.  All other faults represent errors in the
 506          * kernel and should generate an OOPS.  Unfortunately, in the case of an
 507          * erroneous fault occurring in a code path which already holds mmap_sem
 508          * we will deadlock attempting to validate the fault against the
 509          * address space.  Luckily the kernel only validly references user
 510          * space from well defined areas of code, which are listed in the
 511          * exceptions table.
 512          *
 513          * As the vast majority of faults will be valid we will only perform
 514          * the source reference check when there is a possibility of a deadlock.
 515          * Attempt to lock the address space, if we cannot we then validate the
 516          * source.  If this is invalid we can skip the address space check,
 517          * thus avoiding the deadlock.
 518          */
 519         if (!down_read_trylock(&mm->mmap_sem)) {
 520                 if ((error_code & PF_USER) == 0 &&
 521                     !search_exception_tables(regs->ip))
 522                         goto bad_area_nosemaphore;
 523                 down_read(&mm->mmap_sem);
 524         }
 525
 526         vma = find_vma(mm, address);
 527         if (!vma)
 528                 goto bad_area;
 529         if (likely(vma->vm_start <= address))
 530                 goto good_area;
 531         if (!(vma->vm_flags & VM_GROWSDOWN))
 532                 goto bad_area;
 533         if (error_code & PF_USER) {
 534                 /* Allow userspace just enough access below the stack pointer
 535                  * to let the 'enter' instruction work.
 536                  */
 537                 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 538                         goto bad_area;
 539         }
 540         if (expand_stack(vma, address))
 541                 goto bad_area;
 542 /*
 543  * Ok, we have a good vm_area for this memory access, so
 544  * we can handle it..
 545  */
 546 good_area:
 547         si_code = SEGV_ACCERR;
 548         write = 0;
 549         switch (error_code & (PF_PROT|PF_WRITE)) {
 550         default:        /* 3: write, present */
 551                 /* fall through */
 552         case PF_WRITE:          /* write, not present */
 553                 if (!(vma->vm_flags & VM_WRITE))
 554                         goto bad_area;
 555                 write++;
 556                 break;
 557         case PF_PROT:           /* read, present */
 558                 goto bad_area;
 559         case 0:                 /* read, not present */
 560                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 561                         goto bad_area;
 562         }
 563
 564         /*
 565          * If for any reason at all we couldn't handle the fault,
 566          * make sure we exit gracefully rather than endlessly redo
 567          * the fault.
 568          */
 569         fault = handle_mm_fault(mm, vma, address, write);
 570         if (unlikely(fault & VM_FAULT_ERROR)) {
 571                 if (fault & VM_FAULT_OOM)
 572                         goto out_of_memory;
 573                 else if (fault & VM_FAULT_SIGBUS)
 574                         goto do_sigbus;
 575                 BUG();
 576         }
 577         if (fault & VM_FAULT_MAJOR)
 578                 tsk->maj_flt++;
 579         else
 580                 tsk->min_flt++;
 581         up_read(&mm->mmap_sem);
 582         return;
 583
 584 /*
 585  * Something tried to access memory that isn't in our memory map..
 586  * Fix it, but check if it's kernel or user first..
 587  */
 588 bad_area:
 589         up_read(&mm->mmap_sem);
 590
 591 bad_area_nosemaphore:
 592         /* User mode accesses just cause a SIGSEGV */
 593         if (error_code & PF_USER) {
 594
 595                 /*
 596                  * It's possible to have interrupts off here.
 597                  */
 598                 local_irq_enable();
 599
 600                 if (is_prefetch(regs, address, error_code))
 601                         return;
 602
 603                 /* Work around K8 erratum #100 K8 in compat mode
 604                    occasionally jumps to illegal addresses >4GB.  We
 605                    catch this here in the page fault handler because
 606                    these addresses are not reachable. Just detect this
 607                    case and return.  Any code segment in LDT is
 608                    compatibility mode. */
 609                 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
 610                     (address >> 32))
 611                         return;
 612
 613                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 614                     printk_ratelimit()) {
 615                         printk(
 616                        "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
 617                                         tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
 618                                         tsk->comm, tsk->pid, address, regs->ip,
 619                                         regs->sp, error_code);
 620                 }
 621
 622                 tsk->thread.cr2 = address;
 623                 /* Kernel addresses are always protection faults */
 624                 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 625                 tsk->thread.trap_no = 14;
 626
 627                 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
 628                 return;
 629         }
 630
 631 no_context:
 632         /* Are we prepared to handle this kernel fault?  */
 633         if (fixup_exception(regs))
 634                 return;
 635
 636         /*
 637          * Hall of shame of CPU/BIOS bugs.
 638          */
 639
 640         if (is_prefetch(regs, address, error_code))
 641                 return;
 642
 643         if (is_errata93(regs, address))
 644                 return;
 645
 646 /*
 647  * Oops. The kernel tried to access some bad page. We'll have to
 648  * terminate things with extreme prejudice.
 649  */
 650
 651         flags = oops_begin();
 652
 653         if (address < PAGE_SIZE)
 654                 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
 655         else
 656                 printk(KERN_ALERT "Unable to handle kernel paging request");
 657         printk(" at %016lx RIP: \n" KERN_ALERT, address);
 658         printk_address(regs->ip);
 659         dump_pagetable(address);
 660         tsk->thread.cr2 = address;
 661         tsk->thread.trap_no = 14;
 662         tsk->thread.error_code = error_code;
 663         if (__die("Oops", regs, error_code))
 664                 regs = NULL;
 665         /* Executive summary in case the body of the oops scrolled away */
 666         printk(KERN_EMERG "CR2: %016lx\n", address);
 667         oops_end(flags, regs, SIGKILL);
 668
 669 /*
 670  * We ran out of memory, or some other thing happened to us that made
 671  * us unable to handle the page fault gracefully.
 672  */
 673 out_of_memory:
 674         up_read(&mm->mmap_sem);
 675         if (is_global_init(current)) {
 676                 yield();
 677                 goto again;
 678         }
 679         printk("VM: killing process %s\n", tsk->comm);
 680         if (error_code & 4)
 681                 do_group_exit(SIGKILL);
 682         goto no_context;
 683
 684 do_sigbus:
 685         up_read(&mm->mmap_sem);
 686
 687         /* Kernel mode? Handle exceptions or die */
 688         if (!(error_code & PF_USER))
 689                 goto no_context;
 690
 691         tsk->thread.cr2 = address;
 692         tsk->thread.error_code = error_code;
 693         tsk->thread.trap_no = 14;
 694         force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 695         return;
 696 }
 697
 698 DEFINE_SPINLOCK(pgd_lock);
 699 LIST_HEAD(pgd_list);
 700
 701 void vmalloc_sync_all(void)
 702 {
 703         /* Note that races in the updates of insync and start aren't
 704            problematic:
 705            insync can only get set bits added, and updates to start are only
 706            improving performance (without affecting correctness if undone). */
 707         static DECLARE_BITMAP(insync, PTRS_PER_PGD);
 708         static unsigned long start = VMALLOC_START & PGDIR_MASK;
 709         unsigned long address;
 710
 711         for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
 712                 if (!test_bit(pgd_index(address), insync)) {
 713                         const pgd_t *pgd_ref = pgd_offset_k(address);
 714                         struct page *page;
 715
 716                         if (pgd_none(*pgd_ref))
 717                                 continue;
 718                         spin_lock(&pgd_lock);
 719                         list_for_each_entry(page, &pgd_list, lru) {
 720                                 pgd_t *pgd;
 721                                 pgd = (pgd_t *)page_address(page) + pgd_index(address);
 722                                 if (pgd_none(*pgd))
 723                                         set_pgd(pgd, *pgd_ref);
 724                                 else
 725                                         BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 726                         }
 727                         spin_unlock(&pgd_lock);
 728                         set_bit(pgd_index(address), insync);
 729                 }
 730                 if (address == start)
 731                         start = address + PGDIR_SIZE;
 732         }
 733         /* Check that there is no need to do the same for the modules area. */
 734         BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 735         BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 736                                 (__START_KERNEL & PGDIR_MASK)));
 737 }