2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
69 * Return EIP plus the CS segment base. The segment limit is also
70 * adjusted, clamped to the kernel/user address space (whichever is
71 * appropriate), and returned in *eip_limit.
73 * The segment is checked, because it might have been changed by another
74 * task between the original faulting instruction and here.
76 * If CS is no longer a valid code segment, or if EIP is beyond the
77 * limit, or if it is a kernel address when CS is not a kernel segment,
78 * then the returned value will be greater than *eip_limit.
80 * This is slow, but is very rarely executed.
82 static inline unsigned long get_segment_eip(struct pt_regs *regs,
83 unsigned long *eip_limit)
85 unsigned long ip = regs->ip;
86 unsigned seg = regs->cs & 0xffff;
87 u32 seg_ar, seg_limit, base, *desc;
89 /* Unlikely, but must come before segment checks. */
90 if (unlikely(regs->flags & VM_MASK)) {
92 *eip_limit = base + 0xffff;
93 return base + (ip & 0xffff);
96 /* The standard kernel/user address space limit. */
97 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
99 /* By far the most common cases. */
100 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
103 /* Check the segment exists, is within the current LDT/GDT size,
104 that kernel/user (ring 0..3) has the appropriate privilege,
105 that it's a code segment, and get the limit. */
106 __asm__("larl %3,%0; lsll %3,%1"
107 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
108 if ((~seg_ar & 0x9800) || ip > seg_limit) {
110 return 1; /* So that returned ip > *eip_limit. */
113 /* Get the GDT/LDT descriptor base.
114 When you look for races in this code remember that
115 LDT and other horrors are only used in user space. */
117 /* Must lock the LDT while reading it. */
118 mutex_lock(¤t->mm->context.lock);
119 desc = current->mm->context.ldt;
120 desc = (void *)desc + (seg & ~7);
122 /* Must disable preemption while reading the GDT. */
123 desc = (u32 *)get_cpu_gdt_table(get_cpu());
124 desc = (void *)desc + (seg & ~7);
127 /* Decode the code segment base from the descriptor */
128 base = get_desc_base((struct desc_struct *)desc);
131 mutex_unlock(¤t->mm->context.lock);
135 /* Adjust EIP and segment limit, and clamp at the kernel limit.
136 It's legitimate for segments to wrap at 0xffffffff. */
138 if (seg_limit < *eip_limit && seg_limit >= base)
139 *eip_limit = seg_limit;
146 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
147 * Check that here and ignore it.
150 * Sometimes the CPU reports invalid exceptions on prefetch.
151 * Check that here and ignore it.
153 * Opcode checker based on code by Richard Brunner
155 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
156 unsigned long error_code)
158 unsigned char *instr;
161 unsigned char *max_instr;
165 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
166 boot_cpu_data.x86 >= 6)) {
167 /* Catch an obscure case of prefetch inside an NX page. */
168 if (nx_enabled && (error_code & PF_INSTR))
173 instr = (unsigned char *)get_segment_eip(regs, &limit);
175 /* If it was a exec fault ignore */
176 if (error_code & PF_INSTR)
178 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
181 max_instr = instr + 15;
184 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
188 while (scan_more && instr < max_instr) {
189 unsigned char opcode;
190 unsigned char instr_hi;
191 unsigned char instr_lo;
194 if (instr > (unsigned char *)limit)
197 if (probe_kernel_address(instr, opcode))
200 instr_hi = opcode & 0xf0;
201 instr_lo = opcode & 0x0f;
208 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
209 * In X86_64 long mode, the CPU will signal invalid
210 * opcode if some of these prefixes are present so
211 * X86_64 will never get here anyway
213 scan_more = ((instr_lo & 7) == 0x6);
218 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
219 * Need to figure out under what instruction mode the
220 * instruction was issued. Could check the LDT for lm,
221 * but for now it's good enough to assume that long
222 * mode only uses well known segments or kernel.
224 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
228 /* 0x64 thru 0x67 are valid prefixes in all modes. */
229 scan_more = (instr_lo & 0xC) == 0x4;
232 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
233 scan_more = !instr_lo || (instr_lo>>1) == 1;
236 /* Prefetch instruction is 0x0F0D or 0x0F18 */
239 if (instr > (unsigned char *)limit)
242 if (probe_kernel_address(instr, opcode))
244 prefetch = (instr_lo == 0xF) &&
245 (opcode == 0x0D || opcode == 0x18);
255 static void force_sig_info_fault(int si_signo, int si_code,
256 unsigned long address, struct task_struct *tsk)
260 info.si_signo = si_signo;
262 info.si_code = si_code;
263 info.si_addr = (void __user *)address;
264 force_sig_info(si_signo, &info, tsk);
267 static int bad_address(void *p)
270 return probe_kernel_address((unsigned long *)p, dummy);
273 void dump_pagetable(unsigned long address)
280 pgd = (pgd_t *)read_cr3();
282 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
283 pgd += pgd_index(address);
284 if (bad_address(pgd)) goto bad;
285 printk("PGD %lx ", pgd_val(*pgd));
286 if (!pgd_present(*pgd)) goto ret;
288 pud = pud_offset(pgd, address);
289 if (bad_address(pud)) goto bad;
290 printk("PUD %lx ", pud_val(*pud));
291 if (!pud_present(*pud)) goto ret;
293 pmd = pmd_offset(pud, address);
294 if (bad_address(pmd)) goto bad;
295 printk("PMD %lx ", pmd_val(*pmd));
296 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
298 pte = pte_offset_kernel(pmd, address);
299 if (bad_address(pte)) goto bad;
300 printk("PTE %lx", pte_val(*pte));
309 static const char errata93_warning[] =
310 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
311 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
312 KERN_ERR "******* Please consider a BIOS update.\n"
313 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
315 /* Workaround for K8 erratum #93 & buggy BIOS.
316 BIOS SMM functions are required to use a specific workaround
317 to avoid corruption of the 64bit RIP register on C stepping K8.
318 A lot of BIOS that didn't get tested properly miss this.
319 The OS sees this as a page fault with the upper 32bits of RIP cleared.
320 Try to work around it here.
321 Note we only handle faults in kernel here. */
323 static int is_errata93(struct pt_regs *regs, unsigned long address)
326 if (address != regs->ip)
328 if ((address >> 32) != 0)
330 address |= 0xffffffffUL << 32;
331 if ((address >= (u64)_stext && address <= (u64)_etext) ||
332 (address >= MODULES_VADDR && address <= MODULES_END)) {
334 printk(errata93_warning);
344 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
345 unsigned long error_code)
347 unsigned long flags = oops_begin();
348 struct task_struct *tsk;
350 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
351 current->comm, address);
352 dump_pagetable(address);
354 tsk->thread.cr2 = address;
355 tsk->thread.trap_no = 14;
356 tsk->thread.error_code = error_code;
357 if (__die("Bad pagetable", regs, error_code))
359 oops_end(flags, regs, SIGKILL);
363 * Handle a fault on the vmalloc area
365 * This assumes no large pages in there.
367 static int vmalloc_fault(unsigned long address)
369 pgd_t *pgd, *pgd_ref;
370 pud_t *pud, *pud_ref;
371 pmd_t *pmd, *pmd_ref;
372 pte_t *pte, *pte_ref;
374 /* Copy kernel mappings over when needed. This can also
375 happen within a race in page table update. In the later
378 pgd = pgd_offset(current->mm ?: &init_mm, address);
379 pgd_ref = pgd_offset_k(address);
380 if (pgd_none(*pgd_ref))
383 set_pgd(pgd, *pgd_ref);
385 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
387 /* Below here mismatches are bugs because these lower tables
390 pud = pud_offset(pgd, address);
391 pud_ref = pud_offset(pgd_ref, address);
392 if (pud_none(*pud_ref))
394 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
396 pmd = pmd_offset(pud, address);
397 pmd_ref = pmd_offset(pud_ref, address);
398 if (pmd_none(*pmd_ref))
400 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
402 pte_ref = pte_offset_kernel(pmd_ref, address);
403 if (!pte_present(*pte_ref))
405 pte = pte_offset_kernel(pmd, address);
406 /* Don't use pte_page here, because the mappings can point
407 outside mem_map, and the NUMA hash lookup cannot handle
409 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
414 int show_unhandled_signals = 1;
417 * This routine handles page faults. It determines the address,
418 * and the problem, and then passes it off to one of the appropriate
421 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
422 unsigned long error_code)
424 struct task_struct *tsk;
425 struct mm_struct *mm;
426 struct vm_area_struct *vma;
427 unsigned long address;
433 * We can fault from pretty much anywhere, with unknown IRQ state.
435 trace_hardirqs_fixup();
439 prefetchw(&mm->mmap_sem);
441 /* get the address */
442 address = read_cr2();
444 si_code = SEGV_MAPERR;
448 * We fault-in kernel-space virtual memory on-demand. The
449 * 'reference' page table is init_mm.pgd.
451 * NOTE! We MUST NOT take any locks for this case. We may
452 * be in an interrupt or a critical region, and should
453 * only copy the information from the master page table,
456 * This verifies that the fault happens in kernel space
457 * (error_code & 4) == 0, and that the fault was not a
458 * protection error (error_code & 9) == 0.
460 if (unlikely(address >= TASK_SIZE64)) {
462 * Don't check for the module range here: its PML4
463 * is always initialized because it's shared with the main
464 * kernel text. Only vmalloc may need PML4 syncups.
466 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
467 ((address >= VMALLOC_START && address < VMALLOC_END))) {
468 if (vmalloc_fault(address) >= 0)
471 if (notify_page_fault(regs))
474 * Don't take the mm semaphore here. If we fixup a prefetch
475 * fault we could otherwise deadlock.
477 goto bad_area_nosemaphore;
480 if (notify_page_fault(regs))
483 if (likely(regs->flags & X86_EFLAGS_IF))
486 if (unlikely(error_code & PF_RSVD))
487 pgtable_bad(address, regs, error_code);
490 * If we're in an interrupt, have no user context or are running in an
491 * atomic region then we must not take the fault.
493 if (unlikely(in_atomic() || !mm))
494 goto bad_area_nosemaphore;
497 * User-mode registers count as a user access even for any
498 * potential system fault or CPU buglet.
500 if (user_mode_vm(regs))
501 error_code |= PF_USER;
504 /* When running in the kernel we expect faults to occur only to
505 * addresses in user space. All other faults represent errors in the
506 * kernel and should generate an OOPS. Unfortunately, in the case of an
507 * erroneous fault occurring in a code path which already holds mmap_sem
508 * we will deadlock attempting to validate the fault against the
509 * address space. Luckily the kernel only validly references user
510 * space from well defined areas of code, which are listed in the
513 * As the vast majority of faults will be valid we will only perform
514 * the source reference check when there is a possibility of a deadlock.
515 * Attempt to lock the address space, if we cannot we then validate the
516 * source. If this is invalid we can skip the address space check,
517 * thus avoiding the deadlock.
519 if (!down_read_trylock(&mm->mmap_sem)) {
520 if ((error_code & PF_USER) == 0 &&
521 !search_exception_tables(regs->ip))
522 goto bad_area_nosemaphore;
523 down_read(&mm->mmap_sem);
526 vma = find_vma(mm, address);
529 if (likely(vma->vm_start <= address))
531 if (!(vma->vm_flags & VM_GROWSDOWN))
533 if (error_code & PF_USER) {
534 /* Allow userspace just enough access below the stack pointer
535 * to let the 'enter' instruction work.
537 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
540 if (expand_stack(vma, address))
543 * Ok, we have a good vm_area for this memory access, so
547 si_code = SEGV_ACCERR;
549 switch (error_code & (PF_PROT|PF_WRITE)) {
550 default: /* 3: write, present */
552 case PF_WRITE: /* write, not present */
553 if (!(vma->vm_flags & VM_WRITE))
557 case PF_PROT: /* read, present */
559 case 0: /* read, not present */
560 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
565 * If for any reason at all we couldn't handle the fault,
566 * make sure we exit gracefully rather than endlessly redo
569 fault = handle_mm_fault(mm, vma, address, write);
570 if (unlikely(fault & VM_FAULT_ERROR)) {
571 if (fault & VM_FAULT_OOM)
573 else if (fault & VM_FAULT_SIGBUS)
577 if (fault & VM_FAULT_MAJOR)
581 up_read(&mm->mmap_sem);
585 * Something tried to access memory that isn't in our memory map..
586 * Fix it, but check if it's kernel or user first..
589 up_read(&mm->mmap_sem);
591 bad_area_nosemaphore:
592 /* User mode accesses just cause a SIGSEGV */
593 if (error_code & PF_USER) {
596 * It's possible to have interrupts off here.
600 if (is_prefetch(regs, address, error_code))
603 /* Work around K8 erratum #100 K8 in compat mode
604 occasionally jumps to illegal addresses >4GB. We
605 catch this here in the page fault handler because
606 these addresses are not reachable. Just detect this
607 case and return. Any code segment in LDT is
608 compatibility mode. */
609 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
613 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
614 printk_ratelimit()) {
616 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
617 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
618 tsk->comm, tsk->pid, address, regs->ip,
619 regs->sp, error_code);
622 tsk->thread.cr2 = address;
623 /* Kernel addresses are always protection faults */
624 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
625 tsk->thread.trap_no = 14;
627 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
632 /* Are we prepared to handle this kernel fault? */
633 if (fixup_exception(regs))
637 * Hall of shame of CPU/BIOS bugs.
640 if (is_prefetch(regs, address, error_code))
643 if (is_errata93(regs, address))
647 * Oops. The kernel tried to access some bad page. We'll have to
648 * terminate things with extreme prejudice.
651 flags = oops_begin();
653 if (address < PAGE_SIZE)
654 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
656 printk(KERN_ALERT "Unable to handle kernel paging request");
657 printk(" at %016lx RIP: \n" KERN_ALERT, address);
658 printk_address(regs->ip);
659 dump_pagetable(address);
660 tsk->thread.cr2 = address;
661 tsk->thread.trap_no = 14;
662 tsk->thread.error_code = error_code;
663 if (__die("Oops", regs, error_code))
665 /* Executive summary in case the body of the oops scrolled away */
666 printk(KERN_EMERG "CR2: %016lx\n", address);
667 oops_end(flags, regs, SIGKILL);
670 * We ran out of memory, or some other thing happened to us that made
671 * us unable to handle the page fault gracefully.
674 up_read(&mm->mmap_sem);
675 if (is_global_init(current)) {
679 printk("VM: killing process %s\n", tsk->comm);
681 do_group_exit(SIGKILL);
685 up_read(&mm->mmap_sem);
687 /* Kernel mode? Handle exceptions or die */
688 if (!(error_code & PF_USER))
691 tsk->thread.cr2 = address;
692 tsk->thread.error_code = error_code;
693 tsk->thread.trap_no = 14;
694 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
698 DEFINE_SPINLOCK(pgd_lock);
701 void vmalloc_sync_all(void)
703 /* Note that races in the updates of insync and start aren't
705 insync can only get set bits added, and updates to start are only
706 improving performance (without affecting correctness if undone). */
707 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
708 static unsigned long start = VMALLOC_START & PGDIR_MASK;
709 unsigned long address;
711 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
712 if (!test_bit(pgd_index(address), insync)) {
713 const pgd_t *pgd_ref = pgd_offset_k(address);
716 if (pgd_none(*pgd_ref))
718 spin_lock(&pgd_lock);
719 list_for_each_entry(page, &pgd_list, lru) {
721 pgd = (pgd_t *)page_address(page) + pgd_index(address);
723 set_pgd(pgd, *pgd_ref);
725 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
727 spin_unlock(&pgd_lock);
728 set_bit(pgd_index(address), insync);
730 if (address == start)
731 start = address + PGDIR_SIZE;
733 /* Check that there is no need to do the same for the modules area. */
734 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
735 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
736 (__START_KERNEL & PGDIR_MASK)));