2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81 static void __exit_idle(void)
83 if (test_and_clear_bit_pda(0, isidle) == 0)
85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88 /* Called from interrupts to signify idle end */
91 /* idle loop has pid 0 */
98 * We use this if we don't have any better
101 void default_idle(void)
103 current_thread_info()->status &= ~TS_POLLING;
105 * TS_POLLING-cleared state must be visible before we
110 if (!need_resched()) {
111 safe_halt(); /* enables interrupts racelessly */
115 current_thread_info()->status |= TS_POLLING;
119 * On SMP it's slightly faster (but much more power-consuming!)
120 * to poll the ->need_resched flag instead of waiting for the
121 * cross-CPU IPI to arrive. Use this option with caution.
123 static void poll_idle(void)
129 #ifdef CONFIG_HOTPLUG_CPU
130 DECLARE_PER_CPU(int, cpu_state);
133 /* We halt the CPU with physical CPU hotplug */
134 static inline void play_dead(void)
140 __get_cpu_var(cpu_state) = CPU_DEAD;
147 static inline void play_dead(void)
151 #endif /* CONFIG_HOTPLUG_CPU */
154 * The idle thread. There's no useful work to be
155 * done, so just try to conserve power and have a
156 * low exit latency (ie sit in a loop waiting for
157 * somebody to say that they'd like to reschedule)
161 current_thread_info()->status |= TS_POLLING;
162 /* endless idle loop with no priority at all */
164 tick_nohz_stop_sched_tick();
165 while (!need_resched()) {
172 if (cpu_is_offline(smp_processor_id()))
175 * Idle routines should keep interrupts disabled
176 * from here on, until they go to idle.
177 * Otherwise, idle callbacks can misfire.
182 /* In many cases the interrupt that ended idle
183 has already called exit_idle. But some idle
184 loops can be woken up without interrupt. */
188 tick_nohz_restart_sched_tick();
189 preempt_enable_no_resched();
195 static void do_nothing(void *unused)
200 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
201 * pm_idle and update to new pm_idle value. Required while changing pm_idle
202 * handler on SMP systems.
204 * Caller must have changed pm_idle to the new value before the call. Old
205 * pm_idle value will not be used by any CPU after the return of this function.
207 void cpu_idle_wait(void)
210 /* kick all the CPUs so that they exit out of pm_idle */
211 smp_call_function(do_nothing, NULL, 0, 1);
213 EXPORT_SYMBOL_GPL(cpu_idle_wait);
216 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
217 * which can obviate IPI to trigger checking of need_resched.
218 * We execute MONITOR against need_resched and enter optimized wait state
219 * through MWAIT. Whenever someone changes need_resched, we would be woken
220 * up from MWAIT (without an IPI).
222 * New with Core Duo processors, MWAIT can take some hints based on CPU
225 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
227 if (!need_resched()) {
228 __monitor((void *)¤t_thread_info()->flags, 0, 0);
235 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
236 static void mwait_idle(void)
238 if (!need_resched()) {
239 __monitor((void *)¤t_thread_info()->flags, 0, 0);
251 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
255 /* Any C1 states supported? */
256 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
259 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
265 #ifdef CONFIG_X86_SMP
266 if (pm_idle == poll_idle && smp_num_siblings > 1) {
267 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
268 " performance may degrade.\n");
271 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
273 * Skip, if setup has overridden idle.
274 * One CPU supports mwait => All CPUs supports mwait
277 printk(KERN_INFO "using mwait in idle threads.\n");
278 pm_idle = mwait_idle;
284 static int __init idle_setup(char *str)
286 if (!strcmp(str, "poll")) {
287 printk("using polling idle threads.\n");
289 } else if (!strcmp(str, "mwait"))
294 boot_option_idle_override = 1;
297 early_param("idle", idle_setup);
299 /* Prints also some state that isn't saved in the pt_regs */
300 void __show_regs(struct pt_regs * regs)
302 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
303 unsigned long d0, d1, d2, d3, d6, d7;
304 unsigned int fsindex, gsindex;
305 unsigned int ds, cs, es;
309 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
310 current->pid, current->comm, print_tainted(),
311 init_utsname()->release,
312 (int)strcspn(init_utsname()->version, " "),
313 init_utsname()->version);
314 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
315 printk_address(regs->ip, 1);
316 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
318 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
319 regs->ax, regs->bx, regs->cx);
320 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
321 regs->dx, regs->si, regs->di);
322 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
323 regs->bp, regs->r8, regs->r9);
324 printk("R10: %016lx R11: %016lx R12: %016lx\n",
325 regs->r10, regs->r11, regs->r12);
326 printk("R13: %016lx R14: %016lx R15: %016lx\n",
327 regs->r13, regs->r14, regs->r15);
329 asm("movl %%ds,%0" : "=r" (ds));
330 asm("movl %%cs,%0" : "=r" (cs));
331 asm("movl %%es,%0" : "=r" (es));
332 asm("movl %%fs,%0" : "=r" (fsindex));
333 asm("movl %%gs,%0" : "=r" (gsindex));
335 rdmsrl(MSR_FS_BASE, fs);
336 rdmsrl(MSR_GS_BASE, gs);
337 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
344 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
345 fs,fsindex,gs,gsindex,shadowgs);
346 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
347 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
352 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
356 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
359 void show_regs(struct pt_regs *regs)
361 printk("CPU %d:", smp_processor_id());
363 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
367 * Free current thread data structures etc..
369 void exit_thread(void)
371 struct task_struct *me = current;
372 struct thread_struct *t = &me->thread;
374 if (me->thread.io_bitmap_ptr) {
375 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
377 kfree(t->io_bitmap_ptr);
378 t->io_bitmap_ptr = NULL;
379 clear_thread_flag(TIF_IO_BITMAP);
381 * Careful, clear this in the TSS too:
383 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
384 t->io_bitmap_max = 0;
389 void flush_thread(void)
391 struct task_struct *tsk = current;
393 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
394 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
395 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
396 clear_tsk_thread_flag(tsk, TIF_IA32);
398 set_tsk_thread_flag(tsk, TIF_IA32);
399 current_thread_info()->status |= TS_COMPAT;
402 clear_tsk_thread_flag(tsk, TIF_DEBUG);
404 tsk->thread.debugreg0 = 0;
405 tsk->thread.debugreg1 = 0;
406 tsk->thread.debugreg2 = 0;
407 tsk->thread.debugreg3 = 0;
408 tsk->thread.debugreg6 = 0;
409 tsk->thread.debugreg7 = 0;
410 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
412 * Forget coprocessor state..
418 void release_thread(struct task_struct *dead_task)
421 if (dead_task->mm->context.size) {
422 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
424 dead_task->mm->context.ldt,
425 dead_task->mm->context.size);
431 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
433 struct user_desc ud = {
440 struct desc_struct *desc = t->thread.tls_array;
445 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
447 return get_desc_base(&t->thread.tls_array[tls]);
451 * This gets called before we allocate a new thread and copy
452 * the current task into it.
454 void prepare_to_copy(struct task_struct *tsk)
459 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
460 unsigned long unused,
461 struct task_struct * p, struct pt_regs * regs)
464 struct pt_regs * childregs;
465 struct task_struct *me = current;
467 childregs = ((struct pt_regs *)
468 (THREAD_SIZE + task_stack_page(p))) - 1;
474 childregs->sp = (unsigned long)childregs;
476 p->thread.sp = (unsigned long) childregs;
477 p->thread.sp0 = (unsigned long) (childregs+1);
478 p->thread.usersp = me->thread.usersp;
480 set_tsk_thread_flag(p, TIF_FORK);
482 p->thread.fs = me->thread.fs;
483 p->thread.gs = me->thread.gs;
485 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
486 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
487 asm("mov %%es,%0" : "=m" (p->thread.es));
488 asm("mov %%ds,%0" : "=m" (p->thread.ds));
490 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
491 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
492 if (!p->thread.io_bitmap_ptr) {
493 p->thread.io_bitmap_max = 0;
496 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
498 set_tsk_thread_flag(p, TIF_IO_BITMAP);
502 * Set a new TLS for the child thread?
504 if (clone_flags & CLONE_SETTLS) {
505 #ifdef CONFIG_IA32_EMULATION
506 if (test_thread_flag(TIF_IA32))
507 err = do_set_thread_area(p, -1,
508 (struct user_desc __user *)childregs->si, 0);
511 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
517 if (err && p->thread.io_bitmap_ptr) {
518 kfree(p->thread.io_bitmap_ptr);
519 p->thread.io_bitmap_max = 0;
525 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
527 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
531 write_pda(oldrsp, new_sp);
532 regs->cs = __USER_CS;
533 regs->ss = __USER_DS;
537 EXPORT_SYMBOL_GPL(start_thread);
539 static void hard_disable_TSC(void)
541 write_cr4(read_cr4() | X86_CR4_TSD);
544 void disable_TSC(void)
547 if (!test_and_set_thread_flag(TIF_NOTSC))
549 * Must flip the CPU state synchronously with
550 * TIF_NOTSC in the current running context.
556 static void hard_enable_TSC(void)
558 write_cr4(read_cr4() & ~X86_CR4_TSD);
561 void enable_TSC(void)
564 if (test_and_clear_thread_flag(TIF_NOTSC))
566 * Must flip the CPU state synchronously with
567 * TIF_NOTSC in the current running context.
573 int get_tsc_mode(unsigned long adr)
577 if (test_thread_flag(TIF_NOTSC))
578 val = PR_TSC_SIGSEGV;
582 return put_user(val, (unsigned int __user *)adr);
585 int set_tsc_mode(unsigned int val)
587 if (val == PR_TSC_SIGSEGV)
589 else if (val == PR_TSC_ENABLE)
598 * This special macro can be used to load a debugging register
600 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
602 static inline void __switch_to_xtra(struct task_struct *prev_p,
603 struct task_struct *next_p,
604 struct tss_struct *tss)
606 struct thread_struct *prev, *next;
607 unsigned long debugctl;
609 prev = &prev_p->thread,
610 next = &next_p->thread;
612 debugctl = prev->debugctlmsr;
613 if (next->ds_area_msr != prev->ds_area_msr) {
614 /* we clear debugctl to make sure DS
615 * is not in use when we change it */
617 update_debugctlmsr(0);
618 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
621 if (next->debugctlmsr != debugctl)
622 update_debugctlmsr(next->debugctlmsr);
624 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
634 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
635 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
636 /* prev and next are different */
637 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
643 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
645 * Copy the relevant range of the IO bitmap.
646 * Normally this is 128 bytes or less:
648 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
649 max(prev->io_bitmap_max, next->io_bitmap_max));
650 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
652 * Clear any possible leftover bits:
654 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
658 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
659 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
661 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
662 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
667 * switch_to(x,y) should switch tasks from x to y.
669 * This could still be optimized:
670 * - fold all the options into a flag word and test it with a single test.
671 * - could test fs/gs bitsliced
673 * Kprobes not supported here. Set the probe on schedule instead.
676 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
678 struct thread_struct *prev = &prev_p->thread,
679 *next = &next_p->thread;
680 int cpu = smp_processor_id();
681 struct tss_struct *tss = &per_cpu(init_tss, cpu);
683 /* we're going to use this soon, after a few expensive things */
684 if (next_p->fpu_counter>5)
685 prefetch(&next->i387.fxsave);
688 * Reload esp0, LDT and the page table pointer:
694 * This won't pick up thread selector changes, but I guess that is ok.
696 asm volatile("mov %%es,%0" : "=m" (prev->es));
697 if (unlikely(next->es | prev->es))
698 loadsegment(es, next->es);
700 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
701 if (unlikely(next->ds | prev->ds))
702 loadsegment(ds, next->ds);
711 asm volatile("movl %%fs,%0" : "=r" (fsindex));
712 /* segment register != 0 always requires a reload.
713 also reload when it has changed.
714 when prev process used 64bit base always reload
715 to avoid an information leak. */
716 if (unlikely(fsindex | next->fsindex | prev->fs)) {
717 loadsegment(fs, next->fsindex);
718 /* check if the user used a selector != 0
719 * if yes clear 64bit base, since overloaded base
720 * is always mapped to the Null selector
725 /* when next process has a 64bit base use it */
727 wrmsrl(MSR_FS_BASE, next->fs);
728 prev->fsindex = fsindex;
732 asm volatile("movl %%gs,%0" : "=r" (gsindex));
733 if (unlikely(gsindex | next->gsindex | prev->gs)) {
734 load_gs_index(next->gsindex);
739 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
740 prev->gsindex = gsindex;
743 /* Must be after DS reload */
747 * Switch the PDA and FPU contexts.
749 prev->usersp = read_pda(oldrsp);
750 write_pda(oldrsp, next->usersp);
751 write_pda(pcurrent, next_p);
753 write_pda(kernelstack,
754 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
755 #ifdef CONFIG_CC_STACKPROTECTOR
756 write_pda(stack_canary, next_p->stack_canary);
758 * Build time only check to make sure the stack_canary is at
759 * offset 40 in the pda; this is a gcc ABI requirement
761 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
765 * Now maybe reload the debug registers and handle I/O bitmaps
767 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
768 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
769 __switch_to_xtra(prev_p, next_p, tss);
771 /* If the task has used fpu the last 5 timeslices, just do a full
772 * restore of the math state immediately to avoid the trap; the
773 * chances of needing FPU soon are obviously high now
775 if (next_p->fpu_counter>5)
776 math_state_restore();
781 * sys_execve() executes a new program.
784 long sys_execve(char __user *name, char __user * __user *argv,
785 char __user * __user *envp, struct pt_regs *regs)
790 filename = getname(name);
791 error = PTR_ERR(filename);
792 if (IS_ERR(filename))
794 error = do_execve(filename, argv, envp, regs);
799 void set_personality_64bit(void)
801 /* inherit personality from parent */
803 /* Make sure to be in 64bit mode */
804 clear_thread_flag(TIF_IA32);
806 /* TBD: overwrites user setup. Should have two bits.
807 But 64bit processes have always behaved this way,
808 so it's not too bad. The main problem is just that
809 32bit childs are affected again. */
810 current->personality &= ~READ_IMPLIES_EXEC;
813 asmlinkage long sys_fork(struct pt_regs *regs)
815 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
819 sys_clone(unsigned long clone_flags, unsigned long newsp,
820 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
824 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
828 * This is trivial, and on the face of it looks like it
829 * could equally well be done in user mode.
831 * Not so, for quite unobvious reasons - register pressure.
832 * In user mode vfork() cannot have a stack frame, and if
833 * done by calling the "clone()" system call directly, you
834 * do not have enough call-clobbered registers to hold all
835 * the information you need.
837 asmlinkage long sys_vfork(struct pt_regs *regs)
839 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
843 unsigned long get_wchan(struct task_struct *p)
849 if (!p || p == current || p->state==TASK_RUNNING)
851 stack = (unsigned long)task_stack_page(p);
852 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
854 fp = *(u64 *)(p->thread.sp);
856 if (fp < (unsigned long)stack ||
857 fp > (unsigned long)stack+THREAD_SIZE)
860 if (!in_sched_functions(ip))
863 } while (count++ < 16);
867 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
870 int doit = task == current;
875 if (addr >= TASK_SIZE_OF(task))
878 /* handle small bases via the GDT because that's faster to
880 if (addr <= 0xffffffff) {
881 set_32bit_tls(task, GS_TLS, addr);
883 load_TLS(&task->thread, cpu);
884 load_gs_index(GS_TLS_SEL);
886 task->thread.gsindex = GS_TLS_SEL;
889 task->thread.gsindex = 0;
890 task->thread.gs = addr;
893 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
899 /* Not strictly needed for fs, but do it for symmetry
901 if (addr >= TASK_SIZE_OF(task))
904 /* handle small bases via the GDT because that's faster to
906 if (addr <= 0xffffffff) {
907 set_32bit_tls(task, FS_TLS, addr);
909 load_TLS(&task->thread, cpu);
910 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
912 task->thread.fsindex = FS_TLS_SEL;
915 task->thread.fsindex = 0;
916 task->thread.fs = addr;
918 /* set the selector to 0 to not confuse
920 asm volatile("movl %0,%%fs" :: "r" (0));
921 ret = checking_wrmsrl(MSR_FS_BASE, addr);
928 if (task->thread.fsindex == FS_TLS_SEL)
929 base = read_32bit_tls(task, FS_TLS);
931 rdmsrl(MSR_FS_BASE, base);
933 base = task->thread.fs;
934 ret = put_user(base, (unsigned long __user *)addr);
940 if (task->thread.gsindex == GS_TLS_SEL)
941 base = read_32bit_tls(task, GS_TLS);
943 asm("movl %%gs,%0" : "=r" (gsindex));
945 rdmsrl(MSR_KERNEL_GS_BASE, base);
947 base = task->thread.gs;
950 base = task->thread.gs;
951 ret = put_user(base, (unsigned long __user *)addr);
963 long sys_arch_prctl(int code, unsigned long addr)
965 return do_arch_prctl(current, code, addr);
968 unsigned long arch_align_stack(unsigned long sp)
970 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
971 sp -= get_random_int() % 8192;
975 unsigned long arch_randomize_brk(struct mm_struct *mm)
977 unsigned long range_end = mm->brk + 0x02000000;
978 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;