2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/a.out.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
82 static void __exit_idle(void)
84 if (test_and_clear_bit_pda(0, isidle) == 0)
86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
89 /* Called from interrupts to signify idle end */
92 /* idle loop has pid 0 */
99 * We use this if we don't have any better
102 void default_idle(void)
104 current_thread_info()->status &= ~TS_POLLING;
106 * TS_POLLING-cleared state must be visible before we
111 if (!need_resched()) {
116 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */
120 t1n = ktime_to_ns(t1);
121 sched_clock_idle_wakeup_event(t1n - t0n);
124 current_thread_info()->status |= TS_POLLING;
128 * On SMP it's slightly faster (but much more power-consuming!)
129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution.
132 static void poll_idle(void)
138 #ifdef CONFIG_HOTPLUG_CPU
139 DECLARE_PER_CPU(int, cpu_state);
142 /* We halt the CPU with physical CPU hotplug */
143 static inline void play_dead(void)
149 __get_cpu_var(cpu_state) = CPU_DEAD;
156 static inline void play_dead(void)
160 #endif /* CONFIG_HOTPLUG_CPU */
163 * The idle thread. There's no useful work to be
164 * done, so just try to conserve power and have a
165 * low exit latency (ie sit in a loop waiting for
166 * somebody to say that they'd like to reschedule)
170 current_thread_info()->status |= TS_POLLING;
171 /* endless idle loop with no priority at all */
173 tick_nohz_stop_sched_tick();
174 while (!need_resched()) {
177 if (__get_cpu_var(cpu_idle_state))
178 __get_cpu_var(cpu_idle_state) = 0;
184 if (cpu_is_offline(smp_processor_id()))
187 * Idle routines should keep interrupts disabled
188 * from here on, until they go to idle.
189 * Otherwise, idle callbacks can misfire.
194 /* In many cases the interrupt that ended idle
195 has already called exit_idle. But some idle
196 loops can be woken up without interrupt. */
200 tick_nohz_restart_sched_tick();
201 preempt_enable_no_resched();
207 static void do_nothing(void *unused)
211 void cpu_idle_wait(void)
213 unsigned int cpu, this_cpu = get_cpu();
214 cpumask_t map, tmp = current->cpus_allowed;
216 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
220 for_each_online_cpu(cpu) {
221 per_cpu(cpu_idle_state, cpu) = 1;
225 __get_cpu_var(cpu_idle_state) = 0;
230 for_each_online_cpu(cpu) {
231 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
234 cpus_and(map, map, cpu_online_map);
236 * We waited 1 sec, if a CPU still did not call idle
237 * it may be because it is in idle and not waking up
238 * because it has nothing to do.
239 * Give all the remaining CPUS a kick.
241 smp_call_function_mask(map, do_nothing, 0, 0);
242 } while (!cpus_empty(map));
244 set_cpus_allowed(current, tmp);
246 EXPORT_SYMBOL_GPL(cpu_idle_wait);
249 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
250 * which can obviate IPI to trigger checking of need_resched.
251 * We execute MONITOR against need_resched and enter optimized wait state
252 * through MWAIT. Whenever someone changes need_resched, we would be woken
253 * up from MWAIT (without an IPI).
255 * New with Core Duo processors, MWAIT can take some hints based on CPU
258 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
260 if (!need_resched()) {
261 __monitor((void *)¤t_thread_info()->flags, 0, 0);
268 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
269 static void mwait_idle(void)
271 if (!need_resched()) {
272 __monitor((void *)¤t_thread_info()->flags, 0, 0);
284 static int mwait_usable(const struct cpuinfo_x86 *c)
288 /* Any C1 states supported? */
289 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
292 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
295 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
297 * Skip, if setup has overridden idle.
298 * One CPU supports mwait => All CPUs supports mwait
302 printk(KERN_INFO "using mwait in idle threads.\n");
305 pm_idle = mwait_idle;
310 static int __init idle_setup(char *str)
312 if (!strcmp(str, "poll")) {
313 printk("using polling idle threads.\n");
315 } else if (!strcmp(str, "mwait"))
320 boot_option_idle_override = 1;
323 early_param("idle", idle_setup);
325 /* Prints also some state that isn't saved in the pt_regs */
326 void __show_regs(struct pt_regs * regs)
328 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
329 unsigned long d0, d1, d2, d3, d6, d7;
330 unsigned int fsindex, gsindex;
331 unsigned int ds, cs, es;
335 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
336 current->pid, current->comm, print_tainted(),
337 init_utsname()->release,
338 (int)strcspn(init_utsname()->version, " "),
339 init_utsname()->version);
340 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
341 printk_address(regs->ip, 1);
342 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
344 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
345 regs->ax, regs->bx, regs->cx);
346 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
347 regs->dx, regs->si, regs->di);
348 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
349 regs->bp, regs->r8, regs->r9);
350 printk("R10: %016lx R11: %016lx R12: %016lx\n",
351 regs->r10, regs->r11, regs->r12);
352 printk("R13: %016lx R14: %016lx R15: %016lx\n",
353 regs->r13, regs->r14, regs->r15);
355 asm("movl %%ds,%0" : "=r" (ds));
356 asm("movl %%cs,%0" : "=r" (cs));
357 asm("movl %%es,%0" : "=r" (es));
358 asm("movl %%fs,%0" : "=r" (fsindex));
359 asm("movl %%gs,%0" : "=r" (gsindex));
361 rdmsrl(MSR_FS_BASE, fs);
362 rdmsrl(MSR_GS_BASE, gs);
363 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
370 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
371 fs,fsindex,gs,gsindex,shadowgs);
372 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
373 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
378 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
382 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
385 void show_regs(struct pt_regs *regs)
387 printk("CPU %d:", smp_processor_id());
389 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
393 * Free current thread data structures etc..
395 void exit_thread(void)
397 struct task_struct *me = current;
398 struct thread_struct *t = &me->thread;
400 if (me->thread.io_bitmap_ptr) {
401 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
403 kfree(t->io_bitmap_ptr);
404 t->io_bitmap_ptr = NULL;
405 clear_thread_flag(TIF_IO_BITMAP);
407 * Careful, clear this in the TSS too:
409 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
410 t->io_bitmap_max = 0;
415 void flush_thread(void)
417 struct task_struct *tsk = current;
419 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
420 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
421 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
422 clear_tsk_thread_flag(tsk, TIF_IA32);
424 set_tsk_thread_flag(tsk, TIF_IA32);
425 current_thread_info()->status |= TS_COMPAT;
428 clear_tsk_thread_flag(tsk, TIF_DEBUG);
430 tsk->thread.debugreg0 = 0;
431 tsk->thread.debugreg1 = 0;
432 tsk->thread.debugreg2 = 0;
433 tsk->thread.debugreg3 = 0;
434 tsk->thread.debugreg6 = 0;
435 tsk->thread.debugreg7 = 0;
436 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
438 * Forget coprocessor state..
444 void release_thread(struct task_struct *dead_task)
447 if (dead_task->mm->context.size) {
448 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
450 dead_task->mm->context.ldt,
451 dead_task->mm->context.size);
457 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
459 struct user_desc ud = {
466 struct desc_struct *desc = (void *)t->thread.tls_array;
471 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
473 return get_desc_base(&t->thread.tls_array[tls]);
477 * This gets called before we allocate a new thread and copy
478 * the current task into it.
480 void prepare_to_copy(struct task_struct *tsk)
485 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
486 unsigned long unused,
487 struct task_struct * p, struct pt_regs * regs)
490 struct pt_regs * childregs;
491 struct task_struct *me = current;
493 childregs = ((struct pt_regs *)
494 (THREAD_SIZE + task_stack_page(p))) - 1;
500 childregs->sp = (unsigned long)childregs;
502 p->thread.sp = (unsigned long) childregs;
503 p->thread.sp0 = (unsigned long) (childregs+1);
504 p->thread.usersp = me->thread.usersp;
506 set_tsk_thread_flag(p, TIF_FORK);
508 p->thread.fs = me->thread.fs;
509 p->thread.gs = me->thread.gs;
511 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
512 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
513 asm("mov %%es,%0" : "=m" (p->thread.es));
514 asm("mov %%ds,%0" : "=m" (p->thread.ds));
516 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
517 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
518 if (!p->thread.io_bitmap_ptr) {
519 p->thread.io_bitmap_max = 0;
522 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
524 set_tsk_thread_flag(p, TIF_IO_BITMAP);
528 * Set a new TLS for the child thread?
530 if (clone_flags & CLONE_SETTLS) {
531 #ifdef CONFIG_IA32_EMULATION
532 if (test_thread_flag(TIF_IA32))
533 err = do_set_thread_area(p, -1,
534 (struct user_desc __user *)childregs->si, 0);
537 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
543 if (err && p->thread.io_bitmap_ptr) {
544 kfree(p->thread.io_bitmap_ptr);
545 p->thread.io_bitmap_max = 0;
551 * This special macro can be used to load a debugging register
553 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
555 static inline void __switch_to_xtra(struct task_struct *prev_p,
556 struct task_struct *next_p,
557 struct tss_struct *tss)
559 struct thread_struct *prev, *next;
560 unsigned long debugctl;
562 prev = &prev_p->thread,
563 next = &next_p->thread;
565 debugctl = prev->debugctlmsr;
566 if (next->ds_area_msr != prev->ds_area_msr) {
567 /* we clear debugctl to make sure DS
568 * is not in use when we change it */
570 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
571 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
574 if (next->debugctlmsr != debugctl)
575 wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
577 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
587 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
589 * Copy the relevant range of the IO bitmap.
590 * Normally this is 128 bytes or less:
592 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
593 max(prev->io_bitmap_max, next->io_bitmap_max));
594 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
596 * Clear any possible leftover bits:
598 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
601 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
602 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
604 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
605 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
609 * switch_to(x,y) should switch tasks from x to y.
611 * This could still be optimized:
612 * - fold all the options into a flag word and test it with a single test.
613 * - could test fs/gs bitsliced
615 * Kprobes not supported here. Set the probe on schedule instead.
618 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
620 struct thread_struct *prev = &prev_p->thread,
621 *next = &next_p->thread;
622 int cpu = smp_processor_id();
623 struct tss_struct *tss = &per_cpu(init_tss, cpu);
625 /* we're going to use this soon, after a few expensive things */
626 if (next_p->fpu_counter>5)
627 prefetch(&next->i387.fxsave);
630 * Reload esp0, LDT and the page table pointer:
636 * This won't pick up thread selector changes, but I guess that is ok.
638 asm volatile("mov %%es,%0" : "=m" (prev->es));
639 if (unlikely(next->es | prev->es))
640 loadsegment(es, next->es);
642 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
643 if (unlikely(next->ds | prev->ds))
644 loadsegment(ds, next->ds);
653 asm volatile("movl %%fs,%0" : "=r" (fsindex));
654 /* segment register != 0 always requires a reload.
655 also reload when it has changed.
656 when prev process used 64bit base always reload
657 to avoid an information leak. */
658 if (unlikely(fsindex | next->fsindex | prev->fs)) {
659 loadsegment(fs, next->fsindex);
660 /* check if the user used a selector != 0
661 * if yes clear 64bit base, since overloaded base
662 * is always mapped to the Null selector
667 /* when next process has a 64bit base use it */
669 wrmsrl(MSR_FS_BASE, next->fs);
670 prev->fsindex = fsindex;
674 asm volatile("movl %%gs,%0" : "=r" (gsindex));
675 if (unlikely(gsindex | next->gsindex | prev->gs)) {
676 load_gs_index(next->gsindex);
681 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
682 prev->gsindex = gsindex;
685 /* Must be after DS reload */
689 * Switch the PDA and FPU contexts.
691 prev->usersp = read_pda(oldrsp);
692 write_pda(oldrsp, next->usersp);
693 write_pda(pcurrent, next_p);
695 write_pda(kernelstack,
696 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
697 #ifdef CONFIG_CC_STACKPROTECTOR
698 write_pda(stack_canary, next_p->stack_canary);
700 * Build time only check to make sure the stack_canary is at
701 * offset 40 in the pda; this is a gcc ABI requirement
703 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
707 * Now maybe reload the debug registers and handle I/O bitmaps
709 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
710 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
711 __switch_to_xtra(prev_p, next_p, tss);
713 /* If the task has used fpu the last 5 timeslices, just do a full
714 * restore of the math state immediately to avoid the trap; the
715 * chances of needing FPU soon are obviously high now
717 if (next_p->fpu_counter>5)
718 math_state_restore();
723 * sys_execve() executes a new program.
726 long sys_execve(char __user *name, char __user * __user *argv,
727 char __user * __user *envp, struct pt_regs regs)
732 filename = getname(name);
733 error = PTR_ERR(filename);
734 if (IS_ERR(filename))
736 error = do_execve(filename, argv, envp, ®s);
741 void set_personality_64bit(void)
743 /* inherit personality from parent */
745 /* Make sure to be in 64bit mode */
746 clear_thread_flag(TIF_IA32);
748 /* TBD: overwrites user setup. Should have two bits.
749 But 64bit processes have always behaved this way,
750 so it's not too bad. The main problem is just that
751 32bit childs are affected again. */
752 current->personality &= ~READ_IMPLIES_EXEC;
755 asmlinkage long sys_fork(struct pt_regs *regs)
757 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
761 sys_clone(unsigned long clone_flags, unsigned long newsp,
762 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
766 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
770 * This is trivial, and on the face of it looks like it
771 * could equally well be done in user mode.
773 * Not so, for quite unobvious reasons - register pressure.
774 * In user mode vfork() cannot have a stack frame, and if
775 * done by calling the "clone()" system call directly, you
776 * do not have enough call-clobbered registers to hold all
777 * the information you need.
779 asmlinkage long sys_vfork(struct pt_regs *regs)
781 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
785 unsigned long get_wchan(struct task_struct *p)
791 if (!p || p == current || p->state==TASK_RUNNING)
793 stack = (unsigned long)task_stack_page(p);
794 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
796 fp = *(u64 *)(p->thread.sp);
798 if (fp < (unsigned long)stack ||
799 fp > (unsigned long)stack+THREAD_SIZE)
802 if (!in_sched_functions(ip))
805 } while (count++ < 16);
809 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
812 int doit = task == current;
817 if (addr >= TASK_SIZE_OF(task))
820 /* handle small bases via the GDT because that's faster to
822 if (addr <= 0xffffffff) {
823 set_32bit_tls(task, GS_TLS, addr);
825 load_TLS(&task->thread, cpu);
826 load_gs_index(GS_TLS_SEL);
828 task->thread.gsindex = GS_TLS_SEL;
831 task->thread.gsindex = 0;
832 task->thread.gs = addr;
835 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
841 /* Not strictly needed for fs, but do it for symmetry
843 if (addr >= TASK_SIZE_OF(task))
846 /* handle small bases via the GDT because that's faster to
848 if (addr <= 0xffffffff) {
849 set_32bit_tls(task, FS_TLS, addr);
851 load_TLS(&task->thread, cpu);
852 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
854 task->thread.fsindex = FS_TLS_SEL;
857 task->thread.fsindex = 0;
858 task->thread.fs = addr;
860 /* set the selector to 0 to not confuse
862 asm volatile("movl %0,%%fs" :: "r" (0));
863 ret = checking_wrmsrl(MSR_FS_BASE, addr);
870 if (task->thread.fsindex == FS_TLS_SEL)
871 base = read_32bit_tls(task, FS_TLS);
873 rdmsrl(MSR_FS_BASE, base);
875 base = task->thread.fs;
876 ret = put_user(base, (unsigned long __user *)addr);
882 if (task->thread.gsindex == GS_TLS_SEL)
883 base = read_32bit_tls(task, GS_TLS);
885 asm("movl %%gs,%0" : "=r" (gsindex));
887 rdmsrl(MSR_KERNEL_GS_BASE, base);
889 base = task->thread.gs;
892 base = task->thread.gs;
893 ret = put_user(base, (unsigned long __user *)addr);
905 long sys_arch_prctl(int code, unsigned long addr)
907 return do_arch_prctl(current, code, addr);
910 unsigned long arch_align_stack(unsigned long sp)
912 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
913 sp -= get_random_int() % 8192;
917 unsigned long arch_randomize_brk(struct mm_struct *mm)
919 unsigned long range_end = mm->brk + 0x02000000;
920 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;