@  r2  = faulted PC+4
 @  r9  = successful return
 @  r10 = vfp_state union
+@  r11 = CPU number
 @  lr  = failure return
 
        .globl  vfp_support_entry
        DBGSTR1 "enable %x", r10
        ldr     r3, last_VFP_context_address
        orr     r1, r1, #FPEXC_ENABLE   @ user FPEXC has the enable bit set
-       ldr     r4, [r3]                @ last_VFP_context pointer
+       ldr     r4, [r3, r11, lsl #2]   @ last_VFP_context pointer
        bic     r5, r1, #FPEXC_EXCEPTION @ make sure exceptions are disabled
        cmp     r4, r10
        beq     check_for_exception     @ we are returning to the same
                                        @ exceptions, so we can get at the
                                        @ rest of it
 
+#ifndef CONFIG_SMP
        @ Save out the current registers to the old thread state
+       @ No need for SMP since this is not done lazily
 
        DBGSTR1 "save old state %p", r4
        cmp     r4, #0
        stmia   r4, {r1, r5, r6, r8}    @ save FPEXC, FPSCR, FPINST, FPINST2
                                        @ and point r4 at the word at the
                                        @ start of the register dump
+#endif
 
 no_old_VFP_process:
        DBGSTR1 "load state %p", r10
-       str     r10, [r3]               @ update the last_VFP_context pointer
+       str     r10, [r3, r11, lsl #2]  @ update the last_VFP_context pointer
                                        @ Load the saved state back into the VFP
        VFPFLDMIA r10                   @ reload the working registers while
                                        @ FPEXC is in a safe state
                                        @ required. If not, the user code will
                                        @ retry the faulted instruction
 
+#ifdef CONFIG_SMP
+       .globl  vfp_save_state
+       .type   vfp_save_state, %function
+vfp_save_state:
+       @ Save the current VFP state
+       @ r0 - save location
+       @ r1 - FPEXC
+       DBGSTR1 "save VFP state %p", r0
+       VFPFMRX r2, FPSCR               @ current status
+       VFPFMRX r3, FPINST              @ FPINST (always there, rev0 onwards)
+       tst     r1, #FPEXC_FPV2         @ is there an FPINST2 to read?
+       VFPFMRX r12, FPINST2, NE        @ FPINST2 if needed - avoids reading
+                                       @ nonexistant reg on rev0
+       VFPFSTMIA r0                    @ save the working registers
+       stmia   r0, {r1, r2, r3, r12}   @ save FPEXC, FPSCR, FPINST, FPINST2
+       mov     pc, lr
+#endif
+
 last_VFP_context_address:
        .word   last_VFP_context
 
 
 void vfp_support_entry(void);
 
 void (*vfp_vector)(void) = vfp_testing_entry;
-union vfp_state *last_VFP_context;
+union vfp_state *last_VFP_context[NR_CPUS];
 
 /*
  * Dual-use variable.
 {
        struct thread_info *thread = v;
        union vfp_state *vfp;
+       __u32 cpu = thread->cpu;
 
        if (likely(cmd == THREAD_NOTIFY_SWITCH)) {
+               u32 fpexc = fmrx(FPEXC);
+
+#ifdef CONFIG_SMP
+               /*
+                * On SMP, if VFP is enabled, save the old state in
+                * case the thread migrates to a different CPU. The
+                * restoring is done lazily.
+                */
+               if ((fpexc & FPEXC_ENABLE) && last_VFP_context[cpu]) {
+                       vfp_save_state(last_VFP_context[cpu], fpexc);
+                       last_VFP_context[cpu]->hard.cpu = cpu;
+               }
+               /*
+                * Thread migration, just force the reloading of the
+                * state on the new CPU in case the VFP registers
+                * contain stale data.
+                */
+               if (thread->vfpstate.hard.cpu != cpu)
+                       last_VFP_context[cpu] = NULL;
+#endif
+
                /*
                 * Always disable VFP so we can lazily save/restore the
                 * old state.
                 */
-               fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_ENABLE);
+               fmxr(FPEXC, fpexc & ~FPEXC_ENABLE);
                return NOTIFY_DONE;
        }
 
        }
 
        /* flush and release case: Per-thread VFP cleanup. */
-       if (last_VFP_context == vfp)
-               last_VFP_context = NULL;
+       if (last_VFP_context[cpu] == vfp)
+               last_VFP_context[cpu] = NULL;
 
        return NOTIFY_DONE;
 }