2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
14 //#include <asm/asm-offsets.h>
15 #include <asm/thread_info.h>
16 #include <asm/processor-flags.h>
17 #include <asm/segment.h>
19 #include <xen/interface/xen.h>
24 Force an event check by making a hypercall,
25 but preserve regs before making the call.
31 call xen_force_evtchn_callback
38 We can't use sysexit directly, because we're not running in ring0.
39 But we can easily fake it up using iret. Assuming xen_sysexit
40 is jumped to with a standard stack frame, we can just strip it
41 back to a standard iret frame and use iret.
44 movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
45 orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
46 lea PT_EIP(%esp), %esp
52 This is run where a normal iret would be run, with the same stack setup:
57 This attempts to make sure that any pending events are dealt
58 with on return to usermode, but there is a small window in
59 which an event can happen just before entering usermode. If
60 the nested interrupt ends up setting one of the TIF_WORK_MASK
61 pending work flags, they will not be tested again before
62 returning to usermode. This means that a process can end up
63 with pending work, which will be unprocessed until the process
64 enters and leaves the kernel again, which could be an
65 unbounded amount of time. This means that a pending signal or
66 reschedule event could be indefinitely delayed.
68 The fix is to notice a nested interrupt in the critical
69 window, and if one occurs, then fold the nested interrupt into
70 the current interrupt stack frame, and re-process it
71 iteratively rather than recursively. This means that it will
72 exit via the normal path, and all pending work will be dealt
75 Because the nested interrupt handler needs to deal with the
76 current stack state in whatever form its in, we keep things
77 simple by only using a single register which is pushed/popped
81 /* test eflags for special cases */
82 testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
86 ESP_OFFSET=4 # bytes pushed onto stack
88 /* Store vcpu_info pointer for easy access. Do it this
89 way to avoid having to reload %fs */
92 movl TI_cpu(%eax),%eax
93 movl __per_cpu_offset(,%eax,4),%eax
94 mov per_cpu__xen_vcpu(%eax),%eax
96 movl per_cpu__xen_vcpu, %eax
99 /* check IF state we're restoring */
100 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
102 /* Maybe enable events. Once this happens we could get a
103 recursive event, so the critical region starts immediately
104 afterwards. However, if that happens we don't end up
105 resuming the code, so we don't have to be worried about
106 being preempted to another CPU. */
107 setz XEN_vcpu_info_mask(%eax)
110 /* check for unmasked and pending */
111 cmpw $0x0001, XEN_vcpu_info_pending(%eax)
113 /* If there's something pending, mask events again so we
114 can jump back into xen_hypervisor_callback */
115 sete XEN_vcpu_info_mask(%eax)
119 /* From this point on the registers are restored and the stack
120 updated, so we don't need to worry about it if we're preempted */
123 /* Jump to hypervisor_callback after fixing up the stack.
124 Events are masked, so jumping out of the critical
126 je xen_hypervisor_callback
130 .section __ex_table,"a"
136 /* put this out of line since its very rarely used */
137 jmp hypercall_page + __HYPERVISOR_iret * 32
139 .globl xen_iret_start_crit, xen_iret_end_crit
142 This is called by xen_hypervisor_callback in entry.S when it sees
143 that the EIP at the time of interrupt was between xen_iret_start_crit
144 and xen_iret_end_crit. We're passed the EIP in %eax so we can do
145 a more refined determination of what to do.
147 The stack format at this point is:
149 ss : (ss/esp may be present if we came from usermode)
151 eflags } outer exception info
154 ---------------- <- edi (copy dest)
155 eax : outer eax if it hasn't been restored
157 eflags } nested exception info
158 cs } (no ss/esp because we're nested
159 eip } from the same ring)
160 orig_eax }<- esi (copy src)
170 In order to deliver the nested exception properly, we need to shift
171 everything from the return addr up to the error code so it
172 sits just under the outer exception info. This means that when we
173 handle the exception, we do it in the context of the outer exception
174 rather than starting a new one.
176 The only caveat is that if the outer eax hasn't been
177 restored yet (ie, it's still on stack), we need to insert
178 its value into the SAVE_ALL state before going on, since
179 it's usermode state which we eventually need to restore.
181 ENTRY(xen_iret_crit_fixup)
183 Paranoia: Make sure we're really coming from kernel space.
184 One could imagine a case where userspace jumps into the
185 critical range address, but just before the CPU delivers a GP,
186 it decides to deliver an interrupt instead. Unlikely?
187 Definitely. Easy to avoid? Yes. The Intel documents
188 explicitly say that the reported EIP for a bad jump is the
189 jump instruction itself, not the destination, but some virtual
190 environments get this wrong.
192 movl PT_CS(%esp), %ecx
193 andl $SEGMENT_RPL_MASK, %ecx
197 lea PT_ORIG_EAX(%esp), %esi
198 lea PT_EFLAGS(%esp), %edi
200 /* If eip is before iret_restore_end then stack
201 hasn't been restored yet. */
202 cmp $iret_restore_end, %eax
205 movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
206 movl %eax, PT_EAX(%esp)
208 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
210 /* set up the copy */
212 mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
216 lea 4(%edi),%esp /* point esp to new frame */