]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - kernel/hrtimer.c
ocfs2: fix regression in ocfs2_read_blocks_sync()
[linux-2.6-omap-h63xx.git] / kernel / hrtimer.c
index ae307feec74c2435c62f0ab6214aeecd33e6eca6..47e63349d1b2d262be2d8be568af8d3d93de3333 100644 (file)
@@ -664,25 +664,17 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 
                /* Timer is expired, act upon the callback mode */
                switch(timer->cb_mode) {
-               case HRTIMER_CB_IRQSAFE_NO_RESTART:
-                       debug_hrtimer_deactivate(timer);
-                       /*
-                        * We can call the callback from here. No restart
-                        * happens, so no danger of recursion
-                        */
-                       BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
-                       return 1;
-               case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+               case HRTIMER_CB_IRQSAFE_PERCPU:
+               case HRTIMER_CB_IRQSAFE_UNLOCKED:
                        /*
                         * This is solely for the sched tick emulation with
                         * dynamic tick support to ensure that we do not
                         * restart the tick right on the edge and end up with
                         * the tick timer in the softirq ! The calling site
-                        * takes care of this.
+                        * takes care of this. Also used for hrtimer sleeper !
                         */
                        debug_hrtimer_deactivate(timer);
                        return 1;
-               case HRTIMER_CB_IRQSAFE:
                case HRTIMER_CB_SOFTIRQ:
                        /*
                         * Move everything else into the softirq pending list !
@@ -945,9 +937,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 }
 
 /**
- * hrtimer_start - (re)start an relative timer on the current CPU
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
  * @timer:     the timer to be added
  * @tim:       expiry time
+ * @delta_ns:  "slack" range for the timer
  * @mode:      expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
  *
  * Returns:
@@ -955,7 +948,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
  *  1 when the timer was active
  */
 int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+                       const enum hrtimer_mode mode)
 {
        struct hrtimer_clock_base *base, *new_base;
        unsigned long flags;
@@ -983,7 +977,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 #endif
        }
 
-       hrtimer_set_expires(timer, tim);
+       hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
        timer_stats_hrtimer_set_start_info(timer);
 
@@ -1016,8 +1010,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 
        return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start - (re)start an hrtimer on the current CPU
+ * @timer:     the timer to be added
+ * @tim:       expiry time
+ * @mode:      expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+       return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
+
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
  * @timer:     hrtimer to stop
@@ -1188,6 +1200,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
                enum hrtimer_restart (*fn)(struct hrtimer *);
                struct hrtimer *timer;
                int restart;
+               int emulate_hardirq_ctx = 0;
 
                timer = list_entry(cpu_base->cb_pending.next,
                                   struct hrtimer, cb_entry);
@@ -1196,10 +1209,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
                timer_stats_account_hrtimer(timer);
 
                fn = timer->function;
+               /*
+                * A timer might have been added to the cb_pending list
+                * when it was migrated during a cpu-offline operation.
+                * Emulate hardirq context for such timers.
+                */
+               if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+                   timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
+                       emulate_hardirq_ctx = 1;
+
                __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
                spin_unlock_irq(&cpu_base->lock);
 
-               restart = fn(timer);
+               if (unlikely(emulate_hardirq_ctx)) {
+                       local_irq_disable();
+                       restart = fn(timer);
+                       local_irq_enable();
+               } else
+                       restart = fn(timer);
 
                spin_lock_irq(&cpu_base->lock);
 
@@ -1246,7 +1273,8 @@ static void __run_hrtimer(struct hrtimer *timer)
        timer_stats_account_hrtimer(timer);
 
        fn = timer->function;
-       if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+       if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+           timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
                /*
                 * Used for scheduler timers, avoid lock inversion with
                 * rq->lock and tasklist_lock.
@@ -1309,7 +1337,20 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
                        timer = rb_entry(node, struct hrtimer, node);
 
-                       if (basenow.tv64 < hrtimer_get_expires_tv64(timer)) {
+                       /*
+                        * The immediate goal for using the softexpires is
+                        * minimizing wakeups, not running timers at the
+                        * earliest interrupt after their soft expiration.
+                        * This allows us to avoid using a Priority Search
+                        * Tree, which can answer a stabbing querry for
+                        * overlapping intervals and instead use the simple
+                        * BST we already have.
+                        * We don't add extra wakeups by delaying timers that
+                        * are right-of a not yet expired timer, because that
+                        * timer will have to trigger a wakeup anyway.
+                        */
+
+                       if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
                                ktime_t expires;
 
                                expires = ktime_sub(hrtimer_get_expires(timer),
@@ -1348,6 +1389,30 @@ void hrtimer_interrupt(struct clock_event_device *dev)
                raise_softirq(HRTIMER_SOFTIRQ);
 }
 
+/**
+ * hrtimer_peek_ahead_timers -- run soft-expired timers now
+ *
+ * hrtimer_peek_ahead_timers will peek at the timer queue of
+ * the current cpu and check if there are any timers for which
+ * the soft expires time has passed. If any such timers exist,
+ * they are run immediately and then removed from the timer queue.
+ *
+ */
+void hrtimer_peek_ahead_timers(void)
+{
+       struct tick_device *td;
+       unsigned long flags;
+
+       if (!hrtimer_hres_active())
+               return;
+
+       local_irq_save(flags);
+       td = &__get_cpu_var(tick_cpu_device);
+       if (td && td->evtdev)
+               hrtimer_interrupt(td->evtdev);
+       local_irq_restore(flags);
+}
+
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
        run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
@@ -1402,9 +1467,7 @@ void hrtimer_run_queues(void)
                if (!base->first)
                        continue;
 
-               if (base->get_softirq_time)
-                       base->softirq_time = base->get_softirq_time();
-               else if (gettime) {
+               if (gettime) {
                        hrtimer_get_softirq_time(cpu_base);
                        gettime = 0;
                }
@@ -1454,7 +1517,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
        sl->timer.function = hrtimer_wakeup;
        sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-       sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 #endif
 }
 
@@ -1530,9 +1593,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret = 0;
+       unsigned long slack;
+
+       slack = current->timer_slack_ns;
+       if (rt_task(current))
+               slack = 0;
 
        hrtimer_init_on_stack(&t.timer, clockid, mode);
-       hrtimer_set_expires(&t.timer, timespec_to_ktime(*rqtp));
+       hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
        if (do_nanosleep(&t, mode))
                goto out;
 
@@ -1593,49 +1661,123 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-                               struct hrtimer_clock_base *new_base)
+static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+                               struct hrtimer_clock_base *new_base, int dcpu)
 {
        struct hrtimer *timer;
        struct rb_node *node;
+       int raise = 0;
 
        while ((node = rb_first(&old_base->active))) {
                timer = rb_entry(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_hrtimer_deactivate(timer);
-               __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
+
+               /*
+                * Should not happen. Per CPU timers should be
+                * canceled _before_ the migration code is called
+                */
+               if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+                       __remove_hrtimer(timer, old_base,
+                                        HRTIMER_STATE_INACTIVE, 0);
+                       WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+                            timer, timer->function, dcpu);
+                       continue;
+               }
+
+               /*
+                * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+                * timer could be seen as !active and just vanish away
+                * under us on another CPU
+                */
+               __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timer. Allow reprogramming of the event device
                 */
                enqueue_hrtimer(timer, new_base, 1);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+               /*
+                * Happens with high res enabled when the timer was
+                * already expired and the callback mode is
+                * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+                * enqueue code does not move them to the soft irq
+                * pending list for performance/latency reasons, but
+                * in the migration state, we need to do that
+                * otherwise we end up with a stale timer.
+                */
+               if (timer->state == HRTIMER_STATE_MIGRATE) {
+                       timer->state = HRTIMER_STATE_PENDING;
+                       list_add_tail(&timer->cb_entry,
+                                     &new_base->cpu_base->cb_pending);
+                       raise = 1;
+               }
+#endif
+               /* Clear the migration state bit */
+               timer->state &= ~HRTIMER_STATE_MIGRATE;
+       }
+       return raise;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+                                  struct hrtimer_cpu_base *new_base)
+{
+       struct hrtimer *timer;
+       int raise = 0;
+
+       while (!list_empty(&old_base->cb_pending)) {
+               timer = list_entry(old_base->cb_pending.next,
+                                  struct hrtimer, cb_entry);
+
+               __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+               timer->base = &new_base->clock_base[timer->base->index];
+               list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+               raise = 1;
        }
+       return raise;
+}
+#else
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+                                  struct hrtimer_cpu_base *new_base)
+{
+       return 0;
 }
+#endif
 
 static void migrate_hrtimers(int cpu)
 {
        struct hrtimer_cpu_base *old_base, *new_base;
-       int i;
+       int i, raise = 0;
 
        BUG_ON(cpu_online(cpu));
        old_base = &per_cpu(hrtimer_bases, cpu);
        new_base = &get_cpu_var(hrtimer_bases);
 
        tick_cancel_sched_timer(cpu);
-
-       local_irq_disable();
-       spin_lock(&new_base->lock);
+       /*
+        * The caller is globally serialized and nobody else
+        * takes two locks at once, deadlock is not possible.
+        */
+       spin_lock_irq(&new_base->lock);
        spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-               migrate_hrtimer_list(&old_base->clock_base[i],
-                                    &new_base->clock_base[i]);
+               if (migrate_hrtimer_list(&old_base->clock_base[i],
+                                        &new_base->clock_base[i], cpu))
+                       raise = 1;
        }
 
+       if (migrate_hrtimer_pending(old_base, new_base))
+               raise = 1;
+
        spin_unlock(&old_base->lock);
-       spin_unlock(&new_base->lock);
-       local_irq_enable();
+       spin_unlock_irq(&new_base->lock);
        put_cpu_var(hrtimer_bases);
+
+       if (raise)
+               hrtimer_raise_softirq();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
@@ -1681,14 +1823,20 @@ void __init hrtimers_init(void)
 }
 
 /**
- * schedule_hrtimeout - sleep until timeout
+ * schedule_hrtimeout_range - sleep until timeout
  * @expires:   timeout value (ktime_t)
+ * @delta:     slack in expires timeout (ktime_t)
  * @mode:      timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
  *
  * Make the current task sleep until the given expiry time has
  * elapsed. The routine will return immediately unless
  * the current task state has been set (see set_current_state()).
  *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
@@ -1702,7 +1850,7 @@ void __init hrtimers_init(void)
  *
  * Returns 0 when the timer has expired otherwise -EINTR
  */
-int __sched schedule_hrtimeout(ktime_t *expires,
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
                               const enum hrtimer_mode mode)
 {
        struct hrtimer_sleeper t;
@@ -1726,7 +1874,7 @@ int __sched schedule_hrtimeout(ktime_t *expires,
        }
 
        hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
-       hrtimer_set_expires(&t.timer, *expires);
+       hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
 
        hrtimer_init_sleeper(&t, current);
 
@@ -1744,4 +1892,33 @@ int __sched schedule_hrtimeout(ktime_t *expires,
 
        return !t.task ? 0 : -EINTR;
 }
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires:   timeout value (ktime_t)
+ * @mode:      timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+                              const enum hrtimer_mode mode)
+{
+       return schedule_hrtimeout_range(expires, 0, mode);
+}
 EXPORT_SYMBOL_GPL(schedule_hrtimeout);