]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/sched/sch_generic.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[linux-2.6-omap-h63xx.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52         spin_lock_bh(&dev->queue_lock);
53         spin_lock(&dev->ingress_lock);
54 }
55
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58         spin_unlock(&dev->ingress_lock);
59         spin_unlock_bh(&dev->queue_lock);
60 }
61
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65
66    netif_tx_lock serializes accesses to device driver.
67
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71
72
73 /* Kick device.
74
75    Returns:  0  - queue is empty or throttled.
76             >0  - queue is not empty.
77
78    NOTE: Called under dev->queue_lock with locally disabled BH.
79 */
80
81 static inline int qdisc_restart(struct net_device *dev)
82 {
83         struct Qdisc *q = dev->qdisc;
84         struct sk_buff *skb;
85
86         /* Dequeue packet */
87         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
88                 unsigned nolock = (dev->features & NETIF_F_LLTX);
89
90                 dev->gso_skb = NULL;
91
92                 /*
93                  * When the driver has LLTX set it does its own locking
94                  * in start_xmit. No need to add additional overhead by
95                  * locking again. These checks are worth it because
96                  * even uncongested locks can be quite expensive.
97                  * The driver can do trylock like here too, in case
98                  * of lock congestion it should return -1 and the packet
99                  * will be requeued.
100                  */
101                 if (!nolock) {
102                         if (!netif_tx_trylock(dev)) {
103                         collision:
104                                 /* So, someone grabbed the driver. */
105
106                                 /* It may be transient configuration error,
107                                    when hard_start_xmit() recurses. We detect
108                                    it by checking xmit owner and drop the
109                                    packet when deadloop is detected.
110                                 */
111                                 if (dev->xmit_lock_owner == smp_processor_id()) {
112                                         kfree_skb(skb);
113                                         if (net_ratelimit())
114                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
115                                         goto out;
116                                 }
117                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
118                                 goto requeue;
119                         }
120                 }
121
122                 {
123                         /* And release queue */
124                         spin_unlock(&dev->queue_lock);
125
126                         if (!netif_queue_stopped(dev)) {
127                                 int ret;
128
129                                 ret = dev_hard_start_xmit(skb, dev);
130                                 if (ret == NETDEV_TX_OK) {
131                                         if (!nolock) {
132                                                 netif_tx_unlock(dev);
133                                         }
134                                         spin_lock(&dev->queue_lock);
135                                         q = dev->qdisc;
136                                         goto out;
137                                 }
138                                 if (ret == NETDEV_TX_LOCKED && nolock) {
139                                         spin_lock(&dev->queue_lock);
140                                         q = dev->qdisc;
141                                         goto collision;
142                                 }
143                         }
144
145                         /* NETDEV_TX_BUSY - we need to requeue */
146                         /* Release the driver */
147                         if (!nolock) {
148                                 netif_tx_unlock(dev);
149                         }
150                         spin_lock(&dev->queue_lock);
151                         q = dev->qdisc;
152                 }
153
154                 /* Device kicked us out :(
155                    This is possible in three cases:
156
157                    0. driver is locked
158                    1. fastroute is enabled
159                    2. device cannot determine busy state
160                       before start of transmission (f.e. dialout)
161                    3. device is buggy (ppp)
162                  */
163
164 requeue:
165                 if (unlikely(q == &noop_qdisc))
166                         kfree_skb(skb);
167                 else if (skb->next)
168                         dev->gso_skb = skb;
169                 else
170                         q->ops->requeue(skb, q);
171                 netif_schedule(dev);
172         }
173         return 0;
174
175 out:
176         BUG_ON((int) q->q.qlen < 0);
177         return q->q.qlen;
178 }
179
180 void __qdisc_run(struct net_device *dev)
181 {
182         do {
183                 if (!qdisc_restart(dev))
184                         break;
185         } while (!netif_queue_stopped(dev));
186
187         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
188 }
189
190 static void dev_watchdog(unsigned long arg)
191 {
192         struct net_device *dev = (struct net_device *)arg;
193
194         netif_tx_lock(dev);
195         if (dev->qdisc != &noop_qdisc) {
196                 if (netif_device_present(dev) &&
197                     netif_running(dev) &&
198                     netif_carrier_ok(dev)) {
199                         if (netif_queue_stopped(dev) &&
200                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
201
202                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
203                                        dev->name);
204                                 dev->tx_timeout(dev);
205                         }
206                         if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
207                                 dev_hold(dev);
208                 }
209         }
210         netif_tx_unlock(dev);
211
212         dev_put(dev);
213 }
214
215 static void dev_watchdog_init(struct net_device *dev)
216 {
217         init_timer(&dev->watchdog_timer);
218         dev->watchdog_timer.data = (unsigned long)dev;
219         dev->watchdog_timer.function = dev_watchdog;
220 }
221
222 void __netdev_watchdog_up(struct net_device *dev)
223 {
224         if (dev->tx_timeout) {
225                 if (dev->watchdog_timeo <= 0)
226                         dev->watchdog_timeo = 5*HZ;
227                 if (!mod_timer(&dev->watchdog_timer,
228                                round_jiffies(jiffies + dev->watchdog_timeo)))
229                         dev_hold(dev);
230         }
231 }
232
233 static void dev_watchdog_up(struct net_device *dev)
234 {
235         __netdev_watchdog_up(dev);
236 }
237
238 static void dev_watchdog_down(struct net_device *dev)
239 {
240         netif_tx_lock_bh(dev);
241         if (del_timer(&dev->watchdog_timer))
242                 dev_put(dev);
243         netif_tx_unlock_bh(dev);
244 }
245
246 void netif_carrier_on(struct net_device *dev)
247 {
248         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
249                 linkwatch_fire_event(dev);
250         if (netif_running(dev))
251                 __netdev_watchdog_up(dev);
252 }
253
254 void netif_carrier_off(struct net_device *dev)
255 {
256         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
257                 linkwatch_fire_event(dev);
258 }
259
260 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
261    under all circumstances. It is difficult to invent anything faster or
262    cheaper.
263  */
264
265 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
266 {
267         kfree_skb(skb);
268         return NET_XMIT_CN;
269 }
270
271 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
272 {
273         return NULL;
274 }
275
276 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
277 {
278         if (net_ratelimit())
279                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
280                        skb->dev->name);
281         kfree_skb(skb);
282         return NET_XMIT_CN;
283 }
284
285 struct Qdisc_ops noop_qdisc_ops = {
286         .id             =       "noop",
287         .priv_size      =       0,
288         .enqueue        =       noop_enqueue,
289         .dequeue        =       noop_dequeue,
290         .requeue        =       noop_requeue,
291         .owner          =       THIS_MODULE,
292 };
293
294 struct Qdisc noop_qdisc = {
295         .enqueue        =       noop_enqueue,
296         .dequeue        =       noop_dequeue,
297         .flags          =       TCQ_F_BUILTIN,
298         .ops            =       &noop_qdisc_ops,
299         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
300 };
301
302 static struct Qdisc_ops noqueue_qdisc_ops = {
303         .id             =       "noqueue",
304         .priv_size      =       0,
305         .enqueue        =       noop_enqueue,
306         .dequeue        =       noop_dequeue,
307         .requeue        =       noop_requeue,
308         .owner          =       THIS_MODULE,
309 };
310
311 static struct Qdisc noqueue_qdisc = {
312         .enqueue        =       NULL,
313         .dequeue        =       noop_dequeue,
314         .flags          =       TCQ_F_BUILTIN,
315         .ops            =       &noqueue_qdisc_ops,
316         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
317 };
318
319
320 static const u8 prio2band[TC_PRIO_MAX+1] =
321         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
322
323 /* 3-band FIFO queue: old style, but should be a bit faster than
324    generic prio+fifo combination.
325  */
326
327 #define PFIFO_FAST_BANDS 3
328
329 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
330                                              struct Qdisc *qdisc)
331 {
332         struct sk_buff_head *list = qdisc_priv(qdisc);
333         return list + prio2band[skb->priority & TC_PRIO_MAX];
334 }
335
336 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
337 {
338         struct sk_buff_head *list = prio2list(skb, qdisc);
339
340         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
341                 qdisc->q.qlen++;
342                 return __qdisc_enqueue_tail(skb, qdisc, list);
343         }
344
345         return qdisc_drop(skb, qdisc);
346 }
347
348 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
349 {
350         int prio;
351         struct sk_buff_head *list = qdisc_priv(qdisc);
352
353         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
354                 if (!skb_queue_empty(list + prio)) {
355                         qdisc->q.qlen--;
356                         return __qdisc_dequeue_head(qdisc, list + prio);
357                 }
358         }
359
360         return NULL;
361 }
362
363 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
364 {
365         qdisc->q.qlen++;
366         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
367 }
368
369 static void pfifo_fast_reset(struct Qdisc* qdisc)
370 {
371         int prio;
372         struct sk_buff_head *list = qdisc_priv(qdisc);
373
374         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
375                 __qdisc_reset_queue(qdisc, list + prio);
376
377         qdisc->qstats.backlog = 0;
378         qdisc->q.qlen = 0;
379 }
380
381 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
382 {
383         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
384
385         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
386         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
387         return skb->len;
388
389 rtattr_failure:
390         return -1;
391 }
392
393 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
394 {
395         int prio;
396         struct sk_buff_head *list = qdisc_priv(qdisc);
397
398         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
399                 skb_queue_head_init(list + prio);
400
401         return 0;
402 }
403
404 static struct Qdisc_ops pfifo_fast_ops = {
405         .id             =       "pfifo_fast",
406         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
407         .enqueue        =       pfifo_fast_enqueue,
408         .dequeue        =       pfifo_fast_dequeue,
409         .requeue        =       pfifo_fast_requeue,
410         .init           =       pfifo_fast_init,
411         .reset          =       pfifo_fast_reset,
412         .dump           =       pfifo_fast_dump,
413         .owner          =       THIS_MODULE,
414 };
415
416 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
417 {
418         void *p;
419         struct Qdisc *sch;
420         unsigned int size;
421         int err = -ENOBUFS;
422
423         /* ensure that the Qdisc and the private data are 32-byte aligned */
424         size = QDISC_ALIGN(sizeof(*sch));
425         size += ops->priv_size + (QDISC_ALIGNTO - 1);
426
427         p = kzalloc(size, GFP_KERNEL);
428         if (!p)
429                 goto errout;
430         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
431         sch->padded = (char *) sch - (char *) p;
432
433         INIT_LIST_HEAD(&sch->list);
434         skb_queue_head_init(&sch->q);
435         sch->ops = ops;
436         sch->enqueue = ops->enqueue;
437         sch->dequeue = ops->dequeue;
438         sch->dev = dev;
439         dev_hold(dev);
440         atomic_set(&sch->refcnt, 1);
441
442         return sch;
443 errout:
444         return ERR_PTR(-err);
445 }
446
447 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
448                                  unsigned int parentid)
449 {
450         struct Qdisc *sch;
451
452         sch = qdisc_alloc(dev, ops);
453         if (IS_ERR(sch))
454                 goto errout;
455         sch->stats_lock = &dev->queue_lock;
456         sch->parent = parentid;
457
458         if (!ops->init || ops->init(sch, NULL) == 0)
459                 return sch;
460
461         qdisc_destroy(sch);
462 errout:
463         return NULL;
464 }
465
466 /* Under dev->queue_lock and BH! */
467
468 void qdisc_reset(struct Qdisc *qdisc)
469 {
470         struct Qdisc_ops *ops = qdisc->ops;
471
472         if (ops->reset)
473                 ops->reset(qdisc);
474 }
475
476 /* this is the rcu callback function to clean up a qdisc when there
477  * are no further references to it */
478
479 static void __qdisc_destroy(struct rcu_head *head)
480 {
481         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
482         kfree((char *) qdisc - qdisc->padded);
483 }
484
485 /* Under dev->queue_lock and BH! */
486
487 void qdisc_destroy(struct Qdisc *qdisc)
488 {
489         struct Qdisc_ops  *ops = qdisc->ops;
490
491         if (qdisc->flags & TCQ_F_BUILTIN ||
492             !atomic_dec_and_test(&qdisc->refcnt))
493                 return;
494
495         list_del(&qdisc->list);
496 #ifdef CONFIG_NET_ESTIMATOR
497         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
498 #endif
499         if (ops->reset)
500                 ops->reset(qdisc);
501         if (ops->destroy)
502                 ops->destroy(qdisc);
503
504         module_put(ops->owner);
505         dev_put(qdisc->dev);
506         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
507 }
508
509 void dev_activate(struct net_device *dev)
510 {
511         /* No queueing discipline is attached to device;
512            create default one i.e. pfifo_fast for devices,
513            which need queueing and noqueue_qdisc for
514            virtual interfaces
515          */
516
517         if (dev->qdisc_sleeping == &noop_qdisc) {
518                 struct Qdisc *qdisc;
519                 if (dev->tx_queue_len) {
520                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
521                                                   TC_H_ROOT);
522                         if (qdisc == NULL) {
523                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
524                                 return;
525                         }
526                         list_add_tail(&qdisc->list, &dev->qdisc_list);
527                 } else {
528                         qdisc =  &noqueue_qdisc;
529                 }
530                 dev->qdisc_sleeping = qdisc;
531         }
532
533         if (!netif_carrier_ok(dev))
534                 /* Delay activation until next carrier-on event */
535                 return;
536
537         spin_lock_bh(&dev->queue_lock);
538         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
539         if (dev->qdisc != &noqueue_qdisc) {
540                 dev->trans_start = jiffies;
541                 dev_watchdog_up(dev);
542         }
543         spin_unlock_bh(&dev->queue_lock);
544 }
545
546 void dev_deactivate(struct net_device *dev)
547 {
548         struct Qdisc *qdisc;
549         struct sk_buff *skb;
550
551         spin_lock_bh(&dev->queue_lock);
552         qdisc = dev->qdisc;
553         dev->qdisc = &noop_qdisc;
554
555         qdisc_reset(qdisc);
556
557         skb = dev->gso_skb;
558         dev->gso_skb = NULL;
559         spin_unlock_bh(&dev->queue_lock);
560
561         kfree_skb(skb);
562
563         dev_watchdog_down(dev);
564
565         /* Wait for outstanding dev_queue_xmit calls. */
566         synchronize_rcu();
567
568         /* Wait for outstanding qdisc_run calls. */
569         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
570                 yield();
571 }
572
573 void dev_init_scheduler(struct net_device *dev)
574 {
575         qdisc_lock_tree(dev);
576         dev->qdisc = &noop_qdisc;
577         dev->qdisc_sleeping = &noop_qdisc;
578         INIT_LIST_HEAD(&dev->qdisc_list);
579         qdisc_unlock_tree(dev);
580
581         dev_watchdog_init(dev);
582 }
583
584 void dev_shutdown(struct net_device *dev)
585 {
586         struct Qdisc *qdisc;
587
588         qdisc_lock_tree(dev);
589         qdisc = dev->qdisc_sleeping;
590         dev->qdisc = &noop_qdisc;
591         dev->qdisc_sleeping = &noop_qdisc;
592         qdisc_destroy(qdisc);
593 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
594         if ((qdisc = dev->qdisc_ingress) != NULL) {
595                 dev->qdisc_ingress = NULL;
596                 qdisc_destroy(qdisc);
597         }
598 #endif
599         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
600         qdisc_unlock_tree(dev);
601 }
602
603 EXPORT_SYMBOL(netif_carrier_on);
604 EXPORT_SYMBOL(netif_carrier_off);
605 EXPORT_SYMBOL(noop_qdisc);
606 EXPORT_SYMBOL(qdisc_create_dflt);
607 EXPORT_SYMBOL(qdisc_destroy);
608 EXPORT_SYMBOL(qdisc_reset);
609 EXPORT_SYMBOL(qdisc_lock_tree);
610 EXPORT_SYMBOL(qdisc_unlock_tree);