2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
31 #include <net/net_namespace.h>
33 #include <net/netlink.h>
34 #include <net/pkt_sched.h>
36 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
37 struct Qdisc *old, struct Qdisc *new);
38 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
39 struct Qdisc *q, unsigned long cl, int event);
46 This file consists of two interrelated parts:
48 1. queueing disciplines manager frontend.
49 2. traffic classes manager frontend.
51 Generally, queueing discipline ("qdisc") is a black box,
52 which is able to enqueue packets and to dequeue them (when
53 device is ready to send something) in order and at times
54 determined by algorithm hidden in it.
56 qdisc's are divided to two categories:
57 - "queues", which have no internal structure visible from outside.
58 - "schedulers", which split all the packets to "traffic classes",
59 using "packet classifiers" (look at cls_api.c)
61 In turn, classes may have child qdiscs (as rule, queues)
62 attached to them etc. etc. etc.
64 The goal of the routines in this file is to translate
65 information supplied by user in the form of handles
66 to more intelligible for kernel form, to make some sanity
67 checks and part of work, which is common to all qdiscs
68 and to provide rtnetlink notifications.
70 All real intelligent work is done inside qdisc modules.
74 Every discipline has two major routines: enqueue and dequeue.
78 dequeue usually returns a skb to send. It is allowed to return NULL,
79 but it does not mean that queue is empty, it just means that
80 discipline does not want to send anything this time.
81 Queue is really empty if q->q.qlen == 0.
82 For complicated disciplines with multiple queues q->q is not
83 real packet queue, but however q->q.qlen must be valid.
87 enqueue returns 0, if packet was enqueued successfully.
88 If packet (this one or another one) was dropped, it returns
90 NET_XMIT_DROP - this packet dropped
91 Expected action: do not backoff, but wait until queue will clear.
92 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
93 Expected action: backoff or ignore
94 NET_XMIT_POLICED - dropped by police.
95 Expected action: backoff or error to real-time apps.
101 requeues once dequeued packet. It is used for non-standard or
102 just buggy devices, which can defer output even if netif_queue_stopped()=0.
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
111 initializes newly created qdisc.
115 destroys resources allocated by init and during lifetime of qdisc.
119 changes qdisc parameters.
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
126 /************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
131 /* The list of all installed queueing disciplines. */
133 static struct Qdisc_ops *qdisc_base;
135 /* Register/uregister queueing discipline */
137 int register_qdisc(struct Qdisc_ops *qops)
139 struct Qdisc_ops *q, **qp;
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->requeue == NULL)
150 qops->requeue = noop_qdisc_ops.requeue;
151 if (qops->dequeue == NULL)
152 qops->dequeue = noop_qdisc_ops.dequeue;
158 write_unlock(&qdisc_mod_lock);
161 EXPORT_SYMBOL(register_qdisc);
163 int unregister_qdisc(struct Qdisc_ops *qops)
165 struct Qdisc_ops *q, **qp;
168 write_lock(&qdisc_mod_lock);
169 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
177 write_unlock(&qdisc_mod_lock);
180 EXPORT_SYMBOL(unregister_qdisc);
182 /* We know handle. Find qdisc among all qdisc's attached to device
183 (root qdisc, all its children, children of children etc.)
186 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
190 for (i = 0; i < dev->num_tx_queues; i++) {
191 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
192 struct Qdisc *q, *txq_root = txq->qdisc;
194 if (!(txq_root->flags & TCQ_F_BUILTIN) &&
195 txq_root->handle == handle)
198 list_for_each_entry(q, &txq_root->list, list) {
199 if (q->handle == handle)
206 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
210 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
214 cl = cops->get(p, classid);
218 leaf = cops->leaf(p, cl);
223 /* Find queueing discipline by name */
225 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
227 struct Qdisc_ops *q = NULL;
230 read_lock(&qdisc_mod_lock);
231 for (q = qdisc_base; q; q = q->next) {
232 if (nla_strcmp(kind, q->id) == 0) {
233 if (!try_module_get(q->owner))
238 read_unlock(&qdisc_mod_lock);
243 static struct qdisc_rate_table *qdisc_rtab_list;
245 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
247 struct qdisc_rate_table *rtab;
249 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
250 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
256 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
257 nla_len(tab) != TC_RTAB_SIZE)
260 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 memcpy(rtab->data, nla_data(tab), 1024);
265 rtab->next = qdisc_rtab_list;
266 qdisc_rtab_list = rtab;
270 EXPORT_SYMBOL(qdisc_get_rtab);
272 void qdisc_put_rtab(struct qdisc_rate_table *tab)
274 struct qdisc_rate_table *rtab, **rtabp;
276 if (!tab || --tab->refcnt)
279 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
287 EXPORT_SYMBOL(qdisc_put_rtab);
289 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
291 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
294 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
296 __netif_schedule(wd->qdisc);
298 return HRTIMER_NORESTART;
301 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
303 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
304 wd->timer.function = qdisc_watchdog;
307 EXPORT_SYMBOL(qdisc_watchdog_init);
309 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
313 wd->qdisc->flags |= TCQ_F_THROTTLED;
314 time = ktime_set(0, 0);
315 time = ktime_add_ns(time, PSCHED_US2NS(expires));
316 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
318 EXPORT_SYMBOL(qdisc_watchdog_schedule);
320 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
322 hrtimer_cancel(&wd->timer);
323 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
325 EXPORT_SYMBOL(qdisc_watchdog_cancel);
327 struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
329 unsigned int size = n * sizeof(struct hlist_head), i;
330 struct hlist_head *h;
332 if (size <= PAGE_SIZE)
333 h = kmalloc(size, GFP_KERNEL);
335 h = (struct hlist_head *)
336 __get_free_pages(GFP_KERNEL, get_order(size));
339 for (i = 0; i < n; i++)
340 INIT_HLIST_HEAD(&h[i]);
345 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
347 unsigned int size = n * sizeof(struct hlist_head);
349 if (size <= PAGE_SIZE)
352 free_pages((unsigned long)h, get_order(size));
355 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
357 struct Qdisc_class_common *cl;
358 struct hlist_node *n, *next;
359 struct hlist_head *nhash, *ohash;
360 unsigned int nsize, nmask, osize;
363 /* Rehash when load factor exceeds 0.75 */
364 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
366 nsize = clhash->hashsize * 2;
368 nhash = qdisc_class_hash_alloc(nsize);
372 ohash = clhash->hash;
373 osize = clhash->hashsize;
376 for (i = 0; i < osize; i++) {
377 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
378 h = qdisc_class_hash(cl->classid, nmask);
379 hlist_add_head(&cl->hnode, &nhash[h]);
382 clhash->hash = nhash;
383 clhash->hashsize = nsize;
384 clhash->hashmask = nmask;
385 sch_tree_unlock(sch);
387 qdisc_class_hash_free(ohash, osize);
389 EXPORT_SYMBOL(qdisc_class_hash_grow);
391 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
393 unsigned int size = 4;
395 clhash->hash = qdisc_class_hash_alloc(size);
396 if (clhash->hash == NULL)
398 clhash->hashsize = size;
399 clhash->hashmask = size - 1;
400 clhash->hashelems = 0;
403 EXPORT_SYMBOL(qdisc_class_hash_init);
405 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
407 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
409 EXPORT_SYMBOL(qdisc_class_hash_destroy);
411 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
412 struct Qdisc_class_common *cl)
416 INIT_HLIST_NODE(&cl->hnode);
417 h = qdisc_class_hash(cl->classid, clhash->hashmask);
418 hlist_add_head(&cl->hnode, &clhash->hash[h]);
421 EXPORT_SYMBOL(qdisc_class_hash_insert);
423 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
424 struct Qdisc_class_common *cl)
426 hlist_del(&cl->hnode);
429 EXPORT_SYMBOL(qdisc_class_hash_remove);
431 /* Allocate an unique handle from space managed by kernel */
433 static u32 qdisc_alloc_handle(struct net_device *dev)
436 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
439 autohandle += TC_H_MAKE(0x10000U, 0);
440 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
441 autohandle = TC_H_MAKE(0x80000000U, 0);
442 } while (qdisc_lookup(dev, autohandle) && --i > 0);
444 return i>0 ? autohandle : 0;
447 /* Attach toplevel qdisc to device queue. */
449 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
452 spinlock_t *root_lock;
453 struct Qdisc *oqdisc;
457 if (qdisc && qdisc->flags&TCQ_F_INGRESS)
461 oqdisc = dev_queue->qdisc;
463 oqdisc = dev_queue->qdisc_sleeping;
466 root_lock = qdisc_root_lock(oqdisc);
467 spin_lock_bh(root_lock);
470 /* Prune old scheduler */
471 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
474 dev_queue->qdisc = NULL;
476 dev_queue->qdisc = qdisc;
480 /* Prune old scheduler */
481 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
484 /* ... and graft new one */
487 dev_queue->qdisc_sleeping = qdisc;
488 dev_queue->qdisc = &noop_qdisc;
491 spin_unlock_bh(root_lock);
496 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
498 const struct Qdisc_class_ops *cops;
504 while ((parentid = sch->parent)) {
505 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
508 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
510 WARN_ON(parentid != TC_H_ROOT);
513 cops = sch->ops->cl_ops;
514 if (cops->qlen_notify) {
515 cl = cops->get(sch, parentid);
516 cops->qlen_notify(sch, cl);
522 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
524 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
525 struct Qdisc *old, struct Qdisc *new)
528 qdisc_notify(skb, n, clid, old, new);
531 spin_lock_bh(&old->q.lock);
533 spin_unlock_bh(&old->q.lock);
537 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
540 * When appropriate send a netlink notification using 'skb'
543 * On success, destroy old qdisc.
546 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
547 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
548 struct Qdisc *new, struct Qdisc *old)
550 struct Qdisc *q = old;
553 if (parent == NULL) {
554 unsigned int i, num_q, ingress;
557 num_q = dev->num_tx_queues;
558 if (q && q->flags & TCQ_F_INGRESS) {
563 if (dev->flags & IFF_UP)
566 for (i = 0; i < num_q; i++) {
567 struct netdev_queue *dev_queue = &dev->rx_queue;
570 dev_queue = netdev_get_tx_queue(dev, i);
573 old = dev_graft_qdisc(dev_queue, q);
575 old = dev_graft_qdisc(dev_queue, new);
577 atomic_inc(&new->refcnt);
579 notify_and_destroy(skb, n, classid, old, new);
582 if (dev->flags & IFF_UP)
585 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
590 unsigned long cl = cops->get(parent, classid);
592 err = cops->graft(parent, cl, new, &old);
593 cops->put(parent, cl);
597 notify_and_destroy(skb, n, classid, old, new);
603 Allocate and initialize new qdisc.
605 Parameters are passed via opt.
608 static struct Qdisc *
609 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
610 u32 parent, u32 handle, struct nlattr **tca, int *errp)
613 struct nlattr *kind = tca[TCA_KIND];
615 struct Qdisc_ops *ops;
617 ops = qdisc_lookup_ops(kind);
619 if (ops == NULL && kind != NULL) {
621 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
622 /* We dropped the RTNL semaphore in order to
623 * perform the module load. So, even if we
624 * succeeded in loading the module we have to
625 * tell the caller to replay the request. We
626 * indicate this using -EAGAIN.
627 * We replay the request because the device may
628 * go away in the mean time.
631 request_module("sch_%s", name);
633 ops = qdisc_lookup_ops(kind);
635 /* We will try again qdisc_lookup_ops,
636 * so don't keep a reference.
638 module_put(ops->owner);
650 sch = qdisc_alloc(dev_queue, ops);
656 sch->parent = parent;
658 if (handle == TC_H_INGRESS) {
659 sch->flags |= TCQ_F_INGRESS;
660 handle = TC_H_MAKE(TC_H_INGRESS, 0);
663 handle = qdisc_alloc_handle(dev);
670 sch->handle = handle;
672 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
674 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
675 qdisc_root_lock(sch),
679 * Any broken qdiscs that would require
680 * a ops->reset() here? The qdisc was never
681 * in action so it shouldn't be necessary.
689 list_add_tail(&sch->list, &dev_queue->qdisc->list);
695 kfree((char *) sch - sch->padded);
697 module_put(ops->owner);
703 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
705 if (tca[TCA_OPTIONS]) {
708 if (sch->ops->change == NULL)
710 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
715 gen_replace_estimator(&sch->bstats, &sch->rate_est,
716 qdisc_root_lock(sch), tca[TCA_RATE]);
720 struct check_loop_arg
722 struct qdisc_walker w;
727 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
729 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
731 struct check_loop_arg arg;
733 if (q->ops->cl_ops == NULL)
736 arg.w.stop = arg.w.skip = arg.w.count = 0;
737 arg.w.fn = check_loop_fn;
740 q->ops->cl_ops->walk(q, &arg.w);
741 return arg.w.stop ? -ELOOP : 0;
745 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
748 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
749 struct check_loop_arg *arg = (struct check_loop_arg *)w;
751 leaf = cops->leaf(q, cl);
753 if (leaf == arg->p || arg->depth > 7)
755 return check_loop(leaf, arg->p, arg->depth + 1);
764 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
766 struct net *net = sock_net(skb->sk);
767 struct tcmsg *tcm = NLMSG_DATA(n);
768 struct nlattr *tca[TCA_MAX + 1];
769 struct net_device *dev;
770 u32 clid = tcm->tcm_parent;
771 struct Qdisc *q = NULL;
772 struct Qdisc *p = NULL;
775 if (net != &init_net)
778 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
781 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
786 if (clid != TC_H_ROOT) {
787 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
788 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
790 q = qdisc_leaf(p, clid);
791 } else { /* ingress */
792 q = dev->rx_queue.qdisc;
795 struct netdev_queue *dev_queue;
796 dev_queue = netdev_get_tx_queue(dev, 0);
797 q = dev_queue->qdisc_sleeping;
802 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
805 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
809 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
812 if (n->nlmsg_type == RTM_DELQDISC) {
817 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
820 qdisc_notify(skb, n, clid, NULL, q);
829 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
831 struct net *net = sock_net(skb->sk);
833 struct nlattr *tca[TCA_MAX + 1];
834 struct net_device *dev;
839 if (net != &init_net)
843 /* Reinit, just in case something touches this. */
845 clid = tcm->tcm_parent;
848 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
851 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
856 if (clid != TC_H_ROOT) {
857 if (clid != TC_H_INGRESS) {
858 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
860 q = qdisc_leaf(p, clid);
861 } else { /*ingress */
862 q = dev->rx_queue.qdisc;
865 struct netdev_queue *dev_queue;
866 dev_queue = netdev_get_tx_queue(dev, 0);
867 q = dev_queue->qdisc_sleeping;
870 /* It may be default qdisc, ignore it */
871 if (q && q->handle == 0)
874 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
875 if (tcm->tcm_handle) {
876 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
878 if (TC_H_MIN(tcm->tcm_handle))
880 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
882 if (n->nlmsg_flags&NLM_F_EXCL)
884 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
887 (p && check_loop(q, p, 0)))
889 atomic_inc(&q->refcnt);
895 /* This magic test requires explanation.
897 * We know, that some child q is already
898 * attached to this parent and have choice:
899 * either to change it or to create/graft new one.
901 * 1. We are allowed to create/graft only
902 * if CREATE and REPLACE flags are set.
904 * 2. If EXCL is set, requestor wanted to say,
905 * that qdisc tcm_handle is not expected
906 * to exist, so that we choose create/graft too.
908 * 3. The last case is when no flags are set.
909 * Alas, it is sort of hole in API, we
910 * cannot decide what to do unambiguously.
911 * For now we select create/graft, if
912 * user gave KIND, which does not match existing.
914 if ((n->nlmsg_flags&NLM_F_CREATE) &&
915 (n->nlmsg_flags&NLM_F_REPLACE) &&
916 ((n->nlmsg_flags&NLM_F_EXCL) ||
918 nla_strcmp(tca[TCA_KIND], q->ops->id))))
923 if (!tcm->tcm_handle)
925 q = qdisc_lookup(dev, tcm->tcm_handle);
928 /* Change qdisc parameters */
931 if (n->nlmsg_flags&NLM_F_EXCL)
933 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
935 err = qdisc_change(q, tca);
937 qdisc_notify(skb, n, clid, NULL, q);
941 if (!(n->nlmsg_flags&NLM_F_CREATE))
943 if (clid == TC_H_INGRESS)
944 q = qdisc_create(dev, &dev->rx_queue,
945 tcm->tcm_parent, tcm->tcm_parent,
948 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
949 tcm->tcm_parent, tcm->tcm_handle,
959 spinlock_t *root_lock;
961 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
964 root_lock = qdisc_root_lock(q);
965 spin_lock_bh(root_lock);
967 spin_unlock_bh(root_lock);
975 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
976 u32 pid, u32 seq, u16 flags, int event)
979 struct nlmsghdr *nlh;
980 unsigned char *b = skb_tail_pointer(skb);
983 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
984 tcm = NLMSG_DATA(nlh);
985 tcm->tcm_family = AF_UNSPEC;
988 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
989 tcm->tcm_parent = clid;
990 tcm->tcm_handle = q->handle;
991 tcm->tcm_info = atomic_read(&q->refcnt);
992 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
993 if (q->ops->dump && q->ops->dump(q, skb) < 0)
994 goto nla_put_failure;
995 q->qstats.qlen = q->q.qlen;
997 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
998 TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
999 goto nla_put_failure;
1001 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1002 goto nla_put_failure;
1004 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1005 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1006 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1007 goto nla_put_failure;
1009 if (gnet_stats_finish_copy(&d) < 0)
1010 goto nla_put_failure;
1012 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1021 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1022 u32 clid, struct Qdisc *old, struct Qdisc *new)
1024 struct sk_buff *skb;
1025 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1027 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1031 if (old && old->handle) {
1032 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1036 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1041 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1048 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1050 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1053 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1054 struct netlink_callback *cb,
1055 int *q_idx_p, int s_q_idx)
1057 int ret = 0, q_idx = *q_idx_p;
1064 if (q_idx < s_q_idx) {
1067 if (!tc_qdisc_dump_ignore(q) &&
1068 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1069 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1073 list_for_each_entry(q, &root->list, list) {
1074 if (q_idx < s_q_idx) {
1078 if (!tc_qdisc_dump_ignore(q) &&
1079 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1080 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1093 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1095 struct net *net = sock_net(skb->sk);
1098 struct net_device *dev;
1100 if (net != &init_net)
1103 s_idx = cb->args[0];
1104 s_q_idx = q_idx = cb->args[1];
1105 read_lock(&dev_base_lock);
1107 for_each_netdev(&init_net, dev) {
1108 struct netdev_queue *dev_queue;
1116 dev_queue = netdev_get_tx_queue(dev, 0);
1117 if (tc_dump_qdisc_root(dev_queue->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1120 dev_queue = &dev->rx_queue;
1121 if (tc_dump_qdisc_root(dev_queue->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1129 read_unlock(&dev_base_lock);
1132 cb->args[1] = q_idx;
1139 /************************************************
1140 * Traffic classes manipulation. *
1141 ************************************************/
1145 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1147 struct net *net = sock_net(skb->sk);
1148 struct netdev_queue *dev_queue;
1149 struct tcmsg *tcm = NLMSG_DATA(n);
1150 struct nlattr *tca[TCA_MAX + 1];
1151 struct net_device *dev;
1152 struct Qdisc *q = NULL;
1153 const struct Qdisc_class_ops *cops;
1154 unsigned long cl = 0;
1155 unsigned long new_cl;
1156 u32 pid = tcm->tcm_parent;
1157 u32 clid = tcm->tcm_handle;
1158 u32 qid = TC_H_MAJ(clid);
1161 if (net != &init_net)
1164 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1167 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1172 parent == TC_H_UNSPEC - unspecified parent.
1173 parent == TC_H_ROOT - class is root, which has no parent.
1174 parent == X:0 - parent is root class.
1175 parent == X:Y - parent is a node in hierarchy.
1176 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1178 handle == 0:0 - generate handle from kernel pool.
1179 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1180 handle == X:Y - clear.
1181 handle == X:0 - root class.
1184 /* Step 1. Determine qdisc handle X:0 */
1186 dev_queue = netdev_get_tx_queue(dev, 0);
1187 if (pid != TC_H_ROOT) {
1188 u32 qid1 = TC_H_MAJ(pid);
1191 /* If both majors are known, they must be identical. */
1196 } else if (qid == 0)
1197 qid = dev_queue->qdisc_sleeping->handle;
1199 /* Now qid is genuine qdisc handle consistent
1200 both with parent and child.
1202 TC_H_MAJ(pid) still may be unspecified, complete it now.
1205 pid = TC_H_MAKE(qid, pid);
1208 qid = dev_queue->qdisc_sleeping->handle;
1211 /* OK. Locate qdisc */
1212 if ((q = qdisc_lookup(dev, qid)) == NULL)
1215 /* An check that it supports classes */
1216 cops = q->ops->cl_ops;
1220 /* Now try to get class */
1222 if (pid == TC_H_ROOT)
1225 clid = TC_H_MAKE(qid, clid);
1228 cl = cops->get(q, clid);
1232 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1235 switch (n->nlmsg_type) {
1238 if (n->nlmsg_flags&NLM_F_EXCL)
1242 err = cops->delete(q, cl);
1244 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1247 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1256 err = cops->change(q, clid, pid, tca, &new_cl);
1258 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1268 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1270 u32 pid, u32 seq, u16 flags, int event)
1273 struct nlmsghdr *nlh;
1274 unsigned char *b = skb_tail_pointer(skb);
1276 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1278 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1279 tcm = NLMSG_DATA(nlh);
1280 tcm->tcm_family = AF_UNSPEC;
1281 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1282 tcm->tcm_parent = q->handle;
1283 tcm->tcm_handle = q->handle;
1285 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1286 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1287 goto nla_put_failure;
1289 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1290 TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
1291 goto nla_put_failure;
1293 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1294 goto nla_put_failure;
1296 if (gnet_stats_finish_copy(&d) < 0)
1297 goto nla_put_failure;
1299 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1308 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1309 struct Qdisc *q, unsigned long cl, int event)
1311 struct sk_buff *skb;
1312 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1314 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1318 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1323 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1326 struct qdisc_dump_args
1328 struct qdisc_walker w;
1329 struct sk_buff *skb;
1330 struct netlink_callback *cb;
1333 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1335 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1337 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1338 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1341 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1342 struct tcmsg *tcm, struct netlink_callback *cb,
1345 struct qdisc_dump_args arg;
1347 if (tc_qdisc_dump_ignore(q) ||
1348 *t_p < s_t || !q->ops->cl_ops ||
1350 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1355 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1356 arg.w.fn = qdisc_class_dump;
1360 arg.w.skip = cb->args[1];
1362 q->ops->cl_ops->walk(q, &arg.w);
1363 cb->args[1] = arg.w.count;
1370 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1371 struct tcmsg *tcm, struct netlink_callback *cb,
1379 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1382 list_for_each_entry(q, &root->list, list) {
1383 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1390 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1392 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1393 struct net *net = sock_net(skb->sk);
1394 struct netdev_queue *dev_queue;
1395 struct net_device *dev;
1398 if (net != &init_net)
1401 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1403 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1409 dev_queue = netdev_get_tx_queue(dev, 0);
1410 if (tc_dump_tclass_root(dev_queue->qdisc, skb, tcm, cb, &t, s_t) < 0)
1413 dev_queue = &dev->rx_queue;
1414 if (tc_dump_tclass_root(dev_queue->qdisc, skb, tcm, cb, &t, s_t) < 0)
1424 /* Main classifier routine: scans classifier chain attached
1425 to this qdisc, (optionally) tests for protocol and asks
1426 specific classifiers.
1428 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1429 struct tcf_result *res)
1431 __be16 protocol = skb->protocol;
1434 for (; tp; tp = tp->next) {
1435 if ((tp->protocol == protocol ||
1436 tp->protocol == htons(ETH_P_ALL)) &&
1437 (err = tp->classify(skb, tp, res)) >= 0) {
1438 #ifdef CONFIG_NET_CLS_ACT
1439 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1440 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1447 EXPORT_SYMBOL(tc_classify_compat);
1449 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1450 struct tcf_result *res)
1454 #ifdef CONFIG_NET_CLS_ACT
1455 struct tcf_proto *otp = tp;
1458 protocol = skb->protocol;
1460 err = tc_classify_compat(skb, tp, res);
1461 #ifdef CONFIG_NET_CLS_ACT
1462 if (err == TC_ACT_RECLASSIFY) {
1463 u32 verd = G_TC_VERD(skb->tc_verd);
1466 if (verd++ >= MAX_REC_LOOP) {
1467 printk("rule prio %u protocol %02x reclassify loop, "
1469 tp->prio&0xffff, ntohs(tp->protocol));
1472 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1478 EXPORT_SYMBOL(tc_classify);
1480 void tcf_destroy(struct tcf_proto *tp)
1482 tp->ops->destroy(tp);
1483 module_put(tp->ops->owner);
1487 void tcf_destroy_chain(struct tcf_proto **fl)
1489 struct tcf_proto *tp;
1491 while ((tp = *fl) != NULL) {
1496 EXPORT_SYMBOL(tcf_destroy_chain);
1498 #ifdef CONFIG_PROC_FS
1499 static int psched_show(struct seq_file *seq, void *v)
1503 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1504 seq_printf(seq, "%08x %08x %08x %08x\n",
1505 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1507 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1512 static int psched_open(struct inode *inode, struct file *file)
1514 return single_open(file, psched_show, PDE(inode)->data);
1517 static const struct file_operations psched_fops = {
1518 .owner = THIS_MODULE,
1519 .open = psched_open,
1521 .llseek = seq_lseek,
1522 .release = single_release,
1526 static int __init pktsched_init(void)
1528 register_qdisc(&pfifo_qdisc_ops);
1529 register_qdisc(&bfifo_qdisc_ops);
1530 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1532 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1533 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1534 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1535 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1536 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1537 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1542 subsys_initcall(pktsched_init);