2 * net/sched/sch_htb.c Hierarchical token bucket, feed tree version
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Martin Devera, <devik@cdi.cz>
11 * Credits (in time order) for older HTB versions:
12 * Stef Coene <stef.coene@docum.org>
13 * HTB support at LARTC mailing list
14 * Ondrej Kraus, <krauso@barr.cz>
15 * found missing INIT_QDISC(htb)
16 * Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17 * helped a lot to locate nasty class stall bug
18 * Andi Kleen, Jamal Hadi, Bert Hubert
19 * code review and helpful comments on shaping
20 * Tomasz Wrona, <tw@eter.tym.pl>
21 * created test case so that I was able to fix nasty bug
23 * spotted bug in dequeue code and helped with fix
25 * fixed requeue routine
26 * and many others. thanks.
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <net/netlink.h>
39 #include <net/pkt_sched.h>
43 ========================================================================
44 HTB is like TBF with multiple classes. It is also similar to CBQ because
45 it allows to assign priority to each class in hierarchy.
46 In fact it is another implementation of Floyd's formal sharing.
49 Each class is assigned level. Leaf has ALWAYS level 0 and root
50 classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51 one less than their parent.
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
61 /* Module parameter and sysfs export */
62 module_param (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
65 /* used internaly to keep status of single class */
67 HTB_CANT_SEND, /* class can't send and can't borrow */
68 HTB_MAY_BORROW, /* class can't send but may borrow */
69 HTB_CAN_SEND /* class can send */
72 /* interior & leaf nodes; props specific to leaves are marked L: */
74 struct Qdisc_class_common common;
75 /* general class parameters */
76 struct gnet_stats_basic bstats;
77 struct gnet_stats_queue qstats;
78 struct gnet_stats_rate_est rate_est;
79 struct tc_htb_xstats xstats; /* our special stats */
80 int refcnt; /* usage count of this class */
83 int level; /* our level (see above) */
84 unsigned int children;
85 struct htb_class *parent; /* parent class */
88 struct htb_class_leaf {
92 int deficit[TC_HTB_MAXDEPTH];
93 struct list_head drop_list;
95 struct htb_class_inner {
96 struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
97 struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
98 /* When class changes from state 1->2 and disconnects from
99 parent's feed then we lost ptr value and start from the
100 first child again. Here we store classid of the
101 last valid ptr (used when ptr is NULL). */
102 u32 last_ptr_id[TC_HTB_NUMPRIO];
105 struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
106 struct rb_node pq_node; /* node for event queue */
107 psched_time_t pq_key;
109 int prio_activity; /* for which prios are we active */
110 enum htb_cmode cmode; /* current mode of the class */
112 /* class attached filters */
113 struct tcf_proto *filter_list;
116 int warned; /* only one warning about non work conserving .. */
118 /* token bucket parameters */
119 struct qdisc_rate_table *rate; /* rate table of the class itself */
120 struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */
121 long buffer, cbuffer; /* token bucket depth/rate */
122 psched_tdiff_t mbuffer; /* max wait time */
123 long tokens, ctokens; /* current number of tokens */
124 psched_time_t t_c; /* checkpoint time */
126 int prio; /* For parent to leaf return possible here */
127 int quantum; /* we do backup. Finally full replacement */
128 /* of un.leaf originals should be done. */
131 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
134 long result = qdisc_l2t(rate, size);
139 struct Qdisc_class_hash clhash;
140 struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
142 /* self list - roots of self generating tree */
143 struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
144 int row_mask[TC_HTB_MAXDEPTH];
145 struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
146 u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
148 /* self wait list - roots of wait PQs per row */
149 struct rb_root wait_pq[TC_HTB_MAXDEPTH];
151 /* time of nearest event per level (row) */
152 psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
154 /* whether we hit non-work conserving class during this dequeue; we use */
155 int nwc_hit; /* this to disable mindelay complaint in dequeue */
157 int defcls; /* class where unclassified flows go to */
159 /* filters for qdisc itself */
160 struct tcf_proto *filter_list;
162 int rate2quantum; /* quant = rate / rate2quantum */
163 psched_time_t now; /* cached dequeue time */
164 struct qdisc_watchdog watchdog;
166 /* non shaped skbs; let them go directly thru */
167 struct sk_buff_head direct_queue;
168 int direct_qlen; /* max qlen of above */
173 /* find class in global hash table using given handle */
174 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
176 struct htb_sched *q = qdisc_priv(sch);
177 struct Qdisc_class_common *clc;
179 clc = qdisc_class_find(&q->clhash, handle);
182 return container_of(clc, struct htb_class, common);
186 * htb_classify - classify a packet into class
188 * It returns NULL if the packet should be dropped or -1 if the packet
189 * should be passed directly thru. In all other cases leaf class is returned.
190 * We allow direct class selection by classid in priority. The we examine
191 * filters in qdisc and in inner nodes (if higher filter points to the inner
192 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
193 * internal fifo (direct). These packets then go directly thru. If we still
194 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
195 * then finish and return direct queue.
197 #define HTB_DIRECT (struct htb_class*)-1
199 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
202 struct htb_sched *q = qdisc_priv(sch);
203 struct htb_class *cl;
204 struct tcf_result res;
205 struct tcf_proto *tcf;
208 /* allow to select class by setting skb->priority to valid classid;
209 note that nfmark can be used too by attaching filter fw with no
211 if (skb->priority == sch->handle)
212 return HTB_DIRECT; /* X:0 (direct flow) selected */
213 if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
216 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
217 tcf = q->filter_list;
218 while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
219 #ifdef CONFIG_NET_CLS_ACT
223 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
228 if ((cl = (void *)res.class) == NULL) {
229 if (res.classid == sch->handle)
230 return HTB_DIRECT; /* X:0 (direct flow) */
231 if ((cl = htb_find(res.classid, sch)) == NULL)
232 break; /* filter selected invalid classid */
235 return cl; /* we hit leaf; return it */
237 /* we have got inner class; apply inner filter chain */
238 tcf = cl->filter_list;
240 /* classification failed; try to use default class */
241 cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
242 if (!cl || cl->level)
243 return HTB_DIRECT; /* bad default .. this is safe bet */
248 * htb_add_to_id_tree - adds class to the round robin list
250 * Routine adds class to the list (actually tree) sorted by classid.
251 * Make sure that class is not already on such list for given prio.
253 static void htb_add_to_id_tree(struct rb_root *root,
254 struct htb_class *cl, int prio)
256 struct rb_node **p = &root->rb_node, *parent = NULL;
261 c = rb_entry(parent, struct htb_class, node[prio]);
263 if (cl->common.classid > c->common.classid)
264 p = &parent->rb_right;
266 p = &parent->rb_left;
268 rb_link_node(&cl->node[prio], parent, p);
269 rb_insert_color(&cl->node[prio], root);
273 * htb_add_to_wait_tree - adds class to the event queue with delay
275 * The class is added to priority event queue to indicate that class will
276 * change its mode in cl->pq_key microseconds. Make sure that class is not
277 * already in the queue.
279 static void htb_add_to_wait_tree(struct htb_sched *q,
280 struct htb_class *cl, long delay)
282 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
284 cl->pq_key = q->now + delay;
285 if (cl->pq_key == q->now)
288 /* update the nearest event cache */
289 if (q->near_ev_cache[cl->level] > cl->pq_key)
290 q->near_ev_cache[cl->level] = cl->pq_key;
295 c = rb_entry(parent, struct htb_class, pq_node);
296 if (cl->pq_key >= c->pq_key)
297 p = &parent->rb_right;
299 p = &parent->rb_left;
301 rb_link_node(&cl->pq_node, parent, p);
302 rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
306 * htb_next_rb_node - finds next node in binary tree
308 * When we are past last key we return NULL.
309 * Average complexity is 2 steps per call.
311 static inline void htb_next_rb_node(struct rb_node **n)
317 * htb_add_class_to_row - add class to its row
319 * The class is added to row at priorities marked in mask.
320 * It does nothing if mask == 0.
322 static inline void htb_add_class_to_row(struct htb_sched *q,
323 struct htb_class *cl, int mask)
325 q->row_mask[cl->level] |= mask;
327 int prio = ffz(~mask);
328 mask &= ~(1 << prio);
329 htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
333 /* If this triggers, it is a bug in this code, but it need not be fatal */
334 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
336 if (RB_EMPTY_NODE(rb)) {
346 * htb_remove_class_from_row - removes class from its row
348 * The class is removed from row at priorities marked in mask.
349 * It does nothing if mask == 0.
351 static inline void htb_remove_class_from_row(struct htb_sched *q,
352 struct htb_class *cl, int mask)
357 int prio = ffz(~mask);
359 mask &= ~(1 << prio);
360 if (q->ptr[cl->level][prio] == cl->node + prio)
361 htb_next_rb_node(q->ptr[cl->level] + prio);
363 htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
364 if (!q->row[cl->level][prio].rb_node)
367 q->row_mask[cl->level] &= ~m;
371 * htb_activate_prios - creates active classe's feed chain
373 * The class is connected to ancestors and/or appropriate rows
374 * for priorities it is participating on. cl->cmode must be new
375 * (activated) mode. It does nothing if cl->prio_activity == 0.
377 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
379 struct htb_class *p = cl->parent;
380 long m, mask = cl->prio_activity;
382 while (cl->cmode == HTB_MAY_BORROW && p && mask) {
388 if (p->un.inner.feed[prio].rb_node)
389 /* parent already has its feed in use so that
390 reset bit in mask as parent is already ok */
391 mask &= ~(1 << prio);
393 htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
395 p->prio_activity |= mask;
400 if (cl->cmode == HTB_CAN_SEND && mask)
401 htb_add_class_to_row(q, cl, mask);
405 * htb_deactivate_prios - remove class from feed chain
407 * cl->cmode must represent old mode (before deactivation). It does
408 * nothing if cl->prio_activity == 0. Class is removed from all feed
411 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
413 struct htb_class *p = cl->parent;
414 long m, mask = cl->prio_activity;
416 while (cl->cmode == HTB_MAY_BORROW && p && mask) {
423 if (p->un.inner.ptr[prio] == cl->node + prio) {
424 /* we are removing child which is pointed to from
425 parent feed - forget the pointer but remember
427 p->un.inner.last_ptr_id[prio] = cl->common.classid;
428 p->un.inner.ptr[prio] = NULL;
431 htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
433 if (!p->un.inner.feed[prio].rb_node)
437 p->prio_activity &= ~mask;
442 if (cl->cmode == HTB_CAN_SEND && mask)
443 htb_remove_class_from_row(q, cl, mask);
446 static inline long htb_lowater(const struct htb_class *cl)
449 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
453 static inline long htb_hiwater(const struct htb_class *cl)
456 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
463 * htb_class_mode - computes and returns current class mode
465 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
466 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
467 * from now to time when cl will change its state.
468 * Also it is worth to note that class mode doesn't change simply
469 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
470 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
471 * mode transitions per time unit. The speed gain is about 1/6.
473 static inline enum htb_cmode
474 htb_class_mode(struct htb_class *cl, long *diff)
478 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
480 return HTB_CANT_SEND;
483 if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
487 return HTB_MAY_BORROW;
491 * htb_change_class_mode - changes classe's mode
493 * This should be the only way how to change classe's mode under normal
494 * cirsumstances. Routine will update feed lists linkage, change mode
495 * and add class to the wait event queue if appropriate. New mode should
496 * be different from old one and cl->pq_key has to be valid if changing
497 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
500 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
502 enum htb_cmode new_mode = htb_class_mode(cl, diff);
504 if (new_mode == cl->cmode)
507 if (cl->prio_activity) { /* not necessary: speed optimization */
508 if (cl->cmode != HTB_CANT_SEND)
509 htb_deactivate_prios(q, cl);
510 cl->cmode = new_mode;
511 if (new_mode != HTB_CANT_SEND)
512 htb_activate_prios(q, cl);
514 cl->cmode = new_mode;
518 * htb_activate - inserts leaf cl into appropriate active feeds
520 * Routine learns (new) priority of leaf and activates feed chain
521 * for the prio. It can be called on already active leaf safely.
522 * It also adds leaf into droplist.
524 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
526 WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
528 if (!cl->prio_activity) {
529 cl->prio_activity = 1 << cl->un.leaf.prio;
530 htb_activate_prios(q, cl);
531 list_add_tail(&cl->un.leaf.drop_list,
532 q->drops + cl->un.leaf.prio);
537 * htb_deactivate - remove leaf cl from active feeds
539 * Make sure that leaf is active. In the other words it can't be called
540 * with non-active leaf. It also removes class from the drop list.
542 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
544 WARN_ON(!cl->prio_activity);
546 htb_deactivate_prios(q, cl);
547 cl->prio_activity = 0;
548 list_del_init(&cl->un.leaf.drop_list);
551 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
553 int uninitialized_var(ret);
554 struct htb_sched *q = qdisc_priv(sch);
555 struct htb_class *cl = htb_classify(skb, sch, &ret);
557 if (cl == HTB_DIRECT) {
558 /* enqueue to helper queue */
559 if (q->direct_queue.qlen < q->direct_qlen) {
560 __skb_queue_tail(&q->direct_queue, skb);
565 return NET_XMIT_DROP;
567 #ifdef CONFIG_NET_CLS_ACT
569 if (ret & __NET_XMIT_BYPASS)
574 } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
575 if (net_xmit_drop_count(ret)) {
581 cl->bstats.packets +=
582 skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
583 cl->bstats.bytes += qdisc_pkt_len(skb);
588 sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
589 sch->bstats.bytes += qdisc_pkt_len(skb);
590 return NET_XMIT_SUCCESS;
594 * htb_charge_class - charges amount "bytes" to leaf and ancestors
596 * Routine assumes that packet "bytes" long was dequeued from leaf cl
597 * borrowing from "level". It accounts bytes to ceil leaky bucket for
598 * leaf and all ancestors and to rate bucket for ancestors at levels
599 * "level" and higher. It also handles possible change of mode resulting
600 * from the update. Note that mode can also increase here (MAY_BORROW to
601 * CAN_SEND) because we can use more precise clock that event queue here.
602 * In such case we remove class from event queue first.
604 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
605 int level, struct sk_buff *skb)
607 int bytes = qdisc_pkt_len(skb);
609 enum htb_cmode old_mode;
611 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
612 if (toks > cl->B) toks = cl->B; \
613 toks -= L2T(cl, cl->R, bytes); \
614 if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
618 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
619 if (cl->level >= level) {
620 if (cl->level == level)
622 HTB_ACCNT(tokens, buffer, rate);
624 cl->xstats.borrows++;
625 cl->tokens += diff; /* we moved t_c; update tokens */
627 HTB_ACCNT(ctokens, cbuffer, ceil);
630 old_mode = cl->cmode;
632 htb_change_class_mode(q, cl, &diff);
633 if (old_mode != cl->cmode) {
634 if (old_mode != HTB_CAN_SEND)
635 htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
636 if (cl->cmode != HTB_CAN_SEND)
637 htb_add_to_wait_tree(q, cl, diff);
640 /* update byte stats except for leaves which are already updated */
642 cl->bstats.bytes += bytes;
643 cl->bstats.packets += skb_is_gso(skb)?
644 skb_shinfo(skb)->gso_segs:1;
651 * htb_do_events - make mode changes to classes at the level
653 * Scans event queue for pending events and applies them. Returns time of
654 * next pending event (0 for no event in pq).
655 * Note: Applied are events whose have cl->pq_key <= q->now.
657 static psched_time_t htb_do_events(struct htb_sched *q, int level)
659 /* don't run for longer than 2 jiffies; 2 is used instead of
660 1 to simplify things when jiffy is going to be incremented
662 unsigned long stop_at = jiffies + 2;
663 while (time_before(jiffies, stop_at)) {
664 struct htb_class *cl;
666 struct rb_node *p = rb_first(&q->wait_pq[level]);
671 cl = rb_entry(p, struct htb_class, pq_node);
672 if (cl->pq_key > q->now)
675 htb_safe_rb_erase(p, q->wait_pq + level);
676 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
677 htb_change_class_mode(q, cl, &diff);
678 if (cl->cmode != HTB_CAN_SEND)
679 htb_add_to_wait_tree(q, cl, diff);
681 /* too much load - let's continue on next jiffie */
682 return q->now + PSCHED_TICKS_PER_SEC / HZ;
685 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
686 is no such one exists. */
687 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
690 struct rb_node *r = NULL;
692 struct htb_class *cl =
693 rb_entry(n, struct htb_class, node[prio]);
694 if (id == cl->common.classid)
697 if (id > cl->common.classid) {
708 * htb_lookup_leaf - returns next leaf class in DRR order
710 * Find leaf where current feed pointers points to.
712 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
713 struct rb_node **pptr, u32 * pid)
717 struct rb_node *root;
718 struct rb_node **pptr;
720 } stk[TC_HTB_MAXDEPTH], *sp = stk;
722 WARN_ON(!tree->rb_node);
723 sp->root = tree->rb_node;
727 for (i = 0; i < 65535; i++) {
728 if (!*sp->pptr && *sp->pid) {
729 /* ptr was invalidated but id is valid - try to recover
730 the original or next ptr */
732 htb_id_find_next_upper(prio, sp->root, *sp->pid);
734 *sp->pid = 0; /* ptr is valid now so that remove this hint as it
735 can become out of date quickly */
736 if (!*sp->pptr) { /* we are at right end; rewind & go up */
737 *sp->pptr = sp->root;
738 while ((*sp->pptr)->rb_left)
739 *sp->pptr = (*sp->pptr)->rb_left;
745 htb_next_rb_node(sp->pptr);
748 struct htb_class *cl;
749 cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
752 (++sp)->root = cl->un.inner.feed[prio].rb_node;
753 sp->pptr = cl->un.inner.ptr + prio;
754 sp->pid = cl->un.inner.last_ptr_id + prio;
761 /* dequeues packet at given priority and level; call only if
762 you are sure that there is active class at prio/level */
763 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
766 struct sk_buff *skb = NULL;
767 struct htb_class *cl, *start;
768 /* look initial class up in the row */
769 start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
770 q->ptr[level] + prio,
771 q->last_ptr_id[level] + prio);
779 /* class can be empty - it is unlikely but can be true if leaf
780 qdisc drops packets in enqueue routine or if someone used
781 graft operation on the leaf since last dequeue;
782 simply deactivate and skip such class */
783 if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
784 struct htb_class *next;
785 htb_deactivate(q, cl);
787 /* row/level might become empty */
788 if ((q->row_mask[level] & (1 << prio)) == 0)
791 next = htb_lookup_leaf(q->row[level] + prio,
792 prio, q->ptr[level] + prio,
793 q->last_ptr_id[level] + prio);
795 if (cl == start) /* fix start if we just deleted it */
801 skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
802 if (likely(skb != NULL))
806 "htb: class %X isn't work conserving ?!\n",
811 htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
813 cl = htb_lookup_leaf(q->row[level] + prio, prio,
814 q->ptr[level] + prio,
815 q->last_ptr_id[level] + prio);
817 } while (cl != start);
819 if (likely(skb != NULL)) {
820 cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
821 if (cl->un.leaf.deficit[level] < 0) {
822 cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
823 htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
826 /* this used to be after charge_class but this constelation
827 gives us slightly better performance */
828 if (!cl->un.leaf.q->q.qlen)
829 htb_deactivate(q, cl);
830 htb_charge_class(q, cl, level, skb);
835 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
837 struct sk_buff *skb = NULL;
838 struct htb_sched *q = qdisc_priv(sch);
840 psched_time_t next_event;
842 /* try to dequeue direct packets as high prio (!) to minimize cpu work */
843 skb = __skb_dequeue(&q->direct_queue);
845 sch->flags &= ~TCQ_F_THROTTLED;
852 q->now = psched_get_time();
854 next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
856 for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
857 /* common case optimization - skip event handler quickly */
861 if (q->now >= q->near_ev_cache[level]) {
862 event = htb_do_events(q, level);
864 event = q->now + PSCHED_TICKS_PER_SEC;
865 q->near_ev_cache[level] = event;
867 event = q->near_ev_cache[level];
869 if (event && next_event > event)
872 m = ~q->row_mask[level];
873 while (m != (int)(-1)) {
876 skb = htb_dequeue_tree(q, prio, level);
877 if (likely(skb != NULL)) {
879 sch->flags &= ~TCQ_F_THROTTLED;
884 sch->qstats.overlimits++;
885 qdisc_watchdog_schedule(&q->watchdog, next_event);
890 /* try to drop from each class (by prio) until one succeed */
891 static unsigned int htb_drop(struct Qdisc *sch)
893 struct htb_sched *q = qdisc_priv(sch);
896 for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
898 list_for_each(p, q->drops + prio) {
899 struct htb_class *cl = list_entry(p, struct htb_class,
902 if (cl->un.leaf.q->ops->drop &&
903 (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
905 if (!cl->un.leaf.q->q.qlen)
906 htb_deactivate(q, cl);
914 /* reset all classes */
915 /* always caled under BH & queue lock */
916 static void htb_reset(struct Qdisc *sch)
918 struct htb_sched *q = qdisc_priv(sch);
919 struct htb_class *cl;
920 struct hlist_node *n;
923 for (i = 0; i < q->clhash.hashsize; i++) {
924 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
926 memset(&cl->un.inner, 0, sizeof(cl->un.inner));
929 qdisc_reset(cl->un.leaf.q);
930 INIT_LIST_HEAD(&cl->un.leaf.drop_list);
932 cl->prio_activity = 0;
933 cl->cmode = HTB_CAN_SEND;
937 qdisc_watchdog_cancel(&q->watchdog);
938 __skb_queue_purge(&q->direct_queue);
940 memset(q->row, 0, sizeof(q->row));
941 memset(q->row_mask, 0, sizeof(q->row_mask));
942 memset(q->wait_pq, 0, sizeof(q->wait_pq));
943 memset(q->ptr, 0, sizeof(q->ptr));
944 for (i = 0; i < TC_HTB_NUMPRIO; i++)
945 INIT_LIST_HEAD(q->drops + i);
948 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
949 [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
950 [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) },
951 [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
952 [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
955 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
957 struct htb_sched *q = qdisc_priv(sch);
958 struct nlattr *tb[TCA_HTB_INIT + 1];
959 struct tc_htb_glob *gopt;
966 err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
970 if (tb[TCA_HTB_INIT] == NULL) {
971 printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
974 gopt = nla_data(tb[TCA_HTB_INIT]);
975 if (gopt->version != HTB_VER >> 16) {
977 "HTB: need tc/htb version %d (minor is %d), you have %d\n",
978 HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
982 err = qdisc_class_hash_init(&q->clhash);
985 for (i = 0; i < TC_HTB_NUMPRIO; i++)
986 INIT_LIST_HEAD(q->drops + i);
988 qdisc_watchdog_init(&q->watchdog, sch);
989 skb_queue_head_init(&q->direct_queue);
991 q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
992 if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
995 if ((q->rate2quantum = gopt->rate2quantum) < 1)
997 q->defcls = gopt->defcls;
1002 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1004 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1005 struct htb_sched *q = qdisc_priv(sch);
1006 struct nlattr *nest;
1007 struct tc_htb_glob gopt;
1009 spin_lock_bh(root_lock);
1011 gopt.direct_pkts = q->direct_pkts;
1012 gopt.version = HTB_VER;
1013 gopt.rate2quantum = q->rate2quantum;
1014 gopt.defcls = q->defcls;
1017 nest = nla_nest_start(skb, TCA_OPTIONS);
1019 goto nla_put_failure;
1020 NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1021 nla_nest_end(skb, nest);
1023 spin_unlock_bh(root_lock);
1027 spin_unlock_bh(root_lock);
1028 nla_nest_cancel(skb, nest);
1032 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1033 struct sk_buff *skb, struct tcmsg *tcm)
1035 struct htb_class *cl = (struct htb_class *)arg;
1036 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1037 struct nlattr *nest;
1038 struct tc_htb_opt opt;
1040 spin_lock_bh(root_lock);
1041 tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1042 tcm->tcm_handle = cl->common.classid;
1043 if (!cl->level && cl->un.leaf.q)
1044 tcm->tcm_info = cl->un.leaf.q->handle;
1046 nest = nla_nest_start(skb, TCA_OPTIONS);
1048 goto nla_put_failure;
1050 memset(&opt, 0, sizeof(opt));
1052 opt.rate = cl->rate->rate;
1053 opt.buffer = cl->buffer;
1054 opt.ceil = cl->ceil->rate;
1055 opt.cbuffer = cl->cbuffer;
1056 opt.quantum = cl->un.leaf.quantum;
1057 opt.prio = cl->un.leaf.prio;
1058 opt.level = cl->level;
1059 NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1061 nla_nest_end(skb, nest);
1062 spin_unlock_bh(root_lock);
1066 spin_unlock_bh(root_lock);
1067 nla_nest_cancel(skb, nest);
1072 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1074 struct htb_class *cl = (struct htb_class *)arg;
1076 if (!cl->level && cl->un.leaf.q)
1077 cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1078 cl->xstats.tokens = cl->tokens;
1079 cl->xstats.ctokens = cl->ctokens;
1081 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1082 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1083 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1086 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1089 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1092 struct htb_class *cl = (struct htb_class *)arg;
1094 if (cl && !cl->level) {
1096 (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1098 cl->common.classid))
1102 *old = cl->un.leaf.q;
1103 cl->un.leaf.q = new;
1105 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1108 sch_tree_unlock(sch);
1114 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1116 struct htb_class *cl = (struct htb_class *)arg;
1117 return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1120 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1122 struct htb_class *cl = (struct htb_class *)arg;
1124 if (cl->un.leaf.q->q.qlen == 0)
1125 htb_deactivate(qdisc_priv(sch), cl);
1128 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1130 struct htb_class *cl = htb_find(classid, sch);
1133 return (unsigned long)cl;
1136 static inline int htb_parent_last_child(struct htb_class *cl)
1139 /* the root class */
1141 if (cl->parent->children > 1)
1142 /* not the last child */
1147 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1148 struct Qdisc *new_q)
1150 struct htb_class *parent = cl->parent;
1152 WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1154 if (parent->cmode != HTB_CAN_SEND)
1155 htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1158 memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1159 INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1160 parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1161 parent->un.leaf.quantum = parent->quantum;
1162 parent->un.leaf.prio = parent->prio;
1163 parent->tokens = parent->buffer;
1164 parent->ctokens = parent->cbuffer;
1165 parent->t_c = psched_get_time();
1166 parent->cmode = HTB_CAN_SEND;
1169 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1172 WARN_ON(!cl->un.leaf.q);
1173 qdisc_destroy(cl->un.leaf.q);
1175 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1176 qdisc_put_rtab(cl->rate);
1177 qdisc_put_rtab(cl->ceil);
1179 tcf_destroy_chain(&cl->filter_list);
1183 /* always caled under BH & queue lock */
1184 static void htb_destroy(struct Qdisc *sch)
1186 struct htb_sched *q = qdisc_priv(sch);
1187 struct hlist_node *n, *next;
1188 struct htb_class *cl;
1191 qdisc_watchdog_cancel(&q->watchdog);
1192 /* This line used to be after htb_destroy_class call below
1193 and surprisingly it worked in 2.4. But it must precede it
1194 because filter need its target class alive to be able to call
1195 unbind_filter on it (without Oops). */
1196 tcf_destroy_chain(&q->filter_list);
1198 for (i = 0; i < q->clhash.hashsize; i++) {
1199 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1200 tcf_destroy_chain(&cl->filter_list);
1202 for (i = 0; i < q->clhash.hashsize; i++) {
1203 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1205 htb_destroy_class(sch, cl);
1207 qdisc_class_hash_destroy(&q->clhash);
1208 __skb_queue_purge(&q->direct_queue);
1211 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1213 struct htb_sched *q = qdisc_priv(sch);
1214 struct htb_class *cl = (struct htb_class *)arg;
1216 struct Qdisc *new_q = NULL;
1219 // TODO: why don't allow to delete subtree ? references ? does
1220 // tc subsys quarantee us that in htb_destroy it holds no class
1221 // refs so that we can remove children safely there ?
1222 if (cl->children || cl->filter_cnt)
1225 if (!cl->level && htb_parent_last_child(cl)) {
1226 new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1228 cl->parent->common.classid);
1235 qlen = cl->un.leaf.q->q.qlen;
1236 qdisc_reset(cl->un.leaf.q);
1237 qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1240 /* delete from hash and active; remainder in destroy_class */
1241 qdisc_class_hash_remove(&q->clhash, &cl->common);
1243 cl->parent->children--;
1245 if (cl->prio_activity)
1246 htb_deactivate(q, cl);
1248 if (cl->cmode != HTB_CAN_SEND)
1249 htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1252 htb_parent_to_leaf(q, cl, new_q);
1254 if (--cl->refcnt == 0)
1255 htb_destroy_class(sch, cl);
1257 sch_tree_unlock(sch);
1261 static void htb_put(struct Qdisc *sch, unsigned long arg)
1263 struct htb_class *cl = (struct htb_class *)arg;
1265 if (--cl->refcnt == 0)
1266 htb_destroy_class(sch, cl);
1269 static int htb_change_class(struct Qdisc *sch, u32 classid,
1270 u32 parentid, struct nlattr **tca,
1274 struct htb_sched *q = qdisc_priv(sch);
1275 struct htb_class *cl = (struct htb_class *)*arg, *parent;
1276 struct nlattr *opt = tca[TCA_OPTIONS];
1277 struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1278 struct nlattr *tb[TCA_HTB_RTAB + 1];
1279 struct tc_htb_opt *hopt;
1281 /* extract all subattrs from opt attr */
1285 err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1290 if (tb[TCA_HTB_PARMS] == NULL)
1293 parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1295 hopt = nla_data(tb[TCA_HTB_PARMS]);
1297 rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1298 ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1302 if (!cl) { /* new class */
1303 struct Qdisc *new_q;
1307 struct gnet_estimator opt;
1310 .nla_len = nla_attr_size(sizeof(est.opt)),
1311 .nla_type = TCA_RATE,
1314 /* 4s interval, 16s averaging constant */
1320 /* check for valid classid */
1321 if (!classid || TC_H_MAJ(classid ^ sch->handle)
1322 || htb_find(classid, sch))
1325 /* check maximal depth */
1326 if (parent && parent->parent && parent->parent->level < 2) {
1327 printk(KERN_ERR "htb: tree is too deep\n");
1331 if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1334 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1335 qdisc_root_sleeping_lock(sch),
1336 tca[TCA_RATE] ? : &est.nla);
1344 INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1345 RB_CLEAR_NODE(&cl->pq_node);
1347 for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1348 RB_CLEAR_NODE(&cl->node[prio]);
1350 /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1351 so that can't be used inside of sch_tree_lock
1352 -- thanks to Karlis Peisenieks */
1353 new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1354 &pfifo_qdisc_ops, classid);
1356 if (parent && !parent->level) {
1357 unsigned int qlen = parent->un.leaf.q->q.qlen;
1359 /* turn parent into inner node */
1360 qdisc_reset(parent->un.leaf.q);
1361 qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1362 qdisc_destroy(parent->un.leaf.q);
1363 if (parent->prio_activity)
1364 htb_deactivate(q, parent);
1366 /* remove from evt list because of level change */
1367 if (parent->cmode != HTB_CAN_SEND) {
1368 htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1369 parent->cmode = HTB_CAN_SEND;
1371 parent->level = (parent->parent ? parent->parent->level
1372 : TC_HTB_MAXDEPTH) - 1;
1373 memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1375 /* leaf (we) needs elementary qdisc */
1376 cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1378 cl->common.classid = classid;
1379 cl->parent = parent;
1381 /* set class to be in HTB_CAN_SEND state */
1382 cl->tokens = hopt->buffer;
1383 cl->ctokens = hopt->cbuffer;
1384 cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */
1385 cl->t_c = psched_get_time();
1386 cl->cmode = HTB_CAN_SEND;
1388 /* attach to the hash list and parent's family */
1389 qdisc_class_hash_insert(&q->clhash, &cl->common);
1393 if (tca[TCA_RATE]) {
1394 err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1395 qdisc_root_sleeping_lock(sch),
1403 /* it used to be a nasty bug here, we have to check that node
1404 is really leaf before changing cl->un.leaf ! */
1406 cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1407 if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1409 "HTB: quantum of class %X is small. Consider r2q change.\n",
1410 cl->common.classid);
1411 cl->un.leaf.quantum = 1000;
1413 if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1415 "HTB: quantum of class %X is big. Consider r2q change.\n",
1416 cl->common.classid);
1417 cl->un.leaf.quantum = 200000;
1420 cl->un.leaf.quantum = hopt->quantum;
1421 if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1422 cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1424 /* backup for htb_parent_to_leaf */
1425 cl->quantum = cl->un.leaf.quantum;
1426 cl->prio = cl->un.leaf.prio;
1429 cl->buffer = hopt->buffer;
1430 cl->cbuffer = hopt->cbuffer;
1432 qdisc_put_rtab(cl->rate);
1435 qdisc_put_rtab(cl->ceil);
1437 sch_tree_unlock(sch);
1439 qdisc_class_hash_grow(sch, &q->clhash);
1441 *arg = (unsigned long)cl;
1446 qdisc_put_rtab(rtab);
1448 qdisc_put_rtab(ctab);
1452 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1454 struct htb_sched *q = qdisc_priv(sch);
1455 struct htb_class *cl = (struct htb_class *)arg;
1456 struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1461 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1464 struct htb_class *cl = htb_find(classid, sch);
1466 /*if (cl && !cl->level) return 0;
1467 The line above used to be there to prevent attaching filters to
1468 leaves. But at least tc_index filter uses this just to get class
1469 for other reasons so that we have to allow for it.
1471 19.6.2002 As Werner explained it is ok - bind filter is just
1472 another way to "lock" the class - unlike "get" this lock can
1473 be broken by class during destroy IIUC.
1477 return (unsigned long)cl;
1480 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1482 struct htb_class *cl = (struct htb_class *)arg;
1488 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1490 struct htb_sched *q = qdisc_priv(sch);
1491 struct htb_class *cl;
1492 struct hlist_node *n;
1498 for (i = 0; i < q->clhash.hashsize; i++) {
1499 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1500 if (arg->count < arg->skip) {
1504 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1513 static const struct Qdisc_class_ops htb_class_ops = {
1516 .qlen_notify = htb_qlen_notify,
1519 .change = htb_change_class,
1520 .delete = htb_delete,
1522 .tcf_chain = htb_find_tcf,
1523 .bind_tcf = htb_bind_filter,
1524 .unbind_tcf = htb_unbind_filter,
1525 .dump = htb_dump_class,
1526 .dump_stats = htb_dump_class_stats,
1529 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1531 .cl_ops = &htb_class_ops,
1533 .priv_size = sizeof(struct htb_sched),
1534 .enqueue = htb_enqueue,
1535 .dequeue = htb_dequeue,
1536 .peek = qdisc_peek_dequeued,
1540 .destroy = htb_destroy,
1541 .change = NULL /* htb_change */,
1543 .owner = THIS_MODULE,
1546 static int __init htb_module_init(void)
1548 return register_qdisc(&htb_qdisc_ops);
1550 static void __exit htb_module_exit(void)
1552 unregister_qdisc(&htb_qdisc_ops);
1555 module_init(htb_module_init)
1556 module_exit(htb_module_exit)
1557 MODULE_LICENSE("GPL");