net/sched/sch_tbf.c

   1 /*
   2  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  11  *                                               original idea by Martin Devera
  12  *
  13  */
  14
  15 #include <linux/module.h>
  16 #include <asm/uaccess.h>
  17 #include <asm/system.h>
  18 #include <linux/bitops.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/jiffies.h>
  22 #include <linux/string.h>
  23 #include <linux/mm.h>
  24 #include <linux/socket.h>
  25 #include <linux/sockios.h>
  26 #include <linux/in.h>
  27 #include <linux/errno.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/if_ether.h>
  30 #include <linux/inet.h>
  31 #include <linux/netdevice.h>
  32 #include <linux/etherdevice.h>
  33 #include <linux/notifier.h>
  34 #include <net/ip.h>
  35 #include <net/netlink.h>
  36 #include <net/route.h>
  37 #include <linux/skbuff.h>
  38 #include <net/sock.h>
  39 #include <net/pkt_sched.h>
  40
  41
  42 /*      Simple Token Bucket Filter.
  43         =======================================
  44
  45         SOURCE.
  46         -------
  47
  48         None.
  49
  50         Description.
  51         ------------
  52
  53         A data flow obeys TBF with rate R and depth B, if for any
  54         time interval t_i...t_f the number of transmitted bits
  55         does not exceed B + R*(t_f-t_i).
  56
  57         Packetized version of this definition:
  58         The sequence of packets of sizes s_i served at moments t_i
  59         obeys TBF, if for any i<=k:
  60
  61         s_i+....+s_k <= B + R*(t_k - t_i)
  62
  63         Algorithm.
  64         ----------
  65
  66         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  67
  68         N(t+delta) = min{B/R, N(t) + delta}
  69
  70         If the first packet in queue has length S, it may be
  71         transmitted only at the time t_* when S/R <= N(t_*),
  72         and in this case N(t) jumps:
  73
  74         N(t_* + 0) = N(t_* - 0) - S/R.
  75
  76
  77
  78         Actually, QoS requires two TBF to be applied to a data stream.
  79         One of them controls steady state burst size, another
  80         one with rate P (peak rate) and depth M (equal to link MTU)
  81         limits bursts at a smaller time scale.
  82
  83         It is easy to see that P>R, and B>M. If P is infinity, this double
  84         TBF is equivalent to a single one.
  85
  86         When TBF works in reshaping mode, latency is estimated as:
  87
  88         lat = max ((L-B)/R, (L-M)/P)
  89
  90
  91         NOTES.
  92         ------
  93
  94         If TBF throttles, it starts a watchdog timer, which will wake it up
  95         when it is ready to transmit.
  96         Note that the minimal timer resolution is 1/HZ.
  97         If no new packets arrive during this period,
  98         or if the device is not awaken by EOI for some previous packet,
  99         TBF can stop its activity for 1/HZ.
 100
 101
 102         This means, that with depth B, the maximal rate is
 103
 104         R_crit = B*HZ
 105
 106         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 107
 108         Note that the peak rate TBF is much more tough: with MTU 1500
 109         P_crit = 150Kbytes/sec. So, if you need greater peak
 110         rates, use alpha with HZ=1000 :-)
 111
 112         With classful TBF, limit is just kept for backwards compatibility.
 113         It is passed to the default bfifo qdisc - if the inner qdisc is
 114         changed the limit is not effective anymore.
 115 */
 116
 117 struct tbf_sched_data
 118 {
 119 /* Parameters */
 120         u32             limit;          /* Maximal length of backlog: bytes */
 121         u32             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 122         u32             mtu;
 123         u32             max_size;
 124         struct qdisc_rate_table *R_tab;
 125         struct qdisc_rate_table *P_tab;
 126
 127 /* Variables */
 128         long    tokens;                 /* Current number of B tokens */
 129         long    ptokens;                /* Current number of P tokens */
 130         psched_time_t   t_c;            /* Time check-point */
 131         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 132         struct qdisc_watchdog watchdog; /* Watchdog timer */
 133 };
 134
 135 #define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
 136 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
 137
 138 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 139 {
 140         struct tbf_sched_data *q = qdisc_priv(sch);
 141         int ret;
 142
 143         if (skb->len > q->max_size) {
 144                 sch->qstats.drops++;
 145 #ifdef CONFIG_NET_CLS_POLICE
 146                 if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
 147 #endif
 148                         kfree_skb(skb);
 149
 150                 return NET_XMIT_DROP;
 151         }
 152
 153         if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
 154                 sch->qstats.drops++;
 155                 return ret;
 156         }
 157
 158         sch->q.qlen++;
 159         sch->bstats.bytes += skb->len;
 160         sch->bstats.packets++;
 161         return 0;
 162 }
 163
 164 static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
 165 {
 166         struct tbf_sched_data *q = qdisc_priv(sch);
 167         int ret;
 168
 169         if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
 170                 sch->q.qlen++;
 171                 sch->qstats.requeues++;
 172         }
 173
 174         return ret;
 175 }
 176
 177 static unsigned int tbf_drop(struct Qdisc* sch)
 178 {
 179         struct tbf_sched_data *q = qdisc_priv(sch);
 180         unsigned int len = 0;
 181
 182         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 183                 sch->q.qlen--;
 184                 sch->qstats.drops++;
 185         }
 186         return len;
 187 }
 188
 189 static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
 190 {
 191         struct tbf_sched_data *q = qdisc_priv(sch);
 192         struct sk_buff *skb;
 193
 194         skb = q->qdisc->dequeue(q->qdisc);
 195
 196         if (skb) {
 197                 psched_time_t now;
 198                 long toks;
 199                 long ptoks = 0;
 200                 unsigned int len = skb->len;
 201
 202                 PSCHED_GET_TIME(now);
 203
 204                 toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer);
 205
 206                 if (q->P_tab) {
 207                         ptoks = toks + q->ptokens;
 208                         if (ptoks > (long)q->mtu)
 209                                 ptoks = q->mtu;
 210                         ptoks -= L2T_P(q, len);
 211                 }
 212                 toks += q->tokens;
 213                 if (toks > (long)q->buffer)
 214                         toks = q->buffer;
 215                 toks -= L2T(q, len);
 216
 217                 if ((toks|ptoks) >= 0) {
 218                         q->t_c = now;
 219                         q->tokens = toks;
 220                         q->ptokens = ptoks;
 221                         sch->q.qlen--;
 222                         sch->flags &= ~TCQ_F_THROTTLED;
 223                         return skb;
 224                 }
 225
 226                 qdisc_watchdog_schedule(&q->watchdog,
 227                                         now + max_t(long, -toks, -ptoks));
 228
 229                 /* Maybe we have a shorter packet in the queue,
 230                    which can be sent now. It sounds cool,
 231                    but, however, this is wrong in principle.
 232                    We MUST NOT reorder packets under these circumstances.
 233
 234                    Really, if we split the flow into independent
 235                    subflows, it would be a very good solution.
 236                    This is the main idea of all FQ algorithms
 237                    (cf. CSZ, HPFQ, HFSC)
 238                  */
 239
 240                 if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
 241                         /* When requeue fails skb is dropped */
 242                         qdisc_tree_decrease_qlen(q->qdisc, 1);
 243                         sch->qstats.drops++;
 244                 }
 245
 246                 sch->qstats.overlimits++;
 247         }
 248         return NULL;
 249 }
 250
 251 static void tbf_reset(struct Qdisc* sch)
 252 {
 253         struct tbf_sched_data *q = qdisc_priv(sch);
 254
 255         qdisc_reset(q->qdisc);
 256         sch->q.qlen = 0;
 257         PSCHED_GET_TIME(q->t_c);
 258         q->tokens = q->buffer;
 259         q->ptokens = q->mtu;
 260         qdisc_watchdog_cancel(&q->watchdog);
 261 }
 262
 263 static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit)
 264 {
 265         struct Qdisc *q;
 266         struct rtattr *rta;
 267         int ret;
 268
 269         q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops,
 270                               TC_H_MAKE(sch->handle, 1));
 271         if (q) {
 272                 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
 273                 if (rta) {
 274                         rta->rta_type = RTM_NEWQDISC;
 275                         rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
 276                         ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
 277
 278                         ret = q->ops->change(q, rta);
 279                         kfree(rta);
 280
 281                         if (ret == 0)
 282                                 return q;
 283                 }
 284                 qdisc_destroy(q);
 285         }
 286
 287         return NULL;
 288 }
 289
 290 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
 291 {
 292         int err = -EINVAL;
 293         struct tbf_sched_data *q = qdisc_priv(sch);
 294         struct rtattr *tb[TCA_TBF_PTAB];
 295         struct tc_tbf_qopt *qopt;
 296         struct qdisc_rate_table *rtab = NULL;
 297         struct qdisc_rate_table *ptab = NULL;
 298         struct Qdisc *child = NULL;
 299         int max_size,n;
 300
 301         if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
 302             tb[TCA_TBF_PARMS-1] == NULL ||
 303             RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
 304                 goto done;
 305
 306         qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
 307         rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
 308         if (rtab == NULL)
 309                 goto done;
 310
 311         if (qopt->peakrate.rate) {
 312                 if (qopt->peakrate.rate > qopt->rate.rate)
 313                         ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
 314                 if (ptab == NULL)
 315                         goto done;
 316         }
 317
 318         for (n = 0; n < 256; n++)
 319                 if (rtab->data[n] > qopt->buffer) break;
 320         max_size = (n << qopt->rate.cell_log)-1;
 321         if (ptab) {
 322                 int size;
 323
 324                 for (n = 0; n < 256; n++)
 325                         if (ptab->data[n] > qopt->mtu) break;
 326                 size = (n << qopt->peakrate.cell_log)-1;
 327                 if (size < max_size) max_size = size;
 328         }
 329         if (max_size < 0)
 330                 goto done;
 331
 332         if (qopt->limit > 0) {
 333                 if ((child = tbf_create_dflt_qdisc(sch, qopt->limit)) == NULL)
 334                         goto done;
 335         }
 336
 337         sch_tree_lock(sch);
 338         if (child) {
 339                 qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
 340                 qdisc_destroy(xchg(&q->qdisc, child));
 341         }
 342         q->limit = qopt->limit;
 343         q->mtu = qopt->mtu;
 344         q->max_size = max_size;
 345         q->buffer = qopt->buffer;
 346         q->tokens = q->buffer;
 347         q->ptokens = q->mtu;
 348         rtab = xchg(&q->R_tab, rtab);
 349         ptab = xchg(&q->P_tab, ptab);
 350         sch_tree_unlock(sch);
 351         err = 0;
 352 done:
 353         if (rtab)
 354                 qdisc_put_rtab(rtab);
 355         if (ptab)
 356                 qdisc_put_rtab(ptab);
 357         return err;
 358 }
 359
 360 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
 361 {
 362         struct tbf_sched_data *q = qdisc_priv(sch);
 363
 364         if (opt == NULL)
 365                 return -EINVAL;
 366
 367         PSCHED_GET_TIME(q->t_c);
 368         qdisc_watchdog_init(&q->watchdog, sch);
 369         q->qdisc = &noop_qdisc;
 370
 371         return tbf_change(sch, opt);
 372 }
 373
 374 static void tbf_destroy(struct Qdisc *sch)
 375 {
 376         struct tbf_sched_data *q = qdisc_priv(sch);
 377
 378         qdisc_watchdog_cancel(&q->watchdog);
 379
 380         if (q->P_tab)
 381                 qdisc_put_rtab(q->P_tab);
 382         if (q->R_tab)
 383                 qdisc_put_rtab(q->R_tab);
 384
 385         qdisc_destroy(q->qdisc);
 386 }
 387
 388 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 389 {
 390         struct tbf_sched_data *q = qdisc_priv(sch);
 391         unsigned char *b = skb_tail_pointer(skb);
 392         struct rtattr *rta;
 393         struct tc_tbf_qopt opt;
 394
 395         rta = (struct rtattr*)b;
 396         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 397
 398         opt.limit = q->limit;
 399         opt.rate = q->R_tab->rate;
 400         if (q->P_tab)
 401                 opt.peakrate = q->P_tab->rate;
 402         else
 403                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 404         opt.mtu = q->mtu;
 405         opt.buffer = q->buffer;
 406         RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
 407         rta->rta_len = skb_tail_pointer(skb) - b;
 408
 409         return skb->len;
 410
 411 rtattr_failure:
 412         nlmsg_trim(skb, b);
 413         return -1;
 414 }
 415
 416 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 417                           struct sk_buff *skb, struct tcmsg *tcm)
 418 {
 419         struct tbf_sched_data *q = qdisc_priv(sch);
 420
 421         if (cl != 1)    /* only one class */
 422                 return -ENOENT;
 423
 424         tcm->tcm_handle |= TC_H_MIN(1);
 425         tcm->tcm_info = q->qdisc->handle;
 426
 427         return 0;
 428 }
 429
 430 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 431                      struct Qdisc **old)
 432 {
 433         struct tbf_sched_data *q = qdisc_priv(sch);
 434
 435         if (new == NULL)
 436                 new = &noop_qdisc;
 437
 438         sch_tree_lock(sch);
 439         *old = xchg(&q->qdisc, new);
 440         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 441         qdisc_reset(*old);
 442         sch_tree_unlock(sch);
 443
 444         return 0;
 445 }
 446
 447 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 448 {
 449         struct tbf_sched_data *q = qdisc_priv(sch);
 450         return q->qdisc;
 451 }
 452
 453 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
 454 {
 455         return 1;
 456 }
 457
 458 static void tbf_put(struct Qdisc *sch, unsigned long arg)
 459 {
 460 }
 461
 462 static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 463                             struct rtattr **tca, unsigned long *arg)
 464 {
 465         return -ENOSYS;
 466 }
 467
 468 static int tbf_delete(struct Qdisc *sch, unsigned long arg)
 469 {
 470         return -ENOSYS;
 471 }
 472
 473 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 474 {
 475         if (!walker->stop) {
 476                 if (walker->count >= walker->skip)
 477                         if (walker->fn(sch, 1, walker) < 0) {
 478                                 walker->stop = 1;
 479                                 return;
 480                         }
 481                 walker->count++;
 482         }
 483 }
 484
 485 static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
 486 {
 487         return NULL;
 488 }
 489
 490 static struct Qdisc_class_ops tbf_class_ops =
 491 {
 492         .graft          =       tbf_graft,
 493         .leaf           =       tbf_leaf,
 494         .get            =       tbf_get,
 495         .put            =       tbf_put,
 496         .change         =       tbf_change_class,
 497         .delete         =       tbf_delete,
 498         .walk           =       tbf_walk,
 499         .tcf_chain      =       tbf_find_tcf,
 500         .dump           =       tbf_dump_class,
 501 };
 502
 503 static struct Qdisc_ops tbf_qdisc_ops = {
 504         .next           =       NULL,
 505         .cl_ops         =       &tbf_class_ops,
 506         .id             =       "tbf",
 507         .priv_size      =       sizeof(struct tbf_sched_data),
 508         .enqueue        =       tbf_enqueue,
 509         .dequeue        =       tbf_dequeue,
 510         .requeue        =       tbf_requeue,
 511         .drop           =       tbf_drop,
 512         .init           =       tbf_init,
 513         .reset          =       tbf_reset,
 514         .destroy        =       tbf_destroy,
 515         .change         =       tbf_change,
 516         .dump           =       tbf_dump,
 517         .owner          =       THIS_MODULE,
 518 };
 519
 520 static int __init tbf_module_init(void)
 521 {
 522         return register_qdisc(&tbf_qdisc_ops);
 523 }
 524
 525 static void __exit tbf_module_exit(void)
 526 {
 527         unregister_qdisc(&tbf_qdisc_ops);
 528 }
 529 module_init(tbf_module_init)
 530 module_exit(tbf_module_exit)
 531 MODULE_LICENSE("GPL");