net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/tcp.h>
  73 #include <net/udp.h>
  74 #include <linux/skbuff.h>
  75 #include <net/sock.h>
  76 #include <net/arp.h>
  77 #include <net/icmp.h>
  78 #include <net/raw.h>
  79 #include <net/checksum.h>
  80 #include <net/inetpeer.h>
  81 #include <net/checksum.h>
  82 #include <linux/igmp.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/netfilter_bridge.h>
  85 #include <linux/mroute.h>
  86 #include <linux/netlink.h>
  87
  88 /*
  89  *      Shall we try to damage output packets if routing dev changes?
  90  */
  91
  92 int sysctl_ip_dynaddr;
  93 int sysctl_ip_default_ttl = IPDEFTTL;
  94
  95 /* Generate a checksum for an outgoing IP datagram. */
  96 __inline__ void ip_send_check(struct iphdr *iph)
  97 {
  98         iph->check = 0;
  99         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 100 }
 101
 102 /* dev_loopback_xmit for use with netfilter. */
 103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 104 {
 105         newskb->mac.raw = newskb->data;
 106         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 107         newskb->pkt_type = PACKET_LOOPBACK;
 108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 109         BUG_TRAP(newskb->dst);
 110
 111 #ifdef CONFIG_NETFILTER_DEBUG
 112         nf_debug_ip_loopback_xmit(newskb);
 113 #endif
 114         netif_rx(newskb);
 115         return 0;
 116 }
 117
 118 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 119 {
 120         int ttl = inet->uc_ttl;
 121
 122         if (ttl < 0)
 123                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 124         return ttl;
 125 }
 126
 127 /*
 128  *              Add an ip header to a skbuff and send it out.
 129  *
 130  */
 131 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 132                           u32 saddr, u32 daddr, struct ip_options *opt)
 133 {
 134         struct inet_sock *inet = inet_sk(sk);
 135         struct rtable *rt = (struct rtable *)skb->dst;
 136         struct iphdr *iph;
 137
 138         /* Build the IP header. */
 139         if (opt)
 140                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 141         else
 142                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 143
 144         iph->version  = 4;
 145         iph->ihl      = 5;
 146         iph->tos      = inet->tos;
 147         if (ip_dont_fragment(sk, &rt->u.dst))
 148                 iph->frag_off = htons(IP_DF);
 149         else
 150                 iph->frag_off = 0;
 151         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 152         iph->daddr    = rt->rt_dst;
 153         iph->saddr    = rt->rt_src;
 154         iph->protocol = sk->sk_protocol;
 155         iph->tot_len  = htons(skb->len);
 156         ip_select_ident(iph, &rt->u.dst, sk);
 157         skb->nh.iph   = iph;
 158
 159         if (opt && opt->optlen) {
 160                 iph->ihl += opt->optlen>>2;
 161                 ip_options_build(skb, opt, daddr, rt, 0);
 162         }
 163         ip_send_check(iph);
 164
 165         skb->priority = sk->sk_priority;
 166
 167         /* Send it out. */
 168         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 169                        dst_output);
 170 }
 171
 172 static inline int ip_finish_output2(struct sk_buff *skb)
 173 {
 174         struct dst_entry *dst = skb->dst;
 175         struct hh_cache *hh = dst->hh;
 176         struct net_device *dev = dst->dev;
 177         int hh_len = LL_RESERVED_SPACE(dev);
 178
 179         /* Be paranoid, rather than too clever. */
 180         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 181                 struct sk_buff *skb2;
 182
 183                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 184                 if (skb2 == NULL) {
 185                         kfree_skb(skb);
 186                         return -ENOMEM;
 187                 }
 188                 if (skb->sk)
 189                         skb_set_owner_w(skb2, skb->sk);
 190                 kfree_skb(skb);
 191                 skb = skb2;
 192         }
 193
 194 #ifdef CONFIG_NETFILTER_DEBUG
 195         nf_debug_ip_finish_output2(skb);
 196 #endif /*CONFIG_NETFILTER_DEBUG*/
 197
 198         nf_reset(skb);
 199
 200         if (hh) {
 201                 int hh_alen;
 202
 203                 read_lock_bh(&hh->hh_lock);
 204                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 205                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 206                 read_unlock_bh(&hh->hh_lock);
 207                 skb_push(skb, hh->hh_len);
 208                 return hh->hh_output(skb);
 209         } else if (dst->neighbour)
 210                 return dst->neighbour->output(skb);
 211
 212         if (net_ratelimit())
 213                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 214         kfree_skb(skb);
 215         return -EINVAL;
 216 }
 217
 218 int ip_finish_output(struct sk_buff *skb)
 219 {
 220         struct net_device *dev = skb->dst->dev;
 221
 222         skb->dev = dev;
 223         skb->protocol = htons(ETH_P_IP);
 224
 225         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 226                        ip_finish_output2);
 227 }
 228
 229 int ip_mc_output(struct sk_buff *skb)
 230 {
 231         struct sock *sk = skb->sk;
 232         struct rtable *rt = (struct rtable*)skb->dst;
 233         struct net_device *dev = rt->u.dst.dev;
 234
 235         /*
 236          *      If the indicated interface is up and running, send the packet.
 237          */
 238         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 239
 240         skb->dev = dev;
 241         skb->protocol = htons(ETH_P_IP);
 242
 243         /*
 244          *      Multicasts are looped back for other local users
 245          */
 246
 247         if (rt->rt_flags&RTCF_MULTICAST) {
 248                 if ((!sk || inet_sk(sk)->mc_loop)
 249 #ifdef CONFIG_IP_MROUTE
 250                 /* Small optimization: do not loopback not local frames,
 251                    which returned after forwarding; they will be  dropped
 252                    by ip_mr_input in any case.
 253                    Note, that local frames are looped back to be delivered
 254                    to local recipients.
 255
 256                    This check is duplicated in ip_mr_input at the moment.
 257                  */
 258                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 259 #endif
 260                 ) {
 261                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 262                         if (newskb)
 263                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 264                                         newskb->dev,
 265                                         ip_dev_loopback_xmit);
 266                 }
 267
 268                 /* Multicasts with ttl 0 must not go beyond the host */
 269
 270                 if (skb->nh.iph->ttl == 0) {
 271                         kfree_skb(skb);
 272                         return 0;
 273                 }
 274         }
 275
 276         if (rt->rt_flags&RTCF_BROADCAST) {
 277                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 278                 if (newskb)
 279                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 280                                 newskb->dev, ip_dev_loopback_xmit);
 281         }
 282
 283         if (skb->len > dst_mtu(&rt->u.dst))
 284                 return ip_fragment(skb, ip_finish_output);
 285         else
 286                 return ip_finish_output(skb);
 287 }
 288
 289 int ip_output(struct sk_buff *skb)
 290 {
 291         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 292
 293         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
 294                 return ip_fragment(skb, ip_finish_output);
 295         else
 296                 return ip_finish_output(skb);
 297 }
 298
 299 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 300 {
 301         struct sock *sk = skb->sk;
 302         struct inet_sock *inet = inet_sk(sk);
 303         struct ip_options *opt = inet->opt;
 304         struct rtable *rt;
 305         struct iphdr *iph;
 306
 307         /* Skip all of this if the packet is already routed,
 308          * f.e. by something like SCTP.
 309          */
 310         rt = (struct rtable *) skb->dst;
 311         if (rt != NULL)
 312                 goto packet_routed;
 313
 314         /* Make sure we can route this packet. */
 315         rt = (struct rtable *)__sk_dst_check(sk, 0);
 316         if (rt == NULL) {
 317                 u32 daddr;
 318
 319                 /* Use correct destination address if we have options. */
 320                 daddr = inet->daddr;
 321                 if(opt && opt->srr)
 322                         daddr = opt->faddr;
 323
 324                 {
 325                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 326                                             .nl_u = { .ip4_u =
 327                                                       { .daddr = daddr,
 328                                                         .saddr = inet->saddr,
 329                                                         .tos = RT_CONN_FLAGS(sk) } },
 330                                             .proto = sk->sk_protocol,
 331                                             .uli_u = { .ports =
 332                                                        { .sport = inet->sport,
 333                                                          .dport = inet->dport } } };
 334
 335                         /* If this fails, retransmit mechanism of transport layer will
 336                          * keep trying until route appears or the connection times
 337                          * itself out.
 338                          */
 339                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 340                                 goto no_route;
 341                 }
 342                 __sk_dst_set(sk, &rt->u.dst);
 343                 tcp_v4_setup_caps(sk, &rt->u.dst);
 344         }
 345         skb->dst = dst_clone(&rt->u.dst);
 346
 347 packet_routed:
 348         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 349                 goto no_route;
 350
 351         /* OK, we know where to send it, allocate and build IP header. */
 352         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 353         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 354         iph->tot_len = htons(skb->len);
 355         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 356                 iph->frag_off = htons(IP_DF);
 357         else
 358                 iph->frag_off = 0;
 359         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 360         iph->protocol = sk->sk_protocol;
 361         iph->saddr    = rt->rt_src;
 362         iph->daddr    = rt->rt_dst;
 363         skb->nh.iph   = iph;
 364         /* Transport layer set skb->h.foo itself. */
 365
 366         if (opt && opt->optlen) {
 367                 iph->ihl += opt->optlen >> 2;
 368                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 369         }
 370
 371         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
 372
 373         /* Add an IP checksum. */
 374         ip_send_check(iph);
 375
 376         skb->priority = sk->sk_priority;
 377
 378         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 379                        dst_output);
 380
 381 no_route:
 382         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 383         kfree_skb(skb);
 384         return -EHOSTUNREACH;
 385 }
 386
 387
 388 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 389 {
 390         to->pkt_type = from->pkt_type;
 391         to->priority = from->priority;
 392         to->protocol = from->protocol;
 393         to->security = from->security;
 394         dst_release(to->dst);
 395         to->dst = dst_clone(from->dst);
 396         to->dev = from->dev;
 397
 398         /* Copy the flags to each fragment. */
 399         IPCB(to)->flags = IPCB(from)->flags;
 400
 401 #ifdef CONFIG_NET_SCHED
 402         to->tc_index = from->tc_index;
 403 #endif
 404 #ifdef CONFIG_NETFILTER
 405         to->nfmark = from->nfmark;
 406         to->nfcache = from->nfcache;
 407         /* Connection association is same as pre-frag packet */
 408         nf_conntrack_put(to->nfct);
 409         to->nfct = from->nfct;
 410         nf_conntrack_get(to->nfct);
 411         to->nfctinfo = from->nfctinfo;
 412 #ifdef CONFIG_BRIDGE_NETFILTER
 413         nf_bridge_put(to->nf_bridge);
 414         to->nf_bridge = from->nf_bridge;
 415         nf_bridge_get(to->nf_bridge);
 416 #endif
 417 #ifdef CONFIG_NETFILTER_DEBUG
 418         to->nf_debug = from->nf_debug;
 419 #endif
 420 #endif
 421 }
 422
 423 /*
 424  *      This IP datagram is too large to be sent in one piece.  Break it up into
 425  *      smaller pieces (each of size equal to IP header plus
 426  *      a block of the data of the original IP data part) that will yet fit in a
 427  *      single device frame, and queue such a frame for sending.
 428  */
 429
 430 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 431 {
 432         struct iphdr *iph;
 433         int raw = 0;
 434         int ptr;
 435         struct net_device *dev;
 436         struct sk_buff *skb2;
 437         unsigned int mtu, hlen, left, len, ll_rs;
 438         int offset;
 439         int not_last_frag;
 440         struct rtable *rt = (struct rtable*)skb->dst;
 441         int err = 0;
 442
 443         dev = rt->u.dst.dev;
 444
 445         /*
 446          *      Point into the IP datagram header.
 447          */
 448
 449         iph = skb->nh.iph;
 450
 451         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 452                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 453                           htonl(dst_mtu(&rt->u.dst)));
 454                 kfree_skb(skb);
 455                 return -EMSGSIZE;
 456         }
 457
 458         /*
 459          *      Setup starting values.
 460          */
 461
 462         hlen = iph->ihl * 4;
 463         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 464
 465         /* When frag_list is given, use it. First, check its validity:
 466          * some transformers could create wrong frag_list or break existing
 467          * one, it is not prohibited. In this case fall back to copying.
 468          *
 469          * LATER: this step can be merged to real generation of fragments,
 470          * we can switch to copy when see the first bad fragment.
 471          */
 472         if (skb_shinfo(skb)->frag_list) {
 473                 struct sk_buff *frag;
 474                 int first_len = skb_pagelen(skb);
 475
 476                 if (first_len - hlen > mtu ||
 477                     ((first_len - hlen) & 7) ||
 478                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 479                     skb_cloned(skb))
 480                         goto slow_path;
 481
 482                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 483                         /* Correct geometry. */
 484                         if (frag->len > mtu ||
 485                             ((frag->len & 7) && frag->next) ||
 486                             skb_headroom(frag) < hlen)
 487                             goto slow_path;
 488
 489                         /* Partially cloned skb? */
 490                         if (skb_shared(frag))
 491                                 goto slow_path;
 492                 }
 493
 494                 /* Everything is OK. Generate! */
 495
 496                 err = 0;
 497                 offset = 0;
 498                 frag = skb_shinfo(skb)->frag_list;
 499                 skb_shinfo(skb)->frag_list = NULL;
 500                 skb->data_len = first_len - skb_headlen(skb);
 501                 skb->len = first_len;
 502                 iph->tot_len = htons(first_len);
 503                 iph->frag_off = htons(IP_MF);
 504                 ip_send_check(iph);
 505
 506                 for (;;) {
 507                         /* Prepare header of the next frame,
 508                          * before previous one went down. */
 509                         if (frag) {
 510                                 frag->ip_summed = CHECKSUM_NONE;
 511                                 frag->h.raw = frag->data;
 512                                 frag->nh.raw = __skb_push(frag, hlen);
 513                                 memcpy(frag->nh.raw, iph, hlen);
 514                                 iph = frag->nh.iph;
 515                                 iph->tot_len = htons(frag->len);
 516                                 ip_copy_metadata(frag, skb);
 517                                 if (offset == 0)
 518                                         ip_options_fragment(frag);
 519                                 offset += skb->len - hlen;
 520                                 iph->frag_off = htons(offset>>3);
 521                                 if (frag->next != NULL)
 522                                         iph->frag_off |= htons(IP_MF);
 523                                 /* Ready, complete checksum */
 524                                 ip_send_check(iph);
 525                         }
 526
 527                         err = output(skb);
 528
 529                         if (err || !frag)
 530                                 break;
 531
 532                         skb = frag;
 533                         frag = skb->next;
 534                         skb->next = NULL;
 535                 }
 536
 537                 if (err == 0) {
 538                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 539                         return 0;
 540                 }
 541
 542                 while (frag) {
 543                         skb = frag->next;
 544                         kfree_skb(frag);
 545                         frag = skb;
 546                 }
 547                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 548                 return err;
 549         }
 550
 551 slow_path:
 552         left = skb->len - hlen;         /* Space per frame */
 553         ptr = raw + hlen;               /* Where to start from */
 554
 555 #ifdef CONFIG_BRIDGE_NETFILTER
 556         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 557          * we need to make room for the encapsulating header */
 558         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 559         mtu -= nf_bridge_pad(skb);
 560 #else
 561         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 562 #endif
 563         /*
 564          *      Fragment the datagram.
 565          */
 566
 567         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 568         not_last_frag = iph->frag_off & htons(IP_MF);
 569
 570         /*
 571          *      Keep copying data until we run out.
 572          */
 573
 574         while(left > 0) {
 575                 len = left;
 576                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 577                 if (len > mtu)
 578                         len = mtu;
 579                 /* IF: we are not sending upto and including the packet end
 580                    then align the next start on an eight byte boundary */
 581                 if (len < left) {
 582                         len &= ~7;
 583                 }
 584                 /*
 585                  *      Allocate buffer.
 586                  */
 587
 588                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 589                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 590                         err = -ENOMEM;
 591                         goto fail;
 592                 }
 593
 594                 /*
 595                  *      Set up data on packet
 596                  */
 597
 598                 ip_copy_metadata(skb2, skb);
 599                 skb_reserve(skb2, ll_rs);
 600                 skb_put(skb2, len + hlen);
 601                 skb2->nh.raw = skb2->data;
 602                 skb2->h.raw = skb2->data + hlen;
 603
 604                 /*
 605                  *      Charge the memory for the fragment to any owner
 606                  *      it might possess
 607                  */
 608
 609                 if (skb->sk)
 610                         skb_set_owner_w(skb2, skb->sk);
 611
 612                 /*
 613                  *      Copy the packet header into the new buffer.
 614                  */
 615
 616                 memcpy(skb2->nh.raw, skb->data, hlen);
 617
 618                 /*
 619                  *      Copy a block of the IP datagram.
 620                  */
 621                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 622                         BUG();
 623                 left -= len;
 624
 625                 /*
 626                  *      Fill in the new header fields.
 627                  */
 628                 iph = skb2->nh.iph;
 629                 iph->frag_off = htons((offset >> 3));
 630
 631                 /* ANK: dirty, but effective trick. Upgrade options only if
 632                  * the segment to be fragmented was THE FIRST (otherwise,
 633                  * options are already fixed) and make it ONCE
 634                  * on the initial skb, so that all the following fragments
 635                  * will inherit fixed options.
 636                  */
 637                 if (offset == 0)
 638                         ip_options_fragment(skb);
 639
 640                 /*
 641                  *      Added AC : If we are fragmenting a fragment that's not the
 642                  *                 last fragment then keep MF on each bit
 643                  */
 644                 if (left > 0 || not_last_frag)
 645                         iph->frag_off |= htons(IP_MF);
 646                 ptr += len;
 647                 offset += len;
 648
 649                 /*
 650                  *      Put this fragment into the sending queue.
 651                  */
 652
 653                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 654
 655                 iph->tot_len = htons(len + hlen);
 656
 657                 ip_send_check(iph);
 658
 659                 err = output(skb2);
 660                 if (err)
 661                         goto fail;
 662         }
 663         kfree_skb(skb);
 664         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 665         return err;
 666
 667 fail:
 668         kfree_skb(skb);
 669         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 670         return err;
 671 }
 672
 673 int
 674 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 675 {
 676         struct iovec *iov = from;
 677
 678         if (skb->ip_summed == CHECKSUM_HW) {
 679                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 680                         return -EFAULT;
 681         } else {
 682                 unsigned int csum = 0;
 683                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 684                         return -EFAULT;
 685                 skb->csum = csum_block_add(skb->csum, csum, odd);
 686         }
 687         return 0;
 688 }
 689
 690 static inline unsigned int
 691 csum_page(struct page *page, int offset, int copy)
 692 {
 693         char *kaddr;
 694         unsigned int csum;
 695         kaddr = kmap(page);
 696         csum = csum_partial(kaddr + offset, copy, 0);
 697         kunmap(page);
 698         return csum;
 699 }
 700
 701 /*
 702  *      ip_append_data() and ip_append_page() can make one large IP datagram
 703  *      from many pieces of data. Each pieces will be holded on the socket
 704  *      until ip_push_pending_frames() is called. Each piece can be a page
 705  *      or non-page data.
 706  *
 707  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 708  *      this interface potentially.
 709  *
 710  *      LATER: length must be adjusted by pad at tail, when it is required.
 711  */
 712 int ip_append_data(struct sock *sk,
 713                    int getfrag(void *from, char *to, int offset, int len,
 714                                int odd, struct sk_buff *skb),
 715                    void *from, int length, int transhdrlen,
 716                    struct ipcm_cookie *ipc, struct rtable *rt,
 717                    unsigned int flags)
 718 {
 719         struct inet_sock *inet = inet_sk(sk);
 720         struct sk_buff *skb;
 721
 722         struct ip_options *opt = NULL;
 723         int hh_len;
 724         int exthdrlen;
 725         int mtu;
 726         int copy;
 727         int err;
 728         int offset = 0;
 729         unsigned int maxfraglen, fragheaderlen;
 730         int csummode = CHECKSUM_NONE;
 731
 732         if (flags&MSG_PROBE)
 733                 return 0;
 734
 735         if (skb_queue_empty(&sk->sk_write_queue)) {
 736                 /*
 737                  * setup for corking.
 738                  */
 739                 opt = ipc->opt;
 740                 if (opt) {
 741                         if (inet->cork.opt == NULL) {
 742                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 743                                 if (unlikely(inet->cork.opt == NULL))
 744                                         return -ENOBUFS;
 745                         }
 746                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 747                         inet->cork.flags |= IPCORK_OPT;
 748                         inet->cork.addr = ipc->addr;
 749                 }
 750                 dst_hold(&rt->u.dst);
 751                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 752                 inet->cork.rt = rt;
 753                 inet->cork.length = 0;
 754                 sk->sk_sndmsg_page = NULL;
 755                 sk->sk_sndmsg_off = 0;
 756                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 757                         length += exthdrlen;
 758                         transhdrlen += exthdrlen;
 759                 }
 760         } else {
 761                 rt = inet->cork.rt;
 762                 if (inet->cork.flags & IPCORK_OPT)
 763                         opt = inet->cork.opt;
 764
 765                 transhdrlen = 0;
 766                 exthdrlen = 0;
 767                 mtu = inet->cork.fragsize;
 768         }
 769         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 770
 771         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 772         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 773
 774         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 775                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 776                 return -EMSGSIZE;
 777         }
 778
 779         /*
 780          * transhdrlen > 0 means that this is the first fragment and we wish
 781          * it won't be fragmented in the future.
 782          */
 783         if (transhdrlen &&
 784             length + fragheaderlen <= mtu &&
 785             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 786             !exthdrlen)
 787                 csummode = CHECKSUM_HW;
 788
 789         inet->cork.length += length;
 790
 791         /* So, what's going on in the loop below?
 792          *
 793          * We use calculated fragment length to generate chained skb,
 794          * each of segments is IP fragment ready for sending to network after
 795          * adding appropriate IP header.
 796          */
 797
 798         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 799                 goto alloc_new_skb;
 800
 801         while (length > 0) {
 802                 /* Check if the remaining data fits into current packet. */
 803                 copy = mtu - skb->len;
 804                 if (copy < length)
 805                         copy = maxfraglen - skb->len;
 806                 if (copy <= 0) {
 807                         char *data;
 808                         unsigned int datalen;
 809                         unsigned int fraglen;
 810                         unsigned int fraggap;
 811                         unsigned int alloclen;
 812                         struct sk_buff *skb_prev;
 813 alloc_new_skb:
 814                         skb_prev = skb;
 815                         if (skb_prev)
 816                                 fraggap = skb_prev->len - maxfraglen;
 817                         else
 818                                 fraggap = 0;
 819
 820                         /*
 821                          * If remaining data exceeds the mtu,
 822                          * we know we need more fragment(s).
 823                          */
 824                         datalen = length + fraggap;
 825                         if (datalen > mtu - fragheaderlen)
 826                                 datalen = maxfraglen - fragheaderlen;
 827                         fraglen = datalen + fragheaderlen;
 828
 829                         if ((flags & MSG_MORE) &&
 830                             !(rt->u.dst.dev->features&NETIF_F_SG))
 831                                 alloclen = mtu;
 832                         else
 833                                 alloclen = datalen + fragheaderlen;
 834
 835                         /* The last fragment gets additional space at tail.
 836                          * Note, with MSG_MORE we overallocate on fragments,
 837                          * because we have no idea what fragment will be
 838                          * the last.
 839                          */
 840                         if (datalen == length)
 841                                 alloclen += rt->u.dst.trailer_len;
 842
 843                         if (transhdrlen) {
 844                                 skb = sock_alloc_send_skb(sk,
 845                                                 alloclen + hh_len + 15,
 846                                                 (flags & MSG_DONTWAIT), &err);
 847                         } else {
 848                                 skb = NULL;
 849                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 850                                     2 * sk->sk_sndbuf)
 851                                         skb = sock_wmalloc(sk,
 852                                                            alloclen + hh_len + 15, 1,
 853                                                            sk->sk_allocation);
 854                                 if (unlikely(skb == NULL))
 855                                         err = -ENOBUFS;
 856                         }
 857                         if (skb == NULL)
 858                                 goto error;
 859
 860                         /*
 861                          *      Fill in the control structures
 862                          */
 863                         skb->ip_summed = csummode;
 864                         skb->csum = 0;
 865                         skb_reserve(skb, hh_len);
 866
 867                         /*
 868                          *      Find where to start putting bytes.
 869                          */
 870                         data = skb_put(skb, fraglen);
 871                         skb->nh.raw = data + exthdrlen;
 872                         data += fragheaderlen;
 873                         skb->h.raw = data + exthdrlen;
 874
 875                         if (fraggap) {
 876                                 skb->csum = skb_copy_and_csum_bits(
 877                                         skb_prev, maxfraglen,
 878                                         data + transhdrlen, fraggap, 0);
 879                                 skb_prev->csum = csum_sub(skb_prev->csum,
 880                                                           skb->csum);
 881                                 data += fraggap;
 882                                 skb_trim(skb_prev, maxfraglen);
 883                         }
 884
 885                         copy = datalen - transhdrlen - fraggap;
 886                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 887                                 err = -EFAULT;
 888                                 kfree_skb(skb);
 889                                 goto error;
 890                         }
 891
 892                         offset += copy;
 893                         length -= datalen - fraggap;
 894                         transhdrlen = 0;
 895                         exthdrlen = 0;
 896                         csummode = CHECKSUM_NONE;
 897
 898                         /*
 899                          * Put the packet on the pending queue.
 900                          */
 901                         __skb_queue_tail(&sk->sk_write_queue, skb);
 902                         continue;
 903                 }
 904
 905                 if (copy > length)
 906                         copy = length;
 907
 908                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 909                         unsigned int off;
 910
 911                         off = skb->len;
 912                         if (getfrag(from, skb_put(skb, copy),
 913                                         offset, copy, off, skb) < 0) {
 914                                 __skb_trim(skb, off);
 915                                 err = -EFAULT;
 916                                 goto error;
 917                         }
 918                 } else {
 919                         int i = skb_shinfo(skb)->nr_frags;
 920                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 921                         struct page *page = sk->sk_sndmsg_page;
 922                         int off = sk->sk_sndmsg_off;
 923                         unsigned int left;
 924
 925                         if (page && (left = PAGE_SIZE - off) > 0) {
 926                                 if (copy >= left)
 927                                         copy = left;
 928                                 if (page != frag->page) {
 929                                         if (i == MAX_SKB_FRAGS) {
 930                                                 err = -EMSGSIZE;
 931                                                 goto error;
 932                                         }
 933                                         get_page(page);
 934                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 935                                         frag = &skb_shinfo(skb)->frags[i];
 936                                 }
 937                         } else if (i < MAX_SKB_FRAGS) {
 938                                 if (copy > PAGE_SIZE)
 939                                         copy = PAGE_SIZE;
 940                                 page = alloc_pages(sk->sk_allocation, 0);
 941                                 if (page == NULL)  {
 942                                         err = -ENOMEM;
 943                                         goto error;
 944                                 }
 945                                 sk->sk_sndmsg_page = page;
 946                                 sk->sk_sndmsg_off = 0;
 947
 948                                 skb_fill_page_desc(skb, i, page, 0, 0);
 949                                 frag = &skb_shinfo(skb)->frags[i];
 950                                 skb->truesize += PAGE_SIZE;
 951                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
 952                         } else {
 953                                 err = -EMSGSIZE;
 954                                 goto error;
 955                         }
 956                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
 957                                 err = -EFAULT;
 958                                 goto error;
 959                         }
 960                         sk->sk_sndmsg_off += copy;
 961                         frag->size += copy;
 962                         skb->len += copy;
 963                         skb->data_len += copy;
 964                 }
 965                 offset += copy;
 966                 length -= copy;
 967         }
 968
 969         return 0;
 970
 971 error:
 972         inet->cork.length -= length;
 973         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 974         return err;
 975 }
 976
 977 ssize_t ip_append_page(struct sock *sk, struct page *page,
 978                        int offset, size_t size, int flags)
 979 {
 980         struct inet_sock *inet = inet_sk(sk);
 981         struct sk_buff *skb;
 982         struct rtable *rt;
 983         struct ip_options *opt = NULL;
 984         int hh_len;
 985         int mtu;
 986         int len;
 987         int err;
 988         unsigned int maxfraglen, fragheaderlen, fraggap;
 989
 990         if (inet->hdrincl)
 991                 return -EPERM;
 992
 993         if (flags&MSG_PROBE)
 994                 return 0;
 995
 996         if (skb_queue_empty(&sk->sk_write_queue))
 997                 return -EINVAL;
 998
 999         rt = inet->cork.rt;
1000         if (inet->cork.flags & IPCORK_OPT)
1001                 opt = inet->cork.opt;
1002
1003         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1004                 return -EOPNOTSUPP;
1005
1006         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1007         mtu = inet->cork.fragsize;
1008
1009         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1010         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1011
1012         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1013                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1014                 return -EMSGSIZE;
1015         }
1016
1017         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1018                 return -EINVAL;
1019
1020         inet->cork.length += size;
1021
1022         while (size > 0) {
1023                 int i;
1024
1025                 /* Check if the remaining data fits into current packet. */
1026                 len = mtu - skb->len;
1027                 if (len < size)
1028                         len = maxfraglen - skb->len;
1029                 if (len <= 0) {
1030                         struct sk_buff *skb_prev;
1031                         char *data;
1032                         struct iphdr *iph;
1033                         int alloclen;
1034
1035                         skb_prev = skb;
1036                         if (skb_prev)
1037                                 fraggap = skb_prev->len - maxfraglen;
1038                         else
1039                                 fraggap = 0;
1040
1041                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1042                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1043                         if (unlikely(!skb)) {
1044                                 err = -ENOBUFS;
1045                                 goto error;
1046                         }
1047
1048                         /*
1049                          *      Fill in the control structures
1050                          */
1051                         skb->ip_summed = CHECKSUM_NONE;
1052                         skb->csum = 0;
1053                         skb_reserve(skb, hh_len);
1054
1055                         /*
1056                          *      Find where to start putting bytes.
1057                          */
1058                         data = skb_put(skb, fragheaderlen + fraggap);
1059                         skb->nh.iph = iph = (struct iphdr *)data;
1060                         data += fragheaderlen;
1061                         skb->h.raw = data;
1062
1063                         if (fraggap) {
1064                                 skb->csum = skb_copy_and_csum_bits(
1065                                         skb_prev, maxfraglen,
1066                                         data, fraggap, 0);
1067                                 skb_prev->csum = csum_sub(skb_prev->csum,
1068                                                           skb->csum);
1069                                 skb_trim(skb_prev, maxfraglen);
1070                         }
1071
1072                         /*
1073                          * Put the packet on the pending queue.
1074                          */
1075                         __skb_queue_tail(&sk->sk_write_queue, skb);
1076                         continue;
1077                 }
1078
1079                 i = skb_shinfo(skb)->nr_frags;
1080                 if (len > size)
1081                         len = size;
1082                 if (skb_can_coalesce(skb, i, page, offset)) {
1083                         skb_shinfo(skb)->frags[i-1].size += len;
1084                 } else if (i < MAX_SKB_FRAGS) {
1085                         get_page(page);
1086                         skb_fill_page_desc(skb, i, page, offset, len);
1087                 } else {
1088                         err = -EMSGSIZE;
1089                         goto error;
1090                 }
1091
1092                 if (skb->ip_summed == CHECKSUM_NONE) {
1093                         unsigned int csum;
1094                         csum = csum_page(page, offset, len);
1095                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1096                 }
1097
1098                 skb->len += len;
1099                 skb->data_len += len;
1100                 offset += len;
1101                 size -= len;
1102         }
1103         return 0;
1104
1105 error:
1106         inet->cork.length -= size;
1107         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1108         return err;
1109 }
1110
1111 /*
1112  *      Combined all pending IP fragments on the socket as one IP datagram
1113  *      and push them out.
1114  */
1115 int ip_push_pending_frames(struct sock *sk)
1116 {
1117         struct sk_buff *skb, *tmp_skb;
1118         struct sk_buff **tail_skb;
1119         struct inet_sock *inet = inet_sk(sk);
1120         struct ip_options *opt = NULL;
1121         struct rtable *rt = inet->cork.rt;
1122         struct iphdr *iph;
1123         int df = 0;
1124         __u8 ttl;
1125         int err = 0;
1126
1127         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1128                 goto out;
1129         tail_skb = &(skb_shinfo(skb)->frag_list);
1130
1131         /* move skb->data to ip header from ext header */
1132         if (skb->data < skb->nh.raw)
1133                 __skb_pull(skb, skb->nh.raw - skb->data);
1134         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1135                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1136                 *tail_skb = tmp_skb;
1137                 tail_skb = &(tmp_skb->next);
1138                 skb->len += tmp_skb->len;
1139                 skb->data_len += tmp_skb->len;
1140                 skb->truesize += tmp_skb->truesize;
1141                 __sock_put(tmp_skb->sk);
1142                 tmp_skb->destructor = NULL;
1143                 tmp_skb->sk = NULL;
1144         }
1145
1146         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1147          * to fragment the frame generated here. No matter, what transforms
1148          * how transforms change size of the packet, it will come out.
1149          */
1150         if (inet->pmtudisc != IP_PMTUDISC_DO)
1151                 skb->local_df = 1;
1152
1153         /* DF bit is set when we want to see DF on outgoing frames.
1154          * If local_df is set too, we still allow to fragment this frame
1155          * locally. */
1156         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1157             (skb->len <= dst_mtu(&rt->u.dst) &&
1158              ip_dont_fragment(sk, &rt->u.dst)))
1159                 df = htons(IP_DF);
1160
1161         if (inet->cork.flags & IPCORK_OPT)
1162                 opt = inet->cork.opt;
1163
1164         if (rt->rt_type == RTN_MULTICAST)
1165                 ttl = inet->mc_ttl;
1166         else
1167                 ttl = ip_select_ttl(inet, &rt->u.dst);
1168
1169         iph = (struct iphdr *)skb->data;
1170         iph->version = 4;
1171         iph->ihl = 5;
1172         if (opt) {
1173                 iph->ihl += opt->optlen>>2;
1174                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1175         }
1176         iph->tos = inet->tos;
1177         iph->tot_len = htons(skb->len);
1178         iph->frag_off = df;
1179         if (!df) {
1180                 __ip_select_ident(iph, &rt->u.dst, 0);
1181         } else {
1182                 iph->id = htons(inet->id++);
1183         }
1184         iph->ttl = ttl;
1185         iph->protocol = sk->sk_protocol;
1186         iph->saddr = rt->rt_src;
1187         iph->daddr = rt->rt_dst;
1188         ip_send_check(iph);
1189
1190         skb->priority = sk->sk_priority;
1191         skb->dst = dst_clone(&rt->u.dst);
1192
1193         /* Netfilter gets whole the not fragmented skb. */
1194         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1195                       skb->dst->dev, dst_output);
1196         if (err) {
1197                 if (err > 0)
1198                         err = inet->recverr ? net_xmit_errno(err) : 0;
1199                 if (err)
1200                         goto error;
1201         }
1202
1203 out:
1204         inet->cork.flags &= ~IPCORK_OPT;
1205         if (inet->cork.opt) {
1206                 kfree(inet->cork.opt);
1207                 inet->cork.opt = NULL;
1208         }
1209         if (inet->cork.rt) {
1210                 ip_rt_put(inet->cork.rt);
1211                 inet->cork.rt = NULL;
1212         }
1213         return err;
1214
1215 error:
1216         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1217         goto out;
1218 }
1219
1220 /*
1221  *      Throw away all pending data on the socket.
1222  */
1223 void ip_flush_pending_frames(struct sock *sk)
1224 {
1225         struct inet_sock *inet = inet_sk(sk);
1226         struct sk_buff *skb;
1227
1228         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1229                 kfree_skb(skb);
1230
1231         inet->cork.flags &= ~IPCORK_OPT;
1232         if (inet->cork.opt) {
1233                 kfree(inet->cork.opt);
1234                 inet->cork.opt = NULL;
1235         }
1236         if (inet->cork.rt) {
1237                 ip_rt_put(inet->cork.rt);
1238                 inet->cork.rt = NULL;
1239         }
1240 }
1241
1242
1243 /*
1244  *      Fetch data from kernel space and fill in checksum if needed.
1245  */
1246 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1247                               int len, int odd, struct sk_buff *skb)
1248 {
1249         unsigned int csum;
1250
1251         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1252         skb->csum = csum_block_add(skb->csum, csum, odd);
1253         return 0;
1254 }
1255
1256 /*
1257  *      Generic function to send a packet as reply to another packet.
1258  *      Used to send TCP resets so far. ICMP should use this function too.
1259  *
1260  *      Should run single threaded per socket because it uses the sock
1261  *      structure to pass arguments.
1262  *
1263  *      LATER: switch from ip_build_xmit to ip_append_*
1264  */
1265 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1266                    unsigned int len)
1267 {
1268         struct inet_sock *inet = inet_sk(sk);
1269         struct {
1270                 struct ip_options       opt;
1271                 char                    data[40];
1272         } replyopts;
1273         struct ipcm_cookie ipc;
1274         u32 daddr;
1275         struct rtable *rt = (struct rtable*)skb->dst;
1276
1277         if (ip_options_echo(&replyopts.opt, skb))
1278                 return;
1279
1280         daddr = ipc.addr = rt->rt_src;
1281         ipc.opt = NULL;
1282
1283         if (replyopts.opt.optlen) {
1284                 ipc.opt = &replyopts.opt;
1285
1286                 if (ipc.opt->srr)
1287                         daddr = replyopts.opt.faddr;
1288         }
1289
1290         {
1291                 struct flowi fl = { .nl_u = { .ip4_u =
1292                                               { .daddr = daddr,
1293                                                 .saddr = rt->rt_spec_dst,
1294                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1295                                     /* Not quite clean, but right. */
1296                                     .uli_u = { .ports =
1297                                                { .sport = skb->h.th->dest,
1298                                                  .dport = skb->h.th->source } },
1299                                     .proto = sk->sk_protocol };
1300                 if (ip_route_output_key(&rt, &fl))
1301                         return;
1302         }
1303
1304         /* And let IP do all the hard work.
1305
1306            This chunk is not reenterable, hence spinlock.
1307            Note that it uses the fact, that this function is called
1308            with locally disabled BH and that sk cannot be already spinlocked.
1309          */
1310         bh_lock_sock(sk);
1311         inet->tos = skb->nh.iph->tos;
1312         sk->sk_priority = skb->priority;
1313         sk->sk_protocol = skb->nh.iph->protocol;
1314         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1315                        &ipc, rt, MSG_DONTWAIT);
1316         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1317                 if (arg->csumoffset >= 0)
1318                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1319                 skb->ip_summed = CHECKSUM_NONE;
1320                 ip_push_pending_frames(sk);
1321         }
1322
1323         bh_unlock_sock(sk);
1324
1325         ip_rt_put(rt);
1326 }
1327
1328 /*
1329  *      IP protocol layer initialiser
1330  */
1331
1332 static struct packet_type ip_packet_type = {
1333         .type = __constant_htons(ETH_P_IP),
1334         .func = ip_rcv,
1335 };
1336
1337 /*
1338  *      IP registers the packet type and then calls the subprotocol initialisers
1339  */
1340
1341 void __init ip_init(void)
1342 {
1343         dev_add_pack(&ip_packet_type);
1344
1345         ip_rt_init();
1346         inet_initpeers();
1347
1348 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1349         igmp_mc_proc_init();
1350 #endif
1351 }
1352
1353 EXPORT_SYMBOL(ip_finish_output);
1354 EXPORT_SYMBOL(ip_fragment);
1355 EXPORT_SYMBOL(ip_generic_getfrag);
1356 EXPORT_SYMBOL(ip_queue_xmit);
1357 EXPORT_SYMBOL(ip_send_check);
1358
1359 #ifdef CONFIG_SYSCTL
1360 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1361 #endif