2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
35 #include <net/protocol.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
55 1. The most important issue is detecting local dead loops.
56 They would cause complete host lockup in transmit, which
57 would be "resolved" by stack overflow or, if queueing is enabled,
58 with infinite looping in net_bh.
60 We cannot track such dead loops during route installation,
61 it is infeasible task. The most general solutions would be
62 to keep skb->encapsulation counter (sort of local ttl),
63 and silently drop packet when it expires. It is the best
64 solution, but it supposes maintaing new variable in ALL
65 skb, even if no tunneling is used.
67 Current solution: t->recursion lock breaks dead loops. It looks
68 like dev->tbusy flag, but I preferred new variable, because
69 the semantics is different. One day, when hard_start_xmit
70 will be multithreaded we will have to use skb->encapsulation.
74 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case,
76 if we copy it from packet being encapsulated to upper header.
77 It is very good solution, but it introduces two problems:
79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80 do not work over tunnels.
81 - traceroute does not work. I planned to relay ICMP from tunnel,
82 so that this problem would be solved and traceroute output
83 would even more informative. This idea appeared to be wrong:
84 only Linux complies to rfc1812 now (yes, guys, Linux is the only
85 true router now :-)), all routers (at least, in neighbourhood of mine)
86 return only 8 bytes of payload. It is the end.
88 Hence, if we want that OSPF worked or traceroute said something reasonable,
89 we should search for another solution.
91 One of them is to parse packet trying to detect inner encapsulation
92 made by our node. It is difficult or even impossible, especially,
93 taking into account fragmentation. TO be short, tt is not solution at all.
95 Current solution: The solution was UNEXPECTEDLY SIMPLE.
96 We force DF flag on tunnels with preconfigured hop limit,
97 that is ALL. :-) Well, it does not remove the problem completely,
98 but exponential growth of network traffic is changed to linear
99 (branches, that exceed pmtu are pruned) and tunnel mtu
100 fastly degrades to value <68, where looping stops.
101 Yes, it is not good if there exists a router in the loop,
102 which does not force DF, even when encapsulating packets have DF set.
103 But it is not our problem! Nobody could accuse us, we made
104 all that we could make. Even if it is your gated who injected
105 fatal route to network, even if it were you who configured
106 fatal static route: you are innocent. :-)
110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111 practically identical code. It would be good to glue them
112 together, but it is not very evident, how to make them modular.
113 sit is integral part of IPv6, ipip and gre are naturally modular.
114 We could extract common parts (hash table, ioctl etc)
115 to a separate module (ip_tunnel.c).
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
123 /* Fallback tunnel: no source, no destination, no key, no options */
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
129 static int ipgre_net_id;
131 struct ip_tunnel *tunnels[4][HASH_SIZE];
133 struct net_device *fb_tunnel_dev;
136 /* Tunnel hash table */
146 We require exact key match i.e. if a key is present in packet
147 it will match only tunnel with the same key; if it is not present,
148 it will match only keyless tunnel.
150 All keysless packets, if not matched configured keyless tunnels
151 will match fallback tunnel.
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 #define tunnels_r_l tunnels[3]
157 #define tunnels_r tunnels[2]
158 #define tunnels_l tunnels[1]
159 #define tunnels_wc tunnels[0]
161 static DEFINE_RWLOCK(ipgre_lock);
163 /* Given src, dst and key, find appropriate for input tunnel. */
165 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
166 __be32 remote, __be32 local, __be32 key)
168 unsigned h0 = HASH(remote);
169 unsigned h1 = HASH(key);
171 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
173 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
174 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
175 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
179 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
180 if (remote == t->parms.iph.daddr) {
181 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
185 for (t = ign->tunnels_l[h1]; t; t = t->next) {
186 if (local == t->parms.iph.saddr ||
187 (local == t->parms.iph.daddr &&
188 ipv4_is_multicast(local))) {
189 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
193 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
194 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
198 if (ign->fb_tunnel_dev->flags&IFF_UP)
199 return netdev_priv(ign->fb_tunnel_dev);
203 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
204 struct ip_tunnel_parm *parms)
206 __be32 remote = parms->iph.daddr;
207 __be32 local = parms->iph.saddr;
208 __be32 key = parms->i_key;
209 unsigned h = HASH(key);
214 if (remote && !ipv4_is_multicast(remote)) {
219 return &ign->tunnels[prio][h];
222 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
225 return __ipgre_bucket(ign, &t->parms);
228 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
230 struct ip_tunnel **tp = ipgre_bucket(ign, t);
233 write_lock_bh(&ipgre_lock);
235 write_unlock_bh(&ipgre_lock);
238 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
240 struct ip_tunnel **tp;
242 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
244 write_lock_bh(&ipgre_lock);
246 write_unlock_bh(&ipgre_lock);
252 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
253 struct ip_tunnel_parm *parms, int create)
255 __be32 remote = parms->iph.daddr;
256 __be32 local = parms->iph.saddr;
257 __be32 key = parms->i_key;
258 struct ip_tunnel *t, **tp, *nt;
259 struct net_device *dev;
261 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
263 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
264 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
265 if (key == t->parms.i_key)
273 strlcpy(name, parms->name, IFNAMSIZ);
275 sprintf(name, "gre%%d");
277 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
281 if (strchr(name, '%')) {
282 if (dev_alloc_name(dev, name) < 0)
286 dev->init = ipgre_tunnel_init;
287 nt = netdev_priv(dev);
290 if (register_netdevice(dev) < 0)
294 ipgre_tunnel_link(ign, nt);
302 static void ipgre_tunnel_uninit(struct net_device *dev)
304 struct net *net = dev_net(dev);
305 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
307 ipgre_tunnel_unlink(ign, netdev_priv(dev));
312 static void ipgre_err(struct sk_buff *skb, u32 info)
314 #ifndef I_WISH_WORLD_WERE_PERFECT
316 /* It is not :-( All the routers (except for Linux) return only
317 8 bytes of packet payload. It means, that precise relaying of
318 ICMP in the real Internet is absolutely infeasible.
320 Moreover, Cisco "wise men" put GRE key to the third word
321 in GRE header. It makes impossible maintaining even soft state for keyed
322 GRE tunnels with enabled checksum. Tell them "thank you".
324 Well, I wonder, rfc1812 was written by Cisco employee,
325 what the hell these idiots break standrads established
329 struct iphdr *iph = (struct iphdr*)skb->data;
330 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
331 int grehlen = (iph->ihl<<2) + 4;
332 const int type = icmp_hdr(skb)->type;
333 const int code = icmp_hdr(skb)->code;
338 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
339 if (flags&(GRE_VERSION|GRE_ROUTING))
348 /* If only 8 bytes returned, keyed message will be dropped here */
349 if (skb_headlen(skb) < grehlen)
354 case ICMP_PARAMETERPROB:
357 case ICMP_DEST_UNREACH:
360 case ICMP_PORT_UNREACH:
361 /* Impossible event. */
363 case ICMP_FRAG_NEEDED:
364 /* Soft state for pmtu is maintained by IP core. */
367 /* All others are translated to HOST_UNREACH.
368 rfc2003 contains "deep thoughts" about NET_UNREACH,
369 I believe they are just ether pollution. --ANK
374 case ICMP_TIME_EXCEEDED:
375 if (code != ICMP_EXC_TTL)
380 read_lock(&ipgre_lock);
381 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
383 *(((__be32*)p) + (grehlen>>2) - 1) : 0);
384 if (t == NULL || t->parms.iph.daddr == 0 ||
385 ipv4_is_multicast(t->parms.iph.daddr))
388 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
391 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
395 t->err_time = jiffies;
397 read_unlock(&ipgre_lock);
400 struct iphdr *iph = (struct iphdr*)dp;
402 __be16 *p = (__be16*)(dp+(iph->ihl<<2));
403 const int type = icmp_hdr(skb)->type;
404 const int code = icmp_hdr(skb)->code;
410 int grehlen = (iph->ihl<<2) + 4;
411 struct sk_buff *skb2;
415 if (p[1] != htons(ETH_P_IP))
419 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
420 if (flags&(GRE_VERSION|GRE_ROUTING))
429 if (len < grehlen + sizeof(struct iphdr))
431 eiph = (struct iphdr*)(dp + grehlen);
436 case ICMP_PARAMETERPROB:
437 n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
438 if (n < (iph->ihl<<2))
441 /* So... This guy found something strange INSIDE encapsulated
442 packet. Well, he is fool, but what can we do ?
444 rel_type = ICMP_PARAMETERPROB;
446 rel_info = htonl(n << 24);
449 case ICMP_DEST_UNREACH:
452 case ICMP_PORT_UNREACH:
453 /* Impossible event. */
455 case ICMP_FRAG_NEEDED:
456 /* And it is the only really necessary thing :-) */
457 n = ntohs(icmp_hdr(skb)->un.frag.mtu);
461 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
462 if (n > ntohs(eiph->tot_len))
467 /* All others are translated to HOST_UNREACH.
468 rfc2003 contains "deep thoughts" about NET_UNREACH,
469 I believe, it is just ether pollution. --ANK
471 rel_type = ICMP_DEST_UNREACH;
472 rel_code = ICMP_HOST_UNREACH;
476 case ICMP_TIME_EXCEEDED:
477 if (code != ICMP_EXC_TTL)
482 /* Prepare fake skb to feed it to icmp_send */
483 skb2 = skb_clone(skb, GFP_ATOMIC);
486 dst_release(skb2->dst);
488 skb_pull(skb2, skb->data - (u8*)eiph);
489 skb_reset_network_header(skb2);
491 /* Try to guess incoming interface */
492 memset(&fl, 0, sizeof(fl));
493 fl.fl4_dst = eiph->saddr;
494 fl.fl4_tos = RT_TOS(eiph->tos);
495 fl.proto = IPPROTO_GRE;
496 if (ip_route_output_key(&init_net, &rt, &fl)) {
500 skb2->dev = rt->u.dst.dev;
502 /* route "incoming" packet */
503 if (rt->rt_flags&RTCF_LOCAL) {
506 fl.fl4_dst = eiph->daddr;
507 fl.fl4_src = eiph->saddr;
508 fl.fl4_tos = eiph->tos;
509 if (ip_route_output_key(&init_net, &rt, &fl) ||
510 rt->u.dst.dev->type != ARPHRD_IPGRE) {
517 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
518 skb2->dst->dev->type != ARPHRD_IPGRE) {
524 /* change mtu on this route */
525 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
526 if (n > dst_mtu(skb2->dst)) {
530 skb2->dst->ops->update_pmtu(skb2->dst, n);
531 } else if (type == ICMP_TIME_EXCEEDED) {
532 struct ip_tunnel *t = netdev_priv(skb2->dev);
533 if (t->parms.iph.ttl) {
534 rel_type = ICMP_DEST_UNREACH;
535 rel_code = ICMP_HOST_UNREACH;
539 icmp_send(skb2, rel_type, rel_code, rel_info);
544 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
546 if (INET_ECN_is_ce(iph->tos)) {
547 if (skb->protocol == htons(ETH_P_IP)) {
548 IP_ECN_set_ce(ip_hdr(skb));
549 } else if (skb->protocol == htons(ETH_P_IPV6)) {
550 IP6_ECN_set_ce(ipv6_hdr(skb));
556 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
559 if (skb->protocol == htons(ETH_P_IP))
560 inner = old_iph->tos;
561 else if (skb->protocol == htons(ETH_P_IPV6))
562 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
563 return INET_ECN_encapsulate(tos, inner);
566 static int ipgre_rcv(struct sk_buff *skb)
574 struct ip_tunnel *tunnel;
577 if (!pskb_may_pull(skb, 16))
584 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
585 /* - Version must be 0.
586 - We do not support routing headers.
588 if (flags&(GRE_VERSION|GRE_ROUTING))
591 if (flags&GRE_CSUM) {
592 switch (skb->ip_summed) {
593 case CHECKSUM_COMPLETE:
594 csum = csum_fold(skb->csum);
600 csum = __skb_checksum_complete(skb);
601 skb->ip_summed = CHECKSUM_COMPLETE;
606 key = *(__be32*)(h + offset);
610 seqno = ntohl(*(__be32*)(h + offset));
615 read_lock(&ipgre_lock);
616 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
617 iph->saddr, iph->daddr, key)) != NULL) {
620 skb->protocol = *(__be16*)(h + 2);
621 /* WCCP version 1 and 2 protocol decoding.
622 * - Change protocol to IP
623 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
626 skb->protocol == htons(ETH_P_WCCP)) {
627 skb->protocol = htons(ETH_P_IP);
628 if ((*(h + offset) & 0xF0) != 0x40)
632 skb->mac_header = skb->network_header;
633 __pskb_pull(skb, offset);
634 skb_reset_network_header(skb);
635 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
636 skb->pkt_type = PACKET_HOST;
637 #ifdef CONFIG_NET_IPGRE_BROADCAST
638 if (ipv4_is_multicast(iph->daddr)) {
639 /* Looped back packet, drop it! */
640 if (skb->rtable->fl.iif == 0)
642 tunnel->stat.multicast++;
643 skb->pkt_type = PACKET_BROADCAST;
647 if (((flags&GRE_CSUM) && csum) ||
648 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
649 tunnel->stat.rx_crc_errors++;
650 tunnel->stat.rx_errors++;
653 if (tunnel->parms.i_flags&GRE_SEQ) {
654 if (!(flags&GRE_SEQ) ||
655 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
656 tunnel->stat.rx_fifo_errors++;
657 tunnel->stat.rx_errors++;
660 tunnel->i_seqno = seqno + 1;
662 tunnel->stat.rx_packets++;
663 tunnel->stat.rx_bytes += skb->len;
664 skb->dev = tunnel->dev;
665 dst_release(skb->dst);
668 ipgre_ecn_decapsulate(iph, skb);
670 read_unlock(&ipgre_lock);
673 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
676 read_unlock(&ipgre_lock);
682 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
684 struct ip_tunnel *tunnel = netdev_priv(dev);
685 struct net_device_stats *stats = &tunnel->stat;
686 struct iphdr *old_iph = ip_hdr(skb);
690 struct rtable *rt; /* Route to the other host */
691 struct net_device *tdev; /* Device to other host */
692 struct iphdr *iph; /* Our new IP header */
693 unsigned int max_headroom; /* The extra header space needed */
698 if (tunnel->recursion++) {
699 tunnel->stat.collisions++;
703 if (dev->header_ops) {
705 tiph = (struct iphdr*)skb->data;
707 gre_hlen = tunnel->hlen;
708 tiph = &tunnel->parms.iph;
711 if ((dst = tiph->daddr) == 0) {
714 if (skb->dst == NULL) {
715 tunnel->stat.tx_fifo_errors++;
719 if (skb->protocol == htons(ETH_P_IP)) {
721 if ((dst = rt->rt_gateway) == 0)
725 else if (skb->protocol == htons(ETH_P_IPV6)) {
726 struct in6_addr *addr6;
728 struct neighbour *neigh = skb->dst->neighbour;
733 addr6 = (struct in6_addr*)&neigh->primary_key;
734 addr_type = ipv6_addr_type(addr6);
736 if (addr_type == IPV6_ADDR_ANY) {
737 addr6 = &ipv6_hdr(skb)->daddr;
738 addr_type = ipv6_addr_type(addr6);
741 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
744 dst = addr6->s6_addr32[3];
753 if (skb->protocol == htons(ETH_P_IP))
759 struct flowi fl = { .oif = tunnel->parms.link,
762 .saddr = tiph->saddr,
763 .tos = RT_TOS(tos) } },
764 .proto = IPPROTO_GRE };
765 if (ip_route_output_key(&init_net, &rt, &fl)) {
766 tunnel->stat.tx_carrier_errors++;
770 tdev = rt->u.dst.dev;
774 tunnel->stat.collisions++;
780 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
782 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
785 skb->dst->ops->update_pmtu(skb->dst, mtu);
787 if (skb->protocol == htons(ETH_P_IP)) {
788 df |= (old_iph->frag_off&htons(IP_DF));
790 if ((old_iph->frag_off&htons(IP_DF)) &&
791 mtu < ntohs(old_iph->tot_len)) {
792 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
798 else if (skb->protocol == htons(ETH_P_IPV6)) {
799 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
801 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
802 if ((tunnel->parms.iph.daddr &&
803 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
804 rt6->rt6i_dst.plen == 128) {
805 rt6->rt6i_flags |= RTF_MODIFIED;
806 skb->dst->metrics[RTAX_MTU-1] = mtu;
810 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
811 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
818 if (tunnel->err_count > 0) {
819 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
822 dst_link_failure(skb);
824 tunnel->err_count = 0;
827 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
829 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
830 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
831 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
840 skb_set_owner_w(new_skb, skb->sk);
843 old_iph = ip_hdr(skb);
846 skb->transport_header = skb->network_header;
847 skb_push(skb, gre_hlen);
848 skb_reset_network_header(skb);
849 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
850 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
852 dst_release(skb->dst);
853 skb->dst = &rt->u.dst;
856 * Push down and install the IPIP header.
861 iph->ihl = sizeof(struct iphdr) >> 2;
863 iph->protocol = IPPROTO_GRE;
864 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
865 iph->daddr = rt->rt_dst;
866 iph->saddr = rt->rt_src;
868 if ((iph->ttl = tiph->ttl) == 0) {
869 if (skb->protocol == htons(ETH_P_IP))
870 iph->ttl = old_iph->ttl;
872 else if (skb->protocol == htons(ETH_P_IPV6))
873 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
876 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
879 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
880 ((__be16*)(iph+1))[1] = skb->protocol;
882 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
883 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
885 if (tunnel->parms.o_flags&GRE_SEQ) {
887 *ptr = htonl(tunnel->o_seqno);
890 if (tunnel->parms.o_flags&GRE_KEY) {
891 *ptr = tunnel->parms.o_key;
894 if (tunnel->parms.o_flags&GRE_CSUM) {
896 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
907 dst_link_failure(skb);
916 static void ipgre_tunnel_bind_dev(struct net_device *dev)
918 struct net_device *tdev = NULL;
919 struct ip_tunnel *tunnel;
921 int hlen = LL_MAX_HEADER;
922 int mtu = ETH_DATA_LEN;
923 int addend = sizeof(struct iphdr) + 4;
925 tunnel = netdev_priv(dev);
926 iph = &tunnel->parms.iph;
928 /* Guess output device to choose reasonable mtu and hard_header_len */
931 struct flowi fl = { .oif = tunnel->parms.link,
933 { .daddr = iph->daddr,
935 .tos = RT_TOS(iph->tos) } },
936 .proto = IPPROTO_GRE };
938 if (!ip_route_output_key(&init_net, &rt, &fl)) {
939 tdev = rt->u.dst.dev;
942 dev->flags |= IFF_POINTOPOINT;
945 if (!tdev && tunnel->parms.link)
946 tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
949 hlen = tdev->hard_header_len;
952 dev->iflink = tunnel->parms.link;
954 /* Precalculate GRE options length */
955 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
956 if (tunnel->parms.o_flags&GRE_CSUM)
958 if (tunnel->parms.o_flags&GRE_KEY)
960 if (tunnel->parms.o_flags&GRE_SEQ)
963 dev->hard_header_len = hlen + addend;
964 dev->mtu = mtu - addend;
965 tunnel->hlen = addend;
970 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
973 struct ip_tunnel_parm p;
975 struct net *net = dev_net(dev);
976 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
981 if (dev == ign->fb_tunnel_dev) {
982 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
986 t = ipgre_tunnel_locate(net, &p, 0);
989 t = netdev_priv(dev);
990 memcpy(&p, &t->parms, sizeof(p));
991 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
998 if (!capable(CAP_NET_ADMIN))
1002 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1006 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1007 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1008 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1011 p.iph.frag_off |= htons(IP_DF);
1013 if (!(p.i_flags&GRE_KEY))
1015 if (!(p.o_flags&GRE_KEY))
1018 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1020 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1022 if (t->dev != dev) {
1029 t = netdev_priv(dev);
1031 if (ipv4_is_multicast(p.iph.daddr))
1032 nflags = IFF_BROADCAST;
1033 else if (p.iph.daddr)
1034 nflags = IFF_POINTOPOINT;
1036 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1040 ipgre_tunnel_unlink(ign, t);
1041 t->parms.iph.saddr = p.iph.saddr;
1042 t->parms.iph.daddr = p.iph.daddr;
1043 t->parms.i_key = p.i_key;
1044 t->parms.o_key = p.o_key;
1045 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1046 memcpy(dev->broadcast, &p.iph.daddr, 4);
1047 ipgre_tunnel_link(ign, t);
1048 netdev_state_change(dev);
1054 if (cmd == SIOCCHGTUNNEL) {
1055 t->parms.iph.ttl = p.iph.ttl;
1056 t->parms.iph.tos = p.iph.tos;
1057 t->parms.iph.frag_off = p.iph.frag_off;
1058 if (t->parms.link != p.link) {
1059 t->parms.link = p.link;
1060 ipgre_tunnel_bind_dev(dev);
1061 netdev_state_change(dev);
1064 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1067 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1072 if (!capable(CAP_NET_ADMIN))
1075 if (dev == ign->fb_tunnel_dev) {
1077 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1080 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1083 if (t == netdev_priv(ign->fb_tunnel_dev))
1087 unregister_netdevice(dev);
1099 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1101 return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1104 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1106 struct ip_tunnel *tunnel = netdev_priv(dev);
1107 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1113 /* Nice toy. Unfortunately, useless in real life :-)
1114 It allows to construct virtual multiprotocol broadcast "LAN"
1115 over the Internet, provided multicast routing is tuned.
1118 I have no idea was this bicycle invented before me,
1119 so that I had to set ARPHRD_IPGRE to a random value.
1120 I have an impression, that Cisco could make something similar,
1121 but this feature is apparently missing in IOS<=11.2(8).
1123 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1124 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1126 ping -t 255 224.66.66.66
1128 If nobody answers, mbone does not work.
1130 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1131 ip addr add 10.66.66.<somewhat>/24 dev Universe
1132 ifconfig Universe up
1133 ifconfig Universe add fe80::<Your_real_addr>/10
1134 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1137 ftp fec0:6666:6666::193.233.7.65
1142 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1143 unsigned short type,
1144 const void *daddr, const void *saddr, unsigned len)
1146 struct ip_tunnel *t = netdev_priv(dev);
1147 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1148 __be16 *p = (__be16*)(iph+1);
1150 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1151 p[0] = t->parms.o_flags;
1155 * Set the source hardware address.
1159 memcpy(&iph->saddr, saddr, 4);
1162 memcpy(&iph->daddr, daddr, 4);
1165 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1171 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1173 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1174 memcpy(haddr, &iph->saddr, 4);
1178 static const struct header_ops ipgre_header_ops = {
1179 .create = ipgre_header,
1180 .parse = ipgre_header_parse,
1183 #ifdef CONFIG_NET_IPGRE_BROADCAST
1184 static int ipgre_open(struct net_device *dev)
1186 struct ip_tunnel *t = netdev_priv(dev);
1188 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1189 struct flowi fl = { .oif = t->parms.link,
1191 { .daddr = t->parms.iph.daddr,
1192 .saddr = t->parms.iph.saddr,
1193 .tos = RT_TOS(t->parms.iph.tos) } },
1194 .proto = IPPROTO_GRE };
1196 if (ip_route_output_key(&init_net, &rt, &fl))
1197 return -EADDRNOTAVAIL;
1198 dev = rt->u.dst.dev;
1200 if (__in_dev_get_rtnl(dev) == NULL)
1201 return -EADDRNOTAVAIL;
1202 t->mlink = dev->ifindex;
1203 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1208 static int ipgre_close(struct net_device *dev)
1210 struct ip_tunnel *t = netdev_priv(dev);
1211 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1212 struct in_device *in_dev;
1213 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1215 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1224 static void ipgre_tunnel_setup(struct net_device *dev)
1226 dev->uninit = ipgre_tunnel_uninit;
1227 dev->destructor = free_netdev;
1228 dev->hard_start_xmit = ipgre_tunnel_xmit;
1229 dev->get_stats = ipgre_tunnel_get_stats;
1230 dev->do_ioctl = ipgre_tunnel_ioctl;
1231 dev->change_mtu = ipgre_tunnel_change_mtu;
1233 dev->type = ARPHRD_IPGRE;
1234 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1235 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1236 dev->flags = IFF_NOARP;
1241 static int ipgre_tunnel_init(struct net_device *dev)
1243 struct ip_tunnel *tunnel;
1246 tunnel = netdev_priv(dev);
1247 iph = &tunnel->parms.iph;
1250 strcpy(tunnel->parms.name, dev->name);
1252 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1253 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1255 ipgre_tunnel_bind_dev(dev);
1258 #ifdef CONFIG_NET_IPGRE_BROADCAST
1259 if (ipv4_is_multicast(iph->daddr)) {
1262 dev->flags = IFF_BROADCAST;
1263 dev->header_ops = &ipgre_header_ops;
1264 dev->open = ipgre_open;
1265 dev->stop = ipgre_close;
1269 dev->header_ops = &ipgre_header_ops;
1274 static int ipgre_fb_tunnel_init(struct net_device *dev)
1276 struct ip_tunnel *tunnel = netdev_priv(dev);
1277 struct iphdr *iph = &tunnel->parms.iph;
1278 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1281 strcpy(tunnel->parms.name, dev->name);
1284 iph->protocol = IPPROTO_GRE;
1286 tunnel->hlen = sizeof(struct iphdr) + 4;
1289 ign->tunnels_wc[0] = tunnel;
1294 static struct net_protocol ipgre_protocol = {
1295 .handler = ipgre_rcv,
1296 .err_handler = ipgre_err,
1299 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1303 for (prio = 0; prio < 4; prio++) {
1305 for (h = 0; h < HASH_SIZE; h++) {
1306 struct ip_tunnel *t;
1307 while ((t = ign->tunnels[prio][h]) != NULL)
1308 unregister_netdevice(t->dev);
1313 static int ipgre_init_net(struct net *net)
1316 struct ipgre_net *ign;
1319 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1323 err = net_assign_generic(net, ipgre_net_id, ign);
1327 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1328 ipgre_tunnel_setup);
1329 if (!ign->fb_tunnel_dev) {
1334 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1335 dev_net_set(ign->fb_tunnel_dev, net);
1337 if ((err = register_netdev(ign->fb_tunnel_dev)))
1343 free_netdev(ign->fb_tunnel_dev);
1352 static void ipgre_exit_net(struct net *net)
1354 struct ipgre_net *ign;
1356 ign = net_generic(net, ipgre_net_id);
1358 ipgre_destroy_tunnels(ign);
1363 static struct pernet_operations ipgre_net_ops = {
1364 .init = ipgre_init_net,
1365 .exit = ipgre_exit_net,
1369 * And now the modules code and kernel interface.
1372 static int __init ipgre_init(void)
1376 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1378 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1379 printk(KERN_INFO "ipgre init: can't add protocol\n");
1383 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1385 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1390 static void __exit ipgre_fini(void)
1392 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1393 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1395 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1398 module_init(ipgre_init);
1399 module_exit(ipgre_fini);
1400 MODULE_LICENSE("GPL");