2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
35 #include <net/protocol.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
55 1. The most important issue is detecting local dead loops.
56 They would cause complete host lockup in transmit, which
57 would be "resolved" by stack overflow or, if queueing is enabled,
58 with infinite looping in net_bh.
60 We cannot track such dead loops during route installation,
61 it is infeasible task. The most general solutions would be
62 to keep skb->encapsulation counter (sort of local ttl),
63 and silently drop packet when it expires. It is the best
64 solution, but it supposes maintaing new variable in ALL
65 skb, even if no tunneling is used.
67 Current solution: t->recursion lock breaks dead loops. It looks
68 like dev->tbusy flag, but I preferred new variable, because
69 the semantics is different. One day, when hard_start_xmit
70 will be multithreaded we will have to use skb->encapsulation.
74 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case,
76 if we copy it from packet being encapsulated to upper header.
77 It is very good solution, but it introduces two problems:
79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80 do not work over tunnels.
81 - traceroute does not work. I planned to relay ICMP from tunnel,
82 so that this problem would be solved and traceroute output
83 would even more informative. This idea appeared to be wrong:
84 only Linux complies to rfc1812 now (yes, guys, Linux is the only
85 true router now :-)), all routers (at least, in neighbourhood of mine)
86 return only 8 bytes of payload. It is the end.
88 Hence, if we want that OSPF worked or traceroute said something reasonable,
89 we should search for another solution.
91 One of them is to parse packet trying to detect inner encapsulation
92 made by our node. It is difficult or even impossible, especially,
93 taking into account fragmentation. TO be short, tt is not solution at all.
95 Current solution: The solution was UNEXPECTEDLY SIMPLE.
96 We force DF flag on tunnels with preconfigured hop limit,
97 that is ALL. :-) Well, it does not remove the problem completely,
98 but exponential growth of network traffic is changed to linear
99 (branches, that exceed pmtu are pruned) and tunnel mtu
100 fastly degrades to value <68, where looping stops.
101 Yes, it is not good if there exists a router in the loop,
102 which does not force DF, even when encapsulating packets have DF set.
103 But it is not our problem! Nobody could accuse us, we made
104 all that we could make. Even if it is your gated who injected
105 fatal route to network, even if it were you who configured
106 fatal static route: you are innocent. :-)
110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111 practically identical code. It would be good to glue them
112 together, but it is not very evident, how to make them modular.
113 sit is integral part of IPv6, ipip and gre are naturally modular.
114 We could extract common parts (hash table, ioctl etc)
115 to a separate module (ip_tunnel.c).
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
123 /* Fallback tunnel: no source, no destination, no key, no options */
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
127 static int ipgre_net_id;
131 static struct net_device *ipgre_fb_tunnel_dev;
133 /* Tunnel hash table */
143 We require exact key match i.e. if a key is present in packet
144 it will match only tunnel with the same key; if it is not present,
145 it will match only keyless tunnel.
147 All keysless packets, if not matched configured keyless tunnels
148 will match fallback tunnel.
152 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
154 static struct ip_tunnel *tunnels[4][HASH_SIZE];
156 #define tunnels_r_l (tunnels[3])
157 #define tunnels_r (tunnels[2])
158 #define tunnels_l (tunnels[1])
159 #define tunnels_wc (tunnels[0])
161 static DEFINE_RWLOCK(ipgre_lock);
163 /* Given src, dst and key, find appropriate for input tunnel. */
165 static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be32 key)
167 unsigned h0 = HASH(remote);
168 unsigned h1 = HASH(key);
171 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
172 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
173 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
177 for (t = tunnels_r[h0^h1]; t; t = t->next) {
178 if (remote == t->parms.iph.daddr) {
179 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
183 for (t = tunnels_l[h1]; t; t = t->next) {
184 if (local == t->parms.iph.saddr ||
185 (local == t->parms.iph.daddr &&
186 ipv4_is_multicast(local))) {
187 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
191 for (t = tunnels_wc[h1]; t; t = t->next) {
192 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
196 if (ipgre_fb_tunnel_dev->flags&IFF_UP)
197 return netdev_priv(ipgre_fb_tunnel_dev);
201 static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms)
203 __be32 remote = parms->iph.daddr;
204 __be32 local = parms->iph.saddr;
205 __be32 key = parms->i_key;
206 unsigned h = HASH(key);
211 if (remote && !ipv4_is_multicast(remote)) {
216 return &tunnels[prio][h];
219 static inline struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
221 return __ipgre_bucket(&t->parms);
224 static void ipgre_tunnel_link(struct ip_tunnel *t)
226 struct ip_tunnel **tp = ipgre_bucket(t);
229 write_lock_bh(&ipgre_lock);
231 write_unlock_bh(&ipgre_lock);
234 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
236 struct ip_tunnel **tp;
238 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
240 write_lock_bh(&ipgre_lock);
242 write_unlock_bh(&ipgre_lock);
248 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
250 __be32 remote = parms->iph.daddr;
251 __be32 local = parms->iph.saddr;
252 __be32 key = parms->i_key;
253 struct ip_tunnel *t, **tp, *nt;
254 struct net_device *dev;
257 for (tp = __ipgre_bucket(parms); (t = *tp) != NULL; tp = &t->next) {
258 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
259 if (key == t->parms.i_key)
267 strlcpy(name, parms->name, IFNAMSIZ);
269 sprintf(name, "gre%%d");
271 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
275 if (strchr(name, '%')) {
276 if (dev_alloc_name(dev, name) < 0)
280 dev->init = ipgre_tunnel_init;
281 nt = netdev_priv(dev);
284 if (register_netdevice(dev) < 0)
288 ipgre_tunnel_link(nt);
296 static void ipgre_tunnel_uninit(struct net_device *dev)
298 ipgre_tunnel_unlink(netdev_priv(dev));
303 static void ipgre_err(struct sk_buff *skb, u32 info)
305 #ifndef I_WISH_WORLD_WERE_PERFECT
307 /* It is not :-( All the routers (except for Linux) return only
308 8 bytes of packet payload. It means, that precise relaying of
309 ICMP in the real Internet is absolutely infeasible.
311 Moreover, Cisco "wise men" put GRE key to the third word
312 in GRE header. It makes impossible maintaining even soft state for keyed
313 GRE tunnels with enabled checksum. Tell them "thank you".
315 Well, I wonder, rfc1812 was written by Cisco employee,
316 what the hell these idiots break standrads established
320 struct iphdr *iph = (struct iphdr*)skb->data;
321 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
322 int grehlen = (iph->ihl<<2) + 4;
323 const int type = icmp_hdr(skb)->type;
324 const int code = icmp_hdr(skb)->code;
329 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
330 if (flags&(GRE_VERSION|GRE_ROUTING))
339 /* If only 8 bytes returned, keyed message will be dropped here */
340 if (skb_headlen(skb) < grehlen)
345 case ICMP_PARAMETERPROB:
348 case ICMP_DEST_UNREACH:
351 case ICMP_PORT_UNREACH:
352 /* Impossible event. */
354 case ICMP_FRAG_NEEDED:
355 /* Soft state for pmtu is maintained by IP core. */
358 /* All others are translated to HOST_UNREACH.
359 rfc2003 contains "deep thoughts" about NET_UNREACH,
360 I believe they are just ether pollution. --ANK
365 case ICMP_TIME_EXCEEDED:
366 if (code != ICMP_EXC_TTL)
371 read_lock(&ipgre_lock);
372 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0);
373 if (t == NULL || t->parms.iph.daddr == 0 ||
374 ipv4_is_multicast(t->parms.iph.daddr))
377 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
380 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
384 t->err_time = jiffies;
386 read_unlock(&ipgre_lock);
389 struct iphdr *iph = (struct iphdr*)dp;
391 __be16 *p = (__be16*)(dp+(iph->ihl<<2));
392 const int type = icmp_hdr(skb)->type;
393 const int code = icmp_hdr(skb)->code;
399 int grehlen = (iph->ihl<<2) + 4;
400 struct sk_buff *skb2;
404 if (p[1] != htons(ETH_P_IP))
408 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
409 if (flags&(GRE_VERSION|GRE_ROUTING))
418 if (len < grehlen + sizeof(struct iphdr))
420 eiph = (struct iphdr*)(dp + grehlen);
425 case ICMP_PARAMETERPROB:
426 n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
427 if (n < (iph->ihl<<2))
430 /* So... This guy found something strange INSIDE encapsulated
431 packet. Well, he is fool, but what can we do ?
433 rel_type = ICMP_PARAMETERPROB;
435 rel_info = htonl(n << 24);
438 case ICMP_DEST_UNREACH:
441 case ICMP_PORT_UNREACH:
442 /* Impossible event. */
444 case ICMP_FRAG_NEEDED:
445 /* And it is the only really necessary thing :-) */
446 n = ntohs(icmp_hdr(skb)->un.frag.mtu);
450 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
451 if (n > ntohs(eiph->tot_len))
456 /* All others are translated to HOST_UNREACH.
457 rfc2003 contains "deep thoughts" about NET_UNREACH,
458 I believe, it is just ether pollution. --ANK
460 rel_type = ICMP_DEST_UNREACH;
461 rel_code = ICMP_HOST_UNREACH;
465 case ICMP_TIME_EXCEEDED:
466 if (code != ICMP_EXC_TTL)
471 /* Prepare fake skb to feed it to icmp_send */
472 skb2 = skb_clone(skb, GFP_ATOMIC);
475 dst_release(skb2->dst);
477 skb_pull(skb2, skb->data - (u8*)eiph);
478 skb_reset_network_header(skb2);
480 /* Try to guess incoming interface */
481 memset(&fl, 0, sizeof(fl));
482 fl.fl4_dst = eiph->saddr;
483 fl.fl4_tos = RT_TOS(eiph->tos);
484 fl.proto = IPPROTO_GRE;
485 if (ip_route_output_key(&init_net, &rt, &fl)) {
489 skb2->dev = rt->u.dst.dev;
491 /* route "incoming" packet */
492 if (rt->rt_flags&RTCF_LOCAL) {
495 fl.fl4_dst = eiph->daddr;
496 fl.fl4_src = eiph->saddr;
497 fl.fl4_tos = eiph->tos;
498 if (ip_route_output_key(&init_net, &rt, &fl) ||
499 rt->u.dst.dev->type != ARPHRD_IPGRE) {
506 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
507 skb2->dst->dev->type != ARPHRD_IPGRE) {
513 /* change mtu on this route */
514 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
515 if (n > dst_mtu(skb2->dst)) {
519 skb2->dst->ops->update_pmtu(skb2->dst, n);
520 } else if (type == ICMP_TIME_EXCEEDED) {
521 struct ip_tunnel *t = netdev_priv(skb2->dev);
522 if (t->parms.iph.ttl) {
523 rel_type = ICMP_DEST_UNREACH;
524 rel_code = ICMP_HOST_UNREACH;
528 icmp_send(skb2, rel_type, rel_code, rel_info);
533 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
535 if (INET_ECN_is_ce(iph->tos)) {
536 if (skb->protocol == htons(ETH_P_IP)) {
537 IP_ECN_set_ce(ip_hdr(skb));
538 } else if (skb->protocol == htons(ETH_P_IPV6)) {
539 IP6_ECN_set_ce(ipv6_hdr(skb));
545 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
548 if (skb->protocol == htons(ETH_P_IP))
549 inner = old_iph->tos;
550 else if (skb->protocol == htons(ETH_P_IPV6))
551 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
552 return INET_ECN_encapsulate(tos, inner);
555 static int ipgre_rcv(struct sk_buff *skb)
563 struct ip_tunnel *tunnel;
566 if (!pskb_may_pull(skb, 16))
573 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
574 /* - Version must be 0.
575 - We do not support routing headers.
577 if (flags&(GRE_VERSION|GRE_ROUTING))
580 if (flags&GRE_CSUM) {
581 switch (skb->ip_summed) {
582 case CHECKSUM_COMPLETE:
583 csum = csum_fold(skb->csum);
589 csum = __skb_checksum_complete(skb);
590 skb->ip_summed = CHECKSUM_COMPLETE;
595 key = *(__be32*)(h + offset);
599 seqno = ntohl(*(__be32*)(h + offset));
604 read_lock(&ipgre_lock);
605 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
608 skb->protocol = *(__be16*)(h + 2);
609 /* WCCP version 1 and 2 protocol decoding.
610 * - Change protocol to IP
611 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
614 skb->protocol == htons(ETH_P_WCCP)) {
615 skb->protocol = htons(ETH_P_IP);
616 if ((*(h + offset) & 0xF0) != 0x40)
620 skb->mac_header = skb->network_header;
621 __pskb_pull(skb, offset);
622 skb_reset_network_header(skb);
623 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
624 skb->pkt_type = PACKET_HOST;
625 #ifdef CONFIG_NET_IPGRE_BROADCAST
626 if (ipv4_is_multicast(iph->daddr)) {
627 /* Looped back packet, drop it! */
628 if (skb->rtable->fl.iif == 0)
630 tunnel->stat.multicast++;
631 skb->pkt_type = PACKET_BROADCAST;
635 if (((flags&GRE_CSUM) && csum) ||
636 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
637 tunnel->stat.rx_crc_errors++;
638 tunnel->stat.rx_errors++;
641 if (tunnel->parms.i_flags&GRE_SEQ) {
642 if (!(flags&GRE_SEQ) ||
643 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
644 tunnel->stat.rx_fifo_errors++;
645 tunnel->stat.rx_errors++;
648 tunnel->i_seqno = seqno + 1;
650 tunnel->stat.rx_packets++;
651 tunnel->stat.rx_bytes += skb->len;
652 skb->dev = tunnel->dev;
653 dst_release(skb->dst);
656 ipgre_ecn_decapsulate(iph, skb);
658 read_unlock(&ipgre_lock);
661 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
664 read_unlock(&ipgre_lock);
670 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
672 struct ip_tunnel *tunnel = netdev_priv(dev);
673 struct net_device_stats *stats = &tunnel->stat;
674 struct iphdr *old_iph = ip_hdr(skb);
678 struct rtable *rt; /* Route to the other host */
679 struct net_device *tdev; /* Device to other host */
680 struct iphdr *iph; /* Our new IP header */
681 unsigned int max_headroom; /* The extra header space needed */
686 if (tunnel->recursion++) {
687 tunnel->stat.collisions++;
691 if (dev->header_ops) {
693 tiph = (struct iphdr*)skb->data;
695 gre_hlen = tunnel->hlen;
696 tiph = &tunnel->parms.iph;
699 if ((dst = tiph->daddr) == 0) {
702 if (skb->dst == NULL) {
703 tunnel->stat.tx_fifo_errors++;
707 if (skb->protocol == htons(ETH_P_IP)) {
709 if ((dst = rt->rt_gateway) == 0)
713 else if (skb->protocol == htons(ETH_P_IPV6)) {
714 struct in6_addr *addr6;
716 struct neighbour *neigh = skb->dst->neighbour;
721 addr6 = (struct in6_addr*)&neigh->primary_key;
722 addr_type = ipv6_addr_type(addr6);
724 if (addr_type == IPV6_ADDR_ANY) {
725 addr6 = &ipv6_hdr(skb)->daddr;
726 addr_type = ipv6_addr_type(addr6);
729 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
732 dst = addr6->s6_addr32[3];
741 if (skb->protocol == htons(ETH_P_IP))
747 struct flowi fl = { .oif = tunnel->parms.link,
750 .saddr = tiph->saddr,
751 .tos = RT_TOS(tos) } },
752 .proto = IPPROTO_GRE };
753 if (ip_route_output_key(&init_net, &rt, &fl)) {
754 tunnel->stat.tx_carrier_errors++;
758 tdev = rt->u.dst.dev;
762 tunnel->stat.collisions++;
768 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
770 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
773 skb->dst->ops->update_pmtu(skb->dst, mtu);
775 if (skb->protocol == htons(ETH_P_IP)) {
776 df |= (old_iph->frag_off&htons(IP_DF));
778 if ((old_iph->frag_off&htons(IP_DF)) &&
779 mtu < ntohs(old_iph->tot_len)) {
780 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
786 else if (skb->protocol == htons(ETH_P_IPV6)) {
787 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
789 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
790 if ((tunnel->parms.iph.daddr &&
791 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
792 rt6->rt6i_dst.plen == 128) {
793 rt6->rt6i_flags |= RTF_MODIFIED;
794 skb->dst->metrics[RTAX_MTU-1] = mtu;
798 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
799 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
806 if (tunnel->err_count > 0) {
807 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
810 dst_link_failure(skb);
812 tunnel->err_count = 0;
815 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
817 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
818 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
819 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
828 skb_set_owner_w(new_skb, skb->sk);
831 old_iph = ip_hdr(skb);
834 skb->transport_header = skb->network_header;
835 skb_push(skb, gre_hlen);
836 skb_reset_network_header(skb);
837 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
838 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
840 dst_release(skb->dst);
841 skb->dst = &rt->u.dst;
844 * Push down and install the IPIP header.
849 iph->ihl = sizeof(struct iphdr) >> 2;
851 iph->protocol = IPPROTO_GRE;
852 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
853 iph->daddr = rt->rt_dst;
854 iph->saddr = rt->rt_src;
856 if ((iph->ttl = tiph->ttl) == 0) {
857 if (skb->protocol == htons(ETH_P_IP))
858 iph->ttl = old_iph->ttl;
860 else if (skb->protocol == htons(ETH_P_IPV6))
861 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
864 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
867 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
868 ((__be16*)(iph+1))[1] = skb->protocol;
870 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
871 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
873 if (tunnel->parms.o_flags&GRE_SEQ) {
875 *ptr = htonl(tunnel->o_seqno);
878 if (tunnel->parms.o_flags&GRE_KEY) {
879 *ptr = tunnel->parms.o_key;
882 if (tunnel->parms.o_flags&GRE_CSUM) {
884 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
895 dst_link_failure(skb);
904 static void ipgre_tunnel_bind_dev(struct net_device *dev)
906 struct net_device *tdev = NULL;
907 struct ip_tunnel *tunnel;
909 int hlen = LL_MAX_HEADER;
910 int mtu = ETH_DATA_LEN;
911 int addend = sizeof(struct iphdr) + 4;
913 tunnel = netdev_priv(dev);
914 iph = &tunnel->parms.iph;
916 /* Guess output device to choose reasonable mtu and hard_header_len */
919 struct flowi fl = { .oif = tunnel->parms.link,
921 { .daddr = iph->daddr,
923 .tos = RT_TOS(iph->tos) } },
924 .proto = IPPROTO_GRE };
926 if (!ip_route_output_key(&init_net, &rt, &fl)) {
927 tdev = rt->u.dst.dev;
930 dev->flags |= IFF_POINTOPOINT;
933 if (!tdev && tunnel->parms.link)
934 tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
937 hlen = tdev->hard_header_len;
940 dev->iflink = tunnel->parms.link;
942 /* Precalculate GRE options length */
943 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
944 if (tunnel->parms.o_flags&GRE_CSUM)
946 if (tunnel->parms.o_flags&GRE_KEY)
948 if (tunnel->parms.o_flags&GRE_SEQ)
951 dev->hard_header_len = hlen + addend;
952 dev->mtu = mtu - addend;
953 tunnel->hlen = addend;
958 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
961 struct ip_tunnel_parm p;
967 if (dev == ipgre_fb_tunnel_dev) {
968 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
972 t = ipgre_tunnel_locate(&p, 0);
975 t = netdev_priv(dev);
976 memcpy(&p, &t->parms, sizeof(p));
977 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
984 if (!capable(CAP_NET_ADMIN))
988 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
992 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
993 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
994 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
997 p.iph.frag_off |= htons(IP_DF);
999 if (!(p.i_flags&GRE_KEY))
1001 if (!(p.o_flags&GRE_KEY))
1004 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
1006 if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1008 if (t->dev != dev) {
1015 t = netdev_priv(dev);
1017 if (ipv4_is_multicast(p.iph.daddr))
1018 nflags = IFF_BROADCAST;
1019 else if (p.iph.daddr)
1020 nflags = IFF_POINTOPOINT;
1022 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1026 ipgre_tunnel_unlink(t);
1027 t->parms.iph.saddr = p.iph.saddr;
1028 t->parms.iph.daddr = p.iph.daddr;
1029 t->parms.i_key = p.i_key;
1030 t->parms.o_key = p.o_key;
1031 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1032 memcpy(dev->broadcast, &p.iph.daddr, 4);
1033 ipgre_tunnel_link(t);
1034 netdev_state_change(dev);
1040 if (cmd == SIOCCHGTUNNEL) {
1041 t->parms.iph.ttl = p.iph.ttl;
1042 t->parms.iph.tos = p.iph.tos;
1043 t->parms.iph.frag_off = p.iph.frag_off;
1044 if (t->parms.link != p.link) {
1045 t->parms.link = p.link;
1046 ipgre_tunnel_bind_dev(dev);
1047 netdev_state_change(dev);
1050 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1053 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1058 if (!capable(CAP_NET_ADMIN))
1061 if (dev == ipgre_fb_tunnel_dev) {
1063 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1066 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
1069 if (t == netdev_priv(ipgre_fb_tunnel_dev))
1073 unregister_netdevice(dev);
1085 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1087 return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1090 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1092 struct ip_tunnel *tunnel = netdev_priv(dev);
1093 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1099 /* Nice toy. Unfortunately, useless in real life :-)
1100 It allows to construct virtual multiprotocol broadcast "LAN"
1101 over the Internet, provided multicast routing is tuned.
1104 I have no idea was this bicycle invented before me,
1105 so that I had to set ARPHRD_IPGRE to a random value.
1106 I have an impression, that Cisco could make something similar,
1107 but this feature is apparently missing in IOS<=11.2(8).
1109 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1110 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1112 ping -t 255 224.66.66.66
1114 If nobody answers, mbone does not work.
1116 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1117 ip addr add 10.66.66.<somewhat>/24 dev Universe
1118 ifconfig Universe up
1119 ifconfig Universe add fe80::<Your_real_addr>/10
1120 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1123 ftp fec0:6666:6666::193.233.7.65
1128 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1129 unsigned short type,
1130 const void *daddr, const void *saddr, unsigned len)
1132 struct ip_tunnel *t = netdev_priv(dev);
1133 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1134 __be16 *p = (__be16*)(iph+1);
1136 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1137 p[0] = t->parms.o_flags;
1141 * Set the source hardware address.
1145 memcpy(&iph->saddr, saddr, 4);
1148 memcpy(&iph->daddr, daddr, 4);
1151 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1157 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1159 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1160 memcpy(haddr, &iph->saddr, 4);
1164 static const struct header_ops ipgre_header_ops = {
1165 .create = ipgre_header,
1166 .parse = ipgre_header_parse,
1169 #ifdef CONFIG_NET_IPGRE_BROADCAST
1170 static int ipgre_open(struct net_device *dev)
1172 struct ip_tunnel *t = netdev_priv(dev);
1174 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1175 struct flowi fl = { .oif = t->parms.link,
1177 { .daddr = t->parms.iph.daddr,
1178 .saddr = t->parms.iph.saddr,
1179 .tos = RT_TOS(t->parms.iph.tos) } },
1180 .proto = IPPROTO_GRE };
1182 if (ip_route_output_key(&init_net, &rt, &fl))
1183 return -EADDRNOTAVAIL;
1184 dev = rt->u.dst.dev;
1186 if (__in_dev_get_rtnl(dev) == NULL)
1187 return -EADDRNOTAVAIL;
1188 t->mlink = dev->ifindex;
1189 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1194 static int ipgre_close(struct net_device *dev)
1196 struct ip_tunnel *t = netdev_priv(dev);
1197 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1198 struct in_device *in_dev;
1199 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1201 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1210 static void ipgre_tunnel_setup(struct net_device *dev)
1212 dev->uninit = ipgre_tunnel_uninit;
1213 dev->destructor = free_netdev;
1214 dev->hard_start_xmit = ipgre_tunnel_xmit;
1215 dev->get_stats = ipgre_tunnel_get_stats;
1216 dev->do_ioctl = ipgre_tunnel_ioctl;
1217 dev->change_mtu = ipgre_tunnel_change_mtu;
1219 dev->type = ARPHRD_IPGRE;
1220 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1221 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1222 dev->flags = IFF_NOARP;
1227 static int ipgre_tunnel_init(struct net_device *dev)
1229 struct ip_tunnel *tunnel;
1232 tunnel = netdev_priv(dev);
1233 iph = &tunnel->parms.iph;
1236 strcpy(tunnel->parms.name, dev->name);
1238 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1239 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1241 ipgre_tunnel_bind_dev(dev);
1244 #ifdef CONFIG_NET_IPGRE_BROADCAST
1245 if (ipv4_is_multicast(iph->daddr)) {
1248 dev->flags = IFF_BROADCAST;
1249 dev->header_ops = &ipgre_header_ops;
1250 dev->open = ipgre_open;
1251 dev->stop = ipgre_close;
1255 dev->header_ops = &ipgre_header_ops;
1260 static int __init ipgre_fb_tunnel_init(struct net_device *dev)
1262 struct ip_tunnel *tunnel = netdev_priv(dev);
1263 struct iphdr *iph = &tunnel->parms.iph;
1266 strcpy(tunnel->parms.name, dev->name);
1269 iph->protocol = IPPROTO_GRE;
1271 tunnel->hlen = sizeof(struct iphdr) + 4;
1274 tunnels_wc[0] = tunnel;
1279 static struct net_protocol ipgre_protocol = {
1280 .handler = ipgre_rcv,
1281 .err_handler = ipgre_err,
1284 static int ipgre_init_net(struct net *net)
1287 struct ipgre_net *ign;
1290 ign = kmalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1294 err = net_assign_generic(net, ipgre_net_id, ign);
1306 static void ipgre_exit_net(struct net *net)
1308 struct ipgre_net *ign;
1310 ign = net_generic(net, ipgre_net_id);
1314 static struct pernet_operations ipgre_net_ops = {
1315 .init = ipgre_init_net,
1316 .exit = ipgre_exit_net,
1320 * And now the modules code and kernel interface.
1323 static int __init ipgre_init(void)
1327 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1329 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1330 printk(KERN_INFO "ipgre init: can't add protocol\n");
1334 ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1335 ipgre_tunnel_setup);
1336 if (!ipgre_fb_tunnel_dev) {
1341 ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1343 if ((err = register_netdev(ipgre_fb_tunnel_dev)))
1346 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1352 free_netdev(ipgre_fb_tunnel_dev);
1354 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1357 unregister_netdevice(ipgre_fb_tunnel_dev);
1361 static void __exit ipgre_destroy_tunnels(void)
1365 for (prio = 0; prio < 4; prio++) {
1367 for (h = 0; h < HASH_SIZE; h++) {
1368 struct ip_tunnel *t;
1369 while ((t = tunnels[prio][h]) != NULL)
1370 unregister_netdevice(t->dev);
1375 static void __exit ipgre_fini(void)
1377 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1378 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1381 ipgre_destroy_tunnels();
1384 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1387 module_init(ipgre_init);
1388 module_exit(ipgre_fini);
1389 MODULE_LICENSE("GPL");