2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
36 #include <net/protocol.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
69 Current solution: t->recursion lock breaks dead loops. It looks
70 like dev->tbusy flag, but I preferred new variable, because
71 the semantics is different. One day, when hard_start_xmit
72 will be multithreaded we will have to use skb->encapsulation.
76 2. Networking dead loops would not kill routers, but would really
77 kill network. IP hop limit plays role of "t->recursion" in this case,
78 if we copy it from packet being encapsulated to upper header.
79 It is very good solution, but it introduces two problems:
81 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82 do not work over tunnels.
83 - traceroute does not work. I planned to relay ICMP from tunnel,
84 so that this problem would be solved and traceroute output
85 would even more informative. This idea appeared to be wrong:
86 only Linux complies to rfc1812 now (yes, guys, Linux is the only
87 true router now :-)), all routers (at least, in neighbourhood of mine)
88 return only 8 bytes of payload. It is the end.
90 Hence, if we want that OSPF worked or traceroute said something reasonable,
91 we should search for another solution.
93 One of them is to parse packet trying to detect inner encapsulation
94 made by our node. It is difficult or even impossible, especially,
95 taking into account fragmentation. TO be short, tt is not solution at all.
97 Current solution: The solution was UNEXPECTEDLY SIMPLE.
98 We force DF flag on tunnels with preconfigured hop limit,
99 that is ALL. :-) Well, it does not remove the problem completely,
100 but exponential growth of network traffic is changed to linear
101 (branches, that exceed pmtu are pruned) and tunnel mtu
102 fastly degrades to value <68, where looping stops.
103 Yes, it is not good if there exists a router in the loop,
104 which does not force DF, even when encapsulating packets have DF set.
105 But it is not our problem! Nobody could accuse us, we made
106 all that we could make. Even if it is your gated who injected
107 fatal route to network, even if it were you who configured
108 fatal static route: you are innocent. :-)
112 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113 practically identical code. It would be good to glue them
114 together, but it is not very evident, how to make them modular.
115 sit is integral part of IPv6, ipip and gre are naturally modular.
116 We could extract common parts (hash table, ioctl etc)
117 to a separate module (ip_tunnel.c).
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127 /* Fallback tunnel: no source, no destination, no key, no options */
129 static int ipgre_fb_tunnel_init(struct net_device *dev);
133 static int ipgre_net_id;
135 struct ip_tunnel *tunnels[4][HASH_SIZE];
137 struct net_device *fb_tunnel_dev;
140 /* Tunnel hash table */
150 We require exact key match i.e. if a key is present in packet
151 it will match only tunnel with the same key; if it is not present,
152 it will match only keyless tunnel.
154 All keysless packets, if not matched configured keyless tunnels
155 will match fallback tunnel.
158 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
160 #define tunnels_r_l tunnels[3]
161 #define tunnels_r tunnels[2]
162 #define tunnels_l tunnels[1]
163 #define tunnels_wc tunnels[0]
165 static DEFINE_RWLOCK(ipgre_lock);
167 /* Given src, dst and key, find appropriate for input tunnel. */
169 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
170 __be32 remote, __be32 local,
171 __be32 key, __be16 gre_proto)
173 unsigned h0 = HASH(remote);
174 unsigned h1 = HASH(key);
176 struct ip_tunnel *t2 = NULL;
177 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
178 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
179 ARPHRD_ETHER : ARPHRD_IPGRE;
181 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
183 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
184 if (t->dev->type == dev_type)
186 if (t->dev->type == ARPHRD_IPGRE && !t2)
192 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
193 if (remote == t->parms.iph.daddr) {
194 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
195 if (t->dev->type == dev_type)
197 if (t->dev->type == ARPHRD_IPGRE && !t2)
203 for (t = ign->tunnels_l[h1]; t; t = t->next) {
204 if (local == t->parms.iph.saddr ||
205 (local == t->parms.iph.daddr &&
206 ipv4_is_multicast(local))) {
207 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
208 if (t->dev->type == dev_type)
210 if (t->dev->type == ARPHRD_IPGRE && !t2)
216 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
217 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
218 if (t->dev->type == dev_type)
220 if (t->dev->type == ARPHRD_IPGRE && !t2)
228 if (ign->fb_tunnel_dev->flags&IFF_UP)
229 return netdev_priv(ign->fb_tunnel_dev);
233 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
234 struct ip_tunnel_parm *parms)
236 __be32 remote = parms->iph.daddr;
237 __be32 local = parms->iph.saddr;
238 __be32 key = parms->i_key;
239 unsigned h = HASH(key);
244 if (remote && !ipv4_is_multicast(remote)) {
249 return &ign->tunnels[prio][h];
252 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
255 return __ipgre_bucket(ign, &t->parms);
258 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
260 struct ip_tunnel **tp = ipgre_bucket(ign, t);
263 write_lock_bh(&ipgre_lock);
265 write_unlock_bh(&ipgre_lock);
268 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
270 struct ip_tunnel **tp;
272 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
274 write_lock_bh(&ipgre_lock);
276 write_unlock_bh(&ipgre_lock);
282 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
283 struct ip_tunnel_parm *parms,
286 __be32 remote = parms->iph.daddr;
287 __be32 local = parms->iph.saddr;
288 __be32 key = parms->i_key;
289 struct ip_tunnel *t, **tp;
290 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
292 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
293 if (local == t->parms.iph.saddr &&
294 remote == t->parms.iph.daddr &&
295 key == t->parms.i_key &&
296 type == t->dev->type)
302 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
303 struct ip_tunnel_parm *parms, int create)
305 struct ip_tunnel *t, *nt;
306 struct net_device *dev;
308 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
310 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
315 strlcpy(name, parms->name, IFNAMSIZ);
317 sprintf(name, "gre%%d");
319 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
323 dev_net_set(dev, net);
325 if (strchr(name, '%')) {
326 if (dev_alloc_name(dev, name) < 0)
330 nt = netdev_priv(dev);
332 dev->rtnl_link_ops = &ipgre_link_ops;
334 dev->mtu = ipgre_tunnel_bind_dev(dev);
336 if (register_netdevice(dev) < 0)
340 ipgre_tunnel_link(ign, nt);
348 static void ipgre_tunnel_uninit(struct net_device *dev)
350 struct net *net = dev_net(dev);
351 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
353 ipgre_tunnel_unlink(ign, netdev_priv(dev));
358 static void ipgre_err(struct sk_buff *skb, u32 info)
361 /* All the routers (except for Linux) return only
362 8 bytes of packet payload. It means, that precise relaying of
363 ICMP in the real Internet is absolutely infeasible.
365 Moreover, Cisco "wise men" put GRE key to the third word
366 in GRE header. It makes impossible maintaining even soft state for keyed
367 GRE tunnels with enabled checksum. Tell them "thank you".
369 Well, I wonder, rfc1812 was written by Cisco employee,
370 what the hell these idiots break standrads established
374 struct iphdr *iph = (struct iphdr*)skb->data;
375 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
376 int grehlen = (iph->ihl<<2) + 4;
377 const int type = icmp_hdr(skb)->type;
378 const int code = icmp_hdr(skb)->code;
383 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
384 if (flags&(GRE_VERSION|GRE_ROUTING))
393 /* If only 8 bytes returned, keyed message will be dropped here */
394 if (skb_headlen(skb) < grehlen)
399 case ICMP_PARAMETERPROB:
402 case ICMP_DEST_UNREACH:
405 case ICMP_PORT_UNREACH:
406 /* Impossible event. */
408 case ICMP_FRAG_NEEDED:
409 /* Soft state for pmtu is maintained by IP core. */
412 /* All others are translated to HOST_UNREACH.
413 rfc2003 contains "deep thoughts" about NET_UNREACH,
414 I believe they are just ether pollution. --ANK
419 case ICMP_TIME_EXCEEDED:
420 if (code != ICMP_EXC_TTL)
425 read_lock(&ipgre_lock);
426 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
428 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
430 if (t == NULL || t->parms.iph.daddr == 0 ||
431 ipv4_is_multicast(t->parms.iph.daddr))
434 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
437 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
441 t->err_time = jiffies;
443 read_unlock(&ipgre_lock);
447 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
449 if (INET_ECN_is_ce(iph->tos)) {
450 if (skb->protocol == htons(ETH_P_IP)) {
451 IP_ECN_set_ce(ip_hdr(skb));
452 } else if (skb->protocol == htons(ETH_P_IPV6)) {
453 IP6_ECN_set_ce(ipv6_hdr(skb));
459 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
462 if (skb->protocol == htons(ETH_P_IP))
463 inner = old_iph->tos;
464 else if (skb->protocol == htons(ETH_P_IPV6))
465 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
466 return INET_ECN_encapsulate(tos, inner);
469 static int ipgre_rcv(struct sk_buff *skb)
477 struct ip_tunnel *tunnel;
481 if (!pskb_may_pull(skb, 16))
488 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
489 /* - Version must be 0.
490 - We do not support routing headers.
492 if (flags&(GRE_VERSION|GRE_ROUTING))
495 if (flags&GRE_CSUM) {
496 switch (skb->ip_summed) {
497 case CHECKSUM_COMPLETE:
498 csum = csum_fold(skb->csum);
504 csum = __skb_checksum_complete(skb);
505 skb->ip_summed = CHECKSUM_COMPLETE;
510 key = *(__be32*)(h + offset);
514 seqno = ntohl(*(__be32*)(h + offset));
519 gre_proto = *(__be16 *)(h + 2);
521 read_lock(&ipgre_lock);
522 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
523 iph->saddr, iph->daddr, key,
525 struct net_device_stats *stats = &tunnel->dev->stats;
529 skb->protocol = gre_proto;
530 /* WCCP version 1 and 2 protocol decoding.
531 * - Change protocol to IP
532 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
534 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
535 skb->protocol = htons(ETH_P_IP);
536 if ((*(h + offset) & 0xF0) != 0x40)
540 skb->mac_header = skb->network_header;
541 __pskb_pull(skb, offset);
542 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
543 skb->pkt_type = PACKET_HOST;
544 #ifdef CONFIG_NET_IPGRE_BROADCAST
545 if (ipv4_is_multicast(iph->daddr)) {
546 /* Looped back packet, drop it! */
547 if (skb->rtable->fl.iif == 0)
550 skb->pkt_type = PACKET_BROADCAST;
554 if (((flags&GRE_CSUM) && csum) ||
555 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
556 stats->rx_crc_errors++;
560 if (tunnel->parms.i_flags&GRE_SEQ) {
561 if (!(flags&GRE_SEQ) ||
562 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
563 stats->rx_fifo_errors++;
567 tunnel->i_seqno = seqno + 1;
570 /* Warning: All skb pointers will be invalidated! */
571 if (tunnel->dev->type == ARPHRD_ETHER) {
572 if (!pskb_may_pull(skb, ETH_HLEN)) {
573 stats->rx_length_errors++;
579 skb->protocol = eth_type_trans(skb, tunnel->dev);
580 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
584 stats->rx_bytes += skb->len;
585 skb->dev = tunnel->dev;
586 dst_release(skb->dst);
590 skb_reset_network_header(skb);
591 ipgre_ecn_decapsulate(iph, skb);
594 read_unlock(&ipgre_lock);
597 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
600 read_unlock(&ipgre_lock);
606 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
608 struct ip_tunnel *tunnel = netdev_priv(dev);
609 struct net_device_stats *stats = &tunnel->dev->stats;
610 struct iphdr *old_iph = ip_hdr(skb);
614 struct rtable *rt; /* Route to the other host */
615 struct net_device *tdev; /* Device to other host */
616 struct iphdr *iph; /* Our new IP header */
617 unsigned int max_headroom; /* The extra header space needed */
622 if (tunnel->recursion++) {
627 if (dev->type == ARPHRD_ETHER)
628 IPCB(skb)->flags = 0;
630 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
632 tiph = (struct iphdr*)skb->data;
634 gre_hlen = tunnel->hlen;
635 tiph = &tunnel->parms.iph;
638 if ((dst = tiph->daddr) == 0) {
641 if (skb->dst == NULL) {
642 stats->tx_fifo_errors++;
646 if (skb->protocol == htons(ETH_P_IP)) {
648 if ((dst = rt->rt_gateway) == 0)
652 else if (skb->protocol == htons(ETH_P_IPV6)) {
653 struct in6_addr *addr6;
655 struct neighbour *neigh = skb->dst->neighbour;
660 addr6 = (struct in6_addr*)&neigh->primary_key;
661 addr_type = ipv6_addr_type(addr6);
663 if (addr_type == IPV6_ADDR_ANY) {
664 addr6 = &ipv6_hdr(skb)->daddr;
665 addr_type = ipv6_addr_type(addr6);
668 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
671 dst = addr6->s6_addr32[3];
680 if (skb->protocol == htons(ETH_P_IP))
686 struct flowi fl = { .oif = tunnel->parms.link,
689 .saddr = tiph->saddr,
690 .tos = RT_TOS(tos) } },
691 .proto = IPPROTO_GRE };
692 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
693 stats->tx_carrier_errors++;
697 tdev = rt->u.dst.dev;
707 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
709 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
712 skb->dst->ops->update_pmtu(skb->dst, mtu);
714 if (skb->protocol == htons(ETH_P_IP)) {
715 df |= (old_iph->frag_off&htons(IP_DF));
717 if ((old_iph->frag_off&htons(IP_DF)) &&
718 mtu < ntohs(old_iph->tot_len)) {
719 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
725 else if (skb->protocol == htons(ETH_P_IPV6)) {
726 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
728 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
729 if ((tunnel->parms.iph.daddr &&
730 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
731 rt6->rt6i_dst.plen == 128) {
732 rt6->rt6i_flags |= RTF_MODIFIED;
733 skb->dst->metrics[RTAX_MTU-1] = mtu;
737 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
738 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
745 if (tunnel->err_count > 0) {
746 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
749 dst_link_failure(skb);
751 tunnel->err_count = 0;
754 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
756 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
757 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
758 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
767 skb_set_owner_w(new_skb, skb->sk);
770 old_iph = ip_hdr(skb);
773 skb->transport_header = skb->network_header;
774 skb_push(skb, gre_hlen);
775 skb_reset_network_header(skb);
776 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
777 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
779 dst_release(skb->dst);
780 skb->dst = &rt->u.dst;
783 * Push down and install the IPIP header.
788 iph->ihl = sizeof(struct iphdr) >> 2;
790 iph->protocol = IPPROTO_GRE;
791 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
792 iph->daddr = rt->rt_dst;
793 iph->saddr = rt->rt_src;
795 if ((iph->ttl = tiph->ttl) == 0) {
796 if (skb->protocol == htons(ETH_P_IP))
797 iph->ttl = old_iph->ttl;
799 else if (skb->protocol == htons(ETH_P_IPV6))
800 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
803 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
806 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
807 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
808 htons(ETH_P_TEB) : skb->protocol;
810 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
811 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
813 if (tunnel->parms.o_flags&GRE_SEQ) {
815 *ptr = htonl(tunnel->o_seqno);
818 if (tunnel->parms.o_flags&GRE_KEY) {
819 *ptr = tunnel->parms.o_key;
822 if (tunnel->parms.o_flags&GRE_CSUM) {
824 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
835 dst_link_failure(skb);
844 static int ipgre_tunnel_bind_dev(struct net_device *dev)
846 struct net_device *tdev = NULL;
847 struct ip_tunnel *tunnel;
849 int hlen = LL_MAX_HEADER;
850 int mtu = ETH_DATA_LEN;
851 int addend = sizeof(struct iphdr) + 4;
853 tunnel = netdev_priv(dev);
854 iph = &tunnel->parms.iph;
856 /* Guess output device to choose reasonable mtu and needed_headroom */
859 struct flowi fl = { .oif = tunnel->parms.link,
861 { .daddr = iph->daddr,
863 .tos = RT_TOS(iph->tos) } },
864 .proto = IPPROTO_GRE };
866 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
867 tdev = rt->u.dst.dev;
871 if (dev->type != ARPHRD_ETHER)
872 dev->flags |= IFF_POINTOPOINT;
875 if (!tdev && tunnel->parms.link)
876 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
879 hlen = tdev->hard_header_len + tdev->needed_headroom;
882 dev->iflink = tunnel->parms.link;
884 /* Precalculate GRE options length */
885 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
886 if (tunnel->parms.o_flags&GRE_CSUM)
888 if (tunnel->parms.o_flags&GRE_KEY)
890 if (tunnel->parms.o_flags&GRE_SEQ)
893 dev->needed_headroom = addend + hlen;
894 mtu -= dev->hard_header_len - addend;
899 tunnel->hlen = addend;
905 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
908 struct ip_tunnel_parm p;
910 struct net *net = dev_net(dev);
911 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
916 if (dev == ign->fb_tunnel_dev) {
917 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
921 t = ipgre_tunnel_locate(net, &p, 0);
924 t = netdev_priv(dev);
925 memcpy(&p, &t->parms, sizeof(p));
926 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
933 if (!capable(CAP_NET_ADMIN))
937 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
941 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
942 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
943 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
946 p.iph.frag_off |= htons(IP_DF);
948 if (!(p.i_flags&GRE_KEY))
950 if (!(p.o_flags&GRE_KEY))
953 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
955 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
964 t = netdev_priv(dev);
966 if (ipv4_is_multicast(p.iph.daddr))
967 nflags = IFF_BROADCAST;
968 else if (p.iph.daddr)
969 nflags = IFF_POINTOPOINT;
971 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
975 ipgre_tunnel_unlink(ign, t);
976 t->parms.iph.saddr = p.iph.saddr;
977 t->parms.iph.daddr = p.iph.daddr;
978 t->parms.i_key = p.i_key;
979 t->parms.o_key = p.o_key;
980 memcpy(dev->dev_addr, &p.iph.saddr, 4);
981 memcpy(dev->broadcast, &p.iph.daddr, 4);
982 ipgre_tunnel_link(ign, t);
983 netdev_state_change(dev);
989 if (cmd == SIOCCHGTUNNEL) {
990 t->parms.iph.ttl = p.iph.ttl;
991 t->parms.iph.tos = p.iph.tos;
992 t->parms.iph.frag_off = p.iph.frag_off;
993 if (t->parms.link != p.link) {
994 t->parms.link = p.link;
995 dev->mtu = ipgre_tunnel_bind_dev(dev);
996 netdev_state_change(dev);
999 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1002 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1007 if (!capable(CAP_NET_ADMIN))
1010 if (dev == ign->fb_tunnel_dev) {
1012 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1015 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1018 if (t == netdev_priv(ign->fb_tunnel_dev))
1022 unregister_netdevice(dev);
1034 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1036 struct ip_tunnel *tunnel = netdev_priv(dev);
1038 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1044 /* Nice toy. Unfortunately, useless in real life :-)
1045 It allows to construct virtual multiprotocol broadcast "LAN"
1046 over the Internet, provided multicast routing is tuned.
1049 I have no idea was this bicycle invented before me,
1050 so that I had to set ARPHRD_IPGRE to a random value.
1051 I have an impression, that Cisco could make something similar,
1052 but this feature is apparently missing in IOS<=11.2(8).
1054 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1055 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1057 ping -t 255 224.66.66.66
1059 If nobody answers, mbone does not work.
1061 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1062 ip addr add 10.66.66.<somewhat>/24 dev Universe
1063 ifconfig Universe up
1064 ifconfig Universe add fe80::<Your_real_addr>/10
1065 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1068 ftp fec0:6666:6666::193.233.7.65
1073 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1074 unsigned short type,
1075 const void *daddr, const void *saddr, unsigned len)
1077 struct ip_tunnel *t = netdev_priv(dev);
1078 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1079 __be16 *p = (__be16*)(iph+1);
1081 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1082 p[0] = t->parms.o_flags;
1086 * Set the source hardware address.
1090 memcpy(&iph->saddr, saddr, 4);
1093 memcpy(&iph->daddr, daddr, 4);
1096 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1102 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1104 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1105 memcpy(haddr, &iph->saddr, 4);
1109 static const struct header_ops ipgre_header_ops = {
1110 .create = ipgre_header,
1111 .parse = ipgre_header_parse,
1114 #ifdef CONFIG_NET_IPGRE_BROADCAST
1115 static int ipgre_open(struct net_device *dev)
1117 struct ip_tunnel *t = netdev_priv(dev);
1119 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1120 struct flowi fl = { .oif = t->parms.link,
1122 { .daddr = t->parms.iph.daddr,
1123 .saddr = t->parms.iph.saddr,
1124 .tos = RT_TOS(t->parms.iph.tos) } },
1125 .proto = IPPROTO_GRE };
1127 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1128 return -EADDRNOTAVAIL;
1129 dev = rt->u.dst.dev;
1131 if (__in_dev_get_rtnl(dev) == NULL)
1132 return -EADDRNOTAVAIL;
1133 t->mlink = dev->ifindex;
1134 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1139 static int ipgre_close(struct net_device *dev)
1141 struct ip_tunnel *t = netdev_priv(dev);
1142 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1143 struct in_device *in_dev;
1144 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1146 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1155 static void ipgre_tunnel_setup(struct net_device *dev)
1157 dev->init = ipgre_tunnel_init;
1158 dev->uninit = ipgre_tunnel_uninit;
1159 dev->destructor = free_netdev;
1160 dev->hard_start_xmit = ipgre_tunnel_xmit;
1161 dev->do_ioctl = ipgre_tunnel_ioctl;
1162 dev->change_mtu = ipgre_tunnel_change_mtu;
1164 dev->type = ARPHRD_IPGRE;
1165 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1166 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1167 dev->flags = IFF_NOARP;
1170 dev->features |= NETIF_F_NETNS_LOCAL;
1173 static int ipgre_tunnel_init(struct net_device *dev)
1175 struct ip_tunnel *tunnel;
1178 tunnel = netdev_priv(dev);
1179 iph = &tunnel->parms.iph;
1182 strcpy(tunnel->parms.name, dev->name);
1184 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1185 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1188 #ifdef CONFIG_NET_IPGRE_BROADCAST
1189 if (ipv4_is_multicast(iph->daddr)) {
1192 dev->flags = IFF_BROADCAST;
1193 dev->header_ops = &ipgre_header_ops;
1194 dev->open = ipgre_open;
1195 dev->stop = ipgre_close;
1199 dev->header_ops = &ipgre_header_ops;
1204 static int ipgre_fb_tunnel_init(struct net_device *dev)
1206 struct ip_tunnel *tunnel = netdev_priv(dev);
1207 struct iphdr *iph = &tunnel->parms.iph;
1208 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1211 strcpy(tunnel->parms.name, dev->name);
1214 iph->protocol = IPPROTO_GRE;
1216 tunnel->hlen = sizeof(struct iphdr) + 4;
1219 ign->tunnels_wc[0] = tunnel;
1224 static struct net_protocol ipgre_protocol = {
1225 .handler = ipgre_rcv,
1226 .err_handler = ipgre_err,
1230 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1234 for (prio = 0; prio < 4; prio++) {
1236 for (h = 0; h < HASH_SIZE; h++) {
1237 struct ip_tunnel *t;
1238 while ((t = ign->tunnels[prio][h]) != NULL)
1239 unregister_netdevice(t->dev);
1244 static int ipgre_init_net(struct net *net)
1247 struct ipgre_net *ign;
1250 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1254 err = net_assign_generic(net, ipgre_net_id, ign);
1258 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1259 ipgre_tunnel_setup);
1260 if (!ign->fb_tunnel_dev) {
1265 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1266 dev_net_set(ign->fb_tunnel_dev, net);
1267 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1269 if ((err = register_netdev(ign->fb_tunnel_dev)))
1275 free_netdev(ign->fb_tunnel_dev);
1284 static void ipgre_exit_net(struct net *net)
1286 struct ipgre_net *ign;
1288 ign = net_generic(net, ipgre_net_id);
1290 ipgre_destroy_tunnels(ign);
1295 static struct pernet_operations ipgre_net_ops = {
1296 .init = ipgre_init_net,
1297 .exit = ipgre_exit_net,
1300 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1308 if (data[IFLA_GRE_IFLAGS])
1309 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1310 if (data[IFLA_GRE_OFLAGS])
1311 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1312 if (flags & (GRE_VERSION|GRE_ROUTING))
1318 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1322 if (tb[IFLA_ADDRESS]) {
1323 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1325 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1326 return -EADDRNOTAVAIL;
1332 if (data[IFLA_GRE_REMOTE]) {
1333 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1339 return ipgre_tunnel_validate(tb, data);
1342 static void ipgre_netlink_parms(struct nlattr *data[],
1343 struct ip_tunnel_parm *parms)
1345 memset(parms, 0, sizeof(parms));
1347 parms->iph.protocol = IPPROTO_GRE;
1352 if (data[IFLA_GRE_LINK])
1353 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1355 if (data[IFLA_GRE_IFLAGS])
1356 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1358 if (data[IFLA_GRE_OFLAGS])
1359 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1361 if (data[IFLA_GRE_IKEY])
1362 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1364 if (data[IFLA_GRE_OKEY])
1365 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1367 if (data[IFLA_GRE_LOCAL])
1368 memcpy(&parms->iph.saddr, nla_data(data[IFLA_GRE_LOCAL]), 4);
1370 if (data[IFLA_GRE_REMOTE])
1371 memcpy(&parms->iph.daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1373 if (data[IFLA_GRE_TTL])
1374 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1376 if (data[IFLA_GRE_TOS])
1377 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1379 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1380 parms->iph.frag_off = htons(IP_DF);
1383 static int ipgre_tap_init(struct net_device *dev)
1385 struct ip_tunnel *tunnel;
1387 tunnel = netdev_priv(dev);
1390 strcpy(tunnel->parms.name, dev->name);
1392 ipgre_tunnel_bind_dev(dev);
1397 static void ipgre_tap_setup(struct net_device *dev)
1402 dev->init = ipgre_tap_init;
1403 dev->uninit = ipgre_tunnel_uninit;
1404 dev->destructor = free_netdev;
1405 dev->hard_start_xmit = ipgre_tunnel_xmit;
1406 dev->change_mtu = ipgre_tunnel_change_mtu;
1409 dev->features |= NETIF_F_NETNS_LOCAL;
1412 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1413 struct nlattr *data[])
1415 struct ip_tunnel *nt;
1416 struct net *net = dev_net(dev);
1417 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1421 nt = netdev_priv(dev);
1422 ipgre_netlink_parms(data, &nt->parms);
1424 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1427 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1428 random_ether_addr(dev->dev_addr);
1430 mtu = ipgre_tunnel_bind_dev(dev);
1434 err = register_netdevice(dev);
1439 ipgre_tunnel_link(ign, nt);
1445 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1446 struct nlattr *data[])
1448 struct ip_tunnel *t, *nt;
1449 struct net *net = dev_net(dev);
1450 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1451 struct ip_tunnel_parm p;
1454 if (dev == ign->fb_tunnel_dev)
1457 nt = netdev_priv(dev);
1458 ipgre_netlink_parms(data, &p);
1460 t = ipgre_tunnel_locate(net, &p, 0);
1466 unsigned nflags = 0;
1470 if (ipv4_is_multicast(p.iph.daddr))
1471 nflags = IFF_BROADCAST;
1472 else if (p.iph.daddr)
1473 nflags = IFF_POINTOPOINT;
1475 if ((dev->flags ^ nflags) &
1476 (IFF_POINTOPOINT | IFF_BROADCAST))
1479 ipgre_tunnel_unlink(ign, t);
1480 t->parms.iph.saddr = p.iph.saddr;
1481 t->parms.iph.daddr = p.iph.daddr;
1482 t->parms.i_key = p.i_key;
1483 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1484 memcpy(dev->broadcast, &p.iph.daddr, 4);
1485 ipgre_tunnel_link(ign, t);
1486 netdev_state_change(dev);
1489 t->parms.o_key = p.o_key;
1490 t->parms.iph.ttl = p.iph.ttl;
1491 t->parms.iph.tos = p.iph.tos;
1492 t->parms.iph.frag_off = p.iph.frag_off;
1494 if (t->parms.link != p.link) {
1495 t->parms.link = p.link;
1496 mtu = ipgre_tunnel_bind_dev(dev);
1499 netdev_state_change(dev);
1505 static size_t ipgre_get_size(const struct net_device *dev)
1510 /* IFLA_GRE_IFLAGS */
1512 /* IFLA_GRE_OFLAGS */
1518 /* IFLA_GRE_LOCAL */
1520 /* IFLA_GRE_REMOTE */
1526 /* IFLA_GRE_PMTUDISC */
1531 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1533 struct ip_tunnel *t = netdev_priv(dev);
1534 struct ip_tunnel_parm *p = &t->parms;
1536 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1537 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1538 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1539 NLA_PUT_BE32(skb, IFLA_GRE_IFLAGS, p->i_flags);
1540 NLA_PUT_BE32(skb, IFLA_GRE_OFLAGS, p->o_flags);
1541 NLA_PUT(skb, IFLA_GRE_LOCAL, 4, &p->iph.saddr);
1542 NLA_PUT(skb, IFLA_GRE_REMOTE, 4, &p->iph.daddr);
1543 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1544 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1545 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1553 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1554 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1555 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1556 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1557 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1558 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1559 [IFLA_GRE_LOCAL] = { .len = 4 },
1560 [IFLA_GRE_REMOTE] = { .len = 4 },
1561 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1562 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1563 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1566 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1568 .maxtype = IFLA_GRE_MAX,
1569 .policy = ipgre_policy,
1570 .priv_size = sizeof(struct ip_tunnel),
1571 .setup = ipgre_tunnel_setup,
1572 .validate = ipgre_tunnel_validate,
1573 .newlink = ipgre_newlink,
1574 .changelink = ipgre_changelink,
1575 .get_size = ipgre_get_size,
1576 .fill_info = ipgre_fill_info,
1579 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1581 .maxtype = IFLA_GRE_MAX,
1582 .policy = ipgre_policy,
1583 .priv_size = sizeof(struct ip_tunnel),
1584 .setup = ipgre_tap_setup,
1585 .validate = ipgre_tap_validate,
1586 .newlink = ipgre_newlink,
1587 .changelink = ipgre_changelink,
1588 .get_size = ipgre_get_size,
1589 .fill_info = ipgre_fill_info,
1593 * And now the modules code and kernel interface.
1596 static int __init ipgre_init(void)
1600 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1602 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1603 printk(KERN_INFO "ipgre init: can't add protocol\n");
1607 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1609 goto gen_device_failed;
1611 err = rtnl_link_register(&ipgre_link_ops);
1613 goto rtnl_link_failed;
1615 err = rtnl_link_register(&ipgre_tap_ops);
1617 goto tap_ops_failed;
1623 rtnl_link_unregister(&ipgre_link_ops);
1625 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1627 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1631 static void __exit ipgre_fini(void)
1633 rtnl_link_unregister(&ipgre_tap_ops);
1634 rtnl_link_unregister(&ipgre_link_ops);
1635 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1636 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1637 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1640 module_init(ipgre_init);
1641 module_exit(ipgre_fini);
1642 MODULE_LICENSE("GPL");
1643 MODULE_ALIAS("rtnl-link-gre");
1644 MODULE_ALIAS("rtnl-link-gretap");