• source navigation • diff markup • identifier search • freetext search •
Version: 2.6.32 2.6.33 2.6.34 2.6.35 2.6.36 2.6.37 2.6.38 2.6.39 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9
Architecture: x86 arm avr32 blackfin m68k m68knommu microblaze mips powerpc sh
1 /* 2 * VXLAN: Virtual eXtensible Local Area Network 3 * 4 * Copyright (c) 2012 Vyatta Inc. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 * 10 * TODO 11 * - use IANA UDP port number (when defined) 12 * - IPv6 (not in RFC) 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/kernel.h> 18 #include <linux/types.h> 19 #include <linux/module.h> 20 #include <linux/errno.h> 21 #include <linux/slab.h> 22 #include <linux/skbuff.h> 23 #include <linux/rculist.h> 24 #include <linux/netdevice.h> 25 #include <linux/in.h> 26 #include <linux/ip.h> 27 #include <linux/udp.h> 28 #include <linux/igmp.h> 29 #include <linux/etherdevice.h> 30 #include <linux/if_ether.h> 31 #include <linux/hash.h> 32 #include <linux/ethtool.h> 33 #include <net/arp.h> 34 #include <net/ndisc.h> 35 #include <net/ip.h> 36 #include <net/ipip.h> 37 #include <net/icmp.h> 38 #include <net/udp.h> 39 #include <net/rtnetlink.h> 40 #include <net/route.h> 41 #include <net/dsfield.h> 42 #include <net/inet_ecn.h> 43 #include <net/net_namespace.h> 44 #include <net/netns/generic.h> 45 46 #define VXLAN_VERSION "0.1" 47 48 #define VNI_HASH_BITS 10 49 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) 50 #define FDB_HASH_BITS 8 51 #define FDB_HASH_SIZE (1<<FDB_HASH_BITS) 52 #define FDB_AGE_DEFAULT 300 /* 5 min */ 53 #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ 54 55 #define VXLAN_N_VID (1u << 24) 56 #define VXLAN_VID_MASK (VXLAN_N_VID - 1) 57 /* IP header + UDP + VXLAN + Ethernet header */ 58 #define VXLAN_HEADROOM (20 + 8 + 8 + 14) 59 60 #define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ 61 62 /* VXLAN protocol header */ 63 struct vxlanhdr { 64 __be32 vx_flags; 65 __be32 vx_vni; 66 }; 67 68 /* UDP port for VXLAN traffic. */ 69 static unsigned int vxlan_port __read_mostly = 8472; 70 module_param_named(udp_port, vxlan_port, uint, 0444); 71 MODULE_PARM_DESC(udp_port, "Destination UDP port"); 72 73 static bool log_ecn_error = true; 74 module_param(log_ecn_error, bool, 0644); 75 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 76 77 /* per-net private data for this module */ 78 static unsigned int vxlan_net_id; 79 struct vxlan_net { 80 struct socket *sock; /* UDP encap socket */ 81 struct hlist_head vni_list[VNI_HASH_SIZE]; 82 }; 83 84 /* Forwarding table entry */ 85 struct vxlan_fdb { 86 struct hlist_node hlist; /* linked list of entries */ 87 struct rcu_head rcu; 88 unsigned long updated; /* jiffies */ 89 unsigned long used; 90 __be32 remote_ip; 91 u16 state; /* see ndm_state */ 92 u8 eth_addr[ETH_ALEN]; 93 }; 94 95 /* Per-cpu network traffic stats */ 96 struct vxlan_stats { 97 u64 rx_packets; 98 u64 rx_bytes; 99 u64 tx_packets; 100 u64 tx_bytes; 101 struct u64_stats_sync syncp; 102 }; 103 104 /* Pseudo network device */ 105 struct vxlan_dev { 106 struct hlist_node hlist; 107 struct net_device *dev; 108 struct vxlan_stats __percpu *stats; 109 __u32 vni; /* virtual network id */ 110 __be32 gaddr; /* multicast group */ 111 __be32 saddr; /* source address */ 112 unsigned int link; /* link to multicast over */ 113 __u16 port_min; /* source port range */ 114 __u16 port_max; 115 __u8 tos; /* TOS override */ 116 __u8 ttl; 117 u32 flags; /* VXLAN_F_* below */ 118 119 unsigned long age_interval; 120 struct timer_list age_timer; 121 spinlock_t hash_lock; 122 unsigned int addrcnt; 123 unsigned int addrmax; 124 125 struct hlist_head fdb_head[FDB_HASH_SIZE]; 126 }; 127 128 #define VXLAN_F_LEARN 0x01 129 #define VXLAN_F_PROXY 0x02 130 #define VXLAN_F_RSC 0x04 131 #define VXLAN_F_L2MISS 0x08 132 #define VXLAN_F_L3MISS 0x10 133 134 /* salt for hash table */ 135 static u32 vxlan_salt __read_mostly; 136 137 static inline struct hlist_head *vni_head(struct net *net, u32 id) 138 { 139 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 140 141 return &vn->vni_list[hash_32(id, VNI_HASH_BITS)]; 142 } 143 144 /* Look up VNI in a per net namespace table */ 145 static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id) 146 { 147 struct vxlan_dev *vxlan; 148 149 hlist_for_each_entry_rcu(vxlan, vni_head(net, id), hlist) { 150 if (vxlan->vni == id) 151 return vxlan; 152 } 153 154 return NULL; 155 } 156 157 /* Fill in neighbour message in skbuff. */ 158 static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, 159 const struct vxlan_fdb *fdb, 160 u32 portid, u32 seq, int type, unsigned int flags) 161 { 162 unsigned long now = jiffies; 163 struct nda_cacheinfo ci; 164 struct nlmsghdr *nlh; 165 struct ndmsg *ndm; 166 bool send_ip, send_eth; 167 168 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); 169 if (nlh == NULL) 170 return -EMSGSIZE; 171 172 ndm = nlmsg_data(nlh); 173 memset(ndm, 0, sizeof(*ndm)); 174 175 send_eth = send_ip = true; 176 177 if (type == RTM_GETNEIGH) { 178 ndm->ndm_family = AF_INET; 179 send_ip = fdb->remote_ip != 0; 180 send_eth = !is_zero_ether_addr(fdb->eth_addr); 181 } else 182 ndm->ndm_family = AF_BRIDGE; 183 ndm->ndm_state = fdb->state; 184 ndm->ndm_ifindex = vxlan->dev->ifindex; 185 ndm->ndm_flags = NTF_SELF; 186 ndm->ndm_type = NDA_DST; 187 188 if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) 189 goto nla_put_failure; 190 191 if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip)) 192 goto nla_put_failure; 193 194 ci.ndm_used = jiffies_to_clock_t(now - fdb->used); 195 ci.ndm_confirmed = 0; 196 ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); 197 ci.ndm_refcnt = 0; 198 199 if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) 200 goto nla_put_failure; 201 202 return nlmsg_end(skb, nlh); 203 204 nla_put_failure: 205 nlmsg_cancel(skb, nlh); 206 return -EMSGSIZE; 207 } 208 209 static inline size_t vxlan_nlmsg_size(void) 210 { 211 return NLMSG_ALIGN(sizeof(struct ndmsg)) 212 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ 213 + nla_total_size(sizeof(__be32)) /* NDA_DST */ 214 + nla_total_size(sizeof(struct nda_cacheinfo)); 215 } 216 217 static void vxlan_fdb_notify(struct vxlan_dev *vxlan, 218 const struct vxlan_fdb *fdb, int type) 219 { 220 struct net *net = dev_net(vxlan->dev); 221 struct sk_buff *skb; 222 int err = -ENOBUFS; 223 224 skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); 225 if (skb == NULL) 226 goto errout; 227 228 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0); 229 if (err < 0) { 230 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ 231 WARN_ON(err == -EMSGSIZE); 232 kfree_skb(skb); 233 goto errout; 234 } 235 236 rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); 237 return; 238 errout: 239 if (err < 0) 240 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); 241 } 242 243 static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) 244 { 245 struct vxlan_dev *vxlan = netdev_priv(dev); 246 struct vxlan_fdb f; 247 248 memset(&f, 0, sizeof f); 249 f.state = NUD_STALE; 250 f.remote_ip = ipa; /* goes to NDA_DST */ 251 252 vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); 253 } 254 255 static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) 256 { 257 struct vxlan_fdb f; 258 259 memset(&f, 0, sizeof f); 260 f.state = NUD_STALE; 261 memcpy(f.eth_addr, eth_addr, ETH_ALEN); 262 263 vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); 264 } 265 266 /* Hash Ethernet address */ 267 static u32 eth_hash(const unsigned char *addr) 268 { 269 u64 value = get_unaligned((u64 *)addr); 270 271 /* only want 6 bytes */ 272 #ifdef __BIG_ENDIAN 273 value >>= 16; 274 #else 275 value <<= 16; 276 #endif 277 return hash_64(value, FDB_HASH_BITS); 278 } 279 280 /* Hash chain to use given mac address */ 281 static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, 282 const u8 *mac) 283 { 284 return &vxlan->fdb_head[eth_hash(mac)]; 285 } 286 287 /* Look up Ethernet address in forwarding table */ 288 static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, 289 const u8 *mac) 290 291 { 292 struct hlist_head *head = vxlan_fdb_head(vxlan, mac); 293 struct vxlan_fdb *f; 294 295 hlist_for_each_entry_rcu(f, head, hlist) { 296 if (compare_ether_addr(mac, f->eth_addr) == 0) 297 return f; 298 } 299 300 return NULL; 301 } 302 303 /* Add new entry to forwarding table -- assumes lock held */ 304 static int vxlan_fdb_create(struct vxlan_dev *vxlan, 305 const u8 *mac, __be32 ip, 306 __u16 state, __u16 flags) 307 { 308 struct vxlan_fdb *f; 309 int notify = 0; 310 311 f = vxlan_find_mac(vxlan, mac); 312 if (f) { 313 if (flags & NLM_F_EXCL) { 314 netdev_dbg(vxlan->dev, 315 "lost race to create %pM\n", mac); 316 return -EEXIST; 317 } 318 if (f->state != state) { 319 f->state = state; 320 f->updated = jiffies; 321 notify = 1; 322 } 323 } else { 324 if (!(flags & NLM_F_CREATE)) 325 return -ENOENT; 326 327 if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax) 328 return -ENOSPC; 329 330 netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip); 331 f = kmalloc(sizeof(*f), GFP_ATOMIC); 332 if (!f) 333 return -ENOMEM; 334 335 notify = 1; 336 f->remote_ip = ip; 337 f->state = state; 338 f->updated = f->used = jiffies; 339 memcpy(f->eth_addr, mac, ETH_ALEN); 340 341 ++vxlan->addrcnt; 342 hlist_add_head_rcu(&f->hlist, 343 vxlan_fdb_head(vxlan, mac)); 344 } 345 346 if (notify) 347 vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH); 348 349 return 0; 350 } 351 352 static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) 353 { 354 netdev_dbg(vxlan->dev, 355 "delete %pM\n", f->eth_addr); 356 357 --vxlan->addrcnt; 358 vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH); 359 360 hlist_del_rcu(&f->hlist); 361 kfree_rcu(f, rcu); 362 } 363 364 /* Add static entry (via netlink) */ 365 static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], 366 struct net_device *dev, 367 const unsigned char *addr, u16 flags) 368 { 369 struct vxlan_dev *vxlan = netdev_priv(dev); 370 __be32 ip; 371 int err; 372 373 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { 374 pr_info("RTM_NEWNEIGH with invalid state %#x\n", 375 ndm->ndm_state); 376 return -EINVAL; 377 } 378 379 if (tb[NDA_DST] == NULL) 380 return -EINVAL; 381 382 if (nla_len(tb[NDA_DST]) != sizeof(__be32)) 383 return -EAFNOSUPPORT; 384 385 ip = nla_get_be32(tb[NDA_DST]); 386 387 spin_lock_bh(&vxlan->hash_lock); 388 err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags); 389 spin_unlock_bh(&vxlan->hash_lock); 390 391 return err; 392 } 393 394 /* Delete entry (via netlink) */ 395 static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], 396 struct net_device *dev, 397 const unsigned char *addr) 398 { 399 struct vxlan_dev *vxlan = netdev_priv(dev); 400 struct vxlan_fdb *f; 401 int err = -ENOENT; 402 403 spin_lock_bh(&vxlan->hash_lock); 404 f = vxlan_find_mac(vxlan, addr); 405 if (f) { 406 vxlan_fdb_destroy(vxlan, f); 407 err = 0; 408 } 409 spin_unlock_bh(&vxlan->hash_lock); 410 411 return err; 412 } 413 414 /* Dump forwarding table */ 415 static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, 416 struct net_device *dev, int idx) 417 { 418 struct vxlan_dev *vxlan = netdev_priv(dev); 419 unsigned int h; 420 421 for (h = 0; h < FDB_HASH_SIZE; ++h) { 422 struct vxlan_fdb *f; 423 int err; 424 425 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { 426 if (idx < cb->args[0]) 427 goto skip; 428 429 err = vxlan_fdb_info(skb, vxlan, f, 430 NETLINK_CB(cb->skb).portid, 431 cb->nlh->nlmsg_seq, 432 RTM_NEWNEIGH, 433 NLM_F_MULTI); 434 if (err < 0) 435 break; 436 skip: 437 ++idx; 438 } 439 } 440 441 return idx; 442 } 443 444 /* Watch incoming packets to learn mapping between Ethernet address 445 * and Tunnel endpoint. 446 */ 447 static void vxlan_snoop(struct net_device *dev, 448 __be32 src_ip, const u8 *src_mac) 449 { 450 struct vxlan_dev *vxlan = netdev_priv(dev); 451 struct vxlan_fdb *f; 452 int err; 453 454 f = vxlan_find_mac(vxlan, src_mac); 455 if (likely(f)) { 456 f->used = jiffies; 457 if (likely(f->remote_ip == src_ip)) 458 return; 459 460 if (net_ratelimit()) 461 netdev_info(dev, 462 "%pM migrated from %pI4 to %pI4\n", 463 src_mac, &f->remote_ip, &src_ip); 464 465 f->remote_ip = src_ip; 466 f->updated = jiffies; 467 } else { 468 /* learned new entry */ 469 spin_lock(&vxlan->hash_lock); 470 err = vxlan_fdb_create(vxlan, src_mac, src_ip, 471 NUD_REACHABLE, 472 NLM_F_EXCL|NLM_F_CREATE); 473 spin_unlock(&vxlan->hash_lock); 474 } 475 } 476 477 478 /* See if multicast group is already in use by other ID */ 479 static bool vxlan_group_used(struct vxlan_net *vn, 480 const struct vxlan_dev *this) 481 { 482 const struct vxlan_dev *vxlan; 483 unsigned h; 484 485 for (h = 0; h < VNI_HASH_SIZE; ++h) 486 hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist) { 487 if (vxlan == this) 488 continue; 489 490 if (!netif_running(vxlan->dev)) 491 continue; 492 493 if (vxlan->gaddr == this->gaddr) 494 return true; 495 } 496 497 return false; 498 } 499 500 /* kernel equivalent to IP_ADD_MEMBERSHIP */ 501 static int vxlan_join_group(struct net_device *dev) 502 { 503 struct vxlan_dev *vxlan = netdev_priv(dev); 504 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 505 struct sock *sk = vn->sock->sk; 506 struct ip_mreqn mreq = { 507 .imr_multiaddr.s_addr = vxlan->gaddr, 508 .imr_ifindex = vxlan->link, 509 }; 510 int err; 511 512 /* Already a member of group */ 513 if (vxlan_group_used(vn, vxlan)) 514 return 0; 515 516 /* Need to drop RTNL to call multicast join */ 517 rtnl_unlock(); 518 lock_sock(sk); 519 err = ip_mc_join_group(sk, &mreq); 520 release_sock(sk); 521 rtnl_lock(); 522 523 return err; 524 } 525 526 527 /* kernel equivalent to IP_DROP_MEMBERSHIP */ 528 static int vxlan_leave_group(struct net_device *dev) 529 { 530 struct vxlan_dev *vxlan = netdev_priv(dev); 531 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 532 int err = 0; 533 struct sock *sk = vn->sock->sk; 534 struct ip_mreqn mreq = { 535 .imr_multiaddr.s_addr = vxlan->gaddr, 536 .imr_ifindex = vxlan->link, 537 }; 538 539 /* Only leave group when last vxlan is done. */ 540 if (vxlan_group_used(vn, vxlan)) 541 return 0; 542 543 /* Need to drop RTNL to call multicast leave */ 544 rtnl_unlock(); 545 lock_sock(sk); 546 err = ip_mc_leave_group(sk, &mreq); 547 release_sock(sk); 548 rtnl_lock(); 549 550 return err; 551 } 552 553 /* Callback from net/ipv4/udp.c to receive packets */ 554 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) 555 { 556 struct iphdr *oip; 557 struct vxlanhdr *vxh; 558 struct vxlan_dev *vxlan; 559 struct vxlan_stats *stats; 560 __u32 vni; 561 int err; 562 563 /* pop off outer UDP header */ 564 __skb_pull(skb, sizeof(struct udphdr)); 565 566 /* Need Vxlan and inner Ethernet header to be present */ 567 if (!pskb_may_pull(skb, sizeof(struct vxlanhdr))) 568 goto error; 569 570 /* Drop packets with reserved bits set */ 571 vxh = (struct vxlanhdr *) skb->data; 572 if (vxh->vx_flags != htonl(VXLAN_FLAGS) || 573 (vxh->vx_vni & htonl(0xff))) { 574 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", 575 ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); 576 goto error; 577 } 578 579 __skb_pull(skb, sizeof(struct vxlanhdr)); 580 581 /* Is this VNI defined? */ 582 vni = ntohl(vxh->vx_vni) >> 8; 583 vxlan = vxlan_find_vni(sock_net(sk), vni); 584 if (!vxlan) { 585 netdev_dbg(skb->dev, "unknown vni %d\n", vni); 586 goto drop; 587 } 588 589 if (!pskb_may_pull(skb, ETH_HLEN)) { 590 vxlan->dev->stats.rx_length_errors++; 591 vxlan->dev->stats.rx_errors++; 592 goto drop; 593 } 594 595 skb_reset_mac_header(skb); 596 597 /* Re-examine inner Ethernet packet */ 598 oip = ip_hdr(skb); 599 skb->protocol = eth_type_trans(skb, vxlan->dev); 600 601 /* Ignore packet loops (and multicast echo) */ 602 if (compare_ether_addr(eth_hdr(skb)->h_source, 603 vxlan->dev->dev_addr) == 0) 604 goto drop; 605 606 if (vxlan->flags & VXLAN_F_LEARN) 607 vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source); 608 609 __skb_tunnel_rx(skb, vxlan->dev); 610 skb_reset_network_header(skb); 611 612 /* If the NIC driver gave us an encapsulated packet with 613 * CHECKSUM_UNNECESSARY and Rx checksum feature is enabled, 614 * leave the CHECKSUM_UNNECESSARY, the device checksummed it 615 * for us. Otherwise force the upper layers to verify it. 616 */ 617 if (skb->ip_summed != CHECKSUM_UNNECESSARY || !skb->encapsulation || 618 !(vxlan->dev->features & NETIF_F_RXCSUM)) 619 skb->ip_summed = CHECKSUM_NONE; 620 621 skb->encapsulation = 0; 622 623 err = IP_ECN_decapsulate(oip, skb); 624 if (unlikely(err)) { 625 if (log_ecn_error) 626 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 627 &oip->saddr, oip->tos); 628 if (err > 1) { 629 ++vxlan->dev->stats.rx_frame_errors; 630 ++vxlan->dev->stats.rx_errors; 631 goto drop; 632 } 633 } 634 635 stats = this_cpu_ptr(vxlan->stats); 636 u64_stats_update_begin(&stats->syncp); 637 stats->rx_packets++; 638 stats->rx_bytes += skb->len; 639 u64_stats_update_end(&stats->syncp); 640 641 netif_rx(skb); 642 643 return 0; 644 error: 645 /* Put UDP header back */ 646 __skb_push(skb, sizeof(struct udphdr)); 647 648 return 1; 649 drop: 650 /* Consume bad packet */ 651 kfree_skb(skb); 652 return 0; 653 } 654 655 static int arp_reduce(struct net_device *dev, struct sk_buff *skb) 656 { 657 struct vxlan_dev *vxlan = netdev_priv(dev); 658 struct arphdr *parp; 659 u8 *arpptr, *sha; 660 __be32 sip, tip; 661 struct neighbour *n; 662 663 if (dev->flags & IFF_NOARP) 664 goto out; 665 666 if (!pskb_may_pull(skb, arp_hdr_len(dev))) { 667 dev->stats.tx_dropped++; 668 goto out; 669 } 670 parp = arp_hdr(skb); 671 672 if ((parp->ar_hrd != htons(ARPHRD_ETHER) && 673 parp->ar_hrd != htons(ARPHRD_IEEE802)) || 674 parp->ar_pro != htons(ETH_P_IP) || 675 parp->ar_op != htons(ARPOP_REQUEST) || 676 parp->ar_hln != dev->addr_len || 677 parp->ar_pln != 4) 678 goto out; 679 arpptr = (u8 *)parp + sizeof(struct arphdr); 680 sha = arpptr; 681 arpptr += dev->addr_len; /* sha */ 682 memcpy(&sip, arpptr, sizeof(sip)); 683 arpptr += sizeof(sip); 684 arpptr += dev->addr_len; /* tha */ 685 memcpy(&tip, arpptr, sizeof(tip)); 686 687 if (ipv4_is_loopback(tip) || 688 ipv4_is_multicast(tip)) 689 goto out; 690 691 n = neigh_lookup(&arp_tbl, &tip, dev); 692 693 if (n) { 694 struct vxlan_dev *vxlan = netdev_priv(dev); 695 struct vxlan_fdb *f; 696 struct sk_buff *reply; 697 698 if (!(n->nud_state & NUD_CONNECTED)) { 699 neigh_release(n); 700 goto out; 701 } 702 703 f = vxlan_find_mac(vxlan, n->ha); 704 if (f && f->remote_ip == 0) { 705 /* bridge-local neighbor */ 706 neigh_release(n); 707 goto out; 708 } 709 710 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, 711 n->ha, sha); 712 713 neigh_release(n); 714 715 skb_reset_mac_header(reply); 716 __skb_pull(reply, skb_network_offset(reply)); 717 reply->ip_summed = CHECKSUM_UNNECESSARY; 718 reply->pkt_type = PACKET_HOST; 719 720 if (netif_rx_ni(reply) == NET_RX_DROP) 721 dev->stats.rx_dropped++; 722 } else if (vxlan->flags & VXLAN_F_L3MISS) 723 vxlan_ip_miss(dev, tip); 724 out: 725 consume_skb(skb); 726 return NETDEV_TX_OK; 727 } 728 729 static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) 730 { 731 struct vxlan_dev *vxlan = netdev_priv(dev); 732 struct neighbour *n; 733 struct iphdr *pip; 734 735 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) 736 return false; 737 738 n = NULL; 739 switch (ntohs(eth_hdr(skb)->h_proto)) { 740 case ETH_P_IP: 741 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 742 return false; 743 pip = ip_hdr(skb); 744 n = neigh_lookup(&arp_tbl, &pip->daddr, dev); 745 break; 746 default: 747 return false; 748 } 749 750 if (n) { 751 bool diff; 752 753 diff = compare_ether_addr(eth_hdr(skb)->h_dest, n->ha) != 0; 754 if (diff) { 755 memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 756 dev->addr_len); 757 memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); 758 } 759 neigh_release(n); 760 return diff; 761 } else if (vxlan->flags & VXLAN_F_L3MISS) 762 vxlan_ip_miss(dev, pip->daddr); 763 return false; 764 } 765 766 /* Extract dsfield from inner protocol */ 767 static inline u8 vxlan_get_dsfield(const struct iphdr *iph, 768 const struct sk_buff *skb) 769 { 770 if (skb->protocol == htons(ETH_P_IP)) 771 return iph->tos; 772 else if (skb->protocol == htons(ETH_P_IPV6)) 773 return ipv6_get_dsfield((const struct ipv6hdr *)iph); 774 else 775 return 0; 776 } 777 778 /* Propogate ECN bits out */ 779 static inline u8 vxlan_ecn_encap(u8 tos, 780 const struct iphdr *iph, 781 const struct sk_buff *skb) 782 { 783 u8 inner = vxlan_get_dsfield(iph, skb); 784 785 return INET_ECN_encapsulate(tos, inner); 786 } 787 788 static void vxlan_sock_free(struct sk_buff *skb) 789 { 790 sock_put(skb->sk); 791 } 792 793 /* On transmit, associate with the tunnel socket */ 794 static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) 795 { 796 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 797 struct sock *sk = vn->sock->sk; 798 799 skb_orphan(skb); 800 sock_hold(sk); 801 skb->sk = sk; 802 skb->destructor = vxlan_sock_free; 803 } 804 805 /* Compute source port for outgoing packet 806 * first choice to use L4 flow hash since it will spread 807 * better and maybe available from hardware 808 * secondary choice is to use jhash on the Ethernet header 809 */ 810 static u16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb) 811 { 812 unsigned int range = (vxlan->port_max - vxlan->port_min) + 1; 813 u32 hash; 814 815 hash = skb_get_rxhash(skb); 816 if (!hash) 817 hash = jhash(skb->data, 2 * ETH_ALEN, 818 (__force u32) skb->protocol); 819 820 return (((u64) hash * range) >> 32) + vxlan->port_min; 821 } 822 823 /* Transmit local packets over Vxlan 824 * 825 * Outer IP header inherits ECN and DF from inner header. 826 * Outer UDP destination is the VXLAN assigned port. 827 * source port is based on hash of flow 828 */ 829 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) 830 { 831 struct vxlan_dev *vxlan = netdev_priv(dev); 832 struct rtable *rt; 833 const struct iphdr *old_iph; 834 struct ethhdr *eth; 835 struct iphdr *iph; 836 struct vxlanhdr *vxh; 837 struct udphdr *uh; 838 struct flowi4 fl4; 839 unsigned int pkt_len = skb->len; 840 __be32 dst; 841 __u16 src_port; 842 __be16 df = 0; 843 __u8 tos, ttl; 844 int err; 845 bool did_rsc = false; 846 const struct vxlan_fdb *f; 847 848 skb_reset_mac_header(skb); 849 eth = eth_hdr(skb); 850 851 if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP) 852 return arp_reduce(dev, skb); 853 else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP) 854 did_rsc = route_shortcircuit(dev, skb); 855 856 f = vxlan_find_mac(vxlan, eth->h_dest); 857 if (f == NULL) { 858 did_rsc = false; 859 dst = vxlan->gaddr; 860 if (!dst && (vxlan->flags & VXLAN_F_L2MISS) && 861 !is_multicast_ether_addr(eth->h_dest)) 862 vxlan_fdb_miss(vxlan, eth->h_dest); 863 } else 864 dst = f->remote_ip; 865 866 if (!dst) { 867 if (did_rsc) { 868 __skb_pull(skb, skb_network_offset(skb)); 869 skb->ip_summed = CHECKSUM_NONE; 870 skb->pkt_type = PACKET_HOST; 871 872 /* short-circuited back to local bridge */ 873 if (netif_rx(skb) == NET_RX_SUCCESS) { 874 struct vxlan_stats *stats = 875 this_cpu_ptr(vxlan->stats); 876 877 u64_stats_update_begin(&stats->syncp); 878 stats->tx_packets++; 879 stats->tx_bytes += pkt_len; 880 u64_stats_update_end(&stats->syncp); 881 } else { 882 dev->stats.tx_errors++; 883 dev->stats.tx_aborted_errors++; 884 } 885 return NETDEV_TX_OK; 886 } 887 goto drop; 888 } 889 890 if (!skb->encapsulation) { 891 skb_reset_inner_headers(skb); 892 skb->encapsulation = 1; 893 } 894 895 /* Need space for new headers (invalidates iph ptr) */ 896 if (skb_cow_head(skb, VXLAN_HEADROOM)) 897 goto drop; 898 899 old_iph = ip_hdr(skb); 900 901 ttl = vxlan->ttl; 902 if (!ttl && IN_MULTICAST(ntohl(dst))) 903 ttl = 1; 904 905 tos = vxlan->tos; 906 if (tos == 1) 907 tos = vxlan_get_dsfield(old_iph, skb); 908 909 src_port = vxlan_src_port(vxlan, skb); 910 911 memset(&fl4, 0, sizeof(fl4)); 912 fl4.flowi4_oif = vxlan->link; 913 fl4.flowi4_tos = RT_TOS(tos); 914 fl4.daddr = dst; 915 fl4.saddr = vxlan->saddr; 916 917 rt = ip_route_output_key(dev_net(dev), &fl4); 918 if (IS_ERR(rt)) { 919 netdev_dbg(dev, "no route to %pI4\n", &dst); 920 dev->stats.tx_carrier_errors++; 921 goto tx_error; 922 } 923 924 if (rt->dst.dev == dev) { 925 netdev_dbg(dev, "circular route to %pI4\n", &dst); 926 ip_rt_put(rt); 927 dev->stats.collisions++; 928 goto tx_error; 929 } 930 931 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 932 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 933 IPSKB_REROUTED); 934 skb_dst_drop(skb); 935 skb_dst_set(skb, &rt->dst); 936 937 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); 938 vxh->vx_flags = htonl(VXLAN_FLAGS); 939 vxh->vx_vni = htonl(vxlan->vni << 8); 940 941 __skb_push(skb, sizeof(*uh)); 942 skb_reset_transport_header(skb); 943 uh = udp_hdr(skb); 944 945 uh->dest = htons(vxlan_port); 946 uh->source = htons(src_port); 947 948 uh->len = htons(skb->len); 949 uh->check = 0; 950 951 __skb_push(skb, sizeof(*iph)); 952 skb_reset_network_header(skb); 953 iph = ip_hdr(skb); 954 iph->version = 4; 955 iph->ihl = sizeof(struct iphdr) >> 2; 956 iph->frag_off = df; 957 iph->protocol = IPPROTO_UDP; 958 iph->tos = vxlan_ecn_encap(tos, old_iph, skb); 959 iph->daddr = dst; 960 iph->saddr = fl4.saddr; 961 iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); 962 tunnel_ip_select_ident(skb, old_iph, &rt->dst); 963 964 nf_reset(skb); 965 966 vxlan_set_owner(dev, skb); 967 968 /* See iptunnel_xmit() */ 969 if (skb->ip_summed != CHECKSUM_PARTIAL) 970 skb->ip_summed = CHECKSUM_NONE; 971 972 err = ip_local_out(skb); 973 if (likely(net_xmit_eval(err) == 0)) { 974 struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats); 975 976 u64_stats_update_begin(&stats->syncp); 977 stats->tx_packets++; 978 stats->tx_bytes += pkt_len; 979 u64_stats_update_end(&stats->syncp); 980 } else { 981 dev->stats.tx_errors++; 982 dev->stats.tx_aborted_errors++; 983 } 984 return NETDEV_TX_OK; 985 986 drop: 987 dev->stats.tx_dropped++; 988 goto tx_free; 989 990 tx_error: 991 dev->stats.tx_errors++; 992 tx_free: 993 dev_kfree_skb(skb); 994 return NETDEV_TX_OK; 995 } 996 997 /* Walk the forwarding table and purge stale entries */ 998 static void vxlan_cleanup(unsigned long arg) 999 { 1000 struct vxlan_dev *vxlan = (struct vxlan_dev *) arg; 1001 unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; 1002 unsigned int h; 1003 1004 if (!netif_running(vxlan->dev)) 1005 return; 1006 1007 spin_lock_bh(&vxlan->hash_lock); 1008 for (h = 0; h < FDB_HASH_SIZE; ++h) { 1009 struct hlist_node *p, *n; 1010 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 1011 struct vxlan_fdb *f 1012 = container_of(p, struct vxlan_fdb, hlist); 1013 unsigned long timeout; 1014 1015 if (f->state & NUD_PERMANENT) 1016 continue; 1017 1018 timeout = f->used + vxlan->age_interval * HZ; 1019 if (time_before_eq(timeout, jiffies)) { 1020 netdev_dbg(vxlan->dev, 1021 "garbage collect %pM\n", 1022 f->eth_addr); 1023 f->state = NUD_STALE; 1024 vxlan_fdb_destroy(vxlan, f); 1025 } else if (time_before(timeout, next_timer)) 1026 next_timer = timeout; 1027 } 1028 } 1029 spin_unlock_bh(&vxlan->hash_lock); 1030 1031 mod_timer(&vxlan->age_timer, next_timer); 1032 } 1033 1034 /* Setup stats when device is created */ 1035 static int vxlan_init(struct net_device *dev) 1036 { 1037 struct vxlan_dev *vxlan = netdev_priv(dev); 1038 1039 vxlan->stats = alloc_percpu(struct vxlan_stats); 1040 if (!vxlan->stats) 1041 return -ENOMEM; 1042 1043 return 0; 1044 } 1045 1046 /* Start ageing timer and join group when device is brought up */ 1047 static int vxlan_open(struct net_device *dev) 1048 { 1049 struct vxlan_dev *vxlan = netdev_priv(dev); 1050 int err; 1051 1052 if (vxlan->gaddr) { 1053 err = vxlan_join_group(dev); 1054 if (err) 1055 return err; 1056 } 1057 1058 if (vxlan->age_interval) 1059 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); 1060 1061 return 0; 1062 } 1063 1064 /* Purge the forwarding table */ 1065 static void vxlan_flush(struct vxlan_dev *vxlan) 1066 { 1067 unsigned h; 1068 1069 spin_lock_bh(&vxlan->hash_lock); 1070 for (h = 0; h < FDB_HASH_SIZE; ++h) { 1071 struct hlist_node *p, *n; 1072 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 1073 struct vxlan_fdb *f 1074 = container_of(p, struct vxlan_fdb, hlist); 1075 vxlan_fdb_destroy(vxlan, f); 1076 } 1077 } 1078 spin_unlock_bh(&vxlan->hash_lock); 1079 } 1080 1081 /* Cleanup timer and forwarding table on shutdown */ 1082 static int vxlan_stop(struct net_device *dev) 1083 { 1084 struct vxlan_dev *vxlan = netdev_priv(dev); 1085 1086 if (vxlan->gaddr) 1087 vxlan_leave_group(dev); 1088 1089 del_timer_sync(&vxlan->age_timer); 1090 1091 vxlan_flush(vxlan); 1092 1093 return 0; 1094 } 1095 1096 /* Merge per-cpu statistics */ 1097 static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev, 1098 struct rtnl_link_stats64 *stats) 1099 { 1100 struct vxlan_dev *vxlan = netdev_priv(dev); 1101 struct vxlan_stats tmp, sum = { 0 }; 1102 unsigned int cpu; 1103 1104 for_each_possible_cpu(cpu) { 1105 unsigned int start; 1106 const struct vxlan_stats *stats 1107 = per_cpu_ptr(vxlan->stats, cpu); 1108 1109 do { 1110 start = u64_stats_fetch_begin_bh(&stats->syncp); 1111 memcpy(&tmp, stats, sizeof(tmp)); 1112 } while (u64_stats_fetch_retry_bh(&stats->syncp, start)); 1113 1114 sum.tx_bytes += tmp.tx_bytes; 1115 sum.tx_packets += tmp.tx_packets; 1116 sum.rx_bytes += tmp.rx_bytes; 1117 sum.rx_packets += tmp.rx_packets; 1118 } 1119 1120 stats->tx_bytes = sum.tx_bytes; 1121 stats->tx_packets = sum.tx_packets; 1122 stats->rx_bytes = sum.rx_bytes; 1123 stats->rx_packets = sum.rx_packets; 1124 1125 stats->multicast = dev->stats.multicast; 1126 stats->rx_length_errors = dev->stats.rx_length_errors; 1127 stats->rx_frame_errors = dev->stats.rx_frame_errors; 1128 stats->rx_errors = dev->stats.rx_errors; 1129 1130 stats->tx_dropped = dev->stats.tx_dropped; 1131 stats->tx_carrier_errors = dev->stats.tx_carrier_errors; 1132 stats->tx_aborted_errors = dev->stats.tx_aborted_errors; 1133 stats->collisions = dev->stats.collisions; 1134 stats->tx_errors = dev->stats.tx_errors; 1135 1136 return stats; 1137 } 1138 1139 /* Stub, nothing needs to be done. */ 1140 static void vxlan_set_multicast_list(struct net_device *dev) 1141 { 1142 } 1143 1144 static const struct net_device_ops vxlan_netdev_ops = { 1145 .ndo_init = vxlan_init, 1146 .ndo_open = vxlan_open, 1147 .ndo_stop = vxlan_stop, 1148 .ndo_start_xmit = vxlan_xmit, 1149 .ndo_get_stats64 = vxlan_stats64, 1150 .ndo_set_rx_mode = vxlan_set_multicast_list, 1151 .ndo_change_mtu = eth_change_mtu, 1152 .ndo_validate_addr = eth_validate_addr, 1153 .ndo_set_mac_address = eth_mac_addr, 1154 .ndo_fdb_add = vxlan_fdb_add, 1155 .ndo_fdb_del = vxlan_fdb_delete, 1156 .ndo_fdb_dump = vxlan_fdb_dump, 1157 }; 1158 1159 /* Info for udev, that this is a virtual tunnel endpoint */ 1160 static struct device_type vxlan_type = { 1161 .name = "vxlan", 1162 }; 1163 1164 static void vxlan_free(struct net_device *dev) 1165 { 1166 struct vxlan_dev *vxlan = netdev_priv(dev); 1167 1168 free_percpu(vxlan->stats); 1169 free_netdev(dev); 1170 } 1171 1172 /* Initialize the device structure. */ 1173 static void vxlan_setup(struct net_device *dev) 1174 { 1175 struct vxlan_dev *vxlan = netdev_priv(dev); 1176 unsigned h; 1177 int low, high; 1178 1179 eth_hw_addr_random(dev); 1180 ether_setup(dev); 1181 dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; 1182 1183 dev->netdev_ops = &vxlan_netdev_ops; 1184 dev->destructor = vxlan_free; 1185 SET_NETDEV_DEVTYPE(dev, &vxlan_type); 1186 1187 dev->tx_queue_len = 0; 1188 dev->features |= NETIF_F_LLTX; 1189 dev->features |= NETIF_F_NETNS_LOCAL; 1190 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; 1191 dev->features |= NETIF_F_RXCSUM; 1192 1193 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; 1194 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1195 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1196 1197 spin_lock_init(&vxlan->hash_lock); 1198 1199 init_timer_deferrable(&vxlan->age_timer); 1200 vxlan->age_timer.function = vxlan_cleanup; 1201 vxlan->age_timer.data = (unsigned long) vxlan; 1202 1203 inet_get_local_port_range(&low, &high); 1204 vxlan->port_min = low; 1205 vxlan->port_max = high; 1206 1207 vxlan->dev = dev; 1208 1209 for (h = 0; h < FDB_HASH_SIZE; ++h) 1210 INIT_HLIST_HEAD(&vxlan->fdb_head[h]); 1211 } 1212 1213 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { 1214 [IFLA_VXLAN_ID] = { .type = NLA_U32 }, 1215 [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 1216 [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, 1217 [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 1218 [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, 1219 [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, 1220 [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, 1221 [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, 1222 [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, 1223 [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, 1224 [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, 1225 [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, 1226 [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, 1227 [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, 1228 }; 1229 1230 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) 1231 { 1232 if (tb[IFLA_ADDRESS]) { 1233 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { 1234 pr_debug("invalid link address (not ethernet)\n"); 1235 return -EINVAL; 1236 } 1237 1238 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { 1239 pr_debug("invalid all zero ethernet address\n"); 1240 return -EADDRNOTAVAIL; 1241 } 1242 } 1243 1244 if (!data) 1245 return -EINVAL; 1246 1247 if (data[IFLA_VXLAN_ID]) { 1248 __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); 1249 if (id >= VXLAN_VID_MASK) 1250 return -ERANGE; 1251 } 1252 1253 if (data[IFLA_VXLAN_GROUP]) { 1254 __be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]); 1255 if (!IN_MULTICAST(ntohl(gaddr))) { 1256 pr_debug("group address is not IPv4 multicast\n"); 1257 return -EADDRNOTAVAIL; 1258 } 1259 } 1260 1261 if (data[IFLA_VXLAN_PORT_RANGE]) { 1262 const struct ifla_vxlan_port_range *p 1263 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 1264 1265 if (ntohs(p->high) < ntohs(p->low)) { 1266 pr_debug("port range %u .. %u not valid\n", 1267 ntohs(p->low), ntohs(p->high)); 1268 return -EINVAL; 1269 } 1270 } 1271 1272 return 0; 1273 } 1274 1275 static void vxlan_get_drvinfo(struct net_device *netdev, 1276 struct ethtool_drvinfo *drvinfo) 1277 { 1278 strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); 1279 strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); 1280 } 1281 1282 static const struct ethtool_ops vxlan_ethtool_ops = { 1283 .get_drvinfo = vxlan_get_drvinfo, 1284 .get_link = ethtool_op_get_link, 1285 }; 1286 1287 static int vxlan_newlink(struct net *net, struct net_device *dev, 1288 struct nlattr *tb[], struct nlattr *data[]) 1289 { 1290 struct vxlan_dev *vxlan = netdev_priv(dev); 1291 __u32 vni; 1292 int err; 1293 1294 if (!data[IFLA_VXLAN_ID]) 1295 return -EINVAL; 1296 1297 vni = nla_get_u32(data[IFLA_VXLAN_ID]); 1298 if (vxlan_find_vni(net, vni)) { 1299 pr_info("duplicate VNI %u\n", vni); 1300 return -EEXIST; 1301 } 1302 vxlan->vni = vni; 1303 1304 if (data[IFLA_VXLAN_GROUP]) 1305 vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]); 1306 1307 if (data[IFLA_VXLAN_LOCAL]) 1308 vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); 1309 1310 if (data[IFLA_VXLAN_LINK] && 1311 (vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]))) { 1312 struct net_device *lowerdev 1313 = __dev_get_by_index(net, vxlan->link); 1314 1315 if (!lowerdev) { 1316 pr_info("ifindex %d does not exist\n", vxlan->link); 1317 return -ENODEV; 1318 } 1319 1320 if (!tb[IFLA_MTU]) 1321 dev->mtu = lowerdev->mtu - VXLAN_HEADROOM; 1322 1323 /* update header length based on lower device */ 1324 dev->hard_header_len = lowerdev->hard_header_len + 1325 VXLAN_HEADROOM; 1326 } 1327 1328 if (data[IFLA_VXLAN_TOS]) 1329 vxlan->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); 1330 1331 if (data[IFLA_VXLAN_TTL]) 1332 vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); 1333 1334 if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING])) 1335 vxlan->flags |= VXLAN_F_LEARN; 1336 1337 if (data[IFLA_VXLAN_AGEING]) 1338 vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); 1339 else 1340 vxlan->age_interval = FDB_AGE_DEFAULT; 1341 1342 if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY])) 1343 vxlan->flags |= VXLAN_F_PROXY; 1344 1345 if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC])) 1346 vxlan->flags |= VXLAN_F_RSC; 1347 1348 if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS])) 1349 vxlan->flags |= VXLAN_F_L2MISS; 1350 1351 if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS])) 1352 vxlan->flags |= VXLAN_F_L3MISS; 1353 1354 if (data[IFLA_VXLAN_LIMIT]) 1355 vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); 1356 1357 if (data[IFLA_VXLAN_PORT_RANGE]) { 1358 const struct ifla_vxlan_port_range *p 1359 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 1360 vxlan->port_min = ntohs(p->low); 1361 vxlan->port_max = ntohs(p->high); 1362 } 1363 1364 SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops); 1365 1366 err = register_netdevice(dev); 1367 if (!err) 1368 hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni)); 1369 1370 return err; 1371 } 1372 1373 static void vxlan_dellink(struct net_device *dev, struct list_head *head) 1374 { 1375 struct vxlan_dev *vxlan = netdev_priv(dev); 1376 1377 hlist_del_rcu(&vxlan->hlist); 1378 1379 unregister_netdevice_queue(dev, head); 1380 } 1381 1382 static size_t vxlan_get_size(const struct net_device *dev) 1383 { 1384 1385 return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ 1386 nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */ 1387 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ 1388 nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */ 1389 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ 1390 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ 1391 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ 1392 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ 1393 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ 1394 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ 1395 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ 1396 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ 1397 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ 1398 nla_total_size(sizeof(struct ifla_vxlan_port_range)) + 1399 0; 1400 } 1401 1402 static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) 1403 { 1404 const struct vxlan_dev *vxlan = netdev_priv(dev); 1405 struct ifla_vxlan_port_range ports = { 1406 .low = htons(vxlan->port_min), 1407 .high = htons(vxlan->port_max), 1408 }; 1409 1410 if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni)) 1411 goto nla_put_failure; 1412 1413 if (vxlan->gaddr && nla_put_be32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr)) 1414 goto nla_put_failure; 1415 1416 if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link)) 1417 goto nla_put_failure; 1418 1419 if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr)) 1420 goto nla_put_failure; 1421 1422 if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || 1423 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || 1424 nla_put_u8(skb, IFLA_VXLAN_LEARNING, 1425 !!(vxlan->flags & VXLAN_F_LEARN)) || 1426 nla_put_u8(skb, IFLA_VXLAN_PROXY, 1427 !!(vxlan->flags & VXLAN_F_PROXY)) || 1428 nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) || 1429 nla_put_u8(skb, IFLA_VXLAN_L2MISS, 1430 !!(vxlan->flags & VXLAN_F_L2MISS)) || 1431 nla_put_u8(skb, IFLA_VXLAN_L3MISS, 1432 !!(vxlan->flags & VXLAN_F_L3MISS)) || 1433 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || 1434 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax)) 1435 goto nla_put_failure; 1436 1437 if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) 1438 goto nla_put_failure; 1439 1440 return 0; 1441 1442 nla_put_failure: 1443 return -EMSGSIZE; 1444 } 1445 1446 static struct rtnl_link_ops vxlan_link_ops __read_mostly = { 1447 .kind = "vxlan", 1448 .maxtype = IFLA_VXLAN_MAX, 1449 .policy = vxlan_policy, 1450 .priv_size = sizeof(struct vxlan_dev), 1451 .setup = vxlan_setup, 1452 .validate = vxlan_validate, 1453 .newlink = vxlan_newlink, 1454 .dellink = vxlan_dellink, 1455 .get_size = vxlan_get_size, 1456 .fill_info = vxlan_fill_info, 1457 }; 1458 1459 static __net_init int vxlan_init_net(struct net *net) 1460 { 1461 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 1462 struct sock *sk; 1463 struct sockaddr_in vxlan_addr = { 1464 .sin_family = AF_INET, 1465 .sin_addr.s_addr = htonl(INADDR_ANY), 1466 }; 1467 int rc; 1468 unsigned h; 1469 1470 /* Create UDP socket for encapsulation receive. */ 1471 rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock); 1472 if (rc < 0) { 1473 pr_debug("UDP socket create failed\n"); 1474 return rc; 1475 } 1476 /* Put in proper namespace */ 1477 sk = vn->sock->sk; 1478 sk_change_net(sk, net); 1479 1480 vxlan_addr.sin_port = htons(vxlan_port); 1481 1482 rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr, 1483 sizeof(vxlan_addr)); 1484 if (rc < 0) { 1485 pr_debug("bind for UDP socket %pI4:%u (%d)\n", 1486 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc); 1487 sk_release_kernel(sk); 1488 vn->sock = NULL; 1489 return rc; 1490 } 1491 1492 /* Disable multicast loopback */ 1493 inet_sk(sk)->mc_loop = 0; 1494 1495 /* Mark socket as an encapsulation socket. */ 1496 udp_sk(sk)->encap_type = 1; 1497 udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; 1498 udp_encap_enable(); 1499 1500 for (h = 0; h < VNI_HASH_SIZE; ++h) 1501 INIT_HLIST_HEAD(&vn->vni_list[h]); 1502 1503 return 0; 1504 } 1505 1506 static __net_exit void vxlan_exit_net(struct net *net) 1507 { 1508 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 1509 struct vxlan_dev *vxlan; 1510 unsigned h; 1511 1512 rtnl_lock(); 1513 for (h = 0; h < VNI_HASH_SIZE; ++h) 1514 hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist) 1515 dev_close(vxlan->dev); 1516 rtnl_unlock(); 1517 1518 if (vn->sock) { 1519 sk_release_kernel(vn->sock->sk); 1520 vn->sock = NULL; 1521 } 1522 } 1523 1524 static struct pernet_operations vxlan_net_ops = { 1525 .init = vxlan_init_net, 1526 .exit = vxlan_exit_net, 1527 .id = &vxlan_net_id, 1528 .size = sizeof(struct vxlan_net), 1529 }; 1530 1531 static int __init vxlan_init_module(void) 1532 { 1533 int rc; 1534 1535 get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); 1536 1537 rc = register_pernet_device(&vxlan_net_ops); 1538 if (rc) 1539 goto out1; 1540 1541 rc = rtnl_link_register(&vxlan_link_ops); 1542 if (rc) 1543 goto out2; 1544 1545 return 0; 1546 1547 out2: 1548 unregister_pernet_device(&vxlan_net_ops); 1549 out1: 1550 return rc; 1551 } 1552 module_init(vxlan_init_module); 1553 1554 static void __exit vxlan_cleanup_module(void) 1555 { 1556 rtnl_link_unregister(&vxlan_link_ops); 1557 unregister_pernet_device(&vxlan_net_ops); 1558 } 1559 module_exit(vxlan_cleanup_module); 1560 1561 MODULE_LICENSE("GPL"); 1562 MODULE_VERSION(VXLAN_VERSION); 1563 MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>"); 1564 MODULE_ALIAS_RTNL_LINK("vxlan"); 1565
This page was automatically generated by LXR 0.3.1 (source). • Linux is a registered trademark of Linus Torvalds