1/* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */ 2/* 3 * NET3 Protocol independent device support routines. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation; either version 8 * 2 of the License, or (at your option) any later version. 9 * 10 * Derived from the non IP parts of dev.c 1.0.19 11 * Authors: Ross Biro 12 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 13 * Mark Evans, <evansmp@uhura.aston.ac.uk> 14 * 15 * Additional Authors: 16 * Florian la Roche <rzsfl@rz.uni-sb.de> 17 * Alan Cox <gw4pts@gw4pts.ampr.org> 18 * David Hinds <dahinds@users.sourceforge.net> 19 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 20 * Adam Sulmicki <adam@cfar.umd.edu> 21 * Pekka Riikonen <priikone@poesidon.pspt.fi> 22 * 23 * Changes: 24 * D.J. Barrow : Fixed bug where dev->refcnt gets set 25 * to 2 if register_netdev gets called 26 * before net_dev_init & also removed a 27 * few lines of code in the process. 28 * Alan Cox : device private ioctl copies fields back. 29 * Alan Cox : Transmit queue code does relevant 30 * stunts to keep the queue safe. 31 * Alan Cox : Fixed double lock. 32 * Alan Cox : Fixed promisc NULL pointer trap 33 * ???????? : Support the full private ioctl range 34 * Alan Cox : Moved ioctl permission check into 35 * drivers 36 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 37 * Alan Cox : 100 backlog just doesn't cut it when 38 * you start doing multicast video 8) 39 * Alan Cox : Rewrote net_bh and list manager. 40 * Alan Cox : Fix ETH_P_ALL echoback lengths. 41 * Alan Cox : Took out transmit every packet pass 42 * Saved a few bytes in the ioctl handler 43 * Alan Cox : Network driver sets packet type before 44 * calling netif_rx. Saves a function 45 * call a packet. 46 * Alan Cox : Hashed net_bh() 47 * Richard Kooijman: Timestamp fixes. 48 * Alan Cox : Wrong field in SIOCGIFDSTADDR 49 * Alan Cox : Device lock protection. 50 * Alan Cox : Fixed nasty side effect of device close 51 * changes. 52 * Rudi Cilibrasi : Pass the right thing to 53 * set_mac_address() 54 * Dave Miller : 32bit quantity for the device lock to 55 * make it work out on a Sparc. 56 * Bjorn Ekwall : Added KERNELD hack. 57 * Alan Cox : Cleaned up the backlog initialise. 58 * Craig Metz : SIOCGIFCONF fix if space for under 59 * 1 device. 60 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 61 * is no device open function. 62 * Andi Kleen : Fix error reporting for SIOCGIFCONF 63 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 64 * Cyrus Durgin : Cleaned for KMOD 65 * Adam Sulmicki : Bug Fix : Network Device Unload 66 * A network device unload needs to purge 67 * the backlog queue. 68 * Paul Rusty Russell : SIOCSIFNAME 69 * Pekka Riikonen : Netdev boot-time settings code 70 * Andrew Morton : Make unregister_netdevice wait 71 * indefinitely on dev->refcnt 72 * J Hadi Salim : - Backlog queue sampling 73 * - netif_rx() feedback 74 */ 75 76#include <asm/uaccess.h> 77#include <asm/system.h> 78#include <linux/bitops.h> 79#include <linux/capability.h> 80#include <linux/cpu.h> 81#include <linux/types.h> 82#include <linux/kernel.h> 83#include <linux/hash.h> 84#include <linux/slab.h> 85#include <linux/sched.h> 86#include <linux/mutex.h> 87#include <linux/string.h> 88#include <linux/mm.h> 89#include <linux/socket.h> 90#include <linux/sockios.h> 91#include <linux/errno.h> 92#include <linux/interrupt.h> 93#include <linux/if_ether.h> 94#include <linux/netdevice.h> 95#include <linux/etherdevice.h> 96#include <linux/ethtool.h> 97#include <linux/notifier.h> 98#include <linux/skbuff.h> 99#include <linux/netfilter_ipv4.h> 100#include <net/net_namespace.h> 101#include <net/sock.h> 102#include <linux/rtnetlink.h> 103#include <linux/proc_fs.h> 104#include <linux/seq_file.h> 105#include <linux/stat.h> 106#include <net/dst.h> 107#include <net/pkt_sched.h> 108#include <net/checksum.h> 109#include <net/xfrm.h> 110#include <linux/highmem.h> 111#include <linux/init.h> 112#include <linux/kmod.h> 113#include <linux/module.h> 114#include <linux/netpoll.h> 115#include <linux/rcupdate.h> 116#include <linux/delay.h> 117#include <net/wext.h> 118#include <net/iw_handler.h> 119#include <asm/current.h> 120#include <linux/audit.h> 121#include <linux/dmaengine.h> 122#include <linux/err.h> 123#include <linux/ctype.h> 124#include <linux/if_arp.h> 125#include <linux/if_vlan.h> 126#include <linux/ip.h> 127#include <net/ip.h> 128#include <linux/ipv6.h> 129#include <linux/in.h> 130#include <linux/jhash.h> 131#include <linux/random.h> 132#include <trace/events/napi.h> 133#include <linux/pci.h> 134#include "net-sysfs.h" 135 136#include <typedefs.h> 137#include <bcmdefs.h> 138 139/* Instead of increasing this, you should create a hash table. */ 140#define MAX_GRO_SKBS 8 141 142/* This should be increased if a protocol with a bigger head is added. */ 143#define GRO_MAX_HEAD (MAX_HEADER + 128) 144 145/* 146 * The list of packet types we will receive (as opposed to discard) 147 * and the routines to invoke. 148 * 149 * Why 16. Because with 16 the only overlap we get on a hash of the 150 * low nibble of the protocol value is RARP/SNAP/X.25. 151 * 152 * NOTE: That is no longer true with the addition of VLAN tags. Not 153 * sure which should go first, but I bet it won't make much 154 * difference if we are running VLANs. The good news is that 155 * this protocol won't be in the list unless compiled in, so 156 * the average user (w/out VLANs) will not be adversely affected. 157 * --BLG 158 * 159 * 0800 IP 160 * 8100 802.1Q VLAN 161 * 0001 802.3 162 * 0002 AX.25 163 * 0004 802.2 164 * 8035 RARP 165 * 0005 SNAP 166 * 0805 X.25 167 * 0806 ARP 168 * 8137 IPX 169 * 0009 Localtalk 170 * 86DD IPv6 171 */ 172 173#define PTYPE_HASH_SIZE (16) 174#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) 175 176static DEFINE_SPINLOCK(ptype_lock); 177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 178static struct list_head ptype_all __read_mostly; /* Taps */ 179 180/* 181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 182 * semaphore. 183 * 184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 185 * 186 * Writers must hold the rtnl semaphore while they loop through the 187 * dev_base_head list, and hold dev_base_lock for writing when they do the 188 * actual updates. This allows pure readers to access the list even 189 * while a writer is preparing to update it. 190 * 191 * To put it another way, dev_base_lock is held for writing only to 192 * protect against pure readers; the rtnl semaphore provides the 193 * protection against other writers. 194 * 195 * See, for example usages, register_netdevice() and 196 * unregister_netdevice(), which must be called with the rtnl 197 * semaphore held. 198 */ 199DEFINE_RWLOCK(dev_base_lock); 200EXPORT_SYMBOL(dev_base_lock); 201 202static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 203{ 204 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 205 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 206} 207 208static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 209{ 210 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 211} 212 213static inline void rps_lock(struct softnet_data *sd) 214{ 215#ifdef CONFIG_RPS 216 spin_lock(&sd->input_pkt_queue.lock); 217#endif 218} 219 220static inline void rps_unlock(struct softnet_data *sd) 221{ 222#ifdef CONFIG_RPS 223 spin_unlock(&sd->input_pkt_queue.lock); 224#endif 225} 226 227/* Device list insertion */ 228static int list_netdevice(struct net_device *dev) 229{ 230 struct net *net = dev_net(dev); 231 232 ASSERT_RTNL(); 233 234 write_lock_bh(&dev_base_lock); 235 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 236 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 237 hlist_add_head_rcu(&dev->index_hlist, 238 dev_index_hash(net, dev->ifindex)); 239 write_unlock_bh(&dev_base_lock); 240 return 0; 241} 242 243/* Device list removal 244 * caller must respect a RCU grace period before freeing/reusing dev 245 */ 246static void unlist_netdevice(struct net_device *dev) 247{ 248 ASSERT_RTNL(); 249 250 /* Unlink dev from the device chain */ 251 write_lock_bh(&dev_base_lock); 252 list_del_rcu(&dev->dev_list); 253 hlist_del_rcu(&dev->name_hlist); 254 hlist_del_rcu(&dev->index_hlist); 255 write_unlock_bh(&dev_base_lock); 256} 257 258/* 259 * Our notifier list 260 */ 261 262static RAW_NOTIFIER_HEAD(netdev_chain); 263 264/* 265 * Device drivers call our routines to queue packets here. We empty the 266 * queue in the local softnet handler. 267 */ 268 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 270EXPORT_PER_CPU_SYMBOL(softnet_data); 271 272#ifdef CONFIG_LOCKDEP 273/* 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 275 * according to dev->type 276 */ 277static const unsigned short netdev_lock_type[] = 278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 290 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, 291 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, 292 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, 293 ARPHRD_VOID, ARPHRD_NONE}; 294 295static const char *const netdev_lock_name[] = 296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 308 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", 309 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", 310 "_xmit_PHONET_PIPE", "_xmit_IEEE802154", 311 "_xmit_VOID", "_xmit_NONE"}; 312 313static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 314static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 315 316static inline unsigned short netdev_lock_pos(unsigned short dev_type) 317{ 318 int i; 319 320 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 321 if (netdev_lock_type[i] == dev_type) 322 return i; 323 /* the last key is used by default */ 324 return ARRAY_SIZE(netdev_lock_type) - 1; 325} 326 327static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 328 unsigned short dev_type) 329{ 330 int i; 331 332 i = netdev_lock_pos(dev_type); 333 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 334 netdev_lock_name[i]); 335} 336 337static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 338{ 339 int i; 340 341 i = netdev_lock_pos(dev->type); 342 lockdep_set_class_and_name(&dev->addr_list_lock, 343 &netdev_addr_lock_key[i], 344 netdev_lock_name[i]); 345} 346#else 347static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 348 unsigned short dev_type) 349{ 350} 351static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 352{ 353} 354#endif 355 356/******************************************************************************* 357 358 Protocol management and registration routines 359 360*******************************************************************************/ 361 362/* 363 * Add a protocol ID to the list. Now that the input handler is 364 * smarter we can dispense with all the messy stuff that used to be 365 * here. 366 * 367 * BEWARE!!! Protocol handlers, mangling input packets, 368 * MUST BE last in hash buckets and checking protocol handlers 369 * MUST start from promiscuous ptype_all chain in net_bh. 370 * It is true now, do not change it. 371 * Explanation follows: if protocol handler, mangling packet, will 372 * be the first on list, it is not able to sense, that packet 373 * is cloned and should be copied-on-write, so that it will 374 * change it and subsequent readers will get broken packet. 375 * --ANK (980803) 376 */ 377 378/** 379 * dev_add_pack - add packet handler 380 * @pt: packet type declaration 381 * 382 * Add a protocol handler to the networking stack. The passed &packet_type 383 * is linked into kernel lists and may not be freed until it has been 384 * removed from the kernel lists. 385 * 386 * This call does not sleep therefore it can not 387 * guarantee all CPU's that are in middle of receiving packets 388 * will see the new packet type (until the next received packet). 389 */ 390 391void dev_add_pack(struct packet_type *pt) 392{ 393 int hash; 394 395 spin_lock_bh(&ptype_lock); 396 if (pt->type == htons(ETH_P_ALL)) 397 list_add_rcu(&pt->list, &ptype_all); 398 else { 399 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 400 list_add_rcu(&pt->list, &ptype_base[hash]); 401 } 402 spin_unlock_bh(&ptype_lock); 403} 404EXPORT_SYMBOL(dev_add_pack); 405 406/** 407 * __dev_remove_pack - remove packet handler 408 * @pt: packet type declaration 409 * 410 * Remove a protocol handler that was previously added to the kernel 411 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 412 * from the kernel lists and can be freed or reused once this function 413 * returns. 414 * 415 * The packet type might still be in use by receivers 416 * and must not be freed until after all the CPU's have gone 417 * through a quiescent state. 418 */ 419void __dev_remove_pack(struct packet_type *pt) 420{ 421 struct list_head *head; 422 struct packet_type *pt1; 423 424 spin_lock_bh(&ptype_lock); 425 426 if (pt->type == htons(ETH_P_ALL)) 427 head = &ptype_all; 428 else 429 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 430 431 list_for_each_entry(pt1, head, list) { 432 if (pt == pt1) { 433 list_del_rcu(&pt->list); 434 goto out; 435 } 436 } 437 438 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 439out: 440 spin_unlock_bh(&ptype_lock); 441} 442EXPORT_SYMBOL(__dev_remove_pack); 443 444/** 445 * dev_remove_pack - remove packet handler 446 * @pt: packet type declaration 447 * 448 * Remove a protocol handler that was previously added to the kernel 449 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 450 * from the kernel lists and can be freed or reused once this function 451 * returns. 452 * 453 * This call sleeps to guarantee that no CPU is looking at the packet 454 * type after return. 455 */ 456void dev_remove_pack(struct packet_type *pt) 457{ 458 __dev_remove_pack(pt); 459 460 synchronize_net(); 461} 462EXPORT_SYMBOL(dev_remove_pack); 463 464/****************************************************************************** 465 466 Device Boot-time Settings Routines 467 468*******************************************************************************/ 469 470/* Boot time configuration table */ 471static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 472 473/** 474 * netdev_boot_setup_add - add new setup entry 475 * @name: name of the device 476 * @map: configured settings for the device 477 * 478 * Adds new setup entry to the dev_boot_setup list. The function 479 * returns 0 on error and 1 on success. This is a generic routine to 480 * all netdevices. 481 */ 482static int netdev_boot_setup_add(char *name, struct ifmap *map) 483{ 484 struct netdev_boot_setup *s; 485 int i; 486 487 s = dev_boot_setup; 488 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 489 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 490 memset(s[i].name, 0, sizeof(s[i].name)); 491 strlcpy(s[i].name, name, IFNAMSIZ); 492 memcpy(&s[i].map, map, sizeof(s[i].map)); 493 break; 494 } 495 } 496 497 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 498} 499 500/** 501 * netdev_boot_setup_check - check boot time settings 502 * @dev: the netdevice 503 * 504 * Check boot time settings for the device. 505 * The found settings are set for the device to be used 506 * later in the device probing. 507 * Returns 0 if no settings found, 1 if they are. 508 */ 509int netdev_boot_setup_check(struct net_device *dev) 510{ 511 struct netdev_boot_setup *s = dev_boot_setup; 512 int i; 513 514 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 515 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 516 !strcmp(dev->name, s[i].name)) { 517 dev->irq = s[i].map.irq; 518 dev->base_addr = s[i].map.base_addr; 519 dev->mem_start = s[i].map.mem_start; 520 dev->mem_end = s[i].map.mem_end; 521 return 1; 522 } 523 } 524 return 0; 525} 526EXPORT_SYMBOL(netdev_boot_setup_check); 527 528 529/** 530 * netdev_boot_base - get address from boot time settings 531 * @prefix: prefix for network device 532 * @unit: id for network device 533 * 534 * Check boot time settings for the base address of device. 535 * The found settings are set for the device to be used 536 * later in the device probing. 537 * Returns 0 if no settings found. 538 */ 539unsigned long netdev_boot_base(const char *prefix, int unit) 540{ 541 const struct netdev_boot_setup *s = dev_boot_setup; 542 char name[IFNAMSIZ]; 543 int i; 544 545 sprintf(name, "%s%d", prefix, unit); 546 547 /* 548 * If device already registered then return base of 1 549 * to indicate not to probe for this interface 550 */ 551 if (__dev_get_by_name(&init_net, name)) 552 return 1; 553 554 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 555 if (!strcmp(name, s[i].name)) 556 return s[i].map.base_addr; 557 return 0; 558} 559 560/* 561 * Saves at boot time configured settings for any netdevice. 562 */ 563int __init netdev_boot_setup(char *str) 564{ 565 int ints[5]; 566 struct ifmap map; 567 568 str = get_options(str, ARRAY_SIZE(ints), ints); 569 if (!str || !*str) 570 return 0; 571 572 /* Save settings */ 573 memset(&map, 0, sizeof(map)); 574 if (ints[0] > 0) 575 map.irq = ints[1]; 576 if (ints[0] > 1) 577 map.base_addr = ints[2]; 578 if (ints[0] > 2) 579 map.mem_start = ints[3]; 580 if (ints[0] > 3) 581 map.mem_end = ints[4]; 582 583 /* Add new entry to the list */ 584 return netdev_boot_setup_add(str, &map); 585} 586 587__setup("netdev=", netdev_boot_setup); 588 589/******************************************************************************* 590 591 Device Interface Subroutines 592 593*******************************************************************************/ 594 595/** 596 * __dev_get_by_name - find a device by its name 597 * @net: the applicable net namespace 598 * @name: name to find 599 * 600 * Find an interface by name. Must be called under RTNL semaphore 601 * or @dev_base_lock. If the name is found a pointer to the device 602 * is returned. If the name is not found then %NULL is returned. The 603 * reference counters are not incremented so the caller must be 604 * careful with locks. 605 */ 606 607struct net_device *__dev_get_by_name(struct net *net, const char *name) 608{ 609 struct hlist_node *p; 610 struct net_device *dev; 611 struct hlist_head *head = dev_name_hash(net, name); 612 613 hlist_for_each_entry(dev, p, head, name_hlist) 614 if (!strncmp(dev->name, name, IFNAMSIZ)) 615 return dev; 616 617 return NULL; 618} 619EXPORT_SYMBOL(__dev_get_by_name); 620 621/** 622 * dev_get_by_name_rcu - find a device by its name 623 * @net: the applicable net namespace 624 * @name: name to find 625 * 626 * Find an interface by name. 627 * If the name is found a pointer to the device is returned. 628 * If the name is not found then %NULL is returned. 629 * The reference counters are not incremented so the caller must be 630 * careful with locks. The caller must hold RCU lock. 631 */ 632 633struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 634{ 635 struct hlist_node *p; 636 struct net_device *dev; 637 struct hlist_head *head = dev_name_hash(net, name); 638 639 hlist_for_each_entry_rcu(dev, p, head, name_hlist) 640 if (!strncmp(dev->name, name, IFNAMSIZ)) 641 return dev; 642 643 return NULL; 644} 645EXPORT_SYMBOL(dev_get_by_name_rcu); 646 647/** 648 * dev_get_by_name - find a device by its name 649 * @net: the applicable net namespace 650 * @name: name to find 651 * 652 * Find an interface by name. This can be called from any 653 * context and does its own locking. The returned handle has 654 * the usage count incremented and the caller must use dev_put() to 655 * release it when it is no longer needed. %NULL is returned if no 656 * matching device is found. 657 */ 658 659struct net_device *dev_get_by_name(struct net *net, const char *name) 660{ 661 struct net_device *dev; 662 663 rcu_read_lock(); 664 dev = dev_get_by_name_rcu(net, name); 665 if (dev) 666 dev_hold(dev); 667 rcu_read_unlock(); 668 return dev; 669} 670EXPORT_SYMBOL(dev_get_by_name); 671 672/** 673 * __dev_get_by_index - find a device by its ifindex 674 * @net: the applicable net namespace 675 * @ifindex: index of device 676 * 677 * Search for an interface by index. Returns %NULL if the device 678 * is not found or a pointer to the device. The device has not 679 * had its reference counter increased so the caller must be careful 680 * about locking. The caller must hold either the RTNL semaphore 681 * or @dev_base_lock. 682 */ 683 684struct net_device *__dev_get_by_index(struct net *net, int ifindex) 685{ 686 struct hlist_node *p; 687 struct net_device *dev; 688 struct hlist_head *head = dev_index_hash(net, ifindex); 689 690 hlist_for_each_entry(dev, p, head, index_hlist) 691 if (dev->ifindex == ifindex) 692 return dev; 693 694 return NULL; 695} 696EXPORT_SYMBOL(__dev_get_by_index); 697 698/** 699 * dev_get_by_index_rcu - find a device by its ifindex 700 * @net: the applicable net namespace 701 * @ifindex: index of device 702 * 703 * Search for an interface by index. Returns %NULL if the device 704 * is not found or a pointer to the device. The device has not 705 * had its reference counter increased so the caller must be careful 706 * about locking. The caller must hold RCU lock. 707 */ 708 709struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 710{ 711 struct hlist_node *p; 712 struct net_device *dev; 713 struct hlist_head *head = dev_index_hash(net, ifindex); 714 715 hlist_for_each_entry_rcu(dev, p, head, index_hlist) 716 if (dev->ifindex == ifindex) 717 return dev; 718 719 return NULL; 720} 721EXPORT_SYMBOL(dev_get_by_index_rcu); 722 723 724/** 725 * dev_get_by_index - find a device by its ifindex 726 * @net: the applicable net namespace 727 * @ifindex: index of device 728 * 729 * Search for an interface by index. Returns NULL if the device 730 * is not found or a pointer to the device. The device returned has 731 * had a reference added and the pointer is safe until the user calls 732 * dev_put to indicate they have finished with it. 733 */ 734 735struct net_device *dev_get_by_index(struct net *net, int ifindex) 736{ 737 struct net_device *dev; 738 739 rcu_read_lock(); 740 dev = dev_get_by_index_rcu(net, ifindex); 741 if (dev) 742 dev_hold(dev); 743 rcu_read_unlock(); 744 return dev; 745} 746EXPORT_SYMBOL(dev_get_by_index); 747 748/** 749 * dev_getbyhwaddr - find a device by its hardware address 750 * @net: the applicable net namespace 751 * @type: media type of device 752 * @ha: hardware address 753 * 754 * Search for an interface by MAC address. Returns NULL if the device 755 * is not found or a pointer to the device. The caller must hold the 756 * rtnl semaphore. The returned device has not had its ref count increased 757 * and the caller must therefore be careful about locking 758 * 759 * BUGS: 760 * If the API was consistent this would be __dev_get_by_hwaddr 761 */ 762 763struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) 764{ 765 struct net_device *dev; 766 767 ASSERT_RTNL(); 768 769 for_each_netdev(net, dev) 770 if (dev->type == type && 771 !memcmp(dev->dev_addr, ha, dev->addr_len)) 772 return dev; 773 774 return NULL; 775} 776EXPORT_SYMBOL(dev_getbyhwaddr); 777 778struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 779{ 780 struct net_device *dev; 781 782 ASSERT_RTNL(); 783 for_each_netdev(net, dev) 784 if (dev->type == type) 785 return dev; 786 787 return NULL; 788} 789EXPORT_SYMBOL(__dev_getfirstbyhwtype); 790 791struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 792{ 793 struct net_device *dev, *ret = NULL; 794 795 rcu_read_lock(); 796 for_each_netdev_rcu(net, dev) 797 if (dev->type == type) { 798 dev_hold(dev); 799 ret = dev; 800 break; 801 } 802 rcu_read_unlock(); 803 return ret; 804} 805EXPORT_SYMBOL(dev_getfirstbyhwtype); 806 807/** 808 * dev_get_by_flags_rcu - find any device with given flags 809 * @net: the applicable net namespace 810 * @if_flags: IFF_* values 811 * @mask: bitmask of bits in if_flags to check 812 * 813 * Search for any interface with the given flags. Returns NULL if a device 814 * is not found or a pointer to the device. Must be called inside 815 * rcu_read_lock(), and result refcount is unchanged. 816 */ 817 818struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, 819 unsigned short mask) 820{ 821 struct net_device *dev, *ret; 822 823 ret = NULL; 824 for_each_netdev_rcu(net, dev) { 825 if (((dev->flags ^ if_flags) & mask) == 0) { 826 ret = dev; 827 break; 828 } 829 } 830 return ret; 831} 832EXPORT_SYMBOL(dev_get_by_flags_rcu); 833 834/** 835 * dev_valid_name - check if name is okay for network device 836 * @name: name string 837 * 838 * Network device names need to be valid file names to 839 * to allow sysfs to work. We also disallow any kind of 840 * whitespace. 841 */ 842int dev_valid_name(const char *name) 843{ 844 if (*name == '\0') 845 return 0; 846 if (strlen(name) >= IFNAMSIZ) 847 return 0; 848 if (!strcmp(name, ".") || !strcmp(name, "..")) 849 return 0; 850 851 while (*name) { 852 if (*name == '/' || isspace(*name)) 853 return 0; 854 name++; 855 } 856 return 1; 857} 858EXPORT_SYMBOL(dev_valid_name); 859 860/** 861 * __dev_alloc_name - allocate a name for a device 862 * @net: network namespace to allocate the device name in 863 * @name: name format string 864 * @buf: scratch buffer and result name string 865 * 866 * Passed a format string - eg "lt%d" it will try and find a suitable 867 * id. It scans list of devices to build up a free map, then chooses 868 * the first empty slot. The caller must hold the dev_base or rtnl lock 869 * while allocating the name and adding the device in order to avoid 870 * duplicates. 871 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 872 * Returns the number of the unit assigned or a negative errno code. 873 */ 874 875static int __dev_alloc_name(struct net *net, const char *name, char *buf) 876{ 877 int i = 0; 878 const char *p; 879 const int max_netdevices = 8*PAGE_SIZE; 880 unsigned long *inuse; 881 struct net_device *d; 882 883 p = strnchr(name, IFNAMSIZ-1, '%'); 884 if (p) { 885 /* 886 * Verify the string as this thing may have come from 887 * the user. There must be either one "%d" and no other "%" 888 * characters. 889 */ 890 if (p[1] != 'd' || strchr(p + 2, '%')) 891 return -EINVAL; 892 893 /* Use one page as a bit array of possible slots */ 894 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 895 if (!inuse) 896 return -ENOMEM; 897 898 for_each_netdev(net, d) { 899 if (!sscanf(d->name, name, &i)) 900 continue; 901 if (i < 0 || i >= max_netdevices) 902 continue; 903 904 /* avoid cases where sscanf is not exact inverse of printf */ 905 snprintf(buf, IFNAMSIZ, name, i); 906 if (!strncmp(buf, d->name, IFNAMSIZ)) 907 set_bit(i, inuse); 908 } 909 910 i = find_first_zero_bit(inuse, max_netdevices); 911 free_page((unsigned long) inuse); 912 } 913 914 if (buf != name) 915 snprintf(buf, IFNAMSIZ, name, i); 916 if (!__dev_get_by_name(net, buf)) 917 return i; 918 919 /* It is possible to run out of possible slots 920 * when the name is long and there isn't enough space left 921 * for the digits, or if all bits are used. 922 */ 923 return -ENFILE; 924} 925 926/** 927 * dev_alloc_name - allocate a name for a device 928 * @dev: device 929 * @name: name format string 930 * 931 * Passed a format string - eg "lt%d" it will try and find a suitable 932 * id. It scans list of devices to build up a free map, then chooses 933 * the first empty slot. The caller must hold the dev_base or rtnl lock 934 * while allocating the name and adding the device in order to avoid 935 * duplicates. 936 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 937 * Returns the number of the unit assigned or a negative errno code. 938 */ 939 940int dev_alloc_name(struct net_device *dev, const char *name) 941{ 942 char buf[IFNAMSIZ]; 943 struct net *net; 944 int ret; 945 946 BUG_ON(!dev_net(dev)); 947 net = dev_net(dev); 948 ret = __dev_alloc_name(net, name, buf); 949 if (ret >= 0) 950 strlcpy(dev->name, buf, IFNAMSIZ); 951 return ret; 952} 953EXPORT_SYMBOL(dev_alloc_name); 954 955static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) 956{ 957 struct net *net; 958 959 BUG_ON(!dev_net(dev)); 960 net = dev_net(dev); 961 962 if (!dev_valid_name(name)) 963 return -EINVAL; 964 965 if (fmt && strchr(name, '%')) 966 return dev_alloc_name(dev, name); 967 else if (__dev_get_by_name(net, name)) 968 return -EEXIST; 969 else if (dev->name != name) 970 strlcpy(dev->name, name, IFNAMSIZ); 971 972 return 0; 973} 974 975/** 976 * dev_change_name - change name of a device 977 * @dev: device 978 * @newname: name (or format string) must be at least IFNAMSIZ 979 * 980 * Change name of a device, can pass format strings "eth%d". 981 * for wildcarding. 982 */ 983int dev_change_name(struct net_device *dev, const char *newname) 984{ 985 char oldname[IFNAMSIZ]; 986 int err = 0; 987 int ret; 988 struct net *net; 989 990 ASSERT_RTNL(); 991 BUG_ON(!dev_net(dev)); 992 993 net = dev_net(dev); 994 if (dev->flags & IFF_UP) 995 return -EBUSY; 996 997 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) 998 return 0; 999 1000 memcpy(oldname, dev->name, IFNAMSIZ); 1001 1002 err = dev_get_valid_name(dev, newname, 1); 1003 if (err < 0) 1004 return err; 1005 1006rollback: 1007 ret = device_rename(&dev->dev, dev->name); 1008 if (ret) { 1009 memcpy(dev->name, oldname, IFNAMSIZ); 1010 return ret; 1011 } 1012 1013 write_lock_bh(&dev_base_lock); 1014 hlist_del(&dev->name_hlist); 1015 write_unlock_bh(&dev_base_lock); 1016 1017 synchronize_rcu(); 1018 1019 write_lock_bh(&dev_base_lock); 1020 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1021 write_unlock_bh(&dev_base_lock); 1022 1023 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1024 ret = notifier_to_errno(ret); 1025 1026 if (ret) { 1027 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1028 if (err >= 0) { 1029 err = ret; 1030 memcpy(dev->name, oldname, IFNAMSIZ); 1031 goto rollback; 1032 } else { 1033 printk(KERN_ERR 1034 "%s: name change rollback failed: %d.\n", 1035 dev->name, ret); 1036 } 1037 } 1038 1039 return err; 1040} 1041 1042/** 1043 * dev_set_alias - change ifalias of a device 1044 * @dev: device 1045 * @alias: name up to IFALIASZ 1046 * @len: limit of bytes to copy from info 1047 * 1048 * Set ifalias for a device, 1049 */ 1050int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1051{ 1052 ASSERT_RTNL(); 1053 1054 if (len >= IFALIASZ) 1055 return -EINVAL; 1056 1057 if (!len) { 1058 if (dev->ifalias) { 1059 kfree(dev->ifalias); 1060 dev->ifalias = NULL; 1061 } 1062 return 0; 1063 } 1064 1065 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1066 if (!dev->ifalias) 1067 return -ENOMEM; 1068 1069 strlcpy(dev->ifalias, alias, len+1); 1070 return len; 1071} 1072 1073 1074/** 1075 * netdev_features_change - device changes features 1076 * @dev: device to cause notification 1077 * 1078 * Called to indicate a device has changed features. 1079 */ 1080void netdev_features_change(struct net_device *dev) 1081{ 1082 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1083} 1084EXPORT_SYMBOL(netdev_features_change); 1085 1086/** 1087 * netdev_state_change - device changes state 1088 * @dev: device to cause notification 1089 * 1090 * Called to indicate a device has changed state. This function calls 1091 * the notifier chains for netdev_chain and sends a NEWLINK message 1092 * to the routing socket. 1093 */ 1094void netdev_state_change(struct net_device *dev) 1095{ 1096 if (dev->flags & IFF_UP) { 1097 call_netdevice_notifiers(NETDEV_CHANGE, dev); 1098 rtmsg_ifinfo(RTM_NEWLINK, dev, 0); 1099 } 1100} 1101EXPORT_SYMBOL(netdev_state_change); 1102 1103int netdev_bonding_change(struct net_device *dev, unsigned long event) 1104{ 1105 return call_netdevice_notifiers(event, dev); 1106} 1107EXPORT_SYMBOL(netdev_bonding_change); 1108 1109/** 1110 * dev_load - load a network module 1111 * @net: the applicable net namespace 1112 * @name: name of interface 1113 * 1114 * If a network interface is not present and the process has suitable 1115 * privileges this function loads the module. If module loading is not 1116 * available in this kernel then it becomes a nop. 1117 */ 1118 1119void dev_load(struct net *net, const char *name) 1120{ 1121 struct net_device *dev; 1122 1123 rcu_read_lock(); 1124 dev = dev_get_by_name_rcu(net, name); 1125 rcu_read_unlock(); 1126 1127 if (!dev && capable(CAP_NET_ADMIN)) 1128 request_module("%s", name); 1129} 1130EXPORT_SYMBOL(dev_load); 1131 1132static int __dev_open(struct net_device *dev) 1133{ 1134 const struct net_device_ops *ops = dev->netdev_ops; 1135 int ret; 1136 1137 ASSERT_RTNL(); 1138 1139 /* 1140 * Is it even present? 1141 */ 1142 if (!netif_device_present(dev)) 1143 return -ENODEV; 1144 1145 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1146 ret = notifier_to_errno(ret); 1147 if (ret) 1148 return ret; 1149 1150 /* 1151 * Call device private open method 1152 */ 1153 set_bit(__LINK_STATE_START, &dev->state); 1154 1155 if (ops->ndo_validate_addr) 1156 ret = ops->ndo_validate_addr(dev); 1157 1158 if (!ret && ops->ndo_open) 1159 ret = ops->ndo_open(dev); 1160 1161 /* 1162 * If it went open OK then: 1163 */ 1164 1165 if (ret) 1166 clear_bit(__LINK_STATE_START, &dev->state); 1167 else { 1168 /* 1169 * Set the flags. 1170 */ 1171 dev->flags |= IFF_UP; 1172 1173 /* 1174 * Enable NET_DMA 1175 */ 1176 net_dmaengine_get(); 1177 1178 /* 1179 * Initialize multicasting status 1180 */ 1181 dev_set_rx_mode(dev); 1182 1183 /* 1184 * Wakeup transmit queue engine 1185 */ 1186 dev_activate(dev); 1187 } 1188 1189 return ret; 1190} 1191 1192/** 1193 * dev_open - prepare an interface for use. 1194 * @dev: device to open 1195 * 1196 * Takes a device from down to up state. The device's private open 1197 * function is invoked and then the multicast lists are loaded. Finally 1198 * the device is moved into the up state and a %NETDEV_UP message is 1199 * sent to the netdev notifier chain. 1200 * 1201 * Calling this function on an active interface is a nop. On a failure 1202 * a negative errno code is returned. 1203 */ 1204int dev_open(struct net_device *dev) 1205{ 1206 int ret; 1207 1208 /* 1209 * Is it already up? 1210 */ 1211 if (dev->flags & IFF_UP) 1212 return 0; 1213 1214 /* 1215 * Open device 1216 */ 1217 ret = __dev_open(dev); 1218 if (ret < 0) 1219 return ret; 1220 1221 /* 1222 * ... and announce new interface. 1223 */ 1224 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); 1225 call_netdevice_notifiers(NETDEV_UP, dev); 1226 1227 return ret; 1228} 1229EXPORT_SYMBOL(dev_open); 1230 1231static int __dev_close(struct net_device *dev) 1232{ 1233 const struct net_device_ops *ops = dev->netdev_ops; 1234 1235 ASSERT_RTNL(); 1236 might_sleep(); 1237 1238 /* 1239 * Tell people we are going down, so that they can 1240 * prepare to death, when device is still operating. 1241 */ 1242 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1243 1244 clear_bit(__LINK_STATE_START, &dev->state); 1245 1246 /* Synchronize to scheduled poll. We cannot touch poll list, 1247 * it can be even on different cpu. So just clear netif_running(). 1248 * 1249 * dev->stop() will invoke napi_disable() on all of it's 1250 * napi_struct instances on this device. 1251 */ 1252 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 1253 1254 dev_deactivate(dev); 1255 1256 /* 1257 * Call the device specific close. This cannot fail. 1258 * Only if device is UP 1259 * 1260 * We allow it to be called even after a DETACH hot-plug 1261 * event. 1262 */ 1263 if (ops->ndo_stop) 1264 ops->ndo_stop(dev); 1265 1266 /* 1267 * Device is now down. 1268 */ 1269 1270 dev->flags &= ~IFF_UP; 1271 1272 /* 1273 * Shutdown NET_DMA 1274 */ 1275 net_dmaengine_put(); 1276 1277 return 0; 1278} 1279 1280/** 1281 * dev_close - shutdown an interface. 1282 * @dev: device to shutdown 1283 * 1284 * This function moves an active device into down state. A 1285 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1286 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1287 * chain. 1288 */ 1289int dev_close(struct net_device *dev) 1290{ 1291 if (!(dev->flags & IFF_UP)) 1292 return 0; 1293 1294 __dev_close(dev); 1295 1296 /* 1297 * Tell people we are down 1298 */ 1299 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); 1300 call_netdevice_notifiers(NETDEV_DOWN, dev); 1301 1302 return 0; 1303} 1304EXPORT_SYMBOL(dev_close); 1305 1306 1307/** 1308 * dev_disable_lro - disable Large Receive Offload on a device 1309 * @dev: device 1310 * 1311 * Disable Large Receive Offload (LRO) on a net device. Must be 1312 * called under RTNL. This is needed if received packets may be 1313 * forwarded to another interface. 1314 */ 1315void dev_disable_lro(struct net_device *dev) 1316{ 1317 if (dev->ethtool_ops && dev->ethtool_ops->get_flags && 1318 dev->ethtool_ops->set_flags) { 1319 u32 flags = dev->ethtool_ops->get_flags(dev); 1320 if (flags & ETH_FLAG_LRO) { 1321 flags &= ~ETH_FLAG_LRO; 1322 dev->ethtool_ops->set_flags(dev, flags); 1323 } 1324 } 1325 WARN_ON(dev->features & NETIF_F_LRO); 1326} 1327EXPORT_SYMBOL(dev_disable_lro); 1328 1329 1330static int dev_boot_phase = 1; 1331 1332/* 1333 * Device change register/unregister. These are not inline or static 1334 * as we export them to the world. 1335 */ 1336 1337/** 1338 * register_netdevice_notifier - register a network notifier block 1339 * @nb: notifier 1340 * 1341 * Register a notifier to be called when network device events occur. 1342 * The notifier passed is linked into the kernel structures and must 1343 * not be reused until it has been unregistered. A negative errno code 1344 * is returned on a failure. 1345 * 1346 * When registered all registration and up events are replayed 1347 * to the new notifier to allow device to have a race free 1348 * view of the network device list. 1349 */ 1350 1351int register_netdevice_notifier(struct notifier_block *nb) 1352{ 1353 struct net_device *dev; 1354 struct net_device *last; 1355 struct net *net; 1356 int err; 1357 1358 rtnl_lock(); 1359 err = raw_notifier_chain_register(&netdev_chain, nb); 1360 if (err) 1361 goto unlock; 1362 if (dev_boot_phase) 1363 goto unlock; 1364 for_each_net(net) { 1365 for_each_netdev(net, dev) { 1366 err = nb->notifier_call(nb, NETDEV_REGISTER, dev); 1367 err = notifier_to_errno(err); 1368 if (err) 1369 goto rollback; 1370 1371 if (!(dev->flags & IFF_UP)) 1372 continue; 1373 1374 nb->notifier_call(nb, NETDEV_UP, dev); 1375 } 1376 } 1377 1378unlock: 1379 rtnl_unlock(); 1380 return err; 1381 1382rollback: 1383 last = dev; 1384 for_each_net(net) { 1385 for_each_netdev(net, dev) { 1386 if (dev == last) 1387 break; 1388 1389 if (dev->flags & IFF_UP) { 1390 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); 1391 nb->notifier_call(nb, NETDEV_DOWN, dev); 1392 } 1393 nb->notifier_call(nb, NETDEV_UNREGISTER, dev); 1394 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); 1395 } 1396 } 1397 1398 raw_notifier_chain_unregister(&netdev_chain, nb); 1399 goto unlock; 1400} 1401EXPORT_SYMBOL(register_netdevice_notifier); 1402 1403/** 1404 * unregister_netdevice_notifier - unregister a network notifier block 1405 * @nb: notifier 1406 * 1407 * Unregister a notifier previously registered by 1408 * register_netdevice_notifier(). The notifier is unlinked into the 1409 * kernel structures and may then be reused. A negative errno code 1410 * is returned on a failure. 1411 */ 1412 1413int unregister_netdevice_notifier(struct notifier_block *nb) 1414{ 1415 int err; 1416 1417 rtnl_lock(); 1418 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1419 rtnl_unlock(); 1420 return err; 1421} 1422EXPORT_SYMBOL(unregister_netdevice_notifier); 1423 1424/** 1425 * call_netdevice_notifiers - call all network notifier blocks 1426 * @val: value passed unmodified to notifier function 1427 * @dev: net_device pointer passed unmodified to notifier function 1428 * 1429 * Call all network notifier blocks. Parameters and return value 1430 * are as for raw_notifier_call_chain(). 1431 */ 1432 1433int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1434{ 1435 ASSERT_RTNL(); 1436 return raw_notifier_call_chain(&netdev_chain, val, dev); 1437} 1438 1439/* When > 0 there are consumers of rx skb time stamps */ 1440static atomic_t netstamp_needed = ATOMIC_INIT(0); 1441 1442void net_enable_timestamp(void) 1443{ 1444 atomic_inc(&netstamp_needed); 1445} 1446EXPORT_SYMBOL(net_enable_timestamp); 1447 1448void net_disable_timestamp(void) 1449{ 1450 atomic_dec(&netstamp_needed); 1451} 1452EXPORT_SYMBOL(net_disable_timestamp); 1453 1454static inline void net_timestamp_set(struct sk_buff *skb) 1455{ 1456 if (atomic_read(&netstamp_needed)) 1457 __net_timestamp(skb); 1458 else 1459 skb->tstamp.tv64 = 0; 1460} 1461 1462static inline void net_timestamp_check(struct sk_buff *skb) 1463{ 1464 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) 1465 __net_timestamp(skb); 1466} 1467 1468/** 1469 * dev_forward_skb - loopback an skb to another netif 1470 * 1471 * @dev: destination network device 1472 * @skb: buffer to forward 1473 * 1474 * return values: 1475 * NET_RX_SUCCESS (no congestion) 1476 * NET_RX_DROP (packet was dropped, but freed) 1477 * 1478 * dev_forward_skb can be used for injecting an skb from the 1479 * start_xmit function of one device into the receive queue 1480 * of another device. 1481 * 1482 * The receiving device may be in another namespace, so 1483 * we have to clear all information in the skb that could 1484 * impact namespace isolation. 1485 */ 1486int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1487{ 1488 skb_orphan(skb); 1489 nf_reset(skb); 1490 1491 if (!(dev->flags & IFF_UP) || 1492 (skb->len > (dev->mtu + dev->hard_header_len))) { 1493 kfree_skb(skb); 1494 return NET_RX_DROP; 1495 } 1496 skb_set_dev(skb, dev); 1497 skb->tstamp.tv64 = 0; 1498 skb->pkt_type = PACKET_HOST; 1499 skb->protocol = eth_type_trans(skb, dev); 1500 return netif_rx(skb); 1501} 1502EXPORT_SYMBOL_GPL(dev_forward_skb); 1503 1504/* 1505 * Support routine. Sends outgoing frames to any network 1506 * taps currently in use. 1507 */ 1508 1509static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1510{ 1511 struct packet_type *ptype; 1512 1513#ifdef CONFIG_NET_CLS_ACT 1514 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) 1515 net_timestamp_set(skb); 1516#else 1517 net_timestamp_set(skb); 1518#endif 1519 1520 rcu_read_lock(); 1521 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1522 /* Never send packets back to the socket 1523 * they originated from - MvS (miquels@drinkel.ow.org) 1524 */ 1525 if ((ptype->dev == dev || !ptype->dev) && 1526 (ptype->af_packet_priv == NULL || 1527 (struct sock *)ptype->af_packet_priv != skb->sk)) { 1528 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1529 if (!skb2) 1530 break; 1531 1532 /* skb->nh should be correctly 1533 set by sender, so that the second statement is 1534 just protection against buggy protocols. 1535 */ 1536 skb_reset_mac_header(skb2); 1537 1538 if (skb_network_header(skb2) < skb2->data || 1539 skb2->network_header > skb2->tail) { 1540 if (net_ratelimit()) 1541 printk(KERN_CRIT "protocol %04x is " 1542 "buggy, dev %s\n", 1543 ntohs(skb2->protocol), 1544 dev->name); 1545 skb_reset_network_header(skb2); 1546 } 1547 1548 skb2->transport_header = skb2->network_header; 1549 skb2->pkt_type = PACKET_OUTGOING; 1550 ptype->func(skb2, skb->dev, ptype, skb->dev); 1551 } 1552 } 1553 rcu_read_unlock(); 1554} 1555 1556/* 1557 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 1558 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 1559 */ 1560void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 1561{ 1562 unsigned int real_num = dev->real_num_tx_queues; 1563 1564 if (unlikely(txq > dev->num_tx_queues)) 1565 ; 1566 else if (txq > real_num) 1567 dev->real_num_tx_queues = txq; 1568 else if (txq < real_num) { 1569 dev->real_num_tx_queues = txq; 1570 qdisc_reset_all_tx_gt(dev, txq); 1571 } 1572} 1573EXPORT_SYMBOL(netif_set_real_num_tx_queues); 1574 1575static inline void __netif_reschedule(struct Qdisc *q) 1576{ 1577 struct softnet_data *sd; 1578 unsigned long flags; 1579 1580 local_irq_save(flags); 1581 sd = &__get_cpu_var(softnet_data); 1582 q->next_sched = NULL; 1583 *sd->output_queue_tailp = q; 1584 sd->output_queue_tailp = &q->next_sched; 1585 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1586 local_irq_restore(flags); 1587} 1588 1589void __netif_schedule(struct Qdisc *q) 1590{ 1591 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 1592 __netif_reschedule(q); 1593} 1594EXPORT_SYMBOL(__netif_schedule); 1595 1596void dev_kfree_skb_irq(struct sk_buff *skb) 1597{ 1598 if (atomic_dec_and_test(&skb->users)) { 1599 struct softnet_data *sd; 1600 unsigned long flags; 1601 1602 local_irq_save(flags); 1603 sd = &__get_cpu_var(softnet_data); 1604 skb->next = sd->completion_queue; 1605 sd->completion_queue = skb; 1606 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1607 local_irq_restore(flags); 1608 } 1609} 1610EXPORT_SYMBOL(dev_kfree_skb_irq); 1611 1612void dev_kfree_skb_any(struct sk_buff *skb) 1613{ 1614 if (in_irq() || irqs_disabled()) 1615 dev_kfree_skb_irq(skb); 1616 else 1617 dev_kfree_skb(skb); 1618} 1619EXPORT_SYMBOL(dev_kfree_skb_any); 1620 1621 1622/** 1623 * netif_device_detach - mark device as removed 1624 * @dev: network device 1625 * 1626 * Mark device as removed from system and therefore no longer available. 1627 */ 1628void netif_device_detach(struct net_device *dev) 1629{ 1630 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 1631 netif_running(dev)) { 1632 netif_tx_stop_all_queues(dev); 1633 } 1634} 1635EXPORT_SYMBOL(netif_device_detach); 1636 1637/** 1638 * netif_device_attach - mark device as attached 1639 * @dev: network device 1640 * 1641 * Mark device as attached from system and restart if needed. 1642 */ 1643void netif_device_attach(struct net_device *dev) 1644{ 1645 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 1646 netif_running(dev)) { 1647 netif_tx_wake_all_queues(dev); 1648 __netdev_watchdog_up(dev); 1649 } 1650} 1651EXPORT_SYMBOL(netif_device_attach); 1652 1653static bool can_checksum_protocol(unsigned long features, __be16 protocol) 1654{ 1655 return ((features & NETIF_F_NO_CSUM) || 1656 ((features & NETIF_F_V4_CSUM) && 1657 protocol == htons(ETH_P_IP)) || 1658 ((features & NETIF_F_V6_CSUM) && 1659 protocol == htons(ETH_P_IPV6)) || 1660 ((features & NETIF_F_FCOE_CRC) && 1661 protocol == htons(ETH_P_FCOE))); 1662} 1663 1664static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) 1665{ 1666 if (can_checksum_protocol(dev->features, skb->protocol)) 1667 return true; 1668 1669 if (skb->protocol == htons(ETH_P_8021Q)) { 1670 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 1671 if (can_checksum_protocol(dev->features & dev->vlan_features, 1672 veh->h_vlan_encapsulated_proto)) 1673 return true; 1674 } 1675 1676 return false; 1677} 1678 1679/** 1680 * skb_dev_set -- assign a new device to a buffer 1681 * @skb: buffer for the new device 1682 * @dev: network device 1683 * 1684 * If an skb is owned by a device already, we have to reset 1685 * all data private to the namespace a device belongs to 1686 * before assigning it a new device. 1687 */ 1688#ifdef CONFIG_NET_NS 1689void skb_set_dev(struct sk_buff *skb, struct net_device *dev) 1690{ 1691 skb_dst_drop(skb); 1692 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { 1693 secpath_reset(skb); 1694 nf_reset(skb); 1695 skb_init_secmark(skb); 1696 skb->mark = 0; 1697 skb->priority = 0; 1698 skb->nf_trace = 0; 1699 skb->ipvs_property = 0; 1700#ifdef CONFIG_NET_SCHED 1701 skb->tc_index = 0; 1702#endif 1703 } 1704 skb->dev = dev; 1705} 1706EXPORT_SYMBOL(skb_set_dev); 1707#endif /* CONFIG_NET_NS */ 1708 1709/* 1710 * Invalidate hardware checksum when packet is to be mangled, and 1711 * complete checksum manually on outgoing path. 1712 */ 1713int skb_checksum_help(struct sk_buff *skb) 1714{ 1715 __wsum csum; 1716 int ret = 0, offset; 1717 1718 if (skb->ip_summed == CHECKSUM_COMPLETE) 1719 goto out_set_summed; 1720 1721 if (unlikely(skb_shinfo(skb)->gso_size)) { 1722 /* Let GSO fix up the checksum. */ 1723 goto out_set_summed; 1724 } 1725 1726 offset = skb->csum_start - skb_headroom(skb); 1727 BUG_ON(offset >= skb_headlen(skb)); 1728 csum = skb_checksum(skb, offset, skb->len - offset, 0); 1729 1730 offset += skb->csum_offset; 1731 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 1732 1733 if (skb_cloned(skb) && 1734 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 1735 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 1736 if (ret) 1737 goto out; 1738 } 1739 1740 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 1741out_set_summed: 1742 skb->ip_summed = CHECKSUM_NONE; 1743out: 1744 return ret; 1745} 1746EXPORT_SYMBOL(skb_checksum_help); 1747 1748/** 1749 * skb_gso_segment - Perform segmentation on skb. 1750 * @skb: buffer to segment 1751 * @features: features for the output path (see dev->features) 1752 * 1753 * This function segments the given skb and returns a list of segments. 1754 * 1755 * It may return NULL if the skb requires no segmentation. This is 1756 * only possible when GSO is used for verifying header integrity. 1757 */ 1758struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) 1759{ 1760 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1761 struct packet_type *ptype; 1762 __be16 type = skb->protocol; 1763 int err; 1764 1765 skb_reset_mac_header(skb); 1766 skb->mac_len = skb->network_header - skb->mac_header; 1767 __skb_pull(skb, skb->mac_len); 1768 1769 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1770 struct net_device *dev = skb->dev; 1771 struct ethtool_drvinfo info = {}; 1772 1773 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) 1774 dev->ethtool_ops->get_drvinfo(dev, &info); 1775 1776 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " 1777 "ip_summed=%d", 1778 info.driver, dev ? dev->features : 0L, 1779 skb->sk ? skb->sk->sk_route_caps : 0L, 1780 skb->len, skb->data_len, skb->ip_summed); 1781 1782 if (skb_header_cloned(skb) && 1783 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 1784 return ERR_PTR(err); 1785 } 1786 1787 rcu_read_lock(); 1788 list_for_each_entry_rcu(ptype, 1789 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 1790 if (ptype->type == type && !ptype->dev && ptype->gso_segment) { 1791 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1792 err = ptype->gso_send_check(skb); 1793 segs = ERR_PTR(err); 1794 if (err || skb_gso_ok(skb, features)) 1795 break; 1796 __skb_push(skb, (skb->data - 1797 skb_network_header(skb))); 1798 } 1799 segs = ptype->gso_segment(skb, features); 1800 break; 1801 } 1802 } 1803 rcu_read_unlock(); 1804 1805 __skb_push(skb, skb->data - skb_mac_header(skb)); 1806 1807 return segs; 1808} 1809EXPORT_SYMBOL(skb_gso_segment); 1810 1811/* Take action when hardware reception checksum errors are detected. */ 1812#ifdef CONFIG_BUG 1813void netdev_rx_csum_fault(struct net_device *dev) 1814{ 1815 if (net_ratelimit()) { 1816 printk(KERN_ERR "%s: hw csum failure.\n", 1817 dev ? dev->name : "<unknown>"); 1818 dump_stack(); 1819 } 1820} 1821EXPORT_SYMBOL(netdev_rx_csum_fault); 1822#endif 1823 1824/* Actually, we should eliminate this check as soon as we know, that: 1825 * 1. IOMMU is present and allows to map all the memory. 1826 * 2. No high memory really exists on this machine. 1827 */ 1828 1829static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 1830{ 1831#ifdef CONFIG_HIGHMEM 1832 int i; 1833 if (!(dev->features & NETIF_F_HIGHDMA)) { 1834 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1835 if (PageHighMem(skb_shinfo(skb)->frags[i].page)) 1836 return 1; 1837 } 1838 1839 if (PCI_DMA_BUS_IS_PHYS) { 1840 struct device *pdev = dev->dev.parent; 1841 1842 if (!pdev) 1843 return 0; 1844 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1845 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); 1846 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 1847 return 1; 1848 } 1849 } 1850#endif 1851 return 0; 1852} 1853 1854struct dev_gso_cb { 1855 void (*destructor)(struct sk_buff *skb); 1856}; 1857 1858#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) 1859 1860static void dev_gso_skb_destructor(struct sk_buff *skb) 1861{ 1862 struct dev_gso_cb *cb; 1863 1864 do { 1865 struct sk_buff *nskb = skb->next; 1866 1867 skb->next = nskb->next; 1868 nskb->next = NULL; 1869 kfree_skb(nskb); 1870 } while (skb->next); 1871 1872 cb = DEV_GSO_CB(skb); 1873 if (cb->destructor) 1874 cb->destructor(skb); 1875} 1876 1877/** 1878 * dev_gso_segment - Perform emulated hardware segmentation on skb. 1879 * @skb: buffer to segment 1880 * 1881 * This function segments the given skb and stores the list of segments 1882 * in skb->next. 1883 */ 1884static int dev_gso_segment(struct sk_buff *skb) 1885{ 1886 struct net_device *dev = skb->dev; 1887 struct sk_buff *segs; 1888 int features = dev->features & ~(illegal_highdma(dev, skb) ? 1889 NETIF_F_SG : 0); 1890 1891 segs = skb_gso_segment(skb, features); 1892 1893 /* Verifying header integrity only. */ 1894 if (!segs) 1895 return 0; 1896 1897 if (IS_ERR(segs)) 1898 return PTR_ERR(segs); 1899 1900 skb->next = segs; 1901 DEV_GSO_CB(skb)->destructor = skb->destructor; 1902 skb->destructor = dev_gso_skb_destructor; 1903 1904 return 0; 1905} 1906 1907/* 1908 * Try to orphan skb early, right before transmission by the device. 1909 * We cannot orphan skb if tx timestamp is requested, since 1910 * drivers need to call skb_tstamp_tx() to send the timestamp. 1911 */ 1912static inline void skb_orphan_try(struct sk_buff *skb) 1913{ 1914 struct sock *sk = skb->sk; 1915 1916 if (sk && !skb_tx(skb)->flags) { 1917 /* skb_tx_hash() wont be able to get sk. 1918 * We copy sk_hash into skb->rxhash 1919 */ 1920 if (!skb->rxhash) 1921 skb->rxhash = sk->sk_hash; 1922 skb_orphan(skb); 1923 } 1924} 1925 1926/* 1927 * Returns true if either: 1928 * 1. skb has frag_list and the device doesn't support FRAGLIST, or 1929 * 2. skb is fragmented and the device does not support SG, or if 1930 * at least one of fragments is in highmem and device does not 1931 * support DMA from it. 1932 */ 1933static inline int skb_needs_linearize(struct sk_buff *skb, 1934 struct net_device *dev) 1935{ 1936 return skb_is_nonlinear(skb) && 1937 ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || 1938 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || 1939 illegal_highdma(dev, skb)))); 1940} 1941 1942int BCMFASTPATH_HOST dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 1943 struct netdev_queue *txq) 1944{ 1945 const struct net_device_ops *ops = dev->netdev_ops; 1946 int rc = NETDEV_TX_OK; 1947 1948 if (likely(!skb->next)) { 1949 if (!list_empty(&ptype_all)) 1950 dev_queue_xmit_nit(skb, dev); 1951 1952 /* 1953 * If device doesnt need skb->dst, release it right now while 1954 * its hot in this cpu cache 1955 */ 1956 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 1957 skb_dst_drop(skb); 1958 1959 skb_orphan_try(skb); 1960 1961 if (netif_needs_gso(dev, skb)) { 1962 if (unlikely(dev_gso_segment(skb))) 1963 goto out_kfree_skb; 1964 if (skb->next) 1965 goto gso; 1966 else { 1967 DEV_GSO_CB(skb)->destructor = skb->destructor; 1968 skb->destructor = dev_gso_skb_destructor; 1969 goto out_kfree_gso_skb; 1970 } 1971 } else { 1972 if (skb_needs_linearize(skb, dev) && 1973 __skb_linearize(skb)) 1974 goto out_kfree_skb; 1975 1976 /* If packet is not checksummed and device does not 1977 * support checksumming for this protocol, complete 1978 * checksumming here. 1979 */ 1980 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1981 skb_set_transport_header(skb, skb->csum_start - 1982 skb_headroom(skb)); 1983 if (!dev_can_checksum(dev, skb) && 1984 skb_checksum_help(skb)) 1985 goto out_kfree_skb; 1986 } 1987 } 1988 1989 rc = ops->ndo_start_xmit(skb, dev); 1990 if (rc == NETDEV_TX_OK) 1991 txq_trans_update(txq); 1992 return rc; 1993 } 1994 1995gso: 1996 do { 1997 struct sk_buff *nskb = skb->next; 1998 1999 skb->next = nskb->next; 2000 nskb->next = NULL; 2001 2002 /* 2003 * If device doesnt need nskb->dst, release it right now while 2004 * its hot in this cpu cache 2005 */ 2006 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2007 skb_dst_drop(nskb); 2008 2009 rc = ops->ndo_start_xmit(nskb, dev); 2010 if (unlikely(rc != NETDEV_TX_OK)) { 2011 if (rc & ~NETDEV_TX_MASK) 2012 goto out_kfree_gso_skb; 2013 nskb->next = skb->next; 2014 skb->next = nskb; 2015 return rc; 2016 } 2017 txq_trans_update(txq); 2018 if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) 2019 return NETDEV_TX_BUSY; 2020 } while (skb->next); 2021 2022out_kfree_gso_skb: 2023 if (likely(skb->next == NULL)) 2024 skb->destructor = DEV_GSO_CB(skb)->destructor; 2025out_kfree_skb: 2026 kfree_skb(skb); 2027 return rc; 2028} 2029 2030static u32 hashrnd __read_mostly; 2031 2032u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 2033{ 2034 u32 hash; 2035 2036 if (skb_rx_queue_recorded(skb)) { 2037 hash = skb_get_rx_queue(skb); 2038 while (unlikely(hash >= dev->real_num_tx_queues)) 2039 hash -= dev->real_num_tx_queues; 2040 return hash; 2041 } 2042 2043 if (skb->sk && skb->sk->sk_hash) 2044 hash = skb->sk->sk_hash; 2045 else 2046 hash = (__force u16) skb->protocol ^ skb->rxhash; 2047 hash = jhash_1word(hash, hashrnd); 2048 2049 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 2050} 2051EXPORT_SYMBOL(skb_tx_hash); 2052 2053static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) 2054{ 2055 if (unlikely(queue_index >= dev->real_num_tx_queues)) { 2056 if (net_ratelimit()) { 2057 pr_warning("%s selects TX queue %d, but " 2058 "real number of TX queues is %d\n", 2059 dev->name, queue_index, dev->real_num_tx_queues); 2060 } 2061 return 0; 2062 } 2063 return queue_index; 2064} 2065 2066static struct netdev_queue *dev_pick_tx(struct net_device *dev, 2067 struct sk_buff *skb) 2068{ 2069 int queue_index; 2070 const struct net_device_ops *ops = dev->netdev_ops; 2071 2072 if (ops->ndo_select_queue) { 2073 queue_index = ops->ndo_select_queue(dev, skb); 2074 queue_index = dev_cap_txqueue(dev, queue_index); 2075 } else { 2076 struct sock *sk = skb->sk; 2077 queue_index = sk_tx_queue_get(sk); 2078 if (queue_index < 0) { 2079 2080 queue_index = 0; 2081 if (dev->real_num_tx_queues > 1) 2082 queue_index = skb_tx_hash(dev, skb); 2083 2084 if (sk) { 2085 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1); 2086 2087 if (dst && skb_dst(skb) == dst) 2088 sk_tx_queue_set(sk, queue_index); 2089 } 2090 } 2091 } 2092 2093 skb_set_queue_mapping(skb, queue_index); 2094 return netdev_get_tx_queue(dev, queue_index); 2095} 2096 2097static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 2098 struct net_device *dev, 2099 struct netdev_queue *txq) 2100{ 2101 spinlock_t *root_lock = qdisc_lock(q); 2102 int rc; 2103 2104 /* 2105 * Heuristic to force contended enqueues to serialize on a 2106 * separate lock before trying to get qdisc main lock. 2107 * This permits __QDISC_STATE_RUNNING owner to get the lock more often 2108 * and dequeue packets faster. 2109 */ 2110 2111 spin_lock(root_lock); 2112 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 2113 kfree_skb(skb); 2114 rc = NET_XMIT_DROP; 2115 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 2116 qdisc_run_begin(q)) { 2117 /* 2118 * This is a work-conserving queue; there are no old skbs 2119 * waiting to be sent out; and the qdisc is not running - 2120 * xmit the skb directly. 2121 */ 2122 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) 2123 skb_dst_force(skb); 2124 __qdisc_update_bstats(q, skb->len); 2125 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) 2126 __qdisc_run(q); 2127 else 2128 qdisc_run_end(q); 2129 2130 rc = NET_XMIT_SUCCESS; 2131 } else { 2132 skb_dst_force(skb); 2133 rc = qdisc_enqueue_root(skb, q); 2134 if (qdisc_run_begin(q)) 2135 __qdisc_run(q); 2136 } 2137 spin_unlock(root_lock); 2138 2139 return rc; 2140} 2141 2142/** 2143 * dev_queue_xmit - transmit a buffer 2144 * @skb: buffer to transmit 2145 * 2146 * Queue a buffer for transmission to a network device. The caller must 2147 * have set the device and priority and built the buffer before calling 2148 * this function. The function can be called from an interrupt. 2149 * 2150 * A negative errno code is returned on a failure. A success does not 2151 * guarantee the frame will be transmitted as it may be dropped due 2152 * to congestion or traffic shaping. 2153 * 2154 * ----------------------------------------------------------------------------------- 2155 * I notice this method can also return errors from the queue disciplines, 2156 * including NET_XMIT_DROP, which is a positive value. So, errors can also 2157 * be positive. 2158 * 2159 * Regardless of the return value, the skb is consumed, so it is currently 2160 * difficult to retry a send to this method. (You can bump the ref count 2161 * before sending to hold a reference for retry if you are careful.) 2162 * 2163 * When calling this method, interrupts MUST be enabled. This is because 2164 * the BH enable code must have IRQs enabled so that it will not deadlock. 2165 * --BLG 2166 */ 2167int BCMFASTPATH_HOST dev_queue_xmit(struct sk_buff *skb) 2168{ 2169 struct net_device *dev = skb->dev; 2170 struct netdev_queue *txq; 2171 struct Qdisc *q; 2172 int rc = -ENOMEM; 2173 2174 /* Disable soft irqs for various locks below. Also 2175 * stops preemption for RCU. 2176 */ 2177 rcu_read_lock_bh(); 2178 2179 txq = dev_pick_tx(dev, skb); 2180 q = rcu_dereference_bh(txq->qdisc); 2181 2182#ifdef CONFIG_NET_CLS_ACT 2183 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2184#endif 2185#ifdef CONFIG_IP_NF_LFP 2186 if (q->enqueue && !(skb->nfcache & NFC_LFP_ENABLE)) { 2187#else 2188 if (q->enqueue) { 2189#endif 2190 rc = __dev_xmit_skb(skb, q, dev, txq); 2191 goto out; 2192 } 2193 2194 /* The device has no queue. Common case for software devices: 2195 loopback, all the sorts of tunnels... 2196 2197 Really, it is unlikely that netif_tx_lock protection is necessary 2198 here. (f.e. loopback and IP tunnels are clean ignoring statistics 2199 counters.) 2200 However, it is possible, that they rely on protection 2201 made by us here. 2202 2203 Check this and shot the lock. It is not prone from deadlocks. 2204 Either shot noqueue qdisc, it is even simpler 8) 2205 */ 2206 if (dev->flags & IFF_UP) { 2207 int cpu = smp_processor_id(); /* ok because BHs are off */ 2208 2209 if (txq->xmit_lock_owner != cpu) { 2210 2211 HARD_TX_LOCK(dev, txq, cpu); 2212 2213 if (!netif_tx_queue_stopped(txq)) { 2214 rc = dev_hard_start_xmit(skb, dev, txq); 2215 if (dev_xmit_complete(rc)) { 2216 HARD_TX_UNLOCK(dev, txq); 2217 goto out; 2218 } 2219 } 2220 HARD_TX_UNLOCK(dev, txq); 2221 if (net_ratelimit()) 2222 printk(KERN_CRIT "Virtual device %s asks to " 2223 "queue packet!\n", dev->name); 2224 } else { 2225 /* Recursion is detected! It is possible, 2226 * unfortunately */ 2227 if (net_ratelimit()) 2228 printk(KERN_CRIT "Dead loop on virtual device " 2229 "%s, fix it urgently!\n", dev->name); 2230 } 2231 } 2232 2233 rc = -ENETDOWN; 2234 rcu_read_unlock_bh(); 2235 2236 kfree_skb(skb); 2237 return rc; 2238out: 2239 rcu_read_unlock_bh(); 2240 return rc; 2241} 2242EXPORT_SYMBOL(dev_queue_xmit); 2243 2244 2245/*======================================================================= 2246 Receiver routines 2247 =======================================================================*/ 2248 2249int netdev_max_backlog __read_mostly = 1000; 2250int netdev_tstamp_prequeue __read_mostly = 1; 2251int netdev_budget __read_mostly = 300; 2252int weight_p __read_mostly = 64; /* old backlog weight */ 2253 2254/* Called with irq disabled */ 2255static inline void ____napi_schedule(struct softnet_data *sd, 2256 struct napi_struct *napi) 2257{ 2258 list_add_tail(&napi->poll_list, &sd->poll_list); 2259 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2260} 2261 2262#ifdef CONFIG_RPS 2263 2264/* One global table that all flow-based protocols share. */ 2265struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; 2266EXPORT_SYMBOL(rps_sock_flow_table); 2267 2268/* 2269 * get_rps_cpu is called from netif_receive_skb and returns the target 2270 * CPU from the RPS map of the receiving queue for a given skb. 2271 * rcu_read_lock must be held on entry. 2272 */ 2273static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 2274 struct rps_dev_flow **rflowp) 2275{ 2276 struct ipv6hdr *ip6; 2277 struct iphdr *ip; 2278 struct netdev_rx_queue *rxqueue; 2279 struct rps_map *map; 2280 struct rps_dev_flow_table *flow_table; 2281 struct rps_sock_flow_table *sock_flow_table; 2282 int cpu = -1; 2283 u8 ip_proto; 2284 u16 tcpu; 2285 u32 addr1, addr2, ihl; 2286 union { 2287 u32 v32; 2288 u16 v16[2]; 2289 } ports; 2290 2291 if (skb_rx_queue_recorded(skb)) { 2292 u16 index = skb_get_rx_queue(skb); 2293 if (unlikely(index >= dev->num_rx_queues)) { 2294 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " 2295 "on queue %u, but number of RX queues is %u\n", 2296 dev->name, index, dev->num_rx_queues); 2297 goto done; 2298 } 2299 rxqueue = dev->_rx + index; 2300 } else 2301 rxqueue = dev->_rx; 2302 2303 if (!rxqueue->rps_map && !rxqueue->rps_flow_table) 2304 goto done; 2305 2306 if (skb->rxhash) 2307 goto got_hash; /* Skip hash computation on packet header */ 2308 2309 switch (skb->protocol) { 2310 case __constant_htons(ETH_P_IP): 2311 if (!pskb_may_pull(skb, sizeof(*ip))) 2312 goto done; 2313 2314 ip = (struct iphdr *) skb->data; 2315 ip_proto = ip->protocol; 2316 addr1 = (__force u32) ip->saddr; 2317 addr2 = (__force u32) ip->daddr; 2318 ihl = ip->ihl; 2319 break; 2320 case __constant_htons(ETH_P_IPV6): 2321 if (!pskb_may_pull(skb, sizeof(*ip6))) 2322 goto done; 2323 2324 ip6 = (struct ipv6hdr *) skb->data; 2325 ip_proto = ip6->nexthdr; 2326 addr1 = (__force u32) ip6->saddr.s6_addr32[3]; 2327 addr2 = (__force u32) ip6->daddr.s6_addr32[3]; 2328 ihl = (40 >> 2); 2329 break; 2330 default: 2331 goto done; 2332 } 2333 switch (ip_proto) { 2334 case IPPROTO_TCP: 2335 case IPPROTO_UDP: 2336 case IPPROTO_DCCP: 2337 case IPPROTO_ESP: 2338 case IPPROTO_AH: 2339 case IPPROTO_SCTP: 2340 case IPPROTO_UDPLITE: 2341 if (pskb_may_pull(skb, (ihl * 4) + 4)) { 2342 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); 2343 if (ports.v16[1] < ports.v16[0]) 2344 swap(ports.v16[0], ports.v16[1]); 2345 break; 2346 } 2347 default: 2348 ports.v32 = 0; 2349 break; 2350 } 2351 2352 /* get a consistent hash (same value on both flow directions) */ 2353 if (addr2 < addr1) 2354 swap(addr1, addr2); 2355 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); 2356 if (!skb->rxhash) 2357 skb->rxhash = 1; 2358 2359got_hash: 2360 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2361 sock_flow_table = rcu_dereference(rps_sock_flow_table); 2362 if (flow_table && sock_flow_table) { 2363 u16 next_cpu; 2364 struct rps_dev_flow *rflow; 2365 2366 rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; 2367 tcpu = rflow->cpu; 2368 2369 next_cpu = sock_flow_table->ents[skb->rxhash & 2370 sock_flow_table->mask]; 2371 2372 /* 2373 * If the desired CPU (where last recvmsg was done) is 2374 * different from current CPU (one in the rx-queue flow 2375 * table entry), switch if one of the following holds: 2376 * - Current CPU is unset (equal to RPS_NO_CPU). 2377 * - Current CPU is offline. 2378 * - The current CPU's queue tail has advanced beyond the 2379 * last packet that was enqueued using this table entry. 2380 * This guarantees that all previous packets for the flow 2381 * have been dequeued, thus preserving in order delivery. 2382 */ 2383 if (unlikely(tcpu != next_cpu) && 2384 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 2385 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 2386 rflow->last_qtail)) >= 0)) { 2387 tcpu = rflow->cpu = next_cpu; 2388 if (tcpu != RPS_NO_CPU) 2389 rflow->last_qtail = per_cpu(softnet_data, 2390 tcpu).input_queue_head; 2391 } 2392 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 2393 *rflowp = rflow; 2394 cpu = tcpu; 2395 goto done; 2396 } 2397 } 2398 2399 map = rcu_dereference(rxqueue->rps_map); 2400 if (map) { 2401 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; 2402 2403 if (cpu_online(tcpu)) { 2404 cpu = tcpu; 2405 goto done; 2406 } 2407 } 2408 2409done: 2410 return cpu; 2411} 2412 2413/* Called from hardirq (IPI) context */ 2414static void rps_trigger_softirq(void *data) 2415{ 2416 struct softnet_data *sd = data; 2417 2418 ____napi_schedule(sd, &sd->backlog); 2419 sd->received_rps++; 2420} 2421 2422#endif /* CONFIG_RPS */ 2423 2424/* 2425 * Check if this softnet_data structure is another cpu one 2426 * If yes, queue it to our IPI list and return 1 2427 * If no, return 0 2428 */ 2429static int rps_ipi_queued(struct softnet_data *sd) 2430{ 2431#ifdef CONFIG_RPS 2432 struct softnet_data *mysd = &__get_cpu_var(softnet_data); 2433 2434 if (sd != mysd) { 2435 sd->rps_ipi_next = mysd->rps_ipi_list; 2436 mysd->rps_ipi_list = sd; 2437 2438 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2439 return 1; 2440 } 2441#endif /* CONFIG_RPS */ 2442 return 0; 2443} 2444 2445/* 2446 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 2447 * queue (may be a remote CPU queue). 2448 */ 2449static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 2450 unsigned int *qtail) 2451{ 2452 struct softnet_data *sd; 2453 unsigned long flags; 2454 2455 sd = &per_cpu(softnet_data, cpu); 2456 2457 local_irq_save(flags); 2458 2459 rps_lock(sd); 2460 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { 2461 if (skb_queue_len(&sd->input_pkt_queue)) { 2462enqueue: 2463 __skb_queue_tail(&sd->input_pkt_queue, skb); 2464 input_queue_tail_incr_save(sd, qtail); 2465 rps_unlock(sd); 2466 local_irq_restore(flags); 2467 return NET_RX_SUCCESS; 2468 } 2469 2470 /* Schedule NAPI for backlog device 2471 * We can use non atomic operation since we own the queue lock 2472 */ 2473 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 2474 if (!rps_ipi_queued(sd)) 2475 ____napi_schedule(sd, &sd->backlog); 2476 } 2477 goto enqueue; 2478 } 2479 2480 sd->dropped++; 2481 rps_unlock(sd); 2482 2483 local_irq_restore(flags); 2484 2485 kfree_skb(skb); 2486 return NET_RX_DROP; 2487} 2488 2489/** 2490 * netif_rx - post buffer to the network code 2491 * @skb: buffer to post 2492 * 2493 * This function receives a packet from a device driver and queues it for 2494 * the upper (protocol) levels to process. It always succeeds. The buffer 2495 * may be dropped during processing for congestion control or by the 2496 * protocol layers. 2497 * 2498 * return values: 2499 * NET_RX_SUCCESS (no congestion) 2500 * NET_RX_DROP (packet was dropped) 2501 * 2502 */ 2503 2504int BCMFASTPATH_HOST netif_rx(struct sk_buff *skb) 2505{ 2506 int ret; 2507 2508 /* if netpoll wants it, pretend we never saw it */ 2509 if (netpoll_rx(skb)) 2510 return NET_RX_DROP; 2511 2512 if (netdev_tstamp_prequeue) 2513 net_timestamp_check(skb); 2514 2515#ifdef CONFIG_RPS 2516 { 2517 struct rps_dev_flow voidflow, *rflow = &voidflow; 2518 int cpu; 2519 2520 preempt_disable(); 2521 rcu_read_lock(); 2522 2523 cpu = get_rps_cpu(skb->dev, skb, &rflow); 2524 if (cpu < 0) 2525 cpu = smp_processor_id(); 2526 2527 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 2528 2529 rcu_read_unlock(); 2530 preempt_enable(); 2531 } 2532#else 2533 { 2534 unsigned int qtail; 2535 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 2536 put_cpu(); 2537 } 2538#endif 2539 return ret; 2540} 2541EXPORT_SYMBOL(netif_rx); 2542 2543int netif_rx_ni(struct sk_buff *skb) 2544{ 2545 int err; 2546 2547 preempt_disable(); 2548 err = netif_rx(skb); 2549 if (local_softirq_pending()) 2550 do_softirq(); 2551 preempt_enable(); 2552 2553 return err; 2554} 2555EXPORT_SYMBOL(netif_rx_ni); 2556 2557static void net_tx_action(struct softirq_action *h) 2558{ 2559 struct softnet_data *sd = &__get_cpu_var(softnet_data); 2560 2561 if (sd->completion_queue) { 2562 struct sk_buff *clist; 2563 2564 local_irq_disable(); 2565 clist = sd->completion_queue; 2566 sd->completion_queue = NULL; 2567 local_irq_enable(); 2568 2569 while (clist) { 2570 struct sk_buff *skb = clist; 2571 clist = clist->next; 2572 2573 WARN_ON(atomic_read(&skb->users)); 2574 __kfree_skb(skb); 2575 } 2576 } 2577 2578 if (sd->output_queue) { 2579 struct Qdisc *head; 2580 2581 local_irq_disable(); 2582 head = sd->output_queue; 2583 sd->output_queue = NULL; 2584 sd->output_queue_tailp = &sd->output_queue; 2585 local_irq_enable(); 2586 2587 while (head) { 2588 struct Qdisc *q = head; 2589 spinlock_t *root_lock; 2590 2591 head = head->next_sched; 2592 2593 root_lock = qdisc_lock(q); 2594 if (spin_trylock(root_lock)) { 2595 smp_mb__before_clear_bit(); 2596 clear_bit(__QDISC_STATE_SCHED, 2597 &q->state); 2598 qdisc_run(q); 2599 spin_unlock(root_lock); 2600 } else { 2601 if (!test_bit(__QDISC_STATE_DEACTIVATED, 2602 &q->state)) { 2603 __netif_reschedule(q); 2604 } else { 2605 smp_mb__before_clear_bit(); 2606 clear_bit(__QDISC_STATE_SCHED, 2607 &q->state); 2608 } 2609 } 2610 } 2611 } 2612} 2613 2614static inline int deliver_skb(struct sk_buff *skb, 2615 struct packet_type *pt_prev, 2616 struct net_device *orig_dev) 2617{ 2618 atomic_inc(&skb->users); 2619 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2620} 2621 2622#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 2623 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 2624/* This hook is defined here for ATM LANE */ 2625int (*br_fdb_test_addr_hook)(struct net_device *dev, 2626 unsigned char *addr) __read_mostly; 2627EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 2628#endif 2629 2630#ifdef CONFIG_NET_CLS_ACT 2631/* TODO: Maybe we should just force sch_ingress to be compiled in 2632 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 2633 * a compare and 2 stores extra right now if we dont have it on 2634 * but have CONFIG_NET_CLS_ACT 2635 * NOTE: This doesnt stop any functionality; if you dont have 2636 * the ingress scheduler, you just cant add policies on ingress. 2637 * 2638 */ 2639static int ing_filter(struct sk_buff *skb) 2640{ 2641 struct net_device *dev = skb->dev; 2642 u32 ttl = G_TC_RTTL(skb->tc_verd); 2643 struct netdev_queue *rxq; 2644 int result = TC_ACT_OK; 2645 struct Qdisc *q; 2646 2647 if (unlikely(MAX_RED_LOOP < ttl++)) { 2648 if (net_ratelimit()) 2649 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n", 2650 skb->skb_iif, dev->ifindex); 2651 return TC_ACT_SHOT; 2652 } 2653 2654 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 2655 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 2656 2657 rxq = &dev->rx_queue; 2658 2659 q = rxq->qdisc; 2660 if (q != &noop_qdisc) { 2661 spin_lock(qdisc_lock(q)); 2662 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 2663 result = qdisc_enqueue_root(skb, q); 2664 spin_unlock(qdisc_lock(q)); 2665 } 2666 2667 return result; 2668} 2669 2670static inline struct sk_buff *handle_ing(struct sk_buff *skb, 2671 struct packet_type **pt_prev, 2672 int *ret, struct net_device *orig_dev) 2673{ 2674 if (skb->dev->rx_queue.qdisc == &noop_qdisc) 2675 goto out; 2676 2677 if (*pt_prev) { 2678 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2679 *pt_prev = NULL; 2680 } 2681 2682 switch (ing_filter(skb)) { 2683 case TC_ACT_SHOT: 2684 case TC_ACT_STOLEN: 2685 kfree_skb(skb); 2686 return NULL; 2687 } 2688 2689out: 2690 skb->tc_verd = 0; 2691 return skb; 2692} 2693#endif 2694 2695/* 2696 * netif_nit_deliver - deliver received packets to network taps 2697 * @skb: buffer 2698 * 2699 * This function is used to deliver incoming packets to network 2700 * taps. It should be used when the normal netif_receive_skb path 2701 * is bypassed, for example because of VLAN acceleration. 2702 */ 2703void netif_nit_deliver(struct sk_buff *skb) 2704{ 2705 struct packet_type *ptype; 2706 2707 if (list_empty(&ptype_all)) 2708 return; 2709 2710 skb_reset_network_header(skb); 2711 skb_reset_transport_header(skb); 2712 skb->mac_len = skb->network_header - skb->mac_header; 2713 2714 rcu_read_lock(); 2715 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2716 if (!ptype->dev || ptype->dev == skb->dev) 2717 deliver_skb(skb, ptype, skb->dev); 2718 } 2719 rcu_read_unlock(); 2720} 2721 2722/** 2723 * netdev_rx_handler_register - register receive handler 2724 * @dev: device to register a handler for 2725 * @rx_handler: receive handler to register 2726 * @rx_handler_data: data pointer that is used by rx handler 2727 * 2728 * Register a receive hander for a device. This handler will then be 2729 * called from __netif_receive_skb. A negative errno code is returned 2730 * on a failure. 2731 * 2732 * The caller must hold the rtnl_mutex. 2733 */ 2734int netdev_rx_handler_register(struct net_device *dev, 2735 rx_handler_func_t *rx_handler, 2736 void *rx_handler_data) 2737{ 2738 ASSERT_RTNL(); 2739 2740 if (dev->rx_handler) 2741 return -EBUSY; 2742 2743 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 2744 rcu_assign_pointer(dev->rx_handler, rx_handler); 2745 2746 return 0; 2747} 2748EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 2749 2750/** 2751 * netdev_rx_handler_unregister - unregister receive handler 2752 * @dev: device to unregister a handler from 2753 * 2754 * Unregister a receive hander from a device. 2755 * 2756 * The caller must hold the rtnl_mutex. 2757 */ 2758void netdev_rx_handler_unregister(struct net_device *dev) 2759{ 2760 2761 ASSERT_RTNL(); 2762 rcu_assign_pointer(dev->rx_handler, NULL); 2763 rcu_assign_pointer(dev->rx_handler_data, NULL); 2764} 2765EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 2766 2767static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, 2768 struct net_device *master) 2769{ 2770 if (skb->pkt_type == PACKET_HOST) { 2771 u16 *dest = (u16 *) eth_hdr(skb)->h_dest; 2772 2773 memcpy(dest, master->dev_addr, ETH_ALEN); 2774 } 2775} 2776 2777/* On bonding slaves other than the currently active slave, suppress 2778 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and 2779 * ARP on active-backup slaves with arp_validate enabled. 2780 */ 2781int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) 2782{ 2783 struct net_device *dev = skb->dev; 2784 2785 if (master->priv_flags & IFF_MASTER_ARPMON) 2786 dev->last_rx = jiffies; 2787 2788 if ((master->priv_flags & IFF_MASTER_ALB) && 2789 (master->priv_flags & IFF_BRIDGE_PORT)) { 2790 /* Do address unmangle. The local destination address 2791 * will be always the one master has. Provides the right 2792 * functionality in a bridge. 2793 */ 2794 skb_bond_set_mac_by_master(skb, master); 2795 } 2796 2797 if (dev->priv_flags & IFF_SLAVE_INACTIVE) { 2798 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && 2799 skb->protocol == __cpu_to_be16(ETH_P_ARP)) 2800 return 0; 2801 2802 if (master->priv_flags & IFF_MASTER_ALB) { 2803 if (skb->pkt_type != PACKET_BROADCAST && 2804 skb->pkt_type != PACKET_MULTICAST) 2805 return 0; 2806 } 2807 if (master->priv_flags & IFF_MASTER_8023AD && 2808 skb->protocol == __cpu_to_be16(ETH_P_SLOW)) 2809 return 0; 2810 2811 return 1; 2812 } 2813 return 0; 2814} 2815EXPORT_SYMBOL(__skb_bond_should_drop); 2816 2817static int __netif_receive_skb(struct sk_buff *skb) 2818{ 2819 struct packet_type *ptype, *pt_prev; 2820 rx_handler_func_t *rx_handler; 2821 struct net_device *orig_dev; 2822 struct net_device *master; 2823 struct net_device *null_or_orig; 2824 struct net_device *orig_or_bond; 2825 int ret = NET_RX_DROP; 2826 __be16 type; 2827 2828 if (!netdev_tstamp_prequeue) 2829 net_timestamp_check(skb); 2830 2831 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) 2832 return NET_RX_SUCCESS; 2833 2834 /* if we've gotten here through NAPI, check netpoll */ 2835 if (netpoll_receive_skb(skb)) 2836 return NET_RX_DROP; 2837 2838 if (!skb->skb_iif) 2839 skb->skb_iif = skb->dev->ifindex; 2840 2841 /* 2842 * bonding note: skbs received on inactive slaves should only 2843 * be delivered to pkt handlers that are exact matches. Also 2844 * the deliver_no_wcard flag will be set. If packet handlers 2845 * are sensitive to duplicate packets these skbs will need to 2846 * be dropped at the handler. The vlan accel path may have 2847 * already set the deliver_no_wcard flag. 2848 */ 2849 null_or_orig = NULL; 2850 orig_dev = skb->dev; 2851 master = ACCESS_ONCE(orig_dev->master); 2852 if (skb->deliver_no_wcard) 2853 null_or_orig = orig_dev; 2854 else if (master) { 2855 if (skb_bond_should_drop(skb, master)) { 2856 skb->deliver_no_wcard = 1; 2857 null_or_orig = orig_dev; /* deliver only exact match */ 2858 } else 2859 skb->dev = master; 2860 } 2861 2862 __this_cpu_inc(softnet_data.processed); 2863 skb_reset_network_header(skb); 2864 skb_reset_transport_header(skb); 2865 skb->mac_len = skb->network_header - skb->mac_header; 2866 2867 pt_prev = NULL; 2868 2869 rcu_read_lock(); 2870 2871#ifdef CONFIG_NET_CLS_ACT 2872 if (skb->tc_verd & TC_NCLS) { 2873 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 2874 goto ncls; 2875 } 2876#endif 2877 2878 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2879 if (ptype->dev == null_or_orig || ptype->dev == skb->dev || 2880 ptype->dev == orig_dev) { 2881 if (pt_prev) 2882 ret = deliver_skb(skb, pt_prev, orig_dev); 2883 pt_prev = ptype; 2884 } 2885 } 2886 2887#ifdef CONFIG_NET_CLS_ACT 2888 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 2889 if (!skb) 2890 goto out; 2891ncls: 2892#endif 2893 2894 /* If we got this far with a hardware accelerated VLAN tag, it means 2895 * that we were put in promiscuous mode but nobody is interested in 2896 * this vid. Drop the packet now to prevent it from getting propagated 2897 * to other parts of the stack that won't know how to deal with packets 2898 * tagged in this manner. 2899 */ 2900 if (unlikely(vlan_tx_tag_present(skb))) 2901 goto bypass; 2902 2903 /* Handle special case of bridge or macvlan */ 2904 rx_handler = rcu_dereference(skb->dev->rx_handler); 2905 if (rx_handler) { 2906 if (pt_prev) { 2907 ret = deliver_skb(skb, pt_prev, orig_dev); 2908 pt_prev = NULL; 2909 } 2910 skb = rx_handler(skb); 2911 if (!skb) 2912 goto out; 2913 } 2914 2915 /* 2916 * Make sure frames received on VLAN interfaces stacked on 2917 * bonding interfaces still make their way to any base bonding 2918 * device that may have registered for a specific ptype. The 2919 * handler may have to adjust skb->dev and orig_dev. 2920 */ 2921 orig_or_bond = orig_dev; 2922 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && 2923 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { 2924 orig_or_bond = vlan_dev_real_dev(skb->dev); 2925 } 2926 2927 type = skb->protocol; 2928 list_for_each_entry_rcu(ptype, 2929 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 2930 if (ptype->type == type && (ptype->dev == null_or_orig || 2931 ptype->dev == skb->dev || ptype->dev == orig_dev || 2932 ptype->dev == orig_or_bond)) { 2933 if (pt_prev) 2934 ret = deliver_skb(skb, pt_prev, orig_dev); 2935 pt_prev = ptype; 2936 } 2937 } 2938 2939bypass: 2940 if (pt_prev) { 2941 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2942 } else { 2943 kfree_skb(skb); 2944 /* Jamal, now you will not able to escape explaining 2945 * me how you were going to use this. :-) 2946 */ 2947 ret = NET_RX_DROP; 2948 } 2949 2950out: 2951 rcu_read_unlock(); 2952 return ret; 2953} 2954 2955/** 2956 * netif_receive_skb - process receive buffer from network 2957 * @skb: buffer to process 2958 * 2959 * netif_receive_skb() is the main receive data processing function. 2960 * It always succeeds. The buffer may be dropped during processing 2961 * for congestion control or by the protocol layers. 2962 * 2963 * This function may only be called from softirq context and interrupts 2964 * should be enabled. 2965 * 2966 * Return values (usually ignored): 2967 * NET_RX_SUCCESS: no congestion 2968 * NET_RX_DROP: packet was dropped 2969 */ 2970int BCMFASTPATH_HOST netif_receive_skb(struct sk_buff *skb) 2971{ 2972 if (netdev_tstamp_prequeue) 2973 net_timestamp_check(skb); 2974 2975 if (skb_defer_rx_timestamp(skb)) 2976 return NET_RX_SUCCESS; 2977 2978#ifdef CONFIG_RPS 2979 { 2980 struct rps_dev_flow voidflow, *rflow = &voidflow; 2981 int cpu, ret; 2982 2983 rcu_read_lock(); 2984 2985 cpu = get_rps_cpu(skb->dev, skb, &rflow); 2986 2987 if (cpu >= 0) { 2988 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 2989 rcu_read_unlock(); 2990 } else { 2991 rcu_read_unlock(); 2992 ret = __netif_receive_skb(skb); 2993 } 2994 2995 return ret; 2996 } 2997#else 2998 return __netif_receive_skb(skb); 2999#endif 3000} 3001EXPORT_SYMBOL(netif_receive_skb); 3002 3003/* Network device is going away, flush any packets still pending 3004 * Called with irqs disabled. 3005 */ 3006static void flush_backlog(void *arg) 3007{ 3008 struct net_device *dev = arg; 3009 struct softnet_data *sd = &__get_cpu_var(softnet_data); 3010 struct sk_buff *skb, *tmp; 3011 3012 rps_lock(sd); 3013 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 3014 if (skb->dev == dev) { 3015 __skb_unlink(skb, &sd->input_pkt_queue); 3016 kfree_skb(skb); 3017 input_queue_head_incr(sd); 3018 } 3019 } 3020 rps_unlock(sd); 3021 3022 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 3023 if (skb->dev == dev) { 3024 __skb_unlink(skb, &sd->process_queue); 3025 kfree_skb(skb); 3026 input_queue_head_incr(sd); 3027 } 3028 } 3029} 3030 3031static int BCMFASTPATH_HOST napi_gro_complete(struct sk_buff *skb) 3032{ 3033 struct packet_type *ptype; 3034 __be16 type = skb->protocol; 3035 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 3036 int err = -ENOENT; 3037 3038 if (NAPI_GRO_CB(skb)->count == 1) { 3039 skb_shinfo(skb)->gso_size = 0; 3040 goto out; 3041 } 3042 3043 rcu_read_lock(); 3044 list_for_each_entry_rcu(ptype, head, list) { 3045 if (ptype->type != type || ptype->dev || !ptype->gro_complete) 3046 continue; 3047 3048 err = ptype->gro_complete(skb); 3049 break; 3050 } 3051 rcu_read_unlock(); 3052 3053 if (err) { 3054 WARN_ON(&ptype->list == head); 3055 kfree_skb(skb); 3056 return NET_RX_SUCCESS; 3057 } 3058 3059out: 3060 return netif_receive_skb(skb); 3061} 3062 3063static void BCMFASTPATH_HOST napi_gro_flush(struct napi_struct *napi) 3064{ 3065 struct sk_buff *skb, *next; 3066 3067 for (skb = napi->gro_list; skb; skb = next) { 3068 next = skb->next; 3069 skb->next = NULL; 3070 napi_gro_complete(skb); 3071 } 3072 3073 napi->gro_count = 0; 3074 napi->gro_list = NULL; 3075} 3076 3077#ifdef CONFIG_INET_GRO 3078void BCMFASTPATH_HOST generic_napi_gro_flush(struct napi_struct *napi) 3079{ 3080 napi_gro_flush(napi); 3081} 3082EXPORT_SYMBOL(generic_napi_gro_flush); 3083#endif /* CONFIG_INET_GRO */ 3084 3085enum gro_result BCMFASTPATH_HOST dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3086{ 3087 struct sk_buff **pp = NULL; 3088 struct packet_type *ptype; 3089 __be16 type = skb->protocol; 3090 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 3091 int same_flow; 3092 int mac_len; 3093 enum gro_result ret; 3094 3095 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) 3096 goto normal; 3097 3098 if (skb_is_gso(skb) || skb_has_frags(skb)) 3099 goto normal; 3100 3101 rcu_read_lock(); 3102 list_for_each_entry_rcu(ptype, head, list) { 3103 if (ptype->type != type || ptype->dev || !ptype->gro_receive) 3104 continue; 3105 3106 skb_set_network_header(skb, skb_gro_offset(skb)); 3107 mac_len = skb->network_header - skb->mac_header; 3108 skb->mac_len = mac_len; 3109 NAPI_GRO_CB(skb)->same_flow = 0; 3110 NAPI_GRO_CB(skb)->flush = 0; 3111 NAPI_GRO_CB(skb)->free = 0; 3112 3113 pp = ptype->gro_receive(&napi->gro_list, skb); 3114 break; 3115 } 3116 rcu_read_unlock(); 3117 3118 if (&ptype->list == head) 3119 goto normal; 3120 3121 same_flow = NAPI_GRO_CB(skb)->same_flow; 3122 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 3123 3124 if (pp) { 3125 struct sk_buff *nskb = *pp; 3126 3127 *pp = nskb->next; 3128 nskb->next = NULL; 3129 napi_gro_complete(nskb); 3130 napi->gro_count--; 3131 } 3132 3133 if (same_flow) 3134 goto ok; 3135 3136 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) 3137 goto normal; 3138 3139 napi->gro_count++; 3140 NAPI_GRO_CB(skb)->count = 1; 3141 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 3142 skb->next = napi->gro_list; 3143 napi->gro_list = skb; 3144 ret = GRO_HELD; 3145 3146pull: 3147 if (skb_headlen(skb) < skb_gro_offset(skb)) { 3148 int grow = skb_gro_offset(skb) - skb_headlen(skb); 3149 3150 BUG_ON(skb->end - skb->tail < grow); 3151 3152 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 3153 3154 skb->tail += grow; 3155 skb->data_len -= grow; 3156 3157 skb_shinfo(skb)->frags[0].page_offset += grow; 3158 skb_shinfo(skb)->frags[0].size -= grow; 3159 3160 if (unlikely(!skb_shinfo(skb)->frags[0].size)) { 3161 put_page(skb_shinfo(skb)->frags[0].page); 3162 memmove(skb_shinfo(skb)->frags, 3163 skb_shinfo(skb)->frags + 1, 3164 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); 3165 } 3166 } 3167 3168ok: 3169 return ret; 3170 3171normal: 3172 ret = GRO_NORMAL; 3173 goto pull; 3174} 3175EXPORT_SYMBOL(dev_gro_receive); 3176 3177static gro_result_t BCMFASTPATH_HOST 3178__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3179{ 3180 struct sk_buff *p; 3181 3182 for (p = napi->gro_list; p; p = p->next) { 3183 NAPI_GRO_CB(p)->same_flow = 3184 (p->dev == skb->dev) && 3185 !compare_ether_header(skb_mac_header(p), 3186 skb_gro_mac_header(skb)); 3187 NAPI_GRO_CB(p)->flush = 0; 3188 } 3189 3190 return dev_gro_receive(napi, skb); 3191} 3192 3193gro_result_t BCMFASTPATH_HOST napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 3194{ 3195 switch (ret) { 3196 case GRO_NORMAL: 3197 if (netif_receive_skb(skb)) 3198 ret = GRO_DROP; 3199 break; 3200 3201 case GRO_DROP: 3202 case GRO_MERGED_FREE: 3203 kfree_skb(skb); 3204 break; 3205 3206 case GRO_HELD: 3207 case GRO_MERGED: 3208 break; 3209 } 3210 3211 return ret; 3212} 3213EXPORT_SYMBOL(napi_skb_finish); 3214 3215void skb_gro_reset_offset(struct sk_buff *skb) 3216{ 3217 NAPI_GRO_CB(skb)->data_offset = 0; 3218 NAPI_GRO_CB(skb)->frag0 = NULL; 3219 NAPI_GRO_CB(skb)->frag0_len = 0; 3220 3221 if (skb->mac_header == skb->tail && 3222 !PageHighMem(skb_shinfo(skb)->frags[0].page)) { 3223 NAPI_GRO_CB(skb)->frag0 = 3224 page_address(skb_shinfo(skb)->frags[0].page) + 3225 skb_shinfo(skb)->frags[0].page_offset; 3226 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; 3227 } 3228} 3229EXPORT_SYMBOL(skb_gro_reset_offset); 3230 3231gro_result_t BCMFASTPATH_HOST napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3232{ 3233 skb_gro_reset_offset(skb); 3234 3235 return napi_skb_finish(__napi_gro_receive(napi, skb), skb); 3236} 3237EXPORT_SYMBOL(napi_gro_receive); 3238 3239void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 3240{ 3241 __skb_pull(skb, skb_headlen(skb)); 3242 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); 3243 3244 napi->skb = skb; 3245} 3246EXPORT_SYMBOL(napi_reuse_skb); 3247 3248struct sk_buff *napi_get_frags(struct napi_struct *napi) 3249{ 3250 struct sk_buff *skb = napi->skb; 3251 3252 if (!skb) { 3253 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); 3254 if (skb) 3255 napi->skb = skb; 3256 } 3257 return skb; 3258} 3259EXPORT_SYMBOL(napi_get_frags); 3260 3261gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, 3262 gro_result_t ret) 3263{ 3264 switch (ret) { 3265 case GRO_NORMAL: 3266 case GRO_HELD: 3267 skb->protocol = eth_type_trans(skb, skb->dev); 3268 3269 if (ret == GRO_HELD) 3270 skb_gro_pull(skb, -ETH_HLEN); 3271 else if (netif_receive_skb(skb)) 3272 ret = GRO_DROP; 3273 break; 3274 3275 case GRO_DROP: 3276 case GRO_MERGED_FREE: 3277 napi_reuse_skb(napi, skb); 3278 break; 3279 3280 case GRO_MERGED: 3281 break; 3282 } 3283 3284 return ret; 3285} 3286EXPORT_SYMBOL(napi_frags_finish); 3287 3288struct sk_buff *napi_frags_skb(struct napi_struct *napi) 3289{ 3290 struct sk_buff *skb = napi->skb; 3291 struct ethhdr *eth; 3292 unsigned int hlen; 3293 unsigned int off; 3294 3295 napi->skb = NULL; 3296 3297 skb_reset_mac_header(skb); 3298 skb_gro_reset_offset(skb); 3299 3300 off = skb_gro_offset(skb); 3301 hlen = off + sizeof(*eth); 3302 eth = skb_gro_header_fast(skb, off); 3303 if (skb_gro_header_hard(skb, hlen)) { 3304 eth = skb_gro_header_slow(skb, hlen, off); 3305 if (unlikely(!eth)) { 3306 napi_reuse_skb(napi, skb); 3307 skb = NULL; 3308 goto out; 3309 } 3310 } 3311 3312 skb_gro_pull(skb, sizeof(*eth)); 3313 3314 /* 3315 * This works because the only protocols we care about don't require 3316 * special handling. We'll fix it up properly at the end. 3317 */ 3318 skb->protocol = eth->h_proto; 3319 3320out: 3321 return skb; 3322} 3323EXPORT_SYMBOL(napi_frags_skb); 3324 3325gro_result_t napi_gro_frags(struct napi_struct *napi) 3326{ 3327 struct sk_buff *skb = napi_frags_skb(napi); 3328 3329 if (!skb) 3330 return GRO_DROP; 3331 3332 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); 3333} 3334EXPORT_SYMBOL(napi_gro_frags); 3335 3336/* 3337 * net_rps_action sends any pending IPI's for rps. 3338 * Note: called with local irq disabled, but exits with local irq enabled. 3339 */ 3340static void net_rps_action_and_irq_enable(struct softnet_data *sd) 3341{ 3342#ifdef CONFIG_RPS 3343 struct softnet_data *remsd = sd->rps_ipi_list; 3344 3345 if (remsd) { 3346 sd->rps_ipi_list = NULL; 3347 3348 local_irq_enable(); 3349 3350 /* Send pending IPI's to kick RPS processing on remote cpus. */ 3351 while (remsd) { 3352 struct softnet_data *next = remsd->rps_ipi_next; 3353 3354 if (cpu_online(remsd->cpu)) 3355 __smp_call_function_single(remsd->cpu, 3356 &remsd->csd, 0); 3357 remsd = next; 3358 } 3359 } else 3360#endif 3361 local_irq_enable(); 3362} 3363 3364#ifdef CONFIG_INET_GRO 3365struct napi_struct gro_napi = {0}; 3366atomic_t gro_timer_init = {0}; 3367extern spinlock_t gro_lock; 3368#endif /* CONFIG_INET_GRO */ 3369 3370static int process_backlog(struct napi_struct *napi, int quota) 3371{ 3372 int work = 0; 3373 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 3374 3375#ifdef CONFIG_RPS 3376 /* Check if we have pending ipi, its better to send them now, 3377 * not waiting net_rx_action() end. 3378 */ 3379 if (sd->rps_ipi_list) { 3380 local_irq_disable(); 3381 net_rps_action_and_irq_enable(sd); 3382 } 3383#endif 3384 napi->weight = weight_p; 3385 local_irq_disable(); 3386 while (work < quota) { 3387 struct sk_buff *skb; 3388 unsigned int qlen; 3389 3390 while ((skb = __skb_dequeue(&sd->process_queue))) { 3391 local_irq_enable(); 3392#ifdef CONFIG_INET_GRO 3393 if (atomic_read(&gro_timer_init)) { 3394 spin_lock_bh(&gro_lock); 3395 napi_gro_receive(&gro_napi, skb); 3396 spin_unlock_bh(&gro_lock); 3397 } 3398 else 3399#endif /* CONFIG_INET_GRO */ 3400 __netif_receive_skb(skb); 3401 local_irq_disable(); 3402 input_queue_head_incr(sd); 3403 if (++work >= quota) { 3404 local_irq_enable(); 3405 return work; 3406 } 3407 } 3408 3409 rps_lock(sd); 3410 qlen = skb_queue_len(&sd->input_pkt_queue); 3411 if (qlen) 3412 skb_queue_splice_tail_init(&sd->input_pkt_queue, 3413 &sd->process_queue); 3414 3415 if (qlen < quota - work) { 3416 /* 3417 * Inline a custom version of __napi_complete(). 3418 * only current cpu owns and manipulates this napi, 3419 * and NAPI_STATE_SCHED is the only possible flag set on backlog. 3420 * we can use a plain write instead of clear_bit(), 3421 * and we dont need an smp_mb() memory barrier. 3422 */ 3423 list_del(&napi->poll_list); 3424 napi->state = 0; 3425 3426 quota = work + qlen; 3427 } 3428 rps_unlock(sd); 3429 } 3430 local_irq_enable(); 3431 3432 return work; 3433} 3434 3435/** 3436 * __napi_schedule - schedule for receive 3437 * @n: entry to schedule 3438 * 3439 * The entry's receive function will be scheduled to run 3440 */ 3441void __napi_schedule(struct napi_struct *n) 3442{ 3443 unsigned long flags; 3444 3445 local_irq_save(flags); 3446 ____napi_schedule(&__get_cpu_var(softnet_data), n); 3447 local_irq_restore(flags); 3448} 3449EXPORT_SYMBOL(__napi_schedule); 3450 3451void __napi_complete(struct napi_struct *n) 3452{ 3453 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 3454 BUG_ON(n->gro_list); 3455 3456 list_del(&n->poll_list); 3457 smp_mb__before_clear_bit(); 3458 clear_bit(NAPI_STATE_SCHED, &n->state); 3459} 3460EXPORT_SYMBOL(__napi_complete); 3461 3462void napi_complete(struct napi_struct *n) 3463{ 3464 unsigned long flags; 3465 3466 /* 3467 * don't let napi dequeue from the cpu poll list 3468 * just in case its running on a different cpu 3469 */ 3470 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 3471 return; 3472 3473 napi_gro_flush(n); 3474 local_irq_save(flags); 3475 __napi_complete(n); 3476 local_irq_restore(flags); 3477} 3478EXPORT_SYMBOL(napi_complete); 3479 3480void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 3481 int (*poll)(struct napi_struct *, int), int weight) 3482{ 3483 INIT_LIST_HEAD(&napi->poll_list); 3484 napi->gro_count = 0; 3485 napi->gro_list = NULL; 3486 napi->skb = NULL; 3487 napi->poll = poll; 3488 napi->weight = weight; 3489 list_add(&napi->dev_list, &dev->napi_list); 3490 napi->dev = dev; 3491#ifdef CONFIG_NETPOLL 3492 spin_lock_init(&napi->poll_lock); 3493 napi->poll_owner = -1; 3494#endif 3495 set_bit(NAPI_STATE_SCHED, &napi->state); 3496} 3497EXPORT_SYMBOL(netif_napi_add); 3498 3499void netif_napi_del(struct napi_struct *napi) 3500{ 3501 struct sk_buff *skb, *next; 3502 3503 list_del_init(&napi->dev_list); 3504 napi_free_frags(napi); 3505 3506 for (skb = napi->gro_list; skb; skb = next) { 3507 next = skb->next; 3508 skb->next = NULL; 3509 kfree_skb(skb); 3510 } 3511 3512 napi->gro_list = NULL; 3513 napi->gro_count = 0; 3514} 3515EXPORT_SYMBOL(netif_napi_del); 3516 3517static void BCMFASTPATH_HOST net_rx_action(struct softirq_action *h) 3518{ 3519 struct softnet_data *sd = &__get_cpu_var(softnet_data); 3520 unsigned long time_limit = jiffies + 2; 3521 int budget = netdev_budget; 3522 void *have; 3523 3524 local_irq_disable(); 3525 3526 while (!list_empty(&sd->poll_list)) { 3527 struct napi_struct *n; 3528 int work, weight; 3529 3530 /* If softirq window is exhuasted then punt. 3531 * Allow this to run for 2 jiffies since which will allow 3532 * an average latency of 1.5/HZ. 3533 */ 3534 if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) 3535 goto softnet_break; 3536 3537 local_irq_enable(); 3538 3539 /* Even though interrupts have been re-enabled, this 3540 * access is safe because interrupts can only add new 3541 * entries to the tail of this list, and only ->poll() 3542 * calls can remove this head entry from the list. 3543 */ 3544 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); 3545 3546 have = netpoll_poll_lock(n); 3547 3548 weight = n->weight; 3549 3550 /* This NAPI_STATE_SCHED test is for avoiding a race 3551 * with netpoll's poll_napi(). Only the entity which 3552 * obtains the lock and sees NAPI_STATE_SCHED set will 3553 * actually make the ->poll() call. Therefore we avoid 3554 * accidently calling ->poll() when NAPI is not scheduled. 3555 */ 3556 work = 0; 3557 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 3558 work = n->poll(n, weight); 3559 trace_napi_poll(n); 3560 } 3561 3562 WARN_ON_ONCE(work > weight); 3563 3564 budget -= work; 3565 3566 local_irq_disable(); 3567 3568 /* Drivers must not modify the NAPI state if they 3569 * consume the entire weight. In such cases this code 3570 * still "owns" the NAPI instance and therefore can 3571 * move the instance around on the list at-will. 3572 */ 3573 if (unlikely(work == weight)) { 3574 if (unlikely(napi_disable_pending(n))) { 3575 local_irq_enable(); 3576 napi_complete(n); 3577 local_irq_disable(); 3578 } else 3579 list_move_tail(&n->poll_list, &sd->poll_list); 3580 } 3581 3582 netpoll_poll_unlock(have); 3583 } 3584out: 3585 net_rps_action_and_irq_enable(sd); 3586 3587#ifdef CONFIG_NET_DMA 3588 /* 3589 * There may not be any more sk_buffs coming right now, so push 3590 * any pending DMA copies to hardware 3591 */ 3592 dma_issue_pending_all(); 3593#endif 3594 3595 return; 3596 3597softnet_break: 3598 sd->time_squeeze++; 3599 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3600 goto out; 3601} 3602 3603static gifconf_func_t *gifconf_list[NPROTO]; 3604 3605/** 3606 * register_gifconf - register a SIOCGIF handler 3607 * @family: Address family 3608 * @gifconf: Function handler 3609 * 3610 * Register protocol dependent address dumping routines. The handler 3611 * that is passed must not be freed or reused until it has been replaced 3612 * by another handler. 3613 */ 3614int register_gifconf(unsigned int family, gifconf_func_t *gifconf) 3615{ 3616 if (family >= NPROTO) 3617 return -EINVAL; 3618 gifconf_list[family] = gifconf; 3619 return 0; 3620} 3621EXPORT_SYMBOL(register_gifconf); 3622 3623 3624/* 3625 * Map an interface index to its name (SIOCGIFNAME) 3626 */ 3627 3628/* 3629 * We need this ioctl for efficient implementation of the 3630 * if_indextoname() function required by the IPv6 API. Without 3631 * it, we would have to search all the interfaces to find a 3632 * match. --pb 3633 */ 3634 3635static int dev_ifname(struct net *net, struct ifreq __user *arg) 3636{ 3637 struct net_device *dev; 3638 struct ifreq ifr; 3639 3640 /* 3641 * Fetch the caller's info block. 3642 */ 3643 3644 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 3645 return -EFAULT; 3646 3647 rcu_read_lock(); 3648 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); 3649 if (!dev) { 3650 rcu_read_unlock(); 3651 return -ENODEV; 3652 } 3653 3654 strcpy(ifr.ifr_name, dev->name); 3655 rcu_read_unlock(); 3656 3657 if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) 3658 return -EFAULT; 3659 return 0; 3660} 3661 3662/* 3663 * Perform a SIOCGIFCONF call. This structure will change 3664 * size eventually, and there is nothing I can do about it. 3665 * Thus we will need a 'compatibility mode'. 3666 */ 3667 3668static int dev_ifconf(struct net *net, char __user *arg) 3669{ 3670 struct ifconf ifc; 3671 struct net_device *dev; 3672 char __user *pos; 3673 int len; 3674 int total; 3675 int i; 3676 3677 /* 3678 * Fetch the caller's info block. 3679 */ 3680 3681 if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) 3682 return -EFAULT; 3683 3684 pos = ifc.ifc_buf; 3685 len = ifc.ifc_len; 3686 3687 /* 3688 * Loop over the interfaces, and write an info block for each. 3689 */ 3690 3691 total = 0; 3692 for_each_netdev(net, dev) { 3693 for (i = 0; i < NPROTO; i++) { 3694 if (gifconf_list[i]) { 3695 int done; 3696 if (!pos) 3697 done = gifconf_list[i](dev, NULL, 0); 3698 else 3699 done = gifconf_list[i](dev, pos + total, 3700 len - total); 3701 if (done < 0) 3702 return -EFAULT; 3703 total += done; 3704 } 3705 } 3706 } 3707 3708 /* 3709 * All done. Write the updated control block back to the caller. 3710 */ 3711 ifc.ifc_len = total; 3712 3713 /* 3714 * Both BSD and Solaris return 0 here, so we do too. 3715 */ 3716 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; 3717} 3718 3719#ifdef CONFIG_PROC_FS 3720/* 3721 * This is invoked by the /proc filesystem handler to display a device 3722 * in detail. 3723 */ 3724void *dev_seq_start(struct seq_file *seq, loff_t *pos) 3725 __acquires(RCU) 3726{ 3727 struct net *net = seq_file_net(seq); 3728 loff_t off; 3729 struct net_device *dev; 3730 3731 rcu_read_lock(); 3732 if (!*pos) 3733 return SEQ_START_TOKEN; 3734 3735 off = 1; 3736 for_each_netdev_rcu(net, dev) 3737 if (off++ == *pos) 3738 return dev; 3739 3740 return NULL; 3741} 3742 3743void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3744{ 3745 struct net_device *dev = (v == SEQ_START_TOKEN) ? 3746 first_net_device(seq_file_net(seq)) : 3747 next_net_device((struct net_device *)v); 3748 3749 ++*pos; 3750 return rcu_dereference(dev); 3751} 3752 3753void dev_seq_stop(struct seq_file *seq, void *v) 3754 __releases(RCU) 3755{ 3756 rcu_read_unlock(); 3757} 3758 3759static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) 3760{ 3761 struct rtnl_link_stats64 temp; 3762 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); 3763 3764 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " 3765 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", 3766 dev->name, stats->rx_bytes, stats->rx_packets, 3767 stats->rx_errors, 3768 stats->rx_dropped + stats->rx_missed_errors, 3769 stats->rx_fifo_errors, 3770 stats->rx_length_errors + stats->rx_over_errors + 3771 stats->rx_crc_errors + stats->rx_frame_errors, 3772 stats->rx_compressed, stats->multicast, 3773 stats->tx_bytes, stats->tx_packets, 3774 stats->tx_errors, stats->tx_dropped, 3775 stats->tx_fifo_errors, stats->collisions, 3776 stats->tx_carrier_errors + 3777 stats->tx_aborted_errors + 3778 stats->tx_window_errors + 3779 stats->tx_heartbeat_errors, 3780 stats->tx_compressed); 3781} 3782 3783/* 3784 * Called from the PROCfs module. This now uses the new arbitrary sized 3785 * /proc/net interface to create /proc/net/dev 3786 */ 3787static int dev_seq_show(struct seq_file *seq, void *v) 3788{ 3789 if (v == SEQ_START_TOKEN) 3790 seq_puts(seq, "Inter-| Receive " 3791 " | Transmit\n" 3792 " face |bytes packets errs drop fifo frame " 3793 "compressed multicast|bytes packets errs " 3794 "drop fifo colls carrier compressed\n"); 3795 else 3796 dev_seq_printf_stats(seq, v); 3797 return 0; 3798} 3799 3800static struct softnet_data *softnet_get_online(loff_t *pos) 3801{ 3802 struct softnet_data *sd = NULL; 3803 3804 while (*pos < nr_cpu_ids) 3805 if (cpu_online(*pos)) { 3806 sd = &per_cpu(softnet_data, *pos); 3807 break; 3808 } else 3809 ++*pos; 3810 return sd; 3811} 3812 3813static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) 3814{ 3815 return softnet_get_online(pos); 3816} 3817 3818static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3819{ 3820 ++*pos; 3821 return softnet_get_online(pos); 3822} 3823 3824static void softnet_seq_stop(struct seq_file *seq, void *v) 3825{ 3826} 3827 3828static int softnet_seq_show(struct seq_file *seq, void *v) 3829{ 3830 struct softnet_data *sd = v; 3831 3832 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 3833 sd->processed, sd->dropped, sd->time_squeeze, 0, 3834 0, 0, 0, 0, /* was fastroute */ 3835 sd->cpu_collision, sd->received_rps); 3836 return 0; 3837} 3838 3839static const struct seq_operations dev_seq_ops = { 3840 .start = dev_seq_start, 3841 .next = dev_seq_next, 3842 .stop = dev_seq_stop, 3843 .show = dev_seq_show, 3844}; 3845 3846static int dev_seq_open(struct inode *inode, struct file *file) 3847{ 3848 return seq_open_net(inode, file, &dev_seq_ops, 3849 sizeof(struct seq_net_private)); 3850} 3851 3852static const struct file_operations dev_seq_fops = { 3853 .owner = THIS_MODULE, 3854 .open = dev_seq_open, 3855 .read = seq_read, 3856 .llseek = seq_lseek, 3857 .release = seq_release_net, 3858}; 3859 3860static const struct seq_operations softnet_seq_ops = { 3861 .start = softnet_seq_start, 3862 .next = softnet_seq_next, 3863 .stop = softnet_seq_stop, 3864 .show = softnet_seq_show, 3865}; 3866 3867static int softnet_seq_open(struct inode *inode, struct file *file) 3868{ 3869 return seq_open(file, &softnet_seq_ops); 3870} 3871 3872static const struct file_operations softnet_seq_fops = { 3873 .owner = THIS_MODULE, 3874 .open = softnet_seq_open, 3875 .read = seq_read, 3876 .llseek = seq_lseek, 3877 .release = seq_release, 3878}; 3879 3880static void *ptype_get_idx(loff_t pos) 3881{ 3882 struct packet_type *pt = NULL; 3883 loff_t i = 0; 3884 int t; 3885 3886 list_for_each_entry_rcu(pt, &ptype_all, list) { 3887 if (i == pos) 3888 return pt; 3889 ++i; 3890 } 3891 3892 for (t = 0; t < PTYPE_HASH_SIZE; t++) { 3893 list_for_each_entry_rcu(pt, &ptype_base[t], list) { 3894 if (i == pos) 3895 return pt; 3896 ++i; 3897 } 3898 } 3899 return NULL; 3900} 3901 3902static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) 3903 __acquires(RCU) 3904{ 3905 rcu_read_lock(); 3906 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; 3907} 3908 3909static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3910{ 3911 struct packet_type *pt; 3912 struct list_head *nxt; 3913 int hash; 3914 3915 ++*pos; 3916 if (v == SEQ_START_TOKEN) 3917 return ptype_get_idx(0); 3918 3919 pt = v; 3920 nxt = pt->list.next; 3921 if (pt->type == htons(ETH_P_ALL)) { 3922 if (nxt != &ptype_all) 3923 goto found; 3924 hash = 0; 3925 nxt = ptype_base[0].next; 3926 } else 3927 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 3928 3929 while (nxt == &ptype_base[hash]) { 3930 if (++hash >= PTYPE_HASH_SIZE) 3931 return NULL; 3932 nxt = ptype_base[hash].next; 3933 } 3934found: 3935 return list_entry(nxt, struct packet_type, list); 3936} 3937 3938static void ptype_seq_stop(struct seq_file *seq, void *v) 3939 __releases(RCU) 3940{ 3941 rcu_read_unlock(); 3942} 3943 3944static int ptype_seq_show(struct seq_file *seq, void *v) 3945{ 3946 struct packet_type *pt = v; 3947 3948 if (v == SEQ_START_TOKEN) 3949 seq_puts(seq, "Type Device Function\n"); 3950 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { 3951 if (pt->type == htons(ETH_P_ALL)) 3952 seq_puts(seq, "ALL "); 3953 else 3954 seq_printf(seq, "%04x", ntohs(pt->type)); 3955 3956 seq_printf(seq, " %-8s %pF\n", 3957 pt->dev ? pt->dev->name : "", pt->func); 3958 } 3959 3960 return 0; 3961} 3962 3963static const struct seq_operations ptype_seq_ops = { 3964 .start = ptype_seq_start, 3965 .next = ptype_seq_next, 3966 .stop = ptype_seq_stop, 3967 .show = ptype_seq_show, 3968}; 3969 3970static int ptype_seq_open(struct inode *inode, struct file *file) 3971{ 3972 return seq_open_net(inode, file, &ptype_seq_ops, 3973 sizeof(struct seq_net_private)); 3974} 3975 3976static const struct file_operations ptype_seq_fops = { 3977 .owner = THIS_MODULE, 3978 .open = ptype_seq_open, 3979 .read = seq_read, 3980 .llseek = seq_lseek, 3981 .release = seq_release_net, 3982}; 3983 3984 3985static int __net_init dev_proc_net_init(struct net *net) 3986{ 3987 int rc = -ENOMEM; 3988 3989 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) 3990 goto out; 3991 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) 3992 goto out_dev; 3993 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) 3994 goto out_softnet; 3995 3996 if (wext_proc_init(net)) 3997 goto out_ptype; 3998 rc = 0; 3999out: 4000 return rc; 4001out_ptype: 4002 proc_net_remove(net, "ptype"); 4003out_softnet: 4004 proc_net_remove(net, "softnet_stat"); 4005out_dev: 4006 proc_net_remove(net, "dev"); 4007 goto out; 4008} 4009 4010static void __net_exit dev_proc_net_exit(struct net *net) 4011{ 4012 wext_proc_exit(net); 4013 4014 proc_net_remove(net, "ptype"); 4015 proc_net_remove(net, "softnet_stat"); 4016 proc_net_remove(net, "dev"); 4017} 4018 4019static struct pernet_operations __net_initdata dev_proc_ops = { 4020 .init = dev_proc_net_init, 4021 .exit = dev_proc_net_exit, 4022}; 4023 4024static int __init dev_proc_init(void) 4025{ 4026 return register_pernet_subsys(&dev_proc_ops); 4027} 4028#else 4029#define dev_proc_init() 0 4030#endif /* CONFIG_PROC_FS */ 4031 4032 4033/** 4034 * netdev_set_master - set up master/slave pair 4035 * @slave: slave device 4036 * @master: new master device 4037 * 4038 * Changes the master device of the slave. Pass %NULL to break the 4039 * bonding. The caller must hold the RTNL semaphore. On a failure 4040 * a negative errno code is returned. On success the reference counts 4041 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the 4042 * function returns zero. 4043 */ 4044int netdev_set_master(struct net_device *slave, struct net_device *master) 4045{ 4046 struct net_device *old = slave->master; 4047 4048 ASSERT_RTNL(); 4049 4050 if (master) { 4051 if (old) 4052 return -EBUSY; 4053 dev_hold(master); 4054 } 4055 4056 slave->master = master; 4057 4058 if (old) { 4059 synchronize_net(); 4060 dev_put(old); 4061 } 4062 if (master) 4063 slave->flags |= IFF_SLAVE; 4064 else 4065 slave->flags &= ~IFF_SLAVE; 4066 4067 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); 4068 return 0; 4069} 4070EXPORT_SYMBOL(netdev_set_master); 4071 4072static void dev_change_rx_flags(struct net_device *dev, int flags) 4073{ 4074 const struct net_device_ops *ops = dev->netdev_ops; 4075 4076 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) 4077 ops->ndo_change_rx_flags(dev, flags); 4078} 4079 4080static int __dev_set_promiscuity(struct net_device *dev, int inc) 4081{ 4082 unsigned short old_flags = dev->flags; 4083 uid_t uid; 4084 gid_t gid; 4085 4086 ASSERT_RTNL(); 4087 4088 dev->flags |= IFF_PROMISC; 4089 dev->promiscuity += inc; 4090 if (dev->promiscuity == 0) { 4091 /* 4092 * Avoid overflow. 4093 * If inc causes overflow, untouch promisc and return error. 4094 */ 4095 if (inc < 0) 4096 dev->flags &= ~IFF_PROMISC; 4097 else { 4098 dev->promiscuity -= inc; 4099 printk(KERN_WARNING "%s: promiscuity touches roof, " 4100 "set promiscuity failed, promiscuity feature " 4101 "of device might be broken.\n", dev->name); 4102 return -EOVERFLOW; 4103 } 4104 } 4105 if (dev->flags != old_flags) { 4106 printk(KERN_INFO "device %s %s promiscuous mode\n", 4107 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 4108 "left"); 4109 if (audit_enabled) { 4110 current_uid_gid(&uid, &gid); 4111 audit_log(current->audit_context, GFP_ATOMIC, 4112 AUDIT_ANOM_PROMISCUOUS, 4113 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 4114 dev->name, (dev->flags & IFF_PROMISC), 4115 (old_flags & IFF_PROMISC), 4116 audit_get_loginuid(current), 4117 uid, gid, 4118 audit_get_sessionid(current)); 4119 } 4120 4121 dev_change_rx_flags(dev, IFF_PROMISC); 4122 } 4123 return 0; 4124} 4125 4126/** 4127 * dev_set_promiscuity - update promiscuity count on a device 4128 * @dev: device 4129 * @inc: modifier 4130 * 4131 * Add or remove promiscuity from a device. While the count in the device 4132 * remains above zero the interface remains promiscuous. Once it hits zero 4133 * the device reverts back to normal filtering operation. A negative inc 4134 * value is used to drop promiscuity on the device. 4135 * Return 0 if successful or a negative errno code on error. 4136 */ 4137int dev_set_promiscuity(struct net_device *dev, int inc) 4138{ 4139 unsigned short old_flags = dev->flags; 4140 int err; 4141 4142 err = __dev_set_promiscuity(dev, inc); 4143 if (err < 0) 4144 return err; 4145 if (dev->flags != old_flags) 4146 dev_set_rx_mode(dev); 4147 return err; 4148} 4149EXPORT_SYMBOL(dev_set_promiscuity); 4150 4151/** 4152 * dev_set_allmulti - update allmulti count on a device 4153 * @dev: device 4154 * @inc: modifier 4155 * 4156 * Add or remove reception of all multicast frames to a device. While the 4157 * count in the device remains above zero the interface remains listening 4158 * to all interfaces. Once it hits zero the device reverts back to normal 4159 * filtering operation. A negative @inc value is used to drop the counter 4160 * when releasing a resource needing all multicasts. 4161 * Return 0 if successful or a negative errno code on error. 4162 */ 4163 4164int dev_set_allmulti(struct net_device *dev, int inc) 4165{ 4166 unsigned short old_flags = dev->flags; 4167 4168 ASSERT_RTNL(); 4169 4170 dev->flags |= IFF_ALLMULTI; 4171 dev->allmulti += inc; 4172 if (dev->allmulti == 0) { 4173 /* 4174 * Avoid overflow. 4175 * If inc causes overflow, untouch allmulti and return error. 4176 */ 4177 if (inc < 0) 4178 dev->flags &= ~IFF_ALLMULTI; 4179 else { 4180 dev->allmulti -= inc; 4181 printk(KERN_WARNING "%s: allmulti touches roof, " 4182 "set allmulti failed, allmulti feature of " 4183 "device might be broken.\n", dev->name); 4184 return -EOVERFLOW; 4185 } 4186 } 4187 if (dev->flags ^ old_flags) { 4188 dev_change_rx_flags(dev, IFF_ALLMULTI); 4189 dev_set_rx_mode(dev); 4190 } 4191 return 0; 4192} 4193EXPORT_SYMBOL(dev_set_allmulti); 4194 4195/* 4196 * Upload unicast and multicast address lists to device and 4197 * configure RX filtering. When the device doesn't support unicast 4198 * filtering it is put in promiscuous mode while unicast addresses 4199 * are present. 4200 */ 4201void __dev_set_rx_mode(struct net_device *dev) 4202{ 4203 const struct net_device_ops *ops = dev->netdev_ops; 4204 4205 /* dev_open will call this function so the list will stay sane. */ 4206 if (!(dev->flags&IFF_UP)) 4207 return; 4208 4209 if (!netif_device_present(dev)) 4210 return; 4211 4212 if (ops->ndo_set_rx_mode) 4213 ops->ndo_set_rx_mode(dev); 4214 else { 4215 /* Unicast addresses changes may only happen under the rtnl, 4216 * therefore calling __dev_set_promiscuity here is safe. 4217 */ 4218 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 4219 __dev_set_promiscuity(dev, 1); 4220 dev->uc_promisc = 1; 4221 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 4222 __dev_set_promiscuity(dev, -1); 4223 dev->uc_promisc = 0; 4224 } 4225 4226 if (ops->ndo_set_multicast_list) 4227 ops->ndo_set_multicast_list(dev); 4228 } 4229} 4230 4231void dev_set_rx_mode(struct net_device *dev) 4232{ 4233 netif_addr_lock_bh(dev); 4234 __dev_set_rx_mode(dev); 4235 netif_addr_unlock_bh(dev); 4236} 4237 4238/** 4239 * dev_get_flags - get flags reported to userspace 4240 * @dev: device 4241 * 4242 * Get the combination of flag bits exported through APIs to userspace. 4243 */ 4244unsigned dev_get_flags(const struct net_device *dev) 4245{ 4246 unsigned flags; 4247 4248 flags = (dev->flags & ~(IFF_PROMISC | 4249 IFF_ALLMULTI | 4250 IFF_RUNNING | 4251 IFF_LOWER_UP | 4252 IFF_DORMANT)) | 4253 (dev->gflags & (IFF_PROMISC | 4254 IFF_ALLMULTI)); 4255 4256 if (netif_running(dev)) { 4257 if (netif_oper_up(dev)) 4258 flags |= IFF_RUNNING; 4259 if (netif_carrier_ok(dev)) 4260 flags |= IFF_LOWER_UP; 4261 if (netif_dormant(dev)) 4262 flags |= IFF_DORMANT; 4263 } 4264 4265 return flags; 4266} 4267EXPORT_SYMBOL(dev_get_flags); 4268 4269int __dev_change_flags(struct net_device *dev, unsigned int flags) 4270{ 4271 int old_flags = dev->flags; 4272 int ret; 4273 4274 ASSERT_RTNL(); 4275 4276 /* 4277 * Set the flags on our device. 4278 */ 4279 4280 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 4281 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 4282 IFF_AUTOMEDIA)) | 4283 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 4284 IFF_ALLMULTI)); 4285 4286 /* 4287 * Load in the correct multicast list now the flags have changed. 4288 */ 4289 4290 if ((old_flags ^ flags) & IFF_MULTICAST) 4291 dev_change_rx_flags(dev, IFF_MULTICAST); 4292 4293 dev_set_rx_mode(dev); 4294 4295 /* 4296 * Have we downed the interface. We handle IFF_UP ourselves 4297 * according to user attempts to set it, rather than blindly 4298 * setting it. 4299 */ 4300 4301 ret = 0; 4302 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ 4303 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 4304 4305 if (!ret) 4306 dev_set_rx_mode(dev); 4307 } 4308 4309 if ((flags ^ dev->gflags) & IFF_PROMISC) { 4310 int inc = (flags & IFF_PROMISC) ? 1 : -1; 4311 4312 dev->gflags ^= IFF_PROMISC; 4313 dev_set_promiscuity(dev, inc); 4314 } 4315 4316 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 4317 is important. Some (broken) drivers set IFF_PROMISC, when 4318 IFF_ALLMULTI is requested not asking us and not reporting. 4319 */ 4320 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 4321 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 4322 4323 dev->gflags ^= IFF_ALLMULTI; 4324 dev_set_allmulti(dev, inc); 4325 } 4326 4327 return ret; 4328} 4329 4330void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) 4331{ 4332 unsigned int changes = dev->flags ^ old_flags; 4333 4334 if (changes & IFF_UP) { 4335 if (dev->flags & IFF_UP) 4336 call_netdevice_notifiers(NETDEV_UP, dev); 4337 else 4338 call_netdevice_notifiers(NETDEV_DOWN, dev); 4339 } 4340 4341 if (dev->flags & IFF_UP && 4342 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) 4343 call_netdevice_notifiers(NETDEV_CHANGE, dev); 4344} 4345 4346/** 4347 * dev_change_flags - change device settings 4348 * @dev: device 4349 * @flags: device state flags 4350 * 4351 * Change settings on device based state flags. The flags are 4352 * in the userspace exported format. 4353 */ 4354int dev_change_flags(struct net_device *dev, unsigned flags) 4355{ 4356 int ret, changes; 4357 int old_flags = dev->flags; 4358 4359 ret = __dev_change_flags(dev, flags); 4360 if (ret < 0) 4361 return ret; 4362 4363 changes = old_flags ^ dev->flags; 4364 if (changes) 4365 rtmsg_ifinfo(RTM_NEWLINK, dev, changes); 4366 4367 __dev_notify_flags(dev, old_flags); 4368 return ret; 4369} 4370EXPORT_SYMBOL(dev_change_flags); 4371 4372/** 4373 * dev_set_mtu - Change maximum transfer unit 4374 * @dev: device 4375 * @new_mtu: new transfer unit 4376 * 4377 * Change the maximum transfer size of the network device. 4378 */ 4379int dev_set_mtu(struct net_device *dev, int new_mtu) 4380{ 4381 const struct net_device_ops *ops = dev->netdev_ops; 4382 int err; 4383 4384 if (new_mtu == dev->mtu) 4385 return 0; 4386 4387 /* MTU must be positive. */ 4388 if (new_mtu < 0) 4389 return -EINVAL; 4390 4391 if (!netif_device_present(dev)) 4392 return -ENODEV; 4393 4394 err = 0; 4395 if (ops->ndo_change_mtu) 4396 err = ops->ndo_change_mtu(dev, new_mtu); 4397 else 4398 dev->mtu = new_mtu; 4399 4400 if (!err && dev->flags & IFF_UP) 4401 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 4402 return err; 4403} 4404EXPORT_SYMBOL(dev_set_mtu); 4405 4406/** 4407 * dev_set_mac_address - Change Media Access Control Address 4408 * @dev: device 4409 * @sa: new address 4410 * 4411 * Change the hardware (MAC) address of the device 4412 */ 4413int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 4414{ 4415 const struct net_device_ops *ops = dev->netdev_ops; 4416 int err; 4417 4418 if (!ops->ndo_set_mac_address) 4419 return -EOPNOTSUPP; 4420 if (sa->sa_family != dev->type) 4421 return -EINVAL; 4422 if (!netif_device_present(dev)) 4423 return -ENODEV; 4424 err = ops->ndo_set_mac_address(dev, sa); 4425 if (!err) 4426 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4427 return err; 4428} 4429EXPORT_SYMBOL(dev_set_mac_address); 4430 4431/* 4432 * Perform the SIOCxIFxxx calls, inside rcu_read_lock() 4433 */ 4434static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) 4435{ 4436 int err; 4437 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); 4438 4439 if (!dev) 4440 return -ENODEV; 4441 4442 switch (cmd) { 4443 case SIOCGIFFLAGS: /* Get interface flags */ 4444 ifr->ifr_flags = (short) dev_get_flags(dev); 4445 return 0; 4446 4447 case SIOCGIFMETRIC: /* Get the metric on the interface 4448 (currently unused) */ 4449 ifr->ifr_metric = 0; 4450 return 0; 4451 4452 case SIOCGIFMTU: /* Get the MTU of a device */ 4453 ifr->ifr_mtu = dev->mtu; 4454 return 0; 4455 4456 case SIOCGIFHWADDR: 4457 if (!dev->addr_len) 4458 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); 4459 else 4460 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, 4461 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 4462 ifr->ifr_hwaddr.sa_family = dev->type; 4463 return 0; 4464 4465 case SIOCGIFSLAVE: 4466 err = -EINVAL; 4467 break; 4468 4469 case SIOCGIFMAP: 4470 ifr->ifr_map.mem_start = dev->mem_start; 4471 ifr->ifr_map.mem_end = dev->mem_end; 4472 ifr->ifr_map.base_addr = dev->base_addr; 4473 ifr->ifr_map.irq = dev->irq; 4474 ifr->ifr_map.dma = dev->dma; 4475 ifr->ifr_map.port = dev->if_port; 4476 return 0; 4477 4478 case SIOCGIFINDEX: 4479 ifr->ifr_ifindex = dev->ifindex; 4480 return 0; 4481 4482 case SIOCGIFTXQLEN: 4483 ifr->ifr_qlen = dev->tx_queue_len; 4484 return 0; 4485 4486 default: 4487 /* dev_ioctl() should ensure this case 4488 * is never reached 4489 */ 4490 WARN_ON(1); 4491 err = -EINVAL; 4492 break; 4493 4494 } 4495 return err; 4496} 4497 4498/* 4499 * Perform the SIOCxIFxxx calls, inside rtnl_lock() 4500 */ 4501static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) 4502{ 4503 int err; 4504 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 4505 const struct net_device_ops *ops; 4506 4507 if (!dev) 4508 return -ENODEV; 4509 4510 ops = dev->netdev_ops; 4511 4512 switch (cmd) { 4513 case SIOCSIFFLAGS: /* Set interface flags */ 4514 return dev_change_flags(dev, ifr->ifr_flags); 4515 4516 case SIOCSIFMETRIC: /* Set the metric on the interface 4517 (currently unused) */ 4518 return -EOPNOTSUPP; 4519 4520 case SIOCSIFMTU: /* Set the MTU of a device */ 4521 return dev_set_mtu(dev, ifr->ifr_mtu); 4522 4523 case SIOCSIFHWADDR: 4524 return dev_set_mac_address(dev, &ifr->ifr_hwaddr); 4525 4526 case SIOCSIFHWBROADCAST: 4527 if (ifr->ifr_hwaddr.sa_family != dev->type) 4528 return -EINVAL; 4529 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, 4530 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 4531 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4532 return 0; 4533 4534 case SIOCSIFMAP: 4535 if (ops->ndo_set_config) { 4536 if (!netif_device_present(dev)) 4537 return -ENODEV; 4538 return ops->ndo_set_config(dev, &ifr->ifr_map); 4539 } 4540 return -EOPNOTSUPP; 4541 4542 case SIOCADDMULTI: 4543 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 4544 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 4545 return -EINVAL; 4546 if (!netif_device_present(dev)) 4547 return -ENODEV; 4548 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); 4549 4550 case SIOCDELMULTI: 4551 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 4552 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 4553 return -EINVAL; 4554 if (!netif_device_present(dev)) 4555 return -ENODEV; 4556 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); 4557 4558 case SIOCSIFTXQLEN: 4559 if (ifr->ifr_qlen < 0) 4560 return -EINVAL; 4561 dev->tx_queue_len = ifr->ifr_qlen; 4562 return 0; 4563 4564 case SIOCSIFNAME: 4565 ifr->ifr_newname[IFNAMSIZ-1] = '\0'; 4566 return dev_change_name(dev, ifr->ifr_newname); 4567 4568 /* 4569 * Unknown or private ioctl 4570 */ 4571 default: 4572 if ((cmd >= SIOCDEVPRIVATE && 4573 cmd <= SIOCDEVPRIVATE + 15) || 4574 cmd == SIOCBONDENSLAVE || 4575 cmd == SIOCBONDRELEASE || 4576 cmd == SIOCBONDSETHWADDR || 4577 cmd == SIOCBONDSLAVEINFOQUERY || 4578 cmd == SIOCBONDINFOQUERY || 4579 cmd == SIOCBONDCHANGEACTIVE || 4580 cmd == SIOCGMIIPHY || 4581 cmd == SIOCGMIIREG || 4582 cmd == SIOCSMIIREG || 4583 cmd == SIOCBRADDIF || 4584 cmd == SIOCBRDELIF || 4585 cmd == SIOCSHWTSTAMP || 4586 cmd == SIOCWANDEV) { 4587 err = -EOPNOTSUPP; 4588 if (ops->ndo_do_ioctl) { 4589 if (netif_device_present(dev)) 4590 err = ops->ndo_do_ioctl(dev, ifr, cmd); 4591 else 4592 err = -ENODEV; 4593 } 4594 } else 4595 err = -EINVAL; 4596 4597 } 4598 return err; 4599} 4600 4601/* 4602 * This function handles all "interface"-type I/O control requests. The actual 4603 * 'doing' part of this is dev_ifsioc above. 4604 */ 4605 4606/** 4607 * dev_ioctl - network device ioctl 4608 * @net: the applicable net namespace 4609 * @cmd: command to issue 4610 * @arg: pointer to a struct ifreq in user space 4611 * 4612 * Issue ioctl functions to devices. This is normally called by the 4613 * user space syscall interfaces but can sometimes be useful for 4614 * other purposes. The return value is the return from the syscall if 4615 * positive or a negative errno code on error. 4616 */ 4617 4618int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) 4619{ 4620 struct ifreq ifr; 4621 int ret; 4622 char *colon; 4623 4624 /* One special case: SIOCGIFCONF takes ifconf argument 4625 and requires shared lock, because it sleeps writing 4626 to user space. 4627 */ 4628 4629 if (cmd == SIOCGIFCONF) { 4630 rtnl_lock(); 4631 ret = dev_ifconf(net, (char __user *) arg); 4632 rtnl_unlock(); 4633 return ret; 4634 } 4635 if (cmd == SIOCGIFNAME) 4636 return dev_ifname(net, (struct ifreq __user *)arg); 4637 4638 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 4639 return -EFAULT; 4640 4641 ifr.ifr_name[IFNAMSIZ-1] = 0; 4642 4643 colon = strchr(ifr.ifr_name, ':'); 4644 if (colon) 4645 *colon = 0; 4646 4647 /* 4648 * See which interface the caller is talking about. 4649 */ 4650 4651 switch (cmd) { 4652 /* 4653 * These ioctl calls: 4654 * - can be done by all. 4655 * - atomic and do not require locking. 4656 * - return a value 4657 */ 4658 case SIOCGIFFLAGS: 4659 case SIOCGIFMETRIC: 4660 case SIOCGIFMTU: 4661 case SIOCGIFHWADDR: 4662 case SIOCGIFSLAVE: 4663 case SIOCGIFMAP: 4664 case SIOCGIFINDEX: 4665 case SIOCGIFTXQLEN: 4666 dev_load(net, ifr.ifr_name); 4667 rcu_read_lock(); 4668 ret = dev_ifsioc_locked(net, &ifr, cmd); 4669 rcu_read_unlock(); 4670 if (!ret) { 4671 if (colon) 4672 *colon = ':'; 4673 if (copy_to_user(arg, &ifr, 4674 sizeof(struct ifreq))) 4675 ret = -EFAULT; 4676 } 4677 return ret; 4678 4679 case SIOCETHTOOL: 4680 dev_load(net, ifr.ifr_name); 4681 rtnl_lock(); 4682 ret = dev_ethtool(net, &ifr); 4683 rtnl_unlock(); 4684 if (!ret) { 4685 if (colon) 4686 *colon = ':'; 4687 if (copy_to_user(arg, &ifr, 4688 sizeof(struct ifreq))) 4689 ret = -EFAULT; 4690 } 4691 return ret; 4692 4693 /* 4694 * These ioctl calls: 4695 * - require superuser power. 4696 * - require strict serialization. 4697 * - return a value 4698 */ 4699 case SIOCGMIIPHY: 4700 case SIOCGMIIREG: 4701 case SIOCSIFNAME: 4702 if (!capable(CAP_NET_ADMIN)) 4703 return -EPERM; 4704 dev_load(net, ifr.ifr_name); 4705 rtnl_lock(); 4706 ret = dev_ifsioc(net, &ifr, cmd); 4707 rtnl_unlock(); 4708 if (!ret) { 4709 if (colon) 4710 *colon = ':'; 4711 if (copy_to_user(arg, &ifr, 4712 sizeof(struct ifreq))) 4713 ret = -EFAULT; 4714 } 4715 return ret; 4716 4717 /* 4718 * These ioctl calls: 4719 * - require superuser power. 4720 * - require strict serialization. 4721 * - do not return a value 4722 */ 4723 case SIOCSIFFLAGS: 4724 case SIOCSIFMETRIC: 4725 case SIOCSIFMTU: 4726 case SIOCSIFMAP: 4727 case SIOCSIFHWADDR: 4728 case SIOCSIFSLAVE: 4729 case SIOCADDMULTI: 4730 case SIOCDELMULTI: 4731 case SIOCSIFHWBROADCAST: 4732 case SIOCSIFTXQLEN: 4733 case SIOCSMIIREG: 4734 case SIOCBONDENSLAVE: 4735 case SIOCBONDRELEASE: 4736 case SIOCBONDSETHWADDR: 4737 case SIOCBONDCHANGEACTIVE: 4738 case SIOCBRADDIF: 4739 case SIOCBRDELIF: 4740 case SIOCSHWTSTAMP: 4741 if (!capable(CAP_NET_ADMIN)) 4742 return -EPERM; 4743 /* fall through */ 4744 case SIOCBONDSLAVEINFOQUERY: 4745 case SIOCBONDINFOQUERY: 4746 dev_load(net, ifr.ifr_name); 4747 rtnl_lock(); 4748 ret = dev_ifsioc(net, &ifr, cmd); 4749 rtnl_unlock(); 4750 return ret; 4751 4752 case SIOCGIFMEM: 4753 /* Get the per device memory space. We can add this but 4754 * currently do not support it */ 4755 case SIOCSIFMEM: 4756 /* Set the per device memory buffer space. 4757 * Not applicable in our case */ 4758 case SIOCSIFLINK: 4759 return -EINVAL; 4760 4761 /* 4762 * Unknown or private ioctl. 4763 */ 4764 default: 4765 if (cmd == SIOCWANDEV || 4766 (cmd >= SIOCDEVPRIVATE && 4767 cmd <= SIOCDEVPRIVATE + 15)) { 4768 dev_load(net, ifr.ifr_name); 4769 rtnl_lock(); 4770 ret = dev_ifsioc(net, &ifr, cmd); 4771 rtnl_unlock(); 4772 if (!ret && copy_to_user(arg, &ifr, 4773 sizeof(struct ifreq))) 4774 ret = -EFAULT; 4775 return ret; 4776 } 4777 /* Take care of Wireless Extensions */ 4778 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) 4779 return wext_handle_ioctl(net, &ifr, cmd, arg); 4780 return -EINVAL; 4781 } 4782} 4783 4784 4785/** 4786 * dev_new_index - allocate an ifindex 4787 * @net: the applicable net namespace 4788 * 4789 * Returns a suitable unique value for a new device interface 4790 * number. The caller must hold the rtnl semaphore or the 4791 * dev_base_lock to be sure it remains unique. 4792 */ 4793static int dev_new_index(struct net *net) 4794{ 4795 static int ifindex; 4796 for (;;) { 4797 if (++ifindex <= 0) 4798 ifindex = 1; 4799 if (!__dev_get_by_index(net, ifindex)) 4800 return ifindex; 4801 } 4802} 4803 4804/* Delayed registration/unregisteration */ 4805static LIST_HEAD(net_todo_list); 4806 4807static void net_set_todo(struct net_device *dev) 4808{ 4809 list_add_tail(&dev->todo_list, &net_todo_list); 4810} 4811 4812static void rollback_registered_many(struct list_head *head) 4813{ 4814 struct net_device *dev, *tmp; 4815 4816 BUG_ON(dev_boot_phase); 4817 ASSERT_RTNL(); 4818 4819 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 4820 /* Some devices call without registering 4821 * for initialization unwind. Remove those 4822 * devices and proceed with the remaining. 4823 */ 4824 if (dev->reg_state == NETREG_UNINITIALIZED) { 4825 pr_debug("unregister_netdevice: device %s/%p never " 4826 "was registered\n", dev->name, dev); 4827 4828 WARN_ON(1); 4829 list_del(&dev->unreg_list); 4830 continue; 4831 } 4832 4833 BUG_ON(dev->reg_state != NETREG_REGISTERED); 4834 4835 /* If device is running, close it first. */ 4836 dev_close(dev); 4837 4838 /* And unlink it from device chain. */ 4839 unlist_netdevice(dev); 4840 4841 dev->reg_state = NETREG_UNREGISTERING; 4842 } 4843 4844 synchronize_net(); 4845 4846 list_for_each_entry(dev, head, unreg_list) { 4847 /* Shutdown queueing discipline. */ 4848 dev_shutdown(dev); 4849 4850 4851 /* Notify protocols, that we are about to destroy 4852 this device. They should clean all the things. 4853 */ 4854 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4855 4856 if (!dev->rtnl_link_ops || 4857 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 4858 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); 4859 4860 /* 4861 * Flush the unicast and multicast chains 4862 */ 4863 dev_uc_flush(dev); 4864 dev_mc_flush(dev); 4865 4866 if (dev->netdev_ops->ndo_uninit) 4867 dev->netdev_ops->ndo_uninit(dev); 4868 4869 /* Notifier chain MUST detach us from master device. */ 4870 WARN_ON(dev->master); 4871 4872 /* Remove entries from kobject tree */ 4873 netdev_unregister_kobject(dev); 4874 } 4875 4876 /* Process any work delayed until the end of the batch */ 4877 dev = list_first_entry(head, struct net_device, unreg_list); 4878 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 4879 4880 rcu_barrier(); 4881 4882 list_for_each_entry(dev, head, unreg_list) 4883 dev_put(dev); 4884} 4885 4886static void rollback_registered(struct net_device *dev) 4887{ 4888 LIST_HEAD(single); 4889 4890 list_add(&dev->unreg_list, &single); 4891 rollback_registered_many(&single); 4892} 4893 4894static void __netdev_init_queue_locks_one(struct net_device *dev, 4895 struct netdev_queue *dev_queue, 4896 void *_unused) 4897{ 4898 spin_lock_init(&dev_queue->_xmit_lock); 4899 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); 4900 dev_queue->xmit_lock_owner = -1; 4901} 4902 4903static void netdev_init_queue_locks(struct net_device *dev) 4904{ 4905 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); 4906 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); 4907} 4908 4909unsigned long netdev_fix_features(unsigned long features, const char *name) 4910{ 4911 /* Fix illegal SG+CSUM combinations. */ 4912 if ((features & NETIF_F_SG) && 4913 !(features & NETIF_F_ALL_CSUM)) { 4914 if (name) 4915 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " 4916 "checksum feature.\n", name); 4917 features &= ~NETIF_F_SG; 4918 } 4919 4920 /* TSO requires that SG is present as well. */ 4921 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { 4922 if (name) 4923 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " 4924 "SG feature.\n", name); 4925 features &= ~NETIF_F_TSO; 4926 } 4927 4928 if (features & NETIF_F_UFO) { 4929 if (!(features & NETIF_F_GEN_CSUM)) { 4930 if (name) 4931 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 4932 "since no NETIF_F_HW_CSUM feature.\n", 4933 name); 4934 features &= ~NETIF_F_UFO; 4935 } 4936 4937 if (!(features & NETIF_F_SG)) { 4938 if (name) 4939 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 4940 "since no NETIF_F_SG feature.\n", name); 4941 features &= ~NETIF_F_UFO; 4942 } 4943 } 4944 4945 return features; 4946} 4947EXPORT_SYMBOL(netdev_fix_features); 4948 4949/** 4950 * netif_stacked_transfer_operstate - transfer operstate 4951 * @rootdev: the root or lower level device to transfer state from 4952 * @dev: the device to transfer operstate to 4953 * 4954 * Transfer operational state from root to device. This is normally 4955 * called when a stacking relationship exists between the root 4956 * device and the device(a leaf device). 4957 */ 4958void netif_stacked_transfer_operstate(const struct net_device *rootdev, 4959 struct net_device *dev) 4960{ 4961 if (rootdev->operstate == IF_OPER_DORMANT) 4962 netif_dormant_on(dev); 4963 else 4964 netif_dormant_off(dev); 4965 4966 if (netif_carrier_ok(rootdev)) { 4967 if (!netif_carrier_ok(dev)) 4968 netif_carrier_on(dev); 4969 } else { 4970 if (netif_carrier_ok(dev)) 4971 netif_carrier_off(dev); 4972 } 4973} 4974EXPORT_SYMBOL(netif_stacked_transfer_operstate); 4975 4976/** 4977 * register_netdevice - register a network device 4978 * @dev: device to register 4979 * 4980 * Take a completed network device structure and add it to the kernel 4981 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 4982 * chain. 0 is returned on success. A negative errno code is returned 4983 * on a failure to set up the device, or if the name is a duplicate. 4984 * 4985 * Callers must hold the rtnl semaphore. You may want 4986 * register_netdev() instead of this. 4987 * 4988 * BUGS: 4989 * The locking appears insufficient to guarantee two parallel registers 4990 * will not get the same name. 4991 */ 4992 4993int register_netdevice(struct net_device *dev) 4994{ 4995 int ret; 4996 struct net *net = dev_net(dev); 4997 4998 BUG_ON(dev_boot_phase); 4999 ASSERT_RTNL(); 5000 5001 might_sleep(); 5002 5003 /* When net_device's are persistent, this will be fatal. */ 5004 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 5005 BUG_ON(!net); 5006 5007 spin_lock_init(&dev->addr_list_lock); 5008 netdev_set_addr_lockdep_class(dev); 5009 netdev_init_queue_locks(dev); 5010 5011 dev->iflink = -1; 5012 5013#ifdef CONFIG_RPS 5014 if (!dev->num_rx_queues) { 5015 /* 5016 * Allocate a single RX queue if driver never called 5017 * alloc_netdev_mq 5018 */ 5019 5020 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); 5021 if (!dev->_rx) { 5022 ret = -ENOMEM; 5023 goto out; 5024 } 5025 5026 dev->_rx->first = dev->_rx; 5027 atomic_set(&dev->_rx->count, 1); 5028 dev->num_rx_queues = 1; 5029 } 5030#endif 5031 /* Init, if this function is available */ 5032 if (dev->netdev_ops->ndo_init) { 5033 ret = dev->netdev_ops->ndo_init(dev); 5034 if (ret) { 5035 if (ret > 0) 5036 ret = -EIO; 5037 goto out; 5038 } 5039 } 5040 5041 ret = dev_get_valid_name(dev, dev->name, 0); 5042 if (ret) 5043 goto err_uninit; 5044 5045 dev->ifindex = dev_new_index(net); 5046 if (dev->iflink == -1) 5047 dev->iflink = dev->ifindex; 5048 5049 /* Fix illegal checksum combinations */ 5050 if ((dev->features & NETIF_F_HW_CSUM) && 5051 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5052 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", 5053 dev->name); 5054 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 5055 } 5056 5057 if ((dev->features & NETIF_F_NO_CSUM) && 5058 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5059 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", 5060 dev->name); 5061 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); 5062 } 5063 5064 dev->features = netdev_fix_features(dev->features, dev->name); 5065 5066 /* Enable software GSO if SG is supported. */ 5067 if (dev->features & NETIF_F_SG) 5068 dev->features |= NETIF_F_GSO; 5069 5070 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 5071 ret = notifier_to_errno(ret); 5072 if (ret) 5073 goto err_uninit; 5074 5075 ret = netdev_register_kobject(dev); 5076 if (ret) 5077 goto err_uninit; 5078 dev->reg_state = NETREG_REGISTERED; 5079 5080 /* 5081 * Default initial state at registry is that the 5082 * device is present. 5083 */ 5084 5085 set_bit(__LINK_STATE_PRESENT, &dev->state); 5086 5087 dev_init_scheduler(dev); 5088 dev_hold(dev); 5089 list_netdevice(dev); 5090 5091 /* Notify protocols, that a new device appeared. */ 5092 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 5093 ret = notifier_to_errno(ret); 5094 if (ret) { 5095 rollback_registered(dev); 5096 dev->reg_state = NETREG_UNREGISTERED; 5097 } 5098 /* 5099 * Prevent userspace races by waiting until the network 5100 * device is fully setup before sending notifications. 5101 */ 5102 if (!dev->rtnl_link_ops || 5103 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 5104 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); 5105 5106out: 5107 return ret; 5108 5109err_uninit: 5110 if (dev->netdev_ops->ndo_uninit) 5111 dev->netdev_ops->ndo_uninit(dev); 5112 goto out; 5113} 5114EXPORT_SYMBOL(register_netdevice); 5115 5116/** 5117 * init_dummy_netdev - init a dummy network device for NAPI 5118 * @dev: device to init 5119 * 5120 * This takes a network device structure and initialize the minimum 5121 * amount of fields so it can be used to schedule NAPI polls without 5122 * registering a full blown interface. This is to be used by drivers 5123 * that need to tie several hardware interfaces to a single NAPI 5124 * poll scheduler due to HW limitations. 5125 */ 5126int init_dummy_netdev(struct net_device *dev) 5127{ 5128 /* Clear everything. Note we don't initialize spinlocks 5129 * are they aren't supposed to be taken by any of the 5130 * NAPI code and this dummy netdev is supposed to be 5131 * only ever used for NAPI polls 5132 */ 5133 memset(dev, 0, sizeof(struct net_device)); 5134 5135 /* make sure we BUG if trying to hit standard 5136 * register/unregister code path 5137 */ 5138 dev->reg_state = NETREG_DUMMY; 5139 5140 /* initialize the ref count */ 5141 atomic_set(&dev->refcnt, 1); 5142 5143 /* NAPI wants this */ 5144 INIT_LIST_HEAD(&dev->napi_list); 5145 5146 /* a dummy interface is started by default */ 5147 set_bit(__LINK_STATE_PRESENT, &dev->state); 5148 set_bit(__LINK_STATE_START, &dev->state); 5149 5150 return 0; 5151} 5152EXPORT_SYMBOL_GPL(init_dummy_netdev); 5153 5154 5155/** 5156 * register_netdev - register a network device 5157 * @dev: device to register 5158 * 5159 * Take a completed network device structure and add it to the kernel 5160 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 5161 * chain. 0 is returned on success. A negative errno code is returned 5162 * on a failure to set up the device, or if the name is a duplicate. 5163 * 5164 * This is a wrapper around register_netdevice that takes the rtnl semaphore 5165 * and expands the device name if you passed a format string to 5166 * alloc_netdev. 5167 */ 5168int register_netdev(struct net_device *dev) 5169{ 5170 int err; 5171 5172 rtnl_lock(); 5173 5174 /* 5175 * If the name is a format string the caller wants us to do a 5176 * name allocation. 5177 */ 5178 if (strchr(dev->name, '%')) { 5179 err = dev_alloc_name(dev, dev->name); 5180 if (err < 0) 5181 goto out; 5182 } 5183 5184 err = register_netdevice(dev); 5185out: 5186 rtnl_unlock(); 5187 return err; 5188} 5189EXPORT_SYMBOL(register_netdev); 5190 5191/* 5192 * netdev_wait_allrefs - wait until all references are gone. 5193 * 5194 * This is called when unregistering network devices. 5195 * 5196 * Any protocol or device that holds a reference should register 5197 * for netdevice notification, and cleanup and put back the 5198 * reference if they receive an UNREGISTER event. 5199 * We can get stuck here if buggy protocols don't correctly 5200 * call dev_put. 5201 */ 5202static void netdev_wait_allrefs(struct net_device *dev) 5203{ 5204 unsigned long rebroadcast_time, warning_time; 5205 5206 linkwatch_forget_dev(dev); 5207 5208 rebroadcast_time = warning_time = jiffies; 5209 while (atomic_read(&dev->refcnt) != 0) { 5210 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 5211 rtnl_lock(); 5212 5213 /* Rebroadcast unregister notification */ 5214 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5215 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users 5216 * should have already handle it the first time */ 5217 5218 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 5219 &dev->state)) { 5220 /* We must not have linkwatch events 5221 * pending on unregister. If this 5222 * happens, we simply run the queue 5223 * unscheduled, resulting in a noop 5224 * for this device. 5225 */ 5226 linkwatch_run_queue(); 5227 } 5228 5229 __rtnl_unlock(); 5230 5231 rebroadcast_time = jiffies; 5232 } 5233 5234 msleep(250); 5235 5236 if (time_after(jiffies, warning_time + 10 * HZ)) { 5237 printk(KERN_EMERG "unregister_netdevice: " 5238 "waiting for %s to become free. Usage " 5239 "count = %d\n", 5240 dev->name, atomic_read(&dev->refcnt)); 5241 warning_time = jiffies; 5242 } 5243 } 5244} 5245 5246/* The sequence is: 5247 * 5248 * rtnl_lock(); 5249 * ... 5250 * register_netdevice(x1); 5251 * register_netdevice(x2); 5252 * ... 5253 * unregister_netdevice(y1); 5254 * unregister_netdevice(y2); 5255 * ... 5256 * rtnl_unlock(); 5257 * free_netdev(y1); 5258 * free_netdev(y2); 5259 * 5260 * We are invoked by rtnl_unlock(). 5261 * This allows us to deal with problems: 5262 * 1) We can delete sysfs objects which invoke hotplug 5263 * without deadlocking with linkwatch via keventd. 5264 * 2) Since we run with the RTNL semaphore not held, we can sleep 5265 * safely in order to wait for the netdev refcnt to drop to zero. 5266 * 5267 * We must not return until all unregister events added during 5268 * the interval the lock was held have been completed. 5269 */ 5270void netdev_run_todo(void) 5271{ 5272 struct list_head list; 5273 5274 /* Snapshot list, allow later requests */ 5275 list_replace_init(&net_todo_list, &list); 5276 5277 __rtnl_unlock(); 5278 5279 while (!list_empty(&list)) { 5280 struct net_device *dev 5281 = list_first_entry(&list, struct net_device, todo_list); 5282 list_del(&dev->todo_list); 5283 5284 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 5285 printk(KERN_ERR "network todo '%s' but state %d\n", 5286 dev->name, dev->reg_state); 5287 dump_stack(); 5288 continue; 5289 } 5290 5291 dev->reg_state = NETREG_UNREGISTERED; 5292 5293 on_each_cpu(flush_backlog, dev, 1); 5294 5295 netdev_wait_allrefs(dev); 5296 5297 /* paranoia */ 5298 BUG_ON(atomic_read(&dev->refcnt)); 5299 WARN_ON(dev->ip_ptr); 5300 WARN_ON(dev->ip6_ptr); 5301 WARN_ON(dev->dn_ptr); 5302 5303 if (dev->destructor) 5304 dev->destructor(dev); 5305 5306 /* Free network device */ 5307 kobject_put(&dev->dev.kobj); 5308 } 5309} 5310 5311/** 5312 * dev_txq_stats_fold - fold tx_queues stats 5313 * @dev: device to get statistics from 5314 * @stats: struct rtnl_link_stats64 to hold results 5315 */ 5316void dev_txq_stats_fold(const struct net_device *dev, 5317 struct rtnl_link_stats64 *stats) 5318{ 5319 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0; 5320 unsigned int i; 5321 struct netdev_queue *txq; 5322 5323 for (i = 0; i < dev->num_tx_queues; i++) { 5324 txq = netdev_get_tx_queue(dev, i); 5325 spin_lock_bh(&txq->_xmit_lock); 5326 tx_bytes += txq->tx_bytes; 5327 tx_packets += txq->tx_packets; 5328 tx_dropped += txq->tx_dropped; 5329 spin_unlock_bh(&txq->_xmit_lock); 5330 } 5331 if (tx_bytes || tx_packets || tx_dropped) { 5332 stats->tx_bytes = tx_bytes; 5333 stats->tx_packets = tx_packets; 5334 stats->tx_dropped = tx_dropped; 5335 } 5336} 5337EXPORT_SYMBOL(dev_txq_stats_fold); 5338 5339/* Convert net_device_stats to rtnl_link_stats64. They have the same 5340 * fields in the same order, with only the type differing. 5341 */ 5342static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 5343 const struct net_device_stats *netdev_stats) 5344{ 5345#if BITS_PER_LONG == 64 5346 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); 5347 memcpy(stats64, netdev_stats, sizeof(*stats64)); 5348#else 5349 size_t i, n = sizeof(*stats64) / sizeof(u64); 5350 const unsigned long *src = (const unsigned long *)netdev_stats; 5351 u64 *dst = (u64 *)stats64; 5352 5353 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != 5354 sizeof(*stats64) / sizeof(u64)); 5355 for (i = 0; i < n; i++) 5356 dst[i] = src[i]; 5357#endif 5358} 5359 5360/** 5361 * dev_get_stats - get network device statistics 5362 * @dev: device to get statistics from 5363 * @storage: place to store stats 5364 * 5365 * Get network statistics from device. Return @storage. 5366 * The device driver may provide its own method by setting 5367 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 5368 * otherwise the internal statistics structure is used. 5369 */ 5370struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 5371 struct rtnl_link_stats64 *storage) 5372{ 5373 const struct net_device_ops *ops = dev->netdev_ops; 5374 5375 if (ops->ndo_get_stats64) { 5376 memset(storage, 0, sizeof(*storage)); 5377 return ops->ndo_get_stats64(dev, storage); 5378 } 5379 if (ops->ndo_get_stats) { 5380 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 5381 return storage; 5382 } 5383 netdev_stats_to_stats64(storage, &dev->stats); 5384 dev_txq_stats_fold(dev, storage); 5385 return storage; 5386} 5387EXPORT_SYMBOL(dev_get_stats); 5388 5389static void netdev_init_one_queue(struct net_device *dev, 5390 struct netdev_queue *queue, 5391 void *_unused) 5392{ 5393 queue->dev = dev; 5394} 5395 5396static void netdev_init_queues(struct net_device *dev) 5397{ 5398 netdev_init_one_queue(dev, &dev->rx_queue, NULL); 5399 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 5400 spin_lock_init(&dev->tx_global_lock); 5401} 5402 5403/** 5404 * alloc_netdev_mq - allocate network device 5405 * @sizeof_priv: size of private data to allocate space for 5406 * @name: device name format string 5407 * @setup: callback to initialize device 5408 * @queue_count: the number of subqueues to allocate 5409 * 5410 * Allocates a struct net_device with private data area for driver use 5411 * and performs basic initialization. Also allocates subquue structs 5412 * for each queue on the device at the end of the netdevice. 5413 */ 5414struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, 5415 void (*setup)(struct net_device *), unsigned int queue_count) 5416{ 5417 struct netdev_queue *tx; 5418 struct net_device *dev; 5419 size_t alloc_size; 5420 struct net_device *p; 5421#ifdef CONFIG_RPS 5422 struct netdev_rx_queue *rx; 5423 int i; 5424#endif 5425 5426 BUG_ON(strlen(name) >= sizeof(dev->name)); 5427 5428 alloc_size = sizeof(struct net_device); 5429 if (sizeof_priv) { 5430 /* ensure 32-byte alignment of private area */ 5431 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 5432 alloc_size += sizeof_priv; 5433 } 5434 /* ensure 32-byte alignment of whole construct */ 5435 alloc_size += NETDEV_ALIGN - 1; 5436 5437 p = kzalloc(alloc_size, GFP_KERNEL); 5438 if (!p) { 5439 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); 5440 return NULL; 5441 } 5442 5443 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); 5444 if (!tx) { 5445 printk(KERN_ERR "alloc_netdev: Unable to allocate " 5446 "tx qdiscs.\n"); 5447 goto free_p; 5448 } 5449 5450#ifdef CONFIG_RPS 5451 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); 5452 if (!rx) { 5453 printk(KERN_ERR "alloc_netdev: Unable to allocate " 5454 "rx queues.\n"); 5455 goto free_tx; 5456 } 5457 5458 atomic_set(&rx->count, queue_count); 5459 5460 /* 5461 * Set a pointer to first element in the array which holds the 5462 * reference count. 5463 */ 5464 for (i = 0; i < queue_count; i++) 5465 rx[i].first = rx; 5466#endif 5467 5468 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5469 dev->padded = (char *)dev - (char *)p; 5470 5471 if (dev_addr_init(dev)) 5472 goto free_rx; 5473 5474 dev_mc_init(dev); 5475 dev_uc_init(dev); 5476 5477 dev_net_set(dev, &init_net); 5478 5479 dev->_tx = tx; 5480 dev->num_tx_queues = queue_count; 5481 dev->real_num_tx_queues = queue_count; 5482 5483#ifdef CONFIG_RPS 5484 dev->_rx = rx; 5485 dev->num_rx_queues = queue_count; 5486#endif 5487 5488 dev->gso_max_size = GSO_MAX_SIZE; 5489 5490 netdev_init_queues(dev); 5491 5492 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); 5493 dev->ethtool_ntuple_list.count = 0; 5494 INIT_LIST_HEAD(&dev->napi_list); 5495 INIT_LIST_HEAD(&dev->unreg_list); 5496 INIT_LIST_HEAD(&dev->link_watch_list); 5497 dev->priv_flags = IFF_XMIT_DST_RELEASE; 5498 setup(dev); 5499 strcpy(dev->name, name); 5500 return dev; 5501 5502free_rx: 5503#ifdef CONFIG_RPS 5504 kfree(rx); 5505free_tx: 5506#endif 5507 kfree(tx); 5508free_p: 5509 kfree(p); 5510 return NULL; 5511} 5512EXPORT_SYMBOL(alloc_netdev_mq); 5513 5514/** 5515 * free_netdev - free network device 5516 * @dev: device 5517 * 5518 * This function does the last stage of destroying an allocated device 5519 * interface. The reference to the device object is released. 5520 * If this is the last reference then it will be freed. 5521 */ 5522void free_netdev(struct net_device *dev) 5523{ 5524 struct napi_struct *p, *n; 5525 5526 release_net(dev_net(dev)); 5527 5528 kfree(dev->_tx); 5529 5530 /* Flush device addresses */ 5531 dev_addr_flush(dev); 5532 5533 /* Clear ethtool n-tuple list */ 5534 ethtool_ntuple_flush(dev); 5535 5536 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5537 netif_napi_del(p); 5538 5539 /* Compatibility with error handling in drivers */ 5540 if (dev->reg_state == NETREG_UNINITIALIZED) { 5541 kfree((char *)dev - dev->padded); 5542 return; 5543 } 5544 5545 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 5546 dev->reg_state = NETREG_RELEASED; 5547 5548 /* will free via device release */ 5549 put_device(&dev->dev); 5550} 5551EXPORT_SYMBOL(free_netdev); 5552 5553/** 5554 * synchronize_net - Synchronize with packet receive processing 5555 * 5556 * Wait for packets currently being received to be done. 5557 * Does not block later packets from starting. 5558 */ 5559void synchronize_net(void) 5560{ 5561 might_sleep(); 5562 synchronize_rcu(); 5563} 5564EXPORT_SYMBOL(synchronize_net); 5565 5566/** 5567 * unregister_netdevice_queue - remove device from the kernel 5568 * @dev: device 5569 * @head: list 5570 * 5571 * This function shuts down a device interface and removes it 5572 * from the kernel tables. 5573 * If head not NULL, device is queued to be unregistered later. 5574 * 5575 * Callers must hold the rtnl semaphore. You may want 5576 * unregister_netdev() instead of this. 5577 */ 5578 5579void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 5580{ 5581 ASSERT_RTNL(); 5582 5583 if (head) { 5584 list_move_tail(&dev->unreg_list, head); 5585 } else { 5586 rollback_registered(dev); 5587 /* Finish processing unregister after unlock */ 5588 net_set_todo(dev); 5589 } 5590} 5591EXPORT_SYMBOL(unregister_netdevice_queue); 5592 5593/** 5594 * unregister_netdevice_many - unregister many devices 5595 * @head: list of devices 5596 */ 5597void unregister_netdevice_many(struct list_head *head) 5598{ 5599 struct net_device *dev; 5600 5601 if (!list_empty(head)) { 5602 rollback_registered_many(head); 5603 list_for_each_entry(dev, head, unreg_list) 5604 net_set_todo(dev); 5605 } 5606} 5607EXPORT_SYMBOL(unregister_netdevice_many); 5608 5609/** 5610 * unregister_netdev - remove device from the kernel 5611 * @dev: device 5612 * 5613 * This function shuts down a device interface and removes it 5614 * from the kernel tables. 5615 * 5616 * This is just a wrapper for unregister_netdevice that takes 5617 * the rtnl semaphore. In general you want to use this and not 5618 * unregister_netdevice. 5619 */ 5620void unregister_netdev(struct net_device *dev) 5621{ 5622 rtnl_lock(); 5623 unregister_netdevice(dev); 5624 rtnl_unlock(); 5625} 5626EXPORT_SYMBOL(unregister_netdev); 5627 5628/** 5629 * dev_change_net_namespace - move device to different nethost namespace 5630 * @dev: device 5631 * @net: network namespace 5632 * @pat: If not NULL name pattern to try if the current device name 5633 * is already taken in the destination network namespace. 5634 * 5635 * This function shuts down a device interface and moves it 5636 * to a new network namespace. On success 0 is returned, on 5637 * a failure a netagive errno code is returned. 5638 * 5639 * Callers must hold the rtnl semaphore. 5640 */ 5641 5642int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 5643{ 5644 int err; 5645 5646 ASSERT_RTNL(); 5647 5648 /* Don't allow namespace local devices to be moved. */ 5649 err = -EINVAL; 5650 if (dev->features & NETIF_F_NETNS_LOCAL) 5651 goto out; 5652 5653 /* Ensure the device has been registrered */ 5654 err = -EINVAL; 5655 if (dev->reg_state != NETREG_REGISTERED) 5656 goto out; 5657 5658 /* Get out if there is nothing todo */ 5659 err = 0; 5660 if (net_eq(dev_net(dev), net)) 5661 goto out; 5662 5663 /* Pick the destination device name, and ensure 5664 * we can use it in the destination network namespace. 5665 */ 5666 err = -EEXIST; 5667 if (__dev_get_by_name(net, dev->name)) { 5668 /* We get here if we can't use the current device name */ 5669 if (!pat) 5670 goto out; 5671 if (dev_get_valid_name(dev, pat, 1)) 5672 goto out; 5673 } 5674 5675 /* 5676 * And now a mini version of register_netdevice unregister_netdevice. 5677 */ 5678 5679 /* If device is running close it first. */ 5680 dev_close(dev); 5681 5682 /* And unlink it from device chain */ 5683 err = -ENODEV; 5684 unlist_netdevice(dev); 5685 5686 synchronize_net(); 5687 5688 /* Shutdown queueing discipline. */ 5689 dev_shutdown(dev); 5690 5691 /* Notify protocols, that we are about to destroy 5692 this device. They should clean all the things. 5693 */ 5694 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5695 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 5696 5697 /* 5698 * Flush the unicast and multicast chains 5699 */ 5700 dev_uc_flush(dev); 5701 dev_mc_flush(dev); 5702 5703 /* Actually switch the network namespace */ 5704 dev_net_set(dev, net); 5705 5706 /* If there is an ifindex conflict assign a new one */ 5707 if (__dev_get_by_index(net, dev->ifindex)) { 5708 int iflink = (dev->iflink == dev->ifindex); 5709 dev->ifindex = dev_new_index(net); 5710 if (iflink) 5711 dev->iflink = dev->ifindex; 5712 } 5713 5714 /* Fixup kobjects */ 5715 err = device_rename(&dev->dev, dev->name); 5716 WARN_ON(err); 5717 5718 /* Add the device back in the hashes */ 5719 list_netdevice(dev); 5720 5721 /* Notify protocols, that a new device appeared. */ 5722 call_netdevice_notifiers(NETDEV_REGISTER, dev); 5723 5724 /* 5725 * Prevent userspace races by waiting until the network 5726 * device is fully setup before sending notifications. 5727 */ 5728 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); 5729 5730 synchronize_net(); 5731 err = 0; 5732out: 5733 return err; 5734} 5735EXPORT_SYMBOL_GPL(dev_change_net_namespace); 5736 5737static int dev_cpu_callback(struct notifier_block *nfb, 5738 unsigned long action, 5739 void *ocpu) 5740{ 5741 struct sk_buff **list_skb; 5742 struct sk_buff *skb; 5743 unsigned int cpu, oldcpu = (unsigned long)ocpu; 5744 struct softnet_data *sd, *oldsd; 5745 5746 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 5747 return NOTIFY_OK; 5748 5749 local_irq_disable(); 5750 cpu = smp_processor_id(); 5751 sd = &per_cpu(softnet_data, cpu); 5752 oldsd = &per_cpu(softnet_data, oldcpu); 5753 5754 /* Find end of our completion_queue. */ 5755 list_skb = &sd->completion_queue; 5756 while (*list_skb) 5757 list_skb = &(*list_skb)->next; 5758 /* Append completion queue from offline CPU. */ 5759 *list_skb = oldsd->completion_queue; 5760 oldsd->completion_queue = NULL; 5761 5762 /* Append output queue from offline CPU. */ 5763 if (oldsd->output_queue) { 5764 *sd->output_queue_tailp = oldsd->output_queue; 5765 sd->output_queue_tailp = oldsd->output_queue_tailp; 5766 oldsd->output_queue = NULL; 5767 oldsd->output_queue_tailp = &oldsd->output_queue; 5768 } 5769 5770 raise_softirq_irqoff(NET_TX_SOFTIRQ); 5771 local_irq_enable(); 5772 5773 /* Process offline CPU's input_pkt_queue */ 5774 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 5775 netif_rx(skb); 5776 input_queue_head_incr(oldsd); 5777 } 5778 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { 5779 netif_rx(skb); 5780 input_queue_head_incr(oldsd); 5781 } 5782 5783 return NOTIFY_OK; 5784} 5785 5786 5787/** 5788 * netdev_increment_features - increment feature set by one 5789 * @all: current feature set 5790 * @one: new feature set 5791 * @mask: mask feature set 5792 * 5793 * Computes a new feature set after adding a device with feature set 5794 * @one to the master device with current feature set @all. Will not 5795 * enable anything that is off in @mask. Returns the new feature set. 5796 */ 5797unsigned long netdev_increment_features(unsigned long all, unsigned long one, 5798 unsigned long mask) 5799{ 5800 /* If device needs checksumming, downgrade to it. */ 5801 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) 5802 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); 5803 else if (mask & NETIF_F_ALL_CSUM) { 5804 /* If one device supports v4/v6 checksumming, set for all. */ 5805 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && 5806 !(all & NETIF_F_GEN_CSUM)) { 5807 all &= ~NETIF_F_ALL_CSUM; 5808 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); 5809 } 5810 5811 /* If one device supports hw checksumming, set for all. */ 5812 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { 5813 all &= ~NETIF_F_ALL_CSUM; 5814 all |= NETIF_F_HW_CSUM; 5815 } 5816 } 5817 5818 one |= NETIF_F_ALL_CSUM; 5819 5820 one |= all & NETIF_F_ONE_FOR_ALL; 5821 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; 5822 all |= one & mask & NETIF_F_ONE_FOR_ALL; 5823 5824 return all; 5825} 5826EXPORT_SYMBOL(netdev_increment_features); 5827 5828static struct hlist_head *netdev_create_hash(void) 5829{ 5830 int i; 5831 struct hlist_head *hash; 5832 5833 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 5834 if (hash != NULL) 5835 for (i = 0; i < NETDEV_HASHENTRIES; i++) 5836 INIT_HLIST_HEAD(&hash[i]); 5837 5838 return hash; 5839} 5840 5841/* Initialize per network namespace state */ 5842static int __net_init netdev_init(struct net *net) 5843{ 5844 INIT_LIST_HEAD(&net->dev_base_head); 5845 5846 net->dev_name_head = netdev_create_hash(); 5847 if (net->dev_name_head == NULL) 5848 goto err_name; 5849 5850 net->dev_index_head = netdev_create_hash(); 5851 if (net->dev_index_head == NULL) 5852 goto err_idx; 5853 5854 return 0; 5855 5856err_idx: 5857 kfree(net->dev_name_head); 5858err_name: 5859 return -ENOMEM; 5860} 5861 5862/** 5863 * netdev_drivername - network driver for the device 5864 * @dev: network device 5865 * @buffer: buffer for resulting name 5866 * @len: size of buffer 5867 * 5868 * Determine network driver for device. 5869 */ 5870char *netdev_drivername(const struct net_device *dev, char *buffer, int len) 5871{ 5872 const struct device_driver *driver; 5873 const struct device *parent; 5874 5875 if (len <= 0 || !buffer) 5876 return buffer; 5877 buffer[0] = 0; 5878 5879 parent = dev->dev.parent; 5880 5881 if (!parent) 5882 return buffer; 5883 5884 driver = parent->driver; 5885 if (driver && driver->name) 5886 strlcpy(buffer, driver->name, len); 5887 return buffer; 5888} 5889 5890static int __netdev_printk(const char *level, const struct net_device *dev, 5891 struct va_format *vaf) 5892{ 5893 int r; 5894 5895 if (dev && dev->dev.parent) 5896 r = dev_printk(level, dev->dev.parent, "%s: %pV", 5897 netdev_name(dev), vaf); 5898 else if (dev) 5899 r = printk("%s%s: %pV", level, netdev_name(dev), vaf); 5900 else 5901 r = printk("%s(NULL net_device): %pV", level, vaf); 5902 5903 return r; 5904} 5905 5906int netdev_printk(const char *level, const struct net_device *dev, 5907 const char *format, ...) 5908{ 5909 struct va_format vaf; 5910 va_list args; 5911 int r; 5912 5913 va_start(args, format); 5914 5915 vaf.fmt = format; 5916 vaf.va = &args; 5917 5918 r = __netdev_printk(level, dev, &vaf); 5919 va_end(args); 5920 5921 return r; 5922} 5923EXPORT_SYMBOL(netdev_printk); 5924 5925#define define_netdev_printk_level(func, level) \ 5926int func(const struct net_device *dev, const char *fmt, ...) \ 5927{ \ 5928 int r; \ 5929 struct va_format vaf; \ 5930 va_list args; \ 5931 \ 5932 va_start(args, fmt); \ 5933 \ 5934 vaf.fmt = fmt; \ 5935 vaf.va = &args; \ 5936 \ 5937 r = __netdev_printk(level, dev, &vaf); \ 5938 va_end(args); \ 5939 \ 5940 return r; \ 5941} \ 5942EXPORT_SYMBOL(func); 5943 5944define_netdev_printk_level(netdev_emerg, KERN_EMERG); 5945define_netdev_printk_level(netdev_alert, KERN_ALERT); 5946define_netdev_printk_level(netdev_crit, KERN_CRIT); 5947define_netdev_printk_level(netdev_err, KERN_ERR); 5948define_netdev_printk_level(netdev_warn, KERN_WARNING); 5949define_netdev_printk_level(netdev_notice, KERN_NOTICE); 5950define_netdev_printk_level(netdev_info, KERN_INFO); 5951 5952static void __net_exit netdev_exit(struct net *net) 5953{ 5954 kfree(net->dev_name_head); 5955 kfree(net->dev_index_head); 5956} 5957 5958static struct pernet_operations __net_initdata netdev_net_ops = { 5959 .init = netdev_init, 5960 .exit = netdev_exit, 5961}; 5962 5963static void __net_exit default_device_exit(struct net *net) 5964{ 5965 struct net_device *dev, *aux; 5966 /* 5967 * Push all migratable network devices back to the 5968 * initial network namespace 5969 */ 5970 rtnl_lock(); 5971 for_each_netdev_safe(net, dev, aux) { 5972 int err; 5973 char fb_name[IFNAMSIZ]; 5974 5975 /* Ignore unmoveable devices (i.e. loopback) */ 5976 if (dev->features & NETIF_F_NETNS_LOCAL) 5977 continue; 5978 5979 /* Leave virtual devices for the generic cleanup */ 5980 if (dev->rtnl_link_ops) 5981 continue; 5982 5983 /* Push remaing network devices to init_net */ 5984 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 5985 err = dev_change_net_namespace(dev, &init_net, fb_name); 5986 if (err) { 5987 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", 5988 __func__, dev->name, err); 5989 BUG(); 5990 } 5991 } 5992 rtnl_unlock(); 5993} 5994 5995static void __net_exit default_device_exit_batch(struct list_head *net_list) 5996{ 5997 /* At exit all network devices most be removed from a network 5998 * namespace. Do this in the reverse order of registeration. 5999 * Do this across as many network namespaces as possible to 6000 * improve batching efficiency. 6001 */ 6002 struct net_device *dev; 6003 struct net *net; 6004 LIST_HEAD(dev_kill_list); 6005 6006 rtnl_lock(); 6007 list_for_each_entry(net, net_list, exit_list) { 6008 for_each_netdev_reverse(net, dev) { 6009 if (dev->rtnl_link_ops) 6010 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 6011 else 6012 unregister_netdevice_queue(dev, &dev_kill_list); 6013 } 6014 } 6015 unregister_netdevice_many(&dev_kill_list); 6016 rtnl_unlock(); 6017} 6018 6019static struct pernet_operations __net_initdata default_device_ops = { 6020 .exit = default_device_exit, 6021 .exit_batch = default_device_exit_batch, 6022}; 6023 6024/* 6025 * Initialize the DEV module. At boot time this walks the device list and 6026 * unhooks any devices that fail to initialise (normally hardware not 6027 * present) and leaves us with a valid list of present and active devices. 6028 * 6029 */ 6030 6031/* 6032 * This is called single threaded during boot, so no need 6033 * to take the rtnl semaphore. 6034 */ 6035static int __init net_dev_init(void) 6036{ 6037 int i, rc = -ENOMEM; 6038 6039 BUG_ON(!dev_boot_phase); 6040 6041 if (dev_proc_init()) 6042 goto out; 6043 6044 if (netdev_kobject_init()) 6045 goto out; 6046 6047 INIT_LIST_HEAD(&ptype_all); 6048 for (i = 0; i < PTYPE_HASH_SIZE; i++) 6049 INIT_LIST_HEAD(&ptype_base[i]); 6050 6051 if (register_pernet_subsys(&netdev_net_ops)) 6052 goto out; 6053 6054 /* 6055 * Initialise the packet receive queues. 6056 */ 6057 6058 for_each_possible_cpu(i) { 6059 struct softnet_data *sd = &per_cpu(softnet_data, i); 6060 6061 memset(sd, 0, sizeof(*sd)); 6062 skb_queue_head_init(&sd->input_pkt_queue); 6063 skb_queue_head_init(&sd->process_queue); 6064 sd->completion_queue = NULL; 6065 INIT_LIST_HEAD(&sd->poll_list); 6066 sd->output_queue = NULL; 6067 sd->output_queue_tailp = &sd->output_queue; 6068#ifdef CONFIG_RPS 6069 sd->csd.func = rps_trigger_softirq; 6070 sd->csd.info = sd; 6071 sd->csd.flags = 0; 6072 sd->cpu = i; 6073#endif 6074 6075 sd->backlog.poll = process_backlog; 6076 sd->backlog.weight = weight_p; 6077 sd->backlog.gro_list = NULL; 6078 sd->backlog.gro_count = 0; 6079 } 6080 6081 dev_boot_phase = 0; 6082 6083 /* The loopback device is special if any other network devices 6084 * is present in a network namespace the loopback device must 6085 * be present. Since we now dynamically allocate and free the 6086 * loopback device ensure this invariant is maintained by 6087 * keeping the loopback device as the first device on the 6088 * list of network devices. Ensuring the loopback devices 6089 * is the first device that appears and the last network device 6090 * that disappears. 6091 */ 6092 if (register_pernet_device(&loopback_net_ops)) 6093 goto out; 6094 6095 if (register_pernet_device(&default_device_ops)) 6096 goto out; 6097 6098 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 6099 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 6100 6101 hotcpu_notifier(dev_cpu_callback, 0); 6102 dst_init(); 6103 dev_mcast_init(); 6104 rc = 0; 6105out: 6106 return rc; 6107} 6108 6109subsys_initcall(net_dev_init); 6110 6111static int __init initialize_hashrnd(void) 6112{ 6113 get_random_bytes(&hashrnd, sizeof(hashrnd)); 6114 return 0; 6115} 6116 6117late_initcall_sync(initialize_hashrnd); 6118