1/* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */
2/*
3 * 	NET3	Protocol independent device support routines.
4 *
5 *		This program is free software; you can redistribute it and/or
6 *		modify it under the terms of the GNU General Public License
7 *		as published by the Free Software Foundation; either version
8 *		2 of the License, or (at your option) any later version.
9 *
10 *	Derived from the non IP parts of dev.c 1.0.19
11 * 		Authors:	Ross Biro
12 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
13 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
14 *
15 *	Additional Authors:
16 *		Florian la Roche <rzsfl@rz.uni-sb.de>
17 *		Alan Cox <gw4pts@gw4pts.ampr.org>
18 *		David Hinds <dahinds@users.sourceforge.net>
19 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
20 *		Adam Sulmicki <adam@cfar.umd.edu>
21 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
22 *
23 *	Changes:
24 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
25 *              			to 2 if register_netdev gets called
26 *              			before net_dev_init & also removed a
27 *              			few lines of code in the process.
28 *		Alan Cox	:	device private ioctl copies fields back.
29 *		Alan Cox	:	Transmit queue code does relevant
30 *					stunts to keep the queue safe.
31 *		Alan Cox	:	Fixed double lock.
32 *		Alan Cox	:	Fixed promisc NULL pointer trap
33 *		????????	:	Support the full private ioctl range
34 *		Alan Cox	:	Moved ioctl permission check into
35 *					drivers
36 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
37 *		Alan Cox	:	100 backlog just doesn't cut it when
38 *					you start doing multicast video 8)
39 *		Alan Cox	:	Rewrote net_bh and list manager.
40 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
41 *		Alan Cox	:	Took out transmit every packet pass
42 *					Saved a few bytes in the ioctl handler
43 *		Alan Cox	:	Network driver sets packet type before
44 *					calling netif_rx. Saves a function
45 *					call a packet.
46 *		Alan Cox	:	Hashed net_bh()
47 *		Richard Kooijman:	Timestamp fixes.
48 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
49 *		Alan Cox	:	Device lock protection.
50 *		Alan Cox	: 	Fixed nasty side effect of device close
51 *					changes.
52 *		Rudi Cilibrasi	:	Pass the right thing to
53 *					set_mac_address()
54 *		Dave Miller	:	32bit quantity for the device lock to
55 *					make it work out on a Sparc.
56 *		Bjorn Ekwall	:	Added KERNELD hack.
57 *		Alan Cox	:	Cleaned up the backlog initialise.
58 *		Craig Metz	:	SIOCGIFCONF fix if space for under
59 *					1 device.
60 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
61 *					is no device open function.
62 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
63 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
64 *		Cyrus Durgin	:	Cleaned for KMOD
65 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
66 *					A network device unload needs to purge
67 *					the backlog queue.
68 *	Paul Rusty Russell	:	SIOCSIFNAME
69 *              Pekka Riikonen  :	Netdev boot-time settings code
70 *              Andrew Morton   :       Make unregister_netdevice wait
71 *              			indefinitely on dev->refcnt
72 * 		J Hadi Salim	:	- Backlog queue sampling
73 *				        - netif_rx() feedback
74 */
75
76#include <asm/uaccess.h>
77#include <asm/system.h>
78#include <linux/bitops.h>
79#include <linux/capability.h>
80#include <linux/cpu.h>
81#include <linux/types.h>
82#include <linux/kernel.h>
83#include <linux/hash.h>
84#include <linux/slab.h>
85#include <linux/sched.h>
86#include <linux/mutex.h>
87#include <linux/string.h>
88#include <linux/mm.h>
89#include <linux/socket.h>
90#include <linux/sockios.h>
91#include <linux/errno.h>
92#include <linux/interrupt.h>
93#include <linux/if_ether.h>
94#include <linux/netdevice.h>
95#include <linux/etherdevice.h>
96#include <linux/ethtool.h>
97#include <linux/notifier.h>
98#include <linux/skbuff.h>
99#include <linux/netfilter_ipv4.h>
100#include <net/net_namespace.h>
101#include <net/sock.h>
102#include <linux/rtnetlink.h>
103#include <linux/proc_fs.h>
104#include <linux/seq_file.h>
105#include <linux/stat.h>
106#include <net/dst.h>
107#include <net/pkt_sched.h>
108#include <net/checksum.h>
109#include <net/xfrm.h>
110#include <linux/highmem.h>
111#include <linux/init.h>
112#include <linux/kmod.h>
113#include <linux/module.h>
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
117#include <net/wext.h>
118#include <net/iw_handler.h>
119#include <asm/current.h>
120#include <linux/audit.h>
121#include <linux/dmaengine.h>
122#include <linux/err.h>
123#include <linux/ctype.h>
124#include <linux/if_arp.h>
125#include <linux/if_vlan.h>
126#include <linux/ip.h>
127#include <net/ip.h>
128#include <linux/ipv6.h>
129#include <linux/in.h>
130#include <linux/jhash.h>
131#include <linux/random.h>
132#include <trace/events/napi.h>
133#include <linux/pci.h>
134#include "net-sysfs.h"
135
136#include <typedefs.h>
137#include <bcmdefs.h>
138
139/* Instead of increasing this, you should create a hash table. */
140#define MAX_GRO_SKBS 8
141
142/* This should be increased if a protocol with a bigger head is added. */
143#define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145/*
146 *	The list of packet types we will receive (as opposed to discard)
147 *	and the routines to invoke.
148 *
149 *	Why 16. Because with 16 the only overlap we get on a hash of the
150 *	low nibble of the protocol value is RARP/SNAP/X.25.
151 *
152 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153 *             sure which should go first, but I bet it won't make much
154 *             difference if we are running VLANs.  The good news is that
155 *             this protocol won't be in the list unless compiled in, so
156 *             the average user (w/out VLANs) will not be adversely affected.
157 *             --BLG
158 *
159 *		0800	IP
160 *		8100    802.1Q VLAN
161 *		0001	802.3
162 *		0002	AX.25
163 *		0004	802.2
164 *		8035	RARP
165 *		0005	SNAP
166 *		0805	X.25
167 *		0806	ARP
168 *		8137	IPX
169 *		0009	Localtalk
170 *		86DD	IPv6
171 */
172
173#define PTYPE_HASH_SIZE	(16)
174#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175
176static DEFINE_SPINLOCK(ptype_lock);
177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178static struct list_head ptype_all __read_mostly;	/* Taps */
179
180/*
181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182 * semaphore.
183 *
184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185 *
186 * Writers must hold the rtnl semaphore while they loop through the
187 * dev_base_head list, and hold dev_base_lock for writing when they do the
188 * actual updates.  This allows pure readers to access the list even
189 * while a writer is preparing to update it.
190 *
191 * To put it another way, dev_base_lock is held for writing only to
192 * protect against pure readers; the rtnl semaphore provides the
193 * protection against other writers.
194 *
195 * See, for example usages, register_netdevice() and
196 * unregister_netdevice(), which must be called with the rtnl
197 * semaphore held.
198 */
199DEFINE_RWLOCK(dev_base_lock);
200EXPORT_SYMBOL(dev_base_lock);
201
202static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203{
204	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206}
207
208static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209{
210	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211}
212
213static inline void rps_lock(struct softnet_data *sd)
214{
215#ifdef CONFIG_RPS
216	spin_lock(&sd->input_pkt_queue.lock);
217#endif
218}
219
220static inline void rps_unlock(struct softnet_data *sd)
221{
222#ifdef CONFIG_RPS
223	spin_unlock(&sd->input_pkt_queue.lock);
224#endif
225}
226
227/* Device list insertion */
228static int list_netdevice(struct net_device *dev)
229{
230	struct net *net = dev_net(dev);
231
232	ASSERT_RTNL();
233
234	write_lock_bh(&dev_base_lock);
235	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237	hlist_add_head_rcu(&dev->index_hlist,
238			   dev_index_hash(net, dev->ifindex));
239	write_unlock_bh(&dev_base_lock);
240	return 0;
241}
242
243/* Device list removal
244 * caller must respect a RCU grace period before freeing/reusing dev
245 */
246static void unlist_netdevice(struct net_device *dev)
247{
248	ASSERT_RTNL();
249
250	/* Unlink dev from the device chain */
251	write_lock_bh(&dev_base_lock);
252	list_del_rcu(&dev->dev_list);
253	hlist_del_rcu(&dev->name_hlist);
254	hlist_del_rcu(&dev->index_hlist);
255	write_unlock_bh(&dev_base_lock);
256}
257
258/*
259 *	Our notifier list
260 */
261
262static RAW_NOTIFIER_HEAD(netdev_chain);
263
264/*
265 *	Device drivers call our routines to queue packets here. We empty the
266 *	queue in the local softnet handler.
267 */
268
269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272#ifdef CONFIG_LOCKDEP
273/*
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
276 */
277static const unsigned short netdev_lock_type[] =
278	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293	 ARPHRD_VOID, ARPHRD_NONE};
294
295static const char *const netdev_lock_name[] =
296	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311	 "_xmit_VOID", "_xmit_NONE"};
312
313static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315
316static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317{
318	int i;
319
320	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321		if (netdev_lock_type[i] == dev_type)
322			return i;
323	/* the last key is used by default */
324	return ARRAY_SIZE(netdev_lock_type) - 1;
325}
326
327static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328						 unsigned short dev_type)
329{
330	int i;
331
332	i = netdev_lock_pos(dev_type);
333	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334				   netdev_lock_name[i]);
335}
336
337static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338{
339	int i;
340
341	i = netdev_lock_pos(dev->type);
342	lockdep_set_class_and_name(&dev->addr_list_lock,
343				   &netdev_addr_lock_key[i],
344				   netdev_lock_name[i]);
345}
346#else
347static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348						 unsigned short dev_type)
349{
350}
351static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352{
353}
354#endif
355
356/*******************************************************************************
357
358		Protocol management and registration routines
359
360*******************************************************************************/
361
362/*
363 *	Add a protocol ID to the list. Now that the input handler is
364 *	smarter we can dispense with all the messy stuff that used to be
365 *	here.
366 *
367 *	BEWARE!!! Protocol handlers, mangling input packets,
368 *	MUST BE last in hash buckets and checking protocol handlers
369 *	MUST start from promiscuous ptype_all chain in net_bh.
370 *	It is true now, do not change it.
371 *	Explanation follows: if protocol handler, mangling packet, will
372 *	be the first on list, it is not able to sense, that packet
373 *	is cloned and should be copied-on-write, so that it will
374 *	change it and subsequent readers will get broken packet.
375 *							--ANK (980803)
376 */
377
378/**
379 *	dev_add_pack - add packet handler
380 *	@pt: packet type declaration
381 *
382 *	Add a protocol handler to the networking stack. The passed &packet_type
383 *	is linked into kernel lists and may not be freed until it has been
384 *	removed from the kernel lists.
385 *
386 *	This call does not sleep therefore it can not
387 *	guarantee all CPU's that are in middle of receiving packets
388 *	will see the new packet type (until the next received packet).
389 */
390
391void dev_add_pack(struct packet_type *pt)
392{
393	int hash;
394
395	spin_lock_bh(&ptype_lock);
396	if (pt->type == htons(ETH_P_ALL))
397		list_add_rcu(&pt->list, &ptype_all);
398	else {
399		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
400		list_add_rcu(&pt->list, &ptype_base[hash]);
401	}
402	spin_unlock_bh(&ptype_lock);
403}
404EXPORT_SYMBOL(dev_add_pack);
405
406/**
407 *	__dev_remove_pack	 - remove packet handler
408 *	@pt: packet type declaration
409 *
410 *	Remove a protocol handler that was previously added to the kernel
411 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
412 *	from the kernel lists and can be freed or reused once this function
413 *	returns.
414 *
415 *      The packet type might still be in use by receivers
416 *	and must not be freed until after all the CPU's have gone
417 *	through a quiescent state.
418 */
419void __dev_remove_pack(struct packet_type *pt)
420{
421	struct list_head *head;
422	struct packet_type *pt1;
423
424	spin_lock_bh(&ptype_lock);
425
426	if (pt->type == htons(ETH_P_ALL))
427		head = &ptype_all;
428	else
429		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
430
431	list_for_each_entry(pt1, head, list) {
432		if (pt == pt1) {
433			list_del_rcu(&pt->list);
434			goto out;
435		}
436	}
437
438	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
439out:
440	spin_unlock_bh(&ptype_lock);
441}
442EXPORT_SYMBOL(__dev_remove_pack);
443
444/**
445 *	dev_remove_pack	 - remove packet handler
446 *	@pt: packet type declaration
447 *
448 *	Remove a protocol handler that was previously added to the kernel
449 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
450 *	from the kernel lists and can be freed or reused once this function
451 *	returns.
452 *
453 *	This call sleeps to guarantee that no CPU is looking at the packet
454 *	type after return.
455 */
456void dev_remove_pack(struct packet_type *pt)
457{
458	__dev_remove_pack(pt);
459
460	synchronize_net();
461}
462EXPORT_SYMBOL(dev_remove_pack);
463
464/******************************************************************************
465
466		      Device Boot-time Settings Routines
467
468*******************************************************************************/
469
470/* Boot time configuration table */
471static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
472
473/**
474 *	netdev_boot_setup_add	- add new setup entry
475 *	@name: name of the device
476 *	@map: configured settings for the device
477 *
478 *	Adds new setup entry to the dev_boot_setup list.  The function
479 *	returns 0 on error and 1 on success.  This is a generic routine to
480 *	all netdevices.
481 */
482static int netdev_boot_setup_add(char *name, struct ifmap *map)
483{
484	struct netdev_boot_setup *s;
485	int i;
486
487	s = dev_boot_setup;
488	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
489		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
490			memset(s[i].name, 0, sizeof(s[i].name));
491			strlcpy(s[i].name, name, IFNAMSIZ);
492			memcpy(&s[i].map, map, sizeof(s[i].map));
493			break;
494		}
495	}
496
497	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
498}
499
500/**
501 *	netdev_boot_setup_check	- check boot time settings
502 *	@dev: the netdevice
503 *
504 * 	Check boot time settings for the device.
505 *	The found settings are set for the device to be used
506 *	later in the device probing.
507 *	Returns 0 if no settings found, 1 if they are.
508 */
509int netdev_boot_setup_check(struct net_device *dev)
510{
511	struct netdev_boot_setup *s = dev_boot_setup;
512	int i;
513
514	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
515		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
516		    !strcmp(dev->name, s[i].name)) {
517			dev->irq 	= s[i].map.irq;
518			dev->base_addr 	= s[i].map.base_addr;
519			dev->mem_start 	= s[i].map.mem_start;
520			dev->mem_end 	= s[i].map.mem_end;
521			return 1;
522		}
523	}
524	return 0;
525}
526EXPORT_SYMBOL(netdev_boot_setup_check);
527
528
529/**
530 *	netdev_boot_base	- get address from boot time settings
531 *	@prefix: prefix for network device
532 *	@unit: id for network device
533 *
534 * 	Check boot time settings for the base address of device.
535 *	The found settings are set for the device to be used
536 *	later in the device probing.
537 *	Returns 0 if no settings found.
538 */
539unsigned long netdev_boot_base(const char *prefix, int unit)
540{
541	const struct netdev_boot_setup *s = dev_boot_setup;
542	char name[IFNAMSIZ];
543	int i;
544
545	sprintf(name, "%s%d", prefix, unit);
546
547	/*
548	 * If device already registered then return base of 1
549	 * to indicate not to probe for this interface
550	 */
551	if (__dev_get_by_name(&init_net, name))
552		return 1;
553
554	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
555		if (!strcmp(name, s[i].name))
556			return s[i].map.base_addr;
557	return 0;
558}
559
560/*
561 * Saves at boot time configured settings for any netdevice.
562 */
563int __init netdev_boot_setup(char *str)
564{
565	int ints[5];
566	struct ifmap map;
567
568	str = get_options(str, ARRAY_SIZE(ints), ints);
569	if (!str || !*str)
570		return 0;
571
572	/* Save settings */
573	memset(&map, 0, sizeof(map));
574	if (ints[0] > 0)
575		map.irq = ints[1];
576	if (ints[0] > 1)
577		map.base_addr = ints[2];
578	if (ints[0] > 2)
579		map.mem_start = ints[3];
580	if (ints[0] > 3)
581		map.mem_end = ints[4];
582
583	/* Add new entry to the list */
584	return netdev_boot_setup_add(str, &map);
585}
586
587__setup("netdev=", netdev_boot_setup);
588
589/*******************************************************************************
590
591			    Device Interface Subroutines
592
593*******************************************************************************/
594
595/**
596 *	__dev_get_by_name	- find a device by its name
597 *	@net: the applicable net namespace
598 *	@name: name to find
599 *
600 *	Find an interface by name. Must be called under RTNL semaphore
601 *	or @dev_base_lock. If the name is found a pointer to the device
602 *	is returned. If the name is not found then %NULL is returned. The
603 *	reference counters are not incremented so the caller must be
604 *	careful with locks.
605 */
606
607struct net_device *__dev_get_by_name(struct net *net, const char *name)
608{
609	struct hlist_node *p;
610	struct net_device *dev;
611	struct hlist_head *head = dev_name_hash(net, name);
612
613	hlist_for_each_entry(dev, p, head, name_hlist)
614		if (!strncmp(dev->name, name, IFNAMSIZ))
615			return dev;
616
617	return NULL;
618}
619EXPORT_SYMBOL(__dev_get_by_name);
620
621/**
622 *	dev_get_by_name_rcu	- find a device by its name
623 *	@net: the applicable net namespace
624 *	@name: name to find
625 *
626 *	Find an interface by name.
627 *	If the name is found a pointer to the device is returned.
628 * 	If the name is not found then %NULL is returned.
629 *	The reference counters are not incremented so the caller must be
630 *	careful with locks. The caller must hold RCU lock.
631 */
632
633struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
634{
635	struct hlist_node *p;
636	struct net_device *dev;
637	struct hlist_head *head = dev_name_hash(net, name);
638
639	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
640		if (!strncmp(dev->name, name, IFNAMSIZ))
641			return dev;
642
643	return NULL;
644}
645EXPORT_SYMBOL(dev_get_by_name_rcu);
646
647/**
648 *	dev_get_by_name		- find a device by its name
649 *	@net: the applicable net namespace
650 *	@name: name to find
651 *
652 *	Find an interface by name. This can be called from any
653 *	context and does its own locking. The returned handle has
654 *	the usage count incremented and the caller must use dev_put() to
655 *	release it when it is no longer needed. %NULL is returned if no
656 *	matching device is found.
657 */
658
659struct net_device *dev_get_by_name(struct net *net, const char *name)
660{
661	struct net_device *dev;
662
663	rcu_read_lock();
664	dev = dev_get_by_name_rcu(net, name);
665	if (dev)
666		dev_hold(dev);
667	rcu_read_unlock();
668	return dev;
669}
670EXPORT_SYMBOL(dev_get_by_name);
671
672/**
673 *	__dev_get_by_index - find a device by its ifindex
674 *	@net: the applicable net namespace
675 *	@ifindex: index of device
676 *
677 *	Search for an interface by index. Returns %NULL if the device
678 *	is not found or a pointer to the device. The device has not
679 *	had its reference counter increased so the caller must be careful
680 *	about locking. The caller must hold either the RTNL semaphore
681 *	or @dev_base_lock.
682 */
683
684struct net_device *__dev_get_by_index(struct net *net, int ifindex)
685{
686	struct hlist_node *p;
687	struct net_device *dev;
688	struct hlist_head *head = dev_index_hash(net, ifindex);
689
690	hlist_for_each_entry(dev, p, head, index_hlist)
691		if (dev->ifindex == ifindex)
692			return dev;
693
694	return NULL;
695}
696EXPORT_SYMBOL(__dev_get_by_index);
697
698/**
699 *	dev_get_by_index_rcu - find a device by its ifindex
700 *	@net: the applicable net namespace
701 *	@ifindex: index of device
702 *
703 *	Search for an interface by index. Returns %NULL if the device
704 *	is not found or a pointer to the device. The device has not
705 *	had its reference counter increased so the caller must be careful
706 *	about locking. The caller must hold RCU lock.
707 */
708
709struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
710{
711	struct hlist_node *p;
712	struct net_device *dev;
713	struct hlist_head *head = dev_index_hash(net, ifindex);
714
715	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
716		if (dev->ifindex == ifindex)
717			return dev;
718
719	return NULL;
720}
721EXPORT_SYMBOL(dev_get_by_index_rcu);
722
723
724/**
725 *	dev_get_by_index - find a device by its ifindex
726 *	@net: the applicable net namespace
727 *	@ifindex: index of device
728 *
729 *	Search for an interface by index. Returns NULL if the device
730 *	is not found or a pointer to the device. The device returned has
731 *	had a reference added and the pointer is safe until the user calls
732 *	dev_put to indicate they have finished with it.
733 */
734
735struct net_device *dev_get_by_index(struct net *net, int ifindex)
736{
737	struct net_device *dev;
738
739	rcu_read_lock();
740	dev = dev_get_by_index_rcu(net, ifindex);
741	if (dev)
742		dev_hold(dev);
743	rcu_read_unlock();
744	return dev;
745}
746EXPORT_SYMBOL(dev_get_by_index);
747
748/**
749 *	dev_getbyhwaddr - find a device by its hardware address
750 *	@net: the applicable net namespace
751 *	@type: media type of device
752 *	@ha: hardware address
753 *
754 *	Search for an interface by MAC address. Returns NULL if the device
755 *	is not found or a pointer to the device. The caller must hold the
756 *	rtnl semaphore. The returned device has not had its ref count increased
757 *	and the caller must therefore be careful about locking
758 *
759 *	BUGS:
760 *	If the API was consistent this would be __dev_get_by_hwaddr
761 */
762
763struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
764{
765	struct net_device *dev;
766
767	ASSERT_RTNL();
768
769	for_each_netdev(net, dev)
770		if (dev->type == type &&
771		    !memcmp(dev->dev_addr, ha, dev->addr_len))
772			return dev;
773
774	return NULL;
775}
776EXPORT_SYMBOL(dev_getbyhwaddr);
777
778struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
779{
780	struct net_device *dev;
781
782	ASSERT_RTNL();
783	for_each_netdev(net, dev)
784		if (dev->type == type)
785			return dev;
786
787	return NULL;
788}
789EXPORT_SYMBOL(__dev_getfirstbyhwtype);
790
791struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
792{
793	struct net_device *dev, *ret = NULL;
794
795	rcu_read_lock();
796	for_each_netdev_rcu(net, dev)
797		if (dev->type == type) {
798			dev_hold(dev);
799			ret = dev;
800			break;
801		}
802	rcu_read_unlock();
803	return ret;
804}
805EXPORT_SYMBOL(dev_getfirstbyhwtype);
806
807/**
808 *	dev_get_by_flags_rcu - find any device with given flags
809 *	@net: the applicable net namespace
810 *	@if_flags: IFF_* values
811 *	@mask: bitmask of bits in if_flags to check
812 *
813 *	Search for any interface with the given flags. Returns NULL if a device
814 *	is not found or a pointer to the device. Must be called inside
815 *	rcu_read_lock(), and result refcount is unchanged.
816 */
817
818struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
819				    unsigned short mask)
820{
821	struct net_device *dev, *ret;
822
823	ret = NULL;
824	for_each_netdev_rcu(net, dev) {
825		if (((dev->flags ^ if_flags) & mask) == 0) {
826			ret = dev;
827			break;
828		}
829	}
830	return ret;
831}
832EXPORT_SYMBOL(dev_get_by_flags_rcu);
833
834/**
835 *	dev_valid_name - check if name is okay for network device
836 *	@name: name string
837 *
838 *	Network device names need to be valid file names to
839 *	to allow sysfs to work.  We also disallow any kind of
840 *	whitespace.
841 */
842int dev_valid_name(const char *name)
843{
844	if (*name == '\0')
845		return 0;
846	if (strlen(name) >= IFNAMSIZ)
847		return 0;
848	if (!strcmp(name, ".") || !strcmp(name, ".."))
849		return 0;
850
851	while (*name) {
852		if (*name == '/' || isspace(*name))
853			return 0;
854		name++;
855	}
856	return 1;
857}
858EXPORT_SYMBOL(dev_valid_name);
859
860/**
861 *	__dev_alloc_name - allocate a name for a device
862 *	@net: network namespace to allocate the device name in
863 *	@name: name format string
864 *	@buf:  scratch buffer and result name string
865 *
866 *	Passed a format string - eg "lt%d" it will try and find a suitable
867 *	id. It scans list of devices to build up a free map, then chooses
868 *	the first empty slot. The caller must hold the dev_base or rtnl lock
869 *	while allocating the name and adding the device in order to avoid
870 *	duplicates.
871 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
872 *	Returns the number of the unit assigned or a negative errno code.
873 */
874
875static int __dev_alloc_name(struct net *net, const char *name, char *buf)
876{
877	int i = 0;
878	const char *p;
879	const int max_netdevices = 8*PAGE_SIZE;
880	unsigned long *inuse;
881	struct net_device *d;
882
883	p = strnchr(name, IFNAMSIZ-1, '%');
884	if (p) {
885		/*
886		 * Verify the string as this thing may have come from
887		 * the user.  There must be either one "%d" and no other "%"
888		 * characters.
889		 */
890		if (p[1] != 'd' || strchr(p + 2, '%'))
891			return -EINVAL;
892
893		/* Use one page as a bit array of possible slots */
894		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
895		if (!inuse)
896			return -ENOMEM;
897
898		for_each_netdev(net, d) {
899			if (!sscanf(d->name, name, &i))
900				continue;
901			if (i < 0 || i >= max_netdevices)
902				continue;
903
904			/*  avoid cases where sscanf is not exact inverse of printf */
905			snprintf(buf, IFNAMSIZ, name, i);
906			if (!strncmp(buf, d->name, IFNAMSIZ))
907				set_bit(i, inuse);
908		}
909
910		i = find_first_zero_bit(inuse, max_netdevices);
911		free_page((unsigned long) inuse);
912	}
913
914	if (buf != name)
915		snprintf(buf, IFNAMSIZ, name, i);
916	if (!__dev_get_by_name(net, buf))
917		return i;
918
919	/* It is possible to run out of possible slots
920	 * when the name is long and there isn't enough space left
921	 * for the digits, or if all bits are used.
922	 */
923	return -ENFILE;
924}
925
926/**
927 *	dev_alloc_name - allocate a name for a device
928 *	@dev: device
929 *	@name: name format string
930 *
931 *	Passed a format string - eg "lt%d" it will try and find a suitable
932 *	id. It scans list of devices to build up a free map, then chooses
933 *	the first empty slot. The caller must hold the dev_base or rtnl lock
934 *	while allocating the name and adding the device in order to avoid
935 *	duplicates.
936 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
937 *	Returns the number of the unit assigned or a negative errno code.
938 */
939
940int dev_alloc_name(struct net_device *dev, const char *name)
941{
942	char buf[IFNAMSIZ];
943	struct net *net;
944	int ret;
945
946	BUG_ON(!dev_net(dev));
947	net = dev_net(dev);
948	ret = __dev_alloc_name(net, name, buf);
949	if (ret >= 0)
950		strlcpy(dev->name, buf, IFNAMSIZ);
951	return ret;
952}
953EXPORT_SYMBOL(dev_alloc_name);
954
955static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
956{
957	struct net *net;
958
959	BUG_ON(!dev_net(dev));
960	net = dev_net(dev);
961
962	if (!dev_valid_name(name))
963		return -EINVAL;
964
965	if (fmt && strchr(name, '%'))
966		return dev_alloc_name(dev, name);
967	else if (__dev_get_by_name(net, name))
968		return -EEXIST;
969	else if (dev->name != name)
970		strlcpy(dev->name, name, IFNAMSIZ);
971
972	return 0;
973}
974
975/**
976 *	dev_change_name - change name of a device
977 *	@dev: device
978 *	@newname: name (or format string) must be at least IFNAMSIZ
979 *
980 *	Change name of a device, can pass format strings "eth%d".
981 *	for wildcarding.
982 */
983int dev_change_name(struct net_device *dev, const char *newname)
984{
985	char oldname[IFNAMSIZ];
986	int err = 0;
987	int ret;
988	struct net *net;
989
990	ASSERT_RTNL();
991	BUG_ON(!dev_net(dev));
992
993	net = dev_net(dev);
994	if (dev->flags & IFF_UP)
995		return -EBUSY;
996
997	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
998		return 0;
999
1000	memcpy(oldname, dev->name, IFNAMSIZ);
1001
1002	err = dev_get_valid_name(dev, newname, 1);
1003	if (err < 0)
1004		return err;
1005
1006rollback:
1007	ret = device_rename(&dev->dev, dev->name);
1008	if (ret) {
1009		memcpy(dev->name, oldname, IFNAMSIZ);
1010		return ret;
1011	}
1012
1013	write_lock_bh(&dev_base_lock);
1014	hlist_del(&dev->name_hlist);
1015	write_unlock_bh(&dev_base_lock);
1016
1017	synchronize_rcu();
1018
1019	write_lock_bh(&dev_base_lock);
1020	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1021	write_unlock_bh(&dev_base_lock);
1022
1023	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1024	ret = notifier_to_errno(ret);
1025
1026	if (ret) {
1027		/* err >= 0 after dev_alloc_name() or stores the first errno */
1028		if (err >= 0) {
1029			err = ret;
1030			memcpy(dev->name, oldname, IFNAMSIZ);
1031			goto rollback;
1032		} else {
1033			printk(KERN_ERR
1034			       "%s: name change rollback failed: %d.\n",
1035			       dev->name, ret);
1036		}
1037	}
1038
1039	return err;
1040}
1041
1042/**
1043 *	dev_set_alias - change ifalias of a device
1044 *	@dev: device
1045 *	@alias: name up to IFALIASZ
1046 *	@len: limit of bytes to copy from info
1047 *
1048 *	Set ifalias for a device,
1049 */
1050int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1051{
1052	ASSERT_RTNL();
1053
1054	if (len >= IFALIASZ)
1055		return -EINVAL;
1056
1057	if (!len) {
1058		if (dev->ifalias) {
1059			kfree(dev->ifalias);
1060			dev->ifalias = NULL;
1061		}
1062		return 0;
1063	}
1064
1065	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1066	if (!dev->ifalias)
1067		return -ENOMEM;
1068
1069	strlcpy(dev->ifalias, alias, len+1);
1070	return len;
1071}
1072
1073
1074/**
1075 *	netdev_features_change - device changes features
1076 *	@dev: device to cause notification
1077 *
1078 *	Called to indicate a device has changed features.
1079 */
1080void netdev_features_change(struct net_device *dev)
1081{
1082	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1083}
1084EXPORT_SYMBOL(netdev_features_change);
1085
1086/**
1087 *	netdev_state_change - device changes state
1088 *	@dev: device to cause notification
1089 *
1090 *	Called to indicate a device has changed state. This function calls
1091 *	the notifier chains for netdev_chain and sends a NEWLINK message
1092 *	to the routing socket.
1093 */
1094void netdev_state_change(struct net_device *dev)
1095{
1096	if (dev->flags & IFF_UP) {
1097		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1098		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1099	}
1100}
1101EXPORT_SYMBOL(netdev_state_change);
1102
1103int netdev_bonding_change(struct net_device *dev, unsigned long event)
1104{
1105	return call_netdevice_notifiers(event, dev);
1106}
1107EXPORT_SYMBOL(netdev_bonding_change);
1108
1109/**
1110 *	dev_load 	- load a network module
1111 *	@net: the applicable net namespace
1112 *	@name: name of interface
1113 *
1114 *	If a network interface is not present and the process has suitable
1115 *	privileges this function loads the module. If module loading is not
1116 *	available in this kernel then it becomes a nop.
1117 */
1118
1119void dev_load(struct net *net, const char *name)
1120{
1121	struct net_device *dev;
1122
1123	rcu_read_lock();
1124	dev = dev_get_by_name_rcu(net, name);
1125	rcu_read_unlock();
1126
1127	if (!dev && capable(CAP_NET_ADMIN))
1128		request_module("%s", name);
1129}
1130EXPORT_SYMBOL(dev_load);
1131
1132static int __dev_open(struct net_device *dev)
1133{
1134	const struct net_device_ops *ops = dev->netdev_ops;
1135	int ret;
1136
1137	ASSERT_RTNL();
1138
1139	/*
1140	 *	Is it even present?
1141	 */
1142	if (!netif_device_present(dev))
1143		return -ENODEV;
1144
1145	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1146	ret = notifier_to_errno(ret);
1147	if (ret)
1148		return ret;
1149
1150	/*
1151	 *	Call device private open method
1152	 */
1153	set_bit(__LINK_STATE_START, &dev->state);
1154
1155	if (ops->ndo_validate_addr)
1156		ret = ops->ndo_validate_addr(dev);
1157
1158	if (!ret && ops->ndo_open)
1159		ret = ops->ndo_open(dev);
1160
1161	/*
1162	 *	If it went open OK then:
1163	 */
1164
1165	if (ret)
1166		clear_bit(__LINK_STATE_START, &dev->state);
1167	else {
1168		/*
1169		 *	Set the flags.
1170		 */
1171		dev->flags |= IFF_UP;
1172
1173		/*
1174		 *	Enable NET_DMA
1175		 */
1176		net_dmaengine_get();
1177
1178		/*
1179		 *	Initialize multicasting status
1180		 */
1181		dev_set_rx_mode(dev);
1182
1183		/*
1184		 *	Wakeup transmit queue engine
1185		 */
1186		dev_activate(dev);
1187	}
1188
1189	return ret;
1190}
1191
1192/**
1193 *	dev_open	- prepare an interface for use.
1194 *	@dev:	device to open
1195 *
1196 *	Takes a device from down to up state. The device's private open
1197 *	function is invoked and then the multicast lists are loaded. Finally
1198 *	the device is moved into the up state and a %NETDEV_UP message is
1199 *	sent to the netdev notifier chain.
1200 *
1201 *	Calling this function on an active interface is a nop. On a failure
1202 *	a negative errno code is returned.
1203 */
1204int dev_open(struct net_device *dev)
1205{
1206	int ret;
1207
1208	/*
1209	 *	Is it already up?
1210	 */
1211	if (dev->flags & IFF_UP)
1212		return 0;
1213
1214	/*
1215	 *	Open device
1216	 */
1217	ret = __dev_open(dev);
1218	if (ret < 0)
1219		return ret;
1220
1221	/*
1222	 *	... and announce new interface.
1223	 */
1224	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1225	call_netdevice_notifiers(NETDEV_UP, dev);
1226
1227	return ret;
1228}
1229EXPORT_SYMBOL(dev_open);
1230
1231static int __dev_close(struct net_device *dev)
1232{
1233	const struct net_device_ops *ops = dev->netdev_ops;
1234
1235	ASSERT_RTNL();
1236	might_sleep();
1237
1238	/*
1239	 *	Tell people we are going down, so that they can
1240	 *	prepare to death, when device is still operating.
1241	 */
1242	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1243
1244	clear_bit(__LINK_STATE_START, &dev->state);
1245
1246	/* Synchronize to scheduled poll. We cannot touch poll list,
1247	 * it can be even on different cpu. So just clear netif_running().
1248	 *
1249	 * dev->stop() will invoke napi_disable() on all of it's
1250	 * napi_struct instances on this device.
1251	 */
1252	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1253
1254	dev_deactivate(dev);
1255
1256	/*
1257	 *	Call the device specific close. This cannot fail.
1258	 *	Only if device is UP
1259	 *
1260	 *	We allow it to be called even after a DETACH hot-plug
1261	 *	event.
1262	 */
1263	if (ops->ndo_stop)
1264		ops->ndo_stop(dev);
1265
1266	/*
1267	 *	Device is now down.
1268	 */
1269
1270	dev->flags &= ~IFF_UP;
1271
1272	/*
1273	 *	Shutdown NET_DMA
1274	 */
1275	net_dmaengine_put();
1276
1277	return 0;
1278}
1279
1280/**
1281 *	dev_close - shutdown an interface.
1282 *	@dev: device to shutdown
1283 *
1284 *	This function moves an active device into down state. A
1285 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1286 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1287 *	chain.
1288 */
1289int dev_close(struct net_device *dev)
1290{
1291	if (!(dev->flags & IFF_UP))
1292		return 0;
1293
1294	__dev_close(dev);
1295
1296	/*
1297	 * Tell people we are down
1298	 */
1299	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1300	call_netdevice_notifiers(NETDEV_DOWN, dev);
1301
1302	return 0;
1303}
1304EXPORT_SYMBOL(dev_close);
1305
1306
1307/**
1308 *	dev_disable_lro - disable Large Receive Offload on a device
1309 *	@dev: device
1310 *
1311 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1312 *	called under RTNL.  This is needed if received packets may be
1313 *	forwarded to another interface.
1314 */
1315void dev_disable_lro(struct net_device *dev)
1316{
1317	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1318	    dev->ethtool_ops->set_flags) {
1319		u32 flags = dev->ethtool_ops->get_flags(dev);
1320		if (flags & ETH_FLAG_LRO) {
1321			flags &= ~ETH_FLAG_LRO;
1322			dev->ethtool_ops->set_flags(dev, flags);
1323		}
1324	}
1325	WARN_ON(dev->features & NETIF_F_LRO);
1326}
1327EXPORT_SYMBOL(dev_disable_lro);
1328
1329
1330static int dev_boot_phase = 1;
1331
1332/*
1333 *	Device change register/unregister. These are not inline or static
1334 *	as we export them to the world.
1335 */
1336
1337/**
1338 *	register_netdevice_notifier - register a network notifier block
1339 *	@nb: notifier
1340 *
1341 *	Register a notifier to be called when network device events occur.
1342 *	The notifier passed is linked into the kernel structures and must
1343 *	not be reused until it has been unregistered. A negative errno code
1344 *	is returned on a failure.
1345 *
1346 * 	When registered all registration and up events are replayed
1347 *	to the new notifier to allow device to have a race free
1348 *	view of the network device list.
1349 */
1350
1351int register_netdevice_notifier(struct notifier_block *nb)
1352{
1353	struct net_device *dev;
1354	struct net_device *last;
1355	struct net *net;
1356	int err;
1357
1358	rtnl_lock();
1359	err = raw_notifier_chain_register(&netdev_chain, nb);
1360	if (err)
1361		goto unlock;
1362	if (dev_boot_phase)
1363		goto unlock;
1364	for_each_net(net) {
1365		for_each_netdev(net, dev) {
1366			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1367			err = notifier_to_errno(err);
1368			if (err)
1369				goto rollback;
1370
1371			if (!(dev->flags & IFF_UP))
1372				continue;
1373
1374			nb->notifier_call(nb, NETDEV_UP, dev);
1375		}
1376	}
1377
1378unlock:
1379	rtnl_unlock();
1380	return err;
1381
1382rollback:
1383	last = dev;
1384	for_each_net(net) {
1385		for_each_netdev(net, dev) {
1386			if (dev == last)
1387				break;
1388
1389			if (dev->flags & IFF_UP) {
1390				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1391				nb->notifier_call(nb, NETDEV_DOWN, dev);
1392			}
1393			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1394			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1395		}
1396	}
1397
1398	raw_notifier_chain_unregister(&netdev_chain, nb);
1399	goto unlock;
1400}
1401EXPORT_SYMBOL(register_netdevice_notifier);
1402
1403/**
1404 *	unregister_netdevice_notifier - unregister a network notifier block
1405 *	@nb: notifier
1406 *
1407 *	Unregister a notifier previously registered by
1408 *	register_netdevice_notifier(). The notifier is unlinked into the
1409 *	kernel structures and may then be reused. A negative errno code
1410 *	is returned on a failure.
1411 */
1412
1413int unregister_netdevice_notifier(struct notifier_block *nb)
1414{
1415	int err;
1416
1417	rtnl_lock();
1418	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1419	rtnl_unlock();
1420	return err;
1421}
1422EXPORT_SYMBOL(unregister_netdevice_notifier);
1423
1424/**
1425 *	call_netdevice_notifiers - call all network notifier blocks
1426 *      @val: value passed unmodified to notifier function
1427 *      @dev: net_device pointer passed unmodified to notifier function
1428 *
1429 *	Call all network notifier blocks.  Parameters and return value
1430 *	are as for raw_notifier_call_chain().
1431 */
1432
1433int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1434{
1435	ASSERT_RTNL();
1436	return raw_notifier_call_chain(&netdev_chain, val, dev);
1437}
1438
1439/* When > 0 there are consumers of rx skb time stamps */
1440static atomic_t netstamp_needed = ATOMIC_INIT(0);
1441
1442void net_enable_timestamp(void)
1443{
1444	atomic_inc(&netstamp_needed);
1445}
1446EXPORT_SYMBOL(net_enable_timestamp);
1447
1448void net_disable_timestamp(void)
1449{
1450	atomic_dec(&netstamp_needed);
1451}
1452EXPORT_SYMBOL(net_disable_timestamp);
1453
1454static inline void net_timestamp_set(struct sk_buff *skb)
1455{
1456	if (atomic_read(&netstamp_needed))
1457		__net_timestamp(skb);
1458	else
1459		skb->tstamp.tv64 = 0;
1460}
1461
1462static inline void net_timestamp_check(struct sk_buff *skb)
1463{
1464	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1465		__net_timestamp(skb);
1466}
1467
1468/**
1469 * dev_forward_skb - loopback an skb to another netif
1470 *
1471 * @dev: destination network device
1472 * @skb: buffer to forward
1473 *
1474 * return values:
1475 *	NET_RX_SUCCESS	(no congestion)
1476 *	NET_RX_DROP     (packet was dropped, but freed)
1477 *
1478 * dev_forward_skb can be used for injecting an skb from the
1479 * start_xmit function of one device into the receive queue
1480 * of another device.
1481 *
1482 * The receiving device may be in another namespace, so
1483 * we have to clear all information in the skb that could
1484 * impact namespace isolation.
1485 */
1486int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1487{
1488	skb_orphan(skb);
1489	nf_reset(skb);
1490
1491	if (!(dev->flags & IFF_UP) ||
1492	    (skb->len > (dev->mtu + dev->hard_header_len))) {
1493		kfree_skb(skb);
1494		return NET_RX_DROP;
1495	}
1496	skb_set_dev(skb, dev);
1497	skb->tstamp.tv64 = 0;
1498	skb->pkt_type = PACKET_HOST;
1499	skb->protocol = eth_type_trans(skb, dev);
1500	return netif_rx(skb);
1501}
1502EXPORT_SYMBOL_GPL(dev_forward_skb);
1503
1504/*
1505 *	Support routine. Sends outgoing frames to any network
1506 *	taps currently in use.
1507 */
1508
1509static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1510{
1511	struct packet_type *ptype;
1512
1513#ifdef CONFIG_NET_CLS_ACT
1514	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1515		net_timestamp_set(skb);
1516#else
1517	net_timestamp_set(skb);
1518#endif
1519
1520	rcu_read_lock();
1521	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1522		/* Never send packets back to the socket
1523		 * they originated from - MvS (miquels@drinkel.ow.org)
1524		 */
1525		if ((ptype->dev == dev || !ptype->dev) &&
1526		    (ptype->af_packet_priv == NULL ||
1527		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1528			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1529			if (!skb2)
1530				break;
1531
1532			/* skb->nh should be correctly
1533			   set by sender, so that the second statement is
1534			   just protection against buggy protocols.
1535			 */
1536			skb_reset_mac_header(skb2);
1537
1538			if (skb_network_header(skb2) < skb2->data ||
1539			    skb2->network_header > skb2->tail) {
1540				if (net_ratelimit())
1541					printk(KERN_CRIT "protocol %04x is "
1542					       "buggy, dev %s\n",
1543					       ntohs(skb2->protocol),
1544					       dev->name);
1545				skb_reset_network_header(skb2);
1546			}
1547
1548			skb2->transport_header = skb2->network_header;
1549			skb2->pkt_type = PACKET_OUTGOING;
1550			ptype->func(skb2, skb->dev, ptype, skb->dev);
1551		}
1552	}
1553	rcu_read_unlock();
1554}
1555
1556/*
1557 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1558 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1559 */
1560void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1561{
1562	unsigned int real_num = dev->real_num_tx_queues;
1563
1564	if (unlikely(txq > dev->num_tx_queues))
1565		;
1566	else if (txq > real_num)
1567		dev->real_num_tx_queues = txq;
1568	else if (txq < real_num) {
1569		dev->real_num_tx_queues = txq;
1570		qdisc_reset_all_tx_gt(dev, txq);
1571	}
1572}
1573EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1574
1575static inline void __netif_reschedule(struct Qdisc *q)
1576{
1577	struct softnet_data *sd;
1578	unsigned long flags;
1579
1580	local_irq_save(flags);
1581	sd = &__get_cpu_var(softnet_data);
1582	q->next_sched = NULL;
1583	*sd->output_queue_tailp = q;
1584	sd->output_queue_tailp = &q->next_sched;
1585	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1586	local_irq_restore(flags);
1587}
1588
1589void __netif_schedule(struct Qdisc *q)
1590{
1591	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1592		__netif_reschedule(q);
1593}
1594EXPORT_SYMBOL(__netif_schedule);
1595
1596void dev_kfree_skb_irq(struct sk_buff *skb)
1597{
1598	if (atomic_dec_and_test(&skb->users)) {
1599		struct softnet_data *sd;
1600		unsigned long flags;
1601
1602		local_irq_save(flags);
1603		sd = &__get_cpu_var(softnet_data);
1604		skb->next = sd->completion_queue;
1605		sd->completion_queue = skb;
1606		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1607		local_irq_restore(flags);
1608	}
1609}
1610EXPORT_SYMBOL(dev_kfree_skb_irq);
1611
1612void dev_kfree_skb_any(struct sk_buff *skb)
1613{
1614	if (in_irq() || irqs_disabled())
1615		dev_kfree_skb_irq(skb);
1616	else
1617		dev_kfree_skb(skb);
1618}
1619EXPORT_SYMBOL(dev_kfree_skb_any);
1620
1621
1622/**
1623 * netif_device_detach - mark device as removed
1624 * @dev: network device
1625 *
1626 * Mark device as removed from system and therefore no longer available.
1627 */
1628void netif_device_detach(struct net_device *dev)
1629{
1630	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1631	    netif_running(dev)) {
1632		netif_tx_stop_all_queues(dev);
1633	}
1634}
1635EXPORT_SYMBOL(netif_device_detach);
1636
1637/**
1638 * netif_device_attach - mark device as attached
1639 * @dev: network device
1640 *
1641 * Mark device as attached from system and restart if needed.
1642 */
1643void netif_device_attach(struct net_device *dev)
1644{
1645	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1646	    netif_running(dev)) {
1647		netif_tx_wake_all_queues(dev);
1648		__netdev_watchdog_up(dev);
1649	}
1650}
1651EXPORT_SYMBOL(netif_device_attach);
1652
1653static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1654{
1655	return ((features & NETIF_F_NO_CSUM) ||
1656		((features & NETIF_F_V4_CSUM) &&
1657		 protocol == htons(ETH_P_IP)) ||
1658		((features & NETIF_F_V6_CSUM) &&
1659		 protocol == htons(ETH_P_IPV6)) ||
1660		((features & NETIF_F_FCOE_CRC) &&
1661		 protocol == htons(ETH_P_FCOE)));
1662}
1663
1664static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1665{
1666	if (can_checksum_protocol(dev->features, skb->protocol))
1667		return true;
1668
1669	if (skb->protocol == htons(ETH_P_8021Q)) {
1670		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1671		if (can_checksum_protocol(dev->features & dev->vlan_features,
1672					  veh->h_vlan_encapsulated_proto))
1673			return true;
1674	}
1675
1676	return false;
1677}
1678
1679/**
1680 * skb_dev_set -- assign a new device to a buffer
1681 * @skb: buffer for the new device
1682 * @dev: network device
1683 *
1684 * If an skb is owned by a device already, we have to reset
1685 * all data private to the namespace a device belongs to
1686 * before assigning it a new device.
1687 */
1688#ifdef CONFIG_NET_NS
1689void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1690{
1691	skb_dst_drop(skb);
1692	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1693		secpath_reset(skb);
1694		nf_reset(skb);
1695		skb_init_secmark(skb);
1696		skb->mark = 0;
1697		skb->priority = 0;
1698		skb->nf_trace = 0;
1699		skb->ipvs_property = 0;
1700#ifdef CONFIG_NET_SCHED
1701		skb->tc_index = 0;
1702#endif
1703	}
1704	skb->dev = dev;
1705}
1706EXPORT_SYMBOL(skb_set_dev);
1707#endif /* CONFIG_NET_NS */
1708
1709/*
1710 * Invalidate hardware checksum when packet is to be mangled, and
1711 * complete checksum manually on outgoing path.
1712 */
1713int skb_checksum_help(struct sk_buff *skb)
1714{
1715	__wsum csum;
1716	int ret = 0, offset;
1717
1718	if (skb->ip_summed == CHECKSUM_COMPLETE)
1719		goto out_set_summed;
1720
1721	if (unlikely(skb_shinfo(skb)->gso_size)) {
1722		/* Let GSO fix up the checksum. */
1723		goto out_set_summed;
1724	}
1725
1726	offset = skb->csum_start - skb_headroom(skb);
1727	BUG_ON(offset >= skb_headlen(skb));
1728	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1729
1730	offset += skb->csum_offset;
1731	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1732
1733	if (skb_cloned(skb) &&
1734	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1735		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1736		if (ret)
1737			goto out;
1738	}
1739
1740	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1741out_set_summed:
1742	skb->ip_summed = CHECKSUM_NONE;
1743out:
1744	return ret;
1745}
1746EXPORT_SYMBOL(skb_checksum_help);
1747
1748/**
1749 *	skb_gso_segment - Perform segmentation on skb.
1750 *	@skb: buffer to segment
1751 *	@features: features for the output path (see dev->features)
1752 *
1753 *	This function segments the given skb and returns a list of segments.
1754 *
1755 *	It may return NULL if the skb requires no segmentation.  This is
1756 *	only possible when GSO is used for verifying header integrity.
1757 */
1758struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1759{
1760	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1761	struct packet_type *ptype;
1762	__be16 type = skb->protocol;
1763	int err;
1764
1765	skb_reset_mac_header(skb);
1766	skb->mac_len = skb->network_header - skb->mac_header;
1767	__skb_pull(skb, skb->mac_len);
1768
1769	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1770		struct net_device *dev = skb->dev;
1771		struct ethtool_drvinfo info = {};
1772
1773		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1774			dev->ethtool_ops->get_drvinfo(dev, &info);
1775
1776		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1777			"ip_summed=%d",
1778		     info.driver, dev ? dev->features : 0L,
1779		     skb->sk ? skb->sk->sk_route_caps : 0L,
1780		     skb->len, skb->data_len, skb->ip_summed);
1781
1782		if (skb_header_cloned(skb) &&
1783		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1784			return ERR_PTR(err);
1785	}
1786
1787	rcu_read_lock();
1788	list_for_each_entry_rcu(ptype,
1789			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1790		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1791			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1792				err = ptype->gso_send_check(skb);
1793				segs = ERR_PTR(err);
1794				if (err || skb_gso_ok(skb, features))
1795					break;
1796				__skb_push(skb, (skb->data -
1797						 skb_network_header(skb)));
1798			}
1799			segs = ptype->gso_segment(skb, features);
1800			break;
1801		}
1802	}
1803	rcu_read_unlock();
1804
1805	__skb_push(skb, skb->data - skb_mac_header(skb));
1806
1807	return segs;
1808}
1809EXPORT_SYMBOL(skb_gso_segment);
1810
1811/* Take action when hardware reception checksum errors are detected. */
1812#ifdef CONFIG_BUG
1813void netdev_rx_csum_fault(struct net_device *dev)
1814{
1815	if (net_ratelimit()) {
1816		printk(KERN_ERR "%s: hw csum failure.\n",
1817			dev ? dev->name : "<unknown>");
1818		dump_stack();
1819	}
1820}
1821EXPORT_SYMBOL(netdev_rx_csum_fault);
1822#endif
1823
1824/* Actually, we should eliminate this check as soon as we know, that:
1825 * 1. IOMMU is present and allows to map all the memory.
1826 * 2. No high memory really exists on this machine.
1827 */
1828
1829static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1830{
1831#ifdef CONFIG_HIGHMEM
1832	int i;
1833	if (!(dev->features & NETIF_F_HIGHDMA)) {
1834		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1835			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1836				return 1;
1837	}
1838
1839	if (PCI_DMA_BUS_IS_PHYS) {
1840		struct device *pdev = dev->dev.parent;
1841
1842		if (!pdev)
1843			return 0;
1844		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1845			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1846			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1847				return 1;
1848		}
1849	}
1850#endif
1851	return 0;
1852}
1853
1854struct dev_gso_cb {
1855	void (*destructor)(struct sk_buff *skb);
1856};
1857
1858#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1859
1860static void dev_gso_skb_destructor(struct sk_buff *skb)
1861{
1862	struct dev_gso_cb *cb;
1863
1864	do {
1865		struct sk_buff *nskb = skb->next;
1866
1867		skb->next = nskb->next;
1868		nskb->next = NULL;
1869		kfree_skb(nskb);
1870	} while (skb->next);
1871
1872	cb = DEV_GSO_CB(skb);
1873	if (cb->destructor)
1874		cb->destructor(skb);
1875}
1876
1877/**
1878 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1879 *	@skb: buffer to segment
1880 *
1881 *	This function segments the given skb and stores the list of segments
1882 *	in skb->next.
1883 */
1884static int dev_gso_segment(struct sk_buff *skb)
1885{
1886	struct net_device *dev = skb->dev;
1887	struct sk_buff *segs;
1888	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1889					 NETIF_F_SG : 0);
1890
1891	segs = skb_gso_segment(skb, features);
1892
1893	/* Verifying header integrity only. */
1894	if (!segs)
1895		return 0;
1896
1897	if (IS_ERR(segs))
1898		return PTR_ERR(segs);
1899
1900	skb->next = segs;
1901	DEV_GSO_CB(skb)->destructor = skb->destructor;
1902	skb->destructor = dev_gso_skb_destructor;
1903
1904	return 0;
1905}
1906
1907/*
1908 * Try to orphan skb early, right before transmission by the device.
1909 * We cannot orphan skb if tx timestamp is requested, since
1910 * drivers need to call skb_tstamp_tx() to send the timestamp.
1911 */
1912static inline void skb_orphan_try(struct sk_buff *skb)
1913{
1914	struct sock *sk = skb->sk;
1915
1916	if (sk && !skb_tx(skb)->flags) {
1917		/* skb_tx_hash() wont be able to get sk.
1918		 * We copy sk_hash into skb->rxhash
1919		 */
1920		if (!skb->rxhash)
1921			skb->rxhash = sk->sk_hash;
1922		skb_orphan(skb);
1923	}
1924}
1925
1926/*
1927 * Returns true if either:
1928 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
1929 *	2. skb is fragmented and the device does not support SG, or if
1930 *	   at least one of fragments is in highmem and device does not
1931 *	   support DMA from it.
1932 */
1933static inline int skb_needs_linearize(struct sk_buff *skb,
1934				      struct net_device *dev)
1935{
1936	return skb_is_nonlinear(skb) &&
1937	       ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
1938	        (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
1939					      illegal_highdma(dev, skb))));
1940}
1941
1942int BCMFASTPATH_HOST dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1943			struct netdev_queue *txq)
1944{
1945	const struct net_device_ops *ops = dev->netdev_ops;
1946	int rc = NETDEV_TX_OK;
1947
1948	if (likely(!skb->next)) {
1949		if (!list_empty(&ptype_all))
1950			dev_queue_xmit_nit(skb, dev);
1951
1952		/*
1953		 * If device doesnt need skb->dst, release it right now while
1954		 * its hot in this cpu cache
1955		 */
1956		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1957			skb_dst_drop(skb);
1958
1959		skb_orphan_try(skb);
1960
1961		if (netif_needs_gso(dev, skb)) {
1962			if (unlikely(dev_gso_segment(skb)))
1963				goto out_kfree_skb;
1964			if (skb->next)
1965				goto gso;
1966			else {
1967				DEV_GSO_CB(skb)->destructor = skb->destructor;
1968				skb->destructor = dev_gso_skb_destructor;
1969				goto out_kfree_gso_skb;
1970			}
1971		} else {
1972			if (skb_needs_linearize(skb, dev) &&
1973			    __skb_linearize(skb))
1974				goto out_kfree_skb;
1975
1976			/* If packet is not checksummed and device does not
1977			 * support checksumming for this protocol, complete
1978			 * checksumming here.
1979			 */
1980			if (skb->ip_summed == CHECKSUM_PARTIAL) {
1981				skb_set_transport_header(skb, skb->csum_start -
1982					      skb_headroom(skb));
1983				if (!dev_can_checksum(dev, skb) &&
1984				     skb_checksum_help(skb))
1985					goto out_kfree_skb;
1986			}
1987		}
1988
1989		rc = ops->ndo_start_xmit(skb, dev);
1990		if (rc == NETDEV_TX_OK)
1991			txq_trans_update(txq);
1992		return rc;
1993	}
1994
1995gso:
1996	do {
1997		struct sk_buff *nskb = skb->next;
1998
1999		skb->next = nskb->next;
2000		nskb->next = NULL;
2001
2002		/*
2003		 * If device doesnt need nskb->dst, release it right now while
2004		 * its hot in this cpu cache
2005		 */
2006		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2007			skb_dst_drop(nskb);
2008
2009		rc = ops->ndo_start_xmit(nskb, dev);
2010		if (unlikely(rc != NETDEV_TX_OK)) {
2011			if (rc & ~NETDEV_TX_MASK)
2012				goto out_kfree_gso_skb;
2013			nskb->next = skb->next;
2014			skb->next = nskb;
2015			return rc;
2016		}
2017		txq_trans_update(txq);
2018		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2019			return NETDEV_TX_BUSY;
2020	} while (skb->next);
2021
2022out_kfree_gso_skb:
2023	if (likely(skb->next == NULL))
2024		skb->destructor = DEV_GSO_CB(skb)->destructor;
2025out_kfree_skb:
2026	kfree_skb(skb);
2027	return rc;
2028}
2029
2030static u32 hashrnd __read_mostly;
2031
2032u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
2033{
2034	u32 hash;
2035
2036	if (skb_rx_queue_recorded(skb)) {
2037		hash = skb_get_rx_queue(skb);
2038		while (unlikely(hash >= dev->real_num_tx_queues))
2039			hash -= dev->real_num_tx_queues;
2040		return hash;
2041	}
2042
2043	if (skb->sk && skb->sk->sk_hash)
2044		hash = skb->sk->sk_hash;
2045	else
2046		hash = (__force u16) skb->protocol ^ skb->rxhash;
2047	hash = jhash_1word(hash, hashrnd);
2048
2049	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2050}
2051EXPORT_SYMBOL(skb_tx_hash);
2052
2053static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2054{
2055	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2056		if (net_ratelimit()) {
2057			pr_warning("%s selects TX queue %d, but "
2058				"real number of TX queues is %d\n",
2059				dev->name, queue_index, dev->real_num_tx_queues);
2060		}
2061		return 0;
2062	}
2063	return queue_index;
2064}
2065
2066static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2067					struct sk_buff *skb)
2068{
2069	int queue_index;
2070	const struct net_device_ops *ops = dev->netdev_ops;
2071
2072	if (ops->ndo_select_queue) {
2073		queue_index = ops->ndo_select_queue(dev, skb);
2074		queue_index = dev_cap_txqueue(dev, queue_index);
2075	} else {
2076		struct sock *sk = skb->sk;
2077		queue_index = sk_tx_queue_get(sk);
2078		if (queue_index < 0) {
2079
2080			queue_index = 0;
2081			if (dev->real_num_tx_queues > 1)
2082				queue_index = skb_tx_hash(dev, skb);
2083
2084			if (sk) {
2085				struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2086
2087				if (dst && skb_dst(skb) == dst)
2088					sk_tx_queue_set(sk, queue_index);
2089			}
2090		}
2091	}
2092
2093	skb_set_queue_mapping(skb, queue_index);
2094	return netdev_get_tx_queue(dev, queue_index);
2095}
2096
2097static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2098				 struct net_device *dev,
2099				 struct netdev_queue *txq)
2100{
2101	spinlock_t *root_lock = qdisc_lock(q);
2102	int rc;
2103
2104	/*
2105	 * Heuristic to force contended enqueues to serialize on a
2106	 * separate lock before trying to get qdisc main lock.
2107	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2108	 * and dequeue packets faster.
2109	 */
2110
2111	spin_lock(root_lock);
2112	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2113		kfree_skb(skb);
2114		rc = NET_XMIT_DROP;
2115	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2116		   qdisc_run_begin(q)) {
2117		/*
2118		 * This is a work-conserving queue; there are no old skbs
2119		 * waiting to be sent out; and the qdisc is not running -
2120		 * xmit the skb directly.
2121		 */
2122		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2123			skb_dst_force(skb);
2124		__qdisc_update_bstats(q, skb->len);
2125		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2126			__qdisc_run(q);
2127		else
2128			qdisc_run_end(q);
2129
2130		rc = NET_XMIT_SUCCESS;
2131	} else {
2132		skb_dst_force(skb);
2133		rc = qdisc_enqueue_root(skb, q);
2134		if (qdisc_run_begin(q))
2135			__qdisc_run(q);
2136	}
2137	spin_unlock(root_lock);
2138
2139	return rc;
2140}
2141
2142/**
2143 *	dev_queue_xmit - transmit a buffer
2144 *	@skb: buffer to transmit
2145 *
2146 *	Queue a buffer for transmission to a network device. The caller must
2147 *	have set the device and priority and built the buffer before calling
2148 *	this function. The function can be called from an interrupt.
2149 *
2150 *	A negative errno code is returned on a failure. A success does not
2151 *	guarantee the frame will be transmitted as it may be dropped due
2152 *	to congestion or traffic shaping.
2153 *
2154 * -----------------------------------------------------------------------------------
2155 *      I notice this method can also return errors from the queue disciplines,
2156 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2157 *      be positive.
2158 *
2159 *      Regardless of the return value, the skb is consumed, so it is currently
2160 *      difficult to retry a send to this method.  (You can bump the ref count
2161 *      before sending to hold a reference for retry if you are careful.)
2162 *
2163 *      When calling this method, interrupts MUST be enabled.  This is because
2164 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2165 *          --BLG
2166 */
2167int BCMFASTPATH_HOST dev_queue_xmit(struct sk_buff *skb)
2168{
2169	struct net_device *dev = skb->dev;
2170	struct netdev_queue *txq;
2171	struct Qdisc *q;
2172	int rc = -ENOMEM;
2173
2174	/* Disable soft irqs for various locks below. Also
2175	 * stops preemption for RCU.
2176	 */
2177	rcu_read_lock_bh();
2178
2179	txq = dev_pick_tx(dev, skb);
2180	q = rcu_dereference_bh(txq->qdisc);
2181
2182#ifdef CONFIG_NET_CLS_ACT
2183	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2184#endif
2185#ifdef CONFIG_IP_NF_LFP
2186	if (q->enqueue && !(skb->nfcache & NFC_LFP_ENABLE)) {
2187#else
2188	if (q->enqueue) {
2189#endif
2190		rc = __dev_xmit_skb(skb, q, dev, txq);
2191		goto out;
2192	}
2193
2194	/* The device has no queue. Common case for software devices:
2195	   loopback, all the sorts of tunnels...
2196
2197	   Really, it is unlikely that netif_tx_lock protection is necessary
2198	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2199	   counters.)
2200	   However, it is possible, that they rely on protection
2201	   made by us here.
2202
2203	   Check this and shot the lock. It is not prone from deadlocks.
2204	   Either shot noqueue qdisc, it is even simpler 8)
2205	 */
2206	if (dev->flags & IFF_UP) {
2207		int cpu = smp_processor_id(); /* ok because BHs are off */
2208
2209		if (txq->xmit_lock_owner != cpu) {
2210
2211			HARD_TX_LOCK(dev, txq, cpu);
2212
2213			if (!netif_tx_queue_stopped(txq)) {
2214				rc = dev_hard_start_xmit(skb, dev, txq);
2215				if (dev_xmit_complete(rc)) {
2216					HARD_TX_UNLOCK(dev, txq);
2217					goto out;
2218				}
2219			}
2220			HARD_TX_UNLOCK(dev, txq);
2221			if (net_ratelimit())
2222				printk(KERN_CRIT "Virtual device %s asks to "
2223				       "queue packet!\n", dev->name);
2224		} else {
2225			/* Recursion is detected! It is possible,
2226			 * unfortunately */
2227			if (net_ratelimit())
2228				printk(KERN_CRIT "Dead loop on virtual device "
2229				       "%s, fix it urgently!\n", dev->name);
2230		}
2231	}
2232
2233	rc = -ENETDOWN;
2234	rcu_read_unlock_bh();
2235
2236	kfree_skb(skb);
2237	return rc;
2238out:
2239	rcu_read_unlock_bh();
2240	return rc;
2241}
2242EXPORT_SYMBOL(dev_queue_xmit);
2243
2244
2245/*=======================================================================
2246			Receiver routines
2247  =======================================================================*/
2248
2249int netdev_max_backlog __read_mostly = 1000;
2250int netdev_tstamp_prequeue __read_mostly = 1;
2251int netdev_budget __read_mostly = 300;
2252int weight_p __read_mostly = 64;            /* old backlog weight */
2253
2254/* Called with irq disabled */
2255static inline void ____napi_schedule(struct softnet_data *sd,
2256				     struct napi_struct *napi)
2257{
2258	list_add_tail(&napi->poll_list, &sd->poll_list);
2259	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2260}
2261
2262#ifdef CONFIG_RPS
2263
2264/* One global table that all flow-based protocols share. */
2265struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2266EXPORT_SYMBOL(rps_sock_flow_table);
2267
2268/*
2269 * get_rps_cpu is called from netif_receive_skb and returns the target
2270 * CPU from the RPS map of the receiving queue for a given skb.
2271 * rcu_read_lock must be held on entry.
2272 */
2273static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2274		       struct rps_dev_flow **rflowp)
2275{
2276	struct ipv6hdr *ip6;
2277	struct iphdr *ip;
2278	struct netdev_rx_queue *rxqueue;
2279	struct rps_map *map;
2280	struct rps_dev_flow_table *flow_table;
2281	struct rps_sock_flow_table *sock_flow_table;
2282	int cpu = -1;
2283	u8 ip_proto;
2284	u16 tcpu;
2285	u32 addr1, addr2, ihl;
2286	union {
2287		u32 v32;
2288		u16 v16[2];
2289	} ports;
2290
2291	if (skb_rx_queue_recorded(skb)) {
2292		u16 index = skb_get_rx_queue(skb);
2293		if (unlikely(index >= dev->num_rx_queues)) {
2294			WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2295				"on queue %u, but number of RX queues is %u\n",
2296				dev->name, index, dev->num_rx_queues);
2297			goto done;
2298		}
2299		rxqueue = dev->_rx + index;
2300	} else
2301		rxqueue = dev->_rx;
2302
2303	if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2304		goto done;
2305
2306	if (skb->rxhash)
2307		goto got_hash; /* Skip hash computation on packet header */
2308
2309	switch (skb->protocol) {
2310	case __constant_htons(ETH_P_IP):
2311		if (!pskb_may_pull(skb, sizeof(*ip)))
2312			goto done;
2313
2314		ip = (struct iphdr *) skb->data;
2315		ip_proto = ip->protocol;
2316		addr1 = (__force u32) ip->saddr;
2317		addr2 = (__force u32) ip->daddr;
2318		ihl = ip->ihl;
2319		break;
2320	case __constant_htons(ETH_P_IPV6):
2321		if (!pskb_may_pull(skb, sizeof(*ip6)))
2322			goto done;
2323
2324		ip6 = (struct ipv6hdr *) skb->data;
2325		ip_proto = ip6->nexthdr;
2326		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2327		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2328		ihl = (40 >> 2);
2329		break;
2330	default:
2331		goto done;
2332	}
2333	switch (ip_proto) {
2334	case IPPROTO_TCP:
2335	case IPPROTO_UDP:
2336	case IPPROTO_DCCP:
2337	case IPPROTO_ESP:
2338	case IPPROTO_AH:
2339	case IPPROTO_SCTP:
2340	case IPPROTO_UDPLITE:
2341		if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2342			ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2343			if (ports.v16[1] < ports.v16[0])
2344				swap(ports.v16[0], ports.v16[1]);
2345			break;
2346		}
2347	default:
2348		ports.v32 = 0;
2349		break;
2350	}
2351
2352	/* get a consistent hash (same value on both flow directions) */
2353	if (addr2 < addr1)
2354		swap(addr1, addr2);
2355	skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2356	if (!skb->rxhash)
2357		skb->rxhash = 1;
2358
2359got_hash:
2360	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2361	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2362	if (flow_table && sock_flow_table) {
2363		u16 next_cpu;
2364		struct rps_dev_flow *rflow;
2365
2366		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2367		tcpu = rflow->cpu;
2368
2369		next_cpu = sock_flow_table->ents[skb->rxhash &
2370		    sock_flow_table->mask];
2371
2372		/*
2373		 * If the desired CPU (where last recvmsg was done) is
2374		 * different from current CPU (one in the rx-queue flow
2375		 * table entry), switch if one of the following holds:
2376		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2377		 *   - Current CPU is offline.
2378		 *   - The current CPU's queue tail has advanced beyond the
2379		 *     last packet that was enqueued using this table entry.
2380		 *     This guarantees that all previous packets for the flow
2381		 *     have been dequeued, thus preserving in order delivery.
2382		 */
2383		if (unlikely(tcpu != next_cpu) &&
2384		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2385		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2386		      rflow->last_qtail)) >= 0)) {
2387			tcpu = rflow->cpu = next_cpu;
2388			if (tcpu != RPS_NO_CPU)
2389				rflow->last_qtail = per_cpu(softnet_data,
2390				    tcpu).input_queue_head;
2391		}
2392		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2393			*rflowp = rflow;
2394			cpu = tcpu;
2395			goto done;
2396		}
2397	}
2398
2399	map = rcu_dereference(rxqueue->rps_map);
2400	if (map) {
2401		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2402
2403		if (cpu_online(tcpu)) {
2404			cpu = tcpu;
2405			goto done;
2406		}
2407	}
2408
2409done:
2410	return cpu;
2411}
2412
2413/* Called from hardirq (IPI) context */
2414static void rps_trigger_softirq(void *data)
2415{
2416	struct softnet_data *sd = data;
2417
2418	____napi_schedule(sd, &sd->backlog);
2419	sd->received_rps++;
2420}
2421
2422#endif /* CONFIG_RPS */
2423
2424/*
2425 * Check if this softnet_data structure is another cpu one
2426 * If yes, queue it to our IPI list and return 1
2427 * If no, return 0
2428 */
2429static int rps_ipi_queued(struct softnet_data *sd)
2430{
2431#ifdef CONFIG_RPS
2432	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2433
2434	if (sd != mysd) {
2435		sd->rps_ipi_next = mysd->rps_ipi_list;
2436		mysd->rps_ipi_list = sd;
2437
2438		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2439		return 1;
2440	}
2441#endif /* CONFIG_RPS */
2442	return 0;
2443}
2444
2445/*
2446 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2447 * queue (may be a remote CPU queue).
2448 */
2449static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2450			      unsigned int *qtail)
2451{
2452	struct softnet_data *sd;
2453	unsigned long flags;
2454
2455	sd = &per_cpu(softnet_data, cpu);
2456
2457	local_irq_save(flags);
2458
2459	rps_lock(sd);
2460	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2461		if (skb_queue_len(&sd->input_pkt_queue)) {
2462enqueue:
2463			__skb_queue_tail(&sd->input_pkt_queue, skb);
2464			input_queue_tail_incr_save(sd, qtail);
2465			rps_unlock(sd);
2466			local_irq_restore(flags);
2467			return NET_RX_SUCCESS;
2468		}
2469
2470		/* Schedule NAPI for backlog device
2471		 * We can use non atomic operation since we own the queue lock
2472		 */
2473		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2474			if (!rps_ipi_queued(sd))
2475				____napi_schedule(sd, &sd->backlog);
2476		}
2477		goto enqueue;
2478	}
2479
2480	sd->dropped++;
2481	rps_unlock(sd);
2482
2483	local_irq_restore(flags);
2484
2485	kfree_skb(skb);
2486	return NET_RX_DROP;
2487}
2488
2489/**
2490 *	netif_rx	-	post buffer to the network code
2491 *	@skb: buffer to post
2492 *
2493 *	This function receives a packet from a device driver and queues it for
2494 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2495 *	may be dropped during processing for congestion control or by the
2496 *	protocol layers.
2497 *
2498 *	return values:
2499 *	NET_RX_SUCCESS	(no congestion)
2500 *	NET_RX_DROP     (packet was dropped)
2501 *
2502 */
2503
2504int BCMFASTPATH_HOST netif_rx(struct sk_buff *skb)
2505{
2506	int ret;
2507
2508	/* if netpoll wants it, pretend we never saw it */
2509	if (netpoll_rx(skb))
2510		return NET_RX_DROP;
2511
2512	if (netdev_tstamp_prequeue)
2513		net_timestamp_check(skb);
2514
2515#ifdef CONFIG_RPS
2516	{
2517		struct rps_dev_flow voidflow, *rflow = &voidflow;
2518		int cpu;
2519
2520		preempt_disable();
2521		rcu_read_lock();
2522
2523		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2524		if (cpu < 0)
2525			cpu = smp_processor_id();
2526
2527		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2528
2529		rcu_read_unlock();
2530		preempt_enable();
2531	}
2532#else
2533	{
2534		unsigned int qtail;
2535		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2536		put_cpu();
2537	}
2538#endif
2539	return ret;
2540}
2541EXPORT_SYMBOL(netif_rx);
2542
2543int netif_rx_ni(struct sk_buff *skb)
2544{
2545	int err;
2546
2547	preempt_disable();
2548	err = netif_rx(skb);
2549	if (local_softirq_pending())
2550		do_softirq();
2551	preempt_enable();
2552
2553	return err;
2554}
2555EXPORT_SYMBOL(netif_rx_ni);
2556
2557static void net_tx_action(struct softirq_action *h)
2558{
2559	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2560
2561	if (sd->completion_queue) {
2562		struct sk_buff *clist;
2563
2564		local_irq_disable();
2565		clist = sd->completion_queue;
2566		sd->completion_queue = NULL;
2567		local_irq_enable();
2568
2569		while (clist) {
2570			struct sk_buff *skb = clist;
2571			clist = clist->next;
2572
2573			WARN_ON(atomic_read(&skb->users));
2574			__kfree_skb(skb);
2575		}
2576	}
2577
2578	if (sd->output_queue) {
2579		struct Qdisc *head;
2580
2581		local_irq_disable();
2582		head = sd->output_queue;
2583		sd->output_queue = NULL;
2584		sd->output_queue_tailp = &sd->output_queue;
2585		local_irq_enable();
2586
2587		while (head) {
2588			struct Qdisc *q = head;
2589			spinlock_t *root_lock;
2590
2591			head = head->next_sched;
2592
2593			root_lock = qdisc_lock(q);
2594			if (spin_trylock(root_lock)) {
2595				smp_mb__before_clear_bit();
2596				clear_bit(__QDISC_STATE_SCHED,
2597					  &q->state);
2598				qdisc_run(q);
2599				spin_unlock(root_lock);
2600			} else {
2601				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2602					      &q->state)) {
2603					__netif_reschedule(q);
2604				} else {
2605					smp_mb__before_clear_bit();
2606					clear_bit(__QDISC_STATE_SCHED,
2607						  &q->state);
2608				}
2609			}
2610		}
2611	}
2612}
2613
2614static inline int deliver_skb(struct sk_buff *skb,
2615			      struct packet_type *pt_prev,
2616			      struct net_device *orig_dev)
2617{
2618	atomic_inc(&skb->users);
2619	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2620}
2621
2622#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2623	(defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2624/* This hook is defined here for ATM LANE */
2625int (*br_fdb_test_addr_hook)(struct net_device *dev,
2626			     unsigned char *addr) __read_mostly;
2627EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2628#endif
2629
2630#ifdef CONFIG_NET_CLS_ACT
2631/* TODO: Maybe we should just force sch_ingress to be compiled in
2632 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2633 * a compare and 2 stores extra right now if we dont have it on
2634 * but have CONFIG_NET_CLS_ACT
2635 * NOTE: This doesnt stop any functionality; if you dont have
2636 * the ingress scheduler, you just cant add policies on ingress.
2637 *
2638 */
2639static int ing_filter(struct sk_buff *skb)
2640{
2641	struct net_device *dev = skb->dev;
2642	u32 ttl = G_TC_RTTL(skb->tc_verd);
2643	struct netdev_queue *rxq;
2644	int result = TC_ACT_OK;
2645	struct Qdisc *q;
2646
2647	if (unlikely(MAX_RED_LOOP < ttl++)) {
2648		if (net_ratelimit())
2649			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2650			       skb->skb_iif, dev->ifindex);
2651		return TC_ACT_SHOT;
2652	}
2653
2654	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2655	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2656
2657	rxq = &dev->rx_queue;
2658
2659	q = rxq->qdisc;
2660	if (q != &noop_qdisc) {
2661		spin_lock(qdisc_lock(q));
2662		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2663			result = qdisc_enqueue_root(skb, q);
2664		spin_unlock(qdisc_lock(q));
2665	}
2666
2667	return result;
2668}
2669
2670static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2671					 struct packet_type **pt_prev,
2672					 int *ret, struct net_device *orig_dev)
2673{
2674	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2675		goto out;
2676
2677	if (*pt_prev) {
2678		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2679		*pt_prev = NULL;
2680	}
2681
2682	switch (ing_filter(skb)) {
2683	case TC_ACT_SHOT:
2684	case TC_ACT_STOLEN:
2685		kfree_skb(skb);
2686		return NULL;
2687	}
2688
2689out:
2690	skb->tc_verd = 0;
2691	return skb;
2692}
2693#endif
2694
2695/*
2696 * 	netif_nit_deliver - deliver received packets to network taps
2697 * 	@skb: buffer
2698 *
2699 * 	This function is used to deliver incoming packets to network
2700 * 	taps. It should be used when the normal netif_receive_skb path
2701 * 	is bypassed, for example because of VLAN acceleration.
2702 */
2703void netif_nit_deliver(struct sk_buff *skb)
2704{
2705	struct packet_type *ptype;
2706
2707	if (list_empty(&ptype_all))
2708		return;
2709
2710	skb_reset_network_header(skb);
2711	skb_reset_transport_header(skb);
2712	skb->mac_len = skb->network_header - skb->mac_header;
2713
2714	rcu_read_lock();
2715	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2716		if (!ptype->dev || ptype->dev == skb->dev)
2717			deliver_skb(skb, ptype, skb->dev);
2718	}
2719	rcu_read_unlock();
2720}
2721
2722/**
2723 *	netdev_rx_handler_register - register receive handler
2724 *	@dev: device to register a handler for
2725 *	@rx_handler: receive handler to register
2726 *	@rx_handler_data: data pointer that is used by rx handler
2727 *
2728 *	Register a receive hander for a device. This handler will then be
2729 *	called from __netif_receive_skb. A negative errno code is returned
2730 *	on a failure.
2731 *
2732 *	The caller must hold the rtnl_mutex.
2733 */
2734int netdev_rx_handler_register(struct net_device *dev,
2735			       rx_handler_func_t *rx_handler,
2736			       void *rx_handler_data)
2737{
2738	ASSERT_RTNL();
2739
2740	if (dev->rx_handler)
2741		return -EBUSY;
2742
2743	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2744	rcu_assign_pointer(dev->rx_handler, rx_handler);
2745
2746	return 0;
2747}
2748EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2749
2750/**
2751 *	netdev_rx_handler_unregister - unregister receive handler
2752 *	@dev: device to unregister a handler from
2753 *
2754 *	Unregister a receive hander from a device.
2755 *
2756 *	The caller must hold the rtnl_mutex.
2757 */
2758void netdev_rx_handler_unregister(struct net_device *dev)
2759{
2760
2761	ASSERT_RTNL();
2762	rcu_assign_pointer(dev->rx_handler, NULL);
2763	rcu_assign_pointer(dev->rx_handler_data, NULL);
2764}
2765EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2766
2767static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2768					      struct net_device *master)
2769{
2770	if (skb->pkt_type == PACKET_HOST) {
2771		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2772
2773		memcpy(dest, master->dev_addr, ETH_ALEN);
2774	}
2775}
2776
2777/* On bonding slaves other than the currently active slave, suppress
2778 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2779 * ARP on active-backup slaves with arp_validate enabled.
2780 */
2781int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2782{
2783	struct net_device *dev = skb->dev;
2784
2785	if (master->priv_flags & IFF_MASTER_ARPMON)
2786		dev->last_rx = jiffies;
2787
2788	if ((master->priv_flags & IFF_MASTER_ALB) &&
2789	    (master->priv_flags & IFF_BRIDGE_PORT)) {
2790		/* Do address unmangle. The local destination address
2791		 * will be always the one master has. Provides the right
2792		 * functionality in a bridge.
2793		 */
2794		skb_bond_set_mac_by_master(skb, master);
2795	}
2796
2797	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2798		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2799		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2800			return 0;
2801
2802		if (master->priv_flags & IFF_MASTER_ALB) {
2803			if (skb->pkt_type != PACKET_BROADCAST &&
2804			    skb->pkt_type != PACKET_MULTICAST)
2805				return 0;
2806		}
2807		if (master->priv_flags & IFF_MASTER_8023AD &&
2808		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2809			return 0;
2810
2811		return 1;
2812	}
2813	return 0;
2814}
2815EXPORT_SYMBOL(__skb_bond_should_drop);
2816
2817static int __netif_receive_skb(struct sk_buff *skb)
2818{
2819	struct packet_type *ptype, *pt_prev;
2820	rx_handler_func_t *rx_handler;
2821	struct net_device *orig_dev;
2822	struct net_device *master;
2823	struct net_device *null_or_orig;
2824	struct net_device *orig_or_bond;
2825	int ret = NET_RX_DROP;
2826	__be16 type;
2827
2828	if (!netdev_tstamp_prequeue)
2829		net_timestamp_check(skb);
2830
2831	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2832		return NET_RX_SUCCESS;
2833
2834	/* if we've gotten here through NAPI, check netpoll */
2835	if (netpoll_receive_skb(skb))
2836		return NET_RX_DROP;
2837
2838	if (!skb->skb_iif)
2839		skb->skb_iif = skb->dev->ifindex;
2840
2841	/*
2842	 * bonding note: skbs received on inactive slaves should only
2843	 * be delivered to pkt handlers that are exact matches.  Also
2844	 * the deliver_no_wcard flag will be set.  If packet handlers
2845	 * are sensitive to duplicate packets these skbs will need to
2846	 * be dropped at the handler.  The vlan accel path may have
2847	 * already set the deliver_no_wcard flag.
2848	 */
2849	null_or_orig = NULL;
2850	orig_dev = skb->dev;
2851	master = ACCESS_ONCE(orig_dev->master);
2852	if (skb->deliver_no_wcard)
2853		null_or_orig = orig_dev;
2854	else if (master) {
2855		if (skb_bond_should_drop(skb, master)) {
2856			skb->deliver_no_wcard = 1;
2857			null_or_orig = orig_dev; /* deliver only exact match */
2858		} else
2859			skb->dev = master;
2860	}
2861
2862	__this_cpu_inc(softnet_data.processed);
2863	skb_reset_network_header(skb);
2864	skb_reset_transport_header(skb);
2865	skb->mac_len = skb->network_header - skb->mac_header;
2866
2867	pt_prev = NULL;
2868
2869	rcu_read_lock();
2870
2871#ifdef CONFIG_NET_CLS_ACT
2872	if (skb->tc_verd & TC_NCLS) {
2873		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2874		goto ncls;
2875	}
2876#endif
2877
2878	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2879		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2880		    ptype->dev == orig_dev) {
2881			if (pt_prev)
2882				ret = deliver_skb(skb, pt_prev, orig_dev);
2883			pt_prev = ptype;
2884		}
2885	}
2886
2887#ifdef CONFIG_NET_CLS_ACT
2888	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2889	if (!skb)
2890		goto out;
2891ncls:
2892#endif
2893
2894	/* If we got this far with a hardware accelerated VLAN tag, it means
2895	 * that we were put in promiscuous mode but nobody is interested in
2896	 * this vid. Drop the packet now to prevent it from getting propagated
2897	 * to other parts of the stack that won't know how to deal with packets
2898	 * tagged in this manner.
2899	 */
2900	if (unlikely(vlan_tx_tag_present(skb)))
2901		goto bypass;
2902
2903	/* Handle special case of bridge or macvlan */
2904	rx_handler = rcu_dereference(skb->dev->rx_handler);
2905	if (rx_handler) {
2906		if (pt_prev) {
2907			ret = deliver_skb(skb, pt_prev, orig_dev);
2908			pt_prev = NULL;
2909		}
2910		skb = rx_handler(skb);
2911		if (!skb)
2912			goto out;
2913	}
2914
2915	/*
2916	 * Make sure frames received on VLAN interfaces stacked on
2917	 * bonding interfaces still make their way to any base bonding
2918	 * device that may have registered for a specific ptype.  The
2919	 * handler may have to adjust skb->dev and orig_dev.
2920	 */
2921	orig_or_bond = orig_dev;
2922	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2923	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2924		orig_or_bond = vlan_dev_real_dev(skb->dev);
2925	}
2926
2927	type = skb->protocol;
2928	list_for_each_entry_rcu(ptype,
2929			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2930		if (ptype->type == type && (ptype->dev == null_or_orig ||
2931		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
2932		     ptype->dev == orig_or_bond)) {
2933			if (pt_prev)
2934				ret = deliver_skb(skb, pt_prev, orig_dev);
2935			pt_prev = ptype;
2936		}
2937	}
2938
2939bypass:
2940	if (pt_prev) {
2941		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2942	} else {
2943		kfree_skb(skb);
2944		/* Jamal, now you will not able to escape explaining
2945		 * me how you were going to use this. :-)
2946		 */
2947		ret = NET_RX_DROP;
2948	}
2949
2950out:
2951	rcu_read_unlock();
2952	return ret;
2953}
2954
2955/**
2956 *	netif_receive_skb - process receive buffer from network
2957 *	@skb: buffer to process
2958 *
2959 *	netif_receive_skb() is the main receive data processing function.
2960 *	It always succeeds. The buffer may be dropped during processing
2961 *	for congestion control or by the protocol layers.
2962 *
2963 *	This function may only be called from softirq context and interrupts
2964 *	should be enabled.
2965 *
2966 *	Return values (usually ignored):
2967 *	NET_RX_SUCCESS: no congestion
2968 *	NET_RX_DROP: packet was dropped
2969 */
2970int BCMFASTPATH_HOST netif_receive_skb(struct sk_buff *skb)
2971{
2972	if (netdev_tstamp_prequeue)
2973		net_timestamp_check(skb);
2974
2975	if (skb_defer_rx_timestamp(skb))
2976		return NET_RX_SUCCESS;
2977
2978#ifdef CONFIG_RPS
2979	{
2980		struct rps_dev_flow voidflow, *rflow = &voidflow;
2981		int cpu, ret;
2982
2983		rcu_read_lock();
2984
2985		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2986
2987		if (cpu >= 0) {
2988			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2989			rcu_read_unlock();
2990		} else {
2991			rcu_read_unlock();
2992			ret = __netif_receive_skb(skb);
2993		}
2994
2995		return ret;
2996	}
2997#else
2998	return __netif_receive_skb(skb);
2999#endif
3000}
3001EXPORT_SYMBOL(netif_receive_skb);
3002
3003/* Network device is going away, flush any packets still pending
3004 * Called with irqs disabled.
3005 */
3006static void flush_backlog(void *arg)
3007{
3008	struct net_device *dev = arg;
3009	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3010	struct sk_buff *skb, *tmp;
3011
3012	rps_lock(sd);
3013	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3014		if (skb->dev == dev) {
3015			__skb_unlink(skb, &sd->input_pkt_queue);
3016			kfree_skb(skb);
3017			input_queue_head_incr(sd);
3018		}
3019	}
3020	rps_unlock(sd);
3021
3022	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3023		if (skb->dev == dev) {
3024			__skb_unlink(skb, &sd->process_queue);
3025			kfree_skb(skb);
3026			input_queue_head_incr(sd);
3027		}
3028	}
3029}
3030
3031static int BCMFASTPATH_HOST napi_gro_complete(struct sk_buff *skb)
3032{
3033	struct packet_type *ptype;
3034	__be16 type = skb->protocol;
3035	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3036	int err = -ENOENT;
3037
3038	if (NAPI_GRO_CB(skb)->count == 1) {
3039		skb_shinfo(skb)->gso_size = 0;
3040		goto out;
3041	}
3042
3043	rcu_read_lock();
3044	list_for_each_entry_rcu(ptype, head, list) {
3045		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3046			continue;
3047
3048		err = ptype->gro_complete(skb);
3049		break;
3050	}
3051	rcu_read_unlock();
3052
3053	if (err) {
3054		WARN_ON(&ptype->list == head);
3055		kfree_skb(skb);
3056		return NET_RX_SUCCESS;
3057	}
3058
3059out:
3060	return netif_receive_skb(skb);
3061}
3062
3063static void BCMFASTPATH_HOST napi_gro_flush(struct napi_struct *napi)
3064{
3065	struct sk_buff *skb, *next;
3066
3067	for (skb = napi->gro_list; skb; skb = next) {
3068		next = skb->next;
3069		skb->next = NULL;
3070		napi_gro_complete(skb);
3071	}
3072
3073	napi->gro_count = 0;
3074	napi->gro_list = NULL;
3075}
3076
3077#ifdef CONFIG_INET_GRO
3078void BCMFASTPATH_HOST generic_napi_gro_flush(struct napi_struct *napi)
3079{
3080	napi_gro_flush(napi);
3081}
3082EXPORT_SYMBOL(generic_napi_gro_flush);
3083#endif /* CONFIG_INET_GRO */
3084
3085enum gro_result BCMFASTPATH_HOST dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3086{
3087	struct sk_buff **pp = NULL;
3088	struct packet_type *ptype;
3089	__be16 type = skb->protocol;
3090	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3091	int same_flow;
3092	int mac_len;
3093	enum gro_result ret;
3094
3095	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3096		goto normal;
3097
3098	if (skb_is_gso(skb) || skb_has_frags(skb))
3099		goto normal;
3100
3101	rcu_read_lock();
3102	list_for_each_entry_rcu(ptype, head, list) {
3103		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3104			continue;
3105
3106		skb_set_network_header(skb, skb_gro_offset(skb));
3107		mac_len = skb->network_header - skb->mac_header;
3108		skb->mac_len = mac_len;
3109		NAPI_GRO_CB(skb)->same_flow = 0;
3110		NAPI_GRO_CB(skb)->flush = 0;
3111		NAPI_GRO_CB(skb)->free = 0;
3112
3113		pp = ptype->gro_receive(&napi->gro_list, skb);
3114		break;
3115	}
3116	rcu_read_unlock();
3117
3118	if (&ptype->list == head)
3119		goto normal;
3120
3121	same_flow = NAPI_GRO_CB(skb)->same_flow;
3122	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3123
3124	if (pp) {
3125		struct sk_buff *nskb = *pp;
3126
3127		*pp = nskb->next;
3128		nskb->next = NULL;
3129		napi_gro_complete(nskb);
3130		napi->gro_count--;
3131	}
3132
3133	if (same_flow)
3134		goto ok;
3135
3136	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3137		goto normal;
3138
3139	napi->gro_count++;
3140	NAPI_GRO_CB(skb)->count = 1;
3141	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3142	skb->next = napi->gro_list;
3143	napi->gro_list = skb;
3144	ret = GRO_HELD;
3145
3146pull:
3147	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3148		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3149
3150		BUG_ON(skb->end - skb->tail < grow);
3151
3152		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3153
3154		skb->tail += grow;
3155		skb->data_len -= grow;
3156
3157		skb_shinfo(skb)->frags[0].page_offset += grow;
3158		skb_shinfo(skb)->frags[0].size -= grow;
3159
3160		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3161			put_page(skb_shinfo(skb)->frags[0].page);
3162			memmove(skb_shinfo(skb)->frags,
3163				skb_shinfo(skb)->frags + 1,
3164				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3165		}
3166	}
3167
3168ok:
3169	return ret;
3170
3171normal:
3172	ret = GRO_NORMAL;
3173	goto pull;
3174}
3175EXPORT_SYMBOL(dev_gro_receive);
3176
3177static gro_result_t BCMFASTPATH_HOST
3178__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3179{
3180	struct sk_buff *p;
3181
3182	for (p = napi->gro_list; p; p = p->next) {
3183		NAPI_GRO_CB(p)->same_flow =
3184			(p->dev == skb->dev) &&
3185			!compare_ether_header(skb_mac_header(p),
3186					      skb_gro_mac_header(skb));
3187		NAPI_GRO_CB(p)->flush = 0;
3188	}
3189
3190	return dev_gro_receive(napi, skb);
3191}
3192
3193gro_result_t BCMFASTPATH_HOST napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3194{
3195	switch (ret) {
3196	case GRO_NORMAL:
3197		if (netif_receive_skb(skb))
3198			ret = GRO_DROP;
3199		break;
3200
3201	case GRO_DROP:
3202	case GRO_MERGED_FREE:
3203		kfree_skb(skb);
3204		break;
3205
3206	case GRO_HELD:
3207	case GRO_MERGED:
3208		break;
3209	}
3210
3211	return ret;
3212}
3213EXPORT_SYMBOL(napi_skb_finish);
3214
3215void skb_gro_reset_offset(struct sk_buff *skb)
3216{
3217	NAPI_GRO_CB(skb)->data_offset = 0;
3218	NAPI_GRO_CB(skb)->frag0 = NULL;
3219	NAPI_GRO_CB(skb)->frag0_len = 0;
3220
3221	if (skb->mac_header == skb->tail &&
3222	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3223		NAPI_GRO_CB(skb)->frag0 =
3224			page_address(skb_shinfo(skb)->frags[0].page) +
3225			skb_shinfo(skb)->frags[0].page_offset;
3226		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3227	}
3228}
3229EXPORT_SYMBOL(skb_gro_reset_offset);
3230
3231gro_result_t BCMFASTPATH_HOST napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3232{
3233	skb_gro_reset_offset(skb);
3234
3235	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3236}
3237EXPORT_SYMBOL(napi_gro_receive);
3238
3239void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3240{
3241	__skb_pull(skb, skb_headlen(skb));
3242	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3243
3244	napi->skb = skb;
3245}
3246EXPORT_SYMBOL(napi_reuse_skb);
3247
3248struct sk_buff *napi_get_frags(struct napi_struct *napi)
3249{
3250	struct sk_buff *skb = napi->skb;
3251
3252	if (!skb) {
3253		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3254		if (skb)
3255			napi->skb = skb;
3256	}
3257	return skb;
3258}
3259EXPORT_SYMBOL(napi_get_frags);
3260
3261gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3262			       gro_result_t ret)
3263{
3264	switch (ret) {
3265	case GRO_NORMAL:
3266	case GRO_HELD:
3267		skb->protocol = eth_type_trans(skb, skb->dev);
3268
3269		if (ret == GRO_HELD)
3270			skb_gro_pull(skb, -ETH_HLEN);
3271		else if (netif_receive_skb(skb))
3272			ret = GRO_DROP;
3273		break;
3274
3275	case GRO_DROP:
3276	case GRO_MERGED_FREE:
3277		napi_reuse_skb(napi, skb);
3278		break;
3279
3280	case GRO_MERGED:
3281		break;
3282	}
3283
3284	return ret;
3285}
3286EXPORT_SYMBOL(napi_frags_finish);
3287
3288struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3289{
3290	struct sk_buff *skb = napi->skb;
3291	struct ethhdr *eth;
3292	unsigned int hlen;
3293	unsigned int off;
3294
3295	napi->skb = NULL;
3296
3297	skb_reset_mac_header(skb);
3298	skb_gro_reset_offset(skb);
3299
3300	off = skb_gro_offset(skb);
3301	hlen = off + sizeof(*eth);
3302	eth = skb_gro_header_fast(skb, off);
3303	if (skb_gro_header_hard(skb, hlen)) {
3304		eth = skb_gro_header_slow(skb, hlen, off);
3305		if (unlikely(!eth)) {
3306			napi_reuse_skb(napi, skb);
3307			skb = NULL;
3308			goto out;
3309		}
3310	}
3311
3312	skb_gro_pull(skb, sizeof(*eth));
3313
3314	/*
3315	 * This works because the only protocols we care about don't require
3316	 * special handling.  We'll fix it up properly at the end.
3317	 */
3318	skb->protocol = eth->h_proto;
3319
3320out:
3321	return skb;
3322}
3323EXPORT_SYMBOL(napi_frags_skb);
3324
3325gro_result_t napi_gro_frags(struct napi_struct *napi)
3326{
3327	struct sk_buff *skb = napi_frags_skb(napi);
3328
3329	if (!skb)
3330		return GRO_DROP;
3331
3332	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3333}
3334EXPORT_SYMBOL(napi_gro_frags);
3335
3336/*
3337 * net_rps_action sends any pending IPI's for rps.
3338 * Note: called with local irq disabled, but exits with local irq enabled.
3339 */
3340static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3341{
3342#ifdef CONFIG_RPS
3343	struct softnet_data *remsd = sd->rps_ipi_list;
3344
3345	if (remsd) {
3346		sd->rps_ipi_list = NULL;
3347
3348		local_irq_enable();
3349
3350		/* Send pending IPI's to kick RPS processing on remote cpus. */
3351		while (remsd) {
3352			struct softnet_data *next = remsd->rps_ipi_next;
3353
3354			if (cpu_online(remsd->cpu))
3355				__smp_call_function_single(remsd->cpu,
3356							   &remsd->csd, 0);
3357			remsd = next;
3358		}
3359	} else
3360#endif
3361		local_irq_enable();
3362}
3363
3364#ifdef CONFIG_INET_GRO
3365struct napi_struct gro_napi = {0};
3366atomic_t gro_timer_init = {0};
3367extern spinlock_t gro_lock;
3368#endif /* CONFIG_INET_GRO */
3369
3370static int process_backlog(struct napi_struct *napi, int quota)
3371{
3372	int work = 0;
3373	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3374
3375#ifdef CONFIG_RPS
3376	/* Check if we have pending ipi, its better to send them now,
3377	 * not waiting net_rx_action() end.
3378	 */
3379	if (sd->rps_ipi_list) {
3380		local_irq_disable();
3381		net_rps_action_and_irq_enable(sd);
3382	}
3383#endif
3384	napi->weight = weight_p;
3385	local_irq_disable();
3386	while (work < quota) {
3387		struct sk_buff *skb;
3388		unsigned int qlen;
3389
3390		while ((skb = __skb_dequeue(&sd->process_queue))) {
3391			local_irq_enable();
3392#ifdef CONFIG_INET_GRO
3393			if (atomic_read(&gro_timer_init)) {
3394				spin_lock_bh(&gro_lock);
3395				napi_gro_receive(&gro_napi, skb);
3396				spin_unlock_bh(&gro_lock);
3397			}
3398			else
3399#endif /* CONFIG_INET_GRO */
3400			__netif_receive_skb(skb);
3401			local_irq_disable();
3402			input_queue_head_incr(sd);
3403			if (++work >= quota) {
3404				local_irq_enable();
3405				return work;
3406			}
3407		}
3408
3409		rps_lock(sd);
3410		qlen = skb_queue_len(&sd->input_pkt_queue);
3411		if (qlen)
3412			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3413						   &sd->process_queue);
3414
3415		if (qlen < quota - work) {
3416			/*
3417			 * Inline a custom version of __napi_complete().
3418			 * only current cpu owns and manipulates this napi,
3419			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3420			 * we can use a plain write instead of clear_bit(),
3421			 * and we dont need an smp_mb() memory barrier.
3422			 */
3423			list_del(&napi->poll_list);
3424			napi->state = 0;
3425
3426			quota = work + qlen;
3427		}
3428		rps_unlock(sd);
3429	}
3430	local_irq_enable();
3431
3432	return work;
3433}
3434
3435/**
3436 * __napi_schedule - schedule for receive
3437 * @n: entry to schedule
3438 *
3439 * The entry's receive function will be scheduled to run
3440 */
3441void __napi_schedule(struct napi_struct *n)
3442{
3443	unsigned long flags;
3444
3445	local_irq_save(flags);
3446	____napi_schedule(&__get_cpu_var(softnet_data), n);
3447	local_irq_restore(flags);
3448}
3449EXPORT_SYMBOL(__napi_schedule);
3450
3451void __napi_complete(struct napi_struct *n)
3452{
3453	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3454	BUG_ON(n->gro_list);
3455
3456	list_del(&n->poll_list);
3457	smp_mb__before_clear_bit();
3458	clear_bit(NAPI_STATE_SCHED, &n->state);
3459}
3460EXPORT_SYMBOL(__napi_complete);
3461
3462void napi_complete(struct napi_struct *n)
3463{
3464	unsigned long flags;
3465
3466	/*
3467	 * don't let napi dequeue from the cpu poll list
3468	 * just in case its running on a different cpu
3469	 */
3470	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3471		return;
3472
3473	napi_gro_flush(n);
3474	local_irq_save(flags);
3475	__napi_complete(n);
3476	local_irq_restore(flags);
3477}
3478EXPORT_SYMBOL(napi_complete);
3479
3480void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3481		    int (*poll)(struct napi_struct *, int), int weight)
3482{
3483	INIT_LIST_HEAD(&napi->poll_list);
3484	napi->gro_count = 0;
3485	napi->gro_list = NULL;
3486	napi->skb = NULL;
3487	napi->poll = poll;
3488	napi->weight = weight;
3489	list_add(&napi->dev_list, &dev->napi_list);
3490	napi->dev = dev;
3491#ifdef CONFIG_NETPOLL
3492	spin_lock_init(&napi->poll_lock);
3493	napi->poll_owner = -1;
3494#endif
3495	set_bit(NAPI_STATE_SCHED, &napi->state);
3496}
3497EXPORT_SYMBOL(netif_napi_add);
3498
3499void netif_napi_del(struct napi_struct *napi)
3500{
3501	struct sk_buff *skb, *next;
3502
3503	list_del_init(&napi->dev_list);
3504	napi_free_frags(napi);
3505
3506	for (skb = napi->gro_list; skb; skb = next) {
3507		next = skb->next;
3508		skb->next = NULL;
3509		kfree_skb(skb);
3510	}
3511
3512	napi->gro_list = NULL;
3513	napi->gro_count = 0;
3514}
3515EXPORT_SYMBOL(netif_napi_del);
3516
3517static void BCMFASTPATH_HOST net_rx_action(struct softirq_action *h)
3518{
3519	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3520	unsigned long time_limit = jiffies + 2;
3521	int budget = netdev_budget;
3522	void *have;
3523
3524	local_irq_disable();
3525
3526	while (!list_empty(&sd->poll_list)) {
3527		struct napi_struct *n;
3528		int work, weight;
3529
3530		/* If softirq window is exhuasted then punt.
3531		 * Allow this to run for 2 jiffies since which will allow
3532		 * an average latency of 1.5/HZ.
3533		 */
3534		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3535			goto softnet_break;
3536
3537		local_irq_enable();
3538
3539		/* Even though interrupts have been re-enabled, this
3540		 * access is safe because interrupts can only add new
3541		 * entries to the tail of this list, and only ->poll()
3542		 * calls can remove this head entry from the list.
3543		 */
3544		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3545
3546		have = netpoll_poll_lock(n);
3547
3548		weight = n->weight;
3549
3550		/* This NAPI_STATE_SCHED test is for avoiding a race
3551		 * with netpoll's poll_napi().  Only the entity which
3552		 * obtains the lock and sees NAPI_STATE_SCHED set will
3553		 * actually make the ->poll() call.  Therefore we avoid
3554		 * accidently calling ->poll() when NAPI is not scheduled.
3555		 */
3556		work = 0;
3557		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3558			work = n->poll(n, weight);
3559			trace_napi_poll(n);
3560		}
3561
3562		WARN_ON_ONCE(work > weight);
3563
3564		budget -= work;
3565
3566		local_irq_disable();
3567
3568		/* Drivers must not modify the NAPI state if they
3569		 * consume the entire weight.  In such cases this code
3570		 * still "owns" the NAPI instance and therefore can
3571		 * move the instance around on the list at-will.
3572		 */
3573		if (unlikely(work == weight)) {
3574			if (unlikely(napi_disable_pending(n))) {
3575				local_irq_enable();
3576				napi_complete(n);
3577				local_irq_disable();
3578			} else
3579				list_move_tail(&n->poll_list, &sd->poll_list);
3580		}
3581
3582		netpoll_poll_unlock(have);
3583	}
3584out:
3585	net_rps_action_and_irq_enable(sd);
3586
3587#ifdef CONFIG_NET_DMA
3588	/*
3589	 * There may not be any more sk_buffs coming right now, so push
3590	 * any pending DMA copies to hardware
3591	 */
3592	dma_issue_pending_all();
3593#endif
3594
3595	return;
3596
3597softnet_break:
3598	sd->time_squeeze++;
3599	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3600	goto out;
3601}
3602
3603static gifconf_func_t *gifconf_list[NPROTO];
3604
3605/**
3606 *	register_gifconf	-	register a SIOCGIF handler
3607 *	@family: Address family
3608 *	@gifconf: Function handler
3609 *
3610 *	Register protocol dependent address dumping routines. The handler
3611 *	that is passed must not be freed or reused until it has been replaced
3612 *	by another handler.
3613 */
3614int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3615{
3616	if (family >= NPROTO)
3617		return -EINVAL;
3618	gifconf_list[family] = gifconf;
3619	return 0;
3620}
3621EXPORT_SYMBOL(register_gifconf);
3622
3623
3624/*
3625 *	Map an interface index to its name (SIOCGIFNAME)
3626 */
3627
3628/*
3629 *	We need this ioctl for efficient implementation of the
3630 *	if_indextoname() function required by the IPv6 API.  Without
3631 *	it, we would have to search all the interfaces to find a
3632 *	match.  --pb
3633 */
3634
3635static int dev_ifname(struct net *net, struct ifreq __user *arg)
3636{
3637	struct net_device *dev;
3638	struct ifreq ifr;
3639
3640	/*
3641	 *	Fetch the caller's info block.
3642	 */
3643
3644	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3645		return -EFAULT;
3646
3647	rcu_read_lock();
3648	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3649	if (!dev) {
3650		rcu_read_unlock();
3651		return -ENODEV;
3652	}
3653
3654	strcpy(ifr.ifr_name, dev->name);
3655	rcu_read_unlock();
3656
3657	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3658		return -EFAULT;
3659	return 0;
3660}
3661
3662/*
3663 *	Perform a SIOCGIFCONF call. This structure will change
3664 *	size eventually, and there is nothing I can do about it.
3665 *	Thus we will need a 'compatibility mode'.
3666 */
3667
3668static int dev_ifconf(struct net *net, char __user *arg)
3669{
3670	struct ifconf ifc;
3671	struct net_device *dev;
3672	char __user *pos;
3673	int len;
3674	int total;
3675	int i;
3676
3677	/*
3678	 *	Fetch the caller's info block.
3679	 */
3680
3681	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3682		return -EFAULT;
3683
3684	pos = ifc.ifc_buf;
3685	len = ifc.ifc_len;
3686
3687	/*
3688	 *	Loop over the interfaces, and write an info block for each.
3689	 */
3690
3691	total = 0;
3692	for_each_netdev(net, dev) {
3693		for (i = 0; i < NPROTO; i++) {
3694			if (gifconf_list[i]) {
3695				int done;
3696				if (!pos)
3697					done = gifconf_list[i](dev, NULL, 0);
3698				else
3699					done = gifconf_list[i](dev, pos + total,
3700							       len - total);
3701				if (done < 0)
3702					return -EFAULT;
3703				total += done;
3704			}
3705		}
3706	}
3707
3708	/*
3709	 *	All done.  Write the updated control block back to the caller.
3710	 */
3711	ifc.ifc_len = total;
3712
3713	/*
3714	 * 	Both BSD and Solaris return 0 here, so we do too.
3715	 */
3716	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3717}
3718
3719#ifdef CONFIG_PROC_FS
3720/*
3721 *	This is invoked by the /proc filesystem handler to display a device
3722 *	in detail.
3723 */
3724void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3725	__acquires(RCU)
3726{
3727	struct net *net = seq_file_net(seq);
3728	loff_t off;
3729	struct net_device *dev;
3730
3731	rcu_read_lock();
3732	if (!*pos)
3733		return SEQ_START_TOKEN;
3734
3735	off = 1;
3736	for_each_netdev_rcu(net, dev)
3737		if (off++ == *pos)
3738			return dev;
3739
3740	return NULL;
3741}
3742
3743void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3744{
3745	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3746				  first_net_device(seq_file_net(seq)) :
3747				  next_net_device((struct net_device *)v);
3748
3749	++*pos;
3750	return rcu_dereference(dev);
3751}
3752
3753void dev_seq_stop(struct seq_file *seq, void *v)
3754	__releases(RCU)
3755{
3756	rcu_read_unlock();
3757}
3758
3759static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3760{
3761	struct rtnl_link_stats64 temp;
3762	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3763
3764	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3765		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3766		   dev->name, stats->rx_bytes, stats->rx_packets,
3767		   stats->rx_errors,
3768		   stats->rx_dropped + stats->rx_missed_errors,
3769		   stats->rx_fifo_errors,
3770		   stats->rx_length_errors + stats->rx_over_errors +
3771		    stats->rx_crc_errors + stats->rx_frame_errors,
3772		   stats->rx_compressed, stats->multicast,
3773		   stats->tx_bytes, stats->tx_packets,
3774		   stats->tx_errors, stats->tx_dropped,
3775		   stats->tx_fifo_errors, stats->collisions,
3776		   stats->tx_carrier_errors +
3777		    stats->tx_aborted_errors +
3778		    stats->tx_window_errors +
3779		    stats->tx_heartbeat_errors,
3780		   stats->tx_compressed);
3781}
3782
3783/*
3784 *	Called from the PROCfs module. This now uses the new arbitrary sized
3785 *	/proc/net interface to create /proc/net/dev
3786 */
3787static int dev_seq_show(struct seq_file *seq, void *v)
3788{
3789	if (v == SEQ_START_TOKEN)
3790		seq_puts(seq, "Inter-|   Receive                            "
3791			      "                    |  Transmit\n"
3792			      " face |bytes    packets errs drop fifo frame "
3793			      "compressed multicast|bytes    packets errs "
3794			      "drop fifo colls carrier compressed\n");
3795	else
3796		dev_seq_printf_stats(seq, v);
3797	return 0;
3798}
3799
3800static struct softnet_data *softnet_get_online(loff_t *pos)
3801{
3802	struct softnet_data *sd = NULL;
3803
3804	while (*pos < nr_cpu_ids)
3805		if (cpu_online(*pos)) {
3806			sd = &per_cpu(softnet_data, *pos);
3807			break;
3808		} else
3809			++*pos;
3810	return sd;
3811}
3812
3813static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3814{
3815	return softnet_get_online(pos);
3816}
3817
3818static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3819{
3820	++*pos;
3821	return softnet_get_online(pos);
3822}
3823
3824static void softnet_seq_stop(struct seq_file *seq, void *v)
3825{
3826}
3827
3828static int softnet_seq_show(struct seq_file *seq, void *v)
3829{
3830	struct softnet_data *sd = v;
3831
3832	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3833		   sd->processed, sd->dropped, sd->time_squeeze, 0,
3834		   0, 0, 0, 0, /* was fastroute */
3835		   sd->cpu_collision, sd->received_rps);
3836	return 0;
3837}
3838
3839static const struct seq_operations dev_seq_ops = {
3840	.start = dev_seq_start,
3841	.next  = dev_seq_next,
3842	.stop  = dev_seq_stop,
3843	.show  = dev_seq_show,
3844};
3845
3846static int dev_seq_open(struct inode *inode, struct file *file)
3847{
3848	return seq_open_net(inode, file, &dev_seq_ops,
3849			    sizeof(struct seq_net_private));
3850}
3851
3852static const struct file_operations dev_seq_fops = {
3853	.owner	 = THIS_MODULE,
3854	.open    = dev_seq_open,
3855	.read    = seq_read,
3856	.llseek  = seq_lseek,
3857	.release = seq_release_net,
3858};
3859
3860static const struct seq_operations softnet_seq_ops = {
3861	.start = softnet_seq_start,
3862	.next  = softnet_seq_next,
3863	.stop  = softnet_seq_stop,
3864	.show  = softnet_seq_show,
3865};
3866
3867static int softnet_seq_open(struct inode *inode, struct file *file)
3868{
3869	return seq_open(file, &softnet_seq_ops);
3870}
3871
3872static const struct file_operations softnet_seq_fops = {
3873	.owner	 = THIS_MODULE,
3874	.open    = softnet_seq_open,
3875	.read    = seq_read,
3876	.llseek  = seq_lseek,
3877	.release = seq_release,
3878};
3879
3880static void *ptype_get_idx(loff_t pos)
3881{
3882	struct packet_type *pt = NULL;
3883	loff_t i = 0;
3884	int t;
3885
3886	list_for_each_entry_rcu(pt, &ptype_all, list) {
3887		if (i == pos)
3888			return pt;
3889		++i;
3890	}
3891
3892	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3893		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3894			if (i == pos)
3895				return pt;
3896			++i;
3897		}
3898	}
3899	return NULL;
3900}
3901
3902static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3903	__acquires(RCU)
3904{
3905	rcu_read_lock();
3906	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3907}
3908
3909static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3910{
3911	struct packet_type *pt;
3912	struct list_head *nxt;
3913	int hash;
3914
3915	++*pos;
3916	if (v == SEQ_START_TOKEN)
3917		return ptype_get_idx(0);
3918
3919	pt = v;
3920	nxt = pt->list.next;
3921	if (pt->type == htons(ETH_P_ALL)) {
3922		if (nxt != &ptype_all)
3923			goto found;
3924		hash = 0;
3925		nxt = ptype_base[0].next;
3926	} else
3927		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3928
3929	while (nxt == &ptype_base[hash]) {
3930		if (++hash >= PTYPE_HASH_SIZE)
3931			return NULL;
3932		nxt = ptype_base[hash].next;
3933	}
3934found:
3935	return list_entry(nxt, struct packet_type, list);
3936}
3937
3938static void ptype_seq_stop(struct seq_file *seq, void *v)
3939	__releases(RCU)
3940{
3941	rcu_read_unlock();
3942}
3943
3944static int ptype_seq_show(struct seq_file *seq, void *v)
3945{
3946	struct packet_type *pt = v;
3947
3948	if (v == SEQ_START_TOKEN)
3949		seq_puts(seq, "Type Device      Function\n");
3950	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3951		if (pt->type == htons(ETH_P_ALL))
3952			seq_puts(seq, "ALL ");
3953		else
3954			seq_printf(seq, "%04x", ntohs(pt->type));
3955
3956		seq_printf(seq, " %-8s %pF\n",
3957			   pt->dev ? pt->dev->name : "", pt->func);
3958	}
3959
3960	return 0;
3961}
3962
3963static const struct seq_operations ptype_seq_ops = {
3964	.start = ptype_seq_start,
3965	.next  = ptype_seq_next,
3966	.stop  = ptype_seq_stop,
3967	.show  = ptype_seq_show,
3968};
3969
3970static int ptype_seq_open(struct inode *inode, struct file *file)
3971{
3972	return seq_open_net(inode, file, &ptype_seq_ops,
3973			sizeof(struct seq_net_private));
3974}
3975
3976static const struct file_operations ptype_seq_fops = {
3977	.owner	 = THIS_MODULE,
3978	.open    = ptype_seq_open,
3979	.read    = seq_read,
3980	.llseek  = seq_lseek,
3981	.release = seq_release_net,
3982};
3983
3984
3985static int __net_init dev_proc_net_init(struct net *net)
3986{
3987	int rc = -ENOMEM;
3988
3989	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3990		goto out;
3991	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3992		goto out_dev;
3993	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3994		goto out_softnet;
3995
3996	if (wext_proc_init(net))
3997		goto out_ptype;
3998	rc = 0;
3999out:
4000	return rc;
4001out_ptype:
4002	proc_net_remove(net, "ptype");
4003out_softnet:
4004	proc_net_remove(net, "softnet_stat");
4005out_dev:
4006	proc_net_remove(net, "dev");
4007	goto out;
4008}
4009
4010static void __net_exit dev_proc_net_exit(struct net *net)
4011{
4012	wext_proc_exit(net);
4013
4014	proc_net_remove(net, "ptype");
4015	proc_net_remove(net, "softnet_stat");
4016	proc_net_remove(net, "dev");
4017}
4018
4019static struct pernet_operations __net_initdata dev_proc_ops = {
4020	.init = dev_proc_net_init,
4021	.exit = dev_proc_net_exit,
4022};
4023
4024static int __init dev_proc_init(void)
4025{
4026	return register_pernet_subsys(&dev_proc_ops);
4027}
4028#else
4029#define dev_proc_init() 0
4030#endif	/* CONFIG_PROC_FS */
4031
4032
4033/**
4034 *	netdev_set_master	-	set up master/slave pair
4035 *	@slave: slave device
4036 *	@master: new master device
4037 *
4038 *	Changes the master device of the slave. Pass %NULL to break the
4039 *	bonding. The caller must hold the RTNL semaphore. On a failure
4040 *	a negative errno code is returned. On success the reference counts
4041 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4042 *	function returns zero.
4043 */
4044int netdev_set_master(struct net_device *slave, struct net_device *master)
4045{
4046	struct net_device *old = slave->master;
4047
4048	ASSERT_RTNL();
4049
4050	if (master) {
4051		if (old)
4052			return -EBUSY;
4053		dev_hold(master);
4054	}
4055
4056	slave->master = master;
4057
4058	if (old) {
4059		synchronize_net();
4060		dev_put(old);
4061	}
4062	if (master)
4063		slave->flags |= IFF_SLAVE;
4064	else
4065		slave->flags &= ~IFF_SLAVE;
4066
4067	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4068	return 0;
4069}
4070EXPORT_SYMBOL(netdev_set_master);
4071
4072static void dev_change_rx_flags(struct net_device *dev, int flags)
4073{
4074	const struct net_device_ops *ops = dev->netdev_ops;
4075
4076	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4077		ops->ndo_change_rx_flags(dev, flags);
4078}
4079
4080static int __dev_set_promiscuity(struct net_device *dev, int inc)
4081{
4082	unsigned short old_flags = dev->flags;
4083	uid_t uid;
4084	gid_t gid;
4085
4086	ASSERT_RTNL();
4087
4088	dev->flags |= IFF_PROMISC;
4089	dev->promiscuity += inc;
4090	if (dev->promiscuity == 0) {
4091		/*
4092		 * Avoid overflow.
4093		 * If inc causes overflow, untouch promisc and return error.
4094		 */
4095		if (inc < 0)
4096			dev->flags &= ~IFF_PROMISC;
4097		else {
4098			dev->promiscuity -= inc;
4099			printk(KERN_WARNING "%s: promiscuity touches roof, "
4100				"set promiscuity failed, promiscuity feature "
4101				"of device might be broken.\n", dev->name);
4102			return -EOVERFLOW;
4103		}
4104	}
4105	if (dev->flags != old_flags) {
4106		printk(KERN_INFO "device %s %s promiscuous mode\n",
4107		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4108							       "left");
4109		if (audit_enabled) {
4110			current_uid_gid(&uid, &gid);
4111			audit_log(current->audit_context, GFP_ATOMIC,
4112				AUDIT_ANOM_PROMISCUOUS,
4113				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4114				dev->name, (dev->flags & IFF_PROMISC),
4115				(old_flags & IFF_PROMISC),
4116				audit_get_loginuid(current),
4117				uid, gid,
4118				audit_get_sessionid(current));
4119		}
4120
4121		dev_change_rx_flags(dev, IFF_PROMISC);
4122	}
4123	return 0;
4124}
4125
4126/**
4127 *	dev_set_promiscuity	- update promiscuity count on a device
4128 *	@dev: device
4129 *	@inc: modifier
4130 *
4131 *	Add or remove promiscuity from a device. While the count in the device
4132 *	remains above zero the interface remains promiscuous. Once it hits zero
4133 *	the device reverts back to normal filtering operation. A negative inc
4134 *	value is used to drop promiscuity on the device.
4135 *	Return 0 if successful or a negative errno code on error.
4136 */
4137int dev_set_promiscuity(struct net_device *dev, int inc)
4138{
4139	unsigned short old_flags = dev->flags;
4140	int err;
4141
4142	err = __dev_set_promiscuity(dev, inc);
4143	if (err < 0)
4144		return err;
4145	if (dev->flags != old_flags)
4146		dev_set_rx_mode(dev);
4147	return err;
4148}
4149EXPORT_SYMBOL(dev_set_promiscuity);
4150
4151/**
4152 *	dev_set_allmulti	- update allmulti count on a device
4153 *	@dev: device
4154 *	@inc: modifier
4155 *
4156 *	Add or remove reception of all multicast frames to a device. While the
4157 *	count in the device remains above zero the interface remains listening
4158 *	to all interfaces. Once it hits zero the device reverts back to normal
4159 *	filtering operation. A negative @inc value is used to drop the counter
4160 *	when releasing a resource needing all multicasts.
4161 *	Return 0 if successful or a negative errno code on error.
4162 */
4163
4164int dev_set_allmulti(struct net_device *dev, int inc)
4165{
4166	unsigned short old_flags = dev->flags;
4167
4168	ASSERT_RTNL();
4169
4170	dev->flags |= IFF_ALLMULTI;
4171	dev->allmulti += inc;
4172	if (dev->allmulti == 0) {
4173		/*
4174		 * Avoid overflow.
4175		 * If inc causes overflow, untouch allmulti and return error.
4176		 */
4177		if (inc < 0)
4178			dev->flags &= ~IFF_ALLMULTI;
4179		else {
4180			dev->allmulti -= inc;
4181			printk(KERN_WARNING "%s: allmulti touches roof, "
4182				"set allmulti failed, allmulti feature of "
4183				"device might be broken.\n", dev->name);
4184			return -EOVERFLOW;
4185		}
4186	}
4187	if (dev->flags ^ old_flags) {
4188		dev_change_rx_flags(dev, IFF_ALLMULTI);
4189		dev_set_rx_mode(dev);
4190	}
4191	return 0;
4192}
4193EXPORT_SYMBOL(dev_set_allmulti);
4194
4195/*
4196 *	Upload unicast and multicast address lists to device and
4197 *	configure RX filtering. When the device doesn't support unicast
4198 *	filtering it is put in promiscuous mode while unicast addresses
4199 *	are present.
4200 */
4201void __dev_set_rx_mode(struct net_device *dev)
4202{
4203	const struct net_device_ops *ops = dev->netdev_ops;
4204
4205	/* dev_open will call this function so the list will stay sane. */
4206	if (!(dev->flags&IFF_UP))
4207		return;
4208
4209	if (!netif_device_present(dev))
4210		return;
4211
4212	if (ops->ndo_set_rx_mode)
4213		ops->ndo_set_rx_mode(dev);
4214	else {
4215		/* Unicast addresses changes may only happen under the rtnl,
4216		 * therefore calling __dev_set_promiscuity here is safe.
4217		 */
4218		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4219			__dev_set_promiscuity(dev, 1);
4220			dev->uc_promisc = 1;
4221		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4222			__dev_set_promiscuity(dev, -1);
4223			dev->uc_promisc = 0;
4224		}
4225
4226		if (ops->ndo_set_multicast_list)
4227			ops->ndo_set_multicast_list(dev);
4228	}
4229}
4230
4231void dev_set_rx_mode(struct net_device *dev)
4232{
4233	netif_addr_lock_bh(dev);
4234	__dev_set_rx_mode(dev);
4235	netif_addr_unlock_bh(dev);
4236}
4237
4238/**
4239 *	dev_get_flags - get flags reported to userspace
4240 *	@dev: device
4241 *
4242 *	Get the combination of flag bits exported through APIs to userspace.
4243 */
4244unsigned dev_get_flags(const struct net_device *dev)
4245{
4246	unsigned flags;
4247
4248	flags = (dev->flags & ~(IFF_PROMISC |
4249				IFF_ALLMULTI |
4250				IFF_RUNNING |
4251				IFF_LOWER_UP |
4252				IFF_DORMANT)) |
4253		(dev->gflags & (IFF_PROMISC |
4254				IFF_ALLMULTI));
4255
4256	if (netif_running(dev)) {
4257		if (netif_oper_up(dev))
4258			flags |= IFF_RUNNING;
4259		if (netif_carrier_ok(dev))
4260			flags |= IFF_LOWER_UP;
4261		if (netif_dormant(dev))
4262			flags |= IFF_DORMANT;
4263	}
4264
4265	return flags;
4266}
4267EXPORT_SYMBOL(dev_get_flags);
4268
4269int __dev_change_flags(struct net_device *dev, unsigned int flags)
4270{
4271	int old_flags = dev->flags;
4272	int ret;
4273
4274	ASSERT_RTNL();
4275
4276	/*
4277	 *	Set the flags on our device.
4278	 */
4279
4280	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4281			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4282			       IFF_AUTOMEDIA)) |
4283		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4284				    IFF_ALLMULTI));
4285
4286	/*
4287	 *	Load in the correct multicast list now the flags have changed.
4288	 */
4289
4290	if ((old_flags ^ flags) & IFF_MULTICAST)
4291		dev_change_rx_flags(dev, IFF_MULTICAST);
4292
4293	dev_set_rx_mode(dev);
4294
4295	/*
4296	 *	Have we downed the interface. We handle IFF_UP ourselves
4297	 *	according to user attempts to set it, rather than blindly
4298	 *	setting it.
4299	 */
4300
4301	ret = 0;
4302	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4303		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4304
4305		if (!ret)
4306			dev_set_rx_mode(dev);
4307	}
4308
4309	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4310		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4311
4312		dev->gflags ^= IFF_PROMISC;
4313		dev_set_promiscuity(dev, inc);
4314	}
4315
4316	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4317	   is important. Some (broken) drivers set IFF_PROMISC, when
4318	   IFF_ALLMULTI is requested not asking us and not reporting.
4319	 */
4320	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4321		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4322
4323		dev->gflags ^= IFF_ALLMULTI;
4324		dev_set_allmulti(dev, inc);
4325	}
4326
4327	return ret;
4328}
4329
4330void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4331{
4332	unsigned int changes = dev->flags ^ old_flags;
4333
4334	if (changes & IFF_UP) {
4335		if (dev->flags & IFF_UP)
4336			call_netdevice_notifiers(NETDEV_UP, dev);
4337		else
4338			call_netdevice_notifiers(NETDEV_DOWN, dev);
4339	}
4340
4341	if (dev->flags & IFF_UP &&
4342	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4343		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4344}
4345
4346/**
4347 *	dev_change_flags - change device settings
4348 *	@dev: device
4349 *	@flags: device state flags
4350 *
4351 *	Change settings on device based state flags. The flags are
4352 *	in the userspace exported format.
4353 */
4354int dev_change_flags(struct net_device *dev, unsigned flags)
4355{
4356	int ret, changes;
4357	int old_flags = dev->flags;
4358
4359	ret = __dev_change_flags(dev, flags);
4360	if (ret < 0)
4361		return ret;
4362
4363	changes = old_flags ^ dev->flags;
4364	if (changes)
4365		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4366
4367	__dev_notify_flags(dev, old_flags);
4368	return ret;
4369}
4370EXPORT_SYMBOL(dev_change_flags);
4371
4372/**
4373 *	dev_set_mtu - Change maximum transfer unit
4374 *	@dev: device
4375 *	@new_mtu: new transfer unit
4376 *
4377 *	Change the maximum transfer size of the network device.
4378 */
4379int dev_set_mtu(struct net_device *dev, int new_mtu)
4380{
4381	const struct net_device_ops *ops = dev->netdev_ops;
4382	int err;
4383
4384	if (new_mtu == dev->mtu)
4385		return 0;
4386
4387	/*	MTU must be positive.	 */
4388	if (new_mtu < 0)
4389		return -EINVAL;
4390
4391	if (!netif_device_present(dev))
4392		return -ENODEV;
4393
4394	err = 0;
4395	if (ops->ndo_change_mtu)
4396		err = ops->ndo_change_mtu(dev, new_mtu);
4397	else
4398		dev->mtu = new_mtu;
4399
4400	if (!err && dev->flags & IFF_UP)
4401		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4402	return err;
4403}
4404EXPORT_SYMBOL(dev_set_mtu);
4405
4406/**
4407 *	dev_set_mac_address - Change Media Access Control Address
4408 *	@dev: device
4409 *	@sa: new address
4410 *
4411 *	Change the hardware (MAC) address of the device
4412 */
4413int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4414{
4415	const struct net_device_ops *ops = dev->netdev_ops;
4416	int err;
4417
4418	if (!ops->ndo_set_mac_address)
4419		return -EOPNOTSUPP;
4420	if (sa->sa_family != dev->type)
4421		return -EINVAL;
4422	if (!netif_device_present(dev))
4423		return -ENODEV;
4424	err = ops->ndo_set_mac_address(dev, sa);
4425	if (!err)
4426		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4427	return err;
4428}
4429EXPORT_SYMBOL(dev_set_mac_address);
4430
4431/*
4432 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4433 */
4434static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4435{
4436	int err;
4437	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4438
4439	if (!dev)
4440		return -ENODEV;
4441
4442	switch (cmd) {
4443	case SIOCGIFFLAGS:	/* Get interface flags */
4444		ifr->ifr_flags = (short) dev_get_flags(dev);
4445		return 0;
4446
4447	case SIOCGIFMETRIC:	/* Get the metric on the interface
4448				   (currently unused) */
4449		ifr->ifr_metric = 0;
4450		return 0;
4451
4452	case SIOCGIFMTU:	/* Get the MTU of a device */
4453		ifr->ifr_mtu = dev->mtu;
4454		return 0;
4455
4456	case SIOCGIFHWADDR:
4457		if (!dev->addr_len)
4458			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4459		else
4460			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4461			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4462		ifr->ifr_hwaddr.sa_family = dev->type;
4463		return 0;
4464
4465	case SIOCGIFSLAVE:
4466		err = -EINVAL;
4467		break;
4468
4469	case SIOCGIFMAP:
4470		ifr->ifr_map.mem_start = dev->mem_start;
4471		ifr->ifr_map.mem_end   = dev->mem_end;
4472		ifr->ifr_map.base_addr = dev->base_addr;
4473		ifr->ifr_map.irq       = dev->irq;
4474		ifr->ifr_map.dma       = dev->dma;
4475		ifr->ifr_map.port      = dev->if_port;
4476		return 0;
4477
4478	case SIOCGIFINDEX:
4479		ifr->ifr_ifindex = dev->ifindex;
4480		return 0;
4481
4482	case SIOCGIFTXQLEN:
4483		ifr->ifr_qlen = dev->tx_queue_len;
4484		return 0;
4485
4486	default:
4487		/* dev_ioctl() should ensure this case
4488		 * is never reached
4489		 */
4490		WARN_ON(1);
4491		err = -EINVAL;
4492		break;
4493
4494	}
4495	return err;
4496}
4497
4498/*
4499 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4500 */
4501static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4502{
4503	int err;
4504	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4505	const struct net_device_ops *ops;
4506
4507	if (!dev)
4508		return -ENODEV;
4509
4510	ops = dev->netdev_ops;
4511
4512	switch (cmd) {
4513	case SIOCSIFFLAGS:	/* Set interface flags */
4514		return dev_change_flags(dev, ifr->ifr_flags);
4515
4516	case SIOCSIFMETRIC:	/* Set the metric on the interface
4517				   (currently unused) */
4518		return -EOPNOTSUPP;
4519
4520	case SIOCSIFMTU:	/* Set the MTU of a device */
4521		return dev_set_mtu(dev, ifr->ifr_mtu);
4522
4523	case SIOCSIFHWADDR:
4524		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4525
4526	case SIOCSIFHWBROADCAST:
4527		if (ifr->ifr_hwaddr.sa_family != dev->type)
4528			return -EINVAL;
4529		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4530		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4531		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4532		return 0;
4533
4534	case SIOCSIFMAP:
4535		if (ops->ndo_set_config) {
4536			if (!netif_device_present(dev))
4537				return -ENODEV;
4538			return ops->ndo_set_config(dev, &ifr->ifr_map);
4539		}
4540		return -EOPNOTSUPP;
4541
4542	case SIOCADDMULTI:
4543		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4544		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4545			return -EINVAL;
4546		if (!netif_device_present(dev))
4547			return -ENODEV;
4548		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4549
4550	case SIOCDELMULTI:
4551		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4552		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4553			return -EINVAL;
4554		if (!netif_device_present(dev))
4555			return -ENODEV;
4556		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4557
4558	case SIOCSIFTXQLEN:
4559		if (ifr->ifr_qlen < 0)
4560			return -EINVAL;
4561		dev->tx_queue_len = ifr->ifr_qlen;
4562		return 0;
4563
4564	case SIOCSIFNAME:
4565		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4566		return dev_change_name(dev, ifr->ifr_newname);
4567
4568	/*
4569	 *	Unknown or private ioctl
4570	 */
4571	default:
4572		if ((cmd >= SIOCDEVPRIVATE &&
4573		    cmd <= SIOCDEVPRIVATE + 15) ||
4574		    cmd == SIOCBONDENSLAVE ||
4575		    cmd == SIOCBONDRELEASE ||
4576		    cmd == SIOCBONDSETHWADDR ||
4577		    cmd == SIOCBONDSLAVEINFOQUERY ||
4578		    cmd == SIOCBONDINFOQUERY ||
4579		    cmd == SIOCBONDCHANGEACTIVE ||
4580		    cmd == SIOCGMIIPHY ||
4581		    cmd == SIOCGMIIREG ||
4582		    cmd == SIOCSMIIREG ||
4583		    cmd == SIOCBRADDIF ||
4584		    cmd == SIOCBRDELIF ||
4585		    cmd == SIOCSHWTSTAMP ||
4586		    cmd == SIOCWANDEV) {
4587			err = -EOPNOTSUPP;
4588			if (ops->ndo_do_ioctl) {
4589				if (netif_device_present(dev))
4590					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4591				else
4592					err = -ENODEV;
4593			}
4594		} else
4595			err = -EINVAL;
4596
4597	}
4598	return err;
4599}
4600
4601/*
4602 *	This function handles all "interface"-type I/O control requests. The actual
4603 *	'doing' part of this is dev_ifsioc above.
4604 */
4605
4606/**
4607 *	dev_ioctl	-	network device ioctl
4608 *	@net: the applicable net namespace
4609 *	@cmd: command to issue
4610 *	@arg: pointer to a struct ifreq in user space
4611 *
4612 *	Issue ioctl functions to devices. This is normally called by the
4613 *	user space syscall interfaces but can sometimes be useful for
4614 *	other purposes. The return value is the return from the syscall if
4615 *	positive or a negative errno code on error.
4616 */
4617
4618int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4619{
4620	struct ifreq ifr;
4621	int ret;
4622	char *colon;
4623
4624	/* One special case: SIOCGIFCONF takes ifconf argument
4625	   and requires shared lock, because it sleeps writing
4626	   to user space.
4627	 */
4628
4629	if (cmd == SIOCGIFCONF) {
4630		rtnl_lock();
4631		ret = dev_ifconf(net, (char __user *) arg);
4632		rtnl_unlock();
4633		return ret;
4634	}
4635	if (cmd == SIOCGIFNAME)
4636		return dev_ifname(net, (struct ifreq __user *)arg);
4637
4638	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4639		return -EFAULT;
4640
4641	ifr.ifr_name[IFNAMSIZ-1] = 0;
4642
4643	colon = strchr(ifr.ifr_name, ':');
4644	if (colon)
4645		*colon = 0;
4646
4647	/*
4648	 *	See which interface the caller is talking about.
4649	 */
4650
4651	switch (cmd) {
4652	/*
4653	 *	These ioctl calls:
4654	 *	- can be done by all.
4655	 *	- atomic and do not require locking.
4656	 *	- return a value
4657	 */
4658	case SIOCGIFFLAGS:
4659	case SIOCGIFMETRIC:
4660	case SIOCGIFMTU:
4661	case SIOCGIFHWADDR:
4662	case SIOCGIFSLAVE:
4663	case SIOCGIFMAP:
4664	case SIOCGIFINDEX:
4665	case SIOCGIFTXQLEN:
4666		dev_load(net, ifr.ifr_name);
4667		rcu_read_lock();
4668		ret = dev_ifsioc_locked(net, &ifr, cmd);
4669		rcu_read_unlock();
4670		if (!ret) {
4671			if (colon)
4672				*colon = ':';
4673			if (copy_to_user(arg, &ifr,
4674					 sizeof(struct ifreq)))
4675				ret = -EFAULT;
4676		}
4677		return ret;
4678
4679	case SIOCETHTOOL:
4680		dev_load(net, ifr.ifr_name);
4681		rtnl_lock();
4682		ret = dev_ethtool(net, &ifr);
4683		rtnl_unlock();
4684		if (!ret) {
4685			if (colon)
4686				*colon = ':';
4687			if (copy_to_user(arg, &ifr,
4688					 sizeof(struct ifreq)))
4689				ret = -EFAULT;
4690		}
4691		return ret;
4692
4693	/*
4694	 *	These ioctl calls:
4695	 *	- require superuser power.
4696	 *	- require strict serialization.
4697	 *	- return a value
4698	 */
4699	case SIOCGMIIPHY:
4700	case SIOCGMIIREG:
4701	case SIOCSIFNAME:
4702		if (!capable(CAP_NET_ADMIN))
4703			return -EPERM;
4704		dev_load(net, ifr.ifr_name);
4705		rtnl_lock();
4706		ret = dev_ifsioc(net, &ifr, cmd);
4707		rtnl_unlock();
4708		if (!ret) {
4709			if (colon)
4710				*colon = ':';
4711			if (copy_to_user(arg, &ifr,
4712					 sizeof(struct ifreq)))
4713				ret = -EFAULT;
4714		}
4715		return ret;
4716
4717	/*
4718	 *	These ioctl calls:
4719	 *	- require superuser power.
4720	 *	- require strict serialization.
4721	 *	- do not return a value
4722	 */
4723	case SIOCSIFFLAGS:
4724	case SIOCSIFMETRIC:
4725	case SIOCSIFMTU:
4726	case SIOCSIFMAP:
4727	case SIOCSIFHWADDR:
4728	case SIOCSIFSLAVE:
4729	case SIOCADDMULTI:
4730	case SIOCDELMULTI:
4731	case SIOCSIFHWBROADCAST:
4732	case SIOCSIFTXQLEN:
4733	case SIOCSMIIREG:
4734	case SIOCBONDENSLAVE:
4735	case SIOCBONDRELEASE:
4736	case SIOCBONDSETHWADDR:
4737	case SIOCBONDCHANGEACTIVE:
4738	case SIOCBRADDIF:
4739	case SIOCBRDELIF:
4740	case SIOCSHWTSTAMP:
4741		if (!capable(CAP_NET_ADMIN))
4742			return -EPERM;
4743		/* fall through */
4744	case SIOCBONDSLAVEINFOQUERY:
4745	case SIOCBONDINFOQUERY:
4746		dev_load(net, ifr.ifr_name);
4747		rtnl_lock();
4748		ret = dev_ifsioc(net, &ifr, cmd);
4749		rtnl_unlock();
4750		return ret;
4751
4752	case SIOCGIFMEM:
4753		/* Get the per device memory space. We can add this but
4754		 * currently do not support it */
4755	case SIOCSIFMEM:
4756		/* Set the per device memory buffer space.
4757		 * Not applicable in our case */
4758	case SIOCSIFLINK:
4759		return -EINVAL;
4760
4761	/*
4762	 *	Unknown or private ioctl.
4763	 */
4764	default:
4765		if (cmd == SIOCWANDEV ||
4766		    (cmd >= SIOCDEVPRIVATE &&
4767		     cmd <= SIOCDEVPRIVATE + 15)) {
4768			dev_load(net, ifr.ifr_name);
4769			rtnl_lock();
4770			ret = dev_ifsioc(net, &ifr, cmd);
4771			rtnl_unlock();
4772			if (!ret && copy_to_user(arg, &ifr,
4773						 sizeof(struct ifreq)))
4774				ret = -EFAULT;
4775			return ret;
4776		}
4777		/* Take care of Wireless Extensions */
4778		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4779			return wext_handle_ioctl(net, &ifr, cmd, arg);
4780		return -EINVAL;
4781	}
4782}
4783
4784
4785/**
4786 *	dev_new_index	-	allocate an ifindex
4787 *	@net: the applicable net namespace
4788 *
4789 *	Returns a suitable unique value for a new device interface
4790 *	number.  The caller must hold the rtnl semaphore or the
4791 *	dev_base_lock to be sure it remains unique.
4792 */
4793static int dev_new_index(struct net *net)
4794{
4795	static int ifindex;
4796	for (;;) {
4797		if (++ifindex <= 0)
4798			ifindex = 1;
4799		if (!__dev_get_by_index(net, ifindex))
4800			return ifindex;
4801	}
4802}
4803
4804/* Delayed registration/unregisteration */
4805static LIST_HEAD(net_todo_list);
4806
4807static void net_set_todo(struct net_device *dev)
4808{
4809	list_add_tail(&dev->todo_list, &net_todo_list);
4810}
4811
4812static void rollback_registered_many(struct list_head *head)
4813{
4814	struct net_device *dev, *tmp;
4815
4816	BUG_ON(dev_boot_phase);
4817	ASSERT_RTNL();
4818
4819	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4820		/* Some devices call without registering
4821		 * for initialization unwind. Remove those
4822		 * devices and proceed with the remaining.
4823		 */
4824		if (dev->reg_state == NETREG_UNINITIALIZED) {
4825			pr_debug("unregister_netdevice: device %s/%p never "
4826				 "was registered\n", dev->name, dev);
4827
4828			WARN_ON(1);
4829			list_del(&dev->unreg_list);
4830			continue;
4831		}
4832
4833		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4834
4835		/* If device is running, close it first. */
4836		dev_close(dev);
4837
4838		/* And unlink it from device chain. */
4839		unlist_netdevice(dev);
4840
4841		dev->reg_state = NETREG_UNREGISTERING;
4842	}
4843
4844	synchronize_net();
4845
4846	list_for_each_entry(dev, head, unreg_list) {
4847		/* Shutdown queueing discipline. */
4848		dev_shutdown(dev);
4849
4850
4851		/* Notify protocols, that we are about to destroy
4852		   this device. They should clean all the things.
4853		*/
4854		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4855
4856		if (!dev->rtnl_link_ops ||
4857		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4858			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4859
4860		/*
4861		 *	Flush the unicast and multicast chains
4862		 */
4863		dev_uc_flush(dev);
4864		dev_mc_flush(dev);
4865
4866		if (dev->netdev_ops->ndo_uninit)
4867			dev->netdev_ops->ndo_uninit(dev);
4868
4869		/* Notifier chain MUST detach us from master device. */
4870		WARN_ON(dev->master);
4871
4872		/* Remove entries from kobject tree */
4873		netdev_unregister_kobject(dev);
4874	}
4875
4876	/* Process any work delayed until the end of the batch */
4877	dev = list_first_entry(head, struct net_device, unreg_list);
4878	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4879
4880	rcu_barrier();
4881
4882	list_for_each_entry(dev, head, unreg_list)
4883		dev_put(dev);
4884}
4885
4886static void rollback_registered(struct net_device *dev)
4887{
4888	LIST_HEAD(single);
4889
4890	list_add(&dev->unreg_list, &single);
4891	rollback_registered_many(&single);
4892}
4893
4894static void __netdev_init_queue_locks_one(struct net_device *dev,
4895					  struct netdev_queue *dev_queue,
4896					  void *_unused)
4897{
4898	spin_lock_init(&dev_queue->_xmit_lock);
4899	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4900	dev_queue->xmit_lock_owner = -1;
4901}
4902
4903static void netdev_init_queue_locks(struct net_device *dev)
4904{
4905	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4906	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4907}
4908
4909unsigned long netdev_fix_features(unsigned long features, const char *name)
4910{
4911	/* Fix illegal SG+CSUM combinations. */
4912	if ((features & NETIF_F_SG) &&
4913	    !(features & NETIF_F_ALL_CSUM)) {
4914		if (name)
4915			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4916			       "checksum feature.\n", name);
4917		features &= ~NETIF_F_SG;
4918	}
4919
4920	/* TSO requires that SG is present as well. */
4921	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4922		if (name)
4923			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4924			       "SG feature.\n", name);
4925		features &= ~NETIF_F_TSO;
4926	}
4927
4928	if (features & NETIF_F_UFO) {
4929		if (!(features & NETIF_F_GEN_CSUM)) {
4930			if (name)
4931				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4932				       "since no NETIF_F_HW_CSUM feature.\n",
4933				       name);
4934			features &= ~NETIF_F_UFO;
4935		}
4936
4937		if (!(features & NETIF_F_SG)) {
4938			if (name)
4939				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4940				       "since no NETIF_F_SG feature.\n", name);
4941			features &= ~NETIF_F_UFO;
4942		}
4943	}
4944
4945	return features;
4946}
4947EXPORT_SYMBOL(netdev_fix_features);
4948
4949/**
4950 *	netif_stacked_transfer_operstate -	transfer operstate
4951 *	@rootdev: the root or lower level device to transfer state from
4952 *	@dev: the device to transfer operstate to
4953 *
4954 *	Transfer operational state from root to device. This is normally
4955 *	called when a stacking relationship exists between the root
4956 *	device and the device(a leaf device).
4957 */
4958void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4959					struct net_device *dev)
4960{
4961	if (rootdev->operstate == IF_OPER_DORMANT)
4962		netif_dormant_on(dev);
4963	else
4964		netif_dormant_off(dev);
4965
4966	if (netif_carrier_ok(rootdev)) {
4967		if (!netif_carrier_ok(dev))
4968			netif_carrier_on(dev);
4969	} else {
4970		if (netif_carrier_ok(dev))
4971			netif_carrier_off(dev);
4972	}
4973}
4974EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4975
4976/**
4977 *	register_netdevice	- register a network device
4978 *	@dev: device to register
4979 *
4980 *	Take a completed network device structure and add it to the kernel
4981 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4982 *	chain. 0 is returned on success. A negative errno code is returned
4983 *	on a failure to set up the device, or if the name is a duplicate.
4984 *
4985 *	Callers must hold the rtnl semaphore. You may want
4986 *	register_netdev() instead of this.
4987 *
4988 *	BUGS:
4989 *	The locking appears insufficient to guarantee two parallel registers
4990 *	will not get the same name.
4991 */
4992
4993int register_netdevice(struct net_device *dev)
4994{
4995	int ret;
4996	struct net *net = dev_net(dev);
4997
4998	BUG_ON(dev_boot_phase);
4999	ASSERT_RTNL();
5000
5001	might_sleep();
5002
5003	/* When net_device's are persistent, this will be fatal. */
5004	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5005	BUG_ON(!net);
5006
5007	spin_lock_init(&dev->addr_list_lock);
5008	netdev_set_addr_lockdep_class(dev);
5009	netdev_init_queue_locks(dev);
5010
5011	dev->iflink = -1;
5012
5013#ifdef CONFIG_RPS
5014	if (!dev->num_rx_queues) {
5015		/*
5016		 * Allocate a single RX queue if driver never called
5017		 * alloc_netdev_mq
5018		 */
5019
5020		dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
5021		if (!dev->_rx) {
5022			ret = -ENOMEM;
5023			goto out;
5024		}
5025
5026		dev->_rx->first = dev->_rx;
5027		atomic_set(&dev->_rx->count, 1);
5028		dev->num_rx_queues = 1;
5029	}
5030#endif
5031	/* Init, if this function is available */
5032	if (dev->netdev_ops->ndo_init) {
5033		ret = dev->netdev_ops->ndo_init(dev);
5034		if (ret) {
5035			if (ret > 0)
5036				ret = -EIO;
5037			goto out;
5038		}
5039	}
5040
5041	ret = dev_get_valid_name(dev, dev->name, 0);
5042	if (ret)
5043		goto err_uninit;
5044
5045	dev->ifindex = dev_new_index(net);
5046	if (dev->iflink == -1)
5047		dev->iflink = dev->ifindex;
5048
5049	/* Fix illegal checksum combinations */
5050	if ((dev->features & NETIF_F_HW_CSUM) &&
5051	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5052		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5053		       dev->name);
5054		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5055	}
5056
5057	if ((dev->features & NETIF_F_NO_CSUM) &&
5058	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5059		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5060		       dev->name);
5061		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5062	}
5063
5064	dev->features = netdev_fix_features(dev->features, dev->name);
5065
5066	/* Enable software GSO if SG is supported. */
5067	if (dev->features & NETIF_F_SG)
5068		dev->features |= NETIF_F_GSO;
5069
5070	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5071	ret = notifier_to_errno(ret);
5072	if (ret)
5073		goto err_uninit;
5074
5075	ret = netdev_register_kobject(dev);
5076	if (ret)
5077		goto err_uninit;
5078	dev->reg_state = NETREG_REGISTERED;
5079
5080	/*
5081	 *	Default initial state at registry is that the
5082	 *	device is present.
5083	 */
5084
5085	set_bit(__LINK_STATE_PRESENT, &dev->state);
5086
5087	dev_init_scheduler(dev);
5088	dev_hold(dev);
5089	list_netdevice(dev);
5090
5091	/* Notify protocols, that a new device appeared. */
5092	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5093	ret = notifier_to_errno(ret);
5094	if (ret) {
5095		rollback_registered(dev);
5096		dev->reg_state = NETREG_UNREGISTERED;
5097	}
5098	/*
5099	 *	Prevent userspace races by waiting until the network
5100	 *	device is fully setup before sending notifications.
5101	 */
5102	if (!dev->rtnl_link_ops ||
5103	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5104		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5105
5106out:
5107	return ret;
5108
5109err_uninit:
5110	if (dev->netdev_ops->ndo_uninit)
5111		dev->netdev_ops->ndo_uninit(dev);
5112	goto out;
5113}
5114EXPORT_SYMBOL(register_netdevice);
5115
5116/**
5117 *	init_dummy_netdev	- init a dummy network device for NAPI
5118 *	@dev: device to init
5119 *
5120 *	This takes a network device structure and initialize the minimum
5121 *	amount of fields so it can be used to schedule NAPI polls without
5122 *	registering a full blown interface. This is to be used by drivers
5123 *	that need to tie several hardware interfaces to a single NAPI
5124 *	poll scheduler due to HW limitations.
5125 */
5126int init_dummy_netdev(struct net_device *dev)
5127{
5128	/* Clear everything. Note we don't initialize spinlocks
5129	 * are they aren't supposed to be taken by any of the
5130	 * NAPI code and this dummy netdev is supposed to be
5131	 * only ever used for NAPI polls
5132	 */
5133	memset(dev, 0, sizeof(struct net_device));
5134
5135	/* make sure we BUG if trying to hit standard
5136	 * register/unregister code path
5137	 */
5138	dev->reg_state = NETREG_DUMMY;
5139
5140	/* initialize the ref count */
5141	atomic_set(&dev->refcnt, 1);
5142
5143	/* NAPI wants this */
5144	INIT_LIST_HEAD(&dev->napi_list);
5145
5146	/* a dummy interface is started by default */
5147	set_bit(__LINK_STATE_PRESENT, &dev->state);
5148	set_bit(__LINK_STATE_START, &dev->state);
5149
5150	return 0;
5151}
5152EXPORT_SYMBOL_GPL(init_dummy_netdev);
5153
5154
5155/**
5156 *	register_netdev	- register a network device
5157 *	@dev: device to register
5158 *
5159 *	Take a completed network device structure and add it to the kernel
5160 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5161 *	chain. 0 is returned on success. A negative errno code is returned
5162 *	on a failure to set up the device, or if the name is a duplicate.
5163 *
5164 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5165 *	and expands the device name if you passed a format string to
5166 *	alloc_netdev.
5167 */
5168int register_netdev(struct net_device *dev)
5169{
5170	int err;
5171
5172	rtnl_lock();
5173
5174	/*
5175	 * If the name is a format string the caller wants us to do a
5176	 * name allocation.
5177	 */
5178	if (strchr(dev->name, '%')) {
5179		err = dev_alloc_name(dev, dev->name);
5180		if (err < 0)
5181			goto out;
5182	}
5183
5184	err = register_netdevice(dev);
5185out:
5186	rtnl_unlock();
5187	return err;
5188}
5189EXPORT_SYMBOL(register_netdev);
5190
5191/*
5192 * netdev_wait_allrefs - wait until all references are gone.
5193 *
5194 * This is called when unregistering network devices.
5195 *
5196 * Any protocol or device that holds a reference should register
5197 * for netdevice notification, and cleanup and put back the
5198 * reference if they receive an UNREGISTER event.
5199 * We can get stuck here if buggy protocols don't correctly
5200 * call dev_put.
5201 */
5202static void netdev_wait_allrefs(struct net_device *dev)
5203{
5204	unsigned long rebroadcast_time, warning_time;
5205
5206	linkwatch_forget_dev(dev);
5207
5208	rebroadcast_time = warning_time = jiffies;
5209	while (atomic_read(&dev->refcnt) != 0) {
5210		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5211			rtnl_lock();
5212
5213			/* Rebroadcast unregister notification */
5214			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5215			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5216			 * should have already handle it the first time */
5217
5218			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5219				     &dev->state)) {
5220				/* We must not have linkwatch events
5221				 * pending on unregister. If this
5222				 * happens, we simply run the queue
5223				 * unscheduled, resulting in a noop
5224				 * for this device.
5225				 */
5226				linkwatch_run_queue();
5227			}
5228
5229			__rtnl_unlock();
5230
5231			rebroadcast_time = jiffies;
5232		}
5233
5234		msleep(250);
5235
5236		if (time_after(jiffies, warning_time + 10 * HZ)) {
5237			printk(KERN_EMERG "unregister_netdevice: "
5238			       "waiting for %s to become free. Usage "
5239			       "count = %d\n",
5240			       dev->name, atomic_read(&dev->refcnt));
5241			warning_time = jiffies;
5242		}
5243	}
5244}
5245
5246/* The sequence is:
5247 *
5248 *	rtnl_lock();
5249 *	...
5250 *	register_netdevice(x1);
5251 *	register_netdevice(x2);
5252 *	...
5253 *	unregister_netdevice(y1);
5254 *	unregister_netdevice(y2);
5255 *      ...
5256 *	rtnl_unlock();
5257 *	free_netdev(y1);
5258 *	free_netdev(y2);
5259 *
5260 * We are invoked by rtnl_unlock().
5261 * This allows us to deal with problems:
5262 * 1) We can delete sysfs objects which invoke hotplug
5263 *    without deadlocking with linkwatch via keventd.
5264 * 2) Since we run with the RTNL semaphore not held, we can sleep
5265 *    safely in order to wait for the netdev refcnt to drop to zero.
5266 *
5267 * We must not return until all unregister events added during
5268 * the interval the lock was held have been completed.
5269 */
5270void netdev_run_todo(void)
5271{
5272	struct list_head list;
5273
5274	/* Snapshot list, allow later requests */
5275	list_replace_init(&net_todo_list, &list);
5276
5277	__rtnl_unlock();
5278
5279	while (!list_empty(&list)) {
5280		struct net_device *dev
5281			= list_first_entry(&list, struct net_device, todo_list);
5282		list_del(&dev->todo_list);
5283
5284		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5285			printk(KERN_ERR "network todo '%s' but state %d\n",
5286			       dev->name, dev->reg_state);
5287			dump_stack();
5288			continue;
5289		}
5290
5291		dev->reg_state = NETREG_UNREGISTERED;
5292
5293		on_each_cpu(flush_backlog, dev, 1);
5294
5295		netdev_wait_allrefs(dev);
5296
5297		/* paranoia */
5298		BUG_ON(atomic_read(&dev->refcnt));
5299		WARN_ON(dev->ip_ptr);
5300		WARN_ON(dev->ip6_ptr);
5301		WARN_ON(dev->dn_ptr);
5302
5303		if (dev->destructor)
5304			dev->destructor(dev);
5305
5306		/* Free network device */
5307		kobject_put(&dev->dev.kobj);
5308	}
5309}
5310
5311/**
5312 *	dev_txq_stats_fold - fold tx_queues stats
5313 *	@dev: device to get statistics from
5314 *	@stats: struct rtnl_link_stats64 to hold results
5315 */
5316void dev_txq_stats_fold(const struct net_device *dev,
5317			struct rtnl_link_stats64 *stats)
5318{
5319	u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5320	unsigned int i;
5321	struct netdev_queue *txq;
5322
5323	for (i = 0; i < dev->num_tx_queues; i++) {
5324		txq = netdev_get_tx_queue(dev, i);
5325		spin_lock_bh(&txq->_xmit_lock);
5326		tx_bytes   += txq->tx_bytes;
5327		tx_packets += txq->tx_packets;
5328		tx_dropped += txq->tx_dropped;
5329		spin_unlock_bh(&txq->_xmit_lock);
5330	}
5331	if (tx_bytes || tx_packets || tx_dropped) {
5332		stats->tx_bytes   = tx_bytes;
5333		stats->tx_packets = tx_packets;
5334		stats->tx_dropped = tx_dropped;
5335	}
5336}
5337EXPORT_SYMBOL(dev_txq_stats_fold);
5338
5339/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5340 * fields in the same order, with only the type differing.
5341 */
5342static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5343				    const struct net_device_stats *netdev_stats)
5344{
5345#if BITS_PER_LONG == 64
5346        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5347        memcpy(stats64, netdev_stats, sizeof(*stats64));
5348#else
5349	size_t i, n = sizeof(*stats64) / sizeof(u64);
5350	const unsigned long *src = (const unsigned long *)netdev_stats;
5351	u64 *dst = (u64 *)stats64;
5352
5353	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5354		     sizeof(*stats64) / sizeof(u64));
5355	for (i = 0; i < n; i++)
5356		dst[i] = src[i];
5357#endif
5358}
5359
5360/**
5361 *	dev_get_stats	- get network device statistics
5362 *	@dev: device to get statistics from
5363 *	@storage: place to store stats
5364 *
5365 *	Get network statistics from device. Return @storage.
5366 *	The device driver may provide its own method by setting
5367 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5368 *	otherwise the internal statistics structure is used.
5369 */
5370struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5371					struct rtnl_link_stats64 *storage)
5372{
5373	const struct net_device_ops *ops = dev->netdev_ops;
5374
5375	if (ops->ndo_get_stats64) {
5376		memset(storage, 0, sizeof(*storage));
5377		return ops->ndo_get_stats64(dev, storage);
5378	}
5379	if (ops->ndo_get_stats) {
5380		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5381		return storage;
5382	}
5383	netdev_stats_to_stats64(storage, &dev->stats);
5384	dev_txq_stats_fold(dev, storage);
5385	return storage;
5386}
5387EXPORT_SYMBOL(dev_get_stats);
5388
5389static void netdev_init_one_queue(struct net_device *dev,
5390				  struct netdev_queue *queue,
5391				  void *_unused)
5392{
5393	queue->dev = dev;
5394}
5395
5396static void netdev_init_queues(struct net_device *dev)
5397{
5398	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5399	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5400	spin_lock_init(&dev->tx_global_lock);
5401}
5402
5403/**
5404 *	alloc_netdev_mq - allocate network device
5405 *	@sizeof_priv:	size of private data to allocate space for
5406 *	@name:		device name format string
5407 *	@setup:		callback to initialize device
5408 *	@queue_count:	the number of subqueues to allocate
5409 *
5410 *	Allocates a struct net_device with private data area for driver use
5411 *	and performs basic initialization.  Also allocates subquue structs
5412 *	for each queue on the device at the end of the netdevice.
5413 */
5414struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5415		void (*setup)(struct net_device *), unsigned int queue_count)
5416{
5417	struct netdev_queue *tx;
5418	struct net_device *dev;
5419	size_t alloc_size;
5420	struct net_device *p;
5421#ifdef CONFIG_RPS
5422	struct netdev_rx_queue *rx;
5423	int i;
5424#endif
5425
5426	BUG_ON(strlen(name) >= sizeof(dev->name));
5427
5428	alloc_size = sizeof(struct net_device);
5429	if (sizeof_priv) {
5430		/* ensure 32-byte alignment of private area */
5431		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5432		alloc_size += sizeof_priv;
5433	}
5434	/* ensure 32-byte alignment of whole construct */
5435	alloc_size += NETDEV_ALIGN - 1;
5436
5437	p = kzalloc(alloc_size, GFP_KERNEL);
5438	if (!p) {
5439		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5440		return NULL;
5441	}
5442
5443	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5444	if (!tx) {
5445		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5446		       "tx qdiscs.\n");
5447		goto free_p;
5448	}
5449
5450#ifdef CONFIG_RPS
5451	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5452	if (!rx) {
5453		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5454		       "rx queues.\n");
5455		goto free_tx;
5456	}
5457
5458	atomic_set(&rx->count, queue_count);
5459
5460	/*
5461	 * Set a pointer to first element in the array which holds the
5462	 * reference count.
5463	 */
5464	for (i = 0; i < queue_count; i++)
5465		rx[i].first = rx;
5466#endif
5467
5468	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5469	dev->padded = (char *)dev - (char *)p;
5470
5471	if (dev_addr_init(dev))
5472		goto free_rx;
5473
5474	dev_mc_init(dev);
5475	dev_uc_init(dev);
5476
5477	dev_net_set(dev, &init_net);
5478
5479	dev->_tx = tx;
5480	dev->num_tx_queues = queue_count;
5481	dev->real_num_tx_queues = queue_count;
5482
5483#ifdef CONFIG_RPS
5484	dev->_rx = rx;
5485	dev->num_rx_queues = queue_count;
5486#endif
5487
5488	dev->gso_max_size = GSO_MAX_SIZE;
5489
5490	netdev_init_queues(dev);
5491
5492	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5493	dev->ethtool_ntuple_list.count = 0;
5494	INIT_LIST_HEAD(&dev->napi_list);
5495	INIT_LIST_HEAD(&dev->unreg_list);
5496	INIT_LIST_HEAD(&dev->link_watch_list);
5497	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5498	setup(dev);
5499	strcpy(dev->name, name);
5500	return dev;
5501
5502free_rx:
5503#ifdef CONFIG_RPS
5504	kfree(rx);
5505free_tx:
5506#endif
5507	kfree(tx);
5508free_p:
5509	kfree(p);
5510	return NULL;
5511}
5512EXPORT_SYMBOL(alloc_netdev_mq);
5513
5514/**
5515 *	free_netdev - free network device
5516 *	@dev: device
5517 *
5518 *	This function does the last stage of destroying an allocated device
5519 * 	interface. The reference to the device object is released.
5520 *	If this is the last reference then it will be freed.
5521 */
5522void free_netdev(struct net_device *dev)
5523{
5524	struct napi_struct *p, *n;
5525
5526	release_net(dev_net(dev));
5527
5528	kfree(dev->_tx);
5529
5530	/* Flush device addresses */
5531	dev_addr_flush(dev);
5532
5533	/* Clear ethtool n-tuple list */
5534	ethtool_ntuple_flush(dev);
5535
5536	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5537		netif_napi_del(p);
5538
5539	/*  Compatibility with error handling in drivers */
5540	if (dev->reg_state == NETREG_UNINITIALIZED) {
5541		kfree((char *)dev - dev->padded);
5542		return;
5543	}
5544
5545	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5546	dev->reg_state = NETREG_RELEASED;
5547
5548	/* will free via device release */
5549	put_device(&dev->dev);
5550}
5551EXPORT_SYMBOL(free_netdev);
5552
5553/**
5554 *	synchronize_net -  Synchronize with packet receive processing
5555 *
5556 *	Wait for packets currently being received to be done.
5557 *	Does not block later packets from starting.
5558 */
5559void synchronize_net(void)
5560{
5561	might_sleep();
5562	synchronize_rcu();
5563}
5564EXPORT_SYMBOL(synchronize_net);
5565
5566/**
5567 *	unregister_netdevice_queue - remove device from the kernel
5568 *	@dev: device
5569 *	@head: list
5570 *
5571 *	This function shuts down a device interface and removes it
5572 *	from the kernel tables.
5573 *	If head not NULL, device is queued to be unregistered later.
5574 *
5575 *	Callers must hold the rtnl semaphore.  You may want
5576 *	unregister_netdev() instead of this.
5577 */
5578
5579void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5580{
5581	ASSERT_RTNL();
5582
5583	if (head) {
5584		list_move_tail(&dev->unreg_list, head);
5585	} else {
5586		rollback_registered(dev);
5587		/* Finish processing unregister after unlock */
5588		net_set_todo(dev);
5589	}
5590}
5591EXPORT_SYMBOL(unregister_netdevice_queue);
5592
5593/**
5594 *	unregister_netdevice_many - unregister many devices
5595 *	@head: list of devices
5596 */
5597void unregister_netdevice_many(struct list_head *head)
5598{
5599	struct net_device *dev;
5600
5601	if (!list_empty(head)) {
5602		rollback_registered_many(head);
5603		list_for_each_entry(dev, head, unreg_list)
5604			net_set_todo(dev);
5605	}
5606}
5607EXPORT_SYMBOL(unregister_netdevice_many);
5608
5609/**
5610 *	unregister_netdev - remove device from the kernel
5611 *	@dev: device
5612 *
5613 *	This function shuts down a device interface and removes it
5614 *	from the kernel tables.
5615 *
5616 *	This is just a wrapper for unregister_netdevice that takes
5617 *	the rtnl semaphore.  In general you want to use this and not
5618 *	unregister_netdevice.
5619 */
5620void unregister_netdev(struct net_device *dev)
5621{
5622	rtnl_lock();
5623	unregister_netdevice(dev);
5624	rtnl_unlock();
5625}
5626EXPORT_SYMBOL(unregister_netdev);
5627
5628/**
5629 *	dev_change_net_namespace - move device to different nethost namespace
5630 *	@dev: device
5631 *	@net: network namespace
5632 *	@pat: If not NULL name pattern to try if the current device name
5633 *	      is already taken in the destination network namespace.
5634 *
5635 *	This function shuts down a device interface and moves it
5636 *	to a new network namespace. On success 0 is returned, on
5637 *	a failure a netagive errno code is returned.
5638 *
5639 *	Callers must hold the rtnl semaphore.
5640 */
5641
5642int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5643{
5644	int err;
5645
5646	ASSERT_RTNL();
5647
5648	/* Don't allow namespace local devices to be moved. */
5649	err = -EINVAL;
5650	if (dev->features & NETIF_F_NETNS_LOCAL)
5651		goto out;
5652
5653	/* Ensure the device has been registrered */
5654	err = -EINVAL;
5655	if (dev->reg_state != NETREG_REGISTERED)
5656		goto out;
5657
5658	/* Get out if there is nothing todo */
5659	err = 0;
5660	if (net_eq(dev_net(dev), net))
5661		goto out;
5662
5663	/* Pick the destination device name, and ensure
5664	 * we can use it in the destination network namespace.
5665	 */
5666	err = -EEXIST;
5667	if (__dev_get_by_name(net, dev->name)) {
5668		/* We get here if we can't use the current device name */
5669		if (!pat)
5670			goto out;
5671		if (dev_get_valid_name(dev, pat, 1))
5672			goto out;
5673	}
5674
5675	/*
5676	 * And now a mini version of register_netdevice unregister_netdevice.
5677	 */
5678
5679	/* If device is running close it first. */
5680	dev_close(dev);
5681
5682	/* And unlink it from device chain */
5683	err = -ENODEV;
5684	unlist_netdevice(dev);
5685
5686	synchronize_net();
5687
5688	/* Shutdown queueing discipline. */
5689	dev_shutdown(dev);
5690
5691	/* Notify protocols, that we are about to destroy
5692	   this device. They should clean all the things.
5693	*/
5694	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5695	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5696
5697	/*
5698	 *	Flush the unicast and multicast chains
5699	 */
5700	dev_uc_flush(dev);
5701	dev_mc_flush(dev);
5702
5703	/* Actually switch the network namespace */
5704	dev_net_set(dev, net);
5705
5706	/* If there is an ifindex conflict assign a new one */
5707	if (__dev_get_by_index(net, dev->ifindex)) {
5708		int iflink = (dev->iflink == dev->ifindex);
5709		dev->ifindex = dev_new_index(net);
5710		if (iflink)
5711			dev->iflink = dev->ifindex;
5712	}
5713
5714	/* Fixup kobjects */
5715	err = device_rename(&dev->dev, dev->name);
5716	WARN_ON(err);
5717
5718	/* Add the device back in the hashes */
5719	list_netdevice(dev);
5720
5721	/* Notify protocols, that a new device appeared. */
5722	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5723
5724	/*
5725	 *	Prevent userspace races by waiting until the network
5726	 *	device is fully setup before sending notifications.
5727	 */
5728	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5729
5730	synchronize_net();
5731	err = 0;
5732out:
5733	return err;
5734}
5735EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5736
5737static int dev_cpu_callback(struct notifier_block *nfb,
5738			    unsigned long action,
5739			    void *ocpu)
5740{
5741	struct sk_buff **list_skb;
5742	struct sk_buff *skb;
5743	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5744	struct softnet_data *sd, *oldsd;
5745
5746	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5747		return NOTIFY_OK;
5748
5749	local_irq_disable();
5750	cpu = smp_processor_id();
5751	sd = &per_cpu(softnet_data, cpu);
5752	oldsd = &per_cpu(softnet_data, oldcpu);
5753
5754	/* Find end of our completion_queue. */
5755	list_skb = &sd->completion_queue;
5756	while (*list_skb)
5757		list_skb = &(*list_skb)->next;
5758	/* Append completion queue from offline CPU. */
5759	*list_skb = oldsd->completion_queue;
5760	oldsd->completion_queue = NULL;
5761
5762	/* Append output queue from offline CPU. */
5763	if (oldsd->output_queue) {
5764		*sd->output_queue_tailp = oldsd->output_queue;
5765		sd->output_queue_tailp = oldsd->output_queue_tailp;
5766		oldsd->output_queue = NULL;
5767		oldsd->output_queue_tailp = &oldsd->output_queue;
5768	}
5769
5770	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5771	local_irq_enable();
5772
5773	/* Process offline CPU's input_pkt_queue */
5774	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5775		netif_rx(skb);
5776		input_queue_head_incr(oldsd);
5777	}
5778	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5779		netif_rx(skb);
5780		input_queue_head_incr(oldsd);
5781	}
5782
5783	return NOTIFY_OK;
5784}
5785
5786
5787/**
5788 *	netdev_increment_features - increment feature set by one
5789 *	@all: current feature set
5790 *	@one: new feature set
5791 *	@mask: mask feature set
5792 *
5793 *	Computes a new feature set after adding a device with feature set
5794 *	@one to the master device with current feature set @all.  Will not
5795 *	enable anything that is off in @mask. Returns the new feature set.
5796 */
5797unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5798					unsigned long mask)
5799{
5800	/* If device needs checksumming, downgrade to it. */
5801	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5802		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5803	else if (mask & NETIF_F_ALL_CSUM) {
5804		/* If one device supports v4/v6 checksumming, set for all. */
5805		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5806		    !(all & NETIF_F_GEN_CSUM)) {
5807			all &= ~NETIF_F_ALL_CSUM;
5808			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5809		}
5810
5811		/* If one device supports hw checksumming, set for all. */
5812		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5813			all &= ~NETIF_F_ALL_CSUM;
5814			all |= NETIF_F_HW_CSUM;
5815		}
5816	}
5817
5818	one |= NETIF_F_ALL_CSUM;
5819
5820	one |= all & NETIF_F_ONE_FOR_ALL;
5821	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5822	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5823
5824	return all;
5825}
5826EXPORT_SYMBOL(netdev_increment_features);
5827
5828static struct hlist_head *netdev_create_hash(void)
5829{
5830	int i;
5831	struct hlist_head *hash;
5832
5833	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5834	if (hash != NULL)
5835		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5836			INIT_HLIST_HEAD(&hash[i]);
5837
5838	return hash;
5839}
5840
5841/* Initialize per network namespace state */
5842static int __net_init netdev_init(struct net *net)
5843{
5844	INIT_LIST_HEAD(&net->dev_base_head);
5845
5846	net->dev_name_head = netdev_create_hash();
5847	if (net->dev_name_head == NULL)
5848		goto err_name;
5849
5850	net->dev_index_head = netdev_create_hash();
5851	if (net->dev_index_head == NULL)
5852		goto err_idx;
5853
5854	return 0;
5855
5856err_idx:
5857	kfree(net->dev_name_head);
5858err_name:
5859	return -ENOMEM;
5860}
5861
5862/**
5863 *	netdev_drivername - network driver for the device
5864 *	@dev: network device
5865 *	@buffer: buffer for resulting name
5866 *	@len: size of buffer
5867 *
5868 *	Determine network driver for device.
5869 */
5870char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5871{
5872	const struct device_driver *driver;
5873	const struct device *parent;
5874
5875	if (len <= 0 || !buffer)
5876		return buffer;
5877	buffer[0] = 0;
5878
5879	parent = dev->dev.parent;
5880
5881	if (!parent)
5882		return buffer;
5883
5884	driver = parent->driver;
5885	if (driver && driver->name)
5886		strlcpy(buffer, driver->name, len);
5887	return buffer;
5888}
5889
5890static int __netdev_printk(const char *level, const struct net_device *dev,
5891			   struct va_format *vaf)
5892{
5893	int r;
5894
5895	if (dev && dev->dev.parent)
5896		r = dev_printk(level, dev->dev.parent, "%s: %pV",
5897			       netdev_name(dev), vaf);
5898	else if (dev)
5899		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
5900	else
5901		r = printk("%s(NULL net_device): %pV", level, vaf);
5902
5903	return r;
5904}
5905
5906int netdev_printk(const char *level, const struct net_device *dev,
5907		  const char *format, ...)
5908{
5909	struct va_format vaf;
5910	va_list args;
5911	int r;
5912
5913	va_start(args, format);
5914
5915	vaf.fmt = format;
5916	vaf.va = &args;
5917
5918	r = __netdev_printk(level, dev, &vaf);
5919	va_end(args);
5920
5921	return r;
5922}
5923EXPORT_SYMBOL(netdev_printk);
5924
5925#define define_netdev_printk_level(func, level)			\
5926int func(const struct net_device *dev, const char *fmt, ...)	\
5927{								\
5928	int r;							\
5929	struct va_format vaf;					\
5930	va_list args;						\
5931								\
5932	va_start(args, fmt);					\
5933								\
5934	vaf.fmt = fmt;						\
5935	vaf.va = &args;						\
5936								\
5937	r = __netdev_printk(level, dev, &vaf);			\
5938	va_end(args);						\
5939								\
5940	return r;						\
5941}								\
5942EXPORT_SYMBOL(func);
5943
5944define_netdev_printk_level(netdev_emerg, KERN_EMERG);
5945define_netdev_printk_level(netdev_alert, KERN_ALERT);
5946define_netdev_printk_level(netdev_crit, KERN_CRIT);
5947define_netdev_printk_level(netdev_err, KERN_ERR);
5948define_netdev_printk_level(netdev_warn, KERN_WARNING);
5949define_netdev_printk_level(netdev_notice, KERN_NOTICE);
5950define_netdev_printk_level(netdev_info, KERN_INFO);
5951
5952static void __net_exit netdev_exit(struct net *net)
5953{
5954	kfree(net->dev_name_head);
5955	kfree(net->dev_index_head);
5956}
5957
5958static struct pernet_operations __net_initdata netdev_net_ops = {
5959	.init = netdev_init,
5960	.exit = netdev_exit,
5961};
5962
5963static void __net_exit default_device_exit(struct net *net)
5964{
5965	struct net_device *dev, *aux;
5966	/*
5967	 * Push all migratable network devices back to the
5968	 * initial network namespace
5969	 */
5970	rtnl_lock();
5971	for_each_netdev_safe(net, dev, aux) {
5972		int err;
5973		char fb_name[IFNAMSIZ];
5974
5975		/* Ignore unmoveable devices (i.e. loopback) */
5976		if (dev->features & NETIF_F_NETNS_LOCAL)
5977			continue;
5978
5979		/* Leave virtual devices for the generic cleanup */
5980		if (dev->rtnl_link_ops)
5981			continue;
5982
5983		/* Push remaing network devices to init_net */
5984		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5985		err = dev_change_net_namespace(dev, &init_net, fb_name);
5986		if (err) {
5987			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5988				__func__, dev->name, err);
5989			BUG();
5990		}
5991	}
5992	rtnl_unlock();
5993}
5994
5995static void __net_exit default_device_exit_batch(struct list_head *net_list)
5996{
5997	/* At exit all network devices most be removed from a network
5998	 * namespace.  Do this in the reverse order of registeration.
5999	 * Do this across as many network namespaces as possible to
6000	 * improve batching efficiency.
6001	 */
6002	struct net_device *dev;
6003	struct net *net;
6004	LIST_HEAD(dev_kill_list);
6005
6006	rtnl_lock();
6007	list_for_each_entry(net, net_list, exit_list) {
6008		for_each_netdev_reverse(net, dev) {
6009			if (dev->rtnl_link_ops)
6010				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6011			else
6012				unregister_netdevice_queue(dev, &dev_kill_list);
6013		}
6014	}
6015	unregister_netdevice_many(&dev_kill_list);
6016	rtnl_unlock();
6017}
6018
6019static struct pernet_operations __net_initdata default_device_ops = {
6020	.exit = default_device_exit,
6021	.exit_batch = default_device_exit_batch,
6022};
6023
6024/*
6025 *	Initialize the DEV module. At boot time this walks the device list and
6026 *	unhooks any devices that fail to initialise (normally hardware not
6027 *	present) and leaves us with a valid list of present and active devices.
6028 *
6029 */
6030
6031/*
6032 *       This is called single threaded during boot, so no need
6033 *       to take the rtnl semaphore.
6034 */
6035static int __init net_dev_init(void)
6036{
6037	int i, rc = -ENOMEM;
6038
6039	BUG_ON(!dev_boot_phase);
6040
6041	if (dev_proc_init())
6042		goto out;
6043
6044	if (netdev_kobject_init())
6045		goto out;
6046
6047	INIT_LIST_HEAD(&ptype_all);
6048	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6049		INIT_LIST_HEAD(&ptype_base[i]);
6050
6051	if (register_pernet_subsys(&netdev_net_ops))
6052		goto out;
6053
6054	/*
6055	 *	Initialise the packet receive queues.
6056	 */
6057
6058	for_each_possible_cpu(i) {
6059		struct softnet_data *sd = &per_cpu(softnet_data, i);
6060
6061		memset(sd, 0, sizeof(*sd));
6062		skb_queue_head_init(&sd->input_pkt_queue);
6063		skb_queue_head_init(&sd->process_queue);
6064		sd->completion_queue = NULL;
6065		INIT_LIST_HEAD(&sd->poll_list);
6066		sd->output_queue = NULL;
6067		sd->output_queue_tailp = &sd->output_queue;
6068#ifdef CONFIG_RPS
6069		sd->csd.func = rps_trigger_softirq;
6070		sd->csd.info = sd;
6071		sd->csd.flags = 0;
6072		sd->cpu = i;
6073#endif
6074
6075		sd->backlog.poll = process_backlog;
6076		sd->backlog.weight = weight_p;
6077		sd->backlog.gro_list = NULL;
6078		sd->backlog.gro_count = 0;
6079	}
6080
6081	dev_boot_phase = 0;
6082
6083	/* The loopback device is special if any other network devices
6084	 * is present in a network namespace the loopback device must
6085	 * be present. Since we now dynamically allocate and free the
6086	 * loopback device ensure this invariant is maintained by
6087	 * keeping the loopback device as the first device on the
6088	 * list of network devices.  Ensuring the loopback devices
6089	 * is the first device that appears and the last network device
6090	 * that disappears.
6091	 */
6092	if (register_pernet_device(&loopback_net_ops))
6093		goto out;
6094
6095	if (register_pernet_device(&default_device_ops))
6096		goto out;
6097
6098	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6099	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6100
6101	hotcpu_notifier(dev_cpu_callback, 0);
6102	dst_init();
6103	dev_mcast_init();
6104	rc = 0;
6105out:
6106	return rc;
6107}
6108
6109subsys_initcall(net_dev_init);
6110
6111static int __init initialize_hashrnd(void)
6112{
6113	get_random_bytes(&hashrnd, sizeof(hashrnd));
6114	return 0;
6115}
6116
6117late_initcall_sync(initialize_hashrnd);
6118