• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6.36/net/ipv4/
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84#include <typedefs.h>
85#include <bcmdefs.h>
86
87int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
88
89/* Generate a checksum for an outgoing IP datagram. */
90__inline__ void ip_send_check(struct iphdr *iph)
91{
92	iph->check = 0;
93	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
94}
95EXPORT_SYMBOL(ip_send_check);
96
97int __ip_local_out(struct sk_buff *skb)
98{
99	struct iphdr *iph = ip_hdr(skb);
100
101	iph->tot_len = htons(skb->len);
102	ip_send_check(iph);
103
104	/* Mark skb to identify SMB data packet */
105	if ((ip_hdr(skb)->protocol == IPPROTO_TCP) && tcp_hdr(skb))
106		skb->tcpf_smb = (tcp_hdr(skb)->source == htons(0x01bd));
107
108	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
109		       skb_dst(skb)->dev, dst_output);
110}
111
112int ip_local_out(struct sk_buff *skb)
113{
114	int err;
115
116	err = __ip_local_out(skb);
117	if (likely(err == 1))
118		err = dst_output(skb);
119
120	return err;
121}
122EXPORT_SYMBOL_GPL(ip_local_out);
123
124/* dev_loopback_xmit for use with netfilter. */
125static int ip_dev_loopback_xmit(struct sk_buff *newskb)
126{
127	skb_reset_mac_header(newskb);
128	__skb_pull(newskb, skb_network_offset(newskb));
129	newskb->pkt_type = PACKET_LOOPBACK;
130	newskb->ip_summed = CHECKSUM_UNNECESSARY;
131	WARN_ON(!skb_dst(newskb));
132	netif_rx_ni(newskb);
133	return 0;
134}
135
136static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
137{
138	int ttl = inet->uc_ttl;
139
140	if (ttl < 0)
141		ttl = dst_metric(dst, RTAX_HOPLIMIT);
142	return ttl;
143}
144
145/*
146 *		Add an ip header to a skbuff and send it out.
147 *
148 */
149int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
150			  __be32 saddr, __be32 daddr, struct ip_options *opt)
151{
152	struct inet_sock *inet = inet_sk(sk);
153	struct rtable *rt = skb_rtable(skb);
154	struct iphdr *iph;
155
156	/* Build the IP header. */
157	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
158	skb_reset_network_header(skb);
159	iph = ip_hdr(skb);
160	iph->version  = 4;
161	iph->ihl      = 5;
162	iph->tos      = inet->tos;
163	if (ip_dont_fragment(sk, &rt->dst))
164		iph->frag_off = htons(IP_DF);
165	else
166		iph->frag_off = 0;
167	iph->ttl      = ip_select_ttl(inet, &rt->dst);
168	iph->daddr    = rt->rt_dst;
169	iph->saddr    = rt->rt_src;
170	iph->protocol = sk->sk_protocol;
171	ip_select_ident(iph, &rt->dst, sk);
172
173	if (opt && opt->optlen) {
174		iph->ihl += opt->optlen>>2;
175		ip_options_build(skb, opt, daddr, rt, 0);
176	}
177
178	skb->priority = sk->sk_priority;
179	skb->mark = sk->sk_mark;
180
181	/* Send it out. */
182	return ip_local_out(skb);
183}
184EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
185
186static inline int ip_finish_output2(struct sk_buff *skb)
187{
188	struct dst_entry *dst = skb_dst(skb);
189	struct rtable *rt = (struct rtable *)dst;
190	struct net_device *dev = dst->dev;
191	unsigned int hh_len = LL_RESERVED_SPACE(dev);
192
193	if (rt->rt_type == RTN_MULTICAST) {
194		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
195	} else if (rt->rt_type == RTN_BROADCAST)
196		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
197
198	/* Be paranoid, rather than too clever. */
199	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
200		struct sk_buff *skb2;
201
202		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
203		if (skb2 == NULL) {
204			kfree_skb(skb);
205			return -ENOMEM;
206		}
207		if (skb->sk)
208			skb_set_owner_w(skb2, skb->sk);
209		kfree_skb(skb);
210		skb = skb2;
211	}
212
213	if (dst->hh)
214		return neigh_hh_output(dst->hh, skb);
215	else if (dst->neighbour)
216		return dst->neighbour->output(skb);
217
218	if (net_ratelimit())
219		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
220	kfree_skb(skb);
221	return -EINVAL;
222}
223
224static inline int ip_skb_dst_mtu(struct sk_buff *skb)
225{
226	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
227
228	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
229	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
230}
231
232static int ip_finish_output(struct sk_buff *skb)
233{
234#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
235	/* Policy lookup after SNAT yielded a new policy */
236	if (skb_dst(skb)->xfrm != NULL) {
237		IPCB(skb)->flags |= IPSKB_REROUTED;
238		return dst_output(skb);
239	}
240#endif
241	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
242		return ip_fragment(skb, ip_finish_output2);
243	else
244		return ip_finish_output2(skb);
245}
246
247int ip_mc_output(struct sk_buff *skb)
248{
249	struct sock *sk = skb->sk;
250	struct rtable *rt = skb_rtable(skb);
251	struct net_device *dev = rt->dst.dev;
252
253	/*
254	 *	If the indicated interface is up and running, send the packet.
255	 */
256	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
257
258	skb->dev = dev;
259	skb->protocol = htons(ETH_P_IP);
260
261	/*
262	 *	Multicasts are looped back for other local users
263	 */
264
265	if (rt->rt_flags&RTCF_MULTICAST) {
266		if (sk_mc_loop(sk)
267#ifdef CONFIG_IP_MROUTE
268		/* Small optimization: do not loopback not local frames,
269		   which returned after forwarding; they will be  dropped
270		   by ip_mr_input in any case.
271		   Note, that local frames are looped back to be delivered
272		   to local recipients.
273
274		   This check is duplicated in ip_mr_input at the moment.
275		 */
276		    &&
277		    ((rt->rt_flags & RTCF_LOCAL) ||
278		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
279#endif
280		   ) {
281			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
282			if (newskb)
283				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
284					newskb, NULL, newskb->dev,
285					ip_dev_loopback_xmit);
286		}
287
288		/* Multicasts with ttl 0 must not go beyond the host */
289
290		if (ip_hdr(skb)->ttl == 0) {
291			kfree_skb(skb);
292			return 0;
293		}
294	}
295
296	if (rt->rt_flags&RTCF_BROADCAST) {
297		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
298		if (newskb)
299			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
300				NULL, newskb->dev, ip_dev_loopback_xmit);
301	}
302
303	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
304			    skb->dev, ip_finish_output,
305			    !(IPCB(skb)->flags & IPSKB_REROUTED));
306}
307
308int ip_output(struct sk_buff *skb)
309{
310	struct net_device *dev = skb_dst(skb)->dev;
311
312	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
313
314	skb->dev = dev;
315	skb->protocol = htons(ETH_P_IP);
316
317	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
318			    ip_finish_output,
319			    !(IPCB(skb)->flags & IPSKB_REROUTED));
320}
321
322int BCMFASTPATH_HOST ip_queue_xmit(struct sk_buff *skb)
323{
324	struct sock *sk = skb->sk;
325	struct inet_sock *inet = inet_sk(sk);
326	struct ip_options *opt = inet->opt;
327	struct rtable *rt;
328	struct iphdr *iph;
329	int res;
330
331	/* Skip all of this if the packet is already routed,
332	 * f.e. by something like SCTP.
333	 */
334	rcu_read_lock();
335	rt = skb_rtable(skb);
336	if (rt != NULL)
337		goto packet_routed;
338
339	/* Make sure we can route this packet. */
340	rt = (struct rtable *)__sk_dst_check(sk, 0);
341	if (rt == NULL) {
342		__be32 daddr;
343
344		/* Use correct destination address if we have options. */
345		daddr = inet->inet_daddr;
346		if(opt && opt->srr)
347			daddr = opt->faddr;
348
349		{
350			struct flowi fl = { .oif = sk->sk_bound_dev_if,
351					    .mark = sk->sk_mark,
352					    .nl_u = { .ip4_u =
353						      { .daddr = daddr,
354							.saddr = inet->inet_saddr,
355							.tos = RT_CONN_FLAGS(sk) } },
356					    .proto = sk->sk_protocol,
357					    .flags = inet_sk_flowi_flags(sk),
358					    .uli_u = { .ports =
359						       { .sport = inet->inet_sport,
360							 .dport = inet->inet_dport } } };
361
362			/* If this fails, retransmit mechanism of transport layer will
363			 * keep trying until route appears or the connection times
364			 * itself out.
365			 */
366			security_sk_classify_flow(sk, &fl);
367			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
368				goto no_route;
369		}
370		sk_setup_caps(sk, &rt->dst);
371	}
372	skb_dst_set_noref(skb, &rt->dst);
373
374packet_routed:
375	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
376		goto no_route;
377
378	/* OK, we know where to send it, allocate and build IP header. */
379	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
380	skb_reset_network_header(skb);
381	iph = ip_hdr(skb);
382	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
383	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
384		iph->frag_off = htons(IP_DF);
385	else
386		iph->frag_off = 0;
387	iph->ttl      = ip_select_ttl(inet, &rt->dst);
388	iph->protocol = sk->sk_protocol;
389	iph->saddr    = rt->rt_src;
390	iph->daddr    = rt->rt_dst;
391	/* Transport layer set skb->h.foo itself. */
392
393	if (opt && opt->optlen) {
394		iph->ihl += opt->optlen >> 2;
395		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
396	}
397
398	ip_select_ident_more(iph, &rt->dst, sk,
399			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
400
401	skb->priority = sk->sk_priority;
402	skb->mark = sk->sk_mark;
403
404	res = ip_local_out(skb);
405	rcu_read_unlock();
406	return res;
407
408no_route:
409	rcu_read_unlock();
410	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
411	kfree_skb(skb);
412	return -EHOSTUNREACH;
413}
414EXPORT_SYMBOL(ip_queue_xmit);
415
416
417static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
418{
419	to->pkt_type = from->pkt_type;
420	to->priority = from->priority;
421	to->protocol = from->protocol;
422	skb_dst_drop(to);
423	skb_dst_copy(to, from);
424	to->dev = from->dev;
425	to->mark = from->mark;
426
427	/* Copy the flags to each fragment. */
428	IPCB(to)->flags = IPCB(from)->flags;
429
430#ifdef CONFIG_NET_SCHED
431	to->tc_index = from->tc_index;
432#endif
433	nf_copy(to, from);
434#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
435	defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
436	to->nf_trace = from->nf_trace;
437#endif
438#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
439	to->ipvs_property = from->ipvs_property;
440#endif
441	skb_copy_secmark(to, from);
442}
443
444/*
445 *	This IP datagram is too large to be sent in one piece.  Break it up into
446 *	smaller pieces (each of size equal to IP header plus
447 *	a block of the data of the original IP data part) that will yet fit in a
448 *	single device frame, and queue such a frame for sending.
449 */
450
451int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
452{
453	struct iphdr *iph;
454	int ptr;
455	struct net_device *dev;
456	struct sk_buff *skb2;
457	unsigned int mtu, hlen, left, len, ll_rs;
458	int offset;
459	__be16 not_last_frag;
460	struct rtable *rt = skb_rtable(skb);
461	int err = 0;
462
463	dev = rt->dst.dev;
464
465	/*
466	 *	Point into the IP datagram header.
467	 */
468
469	iph = ip_hdr(skb);
470
471	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
472		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
473		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
474			  htonl(ip_skb_dst_mtu(skb)));
475		kfree_skb(skb);
476		return -EMSGSIZE;
477	}
478
479	/*
480	 *	Setup starting values.
481	 */
482
483	hlen = iph->ihl * 4;
484	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
485#ifdef CONFIG_BRIDGE_NETFILTER
486	if (skb->nf_bridge)
487		mtu -= nf_bridge_mtu_reduction(skb);
488#endif
489	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
490
491	/* When frag_list is given, use it. First, check its validity:
492	 * some transformers could create wrong frag_list or break existing
493	 * one, it is not prohibited. In this case fall back to copying.
494	 *
495	 * LATER: this step can be merged to real generation of fragments,
496	 * we can switch to copy when see the first bad fragment.
497	 */
498	if (skb_has_frags(skb)) {
499		struct sk_buff *frag, *frag2;
500		int first_len = skb_pagelen(skb);
501
502		if (first_len - hlen > mtu ||
503		    ((first_len - hlen) & 7) ||
504		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
505		    skb_cloned(skb))
506			goto slow_path;
507
508		skb_walk_frags(skb, frag) {
509			/* Correct geometry. */
510			if (frag->len > mtu ||
511			    ((frag->len & 7) && frag->next) ||
512			    skb_headroom(frag) < hlen)
513				goto slow_path_clean;
514
515			/* Partially cloned skb? */
516			if (skb_shared(frag))
517				goto slow_path_clean;
518
519			BUG_ON(frag->sk);
520			if (skb->sk) {
521				frag->sk = skb->sk;
522				frag->destructor = sock_wfree;
523			}
524			skb->truesize -= frag->truesize;
525		}
526
527		/* Everything is OK. Generate! */
528
529		err = 0;
530		offset = 0;
531		frag = skb_shinfo(skb)->frag_list;
532		skb_frag_list_init(skb);
533		skb->data_len = first_len - skb_headlen(skb);
534		skb->len = first_len;
535		iph->tot_len = htons(first_len);
536		iph->frag_off = htons(IP_MF);
537		ip_send_check(iph);
538
539		for (;;) {
540			/* Prepare header of the next frame,
541			 * before previous one went down. */
542			if (frag) {
543				frag->ip_summed = CHECKSUM_NONE;
544				skb_reset_transport_header(frag);
545				__skb_push(frag, hlen);
546				skb_reset_network_header(frag);
547				memcpy(skb_network_header(frag), iph, hlen);
548				iph = ip_hdr(frag);
549				iph->tot_len = htons(frag->len);
550				ip_copy_metadata(frag, skb);
551				if (offset == 0)
552					ip_options_fragment(frag);
553				offset += skb->len - hlen;
554				iph->frag_off = htons(offset>>3);
555				if (frag->next != NULL)
556					iph->frag_off |= htons(IP_MF);
557				/* Ready, complete checksum */
558				ip_send_check(iph);
559			}
560
561			err = output(skb);
562
563			if (!err)
564				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
565			if (err || !frag)
566				break;
567
568			skb = frag;
569			frag = skb->next;
570			skb->next = NULL;
571		}
572
573		if (err == 0) {
574			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
575			return 0;
576		}
577
578		while (frag) {
579			skb = frag->next;
580			kfree_skb(frag);
581			frag = skb;
582		}
583		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
584		return err;
585
586slow_path_clean:
587		skb_walk_frags(skb, frag2) {
588			if (frag2 == frag)
589				break;
590			frag2->sk = NULL;
591			frag2->destructor = NULL;
592			skb->truesize += frag2->truesize;
593		}
594	}
595
596slow_path:
597	left = skb->len - hlen;		/* Space per frame */
598	ptr = hlen;		/* Where to start from */
599
600	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
601	 * we need to make room for the encapsulating header
602	 */
603	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
604
605	/*
606	 *	Fragment the datagram.
607	 */
608
609	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
610	not_last_frag = iph->frag_off & htons(IP_MF);
611
612	/*
613	 *	Keep copying data until we run out.
614	 */
615
616	while (left > 0) {
617		len = left;
618		/* IF: it doesn't fit, use 'mtu' - the data space left */
619		if (len > mtu)
620			len = mtu;
621		/* IF: we are not sending upto and including the packet end
622		   then align the next start on an eight byte boundary */
623		if (len < left)	{
624			len &= ~7;
625		}
626		/*
627		 *	Allocate buffer.
628		 */
629
630		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
631			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
632			err = -ENOMEM;
633			goto fail;
634		}
635
636		/*
637		 *	Set up data on packet
638		 */
639
640		ip_copy_metadata(skb2, skb);
641		skb_reserve(skb2, ll_rs);
642		skb_put(skb2, len + hlen);
643		skb_reset_network_header(skb2);
644		skb2->transport_header = skb2->network_header + hlen;
645
646		/*
647		 *	Charge the memory for the fragment to any owner
648		 *	it might possess
649		 */
650
651		if (skb->sk)
652			skb_set_owner_w(skb2, skb->sk);
653
654		/*
655		 *	Copy the packet header into the new buffer.
656		 */
657
658		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
659
660		/*
661		 *	Copy a block of the IP datagram.
662		 */
663		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
664			BUG();
665		left -= len;
666
667		/*
668		 *	Fill in the new header fields.
669		 */
670		iph = ip_hdr(skb2);
671		iph->frag_off = htons((offset >> 3));
672
673		/* ANK: dirty, but effective trick. Upgrade options only if
674		 * the segment to be fragmented was THE FIRST (otherwise,
675		 * options are already fixed) and make it ONCE
676		 * on the initial skb, so that all the following fragments
677		 * will inherit fixed options.
678		 */
679		if (offset == 0)
680			ip_options_fragment(skb);
681
682		/*
683		 *	Added AC : If we are fragmenting a fragment that's not the
684		 *		   last fragment then keep MF on each bit
685		 */
686		if (left > 0 || not_last_frag)
687			iph->frag_off |= htons(IP_MF);
688		ptr += len;
689		offset += len;
690
691		/*
692		 *	Put this fragment into the sending queue.
693		 */
694		iph->tot_len = htons(len + hlen);
695
696		ip_send_check(iph);
697
698		err = output(skb2);
699		if (err)
700			goto fail;
701
702		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
703	}
704	kfree_skb(skb);
705	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
706	return err;
707
708fail:
709	kfree_skb(skb);
710	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
711	return err;
712}
713EXPORT_SYMBOL(ip_fragment);
714
715int
716ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
717{
718	struct iovec *iov = from;
719
720	if (skb->ip_summed == CHECKSUM_PARTIAL) {
721		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
722			return -EFAULT;
723	} else {
724		__wsum csum = 0;
725		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
726			return -EFAULT;
727		skb->csum = csum_block_add(skb->csum, csum, odd);
728	}
729	return 0;
730}
731EXPORT_SYMBOL(ip_generic_getfrag);
732
733static inline __wsum
734csum_page(struct page *page, int offset, int copy)
735{
736	char *kaddr;
737	__wsum csum;
738	kaddr = kmap(page);
739	csum = csum_partial(kaddr + offset, copy, 0);
740	kunmap(page);
741	return csum;
742}
743
744static inline int ip_ufo_append_data(struct sock *sk,
745			int getfrag(void *from, char *to, int offset, int len,
746			       int odd, struct sk_buff *skb),
747			void *from, int length, int hh_len, int fragheaderlen,
748			int transhdrlen, int mtu, unsigned int flags)
749{
750	struct sk_buff *skb;
751	int err;
752
753	/* There is support for UDP fragmentation offload by network
754	 * device, so create one single skb packet containing complete
755	 * udp datagram
756	 */
757	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
758		skb = sock_alloc_send_skb(sk,
759			hh_len + fragheaderlen + transhdrlen + 20,
760			(flags & MSG_DONTWAIT), &err);
761
762		if (skb == NULL)
763			return err;
764
765		/* reserve space for Hardware header */
766		skb_reserve(skb, hh_len);
767
768		/* create space for UDP/IP header */
769		skb_put(skb, fragheaderlen + transhdrlen);
770
771		/* initialize network header pointer */
772		skb_reset_network_header(skb);
773
774		/* initialize protocol header pointer */
775		skb->transport_header = skb->network_header + fragheaderlen;
776
777		skb->ip_summed = CHECKSUM_PARTIAL;
778		skb->csum = 0;
779		sk->sk_sndmsg_off = 0;
780
781		/* specify the length of each IP datagram fragment */
782		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
783		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
784		__skb_queue_tail(&sk->sk_write_queue, skb);
785	}
786
787	return skb_append_datato_frags(sk, skb, getfrag, from,
788				       (length - transhdrlen));
789}
790
791/*
792 *	ip_append_data() and ip_append_page() can make one large IP datagram
793 *	from many pieces of data. Each pieces will be holded on the socket
794 *	until ip_push_pending_frames() is called. Each piece can be a page
795 *	or non-page data.
796 *
797 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
798 *	this interface potentially.
799 *
800 *	LATER: length must be adjusted by pad at tail, when it is required.
801 */
802int ip_append_data(struct sock *sk,
803		   int getfrag(void *from, char *to, int offset, int len,
804			       int odd, struct sk_buff *skb),
805		   void *from, int length, int transhdrlen,
806		   struct ipcm_cookie *ipc, struct rtable **rtp,
807		   unsigned int flags)
808{
809	struct inet_sock *inet = inet_sk(sk);
810	struct sk_buff *skb;
811
812	struct ip_options *opt = NULL;
813	int hh_len;
814	int exthdrlen;
815	int mtu;
816	int copy;
817	int err;
818	int offset = 0;
819	unsigned int maxfraglen, fragheaderlen;
820	int csummode = CHECKSUM_NONE;
821	struct rtable *rt;
822
823	if (flags&MSG_PROBE)
824		return 0;
825
826	if (skb_queue_empty(&sk->sk_write_queue)) {
827		/*
828		 * setup for corking.
829		 */
830		opt = ipc->opt;
831		if (opt) {
832			if (inet->cork.opt == NULL) {
833				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
834				if (unlikely(inet->cork.opt == NULL))
835					return -ENOBUFS;
836			}
837			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
838			inet->cork.flags |= IPCORK_OPT;
839			inet->cork.addr = ipc->addr;
840		}
841		rt = *rtp;
842		if (unlikely(!rt))
843			return -EFAULT;
844		/*
845		 * We steal reference to this route, caller should not release it
846		 */
847		*rtp = NULL;
848		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
849					    rt->dst.dev->mtu :
850					    dst_mtu(rt->dst.path);
851		inet->cork.dst = &rt->dst;
852		inet->cork.length = 0;
853		sk->sk_sndmsg_page = NULL;
854		sk->sk_sndmsg_off = 0;
855		if ((exthdrlen = rt->dst.header_len) != 0) {
856			length += exthdrlen;
857			transhdrlen += exthdrlen;
858		}
859	} else {
860		rt = (struct rtable *)inet->cork.dst;
861		if (inet->cork.flags & IPCORK_OPT)
862			opt = inet->cork.opt;
863
864		transhdrlen = 0;
865		exthdrlen = 0;
866		mtu = inet->cork.fragsize;
867	}
868	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
869
870	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
871	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
872
873	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
874		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
875			       mtu-exthdrlen);
876		return -EMSGSIZE;
877	}
878
879	/*
880	 * transhdrlen > 0 means that this is the first fragment and we wish
881	 * it won't be fragmented in the future.
882	 */
883	if (transhdrlen &&
884	    length + fragheaderlen <= mtu &&
885	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
886	    !exthdrlen)
887		csummode = CHECKSUM_PARTIAL;
888
889	skb = skb_peek_tail(&sk->sk_write_queue);
890
891	inet->cork.length += length;
892	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
893	    (sk->sk_protocol == IPPROTO_UDP) &&
894	    (rt->dst.dev->features & NETIF_F_UFO)) {
895		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
896					 fragheaderlen, transhdrlen, mtu,
897					 flags);
898		if (err)
899			goto error;
900		return 0;
901	}
902
903	/* So, what's going on in the loop below?
904	 *
905	 * We use calculated fragment length to generate chained skb,
906	 * each of segments is IP fragment ready for sending to network after
907	 * adding appropriate IP header.
908	 */
909
910	if (!skb)
911		goto alloc_new_skb;
912
913	while (length > 0) {
914		/* Check if the remaining data fits into current packet. */
915		copy = mtu - skb->len;
916		if (copy < length)
917			copy = maxfraglen - skb->len;
918		if (copy <= 0) {
919			char *data;
920			unsigned int datalen;
921			unsigned int fraglen;
922			unsigned int fraggap;
923			unsigned int alloclen;
924			struct sk_buff *skb_prev;
925alloc_new_skb:
926			skb_prev = skb;
927			if (skb_prev)
928				fraggap = skb_prev->len - maxfraglen;
929			else
930				fraggap = 0;
931
932			/*
933			 * If remaining data exceeds the mtu,
934			 * we know we need more fragment(s).
935			 */
936			datalen = length + fraggap;
937			if (datalen > mtu - fragheaderlen)
938				datalen = maxfraglen - fragheaderlen;
939			fraglen = datalen + fragheaderlen;
940
941			if ((flags & MSG_MORE) &&
942			    !(rt->dst.dev->features&NETIF_F_SG))
943				alloclen = mtu;
944			else
945				alloclen = fraglen;
946
947			/* The last fragment gets additional space at tail.
948			 * Note, with MSG_MORE we overallocate on fragments,
949			 * because we have no idea what fragment will be
950			 * the last.
951			 */
952			if (datalen == length + fraggap)
953				alloclen += rt->dst.trailer_len;
954
955			if (transhdrlen) {
956				skb = sock_alloc_send_skb(sk,
957						alloclen + hh_len + 15,
958						(flags & MSG_DONTWAIT), &err);
959			} else {
960				skb = NULL;
961				if (atomic_read(&sk->sk_wmem_alloc) <=
962				    2 * sk->sk_sndbuf)
963					skb = sock_wmalloc(sk,
964							   alloclen + hh_len + 15, 1,
965							   sk->sk_allocation);
966				if (unlikely(skb == NULL))
967					err = -ENOBUFS;
968				else
969					/* only the initial fragment is
970					   time stamped */
971					ipc->shtx.flags = 0;
972			}
973			if (skb == NULL)
974				goto error;
975
976			/*
977			 *	Fill in the control structures
978			 */
979			skb->ip_summed = csummode;
980			skb->csum = 0;
981			skb_reserve(skb, hh_len);
982			*skb_tx(skb) = ipc->shtx;
983
984			/*
985			 *	Find where to start putting bytes.
986			 */
987			data = skb_put(skb, fraglen);
988			skb_set_network_header(skb, exthdrlen);
989			skb->transport_header = (skb->network_header +
990						 fragheaderlen);
991			data += fragheaderlen;
992
993			if (fraggap) {
994				skb->csum = skb_copy_and_csum_bits(
995					skb_prev, maxfraglen,
996					data + transhdrlen, fraggap, 0);
997				skb_prev->csum = csum_sub(skb_prev->csum,
998							  skb->csum);
999				data += fraggap;
1000				pskb_trim_unique(skb_prev, maxfraglen);
1001			}
1002
1003			copy = datalen - transhdrlen - fraggap;
1004			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1005				err = -EFAULT;
1006				kfree_skb(skb);
1007				goto error;
1008			}
1009
1010			offset += copy;
1011			length -= datalen - fraggap;
1012			transhdrlen = 0;
1013			exthdrlen = 0;
1014			csummode = CHECKSUM_NONE;
1015
1016			/*
1017			 * Put the packet on the pending queue.
1018			 */
1019			__skb_queue_tail(&sk->sk_write_queue, skb);
1020			continue;
1021		}
1022
1023		if (copy > length)
1024			copy = length;
1025
1026		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1027			unsigned int off;
1028
1029			off = skb->len;
1030			if (getfrag(from, skb_put(skb, copy),
1031					offset, copy, off, skb) < 0) {
1032				__skb_trim(skb, off);
1033				err = -EFAULT;
1034				goto error;
1035			}
1036		} else {
1037			int i = skb_shinfo(skb)->nr_frags;
1038			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1039			struct page *page = sk->sk_sndmsg_page;
1040			int off = sk->sk_sndmsg_off;
1041			unsigned int left;
1042
1043			if (page && (left = PAGE_SIZE - off) > 0) {
1044				if (copy >= left)
1045					copy = left;
1046				if (page != frag->page) {
1047					if (i == MAX_SKB_FRAGS) {
1048						err = -EMSGSIZE;
1049						goto error;
1050					}
1051					get_page(page);
1052					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1053					frag = &skb_shinfo(skb)->frags[i];
1054				}
1055			} else if (i < MAX_SKB_FRAGS) {
1056				if (copy > PAGE_SIZE)
1057					copy = PAGE_SIZE;
1058				page = alloc_pages(sk->sk_allocation, 0);
1059				if (page == NULL)  {
1060					err = -ENOMEM;
1061					goto error;
1062				}
1063				sk->sk_sndmsg_page = page;
1064				sk->sk_sndmsg_off = 0;
1065
1066				skb_fill_page_desc(skb, i, page, 0, 0);
1067				frag = &skb_shinfo(skb)->frags[i];
1068			} else {
1069				err = -EMSGSIZE;
1070				goto error;
1071			}
1072			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1073				err = -EFAULT;
1074				goto error;
1075			}
1076			sk->sk_sndmsg_off += copy;
1077			frag->size += copy;
1078			skb->len += copy;
1079			skb->data_len += copy;
1080			skb->truesize += copy;
1081			atomic_add(copy, &sk->sk_wmem_alloc);
1082		}
1083		offset += copy;
1084		length -= copy;
1085	}
1086
1087	return 0;
1088
1089error:
1090	inet->cork.length -= length;
1091	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1092	return err;
1093}
1094
1095ssize_t	ip_append_page(struct sock *sk, struct page *page,
1096		       int offset, size_t size, int flags)
1097{
1098	struct inet_sock *inet = inet_sk(sk);
1099	struct sk_buff *skb;
1100	struct rtable *rt;
1101	struct ip_options *opt = NULL;
1102	int hh_len;
1103	int mtu;
1104	int len;
1105	int err;
1106	unsigned int maxfraglen, fragheaderlen, fraggap;
1107
1108	if (inet->hdrincl)
1109		return -EPERM;
1110
1111	if (flags&MSG_PROBE)
1112		return 0;
1113
1114	if (skb_queue_empty(&sk->sk_write_queue))
1115		return -EINVAL;
1116
1117	rt = (struct rtable *)inet->cork.dst;
1118	if (inet->cork.flags & IPCORK_OPT)
1119		opt = inet->cork.opt;
1120
1121	if (!(rt->dst.dev->features&NETIF_F_SG))
1122		return -EOPNOTSUPP;
1123
1124	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1125	mtu = inet->cork.fragsize;
1126
1127	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1128	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1129
1130	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1131		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1132		return -EMSGSIZE;
1133	}
1134
1135	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1136		return -EINVAL;
1137
1138	inet->cork.length += size;
1139	if ((size + skb->len > mtu) &&
1140	    (sk->sk_protocol == IPPROTO_UDP) &&
1141	    (rt->dst.dev->features & NETIF_F_UFO)) {
1142		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1143		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1144	}
1145
1146
1147	while (size > 0) {
1148		int i;
1149
1150		if (skb_is_gso(skb))
1151			len = size;
1152		else {
1153
1154			/* Check if the remaining data fits into current packet. */
1155			len = mtu - skb->len;
1156			if (len < size)
1157				len = maxfraglen - skb->len;
1158		}
1159		if (len <= 0) {
1160			struct sk_buff *skb_prev;
1161			int alloclen;
1162
1163			skb_prev = skb;
1164			fraggap = skb_prev->len - maxfraglen;
1165
1166			alloclen = fragheaderlen + hh_len + fraggap + 15;
1167			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1168			if (unlikely(!skb)) {
1169				err = -ENOBUFS;
1170				goto error;
1171			}
1172
1173			/*
1174			 *	Fill in the control structures
1175			 */
1176			skb->ip_summed = CHECKSUM_NONE;
1177			skb->csum = 0;
1178			skb_reserve(skb, hh_len);
1179
1180			/*
1181			 *	Find where to start putting bytes.
1182			 */
1183			skb_put(skb, fragheaderlen + fraggap);
1184			skb_reset_network_header(skb);
1185			skb->transport_header = (skb->network_header +
1186						 fragheaderlen);
1187			if (fraggap) {
1188				skb->csum = skb_copy_and_csum_bits(skb_prev,
1189								   maxfraglen,
1190						    skb_transport_header(skb),
1191								   fraggap, 0);
1192				skb_prev->csum = csum_sub(skb_prev->csum,
1193							  skb->csum);
1194				pskb_trim_unique(skb_prev, maxfraglen);
1195			}
1196
1197			/*
1198			 * Put the packet on the pending queue.
1199			 */
1200			__skb_queue_tail(&sk->sk_write_queue, skb);
1201			continue;
1202		}
1203
1204		i = skb_shinfo(skb)->nr_frags;
1205		if (len > size)
1206			len = size;
1207		if (skb_can_coalesce(skb, i, page, offset)) {
1208			skb_shinfo(skb)->frags[i-1].size += len;
1209		} else if (i < MAX_SKB_FRAGS) {
1210			get_page(page);
1211			skb_fill_page_desc(skb, i, page, offset, len);
1212		} else {
1213			err = -EMSGSIZE;
1214			goto error;
1215		}
1216
1217		if (skb->ip_summed == CHECKSUM_NONE) {
1218			__wsum csum;
1219			csum = csum_page(page, offset, len);
1220			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1221		}
1222
1223		skb->len += len;
1224		skb->data_len += len;
1225		skb->truesize += len;
1226		atomic_add(len, &sk->sk_wmem_alloc);
1227		offset += len;
1228		size -= len;
1229	}
1230	return 0;
1231
1232error:
1233	inet->cork.length -= size;
1234	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1235	return err;
1236}
1237
1238static void ip_cork_release(struct inet_sock *inet)
1239{
1240	inet->cork.flags &= ~IPCORK_OPT;
1241	kfree(inet->cork.opt);
1242	inet->cork.opt = NULL;
1243	dst_release(inet->cork.dst);
1244	inet->cork.dst = NULL;
1245}
1246
1247/*
1248 *	Combined all pending IP fragments on the socket as one IP datagram
1249 *	and push them out.
1250 */
1251int ip_push_pending_frames(struct sock *sk)
1252{
1253	struct sk_buff *skb, *tmp_skb;
1254	struct sk_buff **tail_skb;
1255	struct inet_sock *inet = inet_sk(sk);
1256	struct net *net = sock_net(sk);
1257	struct ip_options *opt = NULL;
1258	struct rtable *rt = (struct rtable *)inet->cork.dst;
1259	struct iphdr *iph;
1260	__be16 df = 0;
1261	__u8 ttl;
1262	int err = 0;
1263
1264	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1265		goto out;
1266	tail_skb = &(skb_shinfo(skb)->frag_list);
1267
1268	/* move skb->data to ip header from ext header */
1269	if (skb->data < skb_network_header(skb))
1270		__skb_pull(skb, skb_network_offset(skb));
1271	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1272		__skb_pull(tmp_skb, skb_network_header_len(skb));
1273		*tail_skb = tmp_skb;
1274		tail_skb = &(tmp_skb->next);
1275		skb->len += tmp_skb->len;
1276		skb->data_len += tmp_skb->len;
1277		skb->truesize += tmp_skb->truesize;
1278		tmp_skb->destructor = NULL;
1279		tmp_skb->sk = NULL;
1280	}
1281
1282	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1283	 * to fragment the frame generated here. No matter, what transforms
1284	 * how transforms change size of the packet, it will come out.
1285	 */
1286	if (inet->pmtudisc < IP_PMTUDISC_DO)
1287		skb->local_df = 1;
1288
1289	/* DF bit is set when we want to see DF on outgoing frames.
1290	 * If local_df is set too, we still allow to fragment this frame
1291	 * locally. */
1292	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1293	    (skb->len <= dst_mtu(&rt->dst) &&
1294	     ip_dont_fragment(sk, &rt->dst)))
1295		df = htons(IP_DF);
1296
1297	if (inet->cork.flags & IPCORK_OPT)
1298		opt = inet->cork.opt;
1299
1300	if (rt->rt_type == RTN_MULTICAST)
1301		ttl = inet->mc_ttl;
1302	else
1303		ttl = ip_select_ttl(inet, &rt->dst);
1304
1305	iph = (struct iphdr *)skb->data;
1306	iph->version = 4;
1307	iph->ihl = 5;
1308	if (opt) {
1309		iph->ihl += opt->optlen>>2;
1310		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1311	}
1312	iph->tos = inet->tos;
1313	iph->frag_off = df;
1314	ip_select_ident(iph, &rt->dst, sk);
1315	iph->ttl = ttl;
1316	iph->protocol = sk->sk_protocol;
1317	iph->saddr = rt->rt_src;
1318	iph->daddr = rt->rt_dst;
1319
1320	skb->priority = sk->sk_priority;
1321	skb->mark = sk->sk_mark;
1322	/*
1323	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1324	 * on dst refcount
1325	 */
1326	inet->cork.dst = NULL;
1327	skb_dst_set(skb, &rt->dst);
1328
1329	if (iph->protocol == IPPROTO_ICMP)
1330		icmp_out_count(net, ((struct icmphdr *)
1331			skb_transport_header(skb))->type);
1332
1333	/* Netfilter gets whole the not fragmented skb. */
1334	err = ip_local_out(skb);
1335	if (err) {
1336		if (err > 0)
1337			err = net_xmit_errno(err);
1338		if (err)
1339			goto error;
1340	}
1341
1342out:
1343	ip_cork_release(inet);
1344	return err;
1345
1346error:
1347	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1348	goto out;
1349}
1350
1351/*
1352 *	Throw away all pending data on the socket.
1353 */
1354void ip_flush_pending_frames(struct sock *sk)
1355{
1356	struct sk_buff *skb;
1357
1358	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1359		kfree_skb(skb);
1360
1361	ip_cork_release(inet_sk(sk));
1362}
1363
1364
1365/*
1366 *	Fetch data from kernel space and fill in checksum if needed.
1367 */
1368static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1369			      int len, int odd, struct sk_buff *skb)
1370{
1371	__wsum csum;
1372
1373	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1374	skb->csum = csum_block_add(skb->csum, csum, odd);
1375	return 0;
1376}
1377
1378/*
1379 *	Generic function to send a packet as reply to another packet.
1380 *	Used to send TCP resets so far. ICMP should use this function too.
1381 *
1382 *	Should run single threaded per socket because it uses the sock
1383 *     	structure to pass arguments.
1384 */
1385void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1386		   unsigned int len)
1387{
1388	struct inet_sock *inet = inet_sk(sk);
1389	struct {
1390		struct ip_options	opt;
1391		char			data[40];
1392	} replyopts;
1393	struct ipcm_cookie ipc;
1394	__be32 daddr;
1395	struct rtable *rt = skb_rtable(skb);
1396
1397	if (ip_options_echo(&replyopts.opt, skb))
1398		return;
1399
1400	daddr = ipc.addr = rt->rt_src;
1401	ipc.opt = NULL;
1402	ipc.shtx.flags = 0;
1403
1404	if (replyopts.opt.optlen) {
1405		ipc.opt = &replyopts.opt;
1406
1407		if (ipc.opt->srr)
1408			daddr = replyopts.opt.faddr;
1409	}
1410
1411	{
1412		struct flowi fl = { .oif = arg->bound_dev_if,
1413				    .nl_u = { .ip4_u =
1414					      { .daddr = daddr,
1415						.saddr = rt->rt_spec_dst,
1416						.tos = RT_TOS(ip_hdr(skb)->tos) } },
1417				    /* Not quite clean, but right. */
1418				    .uli_u = { .ports =
1419					       { .sport = tcp_hdr(skb)->dest,
1420						 .dport = tcp_hdr(skb)->source } },
1421				    .proto = sk->sk_protocol,
1422				    .flags = ip_reply_arg_flowi_flags(arg) };
1423		security_skb_classify_flow(skb, &fl);
1424		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1425			return;
1426	}
1427
1428	/* And let IP do all the hard work.
1429
1430	   This chunk is not reenterable, hence spinlock.
1431	   Note that it uses the fact, that this function is called
1432	   with locally disabled BH and that sk cannot be already spinlocked.
1433	 */
1434	bh_lock_sock(sk);
1435	inet->tos = ip_hdr(skb)->tos;
1436	sk->sk_priority = skb->priority;
1437	sk->sk_protocol = ip_hdr(skb)->protocol;
1438	sk->sk_bound_dev_if = arg->bound_dev_if;
1439	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1440		       &ipc, &rt, MSG_DONTWAIT);
1441	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1442		if (arg->csumoffset >= 0)
1443			*((__sum16 *)skb_transport_header(skb) +
1444			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1445								arg->csum));
1446		skb->ip_summed = CHECKSUM_NONE;
1447		ip_push_pending_frames(sk);
1448	}
1449
1450	bh_unlock_sock(sk);
1451
1452	ip_rt_put(rt);
1453}
1454
1455void __init ip_init(void)
1456{
1457	ip_rt_init();
1458	inet_initpeers();
1459
1460#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1461	igmp_mc_proc_init();
1462#endif
1463}
1464