1/*
2 *	SUCS NET3:
3 *
4 *	Generic datagram handling routines. These are generic for all
5 *	protocols. Possibly a generic IP version on top of these would
6 *	make sense. Not tonight however 8-).
7 *	This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
8 *	NetROM layer all have identical poll code and mostly
9 *	identical recvmsg() code. So we share it here. The poll was
10 *	shared before but buried in udp.c so I moved it.
11 *
12 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
13 *						     udp.c code)
14 *
15 *	Fixes:
16 *		Alan Cox	:	NULL return from skb_peek_copy()
17 *					understood
18 *		Alan Cox	:	Rewrote skb_read_datagram to avoid the
19 *					skb_peek_copy stuff.
20 *		Alan Cox	:	Added support for SOCK_SEQPACKET.
21 *					IPX can no longer use the SO_TYPE hack
22 *					but AX.25 now works right, and SPX is
23 *					feasible.
24 *		Alan Cox	:	Fixed write poll of non IP protocol
25 *					crash.
26 *		Florian  La Roche:	Changed for my new skbuff handling.
27 *		Darryl Miles	:	Fixed non-blocking SOCK_SEQPACKET.
28 *		Linus Torvalds	:	BSD semantic fixes.
29 *		Alan Cox	:	Datagram iovec handling
30 *		Darryl Miles	:	Fixed non-blocking SOCK_STREAM.
31 *		Alan Cox	:	POSIXisms
32 *		Pete Wyckoff    :       Unconnected accept() fix.
33 *
34 */
35
36#include <linux/module.h>
37#include <linux/types.h>
38#include <linux/kernel.h>
39#include <asm/uaccess.h>
40#include <asm/system.h>
41#include <linux/mm.h>
42#include <linux/interrupt.h>
43#include <linux/errno.h>
44#include <linux/sched.h>
45#include <linux/inet.h>
46#include <linux/netdevice.h>
47#include <linux/rtnetlink.h>
48#include <linux/poll.h>
49#include <linux/highmem.h>
50#include <linux/spinlock.h>
51#include <linux/slab.h>
52
53#include <net/protocol.h>
54#include <linux/skbuff.h>
55
56#include <net/checksum.h>
57#include <net/sock.h>
58#include <net/tcp_states.h>
59#include <trace/events/skb.h>
60
61/*
62 *	Is a socket 'connection oriented' ?
63 */
64static inline int connection_based(struct sock *sk)
65{
66	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
67}
68
69static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,
70				  void *key)
71{
72	unsigned long bits = (unsigned long)key;
73
74	/*
75	 * Avoid a wakeup if event not interesting for us
76	 */
77	if (bits && !(bits & (POLLIN | POLLERR)))
78		return 0;
79	return autoremove_wake_function(wait, mode, sync, key);
80}
81/*
82 * Wait for a packet..
83 */
84static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
85{
86	int error;
87	DEFINE_WAIT_FUNC(wait, receiver_wake_function);
88
89	prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
90
91	/* Socket errors? */
92	error = sock_error(sk);
93	if (error)
94		goto out_err;
95
96	if (!skb_queue_empty(&sk->sk_receive_queue))
97		goto out;
98
99	/* Socket shut down? */
100	if (sk->sk_shutdown & RCV_SHUTDOWN)
101		goto out_noerr;
102
103	/* Sequenced packets can come disconnected.
104	 * If so we report the problem
105	 */
106	error = -ENOTCONN;
107	if (connection_based(sk) &&
108	    !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
109		goto out_err;
110
111	/* handle signals */
112	if (signal_pending(current))
113		goto interrupted;
114
115	error = 0;
116	*timeo_p = schedule_timeout(*timeo_p);
117out:
118	finish_wait(sk_sleep(sk), &wait);
119	return error;
120interrupted:
121	error = sock_intr_errno(*timeo_p);
122out_err:
123	*err = error;
124	goto out;
125out_noerr:
126	*err = 0;
127	error = 1;
128	goto out;
129}
130
131/**
132 *	__skb_recv_datagram - Receive a datagram skbuff
133 *	@sk: socket
134 *	@flags: MSG_ flags
135 *	@peeked: returns non-zero if this packet has been seen before
136 *	@err: error code returned
137 *
138 *	Get a datagram skbuff, understands the peeking, nonblocking wakeups
139 *	and possible races. This replaces identical code in packet, raw and
140 *	udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
141 *	the long standing peek and read race for datagram sockets. If you
142 *	alter this routine remember it must be re-entrant.
143 *
144 *	This function will lock the socket if a skb is returned, so the caller
145 *	needs to unlock the socket in that case (usually by calling
146 *	skb_free_datagram)
147 *
148 *	* It does not lock socket since today. This function is
149 *	* free of race conditions. This measure should/can improve
150 *	* significantly datagram socket latencies at high loads,
151 *	* when data copying to user space takes lots of time.
152 *	* (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
153 *	*  8) Great win.)
154 *	*			                    --ANK (980729)
155 *
156 *	The order of the tests when we find no data waiting are specified
157 *	quite explicitly by POSIX 1003.1g, don't change them without having
158 *	the standard around please.
159 */
160struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
161				    int *peeked, int *err)
162{
163	struct sk_buff *skb;
164	long timeo;
165	/*
166	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
167	 */
168	int error = sock_error(sk);
169
170	if (error)
171		goto no_packet;
172
173	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
174
175	do {
176		/* Again only user level code calls this function, so nothing
177		 * interrupt level will suddenly eat the receive_queue.
178		 *
179		 * Look at current nfs client by the way...
180		 * However, this function was corrent in any case. 8)
181		 */
182		unsigned long cpu_flags;
183
184		spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
185		skb = skb_peek(&sk->sk_receive_queue);
186		if (skb) {
187			*peeked = skb->peeked;
188			if (flags & MSG_PEEK) {
189				skb->peeked = 1;
190				atomic_inc(&skb->users);
191			} else
192				__skb_unlink(skb, &sk->sk_receive_queue);
193		}
194		spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
195
196		if (skb)
197			return skb;
198
199		/* User doesn't want to wait */
200		error = -EAGAIN;
201		if (!timeo)
202			goto no_packet;
203
204	} while (!wait_for_packet(sk, err, &timeo));
205
206	return NULL;
207
208no_packet:
209	*err = error;
210	return NULL;
211}
212EXPORT_SYMBOL(__skb_recv_datagram);
213
214struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
215				  int noblock, int *err)
216{
217	int peeked;
218
219	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
220				   &peeked, err);
221}
222EXPORT_SYMBOL(skb_recv_datagram);
223
224void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
225{
226	consume_skb(skb);
227	sk_mem_reclaim_partial(sk);
228}
229EXPORT_SYMBOL(skb_free_datagram);
230
231void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
232{
233	bool slow;
234
235	if (likely(atomic_read(&skb->users) == 1))
236		smp_rmb();
237	else if (likely(!atomic_dec_and_test(&skb->users)))
238		return;
239
240	slow = lock_sock_fast(sk);
241	skb_orphan(skb);
242	sk_mem_reclaim_partial(sk);
243	unlock_sock_fast(sk, slow);
244
245	/* skb is now orphaned, can be freed outside of locked section */
246	__kfree_skb(skb);
247}
248EXPORT_SYMBOL(skb_free_datagram_locked);
249
250/**
251 *	skb_kill_datagram - Free a datagram skbuff forcibly
252 *	@sk: socket
253 *	@skb: datagram skbuff
254 *	@flags: MSG_ flags
255 *
256 *	This function frees a datagram skbuff that was received by
257 *	skb_recv_datagram.  The flags argument must match the one
258 *	used for skb_recv_datagram.
259 *
260 *	If the MSG_PEEK flag is set, and the packet is still on the
261 *	receive queue of the socket, it will be taken off the queue
262 *	before it is freed.
263 *
264 *	This function currently only disables BH when acquiring the
265 *	sk_receive_queue lock.  Therefore it must not be used in a
266 *	context where that lock is acquired in an IRQ context.
267 *
268 *	It returns 0 if the packet was removed by us.
269 */
270
271int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
272{
273	int err = 0;
274
275	if (flags & MSG_PEEK) {
276		err = -ENOENT;
277		spin_lock_bh(&sk->sk_receive_queue.lock);
278		if (skb == skb_peek(&sk->sk_receive_queue)) {
279			__skb_unlink(skb, &sk->sk_receive_queue);
280			atomic_dec(&skb->users);
281			err = 0;
282		}
283		spin_unlock_bh(&sk->sk_receive_queue.lock);
284	}
285
286	kfree_skb(skb);
287	atomic_inc(&sk->sk_drops);
288	sk_mem_reclaim_partial(sk);
289
290	return err;
291}
292EXPORT_SYMBOL(skb_kill_datagram);
293
294/**
295 *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
296 *	@skb: buffer to copy
297 *	@offset: offset in the buffer to start copying from
298 *	@to: io vector to copy to
299 *	@len: amount of data to copy from buffer to iovec
300 *
301 *	Note: the iovec is modified during the copy.
302 */
303int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
304			    struct iovec *to, int len)
305{
306	int start = skb_headlen(skb);
307	int i, copy = start - offset;
308	struct sk_buff *frag_iter;
309
310	trace_skb_copy_datagram_iovec(skb, len);
311
312	/* Copy header. */
313	if (copy > 0) {
314		if (copy > len)
315			copy = len;
316		if (memcpy_toiovec(to, skb->data + offset, copy))
317			goto fault;
318		if ((len -= copy) == 0)
319			return 0;
320		offset += copy;
321	}
322
323	/* Copy paged appendix. Hmm... why does this look so complicated? */
324	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
325		int end;
326
327		WARN_ON(start > offset + len);
328
329		end = start + skb_shinfo(skb)->frags[i].size;
330		if ((copy = end - offset) > 0) {
331			int err;
332			u8  *vaddr;
333			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
334			struct page *page = frag->page;
335
336			if (copy > len)
337				copy = len;
338			vaddr = kmap(page);
339			err = memcpy_toiovec(to, vaddr + frag->page_offset +
340					     offset - start, copy);
341			kunmap(page);
342			if (err)
343				goto fault;
344			if (!(len -= copy))
345				return 0;
346			offset += copy;
347		}
348		start = end;
349	}
350
351	skb_walk_frags(skb, frag_iter) {
352		int end;
353
354		WARN_ON(start > offset + len);
355
356		end = start + frag_iter->len;
357		if ((copy = end - offset) > 0) {
358			if (copy > len)
359				copy = len;
360			if (skb_copy_datagram_iovec(frag_iter,
361						    offset - start,
362						    to, copy))
363				goto fault;
364			if ((len -= copy) == 0)
365				return 0;
366			offset += copy;
367		}
368		start = end;
369	}
370	if (!len)
371		return 0;
372
373fault:
374	return -EFAULT;
375}
376EXPORT_SYMBOL(skb_copy_datagram_iovec);
377
378/**
379 *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
380 *	@skb: buffer to copy
381 *	@offset: offset in the buffer to start copying from
382 *	@to: io vector to copy to
383 *	@to_offset: offset in the io vector to start copying to
384 *	@len: amount of data to copy from buffer to iovec
385 *
386 *	Returns 0 or -EFAULT.
387 *	Note: the iovec is not modified during the copy.
388 */
389int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
390				  const struct iovec *to, int to_offset,
391				  int len)
392{
393	int start = skb_headlen(skb);
394	int i, copy = start - offset;
395	struct sk_buff *frag_iter;
396
397	/* Copy header. */
398	if (copy > 0) {
399		if (copy > len)
400			copy = len;
401		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
402			goto fault;
403		if ((len -= copy) == 0)
404			return 0;
405		offset += copy;
406		to_offset += copy;
407	}
408
409	/* Copy paged appendix. Hmm... why does this look so complicated? */
410	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
411		int end;
412
413		WARN_ON(start > offset + len);
414
415		end = start + skb_shinfo(skb)->frags[i].size;
416		if ((copy = end - offset) > 0) {
417			int err;
418			u8  *vaddr;
419			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
420			struct page *page = frag->page;
421
422			if (copy > len)
423				copy = len;
424			vaddr = kmap(page);
425			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
426						offset - start, to_offset, copy);
427			kunmap(page);
428			if (err)
429				goto fault;
430			if (!(len -= copy))
431				return 0;
432			offset += copy;
433			to_offset += copy;
434		}
435		start = end;
436	}
437
438	skb_walk_frags(skb, frag_iter) {
439		int end;
440
441		WARN_ON(start > offset + len);
442
443		end = start + frag_iter->len;
444		if ((copy = end - offset) > 0) {
445			if (copy > len)
446				copy = len;
447			if (skb_copy_datagram_const_iovec(frag_iter,
448							  offset - start,
449							  to, to_offset,
450							  copy))
451				goto fault;
452			if ((len -= copy) == 0)
453				return 0;
454			offset += copy;
455			to_offset += copy;
456		}
457		start = end;
458	}
459	if (!len)
460		return 0;
461
462fault:
463	return -EFAULT;
464}
465EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
466
467/**
468 *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
469 *	@skb: buffer to copy
470 *	@offset: offset in the buffer to start copying to
471 *	@from: io vector to copy to
472 *	@from_offset: offset in the io vector to start copying from
473 *	@len: amount of data to copy to buffer from iovec
474 *
475 *	Returns 0 or -EFAULT.
476 *	Note: the iovec is not modified during the copy.
477 */
478int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
479				 const struct iovec *from, int from_offset,
480				 int len)
481{
482	int start = skb_headlen(skb);
483	int i, copy = start - offset;
484	struct sk_buff *frag_iter;
485
486	/* Copy header. */
487	if (copy > 0) {
488		if (copy > len)
489			copy = len;
490		if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
491					copy))
492			goto fault;
493		if ((len -= copy) == 0)
494			return 0;
495		offset += copy;
496		from_offset += copy;
497	}
498
499	/* Copy paged appendix. Hmm... why does this look so complicated? */
500	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
501		int end;
502
503		WARN_ON(start > offset + len);
504
505		end = start + skb_shinfo(skb)->frags[i].size;
506		if ((copy = end - offset) > 0) {
507			int err;
508			u8  *vaddr;
509			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
510			struct page *page = frag->page;
511
512			if (copy > len)
513				copy = len;
514			vaddr = kmap(page);
515			err = memcpy_fromiovecend(vaddr + frag->page_offset +
516						  offset - start,
517						  from, from_offset, copy);
518			kunmap(page);
519			if (err)
520				goto fault;
521
522			if (!(len -= copy))
523				return 0;
524			offset += copy;
525			from_offset += copy;
526		}
527		start = end;
528	}
529
530	skb_walk_frags(skb, frag_iter) {
531		int end;
532
533		WARN_ON(start > offset + len);
534
535		end = start + frag_iter->len;
536		if ((copy = end - offset) > 0) {
537			if (copy > len)
538				copy = len;
539			if (skb_copy_datagram_from_iovec(frag_iter,
540							 offset - start,
541							 from,
542							 from_offset,
543							 copy))
544				goto fault;
545			if ((len -= copy) == 0)
546				return 0;
547			offset += copy;
548			from_offset += copy;
549		}
550		start = end;
551	}
552	if (!len)
553		return 0;
554
555fault:
556	return -EFAULT;
557}
558EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
559
560static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
561				      u8 __user *to, int len,
562				      __wsum *csump)
563{
564	int start = skb_headlen(skb);
565	int i, copy = start - offset;
566	struct sk_buff *frag_iter;
567	int pos = 0;
568
569	/* Copy header. */
570	if (copy > 0) {
571		int err = 0;
572		if (copy > len)
573			copy = len;
574		*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
575					       *csump, &err);
576		if (err)
577			goto fault;
578		if ((len -= copy) == 0)
579			return 0;
580		offset += copy;
581		to += copy;
582		pos = copy;
583	}
584
585	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
586		int end;
587
588		WARN_ON(start > offset + len);
589
590		end = start + skb_shinfo(skb)->frags[i].size;
591		if ((copy = end - offset) > 0) {
592			__wsum csum2;
593			int err = 0;
594			u8  *vaddr;
595			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
596			struct page *page = frag->page;
597
598			if (copy > len)
599				copy = len;
600			vaddr = kmap(page);
601			csum2 = csum_and_copy_to_user(vaddr +
602							frag->page_offset +
603							offset - start,
604						      to, copy, 0, &err);
605			kunmap(page);
606			if (err)
607				goto fault;
608			*csump = csum_block_add(*csump, csum2, pos);
609			if (!(len -= copy))
610				return 0;
611			offset += copy;
612			to += copy;
613			pos += copy;
614		}
615		start = end;
616	}
617
618	skb_walk_frags(skb, frag_iter) {
619		int end;
620
621		WARN_ON(start > offset + len);
622
623		end = start + frag_iter->len;
624		if ((copy = end - offset) > 0) {
625			__wsum csum2 = 0;
626			if (copy > len)
627				copy = len;
628			if (skb_copy_and_csum_datagram(frag_iter,
629						       offset - start,
630						       to, copy,
631						       &csum2))
632				goto fault;
633			*csump = csum_block_add(*csump, csum2, pos);
634			if ((len -= copy) == 0)
635				return 0;
636			offset += copy;
637			to += copy;
638			pos += copy;
639		}
640		start = end;
641	}
642	if (!len)
643		return 0;
644
645fault:
646	return -EFAULT;
647}
648
649__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
650{
651	__sum16 sum;
652
653	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
654	if (likely(!sum)) {
655		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
656			netdev_rx_csum_fault(skb->dev);
657		skb->ip_summed = CHECKSUM_UNNECESSARY;
658	}
659	return sum;
660}
661EXPORT_SYMBOL(__skb_checksum_complete_head);
662
663__sum16 __skb_checksum_complete(struct sk_buff *skb)
664{
665	return __skb_checksum_complete_head(skb, skb->len);
666}
667EXPORT_SYMBOL(__skb_checksum_complete);
668
669/**
670 *	skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
671 *	@skb: skbuff
672 *	@hlen: hardware length
673 *	@iov: io vector
674 *
675 *	Caller _must_ check that skb will fit to this iovec.
676 *
677 *	Returns: 0       - success.
678 *		 -EINVAL - checksum failure.
679 *		 -EFAULT - fault during copy. Beware, in this case iovec
680 *			   can be modified!
681 */
682int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
683				     int hlen, struct iovec *iov)
684{
685	__wsum csum;
686	int chunk = skb->len - hlen;
687
688	if (!chunk)
689		return 0;
690
691	/* Skip filled elements.
692	 * Pretty silly, look at memcpy_toiovec, though 8)
693	 */
694	while (!iov->iov_len)
695		iov++;
696
697	if (iov->iov_len < chunk) {
698		if (__skb_checksum_complete(skb))
699			goto csum_error;
700		if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
701			goto fault;
702	} else {
703		csum = csum_partial(skb->data, hlen, skb->csum);
704		if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
705					       chunk, &csum))
706			goto fault;
707		if (csum_fold(csum))
708			goto csum_error;
709		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
710			netdev_rx_csum_fault(skb->dev);
711		iov->iov_len -= chunk;
712		iov->iov_base += chunk;
713	}
714	return 0;
715csum_error:
716	return -EINVAL;
717fault:
718	return -EFAULT;
719}
720EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
721
722/**
723 * 	datagram_poll - generic datagram poll
724 *	@file: file struct
725 *	@sock: socket
726 *	@wait: poll table
727 *
728 *	Datagram poll: Again totally generic. This also handles
729 *	sequenced packet sockets providing the socket receive queue
730 *	is only ever holding data ready to receive.
731 *
732 *	Note: when you _don't_ use this routine for this protocol,
733 *	and you use a different write policy from sock_writeable()
734 *	then please supply your own write_space callback.
735 */
736unsigned int datagram_poll(struct file *file, struct socket *sock,
737			   poll_table *wait)
738{
739	struct sock *sk = sock->sk;
740	unsigned int mask;
741
742	sock_poll_wait(file, sk_sleep(sk), wait);
743	mask = 0;
744
745	/* exceptional events? */
746	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
747		mask |= POLLERR;
748	if (sk->sk_shutdown & RCV_SHUTDOWN)
749		mask |= POLLRDHUP;
750	if (sk->sk_shutdown == SHUTDOWN_MASK)
751		mask |= POLLHUP;
752
753	/* readable? */
754	if (!skb_queue_empty(&sk->sk_receive_queue) ||
755	    (sk->sk_shutdown & RCV_SHUTDOWN))
756		mask |= POLLIN | POLLRDNORM;
757
758	/* Connection-based need to check for termination and startup */
759	if (connection_based(sk)) {
760		if (sk->sk_state == TCP_CLOSE)
761			mask |= POLLHUP;
762		/* connection hasn't started yet? */
763		if (sk->sk_state == TCP_SYN_SENT)
764			return mask;
765	}
766
767	/* writable? */
768	if (sock_writeable(sk))
769		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
770	else
771		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
772
773	return mask;
774}
775EXPORT_SYMBOL(datagram_poll);
776