1/* 2 * SUCS NET3: 3 * 4 * Generic datagram handling routines. These are generic for all 5 * protocols. Possibly a generic IP version on top of these would 6 * make sense. Not tonight however 8-). 7 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and 8 * NetROM layer all have identical poll code and mostly 9 * identical recvmsg() code. So we share it here. The poll was 10 * shared before but buried in udp.c so I moved it. 11 * 12 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old 13 * udp.c code) 14 * 15 * Fixes: 16 * Alan Cox : NULL return from skb_peek_copy() 17 * understood 18 * Alan Cox : Rewrote skb_read_datagram to avoid the 19 * skb_peek_copy stuff. 20 * Alan Cox : Added support for SOCK_SEQPACKET. 21 * IPX can no longer use the SO_TYPE hack 22 * but AX.25 now works right, and SPX is 23 * feasible. 24 * Alan Cox : Fixed write poll of non IP protocol 25 * crash. 26 * Florian La Roche: Changed for my new skbuff handling. 27 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. 28 * Linus Torvalds : BSD semantic fixes. 29 * Alan Cox : Datagram iovec handling 30 * Darryl Miles : Fixed non-blocking SOCK_STREAM. 31 * Alan Cox : POSIXisms 32 * Pete Wyckoff : Unconnected accept() fix. 33 * 34 */ 35 36#include <linux/module.h> 37#include <linux/types.h> 38#include <linux/kernel.h> 39#include <asm/uaccess.h> 40#include <asm/system.h> 41#include <linux/mm.h> 42#include <linux/interrupt.h> 43#include <linux/errno.h> 44#include <linux/sched.h> 45#include <linux/inet.h> 46#include <linux/netdevice.h> 47#include <linux/rtnetlink.h> 48#include <linux/poll.h> 49#include <linux/highmem.h> 50#include <linux/spinlock.h> 51#include <linux/slab.h> 52 53#include <net/protocol.h> 54#include <linux/skbuff.h> 55 56#include <net/checksum.h> 57#include <net/sock.h> 58#include <net/tcp_states.h> 59#include <trace/events/skb.h> 60 61/* 62 * Is a socket 'connection oriented' ? 63 */ 64static inline int connection_based(struct sock *sk) 65{ 66 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; 67} 68 69static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync, 70 void *key) 71{ 72 unsigned long bits = (unsigned long)key; 73 74 /* 75 * Avoid a wakeup if event not interesting for us 76 */ 77 if (bits && !(bits & (POLLIN | POLLERR))) 78 return 0; 79 return autoremove_wake_function(wait, mode, sync, key); 80} 81/* 82 * Wait for a packet.. 83 */ 84static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) 85{ 86 int error; 87 DEFINE_WAIT_FUNC(wait, receiver_wake_function); 88 89 prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 90 91 /* Socket errors? */ 92 error = sock_error(sk); 93 if (error) 94 goto out_err; 95 96 if (!skb_queue_empty(&sk->sk_receive_queue)) 97 goto out; 98 99 /* Socket shut down? */ 100 if (sk->sk_shutdown & RCV_SHUTDOWN) 101 goto out_noerr; 102 103 /* Sequenced packets can come disconnected. 104 * If so we report the problem 105 */ 106 error = -ENOTCONN; 107 if (connection_based(sk) && 108 !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) 109 goto out_err; 110 111 /* handle signals */ 112 if (signal_pending(current)) 113 goto interrupted; 114 115 error = 0; 116 *timeo_p = schedule_timeout(*timeo_p); 117out: 118 finish_wait(sk_sleep(sk), &wait); 119 return error; 120interrupted: 121 error = sock_intr_errno(*timeo_p); 122out_err: 123 *err = error; 124 goto out; 125out_noerr: 126 *err = 0; 127 error = 1; 128 goto out; 129} 130 131/** 132 * __skb_recv_datagram - Receive a datagram skbuff 133 * @sk: socket 134 * @flags: MSG_ flags 135 * @peeked: returns non-zero if this packet has been seen before 136 * @err: error code returned 137 * 138 * Get a datagram skbuff, understands the peeking, nonblocking wakeups 139 * and possible races. This replaces identical code in packet, raw and 140 * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes 141 * the long standing peek and read race for datagram sockets. If you 142 * alter this routine remember it must be re-entrant. 143 * 144 * This function will lock the socket if a skb is returned, so the caller 145 * needs to unlock the socket in that case (usually by calling 146 * skb_free_datagram) 147 * 148 * * It does not lock socket since today. This function is 149 * * free of race conditions. This measure should/can improve 150 * * significantly datagram socket latencies at high loads, 151 * * when data copying to user space takes lots of time. 152 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet 153 * * 8) Great win.) 154 * * --ANK (980729) 155 * 156 * The order of the tests when we find no data waiting are specified 157 * quite explicitly by POSIX 1003.1g, don't change them without having 158 * the standard around please. 159 */ 160struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, 161 int *peeked, int *err) 162{ 163 struct sk_buff *skb; 164 long timeo; 165 /* 166 * Caller is allowed not to check sk->sk_err before skb_recv_datagram() 167 */ 168 int error = sock_error(sk); 169 170 if (error) 171 goto no_packet; 172 173 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 174 175 do { 176 /* Again only user level code calls this function, so nothing 177 * interrupt level will suddenly eat the receive_queue. 178 * 179 * Look at current nfs client by the way... 180 * However, this function was corrent in any case. 8) 181 */ 182 unsigned long cpu_flags; 183 184 spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); 185 skb = skb_peek(&sk->sk_receive_queue); 186 if (skb) { 187 *peeked = skb->peeked; 188 if (flags & MSG_PEEK) { 189 skb->peeked = 1; 190 atomic_inc(&skb->users); 191 } else 192 __skb_unlink(skb, &sk->sk_receive_queue); 193 } 194 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); 195 196 if (skb) 197 return skb; 198 199 /* User doesn't want to wait */ 200 error = -EAGAIN; 201 if (!timeo) 202 goto no_packet; 203 204 } while (!wait_for_packet(sk, err, &timeo)); 205 206 return NULL; 207 208no_packet: 209 *err = error; 210 return NULL; 211} 212EXPORT_SYMBOL(__skb_recv_datagram); 213 214struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, 215 int noblock, int *err) 216{ 217 int peeked; 218 219 return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), 220 &peeked, err); 221} 222EXPORT_SYMBOL(skb_recv_datagram); 223 224void skb_free_datagram(struct sock *sk, struct sk_buff *skb) 225{ 226 consume_skb(skb); 227 sk_mem_reclaim_partial(sk); 228} 229EXPORT_SYMBOL(skb_free_datagram); 230 231void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) 232{ 233 bool slow; 234 235 if (likely(atomic_read(&skb->users) == 1)) 236 smp_rmb(); 237 else if (likely(!atomic_dec_and_test(&skb->users))) 238 return; 239 240 slow = lock_sock_fast(sk); 241 skb_orphan(skb); 242 sk_mem_reclaim_partial(sk); 243 unlock_sock_fast(sk, slow); 244 245 /* skb is now orphaned, can be freed outside of locked section */ 246 __kfree_skb(skb); 247} 248EXPORT_SYMBOL(skb_free_datagram_locked); 249 250/** 251 * skb_kill_datagram - Free a datagram skbuff forcibly 252 * @sk: socket 253 * @skb: datagram skbuff 254 * @flags: MSG_ flags 255 * 256 * This function frees a datagram skbuff that was received by 257 * skb_recv_datagram. The flags argument must match the one 258 * used for skb_recv_datagram. 259 * 260 * If the MSG_PEEK flag is set, and the packet is still on the 261 * receive queue of the socket, it will be taken off the queue 262 * before it is freed. 263 * 264 * This function currently only disables BH when acquiring the 265 * sk_receive_queue lock. Therefore it must not be used in a 266 * context where that lock is acquired in an IRQ context. 267 * 268 * It returns 0 if the packet was removed by us. 269 */ 270 271int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) 272{ 273 int err = 0; 274 275 if (flags & MSG_PEEK) { 276 err = -ENOENT; 277 spin_lock_bh(&sk->sk_receive_queue.lock); 278 if (skb == skb_peek(&sk->sk_receive_queue)) { 279 __skb_unlink(skb, &sk->sk_receive_queue); 280 atomic_dec(&skb->users); 281 err = 0; 282 } 283 spin_unlock_bh(&sk->sk_receive_queue.lock); 284 } 285 286 kfree_skb(skb); 287 atomic_inc(&sk->sk_drops); 288 sk_mem_reclaim_partial(sk); 289 290 return err; 291} 292EXPORT_SYMBOL(skb_kill_datagram); 293 294/** 295 * skb_copy_datagram_iovec - Copy a datagram to an iovec. 296 * @skb: buffer to copy 297 * @offset: offset in the buffer to start copying from 298 * @to: io vector to copy to 299 * @len: amount of data to copy from buffer to iovec 300 * 301 * Note: the iovec is modified during the copy. 302 */ 303int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, 304 struct iovec *to, int len) 305{ 306 int start = skb_headlen(skb); 307 int i, copy = start - offset; 308 struct sk_buff *frag_iter; 309 310 trace_skb_copy_datagram_iovec(skb, len); 311 312 /* Copy header. */ 313 if (copy > 0) { 314 if (copy > len) 315 copy = len; 316 if (memcpy_toiovec(to, skb->data + offset, copy)) 317 goto fault; 318 if ((len -= copy) == 0) 319 return 0; 320 offset += copy; 321 } 322 323 /* Copy paged appendix. Hmm... why does this look so complicated? */ 324 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 325 int end; 326 327 WARN_ON(start > offset + len); 328 329 end = start + skb_shinfo(skb)->frags[i].size; 330 if ((copy = end - offset) > 0) { 331 int err; 332 u8 *vaddr; 333 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 334 struct page *page = frag->page; 335 336 if (copy > len) 337 copy = len; 338 vaddr = kmap(page); 339 err = memcpy_toiovec(to, vaddr + frag->page_offset + 340 offset - start, copy); 341 kunmap(page); 342 if (err) 343 goto fault; 344 if (!(len -= copy)) 345 return 0; 346 offset += copy; 347 } 348 start = end; 349 } 350 351 skb_walk_frags(skb, frag_iter) { 352 int end; 353 354 WARN_ON(start > offset + len); 355 356 end = start + frag_iter->len; 357 if ((copy = end - offset) > 0) { 358 if (copy > len) 359 copy = len; 360 if (skb_copy_datagram_iovec(frag_iter, 361 offset - start, 362 to, copy)) 363 goto fault; 364 if ((len -= copy) == 0) 365 return 0; 366 offset += copy; 367 } 368 start = end; 369 } 370 if (!len) 371 return 0; 372 373fault: 374 return -EFAULT; 375} 376EXPORT_SYMBOL(skb_copy_datagram_iovec); 377 378/** 379 * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. 380 * @skb: buffer to copy 381 * @offset: offset in the buffer to start copying from 382 * @to: io vector to copy to 383 * @to_offset: offset in the io vector to start copying to 384 * @len: amount of data to copy from buffer to iovec 385 * 386 * Returns 0 or -EFAULT. 387 * Note: the iovec is not modified during the copy. 388 */ 389int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, 390 const struct iovec *to, int to_offset, 391 int len) 392{ 393 int start = skb_headlen(skb); 394 int i, copy = start - offset; 395 struct sk_buff *frag_iter; 396 397 /* Copy header. */ 398 if (copy > 0) { 399 if (copy > len) 400 copy = len; 401 if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) 402 goto fault; 403 if ((len -= copy) == 0) 404 return 0; 405 offset += copy; 406 to_offset += copy; 407 } 408 409 /* Copy paged appendix. Hmm... why does this look so complicated? */ 410 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 411 int end; 412 413 WARN_ON(start > offset + len); 414 415 end = start + skb_shinfo(skb)->frags[i].size; 416 if ((copy = end - offset) > 0) { 417 int err; 418 u8 *vaddr; 419 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 420 struct page *page = frag->page; 421 422 if (copy > len) 423 copy = len; 424 vaddr = kmap(page); 425 err = memcpy_toiovecend(to, vaddr + frag->page_offset + 426 offset - start, to_offset, copy); 427 kunmap(page); 428 if (err) 429 goto fault; 430 if (!(len -= copy)) 431 return 0; 432 offset += copy; 433 to_offset += copy; 434 } 435 start = end; 436 } 437 438 skb_walk_frags(skb, frag_iter) { 439 int end; 440 441 WARN_ON(start > offset + len); 442 443 end = start + frag_iter->len; 444 if ((copy = end - offset) > 0) { 445 if (copy > len) 446 copy = len; 447 if (skb_copy_datagram_const_iovec(frag_iter, 448 offset - start, 449 to, to_offset, 450 copy)) 451 goto fault; 452 if ((len -= copy) == 0) 453 return 0; 454 offset += copy; 455 to_offset += copy; 456 } 457 start = end; 458 } 459 if (!len) 460 return 0; 461 462fault: 463 return -EFAULT; 464} 465EXPORT_SYMBOL(skb_copy_datagram_const_iovec); 466 467/** 468 * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. 469 * @skb: buffer to copy 470 * @offset: offset in the buffer to start copying to 471 * @from: io vector to copy to 472 * @from_offset: offset in the io vector to start copying from 473 * @len: amount of data to copy to buffer from iovec 474 * 475 * Returns 0 or -EFAULT. 476 * Note: the iovec is not modified during the copy. 477 */ 478int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, 479 const struct iovec *from, int from_offset, 480 int len) 481{ 482 int start = skb_headlen(skb); 483 int i, copy = start - offset; 484 struct sk_buff *frag_iter; 485 486 /* Copy header. */ 487 if (copy > 0) { 488 if (copy > len) 489 copy = len; 490 if (memcpy_fromiovecend(skb->data + offset, from, from_offset, 491 copy)) 492 goto fault; 493 if ((len -= copy) == 0) 494 return 0; 495 offset += copy; 496 from_offset += copy; 497 } 498 499 /* Copy paged appendix. Hmm... why does this look so complicated? */ 500 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 501 int end; 502 503 WARN_ON(start > offset + len); 504 505 end = start + skb_shinfo(skb)->frags[i].size; 506 if ((copy = end - offset) > 0) { 507 int err; 508 u8 *vaddr; 509 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 510 struct page *page = frag->page; 511 512 if (copy > len) 513 copy = len; 514 vaddr = kmap(page); 515 err = memcpy_fromiovecend(vaddr + frag->page_offset + 516 offset - start, 517 from, from_offset, copy); 518 kunmap(page); 519 if (err) 520 goto fault; 521 522 if (!(len -= copy)) 523 return 0; 524 offset += copy; 525 from_offset += copy; 526 } 527 start = end; 528 } 529 530 skb_walk_frags(skb, frag_iter) { 531 int end; 532 533 WARN_ON(start > offset + len); 534 535 end = start + frag_iter->len; 536 if ((copy = end - offset) > 0) { 537 if (copy > len) 538 copy = len; 539 if (skb_copy_datagram_from_iovec(frag_iter, 540 offset - start, 541 from, 542 from_offset, 543 copy)) 544 goto fault; 545 if ((len -= copy) == 0) 546 return 0; 547 offset += copy; 548 from_offset += copy; 549 } 550 start = end; 551 } 552 if (!len) 553 return 0; 554 555fault: 556 return -EFAULT; 557} 558EXPORT_SYMBOL(skb_copy_datagram_from_iovec); 559 560static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, 561 u8 __user *to, int len, 562 __wsum *csump) 563{ 564 int start = skb_headlen(skb); 565 int i, copy = start - offset; 566 struct sk_buff *frag_iter; 567 int pos = 0; 568 569 /* Copy header. */ 570 if (copy > 0) { 571 int err = 0; 572 if (copy > len) 573 copy = len; 574 *csump = csum_and_copy_to_user(skb->data + offset, to, copy, 575 *csump, &err); 576 if (err) 577 goto fault; 578 if ((len -= copy) == 0) 579 return 0; 580 offset += copy; 581 to += copy; 582 pos = copy; 583 } 584 585 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 586 int end; 587 588 WARN_ON(start > offset + len); 589 590 end = start + skb_shinfo(skb)->frags[i].size; 591 if ((copy = end - offset) > 0) { 592 __wsum csum2; 593 int err = 0; 594 u8 *vaddr; 595 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 596 struct page *page = frag->page; 597 598 if (copy > len) 599 copy = len; 600 vaddr = kmap(page); 601 csum2 = csum_and_copy_to_user(vaddr + 602 frag->page_offset + 603 offset - start, 604 to, copy, 0, &err); 605 kunmap(page); 606 if (err) 607 goto fault; 608 *csump = csum_block_add(*csump, csum2, pos); 609 if (!(len -= copy)) 610 return 0; 611 offset += copy; 612 to += copy; 613 pos += copy; 614 } 615 start = end; 616 } 617 618 skb_walk_frags(skb, frag_iter) { 619 int end; 620 621 WARN_ON(start > offset + len); 622 623 end = start + frag_iter->len; 624 if ((copy = end - offset) > 0) { 625 __wsum csum2 = 0; 626 if (copy > len) 627 copy = len; 628 if (skb_copy_and_csum_datagram(frag_iter, 629 offset - start, 630 to, copy, 631 &csum2)) 632 goto fault; 633 *csump = csum_block_add(*csump, csum2, pos); 634 if ((len -= copy) == 0) 635 return 0; 636 offset += copy; 637 to += copy; 638 pos += copy; 639 } 640 start = end; 641 } 642 if (!len) 643 return 0; 644 645fault: 646 return -EFAULT; 647} 648 649__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 650{ 651 __sum16 sum; 652 653 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 654 if (likely(!sum)) { 655 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 656 netdev_rx_csum_fault(skb->dev); 657 skb->ip_summed = CHECKSUM_UNNECESSARY; 658 } 659 return sum; 660} 661EXPORT_SYMBOL(__skb_checksum_complete_head); 662 663__sum16 __skb_checksum_complete(struct sk_buff *skb) 664{ 665 return __skb_checksum_complete_head(skb, skb->len); 666} 667EXPORT_SYMBOL(__skb_checksum_complete); 668 669/** 670 * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. 671 * @skb: skbuff 672 * @hlen: hardware length 673 * @iov: io vector 674 * 675 * Caller _must_ check that skb will fit to this iovec. 676 * 677 * Returns: 0 - success. 678 * -EINVAL - checksum failure. 679 * -EFAULT - fault during copy. Beware, in this case iovec 680 * can be modified! 681 */ 682int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, 683 int hlen, struct iovec *iov) 684{ 685 __wsum csum; 686 int chunk = skb->len - hlen; 687 688 if (!chunk) 689 return 0; 690 691 /* Skip filled elements. 692 * Pretty silly, look at memcpy_toiovec, though 8) 693 */ 694 while (!iov->iov_len) 695 iov++; 696 697 if (iov->iov_len < chunk) { 698 if (__skb_checksum_complete(skb)) 699 goto csum_error; 700 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) 701 goto fault; 702 } else { 703 csum = csum_partial(skb->data, hlen, skb->csum); 704 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, 705 chunk, &csum)) 706 goto fault; 707 if (csum_fold(csum)) 708 goto csum_error; 709 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 710 netdev_rx_csum_fault(skb->dev); 711 iov->iov_len -= chunk; 712 iov->iov_base += chunk; 713 } 714 return 0; 715csum_error: 716 return -EINVAL; 717fault: 718 return -EFAULT; 719} 720EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); 721 722/** 723 * datagram_poll - generic datagram poll 724 * @file: file struct 725 * @sock: socket 726 * @wait: poll table 727 * 728 * Datagram poll: Again totally generic. This also handles 729 * sequenced packet sockets providing the socket receive queue 730 * is only ever holding data ready to receive. 731 * 732 * Note: when you _don't_ use this routine for this protocol, 733 * and you use a different write policy from sock_writeable() 734 * then please supply your own write_space callback. 735 */ 736unsigned int datagram_poll(struct file *file, struct socket *sock, 737 poll_table *wait) 738{ 739 struct sock *sk = sock->sk; 740 unsigned int mask; 741 742 sock_poll_wait(file, sk_sleep(sk), wait); 743 mask = 0; 744 745 /* exceptional events? */ 746 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 747 mask |= POLLERR; 748 if (sk->sk_shutdown & RCV_SHUTDOWN) 749 mask |= POLLRDHUP; 750 if (sk->sk_shutdown == SHUTDOWN_MASK) 751 mask |= POLLHUP; 752 753 /* readable? */ 754 if (!skb_queue_empty(&sk->sk_receive_queue) || 755 (sk->sk_shutdown & RCV_SHUTDOWN)) 756 mask |= POLLIN | POLLRDNORM; 757 758 /* Connection-based need to check for termination and startup */ 759 if (connection_based(sk)) { 760 if (sk->sk_state == TCP_CLOSE) 761 mask |= POLLHUP; 762 /* connection hasn't started yet? */ 763 if (sk->sk_state == TCP_SYN_SENT) 764 return mask; 765 } 766 767 /* writable? */ 768 if (sock_writeable(sk)) 769 mask |= POLLOUT | POLLWRNORM | POLLWRBAND; 770 else 771 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 772 773 return mask; 774} 775EXPORT_SYMBOL(datagram_poll); 776