1/* 2 * SUCS NET3: 3 * 4 * Generic datagram handling routines. These are generic for all 5 * protocols. Possibly a generic IP version on top of these would 6 * make sense. Not tonight however 8-). 7 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and 8 * NetROM layer all have identical poll code and mostly 9 * identical recvmsg() code. So we share it here. The poll was 10 * shared before but buried in udp.c so I moved it. 11 * 12 * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old 13 * udp.c code) 14 * 15 * Fixes: 16 * Alan Cox : NULL return from skb_peek_copy() 17 * understood 18 * Alan Cox : Rewrote skb_read_datagram to avoid the 19 * skb_peek_copy stuff. 20 * Alan Cox : Added support for SOCK_SEQPACKET. 21 * IPX can no longer use the SO_TYPE hack 22 * but AX.25 now works right, and SPX is 23 * feasible. 24 * Alan Cox : Fixed write poll of non IP protocol 25 * crash. 26 * Florian La Roche: Changed for my new skbuff handling. 27 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. 28 * Linus Torvalds : BSD semantic fixes. 29 * Alan Cox : Datagram iovec handling 30 * Darryl Miles : Fixed non-blocking SOCK_STREAM. 31 * Alan Cox : POSIXisms 32 * Pete Wyckoff : Unconnected accept() fix. 33 * 34 */ 35 36#include <linux/module.h> 37#include <linux/types.h> 38#include <linux/kernel.h> 39#include <asm/uaccess.h> 40#include <asm/system.h> 41#include <linux/mm.h> 42#include <linux/interrupt.h> 43#include <linux/errno.h> 44#include <linux/sched.h> 45#include <linux/inet.h> 46#include <linux/netdevice.h> 47#include <linux/rtnetlink.h> 48#include <linux/poll.h> 49#include <linux/highmem.h> 50#include <linux/spinlock.h> 51 52#include <net/protocol.h> 53#include <linux/skbuff.h> 54 55#include <net/checksum.h> 56#include <net/sock.h> 57#include <net/tcp_states.h> 58 59/* 60 * Is a socket 'connection oriented' ? 61 */ 62static inline int connection_based(struct sock *sk) 63{ 64 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; 65} 66 67/* 68 * Wait for a packet.. 69 */ 70static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) 71{ 72 int error; 73 DEFINE_WAIT(wait); 74 75 prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 76 77 /* Socket errors? */ 78 error = sock_error(sk); 79 if (error) 80 goto out_err; 81 82 if (!skb_queue_empty(&sk->sk_receive_queue)) 83 goto out; 84 85 /* Socket shut down? */ 86 if (sk->sk_shutdown & RCV_SHUTDOWN) 87 goto out_noerr; 88 89 /* Sequenced packets can come disconnected. 90 * If so we report the problem 91 */ 92 error = -ENOTCONN; 93 if (connection_based(sk) && 94 !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) 95 goto out_err; 96 97 /* handle signals */ 98 if (signal_pending(current)) 99 goto interrupted; 100 101 error = 0; 102 *timeo_p = schedule_timeout(*timeo_p); 103out: 104 finish_wait(sk->sk_sleep, &wait); 105 return error; 106interrupted: 107 error = sock_intr_errno(*timeo_p); 108out_err: 109 *err = error; 110 goto out; 111out_noerr: 112 *err = 0; 113 error = 1; 114 goto out; 115} 116 117/** 118 * skb_recv_datagram - Receive a datagram skbuff 119 * @sk: socket 120 * @flags: MSG_ flags 121 * @noblock: blocking operation? 122 * @err: error code returned 123 * 124 * Get a datagram skbuff, understands the peeking, nonblocking wakeups 125 * and possible races. This replaces identical code in packet, raw and 126 * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes 127 * the long standing peek and read race for datagram sockets. If you 128 * alter this routine remember it must be re-entrant. 129 * 130 * This function will lock the socket if a skb is returned, so the caller 131 * needs to unlock the socket in that case (usually by calling 132 * skb_free_datagram) 133 * 134 * * It does not lock socket since today. This function is 135 * * free of race conditions. This measure should/can improve 136 * * significantly datagram socket latencies at high loads, 137 * * when data copying to user space takes lots of time. 138 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet 139 * * 8) Great win.) 140 * * --ANK (980729) 141 * 142 * The order of the tests when we find no data waiting are specified 143 * quite explicitly by POSIX 1003.1g, don't change them without having 144 * the standard around please. 145 */ 146struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, 147 int noblock, int *err) 148{ 149 struct sk_buff *skb; 150 long timeo; 151 /* 152 * Caller is allowed not to check sk->sk_err before skb_recv_datagram() 153 */ 154 int error = sock_error(sk); 155 156 if (error) 157 goto no_packet; 158 159 timeo = sock_rcvtimeo(sk, noblock); 160 161 do { 162 /* Again only user level code calls this function, so nothing 163 * interrupt level will suddenly eat the receive_queue. 164 * 165 * Look at current nfs client by the way... 166 * However, this function was corrent in any case. 8) 167 */ 168 if (flags & MSG_PEEK) { 169 unsigned long cpu_flags; 170 171 spin_lock_irqsave(&sk->sk_receive_queue.lock, 172 cpu_flags); 173 skb = skb_peek(&sk->sk_receive_queue); 174 if (skb) 175 atomic_inc(&skb->users); 176 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, 177 cpu_flags); 178 } else 179 skb = skb_dequeue(&sk->sk_receive_queue); 180 181 if (skb) 182 return skb; 183 184 /* User doesn't want to wait */ 185 error = -EAGAIN; 186 if (!timeo) 187 goto no_packet; 188 189 } while (!wait_for_packet(sk, err, &timeo)); 190 191 return NULL; 192 193no_packet: 194 *err = error; 195 return NULL; 196} 197 198void skb_free_datagram(struct sock *sk, struct sk_buff *skb) 199{ 200 kfree_skb(skb); 201} 202 203/** 204 * skb_kill_datagram - Free a datagram skbuff forcibly 205 * @sk: socket 206 * @skb: datagram skbuff 207 * @flags: MSG_ flags 208 * 209 * This function frees a datagram skbuff that was received by 210 * skb_recv_datagram. The flags argument must match the one 211 * used for skb_recv_datagram. 212 * 213 * If the MSG_PEEK flag is set, and the packet is still on the 214 * receive queue of the socket, it will be taken off the queue 215 * before it is freed. 216 * 217 * This function currently only disables BH when acquiring the 218 * sk_receive_queue lock. Therefore it must not be used in a 219 * context where that lock is acquired in an IRQ context. 220 */ 221 222void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) 223{ 224 if (flags & MSG_PEEK) { 225 spin_lock_bh(&sk->sk_receive_queue.lock); 226 if (skb == skb_peek(&sk->sk_receive_queue)) { 227 __skb_unlink(skb, &sk->sk_receive_queue); 228 atomic_dec(&skb->users); 229 } 230 spin_unlock_bh(&sk->sk_receive_queue.lock); 231 } 232 233 kfree_skb(skb); 234} 235 236EXPORT_SYMBOL(skb_kill_datagram); 237 238/** 239 * skb_copy_datagram_iovec - Copy a datagram to an iovec. 240 * @skb: buffer to copy 241 * @offset: offset in the buffer to start copying from 242 * @to: io vector to copy to 243 * @len: amount of data to copy from buffer to iovec 244 * 245 * Note: the iovec is modified during the copy. 246 */ 247int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, 248 struct iovec *to, int len) 249{ 250 int start = skb_headlen(skb); 251 int i, copy = start - offset; 252 253 /* Copy header. */ 254 if (copy > 0) { 255 if (copy > len) 256 copy = len; 257 if (memcpy_toiovec(to, skb->data + offset, copy)) 258 goto fault; 259 if ((len -= copy) == 0) 260 return 0; 261 offset += copy; 262 } 263 264 /* Copy paged appendix. Hmm... why does this look so complicated? */ 265 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 266 int end; 267 268 BUG_TRAP(start <= offset + len); 269 270 end = start + skb_shinfo(skb)->frags[i].size; 271 if ((copy = end - offset) > 0) { 272 int err; 273 u8 *vaddr; 274 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 275 struct page *page = frag->page; 276 277 if (copy > len) 278 copy = len; 279 vaddr = kmap(page); 280 err = memcpy_toiovec(to, vaddr + frag->page_offset + 281 offset - start, copy); 282 kunmap(page); 283 if (err) 284 goto fault; 285 if (!(len -= copy)) 286 return 0; 287 offset += copy; 288 } 289 start = end; 290 } 291 292 if (skb_shinfo(skb)->frag_list) { 293 struct sk_buff *list = skb_shinfo(skb)->frag_list; 294 295 for (; list; list = list->next) { 296 int end; 297 298 BUG_TRAP(start <= offset + len); 299 300 end = start + list->len; 301 if ((copy = end - offset) > 0) { 302 if (copy > len) 303 copy = len; 304 if (skb_copy_datagram_iovec(list, 305 offset - start, 306 to, copy)) 307 goto fault; 308 if ((len -= copy) == 0) 309 return 0; 310 offset += copy; 311 } 312 start = end; 313 } 314 } 315 if (!len) 316 return 0; 317 318fault: 319 return -EFAULT; 320} 321 322static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, 323 u8 __user *to, int len, 324 __wsum *csump) 325{ 326 int start = skb_headlen(skb); 327 int pos = 0; 328 int i, copy = start - offset; 329 330 /* Copy header. */ 331 if (copy > 0) { 332 int err = 0; 333 if (copy > len) 334 copy = len; 335 *csump = csum_and_copy_to_user(skb->data + offset, to, copy, 336 *csump, &err); 337 if (err) 338 goto fault; 339 if ((len -= copy) == 0) 340 return 0; 341 offset += copy; 342 to += copy; 343 pos = copy; 344 } 345 346 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 347 int end; 348 349 BUG_TRAP(start <= offset + len); 350 351 end = start + skb_shinfo(skb)->frags[i].size; 352 if ((copy = end - offset) > 0) { 353 __wsum csum2; 354 int err = 0; 355 u8 *vaddr; 356 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 357 struct page *page = frag->page; 358 359 if (copy > len) 360 copy = len; 361 vaddr = kmap(page); 362 csum2 = csum_and_copy_to_user(vaddr + 363 frag->page_offset + 364 offset - start, 365 to, copy, 0, &err); 366 kunmap(page); 367 if (err) 368 goto fault; 369 *csump = csum_block_add(*csump, csum2, pos); 370 if (!(len -= copy)) 371 return 0; 372 offset += copy; 373 to += copy; 374 pos += copy; 375 } 376 start = end; 377 } 378 379 if (skb_shinfo(skb)->frag_list) { 380 struct sk_buff *list = skb_shinfo(skb)->frag_list; 381 382 for (; list; list=list->next) { 383 int end; 384 385 BUG_TRAP(start <= offset + len); 386 387 end = start + list->len; 388 if ((copy = end - offset) > 0) { 389 __wsum csum2 = 0; 390 if (copy > len) 391 copy = len; 392 if (skb_copy_and_csum_datagram(list, 393 offset - start, 394 to, copy, 395 &csum2)) 396 goto fault; 397 *csump = csum_block_add(*csump, csum2, pos); 398 if ((len -= copy) == 0) 399 return 0; 400 offset += copy; 401 to += copy; 402 pos += copy; 403 } 404 start = end; 405 } 406 } 407 if (!len) 408 return 0; 409 410fault: 411 return -EFAULT; 412} 413 414__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 415{ 416 __sum16 sum; 417 418 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 419 if (likely(!sum)) { 420 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 421 netdev_rx_csum_fault(skb->dev); 422 skb->ip_summed = CHECKSUM_UNNECESSARY; 423 } 424 return sum; 425} 426EXPORT_SYMBOL(__skb_checksum_complete_head); 427 428__sum16 __skb_checksum_complete(struct sk_buff *skb) 429{ 430 return __skb_checksum_complete_head(skb, skb->len); 431} 432EXPORT_SYMBOL(__skb_checksum_complete); 433 434/** 435 * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. 436 * @skb: skbuff 437 * @hlen: hardware length 438 * @iov: io vector 439 * 440 * Caller _must_ check that skb will fit to this iovec. 441 * 442 * Returns: 0 - success. 443 * -EINVAL - checksum failure. 444 * -EFAULT - fault during copy. Beware, in this case iovec 445 * can be modified! 446 */ 447int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, 448 int hlen, struct iovec *iov) 449{ 450 __wsum csum; 451 int chunk = skb->len - hlen; 452 453 /* Skip filled elements. 454 * Pretty silly, look at memcpy_toiovec, though 8) 455 */ 456 while (!iov->iov_len) 457 iov++; 458 459 if (iov->iov_len < chunk) { 460 if (__skb_checksum_complete(skb)) 461 goto csum_error; 462 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) 463 goto fault; 464 } else { 465 csum = csum_partial(skb->data, hlen, skb->csum); 466 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, 467 chunk, &csum)) 468 goto fault; 469 if (csum_fold(csum)) 470 goto csum_error; 471 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 472 netdev_rx_csum_fault(skb->dev); 473 iov->iov_len -= chunk; 474 iov->iov_base += chunk; 475 } 476 return 0; 477csum_error: 478 return -EINVAL; 479fault: 480 return -EFAULT; 481} 482 483/** 484 * datagram_poll - generic datagram poll 485 * @file: file struct 486 * @sock: socket 487 * @wait: poll table 488 * 489 * Datagram poll: Again totally generic. This also handles 490 * sequenced packet sockets providing the socket receive queue 491 * is only ever holding data ready to receive. 492 * 493 * Note: when you _don't_ use this routine for this protocol, 494 * and you use a different write policy from sock_writeable() 495 * then please supply your own write_space callback. 496 */ 497unsigned int datagram_poll(struct file *file, struct socket *sock, 498 poll_table *wait) 499{ 500 struct sock *sk = sock->sk; 501 unsigned int mask; 502 503 poll_wait(file, sk->sk_sleep, wait); 504 mask = 0; 505 506 /* exceptional events? */ 507 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 508 mask |= POLLERR; 509 if (sk->sk_shutdown & RCV_SHUTDOWN) 510 mask |= POLLRDHUP; 511 if (sk->sk_shutdown == SHUTDOWN_MASK) 512 mask |= POLLHUP; 513 514 /* readable? */ 515 if (!skb_queue_empty(&sk->sk_receive_queue) || 516 (sk->sk_shutdown & RCV_SHUTDOWN)) 517 mask |= POLLIN | POLLRDNORM; 518 519 /* Connection-based need to check for termination and startup */ 520 if (connection_based(sk)) { 521 if (sk->sk_state == TCP_CLOSE) 522 mask |= POLLHUP; 523 /* connection hasn't started yet? */ 524 if (sk->sk_state == TCP_SYN_SENT) 525 return mask; 526 } 527 528 /* writable? */ 529 if (sock_writeable(sk)) 530 mask |= POLLOUT | POLLWRNORM | POLLWRBAND; 531 else 532 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 533 534 return mask; 535} 536 537EXPORT_SYMBOL(datagram_poll); 538EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); 539EXPORT_SYMBOL(skb_copy_datagram_iovec); 540EXPORT_SYMBOL(skb_free_datagram); 541EXPORT_SYMBOL(skb_recv_datagram); 542