1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * RAW - implementation of IP "raw" sockets. 7 * 8 * Version: $Id: raw.c,v 1.1.1.1 2008/10/15 03:27:33 james26_jang Exp $ 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * 13 * Fixes: 14 * Alan Cox : verify_area() fixed up 15 * Alan Cox : ICMP error handling 16 * Alan Cox : EMSGSIZE if you send too big a packet 17 * Alan Cox : Now uses generic datagrams and shared 18 * skbuff library. No more peek crashes, 19 * no more backlogs 20 * Alan Cox : Checks sk->broadcast. 21 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram 22 * Alan Cox : Raw passes ip options too 23 * Alan Cox : Setsocketopt added 24 * Alan Cox : Fixed error return for broadcasts 25 * Alan Cox : Removed wake_up calls 26 * Alan Cox : Use ttl/tos 27 * Alan Cox : Cleaned up old debugging 28 * Alan Cox : Use new kernel side addresses 29 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. 30 * Alan Cox : BSD style RAW socket demultiplexing. 31 * Alan Cox : Beginnings of mrouted support. 32 * Alan Cox : Added IP_HDRINCL option. 33 * Alan Cox : Skip broadcast check if BSDism set. 34 * David S. Miller : New socket lookup architecture. 35 * 36 * This program is free software; you can redistribute it and/or 37 * modify it under the terms of the GNU General Public License 38 * as published by the Free Software Foundation; either version 39 * 2 of the License, or (at your option) any later version. 40 */ 41 42#include <linux/config.h> 43#include <asm/system.h> 44#include <asm/uaccess.h> 45#include <asm/ioctls.h> 46#include <linux/types.h> 47#include <linux/sched.h> 48#include <linux/errno.h> 49#include <linux/timer.h> 50#include <linux/mm.h> 51#include <linux/kernel.h> 52#include <linux/fcntl.h> 53#include <linux/socket.h> 54#include <linux/in.h> 55#include <linux/inet.h> 56#include <linux/netdevice.h> 57#include <linux/mroute.h> 58#include <net/ip.h> 59#include <net/protocol.h> 60#include <linux/skbuff.h> 61#include <net/sock.h> 62#include <net/icmp.h> 63#include <net/udp.h> 64#include <net/raw.h> 65#include <net/inet_common.h> 66#include <net/checksum.h> 67 68struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE]; 69rwlock_t raw_v4_lock = RW_LOCK_UNLOCKED; 70 71static void raw_v4_hash(struct sock *sk) 72{ 73 struct sock **skp = &raw_v4_htable[sk->num & (RAWV4_HTABLE_SIZE - 1)]; 74 75 write_lock_bh(&raw_v4_lock); 76 if ((sk->next = *skp) != NULL) 77 (*skp)->pprev = &sk->next; 78 *skp = sk; 79 sk->pprev = skp; 80 sock_prot_inc_use(sk->prot); 81 sock_hold(sk); 82 write_unlock_bh(&raw_v4_lock); 83} 84 85static void raw_v4_unhash(struct sock *sk) 86{ 87 write_lock_bh(&raw_v4_lock); 88 if (sk->pprev) { 89 if (sk->next) 90 sk->next->pprev = sk->pprev; 91 *sk->pprev = sk->next; 92 sk->pprev = NULL; 93 sock_prot_dec_use(sk->prot); 94 __sock_put(sk); 95 } 96 write_unlock_bh(&raw_v4_lock); 97} 98 99struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, 100 unsigned long raddr, unsigned long laddr, 101 int dif) 102{ 103 struct sock *s = sk; 104 105 for (s = sk; s; s = s->next) { 106 if (s->num == num && 107 !(s->daddr && s->daddr != raddr) && 108 !(s->rcv_saddr && s->rcv_saddr != laddr) && 109 !(s->bound_dev_if && s->bound_dev_if != dif)) 110 break; /* gotcha */ 111 } 112 return s; 113} 114 115/* 116 * 0 - deliver 117 * 1 - block 118 */ 119static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) 120{ 121 int type; 122 123 type = skb->h.icmph->type; 124 if (type < 32) { 125 __u32 data = sk->tp_pinfo.tp_raw4.filter.data; 126 127 return ((1 << type) & data) != 0; 128 } 129 130 /* Do not block unknown ICMP types */ 131 return 0; 132} 133 134/* IP input processing comes here for RAW socket delivery. 135 * This is fun as to avoid copies we want to make no surplus 136 * copies. 137 * 138 * RFC 1122: SHOULD pass TOS value up to the transport layer. 139 * -> It does. And not only TOS, but all IP header. 140 */ 141struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) 142{ 143 struct sock *sk; 144 145 read_lock(&raw_v4_lock); 146 if ((sk = raw_v4_htable[hash]) == NULL) 147 goto out; 148 sk = __raw_v4_lookup(sk, iph->protocol, 149 iph->saddr, iph->daddr, 150 skb->dev->ifindex); 151 152 while (sk) { 153 struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol, 154 iph->saddr, iph->daddr, 155 skb->dev->ifindex); 156 if (iph->protocol != IPPROTO_ICMP || 157 !icmp_filter(sk, skb)) { 158 struct sk_buff *clone; 159 160 if (!sknext) 161 break; 162 clone = skb_clone(skb, GFP_ATOMIC); 163 /* Not releasing hash table! */ 164 if (clone) 165 raw_rcv(sk, clone); 166 } 167 sk = sknext; 168 } 169out: 170 if (sk) 171 sock_hold(sk); 172 read_unlock(&raw_v4_lock); 173 174 return sk; 175} 176 177void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) 178{ 179 int type = skb->h.icmph->type; 180 int code = skb->h.icmph->code; 181 int err = 0; 182 int harderr = 0; 183 184 /* Report error on raw socket, if: 185 1. User requested ip_recverr. 186 2. Socket is connected (otherwise the error indication 187 is useless without ip_recverr and error is hard. 188 */ 189 if (!sk->protinfo.af_inet.recverr && sk->state != TCP_ESTABLISHED) 190 return; 191 192 switch (type) { 193 default: 194 case ICMP_TIME_EXCEEDED: 195 err = EHOSTUNREACH; 196 break; 197 case ICMP_SOURCE_QUENCH: 198 return; 199 case ICMP_PARAMETERPROB: 200 err = EPROTO; 201 harderr = 1; 202 break; 203 case ICMP_DEST_UNREACH: 204 err = EHOSTUNREACH; 205 if (code > NR_ICMP_UNREACH) 206 break; 207 err = icmp_err_convert[code].errno; 208 harderr = icmp_err_convert[code].fatal; 209 if (code == ICMP_FRAG_NEEDED) { 210 harderr = sk->protinfo.af_inet.pmtudisc != 211 IP_PMTUDISC_DONT; 212 err = EMSGSIZE; 213 } 214 } 215 216 if (sk->protinfo.af_inet.recverr) { 217 struct iphdr *iph = (struct iphdr*)skb->data; 218 u8 *payload = skb->data + (iph->ihl << 2); 219 220 if (sk->protinfo.af_inet.hdrincl) 221 payload = skb->data; 222 ip_icmp_error(sk, skb, err, 0, info, payload); 223 } 224 225 if (sk->protinfo.af_inet.recverr || harderr) { 226 sk->err = err; 227 sk->error_report(sk); 228 } 229} 230 231static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) 232{ 233 /* Charge it to the socket. */ 234 235 if (sock_queue_rcv_skb(sk, skb) < 0) { 236 IP_INC_STATS(IpInDiscards); 237 kfree_skb(skb); 238 return NET_RX_DROP; 239 } 240 241 IP_INC_STATS(IpInDelivers); 242 return NET_RX_SUCCESS; 243} 244 245int raw_rcv(struct sock *sk, struct sk_buff *skb) 246{ 247 skb_push(skb, skb->data - skb->nh.raw); 248 249 raw_rcv_skb(sk, skb); 250 return 0; 251} 252 253struct rawfakehdr 254{ 255 struct iovec *iov; 256 u32 saddr; 257 struct dst_entry *dst; 258}; 259 260/* 261 * Send a RAW IP packet. 262 */ 263 264/* 265 * Callback support is trivial for SOCK_RAW 266 */ 267 268static int raw_getfrag(const void *p, char *to, unsigned int offset, 269 unsigned int fraglen) 270{ 271 struct rawfakehdr *rfh = (struct rawfakehdr *) p; 272 return memcpy_fromiovecend(to, rfh->iov, offset, fraglen); 273} 274 275/* 276 * IPPROTO_RAW needs extra work. 277 */ 278 279static int raw_getrawfrag(const void *p, char *to, unsigned int offset, 280 unsigned int fraglen) 281{ 282 struct rawfakehdr *rfh = (struct rawfakehdr *) p; 283 284 if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen)) 285 return -EFAULT; 286 287 if (!offset) { 288 struct iphdr *iph = (struct iphdr *)to; 289 if (!iph->saddr) 290 iph->saddr = rfh->saddr; 291 iph->check = 0; 292 iph->tot_len = htons(fraglen); /* This is right as you can't 293 frag RAW packets */ 294 /* 295 * Deliberate breach of modularity to keep 296 * ip_build_xmit clean (well less messy). 297 */ 298 if (!iph->id) 299 ip_select_ident(iph, rfh->dst, NULL); 300 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 301 } 302 return 0; 303} 304 305static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) 306{ 307 struct ipcm_cookie ipc; 308 struct rawfakehdr rfh; 309 struct rtable *rt = NULL; 310 int free = 0; 311 u32 daddr; 312 u8 tos; 313 int err; 314 315 /* This check is ONLY to check for arithmetic overflow 316 on integer(!) len. Not more! Real check will be made 317 in ip_build_xmit --ANK 318 319 BTW socket.c -> af_*.c -> ... make multiple 320 invalid conversions size_t -> int. We MUST repair it f.e. 321 by replacing all of them with size_t and revise all 322 the places sort of len += sizeof(struct iphdr) 323 If len was ULONG_MAX-10 it would be cathastrophe --ANK 324 */ 325 326 err = -EMSGSIZE; 327 if (len < 0 || len > 0xFFFF) 328 goto out; 329 330 /* 331 * Check the flags. 332 */ 333 334 err = -EOPNOTSUPP; 335 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ 336 goto out; /* compatibility */ 337 338 /* 339 * Get and verify the address. 340 */ 341 342 if (msg->msg_namelen) { 343 struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name; 344 err = -EINVAL; 345 if (msg->msg_namelen < sizeof(*usin)) 346 goto out; 347 if (usin->sin_family != AF_INET) { 348 static int complained; 349 if (!complained++) 350 printk(KERN_INFO "%s forgot to set AF_INET in " 351 "raw sendmsg. Fix it!\n", 352 current->comm); 353 err = -EINVAL; 354 if (usin->sin_family) 355 goto out; 356 } 357 daddr = usin->sin_addr.s_addr; 358 /* ANK: I did not forget to get protocol from port field. 359 * I just do not know, who uses this weirdness. 360 * IP_HDRINCL is much more convenient. 361 */ 362 } else { 363 err = -EINVAL; 364 if (sk->state != TCP_ESTABLISHED) 365 goto out; 366 daddr = sk->daddr; 367 } 368 369 ipc.addr = sk->saddr; 370 ipc.opt = NULL; 371 ipc.oif = sk->bound_dev_if; 372 373 if (msg->msg_controllen) { 374 err = ip_cmsg_send(msg, &ipc); 375 if (err) 376 goto out; 377 if (ipc.opt) 378 free = 1; 379 } 380 381 rfh.saddr = ipc.addr; 382 ipc.addr = daddr; 383 384 if (!ipc.opt) 385 ipc.opt = sk->protinfo.af_inet.opt; 386 387 if (ipc.opt) { 388 err = -EINVAL; 389 /* Linux does not mangle headers on raw sockets, 390 * so that IP options + IP_HDRINCL is non-sense. 391 */ 392 if (sk->protinfo.af_inet.hdrincl) 393 goto done; 394 if (ipc.opt->srr) { 395 if (!daddr) 396 goto done; 397 daddr = ipc.opt->faddr; 398 } 399 } 400 tos = RT_TOS(sk->protinfo.af_inet.tos) | sk->localroute; 401 if (msg->msg_flags & MSG_DONTROUTE) 402 tos |= RTO_ONLINK; 403 404 if (MULTICAST(daddr)) { 405 if (!ipc.oif) 406 ipc.oif = sk->protinfo.af_inet.mc_index; 407 if (!rfh.saddr) 408 rfh.saddr = sk->protinfo.af_inet.mc_addr; 409 } 410 411 err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif); 412 413 if (err) 414 goto done; 415 416 err = -EACCES; 417 if (rt->rt_flags & RTCF_BROADCAST && !sk->broadcast) 418 goto done; 419 420 if (msg->msg_flags & MSG_CONFIRM) 421 goto do_confirm; 422back_from_confirm: 423 424 rfh.iov = msg->msg_iov; 425 rfh.saddr = rt->rt_src; 426 rfh.dst = &rt->u.dst; 427 if (!ipc.addr) 428 ipc.addr = rt->rt_dst; 429 err = ip_build_xmit(sk, sk->protinfo.af_inet.hdrincl ? raw_getrawfrag : 430 raw_getfrag, &rfh, len, &ipc, rt, msg->msg_flags); 431 432done: 433 if (free) 434 kfree(ipc.opt); 435 ip_rt_put(rt); 436 437out: return err < 0 ? err : len; 438 439do_confirm: 440 dst_confirm(&rt->u.dst); 441 if (!(msg->msg_flags & MSG_PROBE) || len) 442 goto back_from_confirm; 443 err = 0; 444 goto done; 445} 446 447static void raw_close(struct sock *sk, long timeout) 448{ 449 /* 450 * Raw sockets may have direct kernel refereneces. Kill them. 451 */ 452 ip_ra_control(sk, 0, NULL); 453 454 inet_sock_release(sk); 455} 456 457/* This gets rid of all the nasties in af_inet. -DaveM */ 458static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) 459{ 460 struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; 461 int ret = -EINVAL; 462 int chk_addr_ret; 463 464 if (sk->state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) 465 goto out; 466 chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); 467 ret = -EADDRNOTAVAIL; 468 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && 469 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) 470 goto out; 471 sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; 472 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 473 sk->saddr = 0; /* Use device */ 474 sk_dst_reset(sk); 475 ret = 0; 476out: return ret; 477} 478 479/* 480 * This should be easy, if there is something there 481 * we return it, otherwise we block. 482 */ 483 484int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, 485 int noblock, int flags, int *addr_len) 486{ 487 int copied = 0; 488 int err = -EOPNOTSUPP; 489 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; 490 struct sk_buff *skb; 491 492 if (flags & MSG_OOB) 493 goto out; 494 495 if (addr_len) 496 *addr_len = sizeof(*sin); 497 498 if (flags & MSG_ERRQUEUE) { 499 err = ip_recv_error(sk, msg, len); 500 goto out; 501 } 502 503 skb = skb_recv_datagram(sk, flags, noblock, &err); 504 if (!skb) 505 goto out; 506 507 copied = skb->len; 508 if (len < copied) { 509 msg->msg_flags |= MSG_TRUNC; 510 copied = len; 511 } 512 513 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 514 if (err) 515 goto done; 516 517 sock_recv_timestamp(msg, sk, skb); 518 519 /* Copy the address. */ 520 if (sin) { 521 sin->sin_family = AF_INET; 522 sin->sin_addr.s_addr = skb->nh.iph->saddr; 523 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); 524 } 525 if (sk->protinfo.af_inet.cmsg_flags) 526 ip_cmsg_recv(msg, skb); 527done: 528 skb_free_datagram(sk, skb); 529out: return err ? : copied; 530} 531 532static int raw_init(struct sock *sk) 533{ 534 struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4); 535 if (sk->num == IPPROTO_ICMP) 536 memset(&tp->filter, 0, sizeof(tp->filter)); 537 return 0; 538} 539 540static int raw_seticmpfilter(struct sock *sk, char *optval, int optlen) 541{ 542 if (optlen > sizeof(struct icmp_filter)) 543 optlen = sizeof(struct icmp_filter); 544 if (copy_from_user(&sk->tp_pinfo.tp_raw4.filter, optval, optlen)) 545 return -EFAULT; 546 return 0; 547} 548 549static int raw_geticmpfilter(struct sock *sk, char *optval, int *optlen) 550{ 551 int len, ret = -EFAULT; 552 553 if (get_user(len, optlen)) 554 goto out; 555 ret = -EINVAL; 556 if (len < 0) 557 goto out; 558 if (len > sizeof(struct icmp_filter)) 559 len = sizeof(struct icmp_filter); 560 ret = -EFAULT; 561 if (put_user(len, optlen) || 562 copy_to_user(optval, &sk->tp_pinfo.tp_raw4.filter, len)) 563 goto out; 564 ret = 0; 565out: return ret; 566} 567 568static int raw_setsockopt(struct sock *sk, int level, int optname, 569 char *optval, int optlen) 570{ 571 if (level != SOL_RAW) 572 return ip_setsockopt(sk, level, optname, optval, optlen); 573 574 if (optname == ICMP_FILTER) { 575 if (sk->num != IPPROTO_ICMP) 576 return -EOPNOTSUPP; 577 else 578 return raw_seticmpfilter(sk, optval, optlen); 579 } 580 return -ENOPROTOOPT; 581} 582 583static int raw_getsockopt(struct sock *sk, int level, int optname, 584 char *optval, int *optlen) 585{ 586 if (level != SOL_RAW) 587 return ip_getsockopt(sk, level, optname, optval, optlen); 588 589 if (optname == ICMP_FILTER) { 590 if (sk->num != IPPROTO_ICMP) 591 return -EOPNOTSUPP; 592 else 593 return raw_geticmpfilter(sk, optval, optlen); 594 } 595 return -ENOPROTOOPT; 596} 597 598static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) 599{ 600 switch (cmd) { 601 case SIOCOUTQ: { 602 int amount = atomic_read(&sk->wmem_alloc); 603 return put_user(amount, (int *)arg); 604 } 605 case SIOCINQ: { 606 struct sk_buff *skb; 607 int amount = 0; 608 609 spin_lock_irq(&sk->receive_queue.lock); 610 skb = skb_peek(&sk->receive_queue); 611 if (skb != NULL) 612 amount = skb->len; 613 spin_unlock_irq(&sk->receive_queue.lock); 614 return put_user(amount, (int *)arg); 615 } 616 617 default: 618#ifdef CONFIG_IP_MROUTE 619 return ipmr_ioctl(sk, cmd, arg); 620#else 621 return -ENOIOCTLCMD; 622#endif 623 } 624} 625 626static void get_raw_sock(struct sock *sp, char *tmpbuf, int i) 627{ 628 unsigned int dest = sp->daddr, 629 src = sp->rcv_saddr; 630 __u16 destp = 0, 631 srcp = sp->num; 632 633 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" 634 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p", 635 i, src, srcp, dest, destp, sp->state, 636 atomic_read(&sp->wmem_alloc), atomic_read(&sp->rmem_alloc), 637 0, 0L, 0, 638 sock_i_uid(sp), 0, 639 sock_i_ino(sp), 640 atomic_read(&sp->refcnt), sp); 641} 642 643int raw_get_info(char *buffer, char **start, off_t offset, int length) 644{ 645 int len = 0, num = 0, i; 646 off_t pos = 128; 647 off_t begin; 648 char tmpbuf[129]; 649 650 if (offset < 128) 651 len += sprintf(buffer, "%-127s\n", 652 " sl local_address rem_address st tx_queue " 653 "rx_queue tr tm->when retrnsmt uid timeout " 654 "inode"); 655 read_lock(&raw_v4_lock); 656 for (i = 0; i < RAWV4_HTABLE_SIZE; i++) { 657 struct sock *sk; 658 659 for (sk = raw_v4_htable[i]; sk; sk = sk->next, num++) { 660 if (sk->family != PF_INET) 661 continue; 662 pos += 128; 663 if (pos <= offset) 664 continue; 665 get_raw_sock(sk, tmpbuf, i); 666 len += sprintf(buffer + len, "%-127s\n", tmpbuf); 667 if (len >= length) 668 goto out; 669 } 670 } 671out: 672 read_unlock(&raw_v4_lock); 673 begin = len - (pos - offset); 674 *start = buffer + begin; 675 len -= begin; 676 if (len > length) 677 len = length; 678 if (len < 0) 679 len = 0; 680 return len; 681} 682 683struct proto raw_prot = { 684 name: "RAW", 685 close: raw_close, 686 connect: udp_connect, 687 disconnect: udp_disconnect, 688 ioctl: raw_ioctl, 689 init: raw_init, 690 setsockopt: raw_setsockopt, 691 getsockopt: raw_getsockopt, 692 sendmsg: raw_sendmsg, 693 recvmsg: raw_recvmsg, 694 bind: raw_bind, 695 backlog_rcv: raw_rcv_skb, 696 hash: raw_v4_hash, 697 unhash: raw_v4_unhash, 698}; 699