1 2 3#include <linux/compiler.h> 4#include <linux/module.h> 5#include <linux/types.h> 6#include <linux/mm.h> 7#include <linux/jiffies.h> 8#include <linux/skbuff.h> 9#include <linux/list.h> 10#include <linux/ip.h> 11#include <linux/icmp.h> 12#include <linux/netdevice.h> 13#include <linux/jhash.h> 14#include <linux/random.h> 15#include <net/sock.h> 16#include <net/ip.h> 17#include <net/icmp.h> 18#include <net/checksum.h> 19#include <net/inetpeer.h> 20#include <linux/tcp.h> 21#include <linux/udp.h> 22#include <linux/inet.h> 23#include <linux/netfilter_ipv4.h> 24 25/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 26 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c 27 * as well. Or notify me, at least. --ANK 28 */ 29 30/* Fragment cache limits. We will commit 256K at one time. Should we 31 * cross that limit we will prune down to 192K. This should cope with 32 * even the most extreme cases without allowing an attacker to measurably 33 * harm machine performance. 34 */ 35int sysctl_ipfrag_high_thresh __read_mostly = 256*1024; 36int sysctl_ipfrag_low_thresh __read_mostly = 192*1024; 37 38int sysctl_ipfrag_max_dist __read_mostly = 64; 39 40/* Important NOTE! Fragment queue must be destroyed before MSL expires. 41 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. 42 */ 43int sysctl_ipfrag_time __read_mostly = IP_FRAG_TIME; 44 45struct ipfrag_skb_cb 46{ 47 struct inet_skb_parm h; 48 int offset; 49}; 50 51#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb)) 52 53/* Describe an entry in the "incomplete datagrams" queue. */ 54struct ipq { 55 struct hlist_node list; 56 struct list_head lru_list; /* lru list member */ 57 u32 user; 58 __be32 saddr; 59 __be32 daddr; 60 __be16 id; 61 u8 protocol; 62 u8 last_in; 63#define COMPLETE 4 64#define FIRST_IN 2 65#define LAST_IN 1 66 67 struct sk_buff *fragments; /* linked list of received fragments */ 68 int len; /* total length of original datagram */ 69 int meat; 70 spinlock_t lock; 71 atomic_t refcnt; 72 struct timer_list timer; /* when will this queue expire? */ 73 ktime_t stamp; 74 int iif; 75 unsigned int rid; 76 struct inet_peer *peer; 77}; 78 79/* Hash table. */ 80 81#define IPQ_HASHSZ 64 82 83/* Per-bucket lock is easy to add now. */ 84static struct hlist_head ipq_hash[IPQ_HASHSZ]; 85static DEFINE_RWLOCK(ipfrag_lock); 86static u32 ipfrag_hash_rnd; 87static LIST_HEAD(ipq_lru_list); 88int ip_frag_nqueues = 0; 89 90static __inline__ void __ipq_unlink(struct ipq *qp) 91{ 92 hlist_del(&qp->list); 93 list_del(&qp->lru_list); 94 ip_frag_nqueues--; 95} 96 97static __inline__ void ipq_unlink(struct ipq *ipq) 98{ 99 write_lock(&ipfrag_lock); 100 __ipq_unlink(ipq); 101 write_unlock(&ipfrag_lock); 102} 103 104static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) 105{ 106 return jhash_3words((__force u32)id << 16 | prot, 107 (__force u32)saddr, (__force u32)daddr, 108 ipfrag_hash_rnd) & (IPQ_HASHSZ - 1); 109} 110 111static struct timer_list ipfrag_secret_timer; 112int sysctl_ipfrag_secret_interval __read_mostly = 10 * 60 * HZ; 113 114static void ipfrag_secret_rebuild(unsigned long dummy) 115{ 116 unsigned long now = jiffies; 117 int i; 118 119 write_lock(&ipfrag_lock); 120 get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); 121 for (i = 0; i < IPQ_HASHSZ; i++) { 122 struct ipq *q; 123 struct hlist_node *p, *n; 124 125 hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { 126 unsigned int hval = ipqhashfn(q->id, q->saddr, 127 q->daddr, q->protocol); 128 129 if (hval != i) { 130 hlist_del(&q->list); 131 132 /* Relink to new hash chain. */ 133 hlist_add_head(&q->list, &ipq_hash[hval]); 134 } 135 } 136 } 137 write_unlock(&ipfrag_lock); 138 139 mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval); 140} 141 142atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ 143 144/* Memory Tracking Functions. */ 145static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) 146{ 147 if (work) 148 *work -= skb->truesize; 149 atomic_sub(skb->truesize, &ip_frag_mem); 150 kfree_skb(skb); 151} 152 153static __inline__ void frag_free_queue(struct ipq *qp, int *work) 154{ 155 if (work) 156 *work -= sizeof(struct ipq); 157 atomic_sub(sizeof(struct ipq), &ip_frag_mem); 158 kfree(qp); 159} 160 161static __inline__ struct ipq *frag_alloc_queue(void) 162{ 163 struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); 164 165 if (!qp) 166 return NULL; 167 atomic_add(sizeof(struct ipq), &ip_frag_mem); 168 return qp; 169} 170 171 172/* Destruction primitives. */ 173 174/* Complete destruction of ipq. */ 175static void ip_frag_destroy(struct ipq *qp, int *work) 176{ 177 struct sk_buff *fp; 178 179 BUG_TRAP(qp->last_in&COMPLETE); 180 BUG_TRAP(del_timer(&qp->timer) == 0); 181 182 if (qp->peer) 183 inet_putpeer(qp->peer); 184 185 /* Release all fragment data. */ 186 fp = qp->fragments; 187 while (fp) { 188 struct sk_buff *xp = fp->next; 189 190 frag_kfree_skb(fp, work); 191 fp = xp; 192 } 193 194 /* Finally, release the queue descriptor itself. */ 195 frag_free_queue(qp, work); 196} 197 198static __inline__ void ipq_put(struct ipq *ipq, int *work) 199{ 200 if (atomic_dec_and_test(&ipq->refcnt)) 201 ip_frag_destroy(ipq, work); 202} 203 204/* Kill ipq entry. It is not destroyed immediately, 205 * because caller (and someone more) holds reference count. 206 */ 207static void ipq_kill(struct ipq *ipq) 208{ 209 if (del_timer(&ipq->timer)) 210 atomic_dec(&ipq->refcnt); 211 212 if (!(ipq->last_in & COMPLETE)) { 213 ipq_unlink(ipq); 214 atomic_dec(&ipq->refcnt); 215 ipq->last_in |= COMPLETE; 216 } 217} 218 219/* Memory limiting on fragments. Evictor trashes the oldest 220 * fragment queue until we are back under the threshold. 221 */ 222static void ip_evictor(void) 223{ 224 struct ipq *qp; 225 struct list_head *tmp; 226 int work; 227 228 work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh; 229 if (work <= 0) 230 return; 231 232 while (work > 0) { 233 read_lock(&ipfrag_lock); 234 if (list_empty(&ipq_lru_list)) { 235 read_unlock(&ipfrag_lock); 236 return; 237 } 238 tmp = ipq_lru_list.next; 239 qp = list_entry(tmp, struct ipq, lru_list); 240 atomic_inc(&qp->refcnt); 241 read_unlock(&ipfrag_lock); 242 243 spin_lock(&qp->lock); 244 if (!(qp->last_in&COMPLETE)) 245 ipq_kill(qp); 246 spin_unlock(&qp->lock); 247 248 ipq_put(qp, &work); 249 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 250 } 251} 252 253/* 254 * Oops, a fragment queue timed out. Kill it and send an ICMP reply. 255 */ 256static void ip_expire(unsigned long arg) 257{ 258 struct ipq *qp = (struct ipq *) arg; 259 260 spin_lock(&qp->lock); 261 262 if (qp->last_in & COMPLETE) 263 goto out; 264 265 ipq_kill(qp); 266 267 IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); 268 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 269 270 if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) { 271 struct sk_buff *head = qp->fragments; 272 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 273 if ((head->dev = dev_get_by_index(qp->iif)) != NULL) { 274 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 275 dev_put(head->dev); 276 } 277 } 278out: 279 spin_unlock(&qp->lock); 280 ipq_put(qp, NULL); 281} 282 283/* Creation primitives. */ 284 285static struct ipq *ip_frag_intern(struct ipq *qp_in) 286{ 287 struct ipq *qp; 288#ifdef CONFIG_SMP 289 struct hlist_node *n; 290#endif 291 unsigned int hash; 292 293 write_lock(&ipfrag_lock); 294 hash = ipqhashfn(qp_in->id, qp_in->saddr, qp_in->daddr, 295 qp_in->protocol); 296#ifdef CONFIG_SMP 297 /* With SMP race we have to recheck hash table, because 298 * such entry could be created on other cpu, while we 299 * promoted read lock to write lock. 300 */ 301 hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { 302 if (qp->id == qp_in->id && 303 qp->saddr == qp_in->saddr && 304 qp->daddr == qp_in->daddr && 305 qp->protocol == qp_in->protocol && 306 qp->user == qp_in->user) { 307 atomic_inc(&qp->refcnt); 308 write_unlock(&ipfrag_lock); 309 qp_in->last_in |= COMPLETE; 310 ipq_put(qp_in, NULL); 311 return qp; 312 } 313 } 314#endif 315 qp = qp_in; 316 317 if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) 318 atomic_inc(&qp->refcnt); 319 320 atomic_inc(&qp->refcnt); 321 hlist_add_head(&qp->list, &ipq_hash[hash]); 322 INIT_LIST_HEAD(&qp->lru_list); 323 list_add_tail(&qp->lru_list, &ipq_lru_list); 324 ip_frag_nqueues++; 325 write_unlock(&ipfrag_lock); 326 return qp; 327} 328 329/* Add an entry to the 'ipq' queue for a newly received IP datagram. */ 330static struct ipq *ip_frag_create(struct iphdr *iph, u32 user) 331{ 332 struct ipq *qp; 333 334 if ((qp = frag_alloc_queue()) == NULL) 335 goto out_nomem; 336 337 qp->protocol = iph->protocol; 338 qp->last_in = 0; 339 qp->id = iph->id; 340 qp->saddr = iph->saddr; 341 qp->daddr = iph->daddr; 342 qp->user = user; 343 qp->len = 0; 344 qp->meat = 0; 345 qp->fragments = NULL; 346 qp->iif = 0; 347 qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; 348 349 /* Initialize a timer for this entry. */ 350 init_timer(&qp->timer); 351 qp->timer.data = (unsigned long) qp; /* pointer to queue */ 352 qp->timer.function = ip_expire; /* expire function */ 353 spin_lock_init(&qp->lock); 354 atomic_set(&qp->refcnt, 1); 355 356 return ip_frag_intern(qp); 357 358out_nomem: 359 LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); 360 return NULL; 361} 362 363/* Find the correct entry in the "incomplete datagrams" queue for 364 * this IP datagram, and create new one, if nothing is found. 365 */ 366static inline struct ipq *ip_find(struct iphdr *iph, u32 user) 367{ 368 __be16 id = iph->id; 369 __be32 saddr = iph->saddr; 370 __be32 daddr = iph->daddr; 371 __u8 protocol = iph->protocol; 372 unsigned int hash; 373 struct ipq *qp; 374 struct hlist_node *n; 375 376 read_lock(&ipfrag_lock); 377 hash = ipqhashfn(id, saddr, daddr, protocol); 378 hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { 379 if (qp->id == id && 380 qp->saddr == saddr && 381 qp->daddr == daddr && 382 qp->protocol == protocol && 383 qp->user == user) { 384 atomic_inc(&qp->refcnt); 385 read_unlock(&ipfrag_lock); 386 return qp; 387 } 388 } 389 read_unlock(&ipfrag_lock); 390 391 return ip_frag_create(iph, user); 392} 393 394/* Is the fragment too far ahead to be part of ipq? */ 395static inline int ip_frag_too_far(struct ipq *qp) 396{ 397 struct inet_peer *peer = qp->peer; 398 unsigned int max = sysctl_ipfrag_max_dist; 399 unsigned int start, end; 400 401 int rc; 402 403 if (!peer || !max) 404 return 0; 405 406 start = qp->rid; 407 end = atomic_inc_return(&peer->rid); 408 qp->rid = end; 409 410 rc = qp->fragments && (end - start) > max; 411 412 if (rc) { 413 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 414 } 415 416 return rc; 417} 418 419static int ip_frag_reinit(struct ipq *qp) 420{ 421 struct sk_buff *fp; 422 423 if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { 424 atomic_inc(&qp->refcnt); 425 return -ETIMEDOUT; 426 } 427 428 fp = qp->fragments; 429 do { 430 struct sk_buff *xp = fp->next; 431 frag_kfree_skb(fp, NULL); 432 fp = xp; 433 } while (fp); 434 435 qp->last_in = 0; 436 qp->len = 0; 437 qp->meat = 0; 438 qp->fragments = NULL; 439 qp->iif = 0; 440 441 return 0; 442} 443 444/* Add new segment to existing queue. */ 445static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 446{ 447 struct sk_buff *prev, *next; 448 int flags, offset; 449 int ihl, end; 450 451 if (qp->last_in & COMPLETE) 452 goto err; 453 454 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && 455 unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { 456 ipq_kill(qp); 457 goto err; 458 } 459 460 offset = ntohs(ip_hdr(skb)->frag_off); 461 flags = offset & ~IP_OFFSET; 462 offset &= IP_OFFSET; 463 offset <<= 3; /* offset is in 8-byte chunks */ 464 ihl = ip_hdrlen(skb); 465 466 /* Determine the position of this fragment. */ 467 end = offset + skb->len - ihl; 468 469 /* Is this the final fragment? */ 470 if ((flags & IP_MF) == 0) { 471 /* If we already have some bits beyond end 472 * or have different end, the segment is corrrupted. 473 */ 474 if (end < qp->len || 475 ((qp->last_in & LAST_IN) && end != qp->len)) 476 goto err; 477 qp->last_in |= LAST_IN; 478 qp->len = end; 479 } else { 480 if (end&7) { 481 end &= ~7; 482 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 483 skb->ip_summed = CHECKSUM_NONE; 484 } 485 if (end > qp->len) { 486 /* Some bits beyond end -> corruption. */ 487 if (qp->last_in & LAST_IN) 488 goto err; 489 qp->len = end; 490 } 491 } 492 if (end == offset) 493 goto err; 494 495 if (pskb_pull(skb, ihl) == NULL) 496 goto err; 497 if (pskb_trim_rcsum(skb, end-offset)) 498 goto err; 499 500 /* Find out which fragments are in front and at the back of us 501 * in the chain of fragments so far. We must know where to put 502 * this fragment, right? 503 */ 504 prev = NULL; 505 for (next = qp->fragments; next != NULL; next = next->next) { 506 if (FRAG_CB(next)->offset >= offset) 507 break; /* bingo! */ 508 prev = next; 509 } 510 511 /* We found where to put this one. Check for overlap with 512 * preceding fragment, and, if needed, align things so that 513 * any overlaps are eliminated. 514 */ 515 if (prev) { 516 int i = (FRAG_CB(prev)->offset + prev->len) - offset; 517 518 if (i > 0) { 519 offset += i; 520 if (end <= offset) 521 goto err; 522 if (!pskb_pull(skb, i)) 523 goto err; 524 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 525 skb->ip_summed = CHECKSUM_NONE; 526 } 527 } 528 529 while (next && FRAG_CB(next)->offset < end) { 530 int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ 531 532 if (i < next->len) { 533 /* Eat head of the next overlapped fragment 534 * and leave the loop. The next ones cannot overlap. 535 */ 536 if (!pskb_pull(next, i)) 537 goto err; 538 FRAG_CB(next)->offset += i; 539 qp->meat -= i; 540 if (next->ip_summed != CHECKSUM_UNNECESSARY) 541 next->ip_summed = CHECKSUM_NONE; 542 break; 543 } else { 544 struct sk_buff *free_it = next; 545 546 /* Old fragment is completely overridden with 547 * new one drop it. 548 */ 549 next = next->next; 550 551 if (prev) 552 prev->next = next; 553 else 554 qp->fragments = next; 555 556 qp->meat -= free_it->len; 557 frag_kfree_skb(free_it, NULL); 558 } 559 } 560 561 FRAG_CB(skb)->offset = offset; 562 563 /* Insert this fragment in the chain of fragments. */ 564 skb->next = next; 565 if (prev) 566 prev->next = skb; 567 else 568 qp->fragments = skb; 569 570 if (skb->dev) 571 qp->iif = skb->dev->ifindex; 572 skb->dev = NULL; 573 qp->stamp = skb->tstamp; 574 qp->meat += skb->len; 575 atomic_add(skb->truesize, &ip_frag_mem); 576 if (offset == 0) 577 qp->last_in |= FIRST_IN; 578 579 write_lock(&ipfrag_lock); 580 list_move_tail(&qp->lru_list, &ipq_lru_list); 581 write_unlock(&ipfrag_lock); 582 583 return; 584 585err: 586 kfree_skb(skb); 587} 588 589 590/* Build a new IP datagram from all its fragments. */ 591 592static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) 593{ 594 struct iphdr *iph; 595 struct sk_buff *fp, *head = qp->fragments; 596 int len; 597 int ihlen; 598 599 ipq_kill(qp); 600 601 BUG_TRAP(head != NULL); 602 BUG_TRAP(FRAG_CB(head)->offset == 0); 603 604 /* Allocate a new buffer for the datagram. */ 605 ihlen = ip_hdrlen(head); 606 len = ihlen + qp->len; 607 608 if (len > 65535) 609 goto out_oversize; 610 611 /* Head of list must not be cloned. */ 612 if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) 613 goto out_nomem; 614 615 /* If the first fragment is fragmented itself, we split 616 * it to two chunks: the first with data and paged part 617 * and the second, holding only fragments. */ 618 if (skb_shinfo(head)->frag_list) { 619 struct sk_buff *clone; 620 int i, plen = 0; 621 622 if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) 623 goto out_nomem; 624 clone->next = head->next; 625 head->next = clone; 626 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 627 skb_shinfo(head)->frag_list = NULL; 628 for (i=0; i<skb_shinfo(head)->nr_frags; i++) 629 plen += skb_shinfo(head)->frags[i].size; 630 clone->len = clone->data_len = head->data_len - plen; 631 head->data_len -= clone->len; 632 head->len -= clone->len; 633 clone->csum = 0; 634 clone->ip_summed = head->ip_summed; 635 atomic_add(clone->truesize, &ip_frag_mem); 636 } 637 638 skb_shinfo(head)->frag_list = head->next; 639 skb_push(head, head->data - skb_network_header(head)); 640 atomic_sub(head->truesize, &ip_frag_mem); 641 642 for (fp=head->next; fp; fp = fp->next) { 643 head->data_len += fp->len; 644 head->len += fp->len; 645 if (head->ip_summed != fp->ip_summed) 646 head->ip_summed = CHECKSUM_NONE; 647 else if (head->ip_summed == CHECKSUM_COMPLETE) 648 head->csum = csum_add(head->csum, fp->csum); 649 head->truesize += fp->truesize; 650 atomic_sub(fp->truesize, &ip_frag_mem); 651 } 652 653 head->next = NULL; 654 head->dev = dev; 655 head->tstamp = qp->stamp; 656 657 iph = ip_hdr(head); 658 iph->frag_off = 0; 659 iph->tot_len = htons(len); 660 IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); 661 qp->fragments = NULL; 662 return head; 663 664out_nomem: 665 LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " 666 "queue %p\n", qp); 667 goto out_fail; 668out_oversize: 669 if (net_ratelimit()) 670 printk(KERN_INFO 671 "Oversized IP packet from %d.%d.%d.%d.\n", 672 NIPQUAD(qp->saddr)); 673out_fail: 674 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 675 return NULL; 676} 677 678/* Process an incoming IP datagram fragment. */ 679struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) 680{ 681 struct ipq *qp; 682 struct net_device *dev; 683 684 IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); 685 686 /* Start by cleaning up the memory. */ 687 if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh) 688 ip_evictor(); 689 690 dev = skb->dev; 691 692 /* Lookup (or create) queue header */ 693 if ((qp = ip_find(ip_hdr(skb), user)) != NULL) { 694 struct sk_buff *ret = NULL; 695 696 spin_lock(&qp->lock); 697 698 ip_frag_queue(qp, skb); 699 700 if (qp->last_in == (FIRST_IN|LAST_IN) && 701 qp->meat == qp->len) 702 ret = ip_frag_reasm(qp, dev); 703 704 spin_unlock(&qp->lock); 705 ipq_put(qp, NULL); 706 return ret; 707 } 708 709 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 710 kfree_skb(skb); 711 return NULL; 712} 713 714void __init ipfrag_init(void) 715{ 716 ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ 717 (jiffies ^ (jiffies >> 6))); 718 719 init_timer(&ipfrag_secret_timer); 720 ipfrag_secret_timer.function = ipfrag_secret_rebuild; 721 ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval; 722 add_timer(&ipfrag_secret_timer); 723} 724 725EXPORT_SYMBOL(ip_defrag); 726