1/* 2 * ip_vs_xmit.c: various packet transmitters for IPVS 3 * 4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 5 * Julian Anastasov <ja@ssi.bg> 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Changes: 13 * 14 */ 15 16#define KMSG_COMPONENT "IPVS" 17#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 18 19#include <linux/kernel.h> 20#include <linux/slab.h> 21#include <linux/tcp.h> /* for tcphdr */ 22#include <net/ip.h> 23#include <net/tcp.h> /* for csum_tcpudp_magic */ 24#include <net/udp.h> 25#include <net/icmp.h> /* for icmp_send */ 26#include <net/route.h> /* for ip_route_output */ 27#include <net/ipv6.h> 28#include <net/ip6_route.h> 29#include <linux/icmpv6.h> 30#include <linux/netfilter.h> 31#include <net/netfilter/nf_conntrack.h> 32#include <linux/netfilter_ipv4.h> 33 34#include <net/ip_vs.h> 35 36 37/* 38 * Destination cache to speed up outgoing route lookup 39 */ 40static inline void 41__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) 42{ 43 struct dst_entry *old_dst; 44 45 old_dst = dest->dst_cache; 46 dest->dst_cache = dst; 47 dest->dst_rtos = rtos; 48 dst_release(old_dst); 49} 50 51static inline struct dst_entry * 52__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) 53{ 54 struct dst_entry *dst = dest->dst_cache; 55 56 if (!dst) 57 return NULL; 58 if ((dst->obsolete 59 || (dest->af == AF_INET && rtos != dest->dst_rtos)) && 60 dst->ops->check(dst, cookie) == NULL) { 61 dest->dst_cache = NULL; 62 dst_release(dst); 63 return NULL; 64 } 65 dst_hold(dst); 66 return dst; 67} 68 69static struct rtable * 70__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) 71{ 72 struct rtable *rt; /* Route to the other host */ 73 struct ip_vs_dest *dest = cp->dest; 74 75 if (dest) { 76 spin_lock(&dest->dst_lock); 77 if (!(rt = (struct rtable *) 78 __ip_vs_dst_check(dest, rtos, 0))) { 79 struct flowi fl = { 80 .oif = 0, 81 .nl_u = { 82 .ip4_u = { 83 .daddr = dest->addr.ip, 84 .saddr = 0, 85 .tos = rtos, } }, 86 }; 87 88 if (ip_route_output_key(&init_net, &rt, &fl)) { 89 spin_unlock(&dest->dst_lock); 90 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 91 &dest->addr.ip); 92 return NULL; 93 } 94 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst)); 95 IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n", 96 &dest->addr.ip, 97 atomic_read(&rt->dst.__refcnt), rtos); 98 } 99 spin_unlock(&dest->dst_lock); 100 } else { 101 struct flowi fl = { 102 .oif = 0, 103 .nl_u = { 104 .ip4_u = { 105 .daddr = cp->daddr.ip, 106 .saddr = 0, 107 .tos = rtos, } }, 108 }; 109 110 if (ip_route_output_key(&init_net, &rt, &fl)) { 111 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 112 &cp->daddr.ip); 113 return NULL; 114 } 115 } 116 117 return rt; 118} 119 120#ifdef CONFIG_IP_VS_IPV6 121static struct rt6_info * 122__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) 123{ 124 struct rt6_info *rt; /* Route to the other host */ 125 struct ip_vs_dest *dest = cp->dest; 126 127 if (dest) { 128 spin_lock(&dest->dst_lock); 129 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); 130 if (!rt) { 131 struct flowi fl = { 132 .oif = 0, 133 .nl_u = { 134 .ip6_u = { 135 .daddr = dest->addr.in6, 136 .saddr = { 137 .s6_addr32 = 138 { 0, 0, 0, 0 }, 139 }, 140 }, 141 }, 142 }; 143 144 rt = (struct rt6_info *)ip6_route_output(&init_net, 145 NULL, &fl); 146 if (!rt) { 147 spin_unlock(&dest->dst_lock); 148 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", 149 &dest->addr.in6); 150 return NULL; 151 } 152 __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst)); 153 IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n", 154 &dest->addr.in6, 155 atomic_read(&rt->dst.__refcnt)); 156 } 157 spin_unlock(&dest->dst_lock); 158 } else { 159 struct flowi fl = { 160 .oif = 0, 161 .nl_u = { 162 .ip6_u = { 163 .daddr = cp->daddr.in6, 164 .saddr = { 165 .s6_addr32 = { 0, 0, 0, 0 }, 166 }, 167 }, 168 }, 169 }; 170 171 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); 172 if (!rt) { 173 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", 174 &cp->daddr.in6); 175 return NULL; 176 } 177 } 178 179 return rt; 180} 181#endif 182 183 184/* 185 * Release dest->dst_cache before a dest is removed 186 */ 187void 188ip_vs_dst_reset(struct ip_vs_dest *dest) 189{ 190 struct dst_entry *old_dst; 191 192 old_dst = dest->dst_cache; 193 dest->dst_cache = NULL; 194 dst_release(old_dst); 195} 196 197#define IP_VS_XMIT(pf, skb, rt) \ 198do { \ 199 (skb)->ipvs_property = 1; \ 200 skb_forward_csum(skb); \ 201 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ 202 (rt)->dst.dev, dst_output); \ 203} while (0) 204 205 206/* 207 * NULL transmitter (do nothing except return NF_ACCEPT) 208 */ 209int 210ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 211 struct ip_vs_protocol *pp) 212{ 213 /* we do not touch skb and do not need pskb ptr */ 214 return NF_ACCEPT; 215} 216 217 218/* 219 * Bypass transmitter 220 * Let packets bypass the destination when the destination is not 221 * available, it may be only used in transparent cache cluster. 222 */ 223int 224ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 225 struct ip_vs_protocol *pp) 226{ 227 struct rtable *rt; /* Route to the other host */ 228 struct iphdr *iph = ip_hdr(skb); 229 u8 tos = iph->tos; 230 int mtu; 231 struct flowi fl = { 232 .oif = 0, 233 .nl_u = { 234 .ip4_u = { 235 .daddr = iph->daddr, 236 .saddr = 0, 237 .tos = RT_TOS(tos), } }, 238 }; 239 240 EnterFunction(10); 241 242 if (ip_route_output_key(&init_net, &rt, &fl)) { 243 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n", 244 __func__, &iph->daddr); 245 goto tx_error_icmp; 246 } 247 248 /* MTU checking */ 249 mtu = dst_mtu(&rt->dst); 250 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 251 ip_rt_put(rt); 252 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 253 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 254 goto tx_error; 255 } 256 257 /* 258 * Call ip_send_check because we are not sure it is called 259 * after ip_defrag. Is copy-on-write needed? 260 */ 261 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { 262 ip_rt_put(rt); 263 return NF_STOLEN; 264 } 265 ip_send_check(ip_hdr(skb)); 266 267 /* drop old route */ 268 skb_dst_drop(skb); 269 skb_dst_set(skb, &rt->dst); 270 271 /* Another hack: avoid icmp_send in ip_fragment */ 272 skb->local_df = 1; 273 274 IP_VS_XMIT(NFPROTO_IPV4, skb, rt); 275 276 LeaveFunction(10); 277 return NF_STOLEN; 278 279 tx_error_icmp: 280 dst_link_failure(skb); 281 tx_error: 282 kfree_skb(skb); 283 LeaveFunction(10); 284 return NF_STOLEN; 285} 286 287#ifdef CONFIG_IP_VS_IPV6 288int 289ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 290 struct ip_vs_protocol *pp) 291{ 292 struct rt6_info *rt; /* Route to the other host */ 293 struct ipv6hdr *iph = ipv6_hdr(skb); 294 int mtu; 295 struct flowi fl = { 296 .oif = 0, 297 .nl_u = { 298 .ip6_u = { 299 .daddr = iph->daddr, 300 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, 301 }; 302 303 EnterFunction(10); 304 305 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); 306 if (!rt) { 307 IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n", 308 __func__, &iph->daddr); 309 goto tx_error_icmp; 310 } 311 312 /* MTU checking */ 313 mtu = dst_mtu(&rt->dst); 314 if (skb->len > mtu) { 315 dst_release(&rt->dst); 316 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 317 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 318 goto tx_error; 319 } 320 321 /* 322 * Call ip_send_check because we are not sure it is called 323 * after ip_defrag. Is copy-on-write needed? 324 */ 325 skb = skb_share_check(skb, GFP_ATOMIC); 326 if (unlikely(skb == NULL)) { 327 dst_release(&rt->dst); 328 return NF_STOLEN; 329 } 330 331 /* drop old route */ 332 skb_dst_drop(skb); 333 skb_dst_set(skb, &rt->dst); 334 335 /* Another hack: avoid icmp_send in ip_fragment */ 336 skb->local_df = 1; 337 338 IP_VS_XMIT(NFPROTO_IPV6, skb, rt); 339 340 LeaveFunction(10); 341 return NF_STOLEN; 342 343 tx_error_icmp: 344 dst_link_failure(skb); 345 tx_error: 346 kfree_skb(skb); 347 LeaveFunction(10); 348 return NF_STOLEN; 349} 350#endif 351 352void 353ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) 354{ 355 struct nf_conn *ct = (struct nf_conn *)skb->nfct; 356 struct nf_conntrack_tuple new_tuple; 357 358 if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct)) 359 return; 360 361 /* 362 * The connection is not yet in the hashtable, so we update it. 363 * CIP->VIP will remain the same, so leave the tuple in 364 * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the 365 * real-server we will see RIP->DIP. 366 */ 367 new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; 368 if (outin) 369 new_tuple.src.u3 = cp->daddr; 370 else 371 new_tuple.dst.u3 = cp->vaddr; 372 /* 373 * This will also take care of UDP and other protocols. 374 */ 375 if (outin) 376 new_tuple.src.u.tcp.port = cp->dport; 377 else 378 new_tuple.dst.u.tcp.port = cp->vport; 379 nf_conntrack_alter_reply(ct, &new_tuple); 380} 381 382/* 383 * NAT transmitter (only for outside-to-inside nat forwarding) 384 * Not used for related ICMP 385 */ 386int 387ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 388 struct ip_vs_protocol *pp) 389{ 390 struct rtable *rt; /* Route to the other host */ 391 int mtu; 392 struct iphdr *iph = ip_hdr(skb); 393 394 EnterFunction(10); 395 396 /* check if it is a connection of no-client-port */ 397 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 398 __be16 _pt, *p; 399 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); 400 if (p == NULL) 401 goto tx_error; 402 ip_vs_conn_fill_cport(cp, *p); 403 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 404 } 405 406 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 407 goto tx_error_icmp; 408 409 /* MTU checking */ 410 mtu = dst_mtu(&rt->dst); 411 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 412 ip_rt_put(rt); 413 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 414 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); 415 goto tx_error; 416 } 417 418 /* copy-on-write the packet before mangling it */ 419 if (!skb_make_writable(skb, sizeof(struct iphdr))) 420 goto tx_error_put; 421 422 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 423 goto tx_error_put; 424 425 /* drop old route */ 426 skb_dst_drop(skb); 427 skb_dst_set(skb, &rt->dst); 428 429 /* mangle the packet */ 430 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) 431 goto tx_error; 432 ip_hdr(skb)->daddr = cp->daddr.ip; 433 ip_send_check(ip_hdr(skb)); 434 435 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 436 437 ip_vs_update_conntrack(skb, cp, 1); 438 439 440 /* Another hack: avoid icmp_send in ip_fragment */ 441 skb->local_df = 1; 442 443 IP_VS_XMIT(NFPROTO_IPV4, skb, rt); 444 445 LeaveFunction(10); 446 return NF_STOLEN; 447 448 tx_error_icmp: 449 dst_link_failure(skb); 450 tx_error: 451 LeaveFunction(10); 452 kfree_skb(skb); 453 return NF_STOLEN; 454 tx_error_put: 455 ip_rt_put(rt); 456 goto tx_error; 457} 458 459#ifdef CONFIG_IP_VS_IPV6 460int 461ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 462 struct ip_vs_protocol *pp) 463{ 464 struct rt6_info *rt; /* Route to the other host */ 465 int mtu; 466 467 EnterFunction(10); 468 469 /* check if it is a connection of no-client-port */ 470 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 471 __be16 _pt, *p; 472 p = skb_header_pointer(skb, sizeof(struct ipv6hdr), 473 sizeof(_pt), &_pt); 474 if (p == NULL) 475 goto tx_error; 476 ip_vs_conn_fill_cport(cp, *p); 477 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 478 } 479 480 rt = __ip_vs_get_out_rt_v6(cp); 481 if (!rt) 482 goto tx_error_icmp; 483 484 /* MTU checking */ 485 mtu = dst_mtu(&rt->dst); 486 if (skb->len > mtu) { 487 dst_release(&rt->dst); 488 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 489 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 490 "ip_vs_nat_xmit_v6(): frag needed for"); 491 goto tx_error; 492 } 493 494 /* copy-on-write the packet before mangling it */ 495 if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) 496 goto tx_error_put; 497 498 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 499 goto tx_error_put; 500 501 /* drop old route */ 502 skb_dst_drop(skb); 503 skb_dst_set(skb, &rt->dst); 504 505 /* mangle the packet */ 506 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) 507 goto tx_error; 508 ipv6_hdr(skb)->daddr = cp->daddr.in6; 509 510 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 511 512 ip_vs_update_conntrack(skb, cp, 1); 513 514 515 /* Another hack: avoid icmp_send in ip_fragment */ 516 skb->local_df = 1; 517 518 IP_VS_XMIT(NFPROTO_IPV6, skb, rt); 519 520 LeaveFunction(10); 521 return NF_STOLEN; 522 523tx_error_icmp: 524 dst_link_failure(skb); 525tx_error: 526 LeaveFunction(10); 527 kfree_skb(skb); 528 return NF_STOLEN; 529tx_error_put: 530 dst_release(&rt->dst); 531 goto tx_error; 532} 533#endif 534 535 536/* 537 * IP Tunneling transmitter 538 * 539 * This function encapsulates the packet in a new IP packet, its 540 * destination will be set to cp->daddr. Most code of this function 541 * is taken from ipip.c. 542 * 543 * It is used in VS/TUN cluster. The load balancer selects a real 544 * server from a cluster based on a scheduling algorithm, 545 * encapsulates the request packet and forwards it to the selected 546 * server. For example, all real servers are configured with 547 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 548 * the encapsulated packet, it will decapsulate the packet, processe 549 * the request and return the response packets directly to the client 550 * without passing the load balancer. This can greatly increase the 551 * scalability of virtual server. 552 * 553 * Used for ANY protocol 554 */ 555int 556ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 557 struct ip_vs_protocol *pp) 558{ 559 struct rtable *rt; /* Route to the other host */ 560 struct net_device *tdev; /* Device to other host */ 561 struct iphdr *old_iph = ip_hdr(skb); 562 u8 tos = old_iph->tos; 563 __be16 df = old_iph->frag_off; 564 sk_buff_data_t old_transport_header = skb->transport_header; 565 struct iphdr *iph; /* Our new IP header */ 566 unsigned int max_headroom; /* The extra header space needed */ 567 int mtu; 568 569 EnterFunction(10); 570 571 if (skb->protocol != htons(ETH_P_IP)) { 572 IP_VS_DBG_RL("%s(): protocol error, " 573 "ETH_P_IP: %d, skb protocol: %d\n", 574 __func__, htons(ETH_P_IP), skb->protocol); 575 goto tx_error; 576 } 577 578 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) 579 goto tx_error_icmp; 580 581 tdev = rt->dst.dev; 582 583 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 584 if (mtu < 68) { 585 ip_rt_put(rt); 586 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 587 goto tx_error; 588 } 589 if (skb_dst(skb)) 590 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 591 592 df |= (old_iph->frag_off & htons(IP_DF)); 593 594 if ((old_iph->frag_off & htons(IP_DF)) 595 && mtu < ntohs(old_iph->tot_len)) { 596 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 597 ip_rt_put(rt); 598 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 599 goto tx_error; 600 } 601 602 /* 603 * Okay, now see if we can stuff it in the buffer as-is. 604 */ 605 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 606 607 if (skb_headroom(skb) < max_headroom 608 || skb_cloned(skb) || skb_shared(skb)) { 609 struct sk_buff *new_skb = 610 skb_realloc_headroom(skb, max_headroom); 611 if (!new_skb) { 612 ip_rt_put(rt); 613 kfree_skb(skb); 614 IP_VS_ERR_RL("%s(): no memory\n", __func__); 615 return NF_STOLEN; 616 } 617 kfree_skb(skb); 618 skb = new_skb; 619 old_iph = ip_hdr(skb); 620 } 621 622 skb->transport_header = old_transport_header; 623 624 /* fix old IP header checksum */ 625 ip_send_check(old_iph); 626 627 skb_push(skb, sizeof(struct iphdr)); 628 skb_reset_network_header(skb); 629 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 630 631 /* drop old route */ 632 skb_dst_drop(skb); 633 skb_dst_set(skb, &rt->dst); 634 635 /* 636 * Push down and install the IPIP header. 637 */ 638 iph = ip_hdr(skb); 639 iph->version = 4; 640 iph->ihl = sizeof(struct iphdr)>>2; 641 iph->frag_off = df; 642 iph->protocol = IPPROTO_IPIP; 643 iph->tos = tos; 644 iph->daddr = rt->rt_dst; 645 iph->saddr = rt->rt_src; 646 iph->ttl = old_iph->ttl; 647 ip_select_ident(iph, &rt->dst, NULL); 648 649 /* Another hack: avoid icmp_send in ip_fragment */ 650 skb->local_df = 1; 651 652 ip_local_out(skb); 653 654 LeaveFunction(10); 655 656 return NF_STOLEN; 657 658 tx_error_icmp: 659 dst_link_failure(skb); 660 tx_error: 661 kfree_skb(skb); 662 LeaveFunction(10); 663 return NF_STOLEN; 664} 665 666#ifdef CONFIG_IP_VS_IPV6 667int 668ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 669 struct ip_vs_protocol *pp) 670{ 671 struct rt6_info *rt; /* Route to the other host */ 672 struct net_device *tdev; /* Device to other host */ 673 struct ipv6hdr *old_iph = ipv6_hdr(skb); 674 sk_buff_data_t old_transport_header = skb->transport_header; 675 struct ipv6hdr *iph; /* Our new IP header */ 676 unsigned int max_headroom; /* The extra header space needed */ 677 int mtu; 678 679 EnterFunction(10); 680 681 if (skb->protocol != htons(ETH_P_IPV6)) { 682 IP_VS_DBG_RL("%s(): protocol error, " 683 "ETH_P_IPV6: %d, skb protocol: %d\n", 684 __func__, htons(ETH_P_IPV6), skb->protocol); 685 goto tx_error; 686 } 687 688 rt = __ip_vs_get_out_rt_v6(cp); 689 if (!rt) 690 goto tx_error_icmp; 691 692 tdev = rt->dst.dev; 693 694 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 695 /* TODO IPv6: do we need this check in IPv6? */ 696 if (mtu < 1280) { 697 dst_release(&rt->dst); 698 IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__); 699 goto tx_error; 700 } 701 if (skb_dst(skb)) 702 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 703 704 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { 705 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 706 dst_release(&rt->dst); 707 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 708 goto tx_error; 709 } 710 711 /* 712 * Okay, now see if we can stuff it in the buffer as-is. 713 */ 714 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); 715 716 if (skb_headroom(skb) < max_headroom 717 || skb_cloned(skb) || skb_shared(skb)) { 718 struct sk_buff *new_skb = 719 skb_realloc_headroom(skb, max_headroom); 720 if (!new_skb) { 721 dst_release(&rt->dst); 722 kfree_skb(skb); 723 IP_VS_ERR_RL("%s(): no memory\n", __func__); 724 return NF_STOLEN; 725 } 726 kfree_skb(skb); 727 skb = new_skb; 728 old_iph = ipv6_hdr(skb); 729 } 730 731 skb->transport_header = old_transport_header; 732 733 skb_push(skb, sizeof(struct ipv6hdr)); 734 skb_reset_network_header(skb); 735 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 736 737 /* drop old route */ 738 skb_dst_drop(skb); 739 skb_dst_set(skb, &rt->dst); 740 741 /* 742 * Push down and install the IPIP header. 743 */ 744 iph = ipv6_hdr(skb); 745 iph->version = 6; 746 iph->nexthdr = IPPROTO_IPV6; 747 iph->payload_len = old_iph->payload_len; 748 be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); 749 iph->priority = old_iph->priority; 750 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); 751 iph->daddr = rt->rt6i_dst.addr; 752 iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ 753 iph->hop_limit = old_iph->hop_limit; 754 755 /* Another hack: avoid icmp_send in ip_fragment */ 756 skb->local_df = 1; 757 758 ip6_local_out(skb); 759 760 LeaveFunction(10); 761 762 return NF_STOLEN; 763 764tx_error_icmp: 765 dst_link_failure(skb); 766tx_error: 767 kfree_skb(skb); 768 LeaveFunction(10); 769 return NF_STOLEN; 770} 771#endif 772 773 774/* 775 * Direct Routing transmitter 776 * Used for ANY protocol 777 */ 778int 779ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 780 struct ip_vs_protocol *pp) 781{ 782 struct rtable *rt; /* Route to the other host */ 783 struct iphdr *iph = ip_hdr(skb); 784 int mtu; 785 786 EnterFunction(10); 787 788 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 789 goto tx_error_icmp; 790 791 /* MTU checking */ 792 mtu = dst_mtu(&rt->dst); 793 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { 794 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 795 ip_rt_put(rt); 796 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 797 goto tx_error; 798 } 799 800 /* 801 * Call ip_send_check because we are not sure it is called 802 * after ip_defrag. Is copy-on-write needed? 803 */ 804 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { 805 ip_rt_put(rt); 806 return NF_STOLEN; 807 } 808 ip_send_check(ip_hdr(skb)); 809 810 /* drop old route */ 811 skb_dst_drop(skb); 812 skb_dst_set(skb, &rt->dst); 813 814 /* Another hack: avoid icmp_send in ip_fragment */ 815 skb->local_df = 1; 816 817 IP_VS_XMIT(NFPROTO_IPV4, skb, rt); 818 819 LeaveFunction(10); 820 return NF_STOLEN; 821 822 tx_error_icmp: 823 dst_link_failure(skb); 824 tx_error: 825 kfree_skb(skb); 826 LeaveFunction(10); 827 return NF_STOLEN; 828} 829 830#ifdef CONFIG_IP_VS_IPV6 831int 832ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 833 struct ip_vs_protocol *pp) 834{ 835 struct rt6_info *rt; /* Route to the other host */ 836 int mtu; 837 838 EnterFunction(10); 839 840 rt = __ip_vs_get_out_rt_v6(cp); 841 if (!rt) 842 goto tx_error_icmp; 843 844 /* MTU checking */ 845 mtu = dst_mtu(&rt->dst); 846 if (skb->len > mtu) { 847 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 848 dst_release(&rt->dst); 849 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 850 goto tx_error; 851 } 852 853 /* 854 * Call ip_send_check because we are not sure it is called 855 * after ip_defrag. Is copy-on-write needed? 856 */ 857 skb = skb_share_check(skb, GFP_ATOMIC); 858 if (unlikely(skb == NULL)) { 859 dst_release(&rt->dst); 860 return NF_STOLEN; 861 } 862 863 /* drop old route */ 864 skb_dst_drop(skb); 865 skb_dst_set(skb, &rt->dst); 866 867 /* Another hack: avoid icmp_send in ip_fragment */ 868 skb->local_df = 1; 869 870 IP_VS_XMIT(NFPROTO_IPV6, skb, rt); 871 872 LeaveFunction(10); 873 return NF_STOLEN; 874 875tx_error_icmp: 876 dst_link_failure(skb); 877tx_error: 878 kfree_skb(skb); 879 LeaveFunction(10); 880 return NF_STOLEN; 881} 882#endif 883 884 885/* 886 * ICMP packet transmitter 887 * called by the ip_vs_in_icmp 888 */ 889int 890ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 891 struct ip_vs_protocol *pp, int offset) 892{ 893 struct rtable *rt; /* Route to the other host */ 894 int mtu; 895 int rc; 896 897 EnterFunction(10); 898 899 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 900 forwarded directly here, because there is no need to 901 translate address/port back */ 902 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 903 if (cp->packet_xmit) 904 rc = cp->packet_xmit(skb, cp, pp); 905 else 906 rc = NF_ACCEPT; 907 /* do not touch skb anymore */ 908 atomic_inc(&cp->in_pkts); 909 goto out; 910 } 911 912 /* 913 * mangle and send the packet here (only for VS/NAT) 914 */ 915 916 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) 917 goto tx_error_icmp; 918 919 /* MTU checking */ 920 mtu = dst_mtu(&rt->dst); 921 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { 922 ip_rt_put(rt); 923 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 924 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 925 goto tx_error; 926 } 927 928 /* copy-on-write the packet before mangling it */ 929 if (!skb_make_writable(skb, offset)) 930 goto tx_error_put; 931 932 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 933 goto tx_error_put; 934 935 /* drop the old route when skb is not shared */ 936 skb_dst_drop(skb); 937 skb_dst_set(skb, &rt->dst); 938 939 ip_vs_nat_icmp(skb, pp, cp, 0); 940 941 /* Another hack: avoid icmp_send in ip_fragment */ 942 skb->local_df = 1; 943 944 IP_VS_XMIT(NFPROTO_IPV4, skb, rt); 945 946 rc = NF_STOLEN; 947 goto out; 948 949 tx_error_icmp: 950 dst_link_failure(skb); 951 tx_error: 952 dev_kfree_skb(skb); 953 rc = NF_STOLEN; 954 out: 955 LeaveFunction(10); 956 return rc; 957 tx_error_put: 958 ip_rt_put(rt); 959 goto tx_error; 960} 961 962#ifdef CONFIG_IP_VS_IPV6 963int 964ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 965 struct ip_vs_protocol *pp, int offset) 966{ 967 struct rt6_info *rt; /* Route to the other host */ 968 int mtu; 969 int rc; 970 971 EnterFunction(10); 972 973 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 974 forwarded directly here, because there is no need to 975 translate address/port back */ 976 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 977 if (cp->packet_xmit) 978 rc = cp->packet_xmit(skb, cp, pp); 979 else 980 rc = NF_ACCEPT; 981 /* do not touch skb anymore */ 982 atomic_inc(&cp->in_pkts); 983 goto out; 984 } 985 986 /* 987 * mangle and send the packet here (only for VS/NAT) 988 */ 989 990 rt = __ip_vs_get_out_rt_v6(cp); 991 if (!rt) 992 goto tx_error_icmp; 993 994 /* MTU checking */ 995 mtu = dst_mtu(&rt->dst); 996 if (skb->len > mtu) { 997 dst_release(&rt->dst); 998 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 999 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1000 goto tx_error; 1001 } 1002 1003 /* copy-on-write the packet before mangling it */ 1004 if (!skb_make_writable(skb, offset)) 1005 goto tx_error_put; 1006 1007 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1008 goto tx_error_put; 1009 1010 /* drop the old route when skb is not shared */ 1011 skb_dst_drop(skb); 1012 skb_dst_set(skb, &rt->dst); 1013 1014 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1015 1016 /* Another hack: avoid icmp_send in ip_fragment */ 1017 skb->local_df = 1; 1018 1019 IP_VS_XMIT(NFPROTO_IPV6, skb, rt); 1020 1021 rc = NF_STOLEN; 1022 goto out; 1023 1024tx_error_icmp: 1025 dst_link_failure(skb); 1026tx_error: 1027 dev_kfree_skb(skb); 1028 rc = NF_STOLEN; 1029out: 1030 LeaveFunction(10); 1031 return rc; 1032tx_error_put: 1033 dst_release(&rt->dst); 1034 goto tx_error; 1035} 1036#endif 1037