1/* 2 * ip_vs_xmit.c: various packet transmitters for IPVS 3 * 4 * Version: $Id: ip_vs_xmit.c,v 1.1.1.1 2007/08/03 18:53:52 Exp $ 5 * 6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 7 * Julian Anastasov <ja@ssi.bg> 8 * 9 * This program is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License 11 * as published by the Free Software Foundation; either version 12 * 2 of the License, or (at your option) any later version. 13 * 14 * Changes: 15 * 16 */ 17 18#include <linux/kernel.h> 19#include <linux/ip.h> 20#include <linux/tcp.h> /* for tcphdr */ 21#include <net/tcp.h> /* for csum_tcpudp_magic */ 22#include <net/udp.h> 23#include <net/icmp.h> /* for icmp_send */ 24#include <net/route.h> /* for ip_route_output */ 25#include <linux/netfilter.h> 26#include <linux/netfilter_ipv4.h> 27 28#include <net/ip_vs.h> 29 30 31/* 32 * Destination cache to speed up outgoing route lookup 33 */ 34static inline void 35__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) 36{ 37 struct dst_entry *old_dst; 38 39 old_dst = dest->dst_cache; 40 dest->dst_cache = dst; 41 dest->dst_rtos = rtos; 42 dst_release(old_dst); 43} 44 45static inline struct dst_entry * 46__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) 47{ 48 struct dst_entry *dst = dest->dst_cache; 49 50 if (!dst) 51 return NULL; 52 if ((dst->obsolete || rtos != dest->dst_rtos) && 53 dst->ops->check(dst, cookie) == NULL) { 54 dest->dst_cache = NULL; 55 dst_release(dst); 56 return NULL; 57 } 58 dst_hold(dst); 59 return dst; 60} 61 62static inline struct rtable * 63__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) 64{ 65 struct rtable *rt; /* Route to the other host */ 66 struct ip_vs_dest *dest = cp->dest; 67 68 if (dest) { 69 spin_lock(&dest->dst_lock); 70 if (!(rt = (struct rtable *) 71 __ip_vs_dst_check(dest, rtos, 0))) { 72 struct flowi fl = { 73 .oif = 0, 74 .nl_u = { 75 .ip4_u = { 76 .daddr = dest->addr, 77 .saddr = 0, 78 .tos = rtos, } }, 79 }; 80 81 if (ip_route_output_key(&rt, &fl)) { 82 spin_unlock(&dest->dst_lock); 83 IP_VS_DBG_RL("ip_route_output error, " 84 "dest: %u.%u.%u.%u\n", 85 NIPQUAD(dest->addr)); 86 return NULL; 87 } 88 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); 89 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", 90 NIPQUAD(dest->addr), 91 atomic_read(&rt->u.dst.__refcnt), rtos); 92 } 93 spin_unlock(&dest->dst_lock); 94 } else { 95 struct flowi fl = { 96 .oif = 0, 97 .nl_u = { 98 .ip4_u = { 99 .daddr = cp->daddr, 100 .saddr = 0, 101 .tos = rtos, } }, 102 }; 103 104 if (ip_route_output_key(&rt, &fl)) { 105 IP_VS_DBG_RL("ip_route_output error, dest: " 106 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); 107 return NULL; 108 } 109 } 110 111 return rt; 112} 113 114 115/* 116 * Release dest->dst_cache before a dest is removed 117 */ 118void 119ip_vs_dst_reset(struct ip_vs_dest *dest) 120{ 121 struct dst_entry *old_dst; 122 123 old_dst = dest->dst_cache; 124 dest->dst_cache = NULL; 125 dst_release(old_dst); 126} 127 128#define IP_VS_XMIT(skb, rt) \ 129do { \ 130 (skb)->ipvs_property = 1; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 133 (rt)->u.dst.dev, dst_output); \ 134} while (0) 135 136 137/* 138 * NULL transmitter (do nothing except return NF_ACCEPT) 139 */ 140int 141ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 142 struct ip_vs_protocol *pp) 143{ 144 /* we do not touch skb and do not need pskb ptr */ 145 return NF_ACCEPT; 146} 147 148 149/* 150 * Bypass transmitter 151 * Let packets bypass the destination when the destination is not 152 * available, it may be only used in transparent cache cluster. 153 */ 154int 155ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 156 struct ip_vs_protocol *pp) 157{ 158 struct rtable *rt; /* Route to the other host */ 159 struct iphdr *iph = ip_hdr(skb); 160 u8 tos = iph->tos; 161 int mtu; 162 struct flowi fl = { 163 .oif = 0, 164 .nl_u = { 165 .ip4_u = { 166 .daddr = iph->daddr, 167 .saddr = 0, 168 .tos = RT_TOS(tos), } }, 169 }; 170 171 EnterFunction(10); 172 173 if (ip_route_output_key(&rt, &fl)) { 174 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " 175 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); 176 goto tx_error_icmp; 177 } 178 179 /* MTU checking */ 180 mtu = dst_mtu(&rt->u.dst); 181 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 182 ip_rt_put(rt); 183 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 184 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); 185 goto tx_error; 186 } 187 188 /* 189 * Call ip_send_check because we are not sure it is called 190 * after ip_defrag. Is copy-on-write needed? 191 */ 192 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { 193 ip_rt_put(rt); 194 return NF_STOLEN; 195 } 196 ip_send_check(ip_hdr(skb)); 197 198 /* drop old route */ 199 dst_release(skb->dst); 200 skb->dst = &rt->u.dst; 201 202 /* Another hack: avoid icmp_send in ip_fragment */ 203 skb->local_df = 1; 204 205 IP_VS_XMIT(skb, rt); 206 207 LeaveFunction(10); 208 return NF_STOLEN; 209 210 tx_error_icmp: 211 dst_link_failure(skb); 212 tx_error: 213 kfree_skb(skb); 214 LeaveFunction(10); 215 return NF_STOLEN; 216} 217 218 219/* 220 * NAT transmitter (only for outside-to-inside nat forwarding) 221 * Not used for related ICMP 222 */ 223int 224ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 225 struct ip_vs_protocol *pp) 226{ 227 struct rtable *rt; /* Route to the other host */ 228 int mtu; 229 struct iphdr *iph = ip_hdr(skb); 230 231 EnterFunction(10); 232 233 /* check if it is a connection of no-client-port */ 234 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 235 __be16 _pt, *p; 236 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); 237 if (p == NULL) 238 goto tx_error; 239 ip_vs_conn_fill_cport(cp, *p); 240 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 241 } 242 243 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 244 goto tx_error_icmp; 245 246 /* MTU checking */ 247 mtu = dst_mtu(&rt->u.dst); 248 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 249 ip_rt_put(rt); 250 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 251 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); 252 goto tx_error; 253 } 254 255 /* copy-on-write the packet before mangling it */ 256 if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr))) 257 goto tx_error_put; 258 259 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) 260 goto tx_error_put; 261 262 /* drop old route */ 263 dst_release(skb->dst); 264 skb->dst = &rt->u.dst; 265 266 /* mangle the packet */ 267 if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) 268 goto tx_error; 269 ip_hdr(skb)->daddr = cp->daddr; 270 ip_send_check(ip_hdr(skb)); 271 272 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 273 274 275 /* Another hack: avoid icmp_send in ip_fragment */ 276 skb->local_df = 1; 277 278 IP_VS_XMIT(skb, rt); 279 280 LeaveFunction(10); 281 return NF_STOLEN; 282 283 tx_error_icmp: 284 dst_link_failure(skb); 285 tx_error: 286 LeaveFunction(10); 287 kfree_skb(skb); 288 return NF_STOLEN; 289 tx_error_put: 290 ip_rt_put(rt); 291 goto tx_error; 292} 293 294 295/* 296 * IP Tunneling transmitter 297 * 298 * This function encapsulates the packet in a new IP packet, its 299 * destination will be set to cp->daddr. Most code of this function 300 * is taken from ipip.c. 301 * 302 * It is used in VS/TUN cluster. The load balancer selects a real 303 * server from a cluster based on a scheduling algorithm, 304 * encapsulates the request packet and forwards it to the selected 305 * server. For example, all real servers are configured with 306 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 307 * the encapsulated packet, it will decapsulate the packet, processe 308 * the request and return the response packets directly to the client 309 * without passing the load balancer. This can greatly increase the 310 * scalability of virtual server. 311 * 312 * Used for ANY protocol 313 */ 314int 315ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 316 struct ip_vs_protocol *pp) 317{ 318 struct rtable *rt; /* Route to the other host */ 319 struct net_device *tdev; /* Device to other host */ 320 struct iphdr *old_iph = ip_hdr(skb); 321 u8 tos = old_iph->tos; 322 __be16 df = old_iph->frag_off; 323 sk_buff_data_t old_transport_header = skb->transport_header; 324 struct iphdr *iph; /* Our new IP header */ 325 int max_headroom; /* The extra header space needed */ 326 int mtu; 327 328 EnterFunction(10); 329 330 if (skb->protocol != htons(ETH_P_IP)) { 331 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " 332 "ETH_P_IP: %d, skb protocol: %d\n", 333 htons(ETH_P_IP), skb->protocol); 334 goto tx_error; 335 } 336 337 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) 338 goto tx_error_icmp; 339 340 tdev = rt->u.dst.dev; 341 342 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); 343 if (mtu < 68) { 344 ip_rt_put(rt); 345 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); 346 goto tx_error; 347 } 348 if (skb->dst) 349 skb->dst->ops->update_pmtu(skb->dst, mtu); 350 351 df |= (old_iph->frag_off & htons(IP_DF)); 352 353 if ((old_iph->frag_off & htons(IP_DF)) 354 && mtu < ntohs(old_iph->tot_len)) { 355 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 356 ip_rt_put(rt); 357 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); 358 goto tx_error; 359 } 360 361 /* 362 * Okay, now see if we can stuff it in the buffer as-is. 363 */ 364 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 365 366 if (skb_headroom(skb) < max_headroom 367 || skb_cloned(skb) || skb_shared(skb)) { 368 struct sk_buff *new_skb = 369 skb_realloc_headroom(skb, max_headroom); 370 if (!new_skb) { 371 ip_rt_put(rt); 372 kfree_skb(skb); 373 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); 374 return NF_STOLEN; 375 } 376 kfree_skb(skb); 377 skb = new_skb; 378 old_iph = ip_hdr(skb); 379 } 380 381 skb->transport_header = old_transport_header; 382 383 /* fix old IP header checksum */ 384 ip_send_check(old_iph); 385 386 skb_push(skb, sizeof(struct iphdr)); 387 skb_reset_network_header(skb); 388 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 389 390 /* drop old route */ 391 dst_release(skb->dst); 392 skb->dst = &rt->u.dst; 393 394 /* 395 * Push down and install the IPIP header. 396 */ 397 iph = ip_hdr(skb); 398 iph->version = 4; 399 iph->ihl = sizeof(struct iphdr)>>2; 400 iph->frag_off = df; 401 iph->protocol = IPPROTO_IPIP; 402 iph->tos = tos; 403 iph->daddr = rt->rt_dst; 404 iph->saddr = rt->rt_src; 405 iph->ttl = old_iph->ttl; 406 iph->tot_len = htons(skb->len); 407 ip_select_ident(iph, &rt->u.dst, NULL); 408 ip_send_check(iph); 409 410 /* Another hack: avoid icmp_send in ip_fragment */ 411 skb->local_df = 1; 412 413 IP_VS_XMIT(skb, rt); 414 415 LeaveFunction(10); 416 417 return NF_STOLEN; 418 419 tx_error_icmp: 420 dst_link_failure(skb); 421 tx_error: 422 kfree_skb(skb); 423 LeaveFunction(10); 424 return NF_STOLEN; 425} 426 427 428/* 429 * Direct Routing transmitter 430 * Used for ANY protocol 431 */ 432int 433ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 434 struct ip_vs_protocol *pp) 435{ 436 struct rtable *rt; /* Route to the other host */ 437 struct iphdr *iph = ip_hdr(skb); 438 int mtu; 439 440 EnterFunction(10); 441 442 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 443 goto tx_error_icmp; 444 445 /* MTU checking */ 446 mtu = dst_mtu(&rt->u.dst); 447 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { 448 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 449 ip_rt_put(rt); 450 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); 451 goto tx_error; 452 } 453 454 /* 455 * Call ip_send_check because we are not sure it is called 456 * after ip_defrag. Is copy-on-write needed? 457 */ 458 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { 459 ip_rt_put(rt); 460 return NF_STOLEN; 461 } 462 ip_send_check(ip_hdr(skb)); 463 464 /* drop old route */ 465 dst_release(skb->dst); 466 skb->dst = &rt->u.dst; 467 468 /* Another hack: avoid icmp_send in ip_fragment */ 469 skb->local_df = 1; 470 471 IP_VS_XMIT(skb, rt); 472 473 LeaveFunction(10); 474 return NF_STOLEN; 475 476 tx_error_icmp: 477 dst_link_failure(skb); 478 tx_error: 479 kfree_skb(skb); 480 LeaveFunction(10); 481 return NF_STOLEN; 482} 483 484 485/* 486 * ICMP packet transmitter 487 * called by the ip_vs_in_icmp 488 */ 489int 490ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 491 struct ip_vs_protocol *pp, int offset) 492{ 493 struct rtable *rt; /* Route to the other host */ 494 int mtu; 495 int rc; 496 497 EnterFunction(10); 498 499 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 500 forwarded directly here, because there is no need to 501 translate address/port back */ 502 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 503 if (cp->packet_xmit) 504 rc = cp->packet_xmit(skb, cp, pp); 505 else 506 rc = NF_ACCEPT; 507 /* do not touch skb anymore */ 508 atomic_inc(&cp->in_pkts); 509 goto out; 510 } 511 512 /* 513 * mangle and send the packet here (only for VS/NAT) 514 */ 515 516 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) 517 goto tx_error_icmp; 518 519 /* MTU checking */ 520 mtu = dst_mtu(&rt->u.dst); 521 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { 522 ip_rt_put(rt); 523 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 524 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); 525 goto tx_error; 526 } 527 528 /* copy-on-write the packet before mangling it */ 529 if (!ip_vs_make_skb_writable(&skb, offset)) 530 goto tx_error_put; 531 532 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) 533 goto tx_error_put; 534 535 /* drop the old route when skb is not shared */ 536 dst_release(skb->dst); 537 skb->dst = &rt->u.dst; 538 539 ip_vs_nat_icmp(skb, pp, cp, 0); 540 541 /* Another hack: avoid icmp_send in ip_fragment */ 542 skb->local_df = 1; 543 544 IP_VS_XMIT(skb, rt); 545 546 rc = NF_STOLEN; 547 goto out; 548 549 tx_error_icmp: 550 dst_link_failure(skb); 551 tx_error: 552 dev_kfree_skb(skb); 553 rc = NF_STOLEN; 554 out: 555 LeaveFunction(10); 556 return rc; 557 tx_error_put: 558 ip_rt_put(rt); 559 goto tx_error; 560} 561