1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * ip_vs_xmit.c: various packet transmitters for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * Julian Anastasov <ja@ssi.bg> 7 * 8 * Changes: 9 * 10 * Description of forwarding methods: 11 * - all transmitters are called from LOCAL_IN (remote clients) and 12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD 13 * - not all connections have destination server, for example, 14 * connections in backup server when fwmark is used 15 * - bypass connections use daddr from packet 16 * - we can use dst without ref while sending in RCU section, we use 17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback 18 * LOCAL_OUT rules: 19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) 20 * - skb->pkt_type is not set yet 21 * - the only place where we can see skb->sk != NULL 22 */ 23 24#define KMSG_COMPONENT "IPVS" 25#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 26 27#include <linux/kernel.h> 28#include <linux/slab.h> 29#include <linux/tcp.h> /* for tcphdr */ 30#include <net/ip.h> 31#include <net/gue.h> 32#include <net/gre.h> 33#include <net/tcp.h> /* for csum_tcpudp_magic */ 34#include <net/udp.h> 35#include <net/icmp.h> /* for icmp_send */ 36#include <net/route.h> /* for ip_route_output */ 37#include <net/ipv6.h> 38#include <net/ip6_route.h> 39#include <net/ip_tunnels.h> 40#include <net/ip6_checksum.h> 41#include <net/addrconf.h> 42#include <linux/icmpv6.h> 43#include <linux/netfilter.h> 44#include <linux/netfilter_ipv4.h> 45 46#include <net/ip_vs.h> 47 48enum { 49 IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ 50 IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ 51 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 52 * local 53 */ 54 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ 55 IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ 56 IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ 57}; 58 59static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) 60{ 61 return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); 62} 63 64static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) 65{ 66 kfree(dest_dst); 67} 68 69/* 70 * Destination cache to speed up outgoing route lookup 71 */ 72static inline void 73__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, 74 struct dst_entry *dst, u32 dst_cookie) 75{ 76 struct ip_vs_dest_dst *old; 77 78 old = rcu_dereference_protected(dest->dest_dst, 79 lockdep_is_held(&dest->dst_lock)); 80 81 if (dest_dst) { 82 dest_dst->dst_cache = dst; 83 dest_dst->dst_cookie = dst_cookie; 84 } 85 rcu_assign_pointer(dest->dest_dst, dest_dst); 86 87 if (old) 88 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 89} 90 91static inline struct ip_vs_dest_dst * 92__ip_vs_dst_check(struct ip_vs_dest *dest) 93{ 94 struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); 95 struct dst_entry *dst; 96 97 if (!dest_dst) 98 return NULL; 99 dst = dest_dst->dst_cache; 100 if (dst->obsolete && 101 dst->ops->check(dst, dest_dst->dst_cookie) == NULL) 102 return NULL; 103 return dest_dst; 104} 105 106static inline bool 107__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) 108{ 109 if (IP6CB(skb)->frag_max_size) { 110 /* frag_max_size tell us that, this packet have been 111 * defragmented by netfilter IPv6 conntrack module. 112 */ 113 if (IP6CB(skb)->frag_max_size > mtu) 114 return true; /* largest fragment violate MTU */ 115 } 116 else if (skb->len > mtu && !skb_is_gso(skb)) { 117 return true; /* Packet size violate MTU size */ 118 } 119 return false; 120} 121 122/* Get route to daddr, update *saddr, optionally bind route to saddr */ 123static struct rtable *do_output_route4(struct net *net, __be32 daddr, 124 int rt_mode, __be32 *saddr) 125{ 126 struct flowi4 fl4; 127 struct rtable *rt; 128 bool loop = false; 129 130 memset(&fl4, 0, sizeof(fl4)); 131 fl4.daddr = daddr; 132 fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? 133 FLOWI_FLAG_KNOWN_NH : 0; 134 135retry: 136 rt = ip_route_output_key(net, &fl4); 137 if (IS_ERR(rt)) { 138 /* Invalid saddr ? */ 139 if (PTR_ERR(rt) == -EINVAL && *saddr && 140 rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { 141 *saddr = 0; 142 flowi4_update_output(&fl4, 0, daddr, 0); 143 goto retry; 144 } 145 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); 146 return NULL; 147 } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { 148 ip_rt_put(rt); 149 *saddr = fl4.saddr; 150 flowi4_update_output(&fl4, 0, daddr, fl4.saddr); 151 loop = true; 152 goto retry; 153 } 154 *saddr = fl4.saddr; 155 return rt; 156} 157 158#ifdef CONFIG_IP_VS_IPV6 159static inline int __ip_vs_is_local_route6(struct rt6_info *rt) 160{ 161 return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; 162} 163#endif 164 165static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, 166 int rt_mode, 167 bool new_rt_is_local) 168{ 169 bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); 170 bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); 171 bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); 172 bool source_is_loopback; 173 bool old_rt_is_local; 174 175#ifdef CONFIG_IP_VS_IPV6 176 if (skb_af == AF_INET6) { 177 int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); 178 179 source_is_loopback = 180 (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 181 (addr_type & IPV6_ADDR_LOOPBACK); 182 old_rt_is_local = __ip_vs_is_local_route6( 183 (struct rt6_info *)skb_dst(skb)); 184 } else 185#endif 186 { 187 source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); 188 old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 189 } 190 191 if (unlikely(new_rt_is_local)) { 192 if (!rt_mode_allow_local) 193 return true; 194 if (!rt_mode_allow_redirect && !old_rt_is_local) 195 return true; 196 } else { 197 if (!rt_mode_allow_non_local) 198 return true; 199 if (source_is_loopback) 200 return true; 201 } 202 return false; 203} 204 205static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) 206{ 207 struct sock *sk = skb->sk; 208 struct rtable *ort = skb_rtable(skb); 209 210 if (!skb->dev && sk && sk_fullsock(sk)) 211 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); 212} 213 214static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, 215 int rt_mode, 216 struct ip_vs_iphdr *ipvsh, 217 struct sk_buff *skb, int mtu) 218{ 219#ifdef CONFIG_IP_VS_IPV6 220 if (skb_af == AF_INET6) { 221 struct net *net = ipvs->net; 222 223 if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { 224 if (!skb->dev) 225 skb->dev = net->loopback_dev; 226 /* only send ICMP too big on first fragment */ 227 if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) 228 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 229 IP_VS_DBG(1, "frag needed for %pI6c\n", 230 &ipv6_hdr(skb)->saddr); 231 return false; 232 } 233 } else 234#endif 235 { 236 /* If we're going to tunnel the packet and pmtu discovery 237 * is disabled, we'll just fragment it anyway 238 */ 239 if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) 240 return true; 241 242 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && 243 skb->len > mtu && !skb_is_gso(skb) && 244 !ip_vs_iph_icmp(ipvsh))) { 245 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 246 htonl(mtu)); 247 IP_VS_DBG(1, "frag needed for %pI4\n", 248 &ip_hdr(skb)->saddr); 249 return false; 250 } 251 } 252 253 return true; 254} 255 256static inline bool decrement_ttl(struct netns_ipvs *ipvs, 257 int skb_af, 258 struct sk_buff *skb) 259{ 260 struct net *net = ipvs->net; 261 262#ifdef CONFIG_IP_VS_IPV6 263 if (skb_af == AF_INET6) { 264 struct dst_entry *dst = skb_dst(skb); 265 266 /* check and decrement ttl */ 267 if (ipv6_hdr(skb)->hop_limit <= 1) { 268 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 269 270 /* Force OUTPUT device used as source address */ 271 skb->dev = dst->dev; 272 icmpv6_send(skb, ICMPV6_TIME_EXCEED, 273 ICMPV6_EXC_HOPLIMIT, 0); 274 IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 275 276 return false; 277 } 278 279 /* don't propagate ttl change to cloned packets */ 280 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 281 return false; 282 283 ipv6_hdr(skb)->hop_limit--; 284 } else 285#endif 286 { 287 if (ip_hdr(skb)->ttl <= 1) { 288 /* Tell the sender its packet died... */ 289 IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); 290 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); 291 return false; 292 } 293 294 /* don't propagate ttl change to cloned packets */ 295 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 296 return false; 297 298 /* Decrease ttl */ 299 ip_decrease_ttl(ip_hdr(skb)); 300 } 301 302 return true; 303} 304 305/* Get route to destination or remote server */ 306static int 307__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 308 struct ip_vs_dest *dest, 309 __be32 daddr, int rt_mode, __be32 *ret_saddr, 310 struct ip_vs_iphdr *ipvsh) 311{ 312 struct net *net = ipvs->net; 313 struct ip_vs_dest_dst *dest_dst; 314 struct rtable *rt; /* Route to the other host */ 315 int mtu; 316 int local, noref = 1; 317 318 if (dest) { 319 dest_dst = __ip_vs_dst_check(dest); 320 if (likely(dest_dst)) 321 rt = (struct rtable *) dest_dst->dst_cache; 322 else { 323 dest_dst = ip_vs_dest_dst_alloc(); 324 spin_lock_bh(&dest->dst_lock); 325 if (!dest_dst) { 326 __ip_vs_dst_set(dest, NULL, NULL, 0); 327 spin_unlock_bh(&dest->dst_lock); 328 goto err_unreach; 329 } 330 rt = do_output_route4(net, dest->addr.ip, rt_mode, 331 &dest_dst->dst_saddr.ip); 332 if (!rt) { 333 __ip_vs_dst_set(dest, NULL, NULL, 0); 334 spin_unlock_bh(&dest->dst_lock); 335 ip_vs_dest_dst_free(dest_dst); 336 goto err_unreach; 337 } 338 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); 339 spin_unlock_bh(&dest->dst_lock); 340 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", 341 &dest->addr.ip, &dest_dst->dst_saddr.ip, 342 rcuref_read(&rt->dst.__rcuref)); 343 } 344 if (ret_saddr) 345 *ret_saddr = dest_dst->dst_saddr.ip; 346 } else { 347 __be32 saddr = htonl(INADDR_ANY); 348 349 noref = 0; 350 351 /* For such unconfigured boxes avoid many route lookups 352 * for performance reasons because we do not remember saddr 353 */ 354 rt_mode &= ~IP_VS_RT_MODE_CONNECT; 355 rt = do_output_route4(net, daddr, rt_mode, &saddr); 356 if (!rt) 357 goto err_unreach; 358 if (ret_saddr) 359 *ret_saddr = saddr; 360 } 361 362 local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; 363 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 364 local))) { 365 IP_VS_DBG_RL("We are crossing local and non-local addresses" 366 " daddr=%pI4\n", &daddr); 367 goto err_put; 368 } 369 370 if (unlikely(local)) { 371 /* skb to local stack, preserve old route */ 372 if (!noref) 373 ip_rt_put(rt); 374 return local; 375 } 376 377 if (!decrement_ttl(ipvs, skb_af, skb)) 378 goto err_put; 379 380 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { 381 mtu = dst_mtu(&rt->dst); 382 } else { 383 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 384 if (!dest) 385 goto err_put; 386 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 387 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 388 if ((dest->tun_flags & 389 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 390 skb->ip_summed == CHECKSUM_PARTIAL) 391 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 392 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 393 __be16 tflags = 0; 394 395 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 396 tflags |= TUNNEL_CSUM; 397 mtu -= gre_calc_hlen(tflags); 398 } 399 if (mtu < 68) { 400 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 401 goto err_put; 402 } 403 maybe_update_pmtu(skb_af, skb, mtu); 404 } 405 406 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 407 goto err_put; 408 409 skb_dst_drop(skb); 410 if (noref) 411 skb_dst_set_noref(skb, &rt->dst); 412 else 413 skb_dst_set(skb, &rt->dst); 414 415 return local; 416 417err_put: 418 if (!noref) 419 ip_rt_put(rt); 420 return -1; 421 422err_unreach: 423 dst_link_failure(skb); 424 return -1; 425} 426 427#ifdef CONFIG_IP_VS_IPV6 428static struct dst_entry * 429__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, 430 struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) 431{ 432 struct dst_entry *dst; 433 struct flowi6 fl6 = { 434 .daddr = *daddr, 435 }; 436 437 if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) 438 fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; 439 440 dst = ip6_route_output(net, NULL, &fl6); 441 if (dst->error) 442 goto out_err; 443 if (!ret_saddr) 444 return dst; 445 if (ipv6_addr_any(&fl6.saddr) && 446 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, 447 &fl6.daddr, 0, &fl6.saddr) < 0) 448 goto out_err; 449 if (do_xfrm) { 450 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); 451 if (IS_ERR(dst)) { 452 dst = NULL; 453 goto out_err; 454 } 455 } 456 *ret_saddr = fl6.saddr; 457 return dst; 458 459out_err: 460 dst_release(dst); 461 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); 462 return NULL; 463} 464 465/* 466 * Get route to destination or remote server 467 */ 468static int 469__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 470 struct ip_vs_dest *dest, 471 struct in6_addr *daddr, struct in6_addr *ret_saddr, 472 struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) 473{ 474 struct net *net = ipvs->net; 475 struct ip_vs_dest_dst *dest_dst; 476 struct rt6_info *rt; /* Route to the other host */ 477 struct dst_entry *dst; 478 int mtu; 479 int local, noref = 1; 480 481 if (dest) { 482 dest_dst = __ip_vs_dst_check(dest); 483 if (likely(dest_dst)) 484 rt = (struct rt6_info *) dest_dst->dst_cache; 485 else { 486 u32 cookie; 487 488 dest_dst = ip_vs_dest_dst_alloc(); 489 spin_lock_bh(&dest->dst_lock); 490 if (!dest_dst) { 491 __ip_vs_dst_set(dest, NULL, NULL, 0); 492 spin_unlock_bh(&dest->dst_lock); 493 goto err_unreach; 494 } 495 dst = __ip_vs_route_output_v6(net, &dest->addr.in6, 496 &dest_dst->dst_saddr.in6, 497 do_xfrm, rt_mode); 498 if (!dst) { 499 __ip_vs_dst_set(dest, NULL, NULL, 0); 500 spin_unlock_bh(&dest->dst_lock); 501 ip_vs_dest_dst_free(dest_dst); 502 goto err_unreach; 503 } 504 rt = (struct rt6_info *) dst; 505 cookie = rt6_get_cookie(rt); 506 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); 507 spin_unlock_bh(&dest->dst_lock); 508 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", 509 &dest->addr.in6, &dest_dst->dst_saddr.in6, 510 rcuref_read(&rt->dst.__rcuref)); 511 } 512 if (ret_saddr) 513 *ret_saddr = dest_dst->dst_saddr.in6; 514 } else { 515 noref = 0; 516 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, 517 rt_mode); 518 if (!dst) 519 goto err_unreach; 520 rt = (struct rt6_info *) dst; 521 } 522 523 local = __ip_vs_is_local_route6(rt); 524 525 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 526 local))) { 527 IP_VS_DBG_RL("We are crossing local and non-local addresses" 528 " daddr=%pI6\n", daddr); 529 goto err_put; 530 } 531 532 if (unlikely(local)) { 533 /* skb to local stack, preserve old route */ 534 if (!noref) 535 dst_release(&rt->dst); 536 return local; 537 } 538 539 if (!decrement_ttl(ipvs, skb_af, skb)) 540 goto err_put; 541 542 /* MTU checking */ 543 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) 544 mtu = dst_mtu(&rt->dst); 545 else { 546 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 547 if (!dest) 548 goto err_put; 549 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 550 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 551 if ((dest->tun_flags & 552 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 553 skb->ip_summed == CHECKSUM_PARTIAL) 554 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 555 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 556 __be16 tflags = 0; 557 558 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 559 tflags |= TUNNEL_CSUM; 560 mtu -= gre_calc_hlen(tflags); 561 } 562 if (mtu < IPV6_MIN_MTU) { 563 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, 564 IPV6_MIN_MTU); 565 goto err_put; 566 } 567 maybe_update_pmtu(skb_af, skb, mtu); 568 } 569 570 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 571 goto err_put; 572 573 skb_dst_drop(skb); 574 if (noref) 575 skb_dst_set_noref(skb, &rt->dst); 576 else 577 skb_dst_set(skb, &rt->dst); 578 579 return local; 580 581err_put: 582 if (!noref) 583 dst_release(&rt->dst); 584 return -1; 585 586err_unreach: 587 /* The ip6_link_failure function requires the dev field to be set 588 * in order to get the net (further for the sake of fwmark 589 * reflection). 590 */ 591 if (!skb->dev) 592 skb->dev = skb_dst(skb)->dev; 593 594 dst_link_failure(skb); 595 return -1; 596} 597#endif 598 599 600/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ 601static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, 602 struct ip_vs_conn *cp) 603{ 604 int ret = NF_ACCEPT; 605 606 skb->ipvs_property = 1; 607 if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) 608 ret = ip_vs_confirm_conntrack(skb); 609 if (ret == NF_ACCEPT) { 610 nf_reset_ct(skb); 611 skb_forward_csum(skb); 612 if (skb->dev) 613 skb_clear_tstamp(skb); 614 } 615 return ret; 616} 617 618/* In the event of a remote destination, it's possible that we would have 619 * matches against an old socket (particularly a TIME-WAIT socket). This 620 * causes havoc down the line (ip_local_out et. al. expect regular sockets 621 * and invalid memory accesses will happen) so simply drop the association 622 * in this case. 623*/ 624static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) 625{ 626 /* If dev is set, the packet came from the LOCAL_IN callback and 627 * not from a local TCP socket. 628 */ 629 if (skb->dev) 630 skb_orphan(skb); 631} 632 633/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 634static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, 635 struct ip_vs_conn *cp, int local) 636{ 637 int ret = NF_STOLEN; 638 639 skb->ipvs_property = 1; 640 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 641 ip_vs_notrack(skb); 642 else 643 ip_vs_update_conntrack(skb, cp, 1); 644 645 /* Remove the early_demux association unless it's bound for the 646 * exact same port and address on this host after translation. 647 */ 648 if (!local || cp->vport != cp->dport || 649 !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) 650 ip_vs_drop_early_demux_sk(skb); 651 652 if (!local) { 653 skb_forward_csum(skb); 654 if (skb->dev) 655 skb_clear_tstamp(skb); 656 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 657 NULL, skb_dst(skb)->dev, dst_output); 658 } else 659 ret = NF_ACCEPT; 660 661 return ret; 662} 663 664/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 665static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, 666 struct ip_vs_conn *cp, int local) 667{ 668 int ret = NF_STOLEN; 669 670 skb->ipvs_property = 1; 671 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 672 ip_vs_notrack(skb); 673 if (!local) { 674 ip_vs_drop_early_demux_sk(skb); 675 skb_forward_csum(skb); 676 if (skb->dev) 677 skb_clear_tstamp(skb); 678 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 679 NULL, skb_dst(skb)->dev, dst_output); 680 } else 681 ret = NF_ACCEPT; 682 return ret; 683} 684 685 686/* 687 * NULL transmitter (do nothing except return NF_ACCEPT) 688 */ 689int 690ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 691 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 692{ 693 /* we do not touch skb and do not need pskb ptr */ 694 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 695} 696 697 698/* 699 * Bypass transmitter 700 * Let packets bypass the destination when the destination is not 701 * available, it may be only used in transparent cache cluster. 702 */ 703int 704ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 705 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 706{ 707 struct iphdr *iph = ip_hdr(skb); 708 709 if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, 710 IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) 711 goto tx_error; 712 713 ip_send_check(iph); 714 715 /* Another hack: avoid icmp_send in ip_fragment */ 716 skb->ignore_df = 1; 717 718 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 719 720 return NF_STOLEN; 721 722 tx_error: 723 kfree_skb(skb); 724 return NF_STOLEN; 725} 726 727#ifdef CONFIG_IP_VS_IPV6 728int 729ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 730 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 731{ 732 struct ipv6hdr *iph = ipv6_hdr(skb); 733 734 if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, 735 &iph->daddr, NULL, 736 ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) 737 goto tx_error; 738 739 /* Another hack: avoid icmp_send in ip_fragment */ 740 skb->ignore_df = 1; 741 742 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 743 744 return NF_STOLEN; 745 746 tx_error: 747 kfree_skb(skb); 748 return NF_STOLEN; 749} 750#endif 751 752/* 753 * NAT transmitter (only for outside-to-inside nat forwarding) 754 * Not used for related ICMP 755 */ 756int 757ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 758 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 759{ 760 struct rtable *rt; /* Route to the other host */ 761 int local, rc, was_input; 762 763 /* check if it is a connection of no-client-port */ 764 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 765 __be16 _pt, *p; 766 767 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 768 if (p == NULL) 769 goto tx_error; 770 ip_vs_conn_fill_cport(cp, *p); 771 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 772 } 773 774 was_input = rt_is_input_route(skb_rtable(skb)); 775 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 776 IP_VS_RT_MODE_LOCAL | 777 IP_VS_RT_MODE_NON_LOCAL | 778 IP_VS_RT_MODE_RDR, NULL, ipvsh); 779 if (local < 0) 780 goto tx_error; 781 rt = skb_rtable(skb); 782 /* 783 * Avoid duplicate tuple in reply direction for NAT traffic 784 * to local address when connection is sync-ed 785 */ 786#if IS_ENABLED(CONFIG_NF_CONNTRACK) 787 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 788 enum ip_conntrack_info ctinfo; 789 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 790 791 if (ct) { 792 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, 793 "ip_vs_nat_xmit(): " 794 "stopping DNAT to local address"); 795 goto tx_error; 796 } 797 } 798#endif 799 800 /* From world but DNAT to loopback address? */ 801 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 802 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, 803 "ip_vs_nat_xmit(): stopping DNAT to loopback " 804 "address"); 805 goto tx_error; 806 } 807 808 /* copy-on-write the packet before mangling it */ 809 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 810 goto tx_error; 811 812 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 813 goto tx_error; 814 815 /* mangle the packet */ 816 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 817 goto tx_error; 818 ip_hdr(skb)->daddr = cp->daddr.ip; 819 ip_send_check(ip_hdr(skb)); 820 821 IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); 822 823 /* FIXME: when application helper enlarges the packet and the length 824 is larger than the MTU of outgoing device, there will be still 825 MTU problem. */ 826 827 /* Another hack: avoid icmp_send in ip_fragment */ 828 skb->ignore_df = 1; 829 830 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 831 832 return rc; 833 834 tx_error: 835 kfree_skb(skb); 836 return NF_STOLEN; 837} 838 839#ifdef CONFIG_IP_VS_IPV6 840int 841ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 842 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 843{ 844 struct rt6_info *rt; /* Route to the other host */ 845 int local, rc; 846 847 /* check if it is a connection of no-client-port */ 848 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { 849 __be16 _pt, *p; 850 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 851 if (p == NULL) 852 goto tx_error; 853 ip_vs_conn_fill_cport(cp, *p); 854 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 855 } 856 857 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 858 &cp->daddr.in6, 859 NULL, ipvsh, 0, 860 IP_VS_RT_MODE_LOCAL | 861 IP_VS_RT_MODE_NON_LOCAL | 862 IP_VS_RT_MODE_RDR); 863 if (local < 0) 864 goto tx_error; 865 rt = (struct rt6_info *) skb_dst(skb); 866 /* 867 * Avoid duplicate tuple in reply direction for NAT traffic 868 * to local address when connection is sync-ed 869 */ 870#if IS_ENABLED(CONFIG_NF_CONNTRACK) 871 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 872 enum ip_conntrack_info ctinfo; 873 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 874 875 if (ct) { 876 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, 877 "ip_vs_nat_xmit_v6(): " 878 "stopping DNAT to local address"); 879 goto tx_error; 880 } 881 } 882#endif 883 884 /* From world but DNAT to loopback address? */ 885 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 886 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 887 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, 888 "ip_vs_nat_xmit_v6(): " 889 "stopping DNAT to loopback address"); 890 goto tx_error; 891 } 892 893 /* copy-on-write the packet before mangling it */ 894 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 895 goto tx_error; 896 897 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 898 goto tx_error; 899 900 /* mangle the packet */ 901 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 902 goto tx_error; 903 ipv6_hdr(skb)->daddr = cp->daddr.in6; 904 905 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); 906 907 /* FIXME: when application helper enlarges the packet and the length 908 is larger than the MTU of outgoing device, there will be still 909 MTU problem. */ 910 911 /* Another hack: avoid icmp_send in ip_fragment */ 912 skb->ignore_df = 1; 913 914 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 915 916 return rc; 917 918tx_error: 919 kfree_skb(skb); 920 return NF_STOLEN; 921} 922#endif 923 924/* When forwarding a packet, we must ensure that we've got enough headroom 925 * for the encapsulation packet in the skb. This also gives us an 926 * opportunity to figure out what the payload_len, dsfield, ttl, and df 927 * values should be, so that we won't need to look at the old ip header 928 * again 929 */ 930static struct sk_buff * 931ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, 932 unsigned int max_headroom, __u8 *next_protocol, 933 __u32 *payload_len, __u8 *dsfield, __u8 *ttl, 934 __be16 *df) 935{ 936 struct sk_buff *new_skb = NULL; 937 struct iphdr *old_iph = NULL; 938 __u8 old_dsfield; 939#ifdef CONFIG_IP_VS_IPV6 940 struct ipv6hdr *old_ipv6h = NULL; 941#endif 942 943 ip_vs_drop_early_demux_sk(skb); 944 945 if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { 946 new_skb = skb_realloc_headroom(skb, max_headroom); 947 if (!new_skb) 948 goto error; 949 if (skb->sk) 950 skb_set_owner_w(new_skb, skb->sk); 951 consume_skb(skb); 952 skb = new_skb; 953 } 954 955#ifdef CONFIG_IP_VS_IPV6 956 if (skb_af == AF_INET6) { 957 old_ipv6h = ipv6_hdr(skb); 958 *next_protocol = IPPROTO_IPV6; 959 if (payload_len) 960 *payload_len = 961 ntohs(old_ipv6h->payload_len) + 962 sizeof(*old_ipv6h); 963 old_dsfield = ipv6_get_dsfield(old_ipv6h); 964 *ttl = old_ipv6h->hop_limit; 965 if (df) 966 *df = 0; 967 } else 968#endif 969 { 970 old_iph = ip_hdr(skb); 971 /* Copy DF, reset fragment offset and MF */ 972 if (df) 973 *df = (old_iph->frag_off & htons(IP_DF)); 974 *next_protocol = IPPROTO_IPIP; 975 976 /* fix old IP header checksum */ 977 ip_send_check(old_iph); 978 old_dsfield = ipv4_get_dsfield(old_iph); 979 *ttl = old_iph->ttl; 980 if (payload_len) 981 *payload_len = skb_ip_totlen(skb); 982 } 983 984 /* Implement full-functionality option for ECN encapsulation */ 985 *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); 986 987 return skb; 988error: 989 kfree_skb(skb); 990 return ERR_PTR(-ENOMEM); 991} 992 993static inline int __tun_gso_type_mask(int encaps_af, int orig_af) 994{ 995 switch (encaps_af) { 996 case AF_INET: 997 return SKB_GSO_IPXIP4; 998 case AF_INET6: 999 return SKB_GSO_IPXIP6; 1000 default: 1001 return 0; 1002 } 1003} 1004 1005static int 1006ipvs_gue_encap(struct net *net, struct sk_buff *skb, 1007 struct ip_vs_conn *cp, __u8 *next_protocol) 1008{ 1009 __be16 dport; 1010 __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); 1011 struct udphdr *udph; /* Our new UDP header */ 1012 struct guehdr *gueh; /* Our new GUE header */ 1013 size_t hdrlen, optlen = 0; 1014 void *data; 1015 bool need_priv = false; 1016 1017 if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1018 skb->ip_summed == CHECKSUM_PARTIAL) { 1019 optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1020 need_priv = true; 1021 } 1022 1023 hdrlen = sizeof(struct guehdr) + optlen; 1024 1025 skb_push(skb, hdrlen); 1026 1027 gueh = (struct guehdr *)skb->data; 1028 1029 gueh->control = 0; 1030 gueh->version = 0; 1031 gueh->hlen = optlen >> 2; 1032 gueh->flags = 0; 1033 gueh->proto_ctype = *next_protocol; 1034 1035 data = &gueh[1]; 1036 1037 if (need_priv) { 1038 __be32 *flags = data; 1039 u16 csum_start = skb_checksum_start_offset(skb); 1040 __be16 *pd; 1041 1042 gueh->flags |= GUE_FLAG_PRIV; 1043 *flags = 0; 1044 data += GUE_LEN_PRIV; 1045 1046 if (csum_start < hdrlen) 1047 return -EINVAL; 1048 1049 csum_start -= hdrlen; 1050 pd = data; 1051 pd[0] = htons(csum_start); 1052 pd[1] = htons(csum_start + skb->csum_offset); 1053 1054 if (!skb_is_gso(skb)) { 1055 skb->ip_summed = CHECKSUM_NONE; 1056 skb->encapsulation = 0; 1057 } 1058 1059 *flags |= GUE_PFLAG_REMCSUM; 1060 data += GUE_PLEN_REMCSUM; 1061 } 1062 1063 skb_push(skb, sizeof(struct udphdr)); 1064 skb_reset_transport_header(skb); 1065 1066 udph = udp_hdr(skb); 1067 1068 dport = cp->dest->tun_port; 1069 udph->dest = dport; 1070 udph->source = sport; 1071 udph->len = htons(skb->len); 1072 udph->check = 0; 1073 1074 *next_protocol = IPPROTO_UDP; 1075 1076 return 0; 1077} 1078 1079static void 1080ipvs_gre_encap(struct net *net, struct sk_buff *skb, 1081 struct ip_vs_conn *cp, __u8 *next_protocol) 1082{ 1083 __be16 proto = *next_protocol == IPPROTO_IPIP ? 1084 htons(ETH_P_IP) : htons(ETH_P_IPV6); 1085 __be16 tflags = 0; 1086 size_t hdrlen; 1087 1088 if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1089 tflags |= TUNNEL_CSUM; 1090 1091 hdrlen = gre_calc_hlen(tflags); 1092 gre_build_header(skb, hdrlen, tflags, proto, 0, 0); 1093 1094 *next_protocol = IPPROTO_GRE; 1095} 1096 1097/* 1098 * IP Tunneling transmitter 1099 * 1100 * This function encapsulates the packet in a new IP packet, its 1101 * destination will be set to cp->daddr. Most code of this function 1102 * is taken from ipip.c. 1103 * 1104 * It is used in VS/TUN cluster. The load balancer selects a real 1105 * server from a cluster based on a scheduling algorithm, 1106 * encapsulates the request packet and forwards it to the selected 1107 * server. For example, all real servers are configured with 1108 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 1109 * the encapsulated packet, it will decapsulate the packet, processe 1110 * the request and return the response packets directly to the client 1111 * without passing the load balancer. This can greatly increase the 1112 * scalability of virtual server. 1113 * 1114 * Used for ANY protocol 1115 */ 1116int 1117ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1118 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1119{ 1120 struct netns_ipvs *ipvs = cp->ipvs; 1121 struct net *net = ipvs->net; 1122 struct rtable *rt; /* Route to the other host */ 1123 __be32 saddr; /* Source for tunnel */ 1124 struct net_device *tdev; /* Device to other host */ 1125 __u8 next_protocol = 0; 1126 __u8 dsfield = 0; 1127 __u8 ttl = 0; 1128 __be16 df = 0; 1129 __be16 *dfp = NULL; 1130 struct iphdr *iph; /* Our new IP header */ 1131 unsigned int max_headroom; /* The extra header space needed */ 1132 int ret, local; 1133 int tun_type, gso_type; 1134 int tun_flags; 1135 1136 local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1137 IP_VS_RT_MODE_LOCAL | 1138 IP_VS_RT_MODE_NON_LOCAL | 1139 IP_VS_RT_MODE_CONNECT | 1140 IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); 1141 if (local < 0) 1142 goto tx_error; 1143 if (local) 1144 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1145 1146 rt = skb_rtable(skb); 1147 tdev = rt->dst.dev; 1148 1149 /* 1150 * Okay, now see if we can stuff it in the buffer as-is. 1151 */ 1152 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 1153 1154 tun_type = cp->dest->tun_type; 1155 tun_flags = cp->dest->tun_flags; 1156 1157 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1158 size_t gue_hdrlen, gue_optlen = 0; 1159 1160 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1161 skb->ip_summed == CHECKSUM_PARTIAL) { 1162 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1163 } 1164 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1165 1166 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1167 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1168 size_t gre_hdrlen; 1169 __be16 tflags = 0; 1170 1171 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1172 tflags |= TUNNEL_CSUM; 1173 gre_hdrlen = gre_calc_hlen(tflags); 1174 1175 max_headroom += gre_hdrlen; 1176 } 1177 1178 /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ 1179 dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; 1180 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1181 &next_protocol, NULL, &dsfield, 1182 &ttl, dfp); 1183 if (IS_ERR(skb)) 1184 return NF_STOLEN; 1185 1186 gso_type = __tun_gso_type_mask(AF_INET, cp->af); 1187 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1188 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1189 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1190 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1191 else 1192 gso_type |= SKB_GSO_UDP_TUNNEL; 1193 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1194 skb->ip_summed == CHECKSUM_PARTIAL) { 1195 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1196 } 1197 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1198 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1199 gso_type |= SKB_GSO_GRE_CSUM; 1200 else 1201 gso_type |= SKB_GSO_GRE; 1202 } 1203 1204 if (iptunnel_handle_offloads(skb, gso_type)) 1205 goto tx_error; 1206 1207 skb->transport_header = skb->network_header; 1208 1209 skb_set_inner_ipproto(skb, next_protocol); 1210 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); 1211 1212 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1213 bool check = false; 1214 1215 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1216 goto tx_error; 1217 1218 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1219 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1220 check = true; 1221 1222 udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); 1223 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1224 ipvs_gre_encap(net, skb, cp, &next_protocol); 1225 1226 skb_push(skb, sizeof(struct iphdr)); 1227 skb_reset_network_header(skb); 1228 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1229 1230 /* 1231 * Push down and install the IPIP header. 1232 */ 1233 iph = ip_hdr(skb); 1234 iph->version = 4; 1235 iph->ihl = sizeof(struct iphdr)>>2; 1236 iph->frag_off = df; 1237 iph->protocol = next_protocol; 1238 iph->tos = dsfield; 1239 iph->daddr = cp->daddr.ip; 1240 iph->saddr = saddr; 1241 iph->ttl = ttl; 1242 ip_select_ident(net, skb, NULL); 1243 1244 /* Another hack: avoid icmp_send in ip_fragment */ 1245 skb->ignore_df = 1; 1246 1247 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1248 if (ret == NF_ACCEPT) 1249 ip_local_out(net, skb->sk, skb); 1250 else if (ret == NF_DROP) 1251 kfree_skb(skb); 1252 1253 return NF_STOLEN; 1254 1255 tx_error: 1256 kfree_skb(skb); 1257 return NF_STOLEN; 1258} 1259 1260#ifdef CONFIG_IP_VS_IPV6 1261int 1262ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1263 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1264{ 1265 struct netns_ipvs *ipvs = cp->ipvs; 1266 struct net *net = ipvs->net; 1267 struct rt6_info *rt; /* Route to the other host */ 1268 struct in6_addr saddr; /* Source for tunnel */ 1269 struct net_device *tdev; /* Device to other host */ 1270 __u8 next_protocol = 0; 1271 __u32 payload_len = 0; 1272 __u8 dsfield = 0; 1273 __u8 ttl = 0; 1274 struct ipv6hdr *iph; /* Our new IP header */ 1275 unsigned int max_headroom; /* The extra header space needed */ 1276 int ret, local; 1277 int tun_type, gso_type; 1278 int tun_flags; 1279 1280 local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, 1281 &cp->daddr.in6, 1282 &saddr, ipvsh, 1, 1283 IP_VS_RT_MODE_LOCAL | 1284 IP_VS_RT_MODE_NON_LOCAL | 1285 IP_VS_RT_MODE_TUNNEL); 1286 if (local < 0) 1287 goto tx_error; 1288 if (local) 1289 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1290 1291 rt = (struct rt6_info *) skb_dst(skb); 1292 tdev = rt->dst.dev; 1293 1294 /* 1295 * Okay, now see if we can stuff it in the buffer as-is. 1296 */ 1297 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); 1298 1299 tun_type = cp->dest->tun_type; 1300 tun_flags = cp->dest->tun_flags; 1301 1302 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1303 size_t gue_hdrlen, gue_optlen = 0; 1304 1305 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1306 skb->ip_summed == CHECKSUM_PARTIAL) { 1307 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1308 } 1309 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1310 1311 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1312 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1313 size_t gre_hdrlen; 1314 __be16 tflags = 0; 1315 1316 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1317 tflags |= TUNNEL_CSUM; 1318 gre_hdrlen = gre_calc_hlen(tflags); 1319 1320 max_headroom += gre_hdrlen; 1321 } 1322 1323 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1324 &next_protocol, &payload_len, 1325 &dsfield, &ttl, NULL); 1326 if (IS_ERR(skb)) 1327 return NF_STOLEN; 1328 1329 gso_type = __tun_gso_type_mask(AF_INET6, cp->af); 1330 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1331 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1332 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1333 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1334 else 1335 gso_type |= SKB_GSO_UDP_TUNNEL; 1336 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1337 skb->ip_summed == CHECKSUM_PARTIAL) { 1338 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1339 } 1340 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1341 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1342 gso_type |= SKB_GSO_GRE_CSUM; 1343 else 1344 gso_type |= SKB_GSO_GRE; 1345 } 1346 1347 if (iptunnel_handle_offloads(skb, gso_type)) 1348 goto tx_error; 1349 1350 skb->transport_header = skb->network_header; 1351 1352 skb_set_inner_ipproto(skb, next_protocol); 1353 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); 1354 1355 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1356 bool check = false; 1357 1358 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1359 goto tx_error; 1360 1361 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1362 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1363 check = true; 1364 1365 udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); 1366 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1367 ipvs_gre_encap(net, skb, cp, &next_protocol); 1368 1369 skb_push(skb, sizeof(struct ipv6hdr)); 1370 skb_reset_network_header(skb); 1371 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1372 1373 /* 1374 * Push down and install the IPIP header. 1375 */ 1376 iph = ipv6_hdr(skb); 1377 iph->version = 6; 1378 iph->nexthdr = next_protocol; 1379 iph->payload_len = htons(payload_len); 1380 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); 1381 ipv6_change_dsfield(iph, 0, dsfield); 1382 iph->daddr = cp->daddr.in6; 1383 iph->saddr = saddr; 1384 iph->hop_limit = ttl; 1385 1386 /* Another hack: avoid icmp_send in ip_fragment */ 1387 skb->ignore_df = 1; 1388 1389 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1390 if (ret == NF_ACCEPT) 1391 ip6_local_out(net, skb->sk, skb); 1392 else if (ret == NF_DROP) 1393 kfree_skb(skb); 1394 1395 return NF_STOLEN; 1396 1397tx_error: 1398 kfree_skb(skb); 1399 return NF_STOLEN; 1400} 1401#endif 1402 1403 1404/* 1405 * Direct Routing transmitter 1406 * Used for ANY protocol 1407 */ 1408int 1409ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1410 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1411{ 1412 int local; 1413 1414 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1415 IP_VS_RT_MODE_LOCAL | 1416 IP_VS_RT_MODE_NON_LOCAL | 1417 IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); 1418 if (local < 0) 1419 goto tx_error; 1420 if (local) 1421 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1422 1423 ip_send_check(ip_hdr(skb)); 1424 1425 /* Another hack: avoid icmp_send in ip_fragment */ 1426 skb->ignore_df = 1; 1427 1428 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 1429 1430 return NF_STOLEN; 1431 1432 tx_error: 1433 kfree_skb(skb); 1434 return NF_STOLEN; 1435} 1436 1437#ifdef CONFIG_IP_VS_IPV6 1438int 1439ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1440 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1441{ 1442 int local; 1443 1444 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1445 &cp->daddr.in6, 1446 NULL, ipvsh, 0, 1447 IP_VS_RT_MODE_LOCAL | 1448 IP_VS_RT_MODE_NON_LOCAL | 1449 IP_VS_RT_MODE_KNOWN_NH); 1450 if (local < 0) 1451 goto tx_error; 1452 if (local) 1453 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1454 1455 /* Another hack: avoid icmp_send in ip_fragment */ 1456 skb->ignore_df = 1; 1457 1458 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 1459 1460 return NF_STOLEN; 1461 1462tx_error: 1463 kfree_skb(skb); 1464 return NF_STOLEN; 1465} 1466#endif 1467 1468 1469/* 1470 * ICMP packet transmitter 1471 * called by the ip_vs_in_icmp 1472 */ 1473int 1474ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1475 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1476 struct ip_vs_iphdr *iph) 1477{ 1478 struct rtable *rt; /* Route to the other host */ 1479 int rc; 1480 int local; 1481 int rt_mode, was_input; 1482 1483 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1484 forwarded directly here, because there is no need to 1485 translate address/port back */ 1486 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1487 if (cp->packet_xmit) 1488 rc = cp->packet_xmit(skb, cp, pp, iph); 1489 else 1490 rc = NF_ACCEPT; 1491 /* do not touch skb anymore */ 1492 atomic_inc(&cp->in_pkts); 1493 return rc; 1494 } 1495 1496 /* 1497 * mangle and send the packet here (only for VS/NAT) 1498 */ 1499 was_input = rt_is_input_route(skb_rtable(skb)); 1500 1501 /* LOCALNODE from FORWARD hook is not supported */ 1502 rt_mode = (hooknum != NF_INET_FORWARD) ? 1503 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1504 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1505 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, 1506 NULL, iph); 1507 if (local < 0) 1508 goto tx_error; 1509 rt = skb_rtable(skb); 1510 1511 /* 1512 * Avoid duplicate tuple in reply direction for NAT traffic 1513 * to local address when connection is sync-ed 1514 */ 1515#if IS_ENABLED(CONFIG_NF_CONNTRACK) 1516 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1517 enum ip_conntrack_info ctinfo; 1518 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1519 1520 if (ct) { 1521 IP_VS_DBG(10, "%s(): " 1522 "stopping DNAT to local address %pI4\n", 1523 __func__, &cp->daddr.ip); 1524 goto tx_error; 1525 } 1526 } 1527#endif 1528 1529 /* From world but DNAT to loopback address? */ 1530 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 1531 IP_VS_DBG(1, "%s(): " 1532 "stopping DNAT to loopback %pI4\n", 1533 __func__, &cp->daddr.ip); 1534 goto tx_error; 1535 } 1536 1537 /* copy-on-write the packet before mangling it */ 1538 if (skb_ensure_writable(skb, offset)) 1539 goto tx_error; 1540 1541 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1542 goto tx_error; 1543 1544 ip_vs_nat_icmp(skb, pp, cp, 0); 1545 1546 /* Another hack: avoid icmp_send in ip_fragment */ 1547 skb->ignore_df = 1; 1548 1549 return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 1550 1551 tx_error: 1552 kfree_skb(skb); 1553 rc = NF_STOLEN; 1554 return rc; 1555} 1556 1557#ifdef CONFIG_IP_VS_IPV6 1558int 1559ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1560 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1561 struct ip_vs_iphdr *ipvsh) 1562{ 1563 struct rt6_info *rt; /* Route to the other host */ 1564 int rc; 1565 int local; 1566 int rt_mode; 1567 1568 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1569 forwarded directly here, because there is no need to 1570 translate address/port back */ 1571 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1572 if (cp->packet_xmit) 1573 rc = cp->packet_xmit(skb, cp, pp, ipvsh); 1574 else 1575 rc = NF_ACCEPT; 1576 /* do not touch skb anymore */ 1577 atomic_inc(&cp->in_pkts); 1578 return rc; 1579 } 1580 1581 /* 1582 * mangle and send the packet here (only for VS/NAT) 1583 */ 1584 1585 /* LOCALNODE from FORWARD hook is not supported */ 1586 rt_mode = (hooknum != NF_INET_FORWARD) ? 1587 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1588 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1589 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1590 &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); 1591 if (local < 0) 1592 goto tx_error; 1593 rt = (struct rt6_info *) skb_dst(skb); 1594 /* 1595 * Avoid duplicate tuple in reply direction for NAT traffic 1596 * to local address when connection is sync-ed 1597 */ 1598#if IS_ENABLED(CONFIG_NF_CONNTRACK) 1599 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1600 enum ip_conntrack_info ctinfo; 1601 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1602 1603 if (ct) { 1604 IP_VS_DBG(10, "%s(): " 1605 "stopping DNAT to local address %pI6\n", 1606 __func__, &cp->daddr.in6); 1607 goto tx_error; 1608 } 1609 } 1610#endif 1611 1612 /* From world but DNAT to loopback address? */ 1613 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 1614 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 1615 IP_VS_DBG(1, "%s(): " 1616 "stopping DNAT to loopback %pI6\n", 1617 __func__, &cp->daddr.in6); 1618 goto tx_error; 1619 } 1620 1621 /* copy-on-write the packet before mangling it */ 1622 if (skb_ensure_writable(skb, offset)) 1623 goto tx_error; 1624 1625 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1626 goto tx_error; 1627 1628 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1629 1630 /* Another hack: avoid icmp_send in ip_fragment */ 1631 skb->ignore_df = 1; 1632 1633 return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 1634 1635tx_error: 1636 kfree_skb(skb); 1637 rc = NF_STOLEN; 1638 return rc; 1639} 1640#endif 1641