1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the NetFilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * 10 * ip_vs_sync: sync connection info from master load balancer to backups 11 * through multicast 12 * 13 * Changes: 14 * Alexandre Cassen : Added master & backup support at a time. 15 * Alexandre Cassen : Added SyncID support for incoming sync 16 * messages filtering. 17 * Justin Ossevoort : Fix endian problem on sync message size. 18 */ 19 20#define KMSG_COMPONENT "IPVS" 21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 22 23#include <linux/module.h> 24#include <linux/slab.h> 25#include <linux/inetdevice.h> 26#include <linux/net.h> 27#include <linux/completion.h> 28#include <linux/delay.h> 29#include <linux/skbuff.h> 30#include <linux/in.h> 31#include <linux/igmp.h> /* for ip_mc_join_group */ 32#include <linux/udp.h> 33#include <linux/err.h> 34#include <linux/kthread.h> 35#include <linux/wait.h> 36#include <linux/kernel.h> 37 38#include <net/ip.h> 39#include <net/sock.h> 40 41#include <net/ip_vs.h> 42 43#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 44#define IP_VS_SYNC_PORT 8848 /* multicast port */ 45 46 47/* 48 * IPVS sync connection entry 49 */ 50struct ip_vs_sync_conn { 51 __u8 reserved; 52 53 /* Protocol, addresses and port numbers */ 54 __u8 protocol; /* Which protocol (TCP/UDP) */ 55 __be16 cport; 56 __be16 vport; 57 __be16 dport; 58 __be32 caddr; /* client address */ 59 __be32 vaddr; /* virtual address */ 60 __be32 daddr; /* destination address */ 61 62 /* Flags and state transition */ 63 __be16 flags; /* status flags */ 64 __be16 state; /* state info */ 65 66 /* The sequence options start here */ 67}; 68 69struct ip_vs_sync_conn_options { 70 struct ip_vs_seq in_seq; /* incoming seq. struct */ 71 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 72}; 73 74struct ip_vs_sync_thread_data { 75 struct socket *sock; 76 char *buf; 77}; 78 79#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) 80#define FULL_CONN_SIZE \ 81(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) 82 83 84/* 85 The master mulitcasts messages to the backup load balancers in the 86 following format. 87 88 0 1 2 3 89 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 90 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 91 | Count Conns | SyncID | Size | 92 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 93 | | 94 | IPVS Sync Connection (1) | 95 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 96 | . | 97 | . | 98 | . | 99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 100 | | 101 | IPVS Sync Connection (n) | 102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 103*/ 104 105#define SYNC_MESG_HEADER_LEN 4 106#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ 107 108struct ip_vs_sync_mesg { 109 __u8 nr_conns; 110 __u8 syncid; 111 __u16 size; 112 113 /* ip_vs_sync_conn entries start here */ 114}; 115 116/* the maximum length of sync (sending/receiving) message */ 117static int sync_send_mesg_maxlen; 118static int sync_recv_mesg_maxlen; 119 120struct ip_vs_sync_buff { 121 struct list_head list; 122 unsigned long firstuse; 123 124 /* pointers for the message data */ 125 struct ip_vs_sync_mesg *mesg; 126 unsigned char *head; 127 unsigned char *end; 128}; 129 130 131/* the sync_buff list head and the lock */ 132static LIST_HEAD(ip_vs_sync_queue); 133static DEFINE_SPINLOCK(ip_vs_sync_lock); 134 135/* current sync_buff for accepting new conn entries */ 136static struct ip_vs_sync_buff *curr_sb = NULL; 137static DEFINE_SPINLOCK(curr_sb_lock); 138 139/* ipvs sync daemon state */ 140volatile int ip_vs_sync_state = IP_VS_STATE_NONE; 141volatile int ip_vs_master_syncid = 0; 142volatile int ip_vs_backup_syncid = 0; 143 144/* multicast interface name */ 145char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; 146char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; 147 148/* sync daemon tasks */ 149static struct task_struct *sync_master_thread; 150static struct task_struct *sync_backup_thread; 151 152/* multicast addr */ 153static struct sockaddr_in mcast_addr = { 154 .sin_family = AF_INET, 155 .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), 156 .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), 157}; 158 159 160static inline struct ip_vs_sync_buff *sb_dequeue(void) 161{ 162 struct ip_vs_sync_buff *sb; 163 164 spin_lock_bh(&ip_vs_sync_lock); 165 if (list_empty(&ip_vs_sync_queue)) { 166 sb = NULL; 167 } else { 168 sb = list_entry(ip_vs_sync_queue.next, 169 struct ip_vs_sync_buff, 170 list); 171 list_del(&sb->list); 172 } 173 spin_unlock_bh(&ip_vs_sync_lock); 174 175 return sb; 176} 177 178static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) 179{ 180 struct ip_vs_sync_buff *sb; 181 182 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 183 return NULL; 184 185 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { 186 kfree(sb); 187 return NULL; 188 } 189 sb->mesg->nr_conns = 0; 190 sb->mesg->syncid = ip_vs_master_syncid; 191 sb->mesg->size = 4; 192 sb->head = (unsigned char *)sb->mesg + 4; 193 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; 194 sb->firstuse = jiffies; 195 return sb; 196} 197 198static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 199{ 200 kfree(sb->mesg); 201 kfree(sb); 202} 203 204static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) 205{ 206 spin_lock(&ip_vs_sync_lock); 207 if (ip_vs_sync_state & IP_VS_STATE_MASTER) 208 list_add_tail(&sb->list, &ip_vs_sync_queue); 209 else 210 ip_vs_sync_buff_release(sb); 211 spin_unlock(&ip_vs_sync_lock); 212} 213 214/* 215 * Get the current sync buffer if it has been created for more 216 * than the specified time or the specified time is zero. 217 */ 218static inline struct ip_vs_sync_buff * 219get_curr_sync_buff(unsigned long time) 220{ 221 struct ip_vs_sync_buff *sb; 222 223 spin_lock_bh(&curr_sb_lock); 224 if (curr_sb && (time == 0 || 225 time_before(jiffies - curr_sb->firstuse, time))) { 226 sb = curr_sb; 227 curr_sb = NULL; 228 } else 229 sb = NULL; 230 spin_unlock_bh(&curr_sb_lock); 231 return sb; 232} 233 234 235/* 236 * Add an ip_vs_conn information into the current sync_buff. 237 * Called by ip_vs_in. 238 */ 239void ip_vs_sync_conn(struct ip_vs_conn *cp) 240{ 241 struct ip_vs_sync_mesg *m; 242 struct ip_vs_sync_conn *s; 243 int len; 244 245 spin_lock(&curr_sb_lock); 246 if (!curr_sb) { 247 if (!(curr_sb=ip_vs_sync_buff_create())) { 248 spin_unlock(&curr_sb_lock); 249 pr_err("ip_vs_sync_buff_create failed.\n"); 250 return; 251 } 252 } 253 254 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 255 SIMPLE_CONN_SIZE; 256 m = curr_sb->mesg; 257 s = (struct ip_vs_sync_conn *)curr_sb->head; 258 259 /* copy members */ 260 s->protocol = cp->protocol; 261 s->cport = cp->cport; 262 s->vport = cp->vport; 263 s->dport = cp->dport; 264 s->caddr = cp->caddr.ip; 265 s->vaddr = cp->vaddr.ip; 266 s->daddr = cp->daddr.ip; 267 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 268 s->state = htons(cp->state); 269 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 270 struct ip_vs_sync_conn_options *opt = 271 (struct ip_vs_sync_conn_options *)&s[1]; 272 memcpy(opt, &cp->in_seq, sizeof(*opt)); 273 } 274 275 m->nr_conns++; 276 m->size += len; 277 curr_sb->head += len; 278 279 /* check if there is a space for next one */ 280 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { 281 sb_queue_tail(curr_sb); 282 curr_sb = NULL; 283 } 284 spin_unlock(&curr_sb_lock); 285 286 /* synchronize its controller if it has */ 287 if (cp->control) 288 ip_vs_sync_conn(cp->control); 289} 290 291 292/* 293 * Process received multicast message and create the corresponding 294 * ip_vs_conn entries. 295 */ 296static void ip_vs_process_message(const char *buffer, const size_t buflen) 297{ 298 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; 299 struct ip_vs_sync_conn *s; 300 struct ip_vs_sync_conn_options *opt; 301 struct ip_vs_conn *cp; 302 struct ip_vs_protocol *pp; 303 struct ip_vs_dest *dest; 304 char *p; 305 int i; 306 307 if (buflen < sizeof(struct ip_vs_sync_mesg)) { 308 IP_VS_ERR_RL("sync message header too short\n"); 309 return; 310 } 311 312 /* Convert size back to host byte order */ 313 m->size = ntohs(m->size); 314 315 if (buflen != m->size) { 316 IP_VS_ERR_RL("bogus sync message size\n"); 317 return; 318 } 319 320 /* SyncID sanity check */ 321 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { 322 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", 323 m->syncid); 324 return; 325 } 326 327 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); 328 for (i=0; i<m->nr_conns; i++) { 329 unsigned flags, state; 330 331 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 332 IP_VS_ERR_RL("bogus conn in sync message\n"); 333 return; 334 } 335 s = (struct ip_vs_sync_conn *) p; 336 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 337 flags &= ~IP_VS_CONN_F_HASHED; 338 if (flags & IP_VS_CONN_F_SEQ_MASK) { 339 opt = (struct ip_vs_sync_conn_options *)&s[1]; 340 p += FULL_CONN_SIZE; 341 if (p > buffer+buflen) { 342 IP_VS_ERR_RL("bogus conn options in sync message\n"); 343 return; 344 } 345 } else { 346 opt = NULL; 347 p += SIMPLE_CONN_SIZE; 348 } 349 350 state = ntohs(s->state); 351 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 352 pp = ip_vs_proto_get(s->protocol); 353 if (!pp) { 354 IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", 355 s->protocol); 356 continue; 357 } 358 if (state >= pp->num_states) { 359 IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", 360 pp->name, state); 361 continue; 362 } 363 } else { 364 /* protocol in templates is not used for state/timeout */ 365 pp = NULL; 366 if (state > 0) { 367 IP_VS_DBG(2, "Invalid template state %u in sync msg\n", 368 state); 369 state = 0; 370 } 371 } 372 373 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 374 cp = ip_vs_conn_in_get(AF_INET, s->protocol, 375 (union nf_inet_addr *)&s->caddr, 376 s->cport, 377 (union nf_inet_addr *)&s->vaddr, 378 s->vport); 379 else 380 cp = ip_vs_ct_in_get(AF_INET, s->protocol, 381 (union nf_inet_addr *)&s->caddr, 382 s->cport, 383 (union nf_inet_addr *)&s->vaddr, 384 s->vport); 385 if (!cp) { 386 /* 387 * Find the appropriate destination for the connection. 388 * If it is not found the connection will remain unbound 389 * but still handled. 390 */ 391 dest = ip_vs_find_dest(AF_INET, 392 (union nf_inet_addr *)&s->daddr, 393 s->dport, 394 (union nf_inet_addr *)&s->vaddr, 395 s->vport, 396 s->protocol); 397 /* Set the approprite ativity flag */ 398 if (s->protocol == IPPROTO_TCP) { 399 if (state != IP_VS_TCP_S_ESTABLISHED) 400 flags |= IP_VS_CONN_F_INACTIVE; 401 else 402 flags &= ~IP_VS_CONN_F_INACTIVE; 403 } else if (s->protocol == IPPROTO_SCTP) { 404 if (state != IP_VS_SCTP_S_ESTABLISHED) 405 flags |= IP_VS_CONN_F_INACTIVE; 406 else 407 flags &= ~IP_VS_CONN_F_INACTIVE; 408 } 409 cp = ip_vs_conn_new(AF_INET, s->protocol, 410 (union nf_inet_addr *)&s->caddr, 411 s->cport, 412 (union nf_inet_addr *)&s->vaddr, 413 s->vport, 414 (union nf_inet_addr *)&s->daddr, 415 s->dport, 416 flags, dest); 417 if (dest) 418 atomic_dec(&dest->refcnt); 419 if (!cp) { 420 pr_err("ip_vs_conn_new failed\n"); 421 return; 422 } 423 } else if (!cp->dest) { 424 dest = ip_vs_try_bind_dest(cp); 425 if (dest) 426 atomic_dec(&dest->refcnt); 427 } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && 428 (cp->state != state)) { 429 /* update active/inactive flag for the connection */ 430 dest = cp->dest; 431 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && 432 (state != IP_VS_TCP_S_ESTABLISHED)) { 433 atomic_dec(&dest->activeconns); 434 atomic_inc(&dest->inactconns); 435 cp->flags |= IP_VS_CONN_F_INACTIVE; 436 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && 437 (state == IP_VS_TCP_S_ESTABLISHED)) { 438 atomic_inc(&dest->activeconns); 439 atomic_dec(&dest->inactconns); 440 cp->flags &= ~IP_VS_CONN_F_INACTIVE; 441 } 442 } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && 443 (cp->state != state)) { 444 dest = cp->dest; 445 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && 446 (state != IP_VS_SCTP_S_ESTABLISHED)) { 447 atomic_dec(&dest->activeconns); 448 atomic_inc(&dest->inactconns); 449 cp->flags &= ~IP_VS_CONN_F_INACTIVE; 450 } 451 } 452 453 if (opt) 454 memcpy(&cp->in_seq, opt, sizeof(*opt)); 455 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); 456 cp->state = state; 457 cp->old_state = cp->state; 458 /* 459 * We can not recover the right timeout for templates 460 * in all cases, we can not find the right fwmark 461 * virtual service. If needed, we can do it for 462 * non-fwmark persistent services. 463 */ 464 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) 465 cp->timeout = pp->timeout_table[state]; 466 else 467 cp->timeout = (3*60*HZ); 468 ip_vs_conn_put(cp); 469 } 470} 471 472 473/* 474 * Setup loopback of outgoing multicasts on a sending socket 475 */ 476static void set_mcast_loop(struct sock *sk, u_char loop) 477{ 478 struct inet_sock *inet = inet_sk(sk); 479 480 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 481 lock_sock(sk); 482 inet->mc_loop = loop ? 1 : 0; 483 release_sock(sk); 484} 485 486/* 487 * Specify TTL for outgoing multicasts on a sending socket 488 */ 489static void set_mcast_ttl(struct sock *sk, u_char ttl) 490{ 491 struct inet_sock *inet = inet_sk(sk); 492 493 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 494 lock_sock(sk); 495 inet->mc_ttl = ttl; 496 release_sock(sk); 497} 498 499/* 500 * Specifiy default interface for outgoing multicasts 501 */ 502static int set_mcast_if(struct sock *sk, char *ifname) 503{ 504 struct net_device *dev; 505 struct inet_sock *inet = inet_sk(sk); 506 507 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) 508 return -ENODEV; 509 510 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 511 return -EINVAL; 512 513 lock_sock(sk); 514 inet->mc_index = dev->ifindex; 515 /* inet->mc_addr = 0; */ 516 release_sock(sk); 517 518 return 0; 519} 520 521 522/* 523 * Set the maximum length of sync message according to the 524 * specified interface's MTU. 525 */ 526static int set_sync_mesg_maxlen(int sync_state) 527{ 528 struct net_device *dev; 529 int num; 530 531 if (sync_state == IP_VS_STATE_MASTER) { 532 if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) 533 return -ENODEV; 534 535 num = (dev->mtu - sizeof(struct iphdr) - 536 sizeof(struct udphdr) - 537 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; 538 sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + 539 SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); 540 IP_VS_DBG(7, "setting the maximum length of sync sending " 541 "message %d.\n", sync_send_mesg_maxlen); 542 } else if (sync_state == IP_VS_STATE_BACKUP) { 543 if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) 544 return -ENODEV; 545 546 sync_recv_mesg_maxlen = dev->mtu - 547 sizeof(struct iphdr) - sizeof(struct udphdr); 548 IP_VS_DBG(7, "setting the maximum length of sync receiving " 549 "message %d.\n", sync_recv_mesg_maxlen); 550 } 551 552 return 0; 553} 554 555 556/* 557 * Join a multicast group. 558 * the group is specified by a class D multicast address 224.0.0.0/8 559 * in the in_addr structure passed in as a parameter. 560 */ 561static int 562join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) 563{ 564 struct ip_mreqn mreq; 565 struct net_device *dev; 566 int ret; 567 568 memset(&mreq, 0, sizeof(mreq)); 569 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 570 571 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) 572 return -ENODEV; 573 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 574 return -EINVAL; 575 576 mreq.imr_ifindex = dev->ifindex; 577 578 lock_sock(sk); 579 ret = ip_mc_join_group(sk, &mreq); 580 release_sock(sk); 581 582 return ret; 583} 584 585 586static int bind_mcastif_addr(struct socket *sock, char *ifname) 587{ 588 struct net_device *dev; 589 __be32 addr; 590 struct sockaddr_in sin; 591 592 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) 593 return -ENODEV; 594 595 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 596 if (!addr) 597 pr_err("You probably need to specify IP address on " 598 "multicast interface.\n"); 599 600 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 601 ifname, &addr); 602 603 /* Now bind the socket with the address of multicast interface */ 604 sin.sin_family = AF_INET; 605 sin.sin_addr.s_addr = addr; 606 sin.sin_port = 0; 607 608 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 609} 610 611/* 612 * Set up sending multicast socket over UDP 613 */ 614static struct socket * make_send_sock(void) 615{ 616 struct socket *sock; 617 int result; 618 619 /* First create a socket */ 620 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); 621 if (result < 0) { 622 pr_err("Error during creation of socket; terminating\n"); 623 return ERR_PTR(result); 624 } 625 626 result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); 627 if (result < 0) { 628 pr_err("Error setting outbound mcast interface\n"); 629 goto error; 630 } 631 632 set_mcast_loop(sock->sk, 0); 633 set_mcast_ttl(sock->sk, 1); 634 635 result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); 636 if (result < 0) { 637 pr_err("Error binding address of the mcast interface\n"); 638 goto error; 639 } 640 641 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 642 sizeof(struct sockaddr), 0); 643 if (result < 0) { 644 pr_err("Error connecting to the multicast addr\n"); 645 goto error; 646 } 647 648 return sock; 649 650 error: 651 sock_release(sock); 652 return ERR_PTR(result); 653} 654 655 656/* 657 * Set up receiving multicast socket over UDP 658 */ 659static struct socket * make_receive_sock(void) 660{ 661 struct socket *sock; 662 int result; 663 664 /* First create a socket */ 665 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); 666 if (result < 0) { 667 pr_err("Error during creation of socket; terminating\n"); 668 return ERR_PTR(result); 669 } 670 671 /* it is equivalent to the REUSEADDR option in user-space */ 672 sock->sk->sk_reuse = 1; 673 674 result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, 675 sizeof(struct sockaddr)); 676 if (result < 0) { 677 pr_err("Error binding to the multicast addr\n"); 678 goto error; 679 } 680 681 /* join the multicast group */ 682 result = join_mcast_group(sock->sk, 683 (struct in_addr *) &mcast_addr.sin_addr, 684 ip_vs_backup_mcast_ifn); 685 if (result < 0) { 686 pr_err("Error joining to the multicast group\n"); 687 goto error; 688 } 689 690 return sock; 691 692 error: 693 sock_release(sock); 694 return ERR_PTR(result); 695} 696 697 698static int 699ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 700{ 701 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 702 struct kvec iov; 703 int len; 704 705 EnterFunction(7); 706 iov.iov_base = (void *)buffer; 707 iov.iov_len = length; 708 709 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 710 711 LeaveFunction(7); 712 return len; 713} 714 715static void 716ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 717{ 718 int msize; 719 720 msize = msg->size; 721 722 /* Put size in network byte order */ 723 msg->size = htons(msg->size); 724 725 if (ip_vs_send_async(sock, (char *)msg, msize) != msize) 726 pr_err("ip_vs_send_async error\n"); 727} 728 729static int 730ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 731{ 732 struct msghdr msg = {NULL,}; 733 struct kvec iov; 734 int len; 735 736 EnterFunction(7); 737 738 /* Receive a packet */ 739 iov.iov_base = buffer; 740 iov.iov_len = (size_t)buflen; 741 742 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); 743 744 if (len < 0) 745 return -1; 746 747 LeaveFunction(7); 748 return len; 749} 750 751 752static int sync_thread_master(void *data) 753{ 754 struct ip_vs_sync_thread_data *tinfo = data; 755 struct ip_vs_sync_buff *sb; 756 757 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 758 "syncid = %d\n", 759 ip_vs_master_mcast_ifn, ip_vs_master_syncid); 760 761 while (!kthread_should_stop()) { 762 while ((sb = sb_dequeue())) { 763 ip_vs_send_sync_msg(tinfo->sock, sb->mesg); 764 ip_vs_sync_buff_release(sb); 765 } 766 767 /* check if entries stay in curr_sb for 2 seconds */ 768 sb = get_curr_sync_buff(2 * HZ); 769 if (sb) { 770 ip_vs_send_sync_msg(tinfo->sock, sb->mesg); 771 ip_vs_sync_buff_release(sb); 772 } 773 774 schedule_timeout_interruptible(HZ); 775 } 776 777 /* clean up the sync_buff queue */ 778 while ((sb=sb_dequeue())) { 779 ip_vs_sync_buff_release(sb); 780 } 781 782 /* clean up the current sync_buff */ 783 if ((sb = get_curr_sync_buff(0))) { 784 ip_vs_sync_buff_release(sb); 785 } 786 787 /* release the sending multicast socket */ 788 sock_release(tinfo->sock); 789 kfree(tinfo); 790 791 return 0; 792} 793 794 795static int sync_thread_backup(void *data) 796{ 797 struct ip_vs_sync_thread_data *tinfo = data; 798 int len; 799 800 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 801 "syncid = %d\n", 802 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); 803 804 while (!kthread_should_stop()) { 805 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), 806 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) 807 || kthread_should_stop()); 808 809 /* do we have data now? */ 810 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { 811 len = ip_vs_receive(tinfo->sock, tinfo->buf, 812 sync_recv_mesg_maxlen); 813 if (len <= 0) { 814 pr_err("receiving message error\n"); 815 break; 816 } 817 818 /* disable bottom half, because it accesses the data 819 shared by softirq while getting/creating conns */ 820 local_bh_disable(); 821 ip_vs_process_message(tinfo->buf, len); 822 local_bh_enable(); 823 } 824 } 825 826 /* release the sending multicast socket */ 827 sock_release(tinfo->sock); 828 kfree(tinfo->buf); 829 kfree(tinfo); 830 831 return 0; 832} 833 834 835int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) 836{ 837 struct ip_vs_sync_thread_data *tinfo; 838 struct task_struct **realtask, *task; 839 struct socket *sock; 840 char *name, *buf = NULL; 841 int (*threadfn)(void *data); 842 int result = -ENOMEM; 843 844 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 845 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", 846 sizeof(struct ip_vs_sync_conn)); 847 848 if (state == IP_VS_STATE_MASTER) { 849 if (sync_master_thread) 850 return -EEXIST; 851 852 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, 853 sizeof(ip_vs_master_mcast_ifn)); 854 ip_vs_master_syncid = syncid; 855 realtask = &sync_master_thread; 856 name = "ipvs_syncmaster"; 857 threadfn = sync_thread_master; 858 sock = make_send_sock(); 859 } else if (state == IP_VS_STATE_BACKUP) { 860 if (sync_backup_thread) 861 return -EEXIST; 862 863 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, 864 sizeof(ip_vs_backup_mcast_ifn)); 865 ip_vs_backup_syncid = syncid; 866 realtask = &sync_backup_thread; 867 name = "ipvs_syncbackup"; 868 threadfn = sync_thread_backup; 869 sock = make_receive_sock(); 870 } else { 871 return -EINVAL; 872 } 873 874 if (IS_ERR(sock)) { 875 result = PTR_ERR(sock); 876 goto out; 877 } 878 879 set_sync_mesg_maxlen(state); 880 if (state == IP_VS_STATE_BACKUP) { 881 buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); 882 if (!buf) 883 goto outsocket; 884 } 885 886 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 887 if (!tinfo) 888 goto outbuf; 889 890 tinfo->sock = sock; 891 tinfo->buf = buf; 892 893 task = kthread_run(threadfn, tinfo, name); 894 if (IS_ERR(task)) { 895 result = PTR_ERR(task); 896 goto outtinfo; 897 } 898 899 /* mark as active */ 900 *realtask = task; 901 ip_vs_sync_state |= state; 902 903 /* increase the module use count */ 904 ip_vs_use_count_inc(); 905 906 return 0; 907 908outtinfo: 909 kfree(tinfo); 910outbuf: 911 kfree(buf); 912outsocket: 913 sock_release(sock); 914out: 915 return result; 916} 917 918 919int stop_sync_thread(int state) 920{ 921 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 922 923 if (state == IP_VS_STATE_MASTER) { 924 if (!sync_master_thread) 925 return -ESRCH; 926 927 pr_info("stopping master sync thread %d ...\n", 928 task_pid_nr(sync_master_thread)); 929 930 /* 931 * The lock synchronizes with sb_queue_tail(), so that we don't 932 * add sync buffers to the queue, when we are already in 933 * progress of stopping the master sync daemon. 934 */ 935 936 spin_lock_bh(&ip_vs_sync_lock); 937 ip_vs_sync_state &= ~IP_VS_STATE_MASTER; 938 spin_unlock_bh(&ip_vs_sync_lock); 939 kthread_stop(sync_master_thread); 940 sync_master_thread = NULL; 941 } else if (state == IP_VS_STATE_BACKUP) { 942 if (!sync_backup_thread) 943 return -ESRCH; 944 945 pr_info("stopping backup sync thread %d ...\n", 946 task_pid_nr(sync_backup_thread)); 947 948 ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; 949 kthread_stop(sync_backup_thread); 950 sync_backup_thread = NULL; 951 } else { 952 return -EINVAL; 953 } 954 955 /* decrease the module use count */ 956 ip_vs_use_count_dec(); 957 958 return 0; 959} 960