1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the NetFilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Version: $Id: ip_vs_sync.c,v 1.1.1.1 2007/08/03 18:53:52 Exp $ 9 * 10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 11 * 12 * ip_vs_sync: sync connection info from master load balancer to backups 13 * through multicast 14 * 15 * Changes: 16 * Alexandre Cassen : Added master & backup support at a time. 17 * Alexandre Cassen : Added SyncID support for incoming sync 18 * messages filtering. 19 * Justin Ossevoort : Fix endian problem on sync message size. 20 */ 21 22#include <linux/module.h> 23#include <linux/slab.h> 24#include <linux/inetdevice.h> 25#include <linux/net.h> 26#include <linux/completion.h> 27#include <linux/delay.h> 28#include <linux/skbuff.h> 29#include <linux/in.h> 30#include <linux/igmp.h> /* for ip_mc_join_group */ 31#include <linux/udp.h> 32 33#include <net/ip.h> 34#include <net/sock.h> 35#include <asm/uaccess.h> /* for get_fs and set_fs */ 36 37#include <net/ip_vs.h> 38 39#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 40#define IP_VS_SYNC_PORT 8848 /* multicast port */ 41 42 43/* 44 * IPVS sync connection entry 45 */ 46struct ip_vs_sync_conn { 47 __u8 reserved; 48 49 /* Protocol, addresses and port numbers */ 50 __u8 protocol; /* Which protocol (TCP/UDP) */ 51 __be16 cport; 52 __be16 vport; 53 __be16 dport; 54 __be32 caddr; /* client address */ 55 __be32 vaddr; /* virtual address */ 56 __be32 daddr; /* destination address */ 57 58 /* Flags and state transition */ 59 __be16 flags; /* status flags */ 60 __be16 state; /* state info */ 61 62 /* The sequence options start here */ 63}; 64 65struct ip_vs_sync_conn_options { 66 struct ip_vs_seq in_seq; /* incoming seq. struct */ 67 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 68}; 69 70struct ip_vs_sync_thread_data { 71 struct completion *startup; 72 int state; 73}; 74 75#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ) 76#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) 77#define FULL_CONN_SIZE \ 78(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) 79 80 81/* 82 The master mulitcasts messages to the backup load balancers in the 83 following format. 84 85 0 1 2 3 86 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 87 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 88 | Count Conns | SyncID | Size | 89 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 90 | | 91 | IPVS Sync Connection (1) | 92 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 93 | . | 94 | . | 95 | . | 96 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 97 | | 98 | IPVS Sync Connection (n) | 99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 100*/ 101 102#define SYNC_MESG_HEADER_LEN 4 103 104struct ip_vs_sync_mesg { 105 __u8 nr_conns; 106 __u8 syncid; 107 __u16 size; 108 109 /* ip_vs_sync_conn entries start here */ 110}; 111 112/* the maximum length of sync (sending/receiving) message */ 113static int sync_send_mesg_maxlen; 114static int sync_recv_mesg_maxlen; 115 116struct ip_vs_sync_buff { 117 struct list_head list; 118 unsigned long firstuse; 119 120 /* pointers for the message data */ 121 struct ip_vs_sync_mesg *mesg; 122 unsigned char *head; 123 unsigned char *end; 124}; 125 126 127/* the sync_buff list head and the lock */ 128static LIST_HEAD(ip_vs_sync_queue); 129static DEFINE_SPINLOCK(ip_vs_sync_lock); 130 131/* current sync_buff for accepting new conn entries */ 132static struct ip_vs_sync_buff *curr_sb = NULL; 133static DEFINE_SPINLOCK(curr_sb_lock); 134 135/* ipvs sync daemon state */ 136volatile int ip_vs_sync_state = IP_VS_STATE_NONE; 137volatile int ip_vs_master_syncid = 0; 138volatile int ip_vs_backup_syncid = 0; 139 140/* multicast interface name */ 141char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; 142char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; 143 144/* multicast addr */ 145static struct sockaddr_in mcast_addr; 146 147 148static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) 149{ 150 spin_lock(&ip_vs_sync_lock); 151 list_add_tail(&sb->list, &ip_vs_sync_queue); 152 spin_unlock(&ip_vs_sync_lock); 153} 154 155static inline struct ip_vs_sync_buff * sb_dequeue(void) 156{ 157 struct ip_vs_sync_buff *sb; 158 159 spin_lock_bh(&ip_vs_sync_lock); 160 if (list_empty(&ip_vs_sync_queue)) { 161 sb = NULL; 162 } else { 163 sb = list_entry(ip_vs_sync_queue.next, 164 struct ip_vs_sync_buff, 165 list); 166 list_del(&sb->list); 167 } 168 spin_unlock_bh(&ip_vs_sync_lock); 169 170 return sb; 171} 172 173static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) 174{ 175 struct ip_vs_sync_buff *sb; 176 177 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 178 return NULL; 179 180 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { 181 kfree(sb); 182 return NULL; 183 } 184 sb->mesg->nr_conns = 0; 185 sb->mesg->syncid = ip_vs_master_syncid; 186 sb->mesg->size = 4; 187 sb->head = (unsigned char *)sb->mesg + 4; 188 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; 189 sb->firstuse = jiffies; 190 return sb; 191} 192 193static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 194{ 195 kfree(sb->mesg); 196 kfree(sb); 197} 198 199/* 200 * Get the current sync buffer if it has been created for more 201 * than the specified time or the specified time is zero. 202 */ 203static inline struct ip_vs_sync_buff * 204get_curr_sync_buff(unsigned long time) 205{ 206 struct ip_vs_sync_buff *sb; 207 208 spin_lock_bh(&curr_sb_lock); 209 if (curr_sb && (time == 0 || 210 time_before(jiffies - curr_sb->firstuse, time))) { 211 sb = curr_sb; 212 curr_sb = NULL; 213 } else 214 sb = NULL; 215 spin_unlock_bh(&curr_sb_lock); 216 return sb; 217} 218 219 220/* 221 * Add an ip_vs_conn information into the current sync_buff. 222 * Called by ip_vs_in. 223 */ 224void ip_vs_sync_conn(struct ip_vs_conn *cp) 225{ 226 struct ip_vs_sync_mesg *m; 227 struct ip_vs_sync_conn *s; 228 int len; 229 230 spin_lock(&curr_sb_lock); 231 if (!curr_sb) { 232 if (!(curr_sb=ip_vs_sync_buff_create())) { 233 spin_unlock(&curr_sb_lock); 234 IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); 235 return; 236 } 237 } 238 239 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 240 SIMPLE_CONN_SIZE; 241 m = curr_sb->mesg; 242 s = (struct ip_vs_sync_conn *)curr_sb->head; 243 244 /* copy members */ 245 s->protocol = cp->protocol; 246 s->cport = cp->cport; 247 s->vport = cp->vport; 248 s->dport = cp->dport; 249 s->caddr = cp->caddr; 250 s->vaddr = cp->vaddr; 251 s->daddr = cp->daddr; 252 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 253 s->state = htons(cp->state); 254 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 255 struct ip_vs_sync_conn_options *opt = 256 (struct ip_vs_sync_conn_options *)&s[1]; 257 memcpy(opt, &cp->in_seq, sizeof(*opt)); 258 } 259 260 m->nr_conns++; 261 m->size += len; 262 curr_sb->head += len; 263 264 /* check if there is a space for next one */ 265 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { 266 sb_queue_tail(curr_sb); 267 curr_sb = NULL; 268 } 269 spin_unlock(&curr_sb_lock); 270 271 /* synchronize its controller if it has */ 272 if (cp->control) 273 ip_vs_sync_conn(cp->control); 274} 275 276 277/* 278 * Process received multicast message and create the corresponding 279 * ip_vs_conn entries. 280 */ 281static void ip_vs_process_message(const char *buffer, const size_t buflen) 282{ 283 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; 284 struct ip_vs_sync_conn *s; 285 struct ip_vs_sync_conn_options *opt; 286 struct ip_vs_conn *cp; 287 char *p; 288 int i; 289 290 /* Convert size back to host byte order */ 291 m->size = ntohs(m->size); 292 293 if (buflen != m->size) { 294 IP_VS_ERR("bogus message\n"); 295 return; 296 } 297 298 /* SyncID sanity check */ 299 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { 300 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", 301 m->syncid); 302 return; 303 } 304 305 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); 306 for (i=0; i<m->nr_conns; i++) { 307 unsigned flags; 308 309 s = (struct ip_vs_sync_conn *)p; 310 flags = ntohs(s->flags); 311 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 312 cp = ip_vs_conn_in_get(s->protocol, 313 s->caddr, s->cport, 314 s->vaddr, s->vport); 315 else 316 cp = ip_vs_ct_in_get(s->protocol, 317 s->caddr, s->cport, 318 s->vaddr, s->vport); 319 if (!cp) { 320 cp = ip_vs_conn_new(s->protocol, 321 s->caddr, s->cport, 322 s->vaddr, s->vport, 323 s->daddr, s->dport, 324 flags, NULL); 325 if (!cp) { 326 IP_VS_ERR("ip_vs_conn_new failed\n"); 327 return; 328 } 329 cp->state = ntohs(s->state); 330 } else if (!cp->dest) { 331 /* it is an entry created by the synchronization */ 332 cp->state = ntohs(s->state); 333 cp->flags = flags | IP_VS_CONN_F_HASHED; 334 } /* Note that we don't touch its state and flags 335 if it is a normal entry. */ 336 337 if (flags & IP_VS_CONN_F_SEQ_MASK) { 338 opt = (struct ip_vs_sync_conn_options *)&s[1]; 339 memcpy(&cp->in_seq, opt, sizeof(*opt)); 340 p += FULL_CONN_SIZE; 341 } else 342 p += SIMPLE_CONN_SIZE; 343 344 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); 345 cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; 346 ip_vs_conn_put(cp); 347 348 if (p > buffer+buflen) { 349 IP_VS_ERR("bogus message\n"); 350 return; 351 } 352 } 353} 354 355 356/* 357 * Setup loopback of outgoing multicasts on a sending socket 358 */ 359static void set_mcast_loop(struct sock *sk, u_char loop) 360{ 361 struct inet_sock *inet = inet_sk(sk); 362 363 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 364 lock_sock(sk); 365 inet->mc_loop = loop ? 1 : 0; 366 release_sock(sk); 367} 368 369/* 370 * Specify TTL for outgoing multicasts on a sending socket 371 */ 372static void set_mcast_ttl(struct sock *sk, u_char ttl) 373{ 374 struct inet_sock *inet = inet_sk(sk); 375 376 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 377 lock_sock(sk); 378 inet->mc_ttl = ttl; 379 release_sock(sk); 380} 381 382/* 383 * Specifiy default interface for outgoing multicasts 384 */ 385static int set_mcast_if(struct sock *sk, char *ifname) 386{ 387 struct net_device *dev; 388 struct inet_sock *inet = inet_sk(sk); 389 390 if ((dev = __dev_get_by_name(ifname)) == NULL) 391 return -ENODEV; 392 393 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 394 return -EINVAL; 395 396 lock_sock(sk); 397 inet->mc_index = dev->ifindex; 398 /* inet->mc_addr = 0; */ 399 release_sock(sk); 400 401 return 0; 402} 403 404 405/* 406 * Set the maximum length of sync message according to the 407 * specified interface's MTU. 408 */ 409static int set_sync_mesg_maxlen(int sync_state) 410{ 411 struct net_device *dev; 412 int num; 413 414 if (sync_state == IP_VS_STATE_MASTER) { 415 if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL) 416 return -ENODEV; 417 418 num = (dev->mtu - sizeof(struct iphdr) - 419 sizeof(struct udphdr) - 420 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; 421 sync_send_mesg_maxlen = 422 SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; 423 IP_VS_DBG(7, "setting the maximum length of sync sending " 424 "message %d.\n", sync_send_mesg_maxlen); 425 } else if (sync_state == IP_VS_STATE_BACKUP) { 426 if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL) 427 return -ENODEV; 428 429 sync_recv_mesg_maxlen = dev->mtu - 430 sizeof(struct iphdr) - sizeof(struct udphdr); 431 IP_VS_DBG(7, "setting the maximum length of sync receiving " 432 "message %d.\n", sync_recv_mesg_maxlen); 433 } 434 435 return 0; 436} 437 438 439/* 440 * Join a multicast group. 441 * the group is specified by a class D multicast address 224.0.0.0/8 442 * in the in_addr structure passed in as a parameter. 443 */ 444static int 445join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) 446{ 447 struct ip_mreqn mreq; 448 struct net_device *dev; 449 int ret; 450 451 memset(&mreq, 0, sizeof(mreq)); 452 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 453 454 if ((dev = __dev_get_by_name(ifname)) == NULL) 455 return -ENODEV; 456 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 457 return -EINVAL; 458 459 mreq.imr_ifindex = dev->ifindex; 460 461 lock_sock(sk); 462 ret = ip_mc_join_group(sk, &mreq); 463 release_sock(sk); 464 465 return ret; 466} 467 468 469static int bind_mcastif_addr(struct socket *sock, char *ifname) 470{ 471 struct net_device *dev; 472 __be32 addr; 473 struct sockaddr_in sin; 474 475 if ((dev = __dev_get_by_name(ifname)) == NULL) 476 return -ENODEV; 477 478 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 479 if (!addr) 480 IP_VS_ERR("You probably need to specify IP address on " 481 "multicast interface.\n"); 482 483 IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", 484 ifname, NIPQUAD(addr)); 485 486 /* Now bind the socket with the address of multicast interface */ 487 sin.sin_family = AF_INET; 488 sin.sin_addr.s_addr = addr; 489 sin.sin_port = 0; 490 491 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 492} 493 494/* 495 * Set up sending multicast socket over UDP 496 */ 497static struct socket * make_send_sock(void) 498{ 499 struct socket *sock; 500 501 /* First create a socket */ 502 if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { 503 IP_VS_ERR("Error during creation of socket; terminating\n"); 504 return NULL; 505 } 506 507 if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) { 508 IP_VS_ERR("Error setting outbound mcast interface\n"); 509 goto error; 510 } 511 512 set_mcast_loop(sock->sk, 0); 513 set_mcast_ttl(sock->sk, 1); 514 515 if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) { 516 IP_VS_ERR("Error binding address of the mcast interface\n"); 517 goto error; 518 } 519 520 if (sock->ops->connect(sock, 521 (struct sockaddr*)&mcast_addr, 522 sizeof(struct sockaddr), 0) < 0) { 523 IP_VS_ERR("Error connecting to the multicast addr\n"); 524 goto error; 525 } 526 527 return sock; 528 529 error: 530 sock_release(sock); 531 return NULL; 532} 533 534 535/* 536 * Set up receiving multicast socket over UDP 537 */ 538static struct socket * make_receive_sock(void) 539{ 540 struct socket *sock; 541 542 /* First create a socket */ 543 if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { 544 IP_VS_ERR("Error during creation of socket; terminating\n"); 545 return NULL; 546 } 547 548 /* it is equivalent to the REUSEADDR option in user-space */ 549 sock->sk->sk_reuse = 1; 550 551 if (sock->ops->bind(sock, 552 (struct sockaddr*)&mcast_addr, 553 sizeof(struct sockaddr)) < 0) { 554 IP_VS_ERR("Error binding to the multicast addr\n"); 555 goto error; 556 } 557 558 /* join the multicast group */ 559 if (join_mcast_group(sock->sk, 560 (struct in_addr*)&mcast_addr.sin_addr, 561 ip_vs_backup_mcast_ifn) < 0) { 562 IP_VS_ERR("Error joining to the multicast group\n"); 563 goto error; 564 } 565 566 return sock; 567 568 error: 569 sock_release(sock); 570 return NULL; 571} 572 573 574static int 575ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 576{ 577 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 578 struct kvec iov; 579 int len; 580 581 EnterFunction(7); 582 iov.iov_base = (void *)buffer; 583 iov.iov_len = length; 584 585 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 586 587 LeaveFunction(7); 588 return len; 589} 590 591static void 592ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 593{ 594 int msize; 595 596 msize = msg->size; 597 598 /* Put size in network byte order */ 599 msg->size = htons(msg->size); 600 601 if (ip_vs_send_async(sock, (char *)msg, msize) != msize) 602 IP_VS_ERR("ip_vs_send_async error\n"); 603} 604 605static int 606ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 607{ 608 struct msghdr msg = {NULL,}; 609 struct kvec iov; 610 int len; 611 612 EnterFunction(7); 613 614 /* Receive a packet */ 615 iov.iov_base = buffer; 616 iov.iov_len = (size_t)buflen; 617 618 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); 619 620 if (len < 0) 621 return -1; 622 623 LeaveFunction(7); 624 return len; 625} 626 627 628static DECLARE_WAIT_QUEUE_HEAD(sync_wait); 629static pid_t sync_master_pid = 0; 630static pid_t sync_backup_pid = 0; 631 632static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); 633static int stop_master_sync = 0; 634static int stop_backup_sync = 0; 635 636static void sync_master_loop(void) 637{ 638 struct socket *sock; 639 struct ip_vs_sync_buff *sb; 640 641 /* create the sending multicast socket */ 642 sock = make_send_sock(); 643 if (!sock) 644 return; 645 646 IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " 647 "syncid = %d\n", 648 ip_vs_master_mcast_ifn, ip_vs_master_syncid); 649 650 for (;;) { 651 while ((sb=sb_dequeue())) { 652 ip_vs_send_sync_msg(sock, sb->mesg); 653 ip_vs_sync_buff_release(sb); 654 } 655 656 /* check if entries stay in curr_sb for 2 seconds */ 657 if ((sb = get_curr_sync_buff(2*HZ))) { 658 ip_vs_send_sync_msg(sock, sb->mesg); 659 ip_vs_sync_buff_release(sb); 660 } 661 662 if (stop_master_sync) 663 break; 664 665 msleep_interruptible(1000); 666 } 667 668 /* clean up the sync_buff queue */ 669 while ((sb=sb_dequeue())) { 670 ip_vs_sync_buff_release(sb); 671 } 672 673 /* clean up the current sync_buff */ 674 if ((sb = get_curr_sync_buff(0))) { 675 ip_vs_sync_buff_release(sb); 676 } 677 678 /* release the sending multicast socket */ 679 sock_release(sock); 680} 681 682 683static void sync_backup_loop(void) 684{ 685 struct socket *sock; 686 char *buf; 687 int len; 688 689 if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) { 690 IP_VS_ERR("sync_backup_loop: kmalloc error\n"); 691 return; 692 } 693 694 /* create the receiving multicast socket */ 695 sock = make_receive_sock(); 696 if (!sock) 697 goto out; 698 699 IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " 700 "syncid = %d\n", 701 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); 702 703 for (;;) { 704 /* do you have data now? */ 705 while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) { 706 if ((len = 707 ip_vs_receive(sock, buf, 708 sync_recv_mesg_maxlen)) <= 0) { 709 IP_VS_ERR("receiving message error\n"); 710 break; 711 } 712 /* disable bottom half, because it accessed the data 713 shared by softirq while getting/creating conns */ 714 local_bh_disable(); 715 ip_vs_process_message(buf, len); 716 local_bh_enable(); 717 } 718 719 if (stop_backup_sync) 720 break; 721 722 msleep_interruptible(1000); 723 } 724 725 /* release the sending multicast socket */ 726 sock_release(sock); 727 728 out: 729 kfree(buf); 730} 731 732 733static void set_sync_pid(int sync_state, pid_t sync_pid) 734{ 735 if (sync_state == IP_VS_STATE_MASTER) 736 sync_master_pid = sync_pid; 737 else if (sync_state == IP_VS_STATE_BACKUP) 738 sync_backup_pid = sync_pid; 739} 740 741static void set_stop_sync(int sync_state, int set) 742{ 743 if (sync_state == IP_VS_STATE_MASTER) 744 stop_master_sync = set; 745 else if (sync_state == IP_VS_STATE_BACKUP) 746 stop_backup_sync = set; 747 else { 748 stop_master_sync = set; 749 stop_backup_sync = set; 750 } 751} 752 753static int sync_thread(void *startup) 754{ 755 DECLARE_WAITQUEUE(wait, current); 756 mm_segment_t oldmm; 757 int state; 758 const char *name; 759 struct ip_vs_sync_thread_data *tinfo = startup; 760 761 /* increase the module use count */ 762 ip_vs_use_count_inc(); 763 764 if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) { 765 state = IP_VS_STATE_MASTER; 766 name = "ipvs_syncmaster"; 767 } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) { 768 state = IP_VS_STATE_BACKUP; 769 name = "ipvs_syncbackup"; 770 } else { 771 IP_VS_BUG(); 772 ip_vs_use_count_dec(); 773 return -EINVAL; 774 } 775 776 daemonize(name); 777 778 oldmm = get_fs(); 779 set_fs(KERNEL_DS); 780 781 /* Block all signals */ 782 spin_lock_irq(¤t->sighand->siglock); 783 siginitsetinv(¤t->blocked, 0); 784 recalc_sigpending(); 785 spin_unlock_irq(¤t->sighand->siglock); 786 787 /* set the maximum length of sync message */ 788 set_sync_mesg_maxlen(state); 789 790 /* set up multicast address */ 791 mcast_addr.sin_family = AF_INET; 792 mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); 793 mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); 794 795 add_wait_queue(&sync_wait, &wait); 796 797 set_sync_pid(state, current->pid); 798 complete(tinfo->startup); 799 800 /* 801 * once we call the completion queue above, we should 802 * null out that reference, since its allocated on the 803 * stack of the creating kernel thread 804 */ 805 tinfo->startup = NULL; 806 807 /* processing master/backup loop here */ 808 if (state == IP_VS_STATE_MASTER) 809 sync_master_loop(); 810 else if (state == IP_VS_STATE_BACKUP) 811 sync_backup_loop(); 812 else IP_VS_BUG(); 813 814 remove_wait_queue(&sync_wait, &wait); 815 816 /* thread exits */ 817 818 /* 819 * If we weren't explicitly stopped, then we 820 * exited in error, and should undo our state 821 */ 822 if ((!stop_master_sync) && (!stop_backup_sync)) 823 ip_vs_sync_state -= tinfo->state; 824 825 set_sync_pid(state, 0); 826 IP_VS_INFO("sync thread stopped!\n"); 827 828 set_fs(oldmm); 829 830 /* decrease the module use count */ 831 ip_vs_use_count_dec(); 832 833 set_stop_sync(state, 0); 834 wake_up(&stop_sync_wait); 835 836 /* 837 * we need to free the structure that was allocated 838 * for us in start_sync_thread 839 */ 840 kfree(tinfo); 841 return 0; 842} 843 844 845static int fork_sync_thread(void *startup) 846{ 847 pid_t pid; 848 849 /* fork the sync thread here, then the parent process of the 850 sync thread is the init process after this thread exits. */ 851 repeat: 852 if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) { 853 IP_VS_ERR("could not create sync_thread due to %d... " 854 "retrying.\n", pid); 855 msleep_interruptible(1000); 856 goto repeat; 857 } 858 859 return 0; 860} 861 862 863int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) 864{ 865 DECLARE_COMPLETION_ONSTACK(startup); 866 pid_t pid; 867 struct ip_vs_sync_thread_data *tinfo; 868 869 if ((state == IP_VS_STATE_MASTER && sync_master_pid) || 870 (state == IP_VS_STATE_BACKUP && sync_backup_pid)) 871 return -EEXIST; 872 873 /* 874 * Note that tinfo will be freed in sync_thread on exit 875 */ 876 tinfo = kmalloc(sizeof(struct ip_vs_sync_thread_data), GFP_KERNEL); 877 if (!tinfo) 878 return -ENOMEM; 879 880 IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); 881 IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", 882 sizeof(struct ip_vs_sync_conn)); 883 884 ip_vs_sync_state |= state; 885 if (state == IP_VS_STATE_MASTER) { 886 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, 887 sizeof(ip_vs_master_mcast_ifn)); 888 ip_vs_master_syncid = syncid; 889 } else { 890 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, 891 sizeof(ip_vs_backup_mcast_ifn)); 892 ip_vs_backup_syncid = syncid; 893 } 894 895 tinfo->state = state; 896 tinfo->startup = &startup; 897 898 repeat: 899 if ((pid = kernel_thread(fork_sync_thread, tinfo, 0)) < 0) { 900 IP_VS_ERR("could not create fork_sync_thread due to %d... " 901 "retrying.\n", pid); 902 msleep_interruptible(1000); 903 goto repeat; 904 } 905 906 wait_for_completion(&startup); 907 908 return 0; 909} 910 911 912int stop_sync_thread(int state) 913{ 914 DECLARE_WAITQUEUE(wait, current); 915 916 if ((state == IP_VS_STATE_MASTER && !sync_master_pid) || 917 (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) 918 return -ESRCH; 919 920 IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); 921 IP_VS_INFO("stopping sync thread %d ...\n", 922 (state == IP_VS_STATE_MASTER) ? 923 sync_master_pid : sync_backup_pid); 924 925 __set_current_state(TASK_UNINTERRUPTIBLE); 926 add_wait_queue(&stop_sync_wait, &wait); 927 set_stop_sync(state, 1); 928 ip_vs_sync_state -= state; 929 wake_up(&sync_wait); 930 schedule(); 931 __set_current_state(TASK_RUNNING); 932 remove_wait_queue(&stop_sync_wait, &wait); 933 934 /* Note: no need to reap the sync thread, because its parent 935 process is the init process */ 936 937 if ((state == IP_VS_STATE_MASTER && stop_master_sync) || 938 (state == IP_VS_STATE_BACKUP && stop_backup_sync)) 939 IP_VS_BUG(); 940 941 return 0; 942} 943