1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/socket.h>
29#include <sys/list.h>
30#include <sys/stropts.h>
31#include <sys/siginfo.h>
32#include <sys/wait.h>
33#include <arpa/inet.h>
34#include <netinet/in.h>
35#include <stdlib.h>
36#include <stdio.h>
37#include <strings.h>
38#include <stddef.h>
39#include <unistd.h>
40#include <libilb.h>
41#include <port.h>
42#include <time.h>
43#include <signal.h>
44#include <assert.h>
45#include <errno.h>
46#include <spawn.h>
47#include <fcntl.h>
48#include <limits.h>
49#include "libilb_impl.h"
50#include "ilbd.h"
51
52/* Global list of HC objects */
53list_t ilbd_hc_list;
54
55/* Timer queue for all hc related timers. */
56static iu_tq_t *ilbd_hc_timer_q;
57
58/* Indicate whether the timer needs to be updated */
59static boolean_t hc_timer_restarted;
60
61static void ilbd_hc_probe_timer(iu_tq_t *, void *);
62static ilb_status_t ilbd_hc_restart_timer(ilbd_hc_t *, ilbd_hc_srv_t *);
63static boolean_t ilbd_run_probe(ilbd_hc_srv_t *);
64
65#define	MAX(a, b)	((a) > (b) ? (a) : (b))
66
67/*
68 * Number of arguments passed to a probe.  argc[0] is the path name of
69 * the probe.
70 */
71#define	HC_PROBE_ARGC	8
72
73/*
74 * Max number of characters to be read from the output of a probe.  It
75 * is long enough to read in a 64 bit integer.
76 */
77#define	HC_MAX_PROBE_OUTPUT	24
78
79void
80i_ilbd_setup_hc_list(void)
81{
82	list_create(&ilbd_hc_list, sizeof (ilbd_hc_t),
83	    offsetof(ilbd_hc_t, ihc_link));
84}
85
86/*
87 * Given a hc object name, return a pointer to hc object if found.
88 */
89ilbd_hc_t *
90ilbd_get_hc(const char *name)
91{
92	ilbd_hc_t *hc;
93
94	for (hc = list_head(&ilbd_hc_list); hc != NULL;
95	    hc = list_next(&ilbd_hc_list, hc)) {
96		if (strcasecmp(hc->ihc_name, name) == 0)
97			return (hc);
98	}
99	return (NULL);
100}
101
102/*
103 * Generates an audit record for create-healthcheck,
104 * delete-healtcheck subcommands.
105 */
106static void
107ilbd_audit_hc_event(const char *audit_hcname,
108    const ilb_hc_info_t *audit_hcinfo, ilbd_cmd_t cmd,
109    ilb_status_t rc, ucred_t *ucredp)
110{
111	adt_session_data_t	*ah;
112	adt_event_data_t	*event;
113	au_event_t	flag;
114	int	audit_error;
115
116	if ((ucredp == NULL) && (cmd == ILBD_CREATE_HC))  {
117		/*
118		 * we came here from the path where ilbd incorporates
119		 * the configuration that is listed in SCF:
120		 * i_ilbd_read_config->ilbd_walk_hc_pgs->
121		 *   ->ilbd_scf_instance_walk_pg->ilbd_create_hc
122		 * We skip auditing in that case
123		 */
124		logdebug("ilbd_audit_hc_event: skipping auditing");
125		return;
126	}
127
128	if (adt_start_session(&ah, NULL, 0) != 0) {
129		logerr("ilbd_audit_hc_event: adt_start_session failed");
130		exit(EXIT_FAILURE);
131	}
132	if (adt_set_from_ucred(ah, ucredp, ADT_NEW) != 0) {
133		(void) adt_end_session(ah);
134		logerr("ilbd_audit_rule_event: adt_set_from_ucred failed");
135		exit(EXIT_FAILURE);
136	}
137	if (cmd == ILBD_CREATE_HC)
138		flag = ADT_ilb_create_healthcheck;
139	else if (cmd == ILBD_DESTROY_HC)
140		flag = ADT_ilb_delete_healthcheck;
141
142	if ((event = adt_alloc_event(ah, flag)) == NULL) {
143		logerr("ilbd_audit_hc_event: adt_alloc_event failed");
144		exit(EXIT_FAILURE);
145	}
146	(void) memset((char *)event, 0, sizeof (adt_event_data_t));
147
148	switch (cmd) {
149	case ILBD_CREATE_HC:
150		event->adt_ilb_create_healthcheck.auth_used =
151		    NET_ILB_CONFIG_AUTH;
152		event->adt_ilb_create_healthcheck.hc_test =
153		    (char *)audit_hcinfo->hci_test;
154		event->adt_ilb_create_healthcheck.hc_name =
155		    (char *)audit_hcinfo->hci_name;
156
157		/*
158		 * If the value 0 is stored, the default values are
159		 * set in the kernel. User land does not know about them
160		 * So if the user does not specify them, audit record
161		 * will show them as 0
162		 */
163		event->adt_ilb_create_healthcheck.hc_timeout =
164		    audit_hcinfo->hci_timeout;
165		event->adt_ilb_create_healthcheck.hc_count =
166		    audit_hcinfo->hci_count;
167		event->adt_ilb_create_healthcheck.hc_interval =
168		    audit_hcinfo->hci_interval;
169		break;
170	case ILBD_DESTROY_HC:
171		event->adt_ilb_delete_healthcheck.auth_used =
172		    NET_ILB_CONFIG_AUTH;
173		event->adt_ilb_delete_healthcheck.hc_name =
174		    (char *)audit_hcname;
175		break;
176	}
177
178	/* Fill in success/failure */
179	if (rc == ILB_STATUS_OK) {
180		if (adt_put_event(event, ADT_SUCCESS, ADT_SUCCESS) != 0) {
181			logerr("ilbd_audit_hc_event: adt_put_event failed");
182			exit(EXIT_FAILURE);
183		}
184	} else {
185		audit_error = ilberror2auditerror(rc);
186		if (adt_put_event(event, ADT_FAILURE, audit_error) != 0) {
187			logerr("ilbd_audit_hc_event: adt_put_event failed");
188			exit(EXIT_FAILURE);
189		}
190	}
191	adt_free_event(event);
192	(void) adt_end_session(ah);
193}
194
195/*
196 * Given the ilb_hc_info_t passed in (from the libilb), create a hc object
197 * in ilbd.  The parameter ev_port is not used, refer to comments of
198 * ilbd_create_sg() in ilbd_sg.c
199 */
200/* ARGSUSED */
201ilb_status_t
202ilbd_create_hc(const ilb_hc_info_t *hc_info, int ev_port,
203    const struct passwd *ps, ucred_t *ucredp)
204{
205	ilbd_hc_t *hc;
206	ilb_status_t ret = ILB_STATUS_OK;
207
208	/*
209	 * ps == NULL is from the daemon when it starts and load configuration
210	 * ps != NULL is from client.
211	 */
212	if (ps != NULL) {
213		ret = ilbd_check_client_config_auth(ps);
214		if (ret != ILB_STATUS_OK) {
215			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
216			    ret, ucredp);
217			return (ret);
218		}
219	}
220
221	if (hc_info->hci_name[0] == '\0') {
222		logdebug("ilbd_create_hc: missing healthcheck info");
223		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
224		    ILB_STATUS_ENOHCINFO, ucredp);
225		return (ILB_STATUS_ENOHCINFO);
226	}
227
228	hc = ilbd_get_hc(hc_info->hci_name);
229	if (hc != NULL) {
230		logdebug("ilbd_create_hc: healthcheck name %s already"
231		    " exists", hc_info->hci_name);
232		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
233		    ILB_STATUS_EEXIST, ucredp);
234		return (ILB_STATUS_EEXIST);
235	}
236
237	/*
238	 * Sanity check on user supplied probe.  The given path name
239	 * must be a full path name (starts with '/') and is
240	 * executable.
241	 */
242	if (strcasecmp(hc_info->hci_test, ILB_HC_STR_TCP) != 0 &&
243	    strcasecmp(hc_info->hci_test, ILB_HC_STR_UDP) != 0 &&
244	    strcasecmp(hc_info->hci_test, ILB_HC_STR_PING) != 0 &&
245	    (hc_info->hci_test[0] != '/' ||
246	    access(hc_info->hci_test, X_OK) == -1)) {
247		if (errno == ENOENT) {
248			logdebug("ilbd_create_hc: user script %s doesn't "
249			    "exist", hc_info->hci_test);
250			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
251			    ILB_STATUS_ENOENT, ucredp);
252			return (ILB_STATUS_ENOENT);
253		} else {
254			logdebug("ilbd_create_hc: user script %s is "
255			    "invalid", hc_info->hci_test);
256			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
257			    ILB_STATUS_EINVAL, ucredp);
258			return (ILB_STATUS_EINVAL);
259		}
260	}
261
262	/* Create and add the hc object */
263	hc = calloc(1, sizeof (ilbd_hc_t));
264	if (hc == NULL) {
265		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
266		    ILB_STATUS_ENOMEM, ucredp);
267		return (ILB_STATUS_ENOMEM);
268	}
269	(void) memcpy(&hc->ihc_info, hc_info, sizeof (ilb_hc_info_t));
270	if (strcasecmp(hc->ihc_test, ILB_HC_STR_TCP) == 0)
271		hc->ihc_test_type = ILBD_HC_TCP;
272	else if (strcasecmp(hc->ihc_test, ILB_HC_STR_UDP) == 0)
273		hc->ihc_test_type = ILBD_HC_UDP;
274	else if (strcasecmp(hc->ihc_test, ILB_HC_STR_PING) == 0)
275		hc->ihc_test_type = ILBD_HC_PING;
276	else
277		hc->ihc_test_type = ILBD_HC_USER;
278	list_create(&hc->ihc_rules, sizeof (ilbd_hc_rule_t),
279	    offsetof(ilbd_hc_rule_t, hcr_link));
280
281	/* Update SCF */
282	if (ps != NULL) {
283		if ((ret = ilbd_create_pg(ILBD_SCF_HC, (void *)hc)) !=
284		    ILB_STATUS_OK) {
285			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
286			    ret, ucredp);
287			free(hc);
288			return (ret);
289		}
290	}
291
292	/* Everything is fine, now add it to the global list. */
293	list_insert_tail(&ilbd_hc_list, hc);
294	ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC, ret, ucredp);
295	return (ret);
296}
297
298/*
299 * Given a name of a hc object, destroy it.
300 */
301ilb_status_t
302ilbd_destroy_hc(const char *hc_name, const struct passwd *ps,
303    ucred_t *ucredp)
304{
305	ilb_status_t ret;
306	ilbd_hc_t *hc;
307
308	/*
309	 * No need to check ps == NULL, daemon won't call any destroy func
310	 * at start up.
311	 */
312	ret = ilbd_check_client_config_auth(ps);
313	if (ret != ILB_STATUS_OK) {
314		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
315		    ret, ucredp);
316		return (ret);
317	}
318
319	hc = ilbd_get_hc(hc_name);
320	if (hc == NULL) {
321		logdebug("ilbd_destroy_hc: healthcheck %s does not exist",
322		    hc_name);
323		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
324		    ILB_STATUS_ENOENT, ucredp);
325		return (ILB_STATUS_ENOENT);
326	}
327
328	/* If hc is in use, cannot delete it */
329	if (hc->ihc_rule_cnt > 0) {
330		logdebug("ilbd_destroy_hc: healthcheck %s is associated"
331		    " with a rule - cannot remove", hc_name);
332		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
333		    ILB_STATUS_INUSE, ucredp);
334		return (ILB_STATUS_INUSE);
335	}
336
337	if ((ret = ilbd_destroy_pg(ILBD_SCF_HC, hc_name)) !=
338	    ILB_STATUS_OK) {
339		logdebug("ilbd_destroy_hc: cannot destroy healthcheck %s "
340		    "property group", hc_name);
341		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
342		    ret, ucredp);
343		return (ret);
344	}
345
346	list_remove(&ilbd_hc_list, hc);
347	free(hc);
348	ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC, ret, ucredp);
349	return (ret);
350}
351
352/*
353 * Given a hc object name, return its information.  Used by libilb to
354 * get hc info.
355 */
356ilb_status_t
357ilbd_get_hc_info(const char *hc_name, uint32_t *rbuf, size_t *rbufsz)
358{
359	ilbd_hc_t	*hc;
360	ilb_hc_info_t	*hc_info;
361	ilb_comm_t	*ic = (ilb_comm_t *)rbuf;
362
363	hc = ilbd_get_hc(hc_name);
364	if (hc == NULL) {
365		logdebug("%s: healthcheck %s does not exist", __func__,
366		    hc_name);
367		return (ILB_STATUS_ENOENT);
368	}
369	ilbd_reply_ok(rbuf, rbufsz);
370	hc_info = (ilb_hc_info_t *)&ic->ic_data;
371
372	(void) strlcpy(hc_info->hci_name, hc->ihc_name, sizeof (hc->ihc_name));
373	(void) strlcpy(hc_info->hci_test, hc->ihc_test, sizeof (hc->ihc_test));
374	hc_info->hci_timeout = hc->ihc_timeout;
375	hc_info->hci_count = hc->ihc_count;
376	hc_info->hci_interval = hc->ihc_interval;
377	hc_info->hci_def_ping = hc->ihc_def_ping;
378
379	*rbufsz += sizeof (ilb_hc_info_t);
380
381	return (ILB_STATUS_OK);
382}
383
384static void
385ilbd_hc_copy_srvs(uint32_t *rbuf, size_t *rbufsz, ilbd_hc_rule_t *hc_rule,
386    const char *rulename)
387{
388	ilbd_hc_srv_t		*tmp_srv;
389	ilb_hc_srv_t		*dst_srv;
390	ilb_hc_rule_srv_t	*srvs;
391	size_t			tmp_rbufsz;
392	int			i;
393
394	tmp_rbufsz = *rbufsz;
395	/* Set up the reply buffer.  rbufsz will be set to the new size. */
396	ilbd_reply_ok(rbuf, rbufsz);
397
398	/* Calculate how much space is left for holding server info. */
399	*rbufsz += sizeof (ilb_hc_rule_srv_t);
400	tmp_rbufsz -= *rbufsz;
401
402	srvs = (ilb_hc_rule_srv_t *)&((ilb_comm_t *)rbuf)->ic_data;
403
404	tmp_srv = list_head(&hc_rule->hcr_servers);
405	for (i = 0; tmp_srv != NULL && tmp_rbufsz >= sizeof (*dst_srv); i++) {
406		dst_srv = &srvs->rs_srvs[i];
407
408		(void) strlcpy(dst_srv->hcs_rule_name, rulename, ILB_NAMESZ);
409		(void) strlcpy(dst_srv->hcs_ID, tmp_srv->shc_sg_srv->sgs_srvID,
410		    ILB_NAMESZ);
411		(void) strlcpy(dst_srv->hcs_hc_name,
412		    tmp_srv->shc_hc->ihc_name, ILB_NAMESZ);
413		dst_srv->hcs_IP = tmp_srv->shc_sg_srv->sgs_addr;
414		dst_srv->hcs_fail_cnt = tmp_srv->shc_fail_cnt;
415		dst_srv->hcs_status = tmp_srv->shc_status;
416		dst_srv->hcs_rtt = tmp_srv->shc_rtt;
417		dst_srv->hcs_lasttime = tmp_srv->shc_lasttime;
418		dst_srv->hcs_nexttime = tmp_srv->shc_nexttime;
419
420		tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv);
421		tmp_rbufsz -= sizeof (*dst_srv);
422	}
423	srvs->rs_num_srvs = i;
424	*rbufsz += i * sizeof (*dst_srv);
425}
426
427/*
428 * Given a rule name, return the hc status of its servers.
429 */
430ilb_status_t
431ilbd_get_hc_srvs(const char *rulename, uint32_t *rbuf, size_t *rbufsz)
432{
433	ilbd_hc_t	*hc;
434	ilbd_hc_rule_t	*hc_rule;
435
436	for (hc = list_head(&ilbd_hc_list); hc != NULL;
437	    hc = list_next(&ilbd_hc_list, hc)) {
438		for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
439		    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
440			if (strcasecmp(hc_rule->hcr_rule->irl_name,
441			    rulename) != 0) {
442				continue;
443			}
444			ilbd_hc_copy_srvs(rbuf, rbufsz, hc_rule, rulename);
445			return (ILB_STATUS_OK);
446		}
447	}
448	return (ILB_STATUS_RULE_NO_HC);
449}
450
451/*
452 * Initialize the hc timer and associate the notification of timeout to
453 * the given event port.
454 */
455void
456ilbd_hc_timer_init(int ev_port, ilbd_timer_event_obj_t *ev_obj)
457{
458	struct sigevent sigev;
459	port_notify_t notify;
460
461	if ((ilbd_hc_timer_q = iu_tq_create()) == NULL) {
462		logerr("%s: cannot create hc timer queue", __func__);
463		exit(EXIT_FAILURE);
464	}
465	hc_timer_restarted = B_FALSE;
466
467	ev_obj->ev = ILBD_EVENT_TIMER;
468	ev_obj->timerid = -1;
469
470	notify.portnfy_port = ev_port;
471	notify.portnfy_user = ev_obj;
472	sigev.sigev_notify = SIGEV_PORT;
473	sigev.sigev_value.sival_ptr = &notify;
474	if (timer_create(CLOCK_REALTIME, &sigev, &ev_obj->timerid) == -1) {
475		logerr("%s: cannot create timer", __func__);
476		exit(EXIT_FAILURE);
477	}
478}
479
480/*
481 * HC timeout handler.
482 */
483void
484ilbd_hc_timeout(void)
485{
486	(void) iu_expire_timers(ilbd_hc_timer_q);
487	hc_timer_restarted = B_TRUE;
488}
489
490/*
491 * Set up the timer to fire at the earliest timeout.
492 */
493void
494ilbd_hc_timer_update(ilbd_timer_event_obj_t *ev_obj)
495{
496	itimerspec_t itimeout;
497	int timeout;
498
499	/*
500	 * There is no change on the timer list, so no need to set up the
501	 * timer again.
502	 */
503	if (!hc_timer_restarted)
504		return;
505
506restart:
507	if ((timeout = iu_earliest_timer(ilbd_hc_timer_q)) == INFTIM) {
508		hc_timer_restarted = B_FALSE;
509		return;
510	} else if (timeout == 0) {
511		/*
512		 * Handle the timeout immediately.  After that (clearing all
513		 * the expired timers), check to  see if there are still
514		 * timers running.  If yes, start them.
515		 */
516		(void) iu_expire_timers(ilbd_hc_timer_q);
517		goto restart;
518	}
519
520	itimeout.it_value.tv_sec = timeout / MILLISEC + 1;
521	itimeout.it_value.tv_nsec = 0;
522	itimeout.it_interval.tv_sec = 0;
523	itimeout.it_interval.tv_nsec = 0;
524
525	/*
526	 * Failure to set a timeout is "OK" since hopefully there will be
527	 * other events and timer_settime() will be called again.  So
528	 * we will only miss some timeouts.  But in the worst case, no event
529	 * will happen and ilbd will get stuck...
530	 */
531	if (timer_settime(ev_obj->timerid, 0, &itimeout, NULL) == -1)
532		logerr("%s: cannot set timer", __func__);
533	hc_timer_restarted = B_FALSE;
534}
535
536/*
537 * Kill the probe process of a server.
538 */
539static void
540ilbd_hc_kill_probe(ilbd_hc_srv_t *srv)
541{
542	/*
543	 * First dissociate the fd from the event port.  It should not
544	 * fail.
545	 */
546	if (port_dissociate(srv->shc_ev_port, PORT_SOURCE_FD,
547	    srv->shc_child_fd) != 0) {
548		logdebug("%s: port_dissociate: %s", __func__, strerror(errno));
549	}
550	(void) close(srv->shc_child_fd);
551	free(srv->shc_ev);
552	srv->shc_ev = NULL;
553
554	/* Then kill the probe process. */
555	if (kill(srv->shc_child_pid, SIGKILL) != 0) {
556		logerr("%s: rule %s server %s: %s", __func__,
557		    srv->shc_hc_rule->hcr_rule->irl_name,
558		    srv->shc_sg_srv->sgs_srvID, strerror(errno));
559	}
560	/* Should not fail... */
561	if (waitpid(srv->shc_child_pid, NULL, 0) != srv->shc_child_pid) {
562		logdebug("%s: waitpid: rule %s server %s", __func__,
563		    srv->shc_hc_rule->hcr_rule->irl_name,
564		    srv->shc_sg_srv->sgs_srvID);
565	}
566	srv->shc_child_pid = 0;
567}
568
569/*
570 * Disable the server, either because the server is dead or because a timer
571 * cannot be started for this server.  Note that this only affects the
572 * transient configuration, meaning only in memory.  The persistent
573 * configuration is not affected.
574 */
575static void
576ilbd_mark_server_disabled(ilbd_hc_srv_t *srv)
577{
578	srv->shc_status = ILB_HCS_DISABLED;
579
580	/* Disable the server in kernel. */
581	if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
582	    srv->shc_hc_rule->hcr_rule->irl_name,
583	    stat_declare_srv_dead) != ILB_STATUS_OK) {
584		logerr("%s: cannot disable server in kernel: rule %s "
585		    "server %s", __func__,
586		    srv->shc_hc_rule->hcr_rule->irl_name,
587		    srv->shc_sg_srv->sgs_srvID);
588	}
589}
590
591/*
592 * A probe fails, set the state of the server.
593 */
594static void
595ilbd_set_fail_state(ilbd_hc_srv_t *srv)
596{
597	if (++srv->shc_fail_cnt < srv->shc_hc->ihc_count) {
598		/* Probe again */
599		ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
600		return;
601	}
602
603	logdebug("%s: rule %s server %s fails %u", __func__,
604	    srv->shc_hc_rule->hcr_rule->irl_name, srv->shc_sg_srv->sgs_srvID,
605	    srv->shc_fail_cnt);
606
607	/*
608	 * If this is a ping test, mark the server as
609	 * unreachable instead of dead.
610	 */
611	if (srv->shc_hc->ihc_test_type == ILBD_HC_PING ||
612	    srv->shc_state == ilbd_hc_def_pinging) {
613		srv->shc_status = ILB_HCS_UNREACH;
614	} else {
615		srv->shc_status = ILB_HCS_DEAD;
616	}
617
618	/* Disable the server in kernel. */
619	if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
620	    srv->shc_hc_rule->hcr_rule->irl_name, stat_declare_srv_dead) !=
621	    ILB_STATUS_OK) {
622		logerr("%s: cannot disable server in kernel: rule %s "
623		    "server %s", __func__,
624		    srv->shc_hc_rule->hcr_rule->irl_name,
625		    srv->shc_sg_srv->sgs_srvID);
626	}
627
628	/* Still keep probing in case the server is alive again. */
629	if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
630		/* Only thing to do is to disable the server... */
631		logerr("%s: cannot restart timer: rule %s server %s", __func__,
632		    srv->shc_hc_rule->hcr_rule->irl_name,
633		    srv->shc_sg_srv->sgs_srvID);
634		srv->shc_status = ILB_HCS_DISABLED;
635	}
636}
637
638/*
639 * A probe process has not returned for the ihc_timeout period, we should
640 * kill it.  This function is the handler of this.
641 */
642/* ARGSUSED */
643static void
644ilbd_hc_kill_timer(iu_tq_t *tq, void *arg)
645{
646	ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
647
648	ilbd_hc_kill_probe(srv);
649	ilbd_set_fail_state(srv);
650}
651
652/*
653 * Probe timeout handler.  Send out the appropriate probe.
654 */
655/* ARGSUSED */
656static void
657ilbd_hc_probe_timer(iu_tq_t *tq, void *arg)
658{
659	ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
660
661	/*
662	 * If starting the probe fails, just pretend that the timeout has
663	 * extended.
664	 */
665	if (!ilbd_run_probe(srv)) {
666		/*
667		 * If we cannot restart the timer, the only thing we can do
668		 * is to disable this server.  Hopefully the sys admin will
669		 * notice this and enable this server again later.
670		 */
671		if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
672			logerr("%s: cannot restart timer: rule %s server %s, "
673			    "disabling it", __func__,
674			    srv->shc_hc_rule->hcr_rule->irl_name,
675			    srv->shc_sg_srv->sgs_srvID);
676			ilbd_mark_server_disabled(srv);
677		}
678		return;
679	}
680
681	/*
682	 * Similar to above, if kill timer cannot be started, disable the
683	 * server.
684	 */
685	if ((srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q,
686	    srv->shc_hc->ihc_timeout, ilbd_hc_kill_timer, srv)) == -1) {
687		logerr("%s: cannot start kill timer: rule %s server %s, "
688		    "disabling it", __func__,
689		    srv->shc_hc_rule->hcr_rule->irl_name,
690		    srv->shc_sg_srv->sgs_srvID);
691		ilbd_mark_server_disabled(srv);
692	}
693	hc_timer_restarted = B_TRUE;
694}
695
696/* Restart the periodic timer for a given server. */
697static ilb_status_t
698ilbd_hc_restart_timer(ilbd_hc_t *hc, ilbd_hc_srv_t *srv)
699{
700	int timeout;
701
702	/* Don't allow the timeout interval to be less than 1s */
703	timeout = MAX((hc->ihc_interval >> 1) + (gethrtime() %
704	    (hc->ihc_interval + 1)), 1);
705
706	/*
707	 * If the probe is actually a ping probe, there is no need to
708	 * do default pinging.  Just skip the step.
709	 */
710	if (hc->ihc_def_ping && hc->ihc_test_type != ILBD_HC_PING)
711		srv->shc_state = ilbd_hc_def_pinging;
712	else
713		srv->shc_state = ilbd_hc_probing;
714	srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q, timeout,
715	    ilbd_hc_probe_timer, srv);
716
717	if (srv->shc_tid == -1)
718		return (ILB_STATUS_TIMER);
719	srv->shc_lasttime = time(NULL);
720	srv->shc_nexttime = time(NULL) + timeout;
721
722	hc_timer_restarted = B_TRUE;
723	return (ILB_STATUS_OK);
724}
725
726/* Helper routine to associate a server with its hc object. */
727static ilb_status_t
728ilbd_hc_srv_add(ilbd_hc_t *hc, ilbd_hc_rule_t *hc_rule,
729    const ilb_sg_srv_t *srv, int ev_port)
730{
731	ilbd_hc_srv_t *new_srv;
732	ilb_status_t ret;
733
734	if ((new_srv = calloc(1, sizeof (ilbd_hc_srv_t))) == NULL)
735		return (ILB_STATUS_ENOMEM);
736	new_srv->shc_hc = hc;
737	new_srv->shc_hc_rule = hc_rule;
738	new_srv->shc_sg_srv = srv;
739	new_srv->shc_ev_port = ev_port;
740	new_srv->shc_tid = -1;
741	new_srv->shc_nexttime = time(NULL);
742	new_srv->shc_lasttime = new_srv->shc_nexttime;
743
744	if ((hc_rule->hcr_rule->irl_flags & ILB_FLAGS_RULE_ENABLED) &&
745	    ILB_IS_SRV_ENABLED(srv->sgs_flags)) {
746		new_srv->shc_status = ILB_HCS_UNINIT;
747		ret = ilbd_hc_restart_timer(hc, new_srv);
748		if (ret != ILB_STATUS_OK) {
749			free(new_srv);
750			return (ret);
751		}
752	} else {
753		new_srv->shc_status = ILB_HCS_DISABLED;
754	}
755
756	list_insert_tail(&hc_rule->hcr_servers, new_srv);
757	return (ILB_STATUS_OK);
758}
759
760/* Handy macro to cancel a server's timer. */
761#define	HC_CANCEL_TIMER(srv)						\
762{									\
763	void *arg;							\
764	int ret;							\
765	if ((srv)->shc_tid != -1) {					\
766		ret = iu_cancel_timer(ilbd_hc_timer_q, (srv)->shc_tid, &arg); \
767		(srv)->shc_tid = -1;					\
768		assert(ret == 1);					\
769		assert(arg == (srv));					\
770	}								\
771	hc_timer_restarted = B_TRUE;					\
772}
773
774/* Helper routine to dissociate a server from its hc object. */
775static ilb_status_t
776ilbd_hc_srv_rem(ilbd_hc_rule_t *hc_rule, const ilb_sg_srv_t *srv)
777{
778	ilbd_hc_srv_t *tmp_srv;
779
780	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
781	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
782		if (tmp_srv->shc_sg_srv == srv) {
783			list_remove(&hc_rule->hcr_servers, tmp_srv);
784			HC_CANCEL_TIMER(tmp_srv);
785			if (tmp_srv->shc_child_pid != 0)
786				ilbd_hc_kill_probe(tmp_srv);
787			free(tmp_srv);
788			return (ILB_STATUS_OK);
789		}
790	}
791	return (ILB_STATUS_ENOENT);
792}
793
794/* Helper routine to dissociate all servers of a rule from its hc object. */
795static void
796ilbd_hc_srv_rem_all(ilbd_hc_rule_t *hc_rule)
797{
798	ilbd_hc_srv_t *srv;
799
800	while ((srv = list_remove_head(&hc_rule->hcr_servers)) != NULL) {
801		HC_CANCEL_TIMER(srv);
802		if (srv->shc_child_pid != 0)
803			ilbd_hc_kill_probe(srv);
804		free(srv);
805	}
806}
807
808/* Associate a rule with its hc object. */
809ilb_status_t
810ilbd_hc_associate_rule(const ilbd_rule_t *rule, int ev_port)
811{
812	ilbd_hc_t	*hc;
813	ilbd_hc_rule_t	*hc_rule;
814	ilb_status_t	ret;
815	ilbd_sg_t	*sg;
816	ilbd_srv_t	*ilbd_srv;
817
818	/* The rule is assumed to be initialized appropriately. */
819	if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
820		logdebug("ilbd_hc_associate_rule: healthcheck %s does not "
821		    "exist", rule->irl_hcname);
822		return (ILB_STATUS_ENOHCINFO);
823	}
824	if ((hc->ihc_test_type == ILBD_HC_TCP &&
825	    rule->irl_proto != IPPROTO_TCP) ||
826	    (hc->ihc_test_type == ILBD_HC_UDP &&
827	    rule->irl_proto != IPPROTO_UDP)) {
828		return (ILB_STATUS_RULE_HC_MISMATCH);
829	}
830	if ((hc_rule = calloc(1, sizeof (ilbd_hc_rule_t))) == NULL) {
831		logdebug("ilbd_hc_associate_rule: out of memory");
832		return (ILB_STATUS_ENOMEM);
833	}
834
835	hc_rule->hcr_rule = rule;
836	list_create(&hc_rule->hcr_servers, sizeof (ilbd_hc_srv_t),
837	    offsetof(ilbd_hc_srv_t, shc_srv_link));
838
839	/* Add all the servers. */
840	sg = rule->irl_sg;
841	for (ilbd_srv = list_head(&sg->isg_srvlist); ilbd_srv != NULL;
842	    ilbd_srv = list_next(&sg->isg_srvlist, ilbd_srv)) {
843		if ((ret = ilbd_hc_srv_add(hc, hc_rule, &ilbd_srv->isv_srv,
844		    ev_port)) != ILB_STATUS_OK) {
845			/* Remove all previously added servers */
846			ilbd_hc_srv_rem_all(hc_rule);
847			free(hc_rule);
848			return (ret);
849		}
850	}
851	list_insert_tail(&hc->ihc_rules, hc_rule);
852	hc->ihc_rule_cnt++;
853
854	return (ILB_STATUS_OK);
855}
856
857/* Dissociate a rule from its hc object. */
858ilb_status_t
859ilbd_hc_dissociate_rule(const ilbd_rule_t *rule)
860{
861	ilbd_hc_t	*hc;
862	ilbd_hc_rule_t	*hc_rule;
863
864	/* The rule is assumed to be initialized appropriately. */
865	if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
866		logdebug("ilbd_hc_dissociate_rule: healthcheck %s does not "
867		    "exist", rule->irl_hcname);
868		return (ILB_STATUS_ENOENT);
869	}
870	for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
871	    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
872		if (hc_rule->hcr_rule == rule)
873			break;
874	}
875	if (hc_rule == NULL) {
876		logdebug("ilbd_hc_dissociate_rule: rule %s is not associated "
877		    "with healtcheck %s", rule->irl_hcname, hc->ihc_name);
878		return (ILB_STATUS_ENOENT);
879	}
880	ilbd_hc_srv_rem_all(hc_rule);
881	list_remove(&hc->ihc_rules, hc_rule);
882	hc->ihc_rule_cnt--;
883	return (ILB_STATUS_OK);
884}
885
886/*
887 * Given a hc object name and a rule, check to see if the rule is associated
888 * with the hc object.  If it is, the hc object is returned in **hc and the
889 * ilbd_hc_rule_t is returned in **hc_rule.
890 */
891static boolean_t
892ilbd_hc_check_rule(const char *hc_name, const ilbd_rule_t *rule,
893    ilbd_hc_t **hc, ilbd_hc_rule_t **hc_rule)
894{
895	ilbd_hc_t	*tmp_hc;
896	ilbd_hc_rule_t	*tmp_hc_rule;
897
898	if ((tmp_hc = ilbd_get_hc(hc_name)) == NULL)
899		return (B_FALSE);
900	for (tmp_hc_rule = list_head(&tmp_hc->ihc_rules); tmp_hc_rule != NULL;
901	    tmp_hc_rule = list_next(&tmp_hc->ihc_rules, tmp_hc_rule)) {
902		if (tmp_hc_rule->hcr_rule == rule) {
903			*hc = tmp_hc;
904			*hc_rule = tmp_hc_rule;
905			return (B_TRUE);
906		}
907	}
908	return (B_FALSE);
909}
910
911/* Associate a server with its hc object. */
912ilb_status_t
913ilbd_hc_add_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
914    int ev_port)
915{
916	ilbd_hc_t	*hc;
917	ilbd_hc_rule_t	*hc_rule;
918
919	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
920		return (ILB_STATUS_ENOENT);
921	return (ilbd_hc_srv_add(hc, hc_rule, srv, ev_port));
922}
923
924/* Dissociate a server from its hc object. */
925ilb_status_t
926ilbd_hc_del_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
927{
928	ilbd_hc_t	*hc;
929	ilbd_hc_rule_t	*hc_rule;
930
931	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
932		return (ILB_STATUS_ENOENT);
933	return (ilbd_hc_srv_rem(hc_rule, srv));
934}
935
936/* Helper routine to enable/disable a server's hc probe. */
937static ilb_status_t
938ilbd_hc_toggle_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
939    boolean_t enable)
940{
941	ilbd_hc_t	*hc;
942	ilbd_hc_rule_t	*hc_rule;
943	ilbd_hc_srv_t	*tmp_srv;
944	ilb_status_t	ret;
945
946	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
947		return (ILB_STATUS_ENOENT);
948	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
949	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
950		if (tmp_srv->shc_sg_srv != srv) {
951			continue;
952		}
953		if (enable) {
954			if (tmp_srv->shc_status == ILB_HCS_DISABLED) {
955				ret = ilbd_hc_restart_timer(hc, tmp_srv);
956				if (ret != ILB_STATUS_OK) {
957					logerr("%s: cannot start timers for "
958					    "rule %s server %s", __func__,
959					    rule->irl_name,
960					    tmp_srv->shc_sg_srv->sgs_srvID);
961					return (ret);
962				}
963				/* Start from fresh... */
964				tmp_srv->shc_status = ILB_HCS_UNINIT;
965				tmp_srv->shc_rtt = 0;
966				tmp_srv->shc_fail_cnt = 0;
967			}
968		} else {
969			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
970				tmp_srv->shc_status = ILB_HCS_DISABLED;
971				HC_CANCEL_TIMER(tmp_srv);
972				if (tmp_srv->shc_child_pid != 0)
973					ilbd_hc_kill_probe(tmp_srv);
974			}
975		}
976		return (ILB_STATUS_OK);
977	}
978	return (ILB_STATUS_ENOENT);
979}
980
981ilb_status_t
982ilbd_hc_enable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
983{
984	return (ilbd_hc_toggle_server(rule, srv, B_TRUE));
985}
986
987ilb_status_t
988ilbd_hc_disable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
989{
990	return (ilbd_hc_toggle_server(rule, srv, B_FALSE));
991}
992
993/*
994 * Helper routine to enable/disable a rule's hc probe (including all its
995 * servers).
996 */
997static ilb_status_t
998ilbd_hc_toggle_rule(const ilbd_rule_t *rule, boolean_t enable)
999{
1000	ilbd_hc_t	*hc;
1001	ilbd_hc_rule_t	*hc_rule;
1002	ilbd_hc_srv_t	*tmp_srv;
1003	int		ret;
1004
1005	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
1006		return (ILB_STATUS_ENOENT);
1007
1008	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
1009	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
1010		if (enable) {
1011			/*
1012			 * If the server is disabled in the rule, do not
1013			 * restart its timer.
1014			 */
1015			if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1016			    ILB_IS_SRV_ENABLED(
1017			    tmp_srv->shc_sg_srv->sgs_flags)) {
1018				ret = ilbd_hc_restart_timer(hc, tmp_srv);
1019				if (ret != ILB_STATUS_OK) {
1020					logerr("%s: cannot start timers for "
1021					    "rule %s server %s", __func__,
1022					    rule->irl_name,
1023					    tmp_srv->shc_sg_srv->sgs_srvID);
1024					goto rollback;
1025				} else {
1026					/* Start from fresh... */
1027					tmp_srv->shc_status = ILB_HCS_UNINIT;
1028					tmp_srv->shc_rtt = 0;
1029					tmp_srv->shc_fail_cnt = 0;
1030				}
1031			}
1032		} else {
1033			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1034				HC_CANCEL_TIMER(tmp_srv);
1035				tmp_srv->shc_status = ILB_HCS_DISABLED;
1036				if (tmp_srv->shc_child_pid != 0)
1037					ilbd_hc_kill_probe(tmp_srv);
1038			}
1039		}
1040	}
1041	return (ILB_STATUS_OK);
1042rollback:
1043	enable = !enable;
1044	for (tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv);
1045	    tmp_srv != NULL;
1046	    tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv)) {
1047		if (enable) {
1048			if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1049			    ILB_IS_SRV_ENABLED(
1050			    tmp_srv->shc_sg_srv->sgs_flags)) {
1051				(void) ilbd_hc_restart_timer(hc, tmp_srv);
1052				tmp_srv->shc_status = ILB_HCS_UNINIT;
1053				tmp_srv->shc_rtt = 0;
1054				tmp_srv->shc_fail_cnt = 0;
1055			}
1056		} else {
1057			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1058				HC_CANCEL_TIMER(tmp_srv);
1059				tmp_srv->shc_status = ILB_HCS_DISABLED;
1060				if (tmp_srv->shc_child_pid != 0)
1061					ilbd_hc_kill_probe(tmp_srv);
1062			}
1063		}
1064	}
1065	return (ret);
1066}
1067
1068ilb_status_t
1069ilbd_hc_enable_rule(const ilbd_rule_t *rule)
1070{
1071	return (ilbd_hc_toggle_rule(rule, B_TRUE));
1072}
1073
1074ilb_status_t
1075ilbd_hc_disable_rule(const ilbd_rule_t *rule)
1076{
1077	return (ilbd_hc_toggle_rule(rule, B_FALSE));
1078}
1079
1080static const char *
1081topo_2_str(ilb_topo_t topo)
1082{
1083	switch (topo) {
1084	case ILB_TOPO_DSR:
1085		return ("DSR");
1086		break;
1087	case ILB_TOPO_NAT:
1088		return ("NAT");
1089		break;
1090	case ILB_TOPO_HALF_NAT:
1091		return ("HALF_NAT");
1092		break;
1093	default:
1094		/* Should not happen. */
1095		logerr("%s: unknown topology", __func__);
1096		break;
1097	}
1098	return ("");
1099}
1100
1101/*
1102 * Create the argument list to be passed to a hc probe command.
1103 * The passed in argv is assumed to have HC_PROBE_ARGC elements.
1104 */
1105static boolean_t
1106create_argv(ilbd_hc_srv_t *srv, char *argv[])
1107{
1108	char buf[INET6_ADDRSTRLEN];
1109	ilbd_rule_t const *rule;
1110	ilb_sg_srv_t const *sg_srv;
1111	struct in_addr v4_addr;
1112	in_port_t port;
1113	int i;
1114
1115	rule = srv->shc_hc_rule->hcr_rule;
1116	sg_srv = srv->shc_sg_srv;
1117
1118	if (srv->shc_state == ilbd_hc_def_pinging) {
1119		if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL)
1120			return (B_FALSE);
1121	} else {
1122		switch (srv->shc_hc->ihc_test_type) {
1123		case ILBD_HC_USER:
1124			if ((argv[0] = strdup(srv->shc_hc->ihc_test)) == NULL)
1125				return (B_FALSE);
1126			break;
1127		case ILBD_HC_TCP:
1128		case ILBD_HC_UDP:
1129			if ((argv[0] = strdup(ILB_PROBE_PROTO)) ==
1130			    NULL) {
1131				return (B_FALSE);
1132			}
1133			break;
1134		case ILBD_HC_PING:
1135			if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL) {
1136				return (B_FALSE);
1137			}
1138			break;
1139		}
1140	}
1141
1142	/*
1143	 * argv[1] is the VIP.
1144	 *
1145	 * Right now, the VIP and the backend server addresses should be
1146	 * in the same IP address family.  Here we don't do that in case
1147	 * this assumption is changed in future.
1148	 */
1149	if (IN6_IS_ADDR_V4MAPPED(&rule->irl_vip)) {
1150		IN6_V4MAPPED_TO_INADDR(&rule->irl_vip, &v4_addr);
1151		if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1152			goto cleanup;
1153	} else {
1154		if (inet_ntop(AF_INET6, &rule->irl_vip, buf,
1155		    sizeof (buf)) == NULL) {
1156			goto cleanup;
1157		}
1158	}
1159	if ((argv[1] = strdup(buf)) == NULL)
1160		goto cleanup;
1161
1162	/*
1163	 * argv[2] is the backend server address.
1164	 */
1165	if (IN6_IS_ADDR_V4MAPPED(&sg_srv->sgs_addr)) {
1166		IN6_V4MAPPED_TO_INADDR(&sg_srv->sgs_addr, &v4_addr);
1167		if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1168			goto cleanup;
1169	} else {
1170		if (inet_ntop(AF_INET6, &sg_srv->sgs_addr, buf,
1171		    sizeof (buf)) == NULL) {
1172			goto cleanup;
1173		}
1174	}
1175	if ((argv[2] = strdup(buf)) == NULL)
1176		goto cleanup;
1177
1178	/*
1179	 * argv[3] is the transport protocol used in the rule.
1180	 */
1181	switch (rule->irl_proto) {
1182	case IPPROTO_TCP:
1183		argv[3] = strdup("TCP");
1184		break;
1185	case IPPROTO_UDP:
1186		argv[3] = strdup("UDP");
1187		break;
1188	default:
1189		logerr("%s: unknown protocol", __func__);
1190		goto cleanup;
1191		break;
1192	}
1193	if (argv[3] == NULL)
1194		goto cleanup;
1195
1196	/*
1197	 * argv[4] is the load balance mode, DSR, NAT, HALF-NAT.
1198	 */
1199	if ((argv[4] = strdup(topo_2_str(rule->irl_topo))) == NULL)
1200		goto cleanup;
1201
1202	/*
1203	 * argv[5] is the port range.  Right now, there should only be 1 port.
1204	 */
1205	switch (rule->irl_hcpflag) {
1206	case ILB_HCI_PROBE_FIX:
1207		port = ntohs(rule->irl_hcport);
1208		break;
1209	case ILB_HCI_PROBE_ANY: {
1210		in_port_t min, max;
1211
1212		if (ntohs(sg_srv->sgs_minport) == 0) {
1213			min = ntohs(rule->irl_minport);
1214			max = ntohs(rule->irl_maxport);
1215		} else {
1216			min = ntohs(sg_srv->sgs_minport);
1217			max = ntohs(sg_srv->sgs_maxport);
1218		}
1219		if (max > min)
1220			port = min + gethrtime() % (max - min + 1);
1221		else
1222			port = min;
1223		break;
1224	}
1225	default:
1226		logerr("%s: unknown HC flag", __func__);
1227		goto cleanup;
1228		break;
1229	}
1230	(void) sprintf(buf, "%d", port);
1231	if ((argv[5] = strdup(buf)) == NULL)
1232		goto cleanup;
1233
1234	/*
1235	 * argv[6] is the probe timeout.
1236	 */
1237	(void) sprintf(buf, "%d", srv->shc_hc->ihc_timeout);
1238	if ((argv[6] = strdup(buf)) == NULL)
1239		goto cleanup;
1240
1241	argv[7] = NULL;
1242	return (B_TRUE);
1243
1244cleanup:
1245	for (i = 0; i < HC_PROBE_ARGC; i++) {
1246		if (argv[i] != NULL)
1247			free(argv[i]);
1248	}
1249	return (B_FALSE);
1250}
1251
1252static void
1253destroy_argv(char *argv[])
1254{
1255	int i;
1256
1257	for (i = 0; argv[i] != NULL; i++)
1258		free(argv[i]);
1259}
1260
1261/* Spawn a process to run the hc probe on the given server. */
1262static boolean_t
1263ilbd_run_probe(ilbd_hc_srv_t *srv)
1264{
1265	posix_spawn_file_actions_t	fd_actions;
1266	posix_spawnattr_t		attr;
1267	sigset_t			child_sigset;
1268	int				fds[2];
1269	int				fdflags;
1270	pid_t				pid;
1271	char				*child_argv[HC_PROBE_ARGC];
1272	ilbd_hc_probe_event_t		*probe_ev;
1273	char				*probe_name;
1274
1275	bzero(child_argv, HC_PROBE_ARGC * sizeof (char *));
1276	if ((probe_ev = calloc(1, sizeof (*probe_ev))) == NULL) {
1277		logdebug("ilbd_run_probe: calloc");
1278		return (B_FALSE);
1279	}
1280
1281	/* Set up a pipe to get output from probe command. */
1282	if (pipe(fds) < 0) {
1283		logdebug("ilbd_run_probe: cannot create pipe");
1284		free(probe_ev);
1285		return (B_FALSE);
1286	}
1287	/* Set our side of the pipe to be non-blocking */
1288	if ((fdflags = fcntl(fds[0], F_GETFL, 0)) == -1) {
1289		logdebug("ilbd_run_probe: fcntl(F_GETFL)");
1290		goto cleanup;
1291	}
1292	if (fcntl(fds[0], F_SETFL, fdflags | O_NONBLOCK) == -1) {
1293		logdebug("ilbd_run_probe: fcntl(F_SETFL)");
1294		goto cleanup;
1295	}
1296
1297	if (posix_spawn_file_actions_init(&fd_actions) != 0) {
1298		logdebug("ilbd_run_probe: posix_spawn_file_actions_init");
1299		goto cleanup;
1300	}
1301	if (posix_spawnattr_init(&attr) != 0) {
1302		logdebug("ilbd_run_probe: posix_spawnattr_init");
1303		goto cleanup;
1304	}
1305	if (posix_spawn_file_actions_addclose(&fd_actions, fds[0]) != 0) {
1306		logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1307		goto cleanup;
1308	}
1309	if (posix_spawn_file_actions_adddup2(&fd_actions, fds[1],
1310	    STDOUT_FILENO) != 0) {
1311		logdebug("ilbd_run_probe: posix_spawn_file_actions_dup2");
1312		goto cleanup;
1313	}
1314	if (posix_spawn_file_actions_addclose(&fd_actions, fds[1]) != 0) {
1315		logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1316		goto cleanup;
1317	}
1318
1319	/* Reset all signal handling of the child to default. */
1320	(void) sigfillset(&child_sigset);
1321	if (posix_spawnattr_setsigdefault(&attr, &child_sigset) != 0) {
1322		logdebug("ilbd_run_probe: posix_spawnattr_setsigdefault");
1323		goto cleanup;
1324	}
1325	/* Don't want SIGCHLD. */
1326	if (posix_spawnattr_setflags(&attr, POSIX_SPAWN_NOSIGCHLD_NP|
1327	    POSIX_SPAWN_SETSIGDEF) != 0) {
1328		logdebug("ilbd_run_probe: posix_spawnattr_setflags");
1329		goto cleanup;
1330	}
1331
1332	if (!create_argv(srv, child_argv)) {
1333		logdebug("ilbd_run_probe: create_argv");
1334		goto cleanup;
1335	}
1336
1337	/*
1338	 * If we are doing default pinging or not using a user supplied
1339	 * probe, we should execute our standard supplied probe.  The
1340	 * supplied probe command handles all types of probes.  And the
1341	 * type used depends on argv[0], as filled in by create_argv().
1342	 */
1343	if (srv->shc_state == ilbd_hc_def_pinging ||
1344	    srv->shc_hc->ihc_test_type != ILBD_HC_USER) {
1345		probe_name = ILB_PROBE_PROTO;
1346	} else {
1347		probe_name = srv->shc_hc->ihc_test;
1348	}
1349	if (posix_spawn(&pid, probe_name, &fd_actions, &attr, child_argv,
1350	    NULL) != 0) {
1351		logerr("%s: posix_spawn: %s for server %s: %s", __func__,
1352		    srv->shc_hc->ihc_test, srv->shc_sg_srv->sgs_srvID,
1353		    strerror(errno));
1354		goto cleanup;
1355	}
1356
1357	(void) close(fds[1]);
1358	destroy_argv(child_argv);
1359	srv->shc_child_pid = pid;
1360	srv->shc_child_fd = fds[0];
1361	srv->shc_ev = probe_ev;
1362
1363	probe_ev->ihp_ev = ILBD_EVENT_PROBE;
1364	probe_ev->ihp_srv = srv;
1365	probe_ev->ihp_pid = pid;
1366	if (port_associate(srv->shc_ev_port, PORT_SOURCE_FD, fds[0],
1367	    POLLRDNORM, probe_ev) != 0) {
1368		/*
1369		 * Need to kill the child.  It will free the srv->shc_ev,
1370		 * which is probe_ev.  So set probe_ev to NULL.
1371		 */
1372		ilbd_hc_kill_probe(srv);
1373		probe_ev = NULL;
1374		goto cleanup;
1375	}
1376
1377	return (B_TRUE);
1378
1379cleanup:
1380	(void) close(fds[0]);
1381	(void) close(fds[1]);
1382	destroy_argv(child_argv);
1383	if (probe_ev != NULL)
1384		free(probe_ev);
1385	return (B_FALSE);
1386}
1387
1388/*
1389 * Called by ild_hc_probe_return() to re-associate the fd to a child to
1390 * the event port.
1391 */
1392static void
1393reassociate_port(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1394{
1395	if (port_associate(ev_port, PORT_SOURCE_FD, fd,
1396	    POLLRDNORM, ev) != 0) {
1397		/*
1398		 * If we cannot reassociate with the port, the only
1399		 * thing we can do now is to kill the child and
1400		 * do a blocking wait here...
1401		 */
1402		logdebug("%s: port_associate: %s", __func__, strerror(errno));
1403		if (kill(ev->ihp_pid, SIGKILL) != 0)
1404			logerr("%s: kill: %s", __func__, strerror(errno));
1405		if (waitpid(ev->ihp_pid, NULL, 0) != ev->ihp_pid)
1406			logdebug("%s: waitpid: %s", __func__, strerror(errno));
1407		free(ev);
1408	}
1409}
1410
1411/*
1412 * To handle a child probe process hanging up.
1413 */
1414static void
1415ilbd_hc_child_hup(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1416{
1417	ilbd_hc_srv_t *srv;
1418	pid_t ret_pid;
1419	int ret;
1420
1421	srv = ev->ihp_srv;
1422
1423	if (!ev->ihp_done) {
1424		/* ilbd does not care about this process anymore ... */
1425		ev->ihp_done = B_TRUE;
1426		srv->shc_ev = NULL;
1427		srv->shc_child_pid = 0;
1428		HC_CANCEL_TIMER(srv);
1429		ilbd_set_fail_state(srv);
1430	}
1431	ret_pid = waitpid(ev->ihp_pid, &ret, WNOHANG);
1432	switch (ret_pid) {
1433	case -1:
1434		logperror("ilbd_hc_child_hup: waitpid");
1435		/* FALLTHROUGH */
1436	case 0:
1437		/* The child has not completed the exit. Wait again. */
1438		reassociate_port(ev_port, fd, ev);
1439		break;
1440	default:
1441		/* Right now, we just ignore the exit status. */
1442		if (WIFEXITED(ret))
1443			ret = WEXITSTATUS(ret);
1444		(void) close(fd);
1445		free(ev);
1446	}
1447}
1448
1449/*
1450 * To read the output of a child probe process.
1451 */
1452static void
1453ilbd_hc_child_data(int fd, ilbd_hc_probe_event_t *ev)
1454{
1455	ilbd_hc_srv_t *srv;
1456	char buf[HC_MAX_PROBE_OUTPUT];
1457	int ret;
1458	int64_t rtt;
1459
1460	srv = ev->ihp_srv;
1461
1462	bzero(buf, HC_MAX_PROBE_OUTPUT);
1463	ret = read(fd, buf, HC_MAX_PROBE_OUTPUT - 1);
1464	/* Should not happen since event port should have caught this. */
1465	assert(ret > 0);
1466
1467	/*
1468	 * We expect the probe command to print out the RTT only.  But
1469	 * the command may misbehave and print out more than what we intend to
1470	 * read in.  So need to do this check below to "flush" out all the
1471	 * output from the command.
1472	 */
1473	if (!ev->ihp_done) {
1474		ev->ihp_done = B_TRUE;
1475		/* We don't need to know about this event anymore. */
1476		srv->shc_ev = NULL;
1477		srv->shc_child_pid = 0;
1478		HC_CANCEL_TIMER(srv);
1479	} else {
1480		return;
1481	}
1482
1483	rtt = strtoll(buf, NULL, 10);
1484
1485	/*
1486	 * -1 means the server is dead or the probe somehow fails.  Treat
1487	 * them both as server is dead.
1488	 */
1489	if (rtt == -1) {
1490		ilbd_set_fail_state(srv);
1491		return;
1492	} else if (rtt > 0) {
1493		/* If the returned RTT value is not valid, just ignore it. */
1494		if (rtt > 0 && rtt <= UINT_MAX) {
1495			/* Set rtt to be the simple smoothed average. */
1496			if (srv->shc_rtt == 0) {
1497				srv->shc_rtt = rtt;
1498			} else {
1499				srv->shc_rtt = 3 * ((srv)->shc_rtt >> 2) +
1500				    (rtt >> 2);
1501			}
1502		}
1503
1504	}
1505
1506	switch (srv->shc_state) {
1507	case ilbd_hc_def_pinging:
1508		srv->shc_state = ilbd_hc_probing;
1509
1510		/* Ping is OK, now start the probe. */
1511		ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
1512		break;
1513	case ilbd_hc_probing:
1514		srv->shc_fail_cnt = 0;
1515
1516		/* Server is dead before, re-enable it. */
1517		if (srv->shc_status == ILB_HCS_UNREACH ||
1518		    srv->shc_status == ILB_HCS_DEAD) {
1519			/*
1520			 * If enabling the server in kernel fails now,
1521			 * hopefully when the timer fires again later, the
1522			 * enabling can be done.
1523			 */
1524			if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
1525			    srv->shc_hc_rule->hcr_rule->irl_name,
1526			    stat_declare_srv_alive) != ILB_STATUS_OK) {
1527				logerr("%s: cannot enable server in kernel: "
1528				    " rule %s server %s", __func__,
1529				    srv->shc_hc_rule->hcr_rule->irl_name,
1530				    srv->shc_sg_srv->sgs_srvID);
1531			} else {
1532				srv->shc_status = ILB_HCS_ALIVE;
1533			}
1534		} else {
1535			srv->shc_status = ILB_HCS_ALIVE;
1536		}
1537		if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
1538			logerr("%s: cannot restart timer: rule %s server %s",
1539			    __func__, srv->shc_hc_rule->hcr_rule->irl_name,
1540			    srv->shc_sg_srv->sgs_srvID);
1541			ilbd_mark_server_disabled(srv);
1542		}
1543		break;
1544	default:
1545		logdebug("%s: unknown state", __func__);
1546		break;
1547	}
1548}
1549
1550/*
1551 * Handle the return event of a child probe fd.
1552 */
1553void
1554ilbd_hc_probe_return(int ev_port, int fd, int port_events,
1555    ilbd_hc_probe_event_t *ev)
1556{
1557	/*
1558	 * Note that there can be more than one events delivered to us at
1559	 * the same time.  So we need to check them individually.
1560	 */
1561	if (port_events & POLLRDNORM)
1562		ilbd_hc_child_data(fd, ev);
1563
1564	if (port_events & (POLLHUP|POLLERR)) {
1565		ilbd_hc_child_hup(ev_port, fd, ev);
1566		return;
1567	}
1568
1569	/*
1570	 * Re-associate the fd with the port so that when the child
1571	 * exits, we can reap the status.
1572	 */
1573	reassociate_port(ev_port, fd, ev);
1574}
1575