mpd_main.c revision 11042:2d6e217af1b4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include "mpd_defs.h"
27#include "mpd_tables.h"
28
29int debug = 0;				/* Debug flag */
30static int pollfd_num = 0;		/* Num. of poll descriptors */
31static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
32					/* All times below in ms */
33int	user_failure_detection_time;	/* user specified failure detection */
34					/* time (fdt) */
35int	user_probe_interval;		/* derived from user specified fdt */
36
37/*
38 * Structure to store mib2 information returned by the kernel.
39 * This is used to process routing table information.
40 */
41typedef struct mib_item_s {
42	struct mib_item_s	*mi_next;
43	struct opthdr		mi_opthdr;
44	void			*mi_valp;
45} mib_item_t;
46
47static int	rtsock_v4;		/* AF_INET routing socket */
48static int	rtsock_v6;		/* AF_INET6 routing socket */
49int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
50int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
51static int	lsock_v4;		/* Listen socket to detect mpathd */
52static int	lsock_v6;		/* Listen socket to detect mpathd */
53static int	mibfd = -1;		/* fd to get mib info */
54static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
55
56static uint_t	last_initifs_time;	/* Time when initifs was last run */
57static	char **argv0;			/* Saved for re-exec on SIGHUP */
58boolean_t handle_link_notifications = _B_TRUE;
59static int	ipRouteEntrySize;	/* Size of IPv4 route entry */
60static int	ipv6RouteEntrySize;	/* Size of IPv6 route entry */
61
62static void	initlog(void);
63static void	run_timeouts(void);
64static void	initifs(void);
65static void	check_if_removed(struct phyint_instance *pii);
66static void	select_test_ifs(void);
67static void	update_router_list(mib_item_t *item);
68static void	mib_get_constants(mib_item_t *item);
69static int	mibwalk(void (*proc)(mib_item_t *));
70static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
71static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
72static void	router_add_common(int af, char *ifname,
73    struct in6_addr nexthop);
74static void	init_router_targets();
75static void	cleanup(void);
76static int	setup_listener(int af);
77static void	check_config(void);
78static void	check_testconfig(void);
79static void	check_addr_unique(struct phyint_instance *,
80    struct sockaddr_storage *);
81static void	init_host_targets(void);
82static void	dup_host_targets(struct phyint_instance *desired_pii);
83static void	loopback_cmd(int sock, int family);
84static boolean_t daemonize(void);
85static int	closefunc(void *, int);
86static unsigned int process_cmd(int newfd, union mi_commands *mpi);
87static unsigned int process_query(int fd, mi_query_t *miq);
88static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop);
89static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
90static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
91static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
92static unsigned int send_result(int fd, unsigned int error, int syserror);
93
94addrlist_t *localaddrs;
95
96/*
97 * Return the current time in milliseconds (from an arbitrary reference)
98 * truncated to fit into an int. Truncation is ok since we are interested
99 * only in differences and not the absolute values.
100 */
101uint_t
102getcurrenttime(void)
103{
104	uint_t	cur_time;	/* In ms */
105
106	/*
107	 * Use of a non-user-adjustable source of time is
108	 * required. However millisecond precision is sufficient.
109	 * divide by 10^6
110	 */
111	cur_time = (uint_t)(gethrtime() / 1000000LL);
112	return (cur_time);
113}
114
115uint64_t
116getcurrentsec(void)
117{
118	return (gethrtime() / NANOSEC);
119}
120
121/*
122 * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
123 */
124int
125poll_add(int fd)
126{
127	int i;
128	int new_num;
129	struct pollfd *newfds;
130retry:
131	/* Check if already present */
132	for (i = 0; i < pollfd_num; i++) {
133		if (pollfds[i].fd == fd)
134			return (0);
135	}
136	/* Check for empty spot already present */
137	for (i = 0; i < pollfd_num; i++) {
138		if (pollfds[i].fd == -1) {
139			pollfds[i].fd = fd;
140			return (0);
141		}
142	}
143
144	/* Allocate space for 32 more fds and initialize to -1 */
145	new_num = pollfd_num + 32;
146	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
147	if (newfds == NULL) {
148		logperror("poll_add: realloc");
149		return (-1);
150	}
151	for (i = pollfd_num; i < new_num; i++) {
152		newfds[i].fd = -1;
153		newfds[i].events = POLLIN;
154	}
155	pollfd_num = new_num;
156	pollfds = newfds;
157	goto retry;
158}
159
160/*
161 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
162 */
163int
164poll_remove(int fd)
165{
166	int i;
167
168	/* Check if already present */
169	for (i = 0; i < pollfd_num; i++) {
170		if (pollfds[i].fd == fd) {
171			pollfds[i].fd = -1;
172			return (0);
173		}
174	}
175	return (-1);
176}
177
178/*
179 * Extract information about the phyint instance. If the phyint instance still
180 * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
181 * will use it to detect phyint instances that don't exist any longer and
182 * remove them, from our database of phyint instances.
183 * Return value:
184 *	returns true if the phyint instance exists in the kernel,
185 *	returns false otherwise
186 */
187static boolean_t
188pii_process(int af, char *name, struct phyint_instance **pii_p)
189{
190	int err;
191	struct phyint_instance *pii;
192	struct phyint_instance *pii_other;
193
194	if (debug & D_PHYINT)
195		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
196
197	pii = phyint_inst_lookup(af, name);
198	if (pii == NULL) {
199		/*
200		 * Phyint instance does not exist in our tables,
201		 * create new phyint instance
202		 */
203		pii = phyint_inst_init_from_k(af, name);
204	} else {
205		/* Phyint exists in our tables */
206		err = phyint_inst_update_from_k(pii);
207
208		switch (err) {
209		case PI_IOCTL_ERROR:
210			/* Some ioctl error. don't change anything */
211			pii->pii_in_use = 1;
212			break;
213
214		case PI_GROUP_CHANGED:
215		case PI_IFINDEX_CHANGED:
216			/*
217			 * Interface index or group membership has changed.
218			 * Delete the old state and recreate based on the new
219			 * state (it may no longer be in a group).
220			 */
221			pii_other = phyint_inst_other(pii);
222			if (pii_other != NULL)
223				phyint_inst_delete(pii_other);
224			phyint_inst_delete(pii);
225			pii = phyint_inst_init_from_k(af, name);
226			break;
227
228		case PI_DELETED:
229			/* Phyint instance has disappeared from kernel */
230			pii->pii_in_use = 0;
231			break;
232
233		case PI_OK:
234			/* Phyint instance exists and is fine */
235			pii->pii_in_use = 1;
236			break;
237
238		default:
239			/* Unknown status */
240			logerr("pii_process: Unknown status %d\n", err);
241			break;
242		}
243	}
244
245	*pii_p = pii;
246	if (pii != NULL)
247		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
248	else
249		return (_B_FALSE);
250}
251
252/*
253 * Scan all interfaces to detect changes as well as new and deleted interfaces
254 */
255static void
256initifs()
257{
258	int	i, nlifr;
259	int	af;
260	char	*cp;
261	char	*buf;
262	int	sockfd;
263	uint64_t	flags;
264	struct lifnum	lifn;
265	struct lifconf	lifc;
266	struct lifreq	lifreq;
267	struct lifreq	*lifr;
268	struct logint	*li;
269	struct phyint_instance *pii;
270	struct phyint_instance *next_pii;
271	struct phyint_group *pg, *next_pg;
272	char		pi_name[LIFNAMSIZ + 1];
273
274	if (debug & D_PHYINT)
275		logdebug("initifs: Scanning interfaces\n");
276
277	last_initifs_time = getcurrenttime();
278
279	/*
280	 * Free the existing local address list; we'll build a new list below.
281	 */
282	addrlist_free(&localaddrs);
283
284	/*
285	 * Mark the interfaces so that we can find phyints and logints
286	 * which have disappeared from the kernel. pii_process() and
287	 * logint_init_from_k() will set {pii,li}_in_use when they find
288	 * the interface in the kernel. Also, clear dupaddr bit on probe
289	 * logint. check_addr_unique() will set the dupaddr bit on the
290	 * probe logint, if the testaddress is not unique.
291	 */
292	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
293		pii->pii_in_use = 0;
294		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
295			li->li_in_use = 0;
296			if (pii->pii_probe_logint == li)
297				li->li_dupaddr = 0;
298		}
299	}
300
301	/*
302	 * As above, mark groups so that we can detect IPMP interfaces which
303	 * have been removed from the kernel.  Also, delete the group address
304	 * list since we'll iteratively recreate it below.
305	 */
306	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
307		pg->pg_in_use = _B_FALSE;
308		addrlist_free(&pg->pg_addrs);
309	}
310
311	lifn.lifn_family = AF_UNSPEC;
312	lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
313again:
314	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
315		logperror("initifs: ioctl (get interface count)");
316		return;
317	}
318	/*
319	 * Pad the interface count to detect when additional interfaces have
320	 * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
321	 */
322	lifn.lifn_count += 4;
323
324	if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
325		logperror("initifs: calloc");
326		return;
327	}
328
329	lifc.lifc_family = AF_UNSPEC;
330	lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
331	lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
332	lifc.lifc_buf = buf;
333
334	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
335		logperror("initifs: ioctl (get interface configuration)");
336		free(buf);
337		return;
338	}
339
340	/*
341	 * If every lifr_req slot is taken, then additional interfaces must
342	 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
343	 * Recalculate to make sure we didn't miss any interfaces.
344	 */
345	nlifr = lifc.lifc_len / sizeof (struct lifreq);
346	if (nlifr >= lifn.lifn_count) {
347		free(buf);
348		goto again;
349	}
350
351	/*
352	 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
353	 * global list of addresses, phyint groups, phyints, and logints.
354	 */
355	for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) {
356		af = lifr->lifr_addr.ss_family;
357		sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
358		(void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
359
360		if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
361			if (errno != ENXIO)
362				logperror("initifs: ioctl (SIOCGLIFFLAGS)");
363			continue;
364		}
365		flags = lifreq.lifr_flags;
366
367		/*
368		 * If the address is IFF_UP, add it to the local address list.
369		 * (We ignore addresses that aren't IFF_UP since another node
370		 * might legitimately have that address IFF_UP.)
371		 */
372		if (flags & IFF_UP) {
373			(void) addrlist_add(&localaddrs, lifr->lifr_name, flags,
374			    &lifr->lifr_addr);
375		}
376
377		/*
378		 * If this address is on an IPMP meta-interface, update our
379		 * phyint_group information (either by recording that group
380		 * still exists or creating a new group), and track what
381		 * group the address is part of.
382		 */
383		if (flags & IFF_IPMP) {
384			if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) {
385				if (errno != ENXIO)
386					logperror("initifs: ioctl "
387					    "(SIOCGLIFGROUPNAME)");
388				continue;
389			}
390
391			pg = phyint_group_lookup(lifreq.lifr_groupname);
392			if (pg == NULL) {
393				pg = phyint_group_create(lifreq.lifr_groupname);
394				if (pg == NULL) {
395					logerr("initifs: cannot create group "
396					    "%s\n", lifreq.lifr_groupname);
397					continue;
398				}
399				phyint_group_insert(pg);
400			}
401			pg->pg_in_use = _B_TRUE;
402
403			/*
404			 * Add this to the group's list of data addresses.
405			 */
406			if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags,
407			    &lifr->lifr_addr)) {
408				logerr("initifs: insufficient memory to track "
409				    "data address information for %s\n",
410				    lifr->lifr_name);
411			}
412			continue;
413		}
414
415		/*
416		 * This isn't an address on an IPMP meta-interface, so it's
417		 * either on an underlying interface or not related to any
418		 * group.  Update our phyint and logint information (via
419		 * pii_process() and logint_init_from_k()) -- but first,
420		 * convert the logint name to a phyint name so we can call
421		 * pii_process().
422		 */
423		(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
424		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
425			*cp = '\0';
426
427		if (pii_process(af, pi_name, &pii)) {
428			/* The phyint is fine. So process the logint */
429			logint_init_from_k(pii, lifr->lifr_name);
430			check_addr_unique(pii, &lifr->lifr_addr);
431		}
432	}
433	free(buf);
434
435	/*
436	 * Scan for groups, phyints and logints that have disappeared from the
437	 * kernel, and delete them.
438	 */
439	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
440		next_pii = pii->pii_next;
441		check_if_removed(pii);
442	}
443
444	for (pg = phyint_groups; pg != NULL; pg = next_pg) {
445		next_pg = pg->pg_next;
446		if (!pg->pg_in_use) {
447			phyint_group_delete(pg);
448			continue;
449		}
450		/*
451		 * Refresh the group's state.  This is necessary since the
452		 * group's state is defined by the set of usable interfaces in
453		 * the group, and an interface is considered unusable if all
454		 * of its addresses are down.  When an address goes down/up,
455		 * the RTM_DELADDR/RTM_NEWADDR brings us through here.
456		 */
457		phyint_group_refresh_state(pg);
458	}
459
460	/*
461	 * Select a test address for sending probes on each phyint instance
462	 */
463	select_test_ifs();
464
465	/*
466	 * Handle link up/down notifications.
467	 */
468	process_link_state_changes();
469}
470
471/*
472 * Check that a given test address is unique across all of the interfaces in a
473 * group.  (e.g., IPv6 link-locals may not be inherently unique, and binding
474 * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
475 * Any issues will be reported by check_testconfig().
476 */
477static void
478check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
479{
480	struct phyint		*pi;
481	struct phyint_group	*pg;
482	struct in6_addr		addr;
483	struct phyint_instance	*pii;
484	struct sockaddr_in	*sin;
485
486	if (ss->ss_family == AF_INET) {
487		sin = (struct sockaddr_in *)ss;
488		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
489	} else {
490		assert(ss->ss_family == AF_INET6);
491		addr = ((struct sockaddr_in6 *)ss)->sin6_addr;
492	}
493
494	/*
495	 * For anonymous groups, every interface is assumed to be on its own
496	 * link, so there is no chance of overlapping addresses.
497	 */
498	pg = ourpii->pii_phyint->pi_group;
499	if (pg == phyint_anongroup)
500		return;
501
502	/*
503	 * Walk the list of phyint instances in the group and check for test
504	 * addresses matching ours.  Of course, we skip ourself.
505	 */
506	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
507		pii = PHYINT_INSTANCE(pi, ss->ss_family);
508		if (pii == NULL || pii == ourpii ||
509		    pii->pii_probe_logint == NULL)
510			continue;
511
512		/*
513		 * If this test address is not unique, set the dupaddr bit.
514		 */
515		if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr))
516			pii->pii_probe_logint->li_dupaddr = 1;
517	}
518}
519
520/*
521 * Stop probing an interface.  Called when an interface is offlined.
522 * The probe socket is closed on each interface instance, and the
523 * interface state set to PI_OFFLINE.
524 */
525void
526stop_probing(struct phyint *pi)
527{
528	struct phyint_instance *pii;
529
530	pii = pi->pi_v4;
531	if (pii != NULL) {
532		if (pii->pii_probe_sock != -1)
533			close_probe_socket(pii, _B_TRUE);
534		pii->pii_probe_logint = NULL;
535	}
536
537	pii = pi->pi_v6;
538	if (pii != NULL) {
539		if (pii->pii_probe_sock != -1)
540			close_probe_socket(pii, _B_TRUE);
541		pii->pii_probe_logint = NULL;
542	}
543
544	phyint_chstate(pi, PI_OFFLINE);
545}
546
547enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
548
549/*
550 * Rate the provided test flags.  By definition, IFF_NOFAILOVER must be set.
551 * IFF_UP must also be set so that the associated address can be used as a
552 * source address.  Further, we must be able to exchange packets with local
553 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear.  For historical
554 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
555 */
556static int
557rate_testflags(uint64_t flags)
558{
559	if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
560		return (BAD_TESTFLAGS);
561
562	if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
563		return (BAD_TESTFLAGS);
564
565	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
566		return (BEST_TESTFLAGS);
567
568	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
569		return (BEST_TESTFLAGS);
570
571	return (OK_TESTFLAGS);
572}
573
574/*
575 * Attempt to select a test address for each phyint instance.
576 * Call phyint_inst_sockinit() to complete the initializations.
577 */
578static void
579select_test_ifs(void)
580{
581	struct phyint		*pi;
582	struct phyint_instance	*pii;
583	struct phyint_instance	*next_pii;
584	struct logint		*li;
585	struct logint  		*probe_logint;
586	boolean_t		target_scan_reqd = _B_FALSE;
587	int			rating;
588
589	if (debug & D_PHYINT)
590		logdebug("select_test_ifs\n");
591
592	/*
593	 * For each phyint instance, do the test address selection
594	 */
595	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
596		next_pii = pii->pii_next;
597		probe_logint = NULL;
598
599		/*
600		 * An interface that is offline should not be probed.
601		 * IFF_OFFLINE interfaces should always be PI_OFFLINE
602		 * unless some other entity has set the offline flag.
603		 */
604		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
605			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
606				logerr("shouldn't be probing offline"
607				    " interface %s (state is: %u)."
608				    " Stopping probes.\n",
609				    pii->pii_phyint->pi_name,
610				    pii->pii_phyint->pi_state);
611				stop_probing(pii->pii_phyint);
612			}
613			continue;
614		} else {
615			/*
616			 * If something cleared IFF_OFFLINE (e.g., by accident
617			 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
618			 * inherently racy), the phyint may still be offline.
619			 * Just ignore it.
620			 */
621			if (pii->pii_phyint->pi_state == PI_OFFLINE)
622				continue;
623		}
624
625		li = pii->pii_probe_logint;
626		if (li != NULL) {
627			/*
628			 * We've already got a test address; only proceed
629			 * if it's suboptimal.
630			 */
631			if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
632				continue;
633		}
634
635		/*
636		 * Walk the logints of this phyint instance, and select
637		 * the best available test address
638		 */
639		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
640			/*
641			 * Skip 0.0.0.0 addresses, as those are never
642			 * actually usable.
643			 */
644			if (pii->pii_af == AF_INET &&
645			    IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr))
646				continue;
647
648			/*
649			 * Skip any IPv6 logints that are not link-local,
650			 * since we should always have a link-local address
651			 * anyway and in6_data() expects link-local replies.
652			 */
653			if (pii->pii_af == AF_INET6 &&
654			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
655				continue;
656
657			/*
658			 * Rate the testflags. If we've found an optimal
659			 * match, then break out; otherwise, record the most
660			 * recent OK one.
661			 */
662			rating = rate_testflags(li->li_flags);
663			if (rating == BAD_TESTFLAGS)
664				continue;
665
666			probe_logint = li;
667			if (rating == BEST_TESTFLAGS)
668				break;
669		}
670
671		/*
672		 * If the probe logint has changed, ditch the old one.
673		 */
674		if (pii->pii_probe_logint != NULL &&
675		    pii->pii_probe_logint != probe_logint) {
676			if (pii->pii_probe_sock != -1)
677				close_probe_socket(pii, _B_TRUE);
678			pii->pii_probe_logint = NULL;
679		}
680
681		if (probe_logint == NULL) {
682			/*
683			 * We don't have a test address; zero out the probe
684			 * stats array since it is no longer relevant.
685			 * Optimize by checking if it is already zeroed out.
686			 */
687			int pr_ndx;
688
689			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
690			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
691				clear_pii_probe_stats(pii);
692				reset_crtt_all(pii->pii_phyint);
693			}
694			continue;
695		} else if (probe_logint == pii->pii_probe_logint) {
696			/*
697			 * If we didn't find any new test addr, go to the
698			 * next phyint.
699			 */
700			continue;
701		}
702
703		/*
704		 * The phyint is either being assigned a new testaddr
705		 * or is being assigned a testaddr for the 1st time.
706		 * Need to initialize the phyint socket
707		 */
708		pii->pii_probe_logint = probe_logint;
709		if (!phyint_inst_sockinit(pii)) {
710			if (debug & D_PHYINT) {
711				logdebug("select_test_ifs: "
712				    "phyint_sockinit failed\n");
713			}
714			phyint_inst_delete(pii);
715			continue;
716		}
717
718		/*
719		 * This phyint instance is now enabled for probes; this
720		 * impacts our state machine in two ways:
721		 *
722		 * 1. If we're probe *capable* as well (i.e., we have
723		 *    probe targets) and the interface is in PI_NOTARGETS,
724		 *    then transition to PI_RUNNING.
725		 *
726		 * 2. If we're not probe capable, and the other phyint
727		 *    instance is also not probe capable, and we were in
728		 *    PI_RUNNING, then transition to PI_NOTARGETS.
729		 *
730		 * Also see the state diagram in mpd_probe.c.
731		 */
732		if (PROBE_CAPABLE(pii)) {
733			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
734				phyint_chstate(pii->pii_phyint, PI_RUNNING);
735		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
736			if (pii->pii_phyint->pi_state == PI_RUNNING)
737				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
738		}
739
740		/*
741		 * If no targets are currently known for this phyint
742		 * we need to call init_router_targets. Since
743		 * init_router_targets() initializes the list of targets
744		 * for all phyints it is done below the loop.
745		 */
746		if (pii->pii_targets == NULL)
747			target_scan_reqd = _B_TRUE;
748
749		/*
750		 * Start the probe timer for this instance.
751		 */
752		if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) {
753			start_timer(pii);
754			pii->pii_basetime_inited = 1;
755		}
756	}
757
758	/*
759	 * Scan the interface list for any interfaces that are PI_FAILED or
760	 * PI_NOTARGETS but no longer enabled to send probes, and call
761	 * phyint_check_for_repair() to see if the link state indicates that
762	 * the interface should be repaired.  Also see the state diagram in
763	 * mpd_probe.c.
764	 */
765	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
766		if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) &&
767		    (pi->pi_state == PI_FAILED ||
768		    pi->pi_state == PI_NOTARGETS)) {
769			phyint_check_for_repair(pi);
770		}
771	}
772
773	check_testconfig();
774
775	/*
776	 * Try to populate the target list. init_router_targets populates
777	 * the target list from the routing table. If our target list is
778	 * still empty, init_host_targets adds host targets based on the
779	 * host target list of other phyints in the group.
780	 */
781	if (target_scan_reqd) {
782		init_router_targets();
783		init_host_targets();
784	}
785}
786
787/*
788 * Check test address configuration, and log notices/errors if appropriate.
789 * Note that this function only logs pre-existing conditions (e.g., that
790 * probe-based failure detection is disabled).
791 */
792static void
793check_testconfig(void)
794{
795	struct phyint	*pi;
796	struct logint  	*li;
797	char		abuf[INET6_ADDRSTRLEN];
798	int		pri;
799
800	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
801		if (pi->pi_flags & IFF_OFFLINE)
802			continue;
803
804		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) {
805			if (pi->pi_taddrmsg_printed ||
806			    pi->pi_duptaddrmsg_printed) {
807				if (pi->pi_duptaddrmsg_printed)
808					pri = LOG_ERR;
809				else
810					pri = LOG_INFO;
811				logmsg(pri, "Test address now configured on "
812				    "interface %s; enabling probe-based "
813				    "failure detection on it\n", pi->pi_name);
814				pi->pi_taddrmsg_printed = 0;
815				pi->pi_duptaddrmsg_printed = 0;
816			}
817			continue;
818		}
819
820		li = NULL;
821		if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL &&
822		    pi->pi_v4->pii_probe_logint->li_dupaddr)
823			li = pi->pi_v4->pii_probe_logint;
824
825		if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL &&
826		    pi->pi_v6->pii_probe_logint->li_dupaddr)
827			li = pi->pi_v6->pii_probe_logint;
828
829		if (li != NULL && li->li_dupaddr) {
830			if (pi->pi_duptaddrmsg_printed)
831				continue;
832			logerr("Test address %s is not unique in group; "
833			    "disabling probe-based failure detection on %s\n",
834			    pr_addr(li->li_phyint_inst->pii_af,
835			    li->li_addr, abuf, sizeof (abuf)), pi->pi_name);
836			pi->pi_duptaddrmsg_printed = 1;
837			continue;
838		}
839
840		if (getcurrentsec() < pi->pi_taddrthresh)
841			continue;
842
843		if (!pi->pi_taddrmsg_printed) {
844			logtrace("No test address configured on interface %s; "
845			    "disabling probe-based failure detection on it\n",
846			    pi->pi_name);
847			pi->pi_taddrmsg_printed = 1;
848		}
849	}
850}
851
852/*
853 * Check phyint group configuration, to detect any inconsistencies,
854 * and log an error message. This is called from runtimeouts every
855 * 20 secs. But the error message is displayed once. If the
856 * consistency is resolved by the admin, a recovery message is displayed
857 * once.
858 */
859static void
860check_config(void)
861{
862	struct phyint_group *pg;
863	struct phyint *pi;
864	boolean_t v4_in_group;
865	boolean_t v6_in_group;
866
867	/*
868	 * All phyints of a group must be homogeneous to ensure that they can
869	 * take over for one another.  If any phyint in a group has IPv4
870	 * plumbed, check that all phyints have IPv4 plumbed.  Do a similar
871	 * check for IPv6.
872	 */
873	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
874		if (pg == phyint_anongroup)
875			continue;
876
877		v4_in_group = _B_FALSE;
878		v6_in_group = _B_FALSE;
879		/*
880		 * 1st pass. Determine if at least 1 phyint in the group
881		 * has IPv4 plumbed and if so set v4_in_group to true.
882		 * Repeat similarly for IPv6.
883		 */
884		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
885			if (pi->pi_v4 != NULL)
886				v4_in_group = _B_TRUE;
887			if (pi->pi_v6 != NULL)
888				v6_in_group = _B_TRUE;
889		}
890
891		/*
892		 * 2nd pass. If v4_in_group is true, check that phyint
893		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
894		 * out a message the 1st time only.
895		 */
896		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
897			if (pi->pi_flags & IFF_OFFLINE)
898				continue;
899
900			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
901				if (!pi->pi_cfgmsg_printed) {
902					logerr("IP interface %s in group %s is"
903					    " not plumbed for IPv4, affecting"
904					    " IPv4 connectivity\n",
905					    pi->pi_name,
906					    pi->pi_group->pg_name);
907					pi->pi_cfgmsg_printed = 1;
908				}
909			} else if (v6_in_group == _B_TRUE &&
910			    pi->pi_v6 == NULL) {
911				if (!pi->pi_cfgmsg_printed) {
912					logerr("IP interface %s in group %s is"
913					    " not plumbed for IPv6, affecting"
914					    " IPv6 connectivity\n",
915					    pi->pi_name,
916					    pi->pi_group->pg_name);
917					pi->pi_cfgmsg_printed = 1;
918				}
919			} else {
920				/*
921				 * The phyint matches the group configuration,
922				 * if we have reached this point. If it was
923				 * improperly configured earlier, log an
924				 * error recovery message
925				 */
926				if (pi->pi_cfgmsg_printed) {
927					logerr("IP interface %s is now"
928					    " consistent with group %s "
929					    " and connectivity is restored\n",
930					    pi->pi_name, pi->pi_group->pg_name);
931					pi->pi_cfgmsg_printed = 0;
932				}
933			}
934
935		}
936	}
937}
938
939/*
940 * Timer mechanism using relative time (in milliseconds) from the
941 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
942 * will fire after TIMER_INFINITY milliseconds.
943 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
944 * time values. Hence 2 consecutive timer events cannot be spaced farther
945 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
946 * that can be passed for the delay parameter of timer_schedule()
947 */
948static uint_t timer_next;	/* Currently scheduled timeout */
949static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
950
951static void
952timer_init(void)
953{
954	timer_next = getcurrenttime() + TIMER_INFINITY;
955	/*
956	 * The call to run_timeouts() will get the timer started
957	 * Since there are no phyints at this point, the timer will
958	 * be set for IF_SCAN_INTERVAL ms.
959	 */
960	run_timeouts();
961}
962
963/*
964 * Make sure the next SIGALRM occurs delay milliseconds from the current
965 * time if not earlier. We are interested only in time differences.
966 */
967void
968timer_schedule(uint_t delay)
969{
970	uint_t now;
971	struct itimerval itimerval;
972
973	if (debug & D_TIMER)
974		logdebug("timer_schedule(%u)\n", delay);
975
976	assert(delay <= TIMER_INFINITY);
977
978	now = getcurrenttime();
979	if (delay == 0) {
980		/* Minimum allowed delay */
981		delay = 1;
982	}
983	/* Will this timer occur before the currently scheduled SIGALRM? */
984	if (timer_active && TIME_GE(now + delay, timer_next)) {
985		if (debug & D_TIMER) {
986			logdebug("timer_schedule(%u) - no action: "
987			    "now %u next %u\n", delay, now, timer_next);
988		}
989		return;
990	}
991	timer_next = now + delay;
992
993	itimerval.it_value.tv_sec = delay / 1000;
994	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
995	itimerval.it_interval.tv_sec = 0;
996	itimerval.it_interval.tv_usec = 0;
997	if (debug & D_TIMER) {
998		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
999		    delay, itimerval.it_value.tv_sec,
1000		    itimerval.it_value.tv_usec);
1001	}
1002	timer_active = _B_TRUE;
1003	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1004		logperror("timer_schedule: setitimer");
1005		exit(2);
1006	}
1007}
1008
1009static void
1010timer_cancel(void)
1011{
1012	struct itimerval itimerval;
1013
1014	if (debug & D_TIMER)
1015		logdebug("timer_cancel()\n");
1016
1017	bzero(&itimerval, sizeof (itimerval));
1018	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0)
1019		logperror("timer_cancel: setitimer");
1020}
1021
1022/*
1023 * Timer has fired. Determine when the next timer event will occur by asking
1024 * all the timer routines. Should not be called from a timer routine.
1025 */
1026static void
1027run_timeouts(void)
1028{
1029	uint_t next;
1030	uint_t next_event_time;
1031	struct phyint_instance *pii;
1032	struct phyint_instance *next_pii;
1033	static boolean_t timeout_running;
1034
1035	/* assert that recursive timeouts don't happen. */
1036	assert(!timeout_running);
1037
1038	timeout_running = _B_TRUE;
1039
1040	if (debug & D_TIMER)
1041		logdebug("run_timeouts()\n");
1042
1043	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1044		initifs();
1045		check_config();
1046	}
1047
1048	next = TIMER_INFINITY;
1049
1050	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1051		next_pii = pii->pii_next;
1052		next_event_time = phyint_inst_timer(pii);
1053		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1054			next = next_event_time;
1055
1056		if (debug & D_TIMER) {
1057			logdebug("run_timeouts(%s %s): next scheduled for"
1058			    " this phyint inst %u, next scheduled global"
1059			    " %u ms\n",
1060			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1061			    next_event_time, next);
1062		}
1063	}
1064
1065	/*
1066	 * Make sure initifs() is called at least once every
1067	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1068	 * with the kernel, in case we have missed any routing
1069	 * socket messages.
1070	 */
1071	if (next > IF_SCAN_INTERVAL)
1072		next = IF_SCAN_INTERVAL;
1073
1074	if (debug & D_TIMER)
1075		logdebug("run_timeouts: %u ms\n", next);
1076
1077	timer_schedule(next);
1078	timeout_running = _B_FALSE;
1079}
1080
1081static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1082static int eventpipe_write = -1;
1083boolean_t cleanup_started = _B_FALSE;	/* true if we're going away */
1084
1085/*
1086 * Ensure that signals are processed synchronously with the rest of
1087 * the code by just writing a one character signal number on the pipe.
1088 * The poll loop will pick this up and process the signal event.
1089 */
1090static void
1091sig_handler(int signo)
1092{
1093	uchar_t buf = (uchar_t)signo;
1094
1095	/*
1096	 * Don't write to pipe if cleanup has already begun. cleanup()
1097	 * might have closed the pipe already
1098	 */
1099	if (cleanup_started)
1100		return;
1101
1102	if (eventpipe_write == -1) {
1103		logerr("sig_handler: no pipe found\n");
1104		return;
1105	}
1106	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1107		logperror("sig_handler: write");
1108}
1109
1110extern struct probes_missed probes_missed;
1111
1112/*
1113 * Pick up a signal "byte" from the pipe and process it.
1114 */
1115static void
1116in_signal(int fd)
1117{
1118	uchar_t buf;
1119	uint64_t  sent, acked, lost, unacked, unknown;
1120	struct phyint_instance *pii;
1121	int pr_ndx;
1122
1123	switch (read(fd, &buf, sizeof (buf))) {
1124	case -1:
1125		logperror("in_signal: read");
1126		exit(1);
1127		/* NOTREACHED */
1128	case 1:
1129		break;
1130	case 0:
1131		logerr("in_signal: read end of file\n");
1132		exit(1);
1133		/* NOTREACHED */
1134	default:
1135		logerr("in_signal: read > 1\n");
1136		exit(1);
1137	}
1138
1139	if (debug & D_TIMER)
1140		logdebug("in_signal() got %d\n", buf);
1141
1142	switch (buf) {
1143	case SIGALRM:
1144		if (debug & D_TIMER) {
1145			uint_t now = getcurrenttime();
1146
1147			logdebug("in_signal(SIGALRM) delta %u\n",
1148			    now - timer_next);
1149		}
1150		timer_active = _B_FALSE;
1151		run_timeouts();
1152		break;
1153	case SIGUSR1:
1154		logdebug("Printing configuration:\n");
1155		/* Print out the internal tables */
1156		phyint_inst_print_all();
1157
1158		/*
1159		 * Print out the accumulated statistics about missed
1160		 * probes (happens due to scheduling delay).
1161		 */
1162		logerr("Missed sending total of %d probes spread over"
1163		    " %d occurrences\n", probes_missed.pm_nprobes,
1164		    probes_missed.pm_ntimes);
1165
1166		/*
1167		 * Print out the accumulated statistics about probes
1168		 * that were sent.
1169		 */
1170		for (pii = phyint_instances; pii != NULL;
1171		    pii = pii->pii_next) {
1172			unacked = 0;
1173			acked = pii->pii_cum_stats.acked;
1174			lost = pii->pii_cum_stats.lost;
1175			sent = pii->pii_cum_stats.sent;
1176			unknown = pii->pii_cum_stats.unknown;
1177			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1178				switch (pii->pii_probes[pr_ndx].pr_status) {
1179				case PR_ACKED:
1180					acked++;
1181					break;
1182				case PR_LOST:
1183					lost++;
1184					break;
1185				case PR_UNACKED:
1186					unacked++;
1187					break;
1188				}
1189			}
1190			logerr("\nProbe stats on (%s %s)\n"
1191			    "Number of probes sent %lld\n"
1192			    "Number of probe acks received %lld\n"
1193			    "Number of probes/acks lost %lld\n"
1194			    "Number of valid unacknowledged probes %lld\n"
1195			    "Number of ambiguous probe acks received %lld\n",
1196			    AF_STR(pii->pii_af), pii->pii_name,
1197			    sent, acked, lost, unacked, unknown);
1198		}
1199		break;
1200	case SIGHUP:
1201		logerr("SIGHUP: restart and reread config file\n");
1202		/*
1203		 * Cancel the interval timer.  Needed since setitimer() uses
1204		 * alarm() and the time left is inherited across exec(), and
1205		 * thus the SIGALRM may be delivered before a handler has been
1206		 * setup, causing in.mpathd to erroneously exit.
1207		 */
1208		timer_cancel();
1209		cleanup();
1210		(void) execv(argv0[0], argv0);
1211		_exit(0177);
1212		/* NOTREACHED */
1213	case SIGINT:
1214	case SIGTERM:
1215	case SIGQUIT:
1216		cleanup();
1217		exit(0);
1218		/* NOTREACHED */
1219	default:
1220		logerr("in_signal: unknown signal: %d\n", buf);
1221	}
1222}
1223
1224static void
1225cleanup(void)
1226{
1227	struct phyint_instance *pii;
1228	struct phyint_instance *next_pii;
1229
1230	/*
1231	 * Make sure that we don't write to eventpipe in
1232	 * sig_handler() if any signal notably SIGALRM,
1233	 * occurs after we close the eventpipe descriptor below
1234	 */
1235	cleanup_started = _B_TRUE;
1236
1237	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1238		next_pii = pii->pii_next;
1239		phyint_inst_delete(pii);
1240	}
1241
1242	(void) close(ifsock_v4);
1243	(void) close(ifsock_v6);
1244	(void) close(rtsock_v4);
1245	(void) close(rtsock_v6);
1246	(void) close(lsock_v4);
1247	(void) close(lsock_v6);
1248	(void) close(0);
1249	(void) close(1);
1250	(void) close(2);
1251	(void) close(mibfd);
1252	(void) close(eventpipe_read);
1253	(void) close(eventpipe_write);
1254}
1255
1256/*
1257 * Create pipe for signal delivery and set up signal handlers.
1258 */
1259static void
1260setup_eventpipe(void)
1261{
1262	int fds[2];
1263	struct sigaction act;
1264
1265	if ((pipe(fds)) < 0) {
1266		logperror("setup_eventpipe: pipe");
1267		exit(1);
1268	}
1269	eventpipe_read = fds[0];
1270	eventpipe_write = fds[1];
1271	if (poll_add(eventpipe_read) == -1) {
1272		exit(1);
1273	}
1274
1275	act.sa_handler = sig_handler;
1276	act.sa_flags = SA_RESTART;
1277	(void) sigaction(SIGALRM, &act, NULL);
1278
1279	(void) sigset(SIGHUP, sig_handler);
1280	(void) sigset(SIGUSR1, sig_handler);
1281	(void) sigset(SIGTERM, sig_handler);
1282	(void) sigset(SIGINT, sig_handler);
1283	(void) sigset(SIGQUIT, sig_handler);
1284}
1285
1286/*
1287 * Create a routing socket for receiving RTM_IFINFO messages.
1288 */
1289static int
1290setup_rtsock(int af)
1291{
1292	int	s;
1293	int	flags;
1294	int	aware = RTAW_UNDER_IPMP;
1295
1296	s = socket(PF_ROUTE, SOCK_RAW, af);
1297	if (s == -1) {
1298		logperror("setup_rtsock: socket PF_ROUTE");
1299		exit(1);
1300	}
1301
1302	if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) {
1303		logperror("setup_rtsock: setsockopt RT_AWARE");
1304		(void) close(s);
1305		exit(1);
1306	}
1307
1308	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1309		logperror("setup_rtsock: fcntl F_GETFL");
1310		(void) close(s);
1311		exit(1);
1312	}
1313	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1314		logperror("setup_rtsock: fcntl F_SETFL");
1315		(void) close(s);
1316		exit(1);
1317	}
1318	if (poll_add(s) == -1) {
1319		(void) close(s);
1320		exit(1);
1321	}
1322	return (s);
1323}
1324
1325/*
1326 * Process an RTM_IFINFO message received on a routing socket.
1327 * The return value indicates whether a full interface scan is required.
1328 * Link up/down notifications are reflected in the IFF_RUNNING flag.
1329 * If just the state of the IFF_RUNNING interface flag has changed, a
1330 * a full interface scan isn't required.
1331 */
1332static boolean_t
1333process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1334{
1335	struct sockaddr_dl *sdl;
1336	struct phyint *pi;
1337	uint64_t old_flags;
1338	struct phyint_instance *pii;
1339
1340	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1341
1342	/*
1343	 * Although the sockaddr_dl structure is directly after the
1344	 * if_msghdr_t structure. At the time of writing, the size of the
1345	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1346	 * to the presence of a timeval structure, which contains longs,
1347	 * in the if_data structure.  Anyway, we know where the message ends,
1348	 * so we work backwards to get the start of the sockaddr_dl structure.
1349	 */
1350	/*LINTED*/
1351	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1352	    sizeof (struct sockaddr_dl));
1353
1354	assert(sdl->sdl_family == AF_LINK);
1355
1356	/*
1357	 * The interface name is in sdl_data.
1358	 * RTM_IFINFO messages are only generated for logical interface
1359	 * zero, so there is no colon and logical interface number to
1360	 * strip from the name.	 The name is not null terminated, but
1361	 * there should be enough space in sdl_data to add the null.
1362	 */
1363	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1364		if (debug & D_LINKNOTE)
1365			logdebug("process_rtm_ifinfo: phyint name too long\n");
1366		return (_B_TRUE);
1367	}
1368	sdl->sdl_data[sdl->sdl_nlen] = 0;
1369
1370	pi = phyint_lookup(sdl->sdl_data);
1371	if (pi == NULL) {
1372		if (debug & D_LINKNOTE)
1373			logdebug("process_rtm_ifinfo: phyint lookup failed"
1374			    " for %s\n", sdl->sdl_data);
1375		return (_B_TRUE);
1376	}
1377
1378	/*
1379	 * We want to try and avoid doing a full interface scan for
1380	 * link state notifications from the datalink layer, as indicated
1381	 * by the state of the IFF_RUNNING flag.  If just the
1382	 * IFF_RUNNING flag has changed state, the link state changes
1383	 * are processed without a full scan.
1384	 * If there is both an IPv4 and IPv6 instance associated with
1385	 * the physical interface, we will get an RTM_IFINFO message
1386	 * for each instance.  If we just maintained a single copy of
1387	 * the physical interface flags, it would appear that no flags
1388	 * had changed when the second message is processed, leading us
1389	 * to believe that the message wasn't generated by a flags change,
1390	 * and that a full interface scan is required.
1391	 * To get around this problem, two additional copies of the flags
1392	 * are kept, one copy for each instance.  These are only used in
1393	 * this routine.  At any one time, all three copies of the flags
1394	 * should be identical except for the IFF_RUNNING flag.	 The
1395	 * copy of the flags in the "phyint" structure is always up to
1396	 * date.
1397	 */
1398	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1399	if (pii == NULL) {
1400		if (debug & D_LINKNOTE)
1401			logdebug("process_rtm_ifinfo: no instance of address "
1402			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1403		return (_B_TRUE);
1404	}
1405
1406	old_flags = pii->pii_flags;
1407	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1408	pi->pi_flags = pii->pii_flags;
1409
1410	if (debug & D_LINKNOTE) {
1411		logdebug("process_rtm_ifinfo: %s address family: %s, "
1412		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1413		    AF_STR(type), old_flags, pi->pi_flags);
1414	}
1415
1416	/*
1417	 * If IFF_STANDBY has changed, indicate that the interface has changed
1418	 * types and refresh IFF_INACTIVE if need be.
1419	 */
1420	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) {
1421		phyint_changed(pi);
1422		if (pii->pii_flags & IFF_STANDBY)
1423			phyint_standby_refresh_inactive(pi);
1424	}
1425
1426	/* Has just the IFF_RUNNING flag changed state ? */
1427	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1428		struct phyint_instance *pii_other;
1429		/*
1430		 * It wasn't just a link state change.	Update
1431		 * the other instance's copy of the flags.
1432		 */
1433		pii_other = phyint_inst_other(pii);
1434		if (pii_other != NULL)
1435			pii_other->pii_flags = pii->pii_flags;
1436		return (_B_TRUE);
1437	}
1438
1439	return (_B_FALSE);
1440}
1441
1442/*
1443 * Retrieve as many routing socket messages as possible, and try to
1444 * empty the routing sockets. Initiate full scan of targets or interfaces
1445 * as needed.
1446 * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1447 * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1448 */
1449static void
1450process_rtsock(int rtsock_v4, int rtsock_v6)
1451{
1452	int	nbytes;
1453	int64_t msg[2048 / 8];
1454	struct rt_msghdr *rtm;
1455	boolean_t need_if_scan = _B_FALSE;
1456	boolean_t need_rt_scan = _B_FALSE;
1457	boolean_t rtm_ifinfo_seen = _B_FALSE;
1458	int type;
1459
1460	/* Read as many messages as possible and try to empty the sockets */
1461	for (type = AF_INET; ; type = AF_INET6) {
1462		for (;;) {
1463			nbytes = read((type == AF_INET) ? rtsock_v4 :
1464			    rtsock_v6, msg, sizeof (msg));
1465			if (nbytes <= 0) {
1466				/* No more messages */
1467				break;
1468			}
1469			rtm = (struct rt_msghdr *)msg;
1470			if (rtm->rtm_version != RTM_VERSION) {
1471				logerr("process_rtsock: version %d "
1472				    "not understood\n", rtm->rtm_version);
1473				break;
1474			}
1475
1476			if (debug & D_PHYINT) {
1477				logdebug("process_rtsock: message %d\n",
1478				    rtm->rtm_type);
1479			}
1480
1481			switch (rtm->rtm_type) {
1482			case RTM_NEWADDR:
1483			case RTM_DELADDR:
1484				/*
1485				 * Some logical interface has changed,
1486				 * have to scan everything to determine
1487				 * what actually changed.
1488				 */
1489				need_if_scan = _B_TRUE;
1490				break;
1491
1492			case RTM_IFINFO:
1493				rtm_ifinfo_seen = _B_TRUE;
1494				need_if_scan |= process_rtm_ifinfo(
1495				    (if_msghdr_t *)rtm, type);
1496				break;
1497
1498			case RTM_ADD:
1499			case RTM_DELETE:
1500			case RTM_CHANGE:
1501			case RTM_OLDADD:
1502			case RTM_OLDDEL:
1503				need_rt_scan = _B_TRUE;
1504				break;
1505
1506			default:
1507				/* Not interesting */
1508				break;
1509			}
1510		}
1511		if (type == AF_INET6)
1512			break;
1513	}
1514
1515	if (need_if_scan) {
1516		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1517			logdebug("process_rtsock: synchronizing with kernel\n");
1518		initifs();
1519	} else if (rtm_ifinfo_seen) {
1520		if (debug & D_LINKNOTE)
1521			logdebug("process_rtsock: "
1522			    "link up/down notification(s) seen\n");
1523		process_link_state_changes();
1524	}
1525
1526	if (need_rt_scan)
1527		init_router_targets();
1528}
1529
1530/*
1531 * Look if the phyint instance or one of its logints have been removed from
1532 * the kernel and take appropriate action.
1533 * Uses {pii,li}_in_use.
1534 */
1535static void
1536check_if_removed(struct phyint_instance *pii)
1537{
1538	struct logint *li;
1539	struct logint *next_li;
1540
1541	/* Detect phyints that have been removed from the kernel. */
1542	if (!pii->pii_in_use) {
1543		logtrace("%s %s has been removed from kernel\n",
1544		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1545		phyint_inst_delete(pii);
1546	} else {
1547		/* Detect logints that have been removed. */
1548		for (li = pii->pii_logint; li != NULL; li = next_li) {
1549			next_li = li->li_next;
1550			if (!li->li_in_use) {
1551				logint_delete(li);
1552			}
1553		}
1554	}
1555}
1556
1557/*
1558 * Parse the supplied mib2 information to extract the routing information
1559 * table. Process the routing table to get the list of known onlink routers
1560 * and update our database. These onlink routers will serve as probe
1561 * targets.
1562 */
1563static void
1564update_router_list(mib_item_t *item)
1565{
1566	for (; item != NULL; item = item->mi_next) {
1567		if (item->mi_opthdr.name == 0)
1568			continue;
1569		if (item->mi_opthdr.level == MIB2_IP &&
1570		    item->mi_opthdr.name == MIB2_IP_ROUTE) {
1571			ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp,
1572			    item->mi_opthdr.len);
1573		} else if (item->mi_opthdr.level == MIB2_IP6 &&
1574		    item->mi_opthdr.name == MIB2_IP6_ROUTE) {
1575			ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp,
1576			    item->mi_opthdr.len);
1577		}
1578	}
1579}
1580
1581
1582/*
1583 * Convert octet `octp' to a phyint name and store in `ifname'
1584 */
1585static void
1586oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize)
1587{
1588	char *cp;
1589	size_t len = MIN(octp->o_length, ifsize - 1);
1590
1591	(void) strncpy(ifname, octp->o_bytes, len);
1592	ifname[len] = '\0';
1593
1594	if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL)
1595		*cp = '\0';
1596}
1597
1598/*
1599 * Examine the IPv4 routing table `buf' for possible targets.  For each
1600 * possible target, if it's on the same subnet an interface route, pass
1601 * it to router_add_common() for further consideration.
1602 */
1603static void
1604ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1605{
1606	char ifname[LIFNAMSIZ];
1607	mib2_ipRouteEntry_t	*rp, *rp1, *endp;
1608	struct in_addr		nexthop_v4;
1609	struct in6_addr		nexthop;
1610
1611	if (debug & D_TARGET)
1612		logdebug("ire_process_v4(len %d)\n", len);
1613
1614	if (len == 0)
1615		return;
1616
1617	assert((len % ipRouteEntrySize) == 0);
1618	endp = buf + (len / ipRouteEntrySize);
1619
1620	/*
1621	 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1622	 * cross-reference them with the interface routes to determine if
1623	 * they're possible probe targets.
1624	 */
1625	for (rp = buf; rp < endp; rp++) {
1626		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1627			continue;
1628
1629		/* Get the nexthop address. */
1630		nexthop_v4.s_addr = rp->ipRouteNextHop;
1631
1632		/*
1633		 * Rescan the routing table looking for interface routes that
1634		 * are on the same subnet, and try to add them.  If they're
1635		 * not relevant (e.g., the interface route isn't part of an
1636		 * IPMP group, router_add_common() will discard).
1637		 */
1638		for (rp1 = buf; rp1 < endp; rp1++) {
1639			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) ||
1640			    rp1->ipRouteIfIndex.o_length == 0)
1641				continue;
1642
1643			if ((rp1->ipRouteDest & rp1->ipRouteMask) !=
1644			    (nexthop_v4.s_addr & rp1->ipRouteMask))
1645				continue;
1646
1647			oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ);
1648			IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1649			router_add_common(AF_INET, ifname, nexthop);
1650		}
1651	}
1652}
1653
1654void
1655router_add_common(int af, char *ifname, struct in6_addr nexthop)
1656{
1657	struct phyint_instance *pii;
1658	struct phyint *pi;
1659
1660	if (debug & D_TARGET)
1661		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1662
1663	/*
1664	 * Retrieve the phyint instance; bail if it's not known to us yet.
1665	 */
1666	pii = phyint_inst_lookup(af, ifname);
1667	if (pii == NULL)
1668		return;
1669
1670	/*
1671	 * Don't use our own addresses as targets.
1672	 */
1673	if (own_address(nexthop))
1674		return;
1675
1676	/*
1677	 * If the phyint is part a named group, then add the address to all
1678	 * members of the group; note that this is suboptimal in the IPv4 case
1679	 * as it has already been added to all matching interfaces in
1680	 * ire_process_v4(). Otherwise, add the address only to the phyint
1681	 * itself, since other phyints in the anongroup may not be on the same
1682	 * subnet.
1683	 */
1684	pi = pii->pii_phyint;
1685	if (pi->pi_group == phyint_anongroup) {
1686		target_add(pii, nexthop, _B_TRUE);
1687	} else {
1688		pi = pi->pi_group->pg_phyint;
1689		for (; pi != NULL; pi = pi->pi_pgnext)
1690			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1691	}
1692}
1693
1694/*
1695 * Examine the IPv6 routing table `buf' for possible link-local targets, and
1696 * pass any contenders to router_add_common() for further consideration.
1697 */
1698static void
1699ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1700{
1701	struct lifreq lifr;
1702	char ifname[LIFNAMSIZ];
1703	char grname[LIFGRNAMSIZ];
1704	mib2_ipv6RouteEntry_t *rp, *rp1, *endp;
1705	struct in6_addr nexthop_v6;
1706
1707	if (debug & D_TARGET)
1708		logdebug("ire_process_v6(len %d)\n", len);
1709
1710	if (len == 0)
1711		return;
1712
1713	assert((len % ipv6RouteEntrySize) == 0);
1714	endp = buf + (len / ipv6RouteEntrySize);
1715
1716	/*
1717	 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1718	 * cross-reference them with the interface routes to determine if
1719	 * they're possible probe targets.
1720	 */
1721	for (rp = buf; rp < endp; rp++) {
1722		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) ||
1723		    !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop))
1724			continue;
1725
1726		/* Get the nexthop address. */
1727		nexthop_v6 = rp->ipv6RouteNextHop;
1728
1729		/*
1730		 * The interface name should always exist for link-locals;
1731		 * we use it to map this entry to an IPMP group name.
1732		 */
1733		if (rp->ipv6RouteIfIndex.o_length == 0)
1734			continue;
1735
1736		oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ);
1737		if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 ||
1738		    strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) {
1739			continue;
1740		}
1741
1742		/*
1743		 * Rescan the list of routes for interface routes, and add the
1744		 * above target to any interfaces in the same IPMP group.
1745		 */
1746		for (rp1 = buf; rp1 < endp; rp1++) {
1747			if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) ||
1748			    rp1->ipv6RouteIfIndex.o_length == 0) {
1749				continue;
1750			}
1751			oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ);
1752			(void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
1753
1754			if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 &&
1755			    strcmp(lifr.lifr_groupname, grname) == 0) {
1756				router_add_common(AF_INET6, ifname, nexthop_v6);
1757			}
1758		}
1759	}
1760}
1761
1762/*
1763 * Build a list of target routers, by scanning the routing tables.
1764 * It is assumed that interface routes exist, to reach the routers.
1765 */
1766static void
1767init_router_targets(void)
1768{
1769	struct	target *tg;
1770	struct	target *next_tg;
1771	struct	phyint_instance *pii;
1772	struct	phyint *pi;
1773
1774	if (force_mcast)
1775		return;
1776
1777	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1778		pi = pii->pii_phyint;
1779		/*
1780		 * Set tg_in_use to false only for router targets.
1781		 */
1782		if (!pii->pii_targets_are_routers)
1783			continue;
1784
1785		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1786			tg->tg_in_use = 0;
1787	}
1788
1789	if (mibwalk(update_router_list) == -1)
1790		exit(1);
1791
1792	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1793		pi = pii->pii_phyint;
1794		if (!pii->pii_targets_are_routers)
1795			continue;
1796
1797		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
1798			next_tg = tg->tg_next;
1799			/*
1800			 * If the group has failed, it's likely the route was
1801			 * removed by an application affected by that failure.
1802			 * In that case, we keep the target so that we can
1803			 * reliably repair, at which point we'll refresh the
1804			 * target list again.
1805			 */
1806			if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group))
1807				target_delete(tg);
1808		}
1809	}
1810}
1811
1812/*
1813 * Attempt to assign host targets to any interfaces that do not currently
1814 * have probe targets by sharing targets with other interfaces in the group.
1815 */
1816static void
1817init_host_targets(void)
1818{
1819	struct phyint_instance *pii;
1820	struct phyint_group *pg;
1821
1822	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1823		pg = pii->pii_phyint->pi_group;
1824		if (pg != phyint_anongroup && pii->pii_targets == NULL)
1825			dup_host_targets(pii);
1826	}
1827}
1828
1829/*
1830 * Duplicate host targets from other phyints of the group to
1831 * the phyint instance 'desired_pii'.
1832 */
1833static void
1834dup_host_targets(struct phyint_instance	 *desired_pii)
1835{
1836	int af;
1837	struct phyint *pi;
1838	struct phyint_instance *pii;
1839	struct target *tg;
1840
1841	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
1842
1843	af = desired_pii->pii_af;
1844
1845	/*
1846	 * For every phyint in the same group as desired_pii, check if
1847	 * it has any host targets. If so add them to desired_pii.
1848	 */
1849	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
1850		pii = PHYINT_INSTANCE(pi, af);
1851		/*
1852		 * We know that we don't have targets on this phyint instance
1853		 * since we have been called. But we still check for
1854		 * pii_targets_are_routers because another phyint instance
1855		 * could have router targets, since IFF_NOFAILOVER addresses
1856		 * on different phyint instances may belong to different
1857		 * subnets.
1858		 */
1859		if ((pii == NULL) || (pii == desired_pii) ||
1860		    pii->pii_targets_are_routers)
1861			continue;
1862		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1863			target_create(desired_pii, tg->tg_address, _B_FALSE);
1864		}
1865	}
1866}
1867
1868static void
1869usage(char *cmd)
1870{
1871	(void) fprintf(stderr, "usage: %s\n", cmd);
1872}
1873
1874
1875#define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
1876
1877/* Get an option from the /etc/default/mpathd file */
1878static char *
1879getdefault(char *name)
1880{
1881	char namebuf[BUFSIZ];
1882	char *value = NULL;
1883
1884	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
1885		char	*cp;
1886		int	flags;
1887
1888		/*
1889		 * ignore case
1890		 */
1891		flags = defcntl(DC_GETFLAGS, 0);
1892		TURNOFF(flags, DC_CASE);
1893		(void) defcntl(DC_SETFLAGS, flags);
1894
1895		/* Add "=" to the name */
1896		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
1897		(void) strncat(namebuf, "=", 2);
1898
1899		if ((cp = defread(namebuf)) != NULL)
1900			value = strdup(cp);
1901
1902		/* close */
1903		(void) defopen((char *)NULL);
1904	}
1905	return (value);
1906}
1907
1908
1909/*
1910 * Command line options below
1911 */
1912boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
1913boolean_t	track_all_phyints = _B_FALSE;	/* track all IP interfaces */
1914static boolean_t adopt = _B_FALSE;
1915static boolean_t foreground = _B_FALSE;
1916
1917int
1918main(int argc, char *argv[])
1919{
1920	int i;
1921	int c;
1922	struct phyint *pi;
1923	struct phyint_instance *pii;
1924	char *value;
1925
1926	argv0 = argv;		/* Saved for re-exec on SIGHUP */
1927	srandom(gethostid());	/* Initialize the random number generator */
1928
1929	/*
1930	 * NOTE: The messages output by in.mpathd are not suitable for
1931	 * translation, so we do not call textdomain().
1932	 */
1933	(void) setlocale(LC_ALL, "");
1934
1935	/*
1936	 * Get the user specified value of 'failure detection time'
1937	 * from /etc/default/mpathd
1938	 */
1939	value = getdefault("FAILURE_DETECTION_TIME");
1940	if (value != NULL) {
1941		user_failure_detection_time =
1942		    (int)strtol((char *)value, NULL, 0);
1943
1944		if (user_failure_detection_time <= 0) {
1945			user_failure_detection_time = FAILURE_DETECTION_TIME;
1946			logerr("Invalid failure detection time %s, assuming "
1947			    "default of %d ms\n", value,
1948			    user_failure_detection_time);
1949
1950		} else if (user_failure_detection_time <
1951		    MIN_FAILURE_DETECTION_TIME) {
1952			user_failure_detection_time =
1953			    MIN_FAILURE_DETECTION_TIME;
1954			logerr("Too small failure detection time of %s, "
1955			    "assuming minimum of %d ms\n", value,
1956			    user_failure_detection_time);
1957		}
1958		free(value);
1959	} else {
1960		/* User has not specified the parameter, Use default value */
1961		user_failure_detection_time = FAILURE_DETECTION_TIME;
1962	}
1963
1964	/*
1965	 * This gives the frequency at which probes will be sent.
1966	 * When fdt ms elapses, we should be able to determine
1967	 * whether 5 consecutive probes have failed or not.
1968	 * 1 probe will be sent in every user_probe_interval ms,
1969	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
1970	 * user_probe_interval. Thus when we send out probe 'n' we
1971	 * can be sure that probe 'n - 2' is lost, if we have not
1972	 * got the ack. (since the probe interval is > crtt). But
1973	 * probe 'n - 1' may be a valid unacked probe, since the
1974	 * time between 2 successive probes could be as small as
1975	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
1976	 */
1977	user_probe_interval = user_failure_detection_time /
1978	    (NUM_PROBE_FAILS + 2);
1979
1980	/*
1981	 * Get the user specified value of failback_enabled from
1982	 * /etc/default/mpathd
1983	 */
1984	value = getdefault("FAILBACK");
1985	if (value != NULL) {
1986		if (strcasecmp(value, "yes") == 0)
1987			failback_enabled = _B_TRUE;
1988		else if (strcasecmp(value, "no") == 0)
1989			failback_enabled = _B_FALSE;
1990		else
1991			logerr("Invalid value for FAILBACK %s\n", value);
1992		free(value);
1993	} else {
1994		failback_enabled = _B_TRUE;
1995	}
1996
1997	/*
1998	 * Get the user specified value of track_all_phyints from
1999	 * /etc/default/mpathd. The sense is reversed in
2000	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2001	 */
2002	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2003	if (value != NULL) {
2004		if (strcasecmp(value, "yes") == 0)
2005			track_all_phyints = _B_FALSE;
2006		else if (strcasecmp(value, "no") == 0)
2007			track_all_phyints = _B_TRUE;
2008		else
2009			logerr("Invalid value for "
2010			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2011		free(value);
2012	} else {
2013		track_all_phyints = _B_FALSE;
2014	}
2015
2016	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2017		switch (c) {
2018		case 'a':
2019			adopt = _B_TRUE;
2020			break;
2021		case 'm':
2022			force_mcast = _B_TRUE;
2023			break;
2024		case 'd':
2025			debug = D_ALL;
2026			foreground = _B_TRUE;
2027			break;
2028		case 'D':
2029			i = (int)strtol(optarg, NULL, 0);
2030			if (i == 0) {
2031				(void) fprintf(stderr, "Bad debug flags: %s\n",
2032				    optarg);
2033				exit(1);
2034			}
2035			debug |= i;
2036			foreground = _B_TRUE;
2037			break;
2038		case 'l':
2039			/*
2040			 * Turn off link state notification handling.
2041			 * Undocumented command line flag, for debugging
2042			 * purposes.
2043			 */
2044			handle_link_notifications = _B_FALSE;
2045			break;
2046		default:
2047			usage(argv[0]);
2048			exit(1);
2049		}
2050	}
2051
2052	/*
2053	 * The sockets for the loopback command interface should be listening
2054	 * before we fork and exit in daemonize(). This way, whoever started us
2055	 * can use the loopback interface as soon as they get a zero exit
2056	 * status.
2057	 */
2058	lsock_v4 = setup_listener(AF_INET);
2059	lsock_v6 = setup_listener(AF_INET6);
2060
2061	if (lsock_v4 < 0 && lsock_v6 < 0) {
2062		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2063		exit(1);
2064	}
2065
2066	if (!foreground) {
2067		if (!daemonize()) {
2068			logerr("cannot daemonize\n");
2069			exit(EXIT_FAILURE);
2070		}
2071		initlog();
2072	}
2073
2074	/*
2075	 * Initializations:
2076	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2077	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2078	 * 2. Initialize a pipe for handling/recording signal events.
2079	 * 3. Create the routing sockets,  used for listening
2080	 *    to routing / interface changes.
2081	 * 4. phyint_init() - Initialize physical interface state
2082	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2083	 *    which timer_init() does indirectly.
2084	 * 5. Query kernel for route entry sizes (v4 and v6).
2085	 * 6. timer_init()  - Initialize timer related stuff
2086	 * 7. initifs() - Initialize our database of all known interfaces
2087	 * 8. init_router_targets() - Initialize our database of all known
2088	 *    router targets.
2089	 */
2090	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2091	if (ifsock_v4 < 0) {
2092		logperror("main: IPv4 socket open");
2093		exit(1);
2094	}
2095
2096	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2097	if (ifsock_v6 < 0) {
2098		logperror("main: IPv6 socket open");
2099		exit(1);
2100	}
2101
2102	setup_eventpipe();
2103
2104	rtsock_v4 = setup_rtsock(AF_INET);
2105	rtsock_v6 = setup_rtsock(AF_INET6);
2106
2107	if (phyint_init() == -1) {
2108		logerr("cannot initialize physical interface structures");
2109		exit(1);
2110	}
2111
2112	if (mibwalk(mib_get_constants) == -1)
2113		exit(1);
2114
2115	timer_init();
2116
2117	initifs();
2118
2119	/*
2120	 * If we're operating in "adopt" mode and no interfaces need to be
2121	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2122	 * interfaces are subsequently put into multipathing groups).
2123	 */
2124	if (adopt && phyint_instances == NULL)
2125		exit(0);
2126
2127	/*
2128	 * Main body. Keep listening for activity on any of the sockets
2129	 * that we are monitoring and take appropriate action as necessary.
2130	 * signals are also handled synchronously.
2131	 */
2132	for (;;) {
2133		if (poll(pollfds, pollfd_num, -1) < 0) {
2134			if (errno == EINTR)
2135				continue;
2136			logperror("main: poll");
2137			exit(1);
2138		}
2139		for (i = 0; i < pollfd_num; i++) {
2140			if ((pollfds[i].fd == -1) ||
2141			    !(pollfds[i].revents & POLLIN))
2142				continue;
2143			if (pollfds[i].fd == eventpipe_read) {
2144				in_signal(eventpipe_read);
2145				break;
2146			}
2147			if (pollfds[i].fd == rtsock_v4 ||
2148			    pollfds[i].fd == rtsock_v6) {
2149				process_rtsock(rtsock_v4, rtsock_v6);
2150				break;
2151			}
2152
2153			for (pii = phyint_instances; pii != NULL;
2154			    pii = pii->pii_next) {
2155				if (pollfds[i].fd == pii->pii_probe_sock) {
2156					if (pii->pii_af == AF_INET)
2157						in_data(pii);
2158					else
2159						in6_data(pii);
2160					break;
2161				}
2162			}
2163
2164			for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2165				if (pi->pi_notes != 0 &&
2166				    pollfds[i].fd == dlpi_fd(pi->pi_dh)) {
2167					(void) dlpi_recv(pi->pi_dh, NULL, NULL,
2168					    NULL, NULL, 0, NULL);
2169					break;
2170				}
2171			}
2172
2173			if (pollfds[i].fd == lsock_v4)
2174				loopback_cmd(lsock_v4, AF_INET);
2175			else if (pollfds[i].fd == lsock_v6)
2176				loopback_cmd(lsock_v6, AF_INET6);
2177		}
2178	}
2179	/* NOTREACHED */
2180	return (EXIT_SUCCESS);
2181}
2182
2183static int
2184setup_listener(int af)
2185{
2186	int sock;
2187	int on;
2188	int len;
2189	int ret;
2190	struct sockaddr_storage laddr;
2191	struct sockaddr_in  *sin;
2192	struct sockaddr_in6 *sin6;
2193	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2194
2195	assert(af == AF_INET || af == AF_INET6);
2196
2197	sock = socket(af, SOCK_STREAM, 0);
2198	if (sock < 0) {
2199		logperror("setup_listener: socket");
2200		exit(1);
2201	}
2202
2203	on = 1;
2204	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2205	    sizeof (on)) < 0) {
2206		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2207		exit(1);
2208	}
2209
2210	bzero(&laddr, sizeof (laddr));
2211	laddr.ss_family = af;
2212
2213	if (af == AF_INET) {
2214		sin = (struct sockaddr_in *)&laddr;
2215		sin->sin_port = htons(MPATHD_PORT);
2216		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2217		len = sizeof (struct sockaddr_in);
2218	} else {
2219		sin6 = (struct sockaddr_in6 *)&laddr;
2220		sin6->sin6_port = htons(MPATHD_PORT);
2221		sin6->sin6_addr = loopback_addr;
2222		len = sizeof (struct sockaddr_in6);
2223	}
2224
2225	ret = bind(sock, (struct sockaddr *)&laddr, len);
2226	if (ret < 0) {
2227		if (errno == EADDRINUSE) {
2228			/*
2229			 * Another instance of mpathd may be already active.
2230			 */
2231			logerr("main: is another instance of in.mpathd "
2232			    "already active?\n");
2233			exit(1);
2234		} else {
2235			(void) close(sock);
2236			return (-1);
2237		}
2238	}
2239	if (listen(sock, 30) < 0) {
2240		logperror("main: listen");
2241		exit(1);
2242	}
2243	if (poll_add(sock) == -1) {
2244		(void) close(sock);
2245		exit(1);
2246	}
2247
2248	return (sock);
2249}
2250
2251/*
2252 * Table of commands and their expected size; used by loopback_cmd().
2253 */
2254static struct {
2255	const char	*name;
2256	unsigned int	size;
2257} commands[] = {
2258	{ "MI_PING",		sizeof (uint32_t)	},
2259	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2260	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2261	{ "MI_QUERY",		sizeof (mi_query_t)	}
2262};
2263
2264/*
2265 * Commands received over the loopback interface come here (via libipmp).
2266 */
2267static void
2268loopback_cmd(int sock, int family)
2269{
2270	int newfd;
2271	ssize_t len;
2272	boolean_t is_priv = _B_FALSE;
2273	struct sockaddr_storage	peer;
2274	struct sockaddr_in	*peer_sin;
2275	struct sockaddr_in6	*peer_sin6;
2276	socklen_t peerlen;
2277	union mi_commands mpi;
2278	char abuf[INET6_ADDRSTRLEN];
2279	uint_t cmd;
2280	int retval;
2281
2282	peerlen = sizeof (peer);
2283	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2284	if (newfd < 0) {
2285		logperror("loopback_cmd: accept");
2286		return;
2287	}
2288
2289	switch (family) {
2290	case AF_INET:
2291		/*
2292		 * Validate the address and port to make sure that
2293		 * non privileged processes don't connect and start
2294		 * talking to us.
2295		 */
2296		if (peerlen != sizeof (struct sockaddr_in)) {
2297			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2298			(void) close(newfd);
2299			return;
2300		}
2301		peer_sin = (struct sockaddr_in *)&peer;
2302		is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED;
2303		(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2304		    abuf, sizeof (abuf));
2305
2306		if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) {
2307			logerr("Attempt to connect from addr %s port %d\n",
2308			    abuf, ntohs(peer_sin->sin_port));
2309			(void) close(newfd);
2310			return;
2311		}
2312		break;
2313
2314	case AF_INET6:
2315		if (peerlen != sizeof (struct sockaddr_in6)) {
2316			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2317			(void) close(newfd);
2318			return;
2319		}
2320		/*
2321		 * Validate the address and port to make sure that
2322		 * non privileged processes don't connect and start
2323		 * talking to us.
2324		 */
2325		peer_sin6 = (struct sockaddr_in6 *)&peer;
2326		is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED;
2327		(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2328		    sizeof (abuf));
2329		if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) {
2330			logerr("Attempt to connect from addr %s port %d\n",
2331			    abuf, ntohs(peer_sin6->sin6_port));
2332			(void) close(newfd);
2333			return;
2334		}
2335
2336	default:
2337		logdebug("loopback_cmd: family %d\n", family);
2338		(void) close(newfd);
2339		return;
2340	}
2341
2342	/*
2343	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2344	 * all supported commands
2345	 */
2346	len = read(newfd, &mpi, sizeof (mpi));
2347
2348	/*
2349	 * In theory, we can receive any sized message for a stream socket,
2350	 * but we don't expect that to happen for a small message over a
2351	 * loopback connection.
2352	 */
2353	if (len < sizeof (uint32_t)) {
2354		logerr("loopback_cmd: bad command format or read returns "
2355		    "partial data %d\n", len);
2356		(void) close(newfd);
2357		return;
2358	}
2359
2360	cmd = mpi.mi_command;
2361	if (cmd >= MI_NCMD) {
2362		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2363		(void) close(newfd);
2364		return;
2365	}
2366
2367	/*
2368	 * Only MI_PING and MI_QUERY can come from unprivileged sources.
2369	 */
2370	if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) {
2371		logerr("Unprivileged request from %s for privileged "
2372		    "command %s\n", abuf, commands[cmd].name);
2373		(void) close(newfd);
2374		return;
2375	}
2376
2377	if (len < commands[cmd].size) {
2378		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2379		    commands[cmd].name, commands[cmd].size, len);
2380		(void) close(newfd);
2381		return;
2382	}
2383
2384	retval = process_cmd(newfd, &mpi);
2385	if (retval != IPMP_SUCCESS) {
2386		logerr("failed processing %s: %s\n", commands[cmd].name,
2387		    ipmp_errmsg(retval));
2388	}
2389	(void) close(newfd);
2390}
2391
2392/*
2393 * Process the commands received via libipmp.
2394 */
2395static unsigned int
2396process_cmd(int newfd, union mi_commands *mpi)
2397{
2398	struct phyint *pi;
2399	struct mi_offline *mio;
2400	struct mi_undo_offline *miu;
2401	unsigned int retval;
2402
2403	switch (mpi->mi_command) {
2404	case MI_PING:
2405		return (send_result(newfd, IPMP_SUCCESS, 0));
2406
2407	case MI_OFFLINE:
2408		mio = &mpi->mi_ocmd;
2409
2410		pi = phyint_lookup(mio->mio_ifname);
2411		if (pi == NULL)
2412			return (send_result(newfd, IPMP_EUNKIF, 0));
2413
2414		retval = phyint_offline(pi, mio->mio_min_redundancy);
2415		if (retval == IPMP_FAILURE)
2416			return (send_result(newfd, IPMP_FAILURE, errno));
2417
2418		return (send_result(newfd, retval, 0));
2419
2420	case MI_UNDO_OFFLINE:
2421		miu = &mpi->mi_ucmd;
2422
2423		pi = phyint_lookup(miu->miu_ifname);
2424		if (pi == NULL)
2425			return (send_result(newfd, IPMP_EUNKIF, 0));
2426
2427		retval = phyint_undo_offline(pi);
2428		if (retval == IPMP_FAILURE)
2429			return (send_result(newfd, IPMP_FAILURE, errno));
2430
2431		return (send_result(newfd, retval, 0));
2432
2433	case MI_QUERY:
2434		return (process_query(newfd, &mpi->mi_qcmd));
2435
2436	default:
2437		break;
2438	}
2439
2440	return (send_result(newfd, IPMP_EPROTO, 0));
2441}
2442
2443/*
2444 * Process the query request pointed to by `miq' and send a reply on file
2445 * descriptor `fd'.  Returns an IPMP error code.
2446 */
2447static unsigned int
2448process_query(int fd, mi_query_t *miq)
2449{
2450	ipmp_addrinfo_t		*adinfop;
2451	ipmp_addrinfolist_t	*adlp;
2452	ipmp_groupinfo_t	*grinfop;
2453	ipmp_groupinfolist_t	*grlp;
2454	ipmp_grouplist_t	*grlistp;
2455	ipmp_ifinfo_t		*ifinfop;
2456	ipmp_ifinfolist_t	*iflp;
2457	ipmp_snap_t		*snap;
2458	unsigned int		retval;
2459
2460	switch (miq->miq_inforeq) {
2461	case IPMP_ADDRINFO:
2462		retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr,
2463		    &adinfop);
2464		if (retval != IPMP_SUCCESS)
2465			return (send_result(fd, retval, errno));
2466
2467		retval = send_result(fd, IPMP_SUCCESS, 0);
2468		if (retval == IPMP_SUCCESS)
2469			retval = send_addrinfo(fd, adinfop);
2470
2471		ipmp_freeaddrinfo(adinfop);
2472		return (retval);
2473
2474	case IPMP_GROUPLIST:
2475		retval = getgrouplist(&grlistp);
2476		if (retval != IPMP_SUCCESS)
2477			return (send_result(fd, retval, errno));
2478
2479		retval = send_result(fd, IPMP_SUCCESS, 0);
2480		if (retval == IPMP_SUCCESS)
2481			retval = send_grouplist(fd, grlistp);
2482
2483		ipmp_freegrouplist(grlistp);
2484		return (retval);
2485
2486	case IPMP_GROUPINFO:
2487		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2488		retval = getgroupinfo(miq->miq_grname, &grinfop);
2489		if (retval != IPMP_SUCCESS)
2490			return (send_result(fd, retval, errno));
2491
2492		retval = send_result(fd, IPMP_SUCCESS, 0);
2493		if (retval == IPMP_SUCCESS)
2494			retval = send_groupinfo(fd, grinfop);
2495
2496		ipmp_freegroupinfo(grinfop);
2497		return (retval);
2498
2499	case IPMP_IFINFO:
2500		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2501		retval = getifinfo(miq->miq_ifname, &ifinfop);
2502		if (retval != IPMP_SUCCESS)
2503			return (send_result(fd, retval, errno));
2504
2505		retval = send_result(fd, IPMP_SUCCESS, 0);
2506		if (retval == IPMP_SUCCESS)
2507			retval = send_ifinfo(fd, ifinfop);
2508
2509		ipmp_freeifinfo(ifinfop);
2510		return (retval);
2511
2512	case IPMP_SNAP:
2513		/*
2514		 * Before taking the snapshot, sync with the kernel.
2515		 */
2516		initifs();
2517
2518		retval = getsnap(&snap);
2519		if (retval != IPMP_SUCCESS)
2520			return (send_result(fd, retval, errno));
2521
2522		retval = send_result(fd, IPMP_SUCCESS, 0);
2523		if (retval != IPMP_SUCCESS)
2524			goto out;
2525
2526		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2527		if (retval != IPMP_SUCCESS)
2528			goto out;
2529
2530		retval = send_grouplist(fd, snap->sn_grlistp);
2531		if (retval != IPMP_SUCCESS)
2532			goto out;
2533
2534		iflp = snap->sn_ifinfolistp;
2535		for (; iflp != NULL; iflp = iflp->ifl_next) {
2536			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2537			if (retval != IPMP_SUCCESS)
2538				goto out;
2539		}
2540
2541		grlp = snap->sn_grinfolistp;
2542		for (; grlp != NULL; grlp = grlp->grl_next) {
2543			retval = send_groupinfo(fd, grlp->grl_grinfop);
2544			if (retval != IPMP_SUCCESS)
2545				goto out;
2546		}
2547
2548		adlp = snap->sn_adinfolistp;
2549		for (; adlp != NULL; adlp = adlp->adl_next) {
2550			retval = send_addrinfo(fd, adlp->adl_adinfop);
2551			if (retval != IPMP_SUCCESS)
2552				goto out;
2553		}
2554	out:
2555		ipmp_snap_free(snap);
2556		return (retval);
2557
2558	default:
2559		break;
2560
2561	}
2562	return (send_result(fd, IPMP_EPROTO, 0));
2563}
2564
2565/*
2566 * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2567 * Returns an IPMP error code.
2568 */
2569static unsigned int
2570send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2571{
2572	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2573	ipmp_addrlist_t	*adlistp = grinfop->gr_adlistp;
2574	unsigned int	retval;
2575
2576	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2577	if (retval != IPMP_SUCCESS)
2578		return (retval);
2579
2580	retval = ipmp_writetlv(fd, IPMP_IFLIST,
2581	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp);
2582	if (retval != IPMP_SUCCESS)
2583		return (retval);
2584
2585	return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2586	    IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp));
2587}
2588
2589/*
2590 * Send the interface information pointed to by `ifinfop' on file descriptor
2591 * `fd'.  Returns an IPMP error code.
2592 */
2593static unsigned int
2594send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2595{
2596	ipmp_addrlist_t	*adlist4p = ifinfop->if_targinfo4.it_targlistp;
2597	ipmp_addrlist_t	*adlist6p = ifinfop->if_targinfo6.it_targlistp;
2598	unsigned int	retval;
2599
2600	retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop);
2601	if (retval != IPMP_SUCCESS)
2602		return (retval);
2603
2604	retval = ipmp_writetlv(fd, IPMP_ADDRLIST,
2605	    IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p);
2606	if (retval != IPMP_SUCCESS)
2607		return (retval);
2608
2609	return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2610	    IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p));
2611}
2612
2613/*
2614 * Send the address information pointed to by `adinfop' on file descriptor
2615 * `fd'.  Returns an IPMP error code.
2616 */
2617static unsigned int
2618send_addrinfo(int fd, ipmp_addrinfo_t *adinfop)
2619{
2620	return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop));
2621}
2622
2623/*
2624 * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2625 * Returns an IPMP error code.
2626 */
2627static unsigned int
2628send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2629{
2630	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2631	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2632}
2633
2634/*
2635 * Initialize an mi_result_t structure using `error' and `syserror' and
2636 * send it on file descriptor `fd'.  Returns an IPMP error code.
2637 */
2638static unsigned int
2639send_result(int fd, unsigned int error, int syserror)
2640{
2641	mi_result_t me;
2642
2643	me.me_mpathd_error = error;
2644	if (error == IPMP_FAILURE)
2645		me.me_sys_error = syserror;
2646	else
2647		me.me_sys_error = 0;
2648
2649	return (ipmp_write(fd, &me, sizeof (me)));
2650}
2651
2652/*
2653 * Daemonize the process.
2654 */
2655static boolean_t
2656daemonize(void)
2657{
2658	switch (fork()) {
2659	case -1:
2660		return (_B_FALSE);
2661
2662	case  0:
2663		/*
2664		 * Lose our controlling terminal, and become both a session
2665		 * leader and a process group leader.
2666		 */
2667		if (setsid() == -1)
2668			return (_B_FALSE);
2669
2670		/*
2671		 * Under POSIX, a session leader can accidentally (through
2672		 * open(2)) acquire a controlling terminal if it does not
2673		 * have one.  Just to be safe, fork() again so we are not a
2674		 * session leader.
2675		 */
2676		switch (fork()) {
2677		case -1:
2678			return (_B_FALSE);
2679
2680		case 0:
2681			(void) chdir("/");
2682			(void) umask(022);
2683			(void) fdwalk(closefunc, NULL);
2684			break;
2685
2686		default:
2687			_exit(EXIT_SUCCESS);
2688		}
2689		break;
2690
2691	default:
2692		_exit(EXIT_SUCCESS);
2693	}
2694
2695	return (_B_TRUE);
2696}
2697
2698/*
2699 * The parent has created some fds before forking on purpose, keep them open.
2700 */
2701static int
2702closefunc(void *not_used, int fd)
2703/* ARGSUSED */
2704{
2705	if (fd != lsock_v4 && fd != lsock_v6)
2706		(void) close(fd);
2707	return (0);
2708}
2709
2710/* LOGGER */
2711
2712#include <syslog.h>
2713
2714/*
2715 * Logging routines.  All routines log to syslog, unless the daemon is
2716 * running in the foreground, in which case the logging goes to stderr.
2717 *
2718 * The following routines are available:
2719 *
2720 *	logdebug(): A printf-like function for outputting debug messages
2721 *	(messages at LOG_DEBUG) that are only of use to developers.
2722 *
2723 *	logtrace(): A printf-like function for outputting tracing messages
2724 *	(messages at LOG_INFO) from the daemon.	 This is typically used
2725 *	to log the receipt of interesting network-related conditions.
2726 *
2727 *	logerr(): A printf-like function for outputting error messages
2728 *	(messages at LOG_ERR) from the daemon.
2729 *
2730 *	logperror*(): A set of functions used to output error messages
2731 *	(messages at LOG_ERR); these automatically append strerror(errno)
2732 *	and a newline to the message passed to them.
2733 *
2734 * NOTE: since the logging functions write to syslog, the messages passed
2735 *	 to them are not eligible for localization.  Thus, gettext() must
2736 *	 *not* be used.
2737 */
2738
2739static int logging = 0;
2740
2741static void
2742initlog(void)
2743{
2744	logging++;
2745	openlog("in.mpathd", LOG_PID, LOG_DAEMON);
2746}
2747
2748/* PRINTFLIKE2 */
2749void
2750logmsg(int pri, const char *fmt, ...)
2751{
2752	va_list ap;
2753
2754	va_start(ap, fmt);
2755
2756	if (logging)
2757		vsyslog(pri, fmt, ap);
2758	else
2759		(void) vfprintf(stderr, fmt, ap);
2760	va_end(ap);
2761}
2762
2763/* PRINTFLIKE1 */
2764void
2765logperror(const char *str)
2766{
2767	if (logging)
2768		syslog(LOG_ERR, "%s: %m\n", str);
2769	else
2770		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
2771}
2772
2773void
2774logperror_pii(struct phyint_instance *pii, const char *str)
2775{
2776	if (logging) {
2777		syslog(LOG_ERR, "%s (%s %s): %m\n",
2778		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
2779	} else {
2780		(void) fprintf(stderr, "%s (%s %s): %s\n",
2781		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
2782		    strerror(errno));
2783	}
2784}
2785
2786void
2787logperror_li(struct logint *li, const char *str)
2788{
2789	struct	phyint_instance	*pii = li->li_phyint_inst;
2790
2791	if (logging) {
2792		syslog(LOG_ERR, "%s (%s %s): %m\n",
2793		    str, AF_STR(pii->pii_af), li->li_name);
2794	} else {
2795		(void) fprintf(stderr, "%s (%s %s): %s\n",
2796		    str, AF_STR(pii->pii_af), li->li_name,
2797		    strerror(errno));
2798	}
2799}
2800
2801void
2802close_probe_socket(struct phyint_instance *pii, boolean_t polled)
2803{
2804	if (polled)
2805		(void) poll_remove(pii->pii_probe_sock);
2806	(void) close(pii->pii_probe_sock);
2807	pii->pii_probe_sock = -1;
2808	pii->pii_basetime_inited = 0;
2809}
2810
2811boolean_t
2812addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags,
2813    struct sockaddr_storage *ssp)
2814{
2815	addrlist_t *addrp;
2816
2817	if ((addrp = malloc(sizeof (addrlist_t))) == NULL)
2818		return (_B_FALSE);
2819
2820	(void) strlcpy(addrp->al_name, name, LIFNAMSIZ);
2821	addrp->al_flags = flags;
2822	addrp->al_addr = *ssp;
2823	addrp->al_next = *addrsp;
2824	*addrsp = addrp;
2825	return (_B_TRUE);
2826}
2827
2828void
2829addrlist_free(addrlist_t **addrsp)
2830{
2831	addrlist_t *addrp, *next_addrp;
2832
2833	for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) {
2834		next_addrp = addrp->al_next;
2835		free(addrp);
2836	}
2837	*addrsp = NULL;
2838}
2839
2840/*
2841 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
2842 * tables defined by mib2.h. Pass the table information returned to the
2843 * supplied function.
2844 */
2845static int
2846mibwalk(void (*proc)(mib_item_t *))
2847{
2848	mib_item_t		*head_item = NULL;
2849	mib_item_t		*last_item = NULL;
2850	mib_item_t		*tmp;
2851	struct strbuf		ctlbuf, databuf;
2852	int			flags;
2853	int			rval;
2854	uintptr_t		buf[512 / sizeof (uintptr_t)];
2855	struct T_optmgmt_req	*tor = (struct T_optmgmt_req *)buf;
2856	struct T_optmgmt_ack	*toa = (struct T_optmgmt_ack *)buf;
2857	struct T_error_ack	*tea = (struct T_error_ack *)buf;
2858	struct opthdr		*req, *optp;
2859	int			status = -1;
2860
2861	if (mibfd == -1) {
2862		if ((mibfd = open("/dev/ip", O_RDWR)) < 0) {
2863			logperror("mibwalk(): ip open");
2864			return (status);
2865		}
2866	}
2867
2868	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
2869	tor->OPT_offset = sizeof (struct T_optmgmt_req);
2870	tor->OPT_length = sizeof (struct opthdr);
2871	tor->MGMT_flags = T_CURRENT;
2872
2873	/*
2874	 * Note: we use the special level value below so that IP will return
2875	 * us information concerning IRE_MARK_TESTHIDDEN routes.
2876	 */
2877	req = (struct opthdr *)&tor[1];
2878	req->level = EXPER_IP_AND_ALL_IRES;
2879	req->name  = 0;
2880	req->len   = 0;
2881
2882	ctlbuf.buf = (char *)&buf;
2883	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
2884
2885	if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) {
2886		logperror("mibwalk(): putmsg(ctl)");
2887		return (status);
2888	}
2889
2890	/*
2891	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
2892	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
2893	 * a control and data part. The control part contains a struct
2894	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
2895	 * the level, name and length of the data in the data part. The
2896	 * data part contains the actual table data. The last message
2897	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
2898	 * single option with zero optlen.
2899	 */
2900	for (;;) {
2901		errno = flags = 0;
2902		ctlbuf.maxlen = sizeof (buf);
2903		rval = getmsg(mibfd, &ctlbuf, NULL, &flags);
2904		if (rval & MORECTL || rval < 0) {
2905			if (errno == EINTR)
2906				continue;
2907			logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n",
2908			    rval, errno);
2909			goto error;
2910		}
2911		if (ctlbuf.len < sizeof (t_scalar_t)) {
2912			logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len);
2913			goto error;
2914		}
2915
2916		switch (toa->PRIM_type) {
2917		case T_ERROR_ACK:
2918			if (ctlbuf.len < sizeof (struct T_error_ack)) {
2919				logerr("mibwalk(): T_ERROR_ACK ctlbuf "
2920				    "too short: %d\n", ctlbuf.len);
2921				goto error;
2922			}
2923			logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n"
2924			    " UNIX_err = 0x%lx\n", tea->TLI_error,
2925			    t_strerror(tea->TLI_error), tea->UNIX_error);
2926			goto error;
2927
2928		case T_OPTMGMT_ACK:
2929			optp = (struct opthdr *)&toa[1];
2930			if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) +
2931			    sizeof (struct opthdr))) {
2932				logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too "
2933				    "short: %d\n", ctlbuf.len);
2934				goto error;
2935			}
2936			if (toa->MGMT_flags != T_SUCCESS) {
2937				logerr("mibwalk(): MGMT_flags != T_SUCCESS: "
2938				    "0x%lx\n", toa->MGMT_flags);
2939				goto error;
2940			}
2941			break;
2942
2943		default:
2944			goto error;
2945		}
2946		/* The following assert also implies MGMT_flags == T_SUCCESS */
2947		assert(toa->PRIM_type == T_OPTMGMT_ACK);
2948
2949		/*
2950		 * We have reached the end of this T_OPTMGMT_ACK
2951		 * message. If this is the last message i.e EOD,
2952		 * break, else process the next T_OPTMGMT_ACK msg.
2953		 */
2954		if (rval == 0) {
2955			if (optp->len == 0 && optp->name == 0 &&
2956			    optp->level == 0) {
2957				/* This is the EOD message. */
2958				break;
2959			}
2960			/* Not EOD but no data to retrieve */
2961			continue;
2962		}
2963
2964		/*
2965		 * We should only be here if MOREDATA was set.
2966		 * Allocate an empty mib_item_t and link into the list
2967		 * of MIB items.
2968		 */
2969		if ((tmp = malloc(sizeof (*tmp))) == NULL) {
2970			logperror("mibwalk(): malloc() failed.");
2971			goto error;
2972		}
2973		if (last_item != NULL)
2974			last_item->mi_next = tmp;
2975		else
2976			head_item = tmp;
2977		last_item = tmp;
2978		last_item->mi_next = NULL;
2979		last_item->mi_opthdr = *optp;
2980		last_item->mi_valp = malloc(optp->len);
2981		if (last_item->mi_valp == NULL) {
2982			logperror("mibwalk(): malloc() failed.");
2983			goto error;
2984		}
2985
2986		databuf.maxlen = last_item->mi_opthdr.len;
2987		databuf.buf = (char *)last_item->mi_valp;
2988		databuf.len = 0;
2989
2990		/* Retrieve the actual MIB data */
2991		for (;;) {
2992			flags = 0;
2993			if ((rval = getmsg(mibfd, NULL, &databuf,
2994			    &flags)) != 0) {
2995				if (rval < 0 && errno == EINTR)
2996					continue;
2997				/*
2998				 * We shouldn't get MOREDATA here so treat that
2999				 * as an error.
3000				 */
3001				logperror("mibwalk(): getmsg(data)");
3002				goto error;
3003			}
3004			break;
3005		}
3006	}
3007	status = 0;
3008	/* Pass the accumulated MIB data to the supplied function pointer */
3009	(*proc)(head_item);
3010error:
3011	while (head_item != NULL) {
3012		tmp = head_item;
3013		head_item = tmp->mi_next;
3014		free(tmp->mi_valp);
3015		free(tmp);
3016	}
3017	return (status);
3018}
3019
3020/*
3021 * Parse the supplied mib2 information to get the size of routing table
3022 * entries. This is needed when running in a branded zone where the
3023 * Solaris application environment and the Solaris kernel may not be the
3024 * the same release version.
3025 */
3026static void
3027mib_get_constants(mib_item_t *item)
3028{
3029	mib2_ip_t		*ipv4;
3030	mib2_ipv6IfStatsEntry_t	*ipv6;
3031
3032	for (; item != NULL; item = item->mi_next) {
3033		if (item->mi_opthdr.name != 0)
3034			continue;
3035		if (item->mi_opthdr.level == MIB2_IP) {
3036			ipv4 = (mib2_ip_t *)item->mi_valp;
3037			ipRouteEntrySize = ipv4->ipRouteEntrySize;
3038		} else if (item->mi_opthdr.level == MIB2_IP6) {
3039			ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp;
3040			ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize;
3041		}
3042	}
3043}
3044