1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 *	nfs_cast.c : broadcast to a specific group of NFS servers
24 *
25 *	Copyright (c) 1988-1996,1998,1999,2001 by Sun Microsystems, Inc.
26 *	All rights reserved.
27 */
28
29/*
30 * Portions Copyright 2007-2011 Apple Inc.
31 */
32
33#pragma ident	"@(#)nfs_cast.c	1.26	05/06/08 SMI"
34
35#include <stdio.h>
36#include <syslog.h>
37#include <errno.h>
38#include <string.h>
39#include <sys/types.h>
40#include <sys/time.h>
41#include <sys/resource.h>
42#include <unistd.h>
43#include <stdlib.h>
44#include <oncrpc/rpc.h>
45#include <oncrpc/pmap_prot.h>
46#include <sys/socket.h>
47#include <netdb.h>
48#define	NFSCLIENT
49#include <locale.h>
50
51#include "automount.h"
52
53#define	PENALTY_WEIGHT    100000
54
55struct tstamps {
56	struct tstamps	*ts_next;
57	int		ts_penalty;
58	int		ts_inx;
59	int		ts_rcvd;
60	struct timeval	ts_timeval;
61};
62
63/* A list of addresses - all belonging to the same transport */
64
65struct addrs {
66	struct addrs		*addr_next;
67	struct mapfs		*addr_mfs;
68	struct hostent		*addr_addrs;
69	struct tstamps		*addr_if_tstamps;
70};
71
72/* A list of connectionless transports */
73
74struct transp {
75	struct transp		*tr_next;
76	int			tr_fd;
77	const char		*tr_afname;
78	struct addrs		*tr_addrs;
79};
80
81/* A list of map entries and their roundtrip times, for sorting */
82
83struct sm {
84	struct mapfs *mfs;
85	struct timeval timeval;
86};
87
88static void free_transports(struct transp *);
89static void calc_resp_time(struct timeval *);
90static struct mapfs *sort_responses(struct transp *);
91static int host_sm(const void *, const void *b);
92static int time_sm(const void *, const void *b);
93extern struct mapfs *add_mfs(struct mapfs *, int, struct mapfs **,
94	struct mapfs **);
95
96struct aftype {
97	int	afnum;
98	char	*name;
99};
100
101/*
102 * This routine is designed to be able to "ping"
103 * a list of hosts and create a list of responding
104 * hosts sorted by response time.
105 * This must be done without any prior
106 * contact with the host - therefore the "ping"
107 * must be to a "well-known" address.  The outstanding
108 * candidate here is the address of the portmapper/rpcbind.
109 *
110 * A response to a ping is no guarantee that the host
111 * is running NFS, has a mount daemon, or exports
112 * the required filesystem.  If the subsequent
113 * mount attempt fails then the host will be marked
114 * "ignore" and the host list will be re-pinged
115 * (sans the bad host). This process continues
116 * until a successful mount is achieved or until
117 * there are no hosts left to try.
118 */
119enum clnt_stat
120nfs_cast(struct mapfs *mfs_in, struct mapfs **mfs_out, int timeout)
121{
122	struct servent *portmap;
123	enum clnt_stat clnt_stat;
124	AUTH *sys_auth = authunix_create_default();
125	XDR xdr_stream;
126	register XDR *xdrs = &xdr_stream;
127	int outlen;
128	static const struct aftype aflist[] = {
129		{ AF_INET, "IPv4" },
130#ifdef HAVE_IPV6_SUPPORT
131		{ AF_INET6, "IPv6" }
132#endif
133	};
134#define N_AFS	(sizeof aflist / sizeof aflist[0])
135	int if_inx;
136	int tsec;
137	int sent, addr_cnt, rcvd;
138	fd_set readfds, mask;
139	register uint32_t xid;		/* xid - unique per addr */
140	register int i;
141	struct rpc_msg msg;
142	struct timeval t, rcv_timeout;
143	char outbuf[UDPMSGSIZE], inbuf[UDPMSGSIZE];
144	struct hostent *hp;
145	int error_num;
146	char **hostaddrs;
147	struct sockaddr_storage to_addr;
148	struct sockaddr *to;
149	struct sockaddr_storage from_addr;
150	socklen_t fromlen;
151	ssize_t len;
152	struct transp *tr_head;
153	struct transp *trans, *prev_trans;
154	struct addrs *a, *prev_addr;
155	struct tstamps *ts, *prev_ts;
156	size_t af_idx;
157	int af;
158	struct rlimit rl;
159	int dtbsize;
160	struct mapfs *mfs;
161
162	portmap = getservbyname("sunrpc", "udp");
163
164	/*
165	 * For each connectionless transport get a list of
166	 * host addresses.  Any single host may have
167	 * addresses on several transports.
168	 */
169	addr_cnt = sent = rcvd = 0;
170	tr_head = NULL;
171	FD_ZERO(&mask);
172
173	/*
174	 * Set the default select size to be the maximum FD_SETSIZE, unless
175	 * the current rlimit is lower.
176	 */
177	dtbsize = FD_SETSIZE;
178	if (getrlimit(RLIMIT_NOFILE, &rl) == 0) {
179		if (rl.rlim_cur < FD_SETSIZE)
180			dtbsize = (int)rl.rlim_cur;
181	}
182
183	prev_trans = NULL;
184	prev_addr = NULL;
185	prev_ts = NULL;
186	for (mfs = mfs_in; mfs; mfs = mfs->mfs_next) {
187
188		if (trace > 2)
189			trace_prt(1, "nfs_cast: host=%s\n", mfs->mfs_host);
190
191		for (af_idx = 0; af_idx < N_AFS; af_idx++) {
192			af = aflist[af_idx].afnum;
193			trans = (struct transp *)malloc(sizeof (*trans));
194			if (trans == NULL) {
195				syslog(LOG_ERR, "no memory");
196				clnt_stat = RPC_CANTSEND;
197				goto done_broad;
198			}
199			(void) memset(trans, 0, sizeof (*trans));
200			if (tr_head == NULL)
201				tr_head = trans;
202			else
203				prev_trans->tr_next = trans;
204			prev_trans = trans;
205
206			trans->tr_fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
207			if (trans->tr_fd < 0) {
208				syslog(LOG_ERR, "nfscast: UDP %s socket: %m",
209				    aflist[af_idx].name);
210				clnt_stat = RPC_CANTSEND;
211				goto done_broad;
212			}
213			trans->tr_afname = aflist[af_idx].name;
214
215			FD_SET(trans->tr_fd, &mask);
216
217			if_inx = 0;
218			hp = getipnodebyname(mfs->mfs_host, af, AI_DEFAULT, &error_num);
219			if (hp != NULL) {
220				/*
221				 * If mfs->ignore is previously set for
222				 * this map, clear it. Because a host can
223				 * have either v6 or v4 address
224				 */
225				if (mfs->mfs_ignore == 1)
226					mfs->mfs_ignore = 0;
227
228				a = (struct addrs *)malloc(sizeof (*a));
229				if (a == NULL) {
230					syslog(LOG_ERR, "no memory");
231					clnt_stat = RPC_CANTSEND;
232					freehostent(hp);
233					goto done_broad;
234				}
235				(void) memset(a, 0, sizeof (*a));
236				if (trans->tr_addrs == NULL)
237					trans->tr_addrs = a;
238				else
239					prev_addr->addr_next = a;
240				prev_addr = a;
241				a->addr_if_tstamps = NULL;
242				a->addr_mfs = mfs;
243				a->addr_addrs = hp;
244				hostaddrs = hp->h_addr_list;
245				while (*hostaddrs) {
246					ts = (struct tstamps *)
247						malloc(sizeof (*ts));
248					if (ts == NULL) {
249						syslog(LOG_ERR, "no memory");
250						clnt_stat = RPC_CANTSEND;
251						goto done_broad;
252					}
253					(void) memset(ts, 0, sizeof (*ts));
254					ts->ts_penalty = mfs->mfs_penalty;
255					if (a->addr_if_tstamps == NULL)
256						a->addr_if_tstamps = ts;
257					else
258						prev_ts->ts_next = ts;
259					prev_ts = ts;
260					ts->ts_inx = if_inx++;
261					addr_cnt++;
262					hostaddrs++;
263				}
264				break;
265			} else {
266				mfs->mfs_ignore = 1;
267				if (verbose)
268					syslog(LOG_ERR,
269				"%s:%s address not known",
270				mfs->mfs_host,
271				aflist[af_idx].name);
272			}
273		} /* for */
274	} /* for */
275	if (addr_cnt == 0) {
276		syslog(LOG_ERR, "nfscast: couldn't find addresses");
277		clnt_stat = RPC_CANTSEND;
278		goto done_broad;
279	}
280
281	(void) gettimeofday(&t, (struct timezone *)0);
282	xid = (uint32_t)(getpid() ^ t.tv_sec ^ t.tv_usec) & ~0xFF;
283	t.tv_usec = 0;
284
285	/* serialize the RPC header */
286
287	msg.rm_direction = CALL;
288	msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
289	msg.rm_call.cb_prog = PMAPPROG;
290	/*
291	 * we can not use RPCBVERS here since it doesn't exist in 4.X,
292	 * the fix to Sun bug 1139883 has made the 4.X portmapper silent to
293	 * version mismatches. This causes the RPC call to the remote
294	 * portmapper to simply be ignored if it's not Version 2.
295	 */
296	msg.rm_call.cb_vers = PMAPVERS;
297	msg.rm_call.cb_proc = NULLPROC;
298	if (sys_auth == (AUTH *)NULL) {
299		clnt_stat = RPC_SYSTEMERROR;
300		goto done_broad;
301	}
302	msg.rm_call.cb_cred = sys_auth->ah_cred;
303	msg.rm_call.cb_verf = sys_auth->ah_verf;
304	xdrmem_create(xdrs, (uint8_t *) outbuf, sizeof (outbuf), XDR_ENCODE);
305	if (! xdr_callmsg(xdrs, &msg)) {
306		clnt_stat = RPC_CANTENCODEARGS;
307		goto done_broad;
308	}
309	outlen = (int)xdr_getpos(xdrs);
310	xdr_destroy(xdrs);
311
312	/*
313	 * Basic loop: send packet to all hosts and wait for response(s).
314	 * The response timeout grows larger per iteration.
315	 * A unique xid is assigned to each address in order to
316	 * correctly match the replies.
317	 */
318	for (tsec = 4; timeout > 0; tsec *= 2) {
319
320		timeout -= tsec;
321		if (timeout <= 0)
322			tsec += timeout;
323
324		rcv_timeout.tv_sec = tsec;
325		rcv_timeout.tv_usec = 0;
326
327		sent = 0;
328		for (trans = tr_head; trans; trans = trans->tr_next) {
329			for (a = trans->tr_addrs; a; a = a->addr_next) {
330				ts = a->addr_if_tstamps;
331				hp = a->addr_addrs;
332				hostaddrs = hp->h_addr_list;
333				while (*hostaddrs) {
334					/*
335					 * xid is the first thing in
336					 * preserialized buffer
337					 */
338					/* LINTED pointer alignment */
339					*((uint32_t *)outbuf) =
340						htonl(xid + ts->ts_inx);
341					(void) gettimeofday(&(ts->ts_timeval),
342						(struct timezone *)0);
343					/*
344					 * Check if already received
345					 * from a previous iteration.
346					 */
347					if (ts->ts_rcvd) {
348						sent++;
349						ts = ts->ts_next;
350						continue;
351					}
352
353					to = (struct sockaddr *)&to_addr;
354					to->sa_family = hp->h_addrtype;
355
356					if (to->sa_family == AF_INET) {
357						struct sockaddr_in *sin;
358
359						sin = (struct sockaddr_in *)to;
360						to->sa_len = sizeof(*sin);
361						sin->sin_port = portmap->s_port;
362						memcpy(&sin->sin_addr,
363						    *hostaddrs++, hp->h_length);
364					} else {	/* must be AF_INET6 */
365						struct sockaddr_in6 *sin6;
366
367						sin6 = (struct sockaddr_in6 *)to;
368						to->sa_len = sizeof(*sin6);
369						sin6->sin6_port = portmap->s_port;
370						memcpy(&sin6->sin6_addr,
371						    *hostaddrs++, hp->h_length);
372					}
373
374					if (sendto(trans->tr_fd, outbuf,
375					    outlen, 0, to, to->sa_len) != -1) {
376						sent++;
377					}
378
379					ts = ts->ts_next;
380				}
381			}
382		}
383		if (sent == 0) {		/* no packets sent ? */
384			clnt_stat = RPC_CANTSEND;
385			goto done_broad;
386		}
387
388		/*
389		 * Have sent all the packets.  Now collect the responses...
390		 */
391		rcvd = 0;
392	recv_again:
393		msg.acpted_rply.ar_verf = _null_auth;
394		msg.acpted_rply.ar_results.proc = (xdrproc_t)xdr_void;
395		readfds = mask;
396
397		switch (select(dtbsize, &readfds,
398			(fd_set *)NULL, (fd_set *)NULL, &rcv_timeout)) {
399
400		case 0: /* Timed out */
401			/*
402			 * If we got at least one response in the
403			 * last interval, then don't wait for any
404			 * more.  In theory we should wait for
405			 * the max weighting (penalty) value so
406			 * that a very slow server has a chance to
407			 * respond but this could take a long time
408			 * if the admin has set a high weighting
409			 * value.
410			 */
411			if (rcvd > 0)
412				goto done_broad;
413
414			clnt_stat = RPC_TIMEDOUT;
415			continue;
416
417		case -1:  /* some kind of error */
418			if (errno == EINTR)
419				goto recv_again;
420			syslog(LOG_ERR, "nfscast: select: %m");
421			if (rcvd == 0)
422				clnt_stat = RPC_CANTRECV;
423			goto done_broad;
424
425		}  /* end of select results switch */
426
427		for (trans = tr_head; trans; trans = trans->tr_next) {
428			if (FD_ISSET(trans->tr_fd, &readfds))
429				break;
430		}
431		if (trans == NULL)
432			goto recv_again;
433
434	try_again:
435		len = recvfrom(trans->tr_fd, inbuf, sizeof (inbuf), 0,
436		    (struct sockaddr *)&from_addr, &fromlen);
437		if (len < 0) {
438			if (errno == EINTR)
439				goto try_again;
440			syslog(LOG_ERR, "nfscast: recvfrom: UDP %s:%m",
441				trans->tr_afname);
442			clnt_stat = RPC_CANTRECV;
443			continue;
444		}
445		if ((size_t)len < sizeof (uint32_t))
446			goto recv_again;
447
448		/*
449		 * see if reply transaction id matches sent id.
450		 * If so, decode the results.
451		 * Note: received addr is ignored, it could be
452		 * different from the send addr if the host has
453		 * more than one addr.
454		 */
455		xdrmem_create(xdrs, (uint8_t *) inbuf, (uint_t)len,	XDR_DECODE);
456		if (xdr_replymsg(xdrs, &msg)) {
457		    if (msg.rm_reply.rp_stat == MSG_ACCEPTED &&
458			(msg.rm_xid & ~0xFF) == xid) {
459			struct addrs *curr_addr;
460
461			i = msg.rm_xid & 0xFF;
462			for (curr_addr = trans->tr_addrs; curr_addr;
463			    curr_addr = curr_addr->addr_next) {
464			    for (ts = curr_addr->addr_if_tstamps; ts;
465				ts = ts->ts_next)
466				if (ts->ts_inx == i && !ts->ts_rcvd) {
467					ts->ts_rcvd = 1;
468					calc_resp_time(&ts->ts_timeval);
469					clnt_stat = RPC_SUCCESS;
470					rcvd++;
471					break;
472				}
473			}
474		    } /* otherwise, we just ignore the errors ... */
475		}
476		xdrs->x_op = XDR_FREE;
477		msg.acpted_rply.ar_results.proc = (xdrproc_t)xdr_void;
478		(void) xdr_replymsg(xdrs, &msg);
479		XDR_DESTROY(xdrs);
480		if (rcvd == sent)
481			goto done_broad;
482		else
483			goto recv_again;
484	}
485	if (!rcvd)
486		clnt_stat = RPC_TIMEDOUT;
487
488done_broad:
489	if (rcvd) {
490		*mfs_out = sort_responses(tr_head);
491		clnt_stat = RPC_SUCCESS;
492	}
493	free_transports(tr_head);
494	AUTH_DESTROY(sys_auth);
495	return (clnt_stat);
496}
497
498/*
499 * Go through all the responses and sort fastest to slowest.
500 * Note that any penalty is added to the response time - so the
501 * fastest response isn't necessarily the one that arrived first.
502 */
503static struct mapfs *
504sort_responses(trans)
505	struct transp *trans;
506{
507	struct transp *t;
508	struct addrs *a;
509	struct tstamps *ti;
510	int i, size = 0, allocsize = 10;
511	struct mapfs *p, *mfs_head = NULL, *mfs_tail = NULL;
512	struct sm *buffer;
513
514	buffer = (struct sm *)malloc(allocsize * sizeof (struct sm));
515	if (!buffer) {
516		syslog(LOG_ERR, "sort_responses: malloc error.\n");
517		return (NULL);
518	}
519
520	for (t = trans; t; t = t->tr_next) {
521		for (a = t->tr_addrs; a; a = a->addr_next) {
522			for (ti = a->addr_if_tstamps;
523				ti; ti = ti->ts_next) {
524				if (!ti->ts_rcvd)
525					continue;
526				ti->ts_timeval.tv_usec +=
527					(ti->ts_penalty * PENALTY_WEIGHT);
528				if (ti->ts_timeval.tv_usec >= 1000000) {
529					ti->ts_timeval.tv_sec +=
530					(ti->ts_timeval.tv_usec / 1000000);
531					ti->ts_timeval.tv_usec =
532					(ti->ts_timeval.tv_usec % 1000000);
533				}
534
535				if (size >= allocsize) {
536					allocsize += 10;
537					buffer = (struct sm *)realloc(buffer,
538					    allocsize * sizeof (struct sm));
539					if (!buffer) {
540						syslog(LOG_ERR,
541					    "sort_responses: malloc error.\n");
542						return (NULL);
543					}
544				}
545				buffer[size].timeval = ti->ts_timeval;
546				buffer[size].mfs = a->addr_mfs;
547				size++;
548			}
549		}
550	}
551
552#ifdef DEBUG
553	if (trace > 3) {
554		trace_prt(1, "  sort_responses: before host sort:\n");
555		for (i = 0; i < size; i++)
556			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
557			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
558		trace_prt(0, "\n");
559	}
560#endif
561
562	qsort((void *)buffer, size, sizeof (struct sm), host_sm);
563
564	/*
565	 * Cope with multiply listed hosts  by choosing first time
566	 */
567	for (i = 1; i < size; i++) {
568#ifdef DEBUG
569		if (trace > 3) {
570			trace_prt(1, "  sort_responses: comparing %s and %s\n",
571				buffer[i-1].mfs->mfs_host,
572				buffer[i].mfs->mfs_host);
573		}
574#endif
575		if (strcmp(buffer[i-1].mfs->mfs_host,
576		    buffer[i].mfs->mfs_host) == 0)
577			memcpy(&buffer[i].timeval, &buffer[i-1].timeval,
578				sizeof (struct timeval));
579	}
580	if (trace > 3)
581		trace_prt(0, "\n");
582
583#ifdef DEBUG
584	if (trace > 3) {
585		trace_prt(1, "  sort_responses: before time sort:\n");
586		for (i = 0; i < size; i++)
587			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
588			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
589		trace_prt(0, "\n");
590	}
591#endif
592
593	qsort((void *)buffer, size, sizeof (struct sm), time_sm);
594
595#ifdef DEBUG
596	if (trace > 3) {
597		trace_prt(1, "  sort_responses: after sort:\n");
598		for (i = 0; i < size; i++)
599			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
600			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
601		trace_prt(0, "\n");
602	}
603#endif
604
605	for (i = 0; i < size; i++) {
606#ifdef DEBUG
607		if (trace > 3) {
608			trace_prt(1, "  sort_responses: adding %s\n",
609				buffer[i].mfs->mfs_host);
610		}
611#endif
612		p = add_mfs(buffer[i].mfs, 0, &mfs_head, &mfs_tail);
613		if (!p)
614			return (NULL);
615	}
616	free(buffer);
617
618	return (mfs_head);
619}
620
621
622/*
623 * Comparison routines called by qsort(3).
624 */
625static int host_sm(const void *a, const void *b)
626{
627	return (strcmp(((struct sm *)a)->mfs->mfs_host,
628			((struct sm *)b)->mfs->mfs_host));
629}
630
631static int time_sm(const void *a, const void *b)
632{
633	if (timercmp(&(((struct sm *)a)->timeval),
634	    &(((struct sm *)b)->timeval), < /* cstyle */))
635		return (-1);
636	else if (timercmp(&(((struct sm *)a)->timeval),
637	    &(((struct sm *)b)->timeval), > /* cstyle */))
638		return (1);
639	else
640		return (0);
641}
642
643/*
644 * Given send_time which is the time a request
645 * was transmitted to a server, subtract it
646 * from the time "now" thereby converting it
647 * to an elapsed time.
648 */
649static void
650calc_resp_time(send_time)
651struct timeval *send_time;
652{
653	struct timeval time_now;
654
655	(void) gettimeofday(&time_now, (struct timezone *)0);
656	if (time_now.tv_usec <  send_time->tv_usec) {
657		time_now.tv_sec--;
658		time_now.tv_usec += 1000000;
659	}
660	send_time->tv_sec = time_now.tv_sec - send_time->tv_sec;
661	send_time->tv_usec = time_now.tv_usec - send_time->tv_usec;
662}
663
664static void
665free_transports(trans)
666	struct transp *trans;
667{
668	struct transp *t, *tmpt = NULL;
669	struct addrs *a, *tmpa = NULL;
670	struct tstamps *ts, *tmpts = NULL;
671
672	for (t = trans; t; t = tmpt) {
673		if (t->tr_fd > 0)
674			(void) close(t->tr_fd);
675		for (a = t->tr_addrs; a; a = tmpa) {
676			for (ts = a->addr_if_tstamps; ts; ts = tmpts) {
677				tmpts = ts->ts_next;
678				free(ts);
679			}
680			freehostent(a->addr_addrs);
681			tmpa = a->addr_next;
682			free(a);
683		}
684		tmpt = t->tr_next;
685		free(t);
686	}
687}
688