1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * nfs_cast.c : broadcast to a specific group of NFS servers 24 * 25 * Copyright (c) 1988-1996,1998,1999,2001 by Sun Microsystems, Inc. 26 * All rights reserved. 27 */ 28 29/* 30 * Portions Copyright 2007-2011 Apple Inc. 31 */ 32 33#pragma ident "@(#)nfs_cast.c 1.26 05/06/08 SMI" 34 35#include <stdio.h> 36#include <syslog.h> 37#include <errno.h> 38#include <string.h> 39#include <sys/types.h> 40#include <sys/time.h> 41#include <sys/resource.h> 42#include <unistd.h> 43#include <stdlib.h> 44#include <oncrpc/rpc.h> 45#include <oncrpc/pmap_prot.h> 46#include <sys/socket.h> 47#include <netdb.h> 48#define NFSCLIENT 49#include <locale.h> 50 51#include "automount.h" 52 53#define PENALTY_WEIGHT 100000 54 55struct tstamps { 56 struct tstamps *ts_next; 57 int ts_penalty; 58 int ts_inx; 59 int ts_rcvd; 60 struct timeval ts_timeval; 61}; 62 63/* A list of addresses - all belonging to the same transport */ 64 65struct addrs { 66 struct addrs *addr_next; 67 struct mapfs *addr_mfs; 68 struct hostent *addr_addrs; 69 struct tstamps *addr_if_tstamps; 70}; 71 72/* A list of connectionless transports */ 73 74struct transp { 75 struct transp *tr_next; 76 int tr_fd; 77 const char *tr_afname; 78 struct addrs *tr_addrs; 79}; 80 81/* A list of map entries and their roundtrip times, for sorting */ 82 83struct sm { 84 struct mapfs *mfs; 85 struct timeval timeval; 86}; 87 88static void free_transports(struct transp *); 89static void calc_resp_time(struct timeval *); 90static struct mapfs *sort_responses(struct transp *); 91static int host_sm(const void *, const void *b); 92static int time_sm(const void *, const void *b); 93extern struct mapfs *add_mfs(struct mapfs *, int, struct mapfs **, 94 struct mapfs **); 95 96struct aftype { 97 int afnum; 98 char *name; 99}; 100 101/* 102 * This routine is designed to be able to "ping" 103 * a list of hosts and create a list of responding 104 * hosts sorted by response time. 105 * This must be done without any prior 106 * contact with the host - therefore the "ping" 107 * must be to a "well-known" address. The outstanding 108 * candidate here is the address of the portmapper/rpcbind. 109 * 110 * A response to a ping is no guarantee that the host 111 * is running NFS, has a mount daemon, or exports 112 * the required filesystem. If the subsequent 113 * mount attempt fails then the host will be marked 114 * "ignore" and the host list will be re-pinged 115 * (sans the bad host). This process continues 116 * until a successful mount is achieved or until 117 * there are no hosts left to try. 118 */ 119enum clnt_stat 120nfs_cast(struct mapfs *mfs_in, struct mapfs **mfs_out, int timeout) 121{ 122 struct servent *portmap; 123 enum clnt_stat clnt_stat; 124 AUTH *sys_auth = authunix_create_default(); 125 XDR xdr_stream; 126 register XDR *xdrs = &xdr_stream; 127 int outlen; 128 static const struct aftype aflist[] = { 129 { AF_INET, "IPv4" }, 130#ifdef HAVE_IPV6_SUPPORT 131 { AF_INET6, "IPv6" } 132#endif 133 }; 134#define N_AFS (sizeof aflist / sizeof aflist[0]) 135 int if_inx; 136 int tsec; 137 int sent, addr_cnt, rcvd; 138 fd_set readfds, mask; 139 register uint32_t xid; /* xid - unique per addr */ 140 register int i; 141 struct rpc_msg msg; 142 struct timeval t, rcv_timeout; 143 char outbuf[UDPMSGSIZE], inbuf[UDPMSGSIZE]; 144 struct hostent *hp; 145 int error_num; 146 char **hostaddrs; 147 struct sockaddr_storage to_addr; 148 struct sockaddr *to; 149 struct sockaddr_storage from_addr; 150 socklen_t fromlen; 151 ssize_t len; 152 struct transp *tr_head; 153 struct transp *trans, *prev_trans; 154 struct addrs *a, *prev_addr; 155 struct tstamps *ts, *prev_ts; 156 size_t af_idx; 157 int af; 158 struct rlimit rl; 159 int dtbsize; 160 struct mapfs *mfs; 161 162 portmap = getservbyname("sunrpc", "udp"); 163 164 /* 165 * For each connectionless transport get a list of 166 * host addresses. Any single host may have 167 * addresses on several transports. 168 */ 169 addr_cnt = sent = rcvd = 0; 170 tr_head = NULL; 171 FD_ZERO(&mask); 172 173 /* 174 * Set the default select size to be the maximum FD_SETSIZE, unless 175 * the current rlimit is lower. 176 */ 177 dtbsize = FD_SETSIZE; 178 if (getrlimit(RLIMIT_NOFILE, &rl) == 0) { 179 if (rl.rlim_cur < FD_SETSIZE) 180 dtbsize = (int)rl.rlim_cur; 181 } 182 183 prev_trans = NULL; 184 prev_addr = NULL; 185 prev_ts = NULL; 186 for (mfs = mfs_in; mfs; mfs = mfs->mfs_next) { 187 188 if (trace > 2) 189 trace_prt(1, "nfs_cast: host=%s\n", mfs->mfs_host); 190 191 for (af_idx = 0; af_idx < N_AFS; af_idx++) { 192 af = aflist[af_idx].afnum; 193 trans = (struct transp *)malloc(sizeof (*trans)); 194 if (trans == NULL) { 195 syslog(LOG_ERR, "no memory"); 196 clnt_stat = RPC_CANTSEND; 197 goto done_broad; 198 } 199 (void) memset(trans, 0, sizeof (*trans)); 200 if (tr_head == NULL) 201 tr_head = trans; 202 else 203 prev_trans->tr_next = trans; 204 prev_trans = trans; 205 206 trans->tr_fd = socket(af, SOCK_DGRAM, IPPROTO_UDP); 207 if (trans->tr_fd < 0) { 208 syslog(LOG_ERR, "nfscast: UDP %s socket: %m", 209 aflist[af_idx].name); 210 clnt_stat = RPC_CANTSEND; 211 goto done_broad; 212 } 213 trans->tr_afname = aflist[af_idx].name; 214 215 FD_SET(trans->tr_fd, &mask); 216 217 if_inx = 0; 218 hp = getipnodebyname(mfs->mfs_host, af, AI_DEFAULT, &error_num); 219 if (hp != NULL) { 220 /* 221 * If mfs->ignore is previously set for 222 * this map, clear it. Because a host can 223 * have either v6 or v4 address 224 */ 225 if (mfs->mfs_ignore == 1) 226 mfs->mfs_ignore = 0; 227 228 a = (struct addrs *)malloc(sizeof (*a)); 229 if (a == NULL) { 230 syslog(LOG_ERR, "no memory"); 231 clnt_stat = RPC_CANTSEND; 232 freehostent(hp); 233 goto done_broad; 234 } 235 (void) memset(a, 0, sizeof (*a)); 236 if (trans->tr_addrs == NULL) 237 trans->tr_addrs = a; 238 else 239 prev_addr->addr_next = a; 240 prev_addr = a; 241 a->addr_if_tstamps = NULL; 242 a->addr_mfs = mfs; 243 a->addr_addrs = hp; 244 hostaddrs = hp->h_addr_list; 245 while (*hostaddrs) { 246 ts = (struct tstamps *) 247 malloc(sizeof (*ts)); 248 if (ts == NULL) { 249 syslog(LOG_ERR, "no memory"); 250 clnt_stat = RPC_CANTSEND; 251 goto done_broad; 252 } 253 (void) memset(ts, 0, sizeof (*ts)); 254 ts->ts_penalty = mfs->mfs_penalty; 255 if (a->addr_if_tstamps == NULL) 256 a->addr_if_tstamps = ts; 257 else 258 prev_ts->ts_next = ts; 259 prev_ts = ts; 260 ts->ts_inx = if_inx++; 261 addr_cnt++; 262 hostaddrs++; 263 } 264 break; 265 } else { 266 mfs->mfs_ignore = 1; 267 if (verbose) 268 syslog(LOG_ERR, 269 "%s:%s address not known", 270 mfs->mfs_host, 271 aflist[af_idx].name); 272 } 273 } /* for */ 274 } /* for */ 275 if (addr_cnt == 0) { 276 syslog(LOG_ERR, "nfscast: couldn't find addresses"); 277 clnt_stat = RPC_CANTSEND; 278 goto done_broad; 279 } 280 281 (void) gettimeofday(&t, (struct timezone *)0); 282 xid = (uint32_t)(getpid() ^ t.tv_sec ^ t.tv_usec) & ~0xFF; 283 t.tv_usec = 0; 284 285 /* serialize the RPC header */ 286 287 msg.rm_direction = CALL; 288 msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 289 msg.rm_call.cb_prog = PMAPPROG; 290 /* 291 * we can not use RPCBVERS here since it doesn't exist in 4.X, 292 * the fix to Sun bug 1139883 has made the 4.X portmapper silent to 293 * version mismatches. This causes the RPC call to the remote 294 * portmapper to simply be ignored if it's not Version 2. 295 */ 296 msg.rm_call.cb_vers = PMAPVERS; 297 msg.rm_call.cb_proc = NULLPROC; 298 if (sys_auth == (AUTH *)NULL) { 299 clnt_stat = RPC_SYSTEMERROR; 300 goto done_broad; 301 } 302 msg.rm_call.cb_cred = sys_auth->ah_cred; 303 msg.rm_call.cb_verf = sys_auth->ah_verf; 304 xdrmem_create(xdrs, (uint8_t *) outbuf, sizeof (outbuf), XDR_ENCODE); 305 if (! xdr_callmsg(xdrs, &msg)) { 306 clnt_stat = RPC_CANTENCODEARGS; 307 goto done_broad; 308 } 309 outlen = (int)xdr_getpos(xdrs); 310 xdr_destroy(xdrs); 311 312 /* 313 * Basic loop: send packet to all hosts and wait for response(s). 314 * The response timeout grows larger per iteration. 315 * A unique xid is assigned to each address in order to 316 * correctly match the replies. 317 */ 318 for (tsec = 4; timeout > 0; tsec *= 2) { 319 320 timeout -= tsec; 321 if (timeout <= 0) 322 tsec += timeout; 323 324 rcv_timeout.tv_sec = tsec; 325 rcv_timeout.tv_usec = 0; 326 327 sent = 0; 328 for (trans = tr_head; trans; trans = trans->tr_next) { 329 for (a = trans->tr_addrs; a; a = a->addr_next) { 330 ts = a->addr_if_tstamps; 331 hp = a->addr_addrs; 332 hostaddrs = hp->h_addr_list; 333 while (*hostaddrs) { 334 /* 335 * xid is the first thing in 336 * preserialized buffer 337 */ 338 /* LINTED pointer alignment */ 339 *((uint32_t *)outbuf) = 340 htonl(xid + ts->ts_inx); 341 (void) gettimeofday(&(ts->ts_timeval), 342 (struct timezone *)0); 343 /* 344 * Check if already received 345 * from a previous iteration. 346 */ 347 if (ts->ts_rcvd) { 348 sent++; 349 ts = ts->ts_next; 350 continue; 351 } 352 353 to = (struct sockaddr *)&to_addr; 354 to->sa_family = hp->h_addrtype; 355 356 if (to->sa_family == AF_INET) { 357 struct sockaddr_in *sin; 358 359 sin = (struct sockaddr_in *)to; 360 to->sa_len = sizeof(*sin); 361 sin->sin_port = portmap->s_port; 362 memcpy(&sin->sin_addr, 363 *hostaddrs++, hp->h_length); 364 } else { /* must be AF_INET6 */ 365 struct sockaddr_in6 *sin6; 366 367 sin6 = (struct sockaddr_in6 *)to; 368 to->sa_len = sizeof(*sin6); 369 sin6->sin6_port = portmap->s_port; 370 memcpy(&sin6->sin6_addr, 371 *hostaddrs++, hp->h_length); 372 } 373 374 if (sendto(trans->tr_fd, outbuf, 375 outlen, 0, to, to->sa_len) != -1) { 376 sent++; 377 } 378 379 ts = ts->ts_next; 380 } 381 } 382 } 383 if (sent == 0) { /* no packets sent ? */ 384 clnt_stat = RPC_CANTSEND; 385 goto done_broad; 386 } 387 388 /* 389 * Have sent all the packets. Now collect the responses... 390 */ 391 rcvd = 0; 392 recv_again: 393 msg.acpted_rply.ar_verf = _null_auth; 394 msg.acpted_rply.ar_results.proc = (xdrproc_t)xdr_void; 395 readfds = mask; 396 397 switch (select(dtbsize, &readfds, 398 (fd_set *)NULL, (fd_set *)NULL, &rcv_timeout)) { 399 400 case 0: /* Timed out */ 401 /* 402 * If we got at least one response in the 403 * last interval, then don't wait for any 404 * more. In theory we should wait for 405 * the max weighting (penalty) value so 406 * that a very slow server has a chance to 407 * respond but this could take a long time 408 * if the admin has set a high weighting 409 * value. 410 */ 411 if (rcvd > 0) 412 goto done_broad; 413 414 clnt_stat = RPC_TIMEDOUT; 415 continue; 416 417 case -1: /* some kind of error */ 418 if (errno == EINTR) 419 goto recv_again; 420 syslog(LOG_ERR, "nfscast: select: %m"); 421 if (rcvd == 0) 422 clnt_stat = RPC_CANTRECV; 423 goto done_broad; 424 425 } /* end of select results switch */ 426 427 for (trans = tr_head; trans; trans = trans->tr_next) { 428 if (FD_ISSET(trans->tr_fd, &readfds)) 429 break; 430 } 431 if (trans == NULL) 432 goto recv_again; 433 434 try_again: 435 len = recvfrom(trans->tr_fd, inbuf, sizeof (inbuf), 0, 436 (struct sockaddr *)&from_addr, &fromlen); 437 if (len < 0) { 438 if (errno == EINTR) 439 goto try_again; 440 syslog(LOG_ERR, "nfscast: recvfrom: UDP %s:%m", 441 trans->tr_afname); 442 clnt_stat = RPC_CANTRECV; 443 continue; 444 } 445 if ((size_t)len < sizeof (uint32_t)) 446 goto recv_again; 447 448 /* 449 * see if reply transaction id matches sent id. 450 * If so, decode the results. 451 * Note: received addr is ignored, it could be 452 * different from the send addr if the host has 453 * more than one addr. 454 */ 455 xdrmem_create(xdrs, (uint8_t *) inbuf, (uint_t)len, XDR_DECODE); 456 if (xdr_replymsg(xdrs, &msg)) { 457 if (msg.rm_reply.rp_stat == MSG_ACCEPTED && 458 (msg.rm_xid & ~0xFF) == xid) { 459 struct addrs *curr_addr; 460 461 i = msg.rm_xid & 0xFF; 462 for (curr_addr = trans->tr_addrs; curr_addr; 463 curr_addr = curr_addr->addr_next) { 464 for (ts = curr_addr->addr_if_tstamps; ts; 465 ts = ts->ts_next) 466 if (ts->ts_inx == i && !ts->ts_rcvd) { 467 ts->ts_rcvd = 1; 468 calc_resp_time(&ts->ts_timeval); 469 clnt_stat = RPC_SUCCESS; 470 rcvd++; 471 break; 472 } 473 } 474 } /* otherwise, we just ignore the errors ... */ 475 } 476 xdrs->x_op = XDR_FREE; 477 msg.acpted_rply.ar_results.proc = (xdrproc_t)xdr_void; 478 (void) xdr_replymsg(xdrs, &msg); 479 XDR_DESTROY(xdrs); 480 if (rcvd == sent) 481 goto done_broad; 482 else 483 goto recv_again; 484 } 485 if (!rcvd) 486 clnt_stat = RPC_TIMEDOUT; 487 488done_broad: 489 if (rcvd) { 490 *mfs_out = sort_responses(tr_head); 491 clnt_stat = RPC_SUCCESS; 492 } 493 free_transports(tr_head); 494 AUTH_DESTROY(sys_auth); 495 return (clnt_stat); 496} 497 498/* 499 * Go through all the responses and sort fastest to slowest. 500 * Note that any penalty is added to the response time - so the 501 * fastest response isn't necessarily the one that arrived first. 502 */ 503static struct mapfs * 504sort_responses(trans) 505 struct transp *trans; 506{ 507 struct transp *t; 508 struct addrs *a; 509 struct tstamps *ti; 510 int i, size = 0, allocsize = 10; 511 struct mapfs *p, *mfs_head = NULL, *mfs_tail = NULL; 512 struct sm *buffer; 513 514 buffer = (struct sm *)malloc(allocsize * sizeof (struct sm)); 515 if (!buffer) { 516 syslog(LOG_ERR, "sort_responses: malloc error.\n"); 517 return (NULL); 518 } 519 520 for (t = trans; t; t = t->tr_next) { 521 for (a = t->tr_addrs; a; a = a->addr_next) { 522 for (ti = a->addr_if_tstamps; 523 ti; ti = ti->ts_next) { 524 if (!ti->ts_rcvd) 525 continue; 526 ti->ts_timeval.tv_usec += 527 (ti->ts_penalty * PENALTY_WEIGHT); 528 if (ti->ts_timeval.tv_usec >= 1000000) { 529 ti->ts_timeval.tv_sec += 530 (ti->ts_timeval.tv_usec / 1000000); 531 ti->ts_timeval.tv_usec = 532 (ti->ts_timeval.tv_usec % 1000000); 533 } 534 535 if (size >= allocsize) { 536 allocsize += 10; 537 buffer = (struct sm *)realloc(buffer, 538 allocsize * sizeof (struct sm)); 539 if (!buffer) { 540 syslog(LOG_ERR, 541 "sort_responses: malloc error.\n"); 542 return (NULL); 543 } 544 } 545 buffer[size].timeval = ti->ts_timeval; 546 buffer[size].mfs = a->addr_mfs; 547 size++; 548 } 549 } 550 } 551 552#ifdef DEBUG 553 if (trace > 3) { 554 trace_prt(1, " sort_responses: before host sort:\n"); 555 for (i = 0; i < size; i++) 556 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, 557 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); 558 trace_prt(0, "\n"); 559 } 560#endif 561 562 qsort((void *)buffer, size, sizeof (struct sm), host_sm); 563 564 /* 565 * Cope with multiply listed hosts by choosing first time 566 */ 567 for (i = 1; i < size; i++) { 568#ifdef DEBUG 569 if (trace > 3) { 570 trace_prt(1, " sort_responses: comparing %s and %s\n", 571 buffer[i-1].mfs->mfs_host, 572 buffer[i].mfs->mfs_host); 573 } 574#endif 575 if (strcmp(buffer[i-1].mfs->mfs_host, 576 buffer[i].mfs->mfs_host) == 0) 577 memcpy(&buffer[i].timeval, &buffer[i-1].timeval, 578 sizeof (struct timeval)); 579 } 580 if (trace > 3) 581 trace_prt(0, "\n"); 582 583#ifdef DEBUG 584 if (trace > 3) { 585 trace_prt(1, " sort_responses: before time sort:\n"); 586 for (i = 0; i < size; i++) 587 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, 588 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); 589 trace_prt(0, "\n"); 590 } 591#endif 592 593 qsort((void *)buffer, size, sizeof (struct sm), time_sm); 594 595#ifdef DEBUG 596 if (trace > 3) { 597 trace_prt(1, " sort_responses: after sort:\n"); 598 for (i = 0; i < size; i++) 599 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, 600 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); 601 trace_prt(0, "\n"); 602 } 603#endif 604 605 for (i = 0; i < size; i++) { 606#ifdef DEBUG 607 if (trace > 3) { 608 trace_prt(1, " sort_responses: adding %s\n", 609 buffer[i].mfs->mfs_host); 610 } 611#endif 612 p = add_mfs(buffer[i].mfs, 0, &mfs_head, &mfs_tail); 613 if (!p) 614 return (NULL); 615 } 616 free(buffer); 617 618 return (mfs_head); 619} 620 621 622/* 623 * Comparison routines called by qsort(3). 624 */ 625static int host_sm(const void *a, const void *b) 626{ 627 return (strcmp(((struct sm *)a)->mfs->mfs_host, 628 ((struct sm *)b)->mfs->mfs_host)); 629} 630 631static int time_sm(const void *a, const void *b) 632{ 633 if (timercmp(&(((struct sm *)a)->timeval), 634 &(((struct sm *)b)->timeval), < /* cstyle */)) 635 return (-1); 636 else if (timercmp(&(((struct sm *)a)->timeval), 637 &(((struct sm *)b)->timeval), > /* cstyle */)) 638 return (1); 639 else 640 return (0); 641} 642 643/* 644 * Given send_time which is the time a request 645 * was transmitted to a server, subtract it 646 * from the time "now" thereby converting it 647 * to an elapsed time. 648 */ 649static void 650calc_resp_time(send_time) 651struct timeval *send_time; 652{ 653 struct timeval time_now; 654 655 (void) gettimeofday(&time_now, (struct timezone *)0); 656 if (time_now.tv_usec < send_time->tv_usec) { 657 time_now.tv_sec--; 658 time_now.tv_usec += 1000000; 659 } 660 send_time->tv_sec = time_now.tv_sec - send_time->tv_sec; 661 send_time->tv_usec = time_now.tv_usec - send_time->tv_usec; 662} 663 664static void 665free_transports(trans) 666 struct transp *trans; 667{ 668 struct transp *t, *tmpt = NULL; 669 struct addrs *a, *tmpa = NULL; 670 struct tstamps *ts, *tmpts = NULL; 671 672 for (t = trans; t; t = tmpt) { 673 if (t->tr_fd > 0) 674 (void) close(t->tr_fd); 675 for (a = t->tr_addrs; a; a = tmpa) { 676 for (ts = a->addr_if_tstamps; ts; ts = tmpts) { 677 tmpts = ts->ts_next; 678 free(ts); 679 } 680 freehostent(a->addr_addrs); 681 tmpa = a->addr_next; 682 free(a); 683 } 684 tmpt = t->tr_next; 685 free(t); 686 } 687} 688