1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 *
22 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include "rcm_impl.h"
29#include "rcm_module.h"
30
31/*
32 * Global locks
33 */
34mutex_t rcm_req_lock;	/* protects global dr & info request list */
35
36/*
37 * Daemon state file
38 */
39static int state_fd;
40#define	RCM_STATE_FILE	"/var/run/rcm_daemon_state"
41#define	N_REQ_CHUNK	10	/* grow 10 entries at a time */
42
43/*
44 * Daemon timeout value
45 */
46#define	RCM_DAEMON_TIMEOUT	300	/* 5 minutes idle time */
47
48/*
49 * Struct for a list of outstanding rcm requests
50 */
51typedef struct {
52	int	seq_num;		/* sequence number of request */
53	int	state;			/* current state */
54	pid_t	pid;			/* pid of initiator */
55	uint_t	flag;			/* request flags */
56	int	type;			/* resource(device) type */
57	timespec_t interval;		/* suspend interval */
58	char	device[MAXPATHLEN];	/* name of device or resource */
59} req_t;
60
61typedef struct {
62	int	n_req;
63	int	n_req_max;	/* number of req_t's to follow */
64	int	n_seq_max;	/* last sequence number */
65	int	idle_timeout;	/* persist idle timeout value */
66	req_t	req[1];
67	/* more req_t follows */
68} req_list_t;
69
70static req_list_t *dr_req_list;
71static req_list_t *info_req_list;
72
73static const char *locked_info = "DR operation in progress";
74static const char *locked_err = "Resource is busy";
75
76static int rcmd_get_state();
77static void add_to_polling_list(pid_t);
78static void remove_from_polling_list(pid_t);
79
80void start_polling_thread();
81static void stop_polling_thread();
82
83/*
84 * Initialize request lists required for locking
85 */
86void
87rcmd_lock_init(void)
88{
89	int size;
90	struct stat fbuf;
91
92	/*
93	 * Start info list with one slot, then grow on demand.
94	 */
95	info_req_list = s_calloc(1, sizeof (req_list_t));
96	info_req_list->n_req_max = 1;
97
98	/*
99	 * Open daemon state file and map in contents
100	 */
101	state_fd = open(RCM_STATE_FILE, O_CREAT|O_RDWR, 0600);
102	if (state_fd == -1) {
103		rcm_log_message(RCM_ERROR, gettext("cannot open %s: %s\n"),
104		    RCM_STATE_FILE, strerror(errno));
105		rcmd_exit(errno);
106	}
107
108	if (fstat(state_fd, &fbuf) != 0) {
109		rcm_log_message(RCM_ERROR, gettext("cannot stat %s: %s\n"),
110		    RCM_STATE_FILE, strerror(errno));
111		rcmd_exit(errno);
112	}
113
114	size = fbuf.st_size;
115	if (size == 0) {
116		size = sizeof (req_list_t);
117		if (ftruncate(state_fd, size) != 0) {
118			rcm_log_message(RCM_ERROR,
119			    gettext("cannot truncate %s: %s\n"),
120			    RCM_STATE_FILE, strerror(errno));
121			rcmd_exit(errno);
122		}
123	}
124
125	/*LINTED*/
126	dr_req_list = (req_list_t *)mmap(NULL, size, PROT_READ|PROT_WRITE,
127	    MAP_SHARED, state_fd, 0);
128	if (dr_req_list == MAP_FAILED) {
129		rcm_log_message(RCM_ERROR, gettext("cannot mmap %s: %s\n"),
130		    RCM_STATE_FILE, strerror(errno));
131		rcmd_exit(errno);
132	}
133
134	/*
135	 * Initial size is one entry
136	 */
137	if (dr_req_list->n_req_max == 0) {
138		dr_req_list->n_req_max = 1;
139		(void) fsync(state_fd);
140		return;
141	}
142
143	rcm_log_message(RCM_DEBUG, "n_req = %d, n_req_max = %d\n",
144	    dr_req_list->n_req, dr_req_list->n_req_max);
145
146	/*
147	 * Recover the daemon state
148	 */
149	clean_dr_list();
150}
151
152/*
153 * Get a unique sequence number--to be called with rcm_req_lock held.
154 */
155static int
156get_seq_number()
157{
158	int number;
159
160	if (dr_req_list == NULL)
161		return (0);
162
163	dr_req_list->n_seq_max++;
164	number  = (dr_req_list->n_seq_max << SEQ_NUM_SHIFT);
165	(void) fsync(state_fd);
166
167	return (number);
168}
169
170/*
171 * Find entry in list with the same resource name and sequence number.
172 * If seq_num == -1, no seq_num matching is required.
173 */
174static req_t *
175find_req_entry(char *device, uint_t flag, int seq_num, req_list_t *list)
176{
177	int i;
178
179	/*
180	 * Look for entry with the same resource and seq_num.
181	 * Also match RCM_FILESYS field in flag.
182	 */
183	for (i = 0; i < list->n_req_max; i++) {
184		if (list->req[i].state == RCM_STATE_REMOVE)
185			/* stale entry */
186			continue;
187		/*
188		 * We need to distiguish a file system root from the directory
189		 * it is mounted on.
190		 *
191		 * Applications are not aware of any difference between the
192		 * two, but the system keeps track of it internally by
193		 * checking for mount points while traversing file path.
194		 * In a similar spirit, RCM is keeping this difference as
195		 * an implementation detail.
196		 */
197		if ((strcmp(device, list->req[i].device) != 0) ||
198		    (list->req[i].flag & RCM_FILESYS) != (flag & RCM_FILESYS))
199			/* different resource */
200			continue;
201
202		if ((seq_num != -1) && ((seq_num >> SEQ_NUM_SHIFT) !=
203		    (list->req[i].seq_num >> SEQ_NUM_SHIFT)))
204			/* different base seqnum */
205			continue;
206
207		return (&list->req[i]);
208	}
209
210	return (NULL);
211}
212
213/*
214 * Get the next empty req_t entry. If no entry exists, grow the list.
215 */
216static req_t *
217get_req_entry(req_list_t **listp)
218{
219	int i;
220	int n_req = (*listp)->n_req;
221	int n_req_max = (*listp)->n_req_max;
222
223	/*
224	 * If the list is full, grow the list and return the first
225	 * entry in the new portion.
226	 */
227	if (n_req == n_req_max) {
228		int newsize;
229
230		n_req_max += N_REQ_CHUNK;
231		newsize = sizeof (req_list_t) + (n_req_max - 1) *
232		    sizeof (req_t);
233
234		if (listp == &info_req_list) {
235			*listp = s_realloc(*listp, newsize);
236		} else if (ftruncate(state_fd, newsize) != 0) {
237			rcm_log_message(RCM_ERROR,
238			    gettext("cannot truncate %s: %s\n"),
239			    RCM_STATE_FILE, strerror(errno));
240			rcmd_exit(errno);
241		/*LINTED*/
242		} else if ((*listp = (req_list_t *)mmap(NULL, newsize,
243		    PROT_READ|PROT_WRITE, MAP_SHARED, state_fd, 0)) ==
244		    MAP_FAILED) {
245			rcm_log_message(RCM_ERROR,
246			    gettext("cannot mmap %s: %s\n"),
247			    RCM_STATE_FILE, strerror(errno));
248			rcmd_exit(errno);
249		}
250
251		/* Initialize the new entries */
252		for (i = (*listp)->n_req_max; i < n_req_max; i++) {
253			(*listp)->req[i].state = RCM_STATE_REMOVE;
254			(void) strcpy((*listp)->req[i].device, "");
255		}
256
257		(*listp)->n_req_max = n_req_max;
258		(*listp)->n_req++;
259		return (&(*listp)->req[n_req]);
260	}
261
262	/*
263	 * List contains empty slots, find it.
264	 */
265	for (i = 0; i < n_req_max; i++) {
266		if (((*listp)->req[i].device[0] == '\0') ||
267		    ((*listp)->req[i].state == RCM_STATE_REMOVE)) {
268			break;
269		}
270	}
271
272	assert(i < n_req_max);	/* empty slot must exist */
273
274	(*listp)->n_req++;
275	return (&(*listp)->req[i]);
276}
277
278/*
279 * When one resource depends on multiple resources, it's possible that
280 * rcm_get_info can be called multiple times on the resource, resulting
281 * in duplicate information. By assigning a unique sequence number to
282 * each rcm_get_info operation, this duplication can be eliminated.
283 *
284 * Insert a dr entry in info_req_list
285 */
286int
287info_req_add(char *rsrcname, uint_t flag, int seq_num)
288{
289	int error = 0;
290	char *device;
291	req_t *req;
292
293	rcm_log_message(RCM_TRACE2, "info_req_add(%s, %d)\n",
294	    rsrcname, seq_num);
295
296	device = resolve_name(rsrcname);
297	(void) mutex_lock(&rcm_req_lock);
298
299	/*
300	 * Look for entry with the same resource and seq_num.
301	 * If it exists, we return an error so that such
302	 * information is not gathered more than once.
303	 */
304	if (find_req_entry(device, flag, seq_num, info_req_list) != NULL) {
305		rcm_log_message(RCM_DEBUG, "getinfo cycle: %s %d \n",
306		    device, seq_num);
307		error = -1;
308		goto out;
309	}
310
311	/*
312	 * Get empty entry and fill in seq_num and device.
313	 */
314	req = get_req_entry(&info_req_list);
315	req->seq_num = seq_num;
316	req->state = RCM_STATE_ONLINE;  /* mark that the entry is in use */
317	req->flag = flag;
318	(void) strcpy(req->device, device);
319
320out:
321	(void) mutex_unlock(&rcm_req_lock);
322	free(device);
323
324	return (error);
325}
326
327/*
328 * Remove all entries associated with seq_num from info_req_list
329 */
330void
331info_req_remove(int seq_num)
332{
333	int i;
334
335	rcm_log_message(RCM_TRACE3, "info_req_remove(%d)\n", seq_num);
336
337	seq_num >>= SEQ_NUM_SHIFT;
338	(void) mutex_lock(&rcm_req_lock);
339
340	/* remove all entries with seq_num */
341	for (i = 0; i < info_req_list->n_req_max; i++) {
342		if (info_req_list->req[i].state == RCM_STATE_REMOVE)
343			continue;
344
345		if ((info_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != seq_num)
346			continue;
347
348		info_req_list->req[i].state = RCM_STATE_REMOVE;
349		info_req_list->n_req--;
350	}
351
352	/*
353	 * We don't shrink the info_req_list size for now.
354	 */
355	(void) mutex_unlock(&rcm_req_lock);
356}
357
358/*
359 * Checking lock conflicts. There is a conflict if:
360 * - attempt to DR a node when either its ancester or descendent
361 *	is in the process of DR
362 * - attempt to register for a node when its ancester is locked for DR
363 */
364static int
365check_lock(char *device, uint_t flag, int cflag, rcm_info_t **info)
366{
367	int i, ret = RCM_SUCCESS;
368
369	if (info)
370		*info = NULL;
371
372	/*
373	 * During daemon initialization, don't check locks
374	 */
375	if (dr_req_list == NULL)
376		return (ret);
377
378	for (i = 0; i < dr_req_list->n_req; i++) {
379		req_t *req = &dr_req_list->req[i];
380		char *dr_dev = req->device;
381
382		/*
383		 * Skip empty entries
384		 */
385		if ((req->state == RCM_STATE_REMOVE) || (dr_dev[0] == '\0'))
386			continue;
387
388		/*
389		 * Make sure that none of the ancestors of dr_dev is
390		 * being operated upon.
391		 */
392		if (EQUAL(device, dr_dev) || DESCENDENT(device, dr_dev)) {
393			/*
394			 * An exception to this is the filesystem.
395			 * We should allowed a filesystem rooted at a
396			 * child directory to be unmounted.
397			 */
398			if ((flag & RCM_FILESYS) && (!EQUAL(device, dr_dev) ||
399			    ((dr_req_list->req[i].flag & RCM_FILESYS) == 0)))
400				continue;
401
402			assert(info != 0);
403
404			add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
405			    dr_req_list->req[i].state,
406			    dr_req_list->req[i].seq_num, NULL, locked_info,
407			    locked_err, NULL, info);
408			ret = RCM_CONFLICT;
409			break;
410		}
411
412		if ((cflag == LOCK_FOR_DR) && DESCENDENT(dr_dev, device)) {
413			/*
414			 * Check descendents only for DR request.
415			 *
416			 * Could have multiple descendents doing DR,
417			 * we want to find them all.
418			 */
419			assert(info != 0);
420
421			add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
422			    dr_req_list->req[i].state,
423			    dr_req_list->req[i].seq_num, NULL, locked_info,
424			    locked_err, NULL, info);
425			ret = RCM_CONFLICT;
426			/* don't break here, need to find all conflicts */
427		}
428	}
429
430	return (ret);
431}
432
433/*
434 * Check for lock conflicts for DR operation or client registration
435 */
436int
437rsrc_check_lock_conflicts(char *rsrcname, uint_t flag, int cflag,
438    rcm_info_t **info)
439{
440	int result;
441	char *device;
442
443	device = resolve_name(rsrcname);
444	result = check_lock(device, flag, cflag, info);
445	free(device);
446
447	return (result);
448}
449
450static int
451transition_state(int state)
452{
453	/*
454	 * If the resource state is in transition, ask caller to
455	 * try again.
456	 */
457	switch (state) {
458	case RCM_STATE_OFFLINING:
459	case RCM_STATE_SUSPENDING:
460	case RCM_STATE_RESUMING:
461	case RCM_STATE_ONLINING:
462	case RCM_STATE_REMOVING:
463
464		return (1);
465
466	default:
467		/*FALLTHROUGH*/
468		break;
469	}
470	return (0);
471}
472
473/*
474 * Update a dr entry in dr_req_list
475 */
476/*ARGSUSED*/
477static int
478dr_req_update_entry(char *device, pid_t pid, uint_t flag, int state,
479    int seq_num, timespec_t *interval, rcm_info_t **infop)
480{
481	req_t *req;
482
483	/*
484	 * Find request entry. If not found, return RCM_FAILURE
485	 */
486	req = find_req_entry(device, flag, -1, dr_req_list);
487
488	if (req == NULL) {
489		switch (state) {
490		case RCM_STATE_OFFLINE_QUERYING:
491		case RCM_STATE_SUSPEND_QUERYING:
492		case RCM_STATE_OFFLINING:
493		case RCM_STATE_SUSPENDING:
494			/* could be re-do operation, no error message */
495			break;
496
497		default:
498			rcm_log_message(RCM_DEBUG,
499			    "update non-existing resource %s\n", device);
500		}
501		return (RCM_FAILURE);
502	}
503
504	/*
505	 * During initialization, update is unconditional (forced)
506	 * in order to bring the daemon up in a sane state.
507	 */
508	if (rcmd_get_state() == RCMD_INIT)
509		goto update;
510
511	/*
512	 * Don't allow update with mismatched initiator pid. This could happen
513	 * as part of normal operation.
514	 */
515	if (pid != req->pid) {
516		rcm_log_message(RCM_INFO,
517		    gettext("mismatched dr initiator pid: %ld %ld\n"),
518		    req->pid, pid);
519		goto failure;
520	}
521
522	rcm_log_message(RCM_TRACE4,
523	    "dr_req_update_entry: state=%d, device=%s\n",
524	    req->state, req->device);
525
526	/*
527	 * Check that the state transition is valid
528	 */
529	switch (state) {
530	case RCM_STATE_OFFLINE_QUERYING:
531	case RCM_STATE_OFFLINING:
532		/*
533		 * This is the case of re-offlining, which applies only
534		 * if a previous attempt failed.
535		 */
536		if ((req->state != RCM_STATE_OFFLINE_FAIL) &&
537		    (req->state != RCM_STATE_OFFLINE_QUERYING) &&
538		    (req->state != RCM_STATE_OFFLINE_QUERY) &&
539		    (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
540		    (req->state != RCM_STATE_OFFLINE)) {
541			rcm_log_message(RCM_WARNING,
542			    gettext("%s: invalid offlining from state %d\n"),
543			    device, req->state);
544			goto failure;
545		}
546		break;
547
548	case RCM_STATE_SUSPEND_QUERYING:
549	case RCM_STATE_SUSPENDING:
550		/*
551		 * This is the case of re-suspending, which applies only
552		 * if a previous attempt failed.
553		 */
554		if ((req->state != RCM_STATE_SUSPEND_FAIL) &&
555		    (req->state != RCM_STATE_SUSPEND_QUERYING) &&
556		    (req->state != RCM_STATE_SUSPEND_QUERY) &&
557		    (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
558		    (req->state != RCM_STATE_SUSPEND)) {
559			rcm_log_message(RCM_WARNING,
560			    gettext("%s: invalid suspending from state %d\n"),
561			    device, req->state);
562			goto failure;
563		}
564		break;
565
566	case RCM_STATE_RESUMING:
567		if ((req->state != RCM_STATE_SUSPEND) &&
568		    (req->state != RCM_STATE_SUSPEND_QUERYING) &&
569		    (req->state != RCM_STATE_SUSPEND_QUERY) &&
570		    (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
571		    (req->state != RCM_STATE_SUSPEND_FAIL)) {
572			rcm_log_message(RCM_DEBUG,
573			    "%s: invalid resuming from state %d\n",
574			    device, req->state);
575			goto failure;
576		}
577		break;
578
579	case RCM_STATE_ONLINING:
580		if ((req->state != RCM_STATE_OFFLINE) &&
581		    (req->state != RCM_STATE_OFFLINE_QUERYING) &&
582		    (req->state != RCM_STATE_OFFLINE_QUERY) &&
583		    (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
584		    (req->state != RCM_STATE_OFFLINE_FAIL)) {
585			rcm_log_message(RCM_INFO,
586			    gettext("%s: invalid onlining from state %d\n"),
587			    device, req->state);
588			goto failure;
589		}
590		break;
591
592	case RCM_STATE_REMOVING:
593		if ((req->state != RCM_STATE_OFFLINE) &&
594		    (req->state != RCM_STATE_OFFLINE_FAIL)) {
595			rcm_log_message(RCM_INFO,
596			    gettext("%s: invalid removing from state %d\n"),
597			    device, req->state);
598			goto failure;
599		}
600		break;
601
602	case RCM_STATE_SUSPEND_FAIL:
603		assert(req->state == RCM_STATE_SUSPENDING);
604		break;
605
606	case RCM_STATE_OFFLINE_FAIL:
607		assert(req->state == RCM_STATE_OFFLINING);
608		break;
609
610	case RCM_STATE_SUSPEND:
611		assert(req->state == RCM_STATE_SUSPENDING);
612		break;
613
614	case RCM_STATE_OFFLINE:
615		assert(req->state == RCM_STATE_OFFLINING);
616		break;
617
618	case RCM_STATE_ONLINE:
619		assert((req->state == RCM_STATE_RESUMING) ||
620		    (req->state == RCM_STATE_ONLINING));
621		break;
622
623	default:	/* shouldn't be here */
624		rcm_log_message(RCM_ERROR,
625		    gettext("invalid update to dr state: %d\n"), state);
626		return (RCM_FAILURE);
627	}
628
629update:
630	/*
631	 * update the state, interval, and sequence number; sync state file
632	 */
633	req->state = state;
634	req->seq_num = seq_num;
635
636	if (interval)
637		req->interval = *interval;
638	else
639		bzero(&req->interval, sizeof (timespec_t));
640
641	(void) fsync(state_fd);
642	return (RCM_SUCCESS);
643
644failure:
645	if (infop != NULL) {
646		add_busy_rsrc_to_list(req->device, req->pid, req->state,
647		    req->seq_num, NULL, locked_info, locked_err, NULL, infop);
648	}
649
650	/*
651	 * A request may be left in a transition state because the operator
652	 * typed ctrl-C. In this case, the daemon thread continues to run
653	 * and will eventually put the state in a non-transitional state.
654	 *
655	 * To be safe, we return EAGAIN to allow librcm to loop and retry.
656	 * If we are called from a module, loop & retry could result in a
657	 * deadlock. The called will check for this case and turn EAGAIN
658	 * into RCM_CONFLICT.
659	 */
660	if (transition_state(req->state)) {
661		return (EAGAIN);
662	}
663
664	return (RCM_CONFLICT);
665}
666
667/*
668 * Insert a dr entry in dr_req_list
669 */
670int
671dr_req_add(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
672    timespec_t *interval, rcm_info_t **info)
673{
674	int error;
675	char *device;
676	req_t *req;
677
678	rcm_log_message(RCM_TRACE3, "dr_req_add(%s, %ld, 0x%x, %d, %d, %p)\n",
679	    rsrcname, pid, flag, state, seq_num, (void *)info);
680
681	device = resolve_name(rsrcname);
682	if (device == NULL)
683		return (EINVAL);
684
685	(void) mutex_lock(&rcm_req_lock);
686
687	/*
688	 * In the re-offline/suspend case, attempt to update dr request.
689	 *
690	 * If this succeeds, return success;
691	 * If this fails because of a conflict, return error;
692	 * If this this fails because no entry exists, add a new entry.
693	 */
694	error = dr_req_update_entry(device, pid, flag, state, seq_num, interval,
695	    info);
696
697	switch (error) {
698	case RCM_FAILURE:
699		/* proceed to add a new entry */
700		break;
701
702	case RCM_CONFLICT:
703	case RCM_SUCCESS:
704	case EAGAIN:
705	default:
706		goto out;
707	}
708
709	/*
710	 * Check for lock conflicts
711	 */
712	error = check_lock(device, flag, LOCK_FOR_DR, info);
713	if (error != RCM_SUCCESS) {
714		error = RCM_CONFLICT;
715		goto out;
716	}
717
718	/*
719	 * Get empty request entry, fill in values and sync state file
720	 */
721	req = get_req_entry(&dr_req_list);
722
723	req->seq_num = seq_num;
724	req->pid = pid;
725	req->flag = flag;
726	req->state = state;
727	req->type = rsrc_get_type(device);
728	(void) strcpy(req->device, device);
729
730	/* cache interval for failure recovery */
731	if (interval)
732		req->interval = *interval;
733	else
734		bzero(&req->interval, sizeof (timespec_t));
735
736	(void) fsync(state_fd);
737
738	/*
739	 * Add initiator pid to polling list
740	 */
741	add_to_polling_list(req->pid);
742
743out:
744	(void) mutex_unlock(&rcm_req_lock);
745	free(device);
746
747	return (error);
748}
749
750/*
751 * Update a dr entry in dr_req_list
752 */
753/*ARGSUSED*/
754int
755dr_req_update(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
756    rcm_info_t **info)
757{
758	int error;
759	char *device = resolve_name(rsrcname);
760
761	rcm_log_message(RCM_TRACE3, "dr_req_update(%s, %ld, 0x%x, %d, %d)\n",
762	    rsrcname, pid, flag, state, seq_num);
763
764	(void) mutex_lock(&rcm_req_lock);
765	error = dr_req_update_entry(device, pid, flag, state, seq_num, NULL,
766	    info);
767	(void) mutex_unlock(&rcm_req_lock);
768	free(device);
769
770	return (error);
771}
772
773/*
774 * This function scans the DR request list for the next, non-removed
775 * entry that is part of the specified sequence.  The 'device' name
776 * of the entry is copied into the provided 'rsrc' buffer.
777 *
778 * The 'rsrc' buffer is required because the DR request list is only
779 * locked during the duration of this lookup.  Giving a direct pointer
780 * to something in the list would be unsafe.
781 */
782int
783dr_req_lookup(int seq_num, char *rsrc)
784{
785	int	i;
786	int	len;
787	int	base = (seq_num >> SEQ_NUM_SHIFT);
788	int	retval = RCM_FAILURE;
789
790	if (rsrc == NULL) {
791		return (RCM_FAILURE);
792	}
793
794	(void) mutex_lock(&rcm_req_lock);
795
796	for (i = 0; i < dr_req_list->n_req_max; i++) {
797
798		/* Skip removed or non-matching entries */
799		if ((dr_req_list->req[i].state == RCM_STATE_REMOVE) ||
800		    ((dr_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != base)) {
801			continue;
802		}
803
804		/* Copy the next-matching 'device' name into 'rsrc' */
805		len = strlcpy(rsrc, dr_req_list->req[i].device, MAXPATHLEN);
806		if (len < MAXPATHLEN) {
807			retval = RCM_SUCCESS;
808		}
809		break;
810	}
811
812	(void) mutex_unlock(&rcm_req_lock);
813
814	return (retval);
815}
816
817/*
818 * Remove a dr entry in dr_req_list
819 */
820void
821dr_req_remove(char *rsrcname, uint_t flag)
822{
823	req_t *req;
824	char *device = resolve_name(rsrcname);
825
826	rcm_log_message(RCM_TRACE3, "dr_req_remove(%s)\n", rsrcname);
827
828	(void) mutex_lock(&rcm_req_lock);
829
830	/* find entry */
831	req = find_req_entry(device, flag, -1, dr_req_list);
832	free(device);
833
834	if (req == NULL) {
835		(void) mutex_unlock(&rcm_req_lock);
836		rcm_log_message(RCM_WARNING,
837		    gettext("dr_req entry %s not found\n"), rsrcname);
838		return;
839	}
840
841	req->state = RCM_STATE_REMOVE;
842	dr_req_list->n_req--;
843	(void) fsync(state_fd);
844
845	/*
846	 * remove pid from polling list
847	 */
848	remove_from_polling_list(req->pid);
849
850	/*
851	 * We don't shrink the dr_req_list size for now.
852	 * Shouldn't cause big memory leaks.
853	 */
854	(void) mutex_unlock(&rcm_req_lock);
855}
856
857/*
858 * Return the list of ongoing dr operation requests
859 */
860rcm_info_t *
861rsrc_dr_info()
862{
863	int i;
864	rcm_info_t *info;
865	rcm_info_t *result = NULL;
866	char *rsrc;
867	int len;
868
869	rcm_log_message(RCM_TRACE2, "rsrc_dr_info()\n");
870
871	(void) mutex_lock(&rcm_req_lock);
872	for (i = 0; i < dr_req_list->n_req_max; i++) {
873		if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
874			continue;
875
876		if (dr_req_list->req[i].device[0] == '\0')
877			continue;
878
879		if (dr_req_list->req[i].flag & RCM_FILESYS) {
880			len = strlen(dr_req_list->req[i].device) + 5;
881			rsrc = s_malloc(len);
882			(void) snprintf(rsrc, len, "%s(fs)",
883			    dr_req_list->req[i].device);
884		} else {
885			rsrc = s_strdup(dr_req_list->req[i].device);
886		}
887
888		info = s_calloc(1, sizeof (*info));
889		if (errno = nvlist_alloc(&(info->info), NV_UNIQUE_NAME, 0)) {
890			rcm_log_message(RCM_ERROR,
891			    gettext("failed (nvlist_alloc=%s).\n"),
892			    strerror(errno));
893			rcmd_exit(errno);
894		}
895
896		if (errno = nvlist_add_string(info->info, RCM_RSRCNAME, rsrc)) {
897			rcm_log_message(RCM_ERROR,
898			    gettext("failed (nvlist_add=%s).\n"),
899			    strerror(errno));
900			rcmd_exit(errno);
901		}
902		(void) free(rsrc);
903
904		if (errno = nvlist_add_int64(info->info, RCM_CLIENT_ID,
905		    dr_req_list->req[i].pid)) {
906			rcm_log_message(RCM_ERROR,
907			    gettext("failed (nvlist_add=%s).\n"),
908			    strerror(errno));
909			rcmd_exit(errno);
910		}
911
912		if (errno = nvlist_add_int32(info->info, RCM_SEQ_NUM,
913		    dr_req_list->req[i].seq_num)) {
914			rcm_log_message(RCM_ERROR,
915			    gettext("failed (nvlist_add=%s).\n"),
916			    strerror(errno));
917			rcmd_exit(errno);
918		}
919
920		if (errno = nvlist_add_int32(info->info, RCM_RSRCSTATE,
921		    dr_req_list->req[i].state)) {
922			rcm_log_message(RCM_ERROR,
923			    gettext("failed (nvlist_add=%s).\n"),
924			    strerror(errno));
925			rcmd_exit(errno);
926		}
927
928		if (errno = nvlist_add_string(info->info, RCM_CLIENT_INFO,
929		    (char *)locked_info)) {
930			rcm_log_message(RCM_ERROR,
931			    gettext("failed (nvlist_add=%s).\n"),
932			    strerror(errno));
933			rcmd_exit(errno);
934		}
935
936		info->next = result;
937		result = info;
938	}
939	(void) mutex_unlock(&rcm_req_lock);
940
941	return (result);
942}
943
944/*
945 * Eliminate entries whose dr initiator is no longer running
946 * and recover daemon state during daemon restart.
947 *
948 * This routine is called from either during daemon initialization
949 * after all modules have registered resources or from the cleanup
950 * thread. In either case, it is the only thread running in the
951 * daemon.
952 */
953void
954clean_dr_list()
955{
956	int i;
957	struct clean_list {
958		struct clean_list *next;
959		char *rsrcname;
960		pid_t pid;
961		int seq_num;
962		int state;
963		timespec_t interval;
964	} *tmp, *list = NULL;
965	char *rsrcnames[2];
966
967	rcm_log_message(RCM_TRACE3,
968	    "clean_dr_list(): look for stale dr initiators\n");
969
970	rsrcnames[1] = NULL;
971
972	/*
973	 * Make a list of entries to recover. This is necessary because
974	 * the recovery operation will modify dr_req_list.
975	 */
976	(void) mutex_lock(&rcm_req_lock);
977	for (i = 0; i < dr_req_list->n_req_max; i++) {
978		/* skip empty entries */
979		if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
980			continue;
981
982		if (dr_req_list->req[i].device[0] == '\0')
983			continue;
984
985		/* skip cascade operations */
986		if (dr_req_list->req[i].seq_num & SEQ_NUM_MASK)
987			continue;
988
989		/*
990		 * In the cleanup case, ignore entries with initiators alive
991		 */
992		if ((rcmd_get_state() == RCMD_CLEANUP) &&
993		    proc_exist(dr_req_list->req[i].pid))
994			continue;
995
996		rcm_log_message(RCM_TRACE1,
997		    "found stale entry: %s\n", dr_req_list->req[i].device);
998
999		tmp = s_malloc(sizeof (*tmp));
1000		tmp->rsrcname = s_strdup(dr_req_list->req[i].device);
1001		tmp->state = dr_req_list->req[i].state;
1002		tmp->pid = dr_req_list->req[i].pid;
1003		tmp->seq_num = dr_req_list->req[i].seq_num;
1004		tmp->interval = dr_req_list->req[i].interval;
1005		tmp->next = list;
1006		list = tmp;
1007	}
1008	(void) mutex_unlock(&rcm_req_lock);
1009
1010	if (list == NULL)
1011		return;
1012
1013	/*
1014	 * If everything worked normally, we shouldn't be here.
1015	 * Since we are here, something went wrong, so say something.
1016	 */
1017	if (rcmd_get_state() == RCMD_INIT) {
1018		rcm_log_message(RCM_NOTICE, gettext("rcm_daemon died "
1019		    "unexpectedly, recovering previous daemon state\n"));
1020	} else {
1021		rcm_log_message(RCM_INFO, gettext("one or more dr initiator "
1022		    "died, attempting automatic recovery\n"));
1023	}
1024
1025	while (list) {
1026		tmp = list;
1027		list = tmp->next;
1028
1029		switch (tmp->state) {
1030		case RCM_STATE_OFFLINE_QUERY:
1031		case RCM_STATE_OFFLINE_QUERY_FAIL:
1032			rsrcnames[0] = tmp->rsrcname;
1033			if (proc_exist(tmp->pid)) {
1034				/* redo */
1035				(void) process_resource_offline(rsrcnames,
1036				    tmp->pid, RCM_QUERY, tmp->seq_num, NULL);
1037			} else {
1038				/* undo */
1039				(void) notify_resource_online(rsrcnames,
1040				    tmp->pid, 0, tmp->seq_num, NULL);
1041			}
1042			break;
1043
1044		case RCM_STATE_OFFLINE:
1045		case RCM_STATE_OFFLINE_FAIL:
1046			rsrcnames[0] = tmp->rsrcname;
1047			if (proc_exist(tmp->pid)) {
1048				/* redo */
1049				(void) process_resource_offline(rsrcnames,
1050				    tmp->pid, 0, tmp->seq_num, NULL);
1051			} else {
1052				/* undo */
1053				(void) notify_resource_online(rsrcnames,
1054				    tmp->pid, 0, tmp->seq_num, NULL);
1055			}
1056			break;
1057
1058		case RCM_STATE_SUSPEND_QUERY:
1059		case RCM_STATE_SUSPEND_QUERY_FAIL:
1060			rsrcnames[0] = tmp->rsrcname;
1061			if (proc_exist(tmp->pid)) {
1062				/* redo */
1063				(void) process_resource_suspend(rsrcnames,
1064				    tmp->pid, RCM_QUERY, tmp->seq_num,
1065				    &tmp->interval, NULL);
1066			} else {
1067				/* undo */
1068				(void) notify_resource_resume(rsrcnames,
1069				    tmp->pid, 0, tmp->seq_num, NULL);
1070			}
1071			break;
1072
1073		case RCM_STATE_SUSPEND:
1074		case RCM_STATE_SUSPEND_FAIL:
1075			rsrcnames[0] = tmp->rsrcname;
1076			if (proc_exist(tmp->pid)) {
1077				/* redo */
1078				(void) process_resource_suspend(rsrcnames,
1079				    tmp->pid, 0, tmp->seq_num, &tmp->interval,
1080				    NULL);
1081			} else {
1082				/* undo */
1083				(void) notify_resource_resume(rsrcnames,
1084				    tmp->pid, 0, tmp->seq_num, NULL);
1085			}
1086			break;
1087
1088		case RCM_STATE_OFFLINING:
1089		case RCM_STATE_ONLINING:
1090			rsrcnames[0] = tmp->rsrcname;
1091			(void) notify_resource_online(rsrcnames, tmp->pid, 0,
1092			    tmp->seq_num, NULL);
1093			break;
1094
1095		case RCM_STATE_SUSPENDING:
1096		case RCM_STATE_RESUMING:
1097			rsrcnames[0] = tmp->rsrcname;
1098			(void) notify_resource_resume(rsrcnames, tmp->pid, 0,
1099			    tmp->seq_num, NULL);
1100			break;
1101
1102		case RCM_STATE_REMOVING:
1103			rsrcnames[0] = tmp->rsrcname;
1104			(void) notify_resource_remove(rsrcnames, tmp->pid, 0,
1105			    tmp->seq_num, NULL);
1106			break;
1107
1108		default:
1109			rcm_log_message(RCM_WARNING,
1110			    gettext("%s in unknown state %d\n"),
1111			    tmp->rsrcname, tmp->state);
1112			break;
1113		}
1114		free(tmp->rsrcname);
1115		free(tmp);
1116	}
1117}
1118
1119/*
1120 * Selected thread blocking based on event type
1121 */
1122barrier_t barrier;
1123
1124/*
1125 * Change barrier state:
1126 *	RCMD_INIT - daemon is intializing, only register allowed
1127 *	RCMD_NORMAL - normal daemon processing
1128 *	RCMD_CLEANUP - cleanup thread is waiting or running
1129 */
1130int
1131rcmd_get_state()
1132{
1133	return (barrier.state);
1134}
1135
1136void
1137rcmd_set_state(int state)
1138{
1139	/*
1140	 * The state transition is as follows:
1141	 *	INIT --> NORMAL <---> CLEANUP
1142	 * The implementation favors the cleanup thread
1143	 */
1144
1145	(void) mutex_lock(&barrier.lock);
1146	barrier.state = state;
1147
1148	switch (state) {
1149	case RCMD_CLEANUP:
1150		/*
1151		 * Wait for existing threads to exit
1152		 */
1153		barrier.wanted++;
1154		while (barrier.thr_count != 0)
1155			(void) cond_wait(&barrier.cv, &barrier.lock);
1156		barrier.wanted--;
1157		barrier.thr_count = -1;
1158		break;
1159
1160	case RCMD_INIT:
1161	case RCMD_NORMAL:
1162	default:
1163		if (barrier.thr_count == -1)
1164			barrier.thr_count = 0;
1165		if (barrier.wanted)
1166			(void) cond_broadcast(&barrier.cv);
1167		break;
1168	}
1169
1170	(void) mutex_unlock(&barrier.lock);
1171}
1172
1173/*
1174 * Increment daemon thread count
1175 */
1176int
1177rcmd_thr_incr(int cmd)
1178{
1179	int seq_num;
1180
1181	(void) mutex_lock(&barrier.lock);
1182	/*
1183	 * Set wanted flag
1184	 */
1185	barrier.wanted++;
1186
1187	/*
1188	 * Wait till it is safe for daemon to perform the operation
1189	 *
1190	 * NOTE: if a module registers by passing a request to the
1191	 *	client proccess, we may need to allow register
1192	 *	to come through during daemon initialization.
1193	 */
1194	while (barrier.state != RCMD_NORMAL)
1195		(void) cond_wait(&barrier.cv, &barrier.lock);
1196
1197	if ((cmd == CMD_EVENT) ||
1198	    (cmd == CMD_REGISTER) ||
1199	    (cmd == CMD_UNREGISTER)) {
1200		/*
1201		 * Event passthru and register ops don't need sequence number
1202		 */
1203		seq_num = -1;
1204	} else {
1205		/*
1206		 * Non register operation gets a sequence number
1207		 */
1208		seq_num = get_seq_number();
1209	}
1210	barrier.wanted--;
1211	barrier.thr_count++;
1212	(void) mutex_unlock(&barrier.lock);
1213
1214	if ((cmd == CMD_OFFLINE) ||
1215	    (cmd == CMD_SUSPEND) ||
1216	    (cmd == CMD_GETINFO)) {
1217		/*
1218		 * For these operations, need to ask modules to
1219		 * register any new resources that came online.
1220		 *
1221		 * This is because mount/umount are not instrumented
1222		 * to register with rcm before using system resources.
1223		 * Certain registration ops may fail during sync, which
1224		 * indicates race conditions. This cannot be avoided
1225		 * without changing mount/umount.
1226		 */
1227		rcmd_db_sync();
1228	}
1229
1230	return (seq_num);
1231}
1232
1233/*
1234 * Decrement thread count
1235 */
1236void
1237rcmd_thr_decr()
1238{
1239	/*
1240	 * Decrement thread count and wake up reload/cleanup thread.
1241	 */
1242	(void) mutex_lock(&barrier.lock);
1243	barrier.last_update = time(NULL);
1244	if (--barrier.thr_count == 0)
1245		(void) cond_broadcast(&barrier.cv);
1246	(void) mutex_unlock(&barrier.lock);
1247}
1248
1249/*
1250 * Wakeup all waiting threads as a result of SIGHUP
1251 */
1252static int sighup_received = 0;
1253
1254void
1255rcmd_thr_signal()
1256{
1257	(void) mutex_lock(&barrier.lock);
1258	sighup_received = 1;
1259	(void) cond_broadcast(&barrier.cv);
1260	(void) mutex_unlock(&barrier.lock);
1261}
1262
1263void
1264rcmd_start_timer(int timeout)
1265{
1266	timestruc_t abstime;
1267
1268	if (timeout == 0)
1269		timeout = RCM_DAEMON_TIMEOUT;	/* default to 5 minutes */
1270	else
1271		dr_req_list->idle_timeout = timeout;	/* persist timeout */
1272
1273	if (timeout > 0) {
1274		abstime.tv_sec = time(NULL) + timeout;
1275	}
1276
1277	(void) mutex_lock(&barrier.lock);
1278	for (;;) {
1279		int idletime;
1280		int is_active;
1281
1282		if (timeout > 0)
1283			(void) cond_timedwait(&barrier.cv, &barrier.lock,
1284			    &abstime);
1285		else
1286			(void) cond_wait(&barrier.cv, &barrier.lock);
1287
1288		/*
1289		 * If sighup received, change timeout to 0 so the daemon is
1290		 * shut down at the first possible moment
1291		 */
1292		if (sighup_received)
1293			timeout = 0;
1294
1295		/*
1296		 * If timeout is negative, never shutdown the daemon
1297		 */
1298		if (timeout < 0)
1299			continue;
1300
1301		/*
1302		 * Check for ongoing/pending activity
1303		 */
1304		is_active = (barrier.thr_count || barrier.wanted ||
1305		    (dr_req_list->n_req != 0));
1306		if (is_active) {
1307			abstime.tv_sec = time(NULL) + timeout;
1308			continue;
1309		}
1310
1311		/*
1312		 * If idletime is less than timeout, continue to wait
1313		 */
1314		idletime = time(NULL) - barrier.last_update;
1315		if (idletime < timeout) {
1316			abstime.tv_sec = barrier.last_update + timeout;
1317			continue;
1318		}
1319		break;
1320	}
1321
1322	(void) script_main_fini();
1323
1324	rcm_log_message(RCM_INFO, gettext("rcm_daemon is shut down.\n"));
1325}
1326
1327/*
1328 * Code related to polling client pid's
1329 * Not declared as static so that we can find this structure easily
1330 * in the core file.
1331 */
1332struct {
1333	int		n_pids;
1334	int		n_max_pids;
1335	thread_t	poll_tid;	/* poll thread id */
1336	int		signaled;
1337	pid_t		*pids;
1338	int		*refcnt;
1339	struct pollfd	*fds;
1340	cond_t		cv;	/* the associated lock is rcm_req_lock */
1341} polllist;
1342
1343static int
1344find_pid_index(pid_t pid)
1345{
1346	int i;
1347
1348	for (i = 0; i < polllist.n_pids; i++) {
1349		if (polllist.pids[i] == pid) {
1350			return (i);
1351		}
1352	}
1353	return (-1);
1354}
1355
1356/*
1357 * Resize buffer for new pids
1358 */
1359static int
1360get_pid_index()
1361{
1362	const int n_chunk = 10;
1363
1364	int n_max;
1365	int index = polllist.n_pids;
1366
1367	if (polllist.n_pids < polllist.n_max_pids) {
1368		polllist.n_pids++;
1369		return (index);
1370	}
1371
1372	if (polllist.n_max_pids == 0) {
1373		n_max = n_chunk;
1374		polllist.pids = s_calloc(n_max, sizeof (pid_t));
1375		polllist.refcnt = s_calloc(n_max, sizeof (int));
1376		polllist.fds = s_calloc(n_max, sizeof (struct pollfd));
1377	} else {
1378		n_max = polllist.n_max_pids + n_chunk;
1379		polllist.pids = s_realloc(polllist.pids,
1380		    n_max * sizeof (pid_t));
1381		polllist.refcnt = s_realloc(polllist.refcnt,
1382		    n_max * sizeof (int));
1383		polllist.fds = s_realloc(polllist.fds,
1384		    n_max * sizeof (struct pollfd));
1385	}
1386	polllist.n_max_pids = n_max;
1387	polllist.n_pids++;
1388	return (index);
1389}
1390
1391/*
1392 * rcm_req_lock must be held
1393 */
1394static void
1395add_to_polling_list(pid_t pid)
1396{
1397	int fd, index;
1398	char procfile[MAXPATHLEN];
1399
1400	if (pid == (pid_t)0)
1401		return;
1402
1403	rcm_log_message(RCM_TRACE1, "add_to_polling_list(%ld)\n", pid);
1404
1405	/*
1406	 * Need to stop the poll thread before manipulating the polllist
1407	 * since poll thread may possibly be using polllist.fds[] and
1408	 * polllist.n_pids. As an optimization, first check if the pid
1409	 * is already in the polllist. If it is, there is no need to
1410	 * stop the poll thread. Just increment the pid reference count
1411	 * and return;
1412	 */
1413	index = find_pid_index(pid);
1414	if (index != -1) {
1415		polllist.refcnt[index]++;
1416		return;
1417	}
1418
1419	stop_polling_thread();
1420
1421	/*
1422	 * In an attempt to stop the poll thread we may have released
1423	 * and reacquired rcm_req_lock. So find the index again.
1424	 */
1425	index = find_pid_index(pid);
1426	if (index != -1) {
1427		polllist.refcnt[index]++;
1428		goto done;
1429	}
1430
1431	/*
1432	 * Open a /proc file
1433	 */
1434	(void) sprintf(procfile, "/proc/%ld/as", pid);
1435	if ((fd = open(procfile, O_RDONLY)) == -1) {
1436		rcm_log_message(RCM_NOTICE, gettext("open(%s): %s\n"),
1437		    procfile, strerror(errno));
1438		goto done;
1439	}
1440
1441	/*
1442	 * add pid to polllist
1443	 */
1444	index = get_pid_index();
1445	polllist.pids[index] = pid;
1446	polllist.refcnt[index] = 1;
1447	polllist.fds[index].fd = fd;
1448	polllist.fds[index].events = 0;
1449	polllist.fds[index].revents = 0;
1450
1451	rcm_log_message(RCM_DEBUG, "add pid %ld at index %ld\n", pid, index);
1452
1453done:
1454	start_polling_thread();
1455}
1456
1457/*
1458 * rcm_req_lock must be held
1459 */
1460static void
1461remove_from_polling_list(pid_t pid)
1462{
1463	int i, index;
1464
1465	if (pid == (pid_t)0)
1466		return;
1467
1468	rcm_log_message(RCM_TRACE1, "remove_from_polling_list(%ld)\n", pid);
1469
1470	/*
1471	 * Need to stop the poll thread before manipulating the polllist
1472	 * since poll thread may possibly be using polllist.fds[] and
1473	 * polllist.n_pids. As an optimization, first check the pid
1474	 * reference count. If the pid reference count is greater than 1
1475	 * there is no need to stop the polling thread.
1476	 */
1477
1478	index = find_pid_index(pid);
1479	if (index == -1) {
1480		rcm_log_message(RCM_NOTICE,
1481		    gettext("error removing pid %ld from polling list\n"), pid);
1482		return;
1483	}
1484
1485	/*
1486	 * decrement the pid refcnt
1487	 */
1488	if (polllist.refcnt[index] > 1) {
1489		polllist.refcnt[index]--;
1490		return;
1491	}
1492
1493	stop_polling_thread();
1494
1495	/*
1496	 * In an attempt to stop the poll thread we may have released
1497	 * and reacquired rcm_req_lock. So find the index again.
1498	 */
1499	index = find_pid_index(pid);
1500	if (index == -1) {
1501		rcm_log_message(RCM_NOTICE,
1502		    gettext("error removing pid %ld from polling list\n"), pid);
1503		goto done;
1504	}
1505
1506	if (--polllist.refcnt[index] > 0)
1507		goto done;
1508
1509	/*
1510	 * refcnt down to zero, delete pid from polling list
1511	 */
1512	(void) close(polllist.fds[index].fd);
1513	polllist.n_pids--;
1514
1515	for (i = index; i < polllist.n_pids; i++) {
1516		polllist.pids[i] = polllist.pids[i + 1];
1517		polllist.refcnt[i] = polllist.refcnt[i + 1];
1518		bcopy(&polllist.fds[i + 1], &polllist.fds[i],
1519		    sizeof (struct pollfd));
1520	}
1521
1522	rcm_log_message(RCM_DEBUG, "remove pid %ld at index %d\n", pid, index);
1523
1524done:
1525	start_polling_thread();
1526}
1527
1528void
1529init_poll_thread()
1530{
1531	polllist.poll_tid = (thread_t)-1;
1532}
1533
1534void
1535cleanup_poll_thread()
1536{
1537	(void) mutex_lock(&rcm_req_lock);
1538	if (polllist.poll_tid == thr_self()) {
1539		rcm_log_message(RCM_TRACE2,
1540		    "cleanup_poll_thread: n_pids = %d\n", polllist.n_pids);
1541		polllist.poll_tid = (thread_t)-1;
1542		(void) cond_broadcast(&polllist.cv);
1543	}
1544	(void) mutex_unlock(&rcm_req_lock);
1545}
1546
1547/*ARGSUSED*/
1548static void *
1549pollfunc(void *arg)
1550{
1551	sigset_t mask;
1552
1553	rcm_log_message(RCM_TRACE2, "poll thread started. n_pids = %d\n",
1554	    polllist.n_pids);
1555
1556	/*
1557	 * Unblock SIGUSR1 to allow polling thread to be killed
1558	 */
1559	(void) sigemptyset(&mask);
1560	(void) sigaddset(&mask, SIGUSR1);
1561	(void) thr_sigsetmask(SIG_UNBLOCK, &mask, NULL);
1562
1563	(void) poll(polllist.fds, polllist.n_pids, (time_t)-1);
1564
1565	/*
1566	 * block SIGUSR1 to avoid being killed while holding a lock
1567	 */
1568	(void) sigemptyset(&mask);
1569	(void) sigaddset(&mask, SIGUSR1);
1570	(void) thr_sigsetmask(SIG_BLOCK, &mask, NULL);
1571
1572	rcm_log_message(RCM_TRACE2, "returned from poll()\n");
1573
1574	cleanup_poll_thread();
1575
1576	(void) mutex_lock(&barrier.lock);
1577	need_cleanup = 1;
1578	(void) cond_broadcast(&barrier.cv);
1579	(void) mutex_unlock(&barrier.lock);
1580
1581	return (NULL);
1582}
1583
1584/*
1585 * rcm_req_lock must be held
1586 */
1587void
1588start_polling_thread()
1589{
1590	int err;
1591
1592	if (rcmd_get_state() != RCMD_NORMAL)
1593		return;
1594
1595	if (polllist.poll_tid != (thread_t)-1 || polllist.n_pids == 0)
1596		return;
1597
1598	if ((err = thr_create(NULL, 0, pollfunc, NULL, THR_DETACHED,
1599	    &polllist.poll_tid)) == 0)
1600		polllist.signaled = 0;
1601	else
1602		rcm_log_message(RCM_ERROR,
1603		    gettext("failed to create polling thread: %s\n"),
1604		    strerror(err));
1605}
1606
1607/*
1608 * rcm_req_lock must be held
1609 */
1610static void
1611stop_polling_thread()
1612{
1613	int err;
1614
1615	while (polllist.poll_tid != (thread_t)-1) {
1616		if (polllist.signaled == 0) {
1617			if ((err = thr_kill(polllist.poll_tid, SIGUSR1)) == 0)
1618				polllist.signaled = 1;
1619			else
1620				/*
1621				 * thr_kill shouldn't have failed since the
1622				 * poll thread id and the signal are valid.
1623				 * So log an error. Since when thr_kill
1624				 * fails no signal is sent (as per man page),
1625				 * the cond_wait below will wait until the
1626				 * the poll thread exits by some other means.
1627				 * The poll thread, for example, exits on its
1628				 * own when any DR initiator process that it
1629				 * is currently polling exits.
1630				 */
1631				rcm_log_message(RCM_ERROR,
1632				    gettext(
1633				    "fail to kill polling thread %d: %s\n"),
1634				    polllist.poll_tid, strerror(err));
1635		}
1636		(void) cond_wait(&polllist.cv, &rcm_req_lock);
1637	}
1638}
1639