fmd_sysevent.c revision 9967:e0258b956de2
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/sysevent/eventdefs.h>
28#include <sys/sysevent.h>
29#include <sys/sysevent_impl.h>
30#include <sys/fm/protocol.h>
31#include <sys/sysmacros.h>
32#include <sys/dumphdr.h>
33#include <sys/dumpadm.h>
34#include <sys/fm/util.h>
35
36#include <libsysevent.h>
37#include <libnvpair.h>
38#include <alloca.h>
39#include <limits.h>
40#include <strings.h>
41#include <unistd.h>
42#include <fcntl.h>
43#include <errno.h>
44
45#undef MUTEX_HELD
46#undef RW_READ_HELD
47#undef RW_WRITE_HELD
48
49#include <fmd_api.h>
50#include <fmd_log.h>
51#include <fmd_subr.h>
52#include <fmd_dispq.h>
53#include <fmd_dr.h>
54#include <fmd_module.h>
55#include <fmd_protocol.h>
56#include <fmd_scheme.h>
57#include <fmd_error.h>
58
59#include <fmd.h>
60
61static char *sysev_channel;	/* event channel to which we are subscribed */
62static char *sysev_class;	/* event class to which we are subscribed */
63static char *sysev_device;	/* device path to use for replaying events */
64static char *sysev_sid;		/* event channel subscriber identifier */
65static void *sysev_evc;		/* event channel cookie from evc_bind */
66
67static fmd_xprt_t *sysev_xprt;
68static int sysev_xprt_refcnt;
69static fmd_hdl_t *sysev_hdl;
70
71static struct sysev_stats {
72	fmd_stat_t dump_replay;
73	fmd_stat_t dump_lost;
74	fmd_stat_t bad_class;
75	fmd_stat_t bad_attr;
76	fmd_stat_t eagain;
77} sysev_stats = {
78	{ "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
79	{ "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
80	{ "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
81	{ "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
82	{ "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
83};
84
85static pthread_cond_t sysev_cv = PTHREAD_COND_INITIALIZER;
86static pthread_mutex_t sysev_mutex = PTHREAD_MUTEX_INITIALIZER;
87static int sysev_replay_wait = 1;
88static int sysev_exiting;
89
90/*
91 * Entry point for legacy sysevents.  This function is responsible for two
92 * things: passing off interesting events to the DR handler, and converting
93 * sysevents into resource events that modules can then subscribe to.
94 */
95static void
96sysev_legacy(sysevent_t *sep)
97{
98	const char *class = sysevent_get_class_name(sep);
99	const char *subclass = sysevent_get_subclass_name(sep);
100	char *fullclass;
101	size_t len;
102	nvlist_t *attr, *nvl;
103	fmd_event_t *e;
104	hrtime_t hrt;
105
106	/* notify the DR subsystem of the event */
107	fmd_dr_event(sep);
108
109	/* get the matching sysevent name */
110	len = snprintf(NULL, 0, "%s%s.%s", SYSEVENT_RSRC_CLASS,
111	    class, subclass);
112	fullclass = alloca(len + 1);
113	(void) snprintf(fullclass, len + 1, "%s%s.%s",
114	    SYSEVENT_RSRC_CLASS, class, subclass);
115
116	/* construct the event payload */
117	(void) nvlist_xalloc(&nvl, NV_UNIQUE_NAME, &fmd.d_nva);
118	if (sysevent_get_attr_list(sep, &attr) == 0) {
119		(void) nvlist_merge(nvl, attr, 0);
120		nvlist_free(attr);
121	}
122
123	/*
124	 * Add class and version after the nvlist_merge() just in case
125	 * the sysevent has an attribute called class or version.
126	 */
127	(void) nvlist_add_string(nvl, FM_CLASS, fullclass);
128	(void) nvlist_add_uint8(nvl, FM_VERSION, FM_RSRC_VERSION);
129
130	/*
131	 * Dispatch the event.  Ideally, we'd like to use the same transport
132	 * interface as sysev_recv(), but because the legacy sysevent mechanism
133	 * puts in a thread outside fmd's control, using the module APIs is
134	 * impossible.
135	 */
136	sysevent_get_time(sep, &hrt);
137	(void) nvlist_lookup_string(nvl, FM_CLASS, &fullclass);
138	e = fmd_event_create(FMD_EVT_PROTOCOL, hrt, nvl, fullclass);
139	fmd_dispq_dispatch(fmd.d_disp, e, fullclass);
140}
141
142/*
143 * Receive an event from the SysEvent channel and post it to our transport.
144 * Under extreme low-memory situations where we cannot event unpack the event,
145 * we can request that SysEvent redeliver the event later by returning EAGAIN.
146 * If we do this too many times, the kernel will drop the event.  Rather than
147 * keeping state per-event, we simply attempt a garbage-collect, hoping that
148 * enough free memory will be available by the time the event is redelivered.
149 */
150static int
151sysev_recv(sysevent_t *sep, void *arg)
152{
153	uint64_t seq = sysevent_get_seq(sep);
154	fmd_xprt_t *xp = arg;
155	nvlist_t *nvl;
156	hrtime_t hrt;
157	int rc = 0;
158
159	(void) pthread_mutex_lock(&sysev_mutex);
160	if (sysev_exiting == 1) {
161		while (sysev_xprt_refcnt > 0)
162			(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
163		(void) pthread_mutex_unlock(&sysev_mutex);
164		return (EAGAIN);
165	}
166	sysev_xprt_refcnt++;
167	while (sysev_replay_wait)
168		(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
169	(void) pthread_mutex_unlock(&sysev_mutex);
170
171	if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
172		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
173		    " transport class %s\n", seq, sysevent_get_class_name(sep));
174		sysev_stats.bad_class.fmds_value.ui64++;
175	} else if (sysevent_get_attr_list(sep, &nvl) != 0) {
176		if (errno == EAGAIN || errno == ENOMEM) {
177			fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
178			fmd_scheme_hash_trygc(fmd.d_schemes);
179			sysev_stats.eagain.fmds_value.ui64++;
180			rc = EAGAIN;
181		} else {
182			fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: "
183			    "missing or invalid payload", seq);
184			sysev_stats.bad_attr.fmds_value.ui64++;
185		}
186	} else {
187		sysevent_get_time(sep, &hrt);
188		fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
189	}
190
191	(void) pthread_mutex_lock(&sysev_mutex);
192	if (--sysev_xprt_refcnt == 0 && sysev_exiting == 1)
193		(void) pthread_cond_broadcast(&sysev_cv);
194	(void) pthread_mutex_unlock(&sysev_mutex);
195
196	return (rc);
197}
198
199/*
200 * Checksum algorithm used by the dump transport for verifying the content of
201 * error reports saved on the dump device (copy of the kernel's checksum32()).
202 */
203static uint32_t
204sysev_checksum(void *cp_arg, size_t length)
205{
206	uchar_t *cp, *ep;
207	uint32_t sum = 0;
208
209	for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
210		sum = ((sum >> 1) | (sum << 31)) + *cp;
211
212	return (sum);
213}
214
215/*
216 * Replay saved events from the dump transport.  This function is installed as
217 * the timer callback and is called only once during the module's lifetime.
218 */
219/*ARGSUSED*/
220static void
221sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
222{
223	char *dumpdev;
224	off64_t off, off0;
225	int fd, err;
226
227	/*
228	 * Determine the appropriate dump device to use for replaying pending
229	 * error reports.  If the device property is NULL (default), we
230	 * open and query /dev/dump to determine the current dump device.
231	 */
232	if ((dumpdev = sysev_device) == NULL) {
233		if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
234			fmd_hdl_error(hdl, "failed to open /dev/dump "
235			    "to locate dump device for event replay");
236			goto done;
237		}
238
239		dumpdev = alloca(PATH_MAX);
240		err = ioctl(fd, DIOCGETDEV, dumpdev);
241		(void) close(fd);
242
243		if (err == -1) {
244			if (errno != ENODEV) {
245				fmd_hdl_error(hdl, "failed to obtain "
246				    "path to dump device for event replay");
247			}
248			goto done;
249		}
250	}
251
252	if (strcmp(dumpdev, "/dev/null") == 0)
253		goto done; /* return silently and skip replay for /dev/null */
254
255	/*
256	 * Open the appropriate device and then determine the offset of the
257	 * start of the ereport dump region located at the end of the device.
258	 */
259	if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
260		fmd_hdl_error(hdl, "failed to open dump transport %s "
261		    "(pending events will not be replayed)", dumpdev);
262		goto done;
263	}
264
265	off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
266	off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
267
268	if (off == (off64_t)-1LL) {
269		fmd_hdl_error(hdl, "failed to seek dump transport %s "
270		    "(pending events will not be replayed)", dumpdev);
271		(void) close(fd);
272		goto done;
273	}
274
275	/*
276	 * The ereport dump region is a sequence of erpt_dump_t headers each of
277	 * which is followed by packed nvlist data.  We iterate over them in
278	 * order, unpacking and dispatching each one to our dispatch queue.
279	 */
280	for (;;) {
281		char nvbuf[ERPT_DATA_SZ];
282		uint32_t chksum;
283		erpt_dump_t ed;
284		nvlist_t *nvl;
285
286		fmd_timeval_t ftv, tod;
287		hrtime_t hrt;
288		uint64_t ena;
289
290		if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
291			fmd_hdl_error(hdl, "failed to read from dump "
292			    "transport %s (pending events lost)", dumpdev);
293			break;
294		}
295
296		if (ed.ed_magic == 0 && ed.ed_size == 0)
297			break; /* end of list: all zero */
298
299		if (ed.ed_magic == 0) {
300			off += sizeof (ed) + ed.ed_size;
301			continue; /* continue searching */
302		}
303
304		if (ed.ed_magic != ERPT_MAGIC) {
305			/*
306			 * Stop reading silently if the first record has the
307			 * wrong magic number; this likely indicates that we
308			 * rebooted from non-FMA bits or paged over the dump.
309			 */
310			if (off == off0)
311				break;
312
313			fmd_hdl_error(hdl, "invalid dump transport "
314			    "record at %llx (magic number %x, expected %x)\n",
315			    (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
316			break;
317		}
318
319		if (ed.ed_size > ERPT_DATA_SZ) {
320			fmd_hdl_error(hdl, "invalid dump transport "
321			    "record at %llx size (%u exceeds limit)\n",
322			    (u_longlong_t)off, ed.ed_size);
323			break;
324		}
325
326		if (pread64(fd, nvbuf, ed.ed_size,
327		    off + sizeof (ed)) != ed.ed_size) {
328			fmd_hdl_error(hdl, "failed to read dump "
329			    "transport event (offset %llx)", (u_longlong_t)off);
330
331			sysev_stats.dump_lost.fmds_value.ui64++;
332			goto next;
333		}
334
335		if ((chksum = sysev_checksum(nvbuf,
336		    ed.ed_size)) != ed.ed_chksum) {
337			fmd_hdl_error(hdl, "dump transport event at "
338			    "offset %llx is corrupt (checksum %x != %x)\n",
339			    (u_longlong_t)off, chksum, ed.ed_chksum);
340
341			sysev_stats.dump_lost.fmds_value.ui64++;
342			goto next;
343		}
344
345		if ((err = nvlist_xunpack(nvbuf,
346		    ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
347			fmd_hdl_error(hdl, "failed to unpack dump "
348			    "transport event at offset %llx: %s\n",
349			    (u_longlong_t)off, fmd_strerror(err));
350
351			sysev_stats.dump_lost.fmds_value.ui64++;
352			goto next;
353		}
354
355		/*
356		 * If ed_hrt_nsec is set it contains the gethrtime() value from
357		 * when the event was originally enqueued for the transport.
358		 * If it is zero, we use the weaker bound ed_hrt_base instead.
359		 */
360		if (ed.ed_hrt_nsec != 0)
361			hrt = ed.ed_hrt_nsec;
362		else
363			hrt = ed.ed_hrt_base;
364
365		/*
366		 * If this is an FMA protocol event of class "ereport.*" that
367		 * contains valid ENA, we can improve the precision of 'hrt'.
368		 */
369		if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
370			hrt = fmd_time_ena2hrt(hrt, ena);
371
372		/*
373		 * Now convert 'hrt' to an adjustable TOD based on the values
374		 * in ed_tod_base which correspond to one another and are
375		 * sampled before reboot using the old gethrtime() clock.
376		 * fmd_event_recreate() will use this TOD value to re-assign
377		 * the event an updated gethrtime() value based on the current
378		 * value of the non-adjustable gethrtime() clock.  Phew.
379		 */
380		tod.ftv_sec = ed.ed_tod_base.sec;
381		tod.ftv_nsec = ed.ed_tod_base.nsec;
382		fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
383
384		(void) nvlist_remove_all(nvl, FMD_EVN_TOD);
385		(void) nvlist_add_uint64_array(nvl,
386		    FMD_EVN_TOD, (uint64_t *)&ftv, 2);
387
388		fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
389		sysev_stats.dump_replay.fmds_value.ui64++;
390
391next:
392		/*
393		 * Reset the magic number for the event record to zero so that
394		 * we do not replay the same event multiple times.
395		 */
396		ed.ed_magic = 0;
397
398		if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
399			fmd_hdl_error(hdl, "failed to mark dump "
400			    "transport event (offset %llx)", (u_longlong_t)off);
401		}
402
403		off += sizeof (ed) + ed.ed_size;
404	}
405
406	(void) close(fd);
407done:
408	(void) pthread_mutex_lock(&sysev_mutex);
409	sysev_replay_wait = 0;
410	(void) pthread_cond_broadcast(&sysev_cv);
411	(void) pthread_mutex_unlock(&sysev_mutex);
412}
413
414static const fmd_prop_t sysev_props[] = {
415	{ "class", FMD_TYPE_STRING, EC_ALL },		/* event class */
416	{ "device", FMD_TYPE_STRING, NULL },		/* replay device */
417	{ "channel", FMD_TYPE_STRING, FM_ERROR_CHAN },	/* channel name */
418	{ "sid", FMD_TYPE_STRING, "fmd" },		/* subscriber id */
419	{ NULL, 0, NULL }
420};
421
422static const fmd_hdl_ops_t sysev_ops = {
423	NULL,		/* fmdo_recv */
424	sysev_replay,	/* fmdo_timeout */
425	NULL,		/* fmdo_close */
426	NULL,		/* fmdo_stats */
427	NULL,		/* fmdo_gc */
428	NULL,		/* fmdo_send */
429};
430
431static const fmd_hdl_info_t sysev_info = {
432	"SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
433};
434
435/*
436 * Bind to the sysevent channel we use for listening for error events and then
437 * subscribe to appropriate events received over this channel.  Setup the
438 * legacy sysevent handler for creating sysevent resources and forwarding DR
439 * events.
440 */
441void
442sysev_init(fmd_hdl_t *hdl)
443{
444	uint_t flags;
445	const char *subclasses[] = { EC_SUB_ALL };
446
447	if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
448		return; /* invalid property settings */
449
450	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
451	    sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
452
453	sysev_channel = fmd_prop_get_string(hdl, "channel");
454	sysev_class = fmd_prop_get_string(hdl, "class");
455	sysev_device = fmd_prop_get_string(hdl, "device");
456	sysev_sid = fmd_prop_get_string(hdl, "sid");
457
458	if (sysev_channel == NULL)
459		fmd_hdl_abort(hdl, "channel property must be defined\n");
460
461	if (sysev_sid == NULL)
462		fmd_hdl_abort(hdl, "sid property must be defined\n");
463
464	if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
465	    EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
466		fmd_hdl_abort(hdl, "failed to bind to event transport "
467		    "channel %s", sysev_channel);
468	}
469
470	sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY |
471	    FMD_XPRT_CACHE_AS_LOCAL, NULL, NULL);
472	sysev_hdl = hdl;
473
474	/*
475	 * If we're subscribing to the default channel, keep our subscription
476	 * active even if we die unexpectedly so we continue queuing events.
477	 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
478	 * that our event channel will be destroyed if we die unpleasantly.
479	 */
480	if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
481		flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
482	else
483		flags = EVCH_SUB_DUMP;
484
485	errno = sysevent_evc_subscribe(sysev_evc,
486	    sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags);
487
488	if (errno != 0) {
489		if (errno == EEXIST) {
490			fmd_hdl_abort(hdl, "another fault management daemon is "
491			    "active on transport channel %s\n", sysev_channel);
492		} else {
493			fmd_hdl_abort(hdl, "failed to subscribe to %s on "
494			    "transport channel %s", sysev_class, sysev_channel);
495		}
496	}
497
498	/*
499	 * Once the transport is open, install a single timer to fire at once
500	 * in the context of the module's thread to run sysev_replay().  This
501	 * thread will block in its first fmd_xprt_post() until fmd is ready.
502	 */
503	fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
504	(void) fmd_timer_install(hdl, NULL, NULL, 0);
505
506	/*
507	 * Open the legacy sysevent handle and subscribe to all events.  These
508	 * are automatically converted to "resource.sysevent.*" events so that
509	 * modules can manage these events without additional infrastructure.
510	 */
511	if (geteuid() != 0)
512		return;
513
514	if ((fmd.d_sysev_hdl =
515	    sysevent_bind_handle(sysev_legacy)) == NULL)
516		fmd_hdl_abort(hdl, "failed to bind to legacy sysevent channel");
517
518	if (sysevent_subscribe_event(fmd.d_sysev_hdl, EC_ALL,
519	    subclasses, 1) != 0)
520		fmd_hdl_abort(hdl, "failed to subscribe to legacy sysevents");
521}
522
523/*
524 * Close the channel by unsubscribing and unbinding.  We only do this when a
525 * a non-default channel has been selected.  If we're using FM_ERROR_CHAN,
526 * the system default, we do *not* want to unsubscribe because the kernel will
527 * remove the subscriber queue and any events published in our absence will
528 * therefore be lost.  This scenario may occur when, for example, fmd is sent
529 * a SIGTERM by init(1M) during reboot but an error is detected and makes it
530 * into the sysevent channel queue before init(1M) manages to call uadmin(2).
531 */
532void
533sysev_fini(fmd_hdl_t *hdl)
534{
535	if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
536		sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
537		sysevent_evc_unbind(sysev_evc);
538	}
539
540	if (fmd.d_sysev_hdl != NULL)
541		sysevent_unbind_handle(fmd.d_sysev_hdl);
542
543	if (sysev_xprt != NULL) {
544		/*
545		 * Wait callback returns before destroy the transport.
546		 */
547		(void) pthread_mutex_lock(&sysev_mutex);
548		sysev_exiting = 1;
549		while (sysev_xprt_refcnt > 0)
550			(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
551		(void) pthread_mutex_unlock(&sysev_mutex);
552		fmd_xprt_close(hdl, sysev_xprt);
553	}
554
555	fmd_prop_free_string(hdl, sysev_class);
556	fmd_prop_free_string(hdl, sysev_channel);
557	fmd_prop_free_string(hdl, sysev_device);
558	fmd_prop_free_string(hdl, sysev_sid);
559}
560