fmd_sysevent.c revision 2914:266e6e5b5218
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/sysevent/eventdefs.h>
30#include <sys/sysevent.h>
31#include <sys/sysevent_impl.h>
32#include <sys/fm/protocol.h>
33#include <sys/sysmacros.h>
34#include <sys/dumphdr.h>
35#include <sys/dumpadm.h>
36#include <sys/fm/util.h>
37
38#include <libsysevent.h>
39#include <libnvpair.h>
40#include <alloca.h>
41#include <limits.h>
42#include <strings.h>
43#include <unistd.h>
44#include <fcntl.h>
45#include <errno.h>
46
47#undef MUTEX_HELD
48#undef RW_READ_HELD
49#undef RW_WRITE_HELD
50
51#include <fmd_api.h>
52#include <fmd_log.h>
53#include <fmd_subr.h>
54#include <fmd_dispq.h>
55#include <fmd_module.h>
56#include <fmd_scheme.h>
57#include <fmd_error.h>
58
59#include <fmd.h>
60
61static char *sysev_channel;	/* event channel to which we are subscribed */
62static char *sysev_class;	/* event class to which we are subscribed */
63static char *sysev_device;	/* device path to use for replaying events */
64static char *sysev_sid;		/* event channel subscriber identifier */
65static void *sysev_evc;		/* event channel cookie from evc_bind */
66
67static fmd_xprt_t *sysev_xprt;
68static fmd_hdl_t *sysev_hdl;
69
70static struct sysev_stats {
71	fmd_stat_t dump_replay;
72	fmd_stat_t dump_lost;
73	fmd_stat_t bad_class;
74	fmd_stat_t bad_attr;
75	fmd_stat_t eagain;
76} sysev_stats = {
77	{ "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
78	{ "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
79	{ "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
80	{ "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
81	{ "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
82};
83
84static pthread_cond_t sysev_replay_cv = PTHREAD_COND_INITIALIZER;
85static pthread_mutex_t sysev_replay_mutex = PTHREAD_MUTEX_INITIALIZER;
86static int sysev_replay_wait = 1;
87
88/*
89 * Receive an event from the SysEvent channel and post it to our transport.
90 * Under extreme low-memory situations where we cannot event unpack the event,
91 * we can request that SysEvent redeliver the event later by returning EAGAIN.
92 * If we do this too many times, the kernel will drop the event.  Rather than
93 * keeping state per-event, we simply attempt a garbage-collect, hoping that
94 * enough free memory will be available by the time the event is redelivered.
95 */
96static int
97sysev_recv(sysevent_t *sep, void *arg)
98{
99	uint64_t seq = sysevent_get_seq(sep);
100	fmd_xprt_t *xp = arg;
101	nvlist_t *nvl;
102	hrtime_t hrt;
103
104	(void) pthread_mutex_lock(&sysev_replay_mutex);
105	while (sysev_replay_wait)
106		(void) pthread_cond_wait(&sysev_replay_cv, &sysev_replay_mutex);
107	(void) pthread_mutex_unlock(&sysev_replay_mutex);
108
109	if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
110		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
111		    " transport class %s\n", seq, sysevent_get_class_name(sep));
112		sysev_stats.bad_class.fmds_value.ui64++;
113		return (0);
114	}
115
116	if (sysevent_get_attr_list(sep, &nvl) != 0) {
117		if (errno == EAGAIN || errno == ENOMEM) {
118			fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
119			fmd_scheme_hash_trygc(fmd.d_schemes);
120			sysev_stats.eagain.fmds_value.ui64++;
121			return (EAGAIN);
122		}
123
124		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: missing "
125		    "or invalid payload", seq);
126		sysev_stats.bad_attr.fmds_value.ui64++;
127		return (0);
128	}
129
130	sysevent_get_time(sep, &hrt);
131	fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
132	return (0);
133}
134
135/*
136 * Checksum algorithm used by the dump transport for verifying the content of
137 * error reports saved on the dump device (copy of the kernel's checksum32()).
138 */
139static uint32_t
140sysev_checksum(void *cp_arg, size_t length)
141{
142	uchar_t *cp, *ep;
143	uint32_t sum = 0;
144
145	for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
146		sum = ((sum >> 1) | (sum << 31)) + *cp;
147
148	return (sum);
149}
150
151/*
152 * Replay saved events from the dump transport.  This function is installed as
153 * the timer callback and is called only once during the module's lifetime.
154 */
155/*ARGSUSED*/
156static void
157sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
158{
159	char *dumpdev;
160	off64_t off, off0;
161	int fd, err;
162
163	/*
164	 * Determine the appropriate dump device to use for replaying pending
165	 * error reports.  If the device property is NULL (default), we
166	 * open and query /dev/dump to determine the current dump device.
167	 */
168	if ((dumpdev = sysev_device) == NULL) {
169		if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
170			fmd_hdl_error(hdl, "failed to open /dev/dump "
171			    "to locate dump device for event replay");
172			goto done;
173		}
174
175		dumpdev = alloca(PATH_MAX);
176		err = ioctl(fd, DIOCGETDEV, dumpdev);
177		(void) close(fd);
178
179		if (err == -1) {
180			if (errno != ENODEV) {
181				fmd_hdl_error(hdl, "failed to obtain "
182				    "path to dump device for event replay");
183			}
184			goto done;
185		}
186	}
187
188	if (strcmp(dumpdev, "/dev/null") == 0)
189		goto done; /* return silently and skip replay for /dev/null */
190
191	/*
192	 * Open the appropriate device and then determine the offset of the
193	 * start of the ereport dump region located at the end of the device.
194	 */
195	if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
196		fmd_hdl_error(hdl, "failed to open dump transport %s "
197		    "(pending events will not be replayed)", dumpdev);
198		goto done;
199	}
200
201	off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
202	off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
203
204	if (off == (off64_t)-1LL) {
205		fmd_hdl_error(hdl, "failed to seek dump transport %s "
206		    "(pending events will not be replayed)", dumpdev);
207		(void) close(fd);
208		goto done;
209	}
210
211	/*
212	 * The ereport dump region is a sequence of erpt_dump_t headers each of
213	 * which is followed by packed nvlist data.  We iterate over them in
214	 * order, unpacking and dispatching each one to our dispatch queue.
215	 */
216	for (;;) {
217		char nvbuf[ERPT_DATA_SZ];
218		uint32_t chksum;
219		erpt_dump_t ed;
220		nvlist_t *nvl;
221
222		fmd_timeval_t ftv, tod;
223		hrtime_t hrt;
224		uint64_t ena;
225
226		if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
227			fmd_hdl_error(hdl, "failed to read from dump "
228			    "transport %s (pending events lost)", dumpdev);
229			break;
230		}
231
232		if (ed.ed_magic == 0 && ed.ed_size == 0)
233			break; /* end of list: all zero */
234
235		if (ed.ed_magic == 0) {
236			off += sizeof (ed) + ed.ed_size;
237			continue; /* continue searching */
238		}
239
240		if (ed.ed_magic != ERPT_MAGIC) {
241			/*
242			 * Stop reading silently if the first record has the
243			 * wrong magic number; this likely indicates that we
244			 * rebooted from non-FMA bits or paged over the dump.
245			 */
246			if (off == off0)
247				break;
248
249			fmd_hdl_error(hdl, "invalid dump transport "
250			    "record at %llx (magic number %x, expected %x)\n",
251			    (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
252			break;
253		}
254
255		if (ed.ed_size > ERPT_DATA_SZ) {
256			fmd_hdl_error(hdl, "invalid dump transport "
257			    "record at %llx size (%u exceeds limit)\n",
258			    (u_longlong_t)off, ed.ed_size);
259			break;
260		}
261
262		if (pread64(fd, nvbuf, ed.ed_size,
263		    off + sizeof (ed)) != ed.ed_size) {
264			fmd_hdl_error(hdl, "failed to read dump "
265			    "transport event (offset %llx)", (u_longlong_t)off);
266
267			sysev_stats.dump_lost.fmds_value.ui64++;
268			goto next;
269		}
270
271		if ((chksum = sysev_checksum(nvbuf,
272		    ed.ed_size)) != ed.ed_chksum) {
273			fmd_hdl_error(hdl, "dump transport event at "
274			    "offset %llx is corrupt (checksum %x != %x)\n",
275			    (u_longlong_t)off, chksum, ed.ed_chksum);
276
277			sysev_stats.dump_lost.fmds_value.ui64++;
278			goto next;
279		}
280
281		if ((err = nvlist_xunpack(nvbuf,
282		    ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
283			fmd_hdl_error(hdl, "failed to unpack dump "
284			    "transport event at offset %llx: %s\n",
285			    (u_longlong_t)off, fmd_strerror(err));
286
287			sysev_stats.dump_lost.fmds_value.ui64++;
288			goto next;
289		}
290
291		/*
292		 * If ed_hrt_nsec is set it contains the gethrtime() value from
293		 * when the event was originally enqueued for the transport.
294		 * If it is zero, we use the weaker bound ed_hrt_base instead.
295		 */
296		if (ed.ed_hrt_nsec != 0)
297			hrt = ed.ed_hrt_nsec;
298		else
299			hrt = ed.ed_hrt_base;
300
301		/*
302		 * If this is an FMA protocol event of class "ereport.*" that
303		 * contains valid ENA, we can improve the precision of 'hrt'.
304		 */
305		if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
306			hrt = fmd_time_ena2hrt(hrt, ena);
307
308		/*
309		 * Now convert 'hrt' to an adjustable TOD based on the values
310		 * in ed_tod_base which correspond to one another and are
311		 * sampled before reboot using the old gethrtime() clock.
312		 * fmd_event_recreate() will use this TOD value to re-assign
313		 * the event an updated gethrtime() value based on the current
314		 * value of the non-adjustable gethrtime() clock.  Phew.
315		 */
316		tod.ftv_sec = ed.ed_tod_base.sec;
317		tod.ftv_nsec = ed.ed_tod_base.nsec;
318		fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
319
320		(void) nvlist_remove_all(nvl, FMD_EVN_TOD);
321		(void) nvlist_add_uint64_array(nvl,
322		    FMD_EVN_TOD, (uint64_t *)&ftv, 2);
323
324		fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
325		sysev_stats.dump_replay.fmds_value.ui64++;
326
327next:
328		/*
329		 * Reset the magic number for the event record to zero so that
330		 * we do not replay the same event multiple times.
331		 */
332		ed.ed_magic = 0;
333
334		if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
335			fmd_hdl_error(hdl, "failed to mark dump "
336			    "transport event (offset %llx)", (u_longlong_t)off);
337		}
338
339		off += sizeof (ed) + ed.ed_size;
340	}
341
342	(void) close(fd);
343done:
344	(void) pthread_mutex_lock(&sysev_replay_mutex);
345	sysev_replay_wait = 0;
346	(void) pthread_cond_broadcast(&sysev_replay_cv);
347	(void) pthread_mutex_unlock(&sysev_replay_mutex);
348}
349
350static const fmd_prop_t sysev_props[] = {
351	{ "class", FMD_TYPE_STRING, EC_ALL },		/* event class */
352	{ "device", FMD_TYPE_STRING, NULL },		/* replay device */
353	{ "channel", FMD_TYPE_STRING, FM_ERROR_CHAN },	/* channel name */
354	{ "sid", FMD_TYPE_STRING, "fmd" },		/* subscriber id */
355	{ NULL, 0, NULL }
356};
357
358static const fmd_hdl_ops_t sysev_ops = {
359	NULL,		/* fmdo_recv */
360	sysev_replay,	/* fmdo_timeout */
361	NULL,		/* fmdo_close */
362	NULL,		/* fmdo_stats */
363	NULL,		/* fmdo_gc */
364	NULL,		/* fmdo_send */
365};
366
367static const fmd_hdl_info_t sysev_info = {
368	"SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
369};
370
371/*
372 * Bind to the sysevent channel we use for listening for error events and then
373 * subscribe to appropriate events received over this channel.
374 */
375void
376sysev_init(fmd_hdl_t *hdl)
377{
378	uint_t flags;
379
380	if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
381		return; /* invalid property settings */
382
383	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
384	    sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
385
386	sysev_channel = fmd_prop_get_string(hdl, "channel");
387	sysev_class = fmd_prop_get_string(hdl, "class");
388	sysev_device = fmd_prop_get_string(hdl, "device");
389	sysev_sid = fmd_prop_get_string(hdl, "sid");
390
391	if (sysev_channel == NULL)
392		fmd_hdl_abort(hdl, "channel property must be defined\n");
393
394	if (sysev_sid == NULL)
395		fmd_hdl_abort(hdl, "sid property must be defined\n");
396
397	if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
398	    EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
399		fmd_hdl_abort(hdl, "failed to bind to event transport "
400		    "channel %s", sysev_channel);
401	}
402
403	sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL);
404	sysev_hdl = hdl;
405
406	/*
407	 * If we're subscribing to the default channel, keep our subscription
408	 * active even if we die unexpectedly so we continue queuing events.
409	 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
410	 * that our event channel will be destroyed if we die unpleasantly.
411	 */
412	if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
413		flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
414	else
415		flags = EVCH_SUB_DUMP;
416
417	errno = sysevent_evc_subscribe(sysev_evc,
418	    sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags);
419
420	if (errno != 0) {
421		if (errno == EEXIST) {
422			fmd_hdl_abort(hdl, "another fault management daemon is "
423			    "active on transport channel %s\n", sysev_channel);
424		} else {
425			fmd_hdl_abort(hdl, "failed to subscribe to %s on "
426			    "transport channel %s", sysev_class, sysev_channel);
427		}
428	}
429
430	/*
431	 * Once the transport is open, install a single timer to fire at once
432	 * in the context of the module's thread to run sysev_replay().  This
433	 * thread will block in its first fmd_xprt_post() until fmd is ready.
434	 */
435	fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
436	(void) fmd_timer_install(hdl, NULL, NULL, 0);
437}
438
439/*
440 * Close the channel by unsubscribing and unbinding.  We only do this when a
441 * a non-default channel has been selected.  If we're using FM_ERROR_CHAN,
442 * the system default, we do *not* want to unsubscribe because the kernel will
443 * remove the subscriber queue and any events published in our absence will
444 * therefore be lost.  This scenario may occur when, for example, fmd is sent
445 * a SIGTERM by init(1M) during reboot but an error is detected and makes it
446 * into the sysevent channel queue before init(1M) manages to call uadmin(2).
447 */
448void
449sysev_fini(fmd_hdl_t *hdl)
450{
451	if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
452		sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
453		sysevent_evc_unbind(sysev_evc);
454	}
455
456	if (sysev_xprt != NULL)
457		fmd_xprt_close(hdl, sysev_xprt);
458
459	fmd_prop_free_string(hdl, sysev_class);
460	fmd_prop_free_string(hdl, sysev_channel);
461	fmd_prop_free_string(hdl, sysev_device);
462	fmd_prop_free_string(hdl, sysev_sid);
463}
464