1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd
22168404Spjd/*
23219089Spjd * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24307108Smav * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25296519Smav * Copyright (c) 2014 Integros [integros.com]
26331397Smav * Copyright 2017 Joyent, Inc.
27168404Spjd */
28168404Spjd
29185029Spjd#include <sys/spa.h>
30168404Spjd#include <sys/spa_impl.h>
31168404Spjd#include <sys/zap.h>
32168404Spjd#include <sys/dsl_synctask.h>
33185029Spjd#include <sys/dmu_tx.h>
34185029Spjd#include <sys/dmu_objset.h>
35248571Smm#include <sys/dsl_dataset.h>
36248571Smm#include <sys/dsl_dir.h>
37185029Spjd#include <sys/utsname.h>
38185029Spjd#include <sys/sunddi.h>
39248571Smm#include <sys/cred.h>
40219089Spjd#include "zfs_comutil.h"
41185029Spjd#ifdef _KERNEL
42185029Spjd#include <sys/cmn_err.h>
43185029Spjd#include <sys/zone.h>
44185029Spjd#endif
45168404Spjd
46168404Spjd/*
47168404Spjd * Routines to manage the on-disk history log.
48168404Spjd *
49168404Spjd * The history log is stored as a dmu object containing
50168404Spjd * <packed record length, record nvlist> tuples.
51168404Spjd *
52168404Spjd * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
53168404Spjd * "packed record length" is the packed length of the "record nvlist" stored
54168404Spjd * as a little endian uint64_t.
55168404Spjd *
56168404Spjd * The log is implemented as a ring buffer, though the original creation
57168404Spjd * of the pool ('zpool create') is never overwritten.
58168404Spjd *
59168404Spjd * The history log is tracked as object 'spa_t::spa_history'.  The bonus buffer
60168404Spjd * of 'spa_history' stores the offsets for logging/retrieving history as
61168404Spjd * 'spa_history_phys_t'.  'sh_pool_create_len' is the ending offset in bytes of
62168404Spjd * where the 'zpool create' record is stored.  This allows us to never
63168404Spjd * overwrite the original creation of the pool.  'sh_phys_max_off' is the
64168404Spjd * physical ending offset in bytes of the log.  This tells you the length of
65168404Spjd * the buffer. 'sh_eof' is the logical EOF (in bytes).  Whenever a record
66168404Spjd * is added, 'sh_eof' is incremented by the the size of the record.
67168404Spjd * 'sh_eof' is never decremented.  'sh_bof' is the logical BOF (in bytes).
68168404Spjd * This is where the consumer should start reading from after reading in
69168404Spjd * the 'zpool create' portion of the log.
70168404Spjd *
71168404Spjd * 'sh_records_lost' keeps track of how many records have been overwritten
72168404Spjd * and permanently lost.
73168404Spjd */
74168404Spjd
75168404Spjd/* convert a logical offset to physical */
76168404Spjdstatic uint64_t
77168404Spjdspa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
78168404Spjd{
79168404Spjd	uint64_t phys_len;
80168404Spjd
81168404Spjd	phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
82168404Spjd	return ((log_off - shpp->sh_pool_create_len) % phys_len
83168404Spjd	    + shpp->sh_pool_create_len);
84168404Spjd}
85168404Spjd
86168404Spjdvoid
87168404Spjdspa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
88168404Spjd{
89168404Spjd	dmu_buf_t *dbp;
90168404Spjd	spa_history_phys_t *shpp;
91168404Spjd	objset_t *mos = spa->spa_meta_objset;
92168404Spjd
93168404Spjd	ASSERT(spa->spa_history == 0);
94168404Spjd	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
95274337Sdelphij	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
96168404Spjd	    sizeof (spa_history_phys_t), tx);
97168404Spjd
98168404Spjd	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
99168404Spjd	    DMU_POOL_HISTORY, sizeof (uint64_t), 1,
100168404Spjd	    &spa->spa_history, tx) == 0);
101168404Spjd
102168404Spjd	VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
103168404Spjd	ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
104168404Spjd
105168404Spjd	shpp = dbp->db_data;
106168404Spjd	dmu_buf_will_dirty(dbp, tx);
107168404Spjd
108168404Spjd	/*
109168404Spjd	 * Figure out maximum size of history log.  We set it at
110228103Smm	 * 0.1% of pool size, with a max of 1G and min of 128KB.
111168404Spjd	 */
112219089Spjd	shpp->sh_phys_max_off =
113228103Smm	    metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
114228103Smm	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
115168404Spjd	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
116168404Spjd
117168404Spjd	dmu_buf_rele(dbp, FTAG);
118168404Spjd}
119168404Spjd
120168404Spjd/*
121168404Spjd * Change 'sh_bof' to the beginning of the next record.
122168404Spjd */
123168404Spjdstatic int
124168404Spjdspa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
125168404Spjd{
126168404Spjd	objset_t *mos = spa->spa_meta_objset;
127168404Spjd	uint64_t firstread, reclen, phys_bof;
128168404Spjd	char buf[sizeof (reclen)];
129168404Spjd	int err;
130168404Spjd
131168404Spjd	phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
132168404Spjd	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
133168404Spjd
134168404Spjd	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
135209962Smm	    buf, DMU_READ_PREFETCH)) != 0)
136168404Spjd		return (err);
137168404Spjd	if (firstread != sizeof (reclen)) {
138168404Spjd		if ((err = dmu_read(mos, spa->spa_history,
139168404Spjd		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
140209962Smm		    buf + firstread, DMU_READ_PREFETCH)) != 0)
141168404Spjd			return (err);
142168404Spjd	}
143168404Spjd
144168404Spjd	reclen = LE_64(*((uint64_t *)buf));
145168404Spjd	shpp->sh_bof += reclen + sizeof (reclen);
146168404Spjd	shpp->sh_records_lost++;
147168404Spjd	return (0);
148168404Spjd}
149168404Spjd
150168404Spjdstatic int
151168404Spjdspa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
152168404Spjd    dmu_tx_t *tx)
153168404Spjd{
154168404Spjd	uint64_t firstwrite, phys_eof;
155168404Spjd	objset_t *mos = spa->spa_meta_objset;
156168404Spjd	int err;
157168404Spjd
158168404Spjd	ASSERT(MUTEX_HELD(&spa->spa_history_lock));
159168404Spjd
160168404Spjd	/* see if we need to reset logical BOF */
161168404Spjd	while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
162168404Spjd	    (shpp->sh_eof - shpp->sh_bof) <= len) {
163185029Spjd		if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
164168404Spjd			return (err);
165185029Spjd		}
166168404Spjd	}
167168404Spjd
168168404Spjd	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
169168404Spjd	firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
170168404Spjd	shpp->sh_eof += len;
171168404Spjd	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
172168404Spjd
173168404Spjd	len -= firstwrite;
174168404Spjd	if (len > 0) {
175168404Spjd		/* write out the rest at the beginning of physical file */
176168404Spjd		dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
177168404Spjd		    len, (char *)buf + firstwrite, tx);
178168404Spjd	}
179168404Spjd
180168404Spjd	return (0);
181168404Spjd}
182168404Spjd
183185029Spjdstatic char *
184248571Smmspa_history_zone(void)
185185029Spjd{
186185029Spjd#ifdef _KERNEL
187194118Sjamie	/* XXX: pr_hostname can be changed by default from within a jail! */
188185029Spjd	if (jailed(curthread->td_ucred))
189194118Sjamie		return (curthread->td_ucred->cr_prison->pr_hostname);
190185029Spjd#endif
191248571Smm	return (NULL);
192185029Spjd}
193185029Spjd
194168404Spjd/*
195331397Smav * Post a history sysevent.
196331397Smav *
197331397Smav * The nvlist_t* passed into this function will be transformed into a new
198331397Smav * nvlist where:
199331397Smav *
200331397Smav * 1. Nested nvlists will be flattened to a single level
201331397Smav * 2. Keys will have their names normalized (to remove any problematic
202331397Smav * characters, such as whitespace)
203331397Smav *
204331397Smav * The nvlist_t passed into this function will duplicated and should be freed
205331397Smav * by caller.
206331397Smav *
207331397Smav */
208331397Smavstatic void
209331397Smavspa_history_log_notify(spa_t *spa, nvlist_t *nvl)
210331397Smav{
211331397Smav	nvlist_t *hist_nvl = fnvlist_alloc();
212331397Smav	uint64_t uint64;
213331397Smav	char *string;
214331397Smav
215331397Smav	if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
216331397Smav		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
217331397Smav
218331397Smav	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
219331397Smav		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
220331397Smav
221331397Smav	if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0)
222331397Smav		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string);
223331397Smav
224331397Smav	if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0)
225331397Smav		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string);
226331397Smav
227331397Smav	if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0)
228331397Smav		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string);
229331397Smav
230331397Smav	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0)
231331397Smav		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string);
232331397Smav
233331397Smav	if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0)
234331397Smav		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string);
235331397Smav
236331397Smav	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
237331397Smav		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
238331397Smav
239331397Smav	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0)
240331397Smav		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64);
241331397Smav
242331397Smav	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0)
243331397Smav		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64);
244331397Smav
245331397Smav	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0)
246331397Smav		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64);
247331397Smav
248331397Smav	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0)
249331397Smav		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64);
250331397Smav
251331397Smav	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0)
252331397Smav		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64);
253331397Smav
254331397Smav	spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT);
255331397Smav
256331397Smav	nvlist_free(hist_nvl);
257331397Smav}
258331397Smav
259331397Smav/*
260168404Spjd * Write out a history event.
261168404Spjd */
262219089Spjd/*ARGSUSED*/
263185029Spjdstatic void
264248571Smmspa_history_log_sync(void *arg, dmu_tx_t *tx)
265168404Spjd{
266248571Smm	nvlist_t	*nvl = arg;
267248571Smm	spa_t		*spa = dmu_tx_pool(tx)->dp_spa;
268168404Spjd	objset_t	*mos = spa->spa_meta_objset;
269168404Spjd	dmu_buf_t	*dbp;
270168404Spjd	spa_history_phys_t *shpp;
271168404Spjd	size_t		reclen;
272168404Spjd	uint64_t	le_len;
273168404Spjd	char		*record_packed = NULL;
274168404Spjd	int		ret;
275168404Spjd
276168404Spjd	/*
277168404Spjd	 * If we have an older pool that doesn't have a command
278168404Spjd	 * history object, create it now.
279168404Spjd	 */
280168404Spjd	mutex_enter(&spa->spa_history_lock);
281168404Spjd	if (!spa->spa_history)
282168404Spjd		spa_history_create_obj(spa, tx);
283168404Spjd	mutex_exit(&spa->spa_history_lock);
284168404Spjd
285168404Spjd	/*
286168404Spjd	 * Get the offset of where we need to write via the bonus buffer.
287168404Spjd	 * Update the offset when the write completes.
288168404Spjd	 */
289248571Smm	VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
290168404Spjd	shpp = dbp->db_data;
291168404Spjd
292168404Spjd	dmu_buf_will_dirty(dbp, tx);
293168404Spjd
294168404Spjd#ifdef ZFS_DEBUG
295168404Spjd	{
296168404Spjd		dmu_object_info_t doi;
297168404Spjd		dmu_object_info_from_db(dbp, &doi);
298168404Spjd		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
299168404Spjd	}
300168404Spjd#endif
301168404Spjd
302248571Smm	fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
303185029Spjd#ifdef _KERNEL
304248571Smm	fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename);
305185029Spjd#endif
306248571Smm	if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
307248571Smm		zfs_dbgmsg("command: %s",
308248571Smm		    fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
309248571Smm	} else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
310248571Smm		if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
311248571Smm			zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
312248571Smm			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
313248571Smm			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
314248571Smm			    fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
315248571Smm			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
316248571Smm			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
317248571Smm		} else {
318248571Smm			zfs_dbgmsg("txg %lld %s %s",
319248571Smm			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
320248571Smm			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
321248571Smm			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
322248571Smm		}
323331397Smav		/*
324331397Smav		 * The history sysevent is posted only for internal history
325331397Smav		 * messages to show what has happened, not how it happened. For
326331397Smav		 * example, the following command:
327331397Smav		 *
328331397Smav		 * # zfs destroy -r tank/foo
329331397Smav		 *
330331397Smav		 * will result in one sysevent posted per dataset that is
331331397Smav		 * destroyed as a result of the command - which could be more
332331397Smav		 * than one event in total.  By contrast, if the sysevent was
333331397Smav		 * posted as a result of the ZPOOL_HIST_CMD key being present
334331397Smav		 * it would result in only one sysevent being posted with the
335331397Smav		 * full command line arguments, requiring the consumer to know
336331397Smav		 * how to parse and understand zfs(1M) command invocations.
337331397Smav		 */
338331397Smav		spa_history_log_notify(spa, nvl);
339248571Smm	} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
340248571Smm		zfs_dbgmsg("ioctl %s",
341248571Smm		    fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
342185029Spjd	}
343185029Spjd
344248571Smm	record_packed = fnvlist_pack(nvl, &reclen);
345185029Spjd
346168404Spjd	mutex_enter(&spa->spa_history_lock);
347168404Spjd
348168404Spjd	/* write out the packed length as little endian */
349168404Spjd	le_len = LE_64((uint64_t)reclen);
350168404Spjd	ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
351168404Spjd	if (!ret)
352168404Spjd		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
353168404Spjd
354248571Smm	/* The first command is the create, which we keep forever */
355248571Smm	if (ret == 0 && shpp->sh_pool_create_len == 0 &&
356248571Smm	    nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
357248571Smm		shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
358168404Spjd	}
359168404Spjd
360168404Spjd	mutex_exit(&spa->spa_history_lock);
361248571Smm	fnvlist_pack_free(record_packed, reclen);
362168404Spjd	dmu_buf_rele(dbp, FTAG);
363248571Smm	fnvlist_free(nvl);
364168404Spjd}
365168404Spjd
366168404Spjd/*
367168404Spjd * Write out a history event.
368168404Spjd */
369168404Spjdint
370248571Smmspa_history_log(spa_t *spa, const char *msg)
371168404Spjd{
372248571Smm	int err;
373248571Smm	nvlist_t *nvl = fnvlist_alloc();
374248571Smm
375248571Smm	fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
376248571Smm	err = spa_history_log_nvl(spa, nvl);
377248571Smm	fnvlist_free(nvl);
378248571Smm	return (err);
379248571Smm}
380248571Smm
381248571Smmint
382248571Smmspa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
383248571Smm{
384219089Spjd	int err = 0;
385219089Spjd	dmu_tx_t *tx;
386248571Smm	nvlist_t *nvarg;
387168404Spjd
388248571Smm	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
389248571Smm		return (EINVAL);
390185029Spjd
391240133Smm	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
392249195Smm		return (SET_ERROR(EINVAL));
393240133Smm
394219089Spjd	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
395219089Spjd	err = dmu_tx_assign(tx, TXG_WAIT);
396219089Spjd	if (err) {
397219089Spjd		dmu_tx_abort(tx);
398219089Spjd		return (err);
399219089Spjd	}
400219089Spjd
401248571Smm	nvarg = fnvlist_dup(nvl);
402248571Smm	if (spa_history_zone() != NULL) {
403248571Smm		fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
404248571Smm		    spa_history_zone());
405248571Smm	}
406248571Smm	fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
407219089Spjd
408219089Spjd	/* Kick this off asynchronously; errors are ignored. */
409248571Smm	dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
410268473Sdelphij	    nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
411219089Spjd	dmu_tx_commit(tx);
412219089Spjd
413248571Smm	/* spa_history_log_sync will free nvl */
414219089Spjd	return (err);
415248571Smm
416168404Spjd}
417168404Spjd
418168404Spjd/*
419168404Spjd * Read out the command history.
420168404Spjd */
421168404Spjdint
422168404Spjdspa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
423168404Spjd{
424168404Spjd	objset_t *mos = spa->spa_meta_objset;
425168404Spjd	dmu_buf_t *dbp;
426168404Spjd	uint64_t read_len, phys_read_off, phys_eof;
427168404Spjd	uint64_t leftover = 0;
428168404Spjd	spa_history_phys_t *shpp;
429168404Spjd	int err;
430168404Spjd
431168404Spjd	/*
432248571Smm	 * If the command history doesn't exist (older pool),
433168404Spjd	 * that's ok, just return ENOENT.
434168404Spjd	 */
435168404Spjd	if (!spa->spa_history)
436249195Smm		return (SET_ERROR(ENOENT));
437168404Spjd
438219089Spjd	/*
439219089Spjd	 * The history is logged asynchronously, so when they request
440219089Spjd	 * the first chunk of history, make sure everything has been
441219089Spjd	 * synced to disk so that we get it.
442219089Spjd	 */
443219089Spjd	if (*offp == 0 && spa_writeable(spa))
444219089Spjd		txg_wait_synced(spa_get_dsl(spa), 0);
445219089Spjd
446168404Spjd	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
447168404Spjd		return (err);
448168404Spjd	shpp = dbp->db_data;
449168404Spjd
450168404Spjd#ifdef ZFS_DEBUG
451168404Spjd	{
452168404Spjd		dmu_object_info_t doi;
453168404Spjd		dmu_object_info_from_db(dbp, &doi);
454168404Spjd		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
455168404Spjd	}
456168404Spjd#endif
457168404Spjd
458168404Spjd	mutex_enter(&spa->spa_history_lock);
459168404Spjd	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
460168404Spjd
461168404Spjd	if (*offp < shpp->sh_pool_create_len) {
462168404Spjd		/* read in just the zpool create history */
463168404Spjd		phys_read_off = *offp;
464168404Spjd		read_len = MIN(*len, shpp->sh_pool_create_len -
465168404Spjd		    phys_read_off);
466168404Spjd	} else {
467168404Spjd		/*
468168404Spjd		 * Need to reset passed in offset to BOF if the passed in
469168404Spjd		 * offset has since been overwritten.
470168404Spjd		 */
471168404Spjd		*offp = MAX(*offp, shpp->sh_bof);
472168404Spjd		phys_read_off = spa_history_log_to_phys(*offp, shpp);
473168404Spjd
474168404Spjd		/*
475168404Spjd		 * Read up to the minimum of what the user passed down or
476168404Spjd		 * the EOF (physical or logical).  If we hit physical EOF,
477168404Spjd		 * use 'leftover' to read from the physical BOF.
478168404Spjd		 */
479168404Spjd		if (phys_read_off <= phys_eof) {
480168404Spjd			read_len = MIN(*len, phys_eof - phys_read_off);
481168404Spjd		} else {
482168404Spjd			read_len = MIN(*len,
483168404Spjd			    shpp->sh_phys_max_off - phys_read_off);
484168404Spjd			if (phys_read_off + *len > shpp->sh_phys_max_off) {
485168404Spjd				leftover = MIN(*len - read_len,
486168404Spjd				    phys_eof - shpp->sh_pool_create_len);
487168404Spjd			}
488168404Spjd		}
489168404Spjd	}
490168404Spjd
491168404Spjd	/* offset for consumer to use next */
492168404Spjd	*offp += read_len + leftover;
493168404Spjd
494168404Spjd	/* tell the consumer how much you actually read */
495168404Spjd	*len = read_len + leftover;
496168404Spjd
497168404Spjd	if (read_len == 0) {
498168404Spjd		mutex_exit(&spa->spa_history_lock);
499168404Spjd		dmu_buf_rele(dbp, FTAG);
500168404Spjd		return (0);
501168404Spjd	}
502168404Spjd
503209962Smm	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
504209962Smm	    DMU_READ_PREFETCH);
505168404Spjd	if (leftover && err == 0) {
506168404Spjd		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
507209962Smm		    leftover, buf + read_len, DMU_READ_PREFETCH);
508168404Spjd	}
509168404Spjd	mutex_exit(&spa->spa_history_lock);
510168404Spjd
511168404Spjd	dmu_buf_rele(dbp, FTAG);
512168404Spjd	return (err);
513168404Spjd}
514185029Spjd
515248571Smm/*
516248571Smm * The nvlist will be consumed by this call.
517248571Smm */
518219089Spjdstatic void
519248571Smmlog_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
520219089Spjd    dmu_tx_t *tx, const char *fmt, va_list adx)
521185029Spjd{
522248571Smm	char *msg;
523219089Spjd	va_list adx2;
524185029Spjd
525185029Spjd	/*
526185029Spjd	 * If this is part of creating a pool, not everything is
527185029Spjd	 * initialized yet, so don't bother logging the internal events.
528240133Smm	 * Likewise if the pool is not writeable.
529185029Spjd	 */
530248571Smm	if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) {
531248571Smm		fnvlist_free(nvl);
532185029Spjd		return;
533248571Smm	}
534185029Spjd
535219089Spjd	va_copy(adx2, adx);
536185029Spjd
537248571Smm	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
538248571Smm	(void) vsprintf(msg, fmt, adx2);
539248571Smm	fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
540248571Smm	strfree(msg);
541185029Spjd
542219089Spjd	va_end(adx2);
543185029Spjd
544248571Smm	fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
545248571Smm	fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
546219089Spjd
547185029Spjd	if (dmu_tx_is_syncing(tx)) {
548248571Smm		spa_history_log_sync(nvl, tx);
549185029Spjd	} else {
550248571Smm		dsl_sync_task_nowait(spa_get_dsl(spa),
551268473Sdelphij		    spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
552185029Spjd	}
553248571Smm	/* spa_history_log_sync() will free nvl */
554185029Spjd}
555219089Spjd
556219089Spjdvoid
557248571Smmspa_history_log_internal(spa_t *spa, const char *operation,
558219089Spjd    dmu_tx_t *tx, const char *fmt, ...)
559219089Spjd{
560219089Spjd	dmu_tx_t *htx = tx;
561219089Spjd	va_list adx;
562219089Spjd
563219089Spjd	/* create a tx if we didn't get one */
564219089Spjd	if (tx == NULL) {
565219089Spjd		htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
566219089Spjd		if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
567219089Spjd			dmu_tx_abort(htx);
568219089Spjd			return;
569219089Spjd		}
570219089Spjd	}
571219089Spjd
572219089Spjd	va_start(adx, fmt);
573248571Smm	log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
574219089Spjd	va_end(adx);
575219089Spjd
576219089Spjd	/* if we didn't get a tx from the caller, commit the one we made */
577219089Spjd	if (tx == NULL)
578219089Spjd		dmu_tx_commit(htx);
579219089Spjd}
580219089Spjd
581219089Spjdvoid
582248571Smmspa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
583248571Smm    dmu_tx_t *tx, const char *fmt, ...)
584219089Spjd{
585248571Smm	va_list adx;
586307108Smav	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
587248571Smm	nvlist_t *nvl = fnvlist_alloc();
588219089Spjd
589248571Smm	ASSERT(tx != NULL);
590248571Smm
591248571Smm	dsl_dataset_name(ds, namebuf);
592248571Smm	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
593248571Smm	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
594248571Smm
595248571Smm	va_start(adx, fmt);
596248571Smm	log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
597248571Smm	va_end(adx);
598219089Spjd}
599248571Smm
600248571Smmvoid
601248571Smmspa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
602248571Smm    dmu_tx_t *tx, const char *fmt, ...)
603248571Smm{
604248571Smm	va_list adx;
605307108Smav	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
606248571Smm	nvlist_t *nvl = fnvlist_alloc();
607248571Smm
608248571Smm	ASSERT(tx != NULL);
609248571Smm
610248571Smm	dsl_dir_name(dd, namebuf);
611248571Smm	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
612248571Smm	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
613275782Sdelphij	    dsl_dir_phys(dd)->dd_head_dataset_obj);
614248571Smm
615248571Smm	va_start(adx, fmt);
616248571Smm	log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
617248571Smm	va_end(adx);
618248571Smm}
619248571Smm
620248571Smmvoid
621248571Smmspa_history_log_version(spa_t *spa, const char *operation)
622248571Smm{
623248571Smm	spa_history_log_internal(spa, operation, NULL,
624329481Smav	    "pool version %llu; software version %llu/%llu; uts %s %s %s %s",
625248571Smm	    (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
626248571Smm	    utsname.nodename, utsname.release, utsname.version,
627248571Smm	    utsname.machine);
628248571Smm}
629