dmu_send.c revision 275811
117680Spst/* 2127668Sbms * CDDL HEADER START 3127668Sbms * 4127668Sbms * The contents of this file are subject to the terms of the 5111726Sfenner * Common Development and Distribution License (the "License"). 6127668Sbms * You may not use this file except in compliance with the License. 7127668Sbms * 8127668Sbms * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9127668Sbms * or http://www.opensolaris.org/os/licensing. 10127668Sbms * See the License for the specific language governing permissions 11127668Sbms * and limitations under the License. 12127668Sbms * 1375115Sfenner * When distributing Covered Code, include this CDDL HEADER in each 14127668Sbms * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15127668Sbms * If applicable, add the following below this CDDL HEADER, with the 16127668Sbms * fields enclosed by brackets "[]" replaced with your own identifying 17127668Sbms * information: Portions Copyright [yyyy] [name of copyright owner] 18127668Sbms * 19127668Sbms * CDDL HEADER END 20127668Sbms */ 21127668Sbms/* 22127668Sbms * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23127668Sbms * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 2475115Sfenner * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25127668Sbms * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26127668Sbms * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27127668Sbms * Copyright 2014 HybridCluster. All rights reserved. 28127668Sbms */ 29127668Sbms 30127668Sbms#include <sys/dmu.h> 3175115Sfenner#include <sys/dmu_impl.h> 3275115Sfenner#include <sys/dmu_tx.h> 33127668Sbms#include <sys/dbuf.h> 34127668Sbms#include <sys/dnode.h> 35127668Sbms#include <sys/zfs_context.h> 36127668Sbms#include <sys/dmu_objset.h> 37127668Sbms#include <sys/dmu_traverse.h> 3875115Sfenner#include <sys/dsl_dataset.h> 39127668Sbms#include <sys/dsl_dir.h> 40127668Sbms#include <sys/dsl_prop.h> 41127668Sbms#include <sys/dsl_pool.h> 42127668Sbms#include <sys/dsl_synctask.h> 43127668Sbms#include <sys/zfs_ioctl.h> 44127668Sbms#include <sys/zap.h> 45127668Sbms#include <sys/zio_checksum.h> 46127668Sbms#include <sys/zfs_znode.h> 47127668Sbms#include <zfs_fletcher.h> 48127668Sbms#include <sys/avl.h> 49127668Sbms#include <sys/ddt.h> 50127668Sbms#include <sys/zfs_onexit.h> 5175115Sfenner#include <sys/dmu_send.h> 52127668Sbms#include <sys/dsl_destroy.h> 53127668Sbms#include <sys/blkptr.h> 54127668Sbms#include <sys/dsl_bookmark.h> 55127668Sbms#include <sys/zfeature.h> 56127668Sbms 57127668Sbms#ifdef __FreeBSD__ 5875115Sfenner#undef dump_write 59127668Sbms#define dump_write dmu_dump_write 60127668Sbms#endif 61127668Sbms 62127668Sbms/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 63127668Sbmsint zfs_send_corrupt_data = B_FALSE; 6475115Sfenner 6575115Sfennerstatic char *dmu_recv_tag = "dmu_recv_tag"; 66127668Sbmsstatic const char *recv_clone_name = "%recv"; 67127668Sbms 68127668Sbmsstatic int 69127668Sbmsdump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 70127668Sbms{ 71127668Sbms dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 72127668Sbms struct uio auio; 73127668Sbms struct iovec aiov; 74127668Sbms ASSERT0(len % 8); 75127668Sbms 76127668Sbms fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 7775115Sfenner aiov.iov_base = buf; 7875115Sfenner aiov.iov_len = len; 79127668Sbms auio.uio_iov = &aiov; 80127668Sbms auio.uio_iovcnt = 1; 81127668Sbms auio.uio_resid = len; 82127668Sbms auio.uio_segflg = UIO_SYSSPACE; 83127668Sbms auio.uio_rw = UIO_WRITE; 84127668Sbms auio.uio_offset = (off_t)-1; 85127668Sbms auio.uio_td = dsp->dsa_td; 8675115Sfenner#ifdef _KERNEL 87127668Sbms if (dsp->dsa_fp->f_type == DTYPE_VNODE) 88127668Sbms bwillwrite(); 89127668Sbms dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, 90127668Sbms dsp->dsa_td); 91127668Sbms#else 92127668Sbms fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 93127668Sbms dsp->dsa_err = EOPNOTSUPP; 94127668Sbms#endif 95127668Sbms mutex_enter(&ds->ds_sendstream_lock); 96127668Sbms *dsp->dsa_off += len; 97127668Sbms mutex_exit(&ds->ds_sendstream_lock); 98127668Sbms 9975115Sfenner return (dsp->dsa_err); 10075115Sfenner} 101127668Sbms 102127668Sbmsstatic int 103127668Sbmsdump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 104127668Sbms uint64_t length) 105127668Sbms{ 106127668Sbms struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 107127668Sbms 108127668Sbms /* 109127668Sbms * When we receive a free record, dbuf_free_range() assumes 110127668Sbms * that the receiving system doesn't have any dbufs in the range 111127668Sbms * being freed. This is always true because there is a one-record 112127668Sbms * constraint: we only send one WRITE record for any given 113127668Sbms * object+offset. We know that the one-record constraint is 114127668Sbms * true because we always send data in increasing order by 115127668Sbms * object,offset. 116127668Sbms * 11775115Sfenner * If the increasing-order constraint ever changes, we should find 118127668Sbms * another way to assert that the one-record constraint is still 119127668Sbms * satisfied. 120127668Sbms */ 121127668Sbms ASSERT(object > dsp->dsa_last_data_object || 122127668Sbms (object == dsp->dsa_last_data_object && 123127668Sbms offset > dsp->dsa_last_data_offset)); 124127668Sbms 125127668Sbms /* 126127668Sbms * If we are doing a non-incremental send, then there can't 127127668Sbms * be any data in the dataset we're receiving into. Therefore 128127668Sbms * a free record would simply be a no-op. Save space by not 129127668Sbms * sending it to begin with. 130127668Sbms */ 131127668Sbms if (!dsp->dsa_incremental) 132127668Sbms return (0); 133127668Sbms 134127668Sbms if (length != -1ULL && offset + length < offset) 135127668Sbms length = -1ULL; 136127668Sbms 137127668Sbms /* 138127668Sbms * If there is a pending op, but it's not PENDING_FREE, push it out, 139127668Sbms * since free block aggregation can only be done for blocks of the 140127668Sbms * same type (i.e., DRR_FREE records can only be aggregated with 141127668Sbms * other DRR_FREE records. DRR_FREEOBJECTS records can only be 142127668Sbms * aggregated with other DRR_FREEOBJECTS records. 143127668Sbms */ 144127668Sbms if (dsp->dsa_pending_op != PENDING_NONE && 145127668Sbms dsp->dsa_pending_op != PENDING_FREE) { 146127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, 147127668Sbms sizeof (dmu_replay_record_t)) != 0) 148127668Sbms return (SET_ERROR(EINTR)); 149127668Sbms dsp->dsa_pending_op = PENDING_NONE; 150127668Sbms } 151127668Sbms 152127668Sbms if (dsp->dsa_pending_op == PENDING_FREE) { 153127668Sbms /* 154127668Sbms * There should never be a PENDING_FREE if length is -1 155127668Sbms * (because dump_dnode is the only place where this 15675115Sfenner * function is called with a -1, and only after flushing 157127668Sbms * any pending record). 158127668Sbms */ 159127668Sbms ASSERT(length != -1ULL); 160127668Sbms /* 161127668Sbms * Check to see whether this free block can be aggregated 162127668Sbms * with pending one. 163127668Sbms */ 164127668Sbms if (drrf->drr_object == object && drrf->drr_offset + 165127668Sbms drrf->drr_length == offset) { 166127668Sbms drrf->drr_length += length; 167127668Sbms return (0); 168127668Sbms } else { 169127668Sbms /* not a continuation. Push out pending record */ 170127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, 171127668Sbms sizeof (dmu_replay_record_t)) != 0) 172127668Sbms return (SET_ERROR(EINTR)); 173127668Sbms dsp->dsa_pending_op = PENDING_NONE; 174127668Sbms } 175127668Sbms } 176127668Sbms /* create a FREE record and make it pending */ 177127668Sbms bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 17875115Sfenner dsp->dsa_drr->drr_type = DRR_FREE; 179127668Sbms drrf->drr_object = object; 180127668Sbms drrf->drr_offset = offset; 181127668Sbms drrf->drr_length = length; 182127668Sbms drrf->drr_toguid = dsp->dsa_toguid; 183127668Sbms if (length == -1ULL) { 184127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, 185127668Sbms sizeof (dmu_replay_record_t)) != 0) 18675115Sfenner return (SET_ERROR(EINTR)); 18775115Sfenner } else { 188127668Sbms dsp->dsa_pending_op = PENDING_FREE; 189127668Sbms } 190127668Sbms 191127668Sbms return (0); 192127668Sbms} 193127668Sbms 19475115Sfennerstatic int 195127668Sbmsdump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 196127668Sbms uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 197127668Sbms{ 198127668Sbms struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 199127668Sbms 20075115Sfenner /* 201127668Sbms * We send data in increasing object, offset order. 202127668Sbms * See comment in dump_free() for details. 203127668Sbms */ 204127668Sbms ASSERT(object > dsp->dsa_last_data_object || 205127668Sbms (object == dsp->dsa_last_data_object && 206127668Sbms offset > dsp->dsa_last_data_offset)); 207127668Sbms dsp->dsa_last_data_object = object; 208127668Sbms dsp->dsa_last_data_offset = offset + blksz - 1; 209127668Sbms 210127668Sbms /* 211127668Sbms * If there is any kind of pending aggregation (currently either 212127668Sbms * a grouping of free objects or free blocks), push it out to 213127668Sbms * the stream, since aggregation can't be done across operations 214127668Sbms * of different types. 215127668Sbms */ 216127668Sbms if (dsp->dsa_pending_op != PENDING_NONE) { 217127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, 21875115Sfenner sizeof (dmu_replay_record_t)) != 0) 219127668Sbms return (SET_ERROR(EINTR)); 220127668Sbms dsp->dsa_pending_op = PENDING_NONE; 221127668Sbms } 222127668Sbms /* write a DATA record */ 223127668Sbms bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 22475115Sfenner dsp->dsa_drr->drr_type = DRR_WRITE; 225127668Sbms drrw->drr_object = object; 22675115Sfenner drrw->drr_type = type; 227127668Sbms drrw->drr_offset = offset; 228127668Sbms drrw->drr_length = blksz; 22975115Sfenner drrw->drr_toguid = dsp->dsa_toguid; 230127668Sbms if (bp == NULL || BP_IS_EMBEDDED(bp)) { 231127668Sbms /* 23275115Sfenner * There's no pre-computed checksum for partial-block 23375115Sfenner * writes or embedded BP's, so (like 234127668Sbms * fletcher4-checkummed blocks) userland will have to 235127668Sbms * compute a dedup-capable checksum itself. 236127668Sbms */ 237127668Sbms drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 238127668Sbms } else { 23975115Sfenner drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 240127668Sbms if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 241127668Sbms drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 24275115Sfenner DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 24375115Sfenner DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 244127668Sbms DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 245127668Sbms drrw->drr_key.ddk_cksum = bp->blk_cksum; 246127668Sbms } 247127668Sbms 24875115Sfenner if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 249127668Sbms return (SET_ERROR(EINTR)); 25075115Sfenner if (dump_bytes(dsp, data, blksz) != 0) 251127668Sbms return (SET_ERROR(EINTR)); 252127668Sbms return (0); 253127668Sbms} 254127668Sbms 255127668Sbmsstatic int 256127668Sbmsdump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 257127668Sbms int blksz, const blkptr_t *bp) 258127668Sbms{ 259127668Sbms char buf[BPE_PAYLOAD_SIZE]; 260127668Sbms struct drr_write_embedded *drrw = 26175115Sfenner &(dsp->dsa_drr->drr_u.drr_write_embedded); 262127668Sbms 263127668Sbms if (dsp->dsa_pending_op != PENDING_NONE) { 264127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, 265127668Sbms sizeof (dmu_replay_record_t)) != 0) 26675115Sfenner return (EINTR); 267127668Sbms dsp->dsa_pending_op = PENDING_NONE; 268127668Sbms } 269127668Sbms 270127668Sbms ASSERT(BP_IS_EMBEDDED(bp)); 271127668Sbms 272127668Sbms bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 27375115Sfenner dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 274127668Sbms drrw->drr_object = object; 275127668Sbms drrw->drr_offset = offset; 276127668Sbms drrw->drr_length = blksz; 277127668Sbms drrw->drr_toguid = dsp->dsa_toguid; 278127668Sbms drrw->drr_compression = BP_GET_COMPRESS(bp); 279127668Sbms drrw->drr_etype = BPE_GET_ETYPE(bp); 280127668Sbms drrw->drr_lsize = BPE_GET_LSIZE(bp); 281127668Sbms drrw->drr_psize = BPE_GET_PSIZE(bp); 282127668Sbms 283127668Sbms decode_embedded_bp_compressed(bp, buf); 284127668Sbms 285127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 286127668Sbms return (EINTR); 287127668Sbms if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 288127668Sbms return (EINTR); 289127668Sbms return (0); 290127668Sbms} 291127668Sbms 292127668Sbmsstatic int 293127668Sbmsdump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 294127668Sbms{ 295127668Sbms struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 296127668Sbms 297127668Sbms if (dsp->dsa_pending_op != PENDING_NONE) { 298127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, 299127668Sbms sizeof (dmu_replay_record_t)) != 0) 300127668Sbms return (SET_ERROR(EINTR)); 301127668Sbms dsp->dsa_pending_op = PENDING_NONE; 302127668Sbms } 303127668Sbms 304127668Sbms /* write a SPILL record */ 305127668Sbms bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 306127668Sbms dsp->dsa_drr->drr_type = DRR_SPILL; 307127668Sbms drrs->drr_object = object; 308127668Sbms drrs->drr_length = blksz; 309127668Sbms drrs->drr_toguid = dsp->dsa_toguid; 310127668Sbms 31175115Sfenner if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 312127668Sbms return (SET_ERROR(EINTR)); 313127668Sbms if (dump_bytes(dsp, data, blksz)) 31475115Sfenner return (SET_ERROR(EINTR)); 31517680Spst return (0); 316127668Sbms} 317127668Sbms 31817680Spststatic int 31917680Spstdump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 320127668Sbms{ 32117680Spst struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 32217680Spst 32317680Spst /* See comment in dump_free(). */ 32417680Spst if (!dsp->dsa_incremental) 32517680Spst return (0); 32617680Spst 32717680Spst /* 32817680Spst * If there is a pending op, but it's not PENDING_FREEOBJECTS, 32917680Spst * push it out, since free block aggregation can only be done for 33017680Spst * blocks of the same type (i.e., DRR_FREE records can only be 33117680Spst * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 33217680Spst * can only be aggregated with other DRR_FREEOBJECTS records. 33317680Spst */ 334127668Sbms if (dsp->dsa_pending_op != PENDING_NONE && 335127668Sbms dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 336127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, 337127668Sbms sizeof (dmu_replay_record_t)) != 0) 338127668Sbms return (SET_ERROR(EINTR)); 339127668Sbms dsp->dsa_pending_op = PENDING_NONE; 34017680Spst } 34117680Spst if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 34217680Spst /* 34317680Spst * See whether this free object array can be aggregated 34417680Spst * with pending one 34517680Spst */ 34617680Spst if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 34717680Spst drrfo->drr_numobjs += numobjs; 34817680Spst return (0); 34917680Spst } else { 35017680Spst /* can't be aggregated. Push out pending record */ 35117680Spst if (dump_bytes(dsp, dsp->dsa_drr, 35217680Spst sizeof (dmu_replay_record_t)) != 0) 35317680Spst return (SET_ERROR(EINTR)); 35417680Spst dsp->dsa_pending_op = PENDING_NONE; 35517680Spst } 35617680Spst } 35717680Spst 35817680Spst /* write a FREEOBJECTS record */ 35917680Spst bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 36017680Spst dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 36117680Spst drrfo->drr_firstobj = firstobj; 36217680Spst drrfo->drr_numobjs = numobjs; 363127668Sbms drrfo->drr_toguid = dsp->dsa_toguid; 36417680Spst 36517680Spst dsp->dsa_pending_op = PENDING_FREEOBJECTS; 36617680Spst 367127668Sbms return (0); 36817680Spst} 36917680Spst 37017680Spststatic int 37117680Spstdump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 372127668Sbms{ 37317680Spst struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 37417680Spst 375127668Sbms if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 37617680Spst return (dump_freeobjects(dsp, object, 1)); 377127668Sbms 37817680Spst if (dsp->dsa_pending_op != PENDING_NONE) { 37917680Spst if (dump_bytes(dsp, dsp->dsa_drr, 38017680Spst sizeof (dmu_replay_record_t)) != 0) 38117680Spst return (SET_ERROR(EINTR)); 38217680Spst dsp->dsa_pending_op = PENDING_NONE; 38317680Spst } 384127668Sbms 38517680Spst /* write an OBJECT record */ 386127668Sbms bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 387127668Sbms dsp->dsa_drr->drr_type = DRR_OBJECT; 388127668Sbms drro->drr_object = object; 38917680Spst drro->drr_type = dnp->dn_type; 39017680Spst drro->drr_bonustype = dnp->dn_bonustype; 39117680Spst drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 39217680Spst drro->drr_bonuslen = dnp->dn_bonuslen; 393127668Sbms drro->drr_checksumtype = dnp->dn_checksum; 39417680Spst drro->drr_compress = dnp->dn_compress; 39517680Spst drro->drr_toguid = dsp->dsa_toguid; 396127668Sbms 39717680Spst if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 398127668Sbms drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) 399127668Sbms drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; 400127668Sbms 401127668Sbms if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 402127668Sbms return (SET_ERROR(EINTR)); 40317680Spst 40417680Spst if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 405127668Sbms return (SET_ERROR(EINTR)); 40617680Spst 407127668Sbms /* Free anything past the end of the file. */ 408127668Sbms if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 409127668Sbms (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 410127668Sbms return (SET_ERROR(EINTR)); 411127668Sbms if (dsp->dsa_err != 0) 412127668Sbms return (SET_ERROR(EINTR)); 41317680Spst return (0); 41417680Spst} 415127668Sbms 41617680Spststatic boolean_t 41717680Spstbackup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 41817680Spst{ 41917680Spst if (!BP_IS_EMBEDDED(bp)) 42017680Spst return (B_FALSE); 42117680Spst 42217680Spst /* 42317680Spst * Compression function must be legacy, or explicitly enabled. 424127668Sbms */ 42517680Spst if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 42617680Spst !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) 42717680Spst return (B_FALSE); 42817680Spst 42917680Spst /* 430127668Sbms * Embed type must be explicitly enabled. 431127668Sbms */ 432127668Sbms switch (BPE_GET_ETYPE(bp)) { 433127668Sbms case BP_EMBEDDED_TYPE_DATA: 434127668Sbms if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 435127668Sbms return (B_TRUE); 43617680Spst break; 43717680Spst default: 438127668Sbms return (B_FALSE); 43917680Spst } 440127668Sbms return (B_FALSE); 44117680Spst} 44217680Spst 44317680Spst#define BP_SPAN(dnp, level) \ 44417680Spst (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 44517680Spst (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 44617680Spst 447127668Sbms/* ARGSUSED */ 44817680Spststatic int 44917680Spstbackup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 45017680Spst const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 45117680Spst{ 452127668Sbms dmu_sendarg_t *dsp = arg; 45317680Spst dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 45417680Spst int err = 0; 45517680Spst 45617680Spst if (issig(JUSTLOOKING) && issig(FORREAL)) 457127668Sbms return (SET_ERROR(EINTR)); 45817680Spst 45917680Spst if (zb->zb_object != DMU_META_DNODE_OBJECT && 46017680Spst DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 46117680Spst return (0); 46217680Spst } else if (zb->zb_level == ZB_ZIL_LEVEL) { 46317680Spst /* 464127668Sbms * If we are sending a non-snapshot (which is allowed on 46517680Spst * read-only pools), it may have a ZIL, which must be ignored. 46617680Spst */ 46717680Spst return (0); 46817680Spst } else if (BP_IS_HOLE(bp) && 46917680Spst zb->zb_object == DMU_META_DNODE_OBJECT) { 47017680Spst uint64_t span = BP_SPAN(dnp, zb->zb_level); 47117680Spst uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 47217680Spst err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 473127668Sbms } else if (BP_IS_HOLE(bp)) { 47417680Spst uint64_t span = BP_SPAN(dnp, zb->zb_level); 47517680Spst err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 47617680Spst } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 47717680Spst return (0); 478127668Sbms } else if (type == DMU_OT_DNODE) { 47917680Spst dnode_phys_t *blk; 48017680Spst int i; 48117680Spst int blksz = BP_GET_LSIZE(bp); 48217680Spst arc_flags_t aflags = ARC_FLAG_WAIT; 48317680Spst arc_buf_t *abuf; 48417680Spst 485127668Sbms if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 48617680Spst ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 48717680Spst &aflags, zb) != 0) 48817680Spst return (SET_ERROR(EIO)); 48917680Spst 49017680Spst blk = abuf->b_data; 49117680Spst for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 49217680Spst uint64_t dnobj = (zb->zb_blkid << 49317680Spst (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 49417680Spst err = dump_dnode(dsp, dnobj, blk+i); 49517680Spst if (err != 0) 49617680Spst break; 49717680Spst } 49817680Spst (void) arc_buf_remove_ref(abuf, &abuf); 499127668Sbms } else if (type == DMU_OT_SA) { 50017680Spst arc_flags_t aflags = ARC_FLAG_WAIT; 50117680Spst arc_buf_t *abuf; 50217680Spst int blksz = BP_GET_LSIZE(bp); 50317680Spst 504127668Sbms if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 50517680Spst ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 50617680Spst &aflags, zb) != 0) 50717680Spst return (SET_ERROR(EIO)); 50817680Spst 50917680Spst err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 51017680Spst (void) arc_buf_remove_ref(abuf, &abuf); 511127668Sbms } else if (backup_do_embed(dsp, bp)) { 51217680Spst /* it's an embedded level-0 block of a regular object */ 51317680Spst int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 51417680Spst err = dump_write_embedded(dsp, zb->zb_object, 51517680Spst zb->zb_blkid * blksz, blksz, bp); 51617680Spst } else { /* it's a level-0 block of a regular object */ 51717680Spst arc_flags_t aflags = ARC_FLAG_WAIT; 518127668Sbms arc_buf_t *abuf; 51917680Spst int blksz = BP_GET_LSIZE(bp); 52017680Spst uint64_t offset; 52117680Spst 52217680Spst ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 52317680Spst ASSERT0(zb->zb_level); 52417680Spst if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 52517680Spst ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 52617680Spst &aflags, zb) != 0) { 52717680Spst if (zfs_send_corrupt_data) { 52817680Spst /* Send a block filled with 0x"zfs badd bloc" */ 52917680Spst abuf = arc_buf_alloc(spa, blksz, &abuf, 53017680Spst ARC_BUFC_DATA); 53117680Spst uint64_t *ptr; 53217680Spst for (ptr = abuf->b_data; 53317680Spst (char *)ptr < (char *)abuf->b_data + blksz; 53417680Spst ptr++) 535127668Sbms *ptr = 0x2f5baddb10c; 53617680Spst } else { 53717680Spst return (SET_ERROR(EIO)); 53817680Spst } 53917680Spst } 54017680Spst 54117680Spst offset = zb->zb_blkid * blksz; 54217680Spst 54317680Spst if (!(dsp->dsa_featureflags & 54417680Spst DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 545127668Sbms blksz > SPA_OLD_MAXBLOCKSIZE) { 54617680Spst char *buf = abuf->b_data; 54717680Spst while (blksz > 0 && err == 0) { 54817680Spst int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); 54917680Spst err = dump_write(dsp, type, zb->zb_object, 55017680Spst offset, n, NULL, buf); 55117680Spst offset += n; 55217680Spst buf += n; 55317680Spst blksz -= n; 55417680Spst } 55517680Spst } else { 556127668Sbms err = dump_write(dsp, type, zb->zb_object, 55717680Spst offset, blksz, bp, abuf->b_data); 55817680Spst } 55917680Spst (void) arc_buf_remove_ref(abuf, &abuf); 56017680Spst } 561127668Sbms 56217680Spst ASSERT(err == 0 || err == EINTR); 56317680Spst return (err); 56417680Spst} 56517680Spst 566127668Sbms/* 56717680Spst * Releases dp using the specified tag. 56817680Spst */ 56917680Spststatic int 57017680Spstdmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 57117680Spst zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, 57217680Spst#ifdef illumos 573127668Sbms boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) 57417680Spst#else 57517680Spst boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off) 576127668Sbms#endif 57717680Spst{ 578127668Sbms objset_t *os; 57917680Spst dmu_replay_record_t *drr; 58017680Spst dmu_sendarg_t *dsp; 58117680Spst int err; 58217680Spst uint64_t fromtxg = 0; 583127668Sbms uint64_t featureflags = 0; 584127668Sbms 58517680Spst err = dmu_objset_from_ds(ds, &os); 58617680Spst if (err != 0) { 587127668Sbms dsl_pool_rele(dp, tag); 58817680Spst return (err); 589127668Sbms } 590127668Sbms 591127668Sbms drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 59217680Spst drr->drr_type = DRR_BEGIN; 593127668Sbms drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 594127668Sbms DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 59517680Spst DMU_SUBSTREAM); 59617680Spst 597127668Sbms#ifdef _KERNEL 59817680Spst if (dmu_objset_type(os) == DMU_OST_ZFS) { 59917680Spst uint64_t version; 600127668Sbms if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 60117680Spst kmem_free(drr, sizeof (dmu_replay_record_t)); 602127668Sbms dsl_pool_rele(dp, tag); 603127668Sbms return (SET_ERROR(EINVAL)); 604127668Sbms } 605127668Sbms if (version >= ZPL_VERSION_SA) { 606127668Sbms featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 60717680Spst } 60817680Spst } 60917680Spst#endif 61017680Spst 61117680Spst if (large_block_ok && ds->ds_large_blocks) 61217680Spst featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; 61317680Spst if (embedok && 61417680Spst spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 61517680Spst featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 61617680Spst if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 617127668Sbms featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; 61817680Spst } else { 61917680Spst embedok = B_FALSE; 62017680Spst } 62117680Spst 62217680Spst DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 62317680Spst featureflags); 624127668Sbms 62517680Spst drr->drr_u.drr_begin.drr_creation_time = 626127668Sbms dsl_dataset_phys(ds)->ds_creation_time; 627127668Sbms drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 628127668Sbms if (is_clone) 62917680Spst drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 63017680Spst drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; 631127668Sbms if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) 632127668Sbms drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 633127668Sbms 634127668Sbms if (fromzb != NULL) { 635127668Sbms drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; 636127668Sbms fromtxg = fromzb->zbm_creation_txg; 637127668Sbms } 638127668Sbms dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 639127668Sbms if (!dsl_dataset_is_snapshot(ds)) { 640127668Sbms (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 64117680Spst sizeof (drr->drr_u.drr_begin.drr_toname)); 642127668Sbms } 643127668Sbms 644127668Sbms dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 645127668Sbms 646127668Sbms dsp->dsa_drr = drr; 64717680Spst dsp->dsa_outfd = outfd; 64817680Spst dsp->dsa_proc = curproc; 64917680Spst dsp->dsa_td = curthread; 65017680Spst dsp->dsa_fp = fp; 65117680Spst dsp->dsa_os = os; 65217680Spst dsp->dsa_off = off; 653127668Sbms dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; 654127668Sbms ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 655127668Sbms dsp->dsa_pending_op = PENDING_NONE; 65617680Spst dsp->dsa_incremental = (fromzb != NULL); 65717680Spst dsp->dsa_featureflags = featureflags; 658127668Sbms 659127668Sbms mutex_enter(&ds->ds_sendstream_lock); 660127668Sbms list_insert_head(&ds->ds_sendstreams, dsp); 661127668Sbms mutex_exit(&ds->ds_sendstream_lock); 662127668Sbms 663127668Sbms dsl_dataset_long_hold(ds, FTAG); 664127668Sbms dsl_pool_rele(dp, tag); 665127668Sbms 666127668Sbms if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 667127668Sbms err = dsp->dsa_err; 66817680Spst goto out; 669127668Sbms } 670127668Sbms 671127668Sbms err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 67217680Spst backup_cb, dsp); 673127668Sbms 674127668Sbms if (dsp->dsa_pending_op != PENDING_NONE) 675127668Sbms if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 676127668Sbms err = SET_ERROR(EINTR); 677127668Sbms 67817680Spst if (err != 0) { 67917680Spst if (err == EINTR && dsp->dsa_err != 0) 68017680Spst err = dsp->dsa_err; 681127668Sbms goto out; 682127668Sbms } 683127668Sbms 684127668Sbms bzero(drr, sizeof (dmu_replay_record_t)); 685127668Sbms drr->drr_type = DRR_END; 686127668Sbms drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 68717680Spst drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 688127668Sbms 689127668Sbms if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 690127668Sbms err = dsp->dsa_err; 691127668Sbms goto out; 692127668Sbms } 693127668Sbms 694127668Sbmsout: 695127668Sbms mutex_enter(&ds->ds_sendstream_lock); 696127668Sbms list_remove(&ds->ds_sendstreams, dsp); 697127668Sbms mutex_exit(&ds->ds_sendstream_lock); 69817680Spst 699127668Sbms kmem_free(drr, sizeof (dmu_replay_record_t)); 700127668Sbms kmem_free(dsp, sizeof (dmu_sendarg_t)); 70117680Spst 702127668Sbms dsl_dataset_long_rele(ds, FTAG); 703127668Sbms 704127668Sbms return (err); 70517680Spst} 70617680Spst 70717680Spstint 70817680Spstdmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 709127668Sbms boolean_t embedok, boolean_t large_block_ok, 710127668Sbms#ifdef illumos 711127668Sbms int outfd, vnode_t *vp, offset_t *off) 712127668Sbms#else 713127668Sbms int outfd, struct file *fp, offset_t *off) 714127668Sbms#endif 715127668Sbms{ 716127668Sbms dsl_pool_t *dp; 717127668Sbms dsl_dataset_t *ds; 718127668Sbms dsl_dataset_t *fromds = NULL; 719127668Sbms int err; 720127668Sbms 72117680Spst err = dsl_pool_hold(pool, FTAG, &dp); 72217680Spst if (err != 0) 72317680Spst return (err); 72417680Spst 72517680Spst err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 72617680Spst if (err != 0) { 72717680Spst dsl_pool_rele(dp, FTAG); 72817680Spst return (err); 72917680Spst } 730127668Sbms 731127668Sbms if (fromsnap != 0) { 73217680Spst zfs_bookmark_phys_t zb; 733127668Sbms boolean_t is_clone; 734127668Sbms 73517680Spst err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 73617680Spst if (err != 0) { 737127668Sbms dsl_dataset_rele(ds, FTAG); 738127668Sbms dsl_pool_rele(dp, FTAG); 739127668Sbms return (err); 740127668Sbms } 741127668Sbms if (!dsl_dataset_is_before(ds, fromds, 0)) 742127668Sbms err = SET_ERROR(EXDEV); 743127668Sbms zb.zbm_creation_time = 744127668Sbms dsl_dataset_phys(fromds)->ds_creation_time; 745127668Sbms zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; 746127668Sbms zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 747127668Sbms is_clone = (fromds->ds_dir != ds->ds_dir); 748127668Sbms dsl_dataset_rele(fromds, FTAG); 749127668Sbms err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 750127668Sbms embedok, large_block_ok, outfd, fp, off); 751127668Sbms } else { 752127668Sbms err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 753127668Sbms embedok, large_block_ok, outfd, fp, off); 754127668Sbms } 755127668Sbms dsl_dataset_rele(ds, FTAG); 756127668Sbms return (err); 757127668Sbms} 758127668Sbms 759127668Sbmsint 760127668Sbmsdmu_send(const char *tosnap, const char *fromsnap, 761127668Sbms boolean_t embedok, boolean_t large_block_ok, 762127668Sbms#ifdef illumos 763127668Sbms int outfd, vnode_t *vp, offset_t *off) 764127668Sbms#else 765127668Sbms int outfd, struct file *fp, offset_t *off) 766127668Sbms#endif 767127668Sbms{ 768127668Sbms dsl_pool_t *dp; 769127668Sbms dsl_dataset_t *ds; 770127668Sbms int err; 771127668Sbms boolean_t owned = B_FALSE; 772127668Sbms 77317680Spst if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 774127668Sbms return (SET_ERROR(EINVAL)); 775127668Sbms 776127668Sbms err = dsl_pool_hold(tosnap, FTAG, &dp); 777127668Sbms if (err != 0) 778127668Sbms return (err); 779127668Sbms 780127668Sbms if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 781127668Sbms /* 782127668Sbms * We are sending a filesystem or volume. Ensure 783127668Sbms * that it doesn't change by owning the dataset. 784127668Sbms */ 785127668Sbms err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 786127668Sbms owned = B_TRUE; 787127668Sbms } else { 788127668Sbms err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 789127668Sbms } 790127668Sbms if (err != 0) { 791127668Sbms dsl_pool_rele(dp, FTAG); 792127668Sbms return (err); 793127668Sbms } 794127668Sbms 795127668Sbms if (fromsnap != NULL) { 796127668Sbms zfs_bookmark_phys_t zb; 797127668Sbms boolean_t is_clone = B_FALSE; 798127668Sbms int fsnamelen = strchr(tosnap, '@') - tosnap; 799127668Sbms 800127668Sbms /* 801127668Sbms * If the fromsnap is in a different filesystem, then 802127668Sbms * mark the send stream as a clone. 803127668Sbms */ 804127668Sbms if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 805127668Sbms (fromsnap[fsnamelen] != '@' && 806127668Sbms fromsnap[fsnamelen] != '#')) { 807127668Sbms is_clone = B_TRUE; 808127668Sbms } 809127668Sbms 810127668Sbms if (strchr(fromsnap, '@')) { 811127668Sbms dsl_dataset_t *fromds; 812127668Sbms err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 813127668Sbms if (err == 0) { 814127668Sbms if (!dsl_dataset_is_before(ds, fromds, 0)) 815127668Sbms err = SET_ERROR(EXDEV); 816127668Sbms zb.zbm_creation_time = 817127668Sbms dsl_dataset_phys(fromds)->ds_creation_time; 818127668Sbms zb.zbm_creation_txg = 819127668Sbms dsl_dataset_phys(fromds)->ds_creation_txg; 820127668Sbms zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 821127668Sbms is_clone = (ds->ds_dir != fromds->ds_dir); 822127668Sbms dsl_dataset_rele(fromds, FTAG); 823127668Sbms } 824127668Sbms } else { 825127668Sbms err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 826127668Sbms } 827127668Sbms if (err != 0) { 828127668Sbms dsl_dataset_rele(ds, FTAG); 829127668Sbms dsl_pool_rele(dp, FTAG); 830127668Sbms return (err); 831127668Sbms } 832127668Sbms err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 833127668Sbms embedok, large_block_ok, outfd, fp, off); 834127668Sbms } else { 835127668Sbms err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 836127668Sbms embedok, large_block_ok, outfd, fp, off); 837127668Sbms } 838127668Sbms if (owned) 839127668Sbms dsl_dataset_disown(ds, FTAG); 840127668Sbms else 841127668Sbms dsl_dataset_rele(ds, FTAG); 842127668Sbms return (err); 843127668Sbms} 844127668Sbms 845127668Sbmsint 846127668Sbmsdmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 847127668Sbms{ 848127668Sbms dsl_pool_t *dp = ds->ds_dir->dd_pool; 849127668Sbms int err; 850127668Sbms uint64_t size; 851127668Sbms 852127668Sbms ASSERT(dsl_pool_config_held(dp)); 853127668Sbms 854127668Sbms /* tosnap must be a snapshot */ 855127668Sbms if (!dsl_dataset_is_snapshot(ds)) 856127668Sbms return (SET_ERROR(EINVAL)); 857127668Sbms 858127668Sbms /* 859127668Sbms * fromsnap must be an earlier snapshot from the same fs as tosnap, 860127668Sbms * or the origin's fs. 861127668Sbms */ 862127668Sbms if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 863127668Sbms return (SET_ERROR(EXDEV)); 864127668Sbms 865127668Sbms /* Get uncompressed size estimate of changed data. */ 866127668Sbms if (fromds == NULL) { 867127668Sbms size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; 868127668Sbms } else { 869127668Sbms uint64_t used, comp; 870127668Sbms err = dsl_dataset_space_written(fromds, ds, 871127668Sbms &used, &comp, &size); 872127668Sbms if (err != 0) 873127668Sbms return (err); 874127668Sbms } 875127668Sbms 876127668Sbms /* 877127668Sbms * Assume that space (both on-disk and in-stream) is dominated by 878127668Sbms * data. We will adjust for indirect blocks and the copies property, 879127668Sbms * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 880127668Sbms */ 881127668Sbms 882127668Sbms /* 883127668Sbms * Subtract out approximate space used by indirect blocks. 884127668Sbms * Assume most space is used by data blocks (non-indirect, non-dnode). 885127668Sbms * Assume all blocks are recordsize. Assume ditto blocks and 886127668Sbms * internal fragmentation counter out compression. 887127668Sbms * 888127668Sbms * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 889127668Sbms * block, which we observe in practice. 890127668Sbms */ 891127668Sbms uint64_t recordsize; 892127668Sbms err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 893127668Sbms if (err != 0) 894127668Sbms return (err); 895127668Sbms size -= size / recordsize * sizeof (blkptr_t); 896127668Sbms 897127668Sbms /* Add in the space for the record associated with each block. */ 898127668Sbms size += size / recordsize * sizeof (dmu_replay_record_t); 899127668Sbms 900127668Sbms *sizep = size; 901127668Sbms 902127668Sbms return (0); 903127668Sbms} 904127668Sbms 905127668Sbmstypedef struct dmu_recv_begin_arg { 906127668Sbms const char *drba_origin; 907127668Sbms dmu_recv_cookie_t *drba_cookie; 908127668Sbms cred_t *drba_cred; 909127668Sbms uint64_t drba_snapobj; 910127668Sbms} dmu_recv_begin_arg_t; 911127668Sbms 912127668Sbmsstatic int 913127668Sbmsrecv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 914127668Sbms uint64_t fromguid) 915127668Sbms{ 916127668Sbms uint64_t val; 917127668Sbms int error; 918127668Sbms dsl_pool_t *dp = ds->ds_dir->dd_pool; 919127668Sbms 920127668Sbms /* temporary clone name must not exist */ 921127668Sbms error = zap_lookup(dp->dp_meta_objset, 922127668Sbms dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 923127668Sbms 8, 1, &val); 924127668Sbms if (error != ENOENT) 925127668Sbms return (error == 0 ? EBUSY : error); 926127668Sbms 927127668Sbms /* new snapshot name must not exist */ 928127668Sbms error = zap_lookup(dp->dp_meta_objset, 929127668Sbms dsl_dataset_phys(ds)->ds_snapnames_zapobj, 930127668Sbms drba->drba_cookie->drc_tosnap, 8, 1, &val); 931127668Sbms if (error != ENOENT) 932127668Sbms return (error == 0 ? EEXIST : error); 933127668Sbms 934127668Sbms /* 935127668Sbms * Check snapshot limit before receiving. We'll recheck again at the 936127668Sbms * end, but might as well abort before receiving if we're already over 937127668Sbms * the limit. 938127668Sbms * 939127668Sbms * Note that we do not check the file system limit with 940127668Sbms * dsl_dir_fscount_check because the temporary %clones don't count 941127668Sbms * against that limit. 942127668Sbms */ 943127668Sbms error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 944127668Sbms NULL, drba->drba_cred); 945127668Sbms if (error != 0) 946127668Sbms return (error); 947127668Sbms 948127668Sbms if (fromguid != 0) { 949127668Sbms dsl_dataset_t *snap; 950127668Sbms uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 951127668Sbms 952127668Sbms /* Find snapshot in this dir that matches fromguid. */ 953127668Sbms while (obj != 0) { 954127668Sbms error = dsl_dataset_hold_obj(dp, obj, FTAG, 955127668Sbms &snap); 956127668Sbms if (error != 0) 957127668Sbms return (SET_ERROR(ENODEV)); 958127668Sbms if (snap->ds_dir != ds->ds_dir) { 959127668Sbms dsl_dataset_rele(snap, FTAG); 960127668Sbms return (SET_ERROR(ENODEV)); 961127668Sbms } 962127668Sbms if (dsl_dataset_phys(snap)->ds_guid == fromguid) 963127668Sbms break; 964127668Sbms obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 965127668Sbms dsl_dataset_rele(snap, FTAG); 966127668Sbms } 967127668Sbms if (obj == 0) 968127668Sbms return (SET_ERROR(ENODEV)); 969127668Sbms 970127668Sbms if (drba->drba_cookie->drc_force) { 971127668Sbms drba->drba_snapobj = obj; 972127668Sbms } else { 973127668Sbms /* 974127668Sbms * If we are not forcing, there must be no 975127668Sbms * changes since fromsnap. 976127668Sbms */ 977127668Sbms if (dsl_dataset_modified_since_snap(ds, snap)) { 978127668Sbms dsl_dataset_rele(snap, FTAG); 979127668Sbms return (SET_ERROR(ETXTBSY)); 980127668Sbms } 981127668Sbms drba->drba_snapobj = ds->ds_prev->ds_object; 982127668Sbms } 983127668Sbms 984127668Sbms dsl_dataset_rele(snap, FTAG); 985127668Sbms } else { 986127668Sbms /* if full, most recent snapshot must be $ORIGIN */ 987127668Sbms if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= TXG_INITIAL) 988127668Sbms return (SET_ERROR(ENODEV)); 989127668Sbms drba->drba_snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 990127668Sbms } 991127668Sbms 992127668Sbms return (0); 993127668Sbms 994127668Sbms} 995127668Sbms 996127668Sbmsstatic int 997127668Sbmsdmu_recv_begin_check(void *arg, dmu_tx_t *tx) 998127668Sbms{ 999127668Sbms dmu_recv_begin_arg_t *drba = arg; 1000127668Sbms dsl_pool_t *dp = dmu_tx_pool(tx); 1001127668Sbms struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1002127668Sbms uint64_t fromguid = drrb->drr_fromguid; 1003127668Sbms int flags = drrb->drr_flags; 1004127668Sbms int error; 1005127668Sbms uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 1006127668Sbms dsl_dataset_t *ds; 1007127668Sbms const char *tofs = drba->drba_cookie->drc_tofs; 1008127668Sbms 1009127668Sbms /* already checked */ 1010127668Sbms ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1011127668Sbms 1012127668Sbms if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 1013127668Sbms DMU_COMPOUNDSTREAM || 1014127668Sbms drrb->drr_type >= DMU_OST_NUMTYPES || 1015127668Sbms ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 1016127668Sbms return (SET_ERROR(EINVAL)); 1017127668Sbms 1018127668Sbms /* Verify pool version supports SA if SA_SPILL feature set */ 1019127668Sbms if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 1020127668Sbms spa_version(dp->dp_spa) < SPA_VERSION_SA) 1021127668Sbms return (SET_ERROR(ENOTSUP)); 1022127668Sbms 1023127668Sbms /* 1024127668Sbms * The receiving code doesn't know how to translate a WRITE_EMBEDDED 1025127668Sbms * record to a plan WRITE record, so the pool must have the 1026127668Sbms * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 1027127668Sbms * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1028127668Sbms */ 1029127668Sbms if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1030127668Sbms !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1031127668Sbms return (SET_ERROR(ENOTSUP)); 1032127668Sbms if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && 1033127668Sbms !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1034127668Sbms return (SET_ERROR(ENOTSUP)); 1035127668Sbms 1036127668Sbms /* 1037127668Sbms * The receiving code doesn't know how to translate large blocks 1038127668Sbms * to smaller ones, so the pool must have the LARGE_BLOCKS 1039127668Sbms * feature enabled if the stream has LARGE_BLOCKS. 1040127668Sbms */ 1041127668Sbms if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1042127668Sbms !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) 1043127668Sbms return (SET_ERROR(ENOTSUP)); 1044127668Sbms 1045127668Sbms error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1046127668Sbms if (error == 0) { 1047127668Sbms /* target fs already exists; recv into temp clone */ 1048127668Sbms 1049127668Sbms /* Can't recv a clone into an existing fs */ 1050127668Sbms if (flags & DRR_FLAG_CLONE) { 1051127668Sbms dsl_dataset_rele(ds, FTAG); 1052127668Sbms return (SET_ERROR(EINVAL)); 1053127668Sbms } 1054127668Sbms 1055127668Sbms error = recv_begin_check_existing_impl(drba, ds, fromguid); 1056127668Sbms dsl_dataset_rele(ds, FTAG); 1057127668Sbms } else if (error == ENOENT) { 1058127668Sbms /* target fs does not exist; must be a full backup or clone */ 1059127668Sbms char buf[MAXNAMELEN]; 1060127668Sbms 1061127668Sbms /* 1062127668Sbms * If it's a non-clone incremental, we are missing the 1063127668Sbms * target fs, so fail the recv. 1064127668Sbms */ 1065127668Sbms if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 1066127668Sbms return (SET_ERROR(ENOENT)); 1067127668Sbms 1068127668Sbms /* Open the parent of tofs */ 1069127668Sbms ASSERT3U(strlen(tofs), <, MAXNAMELEN); 1070127668Sbms (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1071127668Sbms error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1072127668Sbms if (error != 0) 1073127668Sbms return (error); 1074127668Sbms 1075127668Sbms /* 1076127668Sbms * Check filesystem and snapshot limits before receiving. We'll 1077127668Sbms * recheck snapshot limits again at the end (we create the 1078127668Sbms * filesystems and increment those counts during begin_sync). 1079127668Sbms */ 1080127668Sbms error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1081127668Sbms ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1082127668Sbms if (error != 0) { 1083127668Sbms dsl_dataset_rele(ds, FTAG); 1084127668Sbms return (error); 1085127668Sbms } 1086127668Sbms 1087127668Sbms error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1088127668Sbms ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1089127668Sbms if (error != 0) { 1090127668Sbms dsl_dataset_rele(ds, FTAG); 1091127668Sbms return (error); 1092127668Sbms } 1093127668Sbms 1094127668Sbms if (drba->drba_origin != NULL) { 1095127668Sbms dsl_dataset_t *origin; 1096127668Sbms error = dsl_dataset_hold(dp, drba->drba_origin, 1097127668Sbms FTAG, &origin); 1098127668Sbms if (error != 0) { 1099127668Sbms dsl_dataset_rele(ds, FTAG); 1100127668Sbms return (error); 1101127668Sbms } 1102127668Sbms if (!dsl_dataset_is_snapshot(origin)) { 1103127668Sbms dsl_dataset_rele(origin, FTAG); 1104127668Sbms dsl_dataset_rele(ds, FTAG); 1105127668Sbms return (SET_ERROR(EINVAL)); 1106127668Sbms } 1107127668Sbms if (dsl_dataset_phys(origin)->ds_guid != fromguid) { 1108127668Sbms dsl_dataset_rele(origin, FTAG); 1109127668Sbms dsl_dataset_rele(ds, FTAG); 1110127668Sbms return (SET_ERROR(ENODEV)); 1111127668Sbms } 1112127668Sbms dsl_dataset_rele(origin, FTAG); 1113127668Sbms } 1114127668Sbms dsl_dataset_rele(ds, FTAG); 1115127668Sbms error = 0; 1116127668Sbms } 1117127668Sbms return (error); 1118127668Sbms} 1119127668Sbms 1120127668Sbmsstatic void 1121127668Sbmsdmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1122127668Sbms{ 1123127668Sbms dmu_recv_begin_arg_t *drba = arg; 1124127668Sbms dsl_pool_t *dp = dmu_tx_pool(tx); 1125127668Sbms struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1126127668Sbms const char *tofs = drba->drba_cookie->drc_tofs; 1127127668Sbms dsl_dataset_t *ds, *newds; 1128127668Sbms uint64_t dsobj; 1129127668Sbms int error; 1130127668Sbms uint64_t crflags; 1131127668Sbms 1132127668Sbms crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 1133127668Sbms DS_FLAG_CI_DATASET : 0; 1134127668Sbms 1135127668Sbms error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1136127668Sbms if (error == 0) { 1137127668Sbms /* create temporary clone */ 1138127668Sbms dsl_dataset_t *snap = NULL; 1139127668Sbms if (drba->drba_snapobj != 0) { 1140127668Sbms VERIFY0(dsl_dataset_hold_obj(dp, 1141127668Sbms drba->drba_snapobj, FTAG, &snap)); 1142127668Sbms } 1143127668Sbms dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1144127668Sbms snap, crflags, drba->drba_cred, tx); 1145127668Sbms dsl_dataset_rele(snap, FTAG); 1146127668Sbms dsl_dataset_rele(ds, FTAG); 1147127668Sbms } else { 1148127668Sbms dsl_dir_t *dd; 1149127668Sbms const char *tail; 1150127668Sbms dsl_dataset_t *origin = NULL; 1151127668Sbms 1152127668Sbms VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1153127668Sbms 1154127668Sbms if (drba->drba_origin != NULL) { 1155127668Sbms VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1156127668Sbms FTAG, &origin)); 1157127668Sbms } 1158127668Sbms 1159127668Sbms /* Create new dataset. */ 1160127668Sbms dsobj = dsl_dataset_create_sync(dd, 1161127668Sbms strrchr(tofs, '/') + 1, 1162127668Sbms origin, crflags, drba->drba_cred, tx); 1163127668Sbms if (origin != NULL) 116417680Spst dsl_dataset_rele(origin, FTAG); 116517680Spst dsl_dir_rele(dd, FTAG); 116617680Spst drba->drba_cookie->drc_newfs = B_TRUE; 116717680Spst } 116817680Spst VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 116917680Spst 117017680Spst if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 117117680Spst DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 117217680Spst !newds->ds_large_blocks) { 117317680Spst dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); 1174127668Sbms newds->ds_large_blocks = B_TRUE; 1175127668Sbms } 1176127668Sbms 117717680Spst dmu_buf_will_dirty(newds->ds_dbuf, tx); 117817680Spst dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; 117917680Spst 118017680Spst /* 118117680Spst * If we actually created a non-clone, we need to create the 1182127668Sbms * objset in our new dataset. 1183127668Sbms */ 1184127668Sbms if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1185127668Sbms (void) dmu_objset_create_impl(dp->dp_spa, 1186127668Sbms newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1187127668Sbms } 1188127668Sbms 1189127668Sbms drba->drba_cookie->drc_ds = newds; 1190127668Sbms 1191127668Sbms spa_history_log_internal_ds(newds, "receive", tx, ""); 119217680Spst} 1193127668Sbms 1194127668Sbms/* 1195127668Sbms * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 119617680Spst * succeeds; otherwise we will leak the holds on the datasets. 119717680Spst */ 1198127668Sbmsint 1199127668Sbmsdmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 1200127668Sbms boolean_t force, char *origin, dmu_recv_cookie_t *drc) 1201127668Sbms{ 1202127668Sbms dmu_recv_begin_arg_t drba = { 0 }; 1203127668Sbms dmu_replay_record_t *drr; 1204127668Sbms 1205127668Sbms bzero(drc, sizeof (dmu_recv_cookie_t)); 1206127668Sbms drc->drc_drrb = drrb; 1207127668Sbms drc->drc_tosnap = tosnap; 1208127668Sbms drc->drc_tofs = tofs; 1209127668Sbms drc->drc_force = force; 1210127668Sbms drc->drc_cred = CRED(); 1211127668Sbms 1212127668Sbms if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1213127668Sbms drc->drc_byteswap = B_TRUE; 1214127668Sbms else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 1215127668Sbms return (SET_ERROR(EINVAL)); 1216127668Sbms 1217127668Sbms drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1218127668Sbms drr->drr_type = DRR_BEGIN; 1219127668Sbms drr->drr_u.drr_begin = *drc->drc_drrb; 1220127668Sbms if (drc->drc_byteswap) { 1221127668Sbms fletcher_4_incremental_byteswap(drr, 1222127668Sbms sizeof (dmu_replay_record_t), &drc->drc_cksum); 1223127668Sbms } else { 1224127668Sbms fletcher_4_incremental_native(drr, 1225127668Sbms sizeof (dmu_replay_record_t), &drc->drc_cksum); 1226127668Sbms } 1227127668Sbms kmem_free(drr, sizeof (dmu_replay_record_t)); 1228127668Sbms 1229127668Sbms if (drc->drc_byteswap) { 1230127668Sbms drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1231127668Sbms drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1232127668Sbms drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1233127668Sbms drrb->drr_type = BSWAP_32(drrb->drr_type); 1234127668Sbms drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1235127668Sbms drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1236127668Sbms } 1237127668Sbms 1238127668Sbms drba.drba_origin = origin; 1239127668Sbms drba.drba_cookie = drc; 1240127668Sbms drba.drba_cred = CRED(); 1241127668Sbms 1242127668Sbms return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 1243127668Sbms &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1244127668Sbms} 1245127668Sbms 1246127668Sbmsstruct restorearg { 1247127668Sbms int err; 1248127668Sbms boolean_t byteswap; 124917680Spst kthread_t *td; 125017680Spst struct file *fp; 1251127668Sbms char *buf; 1252127668Sbms uint64_t voff; 1253127668Sbms int bufsize; /* amount of memory allocated for buf */ 125417680Spst zio_cksum_t cksum; 125517680Spst avl_tree_t *guid_to_ds_map; 125617680Spst}; 125717680Spst 1258127668Sbmstypedef struct guid_map_entry { 1259127668Sbms uint64_t guid; 1260127668Sbms dsl_dataset_t *gme_ds; 1261127668Sbms avl_node_t avlnode; 1262127668Sbms} guid_map_entry_t; 1263127668Sbms 1264127668Sbmsstatic int 1265127668Sbmsguid_compare(const void *arg1, const void *arg2) 1266127668Sbms{ 1267127668Sbms const guid_map_entry_t *gmep1 = arg1; 1268127668Sbms const guid_map_entry_t *gmep2 = arg2; 1269127668Sbms 1270127668Sbms if (gmep1->guid < gmep2->guid) 1271127668Sbms return (-1); 1272127668Sbms else if (gmep1->guid > gmep2->guid) 1273127668Sbms return (1); 127417680Spst return (0); 127517680Spst} 127617680Spst 127717680Spststatic void 127817680Spstfree_guid_map_onexit(void *arg) 127917680Spst{ 128017680Spst avl_tree_t *ca = arg; 128117680Spst void *cookie = NULL; 128217680Spst guid_map_entry_t *gmep; 128317680Spst 1284127668Sbms while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1285127668Sbms dsl_dataset_long_rele(gmep->gme_ds, gmep); 1286127668Sbms dsl_dataset_rele(gmep->gme_ds, gmep); 1287127668Sbms kmem_free(gmep, sizeof (guid_map_entry_t)); 128817680Spst } 128917680Spst avl_destroy(ca); 129017680Spst kmem_free(ca, sizeof (avl_tree_t)); 1291127668Sbms} 1292127668Sbms 1293127668Sbmsstatic int 129417680Spstrestore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) 1295127668Sbms{ 1296127668Sbms struct uio auio; 1297127668Sbms struct iovec aiov; 129817680Spst int error; 1299127668Sbms 1300127668Sbms aiov.iov_base = buf; 1301127668Sbms aiov.iov_len = len; 1302127668Sbms auio.uio_iov = &aiov; 1303127668Sbms auio.uio_iovcnt = 1; 130417680Spst auio.uio_resid = len; 1305127668Sbms auio.uio_segflg = UIO_SYSSPACE; 1306127668Sbms auio.uio_rw = UIO_READ; 1307127668Sbms auio.uio_offset = off; 1308127668Sbms auio.uio_td = ra->td; 1309127668Sbms#ifdef _KERNEL 1310127668Sbms error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); 1311127668Sbms#else 1312127668Sbms fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 1313127668Sbms error = EOPNOTSUPP; 1314127668Sbms#endif 1315127668Sbms *resid = auio.uio_resid; 1316127668Sbms return (error); 1317127668Sbms} 1318127668Sbms 1319127668Sbmsstatic void * 1320127668Sbmsrestore_read(struct restorearg *ra, int len, char *buf) 1321127668Sbms{ 132217680Spst int done = 0; 1323127668Sbms 1324127668Sbms if (buf == NULL) 1325127668Sbms buf = ra->buf; 1326127668Sbms 1327127668Sbms /* some things will require 8-byte alignment, so everything must */ 1328127668Sbms ASSERT0(len % 8); 132917680Spst ASSERT3U(len, <=, ra->bufsize); 133017680Spst 1331127668Sbms while (done < len) { 1332127668Sbms ssize_t resid; 1333127668Sbms 1334127668Sbms ra->err = restore_bytes(ra, buf + done, 1335127668Sbms len - done, ra->voff, &resid); 1336127668Sbms 1337127668Sbms if (resid == len - done) 1338127668Sbms ra->err = SET_ERROR(EINVAL); 1339127668Sbms ra->voff += len - done - resid; 1340127668Sbms done = len - resid; 1341127668Sbms if (ra->err != 0) 1342127668Sbms return (NULL); 134317680Spst } 1344127668Sbms 1345127668Sbms ASSERT3U(done, ==, len); 1346127668Sbms if (ra->byteswap) 1347127668Sbms fletcher_4_incremental_byteswap(buf, len, &ra->cksum); 1348127668Sbms else 1349127668Sbms fletcher_4_incremental_native(buf, len, &ra->cksum); 1350127668Sbms return (buf); 135117680Spst} 135217680Spst 135317680Spststatic void 1354127668Sbmsbackup_byteswap(dmu_replay_record_t *drr) 1355127668Sbms{ 1356127668Sbms#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1357127668Sbms#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1358127668Sbms drr->drr_type = BSWAP_32(drr->drr_type); 1359127668Sbms drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 136039297Sfenner switch (drr->drr_type) { 136139297Sfenner case DRR_BEGIN: 136239297Sfenner DO64(drr_begin.drr_magic); 1363127668Sbms DO64(drr_begin.drr_versioninfo); 136439297Sfenner DO64(drr_begin.drr_creation_time); 136517680Spst DO32(drr_begin.drr_type); 136639297Sfenner DO32(drr_begin.drr_flags); 136756893Sfenner DO64(drr_begin.drr_toguid); 136839297Sfenner DO64(drr_begin.drr_fromguid); 136926180Sfenner break; 137026180Sfenner case DRR_OBJECT: 137126180Sfenner DO64(drr_object.drr_object); 137217680Spst DO32(drr_object.drr_type); 137398524Sfenner DO32(drr_object.drr_bonustype); 137417680Spst DO32(drr_object.drr_blksz); 137517680Spst DO32(drr_object.drr_bonuslen); 137617680Spst DO64(drr_object.drr_toguid); 137717680Spst break; 1378127668Sbms case DRR_FREEOBJECTS: 1379127668Sbms DO64(drr_freeobjects.drr_firstobj); 1380127668Sbms DO64(drr_freeobjects.drr_numobjs); 1381127668Sbms DO64(drr_freeobjects.drr_toguid); 138217680Spst break; 138317680Spst case DRR_WRITE: 138417680Spst DO64(drr_write.drr_object); 138517680Spst DO32(drr_write.drr_type); 1386127668Sbms DO64(drr_write.drr_offset); 1387127668Sbms DO64(drr_write.drr_length); 1388127668Sbms DO64(drr_write.drr_toguid); 1389127668Sbms DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1390127668Sbms DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1391127668Sbms DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1392127668Sbms DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1393127668Sbms DO64(drr_write.drr_key.ddk_prop); 1394127668Sbms break; 1395127668Sbms case DRR_WRITE_BYREF: 1396127668Sbms DO64(drr_write_byref.drr_object); 1397127668Sbms DO64(drr_write_byref.drr_offset); 1398127668Sbms DO64(drr_write_byref.drr_length); 1399127668Sbms DO64(drr_write_byref.drr_toguid); 140017680Spst DO64(drr_write_byref.drr_refguid); 140117680Spst DO64(drr_write_byref.drr_refobject); 140217680Spst DO64(drr_write_byref.drr_refoffset); 1403127668Sbms DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 140417680Spst DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1405127668Sbms DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1406127668Sbms DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 140717680Spst DO64(drr_write_byref.drr_key.ddk_prop); 1408127668Sbms break; 1409127668Sbms case DRR_WRITE_EMBEDDED: 141017680Spst DO64(drr_write_embedded.drr_object); 141117680Spst DO64(drr_write_embedded.drr_offset); 141217680Spst DO64(drr_write_embedded.drr_length); 141317680Spst DO64(drr_write_embedded.drr_toguid); 141417680Spst DO32(drr_write_embedded.drr_lsize); 141517680Spst DO32(drr_write_embedded.drr_psize); 141617680Spst break; 141717680Spst case DRR_FREE: 141817680Spst DO64(drr_free.drr_object); 141939297Sfenner DO64(drr_free.drr_offset); 142039297Sfenner DO64(drr_free.drr_length); 142139297Sfenner DO64(drr_free.drr_toguid); 142239297Sfenner break; 1423127668Sbms case DRR_SPILL: 1424127668Sbms DO64(drr_spill.drr_object); 1425127668Sbms DO64(drr_spill.drr_length); 1426127668Sbms DO64(drr_spill.drr_toguid); 1427127668Sbms break; 1428127668Sbms case DRR_END: 1429127668Sbms DO64(drr_end.drr_checksum.zc_word[0]); 1430127668Sbms DO64(drr_end.drr_checksum.zc_word[1]); 1431127668Sbms DO64(drr_end.drr_checksum.zc_word[2]); 1432127668Sbms DO64(drr_end.drr_checksum.zc_word[3]); 1433127668Sbms DO64(drr_end.drr_toguid); 1434127668Sbms break; 1435127668Sbms } 1436127668Sbms#undef DO64 1437127668Sbms#undef DO32 1438127668Sbms} 1439127668Sbms 1440127668Sbmsstatic inline uint8_t 1441127668Sbmsdeduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) 1442127668Sbms{ 1443127668Sbms if (bonus_type == DMU_OT_SA) { 1444127668Sbms return (1); 1445127668Sbms } else { 1446127668Sbms return (1 + 1447127668Sbms ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); 1448127668Sbms } 1449127668Sbms} 1450127668Sbms 1451127668Sbmsstatic int 1452127668Sbmsrestore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1453127668Sbms{ 1454127668Sbms dmu_object_info_t doi; 1455127668Sbms dmu_tx_t *tx; 1456127668Sbms void *data = NULL; 1457127668Sbms uint64_t object; 1458127668Sbms int err; 1459127668Sbms 1460127668Sbms if (drro->drr_type == DMU_OT_NONE || 1461127668Sbms !DMU_OT_IS_VALID(drro->drr_type) || 1462127668Sbms !DMU_OT_IS_VALID(drro->drr_bonustype) || 1463127668Sbms drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1464127668Sbms drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1465127668Sbms P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1466127668Sbms drro->drr_blksz < SPA_MINBLOCKSIZE || 1467127668Sbms drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) || 146817680Spst drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1469127668Sbms return (SET_ERROR(EINVAL)); 1470127668Sbms } 1471127668Sbms 1472127668Sbms err = dmu_object_info(os, drro->drr_object, &doi); 147317680Spst 1474127668Sbms if (err != 0 && err != ENOENT) 1475127668Sbms return (SET_ERROR(EINVAL)); 1476127668Sbms object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; 1477127668Sbms 1478127668Sbms if (drro->drr_bonuslen) { 1479127668Sbms data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL); 1480127668Sbms if (ra->err != 0) 1481127668Sbms return (ra->err); 1482127668Sbms } 1483127668Sbms 1484127668Sbms /* 1485127668Sbms * If we are losing blkptrs or changing the block size this must 1486127668Sbms * be a new file instance. We must clear out the previous file 1487127668Sbms * contents before we can change this type of metadata in the dnode. 1488127668Sbms */ 1489127668Sbms if (err == 0) { 1490127668Sbms int nblkptr; 1491127668Sbms 1492127668Sbms nblkptr = deduce_nblkptr(drro->drr_bonustype, 1493127668Sbms drro->drr_bonuslen); 1494127668Sbms 1495127668Sbms if (drro->drr_blksz != doi.doi_data_block_size || 1496127668Sbms nblkptr < doi.doi_nblkptr) { 1497127668Sbms err = dmu_free_long_range(os, drro->drr_object, 1498127668Sbms 0, DMU_OBJECT_END); 1499127668Sbms if (err != 0) 1500127668Sbms return (SET_ERROR(EINVAL)); 1501127668Sbms } 1502127668Sbms } 1503127668Sbms 1504127668Sbms tx = dmu_tx_create(os); 1505127668Sbms dmu_tx_hold_bonus(tx, object); 1506127668Sbms err = dmu_tx_assign(tx, TXG_WAIT); 1507127668Sbms if (err != 0) { 1508127668Sbms dmu_tx_abort(tx); 1509127668Sbms return (err); 1510127668Sbms } 1511127668Sbms 1512127668Sbms if (object == DMU_NEW_OBJECT) { 1513127668Sbms /* currently free, want to be allocated */ 1514127668Sbms err = dmu_object_claim(os, drro->drr_object, 1515127668Sbms drro->drr_type, drro->drr_blksz, 151617680Spst drro->drr_bonustype, drro->drr_bonuslen, tx); 151717680Spst } else if (drro->drr_type != doi.doi_type || 151817680Spst drro->drr_blksz != doi.doi_data_block_size || 1519127668Sbms drro->drr_bonustype != doi.doi_bonus_type || 1520127668Sbms drro->drr_bonuslen != doi.doi_bonus_size) { 1521127668Sbms /* currently allocated, but with different properties */ 1522127668Sbms err = dmu_object_reclaim(os, drro->drr_object, 1523127668Sbms drro->drr_type, drro->drr_blksz, 1524127668Sbms drro->drr_bonustype, drro->drr_bonuslen, tx); 1525127668Sbms } 1526127668Sbms if (err != 0) { 1527127668Sbms dmu_tx_commit(tx); 1528127668Sbms return (SET_ERROR(EINVAL)); 1529127668Sbms } 1530127668Sbms 1531127668Sbms dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1532127668Sbms tx); 153317680Spst dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 153417680Spst 1535127668Sbms if (data != NULL) { 153617680Spst dmu_buf_t *db; 1537127668Sbms 1538127668Sbms VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 153917680Spst dmu_buf_will_dirty(db, tx); 1540127668Sbms 1541127668Sbms ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 154217680Spst bcopy(data, db->db_data, drro->drr_bonuslen); 154317680Spst if (ra->byteswap) { 1544127668Sbms dmu_object_byteswap_t byteswap = 1545127668Sbms DMU_OT_BYTESWAP(drro->drr_bonustype); 1546127668Sbms dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1547127668Sbms drro->drr_bonuslen); 1548127668Sbms } 1549127668Sbms dmu_buf_rele(db, FTAG); 1550127668Sbms } 1551127668Sbms dmu_tx_commit(tx); 1552127668Sbms return (0); 1553127668Sbms} 1554127668Sbms 1555127668Sbms/* ARGSUSED */ 1556127668Sbmsstatic int 1557127668Sbmsrestore_freeobjects(struct restorearg *ra, objset_t *os, 1558127668Sbms struct drr_freeobjects *drrfo) 1559127668Sbms{ 1560127668Sbms uint64_t obj; 1561127668Sbms 1562127668Sbms if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1563127668Sbms return (SET_ERROR(EINVAL)); 1564127668Sbms 1565127668Sbms for (obj = drrfo->drr_firstobj; 1566127668Sbms obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1567127668Sbms (void) dmu_object_next(os, &obj, FALSE, 0)) { 1568127668Sbms int err; 1569127668Sbms 1570127668Sbms if (dmu_object_info(os, obj, NULL) != 0) 1571127668Sbms continue; 1572127668Sbms 1573127668Sbms err = dmu_free_long_object(os, obj); 1574127668Sbms if (err != 0) 1575127668Sbms return (err); 1576127668Sbms } 1577127668Sbms return (0); 1578127668Sbms} 1579127668Sbms 1580127668Sbmsstatic int 1581127668Sbmsrestore_write(struct restorearg *ra, objset_t *os, 1582127668Sbms struct drr_write *drrw) 1583127668Sbms{ 1584127668Sbms dmu_tx_t *tx; 1585127668Sbms void *data; 1586127668Sbms int err; 1587127668Sbms 158817680Spst if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 158917680Spst !DMU_OT_IS_VALID(drrw->drr_type)) 159017680Spst return (SET_ERROR(EINVAL)); 1591127668Sbms 1592127668Sbms if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1593127668Sbms return (SET_ERROR(EINVAL)); 1594127668Sbms 159517680Spst dmu_buf_t *bonus; 159617680Spst if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0) 159717680Spst return (SET_ERROR(EINVAL)); 159817680Spst 159917680Spst arc_buf_t *abuf = dmu_request_arcbuf(bonus, drrw->drr_length); 1600127668Sbms 1601127668Sbms data = restore_read(ra, drrw->drr_length, abuf->b_data); 1602127668Sbms if (data == NULL) { 1603127668Sbms dmu_return_arcbuf(abuf); 1604127668Sbms dmu_buf_rele(bonus, FTAG); 1605127668Sbms return (ra->err); 1606127668Sbms } 1607127668Sbms 1608127668Sbms tx = dmu_tx_create(os); 1609127668Sbms 1610127668Sbms dmu_tx_hold_write(tx, drrw->drr_object, 1611127668Sbms drrw->drr_offset, drrw->drr_length); 1612127668Sbms err = dmu_tx_assign(tx, TXG_WAIT); 1613127668Sbms if (err != 0) { 1614127668Sbms dmu_return_arcbuf(abuf); 1615127668Sbms dmu_buf_rele(bonus, FTAG); 1616127668Sbms dmu_tx_abort(tx); 1617127668Sbms return (err); 161817680Spst } 161917680Spst if (ra->byteswap) { 162017680Spst dmu_object_byteswap_t byteswap = 162117680Spst DMU_OT_BYTESWAP(drrw->drr_type); 1622127668Sbms dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 162317680Spst } 162417680Spst dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); 162517680Spst dmu_tx_commit(tx); 162617680Spst dmu_buf_rele(bonus, FTAG); 1627127668Sbms return (0); 162817680Spst} 162917680Spst 163017680Spst/* 163117680Spst * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1632127668Sbms * streams to refer to a copy of the data that is already on the 163317680Spst * system because it came in earlier in the stream. This function 1634127668Sbms * finds the earlier copy of the data, and uses that copy instead of 1635127668Sbms * data from the stream to fulfill this write. 163617680Spst */ 1637127668Sbmsstatic int 1638127668Sbmsrestore_write_byref(struct restorearg *ra, objset_t *os, 163917680Spst struct drr_write_byref *drrwbr) 164017680Spst{ 1641127668Sbms dmu_tx_t *tx; 1642127668Sbms int err; 1643127668Sbms guid_map_entry_t gmesrch; 1644127668Sbms guid_map_entry_t *gmep; 1645127668Sbms avl_index_t where; 1646127668Sbms objset_t *ref_os = NULL; 1647127668Sbms dmu_buf_t *dbp; 1648127668Sbms 1649127668Sbms if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1650127668Sbms return (SET_ERROR(EINVAL)); 1651127668Sbms 165256893Sfenner /* 165356893Sfenner * If the GUID of the referenced dataset is different from the 165456893Sfenner * GUID of the target dataset, find the referenced dataset. 165556893Sfenner */ 1656127668Sbms if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1657127668Sbms gmesrch.guid = drrwbr->drr_refguid; 1658127668Sbms if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1659127668Sbms &where)) == NULL) { 1660127668Sbms return (SET_ERROR(EINVAL)); 1661127668Sbms } 1662127668Sbms if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1663127668Sbms return (SET_ERROR(EINVAL)); 1664127668Sbms } else { 1665127668Sbms ref_os = os; 1666127668Sbms } 1667127668Sbms 1668127668Sbms err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1669127668Sbms drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 167056893Sfenner if (err != 0) 167156893Sfenner return (err); 1672127668Sbms 167356893Sfenner tx = dmu_tx_create(os); 1674127668Sbms 1675127668Sbms dmu_tx_hold_write(tx, drrwbr->drr_object, 167656893Sfenner drrwbr->drr_offset, drrwbr->drr_length); 1677127668Sbms err = dmu_tx_assign(tx, TXG_WAIT); 1678127668Sbms if (err != 0) { 167956893Sfenner dmu_tx_abort(tx); 1680127668Sbms return (err); 1681127668Sbms } 1682127668Sbms dmu_write(os, drrwbr->drr_object, 1683127668Sbms drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1684127668Sbms dmu_buf_rele(dbp, FTAG); 1685127668Sbms dmu_tx_commit(tx); 1686127668Sbms return (0); 1687127668Sbms} 1688127668Sbms 1689127668Sbmsstatic int 1690127668Sbmsrestore_write_embedded(struct restorearg *ra, objset_t *os, 1691127668Sbms struct drr_write_embedded *drrwnp) 1692127668Sbms{ 1693127668Sbms dmu_tx_t *tx; 1694127668Sbms int err; 1695127668Sbms void *data; 1696127668Sbms 1697127668Sbms if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) 1698127668Sbms return (EINVAL); 1699127668Sbms 1700127668Sbms if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) 1701127668Sbms return (EINVAL); 1702127668Sbms 1703127668Sbms if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) 1704127668Sbms return (EINVAL); 1705127668Sbms if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 1706127668Sbms return (EINVAL); 1707127668Sbms 170856893Sfenner data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL); 1709127668Sbms if (data == NULL) 1710127668Sbms return (ra->err); 1711127668Sbms 171217680Spst tx = dmu_tx_create(os); 1713127668Sbms 1714127668Sbms dmu_tx_hold_write(tx, drrwnp->drr_object, 1715127668Sbms drrwnp->drr_offset, drrwnp->drr_length); 1716127668Sbms err = dmu_tx_assign(tx, TXG_WAIT); 1717127668Sbms if (err != 0) { 1718127668Sbms dmu_tx_abort(tx); 1719127668Sbms return (err); 1720127668Sbms } 1721127668Sbms 172217680Spst dmu_write_embedded(os, drrwnp->drr_object, 1723127668Sbms drrwnp->drr_offset, data, drrwnp->drr_etype, 1724127668Sbms drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, 172526180Sfenner ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); 1726127668Sbms 1727127668Sbms dmu_tx_commit(tx); 172826180Sfenner return (0); 1729127668Sbms} 173056893Sfenner 173156893Sfennerstatic int 1732127668Sbmsrestore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1733127668Sbms{ 1734127668Sbms dmu_tx_t *tx; 1735127668Sbms void *data; 1736127668Sbms dmu_buf_t *db, *db_spill; 1737127668Sbms int err; 1738127668Sbms 1739127668Sbms if (drrs->drr_length < SPA_MINBLOCKSIZE || 1740127668Sbms drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os))) 1741127668Sbms return (SET_ERROR(EINVAL)); 1742127668Sbms 1743127668Sbms data = restore_read(ra, drrs->drr_length, NULL); 1744127668Sbms if (data == NULL) 1745127668Sbms return (ra->err); 1746127668Sbms 1747127668Sbms if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1748127668Sbms return (SET_ERROR(EINVAL)); 1749127668Sbms 1750127668Sbms VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1751127668Sbms if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1752127668Sbms dmu_buf_rele(db, FTAG); 1753127668Sbms return (err); 1754127668Sbms } 1755127668Sbms 1756127668Sbms tx = dmu_tx_create(os); 1757127668Sbms 1758127668Sbms dmu_tx_hold_spill(tx, db->db_object); 1759127668Sbms 1760127668Sbms err = dmu_tx_assign(tx, TXG_WAIT); 1761127668Sbms if (err != 0) { 1762127668Sbms dmu_buf_rele(db, FTAG); 1763127668Sbms dmu_buf_rele(db_spill, FTAG); 1764127668Sbms dmu_tx_abort(tx); 1765127668Sbms return (err); 1766127668Sbms } 1767127668Sbms dmu_buf_will_dirty(db_spill, tx); 1768127668Sbms 1769127668Sbms if (db_spill->db_size < drrs->drr_length) 1770127668Sbms VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1771127668Sbms drrs->drr_length, tx)); 1772127668Sbms bcopy(data, db_spill->db_data, drrs->drr_length); 1773127668Sbms 1774127668Sbms dmu_buf_rele(db, FTAG); 1775127668Sbms dmu_buf_rele(db_spill, FTAG); 1776127668Sbms 1777127668Sbms dmu_tx_commit(tx); 1778127668Sbms return (0); 1779127668Sbms} 1780127668Sbms 1781127668Sbms/* ARGSUSED */ 1782127668Sbmsstatic int 1783127668Sbmsrestore_free(struct restorearg *ra, objset_t *os, 1784127668Sbms struct drr_free *drrf) 1785127668Sbms{ 1786127668Sbms int err; 1787127668Sbms 1788127668Sbms if (drrf->drr_length != -1ULL && 1789127668Sbms drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1790127668Sbms return (SET_ERROR(EINVAL)); 1791127668Sbms 1792127668Sbms if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1793127668Sbms return (SET_ERROR(EINVAL)); 1794127668Sbms 1795127668Sbms err = dmu_free_long_range(os, drrf->drr_object, 1796127668Sbms drrf->drr_offset, drrf->drr_length); 1797127668Sbms return (err); 1798127668Sbms} 1799127668Sbms 1800127668Sbms/* used to destroy the drc_ds on error */ 1801127668Sbmsstatic void 1802127668Sbmsdmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1803127668Sbms{ 1804127668Sbms char name[MAXNAMELEN]; 1805127668Sbms dsl_dataset_name(drc->drc_ds, name); 1806127668Sbms dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1807127668Sbms (void) dsl_destroy_head(name); 1808127668Sbms} 1809127668Sbms 1810127668Sbms/* 1811127668Sbms * NB: callers *must* call dmu_recv_end() if this succeeds. 1812127668Sbms */ 1813127668Sbmsint 1814127668Sbmsdmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, 1815127668Sbms int cleanup_fd, uint64_t *action_handlep) 1816127668Sbms{ 1817127668Sbms struct restorearg ra = { 0 }; 1818127668Sbms dmu_replay_record_t *drr; 1819127668Sbms objset_t *os; 1820127668Sbms zio_cksum_t pcksum; 1821127668Sbms int featureflags; 1822127668Sbms 1823127668Sbms ra.byteswap = drc->drc_byteswap; 1824127668Sbms ra.cksum = drc->drc_cksum; 1825127668Sbms ra.td = curthread; 1826127668Sbms ra.fp = fp; 1827127668Sbms ra.voff = *voffp; 1828127668Sbms ra.bufsize = SPA_MAXBLOCKSIZE; 1829127668Sbms ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1830127668Sbms 1831127668Sbms /* these were verified in dmu_recv_begin */ 1832127668Sbms ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1833127668Sbms DMU_SUBSTREAM); 1834127668Sbms ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1835127668Sbms 1836127668Sbms /* 1837127668Sbms * Open the objset we are modifying. 1838127668Sbms */ 1839127668Sbms VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); 1840127668Sbms 1841127668Sbms ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); 1842127668Sbms 1843127668Sbms featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1844127668Sbms 1845127668Sbms /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1846127668Sbms if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1847127668Sbms minor_t minor; 184826180Sfenner 1849127668Sbms if (cleanup_fd == -1) { 1850127668Sbms ra.err = SET_ERROR(EBADF); 1851127668Sbms goto out; 1852127668Sbms } 1853127668Sbms ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1854127668Sbms if (ra.err != 0) { 1855127668Sbms cleanup_fd = -1; 1856127668Sbms goto out; 1857127668Sbms } 1858127668Sbms 1859127668Sbms if (*action_handlep == 0) { 186026180Sfenner ra.guid_to_ds_map = 186126180Sfenner kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1862127668Sbms avl_create(ra.guid_to_ds_map, guid_compare, 1863127668Sbms sizeof (guid_map_entry_t), 186426180Sfenner offsetof(guid_map_entry_t, avlnode)); 1865127668Sbms ra.err = zfs_onexit_add_cb(minor, 1866127668Sbms free_guid_map_onexit, ra.guid_to_ds_map, 1867127668Sbms action_handlep); 1868127668Sbms if (ra.err != 0) 1869127668Sbms goto out; 1870127668Sbms } else { 1871127668Sbms ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1872127668Sbms (void **)&ra.guid_to_ds_map); 1873127668Sbms if (ra.err != 0) 1874127668Sbms goto out; 1875127668Sbms } 1876127668Sbms 1877127668Sbms drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1878127668Sbms } 1879127668Sbms 1880127668Sbms /* 1881127668Sbms * Read records and process them. 1882127668Sbms */ 1883127668Sbms pcksum = ra.cksum; 1884127668Sbms while (ra.err == 0 && 1885127668Sbms NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) { 1886127668Sbms if (issig(JUSTLOOKING) && issig(FORREAL)) { 1887127668Sbms ra.err = SET_ERROR(EINTR); 1888127668Sbms goto out; 1889127668Sbms } 1890127668Sbms 1891127668Sbms if (ra.byteswap) 1892127668Sbms backup_byteswap(drr); 1893127668Sbms 1894127668Sbms switch (drr->drr_type) { 1895127668Sbms case DRR_OBJECT: 1896127668Sbms { 1897127668Sbms /* 1898127668Sbms * We need to make a copy of the record header, 1899127668Sbms * because restore_{object,write} may need to 1900127668Sbms * restore_read(), which will invalidate drr. 190126180Sfenner */ 190226180Sfenner struct drr_object drro = drr->drr_u.drr_object; 1903127668Sbms ra.err = restore_object(&ra, os, &drro); 1904127668Sbms break; 1905127668Sbms } 1906127668Sbms case DRR_FREEOBJECTS: 1907127668Sbms { 1908127668Sbms struct drr_freeobjects drrfo = 1909127668Sbms drr->drr_u.drr_freeobjects; 1910127668Sbms ra.err = restore_freeobjects(&ra, os, &drrfo); 1911127668Sbms break; 1912127668Sbms } 1913127668Sbms case DRR_WRITE: 191417680Spst { 1915127668Sbms struct drr_write drrw = drr->drr_u.drr_write; 1916127668Sbms ra.err = restore_write(&ra, os, &drrw); 1917127668Sbms break; 1918127668Sbms } 1919127668Sbms case DRR_WRITE_BYREF: 1920127668Sbms { 1921127668Sbms struct drr_write_byref drrwbr = 1922127668Sbms drr->drr_u.drr_write_byref; 1923127668Sbms ra.err = restore_write_byref(&ra, os, &drrwbr); 1924127668Sbms break; 1925127668Sbms } 1926127668Sbms case DRR_WRITE_EMBEDDED: 1927127668Sbms { 1928127668Sbms struct drr_write_embedded drrwe = 1929127668Sbms drr->drr_u.drr_write_embedded; 1930127668Sbms ra.err = restore_write_embedded(&ra, os, &drrwe); 1931127668Sbms break; 1932127668Sbms } 1933127668Sbms case DRR_FREE: 1934127668Sbms { 1935127668Sbms struct drr_free drrf = drr->drr_u.drr_free; 1936127668Sbms ra.err = restore_free(&ra, os, &drrf); 1937127668Sbms break; 1938127668Sbms } 1939127668Sbms case DRR_END: 1940127668Sbms { 1941127668Sbms struct drr_end drre = drr->drr_u.drr_end; 1942127668Sbms /* 1943127668Sbms * We compare against the *previous* checksum 194417680Spst * value, because the stored checksum is of 1945127668Sbms * everything before the DRR_END record. 1946127668Sbms */ 1947127668Sbms if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1948127668Sbms ra.err = SET_ERROR(ECKSUM); 1949127668Sbms goto out; 1950127668Sbms } 1951127668Sbms case DRR_SPILL: 1952127668Sbms { 195317680Spst struct drr_spill drrs = drr->drr_u.drr_spill; 1954127668Sbms ra.err = restore_spill(&ra, os, &drrs); 1955127668Sbms break; 195617680Spst } 1957127668Sbms default: 1958127668Sbms ra.err = SET_ERROR(EINVAL); 1959127668Sbms goto out; 1960127668Sbms } 1961127668Sbms pcksum = ra.cksum; 1962127668Sbms } 1963127668Sbms ASSERT(ra.err != 0); 1964127668Sbms 1965127668Sbmsout: 1966127668Sbms if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1967127668Sbms zfs_onexit_fd_rele(cleanup_fd); 1968127668Sbms 1969127668Sbms if (ra.err != 0) { 1970127668Sbms /* 1971127668Sbms * destroy what we created, so we don't leave it in the 1972127668Sbms * inconsistent restoring state. 197317680Spst */ 1974127668Sbms dmu_recv_cleanup_ds(drc); 1975127668Sbms } 1976127668Sbms 1977127668Sbms kmem_free(ra.buf, ra.bufsize); 1978127668Sbms *voffp = ra.voff; 1979127668Sbms return (ra.err); 198026180Sfenner} 1981127668Sbms 1982127668Sbmsstatic int 1983127668Sbmsdmu_recv_end_check(void *arg, dmu_tx_t *tx) 1984127668Sbms{ 1985127668Sbms dmu_recv_cookie_t *drc = arg; 1986127668Sbms dsl_pool_t *dp = dmu_tx_pool(tx); 1987127668Sbms int error; 1988127668Sbms 1989127668Sbms ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 1990127668Sbms 1991127668Sbms if (!drc->drc_newfs) { 1992127668Sbms dsl_dataset_t *origin_head; 1993127668Sbms 1994127668Sbms error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 1995127668Sbms if (error != 0) 1996127668Sbms return (error); 1997127668Sbms if (drc->drc_force) { 199856893Sfenner /* 1999127668Sbms * We will destroy any snapshots in tofs (i.e. before 2000127668Sbms * origin_head) that are after the origin (which is 2001127668Sbms * the snap before drc_ds, because drc_ds can not 2002127668Sbms * have any snaps of its own). 200356893Sfenner */ 2004127668Sbms uint64_t obj; 2005127668Sbms 200656893Sfenner obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 2007127668Sbms while (obj != 2008127668Sbms dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 2009127668Sbms dsl_dataset_t *snap; 2010127668Sbms error = dsl_dataset_hold_obj(dp, obj, FTAG, 2011127668Sbms &snap); 2012127668Sbms if (error != 0) 2013127668Sbms return (error); 2014127668Sbms if (snap->ds_dir != origin_head->ds_dir) 2015127668Sbms error = SET_ERROR(EINVAL); 2016127668Sbms if (error == 0) { 2017127668Sbms error = dsl_destroy_snapshot_check_impl( 201817680Spst snap, B_FALSE); 2019127668Sbms } 2020127668Sbms obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 2021127668Sbms dsl_dataset_rele(snap, FTAG); 2022127668Sbms if (error != 0) 2023127668Sbms return (error); 2024127668Sbms } 2025127668Sbms } 2026127668Sbms error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 2027127668Sbms origin_head, drc->drc_force, drc->drc_owner, tx); 2028127668Sbms if (error != 0) { 2029127668Sbms dsl_dataset_rele(origin_head, FTAG); 2030127668Sbms return (error); 2031127668Sbms } 2032127668Sbms error = dsl_dataset_snapshot_check_impl(origin_head, 2033127668Sbms drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 2034127668Sbms dsl_dataset_rele(origin_head, FTAG); 2035127668Sbms if (error != 0) 2036127668Sbms return (error); 2037127668Sbms 2038127668Sbms error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 2039127668Sbms } else { 2040127668Sbms error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 2041127668Sbms drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 2042127668Sbms } 2043127668Sbms return (error); 2044127668Sbms} 2045127668Sbms 2046127668Sbmsstatic void 204726180Sfennerdmu_recv_end_sync(void *arg, dmu_tx_t *tx) 204817680Spst{ 2049127668Sbms dmu_recv_cookie_t *drc = arg; 2050127668Sbms dsl_pool_t *dp = dmu_tx_pool(tx); 205117680Spst 2052127668Sbms spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 205317680Spst tx, "snap=%s", drc->drc_tosnap); 2054127668Sbms 2055127668Sbms if (!drc->drc_newfs) { 2056127668Sbms dsl_dataset_t *origin_head; 2057127668Sbms 205856893Sfenner VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 2059127668Sbms &origin_head)); 206056893Sfenner 206156893Sfenner if (drc->drc_force) { 206226180Sfenner /* 206326180Sfenner * Destroy any snapshots of drc_tofs (origin_head) 206456893Sfenner * after the origin (the snap before drc_ds). 206556893Sfenner */ 206656893Sfenner uint64_t obj; 206756893Sfenner 206826180Sfenner obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 206956893Sfenner while (obj != 207056893Sfenner dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 207117680Spst dsl_dataset_t *snap; 207217680Spst VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 2073127668Sbms &snap)); 2074127668Sbms ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 2075127668Sbms obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 2076127668Sbms dsl_destroy_snapshot_sync_impl(snap, 2077127668Sbms B_FALSE, tx); 2078127668Sbms dsl_dataset_rele(snap, FTAG); 2079127668Sbms } 2080127668Sbms } 2081127668Sbms VERIFY3P(drc->drc_ds->ds_prev, ==, 2082127668Sbms origin_head->ds_prev); 2083127668Sbms 2084127668Sbms dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 2085127668Sbms origin_head, tx); 2086127668Sbms dsl_dataset_snapshot_sync_impl(origin_head, 2087127668Sbms drc->drc_tosnap, tx); 2088127668Sbms 2089127668Sbms /* set snapshot's creation time and guid */ 2090127668Sbms dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 2091127668Sbms dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = 2092127668Sbms drc->drc_drrb->drr_creation_time; 2093127668Sbms dsl_dataset_phys(origin_head->ds_prev)->ds_guid = 2094127668Sbms drc->drc_drrb->drr_toguid; 2095127668Sbms dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= 2096127668Sbms ~DS_FLAG_INCONSISTENT; 2097127668Sbms 2098127668Sbms dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2099127668Sbms dsl_dataset_phys(origin_head)->ds_flags &= 2100127668Sbms ~DS_FLAG_INCONSISTENT; 2101127668Sbms 2102127668Sbms dsl_dataset_rele(origin_head, FTAG); 2103127668Sbms dsl_destroy_head_sync_impl(drc->drc_ds, tx); 2104127668Sbms 2105127668Sbms if (drc->drc_owner != NULL) 2106127668Sbms VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 2107127668Sbms } else { 2108127668Sbms dsl_dataset_t *ds = drc->drc_ds; 2109127668Sbms 2110127668Sbms dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 2111127668Sbms 2112127668Sbms /* set snapshot's creation time and guid */ 2113127668Sbms dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2114127668Sbms dsl_dataset_phys(ds->ds_prev)->ds_creation_time = 2115127668Sbms drc->drc_drrb->drr_creation_time; 2116127668Sbms dsl_dataset_phys(ds->ds_prev)->ds_guid = 2117127668Sbms drc->drc_drrb->drr_toguid; 2118127668Sbms dsl_dataset_phys(ds->ds_prev)->ds_flags &= 2119127668Sbms ~DS_FLAG_INCONSISTENT; 2120127668Sbms 2121127668Sbms dmu_buf_will_dirty(ds->ds_dbuf, tx); 2122127668Sbms dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; 2123127668Sbms } 2124127668Sbms drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; 2125127668Sbms /* 2126127668Sbms * Release the hold from dmu_recv_begin. This must be done before 2127127668Sbms * we return to open context, so that when we free the dataset's dnode, 2128127668Sbms * we can evict its bonus buffer. 2129127668Sbms */ 2130127668Sbms dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2131127668Sbms drc->drc_ds = NULL; 2132127668Sbms} 2133127668Sbms 2134127668Sbmsstatic int 2135127668Sbmsadd_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 2136127668Sbms{ 2137127668Sbms dsl_pool_t *dp; 2138127668Sbms dsl_dataset_t *snapds; 2139127668Sbms guid_map_entry_t *gmep; 2140127668Sbms int err; 2141127668Sbms 2142127668Sbms ASSERT(guid_map != NULL); 2143127668Sbms 2144127668Sbms err = dsl_pool_hold(name, FTAG, &dp); 2145127668Sbms if (err != 0) 2146127668Sbms return (err); 2147127668Sbms gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 2148127668Sbms err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 2149127668Sbms if (err == 0) { 215017680Spst gmep->guid = dsl_dataset_phys(snapds)->ds_guid; 2151127668Sbms gmep->gme_ds = snapds; 2152127668Sbms avl_add(guid_map, gmep); 2153127668Sbms dsl_dataset_long_hold(snapds, gmep); 2154127668Sbms } else 2155127668Sbms kmem_free(gmep, sizeof (*gmep)); 2156127668Sbms 2157127668Sbms dsl_pool_rele(dp, FTAG); 2158127668Sbms return (err); 2159127668Sbms} 2160127668Sbms 2161127668Sbmsstatic int dmu_recv_end_modified_blocks = 3; 2162127668Sbms 2163127668Sbmsstatic int 2164127668Sbmsdmu_recv_existing_end(dmu_recv_cookie_t *drc) 2165127668Sbms{ 2166127668Sbms int error; 2167127668Sbms char name[MAXNAMELEN]; 2168127668Sbms 2169127668Sbms#ifdef _KERNEL 2170127668Sbms /* 2171127668Sbms * We will be destroying the ds; make sure its origin is unmounted if 2172127668Sbms * necessary. 2173127668Sbms */ 2174127668Sbms dsl_dataset_name(drc->drc_ds, name); 2175127668Sbms zfs_destroy_unmount_origin(name); 2176127668Sbms#endif 2177127668Sbms 2178127668Sbms error = dsl_sync_task(drc->drc_tofs, 2179127668Sbms dmu_recv_end_check, dmu_recv_end_sync, drc, 2180127668Sbms dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2181127668Sbms 2182127668Sbms if (error != 0) 2183127668Sbms dmu_recv_cleanup_ds(drc); 2184127668Sbms return (error); 2185127668Sbms} 2186127668Sbms 2187127668Sbmsstatic int 2188127668Sbmsdmu_recv_new_end(dmu_recv_cookie_t *drc) 2189127668Sbms{ 2190127668Sbms int error; 2191127668Sbms 2192127668Sbms error = dsl_sync_task(drc->drc_tofs, 2193127668Sbms dmu_recv_end_check, dmu_recv_end_sync, drc, 2194127668Sbms dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2195127668Sbms 2196127668Sbms if (error != 0) { 2197127668Sbms dmu_recv_cleanup_ds(drc); 2198127668Sbms } else if (drc->drc_guid_to_ds_map != NULL) { 2199127668Sbms (void) add_ds_to_guidmap(drc->drc_tofs, 2200127668Sbms drc->drc_guid_to_ds_map, 2201127668Sbms drc->drc_newsnapobj); 2202127668Sbms } 2203127668Sbms return (error); 2204127668Sbms} 2205127668Sbms 2206127668Sbmsint 2207127668Sbmsdmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 2208127668Sbms{ 2209127668Sbms drc->drc_owner = owner; 2210127668Sbms 2211127668Sbms if (drc->drc_newfs) 2212127668Sbms return (dmu_recv_new_end(drc)); 2213127668Sbms else 2214127668Sbms return (dmu_recv_existing_end(drc)); 2215127668Sbms} 2216127668Sbms 2217127668Sbms/* 2218127668Sbms * Return TRUE if this objset is currently being received into. 2219127668Sbms */ 2220127668Sbmsboolean_t 2221127668Sbmsdmu_objset_is_receiving(objset_t *os) 2222127668Sbms{ 2223127668Sbms return (os->os_dsl_dataset != NULL && 2224127668Sbms os->os_dsl_dataset->ds_owner == dmu_recv_tag); 2225127668Sbms} 2226127668Sbms