dmu_send.c revision 321610
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23221263Smm * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24286708Smav * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 25264835Sdelphij * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26235222Smm * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27272810Sdelphij * Copyright 2014 HybridCluster. All rights reserved. 28296516Smav * Copyright 2016 RackTop Systems. 29296519Smav * Copyright (c) 2014 Integros [integros.com] 30221263Smm */ 31168404Spjd 32168404Spjd#include <sys/dmu.h> 33168404Spjd#include <sys/dmu_impl.h> 34168404Spjd#include <sys/dmu_tx.h> 35168404Spjd#include <sys/dbuf.h> 36168404Spjd#include <sys/dnode.h> 37168404Spjd#include <sys/zfs_context.h> 38168404Spjd#include <sys/dmu_objset.h> 39168404Spjd#include <sys/dmu_traverse.h> 40168404Spjd#include <sys/dsl_dataset.h> 41168404Spjd#include <sys/dsl_dir.h> 42219089Spjd#include <sys/dsl_prop.h> 43168404Spjd#include <sys/dsl_pool.h> 44168404Spjd#include <sys/dsl_synctask.h> 45168404Spjd#include <sys/zfs_ioctl.h> 46168404Spjd#include <sys/zap.h> 47168404Spjd#include <sys/zio_checksum.h> 48219089Spjd#include <sys/zfs_znode.h> 49219089Spjd#include <zfs_fletcher.h> 50219089Spjd#include <sys/avl.h> 51219089Spjd#include <sys/ddt.h> 52219089Spjd#include <sys/zfs_onexit.h> 53248571Smm#include <sys/dmu_send.h> 54248571Smm#include <sys/dsl_destroy.h> 55268075Sdelphij#include <sys/blkptr.h> 56260183Sdelphij#include <sys/dsl_bookmark.h> 57268075Sdelphij#include <sys/zfeature.h> 58286705Smav#include <sys/bqueue.h> 59168404Spjd 60268075Sdelphij#ifdef __FreeBSD__ 61268075Sdelphij#undef dump_write 62268075Sdelphij#define dump_write dmu_dump_write 63268075Sdelphij#endif 64268075Sdelphij 65228103Smm/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 66228103Smmint zfs_send_corrupt_data = B_FALSE; 67286705Smavint zfs_send_queue_length = 16 * 1024 * 1024; 68286705Smavint zfs_recv_queue_length = 16 * 1024 * 1024; 69296516Smav/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ 70296516Smavint zfs_send_set_freerecords_bit = B_TRUE; 71228103Smm 72296516Smav#ifdef _KERNEL 73296516SmavTUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit); 74296516Smav#endif 75296516Smav 76185029Spjdstatic char *dmu_recv_tag = "dmu_recv_tag"; 77289362Smavconst char *recv_clone_name = "%recv"; 78185029Spjd 79286705Smav#define BP_SPAN(datablkszsec, indblkshift, level) \ 80286705Smav (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ 81286705Smav (level) * (indblkshift - SPA_BLKPTRSHIFT))) 82286705Smav 83289362Smavstatic void byteswap_record(dmu_replay_record_t *drr); 84289362Smav 85286705Smavstruct send_thread_arg { 86286705Smav bqueue_t q; 87286705Smav dsl_dataset_t *ds; /* Dataset to traverse */ 88286705Smav uint64_t fromtxg; /* Traverse from this txg */ 89286705Smav int flags; /* flags to pass to traverse_dataset */ 90286705Smav int error_code; 91286705Smav boolean_t cancel; 92289362Smav zbookmark_phys_t resume; 93286705Smav}; 94286705Smav 95286705Smavstruct send_block_record { 96286705Smav boolean_t eos_marker; /* Marks the end of the stream */ 97286705Smav blkptr_t bp; 98286705Smav zbookmark_phys_t zb; 99286705Smav uint8_t indblkshift; 100286705Smav uint16_t datablkszsec; 101286705Smav bqueue_node_t ln; 102286705Smav}; 103286705Smav 104168404Spjdstatic int 105235222Smmdump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 106168404Spjd{ 107289362Smav dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); 108168404Spjd struct uio auio; 109168404Spjd struct iovec aiov; 110297509Smav 111297509Smav /* 112297509Smav * The code does not rely on this (len being a multiple of 8). We keep 113297509Smav * this assertion because of the corresponding assertion in 114297509Smav * receive_read(). Keeping this assertion ensures that we do not 115297509Smav * inadvertently break backwards compatibility (causing the assertion 116297509Smav * in receive_read() to trigger on old software). 117297509Smav * 118297509Smav * Removing the assertions could be rolled into a new feature that uses 119297509Smav * data that isn't 8-byte aligned; if the assertions were removed, a 120297509Smav * feature flag would have to be added. 121297509Smav */ 122297509Smav 123240415Smm ASSERT0(len % 8); 124168404Spjd 125168404Spjd aiov.iov_base = buf; 126168404Spjd aiov.iov_len = len; 127168404Spjd auio.uio_iov = &aiov; 128168404Spjd auio.uio_iovcnt = 1; 129168404Spjd auio.uio_resid = len; 130169170Spjd auio.uio_segflg = UIO_SYSSPACE; 131168404Spjd auio.uio_rw = UIO_WRITE; 132168404Spjd auio.uio_offset = (off_t)-1; 133235222Smm auio.uio_td = dsp->dsa_td; 134168404Spjd#ifdef _KERNEL 135235222Smm if (dsp->dsa_fp->f_type == DTYPE_VNODE) 136168404Spjd bwillwrite(); 137235222Smm dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, 138235222Smm dsp->dsa_td); 139168404Spjd#else 140168404Spjd fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 141235222Smm dsp->dsa_err = EOPNOTSUPP; 142168404Spjd#endif 143235222Smm mutex_enter(&ds->ds_sendstream_lock); 144235222Smm *dsp->dsa_off += len; 145235222Smm mutex_exit(&ds->ds_sendstream_lock); 146235222Smm 147235222Smm return (dsp->dsa_err); 148168404Spjd} 149168404Spjd 150286587Smav/* 151286587Smav * For all record types except BEGIN, fill in the checksum (overlaid in 152286587Smav * drr_u.drr_checksum.drr_checksum). The checksum verifies everything 153286587Smav * up to the start of the checksum itself. 154286587Smav */ 155168404Spjdstatic int 156286587Smavdump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) 157286587Smav{ 158286587Smav ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 159286587Smav ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 160321610Smav (void) fletcher_4_incremental_native(dsp->dsa_drr, 161286587Smav offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 162286587Smav &dsp->dsa_zc); 163307284Smav if (dsp->dsa_drr->drr_type == DRR_BEGIN) { 164307284Smav dsp->dsa_sent_begin = B_TRUE; 165307284Smav } else { 166286587Smav ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. 167286587Smav drr_checksum.drr_checksum)); 168286587Smav dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; 169286587Smav } 170307284Smav if (dsp->dsa_drr->drr_type == DRR_END) { 171307284Smav dsp->dsa_sent_end = B_TRUE; 172307284Smav } 173321610Smav (void) fletcher_4_incremental_native(&dsp->dsa_drr-> 174286587Smav drr_u.drr_checksum.drr_checksum, 175286587Smav sizeof (zio_cksum_t), &dsp->dsa_zc); 176286587Smav if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 177286587Smav return (SET_ERROR(EINTR)); 178286587Smav if (payload_len != 0) { 179321610Smav (void) fletcher_4_incremental_native(payload, payload_len, 180286587Smav &dsp->dsa_zc); 181286587Smav if (dump_bytes(dsp, payload, payload_len) != 0) 182286587Smav return (SET_ERROR(EINTR)); 183286587Smav } 184286587Smav return (0); 185286587Smav} 186286587Smav 187294815Smav/* 188294815Smav * Fill in the drr_free struct, or perform aggregation if the previous record is 189294815Smav * also a free record, and the two are adjacent. 190294815Smav * 191294815Smav * Note that we send free records even for a full send, because we want to be 192294815Smav * able to receive a full send as a clone, which requires a list of all the free 193294815Smav * and freeobject records that were generated on the source. 194294815Smav */ 195286587Smavstatic int 196235222Smmdump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 197168404Spjd uint64_t length) 198168404Spjd{ 199235222Smm struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 200219089Spjd 201253821Sdelphij /* 202253821Sdelphij * When we receive a free record, dbuf_free_range() assumes 203253821Sdelphij * that the receiving system doesn't have any dbufs in the range 204253821Sdelphij * being freed. This is always true because there is a one-record 205253821Sdelphij * constraint: we only send one WRITE record for any given 206289362Smav * object,offset. We know that the one-record constraint is 207253821Sdelphij * true because we always send data in increasing order by 208253821Sdelphij * object,offset. 209253821Sdelphij * 210253821Sdelphij * If the increasing-order constraint ever changes, we should find 211253821Sdelphij * another way to assert that the one-record constraint is still 212253821Sdelphij * satisfied. 213253821Sdelphij */ 214253821Sdelphij ASSERT(object > dsp->dsa_last_data_object || 215253821Sdelphij (object == dsp->dsa_last_data_object && 216253821Sdelphij offset > dsp->dsa_last_data_offset)); 217253821Sdelphij 218237458Smm if (length != -1ULL && offset + length < offset) 219237458Smm length = -1ULL; 220237458Smm 221219089Spjd /* 222219089Spjd * If there is a pending op, but it's not PENDING_FREE, push it out, 223219089Spjd * since free block aggregation can only be done for blocks of the 224219089Spjd * same type (i.e., DRR_FREE records can only be aggregated with 225219089Spjd * other DRR_FREE records. DRR_FREEOBJECTS records can only be 226219089Spjd * aggregated with other DRR_FREEOBJECTS records. 227219089Spjd */ 228235222Smm if (dsp->dsa_pending_op != PENDING_NONE && 229235222Smm dsp->dsa_pending_op != PENDING_FREE) { 230286587Smav if (dump_record(dsp, NULL, 0) != 0) 231249195Smm return (SET_ERROR(EINTR)); 232235222Smm dsp->dsa_pending_op = PENDING_NONE; 233219089Spjd } 234219089Spjd 235235222Smm if (dsp->dsa_pending_op == PENDING_FREE) { 236219089Spjd /* 237219089Spjd * There should never be a PENDING_FREE if length is -1 238219089Spjd * (because dump_dnode is the only place where this 239219089Spjd * function is called with a -1, and only after flushing 240219089Spjd * any pending record). 241219089Spjd */ 242219089Spjd ASSERT(length != -1ULL); 243219089Spjd /* 244219089Spjd * Check to see whether this free block can be aggregated 245219089Spjd * with pending one. 246219089Spjd */ 247219089Spjd if (drrf->drr_object == object && drrf->drr_offset + 248219089Spjd drrf->drr_length == offset) { 249219089Spjd drrf->drr_length += length; 250219089Spjd return (0); 251219089Spjd } else { 252219089Spjd /* not a continuation. Push out pending record */ 253286587Smav if (dump_record(dsp, NULL, 0) != 0) 254249195Smm return (SET_ERROR(EINTR)); 255235222Smm dsp->dsa_pending_op = PENDING_NONE; 256219089Spjd } 257219089Spjd } 258219089Spjd /* create a FREE record and make it pending */ 259235222Smm bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 260235222Smm dsp->dsa_drr->drr_type = DRR_FREE; 261219089Spjd drrf->drr_object = object; 262219089Spjd drrf->drr_offset = offset; 263219089Spjd drrf->drr_length = length; 264235222Smm drrf->drr_toguid = dsp->dsa_toguid; 265219089Spjd if (length == -1ULL) { 266286587Smav if (dump_record(dsp, NULL, 0) != 0) 267249195Smm return (SET_ERROR(EINTR)); 268219089Spjd } else { 269235222Smm dsp->dsa_pending_op = PENDING_FREE; 270219089Spjd } 271168404Spjd 272168404Spjd return (0); 273168404Spjd} 274168404Spjd 275168404Spjdstatic int 276268075Sdelphijdump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 277321535Smav uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, 278321535Smav void *data) 279168404Spjd{ 280321535Smav uint64_t payload_size; 281235222Smm struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 282219089Spjd 283253821Sdelphij /* 284253821Sdelphij * We send data in increasing object, offset order. 285253821Sdelphij * See comment in dump_free() for details. 286253821Sdelphij */ 287253821Sdelphij ASSERT(object > dsp->dsa_last_data_object || 288253821Sdelphij (object == dsp->dsa_last_data_object && 289253821Sdelphij offset > dsp->dsa_last_data_offset)); 290253821Sdelphij dsp->dsa_last_data_object = object; 291321535Smav dsp->dsa_last_data_offset = offset + lsize - 1; 292219089Spjd 293219089Spjd /* 294219089Spjd * If there is any kind of pending aggregation (currently either 295219089Spjd * a grouping of free objects or free blocks), push it out to 296219089Spjd * the stream, since aggregation can't be done across operations 297219089Spjd * of different types. 298219089Spjd */ 299235222Smm if (dsp->dsa_pending_op != PENDING_NONE) { 300286587Smav if (dump_record(dsp, NULL, 0) != 0) 301249195Smm return (SET_ERROR(EINTR)); 302235222Smm dsp->dsa_pending_op = PENDING_NONE; 303219089Spjd } 304286587Smav /* write a WRITE record */ 305235222Smm bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 306235222Smm dsp->dsa_drr->drr_type = DRR_WRITE; 307219089Spjd drrw->drr_object = object; 308219089Spjd drrw->drr_type = type; 309219089Spjd drrw->drr_offset = offset; 310235222Smm drrw->drr_toguid = dsp->dsa_toguid; 311321535Smav drrw->drr_logical_size = lsize; 312321535Smav 313321535Smav /* only set the compression fields if the buf is compressed */ 314321535Smav if (lsize != psize) { 315321535Smav ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); 316321535Smav ASSERT(!BP_IS_EMBEDDED(bp)); 317321535Smav ASSERT(!BP_SHOULD_BYTESWAP(bp)); 318321535Smav ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); 319321535Smav ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); 320321535Smav ASSERT3S(psize, >, 0); 321321535Smav ASSERT3S(lsize, >=, psize); 322321535Smav 323321535Smav drrw->drr_compressiontype = BP_GET_COMPRESS(bp); 324321535Smav drrw->drr_compressed_size = psize; 325321535Smav payload_size = drrw->drr_compressed_size; 326321535Smav } else { 327321535Smav payload_size = drrw->drr_logical_size; 328321535Smav } 329321535Smav 330274337Sdelphij if (bp == NULL || BP_IS_EMBEDDED(bp)) { 331268075Sdelphij /* 332274337Sdelphij * There's no pre-computed checksum for partial-block 333274337Sdelphij * writes or embedded BP's, so (like 334274337Sdelphij * fletcher4-checkummed blocks) userland will have to 335274337Sdelphij * compute a dedup-capable checksum itself. 336268075Sdelphij */ 337268075Sdelphij drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 338268075Sdelphij } else { 339268075Sdelphij drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 340289422Smav if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & 341289422Smav ZCHECKSUM_FLAG_DEDUP) 342268075Sdelphij drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 343268075Sdelphij DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 344268075Sdelphij DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 345268075Sdelphij DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 346268075Sdelphij drrw->drr_key.ddk_cksum = bp->blk_cksum; 347268075Sdelphij } 348168404Spjd 349321535Smav if (dump_record(dsp, data, payload_size) != 0) 350249195Smm return (SET_ERROR(EINTR)); 351219089Spjd return (0); 352219089Spjd} 353219089Spjd 354219089Spjdstatic int 355268075Sdelphijdump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 356268075Sdelphij int blksz, const blkptr_t *bp) 357268075Sdelphij{ 358268075Sdelphij char buf[BPE_PAYLOAD_SIZE]; 359268075Sdelphij struct drr_write_embedded *drrw = 360268075Sdelphij &(dsp->dsa_drr->drr_u.drr_write_embedded); 361268075Sdelphij 362268075Sdelphij if (dsp->dsa_pending_op != PENDING_NONE) { 363286587Smav if (dump_record(dsp, NULL, 0) != 0) 364268075Sdelphij return (EINTR); 365268075Sdelphij dsp->dsa_pending_op = PENDING_NONE; 366268075Sdelphij } 367268075Sdelphij 368268075Sdelphij ASSERT(BP_IS_EMBEDDED(bp)); 369268075Sdelphij 370268075Sdelphij bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 371268075Sdelphij dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 372268075Sdelphij drrw->drr_object = object; 373268075Sdelphij drrw->drr_offset = offset; 374268075Sdelphij drrw->drr_length = blksz; 375268075Sdelphij drrw->drr_toguid = dsp->dsa_toguid; 376268075Sdelphij drrw->drr_compression = BP_GET_COMPRESS(bp); 377268075Sdelphij drrw->drr_etype = BPE_GET_ETYPE(bp); 378268075Sdelphij drrw->drr_lsize = BPE_GET_LSIZE(bp); 379268075Sdelphij drrw->drr_psize = BPE_GET_PSIZE(bp); 380268075Sdelphij 381268075Sdelphij decode_embedded_bp_compressed(bp, buf); 382268075Sdelphij 383286587Smav if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 384268075Sdelphij return (EINTR); 385268075Sdelphij return (0); 386268075Sdelphij} 387268075Sdelphij 388268075Sdelphijstatic int 389235222Smmdump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 390219089Spjd{ 391235222Smm struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 392219089Spjd 393235222Smm if (dsp->dsa_pending_op != PENDING_NONE) { 394286587Smav if (dump_record(dsp, NULL, 0) != 0) 395249195Smm return (SET_ERROR(EINTR)); 396235222Smm dsp->dsa_pending_op = PENDING_NONE; 397219089Spjd } 398219089Spjd 399219089Spjd /* write a SPILL record */ 400235222Smm bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 401235222Smm dsp->dsa_drr->drr_type = DRR_SPILL; 402219089Spjd drrs->drr_object = object; 403219089Spjd drrs->drr_length = blksz; 404235222Smm drrs->drr_toguid = dsp->dsa_toguid; 405219089Spjd 406286587Smav if (dump_record(dsp, data, blksz) != 0) 407249195Smm return (SET_ERROR(EINTR)); 408168404Spjd return (0); 409168404Spjd} 410168404Spjd 411168404Spjdstatic int 412235222Smmdump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 413168404Spjd{ 414235222Smm struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 415219089Spjd 416219089Spjd /* 417219089Spjd * If there is a pending op, but it's not PENDING_FREEOBJECTS, 418219089Spjd * push it out, since free block aggregation can only be done for 419219089Spjd * blocks of the same type (i.e., DRR_FREE records can only be 420219089Spjd * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 421219089Spjd * can only be aggregated with other DRR_FREEOBJECTS records. 422219089Spjd */ 423235222Smm if (dsp->dsa_pending_op != PENDING_NONE && 424235222Smm dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 425286587Smav if (dump_record(dsp, NULL, 0) != 0) 426249195Smm return (SET_ERROR(EINTR)); 427235222Smm dsp->dsa_pending_op = PENDING_NONE; 428219089Spjd } 429235222Smm if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 430219089Spjd /* 431219089Spjd * See whether this free object array can be aggregated 432219089Spjd * with pending one 433219089Spjd */ 434219089Spjd if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 435219089Spjd drrfo->drr_numobjs += numobjs; 436219089Spjd return (0); 437219089Spjd } else { 438219089Spjd /* can't be aggregated. Push out pending record */ 439286587Smav if (dump_record(dsp, NULL, 0) != 0) 440249195Smm return (SET_ERROR(EINTR)); 441235222Smm dsp->dsa_pending_op = PENDING_NONE; 442219089Spjd } 443219089Spjd } 444219089Spjd 445168404Spjd /* write a FREEOBJECTS record */ 446235222Smm bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 447235222Smm dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 448219089Spjd drrfo->drr_firstobj = firstobj; 449219089Spjd drrfo->drr_numobjs = numobjs; 450235222Smm drrfo->drr_toguid = dsp->dsa_toguid; 451168404Spjd 452235222Smm dsp->dsa_pending_op = PENDING_FREEOBJECTS; 453219089Spjd 454168404Spjd return (0); 455168404Spjd} 456168404Spjd 457168404Spjdstatic int 458235222Smmdump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 459168404Spjd{ 460235222Smm struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 461219089Spjd 462289362Smav if (object < dsp->dsa_resume_object) { 463289362Smav /* 464289362Smav * Note: when resuming, we will visit all the dnodes in 465289362Smav * the block of dnodes that we are resuming from. In 466289362Smav * this case it's unnecessary to send the dnodes prior to 467289362Smav * the one we are resuming from. We should be at most one 468289362Smav * block's worth of dnodes behind the resume point. 469289362Smav */ 470289362Smav ASSERT3U(dsp->dsa_resume_object - object, <, 471289362Smav 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); 472289362Smav return (0); 473289362Smav } 474289362Smav 475168404Spjd if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 476235222Smm return (dump_freeobjects(dsp, object, 1)); 477168404Spjd 478235222Smm if (dsp->dsa_pending_op != PENDING_NONE) { 479286587Smav if (dump_record(dsp, NULL, 0) != 0) 480249195Smm return (SET_ERROR(EINTR)); 481235222Smm dsp->dsa_pending_op = PENDING_NONE; 482219089Spjd } 483219089Spjd 484168404Spjd /* write an OBJECT record */ 485235222Smm bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 486235222Smm dsp->dsa_drr->drr_type = DRR_OBJECT; 487219089Spjd drro->drr_object = object; 488219089Spjd drro->drr_type = dnp->dn_type; 489219089Spjd drro->drr_bonustype = dnp->dn_bonustype; 490219089Spjd drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 491219089Spjd drro->drr_bonuslen = dnp->dn_bonuslen; 492219089Spjd drro->drr_checksumtype = dnp->dn_checksum; 493219089Spjd drro->drr_compress = dnp->dn_compress; 494235222Smm drro->drr_toguid = dsp->dsa_toguid; 495168404Spjd 496274337Sdelphij if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 497274337Sdelphij drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) 498274337Sdelphij drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; 499274337Sdelphij 500286587Smav if (dump_record(dsp, DN_BONUS(dnp), 501286587Smav P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { 502249195Smm return (SET_ERROR(EINTR)); 503286587Smav } 504168404Spjd 505253821Sdelphij /* Free anything past the end of the file. */ 506235222Smm if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 507253821Sdelphij (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 508249195Smm return (SET_ERROR(EINTR)); 509248571Smm if (dsp->dsa_err != 0) 510249195Smm return (SET_ERROR(EINTR)); 511168404Spjd return (0); 512168404Spjd} 513168404Spjd 514268075Sdelphijstatic boolean_t 515268075Sdelphijbackup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 516268075Sdelphij{ 517268075Sdelphij if (!BP_IS_EMBEDDED(bp)) 518268075Sdelphij return (B_FALSE); 519268075Sdelphij 520268075Sdelphij /* 521268075Sdelphij * Compression function must be legacy, or explicitly enabled. 522268075Sdelphij */ 523268075Sdelphij if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 524321535Smav !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) 525268075Sdelphij return (B_FALSE); 526268075Sdelphij 527268075Sdelphij /* 528268075Sdelphij * Embed type must be explicitly enabled. 529268075Sdelphij */ 530268075Sdelphij switch (BPE_GET_ETYPE(bp)) { 531268075Sdelphij case BP_EMBEDDED_TYPE_DATA: 532268075Sdelphij if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 533268075Sdelphij return (B_TRUE); 534268075Sdelphij break; 535268075Sdelphij default: 536268075Sdelphij return (B_FALSE); 537268075Sdelphij } 538268075Sdelphij return (B_FALSE); 539268075Sdelphij} 540268075Sdelphij 541286705Smav/* 542286705Smav * This is the callback function to traverse_dataset that acts as the worker 543286705Smav * thread for dmu_send_impl. 544286705Smav */ 545286705Smav/*ARGSUSED*/ 546286705Smavstatic int 547286705Smavsend_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 548286705Smav const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) 549286705Smav{ 550286705Smav struct send_thread_arg *sta = arg; 551286705Smav struct send_block_record *record; 552286705Smav uint64_t record_size; 553286705Smav int err = 0; 554168404Spjd 555289362Smav ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || 556289362Smav zb->zb_object >= sta->resume.zb_object); 557289362Smav 558286705Smav if (sta->cancel) 559286705Smav return (SET_ERROR(EINTR)); 560286705Smav 561286705Smav if (bp == NULL) { 562286705Smav ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); 563286705Smav return (0); 564286705Smav } else if (zb->zb_level < 0) { 565286705Smav return (0); 566286705Smav } 567286705Smav 568286705Smav record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); 569286705Smav record->eos_marker = B_FALSE; 570286705Smav record->bp = *bp; 571286705Smav record->zb = *zb; 572286705Smav record->indblkshift = dnp->dn_indblkshift; 573286705Smav record->datablkszsec = dnp->dn_datablkszsec; 574286705Smav record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 575286705Smav bqueue_enqueue(&sta->q, record, record_size); 576286705Smav 577286705Smav return (err); 578286705Smav} 579286705Smav 580286705Smav/* 581286705Smav * This function kicks off the traverse_dataset. It also handles setting the 582286705Smav * error code of the thread in case something goes wrong, and pushes the End of 583286705Smav * Stream record when the traverse_dataset call has finished. If there is no 584286705Smav * dataset to traverse, the thread immediately pushes End of Stream marker. 585286705Smav */ 586286705Smavstatic void 587286705Smavsend_traverse_thread(void *arg) 588286705Smav{ 589286705Smav struct send_thread_arg *st_arg = arg; 590286705Smav int err; 591286705Smav struct send_block_record *data; 592286705Smav 593286705Smav if (st_arg->ds != NULL) { 594289362Smav err = traverse_dataset_resume(st_arg->ds, 595289362Smav st_arg->fromtxg, &st_arg->resume, 596289362Smav st_arg->flags, send_cb, st_arg); 597289362Smav 598286705Smav if (err != EINTR) 599286705Smav st_arg->error_code = err; 600286705Smav } 601286705Smav data = kmem_zalloc(sizeof (*data), KM_SLEEP); 602286705Smav data->eos_marker = B_TRUE; 603286705Smav bqueue_enqueue(&st_arg->q, data, 1); 604286705Smav thread_exit(); 605286705Smav} 606286705Smav 607286705Smav/* 608286705Smav * This function actually handles figuring out what kind of record needs to be 609286705Smav * dumped, reading the data (which has hopefully been prefetched), and calling 610286705Smav * the appropriate helper function. 611286705Smav */ 612168404Spjdstatic int 613286705Smavdo_dump(dmu_sendarg_t *dsa, struct send_block_record *data) 614168404Spjd{ 615286705Smav dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); 616286705Smav const blkptr_t *bp = &data->bp; 617286705Smav const zbookmark_phys_t *zb = &data->zb; 618286705Smav uint8_t indblkshift = data->indblkshift; 619286705Smav uint16_t dblkszsec = data->datablkszsec; 620286705Smav spa_t *spa = ds->ds_dir->dd_pool->dp_spa; 621168404Spjd dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 622168404Spjd int err = 0; 623168404Spjd 624286705Smav ASSERT3U(zb->zb_level, >=, 0); 625168404Spjd 626289362Smav ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || 627289362Smav zb->zb_object >= dsa->dsa_resume_object); 628289362Smav 629219089Spjd if (zb->zb_object != DMU_META_DNODE_OBJECT && 630219089Spjd DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 631209962Smm return (0); 632260150Sdelphij } else if (BP_IS_HOLE(bp) && 633260150Sdelphij zb->zb_object == DMU_META_DNODE_OBJECT) { 634286705Smav uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); 635208047Smm uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 636286705Smav err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); 637260150Sdelphij } else if (BP_IS_HOLE(bp)) { 638286705Smav uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); 639286705Smav uint64_t offset = zb->zb_blkid * span; 640286705Smav err = dump_free(dsa, zb->zb_object, offset, span); 641208047Smm } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 642208047Smm return (0); 643208047Smm } else if (type == DMU_OT_DNODE) { 644168404Spjd int blksz = BP_GET_LSIZE(bp); 645275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 646208047Smm arc_buf_t *abuf; 647168404Spjd 648286705Smav ASSERT0(zb->zb_level); 649286705Smav 650246666Smm if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 651246666Smm ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 652246666Smm &aflags, zb) != 0) 653249195Smm return (SET_ERROR(EIO)); 654208047Smm 655286705Smav dnode_phys_t *blk = abuf->b_data; 656286705Smav uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); 657286705Smav for (int i = 0; i < blksz >> DNODE_SHIFT; i++) { 658286705Smav err = dump_dnode(dsa, dnobj + i, blk + i); 659248571Smm if (err != 0) 660168404Spjd break; 661168404Spjd } 662307265Smav arc_buf_destroy(abuf, &abuf); 663219089Spjd } else if (type == DMU_OT_SA) { 664275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 665208047Smm arc_buf_t *abuf; 666168404Spjd int blksz = BP_GET_LSIZE(bp); 667168404Spjd 668246666Smm if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 669246666Smm ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 670246666Smm &aflags, zb) != 0) 671249195Smm return (SET_ERROR(EIO)); 672168404Spjd 673286705Smav err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); 674307265Smav arc_buf_destroy(abuf, &abuf); 675286705Smav } else if (backup_do_embed(dsa, bp)) { 676268075Sdelphij /* it's an embedded level-0 block of a regular object */ 677286705Smav int blksz = dblkszsec << SPA_MINBLOCKSHIFT; 678286705Smav ASSERT0(zb->zb_level); 679286705Smav err = dump_write_embedded(dsa, zb->zb_object, 680268075Sdelphij zb->zb_blkid * blksz, blksz, bp); 681286705Smav } else { 682286705Smav /* it's a level-0 block of a regular object */ 683275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 684219089Spjd arc_buf_t *abuf; 685286705Smav int blksz = dblkszsec << SPA_MINBLOCKSHIFT; 686274337Sdelphij uint64_t offset; 687219089Spjd 688321535Smav /* 689321535Smav * If we have large blocks stored on disk but the send flags 690321535Smav * don't allow us to send large blocks, we split the data from 691321535Smav * the arc buf into chunks. 692321535Smav */ 693321535Smav boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && 694321535Smav !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); 695321535Smav /* 696321535Smav * We should only request compressed data from the ARC if all 697321535Smav * the following are true: 698321535Smav * - stream compression was requested 699321535Smav * - we aren't splitting large blocks into smaller chunks 700321535Smav * - the data won't need to be byteswapped before sending 701321535Smav * - this isn't an embedded block 702321535Smav * - this isn't metadata (if receiving on a different endian 703321535Smav * system it can be byteswapped more easily) 704321535Smav */ 705321535Smav boolean_t request_compressed = 706321535Smav (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && 707321535Smav !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && 708321535Smav !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); 709321535Smav 710260183Sdelphij ASSERT0(zb->zb_level); 711289362Smav ASSERT(zb->zb_object > dsa->dsa_resume_object || 712289362Smav (zb->zb_object == dsa->dsa_resume_object && 713289362Smav zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); 714289362Smav 715321535Smav ASSERT0(zb->zb_level); 716321535Smav ASSERT(zb->zb_object > dsa->dsa_resume_object || 717321535Smav (zb->zb_object == dsa->dsa_resume_object && 718321535Smav zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); 719321535Smav 720321535Smav ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); 721321535Smav 722321535Smav enum zio_flag zioflags = ZIO_FLAG_CANFAIL; 723321535Smav if (request_compressed) 724321535Smav zioflags |= ZIO_FLAG_RAW; 725246666Smm if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 726321535Smav ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { 727228103Smm if (zfs_send_corrupt_data) { 728228103Smm /* Send a block filled with 0x"zfs badd bloc" */ 729321535Smav abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, 730321535Smav blksz); 731228103Smm uint64_t *ptr; 732228103Smm for (ptr = abuf->b_data; 733228103Smm (char *)ptr < (char *)abuf->b_data + blksz; 734228103Smm ptr++) 735286554Smav *ptr = 0x2f5baddb10cULL; 736228103Smm } else { 737249195Smm return (SET_ERROR(EIO)); 738228103Smm } 739228103Smm } 740219089Spjd 741274337Sdelphij offset = zb->zb_blkid * blksz; 742274337Sdelphij 743321535Smav if (split_large_blocks) { 744321535Smav ASSERT3U(arc_get_compression(abuf), ==, 745321535Smav ZIO_COMPRESS_OFF); 746274337Sdelphij char *buf = abuf->b_data; 747274337Sdelphij while (blksz > 0 && err == 0) { 748274337Sdelphij int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); 749286705Smav err = dump_write(dsa, type, zb->zb_object, 750321535Smav offset, n, n, NULL, buf); 751274337Sdelphij offset += n; 752274337Sdelphij buf += n; 753274337Sdelphij blksz -= n; 754274337Sdelphij } 755274337Sdelphij } else { 756321535Smav err = dump_write(dsa, type, zb->zb_object, offset, 757321535Smav blksz, arc_buf_size(abuf), bp, abuf->b_data); 758274337Sdelphij } 759307265Smav arc_buf_destroy(abuf, &abuf); 760168404Spjd } 761168404Spjd 762168404Spjd ASSERT(err == 0 || err == EINTR); 763168404Spjd return (err); 764168404Spjd} 765168404Spjd 766248571Smm/* 767286705Smav * Pop the new data off the queue, and free the old data. 768248571Smm */ 769286705Smavstatic struct send_block_record * 770286705Smavget_next_record(bqueue_t *bq, struct send_block_record *data) 771286705Smav{ 772286705Smav struct send_block_record *tmp = bqueue_dequeue(bq); 773286705Smav kmem_free(data, sizeof (*data)); 774286705Smav return (tmp); 775286705Smav} 776286705Smav 777286705Smav/* 778286705Smav * Actually do the bulk of the work in a zfs send. 779286705Smav * 780286705Smav * Note: Releases dp using the specified tag. 781286705Smav */ 782248571Smmstatic int 783286705Smavdmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, 784321535Smav zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, 785321535Smav boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, 786321535Smav int outfd, uint64_t resumeobj, uint64_t resumeoff, 787248571Smm#ifdef illumos 788289362Smav vnode_t *vp, offset_t *off) 789248571Smm#else 790289362Smav struct file *fp, offset_t *off) 791248571Smm#endif 792168404Spjd{ 793248571Smm objset_t *os; 794168404Spjd dmu_replay_record_t *drr; 795235222Smm dmu_sendarg_t *dsp; 796168404Spjd int err; 797185029Spjd uint64_t fromtxg = 0; 798268075Sdelphij uint64_t featureflags = 0; 799289362Smav struct send_thread_arg to_arg = { 0 }; 800168404Spjd 801286705Smav err = dmu_objset_from_ds(to_ds, &os); 802248571Smm if (err != 0) { 803248571Smm dsl_pool_rele(dp, tag); 804248571Smm return (err); 805185029Spjd } 806185029Spjd 807168404Spjd drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 808168404Spjd drr->drr_type = DRR_BEGIN; 809168404Spjd drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 810219089Spjd DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 811219089Spjd DMU_SUBSTREAM); 812219089Spjd 813219089Spjd#ifdef _KERNEL 814248571Smm if (dmu_objset_type(os) == DMU_OST_ZFS) { 815219089Spjd uint64_t version; 816248571Smm if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 817235222Smm kmem_free(drr, sizeof (dmu_replay_record_t)); 818248571Smm dsl_pool_rele(dp, tag); 819249195Smm return (SET_ERROR(EINVAL)); 820235222Smm } 821248571Smm if (version >= ZPL_VERSION_SA) { 822268075Sdelphij featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 823219089Spjd } 824219089Spjd } 825219089Spjd#endif 826219089Spjd 827286708Smav if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) 828274337Sdelphij featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; 829268075Sdelphij if (embedok && 830268075Sdelphij spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 831268075Sdelphij featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 832268075Sdelphij if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 833321535Smav featureflags |= DMU_BACKUP_FEATURE_LZ4; 834268075Sdelphij } 835321535Smav if (compressok) { 836321535Smav featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; 837321535Smav } 838321535Smav if ((featureflags & 839321535Smav (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) != 840321535Smav 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { 841321535Smav featureflags |= DMU_BACKUP_FEATURE_LZ4; 842321535Smav } 843268075Sdelphij 844289362Smav if (resumeobj != 0 || resumeoff != 0) { 845289362Smav featureflags |= DMU_BACKUP_FEATURE_RESUMING; 846289362Smav } 847289362Smav 848268075Sdelphij DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 849268075Sdelphij featureflags); 850268075Sdelphij 851168404Spjd drr->drr_u.drr_begin.drr_creation_time = 852286705Smav dsl_dataset_phys(to_ds)->ds_creation_time; 853248571Smm drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 854260183Sdelphij if (is_clone) 855185029Spjd drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 856286705Smav drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; 857286705Smav if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) 858185029Spjd drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 859296516Smav if (zfs_send_set_freerecords_bit) 860296516Smav drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; 861185029Spjd 862286705Smav if (ancestor_zb != NULL) { 863286705Smav drr->drr_u.drr_begin.drr_fromguid = 864286705Smav ancestor_zb->zbm_guid; 865286705Smav fromtxg = ancestor_zb->zbm_creation_txg; 866260183Sdelphij } 867286705Smav dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); 868286705Smav if (!to_ds->ds_is_snapshot) { 869260183Sdelphij (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 870260183Sdelphij sizeof (drr->drr_u.drr_begin.drr_toname)); 871248571Smm } 872185029Spjd 873235222Smm dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 874168404Spjd 875235222Smm dsp->dsa_drr = drr; 876235222Smm dsp->dsa_outfd = outfd; 877235222Smm dsp->dsa_proc = curproc; 878235222Smm dsp->dsa_td = curthread; 879235222Smm dsp->dsa_fp = fp; 880248571Smm dsp->dsa_os = os; 881235222Smm dsp->dsa_off = off; 882286705Smav dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; 883235222Smm dsp->dsa_pending_op = PENDING_NONE; 884268075Sdelphij dsp->dsa_featureflags = featureflags; 885289362Smav dsp->dsa_resume_object = resumeobj; 886289362Smav dsp->dsa_resume_offset = resumeoff; 887235222Smm 888286705Smav mutex_enter(&to_ds->ds_sendstream_lock); 889286705Smav list_insert_head(&to_ds->ds_sendstreams, dsp); 890286705Smav mutex_exit(&to_ds->ds_sendstream_lock); 891235222Smm 892286705Smav dsl_dataset_long_hold(to_ds, FTAG); 893249042Smm dsl_pool_rele(dp, tag); 894249042Smm 895289362Smav void *payload = NULL; 896289362Smav size_t payload_len = 0; 897289362Smav if (resumeobj != 0 || resumeoff != 0) { 898289362Smav dmu_object_info_t to_doi; 899289362Smav err = dmu_object_info(os, resumeobj, &to_doi); 900289362Smav if (err != 0) 901289362Smav goto out; 902289362Smav SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, 903289362Smav resumeoff / to_doi.doi_data_block_size); 904289362Smav 905289362Smav nvlist_t *nvl = fnvlist_alloc(); 906289362Smav fnvlist_add_uint64(nvl, "resume_object", resumeobj); 907289362Smav fnvlist_add_uint64(nvl, "resume_offset", resumeoff); 908289362Smav payload = fnvlist_pack(nvl, &payload_len); 909289362Smav drr->drr_payloadlen = payload_len; 910289362Smav fnvlist_free(nvl); 911289362Smav } 912289362Smav 913289362Smav err = dump_record(dsp, payload, payload_len); 914289362Smav fnvlist_pack_free(payload, payload_len); 915289362Smav if (err != 0) { 916235222Smm err = dsp->dsa_err; 917235222Smm goto out; 918168404Spjd } 919168404Spjd 920286705Smav err = bqueue_init(&to_arg.q, zfs_send_queue_length, 921286705Smav offsetof(struct send_block_record, ln)); 922286705Smav to_arg.error_code = 0; 923286705Smav to_arg.cancel = B_FALSE; 924286705Smav to_arg.ds = to_ds; 925286705Smav to_arg.fromtxg = fromtxg; 926286705Smav to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; 927287280Sdelphij (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0, 928286705Smav TS_RUN, minclsyspri); 929168404Spjd 930286705Smav struct send_block_record *to_data; 931286705Smav to_data = bqueue_dequeue(&to_arg.q); 932286705Smav 933286705Smav while (!to_data->eos_marker && err == 0) { 934286705Smav err = do_dump(dsp, to_data); 935286705Smav to_data = get_next_record(&to_arg.q, to_data); 936286705Smav if (issig(JUSTLOOKING) && issig(FORREAL)) 937286705Smav err = EINTR; 938286705Smav } 939286705Smav 940286705Smav if (err != 0) { 941286705Smav to_arg.cancel = B_TRUE; 942286705Smav while (!to_data->eos_marker) { 943286705Smav to_data = get_next_record(&to_arg.q, to_data); 944286705Smav } 945286705Smav } 946286705Smav kmem_free(to_data, sizeof (*to_data)); 947286705Smav 948286705Smav bqueue_destroy(&to_arg.q); 949286705Smav 950286705Smav if (err == 0 && to_arg.error_code != 0) 951286705Smav err = to_arg.error_code; 952286705Smav 953286705Smav if (err != 0) 954286705Smav goto out; 955286705Smav 956235222Smm if (dsp->dsa_pending_op != PENDING_NONE) 957286587Smav if (dump_record(dsp, NULL, 0) != 0) 958249195Smm err = SET_ERROR(EINTR); 959219089Spjd 960248571Smm if (err != 0) { 961248571Smm if (err == EINTR && dsp->dsa_err != 0) 962235222Smm err = dsp->dsa_err; 963235222Smm goto out; 964168404Spjd } 965168404Spjd 966168404Spjd bzero(drr, sizeof (dmu_replay_record_t)); 967168404Spjd drr->drr_type = DRR_END; 968235222Smm drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 969235222Smm drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 970168404Spjd 971286705Smav if (dump_record(dsp, NULL, 0) != 0) 972235222Smm err = dsp->dsa_err; 973168404Spjd 974235222Smmout: 975286705Smav mutex_enter(&to_ds->ds_sendstream_lock); 976286705Smav list_remove(&to_ds->ds_sendstreams, dsp); 977286705Smav mutex_exit(&to_ds->ds_sendstream_lock); 978235222Smm 979307284Smav VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); 980307284Smav 981168404Spjd kmem_free(drr, sizeof (dmu_replay_record_t)); 982235222Smm kmem_free(dsp, sizeof (dmu_sendarg_t)); 983168404Spjd 984286705Smav dsl_dataset_long_rele(to_ds, FTAG); 985248571Smm 986235222Smm return (err); 987168404Spjd} 988168404Spjd 989228103Smmint 990248571Smmdmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 991321535Smav boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, 992248571Smm#ifdef illumos 993274337Sdelphij int outfd, vnode_t *vp, offset_t *off) 994248571Smm#else 995274337Sdelphij int outfd, struct file *fp, offset_t *off) 996248571Smm#endif 997228103Smm{ 998248571Smm dsl_pool_t *dp; 999248571Smm dsl_dataset_t *ds; 1000248571Smm dsl_dataset_t *fromds = NULL; 1001248571Smm int err; 1002248571Smm 1003248571Smm err = dsl_pool_hold(pool, FTAG, &dp); 1004248571Smm if (err != 0) 1005248571Smm return (err); 1006248571Smm 1007248571Smm err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 1008248571Smm if (err != 0) { 1009248571Smm dsl_pool_rele(dp, FTAG); 1010248571Smm return (err); 1011248571Smm } 1012248571Smm 1013248571Smm if (fromsnap != 0) { 1014260183Sdelphij zfs_bookmark_phys_t zb; 1015260183Sdelphij boolean_t is_clone; 1016260183Sdelphij 1017248571Smm err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 1018248571Smm if (err != 0) { 1019248571Smm dsl_dataset_rele(ds, FTAG); 1020248571Smm dsl_pool_rele(dp, FTAG); 1021248571Smm return (err); 1022248571Smm } 1023260183Sdelphij if (!dsl_dataset_is_before(ds, fromds, 0)) 1024260183Sdelphij err = SET_ERROR(EXDEV); 1025275782Sdelphij zb.zbm_creation_time = 1026275782Sdelphij dsl_dataset_phys(fromds)->ds_creation_time; 1027275782Sdelphij zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; 1028275782Sdelphij zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 1029260183Sdelphij is_clone = (fromds->ds_dir != ds->ds_dir); 1030260183Sdelphij dsl_dataset_rele(fromds, FTAG); 1031274337Sdelphij err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 1032321535Smav embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); 1033260183Sdelphij } else { 1034274337Sdelphij err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 1035321535Smav embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); 1036248571Smm } 1037260183Sdelphij dsl_dataset_rele(ds, FTAG); 1038260183Sdelphij return (err); 1039248571Smm} 1040248571Smm 1041248571Smmint 1042289362Smavdmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, 1043321535Smav boolean_t large_block_ok, boolean_t compressok, int outfd, 1044321535Smav uint64_t resumeobj, uint64_t resumeoff, 1045248571Smm#ifdef illumos 1046289362Smav vnode_t *vp, offset_t *off) 1047248571Smm#else 1048289362Smav struct file *fp, offset_t *off) 1049248571Smm#endif 1050248571Smm{ 1051248571Smm dsl_pool_t *dp; 1052248571Smm dsl_dataset_t *ds; 1053248571Smm int err; 1054260183Sdelphij boolean_t owned = B_FALSE; 1055248571Smm 1056260183Sdelphij if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 1057249195Smm return (SET_ERROR(EINVAL)); 1058248571Smm 1059248571Smm err = dsl_pool_hold(tosnap, FTAG, &dp); 1060248571Smm if (err != 0) 1061248571Smm return (err); 1062248571Smm 1063260183Sdelphij if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 1064260183Sdelphij /* 1065260183Sdelphij * We are sending a filesystem or volume. Ensure 1066260183Sdelphij * that it doesn't change by owning the dataset. 1067260183Sdelphij */ 1068260183Sdelphij err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 1069260183Sdelphij owned = B_TRUE; 1070260183Sdelphij } else { 1071260183Sdelphij err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 1072260183Sdelphij } 1073248571Smm if (err != 0) { 1074248571Smm dsl_pool_rele(dp, FTAG); 1075248571Smm return (err); 1076248571Smm } 1077248571Smm 1078248571Smm if (fromsnap != NULL) { 1079260183Sdelphij zfs_bookmark_phys_t zb; 1080260183Sdelphij boolean_t is_clone = B_FALSE; 1081260183Sdelphij int fsnamelen = strchr(tosnap, '@') - tosnap; 1082260183Sdelphij 1083260183Sdelphij /* 1084260183Sdelphij * If the fromsnap is in a different filesystem, then 1085260183Sdelphij * mark the send stream as a clone. 1086260183Sdelphij */ 1087260183Sdelphij if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 1088260183Sdelphij (fromsnap[fsnamelen] != '@' && 1089260183Sdelphij fromsnap[fsnamelen] != '#')) { 1090260183Sdelphij is_clone = B_TRUE; 1091260183Sdelphij } 1092260183Sdelphij 1093260183Sdelphij if (strchr(fromsnap, '@')) { 1094260183Sdelphij dsl_dataset_t *fromds; 1095260183Sdelphij err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 1096260183Sdelphij if (err == 0) { 1097260183Sdelphij if (!dsl_dataset_is_before(ds, fromds, 0)) 1098260183Sdelphij err = SET_ERROR(EXDEV); 1099260183Sdelphij zb.zbm_creation_time = 1100275782Sdelphij dsl_dataset_phys(fromds)->ds_creation_time; 1101260183Sdelphij zb.zbm_creation_txg = 1102275782Sdelphij dsl_dataset_phys(fromds)->ds_creation_txg; 1103275782Sdelphij zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 1104260183Sdelphij is_clone = (ds->ds_dir != fromds->ds_dir); 1105260183Sdelphij dsl_dataset_rele(fromds, FTAG); 1106260183Sdelphij } 1107260183Sdelphij } else { 1108260183Sdelphij err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 1109260183Sdelphij } 1110248571Smm if (err != 0) { 1111248571Smm dsl_dataset_rele(ds, FTAG); 1112248571Smm dsl_pool_rele(dp, FTAG); 1113248571Smm return (err); 1114248571Smm } 1115274337Sdelphij err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 1116321535Smav embedok, large_block_ok, compressok, 1117289362Smav outfd, resumeobj, resumeoff, fp, off); 1118260183Sdelphij } else { 1119274337Sdelphij err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 1120321535Smav embedok, large_block_ok, compressok, 1121289362Smav outfd, resumeobj, resumeoff, fp, off); 1122248571Smm } 1123260183Sdelphij if (owned) 1124260183Sdelphij dsl_dataset_disown(ds, FTAG); 1125260183Sdelphij else 1126260183Sdelphij dsl_dataset_rele(ds, FTAG); 1127260183Sdelphij return (err); 1128248571Smm} 1129248571Smm 1130286683Smavstatic int 1131321535Smavdmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, 1132321535Smav uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) 1133286683Smav{ 1134286683Smav int err; 1135321535Smav uint64_t size; 1136286683Smav /* 1137286683Smav * Assume that space (both on-disk and in-stream) is dominated by 1138286683Smav * data. We will adjust for indirect blocks and the copies property, 1139286683Smav * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 1140286683Smav */ 1141321535Smav uint64_t recordsize; 1142321535Smav uint64_t record_count; 1143286683Smav 1144321535Smav /* Assume all (uncompressed) blocks are recordsize. */ 1145321535Smav err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 1146321535Smav &recordsize); 1147321535Smav if (err != 0) 1148321535Smav return (err); 1149321535Smav record_count = uncompressed / recordsize; 1150321535Smav 1151286683Smav /* 1152321535Smav * If we're estimating a send size for a compressed stream, use the 1153321535Smav * compressed data size to estimate the stream size. Otherwise, use the 1154321535Smav * uncompressed data size. 1155321535Smav */ 1156321535Smav size = stream_compressed ? compressed : uncompressed; 1157321535Smav 1158321535Smav /* 1159286683Smav * Subtract out approximate space used by indirect blocks. 1160286683Smav * Assume most space is used by data blocks (non-indirect, non-dnode). 1161321535Smav * Assume no ditto blocks or internal fragmentation. 1162286683Smav * 1163286683Smav * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 1164321535Smav * block. 1165286683Smav */ 1166321535Smav size -= record_count * sizeof (blkptr_t); 1167286683Smav 1168286683Smav /* Add in the space for the record associated with each block. */ 1169321535Smav size += record_count * sizeof (dmu_replay_record_t); 1170286683Smav 1171286683Smav *sizep = size; 1172286683Smav 1173286683Smav return (0); 1174286683Smav} 1175286683Smav 1176248571Smmint 1177321535Smavdmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, 1178321535Smav boolean_t stream_compressed, uint64_t *sizep) 1179248571Smm{ 1180228103Smm dsl_pool_t *dp = ds->ds_dir->dd_pool; 1181228103Smm int err; 1182321535Smav uint64_t uncomp, comp; 1183228103Smm 1184248571Smm ASSERT(dsl_pool_config_held(dp)); 1185248571Smm 1186228103Smm /* tosnap must be a snapshot */ 1187286575Smav if (!ds->ds_is_snapshot) 1188249195Smm return (SET_ERROR(EINVAL)); 1189228103Smm 1190284301Savg /* fromsnap, if provided, must be a snapshot */ 1191286575Smav if (fromds != NULL && !fromds->ds_is_snapshot) 1192284301Savg return (SET_ERROR(EINVAL)); 1193284301Savg 1194248571Smm /* 1195248571Smm * fromsnap must be an earlier snapshot from the same fs as tosnap, 1196248571Smm * or the origin's fs. 1197248571Smm */ 1198260183Sdelphij if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 1199249195Smm return (SET_ERROR(EXDEV)); 1200228103Smm 1201321535Smav /* Get compressed and uncompressed size estimates of changed data. */ 1202228103Smm if (fromds == NULL) { 1203321535Smav uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; 1204321535Smav comp = dsl_dataset_phys(ds)->ds_compressed_bytes; 1205228103Smm } else { 1206321535Smav uint64_t used; 1207228103Smm err = dsl_dataset_space_written(fromds, ds, 1208321535Smav &used, &comp, &uncomp); 1209248571Smm if (err != 0) 1210228103Smm return (err); 1211228103Smm } 1212228103Smm 1213321535Smav err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, 1214321535Smav stream_compressed, sizep); 1215286683Smav return (err); 1216286683Smav} 1217228103Smm 1218321535Smavstruct calculate_send_arg { 1219321535Smav uint64_t uncompressed; 1220321535Smav uint64_t compressed; 1221321535Smav}; 1222321535Smav 1223286683Smav/* 1224286683Smav * Simple callback used to traverse the blocks of a snapshot and sum their 1225321535Smav * uncompressed and compressed sizes. 1226286683Smav */ 1227286683Smav/* ARGSUSED */ 1228286683Smavstatic int 1229286683Smavdmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1230286683Smav const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1231286683Smav{ 1232321535Smav struct calculate_send_arg *space = arg; 1233286683Smav if (bp != NULL && !BP_IS_HOLE(bp)) { 1234321535Smav space->uncompressed += BP_GET_UCSIZE(bp); 1235321535Smav space->compressed += BP_GET_PSIZE(bp); 1236286683Smav } 1237286683Smav return (0); 1238286683Smav} 1239286683Smav 1240286683Smav/* 1241286683Smav * Given a desination snapshot and a TXG, calculate the approximate size of a 1242286683Smav * send stream sent from that TXG. from_txg may be zero, indicating that the 1243286683Smav * whole snapshot will be sent. 1244286683Smav */ 1245286683Smavint 1246286683Smavdmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, 1247321535Smav boolean_t stream_compressed, uint64_t *sizep) 1248286683Smav{ 1249286683Smav dsl_pool_t *dp = ds->ds_dir->dd_pool; 1250286683Smav int err; 1251321535Smav struct calculate_send_arg size = { 0 }; 1252286683Smav 1253286683Smav ASSERT(dsl_pool_config_held(dp)); 1254286683Smav 1255286683Smav /* tosnap must be a snapshot */ 1256321535Smav if (!ds->ds_is_snapshot) 1257286683Smav return (SET_ERROR(EINVAL)); 1258286683Smav 1259286683Smav /* verify that from_txg is before the provided snapshot was taken */ 1260286683Smav if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { 1261286683Smav return (SET_ERROR(EXDEV)); 1262286683Smav } 1263286683Smav 1264228103Smm /* 1265286683Smav * traverse the blocks of the snapshot with birth times after 1266286683Smav * from_txg, summing their uncompressed size 1267228103Smm */ 1268286683Smav err = traverse_dataset(ds, from_txg, TRAVERSE_POST, 1269286683Smav dmu_calculate_send_traversal, &size); 1270286683Smav if (err) 1271228103Smm return (err); 1272228103Smm 1273321535Smav err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, 1274321535Smav size.compressed, stream_compressed, sizep); 1275286683Smav return (err); 1276228103Smm} 1277228103Smm 1278248571Smmtypedef struct dmu_recv_begin_arg { 1279248571Smm const char *drba_origin; 1280248571Smm dmu_recv_cookie_t *drba_cookie; 1281248571Smm cred_t *drba_cred; 1282253820Sdelphij uint64_t drba_snapobj; 1283248571Smm} dmu_recv_begin_arg_t; 1284168404Spjd 1285168404Spjdstatic int 1286248571Smmrecv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 1287248571Smm uint64_t fromguid) 1288168404Spjd{ 1289185029Spjd uint64_t val; 1290248571Smm int error; 1291248571Smm dsl_pool_t *dp = ds->ds_dir->dd_pool; 1292185029Spjd 1293248571Smm /* temporary clone name must not exist */ 1294248571Smm error = zap_lookup(dp->dp_meta_objset, 1295275782Sdelphij dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 1296248571Smm 8, 1, &val); 1297248571Smm if (error != ENOENT) 1298248571Smm return (error == 0 ? EBUSY : error); 1299248571Smm 1300219089Spjd /* new snapshot name must not exist */ 1301248571Smm error = zap_lookup(dp->dp_meta_objset, 1302275782Sdelphij dsl_dataset_phys(ds)->ds_snapnames_zapobj, 1303275782Sdelphij drba->drba_cookie->drc_tosnap, 8, 1, &val); 1304248571Smm if (error != ENOENT) 1305248571Smm return (error == 0 ? EEXIST : error); 1306168404Spjd 1307264835Sdelphij /* 1308264835Sdelphij * Check snapshot limit before receiving. We'll recheck again at the 1309264835Sdelphij * end, but might as well abort before receiving if we're already over 1310264835Sdelphij * the limit. 1311264835Sdelphij * 1312264835Sdelphij * Note that we do not check the file system limit with 1313264835Sdelphij * dsl_dir_fscount_check because the temporary %clones don't count 1314264835Sdelphij * against that limit. 1315264835Sdelphij */ 1316264835Sdelphij error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 1317264835Sdelphij NULL, drba->drba_cred); 1318264835Sdelphij if (error != 0) 1319264835Sdelphij return (error); 1320264835Sdelphij 1321248571Smm if (fromguid != 0) { 1322253820Sdelphij dsl_dataset_t *snap; 1323275782Sdelphij uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 1324253820Sdelphij 1325253820Sdelphij /* Find snapshot in this dir that matches fromguid. */ 1326253820Sdelphij while (obj != 0) { 1327253820Sdelphij error = dsl_dataset_hold_obj(dp, obj, FTAG, 1328253820Sdelphij &snap); 1329253820Sdelphij if (error != 0) 1330253820Sdelphij return (SET_ERROR(ENODEV)); 1331253820Sdelphij if (snap->ds_dir != ds->ds_dir) { 1332253820Sdelphij dsl_dataset_rele(snap, FTAG); 1333253820Sdelphij return (SET_ERROR(ENODEV)); 1334253820Sdelphij } 1335275782Sdelphij if (dsl_dataset_phys(snap)->ds_guid == fromguid) 1336253820Sdelphij break; 1337275782Sdelphij obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 1338253820Sdelphij dsl_dataset_rele(snap, FTAG); 1339253820Sdelphij } 1340253820Sdelphij if (obj == 0) 1341249195Smm return (SET_ERROR(ENODEV)); 1342168404Spjd 1343253820Sdelphij if (drba->drba_cookie->drc_force) { 1344253820Sdelphij drba->drba_snapobj = obj; 1345253820Sdelphij } else { 1346253820Sdelphij /* 1347253820Sdelphij * If we are not forcing, there must be no 1348253820Sdelphij * changes since fromsnap. 1349253820Sdelphij */ 1350253820Sdelphij if (dsl_dataset_modified_since_snap(ds, snap)) { 1351219089Spjd dsl_dataset_rele(snap, FTAG); 1352253820Sdelphij return (SET_ERROR(ETXTBSY)); 1353219089Spjd } 1354253820Sdelphij drba->drba_snapobj = ds->ds_prev->ds_object; 1355219089Spjd } 1356253820Sdelphij 1357253820Sdelphij dsl_dataset_rele(snap, FTAG); 1358219089Spjd } else { 1359283525Savg /* if full, then must be forced */ 1360283525Savg if (!drba->drba_cookie->drc_force) 1361283525Savg return (SET_ERROR(EEXIST)); 1362283525Savg /* start from $ORIGIN@$ORIGIN, if supported */ 1363283525Savg drba->drba_snapobj = dp->dp_origin_snap != NULL ? 1364283525Savg dp->dp_origin_snap->ds_object : 0; 1365219089Spjd } 1366219089Spjd 1367248571Smm return (0); 1368168404Spjd 1369168404Spjd} 1370168404Spjd 1371248571Smmstatic int 1372248571Smmdmu_recv_begin_check(void *arg, dmu_tx_t *tx) 1373248571Smm{ 1374248571Smm dmu_recv_begin_arg_t *drba = arg; 1375248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 1376248571Smm struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1377248571Smm uint64_t fromguid = drrb->drr_fromguid; 1378248571Smm int flags = drrb->drr_flags; 1379248571Smm int error; 1380268075Sdelphij uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 1381248571Smm dsl_dataset_t *ds; 1382248571Smm const char *tofs = drba->drba_cookie->drc_tofs; 1383248571Smm 1384248571Smm /* already checked */ 1385248571Smm ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1386289362Smav ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); 1387248571Smm 1388248571Smm if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 1389248571Smm DMU_COMPOUNDSTREAM || 1390248571Smm drrb->drr_type >= DMU_OST_NUMTYPES || 1391248571Smm ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 1392249195Smm return (SET_ERROR(EINVAL)); 1393248571Smm 1394248571Smm /* Verify pool version supports SA if SA_SPILL feature set */ 1395268075Sdelphij if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 1396268075Sdelphij spa_version(dp->dp_spa) < SPA_VERSION_SA) 1397249195Smm return (SET_ERROR(ENOTSUP)); 1398248571Smm 1399289362Smav if (drba->drba_cookie->drc_resumable && 1400289362Smav !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) 1401289362Smav return (SET_ERROR(ENOTSUP)); 1402289362Smav 1403268075Sdelphij /* 1404268075Sdelphij * The receiving code doesn't know how to translate a WRITE_EMBEDDED 1405321535Smav * record to a plain WRITE record, so the pool must have the 1406268075Sdelphij * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 1407268075Sdelphij * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1408268075Sdelphij */ 1409268075Sdelphij if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1410268075Sdelphij !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1411268075Sdelphij return (SET_ERROR(ENOTSUP)); 1412321535Smav if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && 1413268075Sdelphij !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1414268075Sdelphij return (SET_ERROR(ENOTSUP)); 1415268075Sdelphij 1416274337Sdelphij /* 1417274337Sdelphij * The receiving code doesn't know how to translate large blocks 1418274337Sdelphij * to smaller ones, so the pool must have the LARGE_BLOCKS 1419274337Sdelphij * feature enabled if the stream has LARGE_BLOCKS. 1420274337Sdelphij */ 1421274337Sdelphij if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1422274337Sdelphij !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) 1423274337Sdelphij return (SET_ERROR(ENOTSUP)); 1424274337Sdelphij 1425248571Smm error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1426248571Smm if (error == 0) { 1427248571Smm /* target fs already exists; recv into temp clone */ 1428248571Smm 1429248571Smm /* Can't recv a clone into an existing fs */ 1430294815Smav if (flags & DRR_FLAG_CLONE || drba->drba_origin) { 1431248571Smm dsl_dataset_rele(ds, FTAG); 1432249195Smm return (SET_ERROR(EINVAL)); 1433248571Smm } 1434248571Smm 1435248571Smm error = recv_begin_check_existing_impl(drba, ds, fromguid); 1436248571Smm dsl_dataset_rele(ds, FTAG); 1437248571Smm } else if (error == ENOENT) { 1438248571Smm /* target fs does not exist; must be a full backup or clone */ 1439307108Smav char buf[ZFS_MAX_DATASET_NAME_LEN]; 1440248571Smm 1441248571Smm /* 1442248571Smm * If it's a non-clone incremental, we are missing the 1443248571Smm * target fs, so fail the recv. 1444248571Smm */ 1445286705Smav if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || 1446286705Smav drba->drba_origin)) 1447249195Smm return (SET_ERROR(ENOENT)); 1448248571Smm 1449294815Smav /* 1450294815Smav * If we're receiving a full send as a clone, and it doesn't 1451294815Smav * contain all the necessary free records and freeobject 1452294815Smav * records, reject it. 1453294815Smav */ 1454294815Smav if (fromguid == 0 && drba->drba_origin && 1455294815Smav !(flags & DRR_FLAG_FREERECORDS)) 1456294815Smav return (SET_ERROR(EINVAL)); 1457294815Smav 1458248571Smm /* Open the parent of tofs */ 1459307108Smav ASSERT3U(strlen(tofs), <, sizeof (buf)); 1460248571Smm (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1461248571Smm error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1462248571Smm if (error != 0) 1463248571Smm return (error); 1464248571Smm 1465264835Sdelphij /* 1466264835Sdelphij * Check filesystem and snapshot limits before receiving. We'll 1467264835Sdelphij * recheck snapshot limits again at the end (we create the 1468264835Sdelphij * filesystems and increment those counts during begin_sync). 1469264835Sdelphij */ 1470264835Sdelphij error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1471264835Sdelphij ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1472264835Sdelphij if (error != 0) { 1473264835Sdelphij dsl_dataset_rele(ds, FTAG); 1474264835Sdelphij return (error); 1475264835Sdelphij } 1476264835Sdelphij 1477264835Sdelphij error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1478264835Sdelphij ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1479264835Sdelphij if (error != 0) { 1480264835Sdelphij dsl_dataset_rele(ds, FTAG); 1481264835Sdelphij return (error); 1482264835Sdelphij } 1483264835Sdelphij 1484248571Smm if (drba->drba_origin != NULL) { 1485248571Smm dsl_dataset_t *origin; 1486248571Smm error = dsl_dataset_hold(dp, drba->drba_origin, 1487248571Smm FTAG, &origin); 1488248571Smm if (error != 0) { 1489248571Smm dsl_dataset_rele(ds, FTAG); 1490248571Smm return (error); 1491248571Smm } 1492286575Smav if (!origin->ds_is_snapshot) { 1493248571Smm dsl_dataset_rele(origin, FTAG); 1494248571Smm dsl_dataset_rele(ds, FTAG); 1495249195Smm return (SET_ERROR(EINVAL)); 1496248571Smm } 1497294815Smav if (dsl_dataset_phys(origin)->ds_guid != fromguid && 1498294815Smav fromguid != 0) { 1499248571Smm dsl_dataset_rele(origin, FTAG); 1500248571Smm dsl_dataset_rele(ds, FTAG); 1501249195Smm return (SET_ERROR(ENODEV)); 1502248571Smm } 1503248571Smm dsl_dataset_rele(origin, FTAG); 1504248571Smm } 1505248571Smm dsl_dataset_rele(ds, FTAG); 1506248571Smm error = 0; 1507248571Smm } 1508248571Smm return (error); 1509248571Smm} 1510248571Smm 1511168404Spjdstatic void 1512248571Smmdmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1513168404Spjd{ 1514248571Smm dmu_recv_begin_arg_t *drba = arg; 1515248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 1516289362Smav objset_t *mos = dp->dp_meta_objset; 1517248571Smm struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1518248571Smm const char *tofs = drba->drba_cookie->drc_tofs; 1519248571Smm dsl_dataset_t *ds, *newds; 1520185029Spjd uint64_t dsobj; 1521248571Smm int error; 1522289362Smav uint64_t crflags = 0; 1523168404Spjd 1524289362Smav if (drrb->drr_flags & DRR_FLAG_CI_DATA) 1525289362Smav crflags |= DS_FLAG_CI_DATASET; 1526168404Spjd 1527248571Smm error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1528248571Smm if (error == 0) { 1529248571Smm /* create temporary clone */ 1530253820Sdelphij dsl_dataset_t *snap = NULL; 1531253820Sdelphij if (drba->drba_snapobj != 0) { 1532253820Sdelphij VERIFY0(dsl_dataset_hold_obj(dp, 1533253820Sdelphij drba->drba_snapobj, FTAG, &snap)); 1534253820Sdelphij } 1535248571Smm dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1536253820Sdelphij snap, crflags, drba->drba_cred, tx); 1537282632Savg if (drba->drba_snapobj != 0) 1538282632Savg dsl_dataset_rele(snap, FTAG); 1539248571Smm dsl_dataset_rele(ds, FTAG); 1540248571Smm } else { 1541248571Smm dsl_dir_t *dd; 1542248571Smm const char *tail; 1543248571Smm dsl_dataset_t *origin = NULL; 1544248571Smm 1545248571Smm VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1546248571Smm 1547248571Smm if (drba->drba_origin != NULL) { 1548248571Smm VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1549248571Smm FTAG, &origin)); 1550248571Smm } 1551248571Smm 1552248571Smm /* Create new dataset. */ 1553248571Smm dsobj = dsl_dataset_create_sync(dd, 1554248571Smm strrchr(tofs, '/') + 1, 1555248571Smm origin, crflags, drba->drba_cred, tx); 1556248571Smm if (origin != NULL) 1557248571Smm dsl_dataset_rele(origin, FTAG); 1558248571Smm dsl_dir_rele(dd, FTAG); 1559248571Smm drba->drba_cookie->drc_newfs = B_TRUE; 1560248571Smm } 1561248571Smm VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 1562248571Smm 1563289362Smav if (drba->drba_cookie->drc_resumable) { 1564289362Smav dsl_dataset_zapify(newds, tx); 1565289362Smav if (drrb->drr_fromguid != 0) { 1566289362Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, 1567289362Smav 8, 1, &drrb->drr_fromguid, tx)); 1568289362Smav } 1569289362Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, 1570289362Smav 8, 1, &drrb->drr_toguid, tx)); 1571289362Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, 1572289362Smav 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); 1573289362Smav uint64_t one = 1; 1574289362Smav uint64_t zero = 0; 1575289362Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, 1576289362Smav 8, 1, &one, tx)); 1577289362Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, 1578289362Smav 8, 1, &zero, tx)); 1579289362Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 1580289362Smav 8, 1, &zero, tx)); 1581289362Smav if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 1582321535Smav DMU_BACKUP_FEATURE_LARGE_BLOCKS) { 1583321535Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, 1584321535Smav 8, 1, &one, tx)); 1585321535Smav } 1586321535Smav if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 1587289362Smav DMU_BACKUP_FEATURE_EMBED_DATA) { 1588289362Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 1589289362Smav 8, 1, &one, tx)); 1590289362Smav } 1591321535Smav if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 1592321535Smav DMU_BACKUP_FEATURE_COMPRESSED) { 1593321535Smav VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, 1594321535Smav 8, 1, &one, tx)); 1595321535Smav } 1596289362Smav } 1597289362Smav 1598248571Smm dmu_buf_will_dirty(newds->ds_dbuf, tx); 1599275782Sdelphij dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; 1600248571Smm 1601219089Spjd /* 1602219089Spjd * If we actually created a non-clone, we need to create the 1603219089Spjd * objset in our new dataset. 1604219089Spjd */ 1605308082Smav rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); 1606248571Smm if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1607219089Spjd (void) dmu_objset_create_impl(dp->dp_spa, 1608248571Smm newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1609219089Spjd } 1610308082Smav rrw_exit(&newds->ds_bp_rwlock, FTAG); 1611168404Spjd 1612248571Smm drba->drba_cookie->drc_ds = newds; 1613185029Spjd 1614248571Smm spa_history_log_internal_ds(newds, "receive", tx, ""); 1615168404Spjd} 1616168404Spjd 1617289362Smavstatic int 1618289362Smavdmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) 1619289362Smav{ 1620289362Smav dmu_recv_begin_arg_t *drba = arg; 1621289362Smav dsl_pool_t *dp = dmu_tx_pool(tx); 1622289362Smav struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1623289362Smav int error; 1624289362Smav uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 1625289362Smav dsl_dataset_t *ds; 1626289362Smav const char *tofs = drba->drba_cookie->drc_tofs; 1627289362Smav 1628289362Smav /* already checked */ 1629289362Smav ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1630289362Smav ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); 1631289362Smav 1632289362Smav if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 1633289362Smav DMU_COMPOUNDSTREAM || 1634289362Smav drrb->drr_type >= DMU_OST_NUMTYPES) 1635289362Smav return (SET_ERROR(EINVAL)); 1636289362Smav 1637289362Smav /* Verify pool version supports SA if SA_SPILL feature set */ 1638289362Smav if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 1639289362Smav spa_version(dp->dp_spa) < SPA_VERSION_SA) 1640289362Smav return (SET_ERROR(ENOTSUP)); 1641289362Smav 1642289362Smav /* 1643289362Smav * The receiving code doesn't know how to translate a WRITE_EMBEDDED 1644289362Smav * record to a plain WRITE record, so the pool must have the 1645289362Smav * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 1646289362Smav * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1647289362Smav */ 1648289362Smav if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1649289362Smav !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1650289362Smav return (SET_ERROR(ENOTSUP)); 1651321535Smav if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && 1652289362Smav !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1653289362Smav return (SET_ERROR(ENOTSUP)); 1654289362Smav 1655307108Smav /* 6 extra bytes for /%recv */ 1656307108Smav char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; 1657289362Smav 1658289362Smav (void) snprintf(recvname, sizeof (recvname), "%s/%s", 1659289362Smav tofs, recv_clone_name); 1660289362Smav 1661289362Smav if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { 1662289362Smav /* %recv does not exist; continue in tofs */ 1663289362Smav error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1664289362Smav if (error != 0) 1665289362Smav return (error); 1666289362Smav } 1667289362Smav 1668289362Smav /* check that ds is marked inconsistent */ 1669289362Smav if (!DS_IS_INCONSISTENT(ds)) { 1670289362Smav dsl_dataset_rele(ds, FTAG); 1671289362Smav return (SET_ERROR(EINVAL)); 1672289362Smav } 1673289362Smav 1674289362Smav /* check that there is resuming data, and that the toguid matches */ 1675289362Smav if (!dsl_dataset_is_zapified(ds)) { 1676289362Smav dsl_dataset_rele(ds, FTAG); 1677289362Smav return (SET_ERROR(EINVAL)); 1678289362Smav } 1679289362Smav uint64_t val; 1680289362Smav error = zap_lookup(dp->dp_meta_objset, ds->ds_object, 1681289362Smav DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); 1682289362Smav if (error != 0 || drrb->drr_toguid != val) { 1683289362Smav dsl_dataset_rele(ds, FTAG); 1684289362Smav return (SET_ERROR(EINVAL)); 1685289362Smav } 1686289362Smav 1687289362Smav /* 1688289362Smav * Check if the receive is still running. If so, it will be owned. 1689289362Smav * Note that nothing else can own the dataset (e.g. after the receive 1690289362Smav * fails) because it will be marked inconsistent. 1691289362Smav */ 1692289362Smav if (dsl_dataset_has_owner(ds)) { 1693289362Smav dsl_dataset_rele(ds, FTAG); 1694289362Smav return (SET_ERROR(EBUSY)); 1695289362Smav } 1696289362Smav 1697289362Smav /* There should not be any snapshots of this fs yet. */ 1698289362Smav if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { 1699289362Smav dsl_dataset_rele(ds, FTAG); 1700289362Smav return (SET_ERROR(EINVAL)); 1701289362Smav } 1702289362Smav 1703289362Smav /* 1704289362Smav * Note: resume point will be checked when we process the first WRITE 1705289362Smav * record. 1706289362Smav */ 1707289362Smav 1708289362Smav /* check that the origin matches */ 1709289362Smav val = 0; 1710289362Smav (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, 1711289362Smav DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); 1712289362Smav if (drrb->drr_fromguid != val) { 1713289362Smav dsl_dataset_rele(ds, FTAG); 1714289362Smav return (SET_ERROR(EINVAL)); 1715289362Smav } 1716289362Smav 1717289362Smav dsl_dataset_rele(ds, FTAG); 1718289362Smav return (0); 1719289362Smav} 1720289362Smav 1721289362Smavstatic void 1722289362Smavdmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) 1723289362Smav{ 1724289362Smav dmu_recv_begin_arg_t *drba = arg; 1725289362Smav dsl_pool_t *dp = dmu_tx_pool(tx); 1726289362Smav const char *tofs = drba->drba_cookie->drc_tofs; 1727289362Smav dsl_dataset_t *ds; 1728289362Smav uint64_t dsobj; 1729307108Smav /* 6 extra bytes for /%recv */ 1730307108Smav char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; 1731289362Smav 1732289362Smav (void) snprintf(recvname, sizeof (recvname), "%s/%s", 1733289362Smav tofs, recv_clone_name); 1734289362Smav 1735289362Smav if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { 1736289362Smav /* %recv does not exist; continue in tofs */ 1737289362Smav VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); 1738289362Smav drba->drba_cookie->drc_newfs = B_TRUE; 1739289362Smav } 1740289362Smav 1741289362Smav /* clear the inconsistent flag so that we can own it */ 1742289362Smav ASSERT(DS_IS_INCONSISTENT(ds)); 1743289362Smav dmu_buf_will_dirty(ds->ds_dbuf, tx); 1744289362Smav dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; 1745289362Smav dsobj = ds->ds_object; 1746289362Smav dsl_dataset_rele(ds, FTAG); 1747289362Smav 1748289362Smav VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); 1749289362Smav 1750289362Smav dmu_buf_will_dirty(ds->ds_dbuf, tx); 1751289362Smav dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; 1752289362Smav 1753308082Smav rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 1754289362Smav ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); 1755308082Smav rrw_exit(&ds->ds_bp_rwlock, FTAG); 1756289362Smav 1757289362Smav drba->drba_cookie->drc_ds = ds; 1758289362Smav 1759289362Smav spa_history_log_internal_ds(ds, "resume receive", tx, ""); 1760289362Smav} 1761289362Smav 1762185029Spjd/* 1763185029Spjd * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 1764185029Spjd * succeeds; otherwise we will leak the holds on the datasets. 1765185029Spjd */ 1766185029Spjdint 1767289362Smavdmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, 1768289362Smav boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) 1769168404Spjd{ 1770248571Smm dmu_recv_begin_arg_t drba = { 0 }; 1771168404Spjd 1772185029Spjd bzero(drc, sizeof (dmu_recv_cookie_t)); 1773289362Smav drc->drc_drr_begin = drr_begin; 1774289362Smav drc->drc_drrb = &drr_begin->drr_u.drr_begin; 1775185029Spjd drc->drc_tosnap = tosnap; 1776248571Smm drc->drc_tofs = tofs; 1777185029Spjd drc->drc_force = force; 1778289362Smav drc->drc_resumable = resumable; 1779264835Sdelphij drc->drc_cred = CRED(); 1780168404Spjd 1781289362Smav if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { 1782248571Smm drc->drc_byteswap = B_TRUE; 1783321610Smav (void) fletcher_4_incremental_byteswap(drr_begin, 1784248571Smm sizeof (dmu_replay_record_t), &drc->drc_cksum); 1785289362Smav byteswap_record(drr_begin); 1786289362Smav } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { 1787321610Smav (void) fletcher_4_incremental_native(drr_begin, 1788289362Smav sizeof (dmu_replay_record_t), &drc->drc_cksum); 1789248571Smm } else { 1790289362Smav return (SET_ERROR(EINVAL)); 1791248571Smm } 1792219089Spjd 1793248571Smm drba.drba_origin = origin; 1794248571Smm drba.drba_cookie = drc; 1795248571Smm drba.drba_cred = CRED(); 1796219089Spjd 1797289362Smav if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & 1798289362Smav DMU_BACKUP_FEATURE_RESUMING) { 1799289362Smav return (dsl_sync_task(tofs, 1800289362Smav dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, 1801289362Smav &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1802289362Smav } else { 1803289362Smav return (dsl_sync_task(tofs, 1804289362Smav dmu_recv_begin_check, dmu_recv_begin_sync, 1805289362Smav &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1806289362Smav } 1807168404Spjd} 1808168404Spjd 1809286705Smavstruct receive_record_arg { 1810286705Smav dmu_replay_record_t header; 1811286705Smav void *payload; /* Pointer to a buffer containing the payload */ 1812286705Smav /* 1813286705Smav * If the record is a write, pointer to the arc_buf_t containing the 1814286705Smav * payload. 1815286705Smav */ 1816286705Smav arc_buf_t *write_buf; 1817286705Smav int payload_size; 1818289362Smav uint64_t bytes_read; /* bytes read from stream when record created */ 1819286705Smav boolean_t eos_marker; /* Marks the end of the stream */ 1820286705Smav bqueue_node_t node; 1821286705Smav}; 1822286705Smav 1823286705Smavstruct receive_writer_arg { 1824286587Smav objset_t *os; 1825286705Smav boolean_t byteswap; 1826286705Smav bqueue_t q; 1827289362Smav 1828286705Smav /* 1829286705Smav * These three args are used to signal to the main thread that we're 1830286705Smav * done. 1831286705Smav */ 1832286705Smav kmutex_t mutex; 1833286705Smav kcondvar_t cv; 1834286705Smav boolean_t done; 1835289362Smav 1836185029Spjd int err; 1837286705Smav /* A map from guid to dataset to help handle dedup'd streams. */ 1838286705Smav avl_tree_t *guid_to_ds_map; 1839289362Smav boolean_t resumable; 1840289362Smav uint64_t last_object, last_offset; 1841289362Smav uint64_t bytes_read; /* bytes read when current record created */ 1842286705Smav}; 1843286705Smav 1844294815Smavstruct objlist { 1845294815Smav list_t list; /* List of struct receive_objnode. */ 1846294815Smav /* 1847294815Smav * Last object looked up. Used to assert that objects are being looked 1848294815Smav * up in ascending order. 1849294815Smav */ 1850294815Smav uint64_t last_lookup; 1851294815Smav}; 1852294815Smav 1853294815Smavstruct receive_objnode { 1854294815Smav list_node_t node; 1855294815Smav uint64_t object; 1856294815Smav}; 1857294815Smav 1858321535Smavstruct receive_arg { 1859286705Smav objset_t *os; 1860185029Spjd kthread_t *td; 1861185029Spjd struct file *fp; 1862286705Smav uint64_t voff; /* The current offset in the stream */ 1863289362Smav uint64_t bytes_read; 1864286705Smav /* 1865286705Smav * A record that has had its payload read in, but hasn't yet been handed 1866286705Smav * off to the worker thread. 1867286705Smav */ 1868286705Smav struct receive_record_arg *rrd; 1869286705Smav /* A record that has had its header read in, but not its payload. */ 1870286705Smav struct receive_record_arg *next_rrd; 1871185029Spjd zio_cksum_t cksum; 1872286587Smav zio_cksum_t prev_cksum; 1873286705Smav int err; 1874286705Smav boolean_t byteswap; 1875286705Smav /* Sorted list of objects not to issue prefetches for. */ 1876294815Smav struct objlist ignore_objlist; 1877286705Smav}; 1878286587Smav 1879219089Spjdtypedef struct guid_map_entry { 1880219089Spjd uint64_t guid; 1881219089Spjd dsl_dataset_t *gme_ds; 1882219089Spjd avl_node_t avlnode; 1883219089Spjd} guid_map_entry_t; 1884219089Spjd 1885168404Spjdstatic int 1886219089Spjdguid_compare(const void *arg1, const void *arg2) 1887168404Spjd{ 1888219089Spjd const guid_map_entry_t *gmep1 = arg1; 1889219089Spjd const guid_map_entry_t *gmep2 = arg2; 1890219089Spjd 1891219089Spjd if (gmep1->guid < gmep2->guid) 1892219089Spjd return (-1); 1893219089Spjd else if (gmep1->guid > gmep2->guid) 1894219089Spjd return (1); 1895219089Spjd return (0); 1896219089Spjd} 1897219089Spjd 1898219089Spjdstatic void 1899219089Spjdfree_guid_map_onexit(void *arg) 1900219089Spjd{ 1901219089Spjd avl_tree_t *ca = arg; 1902219089Spjd void *cookie = NULL; 1903219089Spjd guid_map_entry_t *gmep; 1904219089Spjd 1905219089Spjd while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1906248571Smm dsl_dataset_long_rele(gmep->gme_ds, gmep); 1907249196Smm dsl_dataset_rele(gmep->gme_ds, gmep); 1908219089Spjd kmem_free(gmep, sizeof (guid_map_entry_t)); 1909219089Spjd } 1910219089Spjd avl_destroy(ca); 1911219089Spjd kmem_free(ca, sizeof (avl_tree_t)); 1912219089Spjd} 1913219089Spjd 1914219089Spjdstatic int 1915286705Smavrestore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid) 1916219089Spjd{ 1917168404Spjd struct uio auio; 1918168404Spjd struct iovec aiov; 1919168404Spjd int error; 1920168404Spjd 1921168404Spjd aiov.iov_base = buf; 1922168404Spjd aiov.iov_len = len; 1923168404Spjd auio.uio_iov = &aiov; 1924168404Spjd auio.uio_iovcnt = 1; 1925168404Spjd auio.uio_resid = len; 1926169170Spjd auio.uio_segflg = UIO_SYSSPACE; 1927168404Spjd auio.uio_rw = UIO_READ; 1928168404Spjd auio.uio_offset = off; 1929168404Spjd auio.uio_td = ra->td; 1930168404Spjd#ifdef _KERNEL 1931168404Spjd error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); 1932168404Spjd#else 1933168404Spjd fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 1934168404Spjd error = EOPNOTSUPP; 1935168404Spjd#endif 1936168404Spjd *resid = auio.uio_resid; 1937168404Spjd return (error); 1938168404Spjd} 1939168404Spjd 1940286587Smavstatic int 1941286705Smavreceive_read(struct receive_arg *ra, int len, void *buf) 1942168404Spjd{ 1943185029Spjd int done = 0; 1944168404Spjd 1945297509Smav /* 1946297509Smav * The code doesn't rely on this (lengths being multiples of 8). See 1947297509Smav * comment in dump_bytes. 1948297509Smav */ 1949240415Smm ASSERT0(len % 8); 1950168404Spjd 1951185029Spjd while (done < len) { 1952219089Spjd ssize_t resid; 1953168404Spjd 1954272601Sdelphij ra->err = restore_bytes(ra, buf + done, 1955185029Spjd len - done, ra->voff, &resid); 1956168404Spjd 1957289362Smav if (resid == len - done) { 1958289362Smav /* 1959289362Smav * Note: ECKSUM indicates that the receive 1960289362Smav * was interrupted and can potentially be resumed. 1961289362Smav */ 1962289362Smav ra->err = SET_ERROR(ECKSUM); 1963289362Smav } 1964185029Spjd ra->voff += len - done - resid; 1965185029Spjd done = len - resid; 1966248571Smm if (ra->err != 0) 1967286587Smav return (ra->err); 1968168404Spjd } 1969168404Spjd 1970289362Smav ra->bytes_read += len; 1971289362Smav 1972185029Spjd ASSERT3U(done, ==, len); 1973286587Smav return (0); 1974168404Spjd} 1975168404Spjd 1976168404Spjdstatic void 1977286587Smavbyteswap_record(dmu_replay_record_t *drr) 1978168404Spjd{ 1979168404Spjd#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1980168404Spjd#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1981168404Spjd drr->drr_type = BSWAP_32(drr->drr_type); 1982185029Spjd drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1983286587Smav 1984168404Spjd switch (drr->drr_type) { 1985168404Spjd case DRR_BEGIN: 1986168404Spjd DO64(drr_begin.drr_magic); 1987219089Spjd DO64(drr_begin.drr_versioninfo); 1988168404Spjd DO64(drr_begin.drr_creation_time); 1989168404Spjd DO32(drr_begin.drr_type); 1990185029Spjd DO32(drr_begin.drr_flags); 1991168404Spjd DO64(drr_begin.drr_toguid); 1992168404Spjd DO64(drr_begin.drr_fromguid); 1993168404Spjd break; 1994168404Spjd case DRR_OBJECT: 1995168404Spjd DO64(drr_object.drr_object); 1996168404Spjd DO32(drr_object.drr_type); 1997168404Spjd DO32(drr_object.drr_bonustype); 1998168404Spjd DO32(drr_object.drr_blksz); 1999168404Spjd DO32(drr_object.drr_bonuslen); 2000219089Spjd DO64(drr_object.drr_toguid); 2001168404Spjd break; 2002168404Spjd case DRR_FREEOBJECTS: 2003168404Spjd DO64(drr_freeobjects.drr_firstobj); 2004168404Spjd DO64(drr_freeobjects.drr_numobjs); 2005219089Spjd DO64(drr_freeobjects.drr_toguid); 2006168404Spjd break; 2007168404Spjd case DRR_WRITE: 2008168404Spjd DO64(drr_write.drr_object); 2009168404Spjd DO32(drr_write.drr_type); 2010168404Spjd DO64(drr_write.drr_offset); 2011321535Smav DO64(drr_write.drr_logical_size); 2012219089Spjd DO64(drr_write.drr_toguid); 2013286587Smav ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); 2014219089Spjd DO64(drr_write.drr_key.ddk_prop); 2015321535Smav DO64(drr_write.drr_compressed_size); 2016168404Spjd break; 2017219089Spjd case DRR_WRITE_BYREF: 2018219089Spjd DO64(drr_write_byref.drr_object); 2019219089Spjd DO64(drr_write_byref.drr_offset); 2020219089Spjd DO64(drr_write_byref.drr_length); 2021219089Spjd DO64(drr_write_byref.drr_toguid); 2022219089Spjd DO64(drr_write_byref.drr_refguid); 2023219089Spjd DO64(drr_write_byref.drr_refobject); 2024219089Spjd DO64(drr_write_byref.drr_refoffset); 2025286587Smav ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. 2026286587Smav drr_key.ddk_cksum); 2027219089Spjd DO64(drr_write_byref.drr_key.ddk_prop); 2028219089Spjd break; 2029268075Sdelphij case DRR_WRITE_EMBEDDED: 2030268075Sdelphij DO64(drr_write_embedded.drr_object); 2031268075Sdelphij DO64(drr_write_embedded.drr_offset); 2032268075Sdelphij DO64(drr_write_embedded.drr_length); 2033268075Sdelphij DO64(drr_write_embedded.drr_toguid); 2034268075Sdelphij DO32(drr_write_embedded.drr_lsize); 2035268075Sdelphij DO32(drr_write_embedded.drr_psize); 2036268075Sdelphij break; 2037168404Spjd case DRR_FREE: 2038168404Spjd DO64(drr_free.drr_object); 2039168404Spjd DO64(drr_free.drr_offset); 2040168404Spjd DO64(drr_free.drr_length); 2041219089Spjd DO64(drr_free.drr_toguid); 2042168404Spjd break; 2043219089Spjd case DRR_SPILL: 2044219089Spjd DO64(drr_spill.drr_object); 2045219089Spjd DO64(drr_spill.drr_length); 2046219089Spjd DO64(drr_spill.drr_toguid); 2047219089Spjd break; 2048168404Spjd case DRR_END: 2049219089Spjd DO64(drr_end.drr_toguid); 2050286587Smav ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); 2051168404Spjd break; 2052168404Spjd } 2053286587Smav 2054286587Smav if (drr->drr_type != DRR_BEGIN) { 2055286587Smav ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); 2056286587Smav } 2057286587Smav 2058168404Spjd#undef DO64 2059168404Spjd#undef DO32 2060168404Spjd} 2061168404Spjd 2062272810Sdelphijstatic inline uint8_t 2063272810Sdelphijdeduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) 2064272810Sdelphij{ 2065272810Sdelphij if (bonus_type == DMU_OT_SA) { 2066272810Sdelphij return (1); 2067272810Sdelphij } else { 2068272810Sdelphij return (1 + 2069272810Sdelphij ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); 2070272810Sdelphij } 2071272810Sdelphij} 2072272810Sdelphij 2073289362Smavstatic void 2074289362Smavsave_resume_state(struct receive_writer_arg *rwa, 2075289362Smav uint64_t object, uint64_t offset, dmu_tx_t *tx) 2076289362Smav{ 2077289362Smav int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 2078289362Smav 2079289362Smav if (!rwa->resumable) 2080289362Smav return; 2081289362Smav 2082289362Smav /* 2083289362Smav * We use ds_resume_bytes[] != 0 to indicate that we need to 2084289362Smav * update this on disk, so it must not be 0. 2085289362Smav */ 2086289362Smav ASSERT(rwa->bytes_read != 0); 2087289362Smav 2088289362Smav /* 2089289362Smav * We only resume from write records, which have a valid 2090289362Smav * (non-meta-dnode) object number. 2091289362Smav */ 2092289362Smav ASSERT(object != 0); 2093289362Smav 2094289362Smav /* 2095289362Smav * For resuming to work correctly, we must receive records in order, 2096289362Smav * sorted by object,offset. This is checked by the callers, but 2097289362Smav * assert it here for good measure. 2098289362Smav */ 2099289362Smav ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); 2100289362Smav ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || 2101289362Smav offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); 2102289362Smav ASSERT3U(rwa->bytes_read, >=, 2103289362Smav rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); 2104289362Smav 2105289362Smav rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; 2106289362Smav rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; 2107289362Smav rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; 2108289362Smav} 2109289362Smav 2110168404Spjdstatic int 2111286705Smavreceive_object(struct receive_writer_arg *rwa, struct drr_object *drro, 2112286705Smav void *data) 2113168404Spjd{ 2114272810Sdelphij dmu_object_info_t doi; 2115168404Spjd dmu_tx_t *tx; 2116272810Sdelphij uint64_t object; 2117272810Sdelphij int err; 2118168404Spjd 2119168404Spjd if (drro->drr_type == DMU_OT_NONE || 2120236884Smm !DMU_OT_IS_VALID(drro->drr_type) || 2121236884Smm !DMU_OT_IS_VALID(drro->drr_bonustype) || 2122219089Spjd drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 2123168404Spjd drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 2124168404Spjd P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 2125168404Spjd drro->drr_blksz < SPA_MINBLOCKSIZE || 2126286705Smav drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || 2127168404Spjd drro->drr_bonuslen > DN_MAX_BONUSLEN) { 2128249195Smm return (SET_ERROR(EINVAL)); 2129168404Spjd } 2130168404Spjd 2131286705Smav err = dmu_object_info(rwa->os, drro->drr_object, &doi); 2132168404Spjd 2133200726Sdelphij if (err != 0 && err != ENOENT) 2134249195Smm return (SET_ERROR(EINVAL)); 2135272810Sdelphij object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; 2136200726Sdelphij 2137272810Sdelphij /* 2138272810Sdelphij * If we are losing blkptrs or changing the block size this must 2139272810Sdelphij * be a new file instance. We must clear out the previous file 2140272810Sdelphij * contents before we can change this type of metadata in the dnode. 2141272810Sdelphij */ 2142272810Sdelphij if (err == 0) { 2143272810Sdelphij int nblkptr; 2144272810Sdelphij 2145272810Sdelphij nblkptr = deduce_nblkptr(drro->drr_bonustype, 2146272810Sdelphij drro->drr_bonuslen); 2147272810Sdelphij 2148272810Sdelphij if (drro->drr_blksz != doi.doi_data_block_size || 2149272810Sdelphij nblkptr < doi.doi_nblkptr) { 2150286705Smav err = dmu_free_long_range(rwa->os, drro->drr_object, 2151272810Sdelphij 0, DMU_OBJECT_END); 2152272810Sdelphij if (err != 0) 2153272810Sdelphij return (SET_ERROR(EINVAL)); 2154272810Sdelphij } 2155272810Sdelphij } 2156272810Sdelphij 2157286705Smav tx = dmu_tx_create(rwa->os); 2158272810Sdelphij dmu_tx_hold_bonus(tx, object); 2159272810Sdelphij err = dmu_tx_assign(tx, TXG_WAIT); 2160272810Sdelphij if (err != 0) { 2161272810Sdelphij dmu_tx_abort(tx); 2162272810Sdelphij return (err); 2163272810Sdelphij } 2164272810Sdelphij 2165272810Sdelphij if (object == DMU_NEW_OBJECT) { 2166168404Spjd /* currently free, want to be allocated */ 2167286705Smav err = dmu_object_claim(rwa->os, drro->drr_object, 2168168404Spjd drro->drr_type, drro->drr_blksz, 2169168404Spjd drro->drr_bonustype, drro->drr_bonuslen, tx); 2170272810Sdelphij } else if (drro->drr_type != doi.doi_type || 2171272810Sdelphij drro->drr_blksz != doi.doi_data_block_size || 2172272810Sdelphij drro->drr_bonustype != doi.doi_bonus_type || 2173272810Sdelphij drro->drr_bonuslen != doi.doi_bonus_size) { 2174272810Sdelphij /* currently allocated, but with different properties */ 2175286705Smav err = dmu_object_reclaim(rwa->os, drro->drr_object, 2176168404Spjd drro->drr_type, drro->drr_blksz, 2177272810Sdelphij drro->drr_bonustype, drro->drr_bonuslen, tx); 2178168404Spjd } 2179248571Smm if (err != 0) { 2180272810Sdelphij dmu_tx_commit(tx); 2181249195Smm return (SET_ERROR(EINVAL)); 2182219089Spjd } 2183200726Sdelphij 2184286705Smav dmu_object_set_checksum(rwa->os, drro->drr_object, 2185286587Smav drro->drr_checksumtype, tx); 2186286705Smav dmu_object_set_compress(rwa->os, drro->drr_object, 2187286587Smav drro->drr_compress, tx); 2188168404Spjd 2189200727Sdelphij if (data != NULL) { 2190168404Spjd dmu_buf_t *db; 2191200727Sdelphij 2192286705Smav VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); 2193168404Spjd dmu_buf_will_dirty(db, tx); 2194168404Spjd 2195185029Spjd ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 2196185029Spjd bcopy(data, db->db_data, drro->drr_bonuslen); 2197286705Smav if (rwa->byteswap) { 2198236884Smm dmu_object_byteswap_t byteswap = 2199236884Smm DMU_OT_BYTESWAP(drro->drr_bonustype); 2200236884Smm dmu_ot_byteswap[byteswap].ob_func(db->db_data, 2201168404Spjd drro->drr_bonuslen); 2202168404Spjd } 2203168404Spjd dmu_buf_rele(db, FTAG); 2204168404Spjd } 2205168404Spjd dmu_tx_commit(tx); 2206289362Smav 2207168404Spjd return (0); 2208168404Spjd} 2209168404Spjd 2210168404Spjd/* ARGSUSED */ 2211168404Spjdstatic int 2212286705Smavreceive_freeobjects(struct receive_writer_arg *rwa, 2213168404Spjd struct drr_freeobjects *drrfo) 2214168404Spjd{ 2215168404Spjd uint64_t obj; 2216294815Smav int next_err = 0; 2217168404Spjd 2218168404Spjd if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 2219249195Smm return (SET_ERROR(EINVAL)); 2220168404Spjd 2221168404Spjd for (obj = drrfo->drr_firstobj; 2222294815Smav obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; 2223294815Smav next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { 2224168404Spjd int err; 2225168404Spjd 2226286705Smav if (dmu_object_info(rwa->os, obj, NULL) != 0) 2227168404Spjd continue; 2228168404Spjd 2229286705Smav err = dmu_free_long_object(rwa->os, obj); 2230248571Smm if (err != 0) 2231168404Spjd return (err); 2232168404Spjd } 2233294815Smav if (next_err != ESRCH) 2234294815Smav return (next_err); 2235168404Spjd return (0); 2236168404Spjd} 2237168404Spjd 2238168404Spjdstatic int 2239286705Smavreceive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, 2240286705Smav arc_buf_t *abuf) 2241168404Spjd{ 2242168404Spjd dmu_tx_t *tx; 2243168404Spjd int err; 2244168404Spjd 2245321535Smav if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || 2246236884Smm !DMU_OT_IS_VALID(drrw->drr_type)) 2247249195Smm return (SET_ERROR(EINVAL)); 2248168404Spjd 2249289362Smav /* 2250289362Smav * For resuming to work, records must be in increasing order 2251289362Smav * by (object, offset). 2252289362Smav */ 2253289362Smav if (drrw->drr_object < rwa->last_object || 2254289362Smav (drrw->drr_object == rwa->last_object && 2255289362Smav drrw->drr_offset < rwa->last_offset)) { 2256289362Smav return (SET_ERROR(EINVAL)); 2257289362Smav } 2258289362Smav rwa->last_object = drrw->drr_object; 2259289362Smav rwa->last_offset = drrw->drr_offset; 2260289362Smav 2261286705Smav if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) 2262249195Smm return (SET_ERROR(EINVAL)); 2263168404Spjd 2264286705Smav tx = dmu_tx_create(rwa->os); 2265272601Sdelphij 2266168404Spjd dmu_tx_hold_write(tx, drrw->drr_object, 2267321535Smav drrw->drr_offset, drrw->drr_logical_size); 2268168404Spjd err = dmu_tx_assign(tx, TXG_WAIT); 2269248571Smm if (err != 0) { 2270168404Spjd dmu_tx_abort(tx); 2271168404Spjd return (err); 2272168404Spjd } 2273286705Smav if (rwa->byteswap) { 2274236884Smm dmu_object_byteswap_t byteswap = 2275236884Smm DMU_OT_BYTESWAP(drrw->drr_type); 2276286587Smav dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, 2277321535Smav DRR_WRITE_PAYLOAD_SIZE(drrw)); 2278236884Smm } 2279286587Smav 2280321535Smav /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ 2281286587Smav dmu_buf_t *bonus; 2282286705Smav if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) 2283286587Smav return (SET_ERROR(EINVAL)); 2284272601Sdelphij dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); 2285289362Smav 2286289362Smav /* 2287289362Smav * Note: If the receive fails, we want the resume stream to start 2288289362Smav * with the same record that we last successfully received (as opposed 2289289362Smav * to the next record), so that we can verify that we are 2290289362Smav * resuming from the correct location. 2291289362Smav */ 2292289362Smav save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); 2293168404Spjd dmu_tx_commit(tx); 2294272601Sdelphij dmu_buf_rele(bonus, FTAG); 2295289362Smav 2296168404Spjd return (0); 2297168404Spjd} 2298168404Spjd 2299219089Spjd/* 2300219089Spjd * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 2301219089Spjd * streams to refer to a copy of the data that is already on the 2302219089Spjd * system because it came in earlier in the stream. This function 2303219089Spjd * finds the earlier copy of the data, and uses that copy instead of 2304219089Spjd * data from the stream to fulfill this write. 2305219089Spjd */ 2306219089Spjdstatic int 2307286705Smavreceive_write_byref(struct receive_writer_arg *rwa, 2308286705Smav struct drr_write_byref *drrwbr) 2309219089Spjd{ 2310219089Spjd dmu_tx_t *tx; 2311219089Spjd int err; 2312219089Spjd guid_map_entry_t gmesrch; 2313219089Spjd guid_map_entry_t *gmep; 2314268075Sdelphij avl_index_t where; 2315219089Spjd objset_t *ref_os = NULL; 2316219089Spjd dmu_buf_t *dbp; 2317219089Spjd 2318219089Spjd if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 2319249195Smm return (SET_ERROR(EINVAL)); 2320219089Spjd 2321219089Spjd /* 2322219089Spjd * If the GUID of the referenced dataset is different from the 2323219089Spjd * GUID of the target dataset, find the referenced dataset. 2324219089Spjd */ 2325219089Spjd if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 2326219089Spjd gmesrch.guid = drrwbr->drr_refguid; 2327286705Smav if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, 2328219089Spjd &where)) == NULL) { 2329249195Smm return (SET_ERROR(EINVAL)); 2330219089Spjd } 2331219089Spjd if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 2332249195Smm return (SET_ERROR(EINVAL)); 2333219089Spjd } else { 2334286705Smav ref_os = rwa->os; 2335219089Spjd } 2336219089Spjd 2337268075Sdelphij err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 2338268075Sdelphij drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 2339268075Sdelphij if (err != 0) 2340219089Spjd return (err); 2341219089Spjd 2342286705Smav tx = dmu_tx_create(rwa->os); 2343219089Spjd 2344219089Spjd dmu_tx_hold_write(tx, drrwbr->drr_object, 2345219089Spjd drrwbr->drr_offset, drrwbr->drr_length); 2346219089Spjd err = dmu_tx_assign(tx, TXG_WAIT); 2347248571Smm if (err != 0) { 2348219089Spjd dmu_tx_abort(tx); 2349219089Spjd return (err); 2350219089Spjd } 2351286705Smav dmu_write(rwa->os, drrwbr->drr_object, 2352219089Spjd drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 2353219089Spjd dmu_buf_rele(dbp, FTAG); 2354289362Smav 2355289362Smav /* See comment in restore_write. */ 2356289362Smav save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); 2357219089Spjd dmu_tx_commit(tx); 2358219089Spjd return (0); 2359219089Spjd} 2360219089Spjd 2361219089Spjdstatic int 2362286705Smavreceive_write_embedded(struct receive_writer_arg *rwa, 2363289362Smav struct drr_write_embedded *drrwe, void *data) 2364268075Sdelphij{ 2365268075Sdelphij dmu_tx_t *tx; 2366268075Sdelphij int err; 2367268075Sdelphij 2368289362Smav if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) 2369268075Sdelphij return (EINVAL); 2370268075Sdelphij 2371289362Smav if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) 2372268075Sdelphij return (EINVAL); 2373268075Sdelphij 2374289362Smav if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) 2375268075Sdelphij return (EINVAL); 2376289362Smav if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 2377268075Sdelphij return (EINVAL); 2378268075Sdelphij 2379286705Smav tx = dmu_tx_create(rwa->os); 2380268075Sdelphij 2381289362Smav dmu_tx_hold_write(tx, drrwe->drr_object, 2382289362Smav drrwe->drr_offset, drrwe->drr_length); 2383268075Sdelphij err = dmu_tx_assign(tx, TXG_WAIT); 2384268075Sdelphij if (err != 0) { 2385268075Sdelphij dmu_tx_abort(tx); 2386268075Sdelphij return (err); 2387268075Sdelphij } 2388268075Sdelphij 2389289362Smav dmu_write_embedded(rwa->os, drrwe->drr_object, 2390289362Smav drrwe->drr_offset, data, drrwe->drr_etype, 2391289362Smav drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, 2392286705Smav rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); 2393268075Sdelphij 2394289362Smav /* See comment in restore_write. */ 2395289362Smav save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); 2396268075Sdelphij dmu_tx_commit(tx); 2397268075Sdelphij return (0); 2398268075Sdelphij} 2399268075Sdelphij 2400268075Sdelphijstatic int 2401286705Smavreceive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, 2402286705Smav void *data) 2403219089Spjd{ 2404219089Spjd dmu_tx_t *tx; 2405219089Spjd dmu_buf_t *db, *db_spill; 2406219089Spjd int err; 2407219089Spjd 2408219089Spjd if (drrs->drr_length < SPA_MINBLOCKSIZE || 2409286705Smav drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) 2410249195Smm return (SET_ERROR(EINVAL)); 2411219089Spjd 2412286705Smav if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) 2413249195Smm return (SET_ERROR(EINVAL)); 2414219089Spjd 2415286705Smav VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); 2416219089Spjd if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 2417219089Spjd dmu_buf_rele(db, FTAG); 2418219089Spjd return (err); 2419219089Spjd } 2420219089Spjd 2421286705Smav tx = dmu_tx_create(rwa->os); 2422219089Spjd 2423219089Spjd dmu_tx_hold_spill(tx, db->db_object); 2424219089Spjd 2425219089Spjd err = dmu_tx_assign(tx, TXG_WAIT); 2426248571Smm if (err != 0) { 2427219089Spjd dmu_buf_rele(db, FTAG); 2428219089Spjd dmu_buf_rele(db_spill, FTAG); 2429219089Spjd dmu_tx_abort(tx); 2430219089Spjd return (err); 2431219089Spjd } 2432219089Spjd dmu_buf_will_dirty(db_spill, tx); 2433219089Spjd 2434219089Spjd if (db_spill->db_size < drrs->drr_length) 2435219089Spjd VERIFY(0 == dbuf_spill_set_blksz(db_spill, 2436219089Spjd drrs->drr_length, tx)); 2437219089Spjd bcopy(data, db_spill->db_data, drrs->drr_length); 2438219089Spjd 2439219089Spjd dmu_buf_rele(db, FTAG); 2440219089Spjd dmu_buf_rele(db_spill, FTAG); 2441219089Spjd 2442219089Spjd dmu_tx_commit(tx); 2443219089Spjd return (0); 2444219089Spjd} 2445219089Spjd 2446168404Spjd/* ARGSUSED */ 2447168404Spjdstatic int 2448286705Smavreceive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) 2449168404Spjd{ 2450168404Spjd int err; 2451168404Spjd 2452168404Spjd if (drrf->drr_length != -1ULL && 2453168404Spjd drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 2454249195Smm return (SET_ERROR(EINVAL)); 2455168404Spjd 2456286705Smav if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) 2457249195Smm return (SET_ERROR(EINVAL)); 2458168404Spjd 2459286705Smav err = dmu_free_long_range(rwa->os, drrf->drr_object, 2460168404Spjd drrf->drr_offset, drrf->drr_length); 2461286705Smav 2462168404Spjd return (err); 2463168404Spjd} 2464168404Spjd 2465248571Smm/* used to destroy the drc_ds on error */ 2466248571Smmstatic void 2467248571Smmdmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 2468248571Smm{ 2469289362Smav if (drc->drc_resumable) { 2470289362Smav /* wait for our resume state to be written to disk */ 2471289362Smav txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); 2472289362Smav dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2473289362Smav } else { 2474307108Smav char name[ZFS_MAX_DATASET_NAME_LEN]; 2475289362Smav dsl_dataset_name(drc->drc_ds, name); 2476289362Smav dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2477289362Smav (void) dsl_destroy_head(name); 2478289362Smav } 2479248571Smm} 2480248571Smm 2481286587Smavstatic void 2482286705Smavreceive_cksum(struct receive_arg *ra, int len, void *buf) 2483286587Smav{ 2484286587Smav if (ra->byteswap) { 2485321610Smav (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); 2486286587Smav } else { 2487321610Smav (void) fletcher_4_incremental_native(buf, len, &ra->cksum); 2488286587Smav } 2489286587Smav} 2490286587Smav 2491185029Spjd/* 2492286705Smav * Read the payload into a buffer of size len, and update the current record's 2493286705Smav * payload field. 2494286705Smav * Allocate ra->next_rrd and read the next record's header into 2495286705Smav * ra->next_rrd->header. 2496286587Smav * Verify checksum of payload and next record. 2497286587Smav */ 2498286587Smavstatic int 2499286705Smavreceive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) 2500286587Smav{ 2501286587Smav int err; 2502286587Smav 2503286587Smav if (len != 0) { 2504286705Smav ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); 2505289362Smav err = receive_read(ra, len, buf); 2506286587Smav if (err != 0) 2507286587Smav return (err); 2508289362Smav receive_cksum(ra, len, buf); 2509289362Smav 2510289362Smav /* note: rrd is NULL when reading the begin record's payload */ 2511289362Smav if (ra->rrd != NULL) { 2512289362Smav ra->rrd->payload = buf; 2513289362Smav ra->rrd->payload_size = len; 2514289362Smav ra->rrd->bytes_read = ra->bytes_read; 2515289362Smav } 2516286587Smav } 2517286587Smav 2518286587Smav ra->prev_cksum = ra->cksum; 2519286587Smav 2520286705Smav ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); 2521286705Smav err = receive_read(ra, sizeof (ra->next_rrd->header), 2522286705Smav &ra->next_rrd->header); 2523289362Smav ra->next_rrd->bytes_read = ra->bytes_read; 2524286705Smav if (err != 0) { 2525286705Smav kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); 2526286705Smav ra->next_rrd = NULL; 2527286587Smav return (err); 2528286705Smav } 2529286705Smav if (ra->next_rrd->header.drr_type == DRR_BEGIN) { 2530286705Smav kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); 2531286705Smav ra->next_rrd = NULL; 2532286587Smav return (SET_ERROR(EINVAL)); 2533286705Smav } 2534286587Smav 2535286587Smav /* 2536286587Smav * Note: checksum is of everything up to but not including the 2537286587Smav * checksum itself. 2538286587Smav */ 2539286587Smav ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 2540286587Smav ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 2541286705Smav receive_cksum(ra, 2542286587Smav offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 2543286705Smav &ra->next_rrd->header); 2544286587Smav 2545286705Smav zio_cksum_t cksum_orig = 2546286705Smav ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; 2547286705Smav zio_cksum_t *cksump = 2548286705Smav &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; 2549286587Smav 2550286587Smav if (ra->byteswap) 2551286705Smav byteswap_record(&ra->next_rrd->header); 2552286587Smav 2553286587Smav if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && 2554286705Smav !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { 2555286705Smav kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); 2556286705Smav ra->next_rrd = NULL; 2557286587Smav return (SET_ERROR(ECKSUM)); 2558286705Smav } 2559286587Smav 2560286705Smav receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); 2561286587Smav 2562286587Smav return (0); 2563286587Smav} 2564286587Smav 2565294815Smavstatic void 2566294815Smavobjlist_create(struct objlist *list) 2567294815Smav{ 2568294815Smav list_create(&list->list, sizeof (struct receive_objnode), 2569294815Smav offsetof(struct receive_objnode, node)); 2570294815Smav list->last_lookup = 0; 2571294815Smav} 2572294815Smav 2573294815Smavstatic void 2574294815Smavobjlist_destroy(struct objlist *list) 2575294815Smav{ 2576294815Smav for (struct receive_objnode *n = list_remove_head(&list->list); 2577294815Smav n != NULL; n = list_remove_head(&list->list)) { 2578294815Smav kmem_free(n, sizeof (*n)); 2579294815Smav } 2580294815Smav list_destroy(&list->list); 2581294815Smav} 2582294815Smav 2583286705Smav/* 2584294815Smav * This function looks through the objlist to see if the specified object number 2585294815Smav * is contained in the objlist. In the process, it will remove all object 2586294815Smav * numbers in the list that are smaller than the specified object number. Thus, 2587294815Smav * any lookup of an object number smaller than a previously looked up object 2588294815Smav * number will always return false; therefore, all lookups should be done in 2589294815Smav * ascending order. 2590294815Smav */ 2591294815Smavstatic boolean_t 2592294815Smavobjlist_exists(struct objlist *list, uint64_t object) 2593294815Smav{ 2594294815Smav struct receive_objnode *node = list_head(&list->list); 2595294815Smav ASSERT3U(object, >=, list->last_lookup); 2596294815Smav list->last_lookup = object; 2597294815Smav while (node != NULL && node->object < object) { 2598294815Smav VERIFY3P(node, ==, list_remove_head(&list->list)); 2599294815Smav kmem_free(node, sizeof (*node)); 2600294815Smav node = list_head(&list->list); 2601294815Smav } 2602294815Smav return (node != NULL && node->object == object); 2603294815Smav} 2604294815Smav 2605294815Smav/* 2606294815Smav * The objlist is a list of object numbers stored in ascending order. However, 2607294815Smav * the insertion of new object numbers does not seek out the correct location to 2608294815Smav * store a new object number; instead, it appends it to the list for simplicity. 2609294815Smav * Thus, any users must take care to only insert new object numbers in ascending 2610294815Smav * order. 2611294815Smav */ 2612294815Smavstatic void 2613294815Smavobjlist_insert(struct objlist *list, uint64_t object) 2614294815Smav{ 2615294815Smav struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); 2616294815Smav node->object = object; 2617294815Smav#ifdef ZFS_DEBUG 2618294815Smav struct receive_objnode *last_object = list_tail(&list->list); 2619294815Smav uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); 2620294815Smav ASSERT3U(node->object, >, last_objnum); 2621294815Smav#endif 2622294815Smav list_insert_tail(&list->list, node); 2623294815Smav} 2624294815Smav 2625294815Smav/* 2626286705Smav * Issue the prefetch reads for any necessary indirect blocks. 2627286705Smav * 2628286705Smav * We use the object ignore list to tell us whether or not to issue prefetches 2629286705Smav * for a given object. We do this for both correctness (in case the blocksize 2630286705Smav * of an object has changed) and performance (if the object doesn't exist, don't 2631286705Smav * needlessly try to issue prefetches). We also trim the list as we go through 2632286705Smav * the stream to prevent it from growing to an unbounded size. 2633286705Smav * 2634286705Smav * The object numbers within will always be in sorted order, and any write 2635286705Smav * records we see will also be in sorted order, but they're not sorted with 2636286705Smav * respect to each other (i.e. we can get several object records before 2637286705Smav * receiving each object's write records). As a result, once we've reached a 2638286705Smav * given object number, we can safely remove any reference to lower object 2639286705Smav * numbers in the ignore list. In practice, we receive up to 32 object records 2640286705Smav * before receiving write records, so the list can have up to 32 nodes in it. 2641286705Smav */ 2642286705Smav/* ARGSUSED */ 2643286705Smavstatic void 2644286705Smavreceive_read_prefetch(struct receive_arg *ra, 2645286705Smav uint64_t object, uint64_t offset, uint64_t length) 2646286705Smav{ 2647294815Smav if (!objlist_exists(&ra->ignore_objlist, object)) { 2648286705Smav dmu_prefetch(ra->os, object, 1, offset, length, 2649286705Smav ZIO_PRIORITY_SYNC_READ); 2650286705Smav } 2651286705Smav} 2652286705Smav 2653286705Smav/* 2654286705Smav * Read records off the stream, issuing any necessary prefetches. 2655286705Smav */ 2656286587Smavstatic int 2657286705Smavreceive_read_record(struct receive_arg *ra) 2658286587Smav{ 2659286587Smav int err; 2660286587Smav 2661286705Smav switch (ra->rrd->header.drr_type) { 2662286587Smav case DRR_OBJECT: 2663286587Smav { 2664286705Smav struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; 2665286705Smav uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); 2666286705Smav void *buf = kmem_zalloc(size, KM_SLEEP); 2667286705Smav dmu_object_info_t doi; 2668286705Smav err = receive_read_payload_and_next_header(ra, size, buf); 2669286705Smav if (err != 0) { 2670286705Smav kmem_free(buf, size); 2671286587Smav return (err); 2672286705Smav } 2673286705Smav err = dmu_object_info(ra->os, drro->drr_object, &doi); 2674286705Smav /* 2675286705Smav * See receive_read_prefetch for an explanation why we're 2676286705Smav * storing this object in the ignore_obj_list. 2677286705Smav */ 2678286705Smav if (err == ENOENT || 2679286705Smav (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { 2680294815Smav objlist_insert(&ra->ignore_objlist, drro->drr_object); 2681286705Smav err = 0; 2682286705Smav } 2683286705Smav return (err); 2684286587Smav } 2685286587Smav case DRR_FREEOBJECTS: 2686286587Smav { 2687286705Smav err = receive_read_payload_and_next_header(ra, 0, NULL); 2688286705Smav return (err); 2689286587Smav } 2690286587Smav case DRR_WRITE: 2691286587Smav { 2692286705Smav struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; 2693321535Smav arc_buf_t *abuf; 2694321535Smav boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); 2695321535Smav if (DRR_WRITE_COMPRESSED(drrw)) { 2696321535Smav ASSERT3U(drrw->drr_compressed_size, >, 0); 2697321535Smav ASSERT3U(drrw->drr_logical_size, >=, 2698321535Smav drrw->drr_compressed_size); 2699321535Smav ASSERT(!is_meta); 2700321535Smav abuf = arc_loan_compressed_buf( 2701321535Smav dmu_objset_spa(ra->os), 2702321535Smav drrw->drr_compressed_size, drrw->drr_logical_size, 2703321535Smav drrw->drr_compressiontype); 2704321535Smav } else { 2705321535Smav abuf = arc_loan_buf(dmu_objset_spa(ra->os), 2706321535Smav is_meta, drrw->drr_logical_size); 2707321535Smav } 2708286587Smav 2709286705Smav err = receive_read_payload_and_next_header(ra, 2710321535Smav DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); 2711286705Smav if (err != 0) { 2712286705Smav dmu_return_arcbuf(abuf); 2713286587Smav return (err); 2714286705Smav } 2715286705Smav ra->rrd->write_buf = abuf; 2716286705Smav receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, 2717321535Smav drrw->drr_logical_size); 2718286587Smav return (err); 2719286587Smav } 2720286587Smav case DRR_WRITE_BYREF: 2721286587Smav { 2722286705Smav struct drr_write_byref *drrwb = 2723286705Smav &ra->rrd->header.drr_u.drr_write_byref; 2724286705Smav err = receive_read_payload_and_next_header(ra, 0, NULL); 2725286705Smav receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, 2726286705Smav drrwb->drr_length); 2727286705Smav return (err); 2728286587Smav } 2729286587Smav case DRR_WRITE_EMBEDDED: 2730286587Smav { 2731286587Smav struct drr_write_embedded *drrwe = 2732286705Smav &ra->rrd->header.drr_u.drr_write_embedded; 2733286705Smav uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); 2734286705Smav void *buf = kmem_zalloc(size, KM_SLEEP); 2735286705Smav 2736286705Smav err = receive_read_payload_and_next_header(ra, size, buf); 2737286705Smav if (err != 0) { 2738286705Smav kmem_free(buf, size); 2739286587Smav return (err); 2740286705Smav } 2741286705Smav 2742286705Smav receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, 2743286705Smav drrwe->drr_length); 2744286705Smav return (err); 2745286587Smav } 2746286587Smav case DRR_FREE: 2747286587Smav { 2748286705Smav /* 2749286705Smav * It might be beneficial to prefetch indirect blocks here, but 2750286705Smav * we don't really have the data to decide for sure. 2751286705Smav */ 2752286705Smav err = receive_read_payload_and_next_header(ra, 0, NULL); 2753286705Smav return (err); 2754286587Smav } 2755286587Smav case DRR_END: 2756286587Smav { 2757286705Smav struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; 2758286587Smav if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) 2759289362Smav return (SET_ERROR(ECKSUM)); 2760286587Smav return (0); 2761286587Smav } 2762286587Smav case DRR_SPILL: 2763286587Smav { 2764286705Smav struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; 2765286705Smav void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); 2766286705Smav err = receive_read_payload_and_next_header(ra, drrs->drr_length, 2767286705Smav buf); 2768286587Smav if (err != 0) 2769286705Smav kmem_free(buf, drrs->drr_length); 2770286705Smav return (err); 2771286587Smav } 2772286587Smav default: 2773286587Smav return (SET_ERROR(EINVAL)); 2774286587Smav } 2775286587Smav} 2776286587Smav 2777286587Smav/* 2778286705Smav * Commit the records to the pool. 2779286705Smav */ 2780286705Smavstatic int 2781286705Smavreceive_process_record(struct receive_writer_arg *rwa, 2782286705Smav struct receive_record_arg *rrd) 2783286705Smav{ 2784286705Smav int err; 2785286705Smav 2786289362Smav /* Processing in order, therefore bytes_read should be increasing. */ 2787289362Smav ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); 2788289362Smav rwa->bytes_read = rrd->bytes_read; 2789289362Smav 2790286705Smav switch (rrd->header.drr_type) { 2791286705Smav case DRR_OBJECT: 2792286705Smav { 2793286705Smav struct drr_object *drro = &rrd->header.drr_u.drr_object; 2794286705Smav err = receive_object(rwa, drro, rrd->payload); 2795286705Smav kmem_free(rrd->payload, rrd->payload_size); 2796286705Smav rrd->payload = NULL; 2797286705Smav return (err); 2798286705Smav } 2799286705Smav case DRR_FREEOBJECTS: 2800286705Smav { 2801286705Smav struct drr_freeobjects *drrfo = 2802286705Smav &rrd->header.drr_u.drr_freeobjects; 2803286705Smav return (receive_freeobjects(rwa, drrfo)); 2804286705Smav } 2805286705Smav case DRR_WRITE: 2806286705Smav { 2807286705Smav struct drr_write *drrw = &rrd->header.drr_u.drr_write; 2808286705Smav err = receive_write(rwa, drrw, rrd->write_buf); 2809286705Smav /* if receive_write() is successful, it consumes the arc_buf */ 2810286705Smav if (err != 0) 2811286705Smav dmu_return_arcbuf(rrd->write_buf); 2812286705Smav rrd->write_buf = NULL; 2813286705Smav rrd->payload = NULL; 2814286705Smav return (err); 2815286705Smav } 2816286705Smav case DRR_WRITE_BYREF: 2817286705Smav { 2818286705Smav struct drr_write_byref *drrwbr = 2819286705Smav &rrd->header.drr_u.drr_write_byref; 2820286705Smav return (receive_write_byref(rwa, drrwbr)); 2821286705Smav } 2822286705Smav case DRR_WRITE_EMBEDDED: 2823286705Smav { 2824286705Smav struct drr_write_embedded *drrwe = 2825286705Smav &rrd->header.drr_u.drr_write_embedded; 2826286705Smav err = receive_write_embedded(rwa, drrwe, rrd->payload); 2827286705Smav kmem_free(rrd->payload, rrd->payload_size); 2828286705Smav rrd->payload = NULL; 2829286705Smav return (err); 2830286705Smav } 2831286705Smav case DRR_FREE: 2832286705Smav { 2833286705Smav struct drr_free *drrf = &rrd->header.drr_u.drr_free; 2834286705Smav return (receive_free(rwa, drrf)); 2835286705Smav } 2836286705Smav case DRR_SPILL: 2837286705Smav { 2838286705Smav struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; 2839286705Smav err = receive_spill(rwa, drrs, rrd->payload); 2840286705Smav kmem_free(rrd->payload, rrd->payload_size); 2841286705Smav rrd->payload = NULL; 2842286705Smav return (err); 2843286705Smav } 2844286705Smav default: 2845286705Smav return (SET_ERROR(EINVAL)); 2846286705Smav } 2847286705Smav} 2848286705Smav 2849286705Smav/* 2850286705Smav * dmu_recv_stream's worker thread; pull records off the queue, and then call 2851286705Smav * receive_process_record When we're done, signal the main thread and exit. 2852286705Smav */ 2853286705Smavstatic void 2854286705Smavreceive_writer_thread(void *arg) 2855286705Smav{ 2856286705Smav struct receive_writer_arg *rwa = arg; 2857286705Smav struct receive_record_arg *rrd; 2858286705Smav for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; 2859286705Smav rrd = bqueue_dequeue(&rwa->q)) { 2860286705Smav /* 2861286705Smav * If there's an error, the main thread will stop putting things 2862286705Smav * on the queue, but we need to clear everything in it before we 2863286705Smav * can exit. 2864286705Smav */ 2865286705Smav if (rwa->err == 0) { 2866286705Smav rwa->err = receive_process_record(rwa, rrd); 2867286705Smav } else if (rrd->write_buf != NULL) { 2868286705Smav dmu_return_arcbuf(rrd->write_buf); 2869286705Smav rrd->write_buf = NULL; 2870286705Smav rrd->payload = NULL; 2871286705Smav } else if (rrd->payload != NULL) { 2872286705Smav kmem_free(rrd->payload, rrd->payload_size); 2873286705Smav rrd->payload = NULL; 2874286705Smav } 2875286705Smav kmem_free(rrd, sizeof (*rrd)); 2876286705Smav } 2877286705Smav kmem_free(rrd, sizeof (*rrd)); 2878286705Smav mutex_enter(&rwa->mutex); 2879286705Smav rwa->done = B_TRUE; 2880286705Smav cv_signal(&rwa->cv); 2881286705Smav mutex_exit(&rwa->mutex); 2882286705Smav thread_exit(); 2883286705Smav} 2884286705Smav 2885289362Smavstatic int 2886289362Smavresume_check(struct receive_arg *ra, nvlist_t *begin_nvl) 2887289362Smav{ 2888289362Smav uint64_t val; 2889289362Smav objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; 2890289362Smav uint64_t dsobj = dmu_objset_id(ra->os); 2891289362Smav uint64_t resume_obj, resume_off; 2892289362Smav 2893289362Smav if (nvlist_lookup_uint64(begin_nvl, 2894289362Smav "resume_object", &resume_obj) != 0 || 2895289362Smav nvlist_lookup_uint64(begin_nvl, 2896289362Smav "resume_offset", &resume_off) != 0) { 2897289362Smav return (SET_ERROR(EINVAL)); 2898289362Smav } 2899289362Smav VERIFY0(zap_lookup(mos, dsobj, 2900289362Smav DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); 2901289362Smav if (resume_obj != val) 2902289362Smav return (SET_ERROR(EINVAL)); 2903289362Smav VERIFY0(zap_lookup(mos, dsobj, 2904289362Smav DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); 2905289362Smav if (resume_off != val) 2906289362Smav return (SET_ERROR(EINVAL)); 2907289362Smav 2908289362Smav return (0); 2909289362Smav} 2910289362Smav 2911286705Smav/* 2912286705Smav * Read in the stream's records, one by one, and apply them to the pool. There 2913286705Smav * are two threads involved; the thread that calls this function will spin up a 2914286705Smav * worker thread, read the records off the stream one by one, and issue 2915286705Smav * prefetches for any necessary indirect blocks. It will then push the records 2916286705Smav * onto an internal blocking queue. The worker thread will pull the records off 2917286705Smav * the queue, and actually write the data into the DMU. This way, the worker 2918286705Smav * thread doesn't have to wait for reads to complete, since everything it needs 2919286705Smav * (the indirect blocks) will be prefetched. 2920286705Smav * 2921185029Spjd * NB: callers *must* call dmu_recv_end() if this succeeds. 2922185029Spjd */ 2923168404Spjdint 2924219089Spjddmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, 2925219089Spjd int cleanup_fd, uint64_t *action_handlep) 2926168404Spjd{ 2927286587Smav int err = 0; 2928286705Smav struct receive_arg ra = { 0 }; 2929286705Smav struct receive_writer_arg rwa = { 0 }; 2930219089Spjd int featureflags; 2931289362Smav nvlist_t *begin_nvl = NULL; 2932168404Spjd 2933248571Smm ra.byteswap = drc->drc_byteswap; 2934248571Smm ra.cksum = drc->drc_cksum; 2935219089Spjd ra.td = curthread; 2936185029Spjd ra.fp = fp; 2937185029Spjd ra.voff = *voffp; 2938289362Smav 2939289362Smav if (dsl_dataset_is_zapified(drc->drc_ds)) { 2940289362Smav (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, 2941289362Smav drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, 2942289362Smav sizeof (ra.bytes_read), 1, &ra.bytes_read); 2943289362Smav } 2944289362Smav 2945294815Smav objlist_create(&ra.ignore_objlist); 2946168404Spjd 2947185029Spjd /* these were verified in dmu_recv_begin */ 2948248571Smm ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 2949219089Spjd DMU_SUBSTREAM); 2950248571Smm ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 2951168404Spjd 2952168404Spjd /* 2953168404Spjd * Open the objset we are modifying. 2954168404Spjd */ 2955286587Smav VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); 2956168404Spjd 2957275782Sdelphij ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); 2958168404Spjd 2959219089Spjd featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 2960219089Spjd 2961219089Spjd /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 2962219089Spjd if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 2963219089Spjd minor_t minor; 2964219089Spjd 2965219089Spjd if (cleanup_fd == -1) { 2966249195Smm ra.err = SET_ERROR(EBADF); 2967219089Spjd goto out; 2968219089Spjd } 2969219089Spjd ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 2970248571Smm if (ra.err != 0) { 2971219089Spjd cleanup_fd = -1; 2972219089Spjd goto out; 2973219089Spjd } 2974219089Spjd 2975219089Spjd if (*action_handlep == 0) { 2976286705Smav rwa.guid_to_ds_map = 2977219089Spjd kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 2978286705Smav avl_create(rwa.guid_to_ds_map, guid_compare, 2979219089Spjd sizeof (guid_map_entry_t), 2980219089Spjd offsetof(guid_map_entry_t, avlnode)); 2981286587Smav err = zfs_onexit_add_cb(minor, 2982286705Smav free_guid_map_onexit, rwa.guid_to_ds_map, 2983219089Spjd action_handlep); 2984248571Smm if (ra.err != 0) 2985219089Spjd goto out; 2986219089Spjd } else { 2987286587Smav err = zfs_onexit_cb_data(minor, *action_handlep, 2988286705Smav (void **)&rwa.guid_to_ds_map); 2989248571Smm if (ra.err != 0) 2990219089Spjd goto out; 2991219089Spjd } 2992221263Smm 2993286705Smav drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; 2994219089Spjd } 2995219089Spjd 2996289362Smav uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; 2997289362Smav void *payload = NULL; 2998289362Smav if (payloadlen != 0) 2999289362Smav payload = kmem_alloc(payloadlen, KM_SLEEP); 3000289362Smav 3001289362Smav err = receive_read_payload_and_next_header(&ra, payloadlen, payload); 3002289362Smav if (err != 0) { 3003289362Smav if (payloadlen != 0) 3004289362Smav kmem_free(payload, payloadlen); 3005286587Smav goto out; 3006289362Smav } 3007289362Smav if (payloadlen != 0) { 3008289362Smav err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); 3009289362Smav kmem_free(payload, payloadlen); 3010289362Smav if (err != 0) 3011289362Smav goto out; 3012289362Smav } 3013286587Smav 3014289362Smav if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { 3015289362Smav err = resume_check(&ra, begin_nvl); 3016289362Smav if (err != 0) 3017289362Smav goto out; 3018289362Smav } 3019289362Smav 3020286705Smav (void) bqueue_init(&rwa.q, zfs_recv_queue_length, 3021286705Smav offsetof(struct receive_record_arg, node)); 3022286705Smav cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); 3023286705Smav mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); 3024286705Smav rwa.os = ra.os; 3025286705Smav rwa.byteswap = drc->drc_byteswap; 3026289362Smav rwa.resumable = drc->drc_resumable; 3027286705Smav 3028287280Sdelphij (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0, 3029286705Smav TS_RUN, minclsyspri); 3030286705Smav /* 3031286705Smav * We're reading rwa.err without locks, which is safe since we are the 3032286705Smav * only reader, and the worker thread is the only writer. It's ok if we 3033286705Smav * miss a write for an iteration or two of the loop, since the writer 3034286705Smav * thread will keep freeing records we send it until we send it an eos 3035286705Smav * marker. 3036286705Smav * 3037286705Smav * We can leave this loop in 3 ways: First, if rwa.err is 3038286705Smav * non-zero. In that case, the writer thread will free the rrd we just 3039286705Smav * pushed. Second, if we're interrupted; in that case, either it's the 3040286705Smav * first loop and ra.rrd was never allocated, or it's later, and ra.rrd 3041286705Smav * has been handed off to the writer thread who will free it. Finally, 3042286705Smav * if receive_read_record fails or we're at the end of the stream, then 3043286705Smav * we free ra.rrd and exit. 3044286705Smav */ 3045286705Smav while (rwa.err == 0) { 3046185029Spjd if (issig(JUSTLOOKING) && issig(FORREAL)) { 3047286587Smav err = SET_ERROR(EINTR); 3048286587Smav break; 3049168404Spjd } 3050168404Spjd 3051286705Smav ASSERT3P(ra.rrd, ==, NULL); 3052286705Smav ra.rrd = ra.next_rrd; 3053286705Smav ra.next_rrd = NULL; 3054286705Smav /* Allocates and loads header into ra.next_rrd */ 3055286705Smav err = receive_read_record(&ra); 3056168404Spjd 3057286705Smav if (ra.rrd->header.drr_type == DRR_END || err != 0) { 3058286705Smav kmem_free(ra.rrd, sizeof (*ra.rrd)); 3059286705Smav ra.rrd = NULL; 3060168404Spjd break; 3061286705Smav } 3062286705Smav 3063286705Smav bqueue_enqueue(&rwa.q, ra.rrd, 3064286705Smav sizeof (struct receive_record_arg) + ra.rrd->payload_size); 3065286705Smav ra.rrd = NULL; 3066168404Spjd } 3067286705Smav if (ra.next_rrd == NULL) 3068286705Smav ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); 3069286705Smav ra.next_rrd->eos_marker = B_TRUE; 3070286705Smav bqueue_enqueue(&rwa.q, ra.next_rrd, 1); 3071168404Spjd 3072286705Smav mutex_enter(&rwa.mutex); 3073286705Smav while (!rwa.done) { 3074286705Smav cv_wait(&rwa.cv, &rwa.mutex); 3075286705Smav } 3076286705Smav mutex_exit(&rwa.mutex); 3077286705Smav 3078286705Smav cv_destroy(&rwa.cv); 3079286705Smav mutex_destroy(&rwa.mutex); 3080286705Smav bqueue_destroy(&rwa.q); 3081286705Smav if (err == 0) 3082286705Smav err = rwa.err; 3083286705Smav 3084168404Spjdout: 3085289362Smav nvlist_free(begin_nvl); 3086219089Spjd if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 3087219089Spjd zfs_onexit_fd_rele(cleanup_fd); 3088168404Spjd 3089286587Smav if (err != 0) { 3090168404Spjd /* 3091289362Smav * Clean up references. If receive is not resumable, 3092289362Smav * destroy what we created, so we don't leave it in 3093289362Smav * the inconsistent state. 3094168404Spjd */ 3095248571Smm dmu_recv_cleanup_ds(drc); 3096168404Spjd } 3097168404Spjd 3098185029Spjd *voffp = ra.voff; 3099294815Smav objlist_destroy(&ra.ignore_objlist); 3100286587Smav return (err); 3101168404Spjd} 3102185029Spjd 3103185029Spjdstatic int 3104248571Smmdmu_recv_end_check(void *arg, dmu_tx_t *tx) 3105185029Spjd{ 3106248571Smm dmu_recv_cookie_t *drc = arg; 3107248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 3108248571Smm int error; 3109185029Spjd 3110248571Smm ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 3111248571Smm 3112248571Smm if (!drc->drc_newfs) { 3113248571Smm dsl_dataset_t *origin_head; 3114248571Smm 3115248571Smm error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 3116248571Smm if (error != 0) 3117248571Smm return (error); 3118253820Sdelphij if (drc->drc_force) { 3119253820Sdelphij /* 3120253820Sdelphij * We will destroy any snapshots in tofs (i.e. before 3121253820Sdelphij * origin_head) that are after the origin (which is 3122253820Sdelphij * the snap before drc_ds, because drc_ds can not 3123253820Sdelphij * have any snaps of its own). 3124253820Sdelphij */ 3125275782Sdelphij uint64_t obj; 3126275782Sdelphij 3127275782Sdelphij obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 3128275782Sdelphij while (obj != 3129275782Sdelphij dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 3130253820Sdelphij dsl_dataset_t *snap; 3131253820Sdelphij error = dsl_dataset_hold_obj(dp, obj, FTAG, 3132253820Sdelphij &snap); 3133253820Sdelphij if (error != 0) 3134282473Savg break; 3135253820Sdelphij if (snap->ds_dir != origin_head->ds_dir) 3136253820Sdelphij error = SET_ERROR(EINVAL); 3137253820Sdelphij if (error == 0) { 3138253820Sdelphij error = dsl_destroy_snapshot_check_impl( 3139253820Sdelphij snap, B_FALSE); 3140253820Sdelphij } 3141275782Sdelphij obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 3142253820Sdelphij dsl_dataset_rele(snap, FTAG); 3143253820Sdelphij if (error != 0) 3144282473Savg break; 3145253820Sdelphij } 3146282473Savg if (error != 0) { 3147282473Savg dsl_dataset_rele(origin_head, FTAG); 3148282473Savg return (error); 3149282473Savg } 3150253820Sdelphij } 3151248571Smm error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 3152253816Sdelphij origin_head, drc->drc_force, drc->drc_owner, tx); 3153248571Smm if (error != 0) { 3154248571Smm dsl_dataset_rele(origin_head, FTAG); 3155248571Smm return (error); 3156248571Smm } 3157248571Smm error = dsl_dataset_snapshot_check_impl(origin_head, 3158264835Sdelphij drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 3159248571Smm dsl_dataset_rele(origin_head, FTAG); 3160248571Smm if (error != 0) 3161248571Smm return (error); 3162248571Smm 3163248571Smm error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 3164248571Smm } else { 3165248571Smm error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 3166264835Sdelphij drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 3167248571Smm } 3168248571Smm return (error); 3169185029Spjd} 3170185029Spjd 3171185029Spjdstatic void 3172248571Smmdmu_recv_end_sync(void *arg, dmu_tx_t *tx) 3173185029Spjd{ 3174248571Smm dmu_recv_cookie_t *drc = arg; 3175248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 3176185029Spjd 3177248571Smm spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 3178248571Smm tx, "snap=%s", drc->drc_tosnap); 3179185029Spjd 3180248571Smm if (!drc->drc_newfs) { 3181248571Smm dsl_dataset_t *origin_head; 3182185029Spjd 3183248571Smm VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 3184248571Smm &origin_head)); 3185253820Sdelphij 3186253820Sdelphij if (drc->drc_force) { 3187253820Sdelphij /* 3188253820Sdelphij * Destroy any snapshots of drc_tofs (origin_head) 3189253820Sdelphij * after the origin (the snap before drc_ds). 3190253820Sdelphij */ 3191275782Sdelphij uint64_t obj; 3192275782Sdelphij 3193275782Sdelphij obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 3194275782Sdelphij while (obj != 3195275782Sdelphij dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 3196253820Sdelphij dsl_dataset_t *snap; 3197253820Sdelphij VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 3198253820Sdelphij &snap)); 3199253820Sdelphij ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 3200275782Sdelphij obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 3201253820Sdelphij dsl_destroy_snapshot_sync_impl(snap, 3202253820Sdelphij B_FALSE, tx); 3203253820Sdelphij dsl_dataset_rele(snap, FTAG); 3204253820Sdelphij } 3205253820Sdelphij } 3206253820Sdelphij VERIFY3P(drc->drc_ds->ds_prev, ==, 3207253820Sdelphij origin_head->ds_prev); 3208253820Sdelphij 3209248571Smm dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 3210248571Smm origin_head, tx); 3211248571Smm dsl_dataset_snapshot_sync_impl(origin_head, 3212248571Smm drc->drc_tosnap, tx); 3213248571Smm 3214248571Smm /* set snapshot's creation time and guid */ 3215248571Smm dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 3216275782Sdelphij dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = 3217248571Smm drc->drc_drrb->drr_creation_time; 3218275782Sdelphij dsl_dataset_phys(origin_head->ds_prev)->ds_guid = 3219248571Smm drc->drc_drrb->drr_toguid; 3220275782Sdelphij dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= 3221248571Smm ~DS_FLAG_INCONSISTENT; 3222248571Smm 3223248571Smm dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 3224275782Sdelphij dsl_dataset_phys(origin_head)->ds_flags &= 3225275782Sdelphij ~DS_FLAG_INCONSISTENT; 3226248571Smm 3227307288Smav drc->drc_newsnapobj = 3228307288Smav dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 3229307288Smav 3230248571Smm dsl_dataset_rele(origin_head, FTAG); 3231248571Smm dsl_destroy_head_sync_impl(drc->drc_ds, tx); 3232253816Sdelphij 3233253816Sdelphij if (drc->drc_owner != NULL) 3234253816Sdelphij VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 3235248571Smm } else { 3236248571Smm dsl_dataset_t *ds = drc->drc_ds; 3237248571Smm 3238248571Smm dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 3239248571Smm 3240248571Smm /* set snapshot's creation time and guid */ 3241248571Smm dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 3242275782Sdelphij dsl_dataset_phys(ds->ds_prev)->ds_creation_time = 3243248571Smm drc->drc_drrb->drr_creation_time; 3244275782Sdelphij dsl_dataset_phys(ds->ds_prev)->ds_guid = 3245275782Sdelphij drc->drc_drrb->drr_toguid; 3246275782Sdelphij dsl_dataset_phys(ds->ds_prev)->ds_flags &= 3247275782Sdelphij ~DS_FLAG_INCONSISTENT; 3248248571Smm 3249248571Smm dmu_buf_will_dirty(ds->ds_dbuf, tx); 3250275782Sdelphij dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; 3251289362Smav if (dsl_dataset_has_resume_receive_state(ds)) { 3252289362Smav (void) zap_remove(dp->dp_meta_objset, ds->ds_object, 3253289362Smav DS_FIELD_RESUME_FROMGUID, tx); 3254289362Smav (void) zap_remove(dp->dp_meta_objset, ds->ds_object, 3255289362Smav DS_FIELD_RESUME_OBJECT, tx); 3256289362Smav (void) zap_remove(dp->dp_meta_objset, ds->ds_object, 3257289362Smav DS_FIELD_RESUME_OFFSET, tx); 3258289362Smav (void) zap_remove(dp->dp_meta_objset, ds->ds_object, 3259289362Smav DS_FIELD_RESUME_BYTES, tx); 3260289362Smav (void) zap_remove(dp->dp_meta_objset, ds->ds_object, 3261289362Smav DS_FIELD_RESUME_TOGUID, tx); 3262289362Smav (void) zap_remove(dp->dp_meta_objset, ds->ds_object, 3263289362Smav DS_FIELD_RESUME_TONAME, tx); 3264289362Smav } 3265307288Smav drc->drc_newsnapobj = 3266307288Smav dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; 3267248571Smm } 3268248571Smm /* 3269248571Smm * Release the hold from dmu_recv_begin. This must be done before 3270248571Smm * we return to open context, so that when we free the dataset's dnode, 3271248571Smm * we can evict its bonus buffer. 3272248571Smm */ 3273248571Smm dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 3274248571Smm drc->drc_ds = NULL; 3275185029Spjd} 3276185029Spjd 3277219089Spjdstatic int 3278248571Smmadd_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 3279221263Smm{ 3280248571Smm dsl_pool_t *dp; 3281221263Smm dsl_dataset_t *snapds; 3282221263Smm guid_map_entry_t *gmep; 3283221263Smm int err; 3284221263Smm 3285221263Smm ASSERT(guid_map != NULL); 3286221263Smm 3287248571Smm err = dsl_pool_hold(name, FTAG, &dp); 3288248571Smm if (err != 0) 3289248571Smm return (err); 3290249356Smm gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 3291249196Smm err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 3292221263Smm if (err == 0) { 3293275782Sdelphij gmep->guid = dsl_dataset_phys(snapds)->ds_guid; 3294221263Smm gmep->gme_ds = snapds; 3295221263Smm avl_add(guid_map, gmep); 3296248571Smm dsl_dataset_long_hold(snapds, gmep); 3297249196Smm } else 3298249356Smm kmem_free(gmep, sizeof (*gmep)); 3299221263Smm 3300248571Smm dsl_pool_rele(dp, FTAG); 3301221263Smm return (err); 3302221263Smm} 3303221263Smm 3304248571Smmstatic int dmu_recv_end_modified_blocks = 3; 3305248571Smm 3306221263Smmstatic int 3307219089Spjddmu_recv_existing_end(dmu_recv_cookie_t *drc) 3308185029Spjd{ 3309248571Smm#ifdef _KERNEL 3310248571Smm /* 3311248571Smm * We will be destroying the ds; make sure its origin is unmounted if 3312248571Smm * necessary. 3313248571Smm */ 3314307108Smav char name[ZFS_MAX_DATASET_NAME_LEN]; 3315248571Smm dsl_dataset_name(drc->drc_ds, name); 3316248571Smm zfs_destroy_unmount_origin(name); 3317248571Smm#endif 3318185029Spjd 3319307288Smav return (dsl_sync_task(drc->drc_tofs, 3320248571Smm dmu_recv_end_check, dmu_recv_end_sync, drc, 3321307288Smav dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); 3322185029Spjd} 3323219089Spjd 3324219089Spjdstatic int 3325219089Spjddmu_recv_new_end(dmu_recv_cookie_t *drc) 3326219089Spjd{ 3327307288Smav return (dsl_sync_task(drc->drc_tofs, 3328307288Smav dmu_recv_end_check, dmu_recv_end_sync, drc, 3329307288Smav dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); 3330307288Smav} 3331307288Smav 3332307288Smavint 3333307288Smavdmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 3334307288Smav{ 3335248571Smm int error; 3336219089Spjd 3337307288Smav drc->drc_owner = owner; 3338219089Spjd 3339307288Smav if (drc->drc_newfs) 3340307288Smav error = dmu_recv_new_end(drc); 3341307288Smav else 3342307288Smav error = dmu_recv_existing_end(drc); 3343307288Smav 3344248571Smm if (error != 0) { 3345248571Smm dmu_recv_cleanup_ds(drc); 3346248571Smm } else if (drc->drc_guid_to_ds_map != NULL) { 3347248571Smm (void) add_ds_to_guidmap(drc->drc_tofs, 3348248571Smm drc->drc_guid_to_ds_map, 3349248571Smm drc->drc_newsnapobj); 3350219089Spjd } 3351248571Smm return (error); 3352219089Spjd} 3353219089Spjd 3354253821Sdelphij/* 3355253821Sdelphij * Return TRUE if this objset is currently being received into. 3356253821Sdelphij */ 3357253821Sdelphijboolean_t 3358253821Sdelphijdmu_objset_is_receiving(objset_t *os) 3359253821Sdelphij{ 3360253821Sdelphij return (os->os_dsl_dataset != NULL && 3361253821Sdelphij os->os_dsl_dataset->ds_owner == dmu_recv_tag); 3362253821Sdelphij} 3363