dmu_send.c revision 236884
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012 by Delphix. All rights reserved. 25 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 */ 28 29#include <sys/dmu.h> 30#include <sys/dmu_impl.h> 31#include <sys/dmu_tx.h> 32#include <sys/dbuf.h> 33#include <sys/dnode.h> 34#include <sys/zfs_context.h> 35#include <sys/dmu_objset.h> 36#include <sys/dmu_traverse.h> 37#include <sys/dsl_dataset.h> 38#include <sys/dsl_dir.h> 39#include <sys/dsl_prop.h> 40#include <sys/dsl_pool.h> 41#include <sys/dsl_synctask.h> 42#include <sys/zfs_ioctl.h> 43#include <sys/zap.h> 44#include <sys/zio_checksum.h> 45#include <sys/zfs_znode.h> 46#include <zfs_fletcher.h> 47#include <sys/avl.h> 48#include <sys/ddt.h> 49#include <sys/zfs_onexit.h> 50 51/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 52int zfs_send_corrupt_data = B_FALSE; 53 54static char *dmu_recv_tag = "dmu_recv_tag"; 55 56static int 57dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 58{ 59 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 60 struct uio auio; 61 struct iovec aiov; 62 ASSERT3U(len % 8, ==, 0); 63 64 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 65 aiov.iov_base = buf; 66 aiov.iov_len = len; 67 auio.uio_iov = &aiov; 68 auio.uio_iovcnt = 1; 69 auio.uio_resid = len; 70 auio.uio_segflg = UIO_SYSSPACE; 71 auio.uio_rw = UIO_WRITE; 72 auio.uio_offset = (off_t)-1; 73 auio.uio_td = dsp->dsa_td; 74#ifdef _KERNEL 75 if (dsp->dsa_fp->f_type == DTYPE_VNODE) 76 bwillwrite(); 77 dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, 78 dsp->dsa_td); 79#else 80 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 81 dsp->dsa_err = EOPNOTSUPP; 82#endif 83 mutex_enter(&ds->ds_sendstream_lock); 84 *dsp->dsa_off += len; 85 mutex_exit(&ds->ds_sendstream_lock); 86 87 return (dsp->dsa_err); 88} 89 90static int 91dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 92 uint64_t length) 93{ 94 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 95 96 /* 97 * If there is a pending op, but it's not PENDING_FREE, push it out, 98 * since free block aggregation can only be done for blocks of the 99 * same type (i.e., DRR_FREE records can only be aggregated with 100 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 101 * aggregated with other DRR_FREEOBJECTS records. 102 */ 103 if (dsp->dsa_pending_op != PENDING_NONE && 104 dsp->dsa_pending_op != PENDING_FREE) { 105 if (dump_bytes(dsp, dsp->dsa_drr, 106 sizeof (dmu_replay_record_t)) != 0) 107 return (EINTR); 108 dsp->dsa_pending_op = PENDING_NONE; 109 } 110 111 if (dsp->dsa_pending_op == PENDING_FREE) { 112 /* 113 * There should never be a PENDING_FREE if length is -1 114 * (because dump_dnode is the only place where this 115 * function is called with a -1, and only after flushing 116 * any pending record). 117 */ 118 ASSERT(length != -1ULL); 119 /* 120 * Check to see whether this free block can be aggregated 121 * with pending one. 122 */ 123 if (drrf->drr_object == object && drrf->drr_offset + 124 drrf->drr_length == offset) { 125 drrf->drr_length += length; 126 return (0); 127 } else { 128 /* not a continuation. Push out pending record */ 129 if (dump_bytes(dsp, dsp->dsa_drr, 130 sizeof (dmu_replay_record_t)) != 0) 131 return (EINTR); 132 dsp->dsa_pending_op = PENDING_NONE; 133 } 134 } 135 /* create a FREE record and make it pending */ 136 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 137 dsp->dsa_drr->drr_type = DRR_FREE; 138 drrf->drr_object = object; 139 drrf->drr_offset = offset; 140 drrf->drr_length = length; 141 drrf->drr_toguid = dsp->dsa_toguid; 142 if (length == -1ULL) { 143 if (dump_bytes(dsp, dsp->dsa_drr, 144 sizeof (dmu_replay_record_t)) != 0) 145 return (EINTR); 146 } else { 147 dsp->dsa_pending_op = PENDING_FREE; 148 } 149 150 return (0); 151} 152 153static int 154dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, 155 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 156{ 157 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 158 159 160 /* 161 * If there is any kind of pending aggregation (currently either 162 * a grouping of free objects or free blocks), push it out to 163 * the stream, since aggregation can't be done across operations 164 * of different types. 165 */ 166 if (dsp->dsa_pending_op != PENDING_NONE) { 167 if (dump_bytes(dsp, dsp->dsa_drr, 168 sizeof (dmu_replay_record_t)) != 0) 169 return (EINTR); 170 dsp->dsa_pending_op = PENDING_NONE; 171 } 172 /* write a DATA record */ 173 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 174 dsp->dsa_drr->drr_type = DRR_WRITE; 175 drrw->drr_object = object; 176 drrw->drr_type = type; 177 drrw->drr_offset = offset; 178 drrw->drr_length = blksz; 179 drrw->drr_toguid = dsp->dsa_toguid; 180 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 181 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 182 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 183 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 184 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 185 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 186 drrw->drr_key.ddk_cksum = bp->blk_cksum; 187 188 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 189 return (EINTR); 190 if (dump_bytes(dsp, data, blksz) != 0) 191 return (EINTR); 192 return (0); 193} 194 195static int 196dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 197{ 198 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 199 200 if (dsp->dsa_pending_op != PENDING_NONE) { 201 if (dump_bytes(dsp, dsp->dsa_drr, 202 sizeof (dmu_replay_record_t)) != 0) 203 return (EINTR); 204 dsp->dsa_pending_op = PENDING_NONE; 205 } 206 207 /* write a SPILL record */ 208 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 209 dsp->dsa_drr->drr_type = DRR_SPILL; 210 drrs->drr_object = object; 211 drrs->drr_length = blksz; 212 drrs->drr_toguid = dsp->dsa_toguid; 213 214 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 215 return (EINTR); 216 if (dump_bytes(dsp, data, blksz)) 217 return (EINTR); 218 return (0); 219} 220 221static int 222dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 223{ 224 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 225 226 /* 227 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 228 * push it out, since free block aggregation can only be done for 229 * blocks of the same type (i.e., DRR_FREE records can only be 230 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 231 * can only be aggregated with other DRR_FREEOBJECTS records. 232 */ 233 if (dsp->dsa_pending_op != PENDING_NONE && 234 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 235 if (dump_bytes(dsp, dsp->dsa_drr, 236 sizeof (dmu_replay_record_t)) != 0) 237 return (EINTR); 238 dsp->dsa_pending_op = PENDING_NONE; 239 } 240 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 241 /* 242 * See whether this free object array can be aggregated 243 * with pending one 244 */ 245 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 246 drrfo->drr_numobjs += numobjs; 247 return (0); 248 } else { 249 /* can't be aggregated. Push out pending record */ 250 if (dump_bytes(dsp, dsp->dsa_drr, 251 sizeof (dmu_replay_record_t)) != 0) 252 return (EINTR); 253 dsp->dsa_pending_op = PENDING_NONE; 254 } 255 } 256 257 /* write a FREEOBJECTS record */ 258 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 259 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 260 drrfo->drr_firstobj = firstobj; 261 drrfo->drr_numobjs = numobjs; 262 drrfo->drr_toguid = dsp->dsa_toguid; 263 264 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 265 266 return (0); 267} 268 269static int 270dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 271{ 272 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 273 274 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 275 return (dump_freeobjects(dsp, object, 1)); 276 277 if (dsp->dsa_pending_op != PENDING_NONE) { 278 if (dump_bytes(dsp, dsp->dsa_drr, 279 sizeof (dmu_replay_record_t)) != 0) 280 return (EINTR); 281 dsp->dsa_pending_op = PENDING_NONE; 282 } 283 284 /* write an OBJECT record */ 285 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 286 dsp->dsa_drr->drr_type = DRR_OBJECT; 287 drro->drr_object = object; 288 drro->drr_type = dnp->dn_type; 289 drro->drr_bonustype = dnp->dn_bonustype; 290 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 291 drro->drr_bonuslen = dnp->dn_bonuslen; 292 drro->drr_checksumtype = dnp->dn_checksum; 293 drro->drr_compress = dnp->dn_compress; 294 drro->drr_toguid = dsp->dsa_toguid; 295 296 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 297 return (EINTR); 298 299 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 300 return (EINTR); 301 302 /* free anything past the end of the file */ 303 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 304 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 305 return (EINTR); 306 if (dsp->dsa_err) 307 return (EINTR); 308 return (0); 309} 310 311#define BP_SPAN(dnp, level) \ 312 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 313 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 314 315/* ARGSUSED */ 316static int 317backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 318 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 319{ 320 dmu_sendarg_t *dsp = arg; 321 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 322 int err = 0; 323 324 if (issig(JUSTLOOKING) && issig(FORREAL)) 325 return (EINTR); 326 327 if (zb->zb_object != DMU_META_DNODE_OBJECT && 328 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 329 return (0); 330 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 331 uint64_t span = BP_SPAN(dnp, zb->zb_level); 332 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 333 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 334 } else if (bp == NULL) { 335 uint64_t span = BP_SPAN(dnp, zb->zb_level); 336 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 337 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 338 return (0); 339 } else if (type == DMU_OT_DNODE) { 340 dnode_phys_t *blk; 341 int i; 342 int blksz = BP_GET_LSIZE(bp); 343 uint32_t aflags = ARC_WAIT; 344 arc_buf_t *abuf; 345 346 if (dsl_read(NULL, spa, bp, pbuf, 347 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 348 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 349 return (EIO); 350 351 blk = abuf->b_data; 352 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 353 uint64_t dnobj = (zb->zb_blkid << 354 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 355 err = dump_dnode(dsp, dnobj, blk+i); 356 if (err) 357 break; 358 } 359 (void) arc_buf_remove_ref(abuf, &abuf); 360 } else if (type == DMU_OT_SA) { 361 uint32_t aflags = ARC_WAIT; 362 arc_buf_t *abuf; 363 int blksz = BP_GET_LSIZE(bp); 364 365 if (arc_read_nolock(NULL, spa, bp, 366 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 367 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 368 return (EIO); 369 370 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 371 (void) arc_buf_remove_ref(abuf, &abuf); 372 } else { /* it's a level-0 block of a regular object */ 373 uint32_t aflags = ARC_WAIT; 374 arc_buf_t *abuf; 375 int blksz = BP_GET_LSIZE(bp); 376 377 if (dsl_read(NULL, spa, bp, pbuf, 378 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 379 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { 380 if (zfs_send_corrupt_data) { 381 /* Send a block filled with 0x"zfs badd bloc" */ 382 abuf = arc_buf_alloc(spa, blksz, &abuf, 383 ARC_BUFC_DATA); 384 uint64_t *ptr; 385 for (ptr = abuf->b_data; 386 (char *)ptr < (char *)abuf->b_data + blksz; 387 ptr++) 388 *ptr = 0x2f5baddb10c; 389 } else { 390 return (EIO); 391 } 392 } 393 394 err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 395 blksz, bp, abuf->b_data); 396 (void) arc_buf_remove_ref(abuf, &abuf); 397 } 398 399 ASSERT(err == 0 || err == EINTR); 400 return (err); 401} 402 403int 404dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 405 int outfd, struct file *fp, offset_t *off) 406{ 407 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 408 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 409 dmu_replay_record_t *drr; 410 dmu_sendarg_t *dsp; 411 int err; 412 uint64_t fromtxg = 0; 413 414 /* tosnap must be a snapshot */ 415 if (ds->ds_phys->ds_next_snap_obj == 0) 416 return (EINVAL); 417 418 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 419 if (fromds && (ds->ds_dir != fromds->ds_dir || 420 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 421 return (EXDEV); 422 423 if (fromorigin) { 424 dsl_pool_t *dp = ds->ds_dir->dd_pool; 425 426 if (fromsnap) 427 return (EINVAL); 428 429 if (dsl_dir_is_clone(ds->ds_dir)) { 430 rw_enter(&dp->dp_config_rwlock, RW_READER); 431 err = dsl_dataset_hold_obj(dp, 432 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 433 rw_exit(&dp->dp_config_rwlock); 434 if (err) 435 return (err); 436 } else { 437 fromorigin = B_FALSE; 438 } 439 } 440 441 442 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 443 drr->drr_type = DRR_BEGIN; 444 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 445 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 446 DMU_SUBSTREAM); 447 448#ifdef _KERNEL 449 if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { 450 uint64_t version; 451 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { 452 kmem_free(drr, sizeof (dmu_replay_record_t)); 453 return (EINVAL); 454 } 455 if (version == ZPL_VERSION_SA) { 456 DMU_SET_FEATUREFLAGS( 457 drr->drr_u.drr_begin.drr_versioninfo, 458 DMU_BACKUP_FEATURE_SA_SPILL); 459 } 460 } 461#endif 462 463 drr->drr_u.drr_begin.drr_creation_time = 464 ds->ds_phys->ds_creation_time; 465 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 466 if (fromorigin) 467 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 468 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 469 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 470 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 471 472 if (fromds) 473 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 474 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 475 476 if (fromds) 477 fromtxg = fromds->ds_phys->ds_creation_txg; 478 if (fromorigin) 479 dsl_dataset_rele(fromds, FTAG); 480 481 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 482 483 dsp->dsa_drr = drr; 484 dsp->dsa_outfd = outfd; 485 dsp->dsa_proc = curproc; 486 dsp->dsa_td = curthread; 487 dsp->dsa_fp = fp; 488 dsp->dsa_os = tosnap; 489 dsp->dsa_off = off; 490 dsp->dsa_toguid = ds->ds_phys->ds_guid; 491 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 492 dsp->dsa_pending_op = PENDING_NONE; 493 494 mutex_enter(&ds->ds_sendstream_lock); 495 list_insert_head(&ds->ds_sendstreams, dsp); 496 mutex_exit(&ds->ds_sendstream_lock); 497 498 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 499 err = dsp->dsa_err; 500 goto out; 501 } 502 503 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 504 backup_cb, dsp); 505 506 if (dsp->dsa_pending_op != PENDING_NONE) 507 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 508 err = EINTR; 509 510 if (err) { 511 if (err == EINTR && dsp->dsa_err) 512 err = dsp->dsa_err; 513 goto out; 514 } 515 516 bzero(drr, sizeof (dmu_replay_record_t)); 517 drr->drr_type = DRR_END; 518 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 519 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 520 521 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 522 err = dsp->dsa_err; 523 goto out; 524 } 525 526out: 527 mutex_enter(&ds->ds_sendstream_lock); 528 list_remove(&ds->ds_sendstreams, dsp); 529 mutex_exit(&ds->ds_sendstream_lock); 530 531 kmem_free(drr, sizeof (dmu_replay_record_t)); 532 kmem_free(dsp, sizeof (dmu_sendarg_t)); 533 534 return (err); 535} 536 537int 538dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 539 uint64_t *sizep) 540{ 541 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 542 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 543 dsl_pool_t *dp = ds->ds_dir->dd_pool; 544 int err; 545 uint64_t size; 546 547 /* tosnap must be a snapshot */ 548 if (ds->ds_phys->ds_next_snap_obj == 0) 549 return (EINVAL); 550 551 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 552 if (fromds && (ds->ds_dir != fromds->ds_dir || 553 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 554 return (EXDEV); 555 556 if (fromorigin) { 557 if (fromsnap) 558 return (EINVAL); 559 560 if (dsl_dir_is_clone(ds->ds_dir)) { 561 rw_enter(&dp->dp_config_rwlock, RW_READER); 562 err = dsl_dataset_hold_obj(dp, 563 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 564 rw_exit(&dp->dp_config_rwlock); 565 if (err) 566 return (err); 567 } else { 568 fromorigin = B_FALSE; 569 } 570 } 571 572 /* Get uncompressed size estimate of changed data. */ 573 if (fromds == NULL) { 574 size = ds->ds_phys->ds_uncompressed_bytes; 575 } else { 576 uint64_t used, comp; 577 err = dsl_dataset_space_written(fromds, ds, 578 &used, &comp, &size); 579 if (fromorigin) 580 dsl_dataset_rele(fromds, FTAG); 581 if (err) 582 return (err); 583 } 584 585 /* 586 * Assume that space (both on-disk and in-stream) is dominated by 587 * data. We will adjust for indirect blocks and the copies property, 588 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 589 */ 590 591 /* 592 * Subtract out approximate space used by indirect blocks. 593 * Assume most space is used by data blocks (non-indirect, non-dnode). 594 * Assume all blocks are recordsize. Assume ditto blocks and 595 * internal fragmentation counter out compression. 596 * 597 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 598 * block, which we observe in practice. 599 */ 600 uint64_t recordsize; 601 rw_enter(&dp->dp_config_rwlock, RW_READER); 602 err = dsl_prop_get_ds(ds, "recordsize", 603 sizeof (recordsize), 1, &recordsize, NULL); 604 rw_exit(&dp->dp_config_rwlock); 605 if (err) 606 return (err); 607 size -= size / recordsize * sizeof (blkptr_t); 608 609 /* Add in the space for the record associated with each block. */ 610 size += size / recordsize * sizeof (dmu_replay_record_t); 611 612 *sizep = size; 613 614 return (0); 615} 616 617struct recvbeginsyncarg { 618 const char *tofs; 619 const char *tosnap; 620 dsl_dataset_t *origin; 621 uint64_t fromguid; 622 dmu_objset_type_t type; 623 void *tag; 624 boolean_t force; 625 uint64_t dsflags; 626 char clonelastname[MAXNAMELEN]; 627 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 628 cred_t *cr; 629}; 630 631/* ARGSUSED */ 632static int 633recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 634{ 635 dsl_dir_t *dd = arg1; 636 struct recvbeginsyncarg *rbsa = arg2; 637 objset_t *mos = dd->dd_pool->dp_meta_objset; 638 uint64_t val; 639 int err; 640 641 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 642 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 643 644 if (err != ENOENT) 645 return (err ? err : EEXIST); 646 647 if (rbsa->origin) { 648 /* make sure it's a snap in the same pool */ 649 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 650 return (EXDEV); 651 if (!dsl_dataset_is_snapshot(rbsa->origin)) 652 return (EINVAL); 653 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 654 return (ENODEV); 655 } 656 657 return (0); 658} 659 660static void 661recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) 662{ 663 dsl_dir_t *dd = arg1; 664 struct recvbeginsyncarg *rbsa = arg2; 665 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 666 uint64_t dsobj; 667 668 /* Create and open new dataset. */ 669 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 670 rbsa->origin, flags, rbsa->cr, tx); 671 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 672 B_TRUE, dmu_recv_tag, &rbsa->ds)); 673 674 if (rbsa->origin == NULL) { 675 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 676 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 677 } 678 679 spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, 680 dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); 681} 682 683/* ARGSUSED */ 684static int 685recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 686{ 687 dsl_dataset_t *ds = arg1; 688 struct recvbeginsyncarg *rbsa = arg2; 689 int err; 690 uint64_t val; 691 692 /* must not have any changes since most recent snapshot */ 693 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 694 return (ETXTBSY); 695 696 /* new snapshot name must not exist */ 697 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 698 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 699 if (err == 0) 700 return (EEXIST); 701 if (err != ENOENT) 702 return (err); 703 704 if (rbsa->fromguid) { 705 /* if incremental, most recent snapshot must match fromguid */ 706 if (ds->ds_prev == NULL) 707 return (ENODEV); 708 709 /* 710 * most recent snapshot must match fromguid, or there are no 711 * changes since the fromguid one 712 */ 713 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 714 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 715 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 716 while (obj != 0) { 717 dsl_dataset_t *snap; 718 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 719 obj, FTAG, &snap); 720 if (err) 721 return (ENODEV); 722 if (snap->ds_phys->ds_creation_txg < birth) { 723 dsl_dataset_rele(snap, FTAG); 724 return (ENODEV); 725 } 726 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 727 dsl_dataset_rele(snap, FTAG); 728 break; /* it's ok */ 729 } 730 obj = snap->ds_phys->ds_prev_snap_obj; 731 dsl_dataset_rele(snap, FTAG); 732 } 733 if (obj == 0) 734 return (ENODEV); 735 } 736 } else { 737 /* if full, most recent snapshot must be $ORIGIN */ 738 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 739 return (ENODEV); 740 } 741 742 /* temporary clone name must not exist */ 743 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 744 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 745 rbsa->clonelastname, 8, 1, &val); 746 if (err == 0) 747 return (EEXIST); 748 if (err != ENOENT) 749 return (err); 750 751 return (0); 752} 753 754/* ARGSUSED */ 755static void 756recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) 757{ 758 dsl_dataset_t *ohds = arg1; 759 struct recvbeginsyncarg *rbsa = arg2; 760 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 761 dsl_dataset_t *cds; 762 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 763 uint64_t dsobj; 764 765 /* create and open the temporary clone */ 766 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 767 ohds->ds_prev, flags, rbsa->cr, tx); 768 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 769 770 /* 771 * If we actually created a non-clone, we need to create the 772 * objset in our new dataset. 773 */ 774 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 775 (void) dmu_objset_create_impl(dp->dp_spa, 776 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 777 } 778 779 rbsa->ds = cds; 780 781 spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, 782 dp->dp_spa, tx, "dataset = %lld", dsobj); 783} 784 785static boolean_t 786dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) 787{ 788 int featureflags; 789 790 featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 791 792 /* Verify pool version supports SA if SA_SPILL feature set */ 793 return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 794 (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); 795} 796 797/* 798 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 799 * succeeds; otherwise we will leak the holds on the datasets. 800 */ 801int 802dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 803 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 804{ 805 int err = 0; 806 boolean_t byteswap; 807 struct recvbeginsyncarg rbsa = { 0 }; 808 uint64_t versioninfo; 809 int flags; 810 dsl_dataset_t *ds; 811 812 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 813 byteswap = FALSE; 814 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 815 byteswap = TRUE; 816 else 817 return (EINVAL); 818 819 rbsa.tofs = tofs; 820 rbsa.tosnap = tosnap; 821 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 822 rbsa.fromguid = drrb->drr_fromguid; 823 rbsa.type = drrb->drr_type; 824 rbsa.tag = FTAG; 825 rbsa.dsflags = 0; 826 rbsa.cr = CRED(); 827 versioninfo = drrb->drr_versioninfo; 828 flags = drrb->drr_flags; 829 830 if (byteswap) { 831 rbsa.type = BSWAP_32(rbsa.type); 832 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 833 versioninfo = BSWAP_64(versioninfo); 834 flags = BSWAP_32(flags); 835 } 836 837 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 838 rbsa.type >= DMU_OST_NUMTYPES || 839 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 840 return (EINVAL); 841 842 if (flags & DRR_FLAG_CI_DATA) 843 rbsa.dsflags = DS_FLAG_CI_DATASET; 844 845 bzero(drc, sizeof (dmu_recv_cookie_t)); 846 drc->drc_drrb = drrb; 847 drc->drc_tosnap = tosnap; 848 drc->drc_top_ds = top_ds; 849 drc->drc_force = force; 850 851 /* 852 * Process the begin in syncing context. 853 */ 854 855 /* open the dataset we are logically receiving into */ 856 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 857 if (err == 0) { 858 if (dmu_recv_verify_features(ds, drrb)) { 859 dsl_dataset_rele(ds, dmu_recv_tag); 860 return (ENOTSUP); 861 } 862 /* target fs already exists; recv into temp clone */ 863 864 /* Can't recv a clone into an existing fs */ 865 if (flags & DRR_FLAG_CLONE) { 866 dsl_dataset_rele(ds, dmu_recv_tag); 867 return (EINVAL); 868 } 869 870 /* must not have an incremental recv already in progress */ 871 if (!mutex_tryenter(&ds->ds_recvlock)) { 872 dsl_dataset_rele(ds, dmu_recv_tag); 873 return (EBUSY); 874 } 875 876 /* tmp clone name is: tofs/%tosnap" */ 877 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 878 "%%%s", tosnap); 879 rbsa.force = force; 880 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 881 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 882 if (err) { 883 mutex_exit(&ds->ds_recvlock); 884 dsl_dataset_rele(ds, dmu_recv_tag); 885 return (err); 886 } 887 drc->drc_logical_ds = ds; 888 drc->drc_real_ds = rbsa.ds; 889 } else if (err == ENOENT) { 890 /* target fs does not exist; must be a full backup or clone */ 891 char *cp; 892 893 /* 894 * If it's a non-clone incremental, we are missing the 895 * target fs, so fail the recv. 896 */ 897 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 898 return (ENOENT); 899 900 /* Open the parent of tofs */ 901 cp = strrchr(tofs, '/'); 902 *cp = '\0'; 903 err = dsl_dataset_hold(tofs, FTAG, &ds); 904 *cp = '/'; 905 if (err) 906 return (err); 907 908 if (dmu_recv_verify_features(ds, drrb)) { 909 dsl_dataset_rele(ds, FTAG); 910 return (ENOTSUP); 911 } 912 913 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 914 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 915 dsl_dataset_rele(ds, FTAG); 916 if (err) 917 return (err); 918 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; 919 drc->drc_newfs = B_TRUE; 920 } 921 922 return (err); 923} 924 925struct restorearg { 926 int err; 927 int byteswap; 928 kthread_t *td; 929 struct file *fp; 930 char *buf; 931 uint64_t voff; 932 int bufsize; /* amount of memory allocated for buf */ 933 zio_cksum_t cksum; 934 avl_tree_t *guid_to_ds_map; 935}; 936 937typedef struct guid_map_entry { 938 uint64_t guid; 939 dsl_dataset_t *gme_ds; 940 avl_node_t avlnode; 941} guid_map_entry_t; 942 943static int 944guid_compare(const void *arg1, const void *arg2) 945{ 946 const guid_map_entry_t *gmep1 = arg1; 947 const guid_map_entry_t *gmep2 = arg2; 948 949 if (gmep1->guid < gmep2->guid) 950 return (-1); 951 else if (gmep1->guid > gmep2->guid) 952 return (1); 953 return (0); 954} 955 956static void 957free_guid_map_onexit(void *arg) 958{ 959 avl_tree_t *ca = arg; 960 void *cookie = NULL; 961 guid_map_entry_t *gmep; 962 963 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 964 dsl_dataset_rele(gmep->gme_ds, ca); 965 kmem_free(gmep, sizeof (guid_map_entry_t)); 966 } 967 avl_destroy(ca); 968 kmem_free(ca, sizeof (avl_tree_t)); 969} 970 971static int 972restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) 973{ 974 struct uio auio; 975 struct iovec aiov; 976 int error; 977 978 aiov.iov_base = buf; 979 aiov.iov_len = len; 980 auio.uio_iov = &aiov; 981 auio.uio_iovcnt = 1; 982 auio.uio_resid = len; 983 auio.uio_segflg = UIO_SYSSPACE; 984 auio.uio_rw = UIO_READ; 985 auio.uio_offset = off; 986 auio.uio_td = ra->td; 987#ifdef _KERNEL 988 error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); 989#else 990 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 991 error = EOPNOTSUPP; 992#endif 993 *resid = auio.uio_resid; 994 return (error); 995} 996 997static void * 998restore_read(struct restorearg *ra, int len) 999{ 1000 void *rv; 1001 int done = 0; 1002 1003 /* some things will require 8-byte alignment, so everything must */ 1004 ASSERT3U(len % 8, ==, 0); 1005 1006 while (done < len) { 1007 ssize_t resid; 1008 1009 ra->err = restore_bytes(ra, (caddr_t)ra->buf + done, 1010 len - done, ra->voff, &resid); 1011 1012 if (resid == len - done) 1013 ra->err = EINVAL; 1014 ra->voff += len - done - resid; 1015 done = len - resid; 1016 if (ra->err) 1017 return (NULL); 1018 } 1019 1020 ASSERT3U(done, ==, len); 1021 rv = ra->buf; 1022 if (ra->byteswap) 1023 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 1024 else 1025 fletcher_4_incremental_native(rv, len, &ra->cksum); 1026 return (rv); 1027} 1028 1029static void 1030backup_byteswap(dmu_replay_record_t *drr) 1031{ 1032#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1033#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1034 drr->drr_type = BSWAP_32(drr->drr_type); 1035 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1036 switch (drr->drr_type) { 1037 case DRR_BEGIN: 1038 DO64(drr_begin.drr_magic); 1039 DO64(drr_begin.drr_versioninfo); 1040 DO64(drr_begin.drr_creation_time); 1041 DO32(drr_begin.drr_type); 1042 DO32(drr_begin.drr_flags); 1043 DO64(drr_begin.drr_toguid); 1044 DO64(drr_begin.drr_fromguid); 1045 break; 1046 case DRR_OBJECT: 1047 DO64(drr_object.drr_object); 1048 /* DO64(drr_object.drr_allocation_txg); */ 1049 DO32(drr_object.drr_type); 1050 DO32(drr_object.drr_bonustype); 1051 DO32(drr_object.drr_blksz); 1052 DO32(drr_object.drr_bonuslen); 1053 DO64(drr_object.drr_toguid); 1054 break; 1055 case DRR_FREEOBJECTS: 1056 DO64(drr_freeobjects.drr_firstobj); 1057 DO64(drr_freeobjects.drr_numobjs); 1058 DO64(drr_freeobjects.drr_toguid); 1059 break; 1060 case DRR_WRITE: 1061 DO64(drr_write.drr_object); 1062 DO32(drr_write.drr_type); 1063 DO64(drr_write.drr_offset); 1064 DO64(drr_write.drr_length); 1065 DO64(drr_write.drr_toguid); 1066 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1067 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1068 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1069 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1070 DO64(drr_write.drr_key.ddk_prop); 1071 break; 1072 case DRR_WRITE_BYREF: 1073 DO64(drr_write_byref.drr_object); 1074 DO64(drr_write_byref.drr_offset); 1075 DO64(drr_write_byref.drr_length); 1076 DO64(drr_write_byref.drr_toguid); 1077 DO64(drr_write_byref.drr_refguid); 1078 DO64(drr_write_byref.drr_refobject); 1079 DO64(drr_write_byref.drr_refoffset); 1080 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1081 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1082 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1083 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1084 DO64(drr_write_byref.drr_key.ddk_prop); 1085 break; 1086 case DRR_FREE: 1087 DO64(drr_free.drr_object); 1088 DO64(drr_free.drr_offset); 1089 DO64(drr_free.drr_length); 1090 DO64(drr_free.drr_toguid); 1091 break; 1092 case DRR_SPILL: 1093 DO64(drr_spill.drr_object); 1094 DO64(drr_spill.drr_length); 1095 DO64(drr_spill.drr_toguid); 1096 break; 1097 case DRR_END: 1098 DO64(drr_end.drr_checksum.zc_word[0]); 1099 DO64(drr_end.drr_checksum.zc_word[1]); 1100 DO64(drr_end.drr_checksum.zc_word[2]); 1101 DO64(drr_end.drr_checksum.zc_word[3]); 1102 DO64(drr_end.drr_toguid); 1103 break; 1104 } 1105#undef DO64 1106#undef DO32 1107} 1108 1109static int 1110restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1111{ 1112 int err; 1113 dmu_tx_t *tx; 1114 void *data = NULL; 1115 1116 if (drro->drr_type == DMU_OT_NONE || 1117 !DMU_OT_IS_VALID(drro->drr_type) || 1118 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1119 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1120 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1121 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1122 drro->drr_blksz < SPA_MINBLOCKSIZE || 1123 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1124 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1125 return (EINVAL); 1126 } 1127 1128 err = dmu_object_info(os, drro->drr_object, NULL); 1129 1130 if (err != 0 && err != ENOENT) 1131 return (EINVAL); 1132 1133 if (drro->drr_bonuslen) { 1134 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1135 if (ra->err) 1136 return (ra->err); 1137 } 1138 1139 if (err == ENOENT) { 1140 /* currently free, want to be allocated */ 1141 tx = dmu_tx_create(os); 1142 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1143 err = dmu_tx_assign(tx, TXG_WAIT); 1144 if (err) { 1145 dmu_tx_abort(tx); 1146 return (err); 1147 } 1148 err = dmu_object_claim(os, drro->drr_object, 1149 drro->drr_type, drro->drr_blksz, 1150 drro->drr_bonustype, drro->drr_bonuslen, tx); 1151 dmu_tx_commit(tx); 1152 } else { 1153 /* currently allocated, want to be allocated */ 1154 err = dmu_object_reclaim(os, drro->drr_object, 1155 drro->drr_type, drro->drr_blksz, 1156 drro->drr_bonustype, drro->drr_bonuslen); 1157 } 1158 if (err) { 1159 return (EINVAL); 1160 } 1161 1162 tx = dmu_tx_create(os); 1163 dmu_tx_hold_bonus(tx, drro->drr_object); 1164 err = dmu_tx_assign(tx, TXG_WAIT); 1165 if (err) { 1166 dmu_tx_abort(tx); 1167 return (err); 1168 } 1169 1170 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1171 tx); 1172 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1173 1174 if (data != NULL) { 1175 dmu_buf_t *db; 1176 1177 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1178 dmu_buf_will_dirty(db, tx); 1179 1180 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1181 bcopy(data, db->db_data, drro->drr_bonuslen); 1182 if (ra->byteswap) { 1183 dmu_object_byteswap_t byteswap = 1184 DMU_OT_BYTESWAP(drro->drr_bonustype); 1185 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1186 drro->drr_bonuslen); 1187 } 1188 dmu_buf_rele(db, FTAG); 1189 } 1190 dmu_tx_commit(tx); 1191 return (0); 1192} 1193 1194/* ARGSUSED */ 1195static int 1196restore_freeobjects(struct restorearg *ra, objset_t *os, 1197 struct drr_freeobjects *drrfo) 1198{ 1199 uint64_t obj; 1200 1201 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1202 return (EINVAL); 1203 1204 for (obj = drrfo->drr_firstobj; 1205 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1206 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1207 int err; 1208 1209 if (dmu_object_info(os, obj, NULL) != 0) 1210 continue; 1211 1212 err = dmu_free_object(os, obj); 1213 if (err) 1214 return (err); 1215 } 1216 return (0); 1217} 1218 1219static int 1220restore_write(struct restorearg *ra, objset_t *os, 1221 struct drr_write *drrw) 1222{ 1223 dmu_tx_t *tx; 1224 void *data; 1225 int err; 1226 1227 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1228 !DMU_OT_IS_VALID(drrw->drr_type)) 1229 return (EINVAL); 1230 1231 data = restore_read(ra, drrw->drr_length); 1232 if (data == NULL) 1233 return (ra->err); 1234 1235 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1236 return (EINVAL); 1237 1238 tx = dmu_tx_create(os); 1239 1240 dmu_tx_hold_write(tx, drrw->drr_object, 1241 drrw->drr_offset, drrw->drr_length); 1242 err = dmu_tx_assign(tx, TXG_WAIT); 1243 if (err) { 1244 dmu_tx_abort(tx); 1245 return (err); 1246 } 1247 if (ra->byteswap) { 1248 dmu_object_byteswap_t byteswap = 1249 DMU_OT_BYTESWAP(drrw->drr_type); 1250 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1251 } 1252 dmu_write(os, drrw->drr_object, 1253 drrw->drr_offset, drrw->drr_length, data, tx); 1254 dmu_tx_commit(tx); 1255 return (0); 1256} 1257 1258/* 1259 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1260 * streams to refer to a copy of the data that is already on the 1261 * system because it came in earlier in the stream. This function 1262 * finds the earlier copy of the data, and uses that copy instead of 1263 * data from the stream to fulfill this write. 1264 */ 1265static int 1266restore_write_byref(struct restorearg *ra, objset_t *os, 1267 struct drr_write_byref *drrwbr) 1268{ 1269 dmu_tx_t *tx; 1270 int err; 1271 guid_map_entry_t gmesrch; 1272 guid_map_entry_t *gmep; 1273 avl_index_t where; 1274 objset_t *ref_os = NULL; 1275 dmu_buf_t *dbp; 1276 1277 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1278 return (EINVAL); 1279 1280 /* 1281 * If the GUID of the referenced dataset is different from the 1282 * GUID of the target dataset, find the referenced dataset. 1283 */ 1284 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1285 gmesrch.guid = drrwbr->drr_refguid; 1286 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1287 &where)) == NULL) { 1288 return (EINVAL); 1289 } 1290 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1291 return (EINVAL); 1292 } else { 1293 ref_os = os; 1294 } 1295 1296 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1297 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 1298 return (err); 1299 1300 tx = dmu_tx_create(os); 1301 1302 dmu_tx_hold_write(tx, drrwbr->drr_object, 1303 drrwbr->drr_offset, drrwbr->drr_length); 1304 err = dmu_tx_assign(tx, TXG_WAIT); 1305 if (err) { 1306 dmu_tx_abort(tx); 1307 return (err); 1308 } 1309 dmu_write(os, drrwbr->drr_object, 1310 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1311 dmu_buf_rele(dbp, FTAG); 1312 dmu_tx_commit(tx); 1313 return (0); 1314} 1315 1316static int 1317restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1318{ 1319 dmu_tx_t *tx; 1320 void *data; 1321 dmu_buf_t *db, *db_spill; 1322 int err; 1323 1324 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1325 drrs->drr_length > SPA_MAXBLOCKSIZE) 1326 return (EINVAL); 1327 1328 data = restore_read(ra, drrs->drr_length); 1329 if (data == NULL) 1330 return (ra->err); 1331 1332 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1333 return (EINVAL); 1334 1335 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1336 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1337 dmu_buf_rele(db, FTAG); 1338 return (err); 1339 } 1340 1341 tx = dmu_tx_create(os); 1342 1343 dmu_tx_hold_spill(tx, db->db_object); 1344 1345 err = dmu_tx_assign(tx, TXG_WAIT); 1346 if (err) { 1347 dmu_buf_rele(db, FTAG); 1348 dmu_buf_rele(db_spill, FTAG); 1349 dmu_tx_abort(tx); 1350 return (err); 1351 } 1352 dmu_buf_will_dirty(db_spill, tx); 1353 1354 if (db_spill->db_size < drrs->drr_length) 1355 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1356 drrs->drr_length, tx)); 1357 bcopy(data, db_spill->db_data, drrs->drr_length); 1358 1359 dmu_buf_rele(db, FTAG); 1360 dmu_buf_rele(db_spill, FTAG); 1361 1362 dmu_tx_commit(tx); 1363 return (0); 1364} 1365 1366/* ARGSUSED */ 1367static int 1368restore_free(struct restorearg *ra, objset_t *os, 1369 struct drr_free *drrf) 1370{ 1371 int err; 1372 1373 if (drrf->drr_length != -1ULL && 1374 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1375 return (EINVAL); 1376 1377 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1378 return (EINVAL); 1379 1380 err = dmu_free_long_range(os, drrf->drr_object, 1381 drrf->drr_offset, drrf->drr_length); 1382 return (err); 1383} 1384 1385/* 1386 * NB: callers *must* call dmu_recv_end() if this succeeds. 1387 */ 1388int 1389dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, 1390 int cleanup_fd, uint64_t *action_handlep) 1391{ 1392 struct restorearg ra = { 0 }; 1393 dmu_replay_record_t *drr; 1394 objset_t *os; 1395 zio_cksum_t pcksum; 1396 int featureflags; 1397 1398 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1399 ra.byteswap = TRUE; 1400 1401 { 1402 /* compute checksum of drr_begin record */ 1403 dmu_replay_record_t *drr; 1404 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1405 1406 drr->drr_type = DRR_BEGIN; 1407 drr->drr_u.drr_begin = *drc->drc_drrb; 1408 if (ra.byteswap) { 1409 fletcher_4_incremental_byteswap(drr, 1410 sizeof (dmu_replay_record_t), &ra.cksum); 1411 } else { 1412 fletcher_4_incremental_native(drr, 1413 sizeof (dmu_replay_record_t), &ra.cksum); 1414 } 1415 kmem_free(drr, sizeof (dmu_replay_record_t)); 1416 } 1417 1418 if (ra.byteswap) { 1419 struct drr_begin *drrb = drc->drc_drrb; 1420 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1421 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1422 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1423 drrb->drr_type = BSWAP_32(drrb->drr_type); 1424 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1425 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1426 } 1427 1428 ra.td = curthread; 1429 ra.fp = fp; 1430 ra.voff = *voffp; 1431 ra.bufsize = 1<<20; 1432 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1433 1434 /* these were verified in dmu_recv_begin */ 1435 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 1436 DMU_SUBSTREAM); 1437 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 1438 1439 /* 1440 * Open the objset we are modifying. 1441 */ 1442 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 1443 1444 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1445 1446 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1447 1448 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1449 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1450 minor_t minor; 1451 1452 if (cleanup_fd == -1) { 1453 ra.err = EBADF; 1454 goto out; 1455 } 1456 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1457 if (ra.err) { 1458 cleanup_fd = -1; 1459 goto out; 1460 } 1461 1462 if (*action_handlep == 0) { 1463 ra.guid_to_ds_map = 1464 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1465 avl_create(ra.guid_to_ds_map, guid_compare, 1466 sizeof (guid_map_entry_t), 1467 offsetof(guid_map_entry_t, avlnode)); 1468 ra.err = zfs_onexit_add_cb(minor, 1469 free_guid_map_onexit, ra.guid_to_ds_map, 1470 action_handlep); 1471 if (ra.err) 1472 goto out; 1473 } else { 1474 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1475 (void **)&ra.guid_to_ds_map); 1476 if (ra.err) 1477 goto out; 1478 } 1479 1480 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1481 } 1482 1483 /* 1484 * Read records and process them. 1485 */ 1486 pcksum = ra.cksum; 1487 while (ra.err == 0 && 1488 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1489 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1490 ra.err = EINTR; 1491 goto out; 1492 } 1493 1494 if (ra.byteswap) 1495 backup_byteswap(drr); 1496 1497 switch (drr->drr_type) { 1498 case DRR_OBJECT: 1499 { 1500 /* 1501 * We need to make a copy of the record header, 1502 * because restore_{object,write} may need to 1503 * restore_read(), which will invalidate drr. 1504 */ 1505 struct drr_object drro = drr->drr_u.drr_object; 1506 ra.err = restore_object(&ra, os, &drro); 1507 break; 1508 } 1509 case DRR_FREEOBJECTS: 1510 { 1511 struct drr_freeobjects drrfo = 1512 drr->drr_u.drr_freeobjects; 1513 ra.err = restore_freeobjects(&ra, os, &drrfo); 1514 break; 1515 } 1516 case DRR_WRITE: 1517 { 1518 struct drr_write drrw = drr->drr_u.drr_write; 1519 ra.err = restore_write(&ra, os, &drrw); 1520 break; 1521 } 1522 case DRR_WRITE_BYREF: 1523 { 1524 struct drr_write_byref drrwbr = 1525 drr->drr_u.drr_write_byref; 1526 ra.err = restore_write_byref(&ra, os, &drrwbr); 1527 break; 1528 } 1529 case DRR_FREE: 1530 { 1531 struct drr_free drrf = drr->drr_u.drr_free; 1532 ra.err = restore_free(&ra, os, &drrf); 1533 break; 1534 } 1535 case DRR_END: 1536 { 1537 struct drr_end drre = drr->drr_u.drr_end; 1538 /* 1539 * We compare against the *previous* checksum 1540 * value, because the stored checksum is of 1541 * everything before the DRR_END record. 1542 */ 1543 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1544 ra.err = ECKSUM; 1545 goto out; 1546 } 1547 case DRR_SPILL: 1548 { 1549 struct drr_spill drrs = drr->drr_u.drr_spill; 1550 ra.err = restore_spill(&ra, os, &drrs); 1551 break; 1552 } 1553 default: 1554 ra.err = EINVAL; 1555 goto out; 1556 } 1557 pcksum = ra.cksum; 1558 } 1559 ASSERT(ra.err != 0); 1560 1561out: 1562 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1563 zfs_onexit_fd_rele(cleanup_fd); 1564 1565 if (ra.err != 0) { 1566 /* 1567 * destroy what we created, so we don't leave it in the 1568 * inconsistent restoring state. 1569 */ 1570 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 1571 1572 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1573 B_FALSE); 1574 if (drc->drc_real_ds != drc->drc_logical_ds) { 1575 mutex_exit(&drc->drc_logical_ds->ds_recvlock); 1576 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); 1577 } 1578 } 1579 1580 kmem_free(ra.buf, ra.bufsize); 1581 *voffp = ra.voff; 1582 return (ra.err); 1583} 1584 1585struct recvendsyncarg { 1586 char *tosnap; 1587 uint64_t creation_time; 1588 uint64_t toguid; 1589}; 1590 1591static int 1592recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 1593{ 1594 dsl_dataset_t *ds = arg1; 1595 struct recvendsyncarg *resa = arg2; 1596 1597 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 1598} 1599 1600static void 1601recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1602{ 1603 dsl_dataset_t *ds = arg1; 1604 struct recvendsyncarg *resa = arg2; 1605 1606 dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); 1607 1608 /* set snapshot's creation time and guid */ 1609 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1610 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 1611 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 1612 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1613 1614 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1615 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1616} 1617 1618static int 1619add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) 1620{ 1621 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1622 uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; 1623 dsl_dataset_t *snapds; 1624 guid_map_entry_t *gmep; 1625 int err; 1626 1627 ASSERT(guid_map != NULL); 1628 1629 rw_enter(&dp->dp_config_rwlock, RW_READER); 1630 err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); 1631 if (err == 0) { 1632 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 1633 gmep->guid = snapds->ds_phys->ds_guid; 1634 gmep->gme_ds = snapds; 1635 avl_add(guid_map, gmep); 1636 } 1637 1638 rw_exit(&dp->dp_config_rwlock); 1639 return (err); 1640} 1641 1642static int 1643dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1644{ 1645 struct recvendsyncarg resa; 1646 dsl_dataset_t *ds = drc->drc_logical_ds; 1647 int err, myerr; 1648 1649 /* 1650 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1651 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1652 * can close it. 1653 */ 1654 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1655 1656 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { 1657 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, 1658 drc->drc_force); 1659 if (err) 1660 goto out; 1661 } else { 1662 mutex_exit(&ds->ds_recvlock); 1663 dsl_dataset_rele(ds, dmu_recv_tag); 1664 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1665 B_FALSE); 1666 return (EBUSY); 1667 } 1668 1669 resa.creation_time = drc->drc_drrb->drr_creation_time; 1670 resa.toguid = drc->drc_drrb->drr_toguid; 1671 resa.tosnap = drc->drc_tosnap; 1672 1673 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1674 recv_end_check, recv_end_sync, ds, &resa, 3); 1675 if (err) { 1676 /* swap back */ 1677 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 1678 } 1679 1680out: 1681 mutex_exit(&ds->ds_recvlock); 1682 if (err == 0 && drc->drc_guid_to_ds_map != NULL) 1683 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1684 dsl_dataset_disown(ds, dmu_recv_tag); 1685 myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 1686 ASSERT3U(myerr, ==, 0); 1687 return (err); 1688} 1689 1690static int 1691dmu_recv_new_end(dmu_recv_cookie_t *drc) 1692{ 1693 struct recvendsyncarg resa; 1694 dsl_dataset_t *ds = drc->drc_logical_ds; 1695 int err; 1696 1697 /* 1698 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1699 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1700 * can close it. 1701 */ 1702 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1703 1704 resa.creation_time = drc->drc_drrb->drr_creation_time; 1705 resa.toguid = drc->drc_drrb->drr_toguid; 1706 resa.tosnap = drc->drc_tosnap; 1707 1708 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1709 recv_end_check, recv_end_sync, ds, &resa, 3); 1710 if (err) { 1711 /* clean up the fs we just recv'd into */ 1712 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 1713 } else { 1714 if (drc->drc_guid_to_ds_map != NULL) 1715 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1716 /* release the hold from dmu_recv_begin */ 1717 dsl_dataset_disown(ds, dmu_recv_tag); 1718 } 1719 return (err); 1720} 1721 1722int 1723dmu_recv_end(dmu_recv_cookie_t *drc) 1724{ 1725 if (drc->drc_logical_ds != drc->drc_real_ds) 1726 return (dmu_recv_existing_end(drc)); 1727 else 1728 return (dmu_recv_new_end(drc)); 1729} 1730