dmu_send.c revision 221263
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24/* 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28#include <sys/dmu.h> 29#include <sys/dmu_impl.h> 30#include <sys/dmu_tx.h> 31#include <sys/dbuf.h> 32#include <sys/dnode.h> 33#include <sys/zfs_context.h> 34#include <sys/dmu_objset.h> 35#include <sys/dmu_traverse.h> 36#include <sys/dsl_dataset.h> 37#include <sys/dsl_dir.h> 38#include <sys/dsl_prop.h> 39#include <sys/dsl_pool.h> 40#include <sys/dsl_synctask.h> 41#include <sys/zfs_ioctl.h> 42#include <sys/zap.h> 43#include <sys/zio_checksum.h> 44#include <sys/zfs_znode.h> 45#include <zfs_fletcher.h> 46#include <sys/avl.h> 47#include <sys/ddt.h> 48#include <sys/zfs_onexit.h> 49 50static char *dmu_recv_tag = "dmu_recv_tag"; 51 52/* 53 * The list of data whose inclusion in a send stream can be pending from 54 * one call to backup_cb to another. Multiple calls to dump_free() and 55 * dump_freeobjects() can be aggregated into a single DRR_FREE or 56 * DRR_FREEOBJECTS replay record. 57 */ 58typedef enum { 59 PENDING_NONE, 60 PENDING_FREE, 61 PENDING_FREEOBJECTS 62} pendop_t; 63 64struct backuparg { 65 dmu_replay_record_t *drr; 66 kthread_t *td; 67 struct file *fp; 68 offset_t *off; 69 objset_t *os; 70 zio_cksum_t zc; 71 uint64_t toguid; 72 int err; 73 pendop_t pending_op; 74}; 75 76static int 77dump_bytes(struct backuparg *ba, void *buf, int len) 78{ 79 struct uio auio; 80 struct iovec aiov; 81 ASSERT3U(len % 8, ==, 0); 82 83 fletcher_4_incremental_native(buf, len, &ba->zc); 84 aiov.iov_base = buf; 85 aiov.iov_len = len; 86 auio.uio_iov = &aiov; 87 auio.uio_iovcnt = 1; 88 auio.uio_resid = len; 89 auio.uio_segflg = UIO_SYSSPACE; 90 auio.uio_rw = UIO_WRITE; 91 auio.uio_offset = (off_t)-1; 92 auio.uio_td = ba->td; 93#ifdef _KERNEL 94 if (ba->fp->f_type == DTYPE_VNODE) 95 bwillwrite(); 96 ba->err = fo_write(ba->fp, &auio, ba->td->td_ucred, 0, ba->td); 97#else 98 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 99 ba->err = EOPNOTSUPP; 100#endif 101 *ba->off += len; 102 return (ba->err); 103} 104 105static int 106dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, 107 uint64_t length) 108{ 109 struct drr_free *drrf = &(ba->drr->drr_u.drr_free); 110 111 /* 112 * If there is a pending op, but it's not PENDING_FREE, push it out, 113 * since free block aggregation can only be done for blocks of the 114 * same type (i.e., DRR_FREE records can only be aggregated with 115 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 116 * aggregated with other DRR_FREEOBJECTS records. 117 */ 118 if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { 119 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 120 return (EINTR); 121 ba->pending_op = PENDING_NONE; 122 } 123 124 if (ba->pending_op == PENDING_FREE) { 125 /* 126 * There should never be a PENDING_FREE if length is -1 127 * (because dump_dnode is the only place where this 128 * function is called with a -1, and only after flushing 129 * any pending record). 130 */ 131 ASSERT(length != -1ULL); 132 /* 133 * Check to see whether this free block can be aggregated 134 * with pending one. 135 */ 136 if (drrf->drr_object == object && drrf->drr_offset + 137 drrf->drr_length == offset) { 138 drrf->drr_length += length; 139 return (0); 140 } else { 141 /* not a continuation. Push out pending record */ 142 if (dump_bytes(ba, ba->drr, 143 sizeof (dmu_replay_record_t)) != 0) 144 return (EINTR); 145 ba->pending_op = PENDING_NONE; 146 } 147 } 148 /* create a FREE record and make it pending */ 149 bzero(ba->drr, sizeof (dmu_replay_record_t)); 150 ba->drr->drr_type = DRR_FREE; 151 drrf->drr_object = object; 152 drrf->drr_offset = offset; 153 drrf->drr_length = length; 154 drrf->drr_toguid = ba->toguid; 155 if (length == -1ULL) { 156 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 157 return (EINTR); 158 } else { 159 ba->pending_op = PENDING_FREE; 160 } 161 162 return (0); 163} 164 165static int 166dump_data(struct backuparg *ba, dmu_object_type_t type, 167 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 168{ 169 struct drr_write *drrw = &(ba->drr->drr_u.drr_write); 170 171 172 /* 173 * If there is any kind of pending aggregation (currently either 174 * a grouping of free objects or free blocks), push it out to 175 * the stream, since aggregation can't be done across operations 176 * of different types. 177 */ 178 if (ba->pending_op != PENDING_NONE) { 179 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 180 return (EINTR); 181 ba->pending_op = PENDING_NONE; 182 } 183 /* write a DATA record */ 184 bzero(ba->drr, sizeof (dmu_replay_record_t)); 185 ba->drr->drr_type = DRR_WRITE; 186 drrw->drr_object = object; 187 drrw->drr_type = type; 188 drrw->drr_offset = offset; 189 drrw->drr_length = blksz; 190 drrw->drr_toguid = ba->toguid; 191 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 192 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 193 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 194 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 195 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 196 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 197 drrw->drr_key.ddk_cksum = bp->blk_cksum; 198 199 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 200 return (EINTR); 201 if (dump_bytes(ba, data, blksz) != 0) 202 return (EINTR); 203 return (0); 204} 205 206static int 207dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data) 208{ 209 struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill); 210 211 if (ba->pending_op != PENDING_NONE) { 212 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 213 return (EINTR); 214 ba->pending_op = PENDING_NONE; 215 } 216 217 /* write a SPILL record */ 218 bzero(ba->drr, sizeof (dmu_replay_record_t)); 219 ba->drr->drr_type = DRR_SPILL; 220 drrs->drr_object = object; 221 drrs->drr_length = blksz; 222 drrs->drr_toguid = ba->toguid; 223 224 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 225 return (EINTR); 226 if (dump_bytes(ba, data, blksz)) 227 return (EINTR); 228 return (0); 229} 230 231static int 232dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) 233{ 234 struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); 235 236 /* 237 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 238 * push it out, since free block aggregation can only be done for 239 * blocks of the same type (i.e., DRR_FREE records can only be 240 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 241 * can only be aggregated with other DRR_FREEOBJECTS records. 242 */ 243 if (ba->pending_op != PENDING_NONE && 244 ba->pending_op != PENDING_FREEOBJECTS) { 245 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 246 return (EINTR); 247 ba->pending_op = PENDING_NONE; 248 } 249 if (ba->pending_op == PENDING_FREEOBJECTS) { 250 /* 251 * See whether this free object array can be aggregated 252 * with pending one 253 */ 254 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 255 drrfo->drr_numobjs += numobjs; 256 return (0); 257 } else { 258 /* can't be aggregated. Push out pending record */ 259 if (dump_bytes(ba, ba->drr, 260 sizeof (dmu_replay_record_t)) != 0) 261 return (EINTR); 262 ba->pending_op = PENDING_NONE; 263 } 264 } 265 266 /* write a FREEOBJECTS record */ 267 bzero(ba->drr, sizeof (dmu_replay_record_t)); 268 ba->drr->drr_type = DRR_FREEOBJECTS; 269 drrfo->drr_firstobj = firstobj; 270 drrfo->drr_numobjs = numobjs; 271 drrfo->drr_toguid = ba->toguid; 272 273 ba->pending_op = PENDING_FREEOBJECTS; 274 275 return (0); 276} 277 278static int 279dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) 280{ 281 struct drr_object *drro = &(ba->drr->drr_u.drr_object); 282 283 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 284 return (dump_freeobjects(ba, object, 1)); 285 286 if (ba->pending_op != PENDING_NONE) { 287 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 288 return (EINTR); 289 ba->pending_op = PENDING_NONE; 290 } 291 292 /* write an OBJECT record */ 293 bzero(ba->drr, sizeof (dmu_replay_record_t)); 294 ba->drr->drr_type = DRR_OBJECT; 295 drro->drr_object = object; 296 drro->drr_type = dnp->dn_type; 297 drro->drr_bonustype = dnp->dn_bonustype; 298 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 299 drro->drr_bonuslen = dnp->dn_bonuslen; 300 drro->drr_checksumtype = dnp->dn_checksum; 301 drro->drr_compress = dnp->dn_compress; 302 drro->drr_toguid = ba->toguid; 303 304 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 305 return (EINTR); 306 307 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 308 return (EINTR); 309 310 /* free anything past the end of the file */ 311 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * 312 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 313 return (EINTR); 314 if (ba->err) 315 return (EINTR); 316 return (0); 317} 318 319#define BP_SPAN(dnp, level) \ 320 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 321 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 322 323/* ARGSUSED */ 324static int 325backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 326 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 327{ 328 struct backuparg *ba = arg; 329 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 330 int err = 0; 331 332 if (issig(JUSTLOOKING) && issig(FORREAL)) 333 return (EINTR); 334 335 if (zb->zb_object != DMU_META_DNODE_OBJECT && 336 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 337 return (0); 338 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 339 uint64_t span = BP_SPAN(dnp, zb->zb_level); 340 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 341 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); 342 } else if (bp == NULL) { 343 uint64_t span = BP_SPAN(dnp, zb->zb_level); 344 err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); 345 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 346 return (0); 347 } else if (type == DMU_OT_DNODE) { 348 dnode_phys_t *blk; 349 int i; 350 int blksz = BP_GET_LSIZE(bp); 351 uint32_t aflags = ARC_WAIT; 352 arc_buf_t *abuf; 353 354 if (dsl_read(NULL, spa, bp, pbuf, 355 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 356 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 357 return (EIO); 358 359 blk = abuf->b_data; 360 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 361 uint64_t dnobj = (zb->zb_blkid << 362 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 363 err = dump_dnode(ba, dnobj, blk+i); 364 if (err) 365 break; 366 } 367 (void) arc_buf_remove_ref(abuf, &abuf); 368 } else if (type == DMU_OT_SA) { 369 uint32_t aflags = ARC_WAIT; 370 arc_buf_t *abuf; 371 int blksz = BP_GET_LSIZE(bp); 372 373 if (arc_read_nolock(NULL, spa, bp, 374 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 375 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 376 return (EIO); 377 378 err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data); 379 (void) arc_buf_remove_ref(abuf, &abuf); 380 } else { /* it's a level-0 block of a regular object */ 381 uint32_t aflags = ARC_WAIT; 382 arc_buf_t *abuf; 383 int blksz = BP_GET_LSIZE(bp); 384 385 if (dsl_read(NULL, spa, bp, pbuf, 386 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 387 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 388 return (EIO); 389 390 err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, 391 blksz, bp, abuf->b_data); 392 (void) arc_buf_remove_ref(abuf, &abuf); 393 } 394 395 ASSERT(err == 0 || err == EINTR); 396 return (err); 397} 398 399int 400dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 401 struct file *fp, offset_t *off) 402{ 403 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 404 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 405 dmu_replay_record_t *drr; 406 struct backuparg ba; 407 int err; 408 uint64_t fromtxg = 0; 409 410 /* tosnap must be a snapshot */ 411 if (ds->ds_phys->ds_next_snap_obj == 0) 412 return (EINVAL); 413 414 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 415 if (fromds && (ds->ds_dir != fromds->ds_dir || 416 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 417 return (EXDEV); 418 419 if (fromorigin) { 420 dsl_pool_t *dp = ds->ds_dir->dd_pool; 421 422 if (fromsnap) 423 return (EINVAL); 424 425 if (dsl_dir_is_clone(ds->ds_dir)) { 426 rw_enter(&dp->dp_config_rwlock, RW_READER); 427 err = dsl_dataset_hold_obj(dp, 428 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 429 rw_exit(&dp->dp_config_rwlock); 430 if (err) 431 return (err); 432 } else { 433 fromorigin = B_FALSE; 434 } 435 } 436 437 438 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 439 drr->drr_type = DRR_BEGIN; 440 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 441 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 442 DMU_SUBSTREAM); 443 444#ifdef _KERNEL 445 if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { 446 uint64_t version; 447 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) 448 return (EINVAL); 449 if (version == ZPL_VERSION_SA) { 450 DMU_SET_FEATUREFLAGS( 451 drr->drr_u.drr_begin.drr_versioninfo, 452 DMU_BACKUP_FEATURE_SA_SPILL); 453 } 454 } 455#endif 456 457 drr->drr_u.drr_begin.drr_creation_time = 458 ds->ds_phys->ds_creation_time; 459 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 460 if (fromorigin) 461 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 462 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 463 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 464 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 465 466 if (fromds) 467 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 468 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 469 470 if (fromds) 471 fromtxg = fromds->ds_phys->ds_creation_txg; 472 if (fromorigin) 473 dsl_dataset_rele(fromds, FTAG); 474 475 ba.drr = drr; 476 ba.td = curthread; 477 ba.fp = fp; 478 ba.os = tosnap; 479 ba.off = off; 480 ba.toguid = ds->ds_phys->ds_guid; 481 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); 482 ba.pending_op = PENDING_NONE; 483 484 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 485 kmem_free(drr, sizeof (dmu_replay_record_t)); 486 return (ba.err); 487 } 488 489 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 490 backup_cb, &ba); 491 492 if (ba.pending_op != PENDING_NONE) 493 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) 494 err = EINTR; 495 496 if (err) { 497 if (err == EINTR && ba.err) 498 err = ba.err; 499 kmem_free(drr, sizeof (dmu_replay_record_t)); 500 return (err); 501 } 502 503 bzero(drr, sizeof (dmu_replay_record_t)); 504 drr->drr_type = DRR_END; 505 drr->drr_u.drr_end.drr_checksum = ba.zc; 506 drr->drr_u.drr_end.drr_toguid = ba.toguid; 507 508 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 509 kmem_free(drr, sizeof (dmu_replay_record_t)); 510 return (ba.err); 511 } 512 513 kmem_free(drr, sizeof (dmu_replay_record_t)); 514 515 return (0); 516} 517 518struct recvbeginsyncarg { 519 const char *tofs; 520 const char *tosnap; 521 dsl_dataset_t *origin; 522 uint64_t fromguid; 523 dmu_objset_type_t type; 524 void *tag; 525 boolean_t force; 526 uint64_t dsflags; 527 char clonelastname[MAXNAMELEN]; 528 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 529 cred_t *cr; 530}; 531 532/* ARGSUSED */ 533static int 534recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 535{ 536 dsl_dir_t *dd = arg1; 537 struct recvbeginsyncarg *rbsa = arg2; 538 objset_t *mos = dd->dd_pool->dp_meta_objset; 539 uint64_t val; 540 int err; 541 542 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 543 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 544 545 if (err != ENOENT) 546 return (err ? err : EEXIST); 547 548 if (rbsa->origin) { 549 /* make sure it's a snap in the same pool */ 550 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 551 return (EXDEV); 552 if (!dsl_dataset_is_snapshot(rbsa->origin)) 553 return (EINVAL); 554 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 555 return (ENODEV); 556 } 557 558 return (0); 559} 560 561static void 562recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) 563{ 564 dsl_dir_t *dd = arg1; 565 struct recvbeginsyncarg *rbsa = arg2; 566 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 567 uint64_t dsobj; 568 569 /* Create and open new dataset. */ 570 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 571 rbsa->origin, flags, rbsa->cr, tx); 572 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 573 B_TRUE, dmu_recv_tag, &rbsa->ds)); 574 575 if (rbsa->origin == NULL) { 576 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 577 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 578 } 579 580 spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, 581 dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); 582} 583 584/* ARGSUSED */ 585static int 586recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 587{ 588 dsl_dataset_t *ds = arg1; 589 struct recvbeginsyncarg *rbsa = arg2; 590 int err; 591 uint64_t val; 592 593 /* must not have any changes since most recent snapshot */ 594 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 595 return (ETXTBSY); 596 597 /* new snapshot name must not exist */ 598 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 599 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 600 if (err == 0) 601 return (EEXIST); 602 if (err != ENOENT) 603 return (err); 604 605 if (rbsa->fromguid) { 606 /* if incremental, most recent snapshot must match fromguid */ 607 if (ds->ds_prev == NULL) 608 return (ENODEV); 609 610 /* 611 * most recent snapshot must match fromguid, or there are no 612 * changes since the fromguid one 613 */ 614 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 615 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 616 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 617 while (obj != 0) { 618 dsl_dataset_t *snap; 619 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 620 obj, FTAG, &snap); 621 if (err) 622 return (ENODEV); 623 if (snap->ds_phys->ds_creation_txg < birth) { 624 dsl_dataset_rele(snap, FTAG); 625 return (ENODEV); 626 } 627 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 628 dsl_dataset_rele(snap, FTAG); 629 break; /* it's ok */ 630 } 631 obj = snap->ds_phys->ds_prev_snap_obj; 632 dsl_dataset_rele(snap, FTAG); 633 } 634 if (obj == 0) 635 return (ENODEV); 636 } 637 } else { 638 /* if full, most recent snapshot must be $ORIGIN */ 639 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 640 return (ENODEV); 641 } 642 643 /* temporary clone name must not exist */ 644 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 645 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 646 rbsa->clonelastname, 8, 1, &val); 647 if (err == 0) 648 return (EEXIST); 649 if (err != ENOENT) 650 return (err); 651 652 return (0); 653} 654 655/* ARGSUSED */ 656static void 657recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) 658{ 659 dsl_dataset_t *ohds = arg1; 660 struct recvbeginsyncarg *rbsa = arg2; 661 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 662 dsl_dataset_t *cds; 663 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 664 uint64_t dsobj; 665 666 /* create and open the temporary clone */ 667 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 668 ohds->ds_prev, flags, rbsa->cr, tx); 669 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 670 671 /* 672 * If we actually created a non-clone, we need to create the 673 * objset in our new dataset. 674 */ 675 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 676 (void) dmu_objset_create_impl(dp->dp_spa, 677 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 678 } 679 680 rbsa->ds = cds; 681 682 spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, 683 dp->dp_spa, tx, "dataset = %lld", dsobj); 684} 685 686static boolean_t 687dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) 688{ 689 int featureflags; 690 691 featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 692 693 /* Verify pool version supports SA if SA_SPILL feature set */ 694 return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 695 (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); 696} 697 698/* 699 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 700 * succeeds; otherwise we will leak the holds on the datasets. 701 */ 702int 703dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 704 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 705{ 706 int err = 0; 707 boolean_t byteswap; 708 struct recvbeginsyncarg rbsa = { 0 }; 709 uint64_t versioninfo; 710 int flags; 711 dsl_dataset_t *ds; 712 713 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 714 byteswap = FALSE; 715 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 716 byteswap = TRUE; 717 else 718 return (EINVAL); 719 720 rbsa.tofs = tofs; 721 rbsa.tosnap = tosnap; 722 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 723 rbsa.fromguid = drrb->drr_fromguid; 724 rbsa.type = drrb->drr_type; 725 rbsa.tag = FTAG; 726 rbsa.dsflags = 0; 727 rbsa.cr = CRED(); 728 versioninfo = drrb->drr_versioninfo; 729 flags = drrb->drr_flags; 730 731 if (byteswap) { 732 rbsa.type = BSWAP_32(rbsa.type); 733 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 734 versioninfo = BSWAP_64(versioninfo); 735 flags = BSWAP_32(flags); 736 } 737 738 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 739 rbsa.type >= DMU_OST_NUMTYPES || 740 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 741 return (EINVAL); 742 743 if (flags & DRR_FLAG_CI_DATA) 744 rbsa.dsflags = DS_FLAG_CI_DATASET; 745 746 bzero(drc, sizeof (dmu_recv_cookie_t)); 747 drc->drc_drrb = drrb; 748 drc->drc_tosnap = tosnap; 749 drc->drc_top_ds = top_ds; 750 drc->drc_force = force; 751 752 /* 753 * Process the begin in syncing context. 754 */ 755 756 /* open the dataset we are logically receiving into */ 757 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 758 if (err == 0) { 759 if (dmu_recv_verify_features(ds, drrb)) { 760 dsl_dataset_rele(ds, dmu_recv_tag); 761 return (ENOTSUP); 762 } 763 /* target fs already exists; recv into temp clone */ 764 765 /* Can't recv a clone into an existing fs */ 766 if (flags & DRR_FLAG_CLONE) { 767 dsl_dataset_rele(ds, dmu_recv_tag); 768 return (EINVAL); 769 } 770 771 /* must not have an incremental recv already in progress */ 772 if (!mutex_tryenter(&ds->ds_recvlock)) { 773 dsl_dataset_rele(ds, dmu_recv_tag); 774 return (EBUSY); 775 } 776 777 /* tmp clone name is: tofs/%tosnap" */ 778 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 779 "%%%s", tosnap); 780 rbsa.force = force; 781 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 782 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 783 if (err) { 784 mutex_exit(&ds->ds_recvlock); 785 dsl_dataset_rele(ds, dmu_recv_tag); 786 return (err); 787 } 788 drc->drc_logical_ds = ds; 789 drc->drc_real_ds = rbsa.ds; 790 } else if (err == ENOENT) { 791 /* target fs does not exist; must be a full backup or clone */ 792 char *cp; 793 794 /* 795 * If it's a non-clone incremental, we are missing the 796 * target fs, so fail the recv. 797 */ 798 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 799 return (ENOENT); 800 801 /* Open the parent of tofs */ 802 cp = strrchr(tofs, '/'); 803 *cp = '\0'; 804 err = dsl_dataset_hold(tofs, FTAG, &ds); 805 *cp = '/'; 806 if (err) 807 return (err); 808 809 if (dmu_recv_verify_features(ds, drrb)) { 810 dsl_dataset_rele(ds, FTAG); 811 return (ENOTSUP); 812 } 813 814 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 815 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 816 dsl_dataset_rele(ds, FTAG); 817 if (err) 818 return (err); 819 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; 820 drc->drc_newfs = B_TRUE; 821 } 822 823 return (err); 824} 825 826struct restorearg { 827 int err; 828 int byteswap; 829 kthread_t *td; 830 struct file *fp; 831 char *buf; 832 uint64_t voff; 833 int bufsize; /* amount of memory allocated for buf */ 834 zio_cksum_t cksum; 835 avl_tree_t *guid_to_ds_map; 836}; 837 838typedef struct guid_map_entry { 839 uint64_t guid; 840 dsl_dataset_t *gme_ds; 841 avl_node_t avlnode; 842} guid_map_entry_t; 843 844static int 845guid_compare(const void *arg1, const void *arg2) 846{ 847 const guid_map_entry_t *gmep1 = arg1; 848 const guid_map_entry_t *gmep2 = arg2; 849 850 if (gmep1->guid < gmep2->guid) 851 return (-1); 852 else if (gmep1->guid > gmep2->guid) 853 return (1); 854 return (0); 855} 856 857static void 858free_guid_map_onexit(void *arg) 859{ 860 avl_tree_t *ca = arg; 861 void *cookie = NULL; 862 guid_map_entry_t *gmep; 863 864 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 865 dsl_dataset_rele(gmep->gme_ds, ca); 866 kmem_free(gmep, sizeof (guid_map_entry_t)); 867 } 868 avl_destroy(ca); 869 kmem_free(ca, sizeof (avl_tree_t)); 870} 871 872static int 873restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) 874{ 875 struct uio auio; 876 struct iovec aiov; 877 int error; 878 879 aiov.iov_base = buf; 880 aiov.iov_len = len; 881 auio.uio_iov = &aiov; 882 auio.uio_iovcnt = 1; 883 auio.uio_resid = len; 884 auio.uio_segflg = UIO_SYSSPACE; 885 auio.uio_rw = UIO_READ; 886 auio.uio_offset = off; 887 auio.uio_td = ra->td; 888#ifdef _KERNEL 889 error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); 890#else 891 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 892 error = EOPNOTSUPP; 893#endif 894 *resid = auio.uio_resid; 895 return (error); 896} 897 898static void * 899restore_read(struct restorearg *ra, int len) 900{ 901 void *rv; 902 int done = 0; 903 904 /* some things will require 8-byte alignment, so everything must */ 905 ASSERT3U(len % 8, ==, 0); 906 907 while (done < len) { 908 ssize_t resid; 909 910 ra->err = restore_bytes(ra, (caddr_t)ra->buf + done, 911 len - done, ra->voff, &resid); 912 913 if (resid == len - done) 914 ra->err = EINVAL; 915 ra->voff += len - done - resid; 916 done = len - resid; 917 if (ra->err) 918 return (NULL); 919 } 920 921 ASSERT3U(done, ==, len); 922 rv = ra->buf; 923 if (ra->byteswap) 924 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 925 else 926 fletcher_4_incremental_native(rv, len, &ra->cksum); 927 return (rv); 928} 929 930static void 931backup_byteswap(dmu_replay_record_t *drr) 932{ 933#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 934#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 935 drr->drr_type = BSWAP_32(drr->drr_type); 936 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 937 switch (drr->drr_type) { 938 case DRR_BEGIN: 939 DO64(drr_begin.drr_magic); 940 DO64(drr_begin.drr_versioninfo); 941 DO64(drr_begin.drr_creation_time); 942 DO32(drr_begin.drr_type); 943 DO32(drr_begin.drr_flags); 944 DO64(drr_begin.drr_toguid); 945 DO64(drr_begin.drr_fromguid); 946 break; 947 case DRR_OBJECT: 948 DO64(drr_object.drr_object); 949 /* DO64(drr_object.drr_allocation_txg); */ 950 DO32(drr_object.drr_type); 951 DO32(drr_object.drr_bonustype); 952 DO32(drr_object.drr_blksz); 953 DO32(drr_object.drr_bonuslen); 954 DO64(drr_object.drr_toguid); 955 break; 956 case DRR_FREEOBJECTS: 957 DO64(drr_freeobjects.drr_firstobj); 958 DO64(drr_freeobjects.drr_numobjs); 959 DO64(drr_freeobjects.drr_toguid); 960 break; 961 case DRR_WRITE: 962 DO64(drr_write.drr_object); 963 DO32(drr_write.drr_type); 964 DO64(drr_write.drr_offset); 965 DO64(drr_write.drr_length); 966 DO64(drr_write.drr_toguid); 967 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 968 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 969 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 970 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 971 DO64(drr_write.drr_key.ddk_prop); 972 break; 973 case DRR_WRITE_BYREF: 974 DO64(drr_write_byref.drr_object); 975 DO64(drr_write_byref.drr_offset); 976 DO64(drr_write_byref.drr_length); 977 DO64(drr_write_byref.drr_toguid); 978 DO64(drr_write_byref.drr_refguid); 979 DO64(drr_write_byref.drr_refobject); 980 DO64(drr_write_byref.drr_refoffset); 981 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 982 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 983 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 984 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 985 DO64(drr_write_byref.drr_key.ddk_prop); 986 break; 987 case DRR_FREE: 988 DO64(drr_free.drr_object); 989 DO64(drr_free.drr_offset); 990 DO64(drr_free.drr_length); 991 DO64(drr_free.drr_toguid); 992 break; 993 case DRR_SPILL: 994 DO64(drr_spill.drr_object); 995 DO64(drr_spill.drr_length); 996 DO64(drr_spill.drr_toguid); 997 break; 998 case DRR_END: 999 DO64(drr_end.drr_checksum.zc_word[0]); 1000 DO64(drr_end.drr_checksum.zc_word[1]); 1001 DO64(drr_end.drr_checksum.zc_word[2]); 1002 DO64(drr_end.drr_checksum.zc_word[3]); 1003 DO64(drr_end.drr_toguid); 1004 break; 1005 } 1006#undef DO64 1007#undef DO32 1008} 1009 1010static int 1011restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1012{ 1013 int err; 1014 dmu_tx_t *tx; 1015 void *data = NULL; 1016 1017 if (drro->drr_type == DMU_OT_NONE || 1018 drro->drr_type >= DMU_OT_NUMTYPES || 1019 drro->drr_bonustype >= DMU_OT_NUMTYPES || 1020 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1021 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1022 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1023 drro->drr_blksz < SPA_MINBLOCKSIZE || 1024 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1025 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1026 return (EINVAL); 1027 } 1028 1029 err = dmu_object_info(os, drro->drr_object, NULL); 1030 1031 if (err != 0 && err != ENOENT) 1032 return (EINVAL); 1033 1034 if (drro->drr_bonuslen) { 1035 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1036 if (ra->err) 1037 return (ra->err); 1038 } 1039 1040 if (err == ENOENT) { 1041 /* currently free, want to be allocated */ 1042 tx = dmu_tx_create(os); 1043 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1044 err = dmu_tx_assign(tx, TXG_WAIT); 1045 if (err) { 1046 dmu_tx_abort(tx); 1047 return (err); 1048 } 1049 err = dmu_object_claim(os, drro->drr_object, 1050 drro->drr_type, drro->drr_blksz, 1051 drro->drr_bonustype, drro->drr_bonuslen, tx); 1052 dmu_tx_commit(tx); 1053 } else { 1054 /* currently allocated, want to be allocated */ 1055 err = dmu_object_reclaim(os, drro->drr_object, 1056 drro->drr_type, drro->drr_blksz, 1057 drro->drr_bonustype, drro->drr_bonuslen); 1058 } 1059 if (err) { 1060 return (EINVAL); 1061 } 1062 1063 tx = dmu_tx_create(os); 1064 dmu_tx_hold_bonus(tx, drro->drr_object); 1065 err = dmu_tx_assign(tx, TXG_WAIT); 1066 if (err) { 1067 dmu_tx_abort(tx); 1068 return (err); 1069 } 1070 1071 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1072 tx); 1073 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1074 1075 if (data != NULL) { 1076 dmu_buf_t *db; 1077 1078 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1079 dmu_buf_will_dirty(db, tx); 1080 1081 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1082 bcopy(data, db->db_data, drro->drr_bonuslen); 1083 if (ra->byteswap) { 1084 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, 1085 drro->drr_bonuslen); 1086 } 1087 dmu_buf_rele(db, FTAG); 1088 } 1089 dmu_tx_commit(tx); 1090 return (0); 1091} 1092 1093/* ARGSUSED */ 1094static int 1095restore_freeobjects(struct restorearg *ra, objset_t *os, 1096 struct drr_freeobjects *drrfo) 1097{ 1098 uint64_t obj; 1099 1100 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1101 return (EINVAL); 1102 1103 for (obj = drrfo->drr_firstobj; 1104 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1105 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1106 int err; 1107 1108 if (dmu_object_info(os, obj, NULL) != 0) 1109 continue; 1110 1111 err = dmu_free_object(os, obj); 1112 if (err) 1113 return (err); 1114 } 1115 return (0); 1116} 1117 1118static int 1119restore_write(struct restorearg *ra, objset_t *os, 1120 struct drr_write *drrw) 1121{ 1122 dmu_tx_t *tx; 1123 void *data; 1124 int err; 1125 1126 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1127 drrw->drr_type >= DMU_OT_NUMTYPES) 1128 return (EINVAL); 1129 1130 data = restore_read(ra, drrw->drr_length); 1131 if (data == NULL) 1132 return (ra->err); 1133 1134 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1135 return (EINVAL); 1136 1137 tx = dmu_tx_create(os); 1138 1139 dmu_tx_hold_write(tx, drrw->drr_object, 1140 drrw->drr_offset, drrw->drr_length); 1141 err = dmu_tx_assign(tx, TXG_WAIT); 1142 if (err) { 1143 dmu_tx_abort(tx); 1144 return (err); 1145 } 1146 if (ra->byteswap) 1147 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); 1148 dmu_write(os, drrw->drr_object, 1149 drrw->drr_offset, drrw->drr_length, data, tx); 1150 dmu_tx_commit(tx); 1151 return (0); 1152} 1153 1154/* 1155 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1156 * streams to refer to a copy of the data that is already on the 1157 * system because it came in earlier in the stream. This function 1158 * finds the earlier copy of the data, and uses that copy instead of 1159 * data from the stream to fulfill this write. 1160 */ 1161static int 1162restore_write_byref(struct restorearg *ra, objset_t *os, 1163 struct drr_write_byref *drrwbr) 1164{ 1165 dmu_tx_t *tx; 1166 int err; 1167 guid_map_entry_t gmesrch; 1168 guid_map_entry_t *gmep; 1169 avl_index_t where; 1170 objset_t *ref_os = NULL; 1171 dmu_buf_t *dbp; 1172 1173 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1174 return (EINVAL); 1175 1176 /* 1177 * If the GUID of the referenced dataset is different from the 1178 * GUID of the target dataset, find the referenced dataset. 1179 */ 1180 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1181 gmesrch.guid = drrwbr->drr_refguid; 1182 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1183 &where)) == NULL) { 1184 return (EINVAL); 1185 } 1186 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1187 return (EINVAL); 1188 } else { 1189 ref_os = os; 1190 } 1191 1192 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1193 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 1194 return (err); 1195 1196 tx = dmu_tx_create(os); 1197 1198 dmu_tx_hold_write(tx, drrwbr->drr_object, 1199 drrwbr->drr_offset, drrwbr->drr_length); 1200 err = dmu_tx_assign(tx, TXG_WAIT); 1201 if (err) { 1202 dmu_tx_abort(tx); 1203 return (err); 1204 } 1205 dmu_write(os, drrwbr->drr_object, 1206 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1207 dmu_buf_rele(dbp, FTAG); 1208 dmu_tx_commit(tx); 1209 return (0); 1210} 1211 1212static int 1213restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1214{ 1215 dmu_tx_t *tx; 1216 void *data; 1217 dmu_buf_t *db, *db_spill; 1218 int err; 1219 1220 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1221 drrs->drr_length > SPA_MAXBLOCKSIZE) 1222 return (EINVAL); 1223 1224 data = restore_read(ra, drrs->drr_length); 1225 if (data == NULL) 1226 return (ra->err); 1227 1228 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1229 return (EINVAL); 1230 1231 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1232 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1233 dmu_buf_rele(db, FTAG); 1234 return (err); 1235 } 1236 1237 tx = dmu_tx_create(os); 1238 1239 dmu_tx_hold_spill(tx, db->db_object); 1240 1241 err = dmu_tx_assign(tx, TXG_WAIT); 1242 if (err) { 1243 dmu_buf_rele(db, FTAG); 1244 dmu_buf_rele(db_spill, FTAG); 1245 dmu_tx_abort(tx); 1246 return (err); 1247 } 1248 dmu_buf_will_dirty(db_spill, tx); 1249 1250 if (db_spill->db_size < drrs->drr_length) 1251 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1252 drrs->drr_length, tx)); 1253 bcopy(data, db_spill->db_data, drrs->drr_length); 1254 1255 dmu_buf_rele(db, FTAG); 1256 dmu_buf_rele(db_spill, FTAG); 1257 1258 dmu_tx_commit(tx); 1259 return (0); 1260} 1261 1262/* ARGSUSED */ 1263static int 1264restore_free(struct restorearg *ra, objset_t *os, 1265 struct drr_free *drrf) 1266{ 1267 int err; 1268 1269 if (drrf->drr_length != -1ULL && 1270 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1271 return (EINVAL); 1272 1273 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1274 return (EINVAL); 1275 1276 err = dmu_free_long_range(os, drrf->drr_object, 1277 drrf->drr_offset, drrf->drr_length); 1278 return (err); 1279} 1280 1281/* 1282 * NB: callers *must* call dmu_recv_end() if this succeeds. 1283 */ 1284int 1285dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, 1286 int cleanup_fd, uint64_t *action_handlep) 1287{ 1288 struct restorearg ra = { 0 }; 1289 dmu_replay_record_t *drr; 1290 objset_t *os; 1291 zio_cksum_t pcksum; 1292 int featureflags; 1293 1294 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1295 ra.byteswap = TRUE; 1296 1297 { 1298 /* compute checksum of drr_begin record */ 1299 dmu_replay_record_t *drr; 1300 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1301 1302 drr->drr_type = DRR_BEGIN; 1303 drr->drr_u.drr_begin = *drc->drc_drrb; 1304 if (ra.byteswap) { 1305 fletcher_4_incremental_byteswap(drr, 1306 sizeof (dmu_replay_record_t), &ra.cksum); 1307 } else { 1308 fletcher_4_incremental_native(drr, 1309 sizeof (dmu_replay_record_t), &ra.cksum); 1310 } 1311 kmem_free(drr, sizeof (dmu_replay_record_t)); 1312 } 1313 1314 if (ra.byteswap) { 1315 struct drr_begin *drrb = drc->drc_drrb; 1316 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1317 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1318 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1319 drrb->drr_type = BSWAP_32(drrb->drr_type); 1320 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1321 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1322 } 1323 1324 ra.td = curthread; 1325 ra.fp = fp; 1326 ra.voff = *voffp; 1327 ra.bufsize = 1<<20; 1328 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1329 1330 /* these were verified in dmu_recv_begin */ 1331 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 1332 DMU_SUBSTREAM); 1333 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 1334 1335 /* 1336 * Open the objset we are modifying. 1337 */ 1338 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 1339 1340 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1341 1342 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1343 1344 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1345 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1346 minor_t minor; 1347 1348 if (cleanup_fd == -1) { 1349 ra.err = EBADF; 1350 goto out; 1351 } 1352 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1353 if (ra.err) { 1354 cleanup_fd = -1; 1355 goto out; 1356 } 1357 1358 if (*action_handlep == 0) { 1359 ra.guid_to_ds_map = 1360 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1361 avl_create(ra.guid_to_ds_map, guid_compare, 1362 sizeof (guid_map_entry_t), 1363 offsetof(guid_map_entry_t, avlnode)); 1364 ra.err = zfs_onexit_add_cb(minor, 1365 free_guid_map_onexit, ra.guid_to_ds_map, 1366 action_handlep); 1367 if (ra.err) 1368 goto out; 1369 } else { 1370 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1371 (void **)&ra.guid_to_ds_map); 1372 if (ra.err) 1373 goto out; 1374 } 1375 1376 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1377 } 1378 1379 /* 1380 * Read records and process them. 1381 */ 1382 pcksum = ra.cksum; 1383 while (ra.err == 0 && 1384 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1385 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1386 ra.err = EINTR; 1387 goto out; 1388 } 1389 1390 if (ra.byteswap) 1391 backup_byteswap(drr); 1392 1393 switch (drr->drr_type) { 1394 case DRR_OBJECT: 1395 { 1396 /* 1397 * We need to make a copy of the record header, 1398 * because restore_{object,write} may need to 1399 * restore_read(), which will invalidate drr. 1400 */ 1401 struct drr_object drro = drr->drr_u.drr_object; 1402 ra.err = restore_object(&ra, os, &drro); 1403 break; 1404 } 1405 case DRR_FREEOBJECTS: 1406 { 1407 struct drr_freeobjects drrfo = 1408 drr->drr_u.drr_freeobjects; 1409 ra.err = restore_freeobjects(&ra, os, &drrfo); 1410 break; 1411 } 1412 case DRR_WRITE: 1413 { 1414 struct drr_write drrw = drr->drr_u.drr_write; 1415 ra.err = restore_write(&ra, os, &drrw); 1416 break; 1417 } 1418 case DRR_WRITE_BYREF: 1419 { 1420 struct drr_write_byref drrwbr = 1421 drr->drr_u.drr_write_byref; 1422 ra.err = restore_write_byref(&ra, os, &drrwbr); 1423 break; 1424 } 1425 case DRR_FREE: 1426 { 1427 struct drr_free drrf = drr->drr_u.drr_free; 1428 ra.err = restore_free(&ra, os, &drrf); 1429 break; 1430 } 1431 case DRR_END: 1432 { 1433 struct drr_end drre = drr->drr_u.drr_end; 1434 /* 1435 * We compare against the *previous* checksum 1436 * value, because the stored checksum is of 1437 * everything before the DRR_END record. 1438 */ 1439 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1440 ra.err = ECKSUM; 1441 goto out; 1442 } 1443 case DRR_SPILL: 1444 { 1445 struct drr_spill drrs = drr->drr_u.drr_spill; 1446 ra.err = restore_spill(&ra, os, &drrs); 1447 break; 1448 } 1449 default: 1450 ra.err = EINVAL; 1451 goto out; 1452 } 1453 pcksum = ra.cksum; 1454 } 1455 ASSERT(ra.err != 0); 1456 1457out: 1458 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1459 zfs_onexit_fd_rele(cleanup_fd); 1460 1461 if (ra.err != 0) { 1462 /* 1463 * destroy what we created, so we don't leave it in the 1464 * inconsistent restoring state. 1465 */ 1466 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 1467 1468 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1469 B_FALSE); 1470 if (drc->drc_real_ds != drc->drc_logical_ds) { 1471 mutex_exit(&drc->drc_logical_ds->ds_recvlock); 1472 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); 1473 } 1474 } 1475 1476 kmem_free(ra.buf, ra.bufsize); 1477 *voffp = ra.voff; 1478 return (ra.err); 1479} 1480 1481struct recvendsyncarg { 1482 char *tosnap; 1483 uint64_t creation_time; 1484 uint64_t toguid; 1485}; 1486 1487static int 1488recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 1489{ 1490 dsl_dataset_t *ds = arg1; 1491 struct recvendsyncarg *resa = arg2; 1492 1493 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 1494} 1495 1496static void 1497recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1498{ 1499 dsl_dataset_t *ds = arg1; 1500 struct recvendsyncarg *resa = arg2; 1501 1502 dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); 1503 1504 /* set snapshot's creation time and guid */ 1505 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1506 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 1507 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 1508 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1509 1510 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1511 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1512} 1513 1514static int 1515add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) 1516{ 1517 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1518 uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; 1519 dsl_dataset_t *snapds; 1520 guid_map_entry_t *gmep; 1521 int err; 1522 1523 ASSERT(guid_map != NULL); 1524 1525 rw_enter(&dp->dp_config_rwlock, RW_READER); 1526 err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); 1527 if (err == 0) { 1528 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 1529 gmep->guid = snapds->ds_phys->ds_guid; 1530 gmep->gme_ds = snapds; 1531 avl_add(guid_map, gmep); 1532 } 1533 1534 rw_exit(&dp->dp_config_rwlock); 1535 return (err); 1536} 1537 1538static int 1539dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1540{ 1541 struct recvendsyncarg resa; 1542 dsl_dataset_t *ds = drc->drc_logical_ds; 1543 int err; 1544 1545 /* 1546 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1547 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1548 * can close it. 1549 */ 1550 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1551 1552 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { 1553 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, 1554 drc->drc_force); 1555 if (err) 1556 goto out; 1557 } else { 1558 mutex_exit(&ds->ds_recvlock); 1559 dsl_dataset_rele(ds, dmu_recv_tag); 1560 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1561 B_FALSE); 1562 return (EBUSY); 1563 } 1564 1565 resa.creation_time = drc->drc_drrb->drr_creation_time; 1566 resa.toguid = drc->drc_drrb->drr_toguid; 1567 resa.tosnap = drc->drc_tosnap; 1568 1569 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1570 recv_end_check, recv_end_sync, ds, &resa, 3); 1571 if (err) { 1572 /* swap back */ 1573 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 1574 } 1575 1576out: 1577 mutex_exit(&ds->ds_recvlock); 1578 if (err == 0 && drc->drc_guid_to_ds_map != NULL) 1579 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1580 dsl_dataset_disown(ds, dmu_recv_tag); 1581 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 1582 return (err); 1583} 1584 1585static int 1586dmu_recv_new_end(dmu_recv_cookie_t *drc) 1587{ 1588 struct recvendsyncarg resa; 1589 dsl_dataset_t *ds = drc->drc_logical_ds; 1590 int err; 1591 1592 /* 1593 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1594 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1595 * can close it. 1596 */ 1597 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1598 1599 resa.creation_time = drc->drc_drrb->drr_creation_time; 1600 resa.toguid = drc->drc_drrb->drr_toguid; 1601 resa.tosnap = drc->drc_tosnap; 1602 1603 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1604 recv_end_check, recv_end_sync, ds, &resa, 3); 1605 if (err) { 1606 /* clean up the fs we just recv'd into */ 1607 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 1608 } else { 1609 if (drc->drc_guid_to_ds_map != NULL) 1610 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1611 /* release the hold from dmu_recv_begin */ 1612 dsl_dataset_disown(ds, dmu_recv_tag); 1613 } 1614 return (err); 1615} 1616 1617int 1618dmu_recv_end(dmu_recv_cookie_t *drc) 1619{ 1620 if (drc->drc_logical_ds != drc->drc_real_ds) 1621 return (dmu_recv_existing_end(drc)); 1622 else 1623 return (dmu_recv_new_end(drc)); 1624} 1625