dmu.c revision 214378
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22209962Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26168404Spjd#include <sys/dmu.h> 27168404Spjd#include <sys/dmu_impl.h> 28168404Spjd#include <sys/dmu_tx.h> 29168404Spjd#include <sys/dbuf.h> 30168404Spjd#include <sys/dnode.h> 31168404Spjd#include <sys/zfs_context.h> 32168404Spjd#include <sys/dmu_objset.h> 33168404Spjd#include <sys/dmu_traverse.h> 34168404Spjd#include <sys/dsl_dataset.h> 35168404Spjd#include <sys/dsl_dir.h> 36168404Spjd#include <sys/dsl_pool.h> 37168404Spjd#include <sys/dsl_synctask.h> 38168404Spjd#include <sys/dsl_prop.h> 39168404Spjd#include <sys/dmu_zfetch.h> 40168404Spjd#include <sys/zfs_ioctl.h> 41168404Spjd#include <sys/zap.h> 42168404Spjd#include <sys/zio_checksum.h> 43185029Spjd#include <sys/zfs_znode.h> 44168404Spjd 45168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 46168404Spjd { byteswap_uint8_array, TRUE, "unallocated" }, 47168404Spjd { zap_byteswap, TRUE, "object directory" }, 48168404Spjd { byteswap_uint64_array, TRUE, "object array" }, 49168404Spjd { byteswap_uint8_array, TRUE, "packed nvlist" }, 50168404Spjd { byteswap_uint64_array, TRUE, "packed nvlist size" }, 51168404Spjd { byteswap_uint64_array, TRUE, "bplist" }, 52168404Spjd { byteswap_uint64_array, TRUE, "bplist header" }, 53168404Spjd { byteswap_uint64_array, TRUE, "SPA space map header" }, 54168404Spjd { byteswap_uint64_array, TRUE, "SPA space map" }, 55168404Spjd { byteswap_uint64_array, TRUE, "ZIL intent log" }, 56168404Spjd { dnode_buf_byteswap, TRUE, "DMU dnode" }, 57168404Spjd { dmu_objset_byteswap, TRUE, "DMU objset" }, 58168404Spjd { byteswap_uint64_array, TRUE, "DSL directory" }, 59168404Spjd { zap_byteswap, TRUE, "DSL directory child map"}, 60168404Spjd { zap_byteswap, TRUE, "DSL dataset snap map" }, 61168404Spjd { zap_byteswap, TRUE, "DSL props" }, 62168404Spjd { byteswap_uint64_array, TRUE, "DSL dataset" }, 63168404Spjd { zfs_znode_byteswap, TRUE, "ZFS znode" }, 64185029Spjd { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, 65168404Spjd { byteswap_uint8_array, FALSE, "ZFS plain file" }, 66168404Spjd { zap_byteswap, TRUE, "ZFS directory" }, 67168404Spjd { zap_byteswap, TRUE, "ZFS master node" }, 68168404Spjd { zap_byteswap, TRUE, "ZFS delete queue" }, 69168404Spjd { byteswap_uint8_array, FALSE, "zvol object" }, 70168404Spjd { zap_byteswap, TRUE, "zvol prop" }, 71168404Spjd { byteswap_uint8_array, FALSE, "other uint8[]" }, 72168404Spjd { byteswap_uint64_array, FALSE, "other uint64[]" }, 73168404Spjd { zap_byteswap, TRUE, "other ZAP" }, 74168404Spjd { zap_byteswap, TRUE, "persistent error log" }, 75168404Spjd { byteswap_uint8_array, TRUE, "SPA history" }, 76168404Spjd { byteswap_uint64_array, TRUE, "SPA history offsets" }, 77185029Spjd { zap_byteswap, TRUE, "Pool properties" }, 78185029Spjd { zap_byteswap, TRUE, "DSL permissions" }, 79185029Spjd { zfs_acl_byteswap, TRUE, "ZFS ACL" }, 80185029Spjd { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, 81185029Spjd { byteswap_uint8_array, TRUE, "FUID table" }, 82185029Spjd { byteswap_uint64_array, TRUE, "FUID table size" }, 83185029Spjd { zap_byteswap, TRUE, "DSL dataset next clones"}, 84185029Spjd { zap_byteswap, TRUE, "scrub work queue" }, 85209962Smm { zap_byteswap, TRUE, "ZFS user/group used" }, 86209962Smm { zap_byteswap, TRUE, "ZFS user/group quota" }, 87168404Spjd}; 88168404Spjd 89168404Spjdint 90168404Spjddmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 91168404Spjd void *tag, dmu_buf_t **dbp) 92168404Spjd{ 93168404Spjd dnode_t *dn; 94168404Spjd uint64_t blkid; 95168404Spjd dmu_buf_impl_t *db; 96168404Spjd int err; 97168404Spjd 98168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 99168404Spjd if (err) 100168404Spjd return (err); 101168404Spjd blkid = dbuf_whichblock(dn, offset); 102168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 103168404Spjd db = dbuf_hold(dn, blkid, tag); 104168404Spjd rw_exit(&dn->dn_struct_rwlock); 105168404Spjd if (db == NULL) { 106168404Spjd err = EIO; 107168404Spjd } else { 108168404Spjd err = dbuf_read(db, NULL, DB_RF_CANFAIL); 109168404Spjd if (err) { 110168404Spjd dbuf_rele(db, tag); 111168404Spjd db = NULL; 112168404Spjd } 113168404Spjd } 114168404Spjd 115168404Spjd dnode_rele(dn, FTAG); 116168404Spjd *dbp = &db->db; 117168404Spjd return (err); 118168404Spjd} 119168404Spjd 120168404Spjdint 121168404Spjddmu_bonus_max(void) 122168404Spjd{ 123168404Spjd return (DN_MAX_BONUSLEN); 124168404Spjd} 125168404Spjd 126185029Spjdint 127185029Spjddmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) 128185029Spjd{ 129185029Spjd dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 130185029Spjd 131185029Spjd if (dn->dn_bonus != (dmu_buf_impl_t *)db) 132185029Spjd return (EINVAL); 133185029Spjd if (newsize < 0 || newsize > db->db_size) 134185029Spjd return (EINVAL); 135185029Spjd dnode_setbonuslen(dn, newsize, tx); 136185029Spjd return (0); 137185029Spjd} 138185029Spjd 139168404Spjd/* 140168404Spjd * returns ENOENT, EIO, or 0. 141168404Spjd */ 142168404Spjdint 143168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 144168404Spjd{ 145168404Spjd dnode_t *dn; 146168404Spjd dmu_buf_impl_t *db; 147185029Spjd int error; 148168404Spjd 149185029Spjd error = dnode_hold(os->os, object, FTAG, &dn); 150185029Spjd if (error) 151185029Spjd return (error); 152168404Spjd 153168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 154168404Spjd if (dn->dn_bonus == NULL) { 155168404Spjd rw_exit(&dn->dn_struct_rwlock); 156168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 157168404Spjd if (dn->dn_bonus == NULL) 158185029Spjd dbuf_create_bonus(dn); 159168404Spjd } 160168404Spjd db = dn->dn_bonus; 161168404Spjd rw_exit(&dn->dn_struct_rwlock); 162185029Spjd 163185029Spjd /* as long as the bonus buf is held, the dnode will be held */ 164185029Spjd if (refcount_add(&db->db_holds, tag) == 1) 165185029Spjd VERIFY(dnode_add_ref(dn, db)); 166185029Spjd 167168404Spjd dnode_rele(dn, FTAG); 168168404Spjd 169168404Spjd VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); 170168404Spjd 171168404Spjd *dbp = &db->db; 172168404Spjd return (0); 173168404Spjd} 174168404Spjd 175168404Spjd/* 176168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 177168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful, 178168404Spjd * and can induce severe lock contention when writing to several files 179168404Spjd * whose dnodes are in the same block. 180168404Spjd */ 181168404Spjdstatic int 182209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, 183209962Smm int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) 184168404Spjd{ 185185029Spjd dsl_pool_t *dp = NULL; 186168404Spjd dmu_buf_t **dbp; 187168404Spjd uint64_t blkid, nblks, i; 188209962Smm uint32_t dbuf_flags; 189168404Spjd int err; 190168404Spjd zio_t *zio; 191185029Spjd hrtime_t start; 192168404Spjd 193168404Spjd ASSERT(length <= DMU_MAX_ACCESS); 194168404Spjd 195214378Smm dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; 196209962Smm if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) 197209962Smm dbuf_flags |= DB_RF_NOPREFETCH; 198168404Spjd 199168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 200168404Spjd if (dn->dn_datablkshift) { 201168404Spjd int blkshift = dn->dn_datablkshift; 202168404Spjd nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 203168404Spjd P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 204168404Spjd } else { 205168404Spjd if (offset + length > dn->dn_datablksz) { 206168404Spjd zfs_panic_recover("zfs: accessing past end of object " 207168404Spjd "%llx/%llx (size=%u access=%llu+%llu)", 208168404Spjd (longlong_t)dn->dn_objset-> 209168404Spjd os_dsl_dataset->ds_object, 210168404Spjd (longlong_t)dn->dn_object, dn->dn_datablksz, 211168404Spjd (longlong_t)offset, (longlong_t)length); 212214378Smm rw_exit(&dn->dn_struct_rwlock); 213168404Spjd return (EIO); 214168404Spjd } 215168404Spjd nblks = 1; 216168404Spjd } 217168404Spjd dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 218168404Spjd 219185029Spjd if (dn->dn_objset->os_dsl_dataset) 220185029Spjd dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; 221185029Spjd if (dp && dsl_pool_sync_context(dp)) 222185029Spjd start = gethrtime(); 223185029Spjd zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); 224168404Spjd blkid = dbuf_whichblock(dn, offset); 225168404Spjd for (i = 0; i < nblks; i++) { 226168404Spjd dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 227168404Spjd if (db == NULL) { 228168404Spjd rw_exit(&dn->dn_struct_rwlock); 229168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 230168404Spjd zio_nowait(zio); 231168404Spjd return (EIO); 232168404Spjd } 233168404Spjd /* initiate async i/o */ 234168404Spjd if (read) { 235209962Smm (void) dbuf_read(db, zio, dbuf_flags); 236168404Spjd } 237168404Spjd dbp[i] = &db->db; 238168404Spjd } 239168404Spjd rw_exit(&dn->dn_struct_rwlock); 240168404Spjd 241168404Spjd /* wait for async i/o */ 242168404Spjd err = zio_wait(zio); 243185029Spjd /* track read overhead when we are in sync context */ 244185029Spjd if (dp && dsl_pool_sync_context(dp)) 245185029Spjd dp->dp_read_overhead += gethrtime() - start; 246168404Spjd if (err) { 247168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 248168404Spjd return (err); 249168404Spjd } 250168404Spjd 251168404Spjd /* wait for other io to complete */ 252168404Spjd if (read) { 253168404Spjd for (i = 0; i < nblks; i++) { 254168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 255168404Spjd mutex_enter(&db->db_mtx); 256168404Spjd while (db->db_state == DB_READ || 257168404Spjd db->db_state == DB_FILL) 258168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 259168404Spjd if (db->db_state == DB_UNCACHED) 260168404Spjd err = EIO; 261168404Spjd mutex_exit(&db->db_mtx); 262168404Spjd if (err) { 263168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 264168404Spjd return (err); 265168404Spjd } 266168404Spjd } 267168404Spjd } 268168404Spjd 269168404Spjd *numbufsp = nblks; 270168404Spjd *dbpp = dbp; 271168404Spjd return (0); 272168404Spjd} 273168404Spjd 274168404Spjdstatic int 275168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 276168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 277168404Spjd{ 278168404Spjd dnode_t *dn; 279168404Spjd int err; 280168404Spjd 281168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 282168404Spjd if (err) 283168404Spjd return (err); 284168404Spjd 285168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 286209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 287168404Spjd 288168404Spjd dnode_rele(dn, FTAG); 289168404Spjd 290168404Spjd return (err); 291168404Spjd} 292168404Spjd 293168404Spjdint 294168404Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, 295168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 296168404Spjd{ 297168404Spjd dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 298168404Spjd int err; 299168404Spjd 300168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 301209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 302168404Spjd 303168404Spjd return (err); 304168404Spjd} 305168404Spjd 306168404Spjdvoid 307168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 308168404Spjd{ 309168404Spjd int i; 310168404Spjd dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 311168404Spjd 312168404Spjd if (numbufs == 0) 313168404Spjd return; 314168404Spjd 315168404Spjd for (i = 0; i < numbufs; i++) { 316168404Spjd if (dbp[i]) 317168404Spjd dbuf_rele(dbp[i], tag); 318168404Spjd } 319168404Spjd 320168404Spjd kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 321168404Spjd} 322168404Spjd 323168404Spjdvoid 324168404Spjddmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 325168404Spjd{ 326168404Spjd dnode_t *dn; 327168404Spjd uint64_t blkid; 328168404Spjd int nblks, i, err; 329168404Spjd 330194043Skmacy if (zfs_prefetch_disable) 331168404Spjd return; 332168404Spjd 333168404Spjd if (len == 0) { /* they're interested in the bonus buffer */ 334168404Spjd dn = os->os->os_meta_dnode; 335168404Spjd 336168404Spjd if (object == 0 || object >= DN_MAX_OBJECT) 337168404Spjd return; 338168404Spjd 339168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 340168404Spjd blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 341168404Spjd dbuf_prefetch(dn, blkid); 342168404Spjd rw_exit(&dn->dn_struct_rwlock); 343168404Spjd return; 344168404Spjd } 345168404Spjd 346168404Spjd /* 347168404Spjd * XXX - Note, if the dnode for the requested object is not 348168404Spjd * already cached, we will do a *synchronous* read in the 349168404Spjd * dnode_hold() call. The same is true for any indirects. 350168404Spjd */ 351168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 352168404Spjd if (err != 0) 353168404Spjd return; 354168404Spjd 355168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 356168404Spjd if (dn->dn_datablkshift) { 357168404Spjd int blkshift = dn->dn_datablkshift; 358168404Spjd nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 359168404Spjd P2ALIGN(offset, 1<<blkshift)) >> blkshift; 360168404Spjd } else { 361168404Spjd nblks = (offset < dn->dn_datablksz); 362168404Spjd } 363168404Spjd 364168404Spjd if (nblks != 0) { 365168404Spjd blkid = dbuf_whichblock(dn, offset); 366168404Spjd for (i = 0; i < nblks; i++) 367168404Spjd dbuf_prefetch(dn, blkid+i); 368168404Spjd } 369168404Spjd 370168404Spjd rw_exit(&dn->dn_struct_rwlock); 371168404Spjd 372168404Spjd dnode_rele(dn, FTAG); 373168404Spjd} 374168404Spjd 375208775Smm/* 376208775Smm * Get the next "chunk" of file data to free. We traverse the file from 377208775Smm * the end so that the file gets shorter over time (if we crashes in the 378208775Smm * middle, this will leave us in a better state). We find allocated file 379208775Smm * data by simply searching the allocated level 1 indirects. 380208775Smm */ 381185029Spjdstatic int 382208775Smmget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) 383185029Spjd{ 384208775Smm uint64_t len = *start - limit; 385208775Smm uint64_t blkcnt = 0; 386208775Smm uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); 387208775Smm uint64_t iblkrange = 388185029Spjd dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); 389185029Spjd 390208775Smm ASSERT(limit <= *start); 391185029Spjd 392208775Smm if (len <= iblkrange * maxblks) { 393208775Smm *start = limit; 394185029Spjd return (0); 395185029Spjd } 396208775Smm ASSERT(ISP2(iblkrange)); 397185029Spjd 398208775Smm while (*start > limit && blkcnt < maxblks) { 399185029Spjd int err; 400185029Spjd 401208775Smm /* find next allocated L1 indirect */ 402185029Spjd err = dnode_next_offset(dn, 403208775Smm DNODE_FIND_BACKWARDS, start, 2, 1, 0); 404185029Spjd 405208775Smm /* if there are no more, then we are done */ 406208775Smm if (err == ESRCH) { 407208775Smm *start = limit; 408185029Spjd return (0); 409208775Smm } else if (err) { 410208775Smm return (err); 411185029Spjd } 412208775Smm blkcnt += 1; 413185029Spjd 414208775Smm /* reset offset to end of "next" block back */ 415208775Smm *start = P2ALIGN(*start, iblkrange); 416208775Smm if (*start <= limit) 417208775Smm *start = limit; 418208775Smm else 419208775Smm *start -= 1; 420185029Spjd } 421185029Spjd return (0); 422185029Spjd} 423185029Spjd 424185029Spjdstatic int 425185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, 426185029Spjd uint64_t length, boolean_t free_dnode) 427185029Spjd{ 428185029Spjd dmu_tx_t *tx; 429185029Spjd uint64_t object_size, start, end, len; 430185029Spjd boolean_t trunc = (length == DMU_OBJECT_END); 431185029Spjd int align, err; 432185029Spjd 433185029Spjd align = 1 << dn->dn_datablkshift; 434185029Spjd ASSERT(align > 0); 435185029Spjd object_size = align == 1 ? dn->dn_datablksz : 436185029Spjd (dn->dn_maxblkid + 1) << dn->dn_datablkshift; 437185029Spjd 438209962Smm end = offset + length; 439209962Smm if (trunc || end > object_size) 440185029Spjd end = object_size; 441185029Spjd if (end <= offset) 442185029Spjd return (0); 443185029Spjd length = end - offset; 444185029Spjd 445185029Spjd while (length) { 446185029Spjd start = end; 447209962Smm /* assert(offset <= start) */ 448185029Spjd err = get_next_chunk(dn, &start, offset); 449185029Spjd if (err) 450185029Spjd return (err); 451185029Spjd len = trunc ? DMU_OBJECT_END : end - start; 452185029Spjd 453185029Spjd tx = dmu_tx_create(os); 454185029Spjd dmu_tx_hold_free(tx, dn->dn_object, start, len); 455185029Spjd err = dmu_tx_assign(tx, TXG_WAIT); 456185029Spjd if (err) { 457185029Spjd dmu_tx_abort(tx); 458185029Spjd return (err); 459185029Spjd } 460185029Spjd 461185029Spjd dnode_free_range(dn, start, trunc ? -1 : len, tx); 462185029Spjd 463185029Spjd if (start == 0 && free_dnode) { 464185029Spjd ASSERT(trunc); 465185029Spjd dnode_free(dn, tx); 466185029Spjd } 467185029Spjd 468185029Spjd length -= end - start; 469185029Spjd 470185029Spjd dmu_tx_commit(tx); 471185029Spjd end = start; 472185029Spjd } 473185029Spjd return (0); 474185029Spjd} 475185029Spjd 476168404Spjdint 477185029Spjddmu_free_long_range(objset_t *os, uint64_t object, 478185029Spjd uint64_t offset, uint64_t length) 479185029Spjd{ 480185029Spjd dnode_t *dn; 481185029Spjd int err; 482185029Spjd 483185029Spjd err = dnode_hold(os->os, object, FTAG, &dn); 484185029Spjd if (err != 0) 485185029Spjd return (err); 486185029Spjd err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); 487185029Spjd dnode_rele(dn, FTAG); 488185029Spjd return (err); 489185029Spjd} 490185029Spjd 491185029Spjdint 492185029Spjddmu_free_object(objset_t *os, uint64_t object) 493185029Spjd{ 494185029Spjd dnode_t *dn; 495185029Spjd dmu_tx_t *tx; 496185029Spjd int err; 497185029Spjd 498185029Spjd err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, 499185029Spjd FTAG, &dn); 500185029Spjd if (err != 0) 501185029Spjd return (err); 502185029Spjd if (dn->dn_nlevels == 1) { 503185029Spjd tx = dmu_tx_create(os); 504185029Spjd dmu_tx_hold_bonus(tx, object); 505185029Spjd dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); 506185029Spjd err = dmu_tx_assign(tx, TXG_WAIT); 507185029Spjd if (err == 0) { 508185029Spjd dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 509185029Spjd dnode_free(dn, tx); 510185029Spjd dmu_tx_commit(tx); 511185029Spjd } else { 512185029Spjd dmu_tx_abort(tx); 513185029Spjd } 514185029Spjd } else { 515185029Spjd err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); 516185029Spjd } 517185029Spjd dnode_rele(dn, FTAG); 518185029Spjd return (err); 519185029Spjd} 520185029Spjd 521185029Spjdint 522168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 523168404Spjd uint64_t size, dmu_tx_t *tx) 524168404Spjd{ 525168404Spjd dnode_t *dn; 526168404Spjd int err = dnode_hold(os->os, object, FTAG, &dn); 527168404Spjd if (err) 528168404Spjd return (err); 529168404Spjd ASSERT(offset < UINT64_MAX); 530168404Spjd ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 531168404Spjd dnode_free_range(dn, offset, size, tx); 532168404Spjd dnode_rele(dn, FTAG); 533168404Spjd return (0); 534168404Spjd} 535168404Spjd 536168404Spjdint 537168404Spjddmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 538209962Smm void *buf, uint32_t flags) 539168404Spjd{ 540168404Spjd dnode_t *dn; 541168404Spjd dmu_buf_t **dbp; 542214378Smm int numbufs, err; 543168404Spjd 544168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 545168404Spjd if (err) 546168404Spjd return (err); 547168404Spjd 548168404Spjd /* 549168404Spjd * Deal with odd block sizes, where there can't be data past the first 550168404Spjd * block. If we ever do the tail block optimization, we will need to 551168404Spjd * handle that here as well. 552168404Spjd */ 553214378Smm if (dn->dn_maxblkid == 0) { 554168404Spjd int newsz = offset > dn->dn_datablksz ? 0 : 555168404Spjd MIN(size, dn->dn_datablksz - offset); 556168404Spjd bzero((char *)buf + newsz, size - newsz); 557168404Spjd size = newsz; 558168404Spjd } 559168404Spjd 560168404Spjd while (size > 0) { 561168404Spjd uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 562214378Smm int i; 563168404Spjd 564168404Spjd /* 565168404Spjd * NB: we could do this block-at-a-time, but it's nice 566168404Spjd * to be reading in parallel. 567168404Spjd */ 568168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 569209962Smm TRUE, FTAG, &numbufs, &dbp, flags); 570168404Spjd if (err) 571185029Spjd break; 572168404Spjd 573168404Spjd for (i = 0; i < numbufs; i++) { 574168404Spjd int tocpy; 575168404Spjd int bufoff; 576168404Spjd dmu_buf_t *db = dbp[i]; 577168404Spjd 578168404Spjd ASSERT(size > 0); 579168404Spjd 580168404Spjd bufoff = offset - db->db_offset; 581168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 582168404Spjd 583168404Spjd bcopy((char *)db->db_data + bufoff, buf, tocpy); 584168404Spjd 585168404Spjd offset += tocpy; 586168404Spjd size -= tocpy; 587168404Spjd buf = (char *)buf + tocpy; 588168404Spjd } 589168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 590168404Spjd } 591168404Spjd dnode_rele(dn, FTAG); 592185029Spjd return (err); 593168404Spjd} 594168404Spjd 595168404Spjdvoid 596168404Spjddmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 597168404Spjd const void *buf, dmu_tx_t *tx) 598168404Spjd{ 599168404Spjd dmu_buf_t **dbp; 600168404Spjd int numbufs, i; 601168404Spjd 602168404Spjd if (size == 0) 603168404Spjd return; 604168404Spjd 605168404Spjd VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 606168404Spjd FALSE, FTAG, &numbufs, &dbp)); 607168404Spjd 608168404Spjd for (i = 0; i < numbufs; i++) { 609168404Spjd int tocpy; 610168404Spjd int bufoff; 611168404Spjd dmu_buf_t *db = dbp[i]; 612168404Spjd 613168404Spjd ASSERT(size > 0); 614168404Spjd 615168404Spjd bufoff = offset - db->db_offset; 616168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 617168404Spjd 618168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 619168404Spjd 620168404Spjd if (tocpy == db->db_size) 621168404Spjd dmu_buf_will_fill(db, tx); 622168404Spjd else 623168404Spjd dmu_buf_will_dirty(db, tx); 624168404Spjd 625168404Spjd bcopy(buf, (char *)db->db_data + bufoff, tocpy); 626168404Spjd 627168404Spjd if (tocpy == db->db_size) 628168404Spjd dmu_buf_fill_done(db, tx); 629168404Spjd 630168404Spjd offset += tocpy; 631168404Spjd size -= tocpy; 632168404Spjd buf = (char *)buf + tocpy; 633168404Spjd } 634168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 635168404Spjd} 636168404Spjd 637168404Spjd#ifdef _KERNEL 638168404Spjdint 639168404Spjddmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 640168404Spjd{ 641168404Spjd dmu_buf_t **dbp; 642168404Spjd int numbufs, i, err; 643168404Spjd 644168404Spjd /* 645168404Spjd * NB: we could do this block-at-a-time, but it's nice 646168404Spjd * to be reading in parallel. 647168404Spjd */ 648168404Spjd err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, 649168404Spjd &numbufs, &dbp); 650168404Spjd if (err) 651168404Spjd return (err); 652168404Spjd 653168404Spjd for (i = 0; i < numbufs; i++) { 654168404Spjd int tocpy; 655168404Spjd int bufoff; 656168404Spjd dmu_buf_t *db = dbp[i]; 657168404Spjd 658168404Spjd ASSERT(size > 0); 659168404Spjd 660168404Spjd bufoff = uio->uio_loffset - db->db_offset; 661168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 662168404Spjd 663168404Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 664168404Spjd UIO_READ, uio); 665168404Spjd if (err) 666168404Spjd break; 667168404Spjd 668168404Spjd size -= tocpy; 669168404Spjd } 670168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 671168404Spjd 672168404Spjd return (err); 673168404Spjd} 674168404Spjd 675168404Spjdint 676168404Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 677168404Spjd dmu_tx_t *tx) 678168404Spjd{ 679168404Spjd dmu_buf_t **dbp; 680168404Spjd int numbufs, i; 681168404Spjd int err = 0; 682168404Spjd 683168404Spjd if (size == 0) 684168404Spjd return (0); 685168404Spjd 686168404Spjd err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, 687168404Spjd FALSE, FTAG, &numbufs, &dbp); 688168404Spjd if (err) 689168404Spjd return (err); 690168404Spjd 691168404Spjd for (i = 0; i < numbufs; i++) { 692168404Spjd int tocpy; 693168404Spjd int bufoff; 694168404Spjd dmu_buf_t *db = dbp[i]; 695168404Spjd 696168404Spjd ASSERT(size > 0); 697168404Spjd 698168404Spjd bufoff = uio->uio_loffset - db->db_offset; 699168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 700168404Spjd 701168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 702168404Spjd 703168404Spjd if (tocpy == db->db_size) 704168404Spjd dmu_buf_will_fill(db, tx); 705168404Spjd else 706168404Spjd dmu_buf_will_dirty(db, tx); 707168404Spjd 708168404Spjd /* 709168404Spjd * XXX uiomove could block forever (eg. nfs-backed 710168404Spjd * pages). There needs to be a uiolockdown() function 711168404Spjd * to lock the pages in memory, so that uiomove won't 712168404Spjd * block. 713168404Spjd */ 714168404Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 715168404Spjd UIO_WRITE, uio); 716168404Spjd 717168404Spjd if (tocpy == db->db_size) 718168404Spjd dmu_buf_fill_done(db, tx); 719168404Spjd 720168404Spjd if (err) 721168404Spjd break; 722168404Spjd 723168404Spjd size -= tocpy; 724168404Spjd } 725168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 726168404Spjd return (err); 727168404Spjd} 728168404Spjd 729168404Spjd#ifndef __FreeBSD__ 730168404Spjdint 731168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 732168404Spjd page_t *pp, dmu_tx_t *tx) 733168404Spjd{ 734168404Spjd dmu_buf_t **dbp; 735168404Spjd int numbufs, i; 736168404Spjd int err; 737168404Spjd 738168404Spjd if (size == 0) 739168404Spjd return (0); 740168404Spjd 741168404Spjd err = dmu_buf_hold_array(os, object, offset, size, 742168404Spjd FALSE, FTAG, &numbufs, &dbp); 743168404Spjd if (err) 744168404Spjd return (err); 745168404Spjd 746168404Spjd for (i = 0; i < numbufs; i++) { 747168404Spjd int tocpy, copied, thiscpy; 748168404Spjd int bufoff; 749168404Spjd dmu_buf_t *db = dbp[i]; 750168404Spjd caddr_t va; 751168404Spjd 752168404Spjd ASSERT(size > 0); 753168404Spjd ASSERT3U(db->db_size, >=, PAGESIZE); 754168404Spjd 755168404Spjd bufoff = offset - db->db_offset; 756168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 757168404Spjd 758168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 759168404Spjd 760168404Spjd if (tocpy == db->db_size) 761168404Spjd dmu_buf_will_fill(db, tx); 762168404Spjd else 763168404Spjd dmu_buf_will_dirty(db, tx); 764168404Spjd 765168404Spjd for (copied = 0; copied < tocpy; copied += PAGESIZE) { 766168404Spjd ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 767168404Spjd thiscpy = MIN(PAGESIZE, tocpy - copied); 768185029Spjd va = zfs_map_page(pp, S_READ); 769168404Spjd bcopy(va, (char *)db->db_data + bufoff, thiscpy); 770185029Spjd zfs_unmap_page(pp, va); 771168404Spjd pp = pp->p_next; 772168404Spjd bufoff += PAGESIZE; 773168404Spjd } 774168404Spjd 775168404Spjd if (tocpy == db->db_size) 776168404Spjd dmu_buf_fill_done(db, tx); 777168404Spjd 778168404Spjd offset += tocpy; 779168404Spjd size -= tocpy; 780168404Spjd } 781168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 782168404Spjd return (err); 783168404Spjd} 784168404Spjd#endif /* !__FreeBSD__ */ 785168404Spjd#endif /* _KERNEL */ 786168404Spjd 787209962Smm/* 788209962Smm * Allocate a loaned anonymous arc buffer. 789209962Smm */ 790209962Smmarc_buf_t * 791209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size) 792209962Smm{ 793209962Smm dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; 794209962Smm 795209962Smm return (arc_loan_buf(dn->dn_objset->os_spa, size)); 796209962Smm} 797209962Smm 798209962Smm/* 799209962Smm * Free a loaned arc buffer. 800209962Smm */ 801209962Smmvoid 802209962Smmdmu_return_arcbuf(arc_buf_t *buf) 803209962Smm{ 804209962Smm arc_return_buf(buf, FTAG); 805209962Smm VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); 806209962Smm} 807209962Smm 808209962Smm/* 809209962Smm * When possible directly assign passed loaned arc buffer to a dbuf. 810209962Smm * If this is not possible copy the contents of passed arc buf via 811209962Smm * dmu_write(). 812209962Smm */ 813209962Smmvoid 814209962Smmdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, 815209962Smm dmu_tx_t *tx) 816209962Smm{ 817209962Smm dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; 818209962Smm dmu_buf_impl_t *db; 819209962Smm uint32_t blksz = (uint32_t)arc_buf_size(buf); 820209962Smm uint64_t blkid; 821209962Smm 822209962Smm rw_enter(&dn->dn_struct_rwlock, RW_READER); 823209962Smm blkid = dbuf_whichblock(dn, offset); 824209962Smm VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); 825209962Smm rw_exit(&dn->dn_struct_rwlock); 826209962Smm 827209962Smm if (offset == db->db.db_offset && blksz == db->db.db_size) { 828209962Smm dbuf_assign_arcbuf(db, buf, tx); 829209962Smm dbuf_rele(db, FTAG); 830209962Smm } else { 831209962Smm dbuf_rele(db, FTAG); 832209962Smm ASSERT(dn->dn_objset->os.os == dn->dn_objset); 833209962Smm dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz, 834209962Smm buf->b_data, tx); 835209962Smm dmu_return_arcbuf(buf); 836209962Smm } 837209962Smm} 838209962Smm 839168404Spjdtypedef struct { 840168404Spjd dbuf_dirty_record_t *dr; 841168404Spjd dmu_sync_cb_t *done; 842168404Spjd void *arg; 843168404Spjd} dmu_sync_arg_t; 844168404Spjd 845168404Spjd/* ARGSUSED */ 846168404Spjdstatic void 847185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) 848185029Spjd{ 849185029Spjd blkptr_t *bp = zio->io_bp; 850209962Smm dmu_sync_arg_t *in = varg; 851209962Smm dbuf_dirty_record_t *dr = in->dr; 852209962Smm dmu_buf_impl_t *db = dr->dr_dbuf; 853185029Spjd 854185029Spjd if (!BP_IS_HOLE(bp)) { 855185029Spjd ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); 856185029Spjd ASSERT(BP_GET_LEVEL(bp) == 0); 857185029Spjd bp->blk_fill = 1; 858209962Smm } else { 859209962Smm /* 860209962Smm * dmu_sync() can compress a block of zeros to a null blkptr 861209962Smm * but the block size still needs to be passed through to replay 862209962Smm */ 863209962Smm BP_SET_LSIZE(bp, db->db.db_size); 864185029Spjd } 865185029Spjd} 866185029Spjd 867185029Spjd/* ARGSUSED */ 868185029Spjdstatic void 869168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 870168404Spjd{ 871168404Spjd dmu_sync_arg_t *in = varg; 872168404Spjd dbuf_dirty_record_t *dr = in->dr; 873168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 874168404Spjd dmu_sync_cb_t *done = in->done; 875168404Spjd 876168404Spjd mutex_enter(&db->db_mtx); 877168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 878168404Spjd dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ 879209962Smm if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) 880209962Smm BP_ZERO(&dr->dt.dl.dr_overridden_by); 881168404Spjd dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 882168404Spjd cv_broadcast(&db->db_changed); 883168404Spjd mutex_exit(&db->db_mtx); 884168404Spjd 885168404Spjd if (done) 886168404Spjd done(&(db->db), in->arg); 887168404Spjd 888168404Spjd kmem_free(in, sizeof (dmu_sync_arg_t)); 889168404Spjd} 890168404Spjd 891168404Spjd/* 892168404Spjd * Intent log support: sync the block associated with db to disk. 893168404Spjd * N.B. and XXX: the caller is responsible for making sure that the 894168404Spjd * data isn't changing while dmu_sync() is writing it. 895168404Spjd * 896168404Spjd * Return values: 897168404Spjd * 898168404Spjd * EEXIST: this txg has already been synced, so there's nothing to to. 899168404Spjd * The caller should not log the write. 900168404Spjd * 901168404Spjd * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 902168404Spjd * The caller should not log the write. 903168404Spjd * 904168404Spjd * EALREADY: this block is already in the process of being synced. 905168404Spjd * The caller should track its progress (somehow). 906168404Spjd * 907168404Spjd * EINPROGRESS: the IO has been initiated. 908168404Spjd * The caller should log this blkptr in the callback. 909168404Spjd * 910168404Spjd * 0: completed. Sets *bp to the blkptr just written. 911168404Spjd * The caller should log this blkptr immediately. 912168404Spjd */ 913168404Spjdint 914168404Spjddmu_sync(zio_t *pio, dmu_buf_t *db_fake, 915168404Spjd blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) 916168404Spjd{ 917168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 918168404Spjd objset_impl_t *os = db->db_objset; 919168404Spjd dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; 920168404Spjd tx_state_t *tx = &dp->dp_tx; 921168404Spjd dbuf_dirty_record_t *dr; 922168404Spjd dmu_sync_arg_t *in; 923168404Spjd zbookmark_t zb; 924185029Spjd writeprops_t wp = { 0 }; 925168404Spjd zio_t *zio; 926168404Spjd int err; 927168404Spjd 928168404Spjd ASSERT(BP_IS_HOLE(bp)); 929168404Spjd ASSERT(txg != 0); 930168404Spjd 931168404Spjd dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", 932168404Spjd txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); 933168404Spjd 934168404Spjd /* 935168404Spjd * XXX - would be nice if we could do this without suspending... 936168404Spjd */ 937168404Spjd txg_suspend(dp); 938168404Spjd 939168404Spjd /* 940168404Spjd * If this txg already synced, there's nothing to do. 941168404Spjd */ 942168404Spjd if (txg <= tx->tx_synced_txg) { 943168404Spjd txg_resume(dp); 944168404Spjd /* 945168404Spjd * If we're running ziltest, we need the blkptr regardless. 946168404Spjd */ 947168404Spjd if (txg > spa_freeze_txg(dp->dp_spa)) { 948168404Spjd /* if db_blkptr == NULL, this was an empty write */ 949168404Spjd if (db->db_blkptr) 950168404Spjd *bp = *db->db_blkptr; /* structure assignment */ 951168404Spjd return (0); 952168404Spjd } 953168404Spjd return (EEXIST); 954168404Spjd } 955168404Spjd 956168404Spjd mutex_enter(&db->db_mtx); 957168404Spjd 958168404Spjd if (txg == tx->tx_syncing_txg) { 959168404Spjd while (db->db_data_pending) { 960168404Spjd /* 961168404Spjd * IO is in-progress. Wait for it to finish. 962168404Spjd * XXX - would be nice to be able to somehow "attach" 963168404Spjd * this zio to the parent zio passed in. 964168404Spjd */ 965168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 966168404Spjd if (!db->db_data_pending && 967168404Spjd db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { 968168404Spjd /* 969168404Spjd * IO was compressed away 970168404Spjd */ 971168404Spjd *bp = *db->db_blkptr; /* structure assignment */ 972168404Spjd mutex_exit(&db->db_mtx); 973168404Spjd txg_resume(dp); 974168404Spjd return (0); 975168404Spjd } 976168404Spjd ASSERT(db->db_data_pending || 977168404Spjd (db->db_blkptr && db->db_blkptr->blk_birth == txg)); 978168404Spjd } 979168404Spjd 980168404Spjd if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { 981168404Spjd /* 982168404Spjd * IO is already completed. 983168404Spjd */ 984168404Spjd *bp = *db->db_blkptr; /* structure assignment */ 985168404Spjd mutex_exit(&db->db_mtx); 986168404Spjd txg_resume(dp); 987168404Spjd return (0); 988168404Spjd } 989168404Spjd } 990168404Spjd 991168404Spjd dr = db->db_last_dirty; 992168404Spjd while (dr && dr->dr_txg > txg) 993168404Spjd dr = dr->dr_next; 994168404Spjd if (dr == NULL || dr->dr_txg < txg) { 995168404Spjd /* 996168404Spjd * This dbuf isn't dirty, must have been free_range'd. 997168404Spjd * There's no need to log writes to freed blocks, so we're done. 998168404Spjd */ 999168404Spjd mutex_exit(&db->db_mtx); 1000168404Spjd txg_resume(dp); 1001168404Spjd return (ENOENT); 1002168404Spjd } 1003168404Spjd 1004168404Spjd ASSERT(dr->dr_txg == txg); 1005168404Spjd if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 1006168404Spjd /* 1007168404Spjd * We have already issued a sync write for this buffer. 1008168404Spjd */ 1009168404Spjd mutex_exit(&db->db_mtx); 1010168404Spjd txg_resume(dp); 1011168404Spjd return (EALREADY); 1012168404Spjd } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1013168404Spjd /* 1014168404Spjd * This buffer has already been synced. It could not 1015168404Spjd * have been dirtied since, or we would have cleared the state. 1016168404Spjd */ 1017168404Spjd *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ 1018168404Spjd mutex_exit(&db->db_mtx); 1019168404Spjd txg_resume(dp); 1020168404Spjd return (0); 1021168404Spjd } 1022168404Spjd 1023168404Spjd dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 1024168404Spjd in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 1025168404Spjd in->dr = dr; 1026168404Spjd in->done = done; 1027168404Spjd in->arg = arg; 1028168404Spjd mutex_exit(&db->db_mtx); 1029168404Spjd txg_resume(dp); 1030168404Spjd 1031168404Spjd zb.zb_objset = os->os_dsl_dataset->ds_object; 1032168404Spjd zb.zb_object = db->db.db_object; 1033168404Spjd zb.zb_level = db->db_level; 1034168404Spjd zb.zb_blkid = db->db_blkid; 1035168404Spjd 1036185029Spjd wp.wp_type = db->db_dnode->dn_type; 1037185029Spjd wp.wp_level = db->db_level; 1038185029Spjd wp.wp_copies = os->os_copies; 1039185029Spjd wp.wp_dnchecksum = db->db_dnode->dn_checksum; 1040185029Spjd wp.wp_oschecksum = os->os_checksum; 1041185029Spjd wp.wp_dncompress = db->db_dnode->dn_compress; 1042185029Spjd wp.wp_oscompress = os->os_compress; 1043185029Spjd 1044185029Spjd ASSERT(BP_IS_HOLE(bp)); 1045185029Spjd 1046185029Spjd zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db), 1047185029Spjd txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in, 1048185029Spjd ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 1049185029Spjd 1050168404Spjd if (pio) { 1051168404Spjd zio_nowait(zio); 1052168404Spjd err = EINPROGRESS; 1053168404Spjd } else { 1054168404Spjd err = zio_wait(zio); 1055168404Spjd ASSERT(err == 0); 1056168404Spjd } 1057168404Spjd return (err); 1058168404Spjd} 1059168404Spjd 1060168404Spjdint 1061168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 1062168404Spjd dmu_tx_t *tx) 1063168404Spjd{ 1064168404Spjd dnode_t *dn; 1065168404Spjd int err; 1066168404Spjd 1067168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 1068168404Spjd if (err) 1069168404Spjd return (err); 1070168404Spjd err = dnode_set_blksz(dn, size, ibs, tx); 1071168404Spjd dnode_rele(dn, FTAG); 1072168404Spjd return (err); 1073168404Spjd} 1074168404Spjd 1075168404Spjdvoid 1076168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 1077168404Spjd dmu_tx_t *tx) 1078168404Spjd{ 1079168404Spjd dnode_t *dn; 1080168404Spjd 1081168404Spjd /* XXX assumes dnode_hold will not get an i/o error */ 1082168404Spjd (void) dnode_hold(os->os, object, FTAG, &dn); 1083168404Spjd ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 1084168404Spjd dn->dn_checksum = checksum; 1085168404Spjd dnode_setdirty(dn, tx); 1086168404Spjd dnode_rele(dn, FTAG); 1087168404Spjd} 1088168404Spjd 1089168404Spjdvoid 1090168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 1091168404Spjd dmu_tx_t *tx) 1092168404Spjd{ 1093168404Spjd dnode_t *dn; 1094168404Spjd 1095168404Spjd /* XXX assumes dnode_hold will not get an i/o error */ 1096168404Spjd (void) dnode_hold(os->os, object, FTAG, &dn); 1097168404Spjd ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 1098168404Spjd dn->dn_compress = compress; 1099168404Spjd dnode_setdirty(dn, tx); 1100168404Spjd dnode_rele(dn, FTAG); 1101168404Spjd} 1102168404Spjd 1103168404Spjdint 1104168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1105168404Spjd{ 1106168404Spjd dnode_t *dn; 1107168404Spjd int i, err; 1108168404Spjd 1109168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 1110168404Spjd if (err) 1111168404Spjd return (err); 1112168404Spjd /* 1113168404Spjd * Sync any current changes before 1114168404Spjd * we go trundling through the block pointers. 1115168404Spjd */ 1116168404Spjd for (i = 0; i < TXG_SIZE; i++) { 1117168404Spjd if (list_link_active(&dn->dn_dirty_link[i])) 1118168404Spjd break; 1119168404Spjd } 1120168404Spjd if (i != TXG_SIZE) { 1121168404Spjd dnode_rele(dn, FTAG); 1122168404Spjd txg_wait_synced(dmu_objset_pool(os), 0); 1123168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 1124168404Spjd if (err) 1125168404Spjd return (err); 1126168404Spjd } 1127168404Spjd 1128185029Spjd err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); 1129168404Spjd dnode_rele(dn, FTAG); 1130168404Spjd 1131168404Spjd return (err); 1132168404Spjd} 1133168404Spjd 1134168404Spjdvoid 1135168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 1136168404Spjd{ 1137168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1138168404Spjd mutex_enter(&dn->dn_mtx); 1139168404Spjd 1140168404Spjd doi->doi_data_block_size = dn->dn_datablksz; 1141168404Spjd doi->doi_metadata_block_size = dn->dn_indblkshift ? 1142168404Spjd 1ULL << dn->dn_indblkshift : 0; 1143168404Spjd doi->doi_indirection = dn->dn_nlevels; 1144168404Spjd doi->doi_checksum = dn->dn_checksum; 1145168404Spjd doi->doi_compress = dn->dn_compress; 1146168404Spjd doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + 1147168404Spjd SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; 1148168404Spjd doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; 1149168404Spjd doi->doi_type = dn->dn_type; 1150168404Spjd doi->doi_bonus_size = dn->dn_bonuslen; 1151168404Spjd doi->doi_bonus_type = dn->dn_bonustype; 1152168404Spjd 1153168404Spjd mutex_exit(&dn->dn_mtx); 1154168404Spjd rw_exit(&dn->dn_struct_rwlock); 1155168404Spjd} 1156168404Spjd 1157168404Spjd/* 1158168404Spjd * Get information on a DMU object. 1159168404Spjd * If doi is NULL, just indicates whether the object exists. 1160168404Spjd */ 1161168404Spjdint 1162168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 1163168404Spjd{ 1164168404Spjd dnode_t *dn; 1165168404Spjd int err = dnode_hold(os->os, object, FTAG, &dn); 1166168404Spjd 1167168404Spjd if (err) 1168168404Spjd return (err); 1169168404Spjd 1170168404Spjd if (doi != NULL) 1171168404Spjd dmu_object_info_from_dnode(dn, doi); 1172168404Spjd 1173168404Spjd dnode_rele(dn, FTAG); 1174168404Spjd return (0); 1175168404Spjd} 1176168404Spjd 1177168404Spjd/* 1178168404Spjd * As above, but faster; can be used when you have a held dbuf in hand. 1179168404Spjd */ 1180168404Spjdvoid 1181168404Spjddmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) 1182168404Spjd{ 1183168404Spjd dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); 1184168404Spjd} 1185168404Spjd 1186168404Spjd/* 1187168404Spjd * Faster still when you only care about the size. 1188168404Spjd * This is specifically optimized for zfs_getattr(). 1189168404Spjd */ 1190168404Spjdvoid 1191168404Spjddmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) 1192168404Spjd{ 1193168404Spjd dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 1194168404Spjd 1195168404Spjd *blksize = dn->dn_datablksz; 1196168404Spjd /* add 1 for dnode space */ 1197168404Spjd *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 1198168404Spjd SPA_MINBLOCKSHIFT) + 1; 1199168404Spjd} 1200168404Spjd 1201168404Spjdvoid 1202168404Spjdbyteswap_uint64_array(void *vbuf, size_t size) 1203168404Spjd{ 1204168404Spjd uint64_t *buf = vbuf; 1205168404Spjd size_t count = size >> 3; 1206168404Spjd int i; 1207168404Spjd 1208168404Spjd ASSERT((size & 7) == 0); 1209168404Spjd 1210168404Spjd for (i = 0; i < count; i++) 1211168404Spjd buf[i] = BSWAP_64(buf[i]); 1212168404Spjd} 1213168404Spjd 1214168404Spjdvoid 1215168404Spjdbyteswap_uint32_array(void *vbuf, size_t size) 1216168404Spjd{ 1217168404Spjd uint32_t *buf = vbuf; 1218168404Spjd size_t count = size >> 2; 1219168404Spjd int i; 1220168404Spjd 1221168404Spjd ASSERT((size & 3) == 0); 1222168404Spjd 1223168404Spjd for (i = 0; i < count; i++) 1224168404Spjd buf[i] = BSWAP_32(buf[i]); 1225168404Spjd} 1226168404Spjd 1227168404Spjdvoid 1228168404Spjdbyteswap_uint16_array(void *vbuf, size_t size) 1229168404Spjd{ 1230168404Spjd uint16_t *buf = vbuf; 1231168404Spjd size_t count = size >> 1; 1232168404Spjd int i; 1233168404Spjd 1234168404Spjd ASSERT((size & 1) == 0); 1235168404Spjd 1236168404Spjd for (i = 0; i < count; i++) 1237168404Spjd buf[i] = BSWAP_16(buf[i]); 1238168404Spjd} 1239168404Spjd 1240168404Spjd/* ARGSUSED */ 1241168404Spjdvoid 1242168404Spjdbyteswap_uint8_array(void *vbuf, size_t size) 1243168404Spjd{ 1244168404Spjd} 1245168404Spjd 1246168404Spjdvoid 1247168404Spjddmu_init(void) 1248168404Spjd{ 1249168404Spjd dbuf_init(); 1250168404Spjd dnode_init(); 1251208130Smm zfetch_init(); 1252168404Spjd arc_init(); 1253185029Spjd l2arc_init(); 1254168404Spjd} 1255168404Spjd 1256168404Spjdvoid 1257168404Spjddmu_fini(void) 1258168404Spjd{ 1259168404Spjd arc_fini(); 1260208130Smm zfetch_fini(); 1261168404Spjd dnode_fini(); 1262168404Spjd dbuf_fini(); 1263185029Spjd l2arc_fini(); 1264168404Spjd} 1265