dmu.c revision 168404
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22168404Spjd * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26168404Spjd#pragma ident "%Z%%M% %I% %E% SMI" 27168404Spjd 28168404Spjd#include <sys/dmu.h> 29168404Spjd#include <sys/dmu_impl.h> 30168404Spjd#include <sys/dmu_tx.h> 31168404Spjd#include <sys/dbuf.h> 32168404Spjd#include <sys/dnode.h> 33168404Spjd#include <sys/zfs_context.h> 34168404Spjd#include <sys/dmu_objset.h> 35168404Spjd#include <sys/dmu_traverse.h> 36168404Spjd#include <sys/dsl_dataset.h> 37168404Spjd#include <sys/dsl_dir.h> 38168404Spjd#include <sys/dsl_pool.h> 39168404Spjd#include <sys/dsl_synctask.h> 40168404Spjd#include <sys/dsl_prop.h> 41168404Spjd#include <sys/dmu_zfetch.h> 42168404Spjd#include <sys/zfs_ioctl.h> 43168404Spjd#include <sys/zap.h> 44168404Spjd#include <sys/zio_checksum.h> 45168404Spjd 46168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 47168404Spjd { byteswap_uint8_array, TRUE, "unallocated" }, 48168404Spjd { zap_byteswap, TRUE, "object directory" }, 49168404Spjd { byteswap_uint64_array, TRUE, "object array" }, 50168404Spjd { byteswap_uint8_array, TRUE, "packed nvlist" }, 51168404Spjd { byteswap_uint64_array, TRUE, "packed nvlist size" }, 52168404Spjd { byteswap_uint64_array, TRUE, "bplist" }, 53168404Spjd { byteswap_uint64_array, TRUE, "bplist header" }, 54168404Spjd { byteswap_uint64_array, TRUE, "SPA space map header" }, 55168404Spjd { byteswap_uint64_array, TRUE, "SPA space map" }, 56168404Spjd { byteswap_uint64_array, TRUE, "ZIL intent log" }, 57168404Spjd { dnode_buf_byteswap, TRUE, "DMU dnode" }, 58168404Spjd { dmu_objset_byteswap, TRUE, "DMU objset" }, 59168404Spjd { byteswap_uint64_array, TRUE, "DSL directory" }, 60168404Spjd { zap_byteswap, TRUE, "DSL directory child map"}, 61168404Spjd { zap_byteswap, TRUE, "DSL dataset snap map" }, 62168404Spjd { zap_byteswap, TRUE, "DSL props" }, 63168404Spjd { byteswap_uint64_array, TRUE, "DSL dataset" }, 64168404Spjd { zfs_znode_byteswap, TRUE, "ZFS znode" }, 65168404Spjd { zfs_acl_byteswap, TRUE, "ZFS ACL" }, 66168404Spjd { byteswap_uint8_array, FALSE, "ZFS plain file" }, 67168404Spjd { zap_byteswap, TRUE, "ZFS directory" }, 68168404Spjd { zap_byteswap, TRUE, "ZFS master node" }, 69168404Spjd { zap_byteswap, TRUE, "ZFS delete queue" }, 70168404Spjd { byteswap_uint8_array, FALSE, "zvol object" }, 71168404Spjd { zap_byteswap, TRUE, "zvol prop" }, 72168404Spjd { byteswap_uint8_array, FALSE, "other uint8[]" }, 73168404Spjd { byteswap_uint64_array, FALSE, "other uint64[]" }, 74168404Spjd { zap_byteswap, TRUE, "other ZAP" }, 75168404Spjd { zap_byteswap, TRUE, "persistent error log" }, 76168404Spjd { byteswap_uint8_array, TRUE, "SPA history" }, 77168404Spjd { byteswap_uint64_array, TRUE, "SPA history offsets" }, 78168404Spjd { zap_byteswap, TRUE, "Pool properties" }, 79168404Spjd}; 80168404Spjd 81168404Spjdint 82168404Spjddmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 83168404Spjd void *tag, dmu_buf_t **dbp) 84168404Spjd{ 85168404Spjd dnode_t *dn; 86168404Spjd uint64_t blkid; 87168404Spjd dmu_buf_impl_t *db; 88168404Spjd int err; 89168404Spjd 90168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 91168404Spjd if (err) 92168404Spjd return (err); 93168404Spjd blkid = dbuf_whichblock(dn, offset); 94168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 95168404Spjd db = dbuf_hold(dn, blkid, tag); 96168404Spjd rw_exit(&dn->dn_struct_rwlock); 97168404Spjd if (db == NULL) { 98168404Spjd err = EIO; 99168404Spjd } else { 100168404Spjd err = dbuf_read(db, NULL, DB_RF_CANFAIL); 101168404Spjd if (err) { 102168404Spjd dbuf_rele(db, tag); 103168404Spjd db = NULL; 104168404Spjd } 105168404Spjd } 106168404Spjd 107168404Spjd dnode_rele(dn, FTAG); 108168404Spjd *dbp = &db->db; 109168404Spjd return (err); 110168404Spjd} 111168404Spjd 112168404Spjdint 113168404Spjddmu_bonus_max(void) 114168404Spjd{ 115168404Spjd return (DN_MAX_BONUSLEN); 116168404Spjd} 117168404Spjd 118168404Spjd/* 119168404Spjd * returns ENOENT, EIO, or 0. 120168404Spjd */ 121168404Spjdint 122168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 123168404Spjd{ 124168404Spjd dnode_t *dn; 125168404Spjd int err, count; 126168404Spjd dmu_buf_impl_t *db; 127168404Spjd 128168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 129168404Spjd if (err) 130168404Spjd return (err); 131168404Spjd 132168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 133168404Spjd if (dn->dn_bonus == NULL) { 134168404Spjd rw_exit(&dn->dn_struct_rwlock); 135168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 136168404Spjd if (dn->dn_bonus == NULL) 137168404Spjd dn->dn_bonus = dbuf_create_bonus(dn); 138168404Spjd } 139168404Spjd db = dn->dn_bonus; 140168404Spjd rw_exit(&dn->dn_struct_rwlock); 141168404Spjd mutex_enter(&db->db_mtx); 142168404Spjd count = refcount_add(&db->db_holds, tag); 143168404Spjd mutex_exit(&db->db_mtx); 144168404Spjd if (count == 1) 145168404Spjd dnode_add_ref(dn, db); 146168404Spjd dnode_rele(dn, FTAG); 147168404Spjd 148168404Spjd VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); 149168404Spjd 150168404Spjd *dbp = &db->db; 151168404Spjd return (0); 152168404Spjd} 153168404Spjd 154168404Spjd/* 155168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 156168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful, 157168404Spjd * and can induce severe lock contention when writing to several files 158168404Spjd * whose dnodes are in the same block. 159168404Spjd */ 160168404Spjdstatic int 161168404Spjddmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, 162168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 163168404Spjd{ 164168404Spjd dmu_buf_t **dbp; 165168404Spjd uint64_t blkid, nblks, i; 166168404Spjd uint32_t flags; 167168404Spjd int err; 168168404Spjd zio_t *zio; 169168404Spjd 170168404Spjd ASSERT(length <= DMU_MAX_ACCESS); 171168404Spjd 172168404Spjd flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; 173168404Spjd if (length > zfetch_array_rd_sz) 174168404Spjd flags |= DB_RF_NOPREFETCH; 175168404Spjd 176168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 177168404Spjd if (dn->dn_datablkshift) { 178168404Spjd int blkshift = dn->dn_datablkshift; 179168404Spjd nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 180168404Spjd P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 181168404Spjd } else { 182168404Spjd if (offset + length > dn->dn_datablksz) { 183168404Spjd zfs_panic_recover("zfs: accessing past end of object " 184168404Spjd "%llx/%llx (size=%u access=%llu+%llu)", 185168404Spjd (longlong_t)dn->dn_objset-> 186168404Spjd os_dsl_dataset->ds_object, 187168404Spjd (longlong_t)dn->dn_object, dn->dn_datablksz, 188168404Spjd (longlong_t)offset, (longlong_t)length); 189168404Spjd return (EIO); 190168404Spjd } 191168404Spjd nblks = 1; 192168404Spjd } 193168404Spjd dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 194168404Spjd 195168404Spjd zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); 196168404Spjd blkid = dbuf_whichblock(dn, offset); 197168404Spjd for (i = 0; i < nblks; i++) { 198168404Spjd dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 199168404Spjd if (db == NULL) { 200168404Spjd rw_exit(&dn->dn_struct_rwlock); 201168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 202168404Spjd zio_nowait(zio); 203168404Spjd return (EIO); 204168404Spjd } 205168404Spjd /* initiate async i/o */ 206168404Spjd if (read) { 207168404Spjd rw_exit(&dn->dn_struct_rwlock); 208168404Spjd (void) dbuf_read(db, zio, flags); 209168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 210168404Spjd } 211168404Spjd dbp[i] = &db->db; 212168404Spjd } 213168404Spjd rw_exit(&dn->dn_struct_rwlock); 214168404Spjd 215168404Spjd /* wait for async i/o */ 216168404Spjd err = zio_wait(zio); 217168404Spjd if (err) { 218168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 219168404Spjd return (err); 220168404Spjd } 221168404Spjd 222168404Spjd /* wait for other io to complete */ 223168404Spjd if (read) { 224168404Spjd for (i = 0; i < nblks; i++) { 225168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 226168404Spjd mutex_enter(&db->db_mtx); 227168404Spjd while (db->db_state == DB_READ || 228168404Spjd db->db_state == DB_FILL) 229168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 230168404Spjd if (db->db_state == DB_UNCACHED) 231168404Spjd err = EIO; 232168404Spjd mutex_exit(&db->db_mtx); 233168404Spjd if (err) { 234168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 235168404Spjd return (err); 236168404Spjd } 237168404Spjd } 238168404Spjd } 239168404Spjd 240168404Spjd *numbufsp = nblks; 241168404Spjd *dbpp = dbp; 242168404Spjd return (0); 243168404Spjd} 244168404Spjd 245168404Spjdstatic int 246168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 247168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 248168404Spjd{ 249168404Spjd dnode_t *dn; 250168404Spjd int err; 251168404Spjd 252168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 253168404Spjd if (err) 254168404Spjd return (err); 255168404Spjd 256168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 257168404Spjd numbufsp, dbpp); 258168404Spjd 259168404Spjd dnode_rele(dn, FTAG); 260168404Spjd 261168404Spjd return (err); 262168404Spjd} 263168404Spjd 264168404Spjdint 265168404Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, 266168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 267168404Spjd{ 268168404Spjd dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 269168404Spjd int err; 270168404Spjd 271168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 272168404Spjd numbufsp, dbpp); 273168404Spjd 274168404Spjd return (err); 275168404Spjd} 276168404Spjd 277168404Spjdvoid 278168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 279168404Spjd{ 280168404Spjd int i; 281168404Spjd dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 282168404Spjd 283168404Spjd if (numbufs == 0) 284168404Spjd return; 285168404Spjd 286168404Spjd for (i = 0; i < numbufs; i++) { 287168404Spjd if (dbp[i]) 288168404Spjd dbuf_rele(dbp[i], tag); 289168404Spjd } 290168404Spjd 291168404Spjd kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 292168404Spjd} 293168404Spjd 294168404Spjdvoid 295168404Spjddmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 296168404Spjd{ 297168404Spjd dnode_t *dn; 298168404Spjd uint64_t blkid; 299168404Spjd int nblks, i, err; 300168404Spjd 301168404Spjd if (zfs_prefetch_disable) 302168404Spjd return; 303168404Spjd 304168404Spjd if (len == 0) { /* they're interested in the bonus buffer */ 305168404Spjd dn = os->os->os_meta_dnode; 306168404Spjd 307168404Spjd if (object == 0 || object >= DN_MAX_OBJECT) 308168404Spjd return; 309168404Spjd 310168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 311168404Spjd blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 312168404Spjd dbuf_prefetch(dn, blkid); 313168404Spjd rw_exit(&dn->dn_struct_rwlock); 314168404Spjd return; 315168404Spjd } 316168404Spjd 317168404Spjd /* 318168404Spjd * XXX - Note, if the dnode for the requested object is not 319168404Spjd * already cached, we will do a *synchronous* read in the 320168404Spjd * dnode_hold() call. The same is true for any indirects. 321168404Spjd */ 322168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 323168404Spjd if (err != 0) 324168404Spjd return; 325168404Spjd 326168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 327168404Spjd if (dn->dn_datablkshift) { 328168404Spjd int blkshift = dn->dn_datablkshift; 329168404Spjd nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 330168404Spjd P2ALIGN(offset, 1<<blkshift)) >> blkshift; 331168404Spjd } else { 332168404Spjd nblks = (offset < dn->dn_datablksz); 333168404Spjd } 334168404Spjd 335168404Spjd if (nblks != 0) { 336168404Spjd blkid = dbuf_whichblock(dn, offset); 337168404Spjd for (i = 0; i < nblks; i++) 338168404Spjd dbuf_prefetch(dn, blkid+i); 339168404Spjd } 340168404Spjd 341168404Spjd rw_exit(&dn->dn_struct_rwlock); 342168404Spjd 343168404Spjd dnode_rele(dn, FTAG); 344168404Spjd} 345168404Spjd 346168404Spjdint 347168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 348168404Spjd uint64_t size, dmu_tx_t *tx) 349168404Spjd{ 350168404Spjd dnode_t *dn; 351168404Spjd int err = dnode_hold(os->os, object, FTAG, &dn); 352168404Spjd if (err) 353168404Spjd return (err); 354168404Spjd ASSERT(offset < UINT64_MAX); 355168404Spjd ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 356168404Spjd dnode_free_range(dn, offset, size, tx); 357168404Spjd dnode_rele(dn, FTAG); 358168404Spjd return (0); 359168404Spjd} 360168404Spjd 361168404Spjdint 362168404Spjddmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 363168404Spjd void *buf) 364168404Spjd{ 365168404Spjd dnode_t *dn; 366168404Spjd dmu_buf_t **dbp; 367168404Spjd int numbufs, i, err; 368168404Spjd 369168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 370168404Spjd if (err) 371168404Spjd return (err); 372168404Spjd 373168404Spjd /* 374168404Spjd * Deal with odd block sizes, where there can't be data past the first 375168404Spjd * block. If we ever do the tail block optimization, we will need to 376168404Spjd * handle that here as well. 377168404Spjd */ 378168404Spjd if (dn->dn_datablkshift == 0) { 379168404Spjd int newsz = offset > dn->dn_datablksz ? 0 : 380168404Spjd MIN(size, dn->dn_datablksz - offset); 381168404Spjd bzero((char *)buf + newsz, size - newsz); 382168404Spjd size = newsz; 383168404Spjd } 384168404Spjd 385168404Spjd while (size > 0) { 386168404Spjd uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 387168404Spjd int err; 388168404Spjd 389168404Spjd /* 390168404Spjd * NB: we could do this block-at-a-time, but it's nice 391168404Spjd * to be reading in parallel. 392168404Spjd */ 393168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 394168404Spjd TRUE, FTAG, &numbufs, &dbp); 395168404Spjd if (err) 396168404Spjd return (err); 397168404Spjd 398168404Spjd for (i = 0; i < numbufs; i++) { 399168404Spjd int tocpy; 400168404Spjd int bufoff; 401168404Spjd dmu_buf_t *db = dbp[i]; 402168404Spjd 403168404Spjd ASSERT(size > 0); 404168404Spjd 405168404Spjd bufoff = offset - db->db_offset; 406168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 407168404Spjd 408168404Spjd bcopy((char *)db->db_data + bufoff, buf, tocpy); 409168404Spjd 410168404Spjd offset += tocpy; 411168404Spjd size -= tocpy; 412168404Spjd buf = (char *)buf + tocpy; 413168404Spjd } 414168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 415168404Spjd } 416168404Spjd dnode_rele(dn, FTAG); 417168404Spjd return (0); 418168404Spjd} 419168404Spjd 420168404Spjdvoid 421168404Spjddmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 422168404Spjd const void *buf, dmu_tx_t *tx) 423168404Spjd{ 424168404Spjd dmu_buf_t **dbp; 425168404Spjd int numbufs, i; 426168404Spjd 427168404Spjd if (size == 0) 428168404Spjd return; 429168404Spjd 430168404Spjd VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 431168404Spjd FALSE, FTAG, &numbufs, &dbp)); 432168404Spjd 433168404Spjd for (i = 0; i < numbufs; i++) { 434168404Spjd int tocpy; 435168404Spjd int bufoff; 436168404Spjd dmu_buf_t *db = dbp[i]; 437168404Spjd 438168404Spjd ASSERT(size > 0); 439168404Spjd 440168404Spjd bufoff = offset - db->db_offset; 441168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 442168404Spjd 443168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 444168404Spjd 445168404Spjd if (tocpy == db->db_size) 446168404Spjd dmu_buf_will_fill(db, tx); 447168404Spjd else 448168404Spjd dmu_buf_will_dirty(db, tx); 449168404Spjd 450168404Spjd bcopy(buf, (char *)db->db_data + bufoff, tocpy); 451168404Spjd 452168404Spjd if (tocpy == db->db_size) 453168404Spjd dmu_buf_fill_done(db, tx); 454168404Spjd 455168404Spjd offset += tocpy; 456168404Spjd size -= tocpy; 457168404Spjd buf = (char *)buf + tocpy; 458168404Spjd } 459168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 460168404Spjd} 461168404Spjd 462168404Spjd#ifdef _KERNEL 463168404Spjdint 464168404Spjddmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 465168404Spjd{ 466168404Spjd dmu_buf_t **dbp; 467168404Spjd int numbufs, i, err; 468168404Spjd 469168404Spjd /* 470168404Spjd * NB: we could do this block-at-a-time, but it's nice 471168404Spjd * to be reading in parallel. 472168404Spjd */ 473168404Spjd err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, 474168404Spjd &numbufs, &dbp); 475168404Spjd if (err) 476168404Spjd return (err); 477168404Spjd 478168404Spjd for (i = 0; i < numbufs; i++) { 479168404Spjd int tocpy; 480168404Spjd int bufoff; 481168404Spjd dmu_buf_t *db = dbp[i]; 482168404Spjd 483168404Spjd ASSERT(size > 0); 484168404Spjd 485168404Spjd bufoff = uio->uio_loffset - db->db_offset; 486168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 487168404Spjd 488168404Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 489168404Spjd UIO_READ, uio); 490168404Spjd if (err) 491168404Spjd break; 492168404Spjd 493168404Spjd size -= tocpy; 494168404Spjd } 495168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 496168404Spjd 497168404Spjd return (err); 498168404Spjd} 499168404Spjd 500168404Spjdint 501168404Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 502168404Spjd dmu_tx_t *tx) 503168404Spjd{ 504168404Spjd dmu_buf_t **dbp; 505168404Spjd int numbufs, i; 506168404Spjd int err = 0; 507168404Spjd 508168404Spjd if (size == 0) 509168404Spjd return (0); 510168404Spjd 511168404Spjd err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, 512168404Spjd FALSE, FTAG, &numbufs, &dbp); 513168404Spjd if (err) 514168404Spjd return (err); 515168404Spjd 516168404Spjd for (i = 0; i < numbufs; i++) { 517168404Spjd int tocpy; 518168404Spjd int bufoff; 519168404Spjd dmu_buf_t *db = dbp[i]; 520168404Spjd 521168404Spjd ASSERT(size > 0); 522168404Spjd 523168404Spjd bufoff = uio->uio_loffset - db->db_offset; 524168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 525168404Spjd 526168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 527168404Spjd 528168404Spjd if (tocpy == db->db_size) 529168404Spjd dmu_buf_will_fill(db, tx); 530168404Spjd else 531168404Spjd dmu_buf_will_dirty(db, tx); 532168404Spjd 533168404Spjd /* 534168404Spjd * XXX uiomove could block forever (eg. nfs-backed 535168404Spjd * pages). There needs to be a uiolockdown() function 536168404Spjd * to lock the pages in memory, so that uiomove won't 537168404Spjd * block. 538168404Spjd */ 539168404Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 540168404Spjd UIO_WRITE, uio); 541168404Spjd 542168404Spjd if (tocpy == db->db_size) 543168404Spjd dmu_buf_fill_done(db, tx); 544168404Spjd 545168404Spjd if (err) 546168404Spjd break; 547168404Spjd 548168404Spjd size -= tocpy; 549168404Spjd } 550168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 551168404Spjd return (err); 552168404Spjd} 553168404Spjd 554168404Spjd#ifndef __FreeBSD__ 555168404Spjdint 556168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 557168404Spjd page_t *pp, dmu_tx_t *tx) 558168404Spjd{ 559168404Spjd dmu_buf_t **dbp; 560168404Spjd int numbufs, i; 561168404Spjd int err; 562168404Spjd 563168404Spjd if (size == 0) 564168404Spjd return (0); 565168404Spjd 566168404Spjd err = dmu_buf_hold_array(os, object, offset, size, 567168404Spjd FALSE, FTAG, &numbufs, &dbp); 568168404Spjd if (err) 569168404Spjd return (err); 570168404Spjd 571168404Spjd for (i = 0; i < numbufs; i++) { 572168404Spjd int tocpy, copied, thiscpy; 573168404Spjd int bufoff; 574168404Spjd dmu_buf_t *db = dbp[i]; 575168404Spjd caddr_t va; 576168404Spjd 577168404Spjd ASSERT(size > 0); 578168404Spjd ASSERT3U(db->db_size, >=, PAGESIZE); 579168404Spjd 580168404Spjd bufoff = offset - db->db_offset; 581168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 582168404Spjd 583168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 584168404Spjd 585168404Spjd if (tocpy == db->db_size) 586168404Spjd dmu_buf_will_fill(db, tx); 587168404Spjd else 588168404Spjd dmu_buf_will_dirty(db, tx); 589168404Spjd 590168404Spjd for (copied = 0; copied < tocpy; copied += PAGESIZE) { 591168404Spjd ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 592168404Spjd thiscpy = MIN(PAGESIZE, tocpy - copied); 593168404Spjd va = ppmapin(pp, PROT_READ, (caddr_t)-1); 594168404Spjd bcopy(va, (char *)db->db_data + bufoff, thiscpy); 595168404Spjd ppmapout(va); 596168404Spjd pp = pp->p_next; 597168404Spjd bufoff += PAGESIZE; 598168404Spjd } 599168404Spjd 600168404Spjd if (tocpy == db->db_size) 601168404Spjd dmu_buf_fill_done(db, tx); 602168404Spjd 603168404Spjd if (err) 604168404Spjd break; 605168404Spjd 606168404Spjd offset += tocpy; 607168404Spjd size -= tocpy; 608168404Spjd } 609168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 610168404Spjd return (err); 611168404Spjd} 612168404Spjd#endif /* !__FreeBSD__ */ 613168404Spjd#endif /* _KERNEL */ 614168404Spjd 615168404Spjdtypedef struct { 616168404Spjd dbuf_dirty_record_t *dr; 617168404Spjd dmu_sync_cb_t *done; 618168404Spjd void *arg; 619168404Spjd} dmu_sync_arg_t; 620168404Spjd 621168404Spjd/* ARGSUSED */ 622168404Spjdstatic void 623168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 624168404Spjd{ 625168404Spjd dmu_sync_arg_t *in = varg; 626168404Spjd dbuf_dirty_record_t *dr = in->dr; 627168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 628168404Spjd dmu_sync_cb_t *done = in->done; 629168404Spjd 630168404Spjd if (!BP_IS_HOLE(zio->io_bp)) { 631168404Spjd zio->io_bp->blk_fill = 1; 632168404Spjd BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); 633168404Spjd BP_SET_LEVEL(zio->io_bp, 0); 634168404Spjd } 635168404Spjd 636168404Spjd mutex_enter(&db->db_mtx); 637168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 638168404Spjd dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ 639168404Spjd dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 640168404Spjd cv_broadcast(&db->db_changed); 641168404Spjd mutex_exit(&db->db_mtx); 642168404Spjd 643168404Spjd if (done) 644168404Spjd done(&(db->db), in->arg); 645168404Spjd 646168404Spjd kmem_free(in, sizeof (dmu_sync_arg_t)); 647168404Spjd} 648168404Spjd 649168404Spjd/* 650168404Spjd * Intent log support: sync the block associated with db to disk. 651168404Spjd * N.B. and XXX: the caller is responsible for making sure that the 652168404Spjd * data isn't changing while dmu_sync() is writing it. 653168404Spjd * 654168404Spjd * Return values: 655168404Spjd * 656168404Spjd * EEXIST: this txg has already been synced, so there's nothing to to. 657168404Spjd * The caller should not log the write. 658168404Spjd * 659168404Spjd * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 660168404Spjd * The caller should not log the write. 661168404Spjd * 662168404Spjd * EALREADY: this block is already in the process of being synced. 663168404Spjd * The caller should track its progress (somehow). 664168404Spjd * 665168404Spjd * EINPROGRESS: the IO has been initiated. 666168404Spjd * The caller should log this blkptr in the callback. 667168404Spjd * 668168404Spjd * 0: completed. Sets *bp to the blkptr just written. 669168404Spjd * The caller should log this blkptr immediately. 670168404Spjd */ 671168404Spjdint 672168404Spjddmu_sync(zio_t *pio, dmu_buf_t *db_fake, 673168404Spjd blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) 674168404Spjd{ 675168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 676168404Spjd objset_impl_t *os = db->db_objset; 677168404Spjd dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; 678168404Spjd tx_state_t *tx = &dp->dp_tx; 679168404Spjd dbuf_dirty_record_t *dr; 680168404Spjd dmu_sync_arg_t *in; 681168404Spjd zbookmark_t zb; 682168404Spjd zio_t *zio; 683168404Spjd int zio_flags; 684168404Spjd int err; 685168404Spjd 686168404Spjd ASSERT(BP_IS_HOLE(bp)); 687168404Spjd ASSERT(txg != 0); 688168404Spjd 689168404Spjd 690168404Spjd dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", 691168404Spjd txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); 692168404Spjd 693168404Spjd /* 694168404Spjd * XXX - would be nice if we could do this without suspending... 695168404Spjd */ 696168404Spjd txg_suspend(dp); 697168404Spjd 698168404Spjd /* 699168404Spjd * If this txg already synced, there's nothing to do. 700168404Spjd */ 701168404Spjd if (txg <= tx->tx_synced_txg) { 702168404Spjd txg_resume(dp); 703168404Spjd /* 704168404Spjd * If we're running ziltest, we need the blkptr regardless. 705168404Spjd */ 706168404Spjd if (txg > spa_freeze_txg(dp->dp_spa)) { 707168404Spjd /* if db_blkptr == NULL, this was an empty write */ 708168404Spjd if (db->db_blkptr) 709168404Spjd *bp = *db->db_blkptr; /* structure assignment */ 710168404Spjd return (0); 711168404Spjd } 712168404Spjd return (EEXIST); 713168404Spjd } 714168404Spjd 715168404Spjd mutex_enter(&db->db_mtx); 716168404Spjd 717168404Spjd if (txg == tx->tx_syncing_txg) { 718168404Spjd while (db->db_data_pending) { 719168404Spjd /* 720168404Spjd * IO is in-progress. Wait for it to finish. 721168404Spjd * XXX - would be nice to be able to somehow "attach" 722168404Spjd * this zio to the parent zio passed in. 723168404Spjd */ 724168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 725168404Spjd if (!db->db_data_pending && 726168404Spjd db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { 727168404Spjd /* 728168404Spjd * IO was compressed away 729168404Spjd */ 730168404Spjd *bp = *db->db_blkptr; /* structure assignment */ 731168404Spjd mutex_exit(&db->db_mtx); 732168404Spjd txg_resume(dp); 733168404Spjd return (0); 734168404Spjd } 735168404Spjd ASSERT(db->db_data_pending || 736168404Spjd (db->db_blkptr && db->db_blkptr->blk_birth == txg)); 737168404Spjd } 738168404Spjd 739168404Spjd if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { 740168404Spjd /* 741168404Spjd * IO is already completed. 742168404Spjd */ 743168404Spjd *bp = *db->db_blkptr; /* structure assignment */ 744168404Spjd mutex_exit(&db->db_mtx); 745168404Spjd txg_resume(dp); 746168404Spjd return (0); 747168404Spjd } 748168404Spjd } 749168404Spjd 750168404Spjd dr = db->db_last_dirty; 751168404Spjd while (dr && dr->dr_txg > txg) 752168404Spjd dr = dr->dr_next; 753168404Spjd if (dr == NULL || dr->dr_txg < txg) { 754168404Spjd /* 755168404Spjd * This dbuf isn't dirty, must have been free_range'd. 756168404Spjd * There's no need to log writes to freed blocks, so we're done. 757168404Spjd */ 758168404Spjd mutex_exit(&db->db_mtx); 759168404Spjd txg_resume(dp); 760168404Spjd return (ENOENT); 761168404Spjd } 762168404Spjd 763168404Spjd ASSERT(dr->dr_txg == txg); 764168404Spjd if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 765168404Spjd /* 766168404Spjd * We have already issued a sync write for this buffer. 767168404Spjd */ 768168404Spjd mutex_exit(&db->db_mtx); 769168404Spjd txg_resume(dp); 770168404Spjd return (EALREADY); 771168404Spjd } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 772168404Spjd /* 773168404Spjd * This buffer has already been synced. It could not 774168404Spjd * have been dirtied since, or we would have cleared the state. 775168404Spjd */ 776168404Spjd *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ 777168404Spjd mutex_exit(&db->db_mtx); 778168404Spjd txg_resume(dp); 779168404Spjd return (0); 780168404Spjd } 781168404Spjd 782168404Spjd dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 783168404Spjd in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 784168404Spjd in->dr = dr; 785168404Spjd in->done = done; 786168404Spjd in->arg = arg; 787168404Spjd mutex_exit(&db->db_mtx); 788168404Spjd txg_resume(dp); 789168404Spjd 790168404Spjd zb.zb_objset = os->os_dsl_dataset->ds_object; 791168404Spjd zb.zb_object = db->db.db_object; 792168404Spjd zb.zb_level = db->db_level; 793168404Spjd zb.zb_blkid = db->db_blkid; 794168404Spjd zio_flags = ZIO_FLAG_MUSTSUCCEED; 795168404Spjd if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0) 796168404Spjd zio_flags |= ZIO_FLAG_METADATA; 797168404Spjd zio = arc_write(pio, os->os_spa, 798168404Spjd zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), 799168404Spjd zio_compress_select(db->db_dnode->dn_compress, os->os_compress), 800168404Spjd dmu_get_replication_level(os, &zb, db->db_dnode->dn_type), 801168404Spjd txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, 802168404Spjd ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb); 803168404Spjd 804168404Spjd if (pio) { 805168404Spjd zio_nowait(zio); 806168404Spjd err = EINPROGRESS; 807168404Spjd } else { 808168404Spjd err = zio_wait(zio); 809168404Spjd ASSERT(err == 0); 810168404Spjd } 811168404Spjd return (err); 812168404Spjd} 813168404Spjd 814168404Spjdint 815168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 816168404Spjd dmu_tx_t *tx) 817168404Spjd{ 818168404Spjd dnode_t *dn; 819168404Spjd int err; 820168404Spjd 821168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 822168404Spjd if (err) 823168404Spjd return (err); 824168404Spjd err = dnode_set_blksz(dn, size, ibs, tx); 825168404Spjd dnode_rele(dn, FTAG); 826168404Spjd return (err); 827168404Spjd} 828168404Spjd 829168404Spjdvoid 830168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 831168404Spjd dmu_tx_t *tx) 832168404Spjd{ 833168404Spjd dnode_t *dn; 834168404Spjd 835168404Spjd /* XXX assumes dnode_hold will not get an i/o error */ 836168404Spjd (void) dnode_hold(os->os, object, FTAG, &dn); 837168404Spjd ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 838168404Spjd dn->dn_checksum = checksum; 839168404Spjd dnode_setdirty(dn, tx); 840168404Spjd dnode_rele(dn, FTAG); 841168404Spjd} 842168404Spjd 843168404Spjdvoid 844168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 845168404Spjd dmu_tx_t *tx) 846168404Spjd{ 847168404Spjd dnode_t *dn; 848168404Spjd 849168404Spjd /* XXX assumes dnode_hold will not get an i/o error */ 850168404Spjd (void) dnode_hold(os->os, object, FTAG, &dn); 851168404Spjd ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 852168404Spjd dn->dn_compress = compress; 853168404Spjd dnode_setdirty(dn, tx); 854168404Spjd dnode_rele(dn, FTAG); 855168404Spjd} 856168404Spjd 857168404Spjdint 858168404Spjddmu_get_replication_level(objset_impl_t *os, 859168404Spjd zbookmark_t *zb, dmu_object_type_t ot) 860168404Spjd{ 861168404Spjd int ncopies = os->os_copies; 862168404Spjd 863168404Spjd /* If it's the mos, it should have max copies set. */ 864168404Spjd ASSERT(zb->zb_objset != 0 || 865168404Spjd ncopies == spa_max_replication(os->os_spa)); 866168404Spjd 867168404Spjd if (dmu_ot[ot].ot_metadata || zb->zb_level != 0) 868168404Spjd ncopies++; 869168404Spjd return (MIN(ncopies, spa_max_replication(os->os_spa))); 870168404Spjd} 871168404Spjd 872168404Spjdint 873168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 874168404Spjd{ 875168404Spjd dnode_t *dn; 876168404Spjd int i, err; 877168404Spjd 878168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 879168404Spjd if (err) 880168404Spjd return (err); 881168404Spjd /* 882168404Spjd * Sync any current changes before 883168404Spjd * we go trundling through the block pointers. 884168404Spjd */ 885168404Spjd for (i = 0; i < TXG_SIZE; i++) { 886168404Spjd if (list_link_active(&dn->dn_dirty_link[i])) 887168404Spjd break; 888168404Spjd } 889168404Spjd if (i != TXG_SIZE) { 890168404Spjd dnode_rele(dn, FTAG); 891168404Spjd txg_wait_synced(dmu_objset_pool(os), 0); 892168404Spjd err = dnode_hold(os->os, object, FTAG, &dn); 893168404Spjd if (err) 894168404Spjd return (err); 895168404Spjd } 896168404Spjd 897168404Spjd err = dnode_next_offset(dn, hole, off, 1, 1, 0); 898168404Spjd dnode_rele(dn, FTAG); 899168404Spjd 900168404Spjd return (err); 901168404Spjd} 902168404Spjd 903168404Spjdvoid 904168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 905168404Spjd{ 906168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 907168404Spjd mutex_enter(&dn->dn_mtx); 908168404Spjd 909168404Spjd doi->doi_data_block_size = dn->dn_datablksz; 910168404Spjd doi->doi_metadata_block_size = dn->dn_indblkshift ? 911168404Spjd 1ULL << dn->dn_indblkshift : 0; 912168404Spjd doi->doi_indirection = dn->dn_nlevels; 913168404Spjd doi->doi_checksum = dn->dn_checksum; 914168404Spjd doi->doi_compress = dn->dn_compress; 915168404Spjd doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + 916168404Spjd SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; 917168404Spjd doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; 918168404Spjd doi->doi_type = dn->dn_type; 919168404Spjd doi->doi_bonus_size = dn->dn_bonuslen; 920168404Spjd doi->doi_bonus_type = dn->dn_bonustype; 921168404Spjd 922168404Spjd mutex_exit(&dn->dn_mtx); 923168404Spjd rw_exit(&dn->dn_struct_rwlock); 924168404Spjd} 925168404Spjd 926168404Spjd/* 927168404Spjd * Get information on a DMU object. 928168404Spjd * If doi is NULL, just indicates whether the object exists. 929168404Spjd */ 930168404Spjdint 931168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 932168404Spjd{ 933168404Spjd dnode_t *dn; 934168404Spjd int err = dnode_hold(os->os, object, FTAG, &dn); 935168404Spjd 936168404Spjd if (err) 937168404Spjd return (err); 938168404Spjd 939168404Spjd if (doi != NULL) 940168404Spjd dmu_object_info_from_dnode(dn, doi); 941168404Spjd 942168404Spjd dnode_rele(dn, FTAG); 943168404Spjd return (0); 944168404Spjd} 945168404Spjd 946168404Spjd/* 947168404Spjd * As above, but faster; can be used when you have a held dbuf in hand. 948168404Spjd */ 949168404Spjdvoid 950168404Spjddmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) 951168404Spjd{ 952168404Spjd dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); 953168404Spjd} 954168404Spjd 955168404Spjd/* 956168404Spjd * Faster still when you only care about the size. 957168404Spjd * This is specifically optimized for zfs_getattr(). 958168404Spjd */ 959168404Spjdvoid 960168404Spjddmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) 961168404Spjd{ 962168404Spjd dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 963168404Spjd 964168404Spjd *blksize = dn->dn_datablksz; 965168404Spjd /* add 1 for dnode space */ 966168404Spjd *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 967168404Spjd SPA_MINBLOCKSHIFT) + 1; 968168404Spjd} 969168404Spjd 970168404Spjdvoid 971168404Spjdbyteswap_uint64_array(void *vbuf, size_t size) 972168404Spjd{ 973168404Spjd uint64_t *buf = vbuf; 974168404Spjd size_t count = size >> 3; 975168404Spjd int i; 976168404Spjd 977168404Spjd ASSERT((size & 7) == 0); 978168404Spjd 979168404Spjd for (i = 0; i < count; i++) 980168404Spjd buf[i] = BSWAP_64(buf[i]); 981168404Spjd} 982168404Spjd 983168404Spjdvoid 984168404Spjdbyteswap_uint32_array(void *vbuf, size_t size) 985168404Spjd{ 986168404Spjd uint32_t *buf = vbuf; 987168404Spjd size_t count = size >> 2; 988168404Spjd int i; 989168404Spjd 990168404Spjd ASSERT((size & 3) == 0); 991168404Spjd 992168404Spjd for (i = 0; i < count; i++) 993168404Spjd buf[i] = BSWAP_32(buf[i]); 994168404Spjd} 995168404Spjd 996168404Spjdvoid 997168404Spjdbyteswap_uint16_array(void *vbuf, size_t size) 998168404Spjd{ 999168404Spjd uint16_t *buf = vbuf; 1000168404Spjd size_t count = size >> 1; 1001168404Spjd int i; 1002168404Spjd 1003168404Spjd ASSERT((size & 1) == 0); 1004168404Spjd 1005168404Spjd for (i = 0; i < count; i++) 1006168404Spjd buf[i] = BSWAP_16(buf[i]); 1007168404Spjd} 1008168404Spjd 1009168404Spjd/* ARGSUSED */ 1010168404Spjdvoid 1011168404Spjdbyteswap_uint8_array(void *vbuf, size_t size) 1012168404Spjd{ 1013168404Spjd} 1014168404Spjd 1015168404Spjdvoid 1016168404Spjddmu_init(void) 1017168404Spjd{ 1018168404Spjd dbuf_init(); 1019168404Spjd dnode_init(); 1020168404Spjd arc_init(); 1021168404Spjd} 1022168404Spjd 1023168404Spjdvoid 1024168404Spjddmu_fini(void) 1025168404Spjd{ 1026168404Spjd arc_fini(); 1027168404Spjd dnode_fini(); 1028168404Spjd dbuf_fini(); 1029168404Spjd} 1030