dmu.c revision 321549
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23304138Savg * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24168404Spjd */ 25251478Sdelphij/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ 26255750Sdelphij/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ 27268126Sdelphij/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ 28251478Sdelphij 29168404Spjd#include <sys/dmu.h> 30168404Spjd#include <sys/dmu_impl.h> 31168404Spjd#include <sys/dmu_tx.h> 32168404Spjd#include <sys/dbuf.h> 33168404Spjd#include <sys/dnode.h> 34168404Spjd#include <sys/zfs_context.h> 35168404Spjd#include <sys/dmu_objset.h> 36168404Spjd#include <sys/dmu_traverse.h> 37168404Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/dsl_dir.h> 39168404Spjd#include <sys/dsl_pool.h> 40168404Spjd#include <sys/dsl_synctask.h> 41168404Spjd#include <sys/dsl_prop.h> 42168404Spjd#include <sys/dmu_zfetch.h> 43168404Spjd#include <sys/zfs_ioctl.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zio_checksum.h> 46243524Smm#include <sys/zio_compress.h> 47219089Spjd#include <sys/sa.h> 48268126Sdelphij#include <sys/zfeature.h> 49219089Spjd#ifdef _KERNEL 50297633Strasz#include <sys/racct.h> 51258745Savg#include <sys/vm.h> 52185029Spjd#include <sys/zfs_znode.h> 53219089Spjd#endif 54168404Spjd 55243524Smm/* 56243524Smm * Enable/disable nopwrite feature. 57243524Smm */ 58243524Smmint zfs_nopwrite_enabled = 1; 59243525SmmSYSCTL_DECL(_vfs_zfs); 60243525SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, 61243525Smm &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); 62243524Smm 63321523Smav/* 64321523Smav * Tunable to control percentage of dirtied blocks from frees in one TXG. 65321523Smav * After this threshold is crossed, additional dirty blocks from frees 66321523Smav * wait until the next TXG. 67321523Smav * A value of zero will disable this throttle. 68321523Smav */ 69321523Smavuint32_t zfs_per_txg_dirty_frees_percent = 30; 70321523SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, 71321523Smav &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); 72321523Smav 73168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 74236884Smm { DMU_BSWAP_UINT8, TRUE, "unallocated" }, 75236884Smm { DMU_BSWAP_ZAP, TRUE, "object directory" }, 76236884Smm { DMU_BSWAP_UINT64, TRUE, "object array" }, 77236884Smm { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, 78236884Smm { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, 79236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj" }, 80236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, 81236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, 82236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, 83236884Smm { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, 84236884Smm { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, 85236884Smm { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, 86236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, 87236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, 88236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, 89236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL props" }, 90236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, 91236884Smm { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, 92236884Smm { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, 93236884Smm { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, 94236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, 95236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, 96236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, 97236884Smm { DMU_BSWAP_UINT8, FALSE, "zvol object" }, 98236884Smm { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, 99236884Smm { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, 100236884Smm { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, 101236884Smm { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, 102236884Smm { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, 103236884Smm { DMU_BSWAP_UINT8, TRUE, "SPA history" }, 104236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, 105236884Smm { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, 106236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, 107236884Smm { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, 108236884Smm { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, 109236884Smm { DMU_BSWAP_UINT8, TRUE, "FUID table" }, 110236884Smm { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, 111236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, 112236884Smm { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, 113236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, 114236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, 115236884Smm { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, 116236884Smm { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, 117236884Smm { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, 118236884Smm { DMU_BSWAP_UINT8, TRUE, "System attributes" }, 119236884Smm { DMU_BSWAP_ZAP, TRUE, "SA master node" }, 120236884Smm { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, 121236884Smm { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, 122236884Smm { DMU_BSWAP_ZAP, TRUE, "scan translations" }, 123236884Smm { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, 124236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, 125236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, 126236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, 127236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } 128168404Spjd}; 129168404Spjd 130236884Smmconst dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { 131236884Smm { byteswap_uint8_array, "uint8" }, 132236884Smm { byteswap_uint16_array, "uint16" }, 133236884Smm { byteswap_uint32_array, "uint32" }, 134236884Smm { byteswap_uint64_array, "uint64" }, 135236884Smm { zap_byteswap, "zap" }, 136236884Smm { dnode_buf_byteswap, "dnode" }, 137236884Smm { dmu_objset_byteswap, "objset" }, 138236884Smm { zfs_znode_byteswap, "znode" }, 139236884Smm { zfs_oldacl_byteswap, "oldacl" }, 140236884Smm { zfs_acl_byteswap, "acl" } 141236884Smm}; 142236884Smm 143168404Spjdint 144307290Smavdmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, 145307290Smav void *tag, dmu_buf_t **dbp) 146307290Smav{ 147307290Smav uint64_t blkid; 148307290Smav dmu_buf_impl_t *db; 149307290Smav 150307290Smav blkid = dbuf_whichblock(dn, 0, offset); 151307290Smav rw_enter(&dn->dn_struct_rwlock, RW_READER); 152307290Smav db = dbuf_hold(dn, blkid, tag); 153307290Smav rw_exit(&dn->dn_struct_rwlock); 154307290Smav 155307290Smav if (db == NULL) { 156307290Smav *dbp = NULL; 157307290Smav return (SET_ERROR(EIO)); 158307290Smav } 159307290Smav 160307290Smav *dbp = &db->db; 161307290Smav return (0); 162307290Smav} 163307290Smavint 164268075Sdelphijdmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, 165268075Sdelphij void *tag, dmu_buf_t **dbp) 166168404Spjd{ 167168404Spjd dnode_t *dn; 168168404Spjd uint64_t blkid; 169168404Spjd dmu_buf_impl_t *db; 170168404Spjd int err; 171168404Spjd 172219089Spjd err = dnode_hold(os, object, FTAG, &dn); 173168404Spjd if (err) 174168404Spjd return (err); 175286705Smav blkid = dbuf_whichblock(dn, 0, offset); 176168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 177168404Spjd db = dbuf_hold(dn, blkid, tag); 178168404Spjd rw_exit(&dn->dn_struct_rwlock); 179268075Sdelphij dnode_rele(dn, FTAG); 180268075Sdelphij 181168404Spjd if (db == NULL) { 182268075Sdelphij *dbp = NULL; 183268075Sdelphij return (SET_ERROR(EIO)); 184268075Sdelphij } 185268075Sdelphij 186268075Sdelphij *dbp = &db->db; 187268075Sdelphij return (err); 188268075Sdelphij} 189268075Sdelphij 190268075Sdelphijint 191307290Smavdmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, 192307290Smav void *tag, dmu_buf_t **dbp, int flags) 193307290Smav{ 194307290Smav int err; 195307290Smav int db_flags = DB_RF_CANFAIL; 196307290Smav 197307290Smav if (flags & DMU_READ_NO_PREFETCH) 198307290Smav db_flags |= DB_RF_NOPREFETCH; 199307290Smav 200307290Smav err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); 201307290Smav if (err == 0) { 202307290Smav dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); 203307290Smav err = dbuf_read(db, NULL, db_flags); 204307290Smav if (err != 0) { 205307290Smav dbuf_rele(db, tag); 206307290Smav *dbp = NULL; 207307290Smav } 208307290Smav } 209307290Smav 210307290Smav return (err); 211307290Smav} 212307290Smav 213307290Smavint 214268075Sdelphijdmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 215268075Sdelphij void *tag, dmu_buf_t **dbp, int flags) 216268075Sdelphij{ 217268075Sdelphij int err; 218268075Sdelphij int db_flags = DB_RF_CANFAIL; 219268075Sdelphij 220268075Sdelphij if (flags & DMU_READ_NO_PREFETCH) 221268075Sdelphij db_flags |= DB_RF_NOPREFETCH; 222268075Sdelphij 223268075Sdelphij err = dmu_buf_hold_noread(os, object, offset, tag, dbp); 224268075Sdelphij if (err == 0) { 225268075Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); 226219089Spjd err = dbuf_read(db, NULL, db_flags); 227268075Sdelphij if (err != 0) { 228168404Spjd dbuf_rele(db, tag); 229268075Sdelphij *dbp = NULL; 230168404Spjd } 231168404Spjd } 232168404Spjd 233168404Spjd return (err); 234168404Spjd} 235168404Spjd 236168404Spjdint 237168404Spjddmu_bonus_max(void) 238168404Spjd{ 239168404Spjd return (DN_MAX_BONUSLEN); 240168404Spjd} 241168404Spjd 242185029Spjdint 243219089Spjddmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) 244185029Spjd{ 245219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 246219089Spjd dnode_t *dn; 247219089Spjd int error; 248185029Spjd 249219089Spjd DB_DNODE_ENTER(db); 250219089Spjd dn = DB_DNODE(db); 251219089Spjd 252219089Spjd if (dn->dn_bonus != db) { 253249195Smm error = SET_ERROR(EINVAL); 254219089Spjd } else if (newsize < 0 || newsize > db_fake->db_size) { 255249195Smm error = SET_ERROR(EINVAL); 256219089Spjd } else { 257219089Spjd dnode_setbonuslen(dn, newsize, tx); 258219089Spjd error = 0; 259219089Spjd } 260219089Spjd 261219089Spjd DB_DNODE_EXIT(db); 262219089Spjd return (error); 263185029Spjd} 264185029Spjd 265219089Spjdint 266219089Spjddmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) 267219089Spjd{ 268219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 269219089Spjd dnode_t *dn; 270219089Spjd int error; 271219089Spjd 272219089Spjd DB_DNODE_ENTER(db); 273219089Spjd dn = DB_DNODE(db); 274219089Spjd 275236884Smm if (!DMU_OT_IS_VALID(type)) { 276249195Smm error = SET_ERROR(EINVAL); 277219089Spjd } else if (dn->dn_bonus != db) { 278249195Smm error = SET_ERROR(EINVAL); 279219089Spjd } else { 280219089Spjd dnode_setbonus_type(dn, type, tx); 281219089Spjd error = 0; 282219089Spjd } 283219089Spjd 284219089Spjd DB_DNODE_EXIT(db); 285219089Spjd return (error); 286219089Spjd} 287219089Spjd 288219089Spjddmu_object_type_t 289219089Spjddmu_get_bonustype(dmu_buf_t *db_fake) 290219089Spjd{ 291219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 292219089Spjd dnode_t *dn; 293219089Spjd dmu_object_type_t type; 294219089Spjd 295219089Spjd DB_DNODE_ENTER(db); 296219089Spjd dn = DB_DNODE(db); 297219089Spjd type = dn->dn_bonustype; 298219089Spjd DB_DNODE_EXIT(db); 299219089Spjd 300219089Spjd return (type); 301219089Spjd} 302219089Spjd 303219089Spjdint 304219089Spjddmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 305219089Spjd{ 306219089Spjd dnode_t *dn; 307219089Spjd int error; 308219089Spjd 309219089Spjd error = dnode_hold(os, object, FTAG, &dn); 310219089Spjd dbuf_rm_spill(dn, tx); 311219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 312219089Spjd dnode_rm_spill(dn, tx); 313219089Spjd rw_exit(&dn->dn_struct_rwlock); 314219089Spjd dnode_rele(dn, FTAG); 315219089Spjd return (error); 316219089Spjd} 317219089Spjd 318168404Spjd/* 319168404Spjd * returns ENOENT, EIO, or 0. 320168404Spjd */ 321168404Spjdint 322168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 323168404Spjd{ 324168404Spjd dnode_t *dn; 325168404Spjd dmu_buf_impl_t *db; 326185029Spjd int error; 327168404Spjd 328219089Spjd error = dnode_hold(os, object, FTAG, &dn); 329185029Spjd if (error) 330185029Spjd return (error); 331168404Spjd 332168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 333168404Spjd if (dn->dn_bonus == NULL) { 334168404Spjd rw_exit(&dn->dn_struct_rwlock); 335168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 336168404Spjd if (dn->dn_bonus == NULL) 337185029Spjd dbuf_create_bonus(dn); 338168404Spjd } 339168404Spjd db = dn->dn_bonus; 340185029Spjd 341185029Spjd /* as long as the bonus buf is held, the dnode will be held */ 342219089Spjd if (refcount_add(&db->db_holds, tag) == 1) { 343185029Spjd VERIFY(dnode_add_ref(dn, db)); 344270248Sdelphij atomic_inc_32(&dn->dn_dbufs_count); 345219089Spjd } 346185029Spjd 347219089Spjd /* 348219089Spjd * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's 349219089Spjd * hold and incrementing the dbuf count to ensure that dnode_move() sees 350219089Spjd * a dnode hold for every dbuf. 351219089Spjd */ 352219089Spjd rw_exit(&dn->dn_struct_rwlock); 353219089Spjd 354168404Spjd dnode_rele(dn, FTAG); 355168404Spjd 356219089Spjd VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); 357168404Spjd 358168404Spjd *dbp = &db->db; 359168404Spjd return (0); 360168404Spjd} 361168404Spjd 362168404Spjd/* 363219089Spjd * returns ENOENT, EIO, or 0. 364219089Spjd * 365219089Spjd * This interface will allocate a blank spill dbuf when a spill blk 366219089Spjd * doesn't already exist on the dnode. 367219089Spjd * 368219089Spjd * if you only want to find an already existing spill db, then 369219089Spjd * dmu_spill_hold_existing() should be used. 370219089Spjd */ 371219089Spjdint 372219089Spjddmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) 373219089Spjd{ 374219089Spjd dmu_buf_impl_t *db = NULL; 375219089Spjd int err; 376219089Spjd 377219089Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 378219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 379219089Spjd 380219089Spjd db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); 381219089Spjd 382219089Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 383219089Spjd rw_exit(&dn->dn_struct_rwlock); 384219089Spjd 385219089Spjd ASSERT(db != NULL); 386219089Spjd err = dbuf_read(db, NULL, flags); 387219089Spjd if (err == 0) 388219089Spjd *dbp = &db->db; 389219089Spjd else 390219089Spjd dbuf_rele(db, tag); 391219089Spjd return (err); 392219089Spjd} 393219089Spjd 394219089Spjdint 395219089Spjddmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 396219089Spjd{ 397219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 398219089Spjd dnode_t *dn; 399219089Spjd int err; 400219089Spjd 401219089Spjd DB_DNODE_ENTER(db); 402219089Spjd dn = DB_DNODE(db); 403219089Spjd 404219089Spjd if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { 405249195Smm err = SET_ERROR(EINVAL); 406219089Spjd } else { 407219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 408219089Spjd 409219089Spjd if (!dn->dn_have_spill) { 410249195Smm err = SET_ERROR(ENOENT); 411219089Spjd } else { 412219089Spjd err = dmu_spill_hold_by_dnode(dn, 413219089Spjd DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); 414219089Spjd } 415219089Spjd 416219089Spjd rw_exit(&dn->dn_struct_rwlock); 417219089Spjd } 418219089Spjd 419219089Spjd DB_DNODE_EXIT(db); 420219089Spjd return (err); 421219089Spjd} 422219089Spjd 423219089Spjdint 424219089Spjddmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 425219089Spjd{ 426219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 427219089Spjd dnode_t *dn; 428219089Spjd int err; 429219089Spjd 430219089Spjd DB_DNODE_ENTER(db); 431219089Spjd dn = DB_DNODE(db); 432219089Spjd err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); 433219089Spjd DB_DNODE_EXIT(db); 434219089Spjd 435219089Spjd return (err); 436219089Spjd} 437219089Spjd 438219089Spjd/* 439168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 440168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful, 441168404Spjd * and can induce severe lock contention when writing to several files 442168404Spjd * whose dnodes are in the same block. 443168404Spjd */ 444168404Spjdstatic int 445209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, 446287702Sdelphij boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) 447168404Spjd{ 448168404Spjd dmu_buf_t **dbp; 449168404Spjd uint64_t blkid, nblks, i; 450209962Smm uint32_t dbuf_flags; 451168404Spjd int err; 452168404Spjd zio_t *zio; 453168404Spjd 454168404Spjd ASSERT(length <= DMU_MAX_ACCESS); 455168404Spjd 456287702Sdelphij /* 457287702Sdelphij * Note: We directly notify the prefetch code of this read, so that 458287702Sdelphij * we can tell it about the multi-block read. dbuf_read() only knows 459287702Sdelphij * about the one block it is accessing. 460287702Sdelphij */ 461287702Sdelphij dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | 462287702Sdelphij DB_RF_NOPREFETCH; 463168404Spjd 464168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 465168404Spjd if (dn->dn_datablkshift) { 466168404Spjd int blkshift = dn->dn_datablkshift; 467287702Sdelphij nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - 468287702Sdelphij P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; 469168404Spjd } else { 470168404Spjd if (offset + length > dn->dn_datablksz) { 471168404Spjd zfs_panic_recover("zfs: accessing past end of object " 472168404Spjd "%llx/%llx (size=%u access=%llu+%llu)", 473168404Spjd (longlong_t)dn->dn_objset-> 474168404Spjd os_dsl_dataset->ds_object, 475168404Spjd (longlong_t)dn->dn_object, dn->dn_datablksz, 476168404Spjd (longlong_t)offset, (longlong_t)length); 477214378Smm rw_exit(&dn->dn_struct_rwlock); 478249195Smm return (SET_ERROR(EIO)); 479168404Spjd } 480168404Spjd nblks = 1; 481168404Spjd } 482168404Spjd dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 483168404Spjd 484297633Strasz#if defined(_KERNEL) && defined(RACCT) 485297633Strasz if (racct_enable && !read) { 486297633Strasz PROC_LOCK(curproc); 487297633Strasz racct_add_force(curproc, RACCT_WRITEBPS, length); 488297633Strasz racct_add_force(curproc, RACCT_WRITEIOPS, nblks); 489297633Strasz PROC_UNLOCK(curproc); 490297633Strasz } 491297633Strasz#endif 492297633Strasz 493185029Spjd zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); 494286705Smav blkid = dbuf_whichblock(dn, 0, offset); 495168404Spjd for (i = 0; i < nblks; i++) { 496287702Sdelphij dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); 497168404Spjd if (db == NULL) { 498168404Spjd rw_exit(&dn->dn_struct_rwlock); 499168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 500168404Spjd zio_nowait(zio); 501249195Smm return (SET_ERROR(EIO)); 502168404Spjd } 503287702Sdelphij 504168404Spjd /* initiate async i/o */ 505226620Spjd if (read) 506209962Smm (void) dbuf_read(db, zio, dbuf_flags); 507226620Spjd#ifdef _KERNEL 508226620Spjd else 509226620Spjd curthread->td_ru.ru_oublock++; 510226620Spjd#endif 511168404Spjd dbp[i] = &db->db; 512168404Spjd } 513287702Sdelphij 514297832Smav if ((flags & DMU_READ_NO_PREFETCH) == 0 && 515297832Smav DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { 516297832Smav dmu_zfetch(&dn->dn_zfetch, blkid, nblks, 517297832Smav read && DNODE_IS_CACHEABLE(dn)); 518287702Sdelphij } 519168404Spjd rw_exit(&dn->dn_struct_rwlock); 520168404Spjd 521168404Spjd /* wait for async i/o */ 522168404Spjd err = zio_wait(zio); 523168404Spjd if (err) { 524168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 525168404Spjd return (err); 526168404Spjd } 527168404Spjd 528168404Spjd /* wait for other io to complete */ 529168404Spjd if (read) { 530168404Spjd for (i = 0; i < nblks; i++) { 531168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 532168404Spjd mutex_enter(&db->db_mtx); 533168404Spjd while (db->db_state == DB_READ || 534168404Spjd db->db_state == DB_FILL) 535168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 536168404Spjd if (db->db_state == DB_UNCACHED) 537249195Smm err = SET_ERROR(EIO); 538168404Spjd mutex_exit(&db->db_mtx); 539168404Spjd if (err) { 540168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 541168404Spjd return (err); 542168404Spjd } 543168404Spjd } 544168404Spjd } 545168404Spjd 546168404Spjd *numbufsp = nblks; 547168404Spjd *dbpp = dbp; 548168404Spjd return (0); 549168404Spjd} 550168404Spjd 551168404Spjdstatic int 552168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 553168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 554168404Spjd{ 555168404Spjd dnode_t *dn; 556168404Spjd int err; 557168404Spjd 558219089Spjd err = dnode_hold(os, object, FTAG, &dn); 559168404Spjd if (err) 560168404Spjd return (err); 561168404Spjd 562168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 563209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 564168404Spjd 565168404Spjd dnode_rele(dn, FTAG); 566168404Spjd 567168404Spjd return (err); 568168404Spjd} 569168404Spjd 570168404Spjdint 571219089Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, 572287702Sdelphij uint64_t length, boolean_t read, void *tag, int *numbufsp, 573287702Sdelphij dmu_buf_t ***dbpp) 574168404Spjd{ 575219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 576219089Spjd dnode_t *dn; 577168404Spjd int err; 578168404Spjd 579219089Spjd DB_DNODE_ENTER(db); 580219089Spjd dn = DB_DNODE(db); 581168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 582209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 583219089Spjd DB_DNODE_EXIT(db); 584168404Spjd 585168404Spjd return (err); 586168404Spjd} 587168404Spjd 588168404Spjdvoid 589168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 590168404Spjd{ 591168404Spjd int i; 592168404Spjd dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 593168404Spjd 594168404Spjd if (numbufs == 0) 595168404Spjd return; 596168404Spjd 597168404Spjd for (i = 0; i < numbufs; i++) { 598168404Spjd if (dbp[i]) 599168404Spjd dbuf_rele(dbp[i], tag); 600168404Spjd } 601168404Spjd 602168404Spjd kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 603168404Spjd} 604168404Spjd 605258632Savg/* 606286705Smav * Issue prefetch i/os for the given blocks. If level is greater than 0, the 607286705Smav * indirect blocks prefeteched will be those that point to the blocks containing 608286705Smav * the data starting at offset, and continuing to offset + len. 609258632Savg * 610286705Smav * Note that if the indirect blocks above the blocks being prefetched are not in 611286705Smav * cache, they will be asychronously read in. 612258632Savg */ 613168404Spjdvoid 614286705Smavdmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, 615286705Smav uint64_t len, zio_priority_t pri) 616168404Spjd{ 617168404Spjd dnode_t *dn; 618168404Spjd uint64_t blkid; 619258632Savg int nblks, err; 620168404Spjd 621168404Spjd if (len == 0) { /* they're interested in the bonus buffer */ 622219089Spjd dn = DMU_META_DNODE(os); 623168404Spjd 624168404Spjd if (object == 0 || object >= DN_MAX_OBJECT) 625168404Spjd return; 626168404Spjd 627168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 628286705Smav blkid = dbuf_whichblock(dn, level, 629286705Smav object * sizeof (dnode_phys_t)); 630286705Smav dbuf_prefetch(dn, level, blkid, pri, 0); 631168404Spjd rw_exit(&dn->dn_struct_rwlock); 632168404Spjd return; 633168404Spjd } 634168404Spjd 635168404Spjd /* 636168404Spjd * XXX - Note, if the dnode for the requested object is not 637168404Spjd * already cached, we will do a *synchronous* read in the 638168404Spjd * dnode_hold() call. The same is true for any indirects. 639168404Spjd */ 640219089Spjd err = dnode_hold(os, object, FTAG, &dn); 641168404Spjd if (err != 0) 642168404Spjd return; 643168404Spjd 644168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 645286705Smav /* 646286705Smav * offset + len - 1 is the last byte we want to prefetch for, and offset 647286705Smav * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the 648286705Smav * last block we want to prefetch, and dbuf_whichblock(dn, level, 649286705Smav * offset) is the first. Then the number we need to prefetch is the 650286705Smav * last - first + 1. 651286705Smav */ 652286705Smav if (level > 0 || dn->dn_datablkshift != 0) { 653286705Smav nblks = dbuf_whichblock(dn, level, offset + len - 1) - 654286705Smav dbuf_whichblock(dn, level, offset) + 1; 655168404Spjd } else { 656168404Spjd nblks = (offset < dn->dn_datablksz); 657168404Spjd } 658168404Spjd 659168404Spjd if (nblks != 0) { 660286705Smav blkid = dbuf_whichblock(dn, level, offset); 661258632Savg for (int i = 0; i < nblks; i++) 662286705Smav dbuf_prefetch(dn, level, blkid + i, pri, 0); 663168404Spjd } 664168404Spjd 665168404Spjd rw_exit(&dn->dn_struct_rwlock); 666168404Spjd 667168404Spjd dnode_rele(dn, FTAG); 668168404Spjd} 669168404Spjd 670208775Smm/* 671208775Smm * Get the next "chunk" of file data to free. We traverse the file from 672208775Smm * the end so that the file gets shorter over time (if we crashes in the 673208775Smm * middle, this will leave us in a better state). We find allocated file 674208775Smm * data by simply searching the allocated level 1 indirects. 675254753Sdelphij * 676254753Sdelphij * On input, *start should be the first offset that does not need to be 677254753Sdelphij * freed (e.g. "offset + length"). On return, *start will be the first 678254753Sdelphij * offset that should be freed. 679208775Smm */ 680185029Spjdstatic int 681254753Sdelphijget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) 682185029Spjd{ 683254753Sdelphij uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); 684254753Sdelphij /* bytes of data covered by a level-1 indirect block */ 685208775Smm uint64_t iblkrange = 686185029Spjd dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); 687185029Spjd 688254753Sdelphij ASSERT3U(minimum, <=, *start); 689185029Spjd 690254753Sdelphij if (*start - minimum <= iblkrange * maxblks) { 691254753Sdelphij *start = minimum; 692185029Spjd return (0); 693185029Spjd } 694208775Smm ASSERT(ISP2(iblkrange)); 695185029Spjd 696254753Sdelphij for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { 697185029Spjd int err; 698185029Spjd 699254753Sdelphij /* 700254753Sdelphij * dnode_next_offset(BACKWARDS) will find an allocated L1 701254753Sdelphij * indirect block at or before the input offset. We must 702254753Sdelphij * decrement *start so that it is at the end of the region 703254753Sdelphij * to search. 704254753Sdelphij */ 705254753Sdelphij (*start)--; 706185029Spjd err = dnode_next_offset(dn, 707208775Smm DNODE_FIND_BACKWARDS, start, 2, 1, 0); 708185029Spjd 709254753Sdelphij /* if there are no indirect blocks before start, we are done */ 710208775Smm if (err == ESRCH) { 711254753Sdelphij *start = minimum; 712254753Sdelphij break; 713254753Sdelphij } else if (err != 0) { 714208775Smm return (err); 715185029Spjd } 716185029Spjd 717254753Sdelphij /* set start to the beginning of this L1 indirect */ 718208775Smm *start = P2ALIGN(*start, iblkrange); 719185029Spjd } 720254753Sdelphij if (*start < minimum) 721254753Sdelphij *start = minimum; 722185029Spjd return (0); 723185029Spjd} 724185029Spjd 725185029Spjdstatic int 726185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, 727254753Sdelphij uint64_t length) 728185029Spjd{ 729254753Sdelphij uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; 730254753Sdelphij int err; 731321523Smav uint64_t dirty_frees_threshold; 732321523Smav dsl_pool_t *dp = dmu_objset_pool(os); 733185029Spjd 734254753Sdelphij if (offset >= object_size) 735185029Spjd return (0); 736185029Spjd 737321523Smav if (zfs_per_txg_dirty_frees_percent <= 100) 738321523Smav dirty_frees_threshold = 739321523Smav zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; 740321523Smav else 741321523Smav dirty_frees_threshold = zfs_dirty_data_max / 4; 742321523Smav 743254753Sdelphij if (length == DMU_OBJECT_END || offset + length > object_size) 744254753Sdelphij length = object_size - offset; 745254753Sdelphij 746254753Sdelphij while (length != 0) { 747321523Smav uint64_t chunk_end, chunk_begin, chunk_len; 748321523Smav uint64_t long_free_dirty_all_txgs = 0; 749321523Smav dmu_tx_t *tx; 750254753Sdelphij 751254753Sdelphij chunk_end = chunk_begin = offset + length; 752254753Sdelphij 753254753Sdelphij /* move chunk_begin backwards to the beginning of this chunk */ 754254753Sdelphij err = get_next_chunk(dn, &chunk_begin, offset); 755185029Spjd if (err) 756185029Spjd return (err); 757254753Sdelphij ASSERT3U(chunk_begin, >=, offset); 758254753Sdelphij ASSERT3U(chunk_begin, <=, chunk_end); 759185029Spjd 760321523Smav chunk_len = chunk_end - chunk_begin; 761268464Sdelphij 762321523Smav mutex_enter(&dp->dp_lock); 763321523Smav for (int t = 0; t < TXG_SIZE; t++) { 764321523Smav long_free_dirty_all_txgs += 765321523Smav dp->dp_long_free_dirty_pertxg[t]; 766321523Smav } 767321523Smav mutex_exit(&dp->dp_lock); 768321523Smav 769268464Sdelphij /* 770321523Smav * To avoid filling up a TXG with just frees wait for 771321523Smav * the next TXG to open before freeing more chunks if 772321523Smav * we have reached the threshold of frees 773321523Smav */ 774321523Smav if (dirty_frees_threshold != 0 && 775321523Smav long_free_dirty_all_txgs >= dirty_frees_threshold) { 776321523Smav txg_wait_open(dp, 0); 777321523Smav continue; 778321523Smav } 779321523Smav 780321523Smav tx = dmu_tx_create(os); 781321523Smav dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); 782321523Smav 783321523Smav /* 784268464Sdelphij * Mark this transaction as typically resulting in a net 785268464Sdelphij * reduction in space used. 786268464Sdelphij */ 787268464Sdelphij dmu_tx_mark_netfree(tx); 788185029Spjd err = dmu_tx_assign(tx, TXG_WAIT); 789185029Spjd if (err) { 790185029Spjd dmu_tx_abort(tx); 791185029Spjd return (err); 792185029Spjd } 793321523Smav 794321523Smav mutex_enter(&dp->dp_lock); 795321523Smav dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += 796321523Smav chunk_len; 797321523Smav mutex_exit(&dp->dp_lock); 798321523Smav DTRACE_PROBE3(free__long__range, 799321523Smav uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, 800321523Smav uint64_t, dmu_tx_get_txg(tx)); 801321523Smav dnode_free_range(dn, chunk_begin, chunk_len, tx); 802254753Sdelphij dmu_tx_commit(tx); 803185029Spjd 804321523Smav length -= chunk_len; 805185029Spjd } 806185029Spjd return (0); 807185029Spjd} 808185029Spjd 809168404Spjdint 810185029Spjddmu_free_long_range(objset_t *os, uint64_t object, 811185029Spjd uint64_t offset, uint64_t length) 812185029Spjd{ 813185029Spjd dnode_t *dn; 814185029Spjd int err; 815185029Spjd 816219089Spjd err = dnode_hold(os, object, FTAG, &dn); 817185029Spjd if (err != 0) 818185029Spjd return (err); 819254753Sdelphij err = dmu_free_long_range_impl(os, dn, offset, length); 820256259Savg 821256259Savg /* 822256259Savg * It is important to zero out the maxblkid when freeing the entire 823256259Savg * file, so that (a) subsequent calls to dmu_free_long_range_impl() 824256259Savg * will take the fast path, and (b) dnode_reallocate() can verify 825256259Savg * that the entire file has been freed. 826256259Savg */ 827260150Sdelphij if (err == 0 && offset == 0 && length == DMU_OBJECT_END) 828256259Savg dn->dn_maxblkid = 0; 829256259Savg 830185029Spjd dnode_rele(dn, FTAG); 831185029Spjd return (err); 832185029Spjd} 833185029Spjd 834185029Spjdint 835254753Sdelphijdmu_free_long_object(objset_t *os, uint64_t object) 836185029Spjd{ 837185029Spjd dmu_tx_t *tx; 838185029Spjd int err; 839185029Spjd 840254753Sdelphij err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); 841185029Spjd if (err != 0) 842185029Spjd return (err); 843254753Sdelphij 844254753Sdelphij tx = dmu_tx_create(os); 845254753Sdelphij dmu_tx_hold_bonus(tx, object); 846254753Sdelphij dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 847268464Sdelphij dmu_tx_mark_netfree(tx); 848254753Sdelphij err = dmu_tx_assign(tx, TXG_WAIT); 849254753Sdelphij if (err == 0) { 850254753Sdelphij err = dmu_object_free(os, object, tx); 851254753Sdelphij dmu_tx_commit(tx); 852185029Spjd } else { 853254753Sdelphij dmu_tx_abort(tx); 854185029Spjd } 855254753Sdelphij 856185029Spjd return (err); 857185029Spjd} 858185029Spjd 859185029Spjdint 860168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 861168404Spjd uint64_t size, dmu_tx_t *tx) 862168404Spjd{ 863168404Spjd dnode_t *dn; 864219089Spjd int err = dnode_hold(os, object, FTAG, &dn); 865168404Spjd if (err) 866168404Spjd return (err); 867168404Spjd ASSERT(offset < UINT64_MAX); 868168404Spjd ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 869168404Spjd dnode_free_range(dn, offset, size, tx); 870168404Spjd dnode_rele(dn, FTAG); 871168404Spjd return (0); 872168404Spjd} 873168404Spjd 874321549Smavstatic int 875321549Smavdmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, 876209962Smm void *buf, uint32_t flags) 877168404Spjd{ 878168404Spjd dmu_buf_t **dbp; 879321549Smav int numbufs, err = 0; 880168404Spjd 881168404Spjd /* 882168404Spjd * Deal with odd block sizes, where there can't be data past the first 883168404Spjd * block. If we ever do the tail block optimization, we will need to 884168404Spjd * handle that here as well. 885168404Spjd */ 886214378Smm if (dn->dn_maxblkid == 0) { 887168404Spjd int newsz = offset > dn->dn_datablksz ? 0 : 888168404Spjd MIN(size, dn->dn_datablksz - offset); 889168404Spjd bzero((char *)buf + newsz, size - newsz); 890168404Spjd size = newsz; 891168404Spjd } 892168404Spjd 893168404Spjd while (size > 0) { 894168404Spjd uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 895214378Smm int i; 896168404Spjd 897168404Spjd /* 898168404Spjd * NB: we could do this block-at-a-time, but it's nice 899168404Spjd * to be reading in parallel. 900168404Spjd */ 901168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 902209962Smm TRUE, FTAG, &numbufs, &dbp, flags); 903168404Spjd if (err) 904185029Spjd break; 905168404Spjd 906168404Spjd for (i = 0; i < numbufs; i++) { 907168404Spjd int tocpy; 908168404Spjd int bufoff; 909168404Spjd dmu_buf_t *db = dbp[i]; 910168404Spjd 911168404Spjd ASSERT(size > 0); 912168404Spjd 913168404Spjd bufoff = offset - db->db_offset; 914168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 915168404Spjd 916168404Spjd bcopy((char *)db->db_data + bufoff, buf, tocpy); 917168404Spjd 918168404Spjd offset += tocpy; 919168404Spjd size -= tocpy; 920168404Spjd buf = (char *)buf + tocpy; 921168404Spjd } 922168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 923168404Spjd } 924321549Smav return (err); 925321549Smav} 926321549Smav 927321549Smavint 928321549Smavdmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 929321549Smav void *buf, uint32_t flags) 930321549Smav{ 931321549Smav dnode_t *dn; 932321549Smav int err; 933321549Smav 934321549Smav err = dnode_hold(os, object, FTAG, &dn); 935321549Smav if (err != 0) 936321549Smav return (err); 937321549Smav 938321549Smav err = dmu_read_impl(dn, offset, size, buf, flags); 939168404Spjd dnode_rele(dn, FTAG); 940185029Spjd return (err); 941168404Spjd} 942168404Spjd 943321549Smavint 944321549Smavdmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, 945321549Smav uint32_t flags) 946321549Smav{ 947321549Smav return (dmu_read_impl(dn, offset, size, buf, flags)); 948321549Smav} 949321549Smav 950321549Smavstatic void 951321549Smavdmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, 952168404Spjd const void *buf, dmu_tx_t *tx) 953168404Spjd{ 954321549Smav int i; 955168404Spjd 956168404Spjd for (i = 0; i < numbufs; i++) { 957168404Spjd int tocpy; 958168404Spjd int bufoff; 959168404Spjd dmu_buf_t *db = dbp[i]; 960168404Spjd 961168404Spjd ASSERT(size > 0); 962168404Spjd 963168404Spjd bufoff = offset - db->db_offset; 964168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 965168404Spjd 966168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 967168404Spjd 968168404Spjd if (tocpy == db->db_size) 969168404Spjd dmu_buf_will_fill(db, tx); 970168404Spjd else 971168404Spjd dmu_buf_will_dirty(db, tx); 972168404Spjd 973168404Spjd bcopy(buf, (char *)db->db_data + bufoff, tocpy); 974168404Spjd 975168404Spjd if (tocpy == db->db_size) 976168404Spjd dmu_buf_fill_done(db, tx); 977168404Spjd 978168404Spjd offset += tocpy; 979168404Spjd size -= tocpy; 980168404Spjd buf = (char *)buf + tocpy; 981168404Spjd } 982321549Smav} 983321549Smav 984321549Smavvoid 985321549Smavdmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 986321549Smav const void *buf, dmu_tx_t *tx) 987321549Smav{ 988321549Smav dmu_buf_t **dbp; 989321549Smav int numbufs; 990321549Smav 991321549Smav if (size == 0) 992321549Smav return; 993321549Smav 994321549Smav VERIFY0(dmu_buf_hold_array(os, object, offset, size, 995321549Smav FALSE, FTAG, &numbufs, &dbp)); 996321549Smav dmu_write_impl(dbp, numbufs, offset, size, buf, tx); 997168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 998168404Spjd} 999168404Spjd 1000219089Spjdvoid 1001321549Smavdmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, 1002321549Smav const void *buf, dmu_tx_t *tx) 1003321549Smav{ 1004321549Smav dmu_buf_t **dbp; 1005321549Smav int numbufs; 1006321549Smav 1007321549Smav if (size == 0) 1008321549Smav return; 1009321549Smav 1010321549Smav VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, 1011321549Smav FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); 1012321549Smav dmu_write_impl(dbp, numbufs, offset, size, buf, tx); 1013321549Smav dmu_buf_rele_array(dbp, numbufs, FTAG); 1014321549Smav} 1015321549Smav 1016321549Smavvoid 1017219089Spjddmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1018219089Spjd dmu_tx_t *tx) 1019219089Spjd{ 1020219089Spjd dmu_buf_t **dbp; 1021219089Spjd int numbufs, i; 1022219089Spjd 1023219089Spjd if (size == 0) 1024219089Spjd return; 1025219089Spjd 1026219089Spjd VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 1027219089Spjd FALSE, FTAG, &numbufs, &dbp)); 1028219089Spjd 1029219089Spjd for (i = 0; i < numbufs; i++) { 1030219089Spjd dmu_buf_t *db = dbp[i]; 1031219089Spjd 1032219089Spjd dmu_buf_will_not_fill(db, tx); 1033219089Spjd } 1034219089Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1035219089Spjd} 1036219089Spjd 1037268075Sdelphijvoid 1038268075Sdelphijdmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, 1039268075Sdelphij void *data, uint8_t etype, uint8_t comp, int uncompressed_size, 1040268075Sdelphij int compressed_size, int byteorder, dmu_tx_t *tx) 1041268075Sdelphij{ 1042268075Sdelphij dmu_buf_t *db; 1043268075Sdelphij 1044268075Sdelphij ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); 1045268075Sdelphij ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); 1046268075Sdelphij VERIFY0(dmu_buf_hold_noread(os, object, offset, 1047268075Sdelphij FTAG, &db)); 1048268075Sdelphij 1049268075Sdelphij dmu_buf_write_embedded(db, 1050268075Sdelphij data, (bp_embedded_type_t)etype, (enum zio_compress)comp, 1051268075Sdelphij uncompressed_size, compressed_size, byteorder, tx); 1052268075Sdelphij 1053268075Sdelphij dmu_buf_rele(db, FTAG); 1054268075Sdelphij} 1055268075Sdelphij 1056219089Spjd/* 1057219089Spjd * DMU support for xuio 1058219089Spjd */ 1059219089Spjdkstat_t *xuio_ksp = NULL; 1060219089Spjd 1061219089Spjdint 1062219089Spjddmu_xuio_init(xuio_t *xuio, int nblk) 1063219089Spjd{ 1064219089Spjd dmu_xuio_t *priv; 1065219089Spjd uio_t *uio = &xuio->xu_uio; 1066219089Spjd 1067219089Spjd uio->uio_iovcnt = nblk; 1068219089Spjd uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); 1069219089Spjd 1070219089Spjd priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); 1071219089Spjd priv->cnt = nblk; 1072219089Spjd priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); 1073219089Spjd priv->iovp = uio->uio_iov; 1074219089Spjd XUIO_XUZC_PRIV(xuio) = priv; 1075219089Spjd 1076219089Spjd if (XUIO_XUZC_RW(xuio) == UIO_READ) 1077219089Spjd XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); 1078219089Spjd else 1079219089Spjd XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); 1080219089Spjd 1081219089Spjd return (0); 1082219089Spjd} 1083219089Spjd 1084219089Spjdvoid 1085219089Spjddmu_xuio_fini(xuio_t *xuio) 1086219089Spjd{ 1087219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1088219089Spjd int nblk = priv->cnt; 1089219089Spjd 1090219089Spjd kmem_free(priv->iovp, nblk * sizeof (iovec_t)); 1091219089Spjd kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); 1092219089Spjd kmem_free(priv, sizeof (dmu_xuio_t)); 1093219089Spjd 1094219089Spjd if (XUIO_XUZC_RW(xuio) == UIO_READ) 1095219089Spjd XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); 1096219089Spjd else 1097219089Spjd XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); 1098219089Spjd} 1099219089Spjd 1100219089Spjd/* 1101219089Spjd * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } 1102219089Spjd * and increase priv->next by 1. 1103219089Spjd */ 1104219089Spjdint 1105219089Spjddmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) 1106219089Spjd{ 1107219089Spjd struct iovec *iov; 1108219089Spjd uio_t *uio = &xuio->xu_uio; 1109219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1110219089Spjd int i = priv->next++; 1111219089Spjd 1112219089Spjd ASSERT(i < priv->cnt); 1113321535Smav ASSERT(off + n <= arc_buf_lsize(abuf)); 1114219089Spjd iov = uio->uio_iov + i; 1115219089Spjd iov->iov_base = (char *)abuf->b_data + off; 1116219089Spjd iov->iov_len = n; 1117219089Spjd priv->bufs[i] = abuf; 1118219089Spjd return (0); 1119219089Spjd} 1120219089Spjd 1121219089Spjdint 1122219089Spjddmu_xuio_cnt(xuio_t *xuio) 1123219089Spjd{ 1124219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1125219089Spjd return (priv->cnt); 1126219089Spjd} 1127219089Spjd 1128219089Spjdarc_buf_t * 1129219089Spjddmu_xuio_arcbuf(xuio_t *xuio, int i) 1130219089Spjd{ 1131219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1132219089Spjd 1133219089Spjd ASSERT(i < priv->cnt); 1134219089Spjd return (priv->bufs[i]); 1135219089Spjd} 1136219089Spjd 1137219089Spjdvoid 1138219089Spjddmu_xuio_clear(xuio_t *xuio, int i) 1139219089Spjd{ 1140219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1141219089Spjd 1142219089Spjd ASSERT(i < priv->cnt); 1143219089Spjd priv->bufs[i] = NULL; 1144219089Spjd} 1145219089Spjd 1146219089Spjdstatic void 1147219089Spjdxuio_stat_init(void) 1148219089Spjd{ 1149219089Spjd xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", 1150219089Spjd KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), 1151219089Spjd KSTAT_FLAG_VIRTUAL); 1152219089Spjd if (xuio_ksp != NULL) { 1153219089Spjd xuio_ksp->ks_data = &xuio_stats; 1154219089Spjd kstat_install(xuio_ksp); 1155219089Spjd } 1156219089Spjd} 1157219089Spjd 1158219089Spjdstatic void 1159219089Spjdxuio_stat_fini(void) 1160219089Spjd{ 1161219089Spjd if (xuio_ksp != NULL) { 1162219089Spjd kstat_delete(xuio_ksp); 1163219089Spjd xuio_ksp = NULL; 1164219089Spjd } 1165219089Spjd} 1166219089Spjd 1167219089Spjdvoid 1168321530Smavxuio_stat_wbuf_copied(void) 1169219089Spjd{ 1170219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_copied); 1171219089Spjd} 1172219089Spjd 1173219089Spjdvoid 1174321530Smavxuio_stat_wbuf_nocopy(void) 1175219089Spjd{ 1176219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); 1177219089Spjd} 1178219089Spjd 1179168404Spjd#ifdef _KERNEL 1180272809Sdelphijstatic int 1181272809Sdelphijdmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) 1182168404Spjd{ 1183168404Spjd dmu_buf_t **dbp; 1184168404Spjd int numbufs, i, err; 1185219089Spjd xuio_t *xuio = NULL; 1186168404Spjd 1187168404Spjd /* 1188168404Spjd * NB: we could do this block-at-a-time, but it's nice 1189168404Spjd * to be reading in parallel. 1190168404Spjd */ 1191272809Sdelphij err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, 1192272809Sdelphij TRUE, FTAG, &numbufs, &dbp, 0); 1193168404Spjd if (err) 1194168404Spjd return (err); 1195168404Spjd 1196219089Spjd#ifdef UIO_XUIO 1197219089Spjd if (uio->uio_extflg == UIO_XUIO) 1198219089Spjd xuio = (xuio_t *)uio; 1199219089Spjd#endif 1200219089Spjd 1201168404Spjd for (i = 0; i < numbufs; i++) { 1202168404Spjd int tocpy; 1203168404Spjd int bufoff; 1204168404Spjd dmu_buf_t *db = dbp[i]; 1205168404Spjd 1206168404Spjd ASSERT(size > 0); 1207168404Spjd 1208168404Spjd bufoff = uio->uio_loffset - db->db_offset; 1209168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1210168404Spjd 1211219089Spjd if (xuio) { 1212219089Spjd dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 1213219089Spjd arc_buf_t *dbuf_abuf = dbi->db_buf; 1214219089Spjd arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); 1215219089Spjd err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); 1216219089Spjd if (!err) { 1217219089Spjd uio->uio_resid -= tocpy; 1218219089Spjd uio->uio_loffset += tocpy; 1219219089Spjd } 1220219089Spjd 1221219089Spjd if (abuf == dbuf_abuf) 1222219089Spjd XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); 1223219089Spjd else 1224219089Spjd XUIOSTAT_BUMP(xuiostat_rbuf_copied); 1225219089Spjd } else { 1226298105Savg#ifdef illumos 1227219089Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 1228219089Spjd UIO_READ, uio); 1229298105Savg#else 1230298105Savg err = vn_io_fault_uiomove((char *)db->db_data + bufoff, 1231298105Savg tocpy, uio); 1232298105Savg#endif 1233219089Spjd } 1234168404Spjd if (err) 1235168404Spjd break; 1236168404Spjd 1237168404Spjd size -= tocpy; 1238168404Spjd } 1239168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1240168404Spjd 1241168404Spjd return (err); 1242168404Spjd} 1243168404Spjd 1244272809Sdelphij/* 1245272809Sdelphij * Read 'size' bytes into the uio buffer. 1246272809Sdelphij * From object zdb->db_object. 1247272809Sdelphij * Starting at offset uio->uio_loffset. 1248272809Sdelphij * 1249272809Sdelphij * If the caller already has a dbuf in the target object 1250272809Sdelphij * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), 1251272809Sdelphij * because we don't have to find the dnode_t for the object. 1252272809Sdelphij */ 1253272809Sdelphijint 1254272809Sdelphijdmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) 1255272809Sdelphij{ 1256272809Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; 1257272809Sdelphij dnode_t *dn; 1258272809Sdelphij int err; 1259272809Sdelphij 1260272809Sdelphij if (size == 0) 1261272809Sdelphij return (0); 1262272809Sdelphij 1263272809Sdelphij DB_DNODE_ENTER(db); 1264272809Sdelphij dn = DB_DNODE(db); 1265272809Sdelphij err = dmu_read_uio_dnode(dn, uio, size); 1266272809Sdelphij DB_DNODE_EXIT(db); 1267272809Sdelphij 1268272809Sdelphij return (err); 1269272809Sdelphij} 1270272809Sdelphij 1271272809Sdelphij/* 1272272809Sdelphij * Read 'size' bytes into the uio buffer. 1273272809Sdelphij * From the specified object 1274272809Sdelphij * Starting at offset uio->uio_loffset. 1275272809Sdelphij */ 1276272809Sdelphijint 1277272809Sdelphijdmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 1278272809Sdelphij{ 1279272809Sdelphij dnode_t *dn; 1280272809Sdelphij int err; 1281272809Sdelphij 1282272809Sdelphij if (size == 0) 1283272809Sdelphij return (0); 1284272809Sdelphij 1285272809Sdelphij err = dnode_hold(os, object, FTAG, &dn); 1286272809Sdelphij if (err) 1287272809Sdelphij return (err); 1288272809Sdelphij 1289272809Sdelphij err = dmu_read_uio_dnode(dn, uio, size); 1290272809Sdelphij 1291272809Sdelphij dnode_rele(dn, FTAG); 1292272809Sdelphij 1293272809Sdelphij return (err); 1294272809Sdelphij} 1295272809Sdelphij 1296219089Spjdstatic int 1297219089Spjddmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) 1298168404Spjd{ 1299168404Spjd dmu_buf_t **dbp; 1300219089Spjd int numbufs; 1301168404Spjd int err = 0; 1302219089Spjd int i; 1303168404Spjd 1304219089Spjd err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, 1305219089Spjd FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); 1306168404Spjd if (err) 1307168404Spjd return (err); 1308168404Spjd 1309168404Spjd for (i = 0; i < numbufs; i++) { 1310168404Spjd int tocpy; 1311168404Spjd int bufoff; 1312168404Spjd dmu_buf_t *db = dbp[i]; 1313168404Spjd 1314168404Spjd ASSERT(size > 0); 1315168404Spjd 1316168404Spjd bufoff = uio->uio_loffset - db->db_offset; 1317168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1318168404Spjd 1319168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1320168404Spjd 1321168404Spjd if (tocpy == db->db_size) 1322168404Spjd dmu_buf_will_fill(db, tx); 1323168404Spjd else 1324168404Spjd dmu_buf_will_dirty(db, tx); 1325168404Spjd 1326298105Savg#ifdef illumos 1327168404Spjd /* 1328168404Spjd * XXX uiomove could block forever (eg. nfs-backed 1329168404Spjd * pages). There needs to be a uiolockdown() function 1330168404Spjd * to lock the pages in memory, so that uiomove won't 1331168404Spjd * block. 1332168404Spjd */ 1333168404Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 1334168404Spjd UIO_WRITE, uio); 1335298105Savg#else 1336298105Savg err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, 1337298105Savg uio); 1338298105Savg#endif 1339168404Spjd 1340168404Spjd if (tocpy == db->db_size) 1341168404Spjd dmu_buf_fill_done(db, tx); 1342168404Spjd 1343168404Spjd if (err) 1344168404Spjd break; 1345168404Spjd 1346168404Spjd size -= tocpy; 1347168404Spjd } 1348219089Spjd 1349168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1350168404Spjd return (err); 1351168404Spjd} 1352168404Spjd 1353272809Sdelphij/* 1354272809Sdelphij * Write 'size' bytes from the uio buffer. 1355272809Sdelphij * To object zdb->db_object. 1356272809Sdelphij * Starting at offset uio->uio_loffset. 1357272809Sdelphij * 1358272809Sdelphij * If the caller already has a dbuf in the target object 1359272809Sdelphij * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), 1360272809Sdelphij * because we don't have to find the dnode_t for the object. 1361272809Sdelphij */ 1362168404Spjdint 1363219089Spjddmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, 1364219089Spjd dmu_tx_t *tx) 1365219089Spjd{ 1366219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; 1367219089Spjd dnode_t *dn; 1368219089Spjd int err; 1369219089Spjd 1370219089Spjd if (size == 0) 1371219089Spjd return (0); 1372219089Spjd 1373219089Spjd DB_DNODE_ENTER(db); 1374219089Spjd dn = DB_DNODE(db); 1375219089Spjd err = dmu_write_uio_dnode(dn, uio, size, tx); 1376219089Spjd DB_DNODE_EXIT(db); 1377219089Spjd 1378219089Spjd return (err); 1379219089Spjd} 1380219089Spjd 1381272809Sdelphij/* 1382272809Sdelphij * Write 'size' bytes from the uio buffer. 1383272809Sdelphij * To the specified object. 1384272809Sdelphij * Starting at offset uio->uio_loffset. 1385272809Sdelphij */ 1386219089Spjdint 1387219089Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 1388219089Spjd dmu_tx_t *tx) 1389219089Spjd{ 1390219089Spjd dnode_t *dn; 1391219089Spjd int err; 1392219089Spjd 1393219089Spjd if (size == 0) 1394219089Spjd return (0); 1395219089Spjd 1396219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1397219089Spjd if (err) 1398219089Spjd return (err); 1399219089Spjd 1400219089Spjd err = dmu_write_uio_dnode(dn, uio, size, tx); 1401219089Spjd 1402219089Spjd dnode_rele(dn, FTAG); 1403219089Spjd 1404219089Spjd return (err); 1405219089Spjd} 1406219089Spjd 1407277300Ssmh#ifdef illumos 1408219089Spjdint 1409168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1410168404Spjd page_t *pp, dmu_tx_t *tx) 1411168404Spjd{ 1412168404Spjd dmu_buf_t **dbp; 1413168404Spjd int numbufs, i; 1414168404Spjd int err; 1415168404Spjd 1416168404Spjd if (size == 0) 1417168404Spjd return (0); 1418168404Spjd 1419168404Spjd err = dmu_buf_hold_array(os, object, offset, size, 1420168404Spjd FALSE, FTAG, &numbufs, &dbp); 1421168404Spjd if (err) 1422168404Spjd return (err); 1423168404Spjd 1424168404Spjd for (i = 0; i < numbufs; i++) { 1425168404Spjd int tocpy, copied, thiscpy; 1426168404Spjd int bufoff; 1427168404Spjd dmu_buf_t *db = dbp[i]; 1428168404Spjd caddr_t va; 1429168404Spjd 1430168404Spjd ASSERT(size > 0); 1431168404Spjd ASSERT3U(db->db_size, >=, PAGESIZE); 1432168404Spjd 1433168404Spjd bufoff = offset - db->db_offset; 1434168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1435168404Spjd 1436168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1437168404Spjd 1438168404Spjd if (tocpy == db->db_size) 1439168404Spjd dmu_buf_will_fill(db, tx); 1440168404Spjd else 1441168404Spjd dmu_buf_will_dirty(db, tx); 1442168404Spjd 1443168404Spjd for (copied = 0; copied < tocpy; copied += PAGESIZE) { 1444168404Spjd ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 1445168404Spjd thiscpy = MIN(PAGESIZE, tocpy - copied); 1446185029Spjd va = zfs_map_page(pp, S_READ); 1447168404Spjd bcopy(va, (char *)db->db_data + bufoff, thiscpy); 1448185029Spjd zfs_unmap_page(pp, va); 1449168404Spjd pp = pp->p_next; 1450168404Spjd bufoff += PAGESIZE; 1451168404Spjd } 1452168404Spjd 1453168404Spjd if (tocpy == db->db_size) 1454168404Spjd dmu_buf_fill_done(db, tx); 1455168404Spjd 1456168404Spjd offset += tocpy; 1457168404Spjd size -= tocpy; 1458168404Spjd } 1459168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1460168404Spjd return (err); 1461168404Spjd} 1462258745Savg 1463277300Ssmh#else /* !illumos */ 1464258745Savg 1465258745Savgint 1466258745Savgdmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1467258745Savg vm_page_t *ma, dmu_tx_t *tx) 1468258745Savg{ 1469258745Savg dmu_buf_t **dbp; 1470258745Savg struct sf_buf *sf; 1471258745Savg int numbufs, i; 1472258745Savg int err; 1473258745Savg 1474258745Savg if (size == 0) 1475258745Savg return (0); 1476258745Savg 1477258745Savg err = dmu_buf_hold_array(os, object, offset, size, 1478258745Savg FALSE, FTAG, &numbufs, &dbp); 1479258745Savg if (err) 1480258745Savg return (err); 1481258745Savg 1482258745Savg for (i = 0; i < numbufs; i++) { 1483258745Savg int tocpy, copied, thiscpy; 1484258745Savg int bufoff; 1485258745Savg dmu_buf_t *db = dbp[i]; 1486258745Savg caddr_t va; 1487258745Savg 1488258745Savg ASSERT(size > 0); 1489258745Savg ASSERT3U(db->db_size, >=, PAGESIZE); 1490258745Savg 1491258745Savg bufoff = offset - db->db_offset; 1492258745Savg tocpy = (int)MIN(db->db_size - bufoff, size); 1493258745Savg 1494258745Savg ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1495258745Savg 1496258745Savg if (tocpy == db->db_size) 1497258745Savg dmu_buf_will_fill(db, tx); 1498258745Savg else 1499258745Savg dmu_buf_will_dirty(db, tx); 1500258745Savg 1501258745Savg for (copied = 0; copied < tocpy; copied += PAGESIZE) { 1502258745Savg ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); 1503258745Savg thiscpy = MIN(PAGESIZE, tocpy - copied); 1504258745Savg va = zfs_map_page(*ma, &sf); 1505258745Savg bcopy(va, (char *)db->db_data + bufoff, thiscpy); 1506258745Savg zfs_unmap_page(sf); 1507258745Savg ma += 1; 1508258745Savg bufoff += PAGESIZE; 1509258745Savg } 1510258745Savg 1511258745Savg if (tocpy == db->db_size) 1512258745Savg dmu_buf_fill_done(db, tx); 1513258745Savg 1514258745Savg offset += tocpy; 1515258745Savg size -= tocpy; 1516258745Savg } 1517258745Savg dmu_buf_rele_array(dbp, numbufs, FTAG); 1518258745Savg return (err); 1519258745Savg} 1520277300Ssmh#endif /* illumos */ 1521277300Ssmh#endif /* _KERNEL */ 1522168404Spjd 1523209962Smm/* 1524209962Smm * Allocate a loaned anonymous arc buffer. 1525209962Smm */ 1526209962Smmarc_buf_t * 1527209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size) 1528209962Smm{ 1529219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; 1530209962Smm 1531321535Smav return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); 1532209962Smm} 1533209962Smm 1534209962Smm/* 1535209962Smm * Free a loaned arc buffer. 1536209962Smm */ 1537209962Smmvoid 1538209962Smmdmu_return_arcbuf(arc_buf_t *buf) 1539209962Smm{ 1540209962Smm arc_return_buf(buf, FTAG); 1541307265Smav arc_buf_destroy(buf, FTAG); 1542209962Smm} 1543209962Smm 1544209962Smm/* 1545209962Smm * When possible directly assign passed loaned arc buffer to a dbuf. 1546209962Smm * If this is not possible copy the contents of passed arc buf via 1547209962Smm * dmu_write(). 1548209962Smm */ 1549209962Smmvoid 1550209962Smmdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, 1551209962Smm dmu_tx_t *tx) 1552209962Smm{ 1553219089Spjd dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; 1554219089Spjd dnode_t *dn; 1555209962Smm dmu_buf_impl_t *db; 1556321535Smav uint32_t blksz = (uint32_t)arc_buf_lsize(buf); 1557209962Smm uint64_t blkid; 1558209962Smm 1559219089Spjd DB_DNODE_ENTER(dbuf); 1560219089Spjd dn = DB_DNODE(dbuf); 1561209962Smm rw_enter(&dn->dn_struct_rwlock, RW_READER); 1562286705Smav blkid = dbuf_whichblock(dn, 0, offset); 1563209962Smm VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); 1564209962Smm rw_exit(&dn->dn_struct_rwlock); 1565219089Spjd DB_DNODE_EXIT(dbuf); 1566209962Smm 1567272601Sdelphij /* 1568272601Sdelphij * We can only assign if the offset is aligned, the arc buf is the 1569321535Smav * same size as the dbuf, and the dbuf is not metadata. 1570272601Sdelphij */ 1571321535Smav if (offset == db->db.db_offset && blksz == db->db.db_size) { 1572294625Strasz#ifdef _KERNEL 1573294625Strasz curthread->td_ru.ru_oublock++; 1574297633Strasz#ifdef RACCT 1575297633Strasz if (racct_enable) { 1576297633Strasz PROC_LOCK(curproc); 1577297633Strasz racct_add_force(curproc, RACCT_WRITEBPS, blksz); 1578297633Strasz racct_add_force(curproc, RACCT_WRITEIOPS, 1); 1579297633Strasz PROC_UNLOCK(curproc); 1580297633Strasz } 1581297633Strasz#endif /* RACCT */ 1582297633Strasz#endif /* _KERNEL */ 1583209962Smm dbuf_assign_arcbuf(db, buf, tx); 1584209962Smm dbuf_rele(db, FTAG); 1585209962Smm } else { 1586219089Spjd objset_t *os; 1587219089Spjd uint64_t object; 1588219089Spjd 1589321535Smav /* compressed bufs must always be assignable to their dbuf */ 1590321535Smav ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); 1591321535Smav ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); 1592321535Smav 1593219089Spjd DB_DNODE_ENTER(dbuf); 1594219089Spjd dn = DB_DNODE(dbuf); 1595219089Spjd os = dn->dn_objset; 1596219089Spjd object = dn->dn_object; 1597219089Spjd DB_DNODE_EXIT(dbuf); 1598219089Spjd 1599209962Smm dbuf_rele(db, FTAG); 1600219089Spjd dmu_write(os, object, offset, blksz, buf->b_data, tx); 1601209962Smm dmu_return_arcbuf(buf); 1602219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_copied); 1603209962Smm } 1604209962Smm} 1605209962Smm 1606168404Spjdtypedef struct { 1607219089Spjd dbuf_dirty_record_t *dsa_dr; 1608219089Spjd dmu_sync_cb_t *dsa_done; 1609219089Spjd zgd_t *dsa_zgd; 1610219089Spjd dmu_tx_t *dsa_tx; 1611168404Spjd} dmu_sync_arg_t; 1612168404Spjd 1613168404Spjd/* ARGSUSED */ 1614168404Spjdstatic void 1615185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) 1616185029Spjd{ 1617219089Spjd dmu_sync_arg_t *dsa = varg; 1618219089Spjd dmu_buf_t *db = dsa->dsa_zgd->zgd_db; 1619185029Spjd blkptr_t *bp = zio->io_bp; 1620185029Spjd 1621219089Spjd if (zio->io_error == 0) { 1622219089Spjd if (BP_IS_HOLE(bp)) { 1623219089Spjd /* 1624219089Spjd * A block of zeros may compress to a hole, but the 1625219089Spjd * block size still needs to be known for replay. 1626219089Spjd */ 1627219089Spjd BP_SET_LSIZE(bp, db->db_size); 1628268075Sdelphij } else if (!BP_IS_EMBEDDED(bp)) { 1629219089Spjd ASSERT(BP_GET_LEVEL(bp) == 0); 1630219089Spjd bp->blk_fill = 1; 1631219089Spjd } 1632185029Spjd } 1633185029Spjd} 1634185029Spjd 1635219089Spjdstatic void 1636219089Spjddmu_sync_late_arrival_ready(zio_t *zio) 1637219089Spjd{ 1638219089Spjd dmu_sync_ready(zio, NULL, zio->io_private); 1639219089Spjd} 1640219089Spjd 1641185029Spjd/* ARGSUSED */ 1642185029Spjdstatic void 1643168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 1644168404Spjd{ 1645219089Spjd dmu_sync_arg_t *dsa = varg; 1646219089Spjd dbuf_dirty_record_t *dr = dsa->dsa_dr; 1647168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 1648168404Spjd 1649168404Spjd mutex_enter(&db->db_mtx); 1650168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 1651219089Spjd if (zio->io_error == 0) { 1652243524Smm dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); 1653243524Smm if (dr->dt.dl.dr_nopwrite) { 1654243524Smm blkptr_t *bp = zio->io_bp; 1655243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 1656243524Smm uint8_t chksum = BP_GET_CHECKSUM(bp_orig); 1657243524Smm 1658243524Smm ASSERT(BP_EQUAL(bp, bp_orig)); 1659243524Smm ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); 1660289422Smav ASSERT(zio_checksum_table[chksum].ci_flags & 1661289422Smav ZCHECKSUM_FLAG_NOPWRITE); 1662243524Smm } 1663219089Spjd dr->dt.dl.dr_overridden_by = *zio->io_bp; 1664219089Spjd dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 1665219089Spjd dr->dt.dl.dr_copies = zio->io_prop.zp_copies; 1666286677Smav 1667286677Smav /* 1668286677Smav * Old style holes are filled with all zeros, whereas 1669286677Smav * new-style holes maintain their lsize, type, level, 1670286677Smav * and birth time (see zio_write_compress). While we 1671286677Smav * need to reset the BP_SET_LSIZE() call that happened 1672286677Smav * in dmu_sync_ready for old style holes, we do *not* 1673286677Smav * want to wipe out the information contained in new 1674286677Smav * style holes. Thus, only zero out the block pointer if 1675286677Smav * it's an old style hole. 1676286677Smav */ 1677286677Smav if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && 1678286677Smav dr->dt.dl.dr_overridden_by.blk_birth == 0) 1679219089Spjd BP_ZERO(&dr->dt.dl.dr_overridden_by); 1680219089Spjd } else { 1681219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1682219089Spjd } 1683168404Spjd cv_broadcast(&db->db_changed); 1684168404Spjd mutex_exit(&db->db_mtx); 1685168404Spjd 1686219089Spjd dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 1687168404Spjd 1688219089Spjd kmem_free(dsa, sizeof (*dsa)); 1689168404Spjd} 1690168404Spjd 1691219089Spjdstatic void 1692219089Spjddmu_sync_late_arrival_done(zio_t *zio) 1693219089Spjd{ 1694219089Spjd blkptr_t *bp = zio->io_bp; 1695219089Spjd dmu_sync_arg_t *dsa = zio->io_private; 1696243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 1697219089Spjd 1698219089Spjd if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { 1699243524Smm /* 1700243524Smm * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) 1701243524Smm * then there is nothing to do here. Otherwise, free the 1702243524Smm * newly allocated block in this txg. 1703243524Smm */ 1704243524Smm if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 1705243524Smm ASSERT(BP_EQUAL(bp, bp_orig)); 1706243524Smm } else { 1707243524Smm ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); 1708243524Smm ASSERT(zio->io_bp->blk_birth == zio->io_txg); 1709243524Smm ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); 1710243524Smm zio_free(zio->io_spa, zio->io_txg, zio->io_bp); 1711243524Smm } 1712219089Spjd } 1713219089Spjd 1714219089Spjd dmu_tx_commit(dsa->dsa_tx); 1715219089Spjd 1716219089Spjd dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 1717219089Spjd 1718219089Spjd kmem_free(dsa, sizeof (*dsa)); 1719219089Spjd} 1720219089Spjd 1721219089Spjdstatic int 1722219089Spjddmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, 1723268123Sdelphij zio_prop_t *zp, zbookmark_phys_t *zb) 1724219089Spjd{ 1725219089Spjd dmu_sync_arg_t *dsa; 1726219089Spjd dmu_tx_t *tx; 1727219089Spjd 1728219089Spjd tx = dmu_tx_create(os); 1729219089Spjd dmu_tx_hold_space(tx, zgd->zgd_db->db_size); 1730219089Spjd if (dmu_tx_assign(tx, TXG_WAIT) != 0) { 1731219089Spjd dmu_tx_abort(tx); 1732249195Smm /* Make zl_get_data do txg_waited_synced() */ 1733249195Smm return (SET_ERROR(EIO)); 1734219089Spjd } 1735219089Spjd 1736219089Spjd dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 1737219089Spjd dsa->dsa_dr = NULL; 1738219089Spjd dsa->dsa_done = done; 1739219089Spjd dsa->dsa_zgd = zgd; 1740219089Spjd dsa->dsa_tx = tx; 1741219089Spjd 1742321535Smav zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, 1743321535Smav zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size, 1744304138Savg zp, dmu_sync_late_arrival_ready, NULL, 1745304138Savg NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, 1746304138Savg ZIO_FLAG_CANFAIL, zb)); 1747219089Spjd 1748219089Spjd return (0); 1749219089Spjd} 1750219089Spjd 1751168404Spjd/* 1752168404Spjd * Intent log support: sync the block associated with db to disk. 1753168404Spjd * N.B. and XXX: the caller is responsible for making sure that the 1754168404Spjd * data isn't changing while dmu_sync() is writing it. 1755168404Spjd * 1756168404Spjd * Return values: 1757168404Spjd * 1758243524Smm * EEXIST: this txg has already been synced, so there's nothing to do. 1759168404Spjd * The caller should not log the write. 1760168404Spjd * 1761168404Spjd * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 1762168404Spjd * The caller should not log the write. 1763168404Spjd * 1764168404Spjd * EALREADY: this block is already in the process of being synced. 1765168404Spjd * The caller should track its progress (somehow). 1766168404Spjd * 1767219089Spjd * EIO: could not do the I/O. 1768219089Spjd * The caller should do a txg_wait_synced(). 1769168404Spjd * 1770219089Spjd * 0: the I/O has been initiated. 1771219089Spjd * The caller should log this blkptr in the done callback. 1772219089Spjd * It is possible that the I/O will fail, in which case 1773219089Spjd * the error will be reported to the done callback and 1774219089Spjd * propagated to pio from zio_done(). 1775168404Spjd */ 1776168404Spjdint 1777219089Spjddmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) 1778168404Spjd{ 1779219089Spjd blkptr_t *bp = zgd->zgd_bp; 1780219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; 1781219089Spjd objset_t *os = db->db_objset; 1782219089Spjd dsl_dataset_t *ds = os->os_dsl_dataset; 1783168404Spjd dbuf_dirty_record_t *dr; 1784219089Spjd dmu_sync_arg_t *dsa; 1785268123Sdelphij zbookmark_phys_t zb; 1786219089Spjd zio_prop_t zp; 1787219089Spjd dnode_t *dn; 1788168404Spjd 1789219089Spjd ASSERT(pio != NULL); 1790168404Spjd ASSERT(txg != 0); 1791168404Spjd 1792219089Spjd SET_BOOKMARK(&zb, ds->ds_object, 1793219089Spjd db->db.db_object, db->db_level, db->db_blkid); 1794168404Spjd 1795219089Spjd DB_DNODE_ENTER(db); 1796219089Spjd dn = DB_DNODE(db); 1797321535Smav dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, 1798321535Smav ZIO_COMPRESS_INHERIT, &zp); 1799219089Spjd DB_DNODE_EXIT(db); 1800219089Spjd 1801168404Spjd /* 1802219089Spjd * If we're frozen (running ziltest), we always need to generate a bp. 1803168404Spjd */ 1804219089Spjd if (txg > spa_freeze_txg(os->os_spa)) 1805219089Spjd return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 1806168404Spjd 1807168404Spjd /* 1808219089Spjd * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() 1809219089Spjd * and us. If we determine that this txg is not yet syncing, 1810219089Spjd * but it begins to sync a moment later, that's OK because the 1811219089Spjd * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. 1812168404Spjd */ 1813219089Spjd mutex_enter(&db->db_mtx); 1814219089Spjd 1815219089Spjd if (txg <= spa_last_synced_txg(os->os_spa)) { 1816168404Spjd /* 1817219089Spjd * This txg has already synced. There's nothing to do. 1818168404Spjd */ 1819219089Spjd mutex_exit(&db->db_mtx); 1820249195Smm return (SET_ERROR(EEXIST)); 1821168404Spjd } 1822168404Spjd 1823219089Spjd if (txg <= spa_syncing_txg(os->os_spa)) { 1824219089Spjd /* 1825219089Spjd * This txg is currently syncing, so we can't mess with 1826219089Spjd * the dirty record anymore; just write a new log block. 1827219089Spjd */ 1828219089Spjd mutex_exit(&db->db_mtx); 1829219089Spjd return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 1830168404Spjd } 1831168404Spjd 1832168404Spjd dr = db->db_last_dirty; 1833219089Spjd while (dr && dr->dr_txg != txg) 1834168404Spjd dr = dr->dr_next; 1835219089Spjd 1836219089Spjd if (dr == NULL) { 1837168404Spjd /* 1838219089Spjd * There's no dr for this dbuf, so it must have been freed. 1839168404Spjd * There's no need to log writes to freed blocks, so we're done. 1840168404Spjd */ 1841168404Spjd mutex_exit(&db->db_mtx); 1842249195Smm return (SET_ERROR(ENOENT)); 1843168404Spjd } 1844168404Spjd 1845243524Smm ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); 1846243524Smm 1847243524Smm /* 1848286589Smav * Assume the on-disk data is X, the current syncing data (in 1849286589Smav * txg - 1) is Y, and the current in-memory data is Z (currently 1850286589Smav * in dmu_sync). 1851286589Smav * 1852286589Smav * We usually want to perform a nopwrite if X and Z are the 1853286589Smav * same. However, if Y is different (i.e. the BP is going to 1854286589Smav * change before this write takes effect), then a nopwrite will 1855286589Smav * be incorrect - we would override with X, which could have 1856286589Smav * been freed when Y was written. 1857286589Smav * 1858286589Smav * (Note that this is not a concern when we are nop-writing from 1859286589Smav * syncing context, because X and Y must be identical, because 1860286589Smav * all previous txgs have been synced.) 1861286589Smav * 1862286589Smav * Therefore, we disable nopwrite if the current BP could change 1863286589Smav * before this TXG. There are two ways it could change: by 1864286589Smav * being dirty (dr_next is non-NULL), or by being freed 1865286589Smav * (dnode_block_freed()). This behavior is verified by 1866286589Smav * zio_done(), which VERIFYs that the override BP is identical 1867286589Smav * to the on-disk BP. 1868243524Smm */ 1869286589Smav DB_DNODE_ENTER(db); 1870286589Smav dn = DB_DNODE(db); 1871286589Smav if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) 1872243524Smm zp.zp_nopwrite = B_FALSE; 1873286589Smav DB_DNODE_EXIT(db); 1874243524Smm 1875168404Spjd ASSERT(dr->dr_txg == txg); 1876219089Spjd if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || 1877219089Spjd dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1878168404Spjd /* 1879219089Spjd * We have already issued a sync write for this buffer, 1880219089Spjd * or this buffer has already been synced. It could not 1881219089Spjd * have been dirtied since, or we would have cleared the state. 1882168404Spjd */ 1883168404Spjd mutex_exit(&db->db_mtx); 1884249195Smm return (SET_ERROR(EALREADY)); 1885168404Spjd } 1886168404Spjd 1887219089Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 1888168404Spjd dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 1889168404Spjd mutex_exit(&db->db_mtx); 1890168404Spjd 1891219089Spjd dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 1892219089Spjd dsa->dsa_dr = dr; 1893219089Spjd dsa->dsa_done = done; 1894219089Spjd dsa->dsa_zgd = zgd; 1895219089Spjd dsa->dsa_tx = NULL; 1896168404Spjd 1897219089Spjd zio_nowait(arc_write(pio, os->os_spa, txg, 1898251478Sdelphij bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), 1899307265Smav &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, 1900304138Savg ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); 1901185029Spjd 1902219089Spjd return (0); 1903168404Spjd} 1904168404Spjd 1905168404Spjdint 1906168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 1907289562Smav dmu_tx_t *tx) 1908168404Spjd{ 1909168404Spjd dnode_t *dn; 1910168404Spjd int err; 1911168404Spjd 1912219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1913168404Spjd if (err) 1914168404Spjd return (err); 1915168404Spjd err = dnode_set_blksz(dn, size, ibs, tx); 1916168404Spjd dnode_rele(dn, FTAG); 1917168404Spjd return (err); 1918168404Spjd} 1919168404Spjd 1920168404Spjdvoid 1921168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 1922289562Smav dmu_tx_t *tx) 1923168404Spjd{ 1924168404Spjd dnode_t *dn; 1925168404Spjd 1926268075Sdelphij /* 1927268075Sdelphij * Send streams include each object's checksum function. This 1928268075Sdelphij * check ensures that the receiving system can understand the 1929268075Sdelphij * checksum function transmitted. 1930268075Sdelphij */ 1931268075Sdelphij ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); 1932268075Sdelphij 1933268075Sdelphij VERIFY0(dnode_hold(os, object, FTAG, &dn)); 1934268075Sdelphij ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); 1935168404Spjd dn->dn_checksum = checksum; 1936168404Spjd dnode_setdirty(dn, tx); 1937168404Spjd dnode_rele(dn, FTAG); 1938168404Spjd} 1939168404Spjd 1940168404Spjdvoid 1941168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 1942289562Smav dmu_tx_t *tx) 1943168404Spjd{ 1944168404Spjd dnode_t *dn; 1945168404Spjd 1946268075Sdelphij /* 1947268075Sdelphij * Send streams include each object's compression function. This 1948268075Sdelphij * check ensures that the receiving system can understand the 1949268075Sdelphij * compression function transmitted. 1950268075Sdelphij */ 1951268075Sdelphij ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); 1952268075Sdelphij 1953268075Sdelphij VERIFY0(dnode_hold(os, object, FTAG, &dn)); 1954168404Spjd dn->dn_compress = compress; 1955168404Spjd dnode_setdirty(dn, tx); 1956168404Spjd dnode_rele(dn, FTAG); 1957168404Spjd} 1958168404Spjd 1959219089Spjdint zfs_mdcomp_disable = 0; 1960267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, 1961219089Spjd &zfs_mdcomp_disable, 0, "Disable metadata compression"); 1962219089Spjd 1963266771Sdelphij/* 1964266771Sdelphij * When the "redundant_metadata" property is set to "most", only indirect 1965266771Sdelphij * blocks of this level and higher will have an additional ditto block. 1966266771Sdelphij */ 1967266771Sdelphijint zfs_redundant_metadata_most_ditto_level = 2; 1968266771Sdelphij 1969219089Spjdvoid 1970321535Smavdmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, 1971321535Smav enum zio_compress override_compress, zio_prop_t *zp) 1972219089Spjd{ 1973219089Spjd dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; 1974236884Smm boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || 1975219089Spjd (wp & WP_SPILL)); 1976219089Spjd enum zio_checksum checksum = os->os_checksum; 1977219089Spjd enum zio_compress compress = os->os_compress; 1978219089Spjd enum zio_checksum dedup_checksum = os->os_dedup_checksum; 1979243524Smm boolean_t dedup = B_FALSE; 1980243524Smm boolean_t nopwrite = B_FALSE; 1981219089Spjd boolean_t dedup_verify = os->os_dedup_verify; 1982219089Spjd int copies = os->os_copies; 1983321535Smav boolean_t lz4_ac = spa_feature_is_active(os->os_spa, 1984321535Smav SPA_FEATURE_LZ4_COMPRESS); 1985219089Spjd 1986321535Smav IMPLY(override_compress == ZIO_COMPRESS_LZ4, lz4_ac); 1987321535Smav 1988219089Spjd /* 1989243524Smm * We maintain different write policies for each of the following 1990243524Smm * types of data: 1991243524Smm * 1. metadata 1992243524Smm * 2. preallocated blocks (i.e. level-0 blocks of a dump device) 1993243524Smm * 3. all other level 0 blocks 1994219089Spjd */ 1995219089Spjd if (ismd) { 1996268126Sdelphij if (zfs_mdcomp_disable) { 1997268126Sdelphij compress = ZIO_COMPRESS_EMPTY; 1998268126Sdelphij } else { 1999286547Smav /* 2000286547Smav * XXX -- we should design a compression algorithm 2001286547Smav * that specializes in arrays of bps. 2002286547Smav */ 2003286547Smav compress = zio_compress_select(os->os_spa, 2004286547Smav ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); 2005268126Sdelphij } 2006268126Sdelphij 2007243524Smm /* 2008219089Spjd * Metadata always gets checksummed. If the data 2009219089Spjd * checksum is multi-bit correctable, and it's not a 2010219089Spjd * ZBT-style checksum, then it's suitable for metadata 2011219089Spjd * as well. Otherwise, the metadata checksum defaults 2012219089Spjd * to fletcher4. 2013219089Spjd */ 2014289422Smav if (!(zio_checksum_table[checksum].ci_flags & 2015289422Smav ZCHECKSUM_FLAG_METADATA) || 2016289422Smav (zio_checksum_table[checksum].ci_flags & 2017289422Smav ZCHECKSUM_FLAG_EMBEDDED)) 2018219089Spjd checksum = ZIO_CHECKSUM_FLETCHER_4; 2019266771Sdelphij 2020266771Sdelphij if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || 2021266771Sdelphij (os->os_redundant_metadata == 2022266771Sdelphij ZFS_REDUNDANT_METADATA_MOST && 2023266771Sdelphij (level >= zfs_redundant_metadata_most_ditto_level || 2024266771Sdelphij DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) 2025266771Sdelphij copies++; 2026243524Smm } else if (wp & WP_NOFILL) { 2027243524Smm ASSERT(level == 0); 2028219089Spjd 2029219089Spjd /* 2030243524Smm * If we're writing preallocated blocks, we aren't actually 2031243524Smm * writing them so don't set any policy properties. These 2032243524Smm * blocks are currently only used by an external subsystem 2033243524Smm * outside of zfs (i.e. dump) and not written by the zio 2034243524Smm * pipeline. 2035219089Spjd */ 2036243524Smm compress = ZIO_COMPRESS_OFF; 2037255750Sdelphij checksum = ZIO_CHECKSUM_NOPARITY; 2038219089Spjd } else { 2039286547Smav compress = zio_compress_select(os->os_spa, dn->dn_compress, 2040286547Smav compress); 2041219089Spjd 2042243524Smm checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? 2043243524Smm zio_checksum_select(dn->dn_checksum, checksum) : 2044243524Smm dedup_checksum; 2045219089Spjd 2046243524Smm /* 2047243524Smm * Determine dedup setting. If we are in dmu_sync(), 2048243524Smm * we won't actually dedup now because that's all 2049243524Smm * done in syncing context; but we do want to use the 2050243524Smm * dedup checkum. If the checksum is not strong 2051243524Smm * enough to ensure unique signatures, force 2052243524Smm * dedup_verify. 2053243524Smm */ 2054243524Smm if (dedup_checksum != ZIO_CHECKSUM_OFF) { 2055243524Smm dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; 2056289422Smav if (!(zio_checksum_table[checksum].ci_flags & 2057289422Smav ZCHECKSUM_FLAG_DEDUP)) 2058243524Smm dedup_verify = B_TRUE; 2059243524Smm } 2060219089Spjd 2061243524Smm /* 2062289422Smav * Enable nopwrite if we have secure enough checksum 2063289422Smav * algorithm (see comment in zio_nop_write) and 2064289422Smav * compression is enabled. We don't enable nopwrite if 2065289422Smav * dedup is enabled as the two features are mutually 2066289422Smav * exclusive. 2067243524Smm */ 2068289422Smav nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & 2069289422Smav ZCHECKSUM_FLAG_NOPWRITE) && 2070243524Smm compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); 2071219089Spjd } 2072219089Spjd 2073219089Spjd zp->zp_checksum = checksum; 2074321535Smav 2075321535Smav /* 2076321535Smav * If we're writing a pre-compressed buffer, the compression type we use 2077321535Smav * must match the data. If it hasn't been compressed yet, then we should 2078321535Smav * use the value dictated by the policies above. 2079321535Smav */ 2080321535Smav zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT 2081321535Smav ? override_compress : compress; 2082321535Smav ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); 2083321535Smav 2084219089Spjd zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; 2085219089Spjd zp->zp_level = level; 2086266771Sdelphij zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); 2087219089Spjd zp->zp_dedup = dedup; 2088219089Spjd zp->zp_dedup_verify = dedup && dedup_verify; 2089243524Smm zp->zp_nopwrite = nopwrite; 2090219089Spjd} 2091219089Spjd 2092168404Spjdint 2093168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 2094168404Spjd{ 2095168404Spjd dnode_t *dn; 2096287103Savg int err; 2097168404Spjd 2098168404Spjd /* 2099168404Spjd * Sync any current changes before 2100168404Spjd * we go trundling through the block pointers. 2101168404Spjd */ 2102287103Savg err = dmu_object_wait_synced(os, object); 2103287103Savg if (err) { 2104287103Savg return (err); 2105168404Spjd } 2106287103Savg 2107287103Savg err = dnode_hold(os, object, FTAG, &dn); 2108287103Savg if (err) { 2109287103Savg return (err); 2110168404Spjd } 2111168404Spjd 2112185029Spjd err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); 2113168404Spjd dnode_rele(dn, FTAG); 2114168404Spjd 2115168404Spjd return (err); 2116168404Spjd} 2117168404Spjd 2118287103Savg/* 2119287103Savg * Given the ZFS object, if it contains any dirty nodes 2120287103Savg * this function flushes all dirty blocks to disk. This 2121287103Savg * ensures the DMU object info is updated. A more efficient 2122287103Savg * future version might just find the TXG with the maximum 2123287103Savg * ID and wait for that to be synced. 2124287103Savg */ 2125287103Savgint 2126289562Smavdmu_object_wait_synced(objset_t *os, uint64_t object) 2127289562Smav{ 2128287103Savg dnode_t *dn; 2129287103Savg int error, i; 2130287103Savg 2131287103Savg error = dnode_hold(os, object, FTAG, &dn); 2132287103Savg if (error) { 2133287103Savg return (error); 2134287103Savg } 2135287103Savg 2136287103Savg for (i = 0; i < TXG_SIZE; i++) { 2137287103Savg if (list_link_active(&dn->dn_dirty_link[i])) { 2138287103Savg break; 2139287103Savg } 2140287103Savg } 2141287103Savg dnode_rele(dn, FTAG); 2142287103Savg if (i != TXG_SIZE) { 2143287103Savg txg_wait_synced(dmu_objset_pool(os), 0); 2144287103Savg } 2145287103Savg 2146287103Savg return (0); 2147287103Savg} 2148287103Savg 2149168404Spjdvoid 2150168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 2151168404Spjd{ 2152219089Spjd dnode_phys_t *dnp; 2153219089Spjd 2154168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 2155168404Spjd mutex_enter(&dn->dn_mtx); 2156168404Spjd 2157219089Spjd dnp = dn->dn_phys; 2158219089Spjd 2159168404Spjd doi->doi_data_block_size = dn->dn_datablksz; 2160168404Spjd doi->doi_metadata_block_size = dn->dn_indblkshift ? 2161168404Spjd 1ULL << dn->dn_indblkshift : 0; 2162219089Spjd doi->doi_type = dn->dn_type; 2163219089Spjd doi->doi_bonus_type = dn->dn_bonustype; 2164219089Spjd doi->doi_bonus_size = dn->dn_bonuslen; 2165168404Spjd doi->doi_indirection = dn->dn_nlevels; 2166168404Spjd doi->doi_checksum = dn->dn_checksum; 2167168404Spjd doi->doi_compress = dn->dn_compress; 2168272810Sdelphij doi->doi_nblkptr = dn->dn_nblkptr; 2169219089Spjd doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; 2170247852Smm doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; 2171219089Spjd doi->doi_fill_count = 0; 2172219089Spjd for (int i = 0; i < dnp->dn_nblkptr; i++) 2173268075Sdelphij doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); 2174168404Spjd 2175168404Spjd mutex_exit(&dn->dn_mtx); 2176168404Spjd rw_exit(&dn->dn_struct_rwlock); 2177168404Spjd} 2178168404Spjd 2179168404Spjd/* 2180168404Spjd * Get information on a DMU object. 2181168404Spjd * If doi is NULL, just indicates whether the object exists. 2182168404Spjd */ 2183168404Spjdint 2184168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 2185168404Spjd{ 2186168404Spjd dnode_t *dn; 2187219089Spjd int err = dnode_hold(os, object, FTAG, &dn); 2188168404Spjd 2189168404Spjd if (err) 2190168404Spjd return (err); 2191168404Spjd 2192168404Spjd if (doi != NULL) 2193168404Spjd dmu_object_info_from_dnode(dn, doi); 2194168404Spjd 2195168404Spjd dnode_rele(dn, FTAG); 2196168404Spjd return (0); 2197168404Spjd} 2198168404Spjd 2199168404Spjd/* 2200168404Spjd * As above, but faster; can be used when you have a held dbuf in hand. 2201168404Spjd */ 2202168404Spjdvoid 2203219089Spjddmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) 2204168404Spjd{ 2205219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2206219089Spjd 2207219089Spjd DB_DNODE_ENTER(db); 2208219089Spjd dmu_object_info_from_dnode(DB_DNODE(db), doi); 2209219089Spjd DB_DNODE_EXIT(db); 2210168404Spjd} 2211168404Spjd 2212168404Spjd/* 2213168404Spjd * Faster still when you only care about the size. 2214168404Spjd * This is specifically optimized for zfs_getattr(). 2215168404Spjd */ 2216168404Spjdvoid 2217219089Spjddmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, 2218219089Spjd u_longlong_t *nblk512) 2219168404Spjd{ 2220219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2221219089Spjd dnode_t *dn; 2222168404Spjd 2223219089Spjd DB_DNODE_ENTER(db); 2224219089Spjd dn = DB_DNODE(db); 2225219089Spjd 2226168404Spjd *blksize = dn->dn_datablksz; 2227168404Spjd /* add 1 for dnode space */ 2228168404Spjd *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 2229168404Spjd SPA_MINBLOCKSHIFT) + 1; 2230219089Spjd DB_DNODE_EXIT(db); 2231168404Spjd} 2232168404Spjd 2233168404Spjdvoid 2234168404Spjdbyteswap_uint64_array(void *vbuf, size_t size) 2235168404Spjd{ 2236168404Spjd uint64_t *buf = vbuf; 2237168404Spjd size_t count = size >> 3; 2238168404Spjd int i; 2239168404Spjd 2240168404Spjd ASSERT((size & 7) == 0); 2241168404Spjd 2242168404Spjd for (i = 0; i < count; i++) 2243168404Spjd buf[i] = BSWAP_64(buf[i]); 2244168404Spjd} 2245168404Spjd 2246168404Spjdvoid 2247168404Spjdbyteswap_uint32_array(void *vbuf, size_t size) 2248168404Spjd{ 2249168404Spjd uint32_t *buf = vbuf; 2250168404Spjd size_t count = size >> 2; 2251168404Spjd int i; 2252168404Spjd 2253168404Spjd ASSERT((size & 3) == 0); 2254168404Spjd 2255168404Spjd for (i = 0; i < count; i++) 2256168404Spjd buf[i] = BSWAP_32(buf[i]); 2257168404Spjd} 2258168404Spjd 2259168404Spjdvoid 2260168404Spjdbyteswap_uint16_array(void *vbuf, size_t size) 2261168404Spjd{ 2262168404Spjd uint16_t *buf = vbuf; 2263168404Spjd size_t count = size >> 1; 2264168404Spjd int i; 2265168404Spjd 2266168404Spjd ASSERT((size & 1) == 0); 2267168404Spjd 2268168404Spjd for (i = 0; i < count; i++) 2269168404Spjd buf[i] = BSWAP_16(buf[i]); 2270168404Spjd} 2271168404Spjd 2272168404Spjd/* ARGSUSED */ 2273168404Spjdvoid 2274168404Spjdbyteswap_uint8_array(void *vbuf, size_t size) 2275168404Spjd{ 2276168404Spjd} 2277168404Spjd 2278168404Spjdvoid 2279168404Spjddmu_init(void) 2280168404Spjd{ 2281219089Spjd zfs_dbgmsg_init(); 2282219089Spjd sa_cache_init(); 2283219089Spjd xuio_stat_init(); 2284219089Spjd dmu_objset_init(); 2285219089Spjd dnode_init(); 2286208130Smm zfetch_init(); 2287254608Sgibbs zio_compress_init(); 2288239620Smm l2arc_init(); 2289168404Spjd arc_init(); 2290307265Smav dbuf_init(); 2291168404Spjd} 2292168404Spjd 2293168404Spjdvoid 2294168404Spjddmu_fini(void) 2295168404Spjd{ 2296251629Sdelphij arc_fini(); /* arc depends on l2arc, so arc must go first */ 2297219089Spjd l2arc_fini(); 2298208130Smm zfetch_fini(); 2299254608Sgibbs zio_compress_fini(); 2300219089Spjd dbuf_fini(); 2301168404Spjd dnode_fini(); 2302219089Spjd dmu_objset_fini(); 2303219089Spjd xuio_stat_fini(); 2304219089Spjd sa_cache_fini(); 2305219089Spjd zfs_dbgmsg_fini(); 2306168404Spjd} 2307