1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23321573Smav * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24168404Spjd */ 25251478Sdelphij/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ 26255750Sdelphij/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ 27331384Smav/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ 28251478Sdelphij 29168404Spjd#include <sys/dmu.h> 30168404Spjd#include <sys/dmu_impl.h> 31168404Spjd#include <sys/dmu_tx.h> 32168404Spjd#include <sys/dbuf.h> 33168404Spjd#include <sys/dnode.h> 34168404Spjd#include <sys/zfs_context.h> 35168404Spjd#include <sys/dmu_objset.h> 36168404Spjd#include <sys/dmu_traverse.h> 37168404Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/dsl_dir.h> 39168404Spjd#include <sys/dsl_pool.h> 40168404Spjd#include <sys/dsl_synctask.h> 41168404Spjd#include <sys/dsl_prop.h> 42168404Spjd#include <sys/dmu_zfetch.h> 43168404Spjd#include <sys/zfs_ioctl.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zio_checksum.h> 46243524Smm#include <sys/zio_compress.h> 47219089Spjd#include <sys/sa.h> 48268126Sdelphij#include <sys/zfeature.h> 49321610Smav#include <sys/abd.h> 50219089Spjd#ifdef _KERNEL 51297633Strasz#include <sys/racct.h> 52258745Savg#include <sys/vm.h> 53185029Spjd#include <sys/zfs_znode.h> 54219089Spjd#endif 55168404Spjd 56243524Smm/* 57243524Smm * Enable/disable nopwrite feature. 58243524Smm */ 59243524Smmint zfs_nopwrite_enabled = 1; 60243525SmmSYSCTL_DECL(_vfs_zfs); 61243525SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, 62243525Smm &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); 63243524Smm 64321523Smav/* 65321523Smav * Tunable to control percentage of dirtied blocks from frees in one TXG. 66321523Smav * After this threshold is crossed, additional dirty blocks from frees 67321523Smav * wait until the next TXG. 68321523Smav * A value of zero will disable this throttle. 69321523Smav */ 70321523Smavuint32_t zfs_per_txg_dirty_frees_percent = 30; 71321523SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, 72321523Smav &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); 73321523Smav 74332525Smav/* 75332525Smav * This can be used for testing, to ensure that certain actions happen 76332525Smav * while in the middle of a remap (which might otherwise complete too 77332525Smav * quickly). 78332525Smav */ 79332525Smavint zfs_object_remap_one_indirect_delay_ticks = 0; 80332525Smav 81168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 82339109Smav { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, 83339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" }, 84339109Smav { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" }, 85339109Smav { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, 86339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, 87339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, 88339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, 89339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, 90339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, 91339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" }, 92339109Smav { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" }, 93339109Smav { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" }, 94339109Smav { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" }, 95339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" }, 96339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" }, 97339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" }, 98339109Smav { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" }, 99339109Smav { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, 100339109Smav { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" }, 101339109Smav { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" }, 102339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" }, 103339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, 104339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" }, 105339109Smav { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" }, 106339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, 107339109Smav { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" }, 108339109Smav { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" }, 109339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, 110339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, 111339109Smav { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, 112339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, 113339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" }, 114339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" }, 115339109Smav { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" }, 116339109Smav { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" }, 117339109Smav { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" }, 118339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, 119339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" }, 120339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, 121339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" }, 122339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" }, 123339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" }, 124339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, 125339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, 126339109Smav { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" }, 127339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" }, 128339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" }, 129339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" }, 130339109Smav { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, 131339109Smav { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" }, 132339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" }, 133339109Smav { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" }, 134339109Smav { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" }, 135339109Smav { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } 136168404Spjd}; 137168404Spjd 138236884Smmconst dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { 139236884Smm { byteswap_uint8_array, "uint8" }, 140236884Smm { byteswap_uint16_array, "uint16" }, 141236884Smm { byteswap_uint32_array, "uint32" }, 142236884Smm { byteswap_uint64_array, "uint64" }, 143236884Smm { zap_byteswap, "zap" }, 144236884Smm { dnode_buf_byteswap, "dnode" }, 145236884Smm { dmu_objset_byteswap, "objset" }, 146236884Smm { zfs_znode_byteswap, "znode" }, 147236884Smm { zfs_oldacl_byteswap, "oldacl" }, 148236884Smm { zfs_acl_byteswap, "acl" } 149236884Smm}; 150236884Smm 151168404Spjdint 152307290Smavdmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, 153307290Smav void *tag, dmu_buf_t **dbp) 154307290Smav{ 155307290Smav uint64_t blkid; 156307290Smav dmu_buf_impl_t *db; 157307290Smav 158307290Smav blkid = dbuf_whichblock(dn, 0, offset); 159307290Smav rw_enter(&dn->dn_struct_rwlock, RW_READER); 160307290Smav db = dbuf_hold(dn, blkid, tag); 161307290Smav rw_exit(&dn->dn_struct_rwlock); 162307290Smav 163307290Smav if (db == NULL) { 164307290Smav *dbp = NULL; 165307290Smav return (SET_ERROR(EIO)); 166307290Smav } 167307290Smav 168307290Smav *dbp = &db->db; 169307290Smav return (0); 170307290Smav} 171307290Smavint 172268075Sdelphijdmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, 173268075Sdelphij void *tag, dmu_buf_t **dbp) 174168404Spjd{ 175168404Spjd dnode_t *dn; 176168404Spjd uint64_t blkid; 177168404Spjd dmu_buf_impl_t *db; 178168404Spjd int err; 179168404Spjd 180219089Spjd err = dnode_hold(os, object, FTAG, &dn); 181168404Spjd if (err) 182168404Spjd return (err); 183286705Smav blkid = dbuf_whichblock(dn, 0, offset); 184168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 185168404Spjd db = dbuf_hold(dn, blkid, tag); 186168404Spjd rw_exit(&dn->dn_struct_rwlock); 187268075Sdelphij dnode_rele(dn, FTAG); 188268075Sdelphij 189168404Spjd if (db == NULL) { 190268075Sdelphij *dbp = NULL; 191268075Sdelphij return (SET_ERROR(EIO)); 192268075Sdelphij } 193268075Sdelphij 194268075Sdelphij *dbp = &db->db; 195268075Sdelphij return (err); 196268075Sdelphij} 197268075Sdelphij 198268075Sdelphijint 199307290Smavdmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, 200307290Smav void *tag, dmu_buf_t **dbp, int flags) 201307290Smav{ 202307290Smav int err; 203307290Smav int db_flags = DB_RF_CANFAIL; 204307290Smav 205307290Smav if (flags & DMU_READ_NO_PREFETCH) 206307290Smav db_flags |= DB_RF_NOPREFETCH; 207307290Smav 208307290Smav err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); 209307290Smav if (err == 0) { 210307290Smav dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); 211307290Smav err = dbuf_read(db, NULL, db_flags); 212307290Smav if (err != 0) { 213307290Smav dbuf_rele(db, tag); 214307290Smav *dbp = NULL; 215307290Smav } 216307290Smav } 217307290Smav 218307290Smav return (err); 219307290Smav} 220307290Smav 221307290Smavint 222268075Sdelphijdmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 223268075Sdelphij void *tag, dmu_buf_t **dbp, int flags) 224268075Sdelphij{ 225268075Sdelphij int err; 226268075Sdelphij int db_flags = DB_RF_CANFAIL; 227268075Sdelphij 228268075Sdelphij if (flags & DMU_READ_NO_PREFETCH) 229268075Sdelphij db_flags |= DB_RF_NOPREFETCH; 230268075Sdelphij 231268075Sdelphij err = dmu_buf_hold_noread(os, object, offset, tag, dbp); 232268075Sdelphij if (err == 0) { 233268075Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); 234219089Spjd err = dbuf_read(db, NULL, db_flags); 235268075Sdelphij if (err != 0) { 236168404Spjd dbuf_rele(db, tag); 237268075Sdelphij *dbp = NULL; 238168404Spjd } 239168404Spjd } 240168404Spjd 241168404Spjd return (err); 242168404Spjd} 243168404Spjd 244168404Spjdint 245168404Spjddmu_bonus_max(void) 246168404Spjd{ 247168404Spjd return (DN_MAX_BONUSLEN); 248168404Spjd} 249168404Spjd 250185029Spjdint 251219089Spjddmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) 252185029Spjd{ 253219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 254219089Spjd dnode_t *dn; 255219089Spjd int error; 256185029Spjd 257219089Spjd DB_DNODE_ENTER(db); 258219089Spjd dn = DB_DNODE(db); 259219089Spjd 260219089Spjd if (dn->dn_bonus != db) { 261249195Smm error = SET_ERROR(EINVAL); 262219089Spjd } else if (newsize < 0 || newsize > db_fake->db_size) { 263249195Smm error = SET_ERROR(EINVAL); 264219089Spjd } else { 265219089Spjd dnode_setbonuslen(dn, newsize, tx); 266219089Spjd error = 0; 267219089Spjd } 268219089Spjd 269219089Spjd DB_DNODE_EXIT(db); 270219089Spjd return (error); 271185029Spjd} 272185029Spjd 273219089Spjdint 274219089Spjddmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) 275219089Spjd{ 276219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 277219089Spjd dnode_t *dn; 278219089Spjd int error; 279219089Spjd 280219089Spjd DB_DNODE_ENTER(db); 281219089Spjd dn = DB_DNODE(db); 282219089Spjd 283236884Smm if (!DMU_OT_IS_VALID(type)) { 284249195Smm error = SET_ERROR(EINVAL); 285219089Spjd } else if (dn->dn_bonus != db) { 286249195Smm error = SET_ERROR(EINVAL); 287219089Spjd } else { 288219089Spjd dnode_setbonus_type(dn, type, tx); 289219089Spjd error = 0; 290219089Spjd } 291219089Spjd 292219089Spjd DB_DNODE_EXIT(db); 293219089Spjd return (error); 294219089Spjd} 295219089Spjd 296219089Spjddmu_object_type_t 297219089Spjddmu_get_bonustype(dmu_buf_t *db_fake) 298219089Spjd{ 299219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 300219089Spjd dnode_t *dn; 301219089Spjd dmu_object_type_t type; 302219089Spjd 303219089Spjd DB_DNODE_ENTER(db); 304219089Spjd dn = DB_DNODE(db); 305219089Spjd type = dn->dn_bonustype; 306219089Spjd DB_DNODE_EXIT(db); 307219089Spjd 308219089Spjd return (type); 309219089Spjd} 310219089Spjd 311219089Spjdint 312219089Spjddmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 313219089Spjd{ 314219089Spjd dnode_t *dn; 315219089Spjd int error; 316219089Spjd 317219089Spjd error = dnode_hold(os, object, FTAG, &dn); 318219089Spjd dbuf_rm_spill(dn, tx); 319219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 320219089Spjd dnode_rm_spill(dn, tx); 321219089Spjd rw_exit(&dn->dn_struct_rwlock); 322219089Spjd dnode_rele(dn, FTAG); 323219089Spjd return (error); 324219089Spjd} 325219089Spjd 326168404Spjd/* 327168404Spjd * returns ENOENT, EIO, or 0. 328168404Spjd */ 329168404Spjdint 330168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 331168404Spjd{ 332168404Spjd dnode_t *dn; 333168404Spjd dmu_buf_impl_t *db; 334185029Spjd int error; 335168404Spjd 336219089Spjd error = dnode_hold(os, object, FTAG, &dn); 337185029Spjd if (error) 338185029Spjd return (error); 339168404Spjd 340168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 341168404Spjd if (dn->dn_bonus == NULL) { 342168404Spjd rw_exit(&dn->dn_struct_rwlock); 343168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 344168404Spjd if (dn->dn_bonus == NULL) 345185029Spjd dbuf_create_bonus(dn); 346168404Spjd } 347168404Spjd db = dn->dn_bonus; 348185029Spjd 349185029Spjd /* as long as the bonus buf is held, the dnode will be held */ 350219089Spjd if (refcount_add(&db->db_holds, tag) == 1) { 351185029Spjd VERIFY(dnode_add_ref(dn, db)); 352270248Sdelphij atomic_inc_32(&dn->dn_dbufs_count); 353219089Spjd } 354185029Spjd 355219089Spjd /* 356219089Spjd * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's 357219089Spjd * hold and incrementing the dbuf count to ensure that dnode_move() sees 358219089Spjd * a dnode hold for every dbuf. 359219089Spjd */ 360219089Spjd rw_exit(&dn->dn_struct_rwlock); 361219089Spjd 362168404Spjd dnode_rele(dn, FTAG); 363168404Spjd 364219089Spjd VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); 365168404Spjd 366168404Spjd *dbp = &db->db; 367168404Spjd return (0); 368168404Spjd} 369168404Spjd 370168404Spjd/* 371219089Spjd * returns ENOENT, EIO, or 0. 372219089Spjd * 373219089Spjd * This interface will allocate a blank spill dbuf when a spill blk 374219089Spjd * doesn't already exist on the dnode. 375219089Spjd * 376219089Spjd * if you only want to find an already existing spill db, then 377219089Spjd * dmu_spill_hold_existing() should be used. 378219089Spjd */ 379219089Spjdint 380219089Spjddmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) 381219089Spjd{ 382219089Spjd dmu_buf_impl_t *db = NULL; 383219089Spjd int err; 384219089Spjd 385219089Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 386219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 387219089Spjd 388219089Spjd db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); 389219089Spjd 390219089Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 391219089Spjd rw_exit(&dn->dn_struct_rwlock); 392219089Spjd 393219089Spjd ASSERT(db != NULL); 394219089Spjd err = dbuf_read(db, NULL, flags); 395219089Spjd if (err == 0) 396219089Spjd *dbp = &db->db; 397219089Spjd else 398219089Spjd dbuf_rele(db, tag); 399219089Spjd return (err); 400219089Spjd} 401219089Spjd 402219089Spjdint 403219089Spjddmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 404219089Spjd{ 405219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 406219089Spjd dnode_t *dn; 407219089Spjd int err; 408219089Spjd 409219089Spjd DB_DNODE_ENTER(db); 410219089Spjd dn = DB_DNODE(db); 411219089Spjd 412219089Spjd if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { 413249195Smm err = SET_ERROR(EINVAL); 414219089Spjd } else { 415219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 416219089Spjd 417219089Spjd if (!dn->dn_have_spill) { 418249195Smm err = SET_ERROR(ENOENT); 419219089Spjd } else { 420219089Spjd err = dmu_spill_hold_by_dnode(dn, 421219089Spjd DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); 422219089Spjd } 423219089Spjd 424219089Spjd rw_exit(&dn->dn_struct_rwlock); 425219089Spjd } 426219089Spjd 427219089Spjd DB_DNODE_EXIT(db); 428219089Spjd return (err); 429219089Spjd} 430219089Spjd 431219089Spjdint 432219089Spjddmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 433219089Spjd{ 434219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 435219089Spjd dnode_t *dn; 436219089Spjd int err; 437219089Spjd 438219089Spjd DB_DNODE_ENTER(db); 439219089Spjd dn = DB_DNODE(db); 440219089Spjd err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); 441219089Spjd DB_DNODE_EXIT(db); 442219089Spjd 443219089Spjd return (err); 444219089Spjd} 445219089Spjd 446219089Spjd/* 447168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 448168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful, 449168404Spjd * and can induce severe lock contention when writing to several files 450168404Spjd * whose dnodes are in the same block. 451168404Spjd */ 452339128Smavint 453209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, 454287702Sdelphij boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) 455168404Spjd{ 456168404Spjd dmu_buf_t **dbp; 457168404Spjd uint64_t blkid, nblks, i; 458209962Smm uint32_t dbuf_flags; 459168404Spjd int err; 460168404Spjd zio_t *zio; 461168404Spjd 462168404Spjd ASSERT(length <= DMU_MAX_ACCESS); 463168404Spjd 464287702Sdelphij /* 465287702Sdelphij * Note: We directly notify the prefetch code of this read, so that 466287702Sdelphij * we can tell it about the multi-block read. dbuf_read() only knows 467287702Sdelphij * about the one block it is accessing. 468287702Sdelphij */ 469287702Sdelphij dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | 470287702Sdelphij DB_RF_NOPREFETCH; 471168404Spjd 472168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 473168404Spjd if (dn->dn_datablkshift) { 474168404Spjd int blkshift = dn->dn_datablkshift; 475287702Sdelphij nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - 476287702Sdelphij P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; 477168404Spjd } else { 478168404Spjd if (offset + length > dn->dn_datablksz) { 479168404Spjd zfs_panic_recover("zfs: accessing past end of object " 480168404Spjd "%llx/%llx (size=%u access=%llu+%llu)", 481168404Spjd (longlong_t)dn->dn_objset-> 482168404Spjd os_dsl_dataset->ds_object, 483168404Spjd (longlong_t)dn->dn_object, dn->dn_datablksz, 484168404Spjd (longlong_t)offset, (longlong_t)length); 485214378Smm rw_exit(&dn->dn_struct_rwlock); 486249195Smm return (SET_ERROR(EIO)); 487168404Spjd } 488168404Spjd nblks = 1; 489168404Spjd } 490168404Spjd dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 491168404Spjd 492297633Strasz#if defined(_KERNEL) && defined(RACCT) 493297633Strasz if (racct_enable && !read) { 494297633Strasz PROC_LOCK(curproc); 495297633Strasz racct_add_force(curproc, RACCT_WRITEBPS, length); 496297633Strasz racct_add_force(curproc, RACCT_WRITEIOPS, nblks); 497297633Strasz PROC_UNLOCK(curproc); 498297633Strasz } 499297633Strasz#endif 500297633Strasz 501185029Spjd zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); 502286705Smav blkid = dbuf_whichblock(dn, 0, offset); 503168404Spjd for (i = 0; i < nblks; i++) { 504287702Sdelphij dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); 505168404Spjd if (db == NULL) { 506168404Spjd rw_exit(&dn->dn_struct_rwlock); 507168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 508168404Spjd zio_nowait(zio); 509249195Smm return (SET_ERROR(EIO)); 510168404Spjd } 511287702Sdelphij 512168404Spjd /* initiate async i/o */ 513226620Spjd if (read) 514209962Smm (void) dbuf_read(db, zio, dbuf_flags); 515226620Spjd#ifdef _KERNEL 516226620Spjd else 517226620Spjd curthread->td_ru.ru_oublock++; 518226620Spjd#endif 519168404Spjd dbp[i] = &db->db; 520168404Spjd } 521287702Sdelphij 522297832Smav if ((flags & DMU_READ_NO_PREFETCH) == 0 && 523297832Smav DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { 524297832Smav dmu_zfetch(&dn->dn_zfetch, blkid, nblks, 525297832Smav read && DNODE_IS_CACHEABLE(dn)); 526287702Sdelphij } 527168404Spjd rw_exit(&dn->dn_struct_rwlock); 528168404Spjd 529168404Spjd /* wait for async i/o */ 530168404Spjd err = zio_wait(zio); 531168404Spjd if (err) { 532168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 533168404Spjd return (err); 534168404Spjd } 535168404Spjd 536168404Spjd /* wait for other io to complete */ 537168404Spjd if (read) { 538168404Spjd for (i = 0; i < nblks; i++) { 539168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 540168404Spjd mutex_enter(&db->db_mtx); 541168404Spjd while (db->db_state == DB_READ || 542168404Spjd db->db_state == DB_FILL) 543168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 544168404Spjd if (db->db_state == DB_UNCACHED) 545249195Smm err = SET_ERROR(EIO); 546168404Spjd mutex_exit(&db->db_mtx); 547168404Spjd if (err) { 548168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 549168404Spjd return (err); 550168404Spjd } 551168404Spjd } 552168404Spjd } 553168404Spjd 554168404Spjd *numbufsp = nblks; 555168404Spjd *dbpp = dbp; 556168404Spjd return (0); 557168404Spjd} 558168404Spjd 559168404Spjdstatic int 560168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 561168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 562168404Spjd{ 563168404Spjd dnode_t *dn; 564168404Spjd int err; 565168404Spjd 566219089Spjd err = dnode_hold(os, object, FTAG, &dn); 567168404Spjd if (err) 568168404Spjd return (err); 569168404Spjd 570168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 571209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 572168404Spjd 573168404Spjd dnode_rele(dn, FTAG); 574168404Spjd 575168404Spjd return (err); 576168404Spjd} 577168404Spjd 578168404Spjdint 579219089Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, 580287702Sdelphij uint64_t length, boolean_t read, void *tag, int *numbufsp, 581287702Sdelphij dmu_buf_t ***dbpp) 582168404Spjd{ 583219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 584219089Spjd dnode_t *dn; 585168404Spjd int err; 586168404Spjd 587219089Spjd DB_DNODE_ENTER(db); 588219089Spjd dn = DB_DNODE(db); 589168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 590209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 591219089Spjd DB_DNODE_EXIT(db); 592168404Spjd 593168404Spjd return (err); 594168404Spjd} 595168404Spjd 596168404Spjdvoid 597168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 598168404Spjd{ 599168404Spjd int i; 600168404Spjd dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 601168404Spjd 602168404Spjd if (numbufs == 0) 603168404Spjd return; 604168404Spjd 605168404Spjd for (i = 0; i < numbufs; i++) { 606168404Spjd if (dbp[i]) 607168404Spjd dbuf_rele(dbp[i], tag); 608168404Spjd } 609168404Spjd 610168404Spjd kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 611168404Spjd} 612168404Spjd 613258632Savg/* 614286705Smav * Issue prefetch i/os for the given blocks. If level is greater than 0, the 615286705Smav * indirect blocks prefeteched will be those that point to the blocks containing 616286705Smav * the data starting at offset, and continuing to offset + len. 617258632Savg * 618286705Smav * Note that if the indirect blocks above the blocks being prefetched are not in 619286705Smav * cache, they will be asychronously read in. 620258632Savg */ 621168404Spjdvoid 622286705Smavdmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, 623286705Smav uint64_t len, zio_priority_t pri) 624168404Spjd{ 625168404Spjd dnode_t *dn; 626168404Spjd uint64_t blkid; 627258632Savg int nblks, err; 628168404Spjd 629168404Spjd if (len == 0) { /* they're interested in the bonus buffer */ 630219089Spjd dn = DMU_META_DNODE(os); 631168404Spjd 632168404Spjd if (object == 0 || object >= DN_MAX_OBJECT) 633168404Spjd return; 634168404Spjd 635168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 636286705Smav blkid = dbuf_whichblock(dn, level, 637286705Smav object * sizeof (dnode_phys_t)); 638286705Smav dbuf_prefetch(dn, level, blkid, pri, 0); 639168404Spjd rw_exit(&dn->dn_struct_rwlock); 640168404Spjd return; 641168404Spjd } 642168404Spjd 643168404Spjd /* 644168404Spjd * XXX - Note, if the dnode for the requested object is not 645168404Spjd * already cached, we will do a *synchronous* read in the 646168404Spjd * dnode_hold() call. The same is true for any indirects. 647168404Spjd */ 648219089Spjd err = dnode_hold(os, object, FTAG, &dn); 649168404Spjd if (err != 0) 650168404Spjd return; 651168404Spjd 652168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 653286705Smav /* 654286705Smav * offset + len - 1 is the last byte we want to prefetch for, and offset 655286705Smav * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the 656286705Smav * last block we want to prefetch, and dbuf_whichblock(dn, level, 657286705Smav * offset) is the first. Then the number we need to prefetch is the 658286705Smav * last - first + 1. 659286705Smav */ 660286705Smav if (level > 0 || dn->dn_datablkshift != 0) { 661286705Smav nblks = dbuf_whichblock(dn, level, offset + len - 1) - 662286705Smav dbuf_whichblock(dn, level, offset) + 1; 663168404Spjd } else { 664168404Spjd nblks = (offset < dn->dn_datablksz); 665168404Spjd } 666168404Spjd 667168404Spjd if (nblks != 0) { 668286705Smav blkid = dbuf_whichblock(dn, level, offset); 669258632Savg for (int i = 0; i < nblks; i++) 670286705Smav dbuf_prefetch(dn, level, blkid + i, pri, 0); 671168404Spjd } 672168404Spjd 673168404Spjd rw_exit(&dn->dn_struct_rwlock); 674168404Spjd 675168404Spjd dnode_rele(dn, FTAG); 676168404Spjd} 677168404Spjd 678208775Smm/* 679208775Smm * Get the next "chunk" of file data to free. We traverse the file from 680208775Smm * the end so that the file gets shorter over time (if we crashes in the 681208775Smm * middle, this will leave us in a better state). We find allocated file 682208775Smm * data by simply searching the allocated level 1 indirects. 683254753Sdelphij * 684254753Sdelphij * On input, *start should be the first offset that does not need to be 685254753Sdelphij * freed (e.g. "offset + length"). On return, *start will be the first 686254753Sdelphij * offset that should be freed. 687208775Smm */ 688185029Spjdstatic int 689254753Sdelphijget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) 690185029Spjd{ 691254753Sdelphij uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); 692254753Sdelphij /* bytes of data covered by a level-1 indirect block */ 693208775Smm uint64_t iblkrange = 694185029Spjd dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); 695185029Spjd 696254753Sdelphij ASSERT3U(minimum, <=, *start); 697185029Spjd 698254753Sdelphij if (*start - minimum <= iblkrange * maxblks) { 699254753Sdelphij *start = minimum; 700185029Spjd return (0); 701185029Spjd } 702208775Smm ASSERT(ISP2(iblkrange)); 703185029Spjd 704254753Sdelphij for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { 705185029Spjd int err; 706185029Spjd 707254753Sdelphij /* 708254753Sdelphij * dnode_next_offset(BACKWARDS) will find an allocated L1 709254753Sdelphij * indirect block at or before the input offset. We must 710254753Sdelphij * decrement *start so that it is at the end of the region 711254753Sdelphij * to search. 712254753Sdelphij */ 713254753Sdelphij (*start)--; 714185029Spjd err = dnode_next_offset(dn, 715208775Smm DNODE_FIND_BACKWARDS, start, 2, 1, 0); 716185029Spjd 717254753Sdelphij /* if there are no indirect blocks before start, we are done */ 718208775Smm if (err == ESRCH) { 719254753Sdelphij *start = minimum; 720254753Sdelphij break; 721254753Sdelphij } else if (err != 0) { 722208775Smm return (err); 723185029Spjd } 724185029Spjd 725254753Sdelphij /* set start to the beginning of this L1 indirect */ 726208775Smm *start = P2ALIGN(*start, iblkrange); 727185029Spjd } 728254753Sdelphij if (*start < minimum) 729254753Sdelphij *start = minimum; 730185029Spjd return (0); 731185029Spjd} 732185029Spjd 733331384Smav/* 734331384Smav * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, 735331384Smav * otherwise return false. 736331384Smav * Used below in dmu_free_long_range_impl() to enable abort when unmounting 737331384Smav */ 738331384Smav/*ARGSUSED*/ 739331384Smavstatic boolean_t 740331384Smavdmu_objset_zfs_unmounting(objset_t *os) 741331384Smav{ 742331384Smav#ifdef _KERNEL 743331384Smav if (dmu_objset_type(os) == DMU_OST_ZFS) 744331384Smav return (zfs_get_vfs_flag_unmounted(os)); 745331384Smav#endif 746331384Smav return (B_FALSE); 747331384Smav} 748331384Smav 749185029Spjdstatic int 750185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, 751254753Sdelphij uint64_t length) 752185029Spjd{ 753254753Sdelphij uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; 754254753Sdelphij int err; 755321523Smav uint64_t dirty_frees_threshold; 756321523Smav dsl_pool_t *dp = dmu_objset_pool(os); 757185029Spjd 758254753Sdelphij if (offset >= object_size) 759185029Spjd return (0); 760185029Spjd 761321523Smav if (zfs_per_txg_dirty_frees_percent <= 100) 762321523Smav dirty_frees_threshold = 763321523Smav zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; 764321523Smav else 765321523Smav dirty_frees_threshold = zfs_dirty_data_max / 4; 766321523Smav 767254753Sdelphij if (length == DMU_OBJECT_END || offset + length > object_size) 768254753Sdelphij length = object_size - offset; 769254753Sdelphij 770254753Sdelphij while (length != 0) { 771321523Smav uint64_t chunk_end, chunk_begin, chunk_len; 772321523Smav uint64_t long_free_dirty_all_txgs = 0; 773321523Smav dmu_tx_t *tx; 774254753Sdelphij 775331384Smav if (dmu_objset_zfs_unmounting(dn->dn_objset)) 776331384Smav return (SET_ERROR(EINTR)); 777331384Smav 778254753Sdelphij chunk_end = chunk_begin = offset + length; 779254753Sdelphij 780254753Sdelphij /* move chunk_begin backwards to the beginning of this chunk */ 781254753Sdelphij err = get_next_chunk(dn, &chunk_begin, offset); 782185029Spjd if (err) 783185029Spjd return (err); 784254753Sdelphij ASSERT3U(chunk_begin, >=, offset); 785254753Sdelphij ASSERT3U(chunk_begin, <=, chunk_end); 786185029Spjd 787321523Smav chunk_len = chunk_end - chunk_begin; 788268464Sdelphij 789321523Smav mutex_enter(&dp->dp_lock); 790321523Smav for (int t = 0; t < TXG_SIZE; t++) { 791321523Smav long_free_dirty_all_txgs += 792321523Smav dp->dp_long_free_dirty_pertxg[t]; 793321523Smav } 794321523Smav mutex_exit(&dp->dp_lock); 795321523Smav 796268464Sdelphij /* 797321523Smav * To avoid filling up a TXG with just frees wait for 798321523Smav * the next TXG to open before freeing more chunks if 799321523Smav * we have reached the threshold of frees 800321523Smav */ 801321523Smav if (dirty_frees_threshold != 0 && 802321523Smav long_free_dirty_all_txgs >= dirty_frees_threshold) { 803321523Smav txg_wait_open(dp, 0); 804321523Smav continue; 805321523Smav } 806321523Smav 807321523Smav tx = dmu_tx_create(os); 808321523Smav dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); 809321523Smav 810321523Smav /* 811268464Sdelphij * Mark this transaction as typically resulting in a net 812268464Sdelphij * reduction in space used. 813268464Sdelphij */ 814268464Sdelphij dmu_tx_mark_netfree(tx); 815185029Spjd err = dmu_tx_assign(tx, TXG_WAIT); 816185029Spjd if (err) { 817185029Spjd dmu_tx_abort(tx); 818185029Spjd return (err); 819185029Spjd } 820321523Smav 821321523Smav mutex_enter(&dp->dp_lock); 822321523Smav dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += 823321523Smav chunk_len; 824321523Smav mutex_exit(&dp->dp_lock); 825321523Smav DTRACE_PROBE3(free__long__range, 826321523Smav uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, 827321523Smav uint64_t, dmu_tx_get_txg(tx)); 828321523Smav dnode_free_range(dn, chunk_begin, chunk_len, tx); 829254753Sdelphij dmu_tx_commit(tx); 830185029Spjd 831321523Smav length -= chunk_len; 832185029Spjd } 833185029Spjd return (0); 834185029Spjd} 835185029Spjd 836168404Spjdint 837185029Spjddmu_free_long_range(objset_t *os, uint64_t object, 838185029Spjd uint64_t offset, uint64_t length) 839185029Spjd{ 840185029Spjd dnode_t *dn; 841185029Spjd int err; 842185029Spjd 843219089Spjd err = dnode_hold(os, object, FTAG, &dn); 844185029Spjd if (err != 0) 845185029Spjd return (err); 846254753Sdelphij err = dmu_free_long_range_impl(os, dn, offset, length); 847256259Savg 848256259Savg /* 849256259Savg * It is important to zero out the maxblkid when freeing the entire 850256259Savg * file, so that (a) subsequent calls to dmu_free_long_range_impl() 851256259Savg * will take the fast path, and (b) dnode_reallocate() can verify 852256259Savg * that the entire file has been freed. 853256259Savg */ 854260150Sdelphij if (err == 0 && offset == 0 && length == DMU_OBJECT_END) 855256259Savg dn->dn_maxblkid = 0; 856256259Savg 857185029Spjd dnode_rele(dn, FTAG); 858185029Spjd return (err); 859185029Spjd} 860185029Spjd 861185029Spjdint 862254753Sdelphijdmu_free_long_object(objset_t *os, uint64_t object) 863185029Spjd{ 864185029Spjd dmu_tx_t *tx; 865185029Spjd int err; 866185029Spjd 867254753Sdelphij err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); 868185029Spjd if (err != 0) 869185029Spjd return (err); 870254753Sdelphij 871254753Sdelphij tx = dmu_tx_create(os); 872254753Sdelphij dmu_tx_hold_bonus(tx, object); 873254753Sdelphij dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 874268464Sdelphij dmu_tx_mark_netfree(tx); 875254753Sdelphij err = dmu_tx_assign(tx, TXG_WAIT); 876254753Sdelphij if (err == 0) { 877254753Sdelphij err = dmu_object_free(os, object, tx); 878254753Sdelphij dmu_tx_commit(tx); 879185029Spjd } else { 880254753Sdelphij dmu_tx_abort(tx); 881185029Spjd } 882254753Sdelphij 883185029Spjd return (err); 884185029Spjd} 885185029Spjd 886185029Spjdint 887168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 888168404Spjd uint64_t size, dmu_tx_t *tx) 889168404Spjd{ 890168404Spjd dnode_t *dn; 891219089Spjd int err = dnode_hold(os, object, FTAG, &dn); 892168404Spjd if (err) 893168404Spjd return (err); 894168404Spjd ASSERT(offset < UINT64_MAX); 895168404Spjd ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 896168404Spjd dnode_free_range(dn, offset, size, tx); 897168404Spjd dnode_rele(dn, FTAG); 898168404Spjd return (0); 899168404Spjd} 900168404Spjd 901321549Smavstatic int 902321549Smavdmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, 903209962Smm void *buf, uint32_t flags) 904168404Spjd{ 905168404Spjd dmu_buf_t **dbp; 906321549Smav int numbufs, err = 0; 907168404Spjd 908168404Spjd /* 909168404Spjd * Deal with odd block sizes, where there can't be data past the first 910168404Spjd * block. If we ever do the tail block optimization, we will need to 911168404Spjd * handle that here as well. 912168404Spjd */ 913214378Smm if (dn->dn_maxblkid == 0) { 914168404Spjd int newsz = offset > dn->dn_datablksz ? 0 : 915168404Spjd MIN(size, dn->dn_datablksz - offset); 916168404Spjd bzero((char *)buf + newsz, size - newsz); 917168404Spjd size = newsz; 918168404Spjd } 919168404Spjd 920168404Spjd while (size > 0) { 921168404Spjd uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 922214378Smm int i; 923168404Spjd 924168404Spjd /* 925168404Spjd * NB: we could do this block-at-a-time, but it's nice 926168404Spjd * to be reading in parallel. 927168404Spjd */ 928168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 929209962Smm TRUE, FTAG, &numbufs, &dbp, flags); 930168404Spjd if (err) 931185029Spjd break; 932168404Spjd 933168404Spjd for (i = 0; i < numbufs; i++) { 934168404Spjd int tocpy; 935168404Spjd int bufoff; 936168404Spjd dmu_buf_t *db = dbp[i]; 937168404Spjd 938168404Spjd ASSERT(size > 0); 939168404Spjd 940168404Spjd bufoff = offset - db->db_offset; 941168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 942168404Spjd 943168404Spjd bcopy((char *)db->db_data + bufoff, buf, tocpy); 944168404Spjd 945168404Spjd offset += tocpy; 946168404Spjd size -= tocpy; 947168404Spjd buf = (char *)buf + tocpy; 948168404Spjd } 949168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 950168404Spjd } 951321549Smav return (err); 952321549Smav} 953321549Smav 954321549Smavint 955321549Smavdmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 956321549Smav void *buf, uint32_t flags) 957321549Smav{ 958321549Smav dnode_t *dn; 959321549Smav int err; 960321549Smav 961321549Smav err = dnode_hold(os, object, FTAG, &dn); 962321549Smav if (err != 0) 963321549Smav return (err); 964321549Smav 965321549Smav err = dmu_read_impl(dn, offset, size, buf, flags); 966168404Spjd dnode_rele(dn, FTAG); 967185029Spjd return (err); 968168404Spjd} 969168404Spjd 970321549Smavint 971321549Smavdmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, 972321549Smav uint32_t flags) 973321549Smav{ 974321549Smav return (dmu_read_impl(dn, offset, size, buf, flags)); 975321549Smav} 976321549Smav 977321549Smavstatic void 978321549Smavdmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, 979168404Spjd const void *buf, dmu_tx_t *tx) 980168404Spjd{ 981321549Smav int i; 982168404Spjd 983168404Spjd for (i = 0; i < numbufs; i++) { 984168404Spjd int tocpy; 985168404Spjd int bufoff; 986168404Spjd dmu_buf_t *db = dbp[i]; 987168404Spjd 988168404Spjd ASSERT(size > 0); 989168404Spjd 990168404Spjd bufoff = offset - db->db_offset; 991168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 992168404Spjd 993168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 994168404Spjd 995168404Spjd if (tocpy == db->db_size) 996168404Spjd dmu_buf_will_fill(db, tx); 997168404Spjd else 998168404Spjd dmu_buf_will_dirty(db, tx); 999168404Spjd 1000168404Spjd bcopy(buf, (char *)db->db_data + bufoff, tocpy); 1001168404Spjd 1002168404Spjd if (tocpy == db->db_size) 1003168404Spjd dmu_buf_fill_done(db, tx); 1004168404Spjd 1005168404Spjd offset += tocpy; 1006168404Spjd size -= tocpy; 1007168404Spjd buf = (char *)buf + tocpy; 1008168404Spjd } 1009321549Smav} 1010321549Smav 1011321549Smavvoid 1012321549Smavdmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1013321549Smav const void *buf, dmu_tx_t *tx) 1014321549Smav{ 1015321549Smav dmu_buf_t **dbp; 1016321549Smav int numbufs; 1017321549Smav 1018321549Smav if (size == 0) 1019321549Smav return; 1020321549Smav 1021321549Smav VERIFY0(dmu_buf_hold_array(os, object, offset, size, 1022321549Smav FALSE, FTAG, &numbufs, &dbp)); 1023321549Smav dmu_write_impl(dbp, numbufs, offset, size, buf, tx); 1024168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1025168404Spjd} 1026168404Spjd 1027219089Spjdvoid 1028321549Smavdmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, 1029321549Smav const void *buf, dmu_tx_t *tx) 1030321549Smav{ 1031321549Smav dmu_buf_t **dbp; 1032321549Smav int numbufs; 1033321549Smav 1034321549Smav if (size == 0) 1035321549Smav return; 1036321549Smav 1037321549Smav VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, 1038321549Smav FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); 1039321549Smav dmu_write_impl(dbp, numbufs, offset, size, buf, tx); 1040321549Smav dmu_buf_rele_array(dbp, numbufs, FTAG); 1041321549Smav} 1042321549Smav 1043332525Smavstatic int 1044332525Smavdmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, 1045332525Smav uint64_t last_removal_txg, uint64_t offset) 1046332525Smav{ 1047332525Smav uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); 1048332525Smav int err = 0; 1049332525Smav 1050332525Smav rw_enter(&dn->dn_struct_rwlock, RW_READER); 1051332525Smav dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG); 1052332525Smav ASSERT3P(dbuf, !=, NULL); 1053332525Smav 1054332525Smav /* 1055332525Smav * If the block hasn't been written yet, this default will ensure 1056332525Smav * we don't try to remap it. 1057332525Smav */ 1058332525Smav uint64_t birth = UINT64_MAX; 1059332525Smav ASSERT3U(last_removal_txg, !=, UINT64_MAX); 1060332525Smav if (dbuf->db_blkptr != NULL) 1061332525Smav birth = dbuf->db_blkptr->blk_birth; 1062332525Smav rw_exit(&dn->dn_struct_rwlock); 1063332525Smav 1064332525Smav /* 1065332525Smav * If this L1 was already written after the last removal, then we've 1066332525Smav * already tried to remap it. 1067332525Smav */ 1068332525Smav if (birth <= last_removal_txg && 1069332525Smav dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && 1070332525Smav dbuf_can_remap(dbuf)) { 1071332525Smav dmu_tx_t *tx = dmu_tx_create(os); 1072332525Smav dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); 1073332525Smav err = dmu_tx_assign(tx, TXG_WAIT); 1074332525Smav if (err == 0) { 1075332525Smav (void) dbuf_dirty(dbuf, tx); 1076332525Smav dmu_tx_commit(tx); 1077332525Smav } else { 1078332525Smav dmu_tx_abort(tx); 1079332525Smav } 1080332525Smav } 1081332525Smav 1082332525Smav dbuf_rele(dbuf, FTAG); 1083332525Smav 1084332525Smav delay(zfs_object_remap_one_indirect_delay_ticks); 1085332525Smav 1086332525Smav return (err); 1087332525Smav} 1088332525Smav 1089332525Smav/* 1090332525Smav * Remap all blockpointers in the object, if possible, so that they reference 1091332525Smav * only concrete vdevs. 1092332525Smav * 1093332525Smav * To do this, iterate over the L0 blockpointers and remap any that reference 1094332525Smav * an indirect vdev. Note that we only examine L0 blockpointers; since we 1095332525Smav * cannot guarantee that we can remap all blockpointer anyways (due to split 1096332525Smav * blocks), we do not want to make the code unnecessarily complicated to 1097332525Smav * catch the unlikely case that there is an L1 block on an indirect vdev that 1098332525Smav * contains no indirect blockpointers. 1099332525Smav */ 1100332525Smavint 1101332525Smavdmu_object_remap_indirects(objset_t *os, uint64_t object, 1102332525Smav uint64_t last_removal_txg) 1103332525Smav{ 1104332525Smav uint64_t offset, l1span; 1105332525Smav int err; 1106332525Smav dnode_t *dn; 1107332525Smav 1108332525Smav err = dnode_hold(os, object, FTAG, &dn); 1109332525Smav if (err != 0) { 1110332525Smav return (err); 1111332525Smav } 1112332525Smav 1113332525Smav if (dn->dn_nlevels <= 1) { 1114332525Smav if (issig(JUSTLOOKING) && issig(FORREAL)) { 1115332525Smav err = SET_ERROR(EINTR); 1116332525Smav } 1117332525Smav 1118332525Smav /* 1119332525Smav * If the dnode has no indirect blocks, we cannot dirty them. 1120332525Smav * We still want to remap the blkptr(s) in the dnode if 1121332525Smav * appropriate, so mark it as dirty. 1122332525Smav */ 1123332525Smav if (err == 0 && dnode_needs_remap(dn)) { 1124332525Smav dmu_tx_t *tx = dmu_tx_create(os); 1125332525Smav dmu_tx_hold_bonus(tx, dn->dn_object); 1126332525Smav if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) { 1127332525Smav dnode_setdirty(dn, tx); 1128332525Smav dmu_tx_commit(tx); 1129332525Smav } else { 1130332525Smav dmu_tx_abort(tx); 1131332525Smav } 1132332525Smav } 1133332525Smav 1134332525Smav dnode_rele(dn, FTAG); 1135332525Smav return (err); 1136332525Smav } 1137332525Smav 1138332525Smav offset = 0; 1139332525Smav l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT + 1140332525Smav dn->dn_datablkshift); 1141332525Smav /* 1142332525Smav * Find the next L1 indirect that is not a hole. 1143332525Smav */ 1144332525Smav while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) { 1145332525Smav if (issig(JUSTLOOKING) && issig(FORREAL)) { 1146332525Smav err = SET_ERROR(EINTR); 1147332525Smav break; 1148332525Smav } 1149332525Smav if ((err = dmu_object_remap_one_indirect(os, dn, 1150332525Smav last_removal_txg, offset)) != 0) { 1151332525Smav break; 1152332525Smav } 1153332525Smav offset += l1span; 1154332525Smav } 1155332525Smav 1156332525Smav dnode_rele(dn, FTAG); 1157332525Smav return (err); 1158332525Smav} 1159332525Smav 1160321549Smavvoid 1161219089Spjddmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1162219089Spjd dmu_tx_t *tx) 1163219089Spjd{ 1164219089Spjd dmu_buf_t **dbp; 1165219089Spjd int numbufs, i; 1166219089Spjd 1167219089Spjd if (size == 0) 1168219089Spjd return; 1169219089Spjd 1170219089Spjd VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 1171219089Spjd FALSE, FTAG, &numbufs, &dbp)); 1172219089Spjd 1173219089Spjd for (i = 0; i < numbufs; i++) { 1174219089Spjd dmu_buf_t *db = dbp[i]; 1175219089Spjd 1176219089Spjd dmu_buf_will_not_fill(db, tx); 1177219089Spjd } 1178219089Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1179219089Spjd} 1180219089Spjd 1181268075Sdelphijvoid 1182268075Sdelphijdmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, 1183268075Sdelphij void *data, uint8_t etype, uint8_t comp, int uncompressed_size, 1184268075Sdelphij int compressed_size, int byteorder, dmu_tx_t *tx) 1185268075Sdelphij{ 1186268075Sdelphij dmu_buf_t *db; 1187268075Sdelphij 1188268075Sdelphij ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); 1189268075Sdelphij ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); 1190268075Sdelphij VERIFY0(dmu_buf_hold_noread(os, object, offset, 1191268075Sdelphij FTAG, &db)); 1192268075Sdelphij 1193268075Sdelphij dmu_buf_write_embedded(db, 1194268075Sdelphij data, (bp_embedded_type_t)etype, (enum zio_compress)comp, 1195268075Sdelphij uncompressed_size, compressed_size, byteorder, tx); 1196268075Sdelphij 1197268075Sdelphij dmu_buf_rele(db, FTAG); 1198268075Sdelphij} 1199268075Sdelphij 1200219089Spjd/* 1201219089Spjd * DMU support for xuio 1202219089Spjd */ 1203219089Spjdkstat_t *xuio_ksp = NULL; 1204219089Spjd 1205219089Spjdint 1206219089Spjddmu_xuio_init(xuio_t *xuio, int nblk) 1207219089Spjd{ 1208219089Spjd dmu_xuio_t *priv; 1209219089Spjd uio_t *uio = &xuio->xu_uio; 1210219089Spjd 1211219089Spjd uio->uio_iovcnt = nblk; 1212219089Spjd uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); 1213219089Spjd 1214219089Spjd priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); 1215219089Spjd priv->cnt = nblk; 1216219089Spjd priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); 1217219089Spjd priv->iovp = uio->uio_iov; 1218219089Spjd XUIO_XUZC_PRIV(xuio) = priv; 1219219089Spjd 1220219089Spjd if (XUIO_XUZC_RW(xuio) == UIO_READ) 1221219089Spjd XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); 1222219089Spjd else 1223219089Spjd XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); 1224219089Spjd 1225219089Spjd return (0); 1226219089Spjd} 1227219089Spjd 1228219089Spjdvoid 1229219089Spjddmu_xuio_fini(xuio_t *xuio) 1230219089Spjd{ 1231219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1232219089Spjd int nblk = priv->cnt; 1233219089Spjd 1234219089Spjd kmem_free(priv->iovp, nblk * sizeof (iovec_t)); 1235219089Spjd kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); 1236219089Spjd kmem_free(priv, sizeof (dmu_xuio_t)); 1237219089Spjd 1238219089Spjd if (XUIO_XUZC_RW(xuio) == UIO_READ) 1239219089Spjd XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); 1240219089Spjd else 1241219089Spjd XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); 1242219089Spjd} 1243219089Spjd 1244219089Spjd/* 1245219089Spjd * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } 1246219089Spjd * and increase priv->next by 1. 1247219089Spjd */ 1248219089Spjdint 1249219089Spjddmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) 1250219089Spjd{ 1251219089Spjd struct iovec *iov; 1252219089Spjd uio_t *uio = &xuio->xu_uio; 1253219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1254219089Spjd int i = priv->next++; 1255219089Spjd 1256219089Spjd ASSERT(i < priv->cnt); 1257321535Smav ASSERT(off + n <= arc_buf_lsize(abuf)); 1258219089Spjd iov = uio->uio_iov + i; 1259219089Spjd iov->iov_base = (char *)abuf->b_data + off; 1260219089Spjd iov->iov_len = n; 1261219089Spjd priv->bufs[i] = abuf; 1262219089Spjd return (0); 1263219089Spjd} 1264219089Spjd 1265219089Spjdint 1266219089Spjddmu_xuio_cnt(xuio_t *xuio) 1267219089Spjd{ 1268219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1269219089Spjd return (priv->cnt); 1270219089Spjd} 1271219089Spjd 1272219089Spjdarc_buf_t * 1273219089Spjddmu_xuio_arcbuf(xuio_t *xuio, int i) 1274219089Spjd{ 1275219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1276219089Spjd 1277219089Spjd ASSERT(i < priv->cnt); 1278219089Spjd return (priv->bufs[i]); 1279219089Spjd} 1280219089Spjd 1281219089Spjdvoid 1282219089Spjddmu_xuio_clear(xuio_t *xuio, int i) 1283219089Spjd{ 1284219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 1285219089Spjd 1286219089Spjd ASSERT(i < priv->cnt); 1287219089Spjd priv->bufs[i] = NULL; 1288219089Spjd} 1289219089Spjd 1290219089Spjdstatic void 1291219089Spjdxuio_stat_init(void) 1292219089Spjd{ 1293219089Spjd xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", 1294219089Spjd KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), 1295219089Spjd KSTAT_FLAG_VIRTUAL); 1296219089Spjd if (xuio_ksp != NULL) { 1297219089Spjd xuio_ksp->ks_data = &xuio_stats; 1298219089Spjd kstat_install(xuio_ksp); 1299219089Spjd } 1300219089Spjd} 1301219089Spjd 1302219089Spjdstatic void 1303219089Spjdxuio_stat_fini(void) 1304219089Spjd{ 1305219089Spjd if (xuio_ksp != NULL) { 1306219089Spjd kstat_delete(xuio_ksp); 1307219089Spjd xuio_ksp = NULL; 1308219089Spjd } 1309219089Spjd} 1310219089Spjd 1311219089Spjdvoid 1312321530Smavxuio_stat_wbuf_copied(void) 1313219089Spjd{ 1314219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_copied); 1315219089Spjd} 1316219089Spjd 1317219089Spjdvoid 1318321530Smavxuio_stat_wbuf_nocopy(void) 1319219089Spjd{ 1320219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); 1321219089Spjd} 1322219089Spjd 1323168404Spjd#ifdef _KERNEL 1324339128Smavint 1325272809Sdelphijdmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) 1326168404Spjd{ 1327168404Spjd dmu_buf_t **dbp; 1328168404Spjd int numbufs, i, err; 1329219089Spjd xuio_t *xuio = NULL; 1330168404Spjd 1331168404Spjd /* 1332168404Spjd * NB: we could do this block-at-a-time, but it's nice 1333168404Spjd * to be reading in parallel. 1334168404Spjd */ 1335272809Sdelphij err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, 1336272809Sdelphij TRUE, FTAG, &numbufs, &dbp, 0); 1337168404Spjd if (err) 1338168404Spjd return (err); 1339168404Spjd 1340219089Spjd#ifdef UIO_XUIO 1341219089Spjd if (uio->uio_extflg == UIO_XUIO) 1342219089Spjd xuio = (xuio_t *)uio; 1343219089Spjd#endif 1344219089Spjd 1345168404Spjd for (i = 0; i < numbufs; i++) { 1346168404Spjd int tocpy; 1347168404Spjd int bufoff; 1348168404Spjd dmu_buf_t *db = dbp[i]; 1349168404Spjd 1350168404Spjd ASSERT(size > 0); 1351168404Spjd 1352168404Spjd bufoff = uio->uio_loffset - db->db_offset; 1353168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1354168404Spjd 1355219089Spjd if (xuio) { 1356219089Spjd dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 1357219089Spjd arc_buf_t *dbuf_abuf = dbi->db_buf; 1358219089Spjd arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); 1359219089Spjd err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); 1360219089Spjd if (!err) { 1361219089Spjd uio->uio_resid -= tocpy; 1362219089Spjd uio->uio_loffset += tocpy; 1363219089Spjd } 1364219089Spjd 1365219089Spjd if (abuf == dbuf_abuf) 1366219089Spjd XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); 1367219089Spjd else 1368219089Spjd XUIOSTAT_BUMP(xuiostat_rbuf_copied); 1369219089Spjd } else { 1370298105Savg#ifdef illumos 1371219089Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 1372219089Spjd UIO_READ, uio); 1373298105Savg#else 1374298105Savg err = vn_io_fault_uiomove((char *)db->db_data + bufoff, 1375298105Savg tocpy, uio); 1376298105Savg#endif 1377219089Spjd } 1378168404Spjd if (err) 1379168404Spjd break; 1380168404Spjd 1381168404Spjd size -= tocpy; 1382168404Spjd } 1383168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1384168404Spjd 1385168404Spjd return (err); 1386168404Spjd} 1387168404Spjd 1388272809Sdelphij/* 1389272809Sdelphij * Read 'size' bytes into the uio buffer. 1390272809Sdelphij * From object zdb->db_object. 1391272809Sdelphij * Starting at offset uio->uio_loffset. 1392272809Sdelphij * 1393272809Sdelphij * If the caller already has a dbuf in the target object 1394272809Sdelphij * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), 1395272809Sdelphij * because we don't have to find the dnode_t for the object. 1396272809Sdelphij */ 1397272809Sdelphijint 1398272809Sdelphijdmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) 1399272809Sdelphij{ 1400272809Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; 1401272809Sdelphij dnode_t *dn; 1402272809Sdelphij int err; 1403272809Sdelphij 1404272809Sdelphij if (size == 0) 1405272809Sdelphij return (0); 1406272809Sdelphij 1407272809Sdelphij DB_DNODE_ENTER(db); 1408272809Sdelphij dn = DB_DNODE(db); 1409272809Sdelphij err = dmu_read_uio_dnode(dn, uio, size); 1410272809Sdelphij DB_DNODE_EXIT(db); 1411272809Sdelphij 1412272809Sdelphij return (err); 1413272809Sdelphij} 1414272809Sdelphij 1415272809Sdelphij/* 1416272809Sdelphij * Read 'size' bytes into the uio buffer. 1417272809Sdelphij * From the specified object 1418272809Sdelphij * Starting at offset uio->uio_loffset. 1419272809Sdelphij */ 1420272809Sdelphijint 1421272809Sdelphijdmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 1422272809Sdelphij{ 1423272809Sdelphij dnode_t *dn; 1424272809Sdelphij int err; 1425272809Sdelphij 1426272809Sdelphij if (size == 0) 1427272809Sdelphij return (0); 1428272809Sdelphij 1429272809Sdelphij err = dnode_hold(os, object, FTAG, &dn); 1430272809Sdelphij if (err) 1431272809Sdelphij return (err); 1432272809Sdelphij 1433272809Sdelphij err = dmu_read_uio_dnode(dn, uio, size); 1434272809Sdelphij 1435272809Sdelphij dnode_rele(dn, FTAG); 1436272809Sdelphij 1437272809Sdelphij return (err); 1438272809Sdelphij} 1439272809Sdelphij 1440339128Smavint 1441219089Spjddmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) 1442168404Spjd{ 1443168404Spjd dmu_buf_t **dbp; 1444219089Spjd int numbufs; 1445168404Spjd int err = 0; 1446219089Spjd int i; 1447168404Spjd 1448219089Spjd err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, 1449219089Spjd FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); 1450168404Spjd if (err) 1451168404Spjd return (err); 1452168404Spjd 1453168404Spjd for (i = 0; i < numbufs; i++) { 1454168404Spjd int tocpy; 1455168404Spjd int bufoff; 1456168404Spjd dmu_buf_t *db = dbp[i]; 1457168404Spjd 1458168404Spjd ASSERT(size > 0); 1459168404Spjd 1460168404Spjd bufoff = uio->uio_loffset - db->db_offset; 1461168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1462168404Spjd 1463168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1464168404Spjd 1465168404Spjd if (tocpy == db->db_size) 1466168404Spjd dmu_buf_will_fill(db, tx); 1467168404Spjd else 1468168404Spjd dmu_buf_will_dirty(db, tx); 1469168404Spjd 1470298105Savg#ifdef illumos 1471168404Spjd /* 1472168404Spjd * XXX uiomove could block forever (eg. nfs-backed 1473168404Spjd * pages). There needs to be a uiolockdown() function 1474168404Spjd * to lock the pages in memory, so that uiomove won't 1475168404Spjd * block. 1476168404Spjd */ 1477168404Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 1478168404Spjd UIO_WRITE, uio); 1479298105Savg#else 1480298105Savg err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, 1481298105Savg uio); 1482298105Savg#endif 1483168404Spjd 1484168404Spjd if (tocpy == db->db_size) 1485168404Spjd dmu_buf_fill_done(db, tx); 1486168404Spjd 1487168404Spjd if (err) 1488168404Spjd break; 1489168404Spjd 1490168404Spjd size -= tocpy; 1491168404Spjd } 1492219089Spjd 1493168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1494168404Spjd return (err); 1495168404Spjd} 1496168404Spjd 1497272809Sdelphij/* 1498272809Sdelphij * Write 'size' bytes from the uio buffer. 1499272809Sdelphij * To object zdb->db_object. 1500272809Sdelphij * Starting at offset uio->uio_loffset. 1501272809Sdelphij * 1502272809Sdelphij * If the caller already has a dbuf in the target object 1503272809Sdelphij * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), 1504272809Sdelphij * because we don't have to find the dnode_t for the object. 1505272809Sdelphij */ 1506168404Spjdint 1507219089Spjddmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, 1508219089Spjd dmu_tx_t *tx) 1509219089Spjd{ 1510219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; 1511219089Spjd dnode_t *dn; 1512219089Spjd int err; 1513219089Spjd 1514219089Spjd if (size == 0) 1515219089Spjd return (0); 1516219089Spjd 1517219089Spjd DB_DNODE_ENTER(db); 1518219089Spjd dn = DB_DNODE(db); 1519219089Spjd err = dmu_write_uio_dnode(dn, uio, size, tx); 1520219089Spjd DB_DNODE_EXIT(db); 1521219089Spjd 1522219089Spjd return (err); 1523219089Spjd} 1524219089Spjd 1525272809Sdelphij/* 1526272809Sdelphij * Write 'size' bytes from the uio buffer. 1527272809Sdelphij * To the specified object. 1528272809Sdelphij * Starting at offset uio->uio_loffset. 1529272809Sdelphij */ 1530219089Spjdint 1531219089Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 1532219089Spjd dmu_tx_t *tx) 1533219089Spjd{ 1534219089Spjd dnode_t *dn; 1535219089Spjd int err; 1536219089Spjd 1537219089Spjd if (size == 0) 1538219089Spjd return (0); 1539219089Spjd 1540219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1541219089Spjd if (err) 1542219089Spjd return (err); 1543219089Spjd 1544219089Spjd err = dmu_write_uio_dnode(dn, uio, size, tx); 1545219089Spjd 1546219089Spjd dnode_rele(dn, FTAG); 1547219089Spjd 1548219089Spjd return (err); 1549219089Spjd} 1550219089Spjd 1551277300Ssmh#ifdef illumos 1552219089Spjdint 1553168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1554168404Spjd page_t *pp, dmu_tx_t *tx) 1555168404Spjd{ 1556168404Spjd dmu_buf_t **dbp; 1557168404Spjd int numbufs, i; 1558168404Spjd int err; 1559168404Spjd 1560168404Spjd if (size == 0) 1561168404Spjd return (0); 1562168404Spjd 1563168404Spjd err = dmu_buf_hold_array(os, object, offset, size, 1564168404Spjd FALSE, FTAG, &numbufs, &dbp); 1565168404Spjd if (err) 1566168404Spjd return (err); 1567168404Spjd 1568168404Spjd for (i = 0; i < numbufs; i++) { 1569168404Spjd int tocpy, copied, thiscpy; 1570168404Spjd int bufoff; 1571168404Spjd dmu_buf_t *db = dbp[i]; 1572168404Spjd caddr_t va; 1573168404Spjd 1574168404Spjd ASSERT(size > 0); 1575168404Spjd ASSERT3U(db->db_size, >=, PAGESIZE); 1576168404Spjd 1577168404Spjd bufoff = offset - db->db_offset; 1578168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1579168404Spjd 1580168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1581168404Spjd 1582168404Spjd if (tocpy == db->db_size) 1583168404Spjd dmu_buf_will_fill(db, tx); 1584168404Spjd else 1585168404Spjd dmu_buf_will_dirty(db, tx); 1586168404Spjd 1587168404Spjd for (copied = 0; copied < tocpy; copied += PAGESIZE) { 1588168404Spjd ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 1589168404Spjd thiscpy = MIN(PAGESIZE, tocpy - copied); 1590185029Spjd va = zfs_map_page(pp, S_READ); 1591168404Spjd bcopy(va, (char *)db->db_data + bufoff, thiscpy); 1592185029Spjd zfs_unmap_page(pp, va); 1593168404Spjd pp = pp->p_next; 1594168404Spjd bufoff += PAGESIZE; 1595168404Spjd } 1596168404Spjd 1597168404Spjd if (tocpy == db->db_size) 1598168404Spjd dmu_buf_fill_done(db, tx); 1599168404Spjd 1600168404Spjd offset += tocpy; 1601168404Spjd size -= tocpy; 1602168404Spjd } 1603168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1604168404Spjd return (err); 1605168404Spjd} 1606258745Savg 1607277300Ssmh#else /* !illumos */ 1608258745Savg 1609258745Savgint 1610258745Savgdmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1611258745Savg vm_page_t *ma, dmu_tx_t *tx) 1612258745Savg{ 1613258745Savg dmu_buf_t **dbp; 1614258745Savg struct sf_buf *sf; 1615258745Savg int numbufs, i; 1616258745Savg int err; 1617258745Savg 1618258745Savg if (size == 0) 1619258745Savg return (0); 1620258745Savg 1621258745Savg err = dmu_buf_hold_array(os, object, offset, size, 1622258745Savg FALSE, FTAG, &numbufs, &dbp); 1623258745Savg if (err) 1624258745Savg return (err); 1625258745Savg 1626258745Savg for (i = 0; i < numbufs; i++) { 1627258745Savg int tocpy, copied, thiscpy; 1628258745Savg int bufoff; 1629258745Savg dmu_buf_t *db = dbp[i]; 1630258745Savg caddr_t va; 1631258745Savg 1632258745Savg ASSERT(size > 0); 1633258745Savg ASSERT3U(db->db_size, >=, PAGESIZE); 1634258745Savg 1635258745Savg bufoff = offset - db->db_offset; 1636258745Savg tocpy = (int)MIN(db->db_size - bufoff, size); 1637258745Savg 1638258745Savg ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1639258745Savg 1640258745Savg if (tocpy == db->db_size) 1641258745Savg dmu_buf_will_fill(db, tx); 1642258745Savg else 1643258745Savg dmu_buf_will_dirty(db, tx); 1644258745Savg 1645258745Savg for (copied = 0; copied < tocpy; copied += PAGESIZE) { 1646258745Savg ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); 1647258745Savg thiscpy = MIN(PAGESIZE, tocpy - copied); 1648258745Savg va = zfs_map_page(*ma, &sf); 1649258745Savg bcopy(va, (char *)db->db_data + bufoff, thiscpy); 1650258745Savg zfs_unmap_page(sf); 1651258745Savg ma += 1; 1652258745Savg bufoff += PAGESIZE; 1653258745Savg } 1654258745Savg 1655258745Savg if (tocpy == db->db_size) 1656258745Savg dmu_buf_fill_done(db, tx); 1657258745Savg 1658258745Savg offset += tocpy; 1659258745Savg size -= tocpy; 1660258745Savg } 1661258745Savg dmu_buf_rele_array(dbp, numbufs, FTAG); 1662258745Savg return (err); 1663258745Savg} 1664330991Savg 1665330991Savgint 1666330991Savgdmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, 1667330991Savg int *rbehind, int *rahead, int last_size) 1668330991Savg{ 1669330991Savg struct sf_buf *sf; 1670330991Savg vm_object_t vmobj; 1671330991Savg vm_page_t m; 1672330991Savg dmu_buf_t **dbp; 1673330991Savg dmu_buf_t *db; 1674330991Savg caddr_t va; 1675330991Savg int numbufs, i; 1676330991Savg int bufoff, pgoff, tocpy; 1677330991Savg int mi, di; 1678330991Savg int err; 1679330991Savg 1680330991Savg ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); 1681330991Savg ASSERT(last_size <= PAGE_SIZE); 1682330991Savg 1683330991Savg err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), 1684330991Savg IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); 1685330991Savg if (err != 0) 1686330991Savg return (err); 1687330991Savg 1688330991Savg#ifdef DEBUG 1689330991Savg IMPLY(last_size < PAGE_SIZE, *rahead == 0); 1690330991Savg if (dbp[0]->db_offset != 0 || numbufs > 1) { 1691330991Savg for (i = 0; i < numbufs; i++) { 1692330991Savg ASSERT(ISP2(dbp[i]->db_size)); 1693330991Savg ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0); 1694330991Savg ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); 1695330991Savg } 1696330991Savg } 1697330991Savg#endif 1698330991Savg 1699330991Savg vmobj = ma[0]->object; 1700330991Savg zfs_vmobject_wlock(vmobj); 1701330991Savg 1702330991Savg db = dbp[0]; 1703330991Savg for (i = 0; i < *rbehind; i++) { 1704330991Savg m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i, 1705330991Savg VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY); 1706330991Savg if (m == NULL) 1707330991Savg break; 1708330991Savg if (m->valid != 0) { 1709330991Savg ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); 1710330991Savg break; 1711330991Savg } 1712330991Savg ASSERT(m->dirty == 0); 1713330991Savg ASSERT(!pmap_page_is_mapped(m)); 1714330991Savg 1715330991Savg ASSERT(db->db_size > PAGE_SIZE); 1716330991Savg bufoff = IDX_TO_OFF(m->pindex) % db->db_size; 1717330991Savg va = zfs_map_page(m, &sf); 1718330991Savg bcopy((char *)db->db_data + bufoff, va, PAGESIZE); 1719330991Savg zfs_unmap_page(sf); 1720330991Savg m->valid = VM_PAGE_BITS_ALL; 1721330991Savg vm_page_lock(m); 1722330991Savg if ((m->busy_lock & VPB_BIT_WAITERS) != 0) 1723330991Savg vm_page_activate(m); 1724330991Savg else 1725330991Savg vm_page_deactivate(m); 1726330991Savg vm_page_unlock(m); 1727330991Savg } 1728330991Savg *rbehind = i; 1729330991Savg 1730330991Savg bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; 1731330991Savg pgoff = 0; 1732330991Savg for (mi = 0, di = 0; mi < count && di < numbufs; ) { 1733330991Savg if (pgoff == 0) { 1734330991Savg m = ma[mi]; 1735330991Savg vm_page_assert_xbusied(m); 1736330991Savg ASSERT(m->valid == 0); 1737330991Savg ASSERT(m->dirty == 0); 1738330991Savg ASSERT(!pmap_page_is_mapped(m)); 1739330991Savg va = zfs_map_page(m, &sf); 1740330991Savg } 1741330991Savg if (bufoff == 0) 1742330991Savg db = dbp[di]; 1743330991Savg 1744330991Savg ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, 1745330991Savg db->db_offset + bufoff); 1746330991Savg 1747330991Savg /* 1748330991Savg * We do not need to clamp the copy size by the file 1749330991Savg * size as the last block is zero-filled beyond the 1750330991Savg * end of file anyway. 1751330991Savg */ 1752330991Savg tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); 1753330991Savg bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); 1754330991Savg 1755330991Savg pgoff += tocpy; 1756330991Savg ASSERT(pgoff <= PAGESIZE); 1757330991Savg if (pgoff == PAGESIZE) { 1758330991Savg zfs_unmap_page(sf); 1759330991Savg m->valid = VM_PAGE_BITS_ALL; 1760330991Savg ASSERT(mi < count); 1761330991Savg mi++; 1762330991Savg pgoff = 0; 1763330991Savg } 1764330991Savg 1765330991Savg bufoff += tocpy; 1766330991Savg ASSERT(bufoff <= db->db_size); 1767330991Savg if (bufoff == db->db_size) { 1768330991Savg ASSERT(di < numbufs); 1769330991Savg di++; 1770330991Savg bufoff = 0; 1771330991Savg } 1772330991Savg } 1773330991Savg 1774330991Savg#ifdef DEBUG 1775330991Savg /* 1776330991Savg * Three possibilities: 1777330991Savg * - last requested page ends at a buffer boundary and , thus, 1778330991Savg * all pages and buffers have been iterated; 1779330991Savg * - all requested pages are filled, but the last buffer 1780330991Savg * has not been exhausted; 1781330991Savg * the read-ahead is possible only in this case; 1782330991Savg * - all buffers have been read, but the last page has not been 1783330991Savg * fully filled; 1784330991Savg * this is only possible if the file has only a single buffer 1785330991Savg * with a size that is not a multiple of the page size. 1786330991Savg */ 1787330991Savg if (mi == count) { 1788330991Savg ASSERT(di >= numbufs - 1); 1789330991Savg IMPLY(*rahead != 0, di == numbufs - 1); 1790330991Savg IMPLY(*rahead != 0, bufoff != 0); 1791330991Savg ASSERT(pgoff == 0); 1792330991Savg } 1793330991Savg if (di == numbufs) { 1794330991Savg ASSERT(mi >= count - 1); 1795330991Savg ASSERT(*rahead == 0); 1796330991Savg IMPLY(pgoff == 0, mi == count); 1797330991Savg if (pgoff != 0) { 1798330991Savg ASSERT(mi == count - 1); 1799330991Savg ASSERT((dbp[0]->db_size & PAGE_MASK) != 0); 1800330991Savg } 1801330991Savg } 1802330991Savg#endif 1803330991Savg if (pgoff != 0) { 1804330991Savg bzero(va + pgoff, PAGESIZE - pgoff); 1805330991Savg zfs_unmap_page(sf); 1806330991Savg m->valid = VM_PAGE_BITS_ALL; 1807330991Savg } 1808330991Savg 1809330991Savg for (i = 0; i < *rahead; i++) { 1810330991Savg m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i, 1811330991Savg VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY); 1812330991Savg if (m == NULL) 1813330991Savg break; 1814330991Savg if (m->valid != 0) { 1815330991Savg ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); 1816330991Savg break; 1817330991Savg } 1818330991Savg ASSERT(m->dirty == 0); 1819330991Savg ASSERT(!pmap_page_is_mapped(m)); 1820330991Savg 1821330991Savg ASSERT(db->db_size > PAGE_SIZE); 1822330991Savg bufoff = IDX_TO_OFF(m->pindex) % db->db_size; 1823330991Savg tocpy = MIN(db->db_size - bufoff, PAGESIZE); 1824330991Savg va = zfs_map_page(m, &sf); 1825330991Savg bcopy((char *)db->db_data + bufoff, va, tocpy); 1826330991Savg if (tocpy < PAGESIZE) { 1827330991Savg ASSERT(i == *rahead - 1); 1828330991Savg ASSERT((db->db_size & PAGE_MASK) != 0); 1829330991Savg bzero(va + tocpy, PAGESIZE - tocpy); 1830330991Savg } 1831330991Savg zfs_unmap_page(sf); 1832330991Savg m->valid = VM_PAGE_BITS_ALL; 1833330991Savg vm_page_lock(m); 1834330991Savg if ((m->busy_lock & VPB_BIT_WAITERS) != 0) 1835330991Savg vm_page_activate(m); 1836330991Savg else 1837330991Savg vm_page_deactivate(m); 1838330991Savg vm_page_unlock(m); 1839330991Savg } 1840330991Savg *rahead = i; 1841330991Savg zfs_vmobject_wunlock(vmobj); 1842330991Savg 1843330991Savg dmu_buf_rele_array(dbp, numbufs, FTAG); 1844330991Savg return (0); 1845330991Savg} 1846277300Ssmh#endif /* illumos */ 1847277300Ssmh#endif /* _KERNEL */ 1848168404Spjd 1849209962Smm/* 1850209962Smm * Allocate a loaned anonymous arc buffer. 1851209962Smm */ 1852209962Smmarc_buf_t * 1853209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size) 1854209962Smm{ 1855219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; 1856209962Smm 1857321535Smav return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); 1858209962Smm} 1859209962Smm 1860209962Smm/* 1861209962Smm * Free a loaned arc buffer. 1862209962Smm */ 1863209962Smmvoid 1864209962Smmdmu_return_arcbuf(arc_buf_t *buf) 1865209962Smm{ 1866209962Smm arc_return_buf(buf, FTAG); 1867307265Smav arc_buf_destroy(buf, FTAG); 1868209962Smm} 1869209962Smm 1870209962Smm/* 1871209962Smm * When possible directly assign passed loaned arc buffer to a dbuf. 1872209962Smm * If this is not possible copy the contents of passed arc buf via 1873209962Smm * dmu_write(). 1874209962Smm */ 1875209962Smmvoid 1876339128Smavdmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, 1877209962Smm dmu_tx_t *tx) 1878209962Smm{ 1879209962Smm dmu_buf_impl_t *db; 1880321535Smav uint32_t blksz = (uint32_t)arc_buf_lsize(buf); 1881209962Smm uint64_t blkid; 1882209962Smm 1883209962Smm rw_enter(&dn->dn_struct_rwlock, RW_READER); 1884286705Smav blkid = dbuf_whichblock(dn, 0, offset); 1885209962Smm VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); 1886209962Smm rw_exit(&dn->dn_struct_rwlock); 1887209962Smm 1888272601Sdelphij /* 1889272601Sdelphij * We can only assign if the offset is aligned, the arc buf is the 1890321535Smav * same size as the dbuf, and the dbuf is not metadata. 1891272601Sdelphij */ 1892321535Smav if (offset == db->db.db_offset && blksz == db->db.db_size) { 1893294625Strasz#ifdef _KERNEL 1894294625Strasz curthread->td_ru.ru_oublock++; 1895297633Strasz#ifdef RACCT 1896297633Strasz if (racct_enable) { 1897297633Strasz PROC_LOCK(curproc); 1898297633Strasz racct_add_force(curproc, RACCT_WRITEBPS, blksz); 1899297633Strasz racct_add_force(curproc, RACCT_WRITEIOPS, 1); 1900297633Strasz PROC_UNLOCK(curproc); 1901297633Strasz } 1902297633Strasz#endif /* RACCT */ 1903297633Strasz#endif /* _KERNEL */ 1904209962Smm dbuf_assign_arcbuf(db, buf, tx); 1905209962Smm dbuf_rele(db, FTAG); 1906209962Smm } else { 1907219089Spjd objset_t *os; 1908219089Spjd uint64_t object; 1909219089Spjd 1910321535Smav /* compressed bufs must always be assignable to their dbuf */ 1911321535Smav ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); 1912321535Smav ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); 1913321535Smav 1914219089Spjd os = dn->dn_objset; 1915219089Spjd object = dn->dn_object; 1916219089Spjd 1917209962Smm dbuf_rele(db, FTAG); 1918219089Spjd dmu_write(os, object, offset, blksz, buf->b_data, tx); 1919209962Smm dmu_return_arcbuf(buf); 1920219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_copied); 1921209962Smm } 1922209962Smm} 1923209962Smm 1924339128Smavvoid 1925339128Smavdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, 1926339128Smav dmu_tx_t *tx) 1927339128Smav{ 1928339128Smav dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; 1929339128Smav 1930339128Smav DB_DNODE_ENTER(dbuf); 1931339128Smav dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); 1932339128Smav DB_DNODE_EXIT(dbuf); 1933339128Smav} 1934339128Smav 1935168404Spjdtypedef struct { 1936219089Spjd dbuf_dirty_record_t *dsa_dr; 1937219089Spjd dmu_sync_cb_t *dsa_done; 1938219089Spjd zgd_t *dsa_zgd; 1939219089Spjd dmu_tx_t *dsa_tx; 1940168404Spjd} dmu_sync_arg_t; 1941168404Spjd 1942168404Spjd/* ARGSUSED */ 1943168404Spjdstatic void 1944185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) 1945185029Spjd{ 1946219089Spjd dmu_sync_arg_t *dsa = varg; 1947219089Spjd dmu_buf_t *db = dsa->dsa_zgd->zgd_db; 1948185029Spjd blkptr_t *bp = zio->io_bp; 1949185029Spjd 1950219089Spjd if (zio->io_error == 0) { 1951219089Spjd if (BP_IS_HOLE(bp)) { 1952219089Spjd /* 1953219089Spjd * A block of zeros may compress to a hole, but the 1954219089Spjd * block size still needs to be known for replay. 1955219089Spjd */ 1956219089Spjd BP_SET_LSIZE(bp, db->db_size); 1957268075Sdelphij } else if (!BP_IS_EMBEDDED(bp)) { 1958219089Spjd ASSERT(BP_GET_LEVEL(bp) == 0); 1959219089Spjd bp->blk_fill = 1; 1960219089Spjd } 1961185029Spjd } 1962185029Spjd} 1963185029Spjd 1964219089Spjdstatic void 1965219089Spjddmu_sync_late_arrival_ready(zio_t *zio) 1966219089Spjd{ 1967219089Spjd dmu_sync_ready(zio, NULL, zio->io_private); 1968219089Spjd} 1969219089Spjd 1970185029Spjd/* ARGSUSED */ 1971185029Spjdstatic void 1972168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 1973168404Spjd{ 1974219089Spjd dmu_sync_arg_t *dsa = varg; 1975219089Spjd dbuf_dirty_record_t *dr = dsa->dsa_dr; 1976168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 1977168404Spjd 1978168404Spjd mutex_enter(&db->db_mtx); 1979168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 1980219089Spjd if (zio->io_error == 0) { 1981243524Smm dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); 1982243524Smm if (dr->dt.dl.dr_nopwrite) { 1983243524Smm blkptr_t *bp = zio->io_bp; 1984243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 1985243524Smm uint8_t chksum = BP_GET_CHECKSUM(bp_orig); 1986243524Smm 1987243524Smm ASSERT(BP_EQUAL(bp, bp_orig)); 1988323748Savg VERIFY(BP_EQUAL(bp, db->db_blkptr)); 1989243524Smm ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); 1990289422Smav ASSERT(zio_checksum_table[chksum].ci_flags & 1991289422Smav ZCHECKSUM_FLAG_NOPWRITE); 1992243524Smm } 1993219089Spjd dr->dt.dl.dr_overridden_by = *zio->io_bp; 1994219089Spjd dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 1995219089Spjd dr->dt.dl.dr_copies = zio->io_prop.zp_copies; 1996286677Smav 1997286677Smav /* 1998286677Smav * Old style holes are filled with all zeros, whereas 1999286677Smav * new-style holes maintain their lsize, type, level, 2000286677Smav * and birth time (see zio_write_compress). While we 2001286677Smav * need to reset the BP_SET_LSIZE() call that happened 2002286677Smav * in dmu_sync_ready for old style holes, we do *not* 2003286677Smav * want to wipe out the information contained in new 2004286677Smav * style holes. Thus, only zero out the block pointer if 2005286677Smav * it's an old style hole. 2006286677Smav */ 2007286677Smav if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && 2008286677Smav dr->dt.dl.dr_overridden_by.blk_birth == 0) 2009219089Spjd BP_ZERO(&dr->dt.dl.dr_overridden_by); 2010219089Spjd } else { 2011219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2012219089Spjd } 2013168404Spjd cv_broadcast(&db->db_changed); 2014168404Spjd mutex_exit(&db->db_mtx); 2015168404Spjd 2016219089Spjd dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 2017168404Spjd 2018219089Spjd kmem_free(dsa, sizeof (*dsa)); 2019168404Spjd} 2020168404Spjd 2021219089Spjdstatic void 2022219089Spjddmu_sync_late_arrival_done(zio_t *zio) 2023219089Spjd{ 2024219089Spjd blkptr_t *bp = zio->io_bp; 2025219089Spjd dmu_sync_arg_t *dsa = zio->io_private; 2026243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 2027219089Spjd 2028219089Spjd if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { 2029323748Savg ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); 2030323748Savg ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); 2031323748Savg ASSERT(zio->io_bp->blk_birth == zio->io_txg); 2032323748Savg ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); 2033323748Savg zio_free(zio->io_spa, zio->io_txg, zio->io_bp); 2034219089Spjd } 2035219089Spjd 2036219089Spjd dmu_tx_commit(dsa->dsa_tx); 2037219089Spjd 2038219089Spjd dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 2039219089Spjd 2040321610Smav abd_put(zio->io_abd); 2041219089Spjd kmem_free(dsa, sizeof (*dsa)); 2042219089Spjd} 2043219089Spjd 2044219089Spjdstatic int 2045219089Spjddmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, 2046268123Sdelphij zio_prop_t *zp, zbookmark_phys_t *zb) 2047219089Spjd{ 2048219089Spjd dmu_sync_arg_t *dsa; 2049219089Spjd dmu_tx_t *tx; 2050219089Spjd 2051219089Spjd tx = dmu_tx_create(os); 2052219089Spjd dmu_tx_hold_space(tx, zgd->zgd_db->db_size); 2053219089Spjd if (dmu_tx_assign(tx, TXG_WAIT) != 0) { 2054219089Spjd dmu_tx_abort(tx); 2055249195Smm /* Make zl_get_data do txg_waited_synced() */ 2056249195Smm return (SET_ERROR(EIO)); 2057219089Spjd } 2058219089Spjd 2059325132Savg /* 2060325132Savg * In order to prevent the zgd's lwb from being free'd prior to 2061325132Savg * dmu_sync_late_arrival_done() being called, we have to ensure 2062325132Savg * the lwb's "max txg" takes this tx's txg into account. 2063325132Savg */ 2064325132Savg zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); 2065325132Savg 2066219089Spjd dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 2067219089Spjd dsa->dsa_dr = NULL; 2068219089Spjd dsa->dsa_done = done; 2069219089Spjd dsa->dsa_zgd = zgd; 2070219089Spjd dsa->dsa_tx = tx; 2071219089Spjd 2072323748Savg /* 2073323748Savg * Since we are currently syncing this txg, it's nontrivial to 2074323748Savg * determine what BP to nopwrite against, so we disable nopwrite. 2075323748Savg * 2076323748Savg * When syncing, the db_blkptr is initially the BP of the previous 2077323748Savg * txg. We can not nopwrite against it because it will be changed 2078323748Savg * (this is similar to the non-late-arrival case where the dbuf is 2079323748Savg * dirty in a future txg). 2080323748Savg * 2081323748Savg * Then dbuf_write_ready() sets bp_blkptr to the location we will write. 2082323748Savg * We can not nopwrite against it because although the BP will not 2083323748Savg * (typically) be changed, the data has not yet been persisted to this 2084323748Savg * location. 2085323748Savg * 2086323748Savg * Finally, when dbuf_write_done() is called, it is theoretically 2087323748Savg * possible to always nopwrite, because the data that was written in 2088323748Savg * this txg is the same data that we are trying to write. However we 2089323748Savg * would need to check that this dbuf is not dirty in any future 2090323748Savg * txg's (as we do in the normal dmu_sync() path). For simplicity, we 2091323748Savg * don't nopwrite in this case. 2092323748Savg */ 2093323748Savg zp->zp_nopwrite = B_FALSE; 2094323748Savg 2095321535Smav zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, 2096321610Smav abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), 2097321610Smav zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, 2098321610Smav dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, 2099321610Smav dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); 2100219089Spjd 2101219089Spjd return (0); 2102219089Spjd} 2103219089Spjd 2104168404Spjd/* 2105168404Spjd * Intent log support: sync the block associated with db to disk. 2106168404Spjd * N.B. and XXX: the caller is responsible for making sure that the 2107168404Spjd * data isn't changing while dmu_sync() is writing it. 2108168404Spjd * 2109168404Spjd * Return values: 2110168404Spjd * 2111243524Smm * EEXIST: this txg has already been synced, so there's nothing to do. 2112168404Spjd * The caller should not log the write. 2113168404Spjd * 2114168404Spjd * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 2115168404Spjd * The caller should not log the write. 2116168404Spjd * 2117168404Spjd * EALREADY: this block is already in the process of being synced. 2118168404Spjd * The caller should track its progress (somehow). 2119168404Spjd * 2120219089Spjd * EIO: could not do the I/O. 2121219089Spjd * The caller should do a txg_wait_synced(). 2122168404Spjd * 2123219089Spjd * 0: the I/O has been initiated. 2124219089Spjd * The caller should log this blkptr in the done callback. 2125219089Spjd * It is possible that the I/O will fail, in which case 2126219089Spjd * the error will be reported to the done callback and 2127219089Spjd * propagated to pio from zio_done(). 2128168404Spjd */ 2129168404Spjdint 2130219089Spjddmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) 2131168404Spjd{ 2132219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; 2133219089Spjd objset_t *os = db->db_objset; 2134219089Spjd dsl_dataset_t *ds = os->os_dsl_dataset; 2135168404Spjd dbuf_dirty_record_t *dr; 2136219089Spjd dmu_sync_arg_t *dsa; 2137268123Sdelphij zbookmark_phys_t zb; 2138219089Spjd zio_prop_t zp; 2139219089Spjd dnode_t *dn; 2140168404Spjd 2141219089Spjd ASSERT(pio != NULL); 2142168404Spjd ASSERT(txg != 0); 2143168404Spjd 2144219089Spjd SET_BOOKMARK(&zb, ds->ds_object, 2145219089Spjd db->db.db_object, db->db_level, db->db_blkid); 2146168404Spjd 2147219089Spjd DB_DNODE_ENTER(db); 2148219089Spjd dn = DB_DNODE(db); 2149321573Smav dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); 2150219089Spjd DB_DNODE_EXIT(db); 2151219089Spjd 2152168404Spjd /* 2153219089Spjd * If we're frozen (running ziltest), we always need to generate a bp. 2154168404Spjd */ 2155219089Spjd if (txg > spa_freeze_txg(os->os_spa)) 2156219089Spjd return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 2157168404Spjd 2158168404Spjd /* 2159219089Spjd * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() 2160219089Spjd * and us. If we determine that this txg is not yet syncing, 2161219089Spjd * but it begins to sync a moment later, that's OK because the 2162219089Spjd * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. 2163168404Spjd */ 2164219089Spjd mutex_enter(&db->db_mtx); 2165219089Spjd 2166219089Spjd if (txg <= spa_last_synced_txg(os->os_spa)) { 2167168404Spjd /* 2168219089Spjd * This txg has already synced. There's nothing to do. 2169168404Spjd */ 2170219089Spjd mutex_exit(&db->db_mtx); 2171249195Smm return (SET_ERROR(EEXIST)); 2172168404Spjd } 2173168404Spjd 2174219089Spjd if (txg <= spa_syncing_txg(os->os_spa)) { 2175219089Spjd /* 2176219089Spjd * This txg is currently syncing, so we can't mess with 2177219089Spjd * the dirty record anymore; just write a new log block. 2178219089Spjd */ 2179219089Spjd mutex_exit(&db->db_mtx); 2180219089Spjd return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 2181168404Spjd } 2182168404Spjd 2183168404Spjd dr = db->db_last_dirty; 2184219089Spjd while (dr && dr->dr_txg != txg) 2185168404Spjd dr = dr->dr_next; 2186219089Spjd 2187219089Spjd if (dr == NULL) { 2188168404Spjd /* 2189219089Spjd * There's no dr for this dbuf, so it must have been freed. 2190168404Spjd * There's no need to log writes to freed blocks, so we're done. 2191168404Spjd */ 2192168404Spjd mutex_exit(&db->db_mtx); 2193249195Smm return (SET_ERROR(ENOENT)); 2194168404Spjd } 2195168404Spjd 2196243524Smm ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); 2197243524Smm 2198323748Savg if (db->db_blkptr != NULL) { 2199323748Savg /* 2200323748Savg * We need to fill in zgd_bp with the current blkptr so that 2201323748Savg * the nopwrite code can check if we're writing the same 2202323748Savg * data that's already on disk. We can only nopwrite if we 2203323748Savg * are sure that after making the copy, db_blkptr will not 2204323748Savg * change until our i/o completes. We ensure this by 2205323748Savg * holding the db_mtx, and only allowing nopwrite if the 2206323748Savg * block is not already dirty (see below). This is verified 2207323748Savg * by dmu_sync_done(), which VERIFYs that the db_blkptr has 2208323748Savg * not changed. 2209323748Savg */ 2210323748Savg *zgd->zgd_bp = *db->db_blkptr; 2211323748Savg } 2212323748Savg 2213243524Smm /* 2214286589Smav * Assume the on-disk data is X, the current syncing data (in 2215286589Smav * txg - 1) is Y, and the current in-memory data is Z (currently 2216286589Smav * in dmu_sync). 2217286589Smav * 2218286589Smav * We usually want to perform a nopwrite if X and Z are the 2219286589Smav * same. However, if Y is different (i.e. the BP is going to 2220286589Smav * change before this write takes effect), then a nopwrite will 2221286589Smav * be incorrect - we would override with X, which could have 2222286589Smav * been freed when Y was written. 2223286589Smav * 2224286589Smav * (Note that this is not a concern when we are nop-writing from 2225286589Smav * syncing context, because X and Y must be identical, because 2226286589Smav * all previous txgs have been synced.) 2227286589Smav * 2228286589Smav * Therefore, we disable nopwrite if the current BP could change 2229286589Smav * before this TXG. There are two ways it could change: by 2230286589Smav * being dirty (dr_next is non-NULL), or by being freed 2231286589Smav * (dnode_block_freed()). This behavior is verified by 2232286589Smav * zio_done(), which VERIFYs that the override BP is identical 2233286589Smav * to the on-disk BP. 2234243524Smm */ 2235286589Smav DB_DNODE_ENTER(db); 2236286589Smav dn = DB_DNODE(db); 2237286589Smav if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) 2238243524Smm zp.zp_nopwrite = B_FALSE; 2239286589Smav DB_DNODE_EXIT(db); 2240243524Smm 2241168404Spjd ASSERT(dr->dr_txg == txg); 2242219089Spjd if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || 2243219089Spjd dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2244168404Spjd /* 2245219089Spjd * We have already issued a sync write for this buffer, 2246219089Spjd * or this buffer has already been synced. It could not 2247219089Spjd * have been dirtied since, or we would have cleared the state. 2248168404Spjd */ 2249168404Spjd mutex_exit(&db->db_mtx); 2250249195Smm return (SET_ERROR(EALREADY)); 2251168404Spjd } 2252168404Spjd 2253219089Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2254168404Spjd dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 2255168404Spjd mutex_exit(&db->db_mtx); 2256168404Spjd 2257219089Spjd dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 2258219089Spjd dsa->dsa_dr = dr; 2259219089Spjd dsa->dsa_done = done; 2260219089Spjd dsa->dsa_zgd = zgd; 2261219089Spjd dsa->dsa_tx = NULL; 2262168404Spjd 2263219089Spjd zio_nowait(arc_write(pio, os->os_spa, txg, 2264323748Savg zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), 2265307265Smav &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, 2266304138Savg ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); 2267185029Spjd 2268219089Spjd return (0); 2269168404Spjd} 2270168404Spjd 2271168404Spjdint 2272168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 2273289562Smav dmu_tx_t *tx) 2274168404Spjd{ 2275168404Spjd dnode_t *dn; 2276168404Spjd int err; 2277168404Spjd 2278219089Spjd err = dnode_hold(os, object, FTAG, &dn); 2279168404Spjd if (err) 2280168404Spjd return (err); 2281168404Spjd err = dnode_set_blksz(dn, size, ibs, tx); 2282168404Spjd dnode_rele(dn, FTAG); 2283168404Spjd return (err); 2284168404Spjd} 2285168404Spjd 2286168404Spjdvoid 2287168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 2288289562Smav dmu_tx_t *tx) 2289168404Spjd{ 2290168404Spjd dnode_t *dn; 2291168404Spjd 2292268075Sdelphij /* 2293268075Sdelphij * Send streams include each object's checksum function. This 2294268075Sdelphij * check ensures that the receiving system can understand the 2295268075Sdelphij * checksum function transmitted. 2296268075Sdelphij */ 2297268075Sdelphij ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); 2298268075Sdelphij 2299268075Sdelphij VERIFY0(dnode_hold(os, object, FTAG, &dn)); 2300268075Sdelphij ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); 2301168404Spjd dn->dn_checksum = checksum; 2302168404Spjd dnode_setdirty(dn, tx); 2303168404Spjd dnode_rele(dn, FTAG); 2304168404Spjd} 2305168404Spjd 2306168404Spjdvoid 2307168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 2308289562Smav dmu_tx_t *tx) 2309168404Spjd{ 2310168404Spjd dnode_t *dn; 2311168404Spjd 2312268075Sdelphij /* 2313268075Sdelphij * Send streams include each object's compression function. This 2314268075Sdelphij * check ensures that the receiving system can understand the 2315268075Sdelphij * compression function transmitted. 2316268075Sdelphij */ 2317268075Sdelphij ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); 2318268075Sdelphij 2319268075Sdelphij VERIFY0(dnode_hold(os, object, FTAG, &dn)); 2320168404Spjd dn->dn_compress = compress; 2321168404Spjd dnode_setdirty(dn, tx); 2322168404Spjd dnode_rele(dn, FTAG); 2323168404Spjd} 2324168404Spjd 2325219089Spjdint zfs_mdcomp_disable = 0; 2326267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, 2327219089Spjd &zfs_mdcomp_disable, 0, "Disable metadata compression"); 2328219089Spjd 2329266771Sdelphij/* 2330266771Sdelphij * When the "redundant_metadata" property is set to "most", only indirect 2331266771Sdelphij * blocks of this level and higher will have an additional ditto block. 2332266771Sdelphij */ 2333266771Sdelphijint zfs_redundant_metadata_most_ditto_level = 2; 2334266771Sdelphij 2335219089Spjdvoid 2336321573Smavdmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) 2337219089Spjd{ 2338219089Spjd dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; 2339236884Smm boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || 2340219089Spjd (wp & WP_SPILL)); 2341219089Spjd enum zio_checksum checksum = os->os_checksum; 2342219089Spjd enum zio_compress compress = os->os_compress; 2343219089Spjd enum zio_checksum dedup_checksum = os->os_dedup_checksum; 2344243524Smm boolean_t dedup = B_FALSE; 2345243524Smm boolean_t nopwrite = B_FALSE; 2346219089Spjd boolean_t dedup_verify = os->os_dedup_verify; 2347219089Spjd int copies = os->os_copies; 2348219089Spjd 2349219089Spjd /* 2350243524Smm * We maintain different write policies for each of the following 2351243524Smm * types of data: 2352243524Smm * 1. metadata 2353243524Smm * 2. preallocated blocks (i.e. level-0 blocks of a dump device) 2354243524Smm * 3. all other level 0 blocks 2355219089Spjd */ 2356219089Spjd if (ismd) { 2357268126Sdelphij if (zfs_mdcomp_disable) { 2358268126Sdelphij compress = ZIO_COMPRESS_EMPTY; 2359268126Sdelphij } else { 2360286547Smav /* 2361286547Smav * XXX -- we should design a compression algorithm 2362286547Smav * that specializes in arrays of bps. 2363286547Smav */ 2364286547Smav compress = zio_compress_select(os->os_spa, 2365286547Smav ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); 2366268126Sdelphij } 2367268126Sdelphij 2368243524Smm /* 2369219089Spjd * Metadata always gets checksummed. If the data 2370219089Spjd * checksum is multi-bit correctable, and it's not a 2371219089Spjd * ZBT-style checksum, then it's suitable for metadata 2372219089Spjd * as well. Otherwise, the metadata checksum defaults 2373219089Spjd * to fletcher4. 2374219089Spjd */ 2375289422Smav if (!(zio_checksum_table[checksum].ci_flags & 2376289422Smav ZCHECKSUM_FLAG_METADATA) || 2377289422Smav (zio_checksum_table[checksum].ci_flags & 2378289422Smav ZCHECKSUM_FLAG_EMBEDDED)) 2379219089Spjd checksum = ZIO_CHECKSUM_FLETCHER_4; 2380266771Sdelphij 2381266771Sdelphij if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || 2382266771Sdelphij (os->os_redundant_metadata == 2383266771Sdelphij ZFS_REDUNDANT_METADATA_MOST && 2384266771Sdelphij (level >= zfs_redundant_metadata_most_ditto_level || 2385266771Sdelphij DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) 2386266771Sdelphij copies++; 2387243524Smm } else if (wp & WP_NOFILL) { 2388243524Smm ASSERT(level == 0); 2389219089Spjd 2390219089Spjd /* 2391243524Smm * If we're writing preallocated blocks, we aren't actually 2392243524Smm * writing them so don't set any policy properties. These 2393243524Smm * blocks are currently only used by an external subsystem 2394243524Smm * outside of zfs (i.e. dump) and not written by the zio 2395243524Smm * pipeline. 2396219089Spjd */ 2397243524Smm compress = ZIO_COMPRESS_OFF; 2398255750Sdelphij checksum = ZIO_CHECKSUM_NOPARITY; 2399219089Spjd } else { 2400286547Smav compress = zio_compress_select(os->os_spa, dn->dn_compress, 2401286547Smav compress); 2402219089Spjd 2403243524Smm checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? 2404243524Smm zio_checksum_select(dn->dn_checksum, checksum) : 2405243524Smm dedup_checksum; 2406219089Spjd 2407243524Smm /* 2408243524Smm * Determine dedup setting. If we are in dmu_sync(), 2409243524Smm * we won't actually dedup now because that's all 2410243524Smm * done in syncing context; but we do want to use the 2411243524Smm * dedup checkum. If the checksum is not strong 2412243524Smm * enough to ensure unique signatures, force 2413243524Smm * dedup_verify. 2414243524Smm */ 2415243524Smm if (dedup_checksum != ZIO_CHECKSUM_OFF) { 2416243524Smm dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; 2417289422Smav if (!(zio_checksum_table[checksum].ci_flags & 2418289422Smav ZCHECKSUM_FLAG_DEDUP)) 2419243524Smm dedup_verify = B_TRUE; 2420243524Smm } 2421219089Spjd 2422243524Smm /* 2423289422Smav * Enable nopwrite if we have secure enough checksum 2424289422Smav * algorithm (see comment in zio_nop_write) and 2425289422Smav * compression is enabled. We don't enable nopwrite if 2426289422Smav * dedup is enabled as the two features are mutually 2427289422Smav * exclusive. 2428243524Smm */ 2429289422Smav nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & 2430289422Smav ZCHECKSUM_FLAG_NOPWRITE) && 2431243524Smm compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); 2432219089Spjd } 2433219089Spjd 2434219089Spjd zp->zp_checksum = checksum; 2435321573Smav zp->zp_compress = compress; 2436321535Smav ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); 2437321535Smav 2438219089Spjd zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; 2439219089Spjd zp->zp_level = level; 2440266771Sdelphij zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); 2441219089Spjd zp->zp_dedup = dedup; 2442219089Spjd zp->zp_dedup_verify = dedup && dedup_verify; 2443243524Smm zp->zp_nopwrite = nopwrite; 2444219089Spjd} 2445219089Spjd 2446168404Spjdint 2447168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 2448168404Spjd{ 2449168404Spjd dnode_t *dn; 2450287103Savg int err; 2451168404Spjd 2452168404Spjd /* 2453168404Spjd * Sync any current changes before 2454168404Spjd * we go trundling through the block pointers. 2455168404Spjd */ 2456287103Savg err = dmu_object_wait_synced(os, object); 2457287103Savg if (err) { 2458287103Savg return (err); 2459168404Spjd } 2460287103Savg 2461287103Savg err = dnode_hold(os, object, FTAG, &dn); 2462287103Savg if (err) { 2463287103Savg return (err); 2464168404Spjd } 2465168404Spjd 2466185029Spjd err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); 2467168404Spjd dnode_rele(dn, FTAG); 2468168404Spjd 2469168404Spjd return (err); 2470168404Spjd} 2471168404Spjd 2472287103Savg/* 2473287103Savg * Given the ZFS object, if it contains any dirty nodes 2474287103Savg * this function flushes all dirty blocks to disk. This 2475287103Savg * ensures the DMU object info is updated. A more efficient 2476287103Savg * future version might just find the TXG with the maximum 2477287103Savg * ID and wait for that to be synced. 2478287103Savg */ 2479287103Savgint 2480289562Smavdmu_object_wait_synced(objset_t *os, uint64_t object) 2481289562Smav{ 2482287103Savg dnode_t *dn; 2483287103Savg int error, i; 2484287103Savg 2485287103Savg error = dnode_hold(os, object, FTAG, &dn); 2486287103Savg if (error) { 2487287103Savg return (error); 2488287103Savg } 2489287103Savg 2490287103Savg for (i = 0; i < TXG_SIZE; i++) { 2491287103Savg if (list_link_active(&dn->dn_dirty_link[i])) { 2492287103Savg break; 2493287103Savg } 2494287103Savg } 2495287103Savg dnode_rele(dn, FTAG); 2496287103Savg if (i != TXG_SIZE) { 2497287103Savg txg_wait_synced(dmu_objset_pool(os), 0); 2498287103Savg } 2499287103Savg 2500287103Savg return (0); 2501287103Savg} 2502287103Savg 2503168404Spjdvoid 2504168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 2505168404Spjd{ 2506219089Spjd dnode_phys_t *dnp; 2507219089Spjd 2508168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 2509168404Spjd mutex_enter(&dn->dn_mtx); 2510168404Spjd 2511219089Spjd dnp = dn->dn_phys; 2512219089Spjd 2513168404Spjd doi->doi_data_block_size = dn->dn_datablksz; 2514168404Spjd doi->doi_metadata_block_size = dn->dn_indblkshift ? 2515168404Spjd 1ULL << dn->dn_indblkshift : 0; 2516219089Spjd doi->doi_type = dn->dn_type; 2517219089Spjd doi->doi_bonus_type = dn->dn_bonustype; 2518219089Spjd doi->doi_bonus_size = dn->dn_bonuslen; 2519168404Spjd doi->doi_indirection = dn->dn_nlevels; 2520168404Spjd doi->doi_checksum = dn->dn_checksum; 2521168404Spjd doi->doi_compress = dn->dn_compress; 2522272810Sdelphij doi->doi_nblkptr = dn->dn_nblkptr; 2523219089Spjd doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; 2524247852Smm doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; 2525219089Spjd doi->doi_fill_count = 0; 2526219089Spjd for (int i = 0; i < dnp->dn_nblkptr; i++) 2527268075Sdelphij doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); 2528168404Spjd 2529168404Spjd mutex_exit(&dn->dn_mtx); 2530168404Spjd rw_exit(&dn->dn_struct_rwlock); 2531168404Spjd} 2532168404Spjd 2533168404Spjd/* 2534168404Spjd * Get information on a DMU object. 2535168404Spjd * If doi is NULL, just indicates whether the object exists. 2536168404Spjd */ 2537168404Spjdint 2538168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 2539168404Spjd{ 2540168404Spjd dnode_t *dn; 2541219089Spjd int err = dnode_hold(os, object, FTAG, &dn); 2542168404Spjd 2543168404Spjd if (err) 2544168404Spjd return (err); 2545168404Spjd 2546168404Spjd if (doi != NULL) 2547168404Spjd dmu_object_info_from_dnode(dn, doi); 2548168404Spjd 2549168404Spjd dnode_rele(dn, FTAG); 2550168404Spjd return (0); 2551168404Spjd} 2552168404Spjd 2553168404Spjd/* 2554168404Spjd * As above, but faster; can be used when you have a held dbuf in hand. 2555168404Spjd */ 2556168404Spjdvoid 2557219089Spjddmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) 2558168404Spjd{ 2559219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2560219089Spjd 2561219089Spjd DB_DNODE_ENTER(db); 2562219089Spjd dmu_object_info_from_dnode(DB_DNODE(db), doi); 2563219089Spjd DB_DNODE_EXIT(db); 2564168404Spjd} 2565168404Spjd 2566168404Spjd/* 2567168404Spjd * Faster still when you only care about the size. 2568168404Spjd * This is specifically optimized for zfs_getattr(). 2569168404Spjd */ 2570168404Spjdvoid 2571219089Spjddmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, 2572219089Spjd u_longlong_t *nblk512) 2573168404Spjd{ 2574219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2575219089Spjd dnode_t *dn; 2576168404Spjd 2577219089Spjd DB_DNODE_ENTER(db); 2578219089Spjd dn = DB_DNODE(db); 2579219089Spjd 2580168404Spjd *blksize = dn->dn_datablksz; 2581168404Spjd /* add 1 for dnode space */ 2582168404Spjd *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 2583168404Spjd SPA_MINBLOCKSHIFT) + 1; 2584219089Spjd DB_DNODE_EXIT(db); 2585168404Spjd} 2586168404Spjd 2587168404Spjdvoid 2588168404Spjdbyteswap_uint64_array(void *vbuf, size_t size) 2589168404Spjd{ 2590168404Spjd uint64_t *buf = vbuf; 2591168404Spjd size_t count = size >> 3; 2592168404Spjd int i; 2593168404Spjd 2594168404Spjd ASSERT((size & 7) == 0); 2595168404Spjd 2596168404Spjd for (i = 0; i < count; i++) 2597168404Spjd buf[i] = BSWAP_64(buf[i]); 2598168404Spjd} 2599168404Spjd 2600168404Spjdvoid 2601168404Spjdbyteswap_uint32_array(void *vbuf, size_t size) 2602168404Spjd{ 2603168404Spjd uint32_t *buf = vbuf; 2604168404Spjd size_t count = size >> 2; 2605168404Spjd int i; 2606168404Spjd 2607168404Spjd ASSERT((size & 3) == 0); 2608168404Spjd 2609168404Spjd for (i = 0; i < count; i++) 2610168404Spjd buf[i] = BSWAP_32(buf[i]); 2611168404Spjd} 2612168404Spjd 2613168404Spjdvoid 2614168404Spjdbyteswap_uint16_array(void *vbuf, size_t size) 2615168404Spjd{ 2616168404Spjd uint16_t *buf = vbuf; 2617168404Spjd size_t count = size >> 1; 2618168404Spjd int i; 2619168404Spjd 2620168404Spjd ASSERT((size & 1) == 0); 2621168404Spjd 2622168404Spjd for (i = 0; i < count; i++) 2623168404Spjd buf[i] = BSWAP_16(buf[i]); 2624168404Spjd} 2625168404Spjd 2626168404Spjd/* ARGSUSED */ 2627168404Spjdvoid 2628168404Spjdbyteswap_uint8_array(void *vbuf, size_t size) 2629168404Spjd{ 2630168404Spjd} 2631168404Spjd 2632168404Spjdvoid 2633168404Spjddmu_init(void) 2634168404Spjd{ 2635321610Smav abd_init(); 2636219089Spjd zfs_dbgmsg_init(); 2637219089Spjd sa_cache_init(); 2638219089Spjd xuio_stat_init(); 2639219089Spjd dmu_objset_init(); 2640219089Spjd dnode_init(); 2641208130Smm zfetch_init(); 2642254608Sgibbs zio_compress_init(); 2643239620Smm l2arc_init(); 2644168404Spjd arc_init(); 2645307265Smav dbuf_init(); 2646168404Spjd} 2647168404Spjd 2648168404Spjdvoid 2649168404Spjddmu_fini(void) 2650168404Spjd{ 2651251629Sdelphij arc_fini(); /* arc depends on l2arc, so arc must go first */ 2652219089Spjd l2arc_fini(); 2653208130Smm zfetch_fini(); 2654254608Sgibbs zio_compress_fini(); 2655219089Spjd dbuf_fini(); 2656168404Spjd dnode_fini(); 2657219089Spjd dmu_objset_fini(); 2658219089Spjd xuio_stat_fini(); 2659219089Spjd sa_cache_fini(); 2660219089Spjd zfs_dbgmsg_fini(); 2661321610Smav abd_fini(); 2662168404Spjd} 2663