zap_micro.c revision 185029
129415Sjmg/* 250723Scg * CDDL HEADER START 339899Sluigi * 429415Sjmg * The contents of this file are subject to the terms of the 529415Sjmg * Common Development and Distribution License (the "License"). 629415Sjmg * You may not use this file except in compliance with the License. 729415Sjmg * 850723Scg * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 950723Scg * or http://www.opensolaris.org/os/licensing. 1029415Sjmg * See the License for the specific language governing permissions 1129415Sjmg * and limitations under the License. 1230869Sjmg * 1330869Sjmg * When distributing Covered Code, include this CDDL HEADER in each 1430869Sjmg * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1530869Sjmg * If applicable, add the following below this CDDL HEADER, with the 1650723Scg * fields enclosed by brackets "[]" replaced with your own identifying 1750723Scg * information: Portions Copyright [yyyy] [name of copyright owner] 1830869Sjmg * 1950723Scg * CDDL HEADER END 2050723Scg */ 2150723Scg/* 2250723Scg * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 2350723Scg * Use is subject to license terms. 2450723Scg */ 2550723Scg 2650723Scg#pragma ident "%Z%%M% %I% %E% SMI" 2750723Scg 2850723Scg#include <sys/spa.h> 2950723Scg#include <sys/dmu.h> 3050723Scg#include <sys/zfs_context.h> 3150959Speter#include <sys/zap.h> 3229415Sjmg#include <sys/refcount.h> 3329415Sjmg#include <sys/zap_impl.h> 3453465Scg#include <sys/zap_leaf.h> 3529415Sjmg#include <sys/avl.h> 3653465Scg 3753553Stanimura#ifdef _KERNEL 3829415Sjmg#include <sys/sunddi.h> 3967803Scg#endif 4055706Scg 4155254Scgstatic int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx); 4250723Scg 4367803Scg 4467803Scgstatic uint64_t 4567803Scgzap_hash(zap_t *zap, const char *normname) 4667803Scg{ 4767803Scg const uint8_t *cp; 4867803Scg uint8_t c; 4967803Scg uint64_t crc = zap->zap_salt; 5067803Scg 5167803Scg /* NB: name must already be normalized, if necessary */ 5229415Sjmg 5367803Scg ASSERT(crc != 0); 5450723Scg ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 5550723Scg for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) { 5664881Scg crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 5750723Scg } 5867803Scg 5929415Sjmg /* 6067803Scg * Only use 28 bits, since we need 4 bits in the cookie for the 6164881Scg * collision differentiator. We MUST use the high bits, since 6250723Scg * those are the ones that we first pay attention to when 6364881Scg * chosing the bucket. 6450723Scg */ 6567803Scg crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 6629415Sjmg 6764881Scg return (crc); 6864881Scg} 6964881Scg 7064881Scgstatic int 7164881Scgzap_normalize(zap_t *zap, const char *name, char *namenorm) 7264881Scg{ 7354462Scg size_t inlen, outlen; 7464881Scg int err; 7554462Scg 7650723Scg inlen = strlen(name) + 1; 7767803Scg outlen = ZAP_MAXNAMELEN; 7867803Scg 7967803Scg err = 0; 8067803Scg (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, 8167803Scg zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST, 8267803Scg &err); 8367803Scg 8467803Scg return (err); 8565340Scg} 8667803Scg 8767803Scgboolean_t 8865340Scgzap_match(zap_name_t *zn, const char *matchname) 8965340Scg{ 9065340Scg if (zn->zn_matchtype == MT_FIRST) { 9165340Scg char norm[ZAP_MAXNAMELEN]; 9265340Scg 9350723Scg if (zap_normalize(zn->zn_zap, matchname, norm) != 0) 9429415Sjmg return (B_FALSE); 9550723Scg 9629415Sjmg return (strcmp(zn->zn_name_norm, norm) == 0); 9750723Scg } else { 9850723Scg /* MT_BEST or MT_EXACT */ 9950723Scg return (strcmp(zn->zn_name_orij, matchname) == 0); 10050723Scg } 10167803Scg} 10267803Scg 10350723Scgvoid 10429415Sjmgzap_name_free(zap_name_t *zn) 10550723Scg{ 10650723Scg kmem_free(zn, sizeof (zap_name_t)); 10750723Scg} 10854462Scg 10954462Scg/* XXX combine this with zap_lockdir()? */ 11065644Scgzap_name_t * 11155706Scgzap_name_alloc(zap_t *zap, const char *name, matchtype_t mt) 11229415Sjmg{ 11350723Scg zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 11450723Scg 11567803Scg zn->zn_zap = zap; 11650723Scg zn->zn_name_orij = name; 11750723Scg zn->zn_matchtype = mt; 11850723Scg if (zap->zap_normflags) { 11950723Scg if (zap_normalize(zap, name, zn->zn_normbuf) != 0) { 12050723Scg zap_name_free(zn); 12150723Scg return (NULL); 12250723Scg } 12367803Scg zn->zn_name_norm = zn->zn_normbuf; 12450723Scg } else { 12550723Scg if (mt != MT_EXACT) { 12650723Scg zap_name_free(zn); 12750723Scg return (NULL); 12854462Scg } 12929415Sjmg zn->zn_name_norm = zn->zn_name_orij; 13050723Scg } 13150723Scg 13267803Scg zn->zn_hash = zap_hash(zap, zn->zn_name_norm); 13367803Scg return (zn); 13467803Scg} 13550723Scg 13667803Scgstatic void 13767803Scgmzap_byteswap(mzap_phys_t *buf, size_t size) 13867803Scg{ 13965340Scg int i, max; 14067652Scg buf->mz_block_type = BSWAP_64(buf->mz_block_type); 14167803Scg buf->mz_salt = BSWAP_64(buf->mz_salt); 14267803Scg buf->mz_normflags = BSWAP_64(buf->mz_normflags); 14350723Scg max = (size / MZAP_ENT_LEN) - 1; 14450723Scg for (i = 0; i < max; i++) { 14550723Scg buf->mz_chunk[i].mze_value = 14650723Scg BSWAP_64(buf->mz_chunk[i].mze_value); 14729415Sjmg buf->mz_chunk[i].mze_cd = 14850723Scg BSWAP_32(buf->mz_chunk[i].mze_cd); 14929415Sjmg } 15050723Scg} 15150723Scg 15250723Scgvoid 15350723Scgzap_byteswap(void *buf, size_t size) 15429415Sjmg{ 15529415Sjmg uint64_t block_type; 15629415Sjmg 15750723Scg block_type = *(uint64_t *)buf; 15829415Sjmg 15967803Scg if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { 16050723Scg /* ASSERT(magic == ZAP_LEAF_MAGIC); */ 16129415Sjmg mzap_byteswap(buf, size); 16250723Scg } else { 16350723Scg fzap_byteswap(buf, size); 16450723Scg } 16567803Scg} 16629415Sjmg 16729415Sjmgstatic int 16829415Sjmgmze_compare(const void *arg1, const void *arg2) 16950723Scg{ 17029415Sjmg const mzap_ent_t *mze1 = arg1; 17150723Scg const mzap_ent_t *mze2 = arg2; 17250723Scg 17329415Sjmg if (mze1->mze_hash > mze2->mze_hash) 17450723Scg return (+1); 17550723Scg if (mze1->mze_hash < mze2->mze_hash) 17650723Scg return (-1); 17750723Scg if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) 17829415Sjmg return (+1); 17929415Sjmg if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) 18050723Scg return (-1); 18150723Scg return (0); 18250723Scg} 18350723Scg 18450723Scgstatic void 18529415Sjmgmze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) 18629415Sjmg{ 18750723Scg mzap_ent_t *mze; 18829415Sjmg 18950723Scg ASSERT(zap->zap_ismicro); 19029415Sjmg ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 19150723Scg ASSERT(mzep->mze_cd < ZAP_MAXCD); 19250723Scg 19350723Scg mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); 19450723Scg mze->mze_chunkid = chunkid; 19550723Scg mze->mze_hash = hash; 19650723Scg mze->mze_phys = *mzep; 19750723Scg avl_add(&zap->zap_m.zap_avl, mze); 19850723Scg} 19950723Scg 20050723Scgstatic mzap_ent_t * 20129415Sjmgmze_find(zap_name_t *zn) 20250723Scg{ 20350723Scg mzap_ent_t mze_tofind; 20450723Scg mzap_ent_t *mze; 20550723Scg avl_index_t idx; 20650723Scg avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; 20750723Scg 20850723Scg ASSERT(zn->zn_zap->zap_ismicro); 20950723Scg ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); 21029415Sjmg 21167803Scg if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name)) 21250723Scg return (NULL); 21350723Scg 21450723Scg mze_tofind.mze_hash = zn->zn_hash; 21550723Scg mze_tofind.mze_phys.mze_cd = 0; 21650723Scg 21750723Scgagain: 21850723Scg mze = avl_find(avl, &mze_tofind, &idx); 21950723Scg if (mze == NULL) 22050723Scg mze = avl_nearest(avl, idx, AVL_AFTER); 22150723Scg for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { 22267803Scg if (zap_match(zn, mze->mze_phys.mze_name)) 22329415Sjmg return (mze); 22450723Scg } 22550723Scg if (zn->zn_matchtype == MT_BEST) { 22650723Scg zn->zn_matchtype = MT_FIRST; 22750723Scg goto again; 22850723Scg } 22950723Scg return (NULL); 23050723Scg} 23150723Scg 23250723Scgstatic uint32_t 23350723Scgmze_find_unused_cd(zap_t *zap, uint64_t hash) 23450723Scg{ 23529415Sjmg mzap_ent_t mze_tofind; 23650723Scg mzap_ent_t *mze; 23750723Scg avl_index_t idx; 23850723Scg avl_tree_t *avl = &zap->zap_m.zap_avl; 23950723Scg uint32_t cd; 24050723Scg 24150723Scg ASSERT(zap->zap_ismicro); 24250723Scg ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 24350723Scg 24429415Sjmg mze_tofind.mze_hash = hash; 24550723Scg mze_tofind.mze_phys.mze_cd = 0; 24650723Scg 24750723Scg cd = 0; 24850723Scg for (mze = avl_find(avl, &mze_tofind, &idx); 24950723Scg mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { 25050723Scg if (mze->mze_phys.mze_cd != cd) 25150723Scg break; 25231361Sjmg cd++; 25350723Scg } 25450723Scg 25550723Scg return (cd); 25650723Scg} 25750723Scg 25829415Sjmgstatic void 25950723Scgmze_remove(zap_t *zap, mzap_ent_t *mze) 26050723Scg{ 26150723Scg ASSERT(zap->zap_ismicro); 26250723Scg ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 26350723Scg 26450723Scg avl_remove(&zap->zap_m.zap_avl, mze); 26529415Sjmg kmem_free(mze, sizeof (mzap_ent_t)); 26650723Scg} 26750723Scg 26829415Sjmgstatic void 26950723Scgmze_destroy(zap_t *zap) 27050723Scg{ 27150723Scg mzap_ent_t *mze; 27250723Scg void *avlcookie = NULL; 27329415Sjmg 27450723Scg while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) 27550723Scg kmem_free(mze, sizeof (mzap_ent_t)); 27650723Scg avl_destroy(&zap->zap_m.zap_avl); 27750723Scg} 27850723Scg 27950723Scgstatic zap_t * 28050723Scgmzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) 28150723Scg{ 28229415Sjmg zap_t *winner; 28350723Scg zap_t *zap; 28450723Scg int i; 28529415Sjmg 28650723Scg ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); 28750723Scg 28850723Scg zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); 28950723Scg rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0); 29050723Scg rw_enter(&zap->zap_rwlock, RW_WRITER); 29150723Scg zap->zap_objset = os; 29250723Scg zap->zap_object = obj; 29350723Scg zap->zap_dbuf = db; 29450723Scg 29529415Sjmg if (*(uint64_t *)db->db_data != ZBT_MICRO) { 29629415Sjmg mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, 29767803Scg MUTEX_DEFAULT, 0); 29867803Scg zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; 29967803Scg } else { 30067803Scg zap->zap_ismicro = TRUE; 30167803Scg } 30267803Scg 30367803Scg /* 30467803Scg * Make sure that zap_ismicro is set before we let others see 30567803Scg * it, because zap_lockdir() checks zap_ismicro without the lock 30667803Scg * held. 30767803Scg */ 30867803Scg winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); 30967803Scg 31067803Scg if (winner != NULL) { 31167803Scg rw_exit(&zap->zap_rwlock); 31267803Scg rw_destroy(&zap->zap_rwlock); 31367803Scg if (!zap->zap_ismicro) 31467803Scg mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 31567803Scg kmem_free(zap, sizeof (zap_t)); 31667803Scg return (winner); 31767803Scg } 31867803Scg 31967803Scg if (zap->zap_ismicro) { 32067803Scg zap->zap_salt = zap->zap_m.zap_phys->mz_salt; 32167803Scg zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; 32267803Scg zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; 32367803Scg avl_create(&zap->zap_m.zap_avl, mze_compare, 32467803Scg sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); 32567803Scg 32667803Scg for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { 32767803Scg mzap_ent_phys_t *mze = 32867803Scg &zap->zap_m.zap_phys->mz_chunk[i]; 32967803Scg if (mze->mze_name[0]) { 33067803Scg zap_name_t *zn; 33167803Scg 33267803Scg zap->zap_m.zap_num_entries++; 33367803Scg zn = zap_name_alloc(zap, mze->mze_name, 33467803Scg MT_EXACT); 33567803Scg mze_insert(zap, i, zn->zn_hash, mze); 33667803Scg zap_name_free(zn); 33767803Scg } 33867803Scg } 33967803Scg } else { 34067803Scg zap->zap_salt = zap->zap_f.zap_phys->zap_salt; 34167803Scg zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; 34267803Scg 34367803Scg ASSERT3U(sizeof (struct zap_leaf_header), ==, 34467803Scg 2*ZAP_LEAF_CHUNKSIZE); 34567803Scg 34667803Scg /* 34767803Scg * The embedded pointer table should not overlap the 34867803Scg * other members. 34967803Scg */ 35067803Scg ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, 35167803Scg &zap->zap_f.zap_phys->zap_salt); 35267803Scg 35367803Scg /* 35467803Scg * The embedded pointer table should end at the end of 35567803Scg * the block 35668376Scg */ 35767803Scg ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 35867803Scg 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - 35967803Scg (uintptr_t)zap->zap_f.zap_phys, ==, 36067803Scg zap->zap_dbuf->db_size); 36167803Scg } 36267803Scg rw_exit(&zap->zap_rwlock); 36367803Scg return (zap); 36467803Scg} 36567803Scg 36667803Scgint 36767803Scgzap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, 36867803Scg krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) 36967803Scg{ 37067803Scg zap_t *zap; 37167803Scg dmu_buf_t *db; 37267803Scg krw_t lt; 37367803Scg int err; 37467803Scg 37567803Scg *zapp = NULL; 37667803Scg 37767803Scg err = dmu_buf_hold(os, obj, 0, NULL, &db); 37867803Scg if (err) 37967803Scg return (err); 38067803Scg 38167803Scg#ifdef ZFS_DEBUG 38267803Scg { 38367803Scg dmu_object_info_t doi; 38467803Scg dmu_object_info_from_db(db, &doi); 38567803Scg ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); 38667803Scg } 38767803Scg#endif 38867803Scg 38967803Scg zap = dmu_buf_get_user(db); 39067803Scg if (zap == NULL) 39167803Scg zap = mzap_open(os, obj, db); 39267803Scg 39367803Scg /* 39467803Scg * We're checking zap_ismicro without the lock held, in order to 39567803Scg * tell what type of lock we want. Once we have some sort of 39667803Scg * lock, see if it really is the right type. In practice this 39767803Scg * can only be different if it was upgraded from micro to fat, 39867803Scg * and micro wanted WRITER but fat only needs READER. 39967803Scg */ 40067803Scg lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; 40167803Scg rw_enter(&zap->zap_rwlock, lt); 40267803Scg if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { 40329415Sjmg /* it was upgraded, now we only need reader */ 40467803Scg ASSERT(lt == RW_WRITER); 40529415Sjmg ASSERT(RW_READER == 40650723Scg (!zap->zap_ismicro && fatreader) ? RW_READER : lti); 40765644Scg rw_downgrade(&zap->zap_rwlock); 40865644Scg lt = RW_READER; 40965644Scg } 41050723Scg 41150723Scg zap->zap_objset = os; 41250723Scg 41354462Scg if (lt == RW_WRITER) 41450723Scg dmu_buf_will_dirty(db, tx); 41550723Scg 41650723Scg ASSERT3P(zap->zap_dbuf, ==, db); 41754462Scg 41850723Scg ASSERT(!zap->zap_ismicro || 41950723Scg zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); 42050723Scg if (zap->zap_ismicro && tx && adding && 42154462Scg zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { 42250723Scg uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; 42350723Scg if (newsz > MZAP_MAX_BLKSZ) { 42465644Scg dprintf("upgrading obj %llu: num_entries=%u\n", 42565644Scg obj, zap->zap_m.zap_num_entries); 42665644Scg *zapp = zap; 42765644Scg return (mzap_upgrade(zapp, tx)); 42865644Scg } 42950723Scg err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); 43029415Sjmg ASSERT3U(err, ==, 0); 43150723Scg zap->zap_m.zap_num_chunks = 43267803Scg db->db_size / MZAP_ENT_LEN - 1; 43350723Scg } 43454462Scg 43554462Scg *zapp = zap; 43654462Scg return (0); 43750723Scg} 43867803Scg 43967803Scgvoid 44054462Scgzap_unlockdir(zap_t *zap) 44150723Scg{ 44267803Scg rw_exit(&zap->zap_rwlock); 44367803Scg dmu_buf_rele(zap->zap_dbuf, NULL); 44454462Scg} 44550723Scg 44667803Scgstatic int 44767803Scgmzap_upgrade(zap_t **zapp, dmu_tx_t *tx) 44854462Scg{ 44958756Scg mzap_phys_t *mzp; 45067803Scg int i, sz, nchunks, err; 45129415Sjmg zap_t *zap = *zapp; 45250723Scg 45367803Scg ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 45455424Scg 45554462Scg sz = zap->zap_dbuf->db_size; 45655424Scg mzp = kmem_alloc(sz, KM_SLEEP); 45729415Sjmg bcopy(zap->zap_dbuf->db_data, mzp, sz); 45850723Scg nchunks = zap->zap_m.zap_num_chunks; 45954462Scg 46055424Scg err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 46154462Scg 1ULL << fzap_default_block_shift, 0, tx); 46250723Scg if (err) { 46350723Scg kmem_free(mzp, sz); 46429415Sjmg return (err); 46529415Sjmg } 46654462Scg 46767803Scg dprintf("upgrading obj=%llu with %u chunks\n", 46829415Sjmg zap->zap_object, nchunks); 46967803Scg /* XXX destroy the avl later, so we can use the stored hash value */ 47067803Scg mze_destroy(zap); 47129415Sjmg 47267803Scg fzap_upgrade(zap, tx); 47367803Scg 47467803Scg for (i = 0; i < nchunks; i++) { 47567803Scg int err; 47629415Sjmg mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; 47767803Scg zap_name_t *zn; 47867803Scg if (mze->mze_name[0] == 0) 47941653Sbrian continue; 48067803Scg dprintf("adding %s=%llu\n", 48167803Scg mze->mze_name, mze->mze_value); 48267803Scg zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); 48367803Scg err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); 48429415Sjmg zap = zn->zn_zap; /* fzap_add_cd() may change zap */ 48567803Scg zap_name_free(zn); 48667803Scg if (err) 48767803Scg break; 48867803Scg } 48967803Scg kmem_free(mzp, sz); 49067803Scg *zapp = zap; 49129415Sjmg return (err); 49267803Scg} 49367803Scg 49467803Scgstatic void 49567803Scgmzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) 49667803Scg{ 49750723Scg dmu_buf_t *db; 49850723Scg mzap_phys_t *zp; 49950723Scg 50050723Scg VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); 50167803Scg 50250723Scg#ifdef ZFS_DEBUG 50367803Scg { 50467803Scg dmu_object_info_t doi; 50550723Scg dmu_object_info_from_db(db, &doi); 50667803Scg ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); 50755706Scg } 50855706Scg#endif 50967803Scg 51055706Scg dmu_buf_will_dirty(db, tx); 51155706Scg zp = db->db_data; 51250723Scg zp->mz_block_type = ZBT_MICRO; 51331361Sjmg zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; 51450723Scg zp->mz_normflags = normflags; 51567803Scg dmu_buf_rele(db, FTAG); 51650723Scg} 51767803Scg 51867803Scgint 51967803Scgzap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, 52029415Sjmg dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 52167803Scg{ 52267803Scg return (zap_create_claim_norm(os, obj, 52367803Scg 0, ot, bonustype, bonuslen, tx)); 52467803Scg} 52567803Scg 52629415Sjmgint 52767803Scgzap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, 52829415Sjmg dmu_object_type_t ot, 52967803Scg dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 53067803Scg{ 53167803Scg int err; 53267803Scg 53367803Scg err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); 53467803Scg if (err != 0) 53567803Scg return (err); 53667803Scg mzap_create_impl(os, obj, normflags, tx); 53767803Scg return (0); 53867803Scg} 53967803Scg 54067803Scguint64_t 54167803Scgzap_create(objset_t *os, dmu_object_type_t ot, 54267803Scg dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 54367803Scg{ 54467803Scg return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); 54550723Scg} 54667803Scg 54729415Sjmguint64_t 54867803Scgzap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, 54967803Scg dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 55029415Sjmg{ 55167803Scg uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); 55267803Scg 55329415Sjmg mzap_create_impl(os, obj, normflags, tx); 55467803Scg return (obj); 55567803Scg} 55667803Scg 55767803Scgint 55855706Scgzap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) 55967803Scg{ 56067803Scg /* 56167803Scg * dmu_object_free will free the object number and free the 56267803Scg * data. Freeing the data will cause our pageout function to be 56367803Scg * called, which will destroy our data (zap_leaf_t's and zap_t). 56467803Scg */ 56555706Scg 56667803Scg return (dmu_object_free(os, zapobj, tx)); 56767803Scg} 56867803Scg 56967803Scg_NOTE(ARGSUSED(0)) 57067803Scgvoid 57167803Scgzap_evict(dmu_buf_t *db, void *vzap) 57267803Scg{ 57367803Scg zap_t *zap = vzap; 57467803Scg 57567803Scg rw_destroy(&zap->zap_rwlock); 57667803Scg 57767803Scg if (zap->zap_ismicro) 57867803Scg mze_destroy(zap); 57967803Scg else 58067803Scg mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 58167803Scg 58250723Scg kmem_free(zap, sizeof (zap_t)); 58350723Scg} 58467803Scg 58567803Scgint 58667803Scgzap_count(objset_t *os, uint64_t zapobj, uint64_t *count) 58767803Scg{ 58867803Scg zap_t *zap; 58967803Scg int err; 59029415Sjmg 59167803Scg err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 59267803Scg if (err) 59367803Scg return (err); 59467803Scg if (!zap->zap_ismicro) { 59567803Scg err = fzap_count(zap, count); 59667803Scg } else { 59767803Scg *count = zap->zap_m.zap_num_entries; 59867803Scg } 59967803Scg zap_unlockdir(zap); 60067803Scg return (err); 60167803Scg} 60267803Scg 60367803Scg/* 60467803Scg * zn may be NULL; if not specified, it will be computed if needed. 60567803Scg * See also the comment above zap_entry_normalization_conflict(). 60667803Scg */ 60729415Sjmgstatic boolean_t 60867803Scgmzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) 60967803Scg{ 61050723Scg mzap_ent_t *other; 61129415Sjmg int direction = AVL_BEFORE; 61255706Scg boolean_t allocdzn = B_FALSE; 61355706Scg 61467803Scg if (zap->zap_normflags == 0) 61555706Scg return (B_FALSE); 61655706Scg 61755706Scgagain: 61855706Scg for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); 61955706Scg other && other->mze_hash == mze->mze_hash; 62055706Scg other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { 62155706Scg 62267803Scg if (zn == NULL) { 62367803Scg zn = zap_name_alloc(zap, mze->mze_phys.mze_name, 62467803Scg MT_FIRST); 62555706Scg allocdzn = B_TRUE; 62655706Scg } 62767803Scg if (zap_match(zn, other->mze_phys.mze_name)) { 62855706Scg if (allocdzn) 62955706Scg zap_name_free(zn); 63055706Scg return (B_TRUE); 63155706Scg } 63267803Scg } 63355706Scg 63455706Scg if (direction == AVL_BEFORE) { 63567803Scg direction = AVL_AFTER; 63655706Scg goto again; 63767803Scg } 63867803Scg 63967803Scg if (allocdzn) 64067803Scg zap_name_free(zn); 64155706Scg return (B_FALSE); 64255706Scg} 64355706Scg 64455706Scg/* 64567803Scg * Routines for manipulating attributes. 64655706Scg */ 64755706Scg 64855706Scgint 64967803Scgzap_lookup(objset_t *os, uint64_t zapobj, const char *name, 65067803Scg uint64_t integer_size, uint64_t num_integers, void *buf) 65155706Scg{ 65255706Scg return (zap_lookup_norm(os, zapobj, name, integer_size, 65355706Scg num_integers, buf, MT_EXACT, NULL, 0, NULL)); 65467803Scg} 65555706Scg 65655706Scgint 65755706Scgzap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, 65867803Scg uint64_t integer_size, uint64_t num_integers, void *buf, 65955706Scg matchtype_t mt, char *realname, int rn_len, 66055706Scg boolean_t *ncp) 66155706Scg{ 66255706Scg zap_t *zap; 66367803Scg int err; 66455706Scg mzap_ent_t *mze; 66555706Scg zap_name_t *zn; 66667803Scg 66755706Scg err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 66860958Scg if (err) 66955706Scg return (err); 67055706Scg zn = zap_name_alloc(zap, name, mt); 67155706Scg if (zn == NULL) { 67267803Scg zap_unlockdir(zap); 67355706Scg return (ENOTSUP); 67467803Scg } 67568376Scg 67667803Scg if (!zap->zap_ismicro) { 67767803Scg err = fzap_lookup(zn, integer_size, num_integers, buf, 67855706Scg realname, rn_len, ncp); 67955706Scg } else { 68055706Scg mze = mze_find(zn); 68155706Scg if (mze == NULL) { 68267803Scg err = ENOENT; 68355706Scg } else { 68455706Scg if (num_integers < 1) { 68555706Scg err = EOVERFLOW; 68655706Scg } else if (integer_size != 8) { 68755706Scg err = EINVAL; 68855706Scg } else { 68955706Scg *(uint64_t *)buf = mze->mze_phys.mze_value; 69067803Scg (void) strlcpy(realname, 69155706Scg mze->mze_phys.mze_name, rn_len); 69255706Scg if (ncp) { 69367803Scg *ncp = mzap_normalization_conflict(zap, 69455706Scg zn, mze); 69567803Scg } 69655706Scg } 69755706Scg } 69867803Scg } 69955706Scg zap_name_free(zn); 70055706Scg zap_unlockdir(zap); 70129415Sjmg return (err); 70267803Scg} 70329415Sjmg 70467803Scgint 70567803Scgzap_length(objset_t *os, uint64_t zapobj, const char *name, 70667803Scg uint64_t *integer_size, uint64_t *num_integers) 70767803Scg{ 70867803Scg zap_t *zap; 70950723Scg int err; 71031361Sjmg mzap_ent_t *mze; 71150723Scg zap_name_t *zn; 71267803Scg 71350723Scg err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 71467803Scg if (err) 71567803Scg return (err); 71629415Sjmg zn = zap_name_alloc(zap, name, MT_EXACT); 71767803Scg if (zn == NULL) { 71829415Sjmg zap_unlockdir(zap); 71967803Scg return (ENOTSUP); 72029415Sjmg } 72129415Sjmg if (!zap->zap_ismicro) { 72267803Scg err = fzap_length(zn, integer_size, num_integers); 72329415Sjmg } else { 72429415Sjmg mze = mze_find(zn); 72567803Scg if (mze == NULL) { 72653553Stanimura err = ENOENT; 72754462Scg } else { 72855092Sdfr if (integer_size) 72953553Stanimura *integer_size = 8; 73053553Stanimura if (num_integers) 73154462Scg *num_integers = 1; 73254462Scg } 73353553Stanimura } 73453553Stanimura zap_name_free(zn); 73554462Scg zap_unlockdir(zap); 73654791Scg return (err); 73754462Scg} 73867803Scg 73967803Scgstatic void 74067803Scgmzap_addent(zap_name_t *zn, uint64_t value) 74167803Scg{ 74267803Scg int i; 74367803Scg zap_t *zap = zn->zn_zap; 74458756Scg int start = zap->zap_m.zap_alloc_next; 74553553Stanimura uint32_t cd; 74653553Stanimura 74753553Stanimura dprintf("obj=%llu %s=%llu\n", zap->zap_object, 74867803Scg zn->zn_name_orij, value); 74953553Stanimura ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 75053553Stanimura 75155092Sdfr#ifdef ZFS_DEBUG 75267803Scg for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { 75367803Scg mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; 75453553Stanimura ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0); 75553553Stanimura } 75655706Scg#endif 75755706Scg 75853553Stanimura cd = mze_find_unused_cd(zap, zn->zn_hash); 75953553Stanimura /* given the limited size of the microzap, this can't happen */ 76054462Scg ASSERT(cd != ZAP_MAXCD); 76154462Scg 76254462Scgagain: 76354462Scg for (i = start; i < zap->zap_m.zap_num_chunks; i++) { 76467803Scg mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; 76567803Scg if (mze->mze_name[0] == 0) { 76667803Scg mze->mze_value = value; 76767803Scg mze->mze_cd = cd; 76867803Scg (void) strcpy(mze->mze_name, zn->zn_name_orij); 76967803Scg zap->zap_m.zap_num_entries++; 77067803Scg zap->zap_m.zap_alloc_next = i+1; 77167803Scg if (zap->zap_m.zap_alloc_next == 77267803Scg zap->zap_m.zap_num_chunks) 77367803Scg zap->zap_m.zap_alloc_next = 0; 77467803Scg mze_insert(zap, i, zn->zn_hash, mze); 77567803Scg return; 77667803Scg } 77767803Scg } 77867803Scg if (start != 0) { 77967803Scg start = 0; 78067803Scg goto again; 78167803Scg } 78267803Scg ASSERT(!"out of entries!"); 78367803Scg} 78467803Scg 78567803Scgint 78667803Scgzap_add(objset_t *os, uint64_t zapobj, const char *name, 78767803Scg int integer_size, uint64_t num_integers, 78867803Scg const void *val, dmu_tx_t *tx) 78967803Scg{ 79067803Scg zap_t *zap; 79167803Scg int err; 79267803Scg mzap_ent_t *mze; 79367803Scg const uint64_t *intval = val; 79467803Scg zap_name_t *zn; 79567803Scg 79667803Scg err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); 79767803Scg if (err) 79867803Scg return (err); 79967803Scg zn = zap_name_alloc(zap, name, MT_EXACT); 80067803Scg if (zn == NULL) { 80167803Scg zap_unlockdir(zap); 80267803Scg return (ENOTSUP); 80367803Scg } 80467803Scg if (!zap->zap_ismicro) { 80567803Scg err = fzap_add(zn, integer_size, num_integers, val, tx); 80667803Scg zap = zn->zn_zap; /* fzap_add() may change zap */ 80767803Scg } else if (integer_size != 8 || num_integers != 1 || 80867803Scg strlen(name) >= MZAP_NAME_LEN) { 80967803Scg dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 81053553Stanimura zapobj, integer_size, num_integers, name); 81153553Stanimura err = mzap_upgrade(&zn->zn_zap, tx); 81267803Scg if (err == 0) 81367803Scg err = fzap_add(zn, integer_size, num_integers, val, tx); 81467803Scg zap = zn->zn_zap; /* fzap_add() may change zap */ 81567803Scg } else { 81667803Scg mze = mze_find(zn); 81767803Scg if (mze != NULL) { 81867803Scg err = EEXIST; 81967803Scg } else { 82067803Scg mzap_addent(zn, *intval); 82167803Scg } 82267803Scg } 82367803Scg ASSERT(zap == zn->zn_zap); 82467803Scg zap_name_free(zn); 82567803Scg if (zap != NULL) /* may be NULL if fzap_add() failed */ 82667803Scg zap_unlockdir(zap); 82767803Scg return (err); 82853553Stanimura} 82967803Scg 83067803Scgint 83167803Scgzap_update(objset_t *os, uint64_t zapobj, const char *name, 83253553Stanimura int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 83353553Stanimura{ 83453553Stanimura zap_t *zap; 83553553Stanimura mzap_ent_t *mze; 83667803Scg const uint64_t *intval = val; 83753553Stanimura zap_name_t *zn; 83867803Scg int err; 83953553Stanimura 84053553Stanimura err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); 84153553Stanimura if (err) 84267803Scg return (err); 84367803Scg zn = zap_name_alloc(zap, name, MT_EXACT); 84467803Scg if (zn == NULL) { 84553553Stanimura zap_unlockdir(zap); 84654462Scg return (ENOTSUP); 84754462Scg } 84862483Scg if (!zap->zap_ismicro) { 849 err = fzap_update(zn, integer_size, num_integers, val, tx); 850 zap = zn->zn_zap; /* fzap_update() may change zap */ 851 } else if (integer_size != 8 || num_integers != 1 || 852 strlen(name) >= MZAP_NAME_LEN) { 853 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 854 zapobj, integer_size, num_integers, name); 855 err = mzap_upgrade(&zn->zn_zap, tx); 856 if (err == 0) 857 err = fzap_update(zn, integer_size, num_integers, 858 val, tx); 859 zap = zn->zn_zap; /* fzap_update() may change zap */ 860 } else { 861 mze = mze_find(zn); 862 if (mze != NULL) { 863 mze->mze_phys.mze_value = *intval; 864 zap->zap_m.zap_phys->mz_chunk 865 [mze->mze_chunkid].mze_value = *intval; 866 } else { 867 mzap_addent(zn, *intval); 868 } 869 } 870 ASSERT(zap == zn->zn_zap); 871 zap_name_free(zn); 872 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 873 zap_unlockdir(zap); 874 return (err); 875} 876 877int 878zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) 879{ 880 return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); 881} 882 883int 884zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, 885 matchtype_t mt, dmu_tx_t *tx) 886{ 887 zap_t *zap; 888 int err; 889 mzap_ent_t *mze; 890 zap_name_t *zn; 891 892 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); 893 if (err) 894 return (err); 895 zn = zap_name_alloc(zap, name, mt); 896 if (zn == NULL) { 897 zap_unlockdir(zap); 898 return (ENOTSUP); 899 } 900 if (!zap->zap_ismicro) { 901 err = fzap_remove(zn, tx); 902 } else { 903 mze = mze_find(zn); 904 if (mze == NULL) { 905 err = ENOENT; 906 } else { 907 zap->zap_m.zap_num_entries--; 908 bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], 909 sizeof (mzap_ent_phys_t)); 910 mze_remove(zap, mze); 911 } 912 } 913 zap_name_free(zn); 914 zap_unlockdir(zap); 915 return (err); 916} 917 918/* 919 * Routines for iterating over the attributes. 920 */ 921 922/* 923 * We want to keep the high 32 bits of the cursor zero if we can, so 924 * that 32-bit programs can access this. So use a small hash value so 925 * we can fit 4 bits of cd into the 32-bit cursor. 926 * 927 * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] 928 */ 929void 930zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 931 uint64_t serialized) 932{ 933 zc->zc_objset = os; 934 zc->zc_zap = NULL; 935 zc->zc_leaf = NULL; 936 zc->zc_zapobj = zapobj; 937 if (serialized == -1ULL) { 938 zc->zc_hash = -1ULL; 939 zc->zc_cd = 0; 940 } else { 941 zc->zc_hash = serialized << (64-ZAP_HASHBITS); 942 zc->zc_cd = serialized >> ZAP_HASHBITS; 943 if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ 944 zc->zc_cd = 0; 945 } 946} 947 948void 949zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 950{ 951 zap_cursor_init_serialized(zc, os, zapobj, 0); 952} 953 954void 955zap_cursor_fini(zap_cursor_t *zc) 956{ 957 if (zc->zc_zap) { 958 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 959 zap_unlockdir(zc->zc_zap); 960 zc->zc_zap = NULL; 961 } 962 if (zc->zc_leaf) { 963 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 964 zap_put_leaf(zc->zc_leaf); 965 zc->zc_leaf = NULL; 966 } 967 zc->zc_objset = NULL; 968} 969 970uint64_t 971zap_cursor_serialize(zap_cursor_t *zc) 972{ 973 if (zc->zc_hash == -1ULL) 974 return (-1ULL); 975 ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); 976 ASSERT(zc->zc_cd < ZAP_MAXCD); 977 return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | 978 ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); 979} 980 981int 982zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) 983{ 984 int err; 985 avl_index_t idx; 986 mzap_ent_t mze_tofind; 987 mzap_ent_t *mze; 988 989 if (zc->zc_hash == -1ULL) 990 return (ENOENT); 991 992 if (zc->zc_zap == NULL) { 993 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, 994 RW_READER, TRUE, FALSE, &zc->zc_zap); 995 if (err) 996 return (err); 997 } else { 998 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 999 } 1000 if (!zc->zc_zap->zap_ismicro) { 1001 err = fzap_cursor_retrieve(zc->zc_zap, zc, za); 1002 } else { 1003 err = ENOENT; 1004 1005 mze_tofind.mze_hash = zc->zc_hash; 1006 mze_tofind.mze_phys.mze_cd = zc->zc_cd; 1007 1008 mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); 1009 if (mze == NULL) { 1010 mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, 1011 idx, AVL_AFTER); 1012 } 1013 if (mze) { 1014 ASSERT(0 == bcmp(&mze->mze_phys, 1015 &zc->zc_zap->zap_m.zap_phys->mz_chunk 1016 [mze->mze_chunkid], sizeof (mze->mze_phys))); 1017 1018 za->za_normalization_conflict = 1019 mzap_normalization_conflict(zc->zc_zap, NULL, mze); 1020 za->za_integer_length = 8; 1021 za->za_num_integers = 1; 1022 za->za_first_integer = mze->mze_phys.mze_value; 1023 (void) strcpy(za->za_name, mze->mze_phys.mze_name); 1024 zc->zc_hash = mze->mze_hash; 1025 zc->zc_cd = mze->mze_phys.mze_cd; 1026 err = 0; 1027 } else { 1028 zc->zc_hash = -1ULL; 1029 } 1030 } 1031 rw_exit(&zc->zc_zap->zap_rwlock); 1032 return (err); 1033} 1034 1035void 1036zap_cursor_advance(zap_cursor_t *zc) 1037{ 1038 if (zc->zc_hash == -1ULL) 1039 return; 1040 zc->zc_cd++; 1041 if (zc->zc_cd >= ZAP_MAXCD) { 1042 zc->zc_cd = 0; 1043 zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); 1044 if (zc->zc_hash == 0) /* EOF */ 1045 zc->zc_hash = -1ULL; 1046 } 1047} 1048 1049int 1050zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) 1051{ 1052 int err; 1053 zap_t *zap; 1054 1055 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 1056 if (err) 1057 return (err); 1058 1059 bzero(zs, sizeof (zap_stats_t)); 1060 1061 if (zap->zap_ismicro) { 1062 zs->zs_blocksize = zap->zap_dbuf->db_size; 1063 zs->zs_num_entries = zap->zap_m.zap_num_entries; 1064 zs->zs_num_blocks = 1; 1065 } else { 1066 fzap_get_stats(zap, zs); 1067 } 1068 zap_unlockdir(zap); 1069 return (0); 1070} 1071