Deleted Added
full compact
dmu.c (239620) dmu.c (243524)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 26 unchanged lines hidden (view full) ---

35#include <sys/dsl_dir.h>
36#include <sys/dsl_pool.h>
37#include <sys/dsl_synctask.h>
38#include <sys/dsl_prop.h>
39#include <sys/dmu_zfetch.h>
40#include <sys/zfs_ioctl.h>
41#include <sys/zap.h>
42#include <sys/zio_checksum.h>
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 26 unchanged lines hidden (view full) ---

35#include <sys/dsl_dir.h>
36#include <sys/dsl_pool.h>
37#include <sys/dsl_synctask.h>
38#include <sys/dsl_prop.h>
39#include <sys/dmu_zfetch.h>
40#include <sys/zfs_ioctl.h>
41#include <sys/zap.h>
42#include <sys/zio_checksum.h>
43#include <sys/zio_compress.h>
43#include <sys/sa.h>
44#ifdef _KERNEL
45#include <sys/zfs_znode.h>
46#endif
47
44#include <sys/sa.h>
45#ifdef _KERNEL
46#include <sys/zfs_znode.h>
47#endif
48
49/*
50 * Enable/disable nopwrite feature.
51 */
52int zfs_nopwrite_enabled = 1;
53
48const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
49 { DMU_BSWAP_UINT8, TRUE, "unallocated" },
50 { DMU_BSWAP_ZAP, TRUE, "object directory" },
51 { DMU_BSWAP_UINT64, TRUE, "object array" },
52 { DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
53 { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
54 { DMU_BSWAP_UINT64, TRUE, "bpobj" },
55 { DMU_BSWAP_UINT64, TRUE, "bpobj header" },

--- 1226 unchanged lines hidden (view full) ---

1282{
1283 dmu_sync_arg_t *dsa = varg;
1284 dbuf_dirty_record_t *dr = dsa->dsa_dr;
1285 dmu_buf_impl_t *db = dr->dr_dbuf;
1286
1287 mutex_enter(&db->db_mtx);
1288 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1289 if (zio->io_error == 0) {
54const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
55 { DMU_BSWAP_UINT8, TRUE, "unallocated" },
56 { DMU_BSWAP_ZAP, TRUE, "object directory" },
57 { DMU_BSWAP_UINT64, TRUE, "object array" },
58 { DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
59 { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
60 { DMU_BSWAP_UINT64, TRUE, "bpobj" },
61 { DMU_BSWAP_UINT64, TRUE, "bpobj header" },

--- 1226 unchanged lines hidden (view full) ---

1288{
1289 dmu_sync_arg_t *dsa = varg;
1290 dbuf_dirty_record_t *dr = dsa->dsa_dr;
1291 dmu_buf_impl_t *db = dr->dr_dbuf;
1292
1293 mutex_enter(&db->db_mtx);
1294 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1295 if (zio->io_error == 0) {
1296 dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1297 if (dr->dt.dl.dr_nopwrite) {
1298 blkptr_t *bp = zio->io_bp;
1299 blkptr_t *bp_orig = &zio->io_bp_orig;
1300 uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
1301
1302 ASSERT(BP_EQUAL(bp, bp_orig));
1303 ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
1304 ASSERT(zio_checksum_table[chksum].ci_dedup);
1305 }
1290 dr->dt.dl.dr_overridden_by = *zio->io_bp;
1291 dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1292 dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1293 if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
1294 BP_ZERO(&dr->dt.dl.dr_overridden_by);
1295 } else {
1296 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1297 }

--- 5 unchanged lines hidden (view full) ---

1303 kmem_free(dsa, sizeof (*dsa));
1304}
1305
1306static void
1307dmu_sync_late_arrival_done(zio_t *zio)
1308{
1309 blkptr_t *bp = zio->io_bp;
1310 dmu_sync_arg_t *dsa = zio->io_private;
1306 dr->dt.dl.dr_overridden_by = *zio->io_bp;
1307 dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1308 dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1309 if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
1310 BP_ZERO(&dr->dt.dl.dr_overridden_by);
1311 } else {
1312 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1313 }

--- 5 unchanged lines hidden (view full) ---

1319 kmem_free(dsa, sizeof (*dsa));
1320}
1321
1322static void
1323dmu_sync_late_arrival_done(zio_t *zio)
1324{
1325 blkptr_t *bp = zio->io_bp;
1326 dmu_sync_arg_t *dsa = zio->io_private;
1327 blkptr_t *bp_orig = &zio->io_bp_orig;
1311
1312 if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1328
1329 if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1313 ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1314 ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1315 zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
1330 /*
1331 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
1332 * then there is nothing to do here. Otherwise, free the
1333 * newly allocated block in this txg.
1334 */
1335 if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
1336 ASSERT(BP_EQUAL(bp, bp_orig));
1337 } else {
1338 ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
1339 ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1340 ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1341 zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
1342 }
1316 }
1317
1318 dmu_tx_commit(dsa->dsa_tx);
1319
1320 dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1321
1322 kmem_free(dsa, sizeof (*dsa));
1323}

--- 28 unchanged lines hidden (view full) ---

1352
1353/*
1354 * Intent log support: sync the block associated with db to disk.
1355 * N.B. and XXX: the caller is responsible for making sure that the
1356 * data isn't changing while dmu_sync() is writing it.
1357 *
1358 * Return values:
1359 *
1343 }
1344
1345 dmu_tx_commit(dsa->dsa_tx);
1346
1347 dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1348
1349 kmem_free(dsa, sizeof (*dsa));
1350}

--- 28 unchanged lines hidden (view full) ---

1379
1380/*
1381 * Intent log support: sync the block associated with db to disk.
1382 * N.B. and XXX: the caller is responsible for making sure that the
1383 * data isn't changing while dmu_sync() is writing it.
1384 *
1385 * Return values:
1386 *
1360 * EEXIST: this txg has already been synced, so there's nothing to to.
1387 * EEXIST: this txg has already been synced, so there's nothing to do.
1361 * The caller should not log the write.
1362 *
1363 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1364 * The caller should not log the write.
1365 *
1366 * EALREADY: this block is already in the process of being synced.
1367 * The caller should track its progress (somehow).
1368 *

--- 15 unchanged lines hidden (view full) ---

1384 dsl_dataset_t *ds = os->os_dsl_dataset;
1385 dbuf_dirty_record_t *dr;
1386 dmu_sync_arg_t *dsa;
1387 zbookmark_t zb;
1388 zio_prop_t zp;
1389 dnode_t *dn;
1390
1391 ASSERT(pio != NULL);
1388 * The caller should not log the write.
1389 *
1390 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1391 * The caller should not log the write.
1392 *
1393 * EALREADY: this block is already in the process of being synced.
1394 * The caller should track its progress (somehow).
1395 *

--- 15 unchanged lines hidden (view full) ---

1411 dsl_dataset_t *ds = os->os_dsl_dataset;
1412 dbuf_dirty_record_t *dr;
1413 dmu_sync_arg_t *dsa;
1414 zbookmark_t zb;
1415 zio_prop_t zp;
1416 dnode_t *dn;
1417
1418 ASSERT(pio != NULL);
1392 ASSERT(BP_IS_HOLE(bp));
1393 ASSERT(txg != 0);
1394
1395 SET_BOOKMARK(&zb, ds->ds_object,
1396 db->db.db_object, db->db_level, db->db_blkid);
1397
1398 DB_DNODE_ENTER(db);
1399 dn = DB_DNODE(db);
1400 dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);

--- 38 unchanged lines hidden (view full) ---

1439 /*
1440 * There's no dr for this dbuf, so it must have been freed.
1441 * There's no need to log writes to freed blocks, so we're done.
1442 */
1443 mutex_exit(&db->db_mtx);
1444 return (ENOENT);
1445 }
1446
1419 ASSERT(txg != 0);
1420
1421 SET_BOOKMARK(&zb, ds->ds_object,
1422 db->db.db_object, db->db_level, db->db_blkid);
1423
1424 DB_DNODE_ENTER(db);
1425 dn = DB_DNODE(db);
1426 dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);

--- 38 unchanged lines hidden (view full) ---

1465 /*
1466 * There's no dr for this dbuf, so it must have been freed.
1467 * There's no need to log writes to freed blocks, so we're done.
1468 */
1469 mutex_exit(&db->db_mtx);
1470 return (ENOENT);
1471 }
1472
1473 ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
1474
1475 /*
1476 * Assume the on-disk data is X, the current syncing data is Y,
1477 * and the current in-memory data is Z (currently in dmu_sync).
1478 * X and Z are identical but Y is has been modified. Normally,
1479 * when X and Z are the same we will perform a nopwrite but if Y
1480 * is different we must disable nopwrite since the resulting write
1481 * of Y to disk can free the block containing X. If we allowed a
1482 * nopwrite to occur the block pointing to Z would reference a freed
1483 * block. Since this is a rare case we simplify this by disabling
1484 * nopwrite if the current dmu_sync-ing dbuf has been modified in
1485 * a previous transaction.
1486 */
1487 if (dr->dr_next)
1488 zp.zp_nopwrite = B_FALSE;
1489
1447 ASSERT(dr->dr_txg == txg);
1448 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1449 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1450 /*
1451 * We have already issued a sync write for this buffer,
1452 * or this buffer has already been synced. It could not
1453 * have been dirtied since, or we would have cleared the state.
1454 */

--- 72 unchanged lines hidden (view full) ---

1527dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
1528{
1529 dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1530 boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1531 (wp & WP_SPILL));
1532 enum zio_checksum checksum = os->os_checksum;
1533 enum zio_compress compress = os->os_compress;
1534 enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1490 ASSERT(dr->dr_txg == txg);
1491 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1492 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1493 /*
1494 * We have already issued a sync write for this buffer,
1495 * or this buffer has already been synced. It could not
1496 * have been dirtied since, or we would have cleared the state.
1497 */

--- 72 unchanged lines hidden (view full) ---

1570dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
1571{
1572 dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1573 boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1574 (wp & WP_SPILL));
1575 enum zio_checksum checksum = os->os_checksum;
1576 enum zio_compress compress = os->os_compress;
1577 enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1535 boolean_t dedup;
1578 boolean_t dedup = B_FALSE;
1579 boolean_t nopwrite = B_FALSE;
1536 boolean_t dedup_verify = os->os_dedup_verify;
1537 int copies = os->os_copies;
1538
1539 /*
1580 boolean_t dedup_verify = os->os_dedup_verify;
1581 int copies = os->os_copies;
1582
1583 /*
1540 * Determine checksum setting.
1584 * We maintain different write policies for each of the following
1585 * types of data:
1586 * 1. metadata
1587 * 2. preallocated blocks (i.e. level-0 blocks of a dump device)
1588 * 3. all other level 0 blocks
1541 */
1542 if (ismd) {
1543 /*
1589 */
1590 if (ismd) {
1591 /*
1592 * XXX -- we should design a compression algorithm
1593 * that specializes in arrays of bps.
1594 */
1595 compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
1596 ZIO_COMPRESS_LZJB;
1597
1598 /*
1544 * Metadata always gets checksummed. If the data
1545 * checksum is multi-bit correctable, and it's not a
1546 * ZBT-style checksum, then it's suitable for metadata
1547 * as well. Otherwise, the metadata checksum defaults
1548 * to fletcher4.
1549 */
1550 if (zio_checksum_table[checksum].ci_correctable < 1 ||
1551 zio_checksum_table[checksum].ci_eck)
1552 checksum = ZIO_CHECKSUM_FLETCHER_4;
1599 * Metadata always gets checksummed. If the data
1600 * checksum is multi-bit correctable, and it's not a
1601 * ZBT-style checksum, then it's suitable for metadata
1602 * as well. Otherwise, the metadata checksum defaults
1603 * to fletcher4.
1604 */
1605 if (zio_checksum_table[checksum].ci_correctable < 1 ||
1606 zio_checksum_table[checksum].ci_eck)
1607 checksum = ZIO_CHECKSUM_FLETCHER_4;
1553 } else {
1554 checksum = zio_checksum_select(dn->dn_checksum, checksum);
1555 }
1608 } else if (wp & WP_NOFILL) {
1609 ASSERT(level == 0);
1556
1610
1557 /*
1558 * Determine compression setting.
1559 */
1560 if (ismd) {
1561 /*
1611 /*
1562 * XXX -- we should design a compression algorithm
1563 * that specializes in arrays of bps.
1612 * If we're writing preallocated blocks, we aren't actually
1613 * writing them so don't set any policy properties. These
1614 * blocks are currently only used by an external subsystem
1615 * outside of zfs (i.e. dump) and not written by the zio
1616 * pipeline.
1564 */
1617 */
1565 compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
1566 ZIO_COMPRESS_LZJB;
1618 compress = ZIO_COMPRESS_OFF;
1619 checksum = ZIO_CHECKSUM_OFF;
1567 } else {
1568 compress = zio_compress_select(dn->dn_compress, compress);
1620 } else {
1621 compress = zio_compress_select(dn->dn_compress, compress);
1569 }
1570
1622
1571 /*
1572 * Determine dedup setting. If we are in dmu_sync(), we won't
1573 * actually dedup now because that's all done in syncing context;
1574 * but we do want to use the dedup checkum. If the checksum is not
1575 * strong enough to ensure unique signatures, force dedup_verify.
1576 */
1577 dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
1578 if (dedup) {
1579 checksum = dedup_checksum;
1580 if (!zio_checksum_table[checksum].ci_dedup)
1581 dedup_verify = 1;
1582 }
1623 checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
1624 zio_checksum_select(dn->dn_checksum, checksum) :
1625 dedup_checksum;
1583
1626
1584 if (wp & WP_DMU_SYNC)
1585 dedup = 0;
1627 /*
1628 * Determine dedup setting. If we are in dmu_sync(),
1629 * we won't actually dedup now because that's all
1630 * done in syncing context; but we do want to use the
1631 * dedup checkum. If the checksum is not strong
1632 * enough to ensure unique signatures, force
1633 * dedup_verify.
1634 */
1635 if (dedup_checksum != ZIO_CHECKSUM_OFF) {
1636 dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
1637 if (!zio_checksum_table[checksum].ci_dedup)
1638 dedup_verify = B_TRUE;
1639 }
1586
1640
1587 if (wp & WP_NOFILL) {
1588 ASSERT(!ismd && level == 0);
1589 checksum = ZIO_CHECKSUM_OFF;
1590 compress = ZIO_COMPRESS_OFF;
1591 dedup = B_FALSE;
1641 /*
1642 * Enable nopwrite if we have a cryptographically secure
1643 * checksum that has no known collisions (i.e. SHA-256)
1644 * and compression is enabled. We don't enable nopwrite if
1645 * dedup is enabled as the two features are mutually exclusive.
1646 */
1647 nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
1648 compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
1592 }
1593
1594 zp->zp_checksum = checksum;
1595 zp->zp_compress = compress;
1596 zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
1597 zp->zp_level = level;
1598 zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
1599 zp->zp_dedup = dedup;
1600 zp->zp_dedup_verify = dedup && dedup_verify;
1649 }
1650
1651 zp->zp_checksum = checksum;
1652 zp->zp_compress = compress;
1653 zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
1654 zp->zp_level = level;
1655 zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
1656 zp->zp_dedup = dedup;
1657 zp->zp_dedup_verify = dedup && dedup_verify;
1658 zp->zp_nopwrite = nopwrite;
1601}
1602
1603int
1604dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
1605{
1606 dnode_t *dn;
1607 int i, err;
1608

--- 180 unchanged lines hidden ---
1659}
1660
1661int
1662dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
1663{
1664 dnode_t *dn;
1665 int i, err;
1666

--- 180 unchanged lines hidden ---