dmu.c (239620) | dmu.c (243524) |
---|---|
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 26 unchanged lines hidden (view full) --- 35#include <sys/dsl_dir.h> 36#include <sys/dsl_pool.h> 37#include <sys/dsl_synctask.h> 38#include <sys/dsl_prop.h> 39#include <sys/dmu_zfetch.h> 40#include <sys/zfs_ioctl.h> 41#include <sys/zap.h> 42#include <sys/zio_checksum.h> | 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 26 unchanged lines hidden (view full) --- 35#include <sys/dsl_dir.h> 36#include <sys/dsl_pool.h> 37#include <sys/dsl_synctask.h> 38#include <sys/dsl_prop.h> 39#include <sys/dmu_zfetch.h> 40#include <sys/zfs_ioctl.h> 41#include <sys/zap.h> 42#include <sys/zio_checksum.h> |
43#include <sys/zio_compress.h> |
|
43#include <sys/sa.h> 44#ifdef _KERNEL 45#include <sys/zfs_znode.h> 46#endif 47 | 44#include <sys/sa.h> 45#ifdef _KERNEL 46#include <sys/zfs_znode.h> 47#endif 48 |
49/* 50 * Enable/disable nopwrite feature. 51 */ 52int zfs_nopwrite_enabled = 1; 53 |
|
48const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 49 { DMU_BSWAP_UINT8, TRUE, "unallocated" }, 50 { DMU_BSWAP_ZAP, TRUE, "object directory" }, 51 { DMU_BSWAP_UINT64, TRUE, "object array" }, 52 { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, 53 { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, 54 { DMU_BSWAP_UINT64, TRUE, "bpobj" }, 55 { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, --- 1226 unchanged lines hidden (view full) --- 1282{ 1283 dmu_sync_arg_t *dsa = varg; 1284 dbuf_dirty_record_t *dr = dsa->dsa_dr; 1285 dmu_buf_impl_t *db = dr->dr_dbuf; 1286 1287 mutex_enter(&db->db_mtx); 1288 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 1289 if (zio->io_error == 0) { | 54const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 55 { DMU_BSWAP_UINT8, TRUE, "unallocated" }, 56 { DMU_BSWAP_ZAP, TRUE, "object directory" }, 57 { DMU_BSWAP_UINT64, TRUE, "object array" }, 58 { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, 59 { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, 60 { DMU_BSWAP_UINT64, TRUE, "bpobj" }, 61 { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, --- 1226 unchanged lines hidden (view full) --- 1288{ 1289 dmu_sync_arg_t *dsa = varg; 1290 dbuf_dirty_record_t *dr = dsa->dsa_dr; 1291 dmu_buf_impl_t *db = dr->dr_dbuf; 1292 1293 mutex_enter(&db->db_mtx); 1294 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 1295 if (zio->io_error == 0) { |
1296 dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); 1297 if (dr->dt.dl.dr_nopwrite) { 1298 blkptr_t *bp = zio->io_bp; 1299 blkptr_t *bp_orig = &zio->io_bp_orig; 1300 uint8_t chksum = BP_GET_CHECKSUM(bp_orig); 1301 1302 ASSERT(BP_EQUAL(bp, bp_orig)); 1303 ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); 1304 ASSERT(zio_checksum_table[chksum].ci_dedup); 1305 } |
|
1290 dr->dt.dl.dr_overridden_by = *zio->io_bp; 1291 dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 1292 dr->dt.dl.dr_copies = zio->io_prop.zp_copies; 1293 if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) 1294 BP_ZERO(&dr->dt.dl.dr_overridden_by); 1295 } else { 1296 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1297 } --- 5 unchanged lines hidden (view full) --- 1303 kmem_free(dsa, sizeof (*dsa)); 1304} 1305 1306static void 1307dmu_sync_late_arrival_done(zio_t *zio) 1308{ 1309 blkptr_t *bp = zio->io_bp; 1310 dmu_sync_arg_t *dsa = zio->io_private; | 1306 dr->dt.dl.dr_overridden_by = *zio->io_bp; 1307 dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 1308 dr->dt.dl.dr_copies = zio->io_prop.zp_copies; 1309 if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) 1310 BP_ZERO(&dr->dt.dl.dr_overridden_by); 1311 } else { 1312 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1313 } --- 5 unchanged lines hidden (view full) --- 1319 kmem_free(dsa, sizeof (*dsa)); 1320} 1321 1322static void 1323dmu_sync_late_arrival_done(zio_t *zio) 1324{ 1325 blkptr_t *bp = zio->io_bp; 1326 dmu_sync_arg_t *dsa = zio->io_private; |
1327 blkptr_t *bp_orig = &zio->io_bp_orig; |
|
1311 1312 if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { | 1328 1329 if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { |
1313 ASSERT(zio->io_bp->blk_birth == zio->io_txg); 1314 ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); 1315 zio_free(zio->io_spa, zio->io_txg, zio->io_bp); | 1330 /* 1331 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) 1332 * then there is nothing to do here. Otherwise, free the 1333 * newly allocated block in this txg. 1334 */ 1335 if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 1336 ASSERT(BP_EQUAL(bp, bp_orig)); 1337 } else { 1338 ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); 1339 ASSERT(zio->io_bp->blk_birth == zio->io_txg); 1340 ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); 1341 zio_free(zio->io_spa, zio->io_txg, zio->io_bp); 1342 } |
1316 } 1317 1318 dmu_tx_commit(dsa->dsa_tx); 1319 1320 dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 1321 1322 kmem_free(dsa, sizeof (*dsa)); 1323} --- 28 unchanged lines hidden (view full) --- 1352 1353/* 1354 * Intent log support: sync the block associated with db to disk. 1355 * N.B. and XXX: the caller is responsible for making sure that the 1356 * data isn't changing while dmu_sync() is writing it. 1357 * 1358 * Return values: 1359 * | 1343 } 1344 1345 dmu_tx_commit(dsa->dsa_tx); 1346 1347 dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 1348 1349 kmem_free(dsa, sizeof (*dsa)); 1350} --- 28 unchanged lines hidden (view full) --- 1379 1380/* 1381 * Intent log support: sync the block associated with db to disk. 1382 * N.B. and XXX: the caller is responsible for making sure that the 1383 * data isn't changing while dmu_sync() is writing it. 1384 * 1385 * Return values: 1386 * |
1360 * EEXIST: this txg has already been synced, so there's nothing to to. | 1387 * EEXIST: this txg has already been synced, so there's nothing to do. |
1361 * The caller should not log the write. 1362 * 1363 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 1364 * The caller should not log the write. 1365 * 1366 * EALREADY: this block is already in the process of being synced. 1367 * The caller should track its progress (somehow). 1368 * --- 15 unchanged lines hidden (view full) --- 1384 dsl_dataset_t *ds = os->os_dsl_dataset; 1385 dbuf_dirty_record_t *dr; 1386 dmu_sync_arg_t *dsa; 1387 zbookmark_t zb; 1388 zio_prop_t zp; 1389 dnode_t *dn; 1390 1391 ASSERT(pio != NULL); | 1388 * The caller should not log the write. 1389 * 1390 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 1391 * The caller should not log the write. 1392 * 1393 * EALREADY: this block is already in the process of being synced. 1394 * The caller should track its progress (somehow). 1395 * --- 15 unchanged lines hidden (view full) --- 1411 dsl_dataset_t *ds = os->os_dsl_dataset; 1412 dbuf_dirty_record_t *dr; 1413 dmu_sync_arg_t *dsa; 1414 zbookmark_t zb; 1415 zio_prop_t zp; 1416 dnode_t *dn; 1417 1418 ASSERT(pio != NULL); |
1392 ASSERT(BP_IS_HOLE(bp)); | |
1393 ASSERT(txg != 0); 1394 1395 SET_BOOKMARK(&zb, ds->ds_object, 1396 db->db.db_object, db->db_level, db->db_blkid); 1397 1398 DB_DNODE_ENTER(db); 1399 dn = DB_DNODE(db); 1400 dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); --- 38 unchanged lines hidden (view full) --- 1439 /* 1440 * There's no dr for this dbuf, so it must have been freed. 1441 * There's no need to log writes to freed blocks, so we're done. 1442 */ 1443 mutex_exit(&db->db_mtx); 1444 return (ENOENT); 1445 } 1446 | 1419 ASSERT(txg != 0); 1420 1421 SET_BOOKMARK(&zb, ds->ds_object, 1422 db->db.db_object, db->db_level, db->db_blkid); 1423 1424 DB_DNODE_ENTER(db); 1425 dn = DB_DNODE(db); 1426 dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); --- 38 unchanged lines hidden (view full) --- 1465 /* 1466 * There's no dr for this dbuf, so it must have been freed. 1467 * There's no need to log writes to freed blocks, so we're done. 1468 */ 1469 mutex_exit(&db->db_mtx); 1470 return (ENOENT); 1471 } 1472 |
1473 ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); 1474 1475 /* 1476 * Assume the on-disk data is X, the current syncing data is Y, 1477 * and the current in-memory data is Z (currently in dmu_sync). 1478 * X and Z are identical but Y is has been modified. Normally, 1479 * when X and Z are the same we will perform a nopwrite but if Y 1480 * is different we must disable nopwrite since the resulting write 1481 * of Y to disk can free the block containing X. If we allowed a 1482 * nopwrite to occur the block pointing to Z would reference a freed 1483 * block. Since this is a rare case we simplify this by disabling 1484 * nopwrite if the current dmu_sync-ing dbuf has been modified in 1485 * a previous transaction. 1486 */ 1487 if (dr->dr_next) 1488 zp.zp_nopwrite = B_FALSE; 1489 |
|
1447 ASSERT(dr->dr_txg == txg); 1448 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || 1449 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1450 /* 1451 * We have already issued a sync write for this buffer, 1452 * or this buffer has already been synced. It could not 1453 * have been dirtied since, or we would have cleared the state. 1454 */ --- 72 unchanged lines hidden (view full) --- 1527dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) 1528{ 1529 dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; 1530 boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || 1531 (wp & WP_SPILL)); 1532 enum zio_checksum checksum = os->os_checksum; 1533 enum zio_compress compress = os->os_compress; 1534 enum zio_checksum dedup_checksum = os->os_dedup_checksum; | 1490 ASSERT(dr->dr_txg == txg); 1491 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || 1492 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1493 /* 1494 * We have already issued a sync write for this buffer, 1495 * or this buffer has already been synced. It could not 1496 * have been dirtied since, or we would have cleared the state. 1497 */ --- 72 unchanged lines hidden (view full) --- 1570dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) 1571{ 1572 dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; 1573 boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || 1574 (wp & WP_SPILL)); 1575 enum zio_checksum checksum = os->os_checksum; 1576 enum zio_compress compress = os->os_compress; 1577 enum zio_checksum dedup_checksum = os->os_dedup_checksum; |
1535 boolean_t dedup; | 1578 boolean_t dedup = B_FALSE; 1579 boolean_t nopwrite = B_FALSE; |
1536 boolean_t dedup_verify = os->os_dedup_verify; 1537 int copies = os->os_copies; 1538 1539 /* | 1580 boolean_t dedup_verify = os->os_dedup_verify; 1581 int copies = os->os_copies; 1582 1583 /* |
1540 * Determine checksum setting. | 1584 * We maintain different write policies for each of the following 1585 * types of data: 1586 * 1. metadata 1587 * 2. preallocated blocks (i.e. level-0 blocks of a dump device) 1588 * 3. all other level 0 blocks |
1541 */ 1542 if (ismd) { 1543 /* | 1589 */ 1590 if (ismd) { 1591 /* |
1592 * XXX -- we should design a compression algorithm 1593 * that specializes in arrays of bps. 1594 */ 1595 compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : 1596 ZIO_COMPRESS_LZJB; 1597 1598 /* |
|
1544 * Metadata always gets checksummed. If the data 1545 * checksum is multi-bit correctable, and it's not a 1546 * ZBT-style checksum, then it's suitable for metadata 1547 * as well. Otherwise, the metadata checksum defaults 1548 * to fletcher4. 1549 */ 1550 if (zio_checksum_table[checksum].ci_correctable < 1 || 1551 zio_checksum_table[checksum].ci_eck) 1552 checksum = ZIO_CHECKSUM_FLETCHER_4; | 1599 * Metadata always gets checksummed. If the data 1600 * checksum is multi-bit correctable, and it's not a 1601 * ZBT-style checksum, then it's suitable for metadata 1602 * as well. Otherwise, the metadata checksum defaults 1603 * to fletcher4. 1604 */ 1605 if (zio_checksum_table[checksum].ci_correctable < 1 || 1606 zio_checksum_table[checksum].ci_eck) 1607 checksum = ZIO_CHECKSUM_FLETCHER_4; |
1553 } else { 1554 checksum = zio_checksum_select(dn->dn_checksum, checksum); 1555 } | 1608 } else if (wp & WP_NOFILL) { 1609 ASSERT(level == 0); |
1556 | 1610 |
1557 /* 1558 * Determine compression setting. 1559 */ 1560 if (ismd) { | |
1561 /* | 1611 /* |
1562 * XXX -- we should design a compression algorithm 1563 * that specializes in arrays of bps. | 1612 * If we're writing preallocated blocks, we aren't actually 1613 * writing them so don't set any policy properties. These 1614 * blocks are currently only used by an external subsystem 1615 * outside of zfs (i.e. dump) and not written by the zio 1616 * pipeline. |
1564 */ | 1617 */ |
1565 compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : 1566 ZIO_COMPRESS_LZJB; | 1618 compress = ZIO_COMPRESS_OFF; 1619 checksum = ZIO_CHECKSUM_OFF; |
1567 } else { 1568 compress = zio_compress_select(dn->dn_compress, compress); | 1620 } else { 1621 compress = zio_compress_select(dn->dn_compress, compress); |
1569 } | |
1570 | 1622 |
1571 /* 1572 * Determine dedup setting. If we are in dmu_sync(), we won't 1573 * actually dedup now because that's all done in syncing context; 1574 * but we do want to use the dedup checkum. If the checksum is not 1575 * strong enough to ensure unique signatures, force dedup_verify. 1576 */ 1577 dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); 1578 if (dedup) { 1579 checksum = dedup_checksum; 1580 if (!zio_checksum_table[checksum].ci_dedup) 1581 dedup_verify = 1; 1582 } | 1623 checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? 1624 zio_checksum_select(dn->dn_checksum, checksum) : 1625 dedup_checksum; |
1583 | 1626 |
1584 if (wp & WP_DMU_SYNC) 1585 dedup = 0; | 1627 /* 1628 * Determine dedup setting. If we are in dmu_sync(), 1629 * we won't actually dedup now because that's all 1630 * done in syncing context; but we do want to use the 1631 * dedup checkum. If the checksum is not strong 1632 * enough to ensure unique signatures, force 1633 * dedup_verify. 1634 */ 1635 if (dedup_checksum != ZIO_CHECKSUM_OFF) { 1636 dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; 1637 if (!zio_checksum_table[checksum].ci_dedup) 1638 dedup_verify = B_TRUE; 1639 } |
1586 | 1640 |
1587 if (wp & WP_NOFILL) { 1588 ASSERT(!ismd && level == 0); 1589 checksum = ZIO_CHECKSUM_OFF; 1590 compress = ZIO_COMPRESS_OFF; 1591 dedup = B_FALSE; | 1641 /* 1642 * Enable nopwrite if we have a cryptographically secure 1643 * checksum that has no known collisions (i.e. SHA-256) 1644 * and compression is enabled. We don't enable nopwrite if 1645 * dedup is enabled as the two features are mutually exclusive. 1646 */ 1647 nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && 1648 compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); |
1592 } 1593 1594 zp->zp_checksum = checksum; 1595 zp->zp_compress = compress; 1596 zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; 1597 zp->zp_level = level; 1598 zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); 1599 zp->zp_dedup = dedup; 1600 zp->zp_dedup_verify = dedup && dedup_verify; | 1649 } 1650 1651 zp->zp_checksum = checksum; 1652 zp->zp_compress = compress; 1653 zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; 1654 zp->zp_level = level; 1655 zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); 1656 zp->zp_dedup = dedup; 1657 zp->zp_dedup_verify = dedup && dedup_verify; |
1658 zp->zp_nopwrite = nopwrite; |
|
1601} 1602 1603int 1604dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1605{ 1606 dnode_t *dn; 1607 int i, err; 1608 --- 180 unchanged lines hidden --- | 1659} 1660 1661int 1662dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1663{ 1664 dnode_t *dn; 1665 int i, err; 1666 --- 180 unchanged lines hidden --- |