Deleted Added
full compact
zio.c (267992) zio.c (268075)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 23 unchanged lines hidden (view full) ---

32#include <sys/vdev_impl.h>
33#include <sys/zio_impl.h>
34#include <sys/zio_compress.h>
35#include <sys/zio_checksum.h>
36#include <sys/dmu_objset.h>
37#include <sys/arc.h>
38#include <sys/ddt.h>
39#include <sys/trim_map.h>
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 23 unchanged lines hidden (view full) ---

32#include <sys/vdev_impl.h>
33#include <sys/zio_impl.h>
34#include <sys/zio_compress.h>
35#include <sys/zio_checksum.h>
36#include <sys/dmu_objset.h>
37#include <sys/arc.h>
38#include <sys/ddt.h>
39#include <sys/trim_map.h>
40#include <sys/blkptr.h>
40#include <sys/zfeature.h>
41
42SYSCTL_DECL(_vfs_zfs);
43SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
44#if defined(__amd64__)
45static int zio_use_uma = 1;
46#else
47static int zio_use_uma = 0;

--- 210 unchanged lines hidden (view full) ---

258 * excess / transient data in-core during a crashdump.
259 */
260void *
261zio_buf_alloc(size_t size)
262{
263 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
264 int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
265
41#include <sys/zfeature.h>
42
43SYSCTL_DECL(_vfs_zfs);
44SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
45#if defined(__amd64__)
46static int zio_use_uma = 1;
47#else
48static int zio_use_uma = 0;

--- 210 unchanged lines hidden (view full) ---

259 * excess / transient data in-core during a crashdump.
260 */
261void *
262zio_buf_alloc(size_t size)
263{
264 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
265 int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
266
266 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
267 ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
267
268 if (zio_use_uma)
269 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
270 else
271 return (kmem_alloc(size, KM_SLEEP|flags));
272}
273
274/*

--- 418 unchanged lines hidden (view full) ---

693 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
694 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
695 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
696
697 zio->io_ready = ready;
698 zio->io_physdone = physdone;
699 zio->io_prop = *zp;
700
268
269 if (zio_use_uma)
270 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
271 else
272 return (kmem_alloc(size, KM_SLEEP|flags));
273}
274
275/*

--- 418 unchanged lines hidden (view full) ---

694 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
695 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
696 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
697
698 zio->io_ready = ready;
699 zio->io_physdone = physdone;
700 zio->io_prop = *zp;
701
702 /*
703 * Data can be NULL if we are going to call zio_write_override() to
704 * provide the already-allocated BP. But we may need the data to
705 * verify a dedup hit (if requested). In this case, don't try to
706 * dedup (just take the already-allocated BP verbatim).
707 */
708 if (data == NULL && zio->io_prop.zp_dedup_verify) {
709 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
710 }
711
701 return (zio);
702}
703
704zio_t *
705zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
706 uint64_t size, zio_done_func_t *done, void *private,
707 zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
708{

--- 23 unchanged lines hidden (view full) ---

732 zio->io_prop.zp_nopwrite = nopwrite;
733 zio->io_prop.zp_copies = copies;
734 zio->io_bp_override = bp;
735}
736
737void
738zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
739{
712 return (zio);
713}
714
715zio_t *
716zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
717 uint64_t size, zio_done_func_t *done, void *private,
718 zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
719{

--- 23 unchanged lines hidden (view full) ---

743 zio->io_prop.zp_nopwrite = nopwrite;
744 zio->io_prop.zp_copies = copies;
745 zio->io_bp_override = bp;
746}
747
748void
749zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
750{
751
752 /*
753 * The check for EMBEDDED is a performance optimization. We
754 * process the free here (by ignoring it) rather than
755 * putting it on the list and then processing it in zio_free_sync().
756 */
757 if (BP_IS_EMBEDDED(bp))
758 return;
740 metaslab_check_free(spa, bp);
741
742 /*
743 * Frees that are for the currently-syncing txg, are not going to be
744 * deferred, and which will not need to do a read (i.e. not GANG or
745 * DEDUP), can be processed immediately. Otherwise, put them on the
746 * in-memory list for later processing.
747 */

--- 9 unchanged lines hidden (view full) ---

757
758zio_t *
759zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
760 uint64_t size, enum zio_flag flags)
761{
762 zio_t *zio;
763 enum zio_stage stage = ZIO_FREE_PIPELINE;
764
759 metaslab_check_free(spa, bp);
760
761 /*
762 * Frees that are for the currently-syncing txg, are not going to be
763 * deferred, and which will not need to do a read (i.e. not GANG or
764 * DEDUP), can be processed immediately. Otherwise, put them on the
765 * in-memory list for later processing.
766 */

--- 9 unchanged lines hidden (view full) ---

776
777zio_t *
778zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
779 uint64_t size, enum zio_flag flags)
780{
781 zio_t *zio;
782 enum zio_stage stage = ZIO_FREE_PIPELINE;
783
765 dprintf_bp(bp, "freeing in txg %llu, pass %u",
766 (longlong_t)txg, spa->spa_sync_pass);
767
768 ASSERT(!BP_IS_HOLE(bp));
769 ASSERT(spa_syncing_txg(spa) == txg);
770 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
771
784 ASSERT(!BP_IS_HOLE(bp));
785 ASSERT(spa_syncing_txg(spa) == txg);
786 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
787
788 if (BP_IS_EMBEDDED(bp))
789 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
790
772 metaslab_check_free(spa, bp);
773 arc_freed(spa, bp);
774
775 if (zfs_trim_enabled)
776 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
777 ZIO_STAGE_VDEV_IO_ASSESS;
778 /*
779 * GANG and DEDUP blocks can induce a read (for the gang block header,

--- 13 unchanged lines hidden (view full) ---

793}
794
795zio_t *
796zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
797 zio_done_func_t *done, void *private, enum zio_flag flags)
798{
799 zio_t *zio;
800
791 metaslab_check_free(spa, bp);
792 arc_freed(spa, bp);
793
794 if (zfs_trim_enabled)
795 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
796 ZIO_STAGE_VDEV_IO_ASSESS;
797 /*
798 * GANG and DEDUP blocks can induce a read (for the gang block header,

--- 13 unchanged lines hidden (view full) ---

812}
813
814zio_t *
815zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
816 zio_done_func_t *done, void *private, enum zio_flag flags)
817{
818 zio_t *zio;
819
820 dprintf_bp(bp, "claiming in txg %llu", txg);
821
822 if (BP_IS_EMBEDDED(bp))
823 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
824
801 /*
802 * A claim is an allocation of a specific block. Claims are needed
803 * to support immediate writes in the intent log. The issue is that
804 * immediate writes contain committed data, but in a txg that was
805 * *not* committed. Upon opening the pool after an unclean shutdown,
806 * the intent log claims all blocks that contain immediate write data
807 * so that the SPA knows they're in use.
808 *

--- 208 unchanged lines hidden (view full) ---

1017zio_read_bp_init(zio_t **ziop)
1018{
1019 zio_t *zio = *ziop;
1020 blkptr_t *bp = zio->io_bp;
1021
1022 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1023 zio->io_child_type == ZIO_CHILD_LOGICAL &&
1024 !(zio->io_flags & ZIO_FLAG_RAW)) {
825 /*
826 * A claim is an allocation of a specific block. Claims are needed
827 * to support immediate writes in the intent log. The issue is that
828 * immediate writes contain committed data, but in a txg that was
829 * *not* committed. Upon opening the pool after an unclean shutdown,
830 * the intent log claims all blocks that contain immediate write data
831 * so that the SPA knows they're in use.
832 *

--- 208 unchanged lines hidden (view full) ---

1041zio_read_bp_init(zio_t **ziop)
1042{
1043 zio_t *zio = *ziop;
1044 blkptr_t *bp = zio->io_bp;
1045
1046 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1047 zio->io_child_type == ZIO_CHILD_LOGICAL &&
1048 !(zio->io_flags & ZIO_FLAG_RAW)) {
1025 uint64_t psize = BP_GET_PSIZE(bp);
1049 uint64_t psize =
1050 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1026 void *cbuf = zio_buf_alloc(psize);
1027
1028 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1029 }
1030
1051 void *cbuf = zio_buf_alloc(psize);
1052
1053 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1054 }
1055
1056 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1057 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1058 decode_embedded_bp_compressed(bp, zio->io_data);
1059 } else {
1060 ASSERT(!BP_IS_EMBEDDED(bp));
1061 }
1062
1031 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1032 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1033
1034 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1035 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1036
1037 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1038 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;

--- 28 unchanged lines hidden (view full) ---

1067
1068 if (zio->io_bp_override) {
1069 ASSERT(bp->blk_birth != zio->io_txg);
1070 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1071
1072 *bp = *zio->io_bp_override;
1073 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1074
1063 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1064 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1065
1066 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1067 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1068
1069 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1070 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;

--- 28 unchanged lines hidden (view full) ---

1099
1100 if (zio->io_bp_override) {
1101 ASSERT(bp->blk_birth != zio->io_txg);
1102 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1103
1104 *bp = *zio->io_bp_override;
1105 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1106
1107 if (BP_IS_EMBEDDED(bp))
1108 return (ZIO_PIPELINE_CONTINUE);
1109
1075 /*
1076 * If we've been overridden and nopwrite is set then
1077 * set the flag accordingly to indicate that a nopwrite
1078 * has already occurred.
1079 */
1080 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1081 ASSERT(!zp->zp_dedup);
1082 zio->io_flags |= ZIO_FLAG_NOPWRITE;

--- 32 unchanged lines hidden (view full) ---

1115 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1116 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1117 ASSERT(!BP_GET_DEDUP(bp));
1118
1119 if (pass >= zfs_sync_pass_dont_compress)
1120 compress = ZIO_COMPRESS_OFF;
1121
1122 /* Make sure someone doesn't change their mind on overwrites */
1110 /*
1111 * If we've been overridden and nopwrite is set then
1112 * set the flag accordingly to indicate that a nopwrite
1113 * has already occurred.
1114 */
1115 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1116 ASSERT(!zp->zp_dedup);
1117 zio->io_flags |= ZIO_FLAG_NOPWRITE;

--- 32 unchanged lines hidden (view full) ---

1150 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1151 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1152 ASSERT(!BP_GET_DEDUP(bp));
1153
1154 if (pass >= zfs_sync_pass_dont_compress)
1155 compress = ZIO_COMPRESS_OFF;
1156
1157 /* Make sure someone doesn't change their mind on overwrites */
1123 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
1158 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1124 spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1125 }
1126
1127 if (compress != ZIO_COMPRESS_OFF) {
1128 metaslab_class_t *mc = spa_normal_class(spa);
1129 void *cbuf = zio_buf_alloc(lsize);
1130 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize,
1131 (size_t)metaslab_class_get_minblocksize(mc));
1132 if (psize == 0 || psize == lsize) {
1133 compress = ZIO_COMPRESS_OFF;
1134 zio_buf_free(cbuf, lsize);
1159 spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1160 }
1161
1162 if (compress != ZIO_COMPRESS_OFF) {
1163 metaslab_class_t *mc = spa_normal_class(spa);
1164 void *cbuf = zio_buf_alloc(lsize);
1165 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize,
1166 (size_t)metaslab_class_get_minblocksize(mc));
1167 if (psize == 0 || psize == lsize) {
1168 compress = ZIO_COMPRESS_OFF;
1169 zio_buf_free(cbuf, lsize);
1170 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1171 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1172 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1173 encode_embedded_bp_compressed(bp,
1174 cbuf, compress, lsize, psize);
1175 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1176 BP_SET_TYPE(bp, zio->io_prop.zp_type);
1177 BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1178 zio_buf_free(cbuf, lsize);
1179 bp->blk_birth = zio->io_txg;
1180 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1181 ASSERT(spa_feature_is_active(spa,
1182 SPA_FEATURE_EMBEDDED_DATA));
1183 return (ZIO_PIPELINE_CONTINUE);
1135 } else {
1184 } else {
1136 ASSERT(psize < lsize);
1137 zio_push_transform(zio, cbuf, psize, lsize, NULL);
1185 /*
1186 * Round up compressed size to MINBLOCKSIZE and
1187 * zero the tail.
1188 */
1189 size_t rounded =
1190 P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
1191 if (rounded > psize) {
1192 bzero((char *)cbuf + psize, rounded - psize);
1193 psize = rounded;
1194 }
1195 if (psize == lsize) {
1196 compress = ZIO_COMPRESS_OFF;
1197 zio_buf_free(cbuf, lsize);
1198 } else {
1199 zio_push_transform(zio, cbuf,
1200 psize, lsize, NULL);
1201 }
1138 }
1139 }
1140
1141 /*
1142 * The final pass of spa_sync() must be all rewrites, but the first
1143 * few passes offer a trade-off: allocating blocks defers convergence,
1144 * but newly allocated blocks are sequential, so they can be written
1145 * to disk faster. Therefore, we allow the first few passes of

--- 1757 unchanged lines hidden (view full) ---

2903zio_checksum_verified(zio_t *zio)
2904{
2905 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2906}
2907
2908/*
2909 * ==========================================================================
2910 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
1202 }
1203 }
1204
1205 /*
1206 * The final pass of spa_sync() must be all rewrites, but the first
1207 * few passes offer a trade-off: allocating blocks defers convergence,
1208 * but newly allocated blocks are sequential, so they can be written
1209 * to disk faster. Therefore, we allow the first few passes of

--- 1757 unchanged lines hidden (view full) ---

2967zio_checksum_verified(zio_t *zio)
2968{
2969 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2970}
2971
2972/*
2973 * ==========================================================================
2974 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2911 * An error of 0 indictes success. ENXIO indicates whole-device failure,
2975 * An error of 0 indicates success. ENXIO indicates whole-device failure,
2912 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
2913 * indicate errors that are specific to one I/O, and most likely permanent.
2914 * Any other error is presumed to be worse because we weren't expecting it.
2915 * ==========================================================================
2916 */
2917int
2918zio_worst_error(int e1, int e2)
2919{

--- 95 unchanged lines hidden (view full) ---

3015 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3016 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3017 return (ZIO_PIPELINE_STOP);
3018
3019 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
3020 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
3021 ASSERT(zio->io_children[c][w] == 0);
3022
2976 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
2977 * indicate errors that are specific to one I/O, and most likely permanent.
2978 * Any other error is presumed to be worse because we weren't expecting it.
2979 * ==========================================================================
2980 */
2981int
2982zio_worst_error(int e1, int e2)
2983{

--- 95 unchanged lines hidden (view full) ---

3079 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3080 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3081 return (ZIO_PIPELINE_STOP);
3082
3083 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
3084 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
3085 ASSERT(zio->io_children[c][w] == 0);
3086
3023 if (bp != NULL) {
3087 if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
3024 ASSERT(bp->blk_pad[0] == 0);
3025 ASSERT(bp->blk_pad[1] == 0);
3026 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
3027 (bp == zio_unique_parent(zio)->io_bp));
3028 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
3029 zio->io_bp_override == NULL &&
3030 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3031 ASSERT(!BP_SHOULD_BYTESWAP(bp));

--- 316 unchanged lines hidden ---
3088 ASSERT(bp->blk_pad[0] == 0);
3089 ASSERT(bp->blk_pad[1] == 0);
3090 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
3091 (bp == zio_unique_parent(zio)->io_bp));
3092 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
3093 zio->io_bp_override == NULL &&
3094 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3095 ASSERT(!BP_SHOULD_BYTESWAP(bp));

--- 316 unchanged lines hidden ---