Deleted Added
full compact
vdev.c (254112) vdev.c (254591)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 38 unchanged lines hidden (view full) ---

47
48SYSCTL_DECL(_vfs_zfs);
49SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
50
51/*
52 * Virtual device management.
53 */
54
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 38 unchanged lines hidden (view full) ---

47
48SYSCTL_DECL(_vfs_zfs);
49SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
50
51/*
52 * Virtual device management.
53 */
54
55/**
56 * The limit for ZFS to automatically increase a top-level vdev's ashift
57 * from logical ashift to physical ashift.
58 *
59 * Example: one or more 512B emulation child vdevs
60 * child->vdev_ashift = 9 (512 bytes)
61 * child->vdev_physical_ashift = 12 (4096 bytes)
62 * zfs_max_auto_ashift = 11 (2048 bytes)
63 *
64 * On pool creation or the addition of a new top-leve vdev, ZFS will
65 * bump the ashift of the top-level vdev to 2048.
66 *
67 * Example: one or more 512B emulation child vdevs
68 * child->vdev_ashift = 9 (512 bytes)
69 * child->vdev_physical_ashift = 12 (4096 bytes)
70 * zfs_max_auto_ashift = 13 (8192 bytes)
71 *
72 * On pool creation or the addition of a new top-leve vdev, ZFS will
73 * bump the ashift of the top-level vdev to 4096.
74 */
75static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
76
77static int
78sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
79{
80 uint64_t val;
81 int err;
82
83 val = zfs_max_auto_ashift;
84 err = sysctl_handle_64(oidp, &val, 0, req);
85 if (err != 0 || req->newptr == NULL)
86 return (err);
87
88 if (val > SPA_MAXASHIFT)
89 val = SPA_MAXASHIFT;
90
91 zfs_max_auto_ashift = val;
92
93 return (0);
94}
95SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
96 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
97 sysctl_vfs_zfs_max_auto_ashift, "QU",
98 "Cap on logical -> physical ashift adjustment on new top-level vdevs.");
99
55static vdev_ops_t *vdev_ops_table[] = {
56 &vdev_root_ops,
57 &vdev_raidz_ops,
58 &vdev_mirror_ops,
59 &vdev_replacing_ops,
60 &vdev_spare_ops,
61#ifdef _KERNEL
62 &vdev_geom_ops,

--- 678 unchanged lines hidden (view full) ---

741 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
742
743 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
744
745 mvd->vdev_asize = cvd->vdev_asize;
746 mvd->vdev_min_asize = cvd->vdev_min_asize;
747 mvd->vdev_max_asize = cvd->vdev_max_asize;
748 mvd->vdev_ashift = cvd->vdev_ashift;
100static vdev_ops_t *vdev_ops_table[] = {
101 &vdev_root_ops,
102 &vdev_raidz_ops,
103 &vdev_mirror_ops,
104 &vdev_replacing_ops,
105 &vdev_spare_ops,
106#ifdef _KERNEL
107 &vdev_geom_ops,

--- 678 unchanged lines hidden (view full) ---

786 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
787
788 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
789
790 mvd->vdev_asize = cvd->vdev_asize;
791 mvd->vdev_min_asize = cvd->vdev_min_asize;
792 mvd->vdev_max_asize = cvd->vdev_max_asize;
793 mvd->vdev_ashift = cvd->vdev_ashift;
794 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
795 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
749 mvd->vdev_state = cvd->vdev_state;
750 mvd->vdev_crtxg = cvd->vdev_crtxg;
751
752 vdev_remove_child(pvd, cvd);
753 vdev_add_child(pvd, mvd);
754 cvd->vdev_id = mvd->vdev_children;
755 vdev_add_child(mvd, cvd);
756 vdev_top_update(cvd->vdev_top, cvd->vdev_top);

--- 15 unchanged lines hidden (view full) ---

772
773 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
774
775 ASSERT(mvd->vdev_children == 1);
776 ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
777 mvd->vdev_ops == &vdev_replacing_ops ||
778 mvd->vdev_ops == &vdev_spare_ops);
779 cvd->vdev_ashift = mvd->vdev_ashift;
796 mvd->vdev_state = cvd->vdev_state;
797 mvd->vdev_crtxg = cvd->vdev_crtxg;
798
799 vdev_remove_child(pvd, cvd);
800 vdev_add_child(pvd, mvd);
801 cvd->vdev_id = mvd->vdev_children;
802 vdev_add_child(mvd, cvd);
803 vdev_top_update(cvd->vdev_top, cvd->vdev_top);

--- 15 unchanged lines hidden (view full) ---

819
820 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
821
822 ASSERT(mvd->vdev_children == 1);
823 ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
824 mvd->vdev_ops == &vdev_replacing_ops ||
825 mvd->vdev_ops == &vdev_spare_ops);
826 cvd->vdev_ashift = mvd->vdev_ashift;
827 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
828 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
780
781 vdev_remove_child(mvd, cvd);
782 vdev_remove_child(pvd, mvd);
783
784 /*
785 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
786 * Otherwise, we could have detached an offline device, and when we
787 * go to import the pool we'll think we have two top-level vdevs,

--- 327 unchanged lines hidden (view full) ---

1115int
1116vdev_open(vdev_t *vd)
1117{
1118 spa_t *spa = vd->vdev_spa;
1119 int error;
1120 uint64_t osize = 0;
1121 uint64_t max_osize = 0;
1122 uint64_t asize, max_asize, psize;
829
830 vdev_remove_child(mvd, cvd);
831 vdev_remove_child(pvd, mvd);
832
833 /*
834 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
835 * Otherwise, we could have detached an offline device, and when we
836 * go to import the pool we'll think we have two top-level vdevs,

--- 327 unchanged lines hidden (view full) ---

1164int
1165vdev_open(vdev_t *vd)
1166{
1167 spa_t *spa = vd->vdev_spa;
1168 int error;
1169 uint64_t osize = 0;
1170 uint64_t max_osize = 0;
1171 uint64_t asize, max_asize, psize;
1123 uint64_t ashift = 0;
1172 uint64_t logical_ashift = 0;
1173 uint64_t physical_ashift = 0;
1124
1125 ASSERT(vd->vdev_open_thread == curthread ||
1126 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1127 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1128 vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1129 vd->vdev_state == VDEV_STATE_OFFLINE);
1130
1131 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;

--- 13 unchanged lines hidden (view full) ---

1145 vd->vdev_label_aux);
1146 return (SET_ERROR(ENXIO));
1147 } else if (vd->vdev_offline) {
1148 ASSERT(vd->vdev_children == 0);
1149 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1150 return (SET_ERROR(ENXIO));
1151 }
1152
1174
1175 ASSERT(vd->vdev_open_thread == curthread ||
1176 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1177 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1178 vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1179 vd->vdev_state == VDEV_STATE_OFFLINE);
1180
1181 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;

--- 13 unchanged lines hidden (view full) ---

1195 vd->vdev_label_aux);
1196 return (SET_ERROR(ENXIO));
1197 } else if (vd->vdev_offline) {
1198 ASSERT(vd->vdev_children == 0);
1199 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1200 return (SET_ERROR(ENXIO));
1201 }
1202
1153 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
1203 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
1204 &logical_ashift, &physical_ashift);
1154
1155 /*
1156 * Reset the vdev_reopening flag so that we actually close
1157 * the vdev on error.
1158 */
1159 vd->vdev_reopening = B_FALSE;
1160 if (zio_injection_enabled && error == 0)
1161 error = zio_handle_device_injection(vd, NULL, ENXIO);

--- 81 unchanged lines hidden (view full) ---

1243 * Make sure the allocatable size hasn't shrunk.
1244 */
1245 if (asize < vd->vdev_min_asize) {
1246 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1247 VDEV_AUX_BAD_LABEL);
1248 return (SET_ERROR(EINVAL));
1249 }
1250
1205
1206 /*
1207 * Reset the vdev_reopening flag so that we actually close
1208 * the vdev on error.
1209 */
1210 vd->vdev_reopening = B_FALSE;
1211 if (zio_injection_enabled && error == 0)
1212 error = zio_handle_device_injection(vd, NULL, ENXIO);

--- 81 unchanged lines hidden (view full) ---

1294 * Make sure the allocatable size hasn't shrunk.
1295 */
1296 if (asize < vd->vdev_min_asize) {
1297 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1298 VDEV_AUX_BAD_LABEL);
1299 return (SET_ERROR(EINVAL));
1300 }
1301
1302 vd->vdev_physical_ashift =
1303 MAX(physical_ashift, vd->vdev_physical_ashift);
1304 vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
1305 vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
1306
1307 if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
1308 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1309 VDEV_AUX_ASHIFT_TOO_BIG);
1310 return (EINVAL);
1311 }
1312
1251 if (vd->vdev_asize == 0) {
1252 /*
1253 * This is the first-ever open, so use the computed values.
1254 * For testing purposes, a higher ashift can be requested.
1255 */
1256 vd->vdev_asize = asize;
1257 vd->vdev_max_asize = max_asize;
1313 if (vd->vdev_asize == 0) {
1314 /*
1315 * This is the first-ever open, so use the computed values.
1316 * For testing purposes, a higher ashift can be requested.
1317 */
1318 vd->vdev_asize = asize;
1319 vd->vdev_max_asize = max_asize;
1258 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
1259 } else {
1260 /*
1320 } else {
1321 /*
1261 * Detect if the alignment requirement has increased.
1262 * We don't want to make the pool unavailable, just
1263 * issue a warning instead.
1322 * Make sure the alignment requirement hasn't increased.
1264 */
1323 */
1265 if (ashift > vd->vdev_top->vdev_ashift &&
1324 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
1266 vd->vdev_ops->vdev_op_leaf) {
1325 vd->vdev_ops->vdev_op_leaf) {
1267 cmn_err(CE_WARN,
1268 "Disk, '%s', has a block alignment that is "
1269 "larger than the pool's alignment\n",
1270 vd->vdev_path);
1326 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1327 VDEV_AUX_BAD_LABEL);
1328 return (EINVAL);
1271 }
1272 vd->vdev_max_asize = max_asize;
1273 }
1274
1275 /*
1276 * If all children are healthy and the asize has increased,
1277 * then we've experienced dynamic LUN growth. If automatic
1278 * expansion is enabled then use the additional space.

--- 293 unchanged lines hidden (view full) ---

1572{
1573 /*
1574 * Aim for roughly 200 metaslabs per vdev.
1575 */
1576 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1577 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1578}
1579
1329 }
1330 vd->vdev_max_asize = max_asize;
1331 }
1332
1333 /*
1334 * If all children are healthy and the asize has increased,
1335 * then we've experienced dynamic LUN growth. If automatic
1336 * expansion is enabled then use the additional space.

--- 293 unchanged lines hidden (view full) ---

1630{
1631 /*
1632 * Aim for roughly 200 metaslabs per vdev.
1633 */
1634 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1635 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1636}
1637
1638/*
1639 * Maximize performance by inflating the configured ashift for
1640 * top level vdevs to be as close to the physical ashift as
1641 * possible without exceeding the administrator specified
1642 * limit.
1643 */
1580void
1644void
1645vdev_ashift_optimize(vdev_t *vd)
1646{
1647 if (vd == vd->vdev_top &&
1648 (vd->vdev_ashift < vd->vdev_physical_ashift) &&
1649 (vd->vdev_ashift < zfs_max_auto_ashift)) {
1650 vd->vdev_ashift = MIN(zfs_max_auto_ashift,
1651 vd->vdev_physical_ashift);
1652 }
1653}
1654
1655void
1581vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1582{
1583 ASSERT(vd == vd->vdev_top);
1584 ASSERT(!vd->vdev_ishole);
1585 ASSERT(ISP2(flags));
1586 ASSERT(spa_writeable(vd->vdev_spa));
1587
1588 if (flags & VDD_METASLAB)

--- 1001 unchanged lines hidden (view full) ---

2590 mutex_enter(&vd->vdev_stat_lock);
2591 bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2592 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2593 vs->vs_state = vd->vdev_state;
2594 vs->vs_rsize = vdev_get_min_asize(vd);
2595 if (vd->vdev_ops->vdev_op_leaf)
2596 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2597 vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
1656vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1657{
1658 ASSERT(vd == vd->vdev_top);
1659 ASSERT(!vd->vdev_ishole);
1660 ASSERT(ISP2(flags));
1661 ASSERT(spa_writeable(vd->vdev_spa));
1662
1663 if (flags & VDD_METASLAB)

--- 1001 unchanged lines hidden (view full) ---

2665 mutex_enter(&vd->vdev_stat_lock);
2666 bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2667 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2668 vs->vs_state = vd->vdev_state;
2669 vs->vs_rsize = vdev_get_min_asize(vd);
2670 if (vd->vdev_ops->vdev_op_leaf)
2671 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2672 vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
2673 vs->vs_configured_ashift = vd->vdev_top != NULL
2674 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
2675 vs->vs_logical_ashift = vd->vdev_logical_ashift;
2676 vs->vs_physical_ashift = vd->vdev_physical_ashift;
2598 mutex_exit(&vd->vdev_stat_lock);
2599
2600 /*
2601 * If we're getting stats on the root vdev, aggregate the I/O counts
2602 * over all top-level vdevs (i.e. the direct children of the root).
2603 */
2604 if (vd == rvd) {
2605 for (int c = 0; c < rvd->vdev_children; c++) {

--- 714 unchanged lines hidden ---
2677 mutex_exit(&vd->vdev_stat_lock);
2678
2679 /*
2680 * If we're getting stats on the root vdev, aggregate the I/O counts
2681 * over all top-level vdevs (i.e. the direct children of the root).
2682 */
2683 if (vd == rvd) {
2684 for (int c = 0; c < rvd->vdev_children; c++) {

--- 714 unchanged lines hidden ---