Deleted Added
full compact
metaslab.c (177698) metaslab.c (185029)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 5 unchanged lines hidden (view full) ---

14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 5 unchanged lines hidden (view full) ---

14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
23 * Use is subject to license terms.
24 */
25
26#pragma ident "%Z%%M% %I% %E% SMI"
27
28#include <sys/zfs_context.h>
29#include <sys/spa_impl.h>
30#include <sys/dmu.h>
31#include <sys/dmu_tx.h>
32#include <sys/space_map.h>
33#include <sys/metaslab_impl.h>
34#include <sys/vdev_impl.h>
35#include <sys/zio.h>
36
37uint64_t metaslab_aliquot = 512ULL << 10;
26#include <sys/zfs_context.h>
27#include <sys/spa_impl.h>
28#include <sys/dmu.h>
29#include <sys/dmu_tx.h>
30#include <sys/space_map.h>
31#include <sys/metaslab_impl.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio.h>
34
35uint64_t metaslab_aliquot = 512ULL << 10;
36uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
38
39/*
40 * ==========================================================================
41 * Metaslab classes
42 * ==========================================================================
43 */
44metaslab_class_t *
45metaslab_class_create(void)

--- 290 unchanged lines hidden (view full) ---

336
337void
338metaslab_fini(metaslab_t *msp)
339{
340 metaslab_group_t *mg = msp->ms_group;
341 int t;
342
343 vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
37
38/*
39 * ==========================================================================
40 * Metaslab classes
41 * ==========================================================================
42 */
43metaslab_class_t *
44metaslab_class_create(void)

--- 290 unchanged lines hidden (view full) ---

335
336void
337metaslab_fini(metaslab_t *msp)
338{
339 metaslab_group_t *mg = msp->ms_group;
340 int t;
341
342 vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
344 -msp->ms_smo.smo_alloc);
343 -msp->ms_smo.smo_alloc, B_TRUE);
345
346 metaslab_group_remove(mg, msp);
347
348 mutex_enter(&msp->ms_lock);
349
350 space_map_unload(&msp->ms_map);
351 space_map_destroy(&msp->ms_map);
352

--- 176 unchanged lines hidden (view full) ---

529
530 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
531 space_map_sync(freemap, SM_FREE, smo, mos, tx);
532
533 mutex_exit(&msp->ms_lock);
534
535 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
536 dmu_buf_will_dirty(db, tx);
344
345 metaslab_group_remove(mg, msp);
346
347 mutex_enter(&msp->ms_lock);
348
349 space_map_unload(&msp->ms_map);
350 space_map_destroy(&msp->ms_map);
351

--- 176 unchanged lines hidden (view full) ---

528
529 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
530 space_map_sync(freemap, SM_FREE, smo, mos, tx);
531
532 mutex_exit(&msp->ms_lock);
533
534 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
535 dmu_buf_will_dirty(db, tx);
537 ASSERT3U(db->db_size, ==, sizeof (*smo));
538 bcopy(smo, db->db_data, db->db_size);
536 ASSERT3U(db->db_size, >=, sizeof (*smo));
537 bcopy(smo, db->db_data, sizeof (*smo));
539 dmu_buf_rele(db, FTAG);
540
541 dmu_tx_commit(tx);
542}
543
544/*
545 * Called after a transaction group has completely synced to mark
546 * all of the metaslab's free space as usable.

--- 17 unchanged lines hidden (view full) ---

564 */
565 if (freed_map->sm_size == 0) {
566 for (t = 0; t < TXG_SIZE; t++) {
567 space_map_create(&msp->ms_allocmap[t], sm->sm_start,
568 sm->sm_size, sm->sm_shift, sm->sm_lock);
569 space_map_create(&msp->ms_freemap[t], sm->sm_start,
570 sm->sm_size, sm->sm_shift, sm->sm_lock);
571 }
538 dmu_buf_rele(db, FTAG);
539
540 dmu_tx_commit(tx);
541}
542
543/*
544 * Called after a transaction group has completely synced to mark
545 * all of the metaslab's free space as usable.

--- 17 unchanged lines hidden (view full) ---

563 */
564 if (freed_map->sm_size == 0) {
565 for (t = 0; t < TXG_SIZE; t++) {
566 space_map_create(&msp->ms_allocmap[t], sm->sm_start,
567 sm->sm_size, sm->sm_shift, sm->sm_lock);
568 space_map_create(&msp->ms_freemap[t], sm->sm_start,
569 sm->sm_size, sm->sm_shift, sm->sm_lock);
570 }
572 vdev_space_update(vd, sm->sm_size, 0);
571 vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
573 }
574
572 }
573
575 vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
574 vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
576
577 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
578 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
579
580 /*
581 * If there's a space_map_load() in progress, wait for it to complete
582 * so that we have a consistent view of the in-core space map.
583 * Then, add everything we freed in this txg to the map.

--- 125 unchanged lines hidden (view full) ---

709
710 return (offset);
711}
712
713/*
714 * Allocate a block for the specified i/o.
715 */
716static int
575
576 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
577 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
578
579 /*
580 * If there's a space_map_load() in progress, wait for it to complete
581 * so that we have a consistent view of the in-core space map.
582 * Then, add everything we freed in this txg to the map.

--- 125 unchanged lines hidden (view full) ---

708
709 return (offset);
710}
711
712/*
713 * Allocate a block for the specified i/o.
714 */
715static int
717metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
718 dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
716metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
717 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
719{
720 metaslab_group_t *mg, *rotor;
718{
719 metaslab_group_t *mg, *rotor;
721 metaslab_class_t *mc;
722 vdev_t *vd;
723 int dshift = 3;
724 int all_zero;
725 uint64_t offset = -1ULL;
726 uint64_t asize;
727 uint64_t distance;
728
729 ASSERT(!DVA_IS_VALID(&dva[d]));
730
720 vdev_t *vd;
721 int dshift = 3;
722 int all_zero;
723 uint64_t offset = -1ULL;
724 uint64_t asize;
725 uint64_t distance;
726
727 ASSERT(!DVA_IS_VALID(&dva[d]));
728
731 mc = spa_metaslab_class_select(spa);
729 /*
730 * For testing, make some blocks above a certain size be gang blocks.
731 */
732 if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0)
733 return (ENOSPC);
732
733 /*
734 * Start at the rotor and loop through all mgs until we find something.
735 * Note that there's no locking on mc_rotor or mc_allocated because
736 * nothing actually breaks if we miss a few updates -- we just won't
737 * allocate quite as evenly. It all balances out over time.
738 *
739 * If we are doing ditto or log blocks, try to spread them across

--- 9 unchanged lines hidden (view full) ---

749 *
750 * If we are doing gang blocks (hintdva is non-NULL), try to keep
751 * ourselves on the same vdev as our gang block header. That
752 * way, we can hope for locality in vdev_cache, plus it makes our
753 * fault domains something tractable.
754 */
755 if (hintdva) {
756 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
734
735 /*
736 * Start at the rotor and loop through all mgs until we find something.
737 * Note that there's no locking on mc_rotor or mc_allocated because
738 * nothing actually breaks if we miss a few updates -- we just won't
739 * allocate quite as evenly. It all balances out over time.
740 *
741 * If we are doing ditto or log blocks, try to spread them across

--- 9 unchanged lines hidden (view full) ---

751 *
752 * If we are doing gang blocks (hintdva is non-NULL), try to keep
753 * ourselves on the same vdev as our gang block header. That
754 * way, we can hope for locality in vdev_cache, plus it makes our
755 * fault domains something tractable.
756 */
757 if (hintdva) {
758 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
757 if (hintdva_avoid)
759 if (flags & METASLAB_HINTBP_AVOID)
758 mg = vd->vdev_mg->mg_next;
759 else
760 mg = vd->vdev_mg;
761 } else if (d != 0) {
762 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
763 mg = vd->vdev_mg->mg_next;
764 } else {
765 mg = mc->mc_rotor;
766 }
760 mg = vd->vdev_mg->mg_next;
761 else
762 mg = vd->vdev_mg;
763 } else if (d != 0) {
764 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
765 mg = vd->vdev_mg->mg_next;
766 } else {
767 mg = mc->mc_rotor;
768 }
767 rotor = mg;
768
769
770 /*
771 * If the hint put us into the wrong class, just follow the rotor.
772 */
773 if (mg->mg_class != mc)
774 mg = mc->mc_rotor;
775
776 rotor = mg;
769top:
770 all_zero = B_TRUE;
771 do {
772 vd = mg->mg_vd;
777top:
778 all_zero = B_TRUE;
779 do {
780 vd = mg->mg_vd;
781 /*
782 * Don't allocate from faulted devices.
783 */
784 if (!vdev_writeable(vd))
785 goto next;
786 /*
787 * Avoid writing single-copy data to a failing vdev
788 */
789 if ((vd->vdev_stat.vs_write_errors > 0 ||
790 vd->vdev_state < VDEV_STATE_HEALTHY) &&
791 d == 0 && dshift == 3) {
792 all_zero = B_FALSE;
793 goto next;
794 }
773
795
796 ASSERT(mg->mg_class == mc);
797
774 distance = vd->vdev_asize >> dshift;
775 if (distance <= (1ULL << vd->vdev_ms_shift))
776 distance = 0;
777 else
778 all_zero = B_FALSE;
779
780 asize = vdev_psize_to_asize(vd, psize);
781 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

--- 31 unchanged lines hidden (view full) ---

813 if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
814 mg->mg_aliquot + mg->mg_bias) {
815 mc->mc_rotor = mg->mg_next;
816 mc->mc_allocated = 0;
817 }
818
819 DVA_SET_VDEV(&dva[d], vd->vdev_id);
820 DVA_SET_OFFSET(&dva[d], offset);
798 distance = vd->vdev_asize >> dshift;
799 if (distance <= (1ULL << vd->vdev_ms_shift))
800 distance = 0;
801 else
802 all_zero = B_FALSE;
803
804 asize = vdev_psize_to_asize(vd, psize);
805 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

--- 31 unchanged lines hidden (view full) ---

837 if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
838 mg->mg_aliquot + mg->mg_bias) {
839 mc->mc_rotor = mg->mg_next;
840 mc->mc_allocated = 0;
841 }
842
843 DVA_SET_VDEV(&dva[d], vd->vdev_id);
844 DVA_SET_OFFSET(&dva[d], offset);
821 DVA_SET_GANG(&dva[d], 0);
845 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
822 DVA_SET_ASIZE(&dva[d], asize);
823
824 return (0);
825 }
846 DVA_SET_ASIZE(&dva[d], asize);
847
848 return (0);
849 }
850next:
826 mc->mc_rotor = mg->mg_next;
827 mc->mc_allocated = 0;
828 } while ((mg = mg->mg_next) != rotor);
829
830 if (!all_zero) {
831 dshift++;
832 ASSERT(dshift < 64);
833 goto top;

--- 40 unchanged lines hidden (view full) ---

874 if (now) {
875 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
876 offset, size);
877 space_map_free(&msp->ms_map, offset, size);
878 } else {
879 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
880 vdev_dirty(vd, VDD_METASLAB, msp, txg);
881 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
851 mc->mc_rotor = mg->mg_next;
852 mc->mc_allocated = 0;
853 } while ((mg = mg->mg_next) != rotor);
854
855 if (!all_zero) {
856 dshift++;
857 ASSERT(dshift < 64);
858 goto top;

--- 40 unchanged lines hidden (view full) ---

899 if (now) {
900 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
901 offset, size);
902 space_map_free(&msp->ms_map, offset, size);
903 } else {
904 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
905 vdev_dirty(vd, VDD_METASLAB, msp, txg);
906 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
882
883 /*
884 * verify that this region is actually allocated in
885 * either a ms_allocmap or the ms_map
886 */
887 if (msp->ms_map.sm_loaded) {
888 boolean_t allocd = B_FALSE;
889 int i;
890
891 if (!space_map_contains(&msp->ms_map, offset, size)) {
892 allocd = B_TRUE;
893 } else {
894 for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
895 space_map_t *sm = &msp->ms_allocmap
896 [(txg - i) & TXG_MASK];
897 if (space_map_contains(sm,
898 offset, size)) {
899 allocd = B_TRUE;
900 break;
901 }
902 }
903 }
904
905 if (!allocd) {
906 zfs_panic_recover("freeing free segment "
907 "(vdev=%llu offset=%llx size=%llx)",
908 (longlong_t)vdev, (longlong_t)offset,
909 (longlong_t)size);
910 }
911 }
912
913
914 }
915
916 mutex_exit(&msp->ms_lock);
917}
918
919/*
920 * Intent log support: upon opening the pool after a crash, notify the SPA
921 * of blocks that the intent log has allocated for immediate write, but

--- 19 unchanged lines hidden (view full) ---

941 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
942
943 if (DVA_GET_GANG(dva))
944 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
945
946 mutex_enter(&msp->ms_lock);
947
948 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
907 }
908
909 mutex_exit(&msp->ms_lock);
910}
911
912/*
913 * Intent log support: upon opening the pool after a crash, notify the SPA
914 * of blocks that the intent log has allocated for immediate write, but

--- 19 unchanged lines hidden (view full) ---

934 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
935
936 if (DVA_GET_GANG(dva))
937 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
938
939 mutex_enter(&msp->ms_lock);
940
941 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
949 if (error) {
942 if (error || txg == 0) { /* txg == 0 indicates dry run */
950 mutex_exit(&msp->ms_lock);
951 return (error);
952 }
953
943 mutex_exit(&msp->ms_lock);
944 return (error);
945 }
946
954 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
955 vdev_dirty(vd, VDD_METASLAB, msp, txg);
956
957 space_map_claim(&msp->ms_map, offset, size);
947 space_map_claim(&msp->ms_map, offset, size);
958 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
959
948
949 if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */
950 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
951 vdev_dirty(vd, VDD_METASLAB, msp, txg);
952 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
953 }
954
960 mutex_exit(&msp->ms_lock);
961
962 return (0);
963}
964
965int
955 mutex_exit(&msp->ms_lock);
956
957 return (0);
958}
959
960int
966metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
967 uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
961metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
962 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
968{
969 dva_t *dva = bp->blk_dva;
970 dva_t *hintdva = hintbp->blk_dva;
963{
964 dva_t *dva = bp->blk_dva;
965 dva_t *hintdva = hintbp->blk_dva;
971 int d;
972 int error = 0;
973
966 int error = 0;
967
968 ASSERT(bp->blk_birth == 0);
969
970 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
971
972 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
973 spa_config_exit(spa, SCL_ALLOC, FTAG);
974 return (ENOSPC);
975 }
976
974 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
975 ASSERT(BP_GET_NDVAS(bp) == 0);
976 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
977
977 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
978 ASSERT(BP_GET_NDVAS(bp) == 0);
979 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
980
978 for (d = 0; d < ndvas; d++) {
979 error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
980 txg, hintbp_avoid);
981 for (int d = 0; d < ndvas; d++) {
982 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
983 txg, flags);
981 if (error) {
982 for (d--; d >= 0; d--) {
983 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
984 bzero(&dva[d], sizeof (dva_t));
985 }
984 if (error) {
985 for (d--; d >= 0; d--) {
986 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
987 bzero(&dva[d], sizeof (dva_t));
988 }
989 spa_config_exit(spa, SCL_ALLOC, FTAG);
986 return (error);
987 }
988 }
989 ASSERT(error == 0);
990 ASSERT(BP_GET_NDVAS(bp) == ndvas);
991
990 return (error);
991 }
992 }
993 ASSERT(error == 0);
994 ASSERT(BP_GET_NDVAS(bp) == ndvas);
995
996 spa_config_exit(spa, SCL_ALLOC, FTAG);
997
998 bp->blk_birth = txg;
999
992 return (0);
993}
994
995void
996metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
997{
998 const dva_t *dva = bp->blk_dva;
999 int ndvas = BP_GET_NDVAS(bp);
1000 return (0);
1001}
1002
1003void
1004metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1005{
1006 const dva_t *dva = bp->blk_dva;
1007 int ndvas = BP_GET_NDVAS(bp);
1000 int d;
1001
1002 ASSERT(!BP_IS_HOLE(bp));
1008
1009 ASSERT(!BP_IS_HOLE(bp));
1010 ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
1003
1011
1004 for (d = 0; d < ndvas; d++)
1012 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1013
1014 for (int d = 0; d < ndvas; d++)
1005 metaslab_free_dva(spa, &dva[d], txg, now);
1015 metaslab_free_dva(spa, &dva[d], txg, now);
1016
1017 spa_config_exit(spa, SCL_FREE, FTAG);
1006}
1007
1008int
1009metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1010{
1011 const dva_t *dva = bp->blk_dva;
1012 int ndvas = BP_GET_NDVAS(bp);
1018}
1019
1020int
1021metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1022{
1023 const dva_t *dva = bp->blk_dva;
1024 int ndvas = BP_GET_NDVAS(bp);
1013 int d, error;
1014 int last_error = 0;
1025 int error = 0;
1015
1016 ASSERT(!BP_IS_HOLE(bp));
1017
1026
1027 ASSERT(!BP_IS_HOLE(bp));
1028
1018 for (d = 0; d < ndvas; d++)
1029 if (txg != 0) {
1030 /*
1031 * First do a dry run to make sure all DVAs are claimable,
1032 * so we don't have to unwind from partial failures below.
1033 */
1034 if ((error = metaslab_claim(spa, bp, 0)) != 0)
1035 return (error);
1036 }
1037
1038 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1039
1040 for (int d = 0; d < ndvas; d++)
1019 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1041 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1020 last_error = error;
1042 break;
1021
1043
1022 return (last_error);
1044 spa_config_exit(spa, SCL_ALLOC, FTAG);
1045
1046 ASSERT(error == 0 || txg == 0);
1047
1048 return (error);
1023}
1049}