Deleted Added
full compact
vdev_raidz.c (254591) vdev_raidz.c (255750)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 8 unchanged lines hidden (view full) ---

17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 8 unchanged lines hidden (view full) ---

17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/vdev_impl.h>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/vdev_impl.h>
31#ifdef illumos
32#include <sys/vdev_disk.h>
33#endif
34#include <sys/vdev_file.h>
35#include <sys/vdev_raidz.h>
30#include <sys/zio.h>
31#include <sys/zio_checksum.h>
32#include <sys/fs/zfs.h>
33#include <sys/fm/fs/zfs.h>
36#include <sys/zio.h>
37#include <sys/zio_checksum.h>
38#include <sys/fs/zfs.h>
39#include <sys/fm/fs/zfs.h>
40#include <sys/bio.h>
34
35/*
36 * Virtual device vector for RAID-Z.
37 *
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by

--- 107 unchanged lines hidden (view full) ---

149}
150
151#define VDEV_RAIDZ_64MUL_4(x, mask) \
152{ \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 VDEV_RAIDZ_64MUL_2((x), mask); \
155}
156
41
42/*
43 * Virtual device vector for RAID-Z.
44 *
45 * This vdev supports single, double, and triple parity. For single parity,
46 * we use a simple XOR of all the data columns. For double or triple parity,
47 * we use a special case of Reed-Solomon coding. This extends the
48 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by

--- 107 unchanged lines hidden (view full) ---

156}
157
158#define VDEV_RAIDZ_64MUL_4(x, mask) \
159{ \
160 VDEV_RAIDZ_64MUL_2((x), mask); \
161 VDEV_RAIDZ_64MUL_2((x), mask); \
162}
163
164#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
165
157/*
158 * Force reconstruction to use the general purpose method.
159 */
160int vdev_raidz_default_to_general;
161
162/* Powers of 2 in the Galois field defined above. */
163static const uint8_t vdev_raidz_pow2[256] = {
164 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,

--- 267 unchanged lines hidden (view full) ---

432 vdev_raidz_cksum_report
433};
434
435/*
436 * Divides the IO evenly across all child vdevs; usually, dcols is
437 * the number of children in the target vdev.
438 */
439static raidz_map_t *
166/*
167 * Force reconstruction to use the general purpose method.
168 */
169int vdev_raidz_default_to_general;
170
171/* Powers of 2 in the Galois field defined above. */
172static const uint8_t vdev_raidz_pow2[256] = {
173 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,

--- 267 unchanged lines hidden (view full) ---

441 vdev_raidz_cksum_report
442};
443
444/*
445 * Divides the IO evenly across all child vdevs; usually, dcols is
446 * the number of children in the target vdev.
447 */
448static raidz_map_t *
440vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
441 uint64_t nparity)
449vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
450 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
442{
443 raidz_map_t *rm;
444 /* The starting RAIDZ (parent) vdev sector of the block. */
451{
452 raidz_map_t *rm;
453 /* The starting RAIDZ (parent) vdev sector of the block. */
445 uint64_t b = zio->io_offset >> unit_shift;
454 uint64_t b = offset >> unit_shift;
446 /* The zio's size in units of the vdev's minimum sector size. */
455 /* The zio's size in units of the vdev's minimum sector size. */
447 uint64_t s = zio->io_size >> unit_shift;
456 uint64_t s = size >> unit_shift;
448 /* The first column for this stripe. */
449 uint64_t f = b % dcols;
450 /* The starting byte offset on each child vdev. */
451 uint64_t o = (b / dcols) << unit_shift;
452 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
453
454 /*
455 * "Quotient": The number of data sectors for this stripe on all but

--- 71 unchanged lines hidden (view full) ---

527 }
528
529 ASSERT3U(asize, ==, tot << unit_shift);
530 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
531 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
532 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
533 ASSERT3U(rm->rm_nskip, <=, nparity);
534
457 /* The first column for this stripe. */
458 uint64_t f = b % dcols;
459 /* The starting byte offset on each child vdev. */
460 uint64_t o = (b / dcols) << unit_shift;
461 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
462
463 /*
464 * "Quotient": The number of data sectors for this stripe on all but

--- 71 unchanged lines hidden (view full) ---

536 }
537
538 ASSERT3U(asize, ==, tot << unit_shift);
539 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
540 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
541 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
542 ASSERT3U(rm->rm_nskip, <=, nparity);
543
535 if (zio->io_type != ZIO_TYPE_FREE) {
544 if (!dofree) {
536 for (c = 0; c < rm->rm_firstdatacol; c++) {
537 rm->rm_col[c].rc_data =
538 zio_buf_alloc(rm->rm_col[c].rc_size);
539 }
540
545 for (c = 0; c < rm->rm_firstdatacol; c++) {
546 rm->rm_col[c].rc_data =
547 zio_buf_alloc(rm->rm_col[c].rc_size);
548 }
549
541 rm->rm_col[c].rc_data = zio->io_data;
550 rm->rm_col[c].rc_data = data;
542
543 for (c = c + 1; c < acols; c++) {
544 rm->rm_col[c].rc_data =
545 (char *)rm->rm_col[c - 1].rc_data +
546 rm->rm_col[c - 1].rc_size;
547 }
548 }
549

--- 15 unchanged lines hidden (view full) ---

565 * If we intend to skip a sector in the zeroth column for padding
566 * we must make sure to note this swap. We will never intend to
567 * skip the first column since at least one data and one parity
568 * column must appear in each row.
569 */
570 ASSERT(rm->rm_cols >= 2);
571 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
572
551
552 for (c = c + 1; c < acols; c++) {
553 rm->rm_col[c].rc_data =
554 (char *)rm->rm_col[c - 1].rc_data +
555 rm->rm_col[c - 1].rc_size;
556 }
557 }
558

--- 15 unchanged lines hidden (view full) ---

574 * If we intend to skip a sector in the zeroth column for padding
575 * we must make sure to note this swap. We will never intend to
576 * skip the first column since at least one data and one parity
577 * column must appear in each row.
578 */
579 ASSERT(rm->rm_cols >= 2);
580 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
581
573 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
582 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
574 devidx = rm->rm_col[0].rc_devidx;
575 o = rm->rm_col[0].rc_offset;
576 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
577 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
578 rm->rm_col[1].rc_devidx = devidx;
579 rm->rm_col[1].rc_offset = o;
580
581 if (rm->rm_skipstart == 0)
582 rm->rm_skipstart = 1;
583 }
584
583 devidx = rm->rm_col[0].rc_devidx;
584 o = rm->rm_col[0].rc_offset;
585 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
586 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
587 rm->rm_col[1].rc_devidx = devidx;
588 rm->rm_col[1].rc_offset = o;
589
590 if (rm->rm_skipstart == 0)
591 rm->rm_skipstart = 1;
592 }
593
585 zio->io_vsd = rm;
586 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
587 return (rm);
588}
589
590static void
591vdev_raidz_generate_parity_p(raidz_map_t *rm)
592{
593 uint64_t *p, *src, pcount, ccount, i;
594 int c;

--- 393 unchanged lines hidden (view full) ---

988 * | 0 0 0 1 0 0 0 0 |
989 * | 0 0 0 0 1 0 0 0 |
990 * | 0 0 0 0 0 1 0 0 |
991 * | 0 0 0 0 0 0 1 0 |
992 * | 0 0 0 0 0 0 0 1 |
993 * ~~ ~~
994 * __ __
995 * | 1 1 1 1 1 1 1 1 |
594 return (rm);
595}
596
597static void
598vdev_raidz_generate_parity_p(raidz_map_t *rm)
599{
600 uint64_t *p, *src, pcount, ccount, i;
601 int c;

--- 393 unchanged lines hidden (view full) ---

995 * | 0 0 0 1 0 0 0 0 |
996 * | 0 0 0 0 1 0 0 0 |
997 * | 0 0 0 0 0 1 0 0 |
998 * | 0 0 0 0 0 0 1 0 |
999 * | 0 0 0 0 0 0 0 1 |
1000 * ~~ ~~
1001 * __ __
1002 * | 1 1 1 1 1 1 1 1 |
996 * | 128 64 32 16 8 4 2 1 |
997 * | 19 205 116 29 64 16 4 1 |
998 * | 1 0 0 0 0 0 0 0 |
1003 * | 19 205 116 29 64 16 4 1 |
1004 * | 1 0 0 0 0 0 0 0 |
999 * | 0 1 0 0 0 0 0 0 |
1000 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1001 * | 0 0 0 1 0 0 0 0 |
1005 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1002 * | 0 0 0 0 1 0 0 0 |
1003 * | 0 0 0 0 0 1 0 0 |
1004 * | 0 0 0 0 0 0 1 0 |
1005 * | 0 0 0 0 0 0 0 1 |
1006 * ~~ ~~
1007 *
1008 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1009 * have carefully chosen the seed values 1, 2, and 4 to ensure that this

--- 517 unchanged lines hidden (view full) ---

1527vdev_raidz_close(vdev_t *vd)
1528{
1529 int c;
1530
1531 for (c = 0; c < vd->vdev_children; c++)
1532 vdev_close(vd->vdev_child[c]);
1533}
1534
1006 * | 0 0 0 0 1 0 0 0 |
1007 * | 0 0 0 0 0 1 0 0 |
1008 * | 0 0 0 0 0 0 1 0 |
1009 * | 0 0 0 0 0 0 0 1 |
1010 * ~~ ~~
1011 *
1012 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013 * have carefully chosen the seed values 1, 2, and 4 to ensure that this

--- 517 unchanged lines hidden (view full) ---

1531vdev_raidz_close(vdev_t *vd)
1532{
1533 int c;
1534
1535 for (c = 0; c < vd->vdev_children; c++)
1536 vdev_close(vd->vdev_child[c]);
1537}
1538
1539#ifdef illumos
1540/*
1541 * Handle a read or write I/O to a RAID-Z dump device.
1542 *
1543 * The dump device is in a unique situation compared to other ZFS datasets:
1544 * writing to this device should be as simple and fast as possible. In
1545 * addition, durability matters much less since the dump will be extracted
1546 * once the machine reboots. For that reason, this function eschews parity for
1547 * performance and simplicity. The dump device uses the checksum setting
1548 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1549 * dataset.
1550 *
1551 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1552 * 128 KB will not fill an entire block; in addition, they may not be properly
1553 * aligned. In that case, this function uses the preallocated 128 KB block and
1554 * omits reading or writing any "empty" portions of that block, as opposed to
1555 * allocating a fresh appropriately-sized block.
1556 *
1557 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1558 *
1559 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1560 *
1561 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562 * allocated which spans all five child vdevs. 8 KB of data would be written to
1563 * each of four vdevs, with the fifth containing the parity bits.
1564 *
1565 * parity data data data data
1566 * | PP | XX | XX | XX | XX |
1567 * ^ ^ ^ ^ ^
1568 * | | | | |
1569 * 8 KB parity ------8 KB data blocks------
1570 *
1571 * However, when writing to the dump device, the behavior is different:
1572 *
1573 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1574 *
1575 * Unlike the normal RAID-Z case in which the block is allocated based on the
1576 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1577 * I/O size is less than 128 KB, only the actual portions of data are written.
1578 * In this example the data is written to the third data vdev since that vdev
1579 * contains the offset [64 KB, 96 KB).
1580 *
1581 * parity data data data data
1582 * | | | | XX | |
1583 * ^
1584 * |
1585 * 32 KB data block
1586 *
1587 * As a result, an individual I/O may not span all child vdevs; moreover, a
1588 * small I/O may only operate on a single child vdev.
1589 *
1590 * Note that since there are no parity bits calculated or written, this format
1591 * remains the same no matter how many parity bits are used in a normal RAID-Z
1592 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1593 * would look like:
1594 *
1595 * parity parity parity data data data data
1596 * | | | | | | XX | |
1597 * ^
1598 * |
1599 * 32 KB data block
1600 */
1601int
1602vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1604{
1605 vdev_t *tvd = vd->vdev_top;
1606 vdev_t *cvd;
1607 raidz_map_t *rm;
1608 raidz_col_t *rc;
1609 int c, err = 0;
1610
1611 uint64_t start, end, colstart, colend;
1612 uint64_t coloffset, colsize, colskip;
1613
1614 int flags = doread ? BIO_READ : BIO_WRITE;
1615
1616#ifdef _KERNEL
1617
1618 /*
1619 * Don't write past the end of the block
1620 */
1621 VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
1622
1623 start = offset;
1624 end = start + size;
1625
1626 /*
1627 * Allocate a RAID-Z map for this block. Note that this block starts
1628 * from the "original" offset, this is, the offset of the extent which
1629 * contains the requisite offset of the data being read or written.
1630 *
1631 * Even if this I/O operation doesn't span the full block size, let's
1632 * treat the on-disk format as if the only blocks are the complete 128
1633 * KB size.
1634 */
1635 rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636 SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
1637 vd->vdev_nparity);
1638
1639 coloffset = origoffset;
1640
1641 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642 c++, coloffset += rc->rc_size) {
1643 rc = &rm->rm_col[c];
1644 cvd = vd->vdev_child[rc->rc_devidx];
1645
1646 /*
1647 * Find the start and end of this column in the RAID-Z map,
1648 * keeping in mind that the stated size and offset of the
1649 * operation may not fill the entire column for this vdev.
1650 *
1651 * If any portion of the data spans this column, issue the
1652 * appropriate operation to the vdev.
1653 */
1654 if (coloffset + rc->rc_size <= start)
1655 continue;
1656 if (coloffset >= end)
1657 continue;
1658
1659 colstart = MAX(coloffset, start);
1660 colend = MIN(end, coloffset + rc->rc_size);
1661 colsize = colend - colstart;
1662 colskip = colstart - coloffset;
1663
1664 VERIFY3U(colsize, <=, rc->rc_size);
1665 VERIFY3U(colskip, <=, rc->rc_size);
1666
1667 /*
1668 * Note that the child vdev will have a vdev label at the start
1669 * of its range of offsets, hence the need for
1670 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1671 * example of why this calculation is needed.
1672 */
1673 if ((err = vdev_disk_physio(cvd,
1674 ((char *)rc->rc_data) + colskip, colsize,
1675 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676 flags, isdump)) != 0)
1677 break;
1678 }
1679
1680 vdev_raidz_map_free(rm);
1681#endif /* KERNEL */
1682
1683 return (err);
1684}
1685#endif
1686
1535static uint64_t
1536vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1537{
1538 uint64_t asize;
1539 uint64_t ashift = vd->vdev_top->vdev_ashift;
1540 uint64_t cols = vd->vdev_children;
1541 uint64_t nparity = vd->vdev_nparity;
1542

--- 36 unchanged lines hidden (view full) ---

1579{
1580 vdev_t *vd = zio->io_vd;
1581 vdev_t *tvd = vd->vdev_top;
1582 vdev_t *cvd;
1583 raidz_map_t *rm;
1584 raidz_col_t *rc;
1585 int c, i;
1586
1687static uint64_t
1688vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1689{
1690 uint64_t asize;
1691 uint64_t ashift = vd->vdev_top->vdev_ashift;
1692 uint64_t cols = vd->vdev_children;
1693 uint64_t nparity = vd->vdev_nparity;
1694

--- 36 unchanged lines hidden (view full) ---

1731{
1732 vdev_t *vd = zio->io_vd;
1733 vdev_t *tvd = vd->vdev_top;
1734 vdev_t *cvd;
1735 raidz_map_t *rm;
1736 raidz_col_t *rc;
1737 int c, i;
1738
1587 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1739 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740 zio->io_type == ZIO_TYPE_FREE,
1741 tvd->vdev_ashift, vd->vdev_children,
1588 vd->vdev_nparity);
1589
1742 vd->vdev_nparity);
1743
1744 zio->io_vsd = rm;
1745 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1746
1590 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1591
1592 if (zio->io_type == ZIO_TYPE_FREE) {
1593 for (c = 0; c < rm->rm_cols; c++) {
1594 rc = &rm->rm_col[c];
1595 cvd = vd->vdev_child[rc->rc_devidx];
1596 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1597 rc->rc_offset, rc->rc_data, rc->rc_size,

--- 126 unchanged lines hidden (view full) ---

1724 */
1725static int
1726raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1727{
1728 void *orig[VDEV_RAIDZ_MAXPARITY];
1729 int c, ret = 0;
1730 raidz_col_t *rc;
1731
1747 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1748
1749 if (zio->io_type == ZIO_TYPE_FREE) {
1750 for (c = 0; c < rm->rm_cols; c++) {
1751 rc = &rm->rm_col[c];
1752 cvd = vd->vdev_child[rc->rc_devidx];
1753 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754 rc->rc_offset, rc->rc_data, rc->rc_size,

--- 126 unchanged lines hidden (view full) ---

1881 */
1882static int
1883raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1884{
1885 void *orig[VDEV_RAIDZ_MAXPARITY];
1886 int c, ret = 0;
1887 raidz_col_t *rc;
1888
1889 blkptr_t *bp = zio->io_bp;
1890 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1891 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1892
1893 if (checksum == ZIO_CHECKSUM_NOPARITY)
1894 return (ret);
1895
1732 for (c = 0; c < rm->rm_firstdatacol; c++) {
1733 rc = &rm->rm_col[c];
1734 if (!rc->rc_tried || rc->rc_error != 0)
1735 continue;
1736 orig[c] = zio_buf_alloc(rc->rc_size);
1737 bcopy(rc->rc_data, orig[c], rc->rc_size);
1738 }
1739

--- 500 unchanged lines hidden ---
1896 for (c = 0; c < rm->rm_firstdatacol; c++) {
1897 rc = &rm->rm_col[c];
1898 if (!rc->rc_tried || rc->rc_error != 0)
1899 continue;
1900 orig[c] = zio_buf_alloc(rc->rc_size);
1901 bcopy(rc->rc_data, orig[c], rc->rc_size);
1902 }
1903

--- 500 unchanged lines hidden ---