Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vdev_disk: try harder to ensure IO alignment rules #16687

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 91 additions & 63 deletions module/os/linux/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -801,24 +801,13 @@ vbio_completion(struct bio *bio)
bio_put(bio);

/*
* If we copied the ABD before issuing it, clean up and return the copy
* to the ADB, with changes if appropriate.
* We're likely in an interrupt context so we can't do ABD/memory work
* here; instead we stash vbio on the zio and take care of it in the
* done callback.
*/
if (vbio->vbio_abd != NULL) {
void *buf = abd_to_buf(vbio->vbio_abd);
abd_free(vbio->vbio_abd);
vbio->vbio_abd = NULL;

if (zio->io_type == ZIO_TYPE_READ)
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
else
abd_return_buf(zio->io_abd, buf, zio->io_size);
}

/* Final cleanup */
kmem_free(vbio, sizeof (vbio_t));
ASSERT3P(zio->io_bio, ==, NULL);
zio->io_bio = vbio;

/* All done, submit for processing */
zio_delay_interrupt(zio);
}

Expand All @@ -834,38 +823,61 @@ vbio_completion(struct bio *bio)
* NOTE: if you change this function, change the copy in
* tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
* data there to validate the change you're making.
*
*/
typedef struct {
uint_t bmask;
uint_t npages;
uint_t end;
} vdev_disk_check_pages_t;
size_t blocksize;
int seen_first;
int seen_last;
} vdev_disk_check_alignment_t;

static int
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len,
void *priv)
{
(void) page;
vdev_disk_check_pages_t *s = priv;
vdev_disk_check_alignment_t *s = priv;

/*
* If we didn't finish on a block size boundary last time, then there
* would be a gap if we tried to use this ABD as-is, so abort.
* The cardinal rule: a single on-disk block must never cross an
* physical (order-0) page boundary, as the kernel expects to be able
* to split at both LBS and page boundaries.
*
* This implies various alignment rules for the blocks in this
* (possibly compound) page, which we can check for.
*/
if (s->end != 0)
return (1);

/*
* Note if we're taking less than a full block, so we can check it
* above on the next call.
* If the previous page did not end on a page boundary, then we
* can't proceed without creating a hole.
*/
s->end = (off+len) & s->bmask;
if (s->seen_last)
return (1);

/* All blocks after the first must start on a block size boundary. */
if (s->npages != 0 && (off & s->bmask) != 0)
/* This page must contain only whole LBS-sized blocks. */
if (!IS_P2ALIGNED(len, s->blocksize))
return (1);

s->npages++;
/*
* If this is not the first page in the ABD, then the data must start
* on a page-aligned boundary (so the kernel can split on page
* boundaries without having to deal with a hole). If it is, then
* it can start on LBS-alignment.
*/
if (s->seen_first) {
if (!IS_P2ALIGNED(off, PAGESIZE))
return (1);
} else {
if (!IS_P2ALIGNED(off, s->blocksize))
return (1);
s->seen_first = 1;
}

/*
* If this data does not end on a page-aligned boundary, then this
* must be the last page in the ABD, for the same reason.
*/
s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE);

return (0);
}

Expand All @@ -874,15 +886,14 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
* the number of pages, or 0 if it can't be submitted like this.
*/
static boolean_t
vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev)
{
vdev_disk_check_pages_t s = {
.bmask = bdev_logical_block_size(bdev)-1,
.npages = 0,
.end = 0,
vdev_disk_check_alignment_t s = {
.blocksize = bdev_logical_block_size(bdev),
};

if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
if (abd_iterate_page_func(abd, 0, size,
vdev_disk_check_alignment_cb, &s))
return (B_FALSE);

return (B_TRUE);
Expand Down Expand Up @@ -916,37 +927,32 @@ vdev_disk_io_rw(zio_t *zio)

/*
* Check alignment of the incoming ABD. If any part of it would require
* submitting a page that is not aligned to the logical block size,
* then we take a copy into a linear buffer and submit that instead.
* This should be impossible on a 512b LBS, and fairly rare on 4K,
* usually requiring abnormally-small data blocks (eg gang blocks)
* mixed into the same ABD as larger ones (eg aggregated).
* submitting a page that is not aligned to both the logical block size
* and the page size, then we take a copy into a new memory region with
* correct alignment. This should be impossible on a 512b LBS. On
* larger blocks, this can happen at least when a small number of
* blocks (usually 1) are allocated from a shared slab, or when
* abnormally-small data regions (eg gang headers) are mixed into the
* same ABD as larger allocations (eg aggregations).
*/
abd_t *abd = zio->io_abd;
if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
void *buf;
if (zio->io_type == ZIO_TYPE_READ)
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
else
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) {
/* Allocate a new memory region with guaranteed alignment */
abd = abd_alloc_for_io(zio->io_size,
zio->io_abd->abd_flags & ABD_FLAG_META);

/*
* Wrap the copy in an abd_t, so we can use the same iterators
* to count and fill the vbio later.
*/
abd = abd_get_from_buf(buf, zio->io_size);
/* If we're writing copy our data into it */
if (zio->io_type == ZIO_TYPE_WRITE)
abd_copy(abd, zio->io_abd, zio->io_size);

/*
* False here would mean the borrowed copy has an invalid
* alignment too, which would mean we've somehow been passed a
* linear ABD with an interior page that has a non-zero offset
* or a size not a multiple of PAGE_SIZE. This is not possible.
* It would mean either zio_buf_alloc() or its underlying
* allocators have done something extremely strange, or our
* math in vdev_disk_check_pages() is wrong. In either case,
* False here would mean the new allocation has an invalid
* alignment too, which would mean that abd_alloc() is not
* guaranteeing this, or our logic in
* vdev_disk_check_alignment() is wrong. In either case,
* something in seriously wrong and its not safe to continue.
*/
VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev));
}

/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
Expand Down Expand Up @@ -1437,6 +1443,28 @@ vdev_disk_io_start(zio_t *zio)
static void
vdev_disk_io_done(zio_t *zio)
{
/* If this was a read or write, we need to clean up the vbio */
if (zio->io_bio != NULL) {
vbio_t *vbio = zio->io_bio;
zio->io_bio = NULL;

/*
* If we copied the ABD before issuing it, clean up and return
* the copy to the ADB, with changes if appropriate.
*/
if (vbio->vbio_abd != NULL) {
if (zio->io_type == ZIO_TYPE_READ)
abd_copy(zio->io_abd, vbio->vbio_abd,
zio->io_size);

abd_free(vbio->vbio_abd);
vbio->vbio_abd = NULL;
}

/* Final cleanup */
kmem_free(vbio, sizeof (vbio_t));
}

/*
* If the device returned EIO, we revalidate the media. If it is
* determined the media has changed this triggers the asynchronous
Expand Down
14 changes: 0 additions & 14 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,20 +187,6 @@ zio_init(void)
continue;
#endif

#if defined(__linux__) && defined(_KERNEL)
/*
* Workaround issue of Linux vdev_disk.c, in some cases not
* linearizing buffers with disk sector crossing a page
* boundary. It is fine for hardware, but somehow required by
* LUKS. It is not typical for ZFS to produce such buffers, but
* it may happen if 6KB block is compressed to 4KB, while still
* having 2KB alignment. Banning the 6KB buffers helps vdevs
* with ashifh=12.
*/
if (size > PAGESIZE && !IS_P2ALIGNED(size, PAGESIZE))
continue;
#endif

if (IS_P2ALIGNED(size, PAGESIZE))
align = PAGESIZE;
else
Expand Down
Loading