Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add vdev property to bypass vdev queue #16591

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ struct vdev_queue {
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
kmutex_t vq_active_list_lock;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */
Expand Down
19 changes: 19 additions & 0 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,24 @@ typedef uint64_t zio_flag_t;
#define ZIO_CHILD_BIT(x) (1U << (x))
#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x)))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might want to add a comment here so it's understood why these flags are called out:

/*
 * ZIOs that are ZIO_FLAG_IMPORTANT are always queued so that they never get
 * starved out.  This allows us to bypass the queue for "normal" reads and
 * writes when the queues are low for better IOPS.  If the queues get too high
 * then we go back to queuing the "normal" reads/writes so as not to starve
 * out more important IOs like scrub/resilver/retry.  See
 * zfs_vdev_queue_bypass_pct for details.
 */

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.


/*
* ZIOs that are ZIO_FLAG_IMPORTANT are always queued so that they never get
* starved out. This allows us to bypass the queue for "normal" reads and
* writes when the queues are low for better IOPS. If the queues get too high
* then we go back to queuing the "normal" reads/writes so as not to starve
* out more important IOs like scrub/resilver/retry. See
* zfs_vdev_queue_bypass_pct for details.
*/

#define ZIO_FLAG_IMPORTANT \
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL | \
ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB | \
ZIO_FLAG_IO_RETRY | ZIO_FLAG_NODATA

#define ZIO_IS_NORMAL(zio) \
!((zio)->io_flags & (ZIO_FLAG_IMPORTANT))

enum zio_child {
ZIO_CHILD_VDEV = 0,
ZIO_CHILD_GANG,
Expand Down Expand Up @@ -449,6 +467,7 @@ enum zio_qstate {
ZIO_QS_NONE = 0,
ZIO_QS_QUEUED,
ZIO_QS_ACTIVE,
ZIO_QS_BYPASS,
};

struct zio {
Expand Down
7 changes: 7 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -1528,6 +1528,13 @@ Default queue depth for each vdev IO allocator.
Higher values allow for better coalescing of sequential writes before sending
them to the disk, but can increase transaction commit times.
.
.It Sy zfs_vdev_queue_bypass_pct Ns = Ns Sy 10 Pq uint
Allow bypassing the vdev's queue if the vdev queue is less than
zfs_vdev_queue_bypass_pct percent full.
This only applies to SSDs (non-rotational drives).
Only "normal" (read/write) zios can bypass the queue.
You can use 0 to always queue IOs and 100 to never queue IOs.
.
.It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint
Defines if the driver should retire on a given error type.
The following options may be bitwise-ored together:
Expand Down
2 changes: 2 additions & 0 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -5634,7 +5634,9 @@ vdev_deadman(vdev_t *vd, const char *tag)
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
mutex_enter(&vq->vq_active_list_lock);
fio = list_head(&vq->vq_active_list);
mutex_exit(&vq->vq_active_list_lock);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
Expand Down
47 changes: 47 additions & 0 deletions module/zfs/vdev_queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300;
*/
uint_t zfs_vdev_def_queue_depth = 32;

/*
* Allow io to bypass the queue depending on how full the queue is.
* 0 = never bypass, 100 = always bypass.
*/
uint_t zfs_vdev_queue_bypass_pct = 10;

static int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
Expand Down Expand Up @@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd)
list_create(&vq->vq_active_list, sizeof (struct zio),
offsetof(struct zio, io_queue_node.l));
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vq->vq_active_list_lock, NULL, MUTEX_DEFAULT, NULL);
}

void
Expand All @@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd)

list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
mutex_destroy(&vq->vq_active_list_lock);
}

static void
Expand Down Expand Up @@ -572,7 +580,9 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit--;
}
zio->io_queue_state = ZIO_QS_ACTIVE;
mutex_enter(&vq->vq_active_list_lock);
list_insert_tail(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
}

static void
Expand All @@ -589,7 +599,9 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
mutex_enter(&vq->vq_active_list_lock);
list_remove(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
zio->io_queue_state = ZIO_QS_NONE;
}

Expand Down Expand Up @@ -946,6 +958,30 @@ vdev_queue_io(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();

/*
* Bypass queue if certain conditions are met. Queue bypassing requires
* a non-rotational device. Reads / writes will attempt to bypass queue,
* depending on how full the queue is. Other operations will always
* queue. Bypassing the queue can lead to a 2x IOPS speed-ups on some
* benchmarks. If the queue is too full (due to a scrub or resilver)
* then go back to queuing normal reads/writes so as not to starve out
* the more important IOs.
*/
if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) {

int bypass = vdev_queue_length(vq->vq_vdev) <
(zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

zfs_vdev_max_active was supposed to be equal to a number of commands that drive can handle. Its current limit of 1000 effectively disables the functionality. Taking 10% of it makes no sense. Same time if you reduce it to 32 reasonable for SATA, then 10% of it will become 3 requests, which you reach immediately in most cases.

Same time I don't see vq_active to be incremented for bypass commands, which makes me wonder how are you expecting the threshold be reached until we magically get something that we won't bypass?

Also to avoid lock contention on one per-vdev lock you introduce another per-vdev lock, adding even more atomic operations, that makes me wonder if this is an optimization at all.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amotin you're right that we will need separate counters for both the number of normal queued IOs and the total number of active IOs (which include bypass IOs). I think this will work:

diff
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 3be570afb..10e6a1692 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -147,7 +147,19 @@ struct vdev_queue {
        zio_priority_t  vq_last_prio;   /* Last sent I/O priority. */
        uint32_t        vq_cqueued;     /* Classes with queued I/Os. */
        uint32_t        vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
+
+       /*
+        * Number of active I/Os.  This includes I/Os that were previously
+        * queued and are now active, plus all the 'bypass' I/Os that bypassed
+        * the queue.
+        */
        uint32_t        vq_active;      /* Number of active I/Os. */
+
+       /*
+        * Number of active I/Os that were previously queued.  This is a subset
+        * of vq_active.
+        */
+       uint32_t        vq_queued_active;
        uint32_t        vq_ia_active;   /* Active interactive I/Os. */
        uint32_t        vq_nia_credit;  /* Non-interactive I/Os credit. */
        list_t          vq_active_list; /* List of active I/Os. */
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 91093233e..e9c0fc0cf 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -572,7 +572,6 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
        ASSERT(MUTEX_HELD(&vq->vq_lock));
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
        vq->vq_cactive[zio->io_priority]++;
-       vq->vq_active++;
        if (vdev_queue_is_interactive(zio->io_priority)) {
                if (++vq->vq_ia_active == 1)
                        vq->vq_nia_credit = 1;
@@ -581,6 +580,8 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
        }
        zio->io_queue_state = ZIO_QS_ACTIVE;
        mutex_enter(&vq->vq_active_list_lock);
+       vq->vq_active++;
+       vq->vq_queued_active++;
        list_insert_tail(&vq->vq_active_list, zio);
        mutex_exit(&vq->vq_active_list_lock);
 }
@@ -591,7 +592,6 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
        ASSERT(MUTEX_HELD(&vq->vq_lock));
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
        vq->vq_cactive[zio->io_priority]--;
-       vq->vq_active--;
        if (vdev_queue_is_interactive(zio->io_priority)) {
                if (--vq->vq_ia_active == 0)
                        vq->vq_nia_credit = 0;
@@ -600,6 +600,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
        } else if (vq->vq_ia_active == 0)
                vq->vq_nia_credit++;
        mutex_enter(&vq->vq_active_list_lock);
+       vq->vq_active--;
+       vq->vq_queued_active--;
        list_remove(&vq->vq_active_list, zio);
        mutex_exit(&vq->vq_active_list_lock);
        zio->io_queue_state = ZIO_QS_NONE;
@@ -969,13 +971,14 @@ vdev_queue_io(zio_t *zio)
         */
        if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) {
 
-               int bypass = vdev_queue_length(vq->vq_vdev) <
+               boolean_t is_bypass = vq->vq_queued_active <
                    (zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100
                    ? 1 : 0;
 
-               if (bypass) {
+               if (is_bypass) {
                        zio->io_queue_state = ZIO_QS_BYPASS;
                        mutex_enter(&vq->vq_active_list_lock);
+                       vq->vq_active++;
                        list_insert_tail(&vq->vq_active_list, zio);
                        mutex_exit(&vq->vq_active_list_lock);
                        return (zio);
@@ -1016,6 +1019,7 @@ vdev_queue_io_done(zio_t *zio)
 
        if (zio->io_queue_state == ZIO_QS_BYPASS) {
                mutex_enter(&vq->vq_active_list_lock);
+               vq->vq_active--;
                list_remove(&vq->vq_active_list, zio);
                mutex_exit(&vq->vq_active_list_lock);
                zio->io_queue_state = ZIO_QS_NONE;

The Direct IO IOPS speedup is still 2x even with the introduction of the new vq_active_list_lock. That lock is only held during the list insertion/removal, so it's very low overhead.

Regarding the queue size - keep in mind that the bypass code is only active on non-rotational disks, and the 10% number is only counting traditional queued IOs and not bypass IOs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't looked close, but as a quick thought: If we any way obtain the lock and do the accounting, can we instead keep the code as-is, but instead, once we match some criteria, bypass the priority handling and aggregation code queue, taking the main time, as I understand? I wonder if it could be cleaner but no less efficient.

? 1 : 0;

if (bypass) {
zio->io_queue_state = ZIO_QS_BYPASS;
mutex_enter(&vq->vq_active_list_lock);
list_insert_tail(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
return (zio);
}
}

mutex_enter(&vq->vq_lock);
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
Expand Down Expand Up @@ -978,6 +1014,14 @@ vdev_queue_io_done(zio_t *zio)
vq->vq_io_complete_ts = now;
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;

if (zio->io_queue_state == ZIO_QS_BYPASS) {
mutex_enter(&vq->vq_active_list_lock);
list_remove(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
zio->io_queue_state = ZIO_QS_NONE;
return;
}

mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);

Expand Down Expand Up @@ -1163,3 +1207,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW,
"Default queue depth for each allocator");

ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_bypass_pct, UINT, ZMOD_RW,
"Queue bypass percentage per vdev");
Comment on lines +1211 to +1212
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You'll want to add an entry for zfs_vdev_queue_bypass_pct to man/man4/zfs.4, like:

     zfs_vdev_queue_bypass_pct=10 (uint)
             Allow bypassing the vdev's queue if the vdev queue is less than
             zfs_vdev_queue_bypass_pct percent full.  This only applies to
             SSDs (non-rotational drives).  You can use 0 to always queue IOs
             and 100 to never queue IOs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.

Loading