-
Notifications
You must be signed in to change notification settings - Fork 1.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add vdev property to bypass vdev queue #16591
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300; | |
*/ | ||
uint_t zfs_vdev_def_queue_depth = 32; | ||
|
||
/* | ||
* Allow io to bypass the queue depending on how full the queue is. | ||
* 0 = never bypass, 100 = always bypass. | ||
*/ | ||
uint_t zfs_vdev_queue_bypass_pct = 10; | ||
|
||
static int | ||
vdev_queue_offset_compare(const void *x1, const void *x2) | ||
{ | ||
|
@@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd) | |
list_create(&vq->vq_active_list, sizeof (struct zio), | ||
offsetof(struct zio, io_queue_node.l)); | ||
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); | ||
mutex_init(&vq->vq_active_list_lock, NULL, MUTEX_DEFAULT, NULL); | ||
} | ||
|
||
void | ||
|
@@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd) | |
|
||
list_destroy(&vq->vq_active_list); | ||
mutex_destroy(&vq->vq_lock); | ||
mutex_destroy(&vq->vq_active_list_lock); | ||
} | ||
|
||
static void | ||
|
@@ -572,7 +580,9 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) | |
vq->vq_nia_credit--; | ||
} | ||
zio->io_queue_state = ZIO_QS_ACTIVE; | ||
mutex_enter(&vq->vq_active_list_lock); | ||
list_insert_tail(&vq->vq_active_list, zio); | ||
mutex_exit(&vq->vq_active_list_lock); | ||
} | ||
|
||
static void | ||
|
@@ -589,7 +599,9 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) | |
vq->vq_nia_credit = zfs_vdev_nia_credit; | ||
} else if (vq->vq_ia_active == 0) | ||
vq->vq_nia_credit++; | ||
mutex_enter(&vq->vq_active_list_lock); | ||
list_remove(&vq->vq_active_list, zio); | ||
mutex_exit(&vq->vq_active_list_lock); | ||
zio->io_queue_state = ZIO_QS_NONE; | ||
} | ||
|
||
|
@@ -946,6 +958,30 @@ vdev_queue_io(zio_t *zio) | |
zio->io_flags |= ZIO_FLAG_DONT_QUEUE; | ||
zio->io_timestamp = gethrtime(); | ||
|
||
/* | ||
* Bypass queue if certain conditions are met. Queue bypassing requires | ||
* a non-rotational device. Reads / writes will attempt to bypass queue, | ||
* depending on how full the queue is. Other operations will always | ||
* queue. Bypassing the queue can lead to a 2x IOPS speed-ups on some | ||
* benchmarks. If the queue is too full (due to a scrub or resilver) | ||
* then go back to queuing normal reads/writes so as not to starve out | ||
* the more important IOs. | ||
*/ | ||
if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) { | ||
|
||
int bypass = vdev_queue_length(vq->vq_vdev) < | ||
(zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. zfs_vdev_max_active was supposed to be equal to a number of commands that drive can handle. Its current limit of 1000 effectively disables the functionality. Taking 10% of it makes no sense. Same time if you reduce it to 32 reasonable for SATA, then 10% of it will become 3 requests, which you reach immediately in most cases. Same time I don't see Also to avoid lock contention on one per-vdev lock you introduce another per-vdev lock, adding even more atomic operations, that makes me wonder if this is an optimization at all. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @amotin you're right that we will need separate counters for both the number of normal queued IOs and the total number of active IOs (which include bypass IOs). I think this will work: diffdiff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 3be570afb..10e6a1692 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -147,7 +147,19 @@ struct vdev_queue {
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
uint32_t vq_cqueued; /* Classes with queued I/Os. */
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
+
+ /*
+ * Number of active I/Os. This includes I/Os that were previously
+ * queued and are now active, plus all the 'bypass' I/Os that bypassed
+ * the queue.
+ */
uint32_t vq_active; /* Number of active I/Os. */
+
+ /*
+ * Number of active I/Os that were previously queued. This is a subset
+ * of vq_active.
+ */
+ uint32_t vq_queued_active;
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 91093233e..e9c0fc0cf 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -572,7 +572,6 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_cactive[zio->io_priority]++;
- vq->vq_active++;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1;
@@ -581,6 +580,8 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
}
zio->io_queue_state = ZIO_QS_ACTIVE;
mutex_enter(&vq->vq_active_list_lock);
+ vq->vq_active++;
+ vq->vq_queued_active++;
list_insert_tail(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
}
@@ -591,7 +592,6 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_cactive[zio->io_priority]--;
- vq->vq_active--;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0;
@@ -600,6 +600,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
mutex_enter(&vq->vq_active_list_lock);
+ vq->vq_active--;
+ vq->vq_queued_active--;
list_remove(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
zio->io_queue_state = ZIO_QS_NONE;
@@ -969,13 +971,14 @@ vdev_queue_io(zio_t *zio)
*/
if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) {
- int bypass = vdev_queue_length(vq->vq_vdev) <
+ boolean_t is_bypass = vq->vq_queued_active <
(zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100
? 1 : 0;
- if (bypass) {
+ if (is_bypass) {
zio->io_queue_state = ZIO_QS_BYPASS;
mutex_enter(&vq->vq_active_list_lock);
+ vq->vq_active++;
list_insert_tail(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
return (zio);
@@ -1016,6 +1019,7 @@ vdev_queue_io_done(zio_t *zio)
if (zio->io_queue_state == ZIO_QS_BYPASS) {
mutex_enter(&vq->vq_active_list_lock);
+ vq->vq_active--;
list_remove(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
zio->io_queue_state = ZIO_QS_NONE;
The Direct IO IOPS speedup is still 2x even with the introduction of the new Regarding the queue size - keep in mind that the bypass code is only active on non-rotational disks, and the 10% number is only counting traditional queued IOs and not bypass IOs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't looked close, but as a quick thought: If we any way obtain the lock and do the accounting, can we instead keep the code as-is, but instead, once we match some criteria, bypass the priority handling and aggregation code queue, taking the main time, as I understand? I wonder if it could be cleaner but no less efficient. |
||
? 1 : 0; | ||
|
||
if (bypass) { | ||
zio->io_queue_state = ZIO_QS_BYPASS; | ||
mutex_enter(&vq->vq_active_list_lock); | ||
list_insert_tail(&vq->vq_active_list, zio); | ||
mutex_exit(&vq->vq_active_list_lock); | ||
return (zio); | ||
} | ||
} | ||
|
||
mutex_enter(&vq->vq_lock); | ||
vdev_queue_io_add(vq, zio); | ||
nio = vdev_queue_io_to_issue(vq); | ||
|
@@ -978,6 +1014,14 @@ vdev_queue_io_done(zio_t *zio) | |
vq->vq_io_complete_ts = now; | ||
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; | ||
|
||
if (zio->io_queue_state == ZIO_QS_BYPASS) { | ||
mutex_enter(&vq->vq_active_list_lock); | ||
list_remove(&vq->vq_active_list, zio); | ||
mutex_exit(&vq->vq_active_list_lock); | ||
zio->io_queue_state = ZIO_QS_NONE; | ||
return; | ||
} | ||
|
||
mutex_enter(&vq->vq_lock); | ||
vdev_queue_pending_remove(vq, zio); | ||
|
||
|
@@ -1163,3 +1207,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW, | |
|
||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW, | ||
"Default queue depth for each allocator"); | ||
|
||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_bypass_pct, UINT, ZMOD_RW, | ||
"Queue bypass percentage per vdev"); | ||
Comment on lines
+1211
to
+1212
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You'll want to add an entry for
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might want to add a comment here so it's understood why these flags are called out:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added.