1c1cb2cd8Shaad /*
2c1cb2cd8Shaad * CDDL HEADER START
3c1cb2cd8Shaad *
4c1cb2cd8Shaad * The contents of this file are subject to the terms of the
5c1cb2cd8Shaad * Common Development and Distribution License (the "License").
6c1cb2cd8Shaad * You may not use this file except in compliance with the License.
7c1cb2cd8Shaad *
8c1cb2cd8Shaad * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9c1cb2cd8Shaad * or http://www.opensolaris.org/os/licensing.
10c1cb2cd8Shaad * See the License for the specific language governing permissions
11c1cb2cd8Shaad * and limitations under the License.
12c1cb2cd8Shaad *
13c1cb2cd8Shaad * When distributing Covered Code, include this CDDL HEADER in each
14c1cb2cd8Shaad * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15c1cb2cd8Shaad * If applicable, add the following below this CDDL HEADER, with the
16c1cb2cd8Shaad * fields enclosed by brackets "[]" replaced with your own identifying
17c1cb2cd8Shaad * information: Portions Copyright [yyyy] [name of copyright owner]
18c1cb2cd8Shaad *
19c1cb2cd8Shaad * CDDL HEADER END
20c1cb2cd8Shaad */
21c1cb2cd8Shaad /*
22f59c7639Shaad * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23c1cb2cd8Shaad * Use is subject to license terms.
24c1cb2cd8Shaad */
25c1cb2cd8Shaad
263227e6cfSchs /*
273227e6cfSchs * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
283227e6cfSchs * Copyright (c) 2014 Integros [integros.com]
293227e6cfSchs */
303227e6cfSchs
31c1cb2cd8Shaad #include <sys/zfs_context.h>
32c1cb2cd8Shaad #include <sys/vdev_impl.h>
333227e6cfSchs #include <sys/spa_impl.h>
34c1cb2cd8Shaad #include <sys/zio.h>
35c1cb2cd8Shaad #include <sys/avl.h>
363227e6cfSchs #include <sys/dsl_pool.h>
373227e6cfSchs #include <sys/metaslab_impl.h>
38c1cb2cd8Shaad
39c1cb2cd8Shaad /*
403227e6cfSchs * ZFS I/O Scheduler
413227e6cfSchs * ---------------
423227e6cfSchs *
433227e6cfSchs * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
443227e6cfSchs * I/O scheduler determines when and in what order those operations are
453227e6cfSchs * issued. The I/O scheduler divides operations into six I/O classes
463227e6cfSchs * prioritized in the following order: sync read, sync write, async read,
473227e6cfSchs * async write, scrub/resilver and trim. Each queue defines the minimum and
483227e6cfSchs * maximum number of concurrent operations that may be issued to the device.
493227e6cfSchs * In addition, the device has an aggregate maximum. Note that the sum of the
503227e6cfSchs * per-queue minimums must not exceed the aggregate maximum, and if the
513227e6cfSchs * aggregate maximum is equal to or greater than the sum of the per-queue
523227e6cfSchs * maximums, the per-queue minimum has no effect.
533227e6cfSchs *
543227e6cfSchs * For many physical devices, throughput increases with the number of
553227e6cfSchs * concurrent operations, but latency typically suffers. Further, physical
563227e6cfSchs * devices typically have a limit at which more concurrent operations have no
573227e6cfSchs * effect on throughput or can actually cause it to decrease.
583227e6cfSchs *
593227e6cfSchs * The scheduler selects the next operation to issue by first looking for an
603227e6cfSchs * I/O class whose minimum has not been satisfied. Once all are satisfied and
613227e6cfSchs * the aggregate maximum has not been hit, the scheduler looks for classes
623227e6cfSchs * whose maximum has not been satisfied. Iteration through the I/O classes is
633227e6cfSchs * done in the order specified above. No further operations are issued if the
643227e6cfSchs * aggregate maximum number of concurrent operations has been hit or if there
653227e6cfSchs * are no operations queued for an I/O class that has not hit its maximum.
663227e6cfSchs * Every time an I/O is queued or an operation completes, the I/O scheduler
673227e6cfSchs * looks for new operations to issue.
683227e6cfSchs *
693227e6cfSchs * All I/O classes have a fixed maximum number of outstanding operations
703227e6cfSchs * except for the async write class. Asynchronous writes represent the data
713227e6cfSchs * that is committed to stable storage during the syncing stage for
723227e6cfSchs * transaction groups (see txg.c). Transaction groups enter the syncing state
733227e6cfSchs * periodically so the number of queued async writes will quickly burst up and
743227e6cfSchs * then bleed down to zero. Rather than servicing them as quickly as possible,
753227e6cfSchs * the I/O scheduler changes the maximum number of active async write I/Os
763227e6cfSchs * according to the amount of dirty data in the pool (see dsl_pool.c). Since
773227e6cfSchs * both throughput and latency typically increase with the number of
783227e6cfSchs * concurrent operations issued to physical devices, reducing the burstiness
793227e6cfSchs * in the number of concurrent operations also stabilizes the response time of
803227e6cfSchs * operations from other -- and in particular synchronous -- queues. In broad
813227e6cfSchs * strokes, the I/O scheduler will issue more concurrent operations from the
823227e6cfSchs * async write queue as there's more dirty data in the pool.
833227e6cfSchs *
843227e6cfSchs * Async Writes
853227e6cfSchs *
863227e6cfSchs * The number of concurrent operations issued for the async write I/O class
873227e6cfSchs * follows a piece-wise linear function defined by a few adjustable points.
883227e6cfSchs *
893227e6cfSchs * | o---------| <-- zfs_vdev_async_write_max_active
903227e6cfSchs * ^ | /^ |
913227e6cfSchs * | | / | |
923227e6cfSchs * active | / | |
933227e6cfSchs * I/O | / | |
943227e6cfSchs * count | / | |
953227e6cfSchs * | / | |
963227e6cfSchs * |------------o | | <-- zfs_vdev_async_write_min_active
973227e6cfSchs * 0|____________^______|_________|
983227e6cfSchs * 0% | | 100% of zfs_dirty_data_max
993227e6cfSchs * | |
1003227e6cfSchs * | `-- zfs_vdev_async_write_active_max_dirty_percent
1013227e6cfSchs * `--------- zfs_vdev_async_write_active_min_dirty_percent
1023227e6cfSchs *
1033227e6cfSchs * Until the amount of dirty data exceeds a minimum percentage of the dirty
1043227e6cfSchs * data allowed in the pool, the I/O scheduler will limit the number of
1053227e6cfSchs * concurrent operations to the minimum. As that threshold is crossed, the
1063227e6cfSchs * number of concurrent operations issued increases linearly to the maximum at
1073227e6cfSchs * the specified maximum percentage of the dirty data allowed in the pool.
1083227e6cfSchs *
1093227e6cfSchs * Ideally, the amount of dirty data on a busy pool will stay in the sloped
1103227e6cfSchs * part of the function between zfs_vdev_async_write_active_min_dirty_percent
1113227e6cfSchs * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
1123227e6cfSchs * maximum percentage, this indicates that the rate of incoming data is
1133227e6cfSchs * greater than the rate that the backend storage can handle. In this case, we
1143227e6cfSchs * must further throttle incoming writes (see dmu_tx_delay() for details).
115c1cb2cd8Shaad */
1163227e6cfSchs
117c1cb2cd8Shaad /*
1183227e6cfSchs * The maximum number of I/Os active to each device. Ideally, this will be >=
1193227e6cfSchs * the sum of each queue's max_active. It must be at least the sum of each
1203227e6cfSchs * queue's min_active.
121c1cb2cd8Shaad */
1223227e6cfSchs uint32_t zfs_vdev_max_active = 1000;
123c1cb2cd8Shaad
1243227e6cfSchs /*
1253227e6cfSchs * Per-queue limits on the number of I/Os active to each device. If the
1263227e6cfSchs * sum of the queue's max_active is < zfs_vdev_max_active, then the
1273227e6cfSchs * min_active comes into play. We will send min_active from each queue,
1283227e6cfSchs * and then select from queues in the order defined by zio_priority_t.
1293227e6cfSchs *
1303227e6cfSchs * In general, smaller max_active's will lead to lower latency of synchronous
1313227e6cfSchs * operations. Larger max_active's may lead to higher overall throughput,
1323227e6cfSchs * depending on underlying storage.
1333227e6cfSchs *
1343227e6cfSchs * The ratio of the queues' max_actives determines the balance of performance
1353227e6cfSchs * between reads, writes, and scrubs. E.g., increasing
1363227e6cfSchs * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
1373227e6cfSchs * more quickly, but reads and writes to have higher latency and lower
1383227e6cfSchs * throughput.
1393227e6cfSchs */
1403227e6cfSchs uint32_t zfs_vdev_sync_read_min_active = 10;
1413227e6cfSchs uint32_t zfs_vdev_sync_read_max_active = 10;
1423227e6cfSchs uint32_t zfs_vdev_sync_write_min_active = 10;
1433227e6cfSchs uint32_t zfs_vdev_sync_write_max_active = 10;
1443227e6cfSchs uint32_t zfs_vdev_async_read_min_active = 1;
1453227e6cfSchs uint32_t zfs_vdev_async_read_max_active = 3;
1463227e6cfSchs uint32_t zfs_vdev_async_write_min_active = 1;
1473227e6cfSchs uint32_t zfs_vdev_async_write_max_active = 10;
1483227e6cfSchs uint32_t zfs_vdev_scrub_min_active = 1;
1493227e6cfSchs uint32_t zfs_vdev_scrub_max_active = 2;
1503227e6cfSchs uint32_t zfs_vdev_trim_min_active = 1;
1513227e6cfSchs /*
1523227e6cfSchs * TRIM max active is large in comparison to the other values due to the fact
1533227e6cfSchs * that TRIM IOs are coalesced at the device layer. This value is set such
1543227e6cfSchs * that a typical SSD can process the queued IOs in a single request.
1553227e6cfSchs */
1563227e6cfSchs uint32_t zfs_vdev_trim_max_active = 64;
157c1cb2cd8Shaad
1583227e6cfSchs
1593227e6cfSchs /*
1603227e6cfSchs * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
1613227e6cfSchs * dirty data, use zfs_vdev_async_write_min_active. When it has more than
1623227e6cfSchs * zfs_vdev_async_write_active_max_dirty_percent, use
1633227e6cfSchs * zfs_vdev_async_write_max_active. The value is linearly interpolated
1643227e6cfSchs * between min and max.
1653227e6cfSchs */
1663227e6cfSchs int zfs_vdev_async_write_active_min_dirty_percent = 30;
1673227e6cfSchs int zfs_vdev_async_write_active_max_dirty_percent = 60;
168c1cb2cd8Shaad
169c1cb2cd8Shaad /*
170f59c7639Shaad * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
171f59c7639Shaad * For read I/Os, we also aggregate across small adjacency gaps; for writes
172f59c7639Shaad * we include spans of optional I/Os to aid aggregation at the disk even when
173f59c7639Shaad * they aren't able to help us aggregate at this level.
174c1cb2cd8Shaad */
1753227e6cfSchs int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
176f59c7639Shaad int zfs_vdev_read_gap_limit = 32 << 10;
177f59c7639Shaad int zfs_vdev_write_gap_limit = 4 << 10;
178c1cb2cd8Shaad
179c1cb2cd8Shaad /*
1803227e6cfSchs * Define the queue depth percentage for each top-level. This percentage is
1813227e6cfSchs * used in conjunction with zfs_vdev_async_max_active to determine how many
1823227e6cfSchs * allocations a specific top-level vdev should handle. Once the queue depth
1833227e6cfSchs * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
1843227e6cfSchs * then allocator will stop allocating blocks on that top-level device.
1853227e6cfSchs * The default kernel setting is 1000% which will yield 100 allocations per
1863227e6cfSchs * device. For userland testing, the default setting is 300% which equates
1873227e6cfSchs * to 30 allocations per device.
188c1cb2cd8Shaad */
1893227e6cfSchs #ifdef _KERNEL
1903227e6cfSchs int zfs_vdev_queue_depth_pct = 1000;
1913227e6cfSchs #else
1923227e6cfSchs int zfs_vdev_queue_depth_pct = 300;
1933227e6cfSchs #endif
1943227e6cfSchs
1953227e6cfSchs
1963227e6cfSchs #ifdef __FreeBSD__
1973227e6cfSchs #ifdef _KERNEL
1983227e6cfSchs SYSCTL_DECL(_vfs_zfs_vdev);
1993227e6cfSchs
2003227e6cfSchs static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
2013227e6cfSchs SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
2023227e6cfSchs CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
2033227e6cfSchs sysctl_zfs_async_write_active_min_dirty_percent, "I",
2043227e6cfSchs "Percentage of async write dirty data below which "
2053227e6cfSchs "async_write_min_active is used.");
2063227e6cfSchs
2073227e6cfSchs static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
2083227e6cfSchs SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
2093227e6cfSchs CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
2103227e6cfSchs sysctl_zfs_async_write_active_max_dirty_percent, "I",
2113227e6cfSchs "Percentage of async write dirty data above which "
2123227e6cfSchs "async_write_max_active is used.");
2133227e6cfSchs
2143227e6cfSchs SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
2153227e6cfSchs &zfs_vdev_max_active, 0,
2163227e6cfSchs "The maximum number of I/Os of all types active for each device.");
2173227e6cfSchs
2183227e6cfSchs #define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
2193227e6cfSchs SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
2203227e6cfSchs &zfs_vdev_ ## name ## _min_active, 0, \
2213227e6cfSchs "Initial number of I/O requests of type " #name \
2223227e6cfSchs " active for each device");
2233227e6cfSchs
2243227e6cfSchs #define ZFS_VDEV_QUEUE_KNOB_MAX(name) \
2253227e6cfSchs SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\
2263227e6cfSchs &zfs_vdev_ ## name ## _max_active, 0, \
2273227e6cfSchs "Maximum number of I/O requests of type " #name \
2283227e6cfSchs " active for each device");
2293227e6cfSchs
2303227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
2313227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
2323227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
2333227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
2343227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
2353227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
2363227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
2373227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
2383227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
2393227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
2403227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MIN(trim);
2413227e6cfSchs ZFS_VDEV_QUEUE_KNOB_MAX(trim);
2423227e6cfSchs
2433227e6cfSchs #undef ZFS_VDEV_QUEUE_KNOB
2443227e6cfSchs
2453227e6cfSchs SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
2463227e6cfSchs &zfs_vdev_aggregation_limit, 0,
2473227e6cfSchs "I/O requests are aggregated up to this size");
2483227e6cfSchs SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
2493227e6cfSchs &zfs_vdev_read_gap_limit, 0,
2503227e6cfSchs "Acceptable gap between two reads being aggregated");
2513227e6cfSchs SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
2523227e6cfSchs &zfs_vdev_write_gap_limit, 0,
2533227e6cfSchs "Acceptable gap between two writes being aggregated");
2543227e6cfSchs SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
2553227e6cfSchs &zfs_vdev_queue_depth_pct, 0,
2563227e6cfSchs "Queue depth percentage for each top-level");
2573227e6cfSchs
2583227e6cfSchs static int
sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)2593227e6cfSchs sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
260c1cb2cd8Shaad {
2613227e6cfSchs int val, err;
262c1cb2cd8Shaad
2633227e6cfSchs val = zfs_vdev_async_write_active_min_dirty_percent;
2643227e6cfSchs err = sysctl_handle_int(oidp, &val, 0, req);
2653227e6cfSchs if (err != 0 || req->newptr == NULL)
2663227e6cfSchs return (err);
267c1cb2cd8Shaad
2683227e6cfSchs if (val < 0 || val > 100 ||
2693227e6cfSchs val >= zfs_vdev_async_write_active_max_dirty_percent)
2703227e6cfSchs return (EINVAL);
271c1cb2cd8Shaad
2723227e6cfSchs zfs_vdev_async_write_active_min_dirty_percent = val;
273c1cb2cd8Shaad
274c1cb2cd8Shaad return (0);
275c1cb2cd8Shaad }
276c1cb2cd8Shaad
2773227e6cfSchs static int
sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)2783227e6cfSchs sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
2793227e6cfSchs {
2803227e6cfSchs int val, err;
2813227e6cfSchs
2823227e6cfSchs val = zfs_vdev_async_write_active_max_dirty_percent;
2833227e6cfSchs err = sysctl_handle_int(oidp, &val, 0, req);
2843227e6cfSchs if (err != 0 || req->newptr == NULL)
2853227e6cfSchs return (err);
2863227e6cfSchs
2873227e6cfSchs if (val < 0 || val > 100 ||
2883227e6cfSchs val <= zfs_vdev_async_write_active_min_dirty_percent)
2893227e6cfSchs return (EINVAL);
2903227e6cfSchs
2913227e6cfSchs zfs_vdev_async_write_active_max_dirty_percent = val;
2923227e6cfSchs
2933227e6cfSchs return (0);
2943227e6cfSchs }
2953227e6cfSchs #endif
2963227e6cfSchs #endif
2973227e6cfSchs
298c1cb2cd8Shaad int
vdev_queue_offset_compare(const void * x1,const void * x2)299c1cb2cd8Shaad vdev_queue_offset_compare(const void *x1, const void *x2)
300c1cb2cd8Shaad {
301c1cb2cd8Shaad const zio_t *z1 = x1;
302c1cb2cd8Shaad const zio_t *z2 = x2;
303c1cb2cd8Shaad
304c1cb2cd8Shaad if (z1->io_offset < z2->io_offset)
305c1cb2cd8Shaad return (-1);
306c1cb2cd8Shaad if (z1->io_offset > z2->io_offset)
307c1cb2cd8Shaad return (1);
308c1cb2cd8Shaad
309c1cb2cd8Shaad if (z1 < z2)
310c1cb2cd8Shaad return (-1);
311c1cb2cd8Shaad if (z1 > z2)
312c1cb2cd8Shaad return (1);
313c1cb2cd8Shaad
314c1cb2cd8Shaad return (0);
315c1cb2cd8Shaad }
316c1cb2cd8Shaad
3173227e6cfSchs static inline avl_tree_t *
vdev_queue_class_tree(vdev_queue_t * vq,zio_priority_t p)3183227e6cfSchs vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
3193227e6cfSchs {
3203227e6cfSchs return (&vq->vq_class[p].vqc_queued_tree);
3213227e6cfSchs }
3223227e6cfSchs
3233227e6cfSchs static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t * vq,zio_type_t t)3243227e6cfSchs vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
3253227e6cfSchs {
3263227e6cfSchs if (t == ZIO_TYPE_READ)
3273227e6cfSchs return (&vq->vq_read_offset_tree);
3283227e6cfSchs else if (t == ZIO_TYPE_WRITE)
3293227e6cfSchs return (&vq->vq_write_offset_tree);
3303227e6cfSchs else
3313227e6cfSchs return (NULL);
3323227e6cfSchs }
3333227e6cfSchs
3343227e6cfSchs int
vdev_queue_timestamp_compare(const void * x1,const void * x2)3353227e6cfSchs vdev_queue_timestamp_compare(const void *x1, const void *x2)
3363227e6cfSchs {
3373227e6cfSchs const zio_t *z1 = x1;
3383227e6cfSchs const zio_t *z2 = x2;
3393227e6cfSchs
3403227e6cfSchs if (z1->io_timestamp < z2->io_timestamp)
3413227e6cfSchs return (-1);
3423227e6cfSchs if (z1->io_timestamp > z2->io_timestamp)
3433227e6cfSchs return (1);
3443227e6cfSchs
3453227e6cfSchs if (z1->io_offset < z2->io_offset)
3463227e6cfSchs return (-1);
3473227e6cfSchs if (z1->io_offset > z2->io_offset)
3483227e6cfSchs return (1);
3493227e6cfSchs
3503227e6cfSchs if (z1 < z2)
3513227e6cfSchs return (-1);
3523227e6cfSchs if (z1 > z2)
3533227e6cfSchs return (1);
3543227e6cfSchs
3553227e6cfSchs return (0);
3563227e6cfSchs }
3573227e6cfSchs
358c1cb2cd8Shaad void
vdev_queue_init(vdev_t * vd)359c1cb2cd8Shaad vdev_queue_init(vdev_t *vd)
360c1cb2cd8Shaad {
361c1cb2cd8Shaad vdev_queue_t *vq = &vd->vdev_queue;
362c1cb2cd8Shaad
363c1cb2cd8Shaad mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
3643227e6cfSchs vq->vq_vdev = vd;
365c1cb2cd8Shaad
3663227e6cfSchs avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
3673227e6cfSchs sizeof (zio_t), offsetof(struct zio, io_queue_node));
3683227e6cfSchs avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
3693227e6cfSchs vdev_queue_offset_compare, sizeof (zio_t),
3703227e6cfSchs offsetof(struct zio, io_offset_node));
3713227e6cfSchs avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
3723227e6cfSchs vdev_queue_offset_compare, sizeof (zio_t),
3733227e6cfSchs offsetof(struct zio, io_offset_node));
374c1cb2cd8Shaad
3753227e6cfSchs for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
3763227e6cfSchs int (*compfn) (const void *, const void *);
377c1cb2cd8Shaad
3783227e6cfSchs /*
3793227e6cfSchs * The synchronous i/o queues are dispatched in FIFO rather
3803227e6cfSchs * than LBA order. This provides more consistent latency for
3813227e6cfSchs * these i/os.
3823227e6cfSchs */
3833227e6cfSchs if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
3843227e6cfSchs compfn = vdev_queue_timestamp_compare;
3853227e6cfSchs else
3863227e6cfSchs compfn = vdev_queue_offset_compare;
387c1cb2cd8Shaad
3883227e6cfSchs avl_create(vdev_queue_class_tree(vq, p), compfn,
3893227e6cfSchs sizeof (zio_t), offsetof(struct zio, io_queue_node));
3903227e6cfSchs }
3913227e6cfSchs
3923227e6cfSchs vq->vq_lastoffset = 0;
393c1cb2cd8Shaad }
394c1cb2cd8Shaad
395c1cb2cd8Shaad void
vdev_queue_fini(vdev_t * vd)396c1cb2cd8Shaad vdev_queue_fini(vdev_t *vd)
397c1cb2cd8Shaad {
398c1cb2cd8Shaad vdev_queue_t *vq = &vd->vdev_queue;
399c1cb2cd8Shaad
4003227e6cfSchs for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
4013227e6cfSchs avl_destroy(vdev_queue_class_tree(vq, p));
4023227e6cfSchs avl_destroy(&vq->vq_active_tree);
4033227e6cfSchs avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
4043227e6cfSchs avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
405c1cb2cd8Shaad
406c1cb2cd8Shaad mutex_destroy(&vq->vq_lock);
407c1cb2cd8Shaad }
408c1cb2cd8Shaad
409c1cb2cd8Shaad static void
vdev_queue_io_add(vdev_queue_t * vq,zio_t * zio)410c1cb2cd8Shaad vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
411c1cb2cd8Shaad {
4123227e6cfSchs spa_t *spa = zio->io_spa;
4133227e6cfSchs avl_tree_t *qtt;
4143227e6cfSchs
4153227e6cfSchs ASSERT(MUTEX_HELD(&vq->vq_lock));
4163227e6cfSchs ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
4173227e6cfSchs avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
4183227e6cfSchs qtt = vdev_queue_type_tree(vq, zio->io_type);
4193227e6cfSchs if (qtt)
4203227e6cfSchs avl_add(qtt, zio);
4213227e6cfSchs
4223227e6cfSchs #ifdef illumos
4233227e6cfSchs mutex_enter(&spa->spa_iokstat_lock);
4243227e6cfSchs spa->spa_queue_stats[zio->io_priority].spa_queued++;
4253227e6cfSchs if (spa->spa_iokstat != NULL)
4263227e6cfSchs kstat_waitq_enter(spa->spa_iokstat->ks_data);
4273227e6cfSchs mutex_exit(&spa->spa_iokstat_lock);
4283227e6cfSchs #endif
429c1cb2cd8Shaad }
430c1cb2cd8Shaad
431c1cb2cd8Shaad static void
vdev_queue_io_remove(vdev_queue_t * vq,zio_t * zio)432c1cb2cd8Shaad vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
433c1cb2cd8Shaad {
4343227e6cfSchs spa_t *spa = zio->io_spa;
4353227e6cfSchs avl_tree_t *qtt;
4363227e6cfSchs
4373227e6cfSchs ASSERT(MUTEX_HELD(&vq->vq_lock));
4383227e6cfSchs ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
4393227e6cfSchs avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
4403227e6cfSchs qtt = vdev_queue_type_tree(vq, zio->io_type);
4413227e6cfSchs if (qtt)
4423227e6cfSchs avl_remove(qtt, zio);
4433227e6cfSchs
4443227e6cfSchs #ifdef illumos
4453227e6cfSchs mutex_enter(&spa->spa_iokstat_lock);
4463227e6cfSchs ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
4473227e6cfSchs spa->spa_queue_stats[zio->io_priority].spa_queued--;
4483227e6cfSchs if (spa->spa_iokstat != NULL)
4493227e6cfSchs kstat_waitq_exit(spa->spa_iokstat->ks_data);
4503227e6cfSchs mutex_exit(&spa->spa_iokstat_lock);
4513227e6cfSchs #endif
4523227e6cfSchs }
4533227e6cfSchs
4543227e6cfSchs static void
vdev_queue_pending_add(vdev_queue_t * vq,zio_t * zio)4553227e6cfSchs vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
4563227e6cfSchs {
4573227e6cfSchs spa_t *spa = zio->io_spa;
4583227e6cfSchs ASSERT(MUTEX_HELD(&vq->vq_lock));
4593227e6cfSchs ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
4603227e6cfSchs vq->vq_class[zio->io_priority].vqc_active++;
4613227e6cfSchs avl_add(&vq->vq_active_tree, zio);
4623227e6cfSchs
4633227e6cfSchs #ifdef illumos
4643227e6cfSchs mutex_enter(&spa->spa_iokstat_lock);
4653227e6cfSchs spa->spa_queue_stats[zio->io_priority].spa_active++;
4663227e6cfSchs if (spa->spa_iokstat != NULL)
4673227e6cfSchs kstat_runq_enter(spa->spa_iokstat->ks_data);
4683227e6cfSchs mutex_exit(&spa->spa_iokstat_lock);
4693227e6cfSchs #endif
4703227e6cfSchs }
4713227e6cfSchs
4723227e6cfSchs static void
vdev_queue_pending_remove(vdev_queue_t * vq,zio_t * zio)4733227e6cfSchs vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
4743227e6cfSchs {
4753227e6cfSchs spa_t *spa = zio->io_spa;
4763227e6cfSchs ASSERT(MUTEX_HELD(&vq->vq_lock));
4773227e6cfSchs ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
4783227e6cfSchs vq->vq_class[zio->io_priority].vqc_active--;
4793227e6cfSchs avl_remove(&vq->vq_active_tree, zio);
4803227e6cfSchs
4813227e6cfSchs #ifdef illumos
4823227e6cfSchs mutex_enter(&spa->spa_iokstat_lock);
4833227e6cfSchs ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
4843227e6cfSchs spa->spa_queue_stats[zio->io_priority].spa_active--;
4853227e6cfSchs if (spa->spa_iokstat != NULL) {
4863227e6cfSchs kstat_io_t *ksio = spa->spa_iokstat->ks_data;
4873227e6cfSchs
4883227e6cfSchs kstat_runq_exit(spa->spa_iokstat->ks_data);
4893227e6cfSchs if (zio->io_type == ZIO_TYPE_READ) {
4903227e6cfSchs ksio->reads++;
4913227e6cfSchs ksio->nread += zio->io_size;
4923227e6cfSchs } else if (zio->io_type == ZIO_TYPE_WRITE) {
4933227e6cfSchs ksio->writes++;
4943227e6cfSchs ksio->nwritten += zio->io_size;
4953227e6cfSchs }
4963227e6cfSchs }
4973227e6cfSchs mutex_exit(&spa->spa_iokstat_lock);
4983227e6cfSchs #endif
499c1cb2cd8Shaad }
500c1cb2cd8Shaad
501c1cb2cd8Shaad static void
vdev_queue_agg_io_done(zio_t * aio)502c1cb2cd8Shaad vdev_queue_agg_io_done(zio_t *aio)
503c1cb2cd8Shaad {
5043227e6cfSchs if (aio->io_type == ZIO_TYPE_READ) {
505f59c7639Shaad zio_t *pio;
5063227e6cfSchs zio_link_t *zl = NULL;
5073227e6cfSchs while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
508f59c7639Shaad bcopy((char *)aio->io_data + (pio->io_offset -
509f59c7639Shaad aio->io_offset), pio->io_data, pio->io_size);
5103227e6cfSchs }
5113227e6cfSchs }
512c1cb2cd8Shaad
513c1cb2cd8Shaad zio_buf_free(aio->io_data, aio->io_size);
514c1cb2cd8Shaad }
515c1cb2cd8Shaad
5163227e6cfSchs static int
vdev_queue_class_min_active(zio_priority_t p)5173227e6cfSchs vdev_queue_class_min_active(zio_priority_t p)
5183227e6cfSchs {
5193227e6cfSchs switch (p) {
5203227e6cfSchs case ZIO_PRIORITY_SYNC_READ:
5213227e6cfSchs return (zfs_vdev_sync_read_min_active);
5223227e6cfSchs case ZIO_PRIORITY_SYNC_WRITE:
5233227e6cfSchs return (zfs_vdev_sync_write_min_active);
5243227e6cfSchs case ZIO_PRIORITY_ASYNC_READ:
5253227e6cfSchs return (zfs_vdev_async_read_min_active);
5263227e6cfSchs case ZIO_PRIORITY_ASYNC_WRITE:
5273227e6cfSchs return (zfs_vdev_async_write_min_active);
5283227e6cfSchs case ZIO_PRIORITY_SCRUB:
5293227e6cfSchs return (zfs_vdev_scrub_min_active);
5303227e6cfSchs case ZIO_PRIORITY_TRIM:
5313227e6cfSchs return (zfs_vdev_trim_min_active);
5323227e6cfSchs default:
5333227e6cfSchs panic("invalid priority %u", p);
5343227e6cfSchs return (0);
5353227e6cfSchs }
5363227e6cfSchs }
5373227e6cfSchs
5383227e6cfSchs static __noinline int
vdev_queue_max_async_writes(spa_t * spa)5393227e6cfSchs vdev_queue_max_async_writes(spa_t *spa)
5403227e6cfSchs {
5413227e6cfSchs int writes;
5423227e6cfSchs uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
5433227e6cfSchs uint64_t min_bytes = zfs_dirty_data_max *
5443227e6cfSchs zfs_vdev_async_write_active_min_dirty_percent / 100;
5453227e6cfSchs uint64_t max_bytes = zfs_dirty_data_max *
5463227e6cfSchs zfs_vdev_async_write_active_max_dirty_percent / 100;
5473227e6cfSchs
5483227e6cfSchs /*
5493227e6cfSchs * Sync tasks correspond to interactive user actions. To reduce the
5503227e6cfSchs * execution time of those actions we push data out as fast as possible.
5513227e6cfSchs */
5523227e6cfSchs if (spa_has_pending_synctask(spa)) {
5533227e6cfSchs return (zfs_vdev_async_write_max_active);
5543227e6cfSchs }
5553227e6cfSchs
5563227e6cfSchs if (dirty < min_bytes)
5573227e6cfSchs return (zfs_vdev_async_write_min_active);
5583227e6cfSchs if (dirty > max_bytes)
5593227e6cfSchs return (zfs_vdev_async_write_max_active);
5603227e6cfSchs
5613227e6cfSchs /*
5623227e6cfSchs * linear interpolation:
5633227e6cfSchs * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
5643227e6cfSchs * move right by min_bytes
5653227e6cfSchs * move up by min_writes
5663227e6cfSchs */
5673227e6cfSchs writes = (dirty - min_bytes) *
5683227e6cfSchs (zfs_vdev_async_write_max_active -
5693227e6cfSchs zfs_vdev_async_write_min_active) /
5703227e6cfSchs (max_bytes - min_bytes) +
5713227e6cfSchs zfs_vdev_async_write_min_active;
5723227e6cfSchs ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
5733227e6cfSchs ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
5743227e6cfSchs return (writes);
5753227e6cfSchs }
5763227e6cfSchs
5773227e6cfSchs static int
vdev_queue_class_max_active(spa_t * spa,zio_priority_t p)5783227e6cfSchs vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
5793227e6cfSchs {
5803227e6cfSchs switch (p) {
5813227e6cfSchs case ZIO_PRIORITY_SYNC_READ:
5823227e6cfSchs return (zfs_vdev_sync_read_max_active);
5833227e6cfSchs case ZIO_PRIORITY_SYNC_WRITE:
5843227e6cfSchs return (zfs_vdev_sync_write_max_active);
5853227e6cfSchs case ZIO_PRIORITY_ASYNC_READ:
5863227e6cfSchs return (zfs_vdev_async_read_max_active);
5873227e6cfSchs case ZIO_PRIORITY_ASYNC_WRITE:
5883227e6cfSchs return (vdev_queue_max_async_writes(spa));
5893227e6cfSchs case ZIO_PRIORITY_SCRUB:
5903227e6cfSchs return (zfs_vdev_scrub_max_active);
5913227e6cfSchs case ZIO_PRIORITY_TRIM:
5923227e6cfSchs return (zfs_vdev_trim_max_active);
5933227e6cfSchs default:
5943227e6cfSchs panic("invalid priority %u", p);
5953227e6cfSchs return (0);
5963227e6cfSchs }
5973227e6cfSchs }
5983227e6cfSchs
5993227e6cfSchs /*
6003227e6cfSchs * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
6013227e6cfSchs * there is no eligible class.
6023227e6cfSchs */
6033227e6cfSchs static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t * vq)6043227e6cfSchs vdev_queue_class_to_issue(vdev_queue_t *vq)
6053227e6cfSchs {
6063227e6cfSchs spa_t *spa = vq->vq_vdev->vdev_spa;
6073227e6cfSchs zio_priority_t p;
6083227e6cfSchs
6093227e6cfSchs ASSERT(MUTEX_HELD(&vq->vq_lock));
6103227e6cfSchs
6113227e6cfSchs if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
6123227e6cfSchs return (ZIO_PRIORITY_NUM_QUEUEABLE);
6133227e6cfSchs
6143227e6cfSchs /* find a queue that has not reached its minimum # outstanding i/os */
6153227e6cfSchs for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
6163227e6cfSchs if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
6173227e6cfSchs vq->vq_class[p].vqc_active <
6183227e6cfSchs vdev_queue_class_min_active(p))
6193227e6cfSchs return (p);
6203227e6cfSchs }
6213227e6cfSchs
6223227e6cfSchs /*
6233227e6cfSchs * If we haven't found a queue, look for one that hasn't reached its
6243227e6cfSchs * maximum # outstanding i/os.
6253227e6cfSchs */
6263227e6cfSchs for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
6273227e6cfSchs if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
6283227e6cfSchs vq->vq_class[p].vqc_active <
6293227e6cfSchs vdev_queue_class_max_active(spa, p))
6303227e6cfSchs return (p);
6313227e6cfSchs }
6323227e6cfSchs
6333227e6cfSchs /* No eligible queued i/os */
6343227e6cfSchs return (ZIO_PRIORITY_NUM_QUEUEABLE);
6353227e6cfSchs }
6363227e6cfSchs
637f59c7639Shaad /*
638f59c7639Shaad * Compute the range spanned by two i/os, which is the endpoint of the last
639f59c7639Shaad * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
640f59c7639Shaad * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
641f59c7639Shaad * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
642f59c7639Shaad */
643f59c7639Shaad #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
644f59c7639Shaad #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
645c1cb2cd8Shaad
646c1cb2cd8Shaad static zio_t *
vdev_queue_aggregate(vdev_queue_t * vq,zio_t * zio)6473227e6cfSchs vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
648c1cb2cd8Shaad {
6493227e6cfSchs zio_t *first, *last, *aio, *dio, *mandatory, *nio;
6503227e6cfSchs void *abuf;
6513227e6cfSchs uint64_t maxgap = 0;
6523227e6cfSchs uint64_t size;
6533227e6cfSchs boolean_t stretch;
654f59c7639Shaad avl_tree_t *t;
6553227e6cfSchs enum zio_flag flags;
656c1cb2cd8Shaad
657c1cb2cd8Shaad ASSERT(MUTEX_HELD(&vq->vq_lock));
658c1cb2cd8Shaad
6593227e6cfSchs if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
660c1cb2cd8Shaad return (NULL);
661c1cb2cd8Shaad
6623227e6cfSchs first = last = zio;
663c1cb2cd8Shaad
6643227e6cfSchs if (zio->io_type == ZIO_TYPE_READ)
6653227e6cfSchs maxgap = zfs_vdev_read_gap_limit;
666c1cb2cd8Shaad
667f59c7639Shaad /*
668f59c7639Shaad * We can aggregate I/Os that are sufficiently adjacent and of
669f59c7639Shaad * the same flavor, as expressed by the AGG_INHERIT flags.
670f59c7639Shaad * The latter requirement is necessary so that certain
671f59c7639Shaad * attributes of the I/O, such as whether it's a normal I/O
672f59c7639Shaad * or a scrub/resilver, can be preserved in the aggregate.
673f59c7639Shaad * We can include optional I/Os, but don't allow them
674f59c7639Shaad * to begin a range as they add no benefit in that situation.
675f59c7639Shaad */
676f59c7639Shaad
677f59c7639Shaad /*
678f59c7639Shaad * We keep track of the last non-optional I/O.
679f59c7639Shaad */
6803227e6cfSchs mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
681f59c7639Shaad
682f59c7639Shaad /*
683f59c7639Shaad * Walk backwards through sufficiently contiguous I/Os
684f59c7639Shaad * recording the last non-option I/O.
685f59c7639Shaad */
6863227e6cfSchs flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
6873227e6cfSchs t = vdev_queue_type_tree(vq, zio->io_type);
6883227e6cfSchs while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
689f59c7639Shaad (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
6903227e6cfSchs IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
6913227e6cfSchs IO_GAP(dio, first) <= maxgap) {
6923227e6cfSchs first = dio;
6933227e6cfSchs if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
6943227e6cfSchs mandatory = first;
695c1cb2cd8Shaad }
696c1cb2cd8Shaad
697f59c7639Shaad /*
698f59c7639Shaad * Skip any initial optional I/Os.
699f59c7639Shaad */
7003227e6cfSchs while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
7013227e6cfSchs first = AVL_NEXT(t, first);
7023227e6cfSchs ASSERT(first != NULL);
703f59c7639Shaad }
704f59c7639Shaad
705f59c7639Shaad /*
706f59c7639Shaad * Walk forward through sufficiently contiguous I/Os.
707f59c7639Shaad */
7083227e6cfSchs while ((dio = AVL_NEXT(t, last)) != NULL &&
709f59c7639Shaad (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
7103227e6cfSchs IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
7113227e6cfSchs IO_GAP(last, dio) <= maxgap) {
7123227e6cfSchs last = dio;
7133227e6cfSchs if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
7143227e6cfSchs mandatory = last;
715f59c7639Shaad }
716f59c7639Shaad
717f59c7639Shaad /*
718f59c7639Shaad * Now that we've established the range of the I/O aggregation
719f59c7639Shaad * we must decide what to do with trailing optional I/Os.
720f59c7639Shaad * For reads, there's nothing to do. While we are unable to
721f59c7639Shaad * aggregate further, it's possible that a trailing optional
722f59c7639Shaad * I/O would allow the underlying device to aggregate with
723f59c7639Shaad * subsequent I/Os. We must therefore determine if the next
724f59c7639Shaad * non-optional I/O is close enough to make aggregation
725f59c7639Shaad * worthwhile.
726f59c7639Shaad */
727f59c7639Shaad stretch = B_FALSE;
7283227e6cfSchs if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
7293227e6cfSchs zio_t *nio = last;
730f59c7639Shaad while ((dio = AVL_NEXT(t, nio)) != NULL &&
731f59c7639Shaad IO_GAP(nio, dio) == 0 &&
7323227e6cfSchs IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
733f59c7639Shaad nio = dio;
734f59c7639Shaad if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
735f59c7639Shaad stretch = B_TRUE;
736f59c7639Shaad break;
737f59c7639Shaad }
738f59c7639Shaad }
739f59c7639Shaad }
740f59c7639Shaad
741f59c7639Shaad if (stretch) {
742f59c7639Shaad /* This may be a no-op. */
7433227e6cfSchs dio = AVL_NEXT(t, last);
744f59c7639Shaad dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
745f59c7639Shaad } else {
7463227e6cfSchs while (last != mandatory && last != first) {
7473227e6cfSchs ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
7483227e6cfSchs last = AVL_PREV(t, last);
7493227e6cfSchs ASSERT(last != NULL);
750f59c7639Shaad }
751c1cb2cd8Shaad }
752c1cb2cd8Shaad
7533227e6cfSchs if (first == last)
7543227e6cfSchs return (NULL);
755c1cb2cd8Shaad
7563227e6cfSchs size = IO_SPAN(first, last);
7573227e6cfSchs ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
7583227e6cfSchs
7593227e6cfSchs abuf = zio_buf_alloc_nowait(size);
7603227e6cfSchs if (abuf == NULL)
7613227e6cfSchs return (NULL);
7623227e6cfSchs
7633227e6cfSchs aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
7643227e6cfSchs abuf, size, first->io_type, zio->io_priority,
765f59c7639Shaad flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
766c1cb2cd8Shaad vdev_queue_agg_io_done, NULL);
7673227e6cfSchs aio->io_timestamp = first->io_timestamp;
768c1cb2cd8Shaad
7693227e6cfSchs nio = first;
770f59c7639Shaad do {
771f59c7639Shaad dio = nio;
772f59c7639Shaad nio = AVL_NEXT(t, dio);
7733227e6cfSchs ASSERT3U(dio->io_type, ==, aio->io_type);
774f59c7639Shaad
775f59c7639Shaad if (dio->io_flags & ZIO_FLAG_NODATA) {
7763227e6cfSchs ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
777f59c7639Shaad bzero((char *)aio->io_data + (dio->io_offset -
778f59c7639Shaad aio->io_offset), dio->io_size);
779f59c7639Shaad } else if (dio->io_type == ZIO_TYPE_WRITE) {
780f59c7639Shaad bcopy(dio->io_data, (char *)aio->io_data +
781f59c7639Shaad (dio->io_offset - aio->io_offset),
782f59c7639Shaad dio->io_size);
783c1cb2cd8Shaad }
784c1cb2cd8Shaad
785f59c7639Shaad zio_add_child(dio, aio);
786f59c7639Shaad vdev_queue_io_remove(vq, dio);
787f59c7639Shaad zio_vdev_io_bypass(dio);
788f59c7639Shaad zio_execute(dio);
7893227e6cfSchs } while (dio != last);
790c1cb2cd8Shaad
791c1cb2cd8Shaad return (aio);
792c1cb2cd8Shaad }
793c1cb2cd8Shaad
7943227e6cfSchs static zio_t *
vdev_queue_io_to_issue(vdev_queue_t * vq)7953227e6cfSchs vdev_queue_io_to_issue(vdev_queue_t *vq)
7963227e6cfSchs {
7973227e6cfSchs zio_t *zio, *aio;
7983227e6cfSchs zio_priority_t p;
7993227e6cfSchs avl_index_t idx;
8003227e6cfSchs avl_tree_t *tree;
8015964fe3bSjdolecek zio_t *search;
8023227e6cfSchs
8033227e6cfSchs again:
8043227e6cfSchs ASSERT(MUTEX_HELD(&vq->vq_lock));
8053227e6cfSchs
8063227e6cfSchs p = vdev_queue_class_to_issue(vq);
8073227e6cfSchs
8083227e6cfSchs if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
8093227e6cfSchs /* No eligible queued i/os */
8103227e6cfSchs return (NULL);
8113227e6cfSchs }
8123227e6cfSchs
8133227e6cfSchs /*
8143227e6cfSchs * For LBA-ordered queues (async / scrub), issue the i/o which follows
8153227e6cfSchs * the most recently issued i/o in LBA (offset) order.
8163227e6cfSchs *
8173227e6cfSchs * For FIFO queues (sync), issue the i/o with the lowest timestamp.
8183227e6cfSchs */
8193227e6cfSchs tree = vdev_queue_class_tree(vq, p);
8205964fe3bSjdolecek search = kmem_zalloc(sizeof (*search), KM_NOSLEEP);
8215964fe3bSjdolecek if (search) {
8225964fe3bSjdolecek search->io_offset = vq->vq_last_offset + 1;
823*0762753cSjdolecek VERIFY3P(avl_find(tree, search, &idx), ==, NULL);
8245964fe3bSjdolecek kmem_free(search, sizeof (*search));
8253227e6cfSchs zio = avl_nearest(tree, idx, AVL_AFTER);
8265964fe3bSjdolecek } else {
8275964fe3bSjdolecek /* Can't find nearest, fallback to first */
8285964fe3bSjdolecek zio = NULL;
8295964fe3bSjdolecek }
8303227e6cfSchs if (zio == NULL)
8313227e6cfSchs zio = avl_first(tree);
8323227e6cfSchs ASSERT3U(zio->io_priority, ==, p);
8333227e6cfSchs
8343227e6cfSchs aio = vdev_queue_aggregate(vq, zio);
8353227e6cfSchs if (aio != NULL)
8363227e6cfSchs zio = aio;
8373227e6cfSchs else
8383227e6cfSchs vdev_queue_io_remove(vq, zio);
839c1cb2cd8Shaad
840f59c7639Shaad /*
841f59c7639Shaad * If the I/O is or was optional and therefore has no data, we need to
842f59c7639Shaad * simply discard it. We need to drop the vdev queue's lock to avoid a
843f59c7639Shaad * deadlock that we could encounter since this I/O will complete
844f59c7639Shaad * immediately.
845f59c7639Shaad */
8463227e6cfSchs if (zio->io_flags & ZIO_FLAG_NODATA) {
847f59c7639Shaad mutex_exit(&vq->vq_lock);
8483227e6cfSchs zio_vdev_io_bypass(zio);
8493227e6cfSchs zio_execute(zio);
850f59c7639Shaad mutex_enter(&vq->vq_lock);
851f59c7639Shaad goto again;
852f59c7639Shaad }
853f59c7639Shaad
8543227e6cfSchs vdev_queue_pending_add(vq, zio);
8553227e6cfSchs vq->vq_last_offset = zio->io_offset;
856c1cb2cd8Shaad
8573227e6cfSchs return (zio);
858c1cb2cd8Shaad }
859c1cb2cd8Shaad
860c1cb2cd8Shaad zio_t *
vdev_queue_io(zio_t * zio)861c1cb2cd8Shaad vdev_queue_io(zio_t *zio)
862c1cb2cd8Shaad {
863c1cb2cd8Shaad vdev_queue_t *vq = &zio->io_vd->vdev_queue;
864c1cb2cd8Shaad zio_t *nio;
865c1cb2cd8Shaad
866c1cb2cd8Shaad if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
867c1cb2cd8Shaad return (zio);
868c1cb2cd8Shaad
8693227e6cfSchs /*
8703227e6cfSchs * Children i/os inherent their parent's priority, which might
8713227e6cfSchs * not match the child's i/o type. Fix it up here.
8723227e6cfSchs */
8733227e6cfSchs if (zio->io_type == ZIO_TYPE_READ) {
8743227e6cfSchs if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
8753227e6cfSchs zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
8763227e6cfSchs zio->io_priority != ZIO_PRIORITY_SCRUB)
8773227e6cfSchs zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
8783227e6cfSchs } else if (zio->io_type == ZIO_TYPE_WRITE) {
8793227e6cfSchs if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
8803227e6cfSchs zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
8813227e6cfSchs zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
8823227e6cfSchs } else {
8833227e6cfSchs ASSERT(zio->io_type == ZIO_TYPE_FREE);
8843227e6cfSchs zio->io_priority = ZIO_PRIORITY_TRIM;
8853227e6cfSchs }
8863227e6cfSchs
887c1cb2cd8Shaad zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
888c1cb2cd8Shaad
889c1cb2cd8Shaad mutex_enter(&vq->vq_lock);
8903227e6cfSchs zio->io_timestamp = gethrtime();
891c1cb2cd8Shaad vdev_queue_io_add(vq, zio);
8923227e6cfSchs nio = vdev_queue_io_to_issue(vq);
893c1cb2cd8Shaad mutex_exit(&vq->vq_lock);
894c1cb2cd8Shaad
895c1cb2cd8Shaad if (nio == NULL)
896c1cb2cd8Shaad return (NULL);
897c1cb2cd8Shaad
898c1cb2cd8Shaad if (nio->io_done == vdev_queue_agg_io_done) {
899c1cb2cd8Shaad zio_nowait(nio);
900c1cb2cd8Shaad return (NULL);
901c1cb2cd8Shaad }
902c1cb2cd8Shaad
903c1cb2cd8Shaad return (nio);
904c1cb2cd8Shaad }
905c1cb2cd8Shaad
906c1cb2cd8Shaad void
vdev_queue_io_done(zio_t * zio)907c1cb2cd8Shaad vdev_queue_io_done(zio_t *zio)
908c1cb2cd8Shaad {
909c1cb2cd8Shaad vdev_queue_t *vq = &zio->io_vd->vdev_queue;
9103227e6cfSchs zio_t *nio;
911c1cb2cd8Shaad
912c1cb2cd8Shaad mutex_enter(&vq->vq_lock);
913c1cb2cd8Shaad
9143227e6cfSchs vdev_queue_pending_remove(vq, zio);
915c1cb2cd8Shaad
9163227e6cfSchs vq->vq_io_complete_ts = gethrtime();
9173227e6cfSchs
9183227e6cfSchs while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
919c1cb2cd8Shaad mutex_exit(&vq->vq_lock);
920c1cb2cd8Shaad if (nio->io_done == vdev_queue_agg_io_done) {
921c1cb2cd8Shaad zio_nowait(nio);
922c1cb2cd8Shaad } else {
923c1cb2cd8Shaad zio_vdev_io_reissue(nio);
924c1cb2cd8Shaad zio_execute(nio);
925c1cb2cd8Shaad }
926c1cb2cd8Shaad mutex_enter(&vq->vq_lock);
927c1cb2cd8Shaad }
928c1cb2cd8Shaad
929c1cb2cd8Shaad mutex_exit(&vq->vq_lock);
930c1cb2cd8Shaad }
9313227e6cfSchs
9323227e6cfSchs /*
9333227e6cfSchs * As these three methods are only used for load calculations we're not concerned
9343227e6cfSchs * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
9353227e6cfSchs * use here, instead we prefer to keep it lock free for performance.
9363227e6cfSchs */
9373227e6cfSchs int
vdev_queue_length(vdev_t * vd)9383227e6cfSchs vdev_queue_length(vdev_t *vd)
9393227e6cfSchs {
9403227e6cfSchs return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
9413227e6cfSchs }
9423227e6cfSchs
9433227e6cfSchs uint64_t
vdev_queue_lastoffset(vdev_t * vd)9443227e6cfSchs vdev_queue_lastoffset(vdev_t *vd)
9453227e6cfSchs {
9463227e6cfSchs return (vd->vdev_queue.vq_lastoffset);
9473227e6cfSchs }
9483227e6cfSchs
9493227e6cfSchs void
vdev_queue_register_lastoffset(vdev_t * vd,zio_t * zio)9503227e6cfSchs vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
9513227e6cfSchs {
9523227e6cfSchs vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
9533227e6cfSchs }
954