From e949c5e91d6a986541a6dd39467183aa9b04141c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 18 Oct 2023 15:06:56 +1100 Subject: [PATCH] vdev_disk: be smarter about sizing the bio array With the max segments for the bio now generally much smaller, and varying depending on the device/driver characteristics, the intial array size of 16 is totally inadequate. The main reason for getting this right is so we don't have to throw away all the bios built so far and restart. However there's no particular reason to do it this way; the bios constructed so far are perfectly fine; we just ran out of room to track them. So this commit changes it so if we run out of room in the bio array, instead of throwing away the contents entirely, we just create a larger one and copy the existing bios into it, and keep going. Finally, to decide the right initial size for the bio array, we take a guess at the likely size by dividing the io size by the page size by the max device segments. This is likely right most of the time, and if turns out to not be enough, we simply expand it and keep going. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. --- module/os/linux/zfs/vdev_disk.c | 95 ++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 30 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 80195f95bb93..2b82c6880f19 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -448,10 +448,12 @@ vdev_disk_close(vdev_t *v) } static dio_request_t * -vdev_disk_dio_alloc(int bio_count) +vdev_disk_dio_alloc(zio_t *zio, int bio_count) { dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); + + dr->dr_zio = zio; atomic_set(&dr->dr_ref, 0); dr->dr_bio_count = bio_count; dr->dr_error = 0; @@ -462,6 +464,26 @@ vdev_disk_dio_alloc(int bio_count) return (dr); } +static dio_request_t * +vdev_disk_dio_expand(dio_request_t *dr) +{ + dio_request_t *ndr = kmem_zalloc(sizeof (dio_request_t) + + sizeof (struct bio *) * dr->dr_bio_count * 2, KM_SLEEP); + + ndr->dr_zio = dr->dr_zio; + atomic_set(&ndr->dr_ref, atomic_read(&dr->dr_ref)); + ndr->dr_bio_count = dr->dr_bio_count * 2; + ndr->dr_error = 0; + + for (int i = 0; i < ndr->dr_bio_count; i++) + ndr->dr_bio[i] = (i < dr->dr_bio_count) ? dr->dr_bio[i] : NULL; + + kmem_free(dr, sizeof (dio_request_t) + + sizeof (struct bio *) * dr->dr_bio_count); + + return (ndr); +} + static void vdev_disk_dio_free(dio_request_t *dr) { @@ -705,7 +727,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, uint64_t abd_offset; uint64_t bio_offset; int bio_size; - int bio_count = 16; + int bio_count; int error = 0; struct blk_plug plug; unsigned short nr_vecs; @@ -722,27 +744,45 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, return (SET_ERROR(EIO)); } -retry: - dr = vdev_disk_dio_alloc(bio_count); - if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && zio->io_vd->vdev_failfast == B_TRUE) { bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); } - dr->dr_zio = zio; - /* - * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which - * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio - * can cover at least 128KB and at most 1MB. When the required number - * of iovec's exceeds this, we are forced to break the IO in multiple - * bio's and wait for them all to complete. This is likely if the - * recordsize property is increased beyond 1MB. The default - * bio_count=16 should typically accommodate the maximum-size zio of - * 16MB. + * We create each bio with the maximum number of iovecs the + * driver/device can support, up to the kernel maximum + * (bio_max_segs(N)=256). This varies by device type, but 128 is common + * for SCSI disks, and 64 for NVMe devices. + * + * Each iovec can carry at least 512 and at most PAGESIZE bytes + * (typically 4K). This means that a single bio can cover at most 1MB + * of data, but usually significantly less. + * + * If the kernel is passed a bio that has more iovecs than the + * driver/device can support, it will split it into multiple bios. Due + * to complications in the way we allocated IO memory and load it into + * the bio, splits can cause a bio to be not properly aligned to the + * block size, which some devices reject. We try to avoid this + * happening by never passing a bio with more iovecs than the + * driver/device can handle. + * + * All this means that in many cases, a single zio will require + * multiple bios to service. Each bio adds a tiny bit of overhead to + * the overall zio, as we have to wait for them all to complete. + * + * We track the set of bios for this zio in a single dio_request_t, + * which we use to count the the bios as they complete, and so know + * when we have them all and so can finish the operation and complete + * the zio. We have to allocate enough room to keep the bio pointers. + * We guess how many bios we will need by simple division, but if there + * are short blocks, or split pages, we may end up needing more. If + * this happens, we just expand the array and keep going. */ + nr_vecs = vdev_bio_max_segs(bdev); + bio_count = ((((io_size / PAGESIZE)+1) / nr_vecs)+1); + dr = vdev_disk_dio_alloc(zio, bio_count); abd_offset = 0; bio_offset = io_offset; @@ -753,17 +793,10 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, if (bio_size <= 0) break; - /* - * If additional bio's are required, we have to retry, but - * this should be rare - see the comment above. - */ - if (dr->dr_bio_count == i) { - vdev_disk_dio_free(dr); - bio_count *= 2; - goto retry; - } + /* If additional bio's are required, expand tracking space */ + if (dr->dr_bio_count == i) + dr = vdev_disk_dio_expand(dr); - nr_vecs = vdev_bio_max_segs(bdev); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); @@ -774,9 +807,6 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, vdev_disk_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; - dr->dr_bio[i]->bi_private = dr; - bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, @@ -795,8 +825,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, /* Submit all bio's associated with this dio */ for (int i = 0; i < dr->dr_bio_count; i++) { - if (dr->dr_bio[i]) - vdev_submit_bio(dr->dr_bio[i]); + struct bio *bio = dr->dr_bio[i]; + if (bio) { + bio->bi_end_io = vdev_disk_physio_completion; + bio->bi_private = dr; + bio_set_op_attrs(bio, rw, flags); + vdev_submit_bio(bio); + } } if (dr->dr_bio_count > 1)