diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index 6c0e46460835..1d7ef509a4f4 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -13,6 +13,21 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ error = fault_in_iov_iter_readable(&iter, size); ]) + ZFS_LINUX_TEST_SRC([iov_iter_extract_user_pages], [ + #include + ], [ + struct iov_iter iter = { 0 }; + struct page **pages = NULL; + size_t maxsize = 4096; + unsigned maxpages = 1; + iov_iter_extraction_t extraction_flags = 0; + size_t offset; + size_t ret __attribute__ ((unused)); + + ret = iov_iter_extract_usser_pages(&iter, &pages, maxsize, + maxpages, extraction_flags, &offset); + ]) + ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [ #include ], [ @@ -70,17 +85,33 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ ]) dnl # - dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2(). + dnl # Kernel 6.3 provides iov_iter_extract_user_pages(), which calls + dnl # pin_user_pages_fast(). pin_user_pages should be used for Direct + dnl # I/O requests. dnl # - AC_MSG_CHECKING([whether iov_iter_get_pages2() is available]) - ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [ + AC_MSG_CHECKING([whether iov_iter_extract_user_pages() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_extract_user_pages], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1, - [iov_iter_get_pages2() is available]) + AC_DEFINE(HAVE_IOV_ITER_EXTRACT_USER_PAGES, 1, + [iov_iter_extract_user_pages() is available]) ],[ AC_MSG_RESULT(no) + + dnl # + dnl # Kernel 6.0 changed iov_iter_get_pages() to + dnl # iov_iter_page_pages2(). + dnl # + AC_MSG_CHECKING([whether iov_iter_get_pages2() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1, + [iov_iter_get_pages2() is available]) + ],[ + AC_MSG_RESULT(no) + ]) ]) + dnl # dnl # This checks for iov_iter_type() in linux/uio.h. It is not dnl # required, however, and the module will compiled without it diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index ed11f8b63fbf..4ce6433953cb 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -446,6 +446,15 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) { ASSERT3P(uio->uio_dio.pages, !=, NULL); +#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES) + /* + * If user pages were pinned through iov_iter_extract_user_pages(), + * then unpin_user_pages() will take care of correctly handling the + * zero page references. + */ + return; +#endif + for (long i = 0; i < uio->uio_dio.npages; i++) { struct page *p = uio->uio_dio.pages[i]; lock_page(p); @@ -453,9 +462,12 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) if (IS_ZERO_PAGE(p)) { /* * If the user page points the kernels ZERO_PAGE() a - * new zero filled page will just be allocated so the - * contents of the page can not be changed by the user - * while a Direct I/O write is taking place. + * new zero filled page will just be allocated. This is + * required as calling put_page() on the zero page is + * not allowed. This also has the side affect of + * protecting the contents of the page from changing + * during I/O if the user is manipulating the page + * contents. */ gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO | __GFP_ZERO | GFP_KERNEL; @@ -480,6 +492,9 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) ASSERT(uio->uio_extflg & UIO_DIRECT); ASSERT3P(uio->uio_dio.pages, !=, NULL); +#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES) + unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); +#else for (long i = 0; i < uio->uio_dio.npages; i++) { struct page *p = uio->uio_dio.pages[i]; @@ -491,13 +506,55 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) put_page(p); } - +#endif vmem_free(uio->uio_dio.pages, uio->uio_dio.npages * sizeof (struct page *)); } +#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES) +static int +zfs_uio_extract_pages(zfs_uio_t *uio) +{ + size_t wanted = uio->uio_resid - uio->uio_skip; + ssize_t rollback = 0; + unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); + + ASSERT3B(iov_iter_extract_will_pin(uio->uio_iter, ==, B_TRUE)); + while (wanted) { + ssize_t len; + size_t offset; + + struct page **pages = &uio->uio_dio.pages[uio->uio_dio.npages]; + /* + * Currently just passing 0 for the iov_iter_extraction_t flag. + * This could at some point be leveraged though for P2PDMA by + * passing ITER_ALLOW_P2PDMA instead. + */ + len = iov_iter_get_pages2(uio->uio_iter, &pages, wanted, + maxpages, 0, &offset); + if (len < 0) { + iov_iter_revert(uio->uio_iter, rollback); + return (SET_ERROR(-len)); + } + /* + * All Direct I/O request are page aligned, so the offset into + * the first page must be zero. + */ + ASSERT0(offset); + uio->uio_dio.npages += DIV_ROUND_UP(len, PAGE_SIZE); + maxpages -= uio->uio_dio.npages; + rollback += len; + wanted -= len; + } + ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); + iov_iter_revert(uio->uio_iter, rollback); + + return (0); +} +#endif + static int -zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) +zfs_uio_get_pages(zfs_uio_t *uio) { size_t skip = uio->uio_skip; size_t wanted = uio->uio_resid - uio->uio_skip; @@ -525,7 +582,8 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) skip = 0; #if !defined(HAVE_IOV_ITER_GET_PAGES2) /* - * iov_iter_get_pages2() advances the iov_iter on success. + * iov_iter_extract_user_pages() and iov_iter_get_pages2() + * advances the iov_iter on success. */ iov_iter_advance(uio->uio_iter, cnt); #endif @@ -552,7 +610,12 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) if (uio->uio_segflg == UIO_ITER) { uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); - error = zfs_uio_get_dio_pages_iov_iter(uio, rw); +#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES) + error = zfs_uio_extract_pages(uio); +#else + error = zfs_uio_get_pages(uio); +#endif + } else { return (SET_ERROR(EOPNOTSUPP)); } @@ -560,8 +623,12 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) ASSERT3S(uio->uio_dio.npages, >=, 0); if (error) { +#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES) + unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); +#else for (long i = 0; i < uio->uio_dio.npages; i++) put_page(uio->uio_dio.pages[i]); +#endif vmem_free(uio->uio_dio.pages, size); return (error); } else {