Skip to content

Commit

Permalink
Adding iov_iter_extract_user_pages
Browse files Browse the repository at this point in the history
Starting in the Linux kernel 6.3, the iov_iter API added
iov_iter_extract_user_pages(). Direct I/O requests should be using the
pin_user_pages* interfaces through kernel documentation. Since the pages
for Direct I/O are pinned under the iov_iter API in the UIO code, the
pin_user_pages* interfaces can now be leveraged by calling
iov_iter_extract_user_pages(). The Linux UIO code was updated to
leverage these new interfaces and keeps the Direct I/O patch up to date
with the latest kernel API's.

Signed-off-by: Brian Atkinson <[email protected]>
  • Loading branch information
bwatkinson committed Dec 3, 2024
1 parent 45f796b commit c21806d
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 12 deletions.
41 changes: 36 additions & 5 deletions config/kernel-vfs-iov_iter.m4
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,21 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
error = fault_in_iov_iter_readable(&iter, size);
])
ZFS_LINUX_TEST_SRC([iov_iter_extract_user_pages], [
#include <linux/uio.h>
], [
struct iov_iter iter = { 0 };
struct page **pages = NULL;
size_t maxsize = 4096;
unsigned maxpages = 1;
iov_iter_extraction_t extraction_flags = 0;
size_t offset;
size_t ret __attribute__ ((unused));
ret = iov_iter_extract_usser_pages(&iter, &pages, maxsize,
maxpages, extraction_flags, &offset);
])
ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [
#include <linux/uio.h>
], [
Expand Down Expand Up @@ -70,17 +85,33 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
])
dnl #
dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2().
dnl # Kernel 6.3 provides iov_iter_extract_user_pages(), which calls
dnl # pin_user_pages_fast(). pin_user_pages should be used for Direct
dnl # I/O requests.
dnl #
AC_MSG_CHECKING([whether iov_iter_get_pages2() is available])
ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [
AC_MSG_CHECKING([whether iov_iter_extract_user_pages() is available])
ZFS_LINUX_TEST_RESULT([iov_iter_extract_user_pages], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1,
[iov_iter_get_pages2() is available])
AC_DEFINE(HAVE_IOV_ITER_EXTRACT_USER_PAGES, 1,
[iov_iter_extract_user_pages() is available])
],[
AC_MSG_RESULT(no)
dnl #
dnl # Kernel 6.0 changed iov_iter_get_pages() to
dnl # iov_iter_page_pages2().
dnl #
AC_MSG_CHECKING([whether iov_iter_get_pages2() is available])
ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1,
[iov_iter_get_pages2() is available])
],[
AC_MSG_RESULT(no)
])
])
dnl #
dnl # This checks for iov_iter_type() in linux/uio.h. It is not
dnl # required, however, and the module will compiled without it
Expand Down
81 changes: 74 additions & 7 deletions module/os/linux/zfs/zfs_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -446,16 +446,28 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
{
ASSERT3P(uio->uio_dio.pages, !=, NULL);

#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES)
/*
* If user pages were pinned through iov_iter_extract_user_pages(),
* then unpin_user_pages() will take care of correctly handling the
* zero page references.
*/
return;
#endif

for (long i = 0; i < uio->uio_dio.npages; i++) {
struct page *p = uio->uio_dio.pages[i];
lock_page(p);

if (IS_ZERO_PAGE(p)) {
/*
* If the user page points the kernels ZERO_PAGE() a
* new zero filled page will just be allocated so the
* contents of the page can not be changed by the user
* while a Direct I/O write is taking place.
* new zero filled page will just be allocated. This is
* required as calling put_page() on the zero page is
* not allowed. This also has the side affect of
* protecting the contents of the page from changing
* during I/O if the user is manipulating the page
* contents.
*/
gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |
__GFP_ZERO | GFP_KERNEL;
Expand All @@ -480,6 +492,9 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
ASSERT(uio->uio_extflg & UIO_DIRECT);
ASSERT3P(uio->uio_dio.pages, !=, NULL);

#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES)
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
#else
for (long i = 0; i < uio->uio_dio.npages; i++) {
struct page *p = uio->uio_dio.pages[i];

Expand All @@ -491,13 +506,55 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)

put_page(p);
}

#endif
vmem_free(uio->uio_dio.pages,
uio->uio_dio.npages * sizeof (struct page *));
}

#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES)
static int
zfs_uio_extract_pages(zfs_uio_t *uio)
{
size_t wanted = uio->uio_resid - uio->uio_skip;
ssize_t rollback = 0;
unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);

ASSERT3B(iov_iter_extract_will_pin(uio->uio_iter, ==, B_TRUE));
while (wanted) {
ssize_t len;
size_t offset;

struct page **pages = &uio->uio_dio.pages[uio->uio_dio.npages];
/*
* Currently just passing 0 for the iov_iter_extraction_t flag.
* This could at some point be leveraged though for P2PDMA by
* passing ITER_ALLOW_P2PDMA instead.
*/
len = iov_iter_get_pages2(uio->uio_iter, &pages, wanted,
maxpages, 0, &offset);
if (len < 0) {
iov_iter_revert(uio->uio_iter, rollback);
return (SET_ERROR(-len));
}
/*
* All Direct I/O request are page aligned, so the offset into
* the first page must be zero.
*/
ASSERT0(offset);
uio->uio_dio.npages += DIV_ROUND_UP(len, PAGE_SIZE);
maxpages -= uio->uio_dio.npages;
rollback += len;
wanted -= len;
}
ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
iov_iter_revert(uio->uio_iter, rollback);

return (0);
}
#endif

static int
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
zfs_uio_get_pages(zfs_uio_t *uio)
{
size_t skip = uio->uio_skip;
size_t wanted = uio->uio_resid - uio->uio_skip;
Expand Down Expand Up @@ -525,7 +582,8 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
skip = 0;
#if !defined(HAVE_IOV_ITER_GET_PAGES2)
/*
* iov_iter_get_pages2() advances the iov_iter on success.
* iov_iter_extract_user_pages() and iov_iter_get_pages2()
* advances the iov_iter on success.
*/
iov_iter_advance(uio->uio_iter, cnt);
#endif
Expand All @@ -552,16 +610,25 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)

if (uio->uio_segflg == UIO_ITER) {
uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES)
error = zfs_uio_extract_pages(uio);
#else
error = zfs_uio_get_pages(uio);
#endif

} else {
return (SET_ERROR(EOPNOTSUPP));
}

ASSERT3S(uio->uio_dio.npages, >=, 0);

if (error) {
#if defined(HAVE_IOV_ITER_EXTRACT_USER_PAGES)
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
#else
for (long i = 0; i < uio->uio_dio.npages; i++)
put_page(uio->uio_dio.pages[i]);
#endif
vmem_free(uio->uio_dio.pages, size);
return (error);
} else {
Expand Down

0 comments on commit c21806d

Please sign in to comment.