Skip to content

Commit

Permalink
Use DNODE_FIND_DIRTY for SEEK_HOLE/SEEK_DATA.
Browse files Browse the repository at this point in the history
There is no need to worry about txg syncs anymore, so remove
dmu_offset_next_sync and dnode_is_dirty too.

This should be transparent to userland except that holes created by
compression do not become visible until after the next txg sync.

mmap_seek_001_pos is updated to force a sync to match this case.

Signed-off-by: Robert Evans <[email protected]>
  • Loading branch information
rrevans committed Jan 8, 2025
1 parent a765b1d commit 0027d2a
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 83 deletions.
1 change: 0 additions & 1 deletion include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,6 @@ boolean_t dnode_add_ref(dnode_t *dn, const void *ref);
void dnode_rele(dnode_t *dn, const void *ref);
void dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting);
int dnode_try_claim(objset_t *os, uint64_t object, int slots);
boolean_t dnode_is_dirty(dnode_t *dn);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
Expand Down
49 changes: 4 additions & 45 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,6 @@ static int zfs_nopwrite_enabled = 1;
*/
static uint_t zfs_per_txg_dirty_frees_percent = 30;

/*
* Enable/disable forcing txg sync when dirty checking for holes with lseek().
* By default this is enabled to ensure accurate hole reporting, it can result
* in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
* Disabling this option will result in holes never being reported in dirty
* files which is always safe.
*/
static int zfs_dmu_offset_next_sync = 1;

/*
* Limit the amount we can prefetch with one call to this amount. This
* helps to limit the amount of memory that can be used by prefetching.
Expand Down Expand Up @@ -2496,45 +2487,16 @@ int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
int restarted = 0, err;
int err;

restart:
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);

rw_enter(&dn->dn_struct_rwlock, RW_READER);

if (dnode_is_dirty(dn)) {
/*
* If the zfs_dmu_offset_next_sync module option is enabled
* then hole reporting has been requested. Dirty dnodes
* must be synced to disk to accurately report holes.
*
* Provided a RL_READER rangelock spanning 0-UINT64_MAX is
* held by the caller only a single restart will be required.
* We tolerate callers which do not hold the rangelock by
* returning EBUSY and not reporting holes after one restart.
*/
if (zfs_dmu_offset_next_sync) {
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
err = dnode_next_offset(dn,
DNODE_FIND_DIRTY | (hole ? DNODE_FIND_HOLE : 0),
off, 1, 1, 0);

if (restarted)
return (SET_ERROR(EBUSY));

txg_wait_synced(dmu_objset_pool(os), 0);
restarted = 1;
goto restart;
}

err = SET_ERROR(EBUSY);
} else {
err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
(hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
}

rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);

return (err);
Expand Down Expand Up @@ -2939,9 +2901,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
"Percentage of dirtied blocks from frees in one TXG");

ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
"Enable forcing txg sync to find holes");

ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
"Limit one prefetch call to this size");

Expand Down
28 changes: 0 additions & 28 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1815,34 +1815,6 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
slots, NULL, NULL));
}

/*
* Checks if the dnode itself is dirty, or is carrying any uncommitted records.
* It is important to check both conditions, as some operations (eg appending
* to a file) can dirty both as a single logical unit, but they are not synced
* out atomically, so checking one and not the other can result in an object
* appearing to be clean mid-way through a commit.
*
* Do not change this lightly! If you get it wrong, dmu_offset_next() can
* detect a hole where there is really data, leading to silent corruption.
*/
boolean_t
dnode_is_dirty(dnode_t *dn)
{
mutex_enter(&dn->dn_mtx);

for (int i = 0; i < TXG_SIZE; i++) {
if (multilist_link_active(&dn->dn_dirty_link[i]) ||
!list_is_empty(&dn->dn_dirty_records[i])) {
mutex_exit(&dn->dn_mtx);
return (B_TRUE);
}
}

mutex_exit(&dn->dn_mtx);

return (B_FALSE);
}

void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
Expand Down
3 changes: 3 additions & 0 deletions tests/zfs-tests/cmd/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree


scripts_zfs_tests_bin_PROGRAMS += %D%/mmap_exec %D%/mmap_seek %D%/mmap_sync %D%/mmapwrite %D%/readmmap
%C%_mmap_seek_LDADD = \
libzfs_core.la \
libnvpair.la
%C%_mmapwrite_LDADD = -lpthread

if WANT_MMAP_LIBAIO
Expand Down
27 changes: 25 additions & 2 deletions tests/zfs-tests/cmd/mmap_seek.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#ifdef __linux__
#include <linux/fs.h>
#endif
#include <libnvpair.h>
#include <libzfs_core.h>

/* some older uClibc's lack the defines, so we'll manually define them */
#ifdef __UCLIBC__
Expand Down Expand Up @@ -67,6 +69,18 @@ seek_hole(int fd, off_t offset, off_t expected)
}
}

static void
sync_pool(const char *pool_name)
{
nvlist_t *innvl = fnvlist_alloc();
fnvlist_add_boolean_value(innvl, "force", B_TRUE);
if (lzc_sync(pool_name, innvl, NULL) != 0) {
perror("lzc_sync");
exit(2);
}
nvlist_free(innvl);
}

int
main(int argc, char **argv)
{
Expand All @@ -75,12 +89,14 @@ main(int argc, char **argv)
char *buf = NULL;
int err;

if (argc != 4) {
if (argc != 5) {
(void) printf("usage: %s <file name> <file size> "
"<block size>\n", argv[0]);
"<block size> <pool name>\n", argv[0]);
exit(1);
}

(void) libzfs_core_init();

int fd = open(file_path, O_RDWR | O_CREAT, 0666);
if (fd == -1) {
(void) fprintf(stderr, "%s: %s: ", execname, file_path);
Expand All @@ -90,6 +106,7 @@ main(int argc, char **argv)

off_t file_size = atoi(argv[2]);
off_t block_size = atoi(argv[3]);
const char *pool_name = argv[4];

if (block_size * 2 > file_size) {
(void) fprintf(stderr, "file size must be at least "
Expand Down Expand Up @@ -143,6 +160,12 @@ main(int argc, char **argv)

/* Punch a hole (required compression be enabled). */
memset(buf + block_size, 0, block_size);
err = msync(buf, file_size, MS_SYNC);
if (err == -1) {
perror("msync");
exit(2);
}
sync_pool(pool_name);
seek_data(fd, 0, 0);
seek_data(fd, block_size, 2 * block_size);
seek_hole(fd, 0, block_size);
Expand Down
1 change: 0 additions & 1 deletion tests/zfs-tests/include/tunables.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode
DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms
DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms
DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check
DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync
EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms
INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size
INITIALIZE_VALUE initialize_value zfs_initialize_value
Expand Down
7 changes: 1 addition & 6 deletions tests/zfs-tests/tests/functional/mmap/mmap_seek_001_pos.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,19 @@ function cleanup
log_must zfs set compression=off $TESTPOOL/$TESTFS
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
log_must rm -f $TESTDIR/test-mmap-file
log_must set_tunable64 DMU_OFFSET_NEXT_SYNC $dmu_offset_next_sync
}

log_assert "lseek() data/holes for an mmap()'d file."

log_onexit cleanup

# Enable hole reporting for dirty files.
typeset dmu_offset_next_sync=$(get_tunable DMU_OFFSET_NEXT_SYNC)
log_must set_tunable64 DMU_OFFSET_NEXT_SYNC 1

# Compression must be enabled to convert zero'd blocks to holes.
# This behavior is checked by the mmap_seek test.
log_must zfs set compression=on $TESTPOOL/$TESTFS

for bs in 4096 8192 16384 32768 65536 131072; do
log_must zfs set recordsize=$bs $TESTPOOL/$TESTFS
log_must mmap_seek $TESTDIR/test-mmap-file $((1024*1024)) $bs
log_must mmap_seek $TESTDIR/test-mmap-file $((1024*1024)) $bs $TESTPOOL
log_must rm $TESTDIR/test-mmap-file
done

Expand Down

0 comments on commit 0027d2a

Please sign in to comment.