From caca3ebbb0902d8b17f8a27e006d3dda2f06237b Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Thu, 18 Jan 2024 13:52:19 -0700 Subject: [PATCH] Fixing failed ASSERT in dbuf_unoverride() 995734e added a test for block cloning with mmap files. As a result I began hitting a panic in that test in dbuf_unoverride(). The was if the dirty record was from a cloned block, then the dr_data must be set to NULL. This ASSERT was added in 86e115e. The point of that commit was to make sure that if a cloned block is read before it is synced out, then the associated ARC buffer is set in the dirty record. This became an issue with the O_DIRECT code, because dr_data was set to the ARC buf in dbuf_set_data() after the read. This is the incorrect logic though for the cloned block. In order to fix this issue, I refined how to determine if the dirty record is in fact from a O_DIRECT write by make sure that dr_brtwrite is false. I created the function dbuf_dirty_is_direct_write() to perform the proper check. As part of this, I also cleaned up other code that did the exact same check for an O_DIRECT write to make sure the proper check is taking place everywhere. The trace of the ASSERT that was being tripped before this change is below: [3649972.811039] VERIFY0P(dr->dt.dl.dr_data) failed (NULL == ffff8d58e8183c80) [3649972.817999] PANIC at dbuf.c:1999:dbuf_unoverride() [3649972.822968] Showing stack for process 2365657 [3649972.827502] CPU: 0 PID: 2365657 Comm: clone_mmap_writ Kdump: loaded Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [3649972.839749] Hardware name: GIGABYTE R272-Z32-00/MZ32-AR0-00, BIOS R21 10/08/2020 [3649972.847315] Call Trace: [3649972.849935] dump_stack+0x41/0x60 [3649972.853428] spl_panic+0xd0/0xe8 [spl] [3649972.857370] ? cityhash4+0x75/0x90 [zfs] [3649972.861649] ? _cond_resched+0x15/0x30 [3649972.865577] ? spl_kmem_alloc_impl+0xce/0xf0 [spl] [3649972.870548] ? __kmalloc_node+0x10d/0x300 [3649972.874735] ? spl_kmem_alloc_impl+0xce/0xf0 [spl] [3649972.879702] ? __list_add+0x12/0x30 [zfs] [3649972.884061] dbuf_unoverride+0x1c1/0x1d0 [zfs] [3649972.888856] dbuf_redirty+0x3b/0xd0 [zfs] [3649972.893204] dbuf_dirty+0xeb1/0x1330 [zfs] [3649972.897643] ? _cond_resched+0x15/0x30 [3649972.901569] ? mutex_lock+0xe/0x30 [3649972.905148] ? dbuf_noread+0x117/0x240 [zfs] [3649972.909760] dmu_write_uio_dnode+0x1d2/0x320 [zfs] [3649972.914900] dmu_write_uio_dbuf+0x47/0x60 [zfs] [3649972.919777] zfs_write+0x57d/0xe00 [zfs] [3649972.924076] ? alloc_set_pte+0xb8/0x3e0 [3649972.928088] zpl_iter_write_buffered+0xb2/0x120 [zfs] [3649972.933507] ? rrw_exit+0xc6/0x200 [zfs] [3649972.937796] zpl_iter_write+0xba/0x110 [zfs] [3649972.942433] new_sync_write+0x112/0x160 [3649972.946445] vfs_write+0xa5/0x1a0 [3649972.949935] ksys_pwrite64+0x61/0xa0 [3649972.953681] do_syscall_64+0x5b/0x1a0 [3649972.957519] entry_SYSCALL_64_after_hwframe+0x65/0xca [3649972.962745] RIP: 0033:0x7f610616f01b Signed-off-by: Brian Atkinson --- include/sys/dbuf.h | 14 ++++++++++++++ module/zfs/dbuf.c | 12 ++++++++---- module/zfs/dmu_direct.c | 4 +--- module/zfs/zfs_vnops.c | 3 +-- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index da223e9bbb2e..703ba5da1c8e 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -476,6 +476,20 @@ dbuf_get_dirty_direct(dmu_buf_impl_t *db) return (list_head(&db->db_dirty_records)); } +static inline boolean_t +dbuf_dirty_is_direct_write(dbuf_dirty_record_t *dr) +{ + boolean_t ret = B_FALSE; + + if (dr != NULL && dr->dr_dbuf->db_level == 0 && + !dr->dt.dl.dr_brtwrite && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN && + dr->dt.dl.dr_data == NULL) { + ret = B_TRUE; + } + return (ret); +} + #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index a9fe1ded82ad..2b44ad6126df 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1292,9 +1292,7 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) * If there is a Direct I/O, set its data too. Then its state * will be the same as if we did a ZIL dmu_sync(). */ - if (dr_dio != NULL && db->db_level == 0 && - dr_dio->dt.dl.dr_override_state == DR_OVERRIDDEN && - dr_dio->dt.dl.dr_data == NULL) { + if (dbuf_dirty_is_direct_write(dr_dio)) { dr_dio->dt.dl.dr_data = db->db_buf; } @@ -2218,9 +2216,15 @@ dbuf_redirty(dbuf_dirty_record_t *dr) } /* * If initial dirty was via Direct I/O, may not have a dr_data. + * + * If the dirty record was associated with cloned block then + * the call above to dbuf_unoverride() will have reset + * dr->dt.dl.dr_data and it will not be NULL here. */ - if (dr->dt.dl.dr_data == NULL) + if (dr->dt.dl.dr_data == NULL) { + ASSERT3B(dbuf_dirty_is_direct_write(dr), ==, B_TRUE); dr->dt.dl.dr_data = db->db_buf; + } } } diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 584ec8f33598..d618d27d18fc 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -194,9 +194,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) */ mutex_enter(&db->db_mtx); dr_head = dbuf_find_dirty_eq(db, dmu_tx_get_txg(tx)); - if (dr_head && dr_head->dt.dl.dr_override_state == DR_OVERRIDDEN && - dr_head->dt.dl.dr_data == NULL && - !dr_head->dt.dl.dr_brtwrite) { + if (dbuf_dirty_is_direct_write(dr_head)) { dmu_buf_undirty(db, tx); } mutex_exit(&db->db_mtx); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index ade2b927a3ee..2e154a1759d9 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1172,8 +1172,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * All Direct I/O writes will have already completed and the * block pointer can be immediately stored in the log record. */ - if (dr != NULL && dr->dt.dl.dr_data == NULL && - dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + if (dbuf_dirty_is_direct_write(dr)) { lr->lr_blkptr = dr->dt.dl.dr_overridden_by; zfs_get_done(zgd, 0); return (0);