HansKristian-Work · HansKristian-Work · Aug 20, 2024 · Aug 20, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/include/vkd3d.h b/include/vkd3d.h
@@ -106,6 +106,7 @@ extern "C" {
 #define VKD3D_CONFIG_FLAG_DRIVER_VERSION_SENSITIVE_SHADERS (1ull << 48)
 #define VKD3D_CONFIG_FLAG_SMALL_VRAM_REBAR (1ull << 49)
 #define VKD3D_CONFIG_FLAG_STAGGERED_SUBMIT (1ull << 50)
+#define VKD3D_CONFIG_FLAG_ASYNC_PRESENT (1ull << 51)
 
 struct vkd3d_instance;
 

diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c
@@ -5076,6 +5076,15 @@ static void d3d12_command_list_track_resource_usage(struct d3d12_command_list *l
         transition.resource.perform_initial_transition = perform_initial_transition;
         d3d12_command_list_add_transition(list, &transition);
     }
+
+    /* We're guaranteed to observe first use and last use of a swapchain image.
+     * COMMON/PRESENT state must be transitioned out of, and swapchain image must transition into PRESENT.
+     * We also track any case with rendering. UAVs are banned, so the only potential case is sampling bindless
+     * straight out of COMMON, but it's unclear if that is legal in D3D12, since DXGI swapchains have implicit
+     * sync semantics to begin with. It would also be a moot scenario from a sync PoV, since it would be read-after-read,
+     * so there is no hazard to worry about. */
+    if (resource->flags & VKD3D_RESOURCE_SWAP_CHAIN_IMPLICIT_SYNC)
+        list->implicit_sync_mask |= 1u << resource->swap_chain_implicit_sync_index;
 }
 
 static void d3d12_command_list_track_query_heap(struct d3d12_command_list *list,
@@ -5674,6 +5683,7 @@ static void d3d12_command_list_reset_internal_state(struct d3d12_command_list *l
     list->wbi_batch.batch_len = 0;
     list->query_resolve_count = 0;
     list->submit_allocator = NULL;
+    list->implicit_sync_mask = 0;
 
     d3d12_command_list_clear_rtas_batch(list);
 }
@@ -17292,6 +17302,7 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm
 
     sub.execute.debug_capture = false;
     sub.execute.split_submission = false;
+    sub.execute.implicit_sync_mask = 0;
 
     num_transitions = 0;
 
@@ -17355,6 +17366,8 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm
             sub.execute.split_submission = true;
         }
 
+        sub.execute.implicit_sync_mask |= cmd_list->implicit_sync_mask;
+
 #ifdef VKD3D_ENABLE_BREADCRUMBS
         if (breadcrumb_indices)
             breadcrumb_indices[i] = cmd_list->breadcrumb_context_index;
@@ -17705,9 +17718,12 @@ static void d3d12_command_queue_signal(struct d3d12_command_queue *command_queue
     VkSemaphoreSubmitInfo signal_semaphore_info;
     struct vkd3d_queue *vkd3d_queue;
     struct d3d12_device *device;
+    VkSemaphoreSignalInfo sig;
     VkSubmitInfo2 submit_info;
+    uint64_t completed_value;
     uint64_t physical_value;
     uint64_t signal_value;
+    bool early_signal;
     VkQueue vk_queue;
     VkResult vr;
     HRESULT hr;
@@ -17744,7 +17760,25 @@ static void d3d12_command_queue_signal(struct d3d12_command_queue *command_queue
         return;
     }
 
-    vr = VK_CALL(vkQueueSubmit2(vk_queue, 1, &submit_info, VK_NULL_HANDLE));
+    early_signal = false;
+
+    /* If there is no meaningful work in the queue, there's no reason to submit anything,
+     * just joink the timeline forward.
+     * We want to ignore any incidental work submitted to the queue which is outside the scope of D3D12,
+     * e.g. swapchain blits, so we cannot rely on driver eliding these submissions. */
+    if (VK_CALL(vkGetSemaphoreCounterValue(command_queue->device->vk_device,
+            vkd3d_queue->submission_timeline, &completed_value)) == VK_SUCCESS &&
+            completed_value == vkd3d_queue->submission_timeline_count)
+    {
+        sig.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO;
+        sig.pNext = NULL;
+        sig.semaphore = signal_semaphore_info.semaphore;
+        sig.value = signal_semaphore_info.value;
+        vr = VK_CALL(vkSignalSemaphore(command_queue->device->vk_device, &sig));
+        early_signal = true;
+    }
+    else
+        vr = VK_CALL(vkQueueSubmit2(vk_queue, 1, &submit_info, VK_NULL_HANDLE));
 
     if (vr == VK_SUCCESS)
         d3d12_fence_update_pending_value_locked(fence);
@@ -17760,13 +17794,21 @@ static void d3d12_command_queue_signal(struct d3d12_command_queue *command_queue
 
     VKD3D_DEVICE_REPORT_FAULT_AND_BREADCRUMB_IF(command_queue->device, vr == VK_ERROR_DEVICE_LOST);
 
-    cookie = vkd3d_queue_timeline_trace_register_signal(&command_queue->device->queue_timeline_trace,
-            &fence->ID3D12Fence_iface, value);
-
-    if (FAILED(hr = vkd3d_enqueue_timeline_semaphore(&command_queue->fence_worker, &fence->ID3D12Fence_iface,
-            fence->timeline_semaphore, physical_value, true, NULL, 0, &cookie)))
+    if (early_signal)
     {
-        ERR("Failed to enqueue timeline semaphore, hr #%x.\n", hr);
+        if (FAILED(hr = d3d12_fence_signal(fence, &command_queue->fence_worker, physical_value)))
+            ERR("Failed to signal D3D12 fence, hr %#x.\n", hr);
+    }
+    else
+    {
+        cookie = vkd3d_queue_timeline_trace_register_signal(&command_queue->device->queue_timeline_trace,
+                &fence->ID3D12Fence_iface, value);
+
+        if (FAILED(hr = vkd3d_enqueue_timeline_semaphore(&command_queue->fence_worker, &fence->ID3D12Fence_iface,
+                fence->timeline_semaphore, physical_value, true, NULL, 0, &cookie)))
+        {
+            ERR("Failed to enqueue timeline semaphore, hr #%x.\n", hr);
+        }
     }
 
     /* We should probably trigger DEVICE_REMOVED if we hit any errors in the submission thread. */
@@ -18208,15 +18250,19 @@ static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queu
         const VkSemaphoreSubmitInfo *transition_semaphore,
         struct d3d12_command_allocator **command_allocators, size_t num_command_allocators,
         struct vkd3d_queue_timeline_trace_cookie timeline_cookie,
-        uint64_t low_latency_frame_id, bool debug_capture, bool split_submissions)
+        uint64_t low_latency_frame_id, bool debug_capture, bool split_submissions,
+        uint32_t implicit_sync_mask)
 {
+    VkSemaphoreSubmitInfo implicit_sync_signal_info[VKD3D_IMPLICIT_SYNC_NUM_TIMELINES + 1];
+    VkSemaphoreSubmitInfo implicit_sync_wait_info[VKD3D_IMPLICIT_SYNC_NUM_TIMELINES + 1];
     const struct vkd3d_vk_device_procs *vk_procs = &command_queue->device->vk_procs;
     struct vkd3d_queue *vkd3d_queue = command_queue->vkd3d_queue;
     VkLatencySubmissionPresentIdNV latency_submit_present_info;
     struct dxgi_vk_swap_chain *low_latency_swapchain;
     VkSemaphoreSubmitInfo signal_semaphore_info;
     VkSemaphoreSubmitInfo binary_semaphore_info;
-    VkSubmitInfo2 submit_desc[4], *submit;
+    VkSubmitInfo2 submit_desc[5], *submit;
+    VkSubmitInfo2 implicit_sync_signal;
     uint32_t num_submits, split_count;
     uint64_t consumed_present_id;
     bool stagger_submissions;
@@ -18314,6 +18360,28 @@ static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queu
             submit->pWaitSemaphoreInfos = transition_semaphore;
         }
 
+        if (implicit_sync_mask)
+        {
+            if (j == 0)
+            {
+                memset(&implicit_sync_signal, 0, sizeof(implicit_sync_signal));
+                implicit_sync_signal.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2;
+
+                /* We hold the queue lock while this called until we have made the first submission,
+                 * so once we have committed a wait/signal value, we're guaranteed forward progress. */
+                vkd3d_device_swapchain_patch_implicit_sync_semaphores(&command_queue->device->swapchain_info,
+                        submit, &implicit_sync_signal,
+                        implicit_sync_wait_info, implicit_sync_signal_info,
+                        implicit_sync_mask);
+            }
+
+            if (j + 1 == split_count)
+            {
+                submit = &submit_desc[num_submits++];
+                *submit = implicit_sync_signal;
+            }
+        }
+
         /* Prefer binary semaphore since timeline signal -> wait pair can cause scheduling bubbles.
          * Binary semaphores tend to be more well-behaved here since they can lower to kernel primitives more easily. */
         if (!command_queue->vkd3d_queue->barrier_command_buffer && j + 1 == split_count)
@@ -18338,11 +18406,11 @@ static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queu
 
         if (command_queue->device->vk_info.NV_low_latency2)
         {
-            spinlock_acquire(&command_queue->device->low_latency_swapchain_spinlock);
+            spinlock_acquire(&command_queue->device->swapchain_info.spinlock);
             if ((low_latency_swapchain = command_queue->device->swapchain_info.low_latency_swapchain))
                 dxgi_vk_swap_chain_incref(low_latency_swapchain);
             consumed_present_id = command_queue->device->frame_markers.consumed_present_id;
-            spinlock_release(&command_queue->device->low_latency_swapchain_spinlock);
+            spinlock_release(&command_queue->device->swapchain_info.spinlock);
 
             /* If we have submitted a swapchain blit to Vulkan,
              * it is not possible for a present ID to keep contributing to the frame's completion.
@@ -18879,7 +18947,8 @@ static void *d3d12_command_queue_submission_worker_main(void *userdata)
                     submission.execute.timeline_cookie,
                     submission.execute.low_latency_frame_id,
                     submission.execute.debug_capture,
-                    submission.execute.split_submission);
+                    submission.execute.split_submission,
+                    submission.execute.implicit_sync_mask);
 
             /* command_queue_execute takes ownership of the
              * outstanding_submission_counters and queue_timeline_indices allocations.

diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c
@@ -940,6 +940,7 @@ static const struct vkd3d_debug_option vkd3d_config_options[] =
     {"app_debug_marker_only", VKD3D_CONFIG_FLAG_APP_DEBUG_MARKER_ONLY},
     {"small_vram_rebar", VKD3D_CONFIG_FLAG_SMALL_VRAM_REBAR},
     {"staggered_submit", VKD3D_CONFIG_FLAG_STAGGERED_SUBMIT},
+    {"async_present", VKD3D_CONFIG_FLAG_ASYNC_PRESENT},
 };
 
 static void vkd3d_config_flags_init_once(void)
@@ -3538,6 +3539,7 @@ static void d3d12_device_destroy(struct d3d12_device *device)
     if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_BREADCRUMBS)
         vkd3d_breadcrumb_tracer_cleanup(&device->breadcrumb_tracer, device);
 #endif
+    vkd3d_device_swapchain_info_cleanup(&device->swapchain_info, device);
     vkd3d_pipeline_library_flush_disk_cache(&device->disk_cache);
     vkd3d_sampler_state_cleanup(&device->sampler_state, device);
     vkd3d_view_map_destroy(&device->sampler_map, device);
@@ -8784,8 +8786,6 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
         goto out_free_mutex;
     }
 
-    spinlock_init(&device->low_latency_swapchain_spinlock);
-
     device->ID3D12DeviceExt_iface.lpVtbl = &d3d12_device_vkd3d_ext_vtbl;
     device->ID3D12DXVKInteropDevice_iface.lpVtbl = &d3d12_dxvk_interop_device_vtbl;
     device->ID3DLowLatencyDevice_iface.lpVtbl = &d3d_low_latency_device_vtbl;
@@ -8887,6 +8887,9 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
     if (FAILED(hr = vkd3d_pipeline_library_init_disk_cache(&device->disk_cache, device)))
         goto out_cleanup_descriptor_qa_global_info;
 
+    if (FAILED(hr = vkd3d_device_swapchain_info_init(&device->swapchain_info, device)))
+        goto out_cleanup_disk_cache;
+
     d3d12_device_replace_vtable(device);
 
 #ifdef VKD3D_ENABLE_RENDERDOC
@@ -8896,6 +8899,8 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
 
     return S_OK;
 
+out_cleanup_disk_cache:
+    vkd3d_pipeline_library_flush_disk_cache(&device->disk_cache);
 out_cleanup_descriptor_qa_global_info:
     vkd3d_descriptor_debug_free_global_info(device->descriptor_qa_global_info, device);
 out_cleanup_breadcrumb_tracer:

diff --git a/libs/vkd3d/device_vkd3d_ext.c b/libs/vkd3d/device_vkd3d_ext.c
@@ -587,10 +587,10 @@ static HRESULT STDMETHODCALLTYPE d3d12_low_latency_device_LatencySleep(d3d_low_l
     if (!device->vk_info.NV_low_latency2)
         return E_NOTIMPL;
 
-    spinlock_acquire(&device->low_latency_swapchain_spinlock);
+    spinlock_acquire(&device->swapchain_info.spinlock);
     if ((low_latency_swapchain = device->swapchain_info.low_latency_swapchain))
         dxgi_vk_swap_chain_incref(low_latency_swapchain);
-    spinlock_release(&device->low_latency_swapchain_spinlock);
+    spinlock_release(&device->swapchain_info.spinlock);
 
     if (low_latency_swapchain)
     {
@@ -613,13 +613,13 @@ static HRESULT STDMETHODCALLTYPE d3d12_low_latency_device_SetLatencySleepMode(d3
     if (!device->vk_info.NV_low_latency2)
         return E_NOTIMPL;
 
-    spinlock_acquire(&device->low_latency_swapchain_spinlock);
+    spinlock_acquire(&device->swapchain_info.spinlock);
     device->swapchain_info.mode = low_latency_mode;
     device->swapchain_info.boost = low_latency_boost;
     device->swapchain_info.minimum_us = minimum_interval_us;
     if ((low_latency_swapchain = device->swapchain_info.low_latency_swapchain))
         dxgi_vk_swap_chain_incref(low_latency_swapchain);
-    spinlock_release(&device->low_latency_swapchain_spinlock);
+    spinlock_release(&device->swapchain_info.spinlock);
 
     if (low_latency_swapchain)
     {
@@ -693,10 +693,10 @@ static HRESULT STDMETHODCALLTYPE d3d12_low_latency_device_SetLatencyMarker(d3d_l
             break;
     }
 
-    spinlock_acquire(&device->low_latency_swapchain_spinlock);
+    spinlock_acquire(&device->swapchain_info.spinlock);
     if ((low_latency_swapchain = device->swapchain_info.low_latency_swapchain))
         dxgi_vk_swap_chain_incref(low_latency_swapchain);
-    spinlock_release(&device->low_latency_swapchain_spinlock);
+    spinlock_release(&device->swapchain_info.spinlock);
 
     if (low_latency_swapchain)
     {
@@ -717,10 +717,10 @@ static HRESULT STDMETHODCALLTYPE d3d12_low_latency_device_GetLatencyInfo(d3d_low
     if (!device->vk_info.NV_low_latency2)
         return E_NOTIMPL;
 
-    spinlock_acquire(&device->low_latency_swapchain_spinlock);
+    spinlock_acquire(&device->swapchain_info.spinlock);
     if ((low_latency_swapchain = device->swapchain_info.low_latency_swapchain))
         dxgi_vk_swap_chain_incref(low_latency_swapchain);
-    spinlock_release(&device->low_latency_swapchain_spinlock);
+    spinlock_release(&device->swapchain_info.spinlock);
 
     if (low_latency_swapchain)
     {

diff --git a/libs/vkd3d/meson.build b/libs/vkd3d/meson.build
@@ -31,6 +31,7 @@ vkd3d_shaders =[
 
   'shaders/vs_swapchain_fullscreen.vert',
   'shaders/fs_swapchain_fullscreen.frag',
+  'shaders/cs_swapchain_fullscreen.comp',
   'shaders/cs_execute_indirect_patch.comp',
   'shaders/cs_execute_indirect_patch_debug_ring.comp',
   'shaders/cs_execute_indirect_multi_dispatch.comp',

diff --git a/libs/vkd3d/meta.c b/libs/vkd3d/meta.c
@@ -643,11 +643,23 @@ static HRESULT vkd3d_meta_create_swapchain_pipeline(struct vkd3d_meta_ops *meta_
     struct vkd3d_swapchain_ops *meta_swapchain_ops = &meta_ops->swapchain;
     VkResult vr;
 
-    if ((vr = vkd3d_meta_create_graphics_pipeline(meta_ops,
-            meta_swapchain_ops->vk_pipeline_layouts[key->filter], key->format, VK_FORMAT_UNDEFINED, VK_IMAGE_ASPECT_COLOR_BIT,
-            meta_swapchain_ops->vk_vs_module, meta_swapchain_ops->vk_fs_module, 1,
-            NULL, 0, NULL, NULL, false, &pipeline->vk_pipeline)) < 0)
-        return hresult_from_vk_result(vr);
+    if (key->bind_point == VK_PIPELINE_BIND_POINT_COMPUTE)
+    {
+        if ((vr = vkd3d_meta_create_compute_pipeline(meta_ops->device,
+                sizeof(cs_swapchain_fullscreen), cs_swapchain_fullscreen,
+                meta_swapchain_ops->vk_pipeline_layouts[key->filter],
+                NULL, false, &pipeline->vk_pipeline)) < 0)
+            return hresult_from_vk_result(vr);
+    }
+    else
+    {
+        if ((vr = vkd3d_meta_create_graphics_pipeline(meta_ops,
+                meta_swapchain_ops->vk_pipeline_layouts[key->filter], key->format, VK_FORMAT_UNDEFINED,
+                VK_IMAGE_ASPECT_COLOR_BIT,
+                meta_swapchain_ops->vk_vs_module, meta_swapchain_ops->vk_fs_module, 1,
+                NULL, 0, NULL, NULL, false, &pipeline->vk_pipeline)) < 0)
+            return hresult_from_vk_result(vr);
+    }
 
     pipeline->key = *key;
     return S_OK;
@@ -1314,7 +1326,8 @@ static void vkd3d_swapchain_ops_cleanup(struct vkd3d_swapchain_ops *meta_swapcha
 
 static HRESULT vkd3d_swapchain_ops_init(struct vkd3d_swapchain_ops *meta_swapchain_ops, struct d3d12_device *device)
 {
-    VkDescriptorSetLayoutBinding set_binding;
+    VkDescriptorSetLayoutBinding set_binding[2];
+    VkPushConstantRange push_range;
     unsigned int i;
     VkResult vr;
     int rc;
@@ -1327,10 +1340,19 @@ static HRESULT vkd3d_swapchain_ops_init(struct vkd3d_swapchain_ops *meta_swapcha
         return hresult_from_errno(rc);
     }
 
-    set_binding.binding = 0;
-    set_binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-    set_binding.descriptorCount = 1;
-    set_binding.stageFlags = VK_SHADER_STAGE_ALL; /* Could be compute or graphics, so just use ALL. */
+    set_binding[0].binding = 0;
+    set_binding[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+    set_binding[0].descriptorCount = 1;
+    set_binding[0].stageFlags = VK_SHADER_STAGE_ALL; /* Could be compute or graphics, so just use ALL. */
+
+    set_binding[1].binding = 1;
+    set_binding[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+    set_binding[1].descriptorCount = 1;
+    set_binding[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    push_range.offset = 0;
+    push_range.size = sizeof(float) * 2;
+    push_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
 
     for (i = 0; i < 2; i++)
     {
@@ -1340,16 +1362,16 @@ static HRESULT vkd3d_swapchain_ops_init(struct vkd3d_swapchain_ops *meta_swapcha
             goto fail;
         }
 
-        set_binding.pImmutableSamplers = &meta_swapchain_ops->vk_samplers[i];
-        if ((vr = vkd3d_meta_create_descriptor_set_layout(device, 1, &set_binding,
+        set_binding[0].pImmutableSamplers = &meta_swapchain_ops->vk_samplers[i];
+        if ((vr = vkd3d_meta_create_descriptor_set_layout(device, ARRAY_SIZE(set_binding), set_binding,
                 false, &meta_swapchain_ops->vk_set_layouts[i])) < 0)
         {
             ERR("Failed to create descriptor set layout, vr %d.\n", vr);
             goto fail;
         }
 
         if ((vr = vkd3d_meta_create_pipeline_layout(device, 1, &meta_swapchain_ops->vk_set_layouts[i],
-                0, NULL, &meta_swapchain_ops->vk_pipeline_layouts[i])))
+                1, &push_range, &meta_swapchain_ops->vk_pipeline_layouts[i])))
         {
             ERR("Failed to create pipeline layout, vr %d.\n", vr);
             goto fail;