Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Async compute present - attempt to squeeze a little perf out of AMD #2075

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/vkd3d.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ extern "C" {
#define VKD3D_CONFIG_FLAG_DRIVER_VERSION_SENSITIVE_SHADERS (1ull << 48)
#define VKD3D_CONFIG_FLAG_SMALL_VRAM_REBAR (1ull << 49)
#define VKD3D_CONFIG_FLAG_STAGGERED_SUBMIT (1ull << 50)
#define VKD3D_CONFIG_FLAG_ASYNC_PRESENT (1ull << 51)

struct vkd3d_instance;

Expand Down
93 changes: 81 additions & 12 deletions libs/vkd3d/command.c
Original file line number Diff line number Diff line change
Expand Up @@ -5076,6 +5076,15 @@ static void d3d12_command_list_track_resource_usage(struct d3d12_command_list *l
transition.resource.perform_initial_transition = perform_initial_transition;
d3d12_command_list_add_transition(list, &transition);
}

/* We're guaranteed to observe first use and last use of a swapchain image.
* COMMON/PRESENT state must be transitioned out of, and swapchain image must transition into PRESENT.
* We also track any case with rendering. UAVs are banned, so the only potential case is sampling bindless
* straight out of COMMON, but it's unclear if that is legal in D3D12, since DXGI swapchains have implicit
* sync semantics to begin with. It would also be a moot scenario from a sync PoV, since it would be read-after-read,
* so there is no hazard to worry about. */
if (resource->flags & VKD3D_RESOURCE_SWAP_CHAIN_IMPLICIT_SYNC)
list->implicit_sync_mask |= 1u << resource->swap_chain_implicit_sync_index;
}

static void d3d12_command_list_track_query_heap(struct d3d12_command_list *list,
Expand Down Expand Up @@ -5674,6 +5683,7 @@ static void d3d12_command_list_reset_internal_state(struct d3d12_command_list *l
list->wbi_batch.batch_len = 0;
list->query_resolve_count = 0;
list->submit_allocator = NULL;
list->implicit_sync_mask = 0;

d3d12_command_list_clear_rtas_batch(list);
}
Expand Down Expand Up @@ -17292,6 +17302,7 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm

sub.execute.debug_capture = false;
sub.execute.split_submission = false;
sub.execute.implicit_sync_mask = 0;

num_transitions = 0;

Expand Down Expand Up @@ -17355,6 +17366,8 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm
sub.execute.split_submission = true;
}

sub.execute.implicit_sync_mask |= cmd_list->implicit_sync_mask;

#ifdef VKD3D_ENABLE_BREADCRUMBS
if (breadcrumb_indices)
breadcrumb_indices[i] = cmd_list->breadcrumb_context_index;
Expand Down Expand Up @@ -17705,9 +17718,12 @@ static void d3d12_command_queue_signal(struct d3d12_command_queue *command_queue
VkSemaphoreSubmitInfo signal_semaphore_info;
struct vkd3d_queue *vkd3d_queue;
struct d3d12_device *device;
VkSemaphoreSignalInfo sig;
VkSubmitInfo2 submit_info;
uint64_t completed_value;
uint64_t physical_value;
uint64_t signal_value;
bool early_signal;
VkQueue vk_queue;
VkResult vr;
HRESULT hr;
Expand Down Expand Up @@ -17744,7 +17760,25 @@ static void d3d12_command_queue_signal(struct d3d12_command_queue *command_queue
return;
}

vr = VK_CALL(vkQueueSubmit2(vk_queue, 1, &submit_info, VK_NULL_HANDLE));
early_signal = false;

/* If there is no meaningful work in the queue, there's no reason to submit anything,
* just joink the timeline forward.
* We want to ignore any incidental work submitted to the queue which is outside the scope of D3D12,
* e.g. swapchain blits, so we cannot rely on driver eliding these submissions. */
if (VK_CALL(vkGetSemaphoreCounterValue(command_queue->device->vk_device,
vkd3d_queue->submission_timeline, &completed_value)) == VK_SUCCESS &&
completed_value == vkd3d_queue->submission_timeline_count)
{
sig.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO;
sig.pNext = NULL;
sig.semaphore = signal_semaphore_info.semaphore;
sig.value = signal_semaphore_info.value;
vr = VK_CALL(vkSignalSemaphore(command_queue->device->vk_device, &sig));
early_signal = true;
}
else
vr = VK_CALL(vkQueueSubmit2(vk_queue, 1, &submit_info, VK_NULL_HANDLE));

if (vr == VK_SUCCESS)
d3d12_fence_update_pending_value_locked(fence);
Expand All @@ -17760,13 +17794,21 @@ static void d3d12_command_queue_signal(struct d3d12_command_queue *command_queue

VKD3D_DEVICE_REPORT_FAULT_AND_BREADCRUMB_IF(command_queue->device, vr == VK_ERROR_DEVICE_LOST);

cookie = vkd3d_queue_timeline_trace_register_signal(&command_queue->device->queue_timeline_trace,
&fence->ID3D12Fence_iface, value);

if (FAILED(hr = vkd3d_enqueue_timeline_semaphore(&command_queue->fence_worker, &fence->ID3D12Fence_iface,
fence->timeline_semaphore, physical_value, true, NULL, 0, &cookie)))
if (early_signal)
{
ERR("Failed to enqueue timeline semaphore, hr #%x.\n", hr);
if (FAILED(hr = d3d12_fence_signal(fence, &command_queue->fence_worker, physical_value)))
ERR("Failed to signal D3D12 fence, hr %#x.\n", hr);
}
else
{
cookie = vkd3d_queue_timeline_trace_register_signal(&command_queue->device->queue_timeline_trace,
&fence->ID3D12Fence_iface, value);

if (FAILED(hr = vkd3d_enqueue_timeline_semaphore(&command_queue->fence_worker, &fence->ID3D12Fence_iface,
fence->timeline_semaphore, physical_value, true, NULL, 0, &cookie)))
{
ERR("Failed to enqueue timeline semaphore, hr #%x.\n", hr);
}
}

/* We should probably trigger DEVICE_REMOVED if we hit any errors in the submission thread. */
Expand Down Expand Up @@ -18208,15 +18250,19 @@ static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queu
const VkSemaphoreSubmitInfo *transition_semaphore,
struct d3d12_command_allocator **command_allocators, size_t num_command_allocators,
struct vkd3d_queue_timeline_trace_cookie timeline_cookie,
uint64_t low_latency_frame_id, bool debug_capture, bool split_submissions)
uint64_t low_latency_frame_id, bool debug_capture, bool split_submissions,
uint32_t implicit_sync_mask)
{
VkSemaphoreSubmitInfo implicit_sync_signal_info[VKD3D_IMPLICIT_SYNC_NUM_TIMELINES + 1];
VkSemaphoreSubmitInfo implicit_sync_wait_info[VKD3D_IMPLICIT_SYNC_NUM_TIMELINES + 1];
const struct vkd3d_vk_device_procs *vk_procs = &command_queue->device->vk_procs;
struct vkd3d_queue *vkd3d_queue = command_queue->vkd3d_queue;
VkLatencySubmissionPresentIdNV latency_submit_present_info;
struct dxgi_vk_swap_chain *low_latency_swapchain;
VkSemaphoreSubmitInfo signal_semaphore_info;
VkSemaphoreSubmitInfo binary_semaphore_info;
VkSubmitInfo2 submit_desc[4], *submit;
VkSubmitInfo2 submit_desc[5], *submit;
VkSubmitInfo2 implicit_sync_signal;
uint32_t num_submits, split_count;
uint64_t consumed_present_id;
bool stagger_submissions;
Expand Down Expand Up @@ -18314,6 +18360,28 @@ static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queu
submit->pWaitSemaphoreInfos = transition_semaphore;
}

if (implicit_sync_mask)
{
if (j == 0)
{
memset(&implicit_sync_signal, 0, sizeof(implicit_sync_signal));
implicit_sync_signal.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2;

/* We hold the queue lock while this called until we have made the first submission,
* so once we have committed a wait/signal value, we're guaranteed forward progress. */
vkd3d_device_swapchain_patch_implicit_sync_semaphores(&command_queue->device->swapchain_info,
submit, &implicit_sync_signal,
implicit_sync_wait_info, implicit_sync_signal_info,
implicit_sync_mask);
}

if (j + 1 == split_count)
{
submit = &submit_desc[num_submits++];
*submit = implicit_sync_signal;
}
}

/* Prefer binary semaphore since timeline signal -> wait pair can cause scheduling bubbles.
* Binary semaphores tend to be more well-behaved here since they can lower to kernel primitives more easily. */
if (!command_queue->vkd3d_queue->barrier_command_buffer && j + 1 == split_count)
Expand All @@ -18338,11 +18406,11 @@ static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queu

if (command_queue->device->vk_info.NV_low_latency2)
{
spinlock_acquire(&command_queue->device->low_latency_swapchain_spinlock);
spinlock_acquire(&command_queue->device->swapchain_info.spinlock);
if ((low_latency_swapchain = command_queue->device->swapchain_info.low_latency_swapchain))
dxgi_vk_swap_chain_incref(low_latency_swapchain);
consumed_present_id = command_queue->device->frame_markers.consumed_present_id;
spinlock_release(&command_queue->device->low_latency_swapchain_spinlock);
spinlock_release(&command_queue->device->swapchain_info.spinlock);

/* If we have submitted a swapchain blit to Vulkan,
* it is not possible for a present ID to keep contributing to the frame's completion.
Expand Down Expand Up @@ -18879,7 +18947,8 @@ static void *d3d12_command_queue_submission_worker_main(void *userdata)
submission.execute.timeline_cookie,
submission.execute.low_latency_frame_id,
submission.execute.debug_capture,
submission.execute.split_submission);
submission.execute.split_submission,
submission.execute.implicit_sync_mask);

/* command_queue_execute takes ownership of the
* outstanding_submission_counters and queue_timeline_indices allocations.
Expand Down
9 changes: 7 additions & 2 deletions libs/vkd3d/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,7 @@ static const struct vkd3d_debug_option vkd3d_config_options[] =
{"app_debug_marker_only", VKD3D_CONFIG_FLAG_APP_DEBUG_MARKER_ONLY},
{"small_vram_rebar", VKD3D_CONFIG_FLAG_SMALL_VRAM_REBAR},
{"staggered_submit", VKD3D_CONFIG_FLAG_STAGGERED_SUBMIT},
{"async_present", VKD3D_CONFIG_FLAG_ASYNC_PRESENT},
};

static void vkd3d_config_flags_init_once(void)
Expand Down Expand Up @@ -3538,6 +3539,7 @@ static void d3d12_device_destroy(struct d3d12_device *device)
if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_BREADCRUMBS)
vkd3d_breadcrumb_tracer_cleanup(&device->breadcrumb_tracer, device);
#endif
vkd3d_device_swapchain_info_cleanup(&device->swapchain_info, device);
vkd3d_pipeline_library_flush_disk_cache(&device->disk_cache);
vkd3d_sampler_state_cleanup(&device->sampler_state, device);
vkd3d_view_map_destroy(&device->sampler_map, device);
Expand Down Expand Up @@ -8784,8 +8786,6 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
goto out_free_mutex;
}

spinlock_init(&device->low_latency_swapchain_spinlock);

device->ID3D12DeviceExt_iface.lpVtbl = &d3d12_device_vkd3d_ext_vtbl;
device->ID3D12DXVKInteropDevice_iface.lpVtbl = &d3d12_dxvk_interop_device_vtbl;
device->ID3DLowLatencyDevice_iface.lpVtbl = &d3d_low_latency_device_vtbl;
Expand Down Expand Up @@ -8887,6 +8887,9 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
if (FAILED(hr = vkd3d_pipeline_library_init_disk_cache(&device->disk_cache, device)))
goto out_cleanup_descriptor_qa_global_info;

if (FAILED(hr = vkd3d_device_swapchain_info_init(&device->swapchain_info, device)))
goto out_cleanup_disk_cache;

d3d12_device_replace_vtable(device);

#ifdef VKD3D_ENABLE_RENDERDOC
Expand All @@ -8896,6 +8899,8 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,

return S_OK;

out_cleanup_disk_cache:
vkd3d_pipeline_library_flush_disk_cache(&device->disk_cache);
out_cleanup_descriptor_qa_global_info:
vkd3d_descriptor_debug_free_global_info(device->descriptor_qa_global_info, device);
out_cleanup_breadcrumb_tracer:
Expand Down
16 changes: 8 additions & 8 deletions libs/vkd3d/device_vkd3d_ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -587,10 +587,10 @@ static HRESULT STDMETHODCALLTYPE d3d12_low_latency_device_LatencySleep(d3d_low_l
if (!device->vk_info.NV_low_latency2)
return E_NOTIMPL;

spinlock_acquire(&device->low_latency_swapchain_spinlock);
spinlock_acquire(&device->swapchain_info.spinlock);
if ((low_latency_swapchain = device->swapchain_info.low_latency_swapchain))
dxgi_vk_swap_chain_incref(low_latency_swapchain);
spinlock_release(&device->low_latency_swapchain_spinlock);
spinlock_release(&device->swapchain_info.spinlock);

if (low_latency_swapchain)
{
Expand All @@ -613,13 +613,13 @@ static HRESULT STDMETHODCALLTYPE d3d12_low_latency_device_SetLatencySleepMode(d3
if (!device->vk_info.NV_low_latency2)
return E_NOTIMPL;

spinlock_acquire(&device->low_latency_swapchain_spinlock);
spinlock_acquire(&device->swapchain_info.spinlock);
device->swapchain_info.mode = low_latency_mode;
device->swapchain_info.boost = low_latency_boost;
device->swapchain_info.minimum_us = minimum_interval_us;
if ((low_latency_swapchain = device->swapchain_info.low_latency_swapchain))
dxgi_vk_swap_chain_incref(low_latency_swapchain);
spinlock_release(&device->low_latency_swapchain_spinlock);
spinlock_release(&device->swapchain_info.spinlock);

if (low_latency_swapchain)
{
Expand Down Expand Up @@ -693,10 +693,10 @@ static HRESULT STDMETHODCALLTYPE d3d12_low_latency_device_SetLatencyMarker(d3d_l
break;
}

spinlock_acquire(&device->low_latency_swapchain_spinlock);
spinlock_acquire(&device->swapchain_info.spinlock);
if ((low_latency_swapchain = device->swapchain_info.low_latency_swapchain))
dxgi_vk_swap_chain_incref(low_latency_swapchain);
spinlock_release(&device->low_latency_swapchain_spinlock);
spinlock_release(&device->swapchain_info.spinlock);

if (low_latency_swapchain)
{
Expand All @@ -717,10 +717,10 @@ static HRESULT STDMETHODCALLTYPE d3d12_low_latency_device_GetLatencyInfo(d3d_low
if (!device->vk_info.NV_low_latency2)
return E_NOTIMPL;

spinlock_acquire(&device->low_latency_swapchain_spinlock);
spinlock_acquire(&device->swapchain_info.spinlock);
if ((low_latency_swapchain = device->swapchain_info.low_latency_swapchain))
dxgi_vk_swap_chain_incref(low_latency_swapchain);
spinlock_release(&device->low_latency_swapchain_spinlock);
spinlock_release(&device->swapchain_info.spinlock);

if (low_latency_swapchain)
{
Expand Down
1 change: 1 addition & 0 deletions libs/vkd3d/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ vkd3d_shaders =[

'shaders/vs_swapchain_fullscreen.vert',
'shaders/fs_swapchain_fullscreen.frag',
'shaders/cs_swapchain_fullscreen.comp',
'shaders/cs_execute_indirect_patch.comp',
'shaders/cs_execute_indirect_patch_debug_ring.comp',
'shaders/cs_execute_indirect_multi_dispatch.comp',
Expand Down
48 changes: 35 additions & 13 deletions libs/vkd3d/meta.c
Original file line number Diff line number Diff line change
Expand Up @@ -643,11 +643,23 @@ static HRESULT vkd3d_meta_create_swapchain_pipeline(struct vkd3d_meta_ops *meta_
struct vkd3d_swapchain_ops *meta_swapchain_ops = &meta_ops->swapchain;
VkResult vr;

if ((vr = vkd3d_meta_create_graphics_pipeline(meta_ops,
meta_swapchain_ops->vk_pipeline_layouts[key->filter], key->format, VK_FORMAT_UNDEFINED, VK_IMAGE_ASPECT_COLOR_BIT,
meta_swapchain_ops->vk_vs_module, meta_swapchain_ops->vk_fs_module, 1,
NULL, 0, NULL, NULL, false, &pipeline->vk_pipeline)) < 0)
return hresult_from_vk_result(vr);
if (key->bind_point == VK_PIPELINE_BIND_POINT_COMPUTE)
{
if ((vr = vkd3d_meta_create_compute_pipeline(meta_ops->device,
sizeof(cs_swapchain_fullscreen), cs_swapchain_fullscreen,
meta_swapchain_ops->vk_pipeline_layouts[key->filter],
NULL, false, &pipeline->vk_pipeline)) < 0)
return hresult_from_vk_result(vr);
}
else
{
if ((vr = vkd3d_meta_create_graphics_pipeline(meta_ops,
meta_swapchain_ops->vk_pipeline_layouts[key->filter], key->format, VK_FORMAT_UNDEFINED,
VK_IMAGE_ASPECT_COLOR_BIT,
meta_swapchain_ops->vk_vs_module, meta_swapchain_ops->vk_fs_module, 1,
NULL, 0, NULL, NULL, false, &pipeline->vk_pipeline)) < 0)
return hresult_from_vk_result(vr);
}

pipeline->key = *key;
return S_OK;
Expand Down Expand Up @@ -1314,7 +1326,8 @@ static void vkd3d_swapchain_ops_cleanup(struct vkd3d_swapchain_ops *meta_swapcha

static HRESULT vkd3d_swapchain_ops_init(struct vkd3d_swapchain_ops *meta_swapchain_ops, struct d3d12_device *device)
{
VkDescriptorSetLayoutBinding set_binding;
VkDescriptorSetLayoutBinding set_binding[2];
VkPushConstantRange push_range;
unsigned int i;
VkResult vr;
int rc;
Expand All @@ -1327,10 +1340,19 @@ static HRESULT vkd3d_swapchain_ops_init(struct vkd3d_swapchain_ops *meta_swapcha
return hresult_from_errno(rc);
}

set_binding.binding = 0;
set_binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
set_binding.descriptorCount = 1;
set_binding.stageFlags = VK_SHADER_STAGE_ALL; /* Could be compute or graphics, so just use ALL. */
set_binding[0].binding = 0;
set_binding[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
set_binding[0].descriptorCount = 1;
set_binding[0].stageFlags = VK_SHADER_STAGE_ALL; /* Could be compute or graphics, so just use ALL. */

set_binding[1].binding = 1;
set_binding[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
set_binding[1].descriptorCount = 1;
set_binding[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;

push_range.offset = 0;
push_range.size = sizeof(float) * 2;
push_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;

for (i = 0; i < 2; i++)
{
Expand All @@ -1340,16 +1362,16 @@ static HRESULT vkd3d_swapchain_ops_init(struct vkd3d_swapchain_ops *meta_swapcha
goto fail;
}

set_binding.pImmutableSamplers = &meta_swapchain_ops->vk_samplers[i];
if ((vr = vkd3d_meta_create_descriptor_set_layout(device, 1, &set_binding,
set_binding[0].pImmutableSamplers = &meta_swapchain_ops->vk_samplers[i];
if ((vr = vkd3d_meta_create_descriptor_set_layout(device, ARRAY_SIZE(set_binding), set_binding,
false, &meta_swapchain_ops->vk_set_layouts[i])) < 0)
{
ERR("Failed to create descriptor set layout, vr %d.\n", vr);
goto fail;
}

if ((vr = vkd3d_meta_create_pipeline_layout(device, 1, &meta_swapchain_ops->vk_set_layouts[i],
0, NULL, &meta_swapchain_ops->vk_pipeline_layouts[i])))
1, &push_range, &meta_swapchain_ops->vk_pipeline_layouts[i])))
{
ERR("Failed to create pipeline layout, vr %d.\n", vr);
goto fail;
Expand Down
Loading