Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

layers: Allow using uncached buffer for Debug Printf #6128

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions layers/VkLayer_khronos_validation.json.in
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,26 @@
]
}
},
{
"key": "printf_uncached_buffer",
"label": "Printf using uncached buffer (ALPHA)",
"description": "Use VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD(from VK_AMD_device_coherent_memory) to allocate destination buffer. Slower, but useful in case of instrumenting shader causing VK_ERROR_DEVICE_LOST.",
"type": "BOOL",
"default": false,
"platforms": [
"WINDOWS",
"LINUX"
],
"dependence": {
"mode": "ALL",
"settings": [
{
"key": "validate_gpu_based",
"value": "GPU_BASED_DEBUG_PRINTF"
}
]
}
},
{
"key": "printf_verbose",
"label": "Printf verbose",
Expand Down
182 changes: 169 additions & 13 deletions layers/gpu_validation/debug_printf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,87 @@
#include <iostream>
#include "generated/layer_chassis_dispatch.h"

void DebugPrintf::PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, void *modified_ci) {
GpuAssistedBase::PreCallRecordCreateDevice(gpu, pCreateInfo, pAllocator, pDevice, modified_ci);

std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer");
vvl::ToLower(use_uncached_buffer_string);
use_uncached_buffer = !use_uncached_buffer_string.compare("true");

if (use_uncached_buffer)
{
static const std::string dcm_ext{"VK_AMD_device_coherent_memory"};
{
bool dcm_supported = false;
uint32_t property_count = 0;
if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, nullptr) == VK_SUCCESS) {
std::vector<VkExtensionProperties> property_list(property_count);
if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, property_list.data()) ==
VK_SUCCESS) {
for (const VkExtensionProperties &properties : property_list) {
if (dcm_ext == properties.extensionName) {
dcm_supported = true;
}
}
}
}
if (!dcm_supported) {
ReportSetupProblem(
device, "Debug Printf with uncached buffer requires VK_AMD_device_coherent_memory which is not supported");
aborted = true;
return;
}
}

// See CreateDevice() in chassis.cpp. modified_ci is a pointer to a safe struct stored on the stack.
// This code follows the safe struct memory memory management scheme. That is, we must delete any memory
// remove from the safe struct, and any additions must be allocated in a way that is compatible with
// the safe struct destructor.
auto *modified_create_info = static_cast<safe_VkDeviceCreateInfo *>(modified_ci);

bool found_ext = false;
for (uint32_t i = 0; i < modified_create_info->enabledExtensionCount; i++) {
if (dcm_ext == modified_create_info->ppEnabledExtensionNames[i]) {
found_ext = true;
break;
}
}
if (!found_ext) {
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF", "VK_AMD_device_coherent_memory extension not enabled but use_uncached_buffer is true. Forcing extension.");
const char **ext_names = new const char *[modified_create_info->enabledExtensionCount + 1];
// Copy the existing pointer table
std::copy(modified_create_info->ppEnabledExtensionNames,
modified_create_info->ppEnabledExtensionNames + modified_create_info->enabledExtensionCount, ext_names);
// Add our new extension
char *dcm_ext_copy = new char[dcm_ext.size() + 1]{};
dcm_ext.copy(dcm_ext_copy, dcm_ext.size());
dcm_ext_copy[dcm_ext.size()] = '\0';
ext_names[modified_create_info->enabledExtensionCount] = dcm_ext_copy;
// Patch up the safe struct
delete[] modified_create_info->ppEnabledExtensionNames;
modified_create_info->ppEnabledExtensionNames = ext_names;
modified_create_info->enabledExtensionCount++;
}
auto *dcm_features = const_cast<VkPhysicalDeviceCoherentMemoryFeaturesAMD *>(
LvlFindInChain<VkPhysicalDeviceCoherentMemoryFeaturesAMD>(modified_create_info));
if (dcm_features) {
if (dcm_features->deviceCoherentMemory != VK_TRUE) {
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF",
"use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature.");
dcm_features->deviceCoherentMemory = VK_TRUE;
}
} else {
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF",
"use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature.");
auto new_dcm_features = LvlInitStruct<VkPhysicalDeviceCoherentMemoryFeaturesAMD>();
new_dcm_features.deviceCoherentMemory = VK_TRUE;
new_dcm_features.pNext = const_cast<void *>(modified_create_info->pNext);
modified_create_info->pNext = new VkPhysicalDeviceCoherentMemoryFeaturesAMD(new_dcm_features);
}
}
}

// Perform initializations that can be done at Create Device time.
void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
if (enabled[gpu_validation]) {
Expand All @@ -42,7 +123,16 @@ void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
use_stdout = !stdout_string.compare("true");
if (getenv("DEBUG_PRINTF_TO_STDOUT")) use_stdout = true;

// GpuAssistedBase::CreateDevice will set up bindings
// Need to get this option again, because PreCallRecordCreateDevice was done
// in separate DebugPrintf instance (during VkInstance creation).
std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer");
vvl::ToLower(use_uncached_buffer_string);
use_uncached_buffer = !use_uncached_buffer_string.compare("true");
if (use_uncached_buffer) {
force_device_coherent_memory = true; // vma needs to know it.
}

// GpuAssistedBase::CreateDevice will set up bindings.
VkDescriptorSetLayoutBinding binding = {3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT |
VK_SHADER_STAGE_TASK_BIT_EXT | VK_SHADER_STAGE_COMPUTE_BIT |
Expand Down Expand Up @@ -138,6 +228,49 @@ void DebugPrintf::PreCallRecordCreateShaderModule(VkDevice device, const VkShade
}
}

// Override GpuAssistedBase version to allow processing in case of VK_ERROR_DEVICE_LOST when using uncached buffer.
void DebugPrintf::PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence,
VkResult result) {
ValidationStateTracker::PostCallRecordQueueSubmit(queue, submitCount, pSubmits, fence, result);

bool device_lost = (result == VK_ERROR_DEVICE_LOST);

if (aborted) return;

if (result != VK_SUCCESS) {
if (!use_uncached_buffer) {
return;
} else if (!device_lost) {
return; // VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY
}
}

bool buffers_present = false;
// Don't QueueWaitIdle if there's nothing to process
for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
const VkSubmitInfo *submit = &pSubmits[submit_idx];
for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
buffers_present |= CommandBufferNeedsProcessing(submit->pCommandBuffers[i]);
}
}
if (!buffers_present) return;

if (!device_lost) {
SubmitBarrier(queue);

DispatchQueueWaitIdle(queue); /// @todo Dispatch wait idle only after SubmitBarrier() succeeded.
} else {
assert(use_uncached_buffer);
}

for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
const VkSubmitInfo *submit = &pSubmits[submit_idx];
for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
ProcessCommandBuffer(queue, submit->pCommandBuffers[i]);
}
}
}

vartype vartype_lookup(char intype) {
switch (intype) {
case 'd':
Expand Down Expand Up @@ -307,11 +440,14 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
// 8 Printf Format String Id
// 9 Printf Values Word 0 (optional)
// 10 Printf Values Word 1 (optional)
uint32_t expect = debug_output_buffer[1];
if (!expect) return;
uint32_t expect = debug_output_buffer[spvtools::kDebugOutputSizeOffset];
// Total size of all messages are written by AtomicAdd. Atomics in uncached memory seems to be working in caches anyway
// and are not flushed to uncached memory at the end. In that case, expect will contain zero.
// As a WA just parse messages using individual sizes (written correctly).
if (!expect && !use_uncached_buffer) return;

uint32_t index = spvtools::kDebugOutputDataOffset;
while (debug_output_buffer[index]) {
while ((index < output_buffer_size) && debug_output_buffer[index]) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this loop will work if the output buffer has one message but isn't full. To see this, use uncached buffer memory in the test NegativeDebugPrintf.BasicUsage and run it. This loop doesn't stop when it's supposed to.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you check in which vkQueueSubmit/AnalyzeAndGenerateMessages you observe it? BasicUsage does it 15 times.

Here is how I understand it:

In my first one, debug_output_buffer == 0x0000028b8e520000
Memory of that buffer is:

0x0000028B8E520000  00 00 00 00  MBZ
0x0000028B8E520004  0a 00 00 00  expect (size of all messages, written by atomic in shader)
0x0000028B8E520008  0a 00 00 00  DPFOutputRecord.size (size in dwords of this message)
0x0000028B8E52000C  00 00 00 00  DPFOutputRecord.shader_id
0x0000028B8E520010  70 00 00 00  DPFOutputRecord.instruction_position
0x0000028B8E520014  00 00 00 00  DPFOutputRecord.stage
0x0000028B8E520018  00 00 00 00  DPFOutputRecord.stage_word_1
0x0000028B8E52001C  00 00 00 00  DPFOutputRecord.stage_word_2
0x0000028B8E520020  00 00 00 00  DPFOutputRecord.stage_word_3
0x0000028B8E520024  29 00 00 00  DPFOutputRecord.format_string_id ("Here are two float values %f, %f")
0x0000028B8E520028  00 00 80 3f  DPFOutputRecord.values (float 1.0)
0x0000028B8E52002C  56 0e 49 40  DPFOutputRecord.values (float 3.14150000)
0x0000028B8E520030  00 00 00 00  DPFOutputRecord.size  (no more messages)
0x0000028B8E520034  00 00 00 00  DPFOutputRecord.
0x0000028B8E520038  00 00 00 00  DPFOutputRecord.

Index starts at 0x2. At the end of first loop iteration:
index += debug_record->size; // 0x2 + 0xa = 0xc
At second loop beginning
debug_output_buffer[index] == 0x0000028b8e520030 {0x0}
So the loop ends.

(index < output_buffer_size) means "while we are still in the buffer"
&& debug_output_buffer[index]) means "while next message is not empty"

Copy link
Contributor

@TonyBarbour TonyBarbour Jul 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have access to the system I was using to test this today - I'll get back to it tomorrow. The system I do have is getting VK_TIMEOUT instead of VK_ERROR_DEVICE_LOST or VK_SUCCESS at the vkQueueWaitIdle, and I had to add that to the test's expected results. You should add that to your test.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I've added VK_TIMEOUT to asserts in test.
I assume you used AMD for that test. it happened to me once too.
VK_TIMEOUT should be not returned from vkQueueSubmit, vkDeviceWaitIdle or vkQueueWaitIdle.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the problem. Because debug_output_buffer[1] comes back as 0, the memset at the end of message processing doesn't clear the buffer:

memset(debug_output_buffer, 0, 4 * (debug_output_buffer[spvtools::kDebugOutputSizeOffset] + spvtools::kDebugOutputDataOffset));

And the memset clears 4 * (0 + 2) bytes which doesn't cover all of the records. Then on the next printf, the loop depends on debug_output_buffer[index] being zero when the loop is supposed to stop, but previous printfs may have left values that didn't get memset to 0. I suppose one answer would be to memset the whole buffer.

But I'm seeing another problem. If I modify your test to do two debug printfs:

          if (gl_VertexIndex == 0) {
                debugPrintfEXT("Here are three float values %f, %f, %f", 1.0, myfloat, gl_Position.x);
                debugPrintfEXT("Here's another debug printf");
                float x = constants.x[0];
                while(x > -1.f) { // infinite loop
                    x += 0.000001f;
                }
                debugPrintfEXT("Here is a value that should not be printed %f", x);
            }

I only get the second printf back, as if the first one was overwritten in the shader. Could that be a result of the atomic operation problem and debug_output_buffer[1] not getting written?
(This is all on an AMD RX6600 with driver 23.7.1)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The atomic add is performed by the instrumentation added to the shader via spirv-tools code, which I'm not very familiar with.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok so after a bit of digging, this extension was enabled before we had the MALL cache in hardware, and nobody ever went back and changed the interface to bypass MALL, so there IS caching enabled on this memory type for newer hardware. Whoops. I've filed an internal bug to fix this and to add a more comprehensive test to make sure this actually works on the off chance we make another big change to the caching hierarchy...

You can go look at the public driver source if for some reason you want to see where this is happening 🙃

I'll let you know when it's fixed and subsequently released.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Tobski I assume the fix is released now?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A fix has been implemented but I'm not sure on release status, I'll take a look.

std::stringstream shader_message;
VkShaderModule shader_module_handle = VK_NULL_HANDLE;
VkPipeline pipeline_handle = VK_NULL_HANDLE;
Expand Down Expand Up @@ -412,11 +548,19 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
}
index += debug_record->size;
}
if ((index - spvtools::kDebugOutputDataOffset) != expect) {
if ((use_uncached_buffer && (index >= output_buffer_size)) ||
(!use_uncached_buffer && (index - spvtools::kDebugOutputDataOffset) != expect)) {
LogWarning(device, "UNASSIGNED-DEBUG-PRINTF",
"WARNING - Debug Printf message was truncated, likely due to a buffer size that was too small for the message");
}
memset(debug_output_buffer, 0, 4 * (debug_output_buffer[spvtools::kDebugOutputSizeOffset] + spvtools::kDebugOutputDataOffset));

if (use_uncached_buffer) {
// WA for atomics.
memset(debug_output_buffer, 0, output_buffer_size);
} else {
// Clear only written memory.
memset(debug_output_buffer, 0, sizeof(uint32_t) * (expect + spvtools::kDebugOutputDataOffset));
}
}

// For the given command buffer, map its debug data buffers and read their contents for analysis.
Expand All @@ -429,7 +573,6 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) {
uint32_t ray_trace_index = 0;

for (auto &buffer_info : gpu_buffer_list) {
char *data;

uint32_t operation_index = 0;
if (buffer_info.pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
Expand All @@ -445,10 +588,16 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) {
assert(false);
}

VkResult result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation, (void **)&data);
VkResult result = VK_SUCCESS;
if (buffer_info.output_mem_block.data == nullptr) {
result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation,
(void **)&buffer_info.output_mem_block.data);
}
if (result == VK_SUCCESS) {
device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index, (uint32_t *)data);
device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index,
(uint32_t *)buffer_info.output_mem_block.data);
vmaUnmapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation);
buffer_info.output_mem_block.data = nullptr;
}
}
}
Expand Down Expand Up @@ -672,6 +821,9 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer,
buffer_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
VmaAllocationCreateInfo alloc_info = {};
alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
if (use_uncached_buffer) {
alloc_info.requiredFlags |= VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD | VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably don't want to require UNCACHED here, since theoretically in future you might get COHERENT and CACHED memory. Uncached is meant to be informative, not something you generally rely upon unless you're doing some very specific performance tuning.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite understand. Here I want to get device uncached to read it after device is lost.

}
result = vmaCreateBuffer(vmaAllocator, &buffer_info, &alloc_info, &output_block.buffer, &output_block.allocation, nullptr);
if (result != VK_SUCCESS) {
ReportSetupProblem(device, "Unable to allocate device memory. Device could become unstable.");
Expand All @@ -680,11 +832,15 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer,
}

// Clear the output block to zeros so that only printf values from the gpu will be present
uint32_t *data;
result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&data));
result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&output_block.data));
if (result == VK_SUCCESS) {
memset(data, 0, output_buffer_size);
vmaUnmapMemory(vmaAllocator, output_block.allocation);
memset(output_block.data, 0, output_buffer_size);
// Mapping may fail after DEVICE_LOST. Keep it mapped for now in such case.
// Will be unmapped in debug_printf_state::CommandBuffer::Process
if (!use_uncached_buffer) {
vmaUnmapMemory(vmaAllocator, output_block.allocation);
output_block.data = nullptr;
}
}

auto desc_writes = LvlInitStruct<VkWriteDescriptorSet>();
Expand Down
6 changes: 6 additions & 0 deletions layers/gpu_validation/debug_printf.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class DebugPrintf;
struct DPFDeviceMemoryBlock {
VkBuffer buffer;
VmaAllocation allocation;
uint32_t* data; // only valid if using uncached buffer, because mapping may fail after device is lost.
};

struct DPFBufferInfo {
Expand Down Expand Up @@ -86,12 +87,16 @@ class DebugPrintf : public GpuAssistedBase {
desired_features.fragmentStoresAndAtomics = true;
}

void PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, void* modified_ci) override;
void CreateDevice(const VkDeviceCreateInfo* pCreateInfo) override;
bool InstrumentShader(const vvl::span<const uint32_t>& input, std::vector<uint32_t>& new_pgm,
uint32_t* unique_shader_id) override;
void PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule,
void* csm_state_data) override;
void PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits,
VkFence fence, VkResult result) override;
std::vector<DPFSubstring> ParseFormatString(const std::string& format_string);
std::string FindFormatString(vvl::span<const uint32_t> pgm, uint32_t string_id);
void AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, DPFBufferInfo& buffer_info,
Expand Down Expand Up @@ -172,6 +177,7 @@ class DebugPrintf : public GpuAssistedBase {
void DestroyBuffer(DPFBufferInfo& buffer_info);

private:
bool use_uncached_buffer = false;
bool verbose = false;
bool use_stdout = false;
};
10 changes: 8 additions & 2 deletions layers/gpu_validation/gpu_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,8 @@ static VKAPI_ATTR void VKAPI_CALL gpuVkCmdCopyBuffer(VkCommandBuffer commandBuff
DispatchCmdCopyBuffer(commandBuffer, srcBuffer, dstBuffer, regionCount, pRegions);
}

VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address, VmaAllocator *pAllocator) {
VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address,
bool use_device_coherent_memory, VmaAllocator *pAllocator) {
VmaVulkanFunctions functions;
VmaAllocatorCreateInfo allocator_info = {};
allocator_info.instance = instance;
Expand All @@ -203,6 +204,10 @@ VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device
allocator_info.flags |= VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT;
}

if (use_device_coherent_memory) {
allocator_info.flags |= VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT;
}

functions.vkGetInstanceProcAddr = static_cast<PFN_vkGetInstanceProcAddr>(gpuVkGetInstanceProcAddr);
functions.vkGetDeviceProcAddr = static_cast<PFN_vkGetDeviceProcAddr>(gpuVkGetDeviceProcAddr);
functions.vkGetPhysicalDeviceProperties = static_cast<PFN_vkGetPhysicalDeviceProperties>(gpuVkGetPhysicalDeviceProperties);
Expand Down Expand Up @@ -373,7 +378,8 @@ void GpuAssistedBase::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
}
desc_set_bind_index = adjusted_max_desc_sets - 1;

VkResult result1 = UtilInitializeVma(instance, physical_device, device, force_buffer_device_address, &vmaAllocator);
VkResult result1 = UtilInitializeVma(instance, physical_device, device, force_buffer_device_address,
force_device_coherent_memory, &vmaAllocator);
assert(result1 == VK_SUCCESS);
desc_set_manager = std::make_unique<UtilDescriptorSetManager>(device, static_cast<uint32_t>(bindings_.size()));

Expand Down
3 changes: 2 additions & 1 deletion layers/gpu_validation/gpu_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ VALSTATETRACK_DERIVED_STATE_OBJECT(VkQueue, gpu_utils_state::Queue, QUEUE_STATE)
VALSTATETRACK_DERIVED_STATE_OBJECT(VkCommandBuffer, gpu_utils_state::CommandBuffer, CMD_BUFFER_STATE)

VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address,
VmaAllocator *pAllocator);
bool use_device_coherent_memory, VmaAllocator *pAllocator);

void UtilGenerateStageMessage(const uint32_t *debug_record, std::string &msg);
void UtilGenerateCommonMessage(const debug_report_data *report_data, const VkCommandBuffer commandBuffer,
Expand Down Expand Up @@ -216,6 +216,7 @@ class GpuAssistedBase : public ValidationStateTracker {
public:
bool aborted = false;
bool force_buffer_device_address;
bool force_device_coherent_memory = false;
PFN_vkSetDeviceLoaderData vkSetDeviceLoaderData;
const char *setup_vuid;
VkPhysicalDeviceFeatures supported_features{};
Expand Down
Loading