Skip to content

Commit

Permalink
layers: Allow using uncached buffer for Debug Printf
Browse files Browse the repository at this point in the history
Adds option to force using AMD_DEVICE_COHERENT_MEMORY for debug printf buffer
to print messages even if VK_ERROR_DEVICE_LOST is encountered.

* Option added to layer json to be visible in vkconfig,
* Forcing extension and device feature if not enabled by application,
* Added workaround for atomic operations in uncached memory
being in cache anyway,
* Added workaround to failing MapMemory after
DEVICE_LOST (occurs on AMD): When using uncached buffer,
do not unmap buffer until messages are analyzed.
  • Loading branch information
dorian-apanel-intel committed Jul 12, 2023
1 parent 886d29e commit af6ca10
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 14 deletions.
20 changes: 20 additions & 0 deletions layers/VkLayer_khronos_validation.json.in
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,26 @@
]
}
},
{
"key": "printf_uncached_buffer",
"label": "Printf using uncached buffer (ALPHA)",
"description": "Use VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD(from VK_AMD_device_coherent_memory) to allocate destination buffer. Slower, but useful in case of instrumenting shader causing VK_ERROR_DEVICE_LOST.",
"type": "BOOL",
"default": false,
"platforms": [
"WINDOWS",
"LINUX"
],
"dependence": {
"mode": "ALL",
"settings": [
{
"key": "validate_gpu_based",
"value": "GPU_BASED_DEBUG_PRINTF"
}
]
}
},
{
"key": "printf_verbose",
"label": "Printf verbose",
Expand Down
166 changes: 155 additions & 11 deletions layers/gpu_validation/debug_printf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,87 @@
#include <iostream>
#include "generated/layer_chassis_dispatch.h"

void DebugPrintf::PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, void *modified_ci) {
GpuAssistedBase::PreCallRecordCreateDevice(gpu, pCreateInfo, pAllocator, pDevice, modified_ci);

std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer");
vvl::ToLower(use_uncached_buffer_string);
use_uncached_buffer = !use_uncached_buffer_string.compare("true");

if (use_uncached_buffer)
{
static const std::string dcm_ext{"VK_AMD_device_coherent_memory"};
{
bool dcm_supported = false;
uint32_t property_count = 0;
if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, nullptr) == VK_SUCCESS) {
std::vector<VkExtensionProperties> property_list(property_count);
if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, property_list.data()) ==
VK_SUCCESS) {
for (const VkExtensionProperties &properties : property_list) {
if (dcm_ext == properties.extensionName) {
dcm_supported = true;
}
}
}
}
if (!dcm_supported) {
ReportSetupProblem(
device, "Debug Printf with uncached buffer requires VK_AMD_device_coherent_memory which is not supported");
aborted = true;
return;
}
}

// See CreateDevice() in chassis.cpp. modified_ci is a pointer to a safe struct stored on the stack.
// This code follows the safe struct memory memory management scheme. That is, we must delete any memory
// remove from the safe struct, and any additions must be allocated in a way that is compatible with
// the safe struct destructor.
auto *modified_create_info = static_cast<safe_VkDeviceCreateInfo *>(modified_ci);

bool found_ext = false;
for (uint32_t i = 0; i < modified_create_info->enabledExtensionCount; i++) {
if (dcm_ext == modified_create_info->ppEnabledExtensionNames[i]) {
found_ext = true;
break;
}
}
if (!found_ext) {
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF", "VK_AMD_device_coherent_memory extension not enabled but use_uncached_buffer is true. Forcing extension.");
const char **ext_names = new const char *[modified_create_info->enabledExtensionCount + 1];
// Copy the existing pointer table
std::copy(modified_create_info->ppEnabledExtensionNames,
modified_create_info->ppEnabledExtensionNames + modified_create_info->enabledExtensionCount, ext_names);
// Add our new extension
char *dcm_ext_copy = new char[dcm_ext.size() + 1]{};
dcm_ext.copy(dcm_ext_copy, dcm_ext.size());
dcm_ext_copy[dcm_ext.size()] = '\0';
ext_names[modified_create_info->enabledExtensionCount] = dcm_ext_copy;
// Patch up the safe struct
delete[] modified_create_info->ppEnabledExtensionNames;
modified_create_info->ppEnabledExtensionNames = ext_names;
modified_create_info->enabledExtensionCount++;
}
auto *dcm_features = const_cast<VkPhysicalDeviceCoherentMemoryFeaturesAMD *>(
LvlFindInChain<VkPhysicalDeviceCoherentMemoryFeaturesAMD>(modified_create_info));
if (dcm_features) {
if (dcm_features->deviceCoherentMemory != VK_TRUE) {
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF",
"use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature.");
dcm_features->deviceCoherentMemory = VK_TRUE;
}
} else {
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF",
"use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature.");
auto new_dcm_features = LvlInitStruct<VkPhysicalDeviceCoherentMemoryFeaturesAMD>();
new_dcm_features.deviceCoherentMemory = VK_TRUE;
new_dcm_features.pNext = const_cast<void *>(modified_create_info->pNext);
modified_create_info->pNext = new VkPhysicalDeviceCoherentMemoryFeaturesAMD(new_dcm_features);
}
}
}

// Perform initializations that can be done at Create Device time.
void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
if (enabled[gpu_validation]) {
Expand All @@ -42,7 +123,16 @@ void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
use_stdout = !stdout_string.compare("true");
if (getenv("DEBUG_PRINTF_TO_STDOUT")) use_stdout = true;

// GpuAssistedBase::CreateDevice will set up bindings
// Need to get this option again, because PreCallRecordCreateDevice was done
// in separate DebugPrintf instance (during VkInstance creation).
std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer");
vvl::ToLower(use_uncached_buffer_string);
use_uncached_buffer = !use_uncached_buffer_string.compare("true");
if (use_uncached_buffer) {
force_device_coherent_memory = true; // vma needs to know it.
}

// GpuAssistedBase::CreateDevice will set up bindings.
VkDescriptorSetLayoutBinding binding = {3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT |
VK_SHADER_STAGE_TASK_BIT_EXT | VK_SHADER_STAGE_COMPUTE_BIT |
Expand Down Expand Up @@ -138,6 +228,42 @@ void DebugPrintf::PreCallRecordCreateShaderModule(VkDevice device, const VkShade
}
}

// Override GpuAssistedBase version to allow processing in case of VK_ERROR_DEVICE_LOST when using uncached buffer.
void DebugPrintf::PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence,
VkResult result) {
ValidationStateTracker::PostCallRecordQueueSubmit(queue, submitCount, pSubmits, fence, result);

bool device_lost = (result == VK_ERROR_DEVICE_LOST);

if (aborted) return;
if (!((result == VK_SUCCESS) || (device_lost && use_uncached_buffer))) return;

bool buffers_present = false;
// Don't QueueWaitIdle if there's nothing to process
for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
const VkSubmitInfo *submit = &pSubmits[submit_idx];
for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
buffers_present |= CommandBufferNeedsProcessing(submit->pCommandBuffers[i]);
}
}
if (!buffers_present) return;

if (!device_lost) {
SubmitBarrier(queue);

DispatchQueueWaitIdle(queue); /// @todo Dispatch wait idle only after SubmitBarrier() succeeded.
} else {
assert(use_uncached_buffer);
}

for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
const VkSubmitInfo *submit = &pSubmits[submit_idx];
for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
ProcessCommandBuffer(queue, submit->pCommandBuffers[i]);
}
}
}

vartype vartype_lookup(char intype) {
switch (intype) {
case 'd':
Expand Down Expand Up @@ -308,10 +434,14 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
// 9 Printf Values Word 0 (optional)
// 10 Printf Values Word 1 (optional)
uint32_t expect = debug_output_buffer[1];
if (!expect) return;
// Total size of all messages are written by AtomicAdd. Atomics in uncached memory seems to be working in caches anyway
// and are not flushed to uncached memory at the end. In that case, expect will contain zero.
// As a WA just parse messages using individual sizes (written correctly).
// Can the messages be overridden because of those atomics?
if (!expect && !use_uncached_buffer) return;

uint32_t index = spvtools::kDebugOutputDataOffset;
while (debug_output_buffer[index]) {
while (debug_output_buffer[index] && (index < output_buffer_size)) {
std::stringstream shader_message;
VkShaderModule shader_module_handle = VK_NULL_HANDLE;
VkPipeline pipeline_handle = VK_NULL_HANDLE;
Expand Down Expand Up @@ -412,7 +542,8 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
}
index += debug_record->size;
}
if ((index - spvtools::kDebugOutputDataOffset) != expect) {
if ((use_uncached_buffer && (index >= output_buffer_size)) ||
!use_uncached_buffer && (index - spvtools::kDebugOutputDataOffset) != expect) {
LogWarning(device, "UNASSIGNED-DEBUG-PRINTF",
"WARNING - Debug Printf message was truncated, likely due to a buffer size that was too small for the message");
}
Expand All @@ -429,7 +560,6 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) {
uint32_t ray_trace_index = 0;

for (auto &buffer_info : gpu_buffer_list) {
char *data;

uint32_t operation_index = 0;
if (buffer_info.pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
Expand All @@ -445,10 +575,16 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) {
assert(false);
}

VkResult result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation, (void **)&data);
VkResult result = VK_SUCCESS;
if (buffer_info.output_mem_block.data == nullptr) {
result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation,
(void **)&buffer_info.output_mem_block.data);
}
if (result == VK_SUCCESS) {
device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index, (uint32_t *)data);
device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index,
(uint32_t *)buffer_info.output_mem_block.data);
vmaUnmapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation);
buffer_info.output_mem_block.data = nullptr;
}
}
}
Expand Down Expand Up @@ -672,6 +808,9 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer,
buffer_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
VmaAllocationCreateInfo alloc_info = {};
alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
if (use_uncached_buffer) {
alloc_info.requiredFlags |= VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD;
}
result = vmaCreateBuffer(vmaAllocator, &buffer_info, &alloc_info, &output_block.buffer, &output_block.allocation, nullptr);
if (result != VK_SUCCESS) {
ReportSetupProblem(device, "Unable to allocate device memory. Device could become unstable.");
Expand All @@ -680,11 +819,16 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer,
}

// Clear the output block to zeros so that only printf values from the gpu will be present
uint32_t *data;
result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&data));
result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&output_block.data));
if (result == VK_SUCCESS) {
memset(data, 0, output_buffer_size);
vmaUnmapMemory(vmaAllocator, output_block.allocation);
memset(output_block.data, 0, output_buffer_size);
if (!use_uncached_buffer) {
vmaUnmapMemory(vmaAllocator, output_block.allocation);
output_block.data = nullptr;
} else {
// Mapping may fail after DEVICE_LOST. Keep it mapped for now.
// Will be unmapped in debug_printf_state::CommandBuffer::Process
}
}

auto desc_writes = LvlInitStruct<VkWriteDescriptorSet>();
Expand Down
6 changes: 6 additions & 0 deletions layers/gpu_validation/debug_printf.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class DebugPrintf;
struct DPFDeviceMemoryBlock {
VkBuffer buffer;
VmaAllocation allocation;
uint32_t* data; // only valid if using uncached buffer, because mapping may fail after device is lost.
};

struct DPFBufferInfo {
Expand Down Expand Up @@ -86,12 +87,16 @@ class DebugPrintf : public GpuAssistedBase {
desired_features.fragmentStoresAndAtomics = true;
}

void PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, void* modified_ci) override;
void CreateDevice(const VkDeviceCreateInfo* pCreateInfo) override;
bool InstrumentShader(const vvl::span<const uint32_t>& input, std::vector<uint32_t>& new_pgm,
uint32_t* unique_shader_id) override;
void PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule,
void* csm_state_data) override;
void PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits,
VkFence fence, VkResult result) override;
std::vector<DPFSubstring> ParseFormatString(const std::string& format_string);
std::string FindFormatString(vvl::span<const uint32_t> pgm, uint32_t string_id);
void AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, DPFBufferInfo& buffer_info,
Expand Down Expand Up @@ -172,6 +177,7 @@ class DebugPrintf : public GpuAssistedBase {
void DestroyBuffer(DPFBufferInfo& buffer_info);

private:
bool use_uncached_buffer = false;
bool verbose = false;
bool use_stdout = false;
};
10 changes: 8 additions & 2 deletions layers/gpu_validation/gpu_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,8 @@ static VKAPI_ATTR void VKAPI_CALL gpuVkCmdCopyBuffer(VkCommandBuffer commandBuff
DispatchCmdCopyBuffer(commandBuffer, srcBuffer, dstBuffer, regionCount, pRegions);
}

VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address, VmaAllocator *pAllocator) {
VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address,
bool use_device_coherent_memory, VmaAllocator *pAllocator) {
VmaVulkanFunctions functions;
VmaAllocatorCreateInfo allocator_info = {};
allocator_info.instance = instance;
Expand All @@ -203,6 +204,10 @@ VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device
allocator_info.flags |= VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT;
}

if (use_device_coherent_memory) {
allocator_info.flags |= VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT;
}

functions.vkGetInstanceProcAddr = static_cast<PFN_vkGetInstanceProcAddr>(gpuVkGetInstanceProcAddr);
functions.vkGetDeviceProcAddr = static_cast<PFN_vkGetDeviceProcAddr>(gpuVkGetDeviceProcAddr);
functions.vkGetPhysicalDeviceProperties = static_cast<PFN_vkGetPhysicalDeviceProperties>(gpuVkGetPhysicalDeviceProperties);
Expand Down Expand Up @@ -373,7 +378,8 @@ void GpuAssistedBase::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
}
desc_set_bind_index = adjusted_max_desc_sets - 1;

VkResult result1 = UtilInitializeVma(instance, physical_device, device, force_buffer_device_address, &vmaAllocator);
VkResult result1 = UtilInitializeVma(instance, physical_device, device, force_buffer_device_address,
force_device_coherent_memory, &vmaAllocator);
assert(result1 == VK_SUCCESS);
desc_set_manager = std::make_unique<UtilDescriptorSetManager>(device, static_cast<uint32_t>(bindings_.size()));

Expand Down
3 changes: 2 additions & 1 deletion layers/gpu_validation/gpu_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ VALSTATETRACK_DERIVED_STATE_OBJECT(VkQueue, gpu_utils_state::Queue, QUEUE_STATE)
VALSTATETRACK_DERIVED_STATE_OBJECT(VkCommandBuffer, gpu_utils_state::CommandBuffer, CMD_BUFFER_STATE)

VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address,
VmaAllocator *pAllocator);
bool use_device_coherent_memory, VmaAllocator *pAllocator);

void UtilGenerateStageMessage(const uint32_t *debug_record, std::string &msg);
void UtilGenerateCommonMessage(const debug_report_data *report_data, const VkCommandBuffer commandBuffer,
Expand Down Expand Up @@ -216,6 +216,7 @@ class GpuAssistedBase : public ValidationStateTracker {
public:
bool aborted = false;
bool force_buffer_device_address;
bool force_device_coherent_memory = false;
PFN_vkSetDeviceLoaderData vkSetDeviceLoaderData;
const char *setup_vuid;
VkPhysicalDeviceFeatures supported_features{};
Expand Down

0 comments on commit af6ca10

Please sign in to comment.