-
This discussion began on the wgpu-users chat room. I wrote some code to find the Max of an array using a GPU compute shader. This is counterintuitive as I would expect a stronger barrier if both buffers are writable. use std::convert::TryInto;
use wgpu::util::DeviceExt;
// Dependencies
/*
[dependencies]
wgpu = "0.6"
futures = "0.3"
rand = "0.8"
bytemuck = "1.5.0"
*/
// max.comp src
/*
#version 450
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(set = 0, binding = 0) buffer SrcValues {
uint value[];
} srcValues;
layout(set = 0, binding = 1) buffer DstValues {
uint value[];
} dstValues;
shared uint localValues[64];
void main() {
uint group_size = gl_WorkGroupSize.x;
uint local_id = gl_LocalInvocationID.x;
// Store the values locally
localValues[local_id] = srcValues.value[gl_GlobalInvocationID.x];
for (uint stride = group_size / 2; stride > 0; stride /= 2) {
memoryBarrierShared();
barrier();
if (local_id < stride) {
localValues[local_id] = max(localValues[local_id], localValues[local_id + stride]);
}
}
memoryBarrierShared();
barrier();
// Commit the final value
if (gl_LocalInvocationID.x == 0) {
dstValues.value[gl_WorkGroupID.x] = localValues[0];
}
memoryBarrierBuffer();
}
*/
async fn setup() {
let mut values = vec![0u32; 64 * 64 * 64];
for i in 0..1_000 {
let mut wanted_max = 0u32;
for val in values.iter_mut() {
*val = rand::random::<u32>();
if *val > wanted_max {
wanted_max = *val;
}
}
let max_val = gpu_exec(&values).await;
println!("{}: cpu: {} gpu: {}", i, wanted_max, max_val);
assert_eq!(wanted_max, max_val);
}
}
async fn gpu_exec(values: &[u32]) -> u32 {
// Instantiates instance of WebGPU
let instance = wgpu::Instance::new(wgpu::BackendBit::PRIMARY);
// `request_adapter` instantiates the general connection to the GPU
let adapter = instance
.request_adapter(&wgpu::RequestAdapterOptions::default())
.await
.unwrap();
// `request_device` instantiates the feature specific connection to the GPU, defining some parameters,
// `features` being the available features.
let (device, queue) = adapter
.request_device(
&wgpu::DeviceDescriptor {
features: wgpu::Features::empty(),
limits: wgpu::Limits::default(),
shader_validation: false,
},
None,
)
.await
.unwrap();
// Loads the shader from the SPIR-V file.arrayvec
let cs_module = device.create_shader_module(wgpu::include_spirv!("shaders/max.comp.spv"));
// Gets the size in bytes of the buffer.
let slice_size = values.len() * std::mem::size_of::<u32>();
let size = slice_size as wgpu::BufferAddress;
let buffer_a = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: None,
contents: bytemuck::cast_slice(&values),
usage: wgpu::BufferUsage::STORAGE | wgpu::BufferUsage::COPY_SRC,
});
let buffer_b = device.create_buffer(&wgpu::BufferDescriptor {
label: None,
size,
usage: wgpu::BufferUsage::STORAGE | wgpu::BufferUsage::COPY_SRC,
mapped_at_creation: false,
});
let out_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: None,
size: (64 * std::mem::size_of::<f32>()) as u64,
usage: wgpu::BufferUsage::COPY_DST | wgpu::BufferUsage::MAP_READ,
mapped_at_creation: false,
});
// Here we specifiy the layout of the bind group.
let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: None,
entries: &[
wgpu::BindGroupLayoutEntry {
binding: 0, // The location
visibility: wgpu::ShaderStage::COMPUTE, // Which shader type in the pipeline this buffer is available to.
ty: wgpu::BindingType::StorageBuffer {
dynamic: false,
min_binding_size: std::num::NonZeroU64::new(
(64 * std::mem::size_of::<u32>()) as u64,
),
readonly: true, // If this is `false` then this shader fails to find the max sometimes
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 1, // The location
visibility: wgpu::ShaderStage::COMPUTE, // Which shader type in the pipeline this buffer is available to.
ty: wgpu::BindingType::StorageBuffer {
dynamic: false,
min_binding_size: std::num::NonZeroU64::new(
(64 * std::mem::size_of::<u32>()) as u64,
),
readonly: false,
},
count: None,
},
],
});
// Instantiates the bind group, once again specifying the binding of buffers.
let bind_group_a = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
layout: &bind_group_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::Buffer(buffer_a.slice(..)),
},
wgpu::BindGroupEntry {
binding: 1,
resource: wgpu::BindingResource::Buffer(buffer_b.slice(..)),
},
],
});
let bind_group_b = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
layout: &bind_group_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::Buffer(buffer_b.slice(..)),
},
wgpu::BindGroupEntry {
binding: 1,
resource: wgpu::BindingResource::Buffer(buffer_a.slice(..)),
},
],
});
let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: None,
bind_group_layouts: &[&bind_group_layout],
push_constant_ranges: &[],
});
let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: None,
layout: Some(&pipeline_layout),
compute_stage: wgpu::ProgrammableStageDescriptor {
module: &cs_module,
entry_point: "main",
},
});
let bind_groups: [_; 2] = [bind_group_a, bind_group_b];
let mut dispatch_size = values.len() / 64;
let mut dispatch_idx = 0;
// A command encoder executes one or many pipelines.
// It is to WebGPU what a command buffer is to Vulkan.
let mut encoder =
device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
{
let mut cpass = encoder.begin_compute_pass();
cpass.set_pipeline(&compute_pipeline);
loop {
// println!("Dispatch {}, binding: {}", dispatch_size, dispatch_idx % 2);
cpass.set_bind_group(0, &bind_groups[dispatch_idx % 2], &[]);
cpass.dispatch(dispatch_size as u32, 1, 1);
if dispatch_size == 1 {
break;
}
dispatch_size /= 64;
dispatch_idx += 1;
}
}
if dispatch_idx % 2 == 0 {
// Copy the data we wanted out to the transfer buffer
encoder.copy_buffer_to_buffer(
&buffer_b,
0,
&out_buffer,
0,
(64 * std::mem::size_of::<f32>()) as u64,
);
} else {
// Copy the data we wanted out to the transfer buffer
encoder.copy_buffer_to_buffer(
&buffer_a,
0,
&out_buffer,
0,
(64 * std::mem::size_of::<f32>()) as u64,
);
}
queue.submit(Some(encoder.finish()));
let buffer_slice = out_buffer.slice(0..((std::mem::size_of::<u32>() * 64) as u64));
// Gets the future representing when `staging_buffer` can be read from
let buffer_future = buffer_slice.map_async(wgpu::MapMode::Read);
// Poll the device in a blocking manner so that our future resolves.
// In an actual application, `device.poll(...)` should
// be called in an event loop or on another thread.
device.poll(wgpu::Maintain::Wait);
if let Ok(()) = buffer_future.await {
// Gets contents of buffer
let data = buffer_slice.get_mapped_range();
// Since contents are got in bytes, this converts these bytes back to u32
let result = data
.chunks_exact(4)
.map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
.collect::<Vec<_>>();
// With the current interface, we have to make sure all mapped views are
// dropped before we unmap the buffer.
drop(data);
out_buffer.unmap(); // Unmaps buffer from memory
// If you are familiar with C++ these 2 lines can be thought of similarly to:
// delete myPointer;
// myPointer = NULL;
// It effectively frees the memory
// Returns data from buffer
// for d in result.chunks_exact(4) {
// println!("{:010} {:010} {:010} {:010}", d[0], d[1], d[2], d[3]);
// }
result[0]
} else {
panic!("failed to run compute on gpu!")
}
}
fn main() {
futures::executor::block_on(setup());
} |
Beta Was this translation helpful? Give feedback.
Replies: 0 comments 8 replies
-
Some additional information for reproduction steps:
|
Beta Was this translation helpful? Give feedback.
-
Won't change a thing but so I don't forget to mention later
is actually no longer necessary, |
Beta Was this translation helpful? Give feedback.
-
Huh. Eyeballed it somewhat carefully but I'm at a loss so far. Looks all correct to me 🤔 |
Beta Was this translation helpful? Give feedback.
-
We need to find a way to mark this as resolved :) |
Beta Was this translation helpful? Give feedback.
Huh. Eyeballed it somewhat carefully but I'm at a loss so far. Looks all correct to me 🤔