Kernel stops running when I click on another window.
jafioti opened this issue · 1 comments
jafioti commented
Hi,
I'm working on writing matrix multiplication kernels and I have them running in a loop. I notice when I click off the window, or three finger swipe to another tab, each kernel run starts taking 0 ms. Why is this? Here's all the code needed to reproduce. I'm running it on a M1 Pro.
Edit: I've also noticed it just happen seemingly randomly as well. Haven't been able to charactarize it well.
Cargo.toml:
[dependencies]
metal = "0.26.0"
rand = "0.8.5"
Code:
use metal::{
objc::rc::autoreleasepool, Buffer, BufferRef, CommandBuffer, CommandBufferRef, CommandQueue,
CompileOptions, ComputePassDescriptor, ComputePassDescriptorRef, ComputePipelineDescriptor,
ComputePipelineState, CounterSampleBuffer, CounterSampleBufferRef, Device,
MTLCounterSamplingPoint, MTLResourceOptions, MTLSize, NSRange,
};
use rand::{rngs::StdRng, Rng, SeedableRng};
const NUM_SAMPLES: u64 = 2;
#[inline]
#[allow(clippy::too_many_arguments)]
fn run_naive(
a_buffer: &Buffer,
b_buffer: &Buffer,
c_buffer: &Buffer,
shader: &ComputePipelineState,
dev: &Device,
command_queue: &CommandQueue,
counter_sample_buffer: &CounterSampleBufferRef,
mat_size: usize,
) {
let mut cpu_start = 0;
let mut gpu_start = 0;
dev.sample_timestamps(&mut cpu_start, &mut gpu_start);
let destination_buffer = dev.new_buffer(
(std::mem::size_of::<u64>() * NUM_SAMPLES as usize) as u64,
MTLResourceOptions::StorageModeShared,
);
let counter_sampling_point = MTLCounterSamplingPoint::AtStageBoundary;
assert!(dev.supports_counter_sampling(counter_sampling_point));
let command_buffer = command_queue.new_command_buffer();
let compute_pass_descriptor = ComputePassDescriptor::new();
handle_compute_pass_sample_buffer_attachment(compute_pass_descriptor, counter_sample_buffer);
let encoder = command_buffer.compute_command_encoder_with_descriptor(compute_pass_descriptor);
encoder.set_compute_pipeline_state(shader);
encoder.set_buffer(0, Some(a_buffer), 0);
encoder.set_buffer(1, Some(b_buffer), 0);
encoder.set_buffer(2, Some(c_buffer), 0);
encoder.set_bytes(
3,
std::mem::size_of::<u32>() as u64,
&(mat_size as u32) as *const u32 as *const _,
);
encoder.set_bytes(
4,
std::mem::size_of::<u32>() as u64,
&(mat_size as u32) as *const u32 as *const _,
);
encoder.set_bytes(
5,
std::mem::size_of::<u32>() as u64,
&(mat_size as u32) as *const u32 as *const _,
);
let thread_block_size = 32;
encoder.set_threadgroup_memory_length(
0,
thread_block_size * thread_block_size * 2 * std::mem::size_of::<f32>() as u64,
);
encoder.dispatch_thread_groups(
MTLSize {
width: (mat_size as u64 + thread_block_size - 1) / thread_block_size,
height: (mat_size as u64 + thread_block_size - 1) / thread_block_size,
depth: 1,
},
MTLSize {
width: thread_block_size,
height: thread_block_size,
depth: 1,
},
);
encoder.end_encoding();
resolve_samples_into_buffer(command_buffer, counter_sample_buffer, &destination_buffer);
command_buffer.commit();
command_buffer.wait_until_completed();
let mut cpu_end = 0;
let mut gpu_end = 0;
dev.sample_timestamps(&mut cpu_end, &mut gpu_end);
handle_timestamps(&destination_buffer, cpu_start, cpu_end, gpu_start, gpu_end);
}
fn main() {
autoreleasepool(|| {
let mat_size = 2048;
let iters = 1000;
let mut rng = StdRng::seed_from_u64(0);
let a_data: Vec<f32> = (0..(mat_size * mat_size))
.map(|_| rng.gen_range(-0.5..0.5))
.collect();
let b_data: Vec<f32> = (0..(mat_size * mat_size))
.map(|_| rng.gen_range(-0.5..0.5))
.collect();
let shader = "#include <metal_stdlib>
using namespace metal;
kernel void matmul(
device float *A [[buffer(0)]],
device float *B [[buffer(1)]],
device float *C [[buffer(2)]],
device uint& M [[buffer(3)]],
device uint& N [[buffer(4)]],
device uint& K [[buffer(5)]],
uint3 tid [[thread_position_in_grid]]
) {
uint row = tid.y;
uint column = tid.x;
if(row < M && column < N) {
float value = 0.0f;
for(int i = 0; i < K; ++i) {
value = fma(A[row * K + i], B[i * N + column], value);
}
C[row * N + column] = value;
}
}
";
let dev = Device::system_default().unwrap();
let c_buffer = dev.new_buffer(
(mat_size * std::mem::size_of::<f32>()) as u64,
MTLResourceOptions::StorageModeManaged,
);
let a_buffer = dev.new_buffer_with_data(
unsafe { std::mem::transmute(a_data.as_ptr()) },
std::mem::size_of_val(&a_data) as u64,
MTLResourceOptions::StorageModeManaged,
);
let b_buffer = dev.new_buffer_with_data(
unsafe { std::mem::transmute(b_data.as_ptr()) },
std::mem::size_of_val(&b_data) as u64,
MTLResourceOptions::StorageModeManaged,
);
let counter_sample_buffer = create_counter_sample_buffer(&dev);
let shader = compile_function("matmul", shader, &dev);
let command_queue = dev.new_command_queue();
for _ in 0..iters {
run_naive(
&a_buffer,
&b_buffer,
&c_buffer,
&shader,
&dev,
&command_queue,
&counter_sample_buffer,
mat_size,
);
}
})
}
fn compile_function(name: &str, code: &str, device: &Device) -> ComputePipelineState {
let library = device
.new_library_with_source(code, &CompileOptions::new())
.unwrap();
let pipeline_state_descriptor = ComputePipelineDescriptor::new();
pipeline_state_descriptor
.set_compute_function(Some(&library.get_function(name, None).unwrap()));
device
.new_compute_pipeline_state_with_function(
pipeline_state_descriptor.compute_function().unwrap(),
)
.unwrap()
}
fn handle_compute_pass_sample_buffer_attachment(
compute_pass_descriptor: &ComputePassDescriptorRef,
counter_sample_buffer: &CounterSampleBufferRef,
) {
let sample_buffer_attachment_descriptor = compute_pass_descriptor
.sample_buffer_attachments()
.object_at(0)
.unwrap();
sample_buffer_attachment_descriptor.set_sample_buffer(counter_sample_buffer);
sample_buffer_attachment_descriptor.set_start_of_encoder_sample_index(0);
sample_buffer_attachment_descriptor.set_end_of_encoder_sample_index(1);
}
fn resolve_samples_into_buffer(
command_buffer: &CommandBufferRef,
counter_sample_buffer: &CounterSampleBufferRef,
destination_buffer: &BufferRef,
) {
let blit_encoder = command_buffer.new_blit_command_encoder();
blit_encoder.resolve_counters(
counter_sample_buffer,
NSRange::new(0_u64, NUM_SAMPLES),
destination_buffer,
0_u64,
);
blit_encoder.end_encoding();
}
fn handle_timestamps(
resolved_sample_buffer: &BufferRef,
cpu_start: u64,
cpu_end: u64,
gpu_start: u64,
gpu_end: u64,
) {
let samples = unsafe {
std::slice::from_raw_parts(
resolved_sample_buffer.contents() as *const u64,
NUM_SAMPLES as usize,
)
};
let pass_start = samples[0];
let pass_end = samples[1];
let cpu_time_span = cpu_end - cpu_start;
let gpu_time_span = gpu_end - gpu_start;
let millis = milliseconds_between_begin(pass_start, pass_end, gpu_time_span, cpu_time_span);
println!("Compute pass duration: {millis} ms");
}
fn milliseconds_between_begin(begin: u64, end: u64, gpu_time_span: u64, cpu_time_span: u64) -> f64 {
let time_span = (end as f64) - (begin as f64);
let nanoseconds = time_span / (gpu_time_span as f64) * (cpu_time_span as f64);
nanoseconds / 1_000_000.0
}
fn create_counter_sample_buffer(device: &Device) -> CounterSampleBuffer {
let counter_sample_buffer_desc = metal::CounterSampleBufferDescriptor::new();
counter_sample_buffer_desc.set_storage_mode(metal::MTLStorageMode::Shared);
counter_sample_buffer_desc.set_sample_count(NUM_SAMPLES);
let counter_sets = device.counter_sets();
let timestamp_counter = counter_sets.iter().find(|cs| cs.name() == "timestamp");
counter_sample_buffer_desc
.set_counter_set(timestamp_counter.expect("No timestamp counter found"));
device
.new_counter_sample_buffer_with_descriptor(&counter_sample_buffer_desc)
.unwrap()
}
jafioti commented
I've also noticed that when the kernel outputs incorrect results, it's always outputting 0 after the same index. So there's some determinism here.