`copy_buffer_to_texture()` writes to the wrong location — on Apple Silicon only

Question

`copy_buffer_to_texture()` writes to the wrong location — on Apple Silicon only

Opened this issue 7 days ago · 1 comments

Description
I use copy_buffer_to_texture to copy data from a buffer to single texels of a 3D texture. On my new Apple Silicon machine, and no others, data is apparently written to incorrect portions of the texture. The same problem does not occur when the entire texture is copied.

Repro steps

Run these tests:

[dependencies]
bytemuck = "1.21.0"
itertools = "0.13.0"
pollster = "0.4.0"
wgpu = { git = "https://github.com/gfx-rs/wgpu/" } # or release 23.0.1

use std::sync::Arc;

#[test]
fn test_single_write() {
    pollster::block_on(run_test(false))
}

#[test]
fn test_scatter() {
    pollster::block_on(run_test(true))
}

async fn run_test(use_many_writes: bool) {
    let (device, queue) = {
        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::default());
        let adapter = wgpu::util::initialize_adapter_from_env_or_default(&instance, None)
            .await
            .expect("no adapter");
        dbg!(adapter.get_info());
        let (device, queue) = adapter
            .request_device(&wgpu::DeviceDescriptor::default(), None)
            .await
            .expect("failed to request_device");
        let device = Arc::new(device);
        (device, queue)
    };

    let size = wgpu::Extent3d {
        width: 4,
        height: 4,
        depth_or_array_layers: 4,
    };
    let texture = {
        device.create_texture(&wgpu::TextureDescriptor {
            size,
            mip_level_count: 1,
            sample_count: 1,
            dimension: wgpu::TextureDimension::D3,
            format: wgpu::TextureFormat::Rgba8Uint,
            view_formats: &[],
            usage: wgpu::TextureUsages::TEXTURE_BINDING
                | wgpu::TextureUsages::COPY_DST
                | wgpu::TextureUsages::COPY_SRC,
            label: None,
        })
    };

    if use_many_writes {
        many_writes(&texture, &device, &queue);
    } else {
        single_write(&texture, &queue);
    }

    let light_texels: Vec<[u8; 4]> = {
        let tc = TextureCopyParameters::from_texture(&texture);
        let temp_buffer = tc.copy_texture_to_new_buffer(&device, &queue, &texture);

        let result_cell =
            Arc::new(std::sync::OnceLock::<Result<(), wgpu::BufferAsyncError>>::new());
        temp_buffer.slice(..).map_async(wgpu::MapMode::Read, {
            let result_cell = result_cell.clone();
            move |result| result_cell.set(result).unwrap()
        });
        device.poll(wgpu::Maintain::Wait);
        result_cell
            .get()
            .as_ref()
            .expect("cell not set")
            .as_ref()
            .expect("mapping failed");

        tc.copy_mapped_to_vec(1, &temp_buffer)
    };

    let mut wrong_texels = Vec::new();
    for (zyx_index, cube) in texel_iter(&texture).enumerate() {
        #[allow(clippy::cast_possible_wrap)]
        let expected = texel_for_cube(cube);
        let actual = light_texels[zyx_index];
        if expected != actual {
            println!("{:?}", (cube, expected, actual));
            wrong_texels.push((cube, expected, actual));
        }
    }

    let volume = size.width * size.height * size.depth_or_array_layers;
    assert!(
        wrong_texels.is_empty(),
        "out of {volume}, {len} were wrong",
        len = wrong_texels.len(),
    );
}

// -------------------------------------------------------------------------------------------------

type Texel = [u8; COMPONENTS];

pub fn texel_for_cube(point: [u32; 3]) -> [u8; 4] {
    [
        10 + point[0] as u8,
        10 + point[1] as u8,
        10 + point[2] as u8,
        10,
    ]
}

const COMPONENTS: usize = 4;

fn texel_iter(texture: &wgpu::Texture) -> impl Iterator<Item = [u32; 3]> {
    itertools::iproduct!(
        0..texture.depth_or_array_layers(),
        0..texture.height(),
        0..texture.width()
    )
    .map(|(z, y, x)| [x, y, z])
}

fn compute_data(texture: &wgpu::Texture) -> Vec<Texel> {
    let mut data = Vec::new();
    for point in texel_iter(texture) {
        data.push(texel_for_cube(point));
    }
    data
}

pub fn single_write(texture: &wgpu::Texture, queue: &wgpu::Queue) {
    let data = compute_data(texture);

    queue.write_texture(
        wgpu::ImageCopyTexture {
            texture,
            mip_level: 0,
            origin: wgpu::Origin3d::ZERO,
            aspect: wgpu::TextureAspect::All,
        },
        data.as_flattened(),
        wgpu::ImageDataLayout {
            offset: 0,
            bytes_per_row: Some(texture.width() * COMPONENTS as u32),
            rows_per_image: Some(texture.height()),
        },
        texture.size(),
    )
}

pub fn many_writes(texture: &wgpu::Texture, device: &wgpu::Device, queue: &wgpu::Queue) {
    let data: Vec<Texel> = compute_data(texture);

    let copy_buffer_2 = device.create_buffer(&wgpu::BufferDescriptor {
        label: None,
        size: u64::try_from(data.len() * COMPONENTS).unwrap(),
        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC,
        mapped_at_creation: false,
    });

    let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor::default());

    for (index, cube) in texel_iter(texture).enumerate() {
        encoder.copy_buffer_to_texture(
            wgpu::ImageCopyBuffer {
                buffer: &copy_buffer_2,
                layout: wgpu::ImageDataLayout {
                    offset: (index * COMPONENTS) as u64,
                    bytes_per_row: None,
                    rows_per_image: None,
                },
            },
            wgpu::ImageCopyTexture {
                texture,
                mip_level: 0,
                origin: wgpu::Origin3d {
                    x: cube[0],
                    y: cube[1],
                    z: cube[2],
                },
                aspect: wgpu::TextureAspect::All,
            },
            wgpu::Extent3d {
                width: 1,
                height: 1,
                depth_or_array_layers: 1,
            },
        );
    }

    queue.write_buffer(&copy_buffer_2, 0, data.as_flattened());
    queue.submit([encoder.finish()]);
}

// -------------------------------------------------------------------------------------------------

/// Elements of GPU-to-CPU copying
#[derive(Clone, Copy, Debug)]
struct TextureCopyParameters {
    pub size: wgpu::Extent3d,
    pub byte_size_of_texel: u32,
}
impl TextureCopyParameters {
    pub fn from_texture(texture: &wgpu::Texture) -> Self {
        let format = texture.format();
        assert_eq!(
            format.block_dimensions(),
            (1, 1),
            "compressed texture format {format:?} not supported",
        );

        Self {
            size: texture.size(),
            byte_size_of_texel: format
                .block_copy_size(None)
                .expect("non-color texture format {format:} not supported"),
        }
    }

    pub fn dense_bytes_per_row(&self) -> u32 {
        self.size.width * self.byte_size_of_texel
    }

    pub fn padded_bytes_per_row(&self) -> u32 {
        self.dense_bytes_per_row()
            .div_ceil(wgpu::COPY_BYTES_PER_ROW_ALIGNMENT)
            * wgpu::COPY_BYTES_PER_ROW_ALIGNMENT
    }

    #[track_caller]
    pub fn copy_texture_to_new_buffer(
        &self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        texture: &wgpu::Texture,
    ) -> wgpu::Buffer {
        let padded_bytes_per_row = self.padded_bytes_per_row();

        let temp_buffer = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("GPU-to-CPU image copy buffer"),
            size: u64::from(padded_bytes_per_row)
                * u64::from(self.size.height)
                * u64::from(self.size.depth_or_array_layers),
            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
            mapped_at_creation: false,
        });

        {
            let mut encoder =
                device.create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
            encoder.copy_texture_to_buffer(
                texture.as_image_copy(),
                wgpu::ImageCopyBuffer {
                    buffer: &temp_buffer,
                    layout: wgpu::ImageDataLayout {
                        offset: 0,
                        bytes_per_row: Some(padded_bytes_per_row),
                        rows_per_image: Some(self.size.height),
                    },
                },
                texture.size(),
            );
            queue.submit(Some(encoder.finish()));
        }

        temp_buffer
    }

    /// Given a mapped buffer, make a [`Vec<C>`] of it.
    ///
    /// `size_of::<C>() * components` must be equal to the byte size of a texel.
    pub fn copy_mapped_to_vec<C>(&self, components: usize, buffer: &wgpu::Buffer) -> Vec<C>
    where
        C: bytemuck::AnyBitPattern,
    {
        assert_eq!(
            u32::try_from(components * size_of::<C>()).ok(),
            Some(self.byte_size_of_texel),
            "Texture format does not match requested format",
        );

        // Copy the mapped buffer data into a Rust vector, removing row padding if present
        // by copying it one row at a time.
        let mut texel_vector: Vec<C> = Vec::new();
        {
            let mapped: &[u8] = &buffer.slice(..).get_mapped_range();
            for row in 0..self.row_count() {
                let byte_start_of_row = (self.padded_bytes_per_row()) as usize * row;
                texel_vector.extend(bytemuck::cast_slice::<u8, C>(
                    &mapped[byte_start_of_row..][..self.dense_bytes_per_row() as usize],
                ));
            }
        }

        texel_vector
    }

    fn row_count(&self) -> usize {
        self.size.height as usize * self.size.depth_or_array_layers as usize
    }
}

Expected vs observed behavior
The two tests should both pass, indicating identical data was written to the texture. Instead I get:

[tests/main.rs:19:9] adapter.get_info() = AdapterInfo {
    name: "Apple M4 Max",
    vendor: 0,
    device: 0,
    device_type: IntegratedGpu,
    driver: "",
    driver_info: "",
    backend: Metal,
}
([0, 1, 0], [10, 11, 10, 10], [10, 10, 10, 10])
([0, 2, 0], [10, 12, 10, 10], [10, 10, 10, 10])
([0, 3, 0], [10, 13, 10, 10], [10, 10, 10, 10])
([0, 1, 1], [10, 11, 11, 10], [10, 10, 11, 10])
([0, 2, 1], [10, 12, 11, 10], [10, 10, 11, 10])
([0, 3, 1], [10, 13, 11, 10], [10, 10, 11, 10])
([0, 1, 2], [10, 11, 12, 10], [10, 10, 12, 10])
([0, 2, 2], [10, 12, 12, 10], [10, 10, 12, 10])
([0, 3, 2], [10, 13, 12, 10], [10, 10, 12, 10])
([0, 1, 3], [10, 11, 13, 10], [10, 10, 13, 10])
([0, 2, 3], [10, 12, 13, 10], [10, 10, 13, 10])
([0, 3, 3], [10, 13, 13, 10], [10, 10, 13, 10])
thread 'test_scatter' panicked at tests/main.rs:87:5:
out of 64, 12 were wrong

which indicates that every texel with x=0 received the data belonging to y=0 instead of the data from the correct y position. However, changing the test data size suggests that the problem has more to do with 16-byte stride than the coordinates.

In my actual application, where the single-texel writes are to arbitrary locations in a bigger texture, the incorrect writes seem to be deterministic but not following any simple pattern, unlike the output of this test. None of them has ever written garbage data; only valid data to the wrong texel.

Platform

MacBook Pro
macOS Sequoia 15.1
Apple M4 Max CPU
target: aarch64-apple-darwin
rustc 1.83.0
wgpu 23.0.1 or git 7c75ac7 (same results)

The same bug does not appear on my other macOS machine with x86 CPU and macOS 14.7, nor does it appear with WebGPU.

Answer 1 · 2024-12-28T05:09:41.000Z

Further observation: if I make sure that the small copies are each a multiple of 16 texels (64 bytes) long, then the problem apparently does not occur.