LukasBanana/LLGL

Big memory leak problem

Opened this issue · 9 comments

On macOS with the Metal renderer, memory leaks every frame.
In 3 minutes memory usage of a simple app reached 6 GB.

image

Minimal repro:

quad.metal:

#include <metal_stdlib>
#include <simd/simd.h>

using namespace metal;

struct VertexIn
{
    float2 position [[attribute(0)]];
    float2 uv       [[attribute(1)]];
};

struct VertexOut
{
    float4 position [[position]];
    float2 uv;
};

vertex VertexOut VS(VertexIn inp [[stage_in]])
{
    VertexOut outp;

    outp.position = float4(inp.position, 0.0, 1.0);
    outp.uv    = inp.uv;

    return outp;
}

fragment float4 PS(VertexOut inp [[stage_in]])
{
    return float4(inp.uv, 1.0, 1.0);
}

main.mm:

#include <stdio.h>
#include <LLGL/LLGL.h>
#include <LLGL/Platform/NativeHandle.h>
#include <LLGL/Utils/VertexFormat.h>
#include <LLGL/Utils/TypeNames.h>
#include <memory>

#define GLFW_EXPOSE_NATIVE_COCOA

#include <GLFW/glfw3.h>
#include <GLFW/glfw3native.h>
#include <LLGL/LLGL.h>

class CustomSurface : public LLGL::Surface {
public:
    CustomSurface(GLFWwindow *window, const LLGL::Extent2D& size) : m_size(size), m_wnd(window) {}
    ~CustomSurface();

    bool GetNativeHandle(void* nativeHandle, std::size_t nativeHandleSize) override;
    LLGL::Extent2D GetContentSize() const override { return m_size; };
    bool AdaptForVideoMode(LLGL::Extent2D* resolution, bool* fullscreen) override;
    void ResetPixelFormat() override {};
    LLGL::Display* FindResidentDisplay() const override { return LLGL::Display::GetPrimary(); };

    bool ProcessEvents();
private:
    LLGL::Extent2D m_size;
    GLFWwindow* m_wnd = nullptr;
};

CustomSurface::~CustomSurface() {
    glfwDestroyWindow(m_wnd);
}

bool CustomSurface::GetNativeHandle(void* nativeHandle, std::size_t nativeHandleSize) {
    auto handle = reinterpret_cast<LLGL::NativeHandle*>(nativeHandle);
    handle->responder = glfwGetCocoaWindow(m_wnd);
    return true;
}

bool CustomSurface::AdaptForVideoMode(LLGL::Extent2D *resolution, bool *fullscreen) {
    m_size = *resolution;
    glfwSetWindowSize(m_wnd, m_size.width, m_size.height);
    return true;
}

bool CustomSurface::ProcessEvents() {
    glfwPollEvents();
    return !glfwWindowShouldClose(m_wnd);
}

int main(void) {
    LLGL::Log::RegisterCallbackStd();

    if (!glfwInit()) return -1;

    GLFWwindow *window = glfwCreateWindow(1280, 720, "AAA", nullptr, nullptr);

    LLGL::Report report;
    auto context = LLGL::RenderSystem::Load("Metal", &report);

    const LLGL::Display* display = LLGL::Display::GetPrimary();
    const std::uint32_t resScale = (display != nullptr ? static_cast<std::uint32_t>(display->GetScale()) : 1u);

    const auto resolution = LLGL::Extent2D(1280 * resScale, 720 * resScale);

    LLGL::SwapChainDescriptor swapChainDesc;
    swapChainDesc.resolution = resolution;

    auto surface = std::make_shared<CustomSurface>(window, resolution);

    auto swapChain = context->CreateSwapChain(swapChainDesc, surface);
    swapChain->SetVsyncInterval(0);

    auto commands = context->CreateCommandBuffer(LLGL::CommandBufferFlags::ImmediateSubmit);

    const auto& info = context->GetRendererInfo();

    float vertices[] = {
        -0.5f, -0.5f, 0.0f, 0.0f,
        0.5f,  -0.5f, 1.0f, 0.0f,
        -0.5f, 0.5f,  0.0f, 1.0f,
        0.5f,  0.5f,  1.0f, 1.0f,
    };

    LLGL::VertexFormat vertexFormat;
    vertexFormat.AppendAttribute({"a_position", LLGL::Format::RG32Float});
    vertexFormat.AppendAttribute({"a_uv", LLGL::Format::RG32Float});
    vertexFormat.SetStride(sizeof(float) * 4);

    LLGL::BufferDescriptor vertexBufferDesc;
    vertexBufferDesc.size = sizeof(vertices);
    vertexBufferDesc.bindFlags = LLGL::BindFlags::VertexBuffer;
    vertexBufferDesc.vertexAttribs = vertexFormat.attributes;

    LLGL::Buffer* vertexBuffer = context->CreateBuffer(vertexBufferDesc, vertices);

    LLGL::ShaderDescriptor vertexShaderDesc, fragmentShaderDesc;
    vertexShaderDesc = { LLGL::ShaderType::Vertex,   "assets/shaders/quad.metal", "VS", "1.1" };
    fragmentShaderDesc = { LLGL::ShaderType::Fragment, "assets/shaders/quad.metal", "PS", "1.1" };

    vertexShaderDesc.vertex.inputAttribs = vertexFormat.attributes;

    LLGL::Shader* vertexShader = context->CreateShader(vertexShaderDesc);
    LLGL::Shader* fragmentShader = context->CreateShader(fragmentShaderDesc);

    LLGL::PipelineLayoutDescriptor pipelineLayoutDesc;

    LLGL::PipelineLayout* pipelineLayout = context->CreatePipelineLayout(pipelineLayoutDesc);

    LLGL::GraphicsPipelineDescriptor pipelineDesc;
    pipelineDesc.vertexShader = vertexShader;
    pipelineDesc.fragmentShader = fragmentShader;
    pipelineDesc.pipelineLayout = pipelineLayout;
    pipelineDesc.indexFormat = LLGL::Format::R32UInt;
    pipelineDesc.primitiveTopology = LLGL::PrimitiveTopology::TriangleStrip;
    pipelineDesc.renderPass = swapChain->GetRenderPass();

    LLGL::PipelineState* pipelineState = context->CreatePipelineState(pipelineDesc);
    if (const LLGL::Report* report = pipelineState->GetReport()) {
        if (report->HasErrors()) LLGL::Log::Errorf("%s", report->GetText());
        return -1;
    }

    double prevTick = glfwGetTime();
    
    while (surface->ProcessEvents()) {
        double currentTick = glfwGetTime();
        const double deltaTime = (currentTick - prevTick);
        prevTick = currentTick;

        commands->Begin();
        {
            commands->SetVertexBuffer(*vertexBuffer);

            commands->BeginRenderPass(*swapChain);
            {
                commands->Clear(LLGL::ClearFlags::Color, LLGL::ClearValue(0.0f, 0.0f, 0.0f, 1.0f));
                commands->SetViewport(swapChain->GetResolution());
                commands->SetPipelineState(*pipelineState);

                commands->Draw(4, 0);
            }
            commands->EndRenderPass();
        }
        commands->End();

        swapChain->Present();
    }

    return 0;
}

Thanks for the simple repro. I haven't done much memory analysis yet and my examples relied on the autorelease pool feature in macOS, which I think is fine to use, but it would be nicer not having to rely on it. Wrapping your main loop or at least all LLGL frame drawing into the following block fixes the problem:

while (ProcessEvents())
{
    @autoreleasepool
    {
        /* Frame rendering ... */
    }
}

ExampleBase does it this way, too, but I am concerned this will always fail on iOS as this platform doesn't support garbage collection iirc.

Well, with autoreleasepool memory leaks much slower, but it still leaks about 0.1 MB in 5–10 seconds.

When debugging with Xcode memory leaks faster: about 100 MB in 1 minute

Did you notice any of such leaks in the examples as well? My activity monitor showed a steady memory usage for the examples. What happens if you wrap the entire main function into the autoreleasepool-block?

FWIW: There are only a handful of retain calls in the Metal backend like in GetNativeHandle() which is also documented (see CommandBuffer.h:1004). I therefore don't think LLGL is causing a retain cycle, which means it might be possible to track this down with Xcode's leak detection tools (I won't have time for this until next week, though).

What happens if you wrap the entire main function into the autoreleasepool-block?

If I wrap the entire main function nothing happens, autoreleasepool just doesn't work and memory leaks with the speed of light.

Did you notice any of such leaks in the examples as well?

In the PBR example memory leaks about 0.1 MB in 5–10 seconds.
In the Instancing example memory leaks very slow, I ran the example for 5 minutes and memory usage was increased by about 0.4 MB.

I can't think of a tremendous amount of objects being allocated that LLGL wouldn't clean up. Can you test this by replacing the commandBuffer calls with commandBufferWithUnretainedReferences at these two places:

  1. MTDirectCommandBuffer.mm:70
  2. MTCommandQueue.mm:40

I think the Metal backend should already take care of maintaining the buffer live time during command encoding, so the default MTLCommandBuffer with strong references might not be necessary.

I can take a closer look tomorrow.

Can you test this by replacing the commandBuffer calls with commandBufferWithUnretainedReferences

Looks like nothing has changed.

With the example you posted I am having a hard time reproducing a memory leak after adding the @autoreleasepool block around the commands->Begin() and swapChain->Present() calls. That app stays around 68.2 MB pretty consistently. When I resize the window, the memory goes down to 54.1 MB and stays there even after several minutes (I have an Intel Iris Pro Graphics with Metal 1.4, but the M1* GPUs use shared memory as well). I'll try a few things without the autorelease pool, but I think this is pretty common practice since this also works with automatic reference counting (ARC) as opposed to garbage collection.

Try to capture the GPU workload in Xcode, memory usage should be increasing pretty fast.

You can do that by pressing this button:
image

Unfortunately, my Xcode version doesn't have that feature and my MacBook is too old to upgrade. So I'll have to stick with more oldschool debugging techniques.

As I'm testing a bit with explicitly releasing the MTLCommandBuffer objects, it looks like it's not feasible to ditch the autoreleasepool block without significant refactoring. I'll also be on vacation for a couple of days, but feel free to post any more details you can find and I'll continue debugging this issue when I'm back.