Multiple DDR banks and q.enqueueWriteBuffer

Question

Multiple DDR banks and q.enqueueWriteBuffer

RatkoFri opened this issue a year ago · 2 comments

Greetings,

I am playing with the multiple DDR banks and vector addition problem. I am using two memory banks, one to store one input and one output array, while the other banks is to store otherr input array. I compile successfully compile code, but when I run hardware emulation I get this error:

[XRT] ERROR: Cannot allocate buffer at unknown memory index [XRT] ERROR: Operation failed due to earlier error 'Cannot allocate buffer at unknown memory index' [XRT] ERROR: Operation failed due to earlier error 'Cannot allocate buffer at unknown memory index'
The host code is following:

`

#include <iostream>
#include <fstream>
#include <iterator>
#include <vector>


// XRT includes
#include "xrt/xrt_bo.h"
#include <experimental/xrt_xclbin.h>
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

// OpenCL API 
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

#include <CL/cl2.hpp>
#include <CL/cl_ext_xilinx.h>

#define DATA_SIZE 1024
#define KERNEL_CL "vadd"


using namespace std;

vector<unsigned char> read_binary_file(const std::string &filename)
{
    std::cout << "INFO: Reading " << filename << std::endl;
    std::ifstream file(filename, std::ios::binary);
    file.unsetf(std::ios::skipws);

    std::streampos file_size;
    file.seekg(0, std::ios::end);
    file_size = file.tellg();
    file.seekg(0, std::ios::beg);

    std::vector<unsigned char> data;
    data.reserve(file_size);
    data.insert(data.begin(),
        std::istream_iterator<unsigned char>(file),
        std::istream_iterator<unsigned char>());

    return data;
}

int main(int argc, char** argv) {
    
    cl_int err;

    // read arguments 
    if (argc != 3) {
        cout << "Usage: " << argv[0] << " device name kernel" << endl;
        return EXIT_FAILURE;
    }

    string binary_file = argv[2];
    cout<<binary_file<<endl;
    //***************************************************
    // STEP 0: Initialize data 
    //***************************************************
    
    vector<int> source_a(DATA_SIZE, 1); // ini
    vector<int> source_b(DATA_SIZE, 1);
    vector<int> source_c(DATA_SIZE, 0);

    //***************************************************
    // STEP 1: Get the platform 
    //***************************************************
    vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    cl::Platform platform;

    for(cl::Platform &p: platforms)
    {
        const string name = p.getInfo<CL_PLATFORM_NAME>();
        cout << "PLATFORM: " << name << endl;
        if(name == "Xilinx")
        {
            platform = p;
            break;
        }
    }

    if(platform == cl::Platform())
    {
        cout << "Xilinx platform not found!" << endl;
        exit(EXIT_FAILURE);
    }
    
    //***************************************************
    // STEP 2: Get the devices and select the desired device 
    //***************************************************

    vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
    
    cout<<"Number of devices found: " << devices.size() << endl;

    cl::Device device;
    for(cl::Device &iterDevice: devices){
        cout << "DEVICE: " << iterDevice.getInfo<CL_DEVICE_NAME>() << endl;
        if(iterDevice.getInfo<CL_DEVICE_NAME>() == argv[1])
            device = iterDevice;
    }
    
    cout << "SELECTED DEVICE: " << device.getInfo<CL_DEVICE_NAME>() << endl;
    
    //***************************************************
    // STEP 3: Create a context 
    //***************************************************
    // we create a context with the selected device using Context class 
    
    cl::Context context(device, nullptr, nullptr, nullptr, &err);
    cout << "CONTEXT ERROR: " << err << endl;

    //***************************************************
    // STEP 4: Create a command queue 
    //***************************************************
    // we create a command queue with the selected device and context using CommandQueue class 
    
    cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
    cout << "COMMAND QUEUE ERROR: " << err << endl;

    //***************************************************
    // STEP 5: Create device buffers
    //***************************************************
     

    //cl::Buffer buffer_a(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, source_a.size() * sizeof(int), source_a.data(), &err);    
    //cl::Buffer buffer_b(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, source_b.size() * sizeof(int), source_b.data(), &err);

    cl::Buffer buffer_a(context, CL_MEM_READ_ONLY, source_a.size() * sizeof(int), NULL, &err);    
    cl::Buffer buffer_b(context, CL_MEM_READ_ONLY, source_b.size() * sizeof(int), NULL, &err);
    cl::Buffer buffer_res(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, source_c.size() * sizeof(int), nullptr, &err);

    //***************************************************
    // STEP 6: Create a program object for the context
    //***************************************************
    // read the kernel .xo file 

    cl::Kernel kernel;
    auto program_binary = read_binary_file(binary_file);
    cl::Program::Binaries bins{{program_binary.data(), program_binary.size()}};
    
    std::cout << "Trying to program device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    cl::Program program(context, {device}, bins, nullptr, &err);
    //***************************************************
    // STEP 6: Create the kernel object
    //***************************************************
    
    if (err != CL_SUCCESS) {
        std::cout << "Failed to program device with xclbin file!\n";
        cout << err << endl;
    } else {
        std::cout << "Device: program successful!\n";
        kernel = cl::Kernel(program, KERNEL_CL, &err);
         // we break because we found a valid device
    }
    //***************************************************
    // STEP 7: Write host data to device buffers
    //***************************************************

    
    q.enqueueWriteBuffer( buffer_a,  CL_FALSE, 0, source_a.size() * sizeof(int), source_a.data());
    q.enqueueWriteBuffer( buffer_b,  CL_FALSE, 0, source_b.size() * sizeof(int), source_b.data());
    
    //***************************************************
    // STEP 8: Set the kernel arguments
    //***************************************************

    kernel.setArg(0, buffer_res);
    kernel.setArg(1, buffer_a);
    kernel.setArg(2, buffer_b);
    kernel.setArg(3, DATA_SIZE);

    //***************************************************
    // STEP 9: Enqueue the kernel for execution
    //***************************************************

    q.enqueueTask(kernel);

    //***************************************************
    // STEP 12: Read the output buffer back to the host
    //***************************************************
    // Synchronous/blocking read of results

    vector<int> result(DATA_SIZE, 0);
    
    q.finish();

    q.enqueueReadBuffer(buffer_res, CL_TRUE, 0, result.size() * sizeof(int), result.data());


    cout << "Calculating sum of resulting array: " << endl;
    
    int sum = 0;
    for (int i = 0; i < DATA_SIZE; i++){
        sum += result[i];
    }

    cout << "SUM: " << sum << endl;
    
    return 0;
}

`

Vector kernel

#define BUFFER_SIZE 256

extern "C" {
    void vadd(int* c,
        const int* a,
        const int* b,
        const int n_elements)
    {

        #pragma HLS interface m_axi port=a bundle=aximm1
        #pragma HLS interface m_axi port=b bundle=aximm2
        #pragma HLS interface m_axi port=c bundle=aximm1


        int arrayA[BUFFER_SIZE];
        int arrayB[BUFFER_SIZE];
    
    main_loop:
        for (int i = 0; i < n_elements; i += BUFFER_SIZE)
        {
            int size = BUFFER_SIZE;

            if(i + size > n_elements)
                size = n_elements - i;

        readA:
            for(int j = 0; j < size; j++)
                arrayA[j] = a[i + j];

        readB:
            for(int j = 0; j < size; j++)
                arrayB[j] = b[i + j];

        vadd_writeC:
            for(int j = 0; j < size; j++)
                c[i + j] = arrayA[j] + arrayB[j];
        }
    }
}

Config file:

debug=1
save-temps=1

[connectivity]
sp=vadd_1.a:DDR[1]
sp=vadd_1.b:DDR[2]
sp=vadd_1.c:DDR[1]

[profile]
data=all:all:all

Btw, if I comment enqueueWriteBuffer command and add CL_MEM_COPY_HOST_PTR when creating buffer, everything works great.

Any suggestions ?

Thanks,
Ratko

Answer 1 · 2023-12-05T05:56:35.000Z

Hi RatkoFri, if this issue is related to a tutorial, please let me know which one so that I can assign it to the author. But if it is not a tutorial-related issue, please try to ask for help from our Vitis Forum or create a support case through your sales representative.

Answer 2 · 2023-12-05T07:09:23.000Z

Thanks for the information. I will post it on Vitis Forum.