"cuda reported error 500" when compute the albedo map in "shape_from_shading" using Graph
zhangxaochen opened this issue · 1 comments
zhangxaochen commented
Dear author,
I'm trying to compute the albedo map in the example "shape_from_shading", but I get the following error cuModuleGetFunction: cuda reported error 500
:
Num Active Unknowns: 0
tval: 1
Num Active Unknowns: 0
tval: 1
Saving sfsInitDepth.ply 640x480x1
Saving targetIntensity 640x480x1
Saving initialUnknown 640x480x1
Saving maskEdgeMap 640x960x1
Using Opt v0.2.2
No unknownwise residuals for ispaces(s) U-SHIT. Creating zero-valued stand-ins.
nUnknowns = 9
nResiduals = 0 + (@parametersSym).G.N * 1 + 1 * 1
nnz = 0 + (@parametersSym).G.N * 9 + 1 * 0
E:\Github\Opt\API\src/util.t:873: cuModuleGetFunction: cuda reported error 500
stack traceback:
[C]: in function 'error'
[string "<string>"]:243: in function 'cudacompile'
E:\Github\Opt\API\src/util.t:873: in function 'makeGPUFunctions'
E:\Github\Opt\API\src/solverGPUGaussNewton.t:762: in function 'compilePlan'
E:\Github\Opt\API\src/o.t:870: in function <E:\Github\Opt\API\src/o.t:862>
[C]: in function 'xpcall'
E:\Github\Opt\API\src/o.t:862: in function <E:\Github\Opt\API\src/o.t:861>
Assertion failed: m_plan, file e:\github\opt\examples\shared\OptSolver.h, line 58
Press any key to continue . . .
I tried to ensure the validity of the variables allocated on the GPU, but still get this error... How can I debug this? could someone help me?
my source code snippet is like:
//in main.cpp:
CombinedSolverComputeLighting solverLighting(solverInputGPU, params);
printf("Solving Light Coeffs>>>>>>>>>\n");
solverLighting.solveAll();
std::vector<float> res = solverLighting.result();
for (size_t i = 0; i < res.size(); i++){
printf("res-%d: %f\n", i, res[i]);
}
printf("=======================Solved\n");
and class CombinedSolverComputeLighting:
class CombinedSolverComputeLighting : public CombinedSolverBase {
private:
//std::shared_ptr<SimpleBuffer> m_initialUnknown;
//std::shared_ptr<SimpleBuffer> m_result;
std::vector<float> m_result; //light coeffs
float *m_result_gpu;
std::vector<unsigned int> m_dims;
int *m_param_idx_gpu,
*m_data_idx_gpu;
public:
CombinedSolverComputeLighting(const SFSSolverInput& inputGPU, CombinedSolverParameters params)
{
m_combinedSolverParameters = params;
const uint ww = (uint)inputGPU.targetDepth->width(),
hh = (uint)inputGPU.targetDepth->height(),
elemNum = ww *hh,
LcoeffsNum = 9;// (for 2nd order SH)
//m_dims = { ww, hh, 1 };
m_dims = { /*ww, */elemNum, 1 };
//1, result init
m_result.resize(LcoeffsNum, 0.f); //alloc CPU memory
const size_t sizeRes9Byte = sizeof(float) * LcoeffsNum;
//float *m_result_gpu;
cudaSafeCall(cudaMalloc(&m_result_gpu, sizeRes9Byte));
cudaSafeCall(cudaMemcpy(m_result_gpu, m_result.data(), sizeRes9Byte, cudaMemcpyHostToDevice));
//2, vert idx arr, for data and params
const size_t sizeIdxMatByte = sizeof(int) * elemNum;
cudaSafeCall(cudaMalloc(&m_param_idx_gpu, sizeIdxMatByte));
cudaSafeCall(cudaMemset(m_param_idx_gpu, 0, sizeIdxMatByte)); //all 0s
cudaSafeCall(cudaMalloc(&m_data_idx_gpu, sizeIdxMatByte));
vector<int> data_idx_cpu(elemNum);
for (size_t i = 0; i < elemNum; i++){
data_idx_cpu[i] = i;
}
cudaSafeCall(cudaMemcpy(m_data_idx_gpu, data_idx_cpu.data(), sizeIdxMatByte, cudaMemcpyHostToDevice));
float3 *nmapGpu;
cudaSafeCall(cudaMalloc(&nmapGpu, sizeof(float3) * elemNum));
float fx = inputGPU.parameters.fx,
fy = inputGPU.parameters.fy,
cx = inputGPU.parameters.ux,
cy = inputGPU.parameters.uy;
computeNmap((float*)inputGPU.targetDepth->data(), nmapGpu, ww, hh, fx, fy, cx, cy);
inputGPU.setParamsComputeLighting(m_problemParams, (void *)m_result_gpu
, elemNum, (void *)m_data_idx_gpu, (void *)m_param_idx_gpu
, nmapGpu
);
addOptSolvers(m_dims, "compute_lighting.t", false);
}
virtual void combinedSolveInit() override {
m_solverParams.set("nIterations", &m_combinedSolverParameters.nonLinearIter);
m_solverParams.set("lIterations", &m_combinedSolverParameters.linearIter);
}
virtual void preSingleSolve() override {
//resetGPUMemory();
}
virtual void postSingleSolve() override {}
virtual void preNonlinearSolve(int) override {}
virtual void postNonlinearSolve(int) override {}
virtual void combinedSolveFinalize() override {
//ceresIterationComparison("Shape From Shading", m_combinedSolverParameters.optDoublePrecision);
}
std::vector<float> result() {
return m_result;
}
//void resetGPUMemory() {
// cudaSafeCall(cudaMemcpy(m_result->data(), m_initialUnknown->data(), m_dims[0] * m_dims[1] * sizeof(float), cudaMemcpyDeviceToDevice));
//}
};
and add a setter in struct SFSSolverInput, in SFSSolverInput.h:
void setParamsComputeLighting(NamedParameters &probParams, void *unknown_lvec
, int graphEdgeNum, void *ptrDataIdx, void *ptrParamIdx
, void *nmap) const
{
//0123
probParams.set("L_coeffs", unknown_lvec);
probParams.set("D_i", targetDepth->data());
probParams.set("Im", targetIntensity->data());
probParams.set("N_i", nmap);
//456 --G
probParams.set("graphEdgeNum", &graphEdgeNum);
probParams.set("ptrDataIdx", ptrDataIdx);
probParams.set("ptrParamIdx", ptrParamIdx);
}
and the computeNmap
is like:
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
__global__
void computeNmapKernel(float *dmap, float3 *nmap, int ww, int hh, float fx, float fy, float cx, float cy){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
bool dbgPrint = false;
if (x % 100 == 0 && y % 100 == 0)
//dbgPrint = true;
;
if (x >= ww || y >= hh)
return;
int idx = y * ww + x,
idxL = y * ww + x - 1,
idxU = (y - 1)*ww + x;
if (dbgPrint){
printf("x,y:= %d, %d; idx, idxL, idxU:= %d, %d, %d\n",
x, y, idx, idxL, idxU);
}
if (x == 0 || y == 0
|| dmap[idx] == 0 || dmap[idxL] == 0 || dmap[idxU] == 0)
{
nmap[idx].x = 0;
nmap[idx].y = 0;
nmap[idx].y = 0;
if (dbgPrint){
printf("-----x,y:= %d, %d; idx, idxL, idxU:= %d, %d, %d\n",
x, y, idx, idxL, idxU);
}
return;
}
const float m2mm = 1e3; //avoid float underflow when calc nx,ny,nz
float d0 = dmap[idx] * m2mm,
dx1 = dmap[idxL] * m2mm,
dy1 = dmap[idxU] * m2mm;
float nx = dy1*(d0 - dx1) / fy,
ny = dx1*(d0 - dy1) / fx,
nz = nx*(cx - x) / fx + ny*(cy - y) / fy - dx1*dy1 / (fx*fy);
float sqLength = nx*nx + ny*ny + nz*nz;
float invLen = 1;
if (sqLength > 0)
invLen = 1.f / sqrtf(sqLength);
//normed nx,ny,nz:
float nnx = invLen*nx,
nny = invLen*ny,
nnz = invLen*nz;
if (dbgPrint){
printf("+++++x,y:= %d, %d; idx, idxL, idxU:= %d, %d, %d\n \
\ \td0, dx1, dy1: %f, %f, %f; nx, ny, nz: %f, %f, %f, ===normed-nxyz: %f, %f, %f\n",
x, y, idx, idxL, idxU,
d0, dx1, dy1, nx, ny, nz, nnx, nny, nnz);
}
nmap[idx].x = nnx;
nmap[idx].y = nny;
nmap[idx].z = nnz;
}
void computeNmap(float *dmap, float3 *nmap, int ww, int hh, float fx, float fy, float cx, float cy){
dim3 block(32, 8);
dim3 grid(divUp(ww, block.x), divUp(hh, block.y));
computeNmapKernel <<<grid, block >>>(dmap, nmap, ww, hh, fx, fy, cx, cy);
cudaSafeCall(cudaGetLastError());
}
zhangxaochen commented
forgot to say, the Opt file compute_lighting.t is like:
local N,UUU = Dim('N', 0), Dim('U-SHIT', 1)
local L_coeffs = Unknown("L_coeffs", opt_float9, {UUU}, 0) --写死9
local D_i = Array('D_i', opt_float, {N}, 1)
local Im = Array('Im', opt_float, {N}, 2)
local N_i = Array('N_i', opt_float3, {N}, 3)
--UsePreconditioner(true)
local G = Graph('G', 4,
'd', {N}, 5,
'p', {UUU}, 6
)
local function DepthValid(idx) return greater(D_i(idx),0) end
local n = N_i(G.d)
local x, y, z = n(0), n(1), n(2)
local L = L_coeffs(G.p)
local light = L(0)
+ L(1) * y
+ L(2) * z
+ L(3) * x
+ L(4) * x * y
+ L(5) * y * z
+ L(6) * (2 * z * z - x * x - y * y)
+ L(7) * z * x
+ L(8) * (x * x - y * y)
--Energy(Select(DepthValid(G.d) , light - Im(G.d), 0.0)) --ERR: invalid conversion from bool to float
--Energy(Select( abs(D_i(G.d)-0) , light - Im(G.d), 0.0)) --ERR: cuda 500
Energy(Select( D_i(G.d) , light - Im(G.d), 0.0)) --ERR: cuda 500
--Energy(light - Im(G.d))