// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html // This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory #include "precomp.hpp" #include "tsdf.hpp" #include "opencl_kernels_rgbd.hpp" namespace cv { namespace kinfu { // TODO: Optimization possible: // * volumeType can be FP16 // * weight can be int16 typedef float volumeType; struct Voxel { volumeType v; int weight; }; typedef Vec VecT; class TSDFVolumeCPU : public TSDFVolume { public: // dimension in voxels, size in meters TSDFVolumeCPU(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor, bool zFirstMemOrder = true); virtual void integrate(InputArray _depth, float depthFactor, cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics) override; virtual void raycast(cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics, cv::Size frameSize, cv::OutputArray points, cv::OutputArray normals) const override; virtual void fetchNormals(cv::InputArray points, cv::OutputArray _normals) const override; virtual void fetchPointsNormals(cv::OutputArray points, cv::OutputArray normals) const override; virtual void reset() override; volumeType interpolateVoxel(cv::Point3f p) const; Point3f getNormalVoxel(cv::Point3f p) const; #if USE_INTRINSICS volumeType interpolateVoxel(const v_float32x4& p) const; v_float32x4 getNormalVoxel(const v_float32x4& p) const; #endif // See zFirstMemOrder arg of parent class constructor // for the array layout info // Consist of Voxel elements Mat volume; }; TSDFVolume::TSDFVolume(Point3i _res, float _voxelSize, Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor, bool zFirstMemOrder) : voxelSize(_voxelSize), voxelSizeInv(1.f/_voxelSize), volResolution(_res), maxWeight(_maxWeight), pose(_pose), raycastStepFactor(_raycastStepFactor) { // Unlike original code, this should work with any volume size // Not only when (x,y,z % 32) == 0 volSize = Point3f(volResolution) * voxelSize; truncDist = std::max(_truncDist, 2.1f * voxelSize); // (xRes*yRes*zRes) array // Depending on zFirstMemOrder arg: // &elem(x, y, z) = data + x*zRes*yRes + y*zRes + z; // &elem(x, y, z) = data + x + y*xRes + z*xRes*yRes; int xdim, ydim, zdim; if(zFirstMemOrder) { xdim = volResolution.z * volResolution.y; ydim = volResolution.z; zdim = 1; } else { xdim = 1; ydim = volResolution.x; zdim = volResolution.x * volResolution.y; } volDims = Vec4i(xdim, ydim, zdim); neighbourCoords = Vec8i( volDims.dot(Vec4i(0, 0, 0)), volDims.dot(Vec4i(0, 0, 1)), volDims.dot(Vec4i(0, 1, 0)), volDims.dot(Vec4i(0, 1, 1)), volDims.dot(Vec4i(1, 0, 0)), volDims.dot(Vec4i(1, 0, 1)), volDims.dot(Vec4i(1, 1, 0)), volDims.dot(Vec4i(1, 1, 1)) ); } // dimension in voxels, size in meters TSDFVolumeCPU::TSDFVolumeCPU(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor, bool zFirstMemOrder) : TSDFVolume(_res, _voxelSize, _pose, _truncDist, _maxWeight, _raycastStepFactor, zFirstMemOrder) { volume = Mat(1, volResolution.x * volResolution.y * volResolution.z, rawType()); reset(); } // zero volume, leave rest params the same void TSDFVolumeCPU::reset() { CV_TRACE_FUNCTION(); volume.forEach([](VecT& vv, const int* /* position */) { Voxel& v = reinterpret_cast(vv); v.v = 0; v.weight = 0; }); } // SIMD version of that code is manually inlined #if !USE_INTRINSICS static const bool fixMissingData = false; static inline depthType bilinearDepth(const Depth& m, cv::Point2f pt) { const depthType defaultValue = qnan; if(pt.x < 0 || pt.x >= m.cols-1 || pt.y < 0 || pt.y >= m.rows-1) return defaultValue; int xi = cvFloor(pt.x), yi = cvFloor(pt.y); const depthType* row0 = m[yi+0]; const depthType* row1 = m[yi+1]; depthType v00 = row0[xi+0]; depthType v01 = row0[xi+1]; depthType v10 = row1[xi+0]; depthType v11 = row1[xi+1]; // assume correct depth is positive bool b00 = v00 > 0; bool b01 = v01 > 0; bool b10 = v10 > 0; bool b11 = v11 > 0; if(!fixMissingData) { if(!(b00 && b01 && b10 && b11)) return defaultValue; else { float tx = pt.x - xi, ty = pt.y - yi; depthType v0 = v00 + tx*(v01 - v00); depthType v1 = v10 + tx*(v11 - v10); return v0 + ty*(v1 - v0); } } else { int nz = b00 + b01 + b10 + b11; if(nz == 0) { return defaultValue; } if(nz == 1) { if(b00) return v00; if(b01) return v01; if(b10) return v10; if(b11) return v11; } else if(nz == 2) { if(b00 && b10) v01 = v00, v11 = v10; if(b01 && b11) v00 = v01, v10 = v11; if(b00 && b01) v10 = v00, v11 = v01; if(b10 && b11) v00 = v10, v01 = v11; if(b00 && b11) v01 = v10 = (v00 + v11)*0.5f; if(b01 && b10) v00 = v11 = (v01 + v10)*0.5f; } else if(nz == 3) { if(!b00) v00 = v10 + v01 - v11; if(!b01) v01 = v00 + v11 - v10; if(!b10) v10 = v00 + v11 - v01; if(!b11) v11 = v01 + v10 - v00; } float tx = pt.x - xi, ty = pt.y - yi; depthType v0 = v00 + tx*(v01 - v00); depthType v1 = v10 + tx*(v11 - v10); return v0 + ty*(v1 - v0); } } #endif struct IntegrateInvoker : ParallelLoopBody { IntegrateInvoker(TSDFVolumeCPU& _volume, const Depth& _depth, Intr intrinsics, cv::Affine3f cameraPose, float depthFactor) : ParallelLoopBody(), volume(_volume), depth(_depth), proj(intrinsics.makeProjector()), vol2cam(cameraPose.inv() * _volume.pose), truncDistInv(1.f/_volume.truncDist), dfac(1.f/depthFactor) { volDataStart = volume.volume.ptr(); } #if USE_INTRINSICS virtual void operator() (const Range& range) const override { // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt; Point3f zStepPt = Point3f(vol2cam.matrix(0, 2), vol2cam.matrix(1, 2), vol2cam.matrix(2, 2))*volume.voxelSize; v_float32x4 zStep(zStepPt.x, zStepPt.y, zStepPt.z, 0); v_float32x4 vfxy(proj.fx, proj.fy, 0.f, 0.f), vcxy(proj.cx, proj.cy, 0.f, 0.f); const v_float32x4 upLimits = v_cvt_f32(v_int32x4(depth.cols-1, depth.rows-1, 0, 0)); for(int x = range.start; x < range.end; x++) { Voxel* volDataX = volDataStart + x*volume.volDims[0]; for(int y = 0; y < volume.volResolution.y; y++) { Voxel* volDataY = volDataX + y*volume.volDims[1]; // optimization of camSpace transformation (vector addition instead of matmul at each z) Point3f basePt = vol2cam*(Point3f((float)x, (float)y, 0)*volume.voxelSize); v_float32x4 camSpacePt(basePt.x, basePt.y, basePt.z, 0); int startZ, endZ; if(abs(zStepPt.z) > 1e-5) { int baseZ = (int)(-basePt.z / zStepPt.z); if(zStepPt.z > 0) { startZ = baseZ; endZ = volume.volResolution.z; } else { startZ = 0; endZ = baseZ; } } else { if(basePt.z > 0) { startZ = 0; endZ = volume.volResolution.z; } else { // z loop shouldn't be performed startZ = endZ = 0; } } startZ = max(0, startZ); endZ = min(volume.volResolution.z, endZ); for(int z = startZ; z < endZ; z++) { // optimization of the following: //Point3f volPt = Point3f(x, y, z)*voxelSize; //Point3f camSpacePt = vol2cam * volPt; camSpacePt += zStep; float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0(); if(zCamSpace <= 0.f) continue; v_float32x4 camPixVec = camSpacePt/v_setall_f32(zCamSpace); v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy); // leave only first 2 lanes projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) & v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); depthType v; // bilinearly interpolate depth at projected { const v_float32x4& pt = projected; // check coords >= 0 and < imgSize v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) | v_reinterpret_as_u32(pt >= upLimits); limits = limits | v_rotate_right<1>(limits); if(limits.get0()) continue; // xi, yi = floor(pt) v_int32x4 ip = v_floor(pt); v_int32x4 ipshift = ip; int xi = ipshift.get0(); ipshift = v_rotate_right<1>(ipshift); int yi = ipshift.get0(); const depthType* row0 = depth[yi+0]; const depthType* row1 = depth[yi+1]; // v001 = [v(xi + 0, yi + 0), v(xi + 1, yi + 0)] v_float32x4 v001 = v_load_low(row0 + xi); // v101 = [v(xi + 0, yi + 1), v(xi + 1, yi + 1)] v_float32x4 v101 = v_load_low(row1 + xi); v_float32x4 vall = v_combine_low(v001, v101); // assume correct depth is positive // don't fix missing data if(v_check_all(vall > v_setzero_f32())) { v_float32x4 t = pt - v_cvt_f32(ip); float tx = t.get0(); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); v_float32x4 ty = v_setall_f32(t.get0()); // vx is y-interpolated between rows 0 and 1 v_float32x4 vx = v001 + ty*(v101 - v001); float v0 = vx.get0(); vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx))); float v1 = vx.get0(); v = v0 + tx*(v1 - v0); } else continue; } // norm(camPixVec) produces double which is too slow float pixNorm = sqrt(v_reduce_sum(camPixVec*camPixVec)); // difference between distances of point and of surface to camera volumeType sdf = pixNorm*(v*dfac - zCamSpace); // possible alternative is: // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1); if(sdf >= -volume.truncDist) { volumeType tsdf = fmin(1.f, sdf * truncDistInv); Voxel& voxel = volDataY[z*volume.volDims[2]]; int& weight = voxel.weight; volumeType& value = voxel.v; // update TSDF value = (value*weight+tsdf) / (weight + 1); weight = min(weight + 1, volume.maxWeight); } } } } } #else virtual void operator() (const Range& range) const override { for(int x = range.start; x < range.end; x++) { Voxel* volDataX = volDataStart + x*volume.volDims[0]; for(int y = 0; y < volume.volResolution.y; y++) { Voxel* volDataY = volDataX+y*volume.volDims[1]; // optimization of camSpace transformation (vector addition instead of matmul at each z) Point3f basePt = vol2cam*(Point3f(x, y, 0)*volume.voxelSize); Point3f camSpacePt = basePt; // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt; Point3f zStep = Point3f(vol2cam.matrix(0, 2), vol2cam.matrix(1, 2), vol2cam.matrix(2, 2))*volume.voxelSize; int startZ, endZ; if(abs(zStep.z) > 1e-5) { int baseZ = -basePt.z / zStep.z; if(zStep.z > 0) { startZ = baseZ; endZ = volume.volResolution.z; } else { startZ = 0; endZ = baseZ; } } else { if(basePt.z > 0) { startZ = 0; endZ = volume.volResolution.z; } else { // z loop shouldn't be performed startZ = endZ = 0; } } startZ = max(0, startZ); endZ = min(volume.volResolution.z, endZ); for(int z = startZ; z < endZ; z++) { // optimization of the following: //Point3f volPt = Point3f(x, y, z)*volume.voxelSize; //Point3f camSpacePt = vol2cam * volPt; camSpacePt += zStep; if(camSpacePt.z <= 0) continue; Point3f camPixVec; Point2f projected = proj(camSpacePt, camPixVec); depthType v = bilinearDepth(depth, projected); if(v == 0) continue; // norm(camPixVec) produces double which is too slow float pixNorm = sqrt(camPixVec.dot(camPixVec)); // difference between distances of point and of surface to camera volumeType sdf = pixNorm*(v*dfac - camSpacePt.z); // possible alternative is: // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1); if(sdf >= -volume.truncDist) { volumeType tsdf = fmin(1.f, sdf * truncDistInv); Voxel& voxel = volDataY[z*volume.volDims[2]]; int& weight = voxel.weight; volumeType& value = voxel.v; // update TSDF value = (value*weight+tsdf) / (weight + 1); weight = min(weight + 1, volume.maxWeight); } } } } } #endif TSDFVolumeCPU& volume; const Depth& depth; const Intr::Projector proj; const cv::Affine3f vol2cam; const float truncDistInv; const float dfac; Voxel* volDataStart; }; // use depth instead of distance (optimization) void TSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, cv::Affine3f cameraPose, Intr intrinsics) { CV_TRACE_FUNCTION(); CV_Assert(_depth.type() == DEPTH_TYPE); Depth depth = _depth.getMat(); IntegrateInvoker ii(*this, depth, intrinsics, cameraPose, depthFactor); Range range(0, volResolution.x); parallel_for_(range, ii); } #if USE_INTRINSICS // all coordinate checks should be done in inclosing cycle inline volumeType TSDFVolumeCPU::interpolateVoxel(Point3f _p) const { v_float32x4 p(_p.x, _p.y, _p.z, 0); return interpolateVoxel(p); } inline volumeType TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const { // tx, ty, tz = floor(p) v_int32x4 ip = v_floor(p); v_float32x4 t = p - v_cvt_f32(ip); float tx = t.get0(); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); float ty = t.get0(); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); float tz = t.get0(); int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const Voxel* volData = volume.ptr(); int ix = ip.get0(); ip = v_rotate_right<1>(ip); int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iz = ip.get0(); int coordBase = ix*xdim + iy*ydim + iz*zdim; volumeType vx[8]; for(int i = 0; i < 8; i++) vx[i] = volData[neighbourCoords[i] + coordBase].v; v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]), v1357(vx[1], vx[3], vx[5], vx[7]); v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246); v_float32x4 v00_10 = vxx; v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10); float v0 = v0_1.get0(); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); float v1 = v0_1.get0(); return v0 + tx*(v1 - v0); } #else inline volumeType TSDFVolumeCPU::interpolateVoxel(Point3f p) const { int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; int ix = cvFloor(p.x); int iy = cvFloor(p.y); int iz = cvFloor(p.z); float tx = p.x - ix; float ty = p.y - iy; float tz = p.z - iz; int coordBase = ix*xdim + iy*ydim + iz*zdim; const Voxel* volData = volume.ptr(); volumeType vx[8]; for(int i = 0; i < 8; i++) vx[i] = volData[neighbourCoords[i] + coordBase].v; volumeType v00 = vx[0] + tz*(vx[1] - vx[0]); volumeType v01 = vx[2] + tz*(vx[3] - vx[2]); volumeType v10 = vx[4] + tz*(vx[5] - vx[4]); volumeType v11 = vx[6] + tz*(vx[7] - vx[6]); volumeType v0 = v00 + ty*(v01 - v00); volumeType v1 = v10 + ty*(v11 - v10); return v0 + tx*(v1 - v0); } #endif #if USE_INTRINSICS //gradientDeltaFactor is fixed at 1.0 of voxel size inline Point3f TSDFVolumeCPU::getNormalVoxel(Point3f _p) const { v_float32x4 p(_p.x, _p.y, _p.z, 0.f); v_float32x4 result = getNormalVoxel(p); float CV_DECL_ALIGNED(16) ares[4]; v_store_aligned(ares, result); return Point3f(ares[0], ares[1], ares[2]); } inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const { if(v_check_any((p < v_float32x4(1.f, 1.f, 1.f, 0.f)) + (p >= v_float32x4((float)(volResolution.x-2), (float)(volResolution.y-2), (float)(volResolution.z-2), 1.f)) )) return nanv; v_int32x4 ip = v_floor(p); v_float32x4 t = p - v_cvt_f32(ip); float tx = t.get0(); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); float ty = t.get0(); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); float tz = t.get0(); const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const Voxel* volData = volume.ptr(); int ix = ip.get0(); ip = v_rotate_right<1>(ip); int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iz = ip.get0(); int coordBase = ix*xdim + iy*ydim + iz*zdim; float CV_DECL_ALIGNED(16) an[4]; an[0] = an[1] = an[2] = an[3] = 0.f; for(int c = 0; c < 3; c++) { const int dim = volDims[c]; float& nv = an[c]; volumeType vx[8]; for(int i = 0; i < 8; i++) vx[i] = volData[neighbourCoords[i] + coordBase + 1*dim].v - volData[neighbourCoords[i] + coordBase - 1*dim].v; v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]), v1357(vx[1], vx[3], vx[5], vx[7]); v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246); v_float32x4 v00_10 = vxx; v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10); float v0 = v0_1.get0(); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); float v1 = v0_1.get0(); nv = v0 + tx*(v1 - v0); } v_float32x4 n = v_load_aligned(an); v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(n*n))); return n*invNorm; } #else inline Point3f TSDFVolumeCPU::getNormalVoxel(Point3f p) const { const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const Voxel* volData = volume.ptr(); if(p.x < 1 || p.x >= volResolution.x - 2 || p.y < 1 || p.y >= volResolution.y - 2 || p.z < 1 || p.z >= volResolution.z - 2) return nan3; int ix = cvFloor(p.x); int iy = cvFloor(p.y); int iz = cvFloor(p.z); float tx = p.x - ix; float ty = p.y - iy; float tz = p.z - iz; int coordBase = ix*xdim + iy*ydim + iz*zdim; Vec3f an; for(int c = 0; c < 3; c++) { const int dim = volDims[c]; float& nv = an[c]; volumeType vx[8]; for(int i = 0; i < 8; i++) vx[i] = volData[neighbourCoords[i] + coordBase + 1*dim].v - volData[neighbourCoords[i] + coordBase - 1*dim].v; volumeType v00 = vx[0] + tz*(vx[1] - vx[0]); volumeType v01 = vx[2] + tz*(vx[3] - vx[2]); volumeType v10 = vx[4] + tz*(vx[5] - vx[4]); volumeType v11 = vx[6] + tz*(vx[7] - vx[6]); volumeType v0 = v00 + ty*(v01 - v00); volumeType v1 = v10 + ty*(v11 - v10); nv = v0 + tx*(v1 - v0); } return normalize(an); } #endif struct RaycastInvoker : ParallelLoopBody { RaycastInvoker(Points& _points, Normals& _normals, Affine3f cameraPose, Intr intrinsics, const TSDFVolumeCPU& _volume) : ParallelLoopBody(), points(_points), normals(_normals), volume(_volume), tstep(volume.truncDist * volume.raycastStepFactor), // We do subtract voxel size to minimize checks after // Note: origin of volume coordinate is placed // in the center of voxel (0,0,0), not in the corner of the voxel! boxMax(volume.volSize - Point3f(volume.voxelSize, volume.voxelSize, volume.voxelSize)), boxMin(), cam2vol(volume.pose.inv() * cameraPose), vol2cam(cameraPose.inv() * volume.pose), reproj(intrinsics.makeReprojector()) { } #if USE_INTRINSICS virtual void operator() (const Range& range) const override { const v_float32x4 vfxy(reproj.fxinv, reproj.fyinv, 0, 0); const v_float32x4 vcxy(reproj.cx, reproj.cy, 0, 0); const float (&cm)[16] = cam2vol.matrix.val; const v_float32x4 camRot0(cm[0], cm[4], cm[ 8], 0); const v_float32x4 camRot1(cm[1], cm[5], cm[ 9], 0); const v_float32x4 camRot2(cm[2], cm[6], cm[10], 0); const v_float32x4 camTrans(cm[3], cm[7], cm[11], 0); const v_float32x4 boxDown(boxMin.x, boxMin.y, boxMin.z, 0.f); const v_float32x4 boxUp(boxMax.x, boxMax.y, boxMax.z, 0.f); const v_float32x4 invVoxelSize = v_float32x4(volume.voxelSizeInv, volume.voxelSizeInv, volume.voxelSizeInv, 1.f); const float (&vm)[16] = vol2cam.matrix.val; const v_float32x4 volRot0(vm[0], vm[4], vm[ 8], 0); const v_float32x4 volRot1(vm[1], vm[5], vm[ 9], 0); const v_float32x4 volRot2(vm[2], vm[6], vm[10], 0); const v_float32x4 volTrans(vm[3], vm[7], vm[11], 0); for(int y = range.start; y < range.end; y++) { ptype* ptsRow = points[y]; ptype* nrmRow = normals[y]; for(int x = 0; x < points.cols; x++) { v_float32x4 point = nanv, normal = nanv; v_float32x4 orig = camTrans; // get direction through pixel in volume space: // 1. reproject (x, y) on projecting plane where z = 1.f v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy; planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f)); // 2. rotate to volume space planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32()); // 3. normalize v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed))); v_float32x4 dir = planed*invNorm; // compute intersection of ray with all six bbox planes v_float32x4 rayinv = v_setall_f32(1.f)/dir; // div by zero should be eliminated by these products v_float32x4 tbottom = rayinv*(boxDown - orig); v_float32x4 ttop = rayinv*(boxUp - orig); // re-order intersections to find smallest and largest on each axis v_float32x4 minAx = v_min(ttop, tbottom); v_float32x4 maxAx = v_max(ttop, tbottom); // near clipping plane const float clip = 0.f; float tmin = max(v_reduce_max(minAx), clip); float tmax = v_reduce_min(maxAx); // precautions against getting coordinates out of bounds tmin = tmin + tstep; tmax = tmax - tstep; if(tmin < tmax) { // interpolation optimized a little orig *= invVoxelSize; dir *= invVoxelSize; int xdim = volume.volDims[0]; int ydim = volume.volDims[1]; int zdim = volume.volDims[2]; v_float32x4 rayStep = dir * v_setall_f32(tstep); v_float32x4 next = (orig + dir * v_setall_f32(tmin)); volumeType f = volume.interpolateVoxel(next), fnext = f; //raymarch int steps = 0; int nSteps = cvFloor((tmax - tmin)/tstep); for(; steps < nSteps; steps++) { next += rayStep; v_int32x4 ip = v_round(next); int ix = ip.get0(); ip = v_rotate_right<1>(ip); int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iz = ip.get0(); int coord = ix*xdim + iy*ydim + iz*zdim; fnext = volume.volume.at(coord).v; if(fnext != f) { fnext = volume.interpolateVoxel(next); // when ray crosses a surface if(std::signbit(f) != std::signbit(fnext)) break; f = fnext; } } // if ray penetrates a surface from outside // linearly interpolate t between two f values if(f > 0.f && fnext < 0.f) { v_float32x4 tp = next - rayStep; volumeType ft = volume.interpolateVoxel(tp); volumeType ftdt = volume.interpolateVoxel(next); // float t = tmin + steps*tstep; // float ts = t - tstep*ft/(ftdt - ft); float ts = tmin + tstep*(steps - ft/(ftdt - ft)); // avoid division by zero if(!cvIsNaN(ts) && !cvIsInf(ts)) { v_float32x4 pv = (orig + dir*v_setall_f32(ts)); v_float32x4 nv = volume.getNormalVoxel(pv); if(!isNaN(nv)) { //convert pv and nv to camera space normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32()); // interpolation optimized a little point = v_matmuladd(pv*v_float32x4(volume.voxelSize, volume.voxelSize, volume.voxelSize, 1.f), volRot0, volRot1, volRot2, volTrans); } } } } v_store((float*)(&ptsRow[x]), point); v_store((float*)(&nrmRow[x]), normal); } } } #else virtual void operator() (const Range& range) const override { const Point3f camTrans = cam2vol.translation(); const Matx33f camRot = cam2vol.rotation(); const Matx33f volRot = vol2cam.rotation(); for(int y = range.start; y < range.end; y++) { ptype* ptsRow = points[y]; ptype* nrmRow = normals[y]; for(int x = 0; x < points.cols; x++) { Point3f point = nan3, normal = nan3; Point3f orig = camTrans; // direction through pixel in volume space Point3f dir = normalize(Vec3f(camRot * reproj(Point3f(x, y, 1.f)))); // compute intersection of ray with all six bbox planes Vec3f rayinv(1.f/dir.x, 1.f/dir.y, 1.f/dir.z); Point3f tbottom = rayinv.mul(boxMin - orig); Point3f ttop = rayinv.mul(boxMax - orig); // re-order intersections to find smallest and largest on each axis Point3f minAx(min(ttop.x, tbottom.x), min(ttop.y, tbottom.y), min(ttop.z, tbottom.z)); Point3f maxAx(max(ttop.x, tbottom.x), max(ttop.y, tbottom.y), max(ttop.z, tbottom.z)); // near clipping plane const float clip = 0.f; float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip); float tmax = min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z)); // precautions against getting coordinates out of bounds tmin = tmin + tstep; tmax = tmax - tstep; if(tmin < tmax) { // interpolation optimized a little orig = orig*volume.voxelSizeInv; dir = dir*volume.voxelSizeInv; Point3f rayStep = dir * tstep; Point3f next = (orig + dir * tmin); volumeType f = volume.interpolateVoxel(next), fnext = f; //raymarch int steps = 0; int nSteps = floor((tmax - tmin)/tstep); for(; steps < nSteps; steps++) { next += rayStep; int xdim = volume.volDims[0]; int ydim = volume.volDims[1]; int zdim = volume.volDims[2]; int ix = cvRound(next.x); int iy = cvRound(next.y); int iz = cvRound(next.z); fnext = volume.volume.at(ix*xdim + iy*ydim + iz*zdim).v; if(fnext != f) { fnext = volume.interpolateVoxel(next); // when ray crosses a surface if(std::signbit(f) != std::signbit(fnext)) break; f = fnext; } } // if ray penetrates a surface from outside // linearly interpolate t between two f values if(f > 0.f && fnext < 0.f) { Point3f tp = next - rayStep; volumeType ft = volume.interpolateVoxel(tp); volumeType ftdt = volume.interpolateVoxel(next); // float t = tmin + steps*tstep; // float ts = t - tstep*ft/(ftdt - ft); float ts = tmin + tstep*(steps - ft/(ftdt - ft)); // avoid division by zero if(!cvIsNaN(ts) && !cvIsInf(ts)) { Point3f pv = (orig + dir*ts); Point3f nv = volume.getNormalVoxel(pv); if(!isNaN(nv)) { //convert pv and nv to camera space normal = volRot * nv; // interpolation optimized a little point = vol2cam * (pv*volume.voxelSize); } } } } ptsRow[x] = toPtype(point); nrmRow[x] = toPtype(normal); } } } #endif Points& points; Normals& normals; const TSDFVolumeCPU& volume; const float tstep; const Point3f boxMax; const Point3f boxMin; const Affine3f cam2vol; const Affine3f vol2cam; const Intr::Reprojector reproj; }; void TSDFVolumeCPU::raycast(cv::Affine3f cameraPose, Intr intrinsics, Size frameSize, cv::OutputArray _points, cv::OutputArray _normals) const { CV_TRACE_FUNCTION(); CV_Assert(frameSize.area() > 0); _points.create (frameSize, POINT_TYPE); _normals.create(frameSize, POINT_TYPE); Points points = _points.getMat(); Normals normals = _normals.getMat(); RaycastInvoker ri(points, normals, cameraPose, intrinsics, *this); const int nstripes = -1; parallel_for_(Range(0, points.rows), ri, nstripes); } struct FetchPointsNormalsInvoker : ParallelLoopBody { FetchPointsNormalsInvoker(const TSDFVolumeCPU& _volume, std::vector< std::vector >& _pVecs, std::vector< std::vector >& _nVecs, bool _needNormals) : ParallelLoopBody(), vol(_volume), pVecs(_pVecs), nVecs(_nVecs), needNormals(_needNormals) { volDataStart = vol.volume.ptr(); } inline void coord(std::vector& points, std::vector& normals, int x, int y, int z, Point3f V, float v0, int axis) const { // 0 for x, 1 for y, 2 for z bool limits = false; Point3i shift; float Vc = 0.f; if(axis == 0) { shift = Point3i(1, 0, 0); limits = (x + 1 < vol.volResolution.x); Vc = V.x; } if(axis == 1) { shift = Point3i(0, 1, 0); limits = (y + 1 < vol.volResolution.y); Vc = V.y; } if(axis == 2) { shift = Point3i(0, 0, 1); limits = (z + 1 < vol.volResolution.z); Vc = V.z; } if(limits) { const Voxel& voxeld = volDataStart[(x+shift.x)*vol.volDims[0] + (y+shift.y)*vol.volDims[1] + (z+shift.z)*vol.volDims[2]]; volumeType vd = voxeld.v; if(voxeld.weight != 0 && vd != 1.f) { if((v0 > 0 && vd < 0) || (v0 < 0 && vd > 0)) { //linearly interpolate coordinate float Vn = Vc + vol.voxelSize; float dinv = 1.f/(abs(v0)+abs(vd)); float inter = (Vc*abs(vd) + Vn*abs(v0))*dinv; Point3f p(shift.x ? inter : V.x, shift.y ? inter : V.y, shift.z ? inter : V.z); { points.push_back(toPtype(vol.pose * p)); if(needNormals) normals.push_back(toPtype(vol.pose.rotation() * vol.getNormalVoxel(p*vol.voxelSizeInv))); } } } } } virtual void operator() (const Range& range) const override { std::vector points, normals; for(int x = range.start; x < range.end; x++) { const Voxel* volDataX = volDataStart + x*vol.volDims[0]; for(int y = 0; y < vol.volResolution.y; y++) { const Voxel* volDataY = volDataX + y*vol.volDims[1]; for(int z = 0; z < vol.volResolution.z; z++) { const Voxel& voxel0 = volDataY[z*vol.volDims[2]]; volumeType v0 = voxel0.v; if(voxel0.weight != 0 && v0 != 1.f) { Point3f V(Point3f((float)x + 0.5f, (float)y + 0.5f, (float)z + 0.5f)*vol.voxelSize); coord(points, normals, x, y, z, V, v0, 0); coord(points, normals, x, y, z, V, v0, 1); coord(points, normals, x, y, z, V, v0, 2); } // if voxel is not empty } } } AutoLock al(mutex); pVecs.push_back(points); nVecs.push_back(normals); } const TSDFVolumeCPU& vol; std::vector< std::vector >& pVecs; std::vector< std::vector >& nVecs; const Voxel* volDataStart; bool needNormals; mutable Mutex mutex; }; void TSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const { CV_TRACE_FUNCTION(); if(_points.needed()) { std::vector< std::vector > pVecs, nVecs; FetchPointsNormalsInvoker fi(*this, pVecs, nVecs, _normals.needed()); Range range(0, volResolution.x); const int nstripes = -1; parallel_for_(range, fi, nstripes); std::vector points, normals; for(size_t i = 0; i < pVecs.size(); i++) { points.insert(points.end(), pVecs[i].begin(), pVecs[i].end()); normals.insert(normals.end(), nVecs[i].begin(), nVecs[i].end()); } _points.create((int)points.size(), 1, POINT_TYPE); if(!points.empty()) Mat((int)points.size(), 1, POINT_TYPE, &points[0]).copyTo(_points.getMat()); if(_normals.needed()) { _normals.create((int)normals.size(), 1, POINT_TYPE); if(!normals.empty()) Mat((int)normals.size(), 1, POINT_TYPE, &normals[0]).copyTo(_normals.getMat()); } } } struct PushNormals { PushNormals(const TSDFVolumeCPU& _vol, Mat_& _nrm) : vol(_vol), normals(_nrm), invPose(vol.pose.inv()) { } void operator ()(const ptype &pp, const int * position) const { Point3f p = fromPtype(pp); Point3f n = nan3; if(!isNaN(p)) { Point3f voxPt = (invPose * p); voxPt = voxPt * vol.voxelSizeInv; n = vol.pose.rotation() * vol.getNormalVoxel(voxPt); } normals(position[0], position[1]) = toPtype(n); } const TSDFVolumeCPU& vol; Mat_& normals; Affine3f invPose; }; void TSDFVolumeCPU::fetchNormals(InputArray _points, OutputArray _normals) const { CV_TRACE_FUNCTION(); if(_normals.needed()) { Points points = _points.getMat(); CV_Assert(points.type() == POINT_TYPE); _normals.createSameSize(_points, _points.type()); Mat_ normals = _normals.getMat(); points.forEach(PushNormals(*this, normals)); } } ///////// GPU implementation ///////// #ifdef HAVE_OPENCL class TSDFVolumeGPU : public TSDFVolume { public: // dimension in voxels, size in meters TSDFVolumeGPU(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor); virtual void integrate(InputArray _depth, float depthFactor, cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics) override; virtual void raycast(cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics, cv::Size frameSize, cv::OutputArray _points, cv::OutputArray _normals) const override; virtual void fetchPointsNormals(cv::OutputArray points, cv::OutputArray normals) const override; virtual void fetchNormals(cv::InputArray points, cv::OutputArray normals) const override; virtual void reset() override; // See zFirstMemOrder arg of parent class constructor // for the array layout info // Array elem is CV_32FC2, read as (float, int) // TODO: optimization possible to (fp16, int16), see Voxel definition UMat volume; }; TSDFVolumeGPU::TSDFVolumeGPU(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor) : TSDFVolume(_res, _voxelSize, _pose, _truncDist, _maxWeight, _raycastStepFactor, false) { volume = UMat(1, volResolution.x * volResolution.y * volResolution.z, CV_32FC2); reset(); } // zero volume, leave rest params the same void TSDFVolumeGPU::reset() { CV_TRACE_FUNCTION(); volume.setTo(Scalar(0, 0)); } // use depth instead of distance (optimization) void TSDFVolumeGPU::integrate(InputArray _depth, float depthFactor, cv::Affine3f cameraPose, Intr intrinsics) { CV_TRACE_FUNCTION(); UMat depth = _depth.getUMat(); cv::String errorStr; cv::String name = "integrate"; ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc; cv::String options = "-cl-mad-enable"; ocl::Kernel k; k.create(name.c_str(), source, options, &errorStr); if(k.empty()) throw std::runtime_error("Failed to create kernel: " + errorStr); cv::Affine3f vol2cam(cameraPose.inv() * pose); float dfac = 1.f/depthFactor; Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z); Vec2f fxy(intrinsics.fx, intrinsics.fy), cxy(intrinsics.cx, intrinsics.cy); // TODO: optimization possible // Use sampler for depth (mask needed) k.args(ocl::KernelArg::ReadOnly(depth), ocl::KernelArg::PtrReadWrite(volume), ocl::KernelArg::Constant(vol2cam.matrix.val, sizeof(vol2cam.matrix.val)), voxelSize, volResGpu.val, volDims.val, fxy.val, cxy.val, dfac, truncDist, maxWeight); size_t globalSize[2]; globalSize[0] = (size_t)volResolution.x; globalSize[1] = (size_t)volResolution.y; if(!k.run(2, globalSize, NULL, true)) throw std::runtime_error("Failed to run kernel"); } void TSDFVolumeGPU::raycast(cv::Affine3f cameraPose, Intr intrinsics, Size frameSize, cv::OutputArray _points, cv::OutputArray _normals) const { CV_TRACE_FUNCTION(); CV_Assert(frameSize.area() > 0); cv::String errorStr; cv::String name = "raycast"; ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc; cv::String options = "-cl-mad-enable"; ocl::Kernel k; k.create(name.c_str(), source, options, &errorStr); if(k.empty()) throw std::runtime_error("Failed to create kernel: " + errorStr); _points.create (frameSize, CV_32FC4); _normals.create(frameSize, CV_32FC4); UMat points = _points.getUMat(); UMat normals = _normals.getUMat(); UMat vol2camGpu, cam2volGpu; Affine3f vol2cam = cameraPose.inv() * pose; Affine3f cam2vol = pose.inv() * cameraPose; Mat(cam2vol.matrix).copyTo(cam2volGpu); Mat(vol2cam.matrix).copyTo(vol2camGpu); Intr::Reprojector r = intrinsics.makeReprojector(); // We do subtract voxel size to minimize checks after // Note: origin of volume coordinate is placed // in the center of voxel (0,0,0), not in the corner of the voxel! Vec4f boxMin, boxMax(volSize.x - voxelSize, volSize.y - voxelSize, volSize.z - voxelSize); Vec2f finv(r.fxinv, r.fyinv), cxy(r.cx, r.cy); float tstep = truncDist * raycastStepFactor; Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z); k.args(ocl::KernelArg::WriteOnlyNoSize(points), ocl::KernelArg::WriteOnlyNoSize(normals), frameSize, ocl::KernelArg::PtrReadOnly(volume), ocl::KernelArg::PtrReadOnly(vol2camGpu), ocl::KernelArg::PtrReadOnly(cam2volGpu), finv.val, cxy.val, boxMin.val, boxMax.val, tstep, voxelSize, volResGpu.val, volDims.val, neighbourCoords.val); size_t globalSize[2]; globalSize[0] = (size_t)frameSize.width; globalSize[1] = (size_t)frameSize.height; if(!k.run(2, globalSize, NULL, true)) throw std::runtime_error("Failed to run kernel"); } void TSDFVolumeGPU::fetchNormals(InputArray _points, OutputArray _normals) const { CV_TRACE_FUNCTION(); if(_normals.needed()) { UMat points = _points.getUMat(); CV_Assert(points.type() == POINT_TYPE); _normals.createSameSize(_points, POINT_TYPE); UMat normals = _normals.getUMat(); cv::String errorStr; cv::String name = "getNormals"; ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc; cv::String options = "-cl-mad-enable"; ocl::Kernel k; k.create(name.c_str(), source, options, &errorStr); if(k.empty()) throw std::runtime_error("Failed to create kernel: " + errorStr); UMat volPoseGpu, invPoseGpu; Mat(pose .matrix).copyTo(volPoseGpu); Mat(pose.inv().matrix).copyTo(invPoseGpu); Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z); Size frameSize = points.size(); k.args(ocl::KernelArg::ReadOnlyNoSize(points), ocl::KernelArg::WriteOnlyNoSize(normals), frameSize, ocl::KernelArg::PtrReadOnly(volume), ocl::KernelArg::PtrReadOnly(volPoseGpu), ocl::KernelArg::PtrReadOnly(invPoseGpu), voxelSizeInv, volResGpu.val, volDims.val, neighbourCoords.val); size_t globalSize[2]; globalSize[0] = (size_t)points.cols; globalSize[1] = (size_t)points.rows; if(!k.run(2, globalSize, NULL, true)) throw std::runtime_error("Failed to run kernel"); } } void TSDFVolumeGPU::fetchPointsNormals(OutputArray points, OutputArray normals) const { CV_TRACE_FUNCTION(); if(points.needed()) { bool needNormals = normals.needed(); // 1. scan to count points in each group and allocate output arrays ocl::Kernel kscan; cv::String errorStr; ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc; cv::String options = "-cl-mad-enable"; kscan.create("scanSize", source, options, &errorStr); if(kscan.empty()) throw std::runtime_error("Failed to create kernel: " + errorStr); size_t globalSize[3]; globalSize[0] = (size_t)volResolution.x; globalSize[1] = (size_t)volResolution.y; globalSize[2] = (size_t)volResolution.z; const ocl::Device& device = ocl::Device::getDefault(); size_t wgsLimit = device.maxWorkGroupSize(); size_t memSize = device.localMemSize(); // local mem should keep a point (and a normal) for each thread in a group // use 4 float per each point and normal size_t elemSize = (sizeof(float)*4)*(needNormals ? 2 : 1); const size_t lcols = 8; const size_t lrows = 8; size_t lplanes = min(memSize/elemSize, wgsLimit)/lcols/lrows; lplanes = roundDownPow2(lplanes); size_t localSize[3] = {lcols, lrows, lplanes}; Vec3i ngroups((int)divUp(globalSize[0], (unsigned int)localSize[0]), (int)divUp(globalSize[1], (unsigned int)localSize[1]), (int)divUp(globalSize[2], (unsigned int)localSize[2])); const size_t counterSize = sizeof(int); size_t lsz = localSize[0]*localSize[1]*localSize[2]*counterSize; const int gsz[3] = {ngroups[2], ngroups[1], ngroups[0]}; UMat groupedSum(3, gsz, CV_32S, Scalar(0)); UMat volPoseGpu; Mat(pose.matrix).copyTo(volPoseGpu); Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z); kscan.args(ocl::KernelArg::PtrReadOnly(volume), volResGpu.val, volDims.val, neighbourCoords.val, ocl::KernelArg::PtrReadOnly(volPoseGpu), voxelSize, voxelSizeInv, ocl::KernelArg::Local(lsz), ocl::KernelArg::WriteOnlyNoSize(groupedSum)); if(!kscan.run(3, globalSize, localSize, true)) throw std::runtime_error("Failed to run kernel"); Mat groupedSumCpu = groupedSum.getMat(ACCESS_READ); int gpuSum = (int)cv::sum(groupedSumCpu)[0]; // should be no CPU copies when new kernel is executing groupedSumCpu.release(); // 2. fill output arrays according to per-group points count ocl::Kernel kfill; kfill.create("fillPtsNrm", source, options, &errorStr); if(kfill.empty()) throw std::runtime_error("Failed to create kernel: " + errorStr); points.create(gpuSum, 1, POINT_TYPE); UMat pts = points.getUMat(); UMat nrm; if(needNormals) { normals.create(gpuSum, 1, POINT_TYPE); nrm = normals.getUMat(); } else { // it won't access but empty args are forbidden nrm = UMat(1, 1, POINT_TYPE); } UMat atomicCtr(1, 1, CV_32S, Scalar(0)); // mem size to keep pts (and normals optionally) for all work-items in a group lsz = localSize[0]*localSize[1]*localSize[2]*elemSize; kfill.args(ocl::KernelArg::PtrReadOnly(volume), volResGpu.val, volDims.val, neighbourCoords.val, ocl::KernelArg::PtrReadOnly(volPoseGpu), voxelSize, voxelSizeInv, ((int)needNormals), ocl::KernelArg::Local(lsz), ocl::KernelArg::PtrReadWrite(atomicCtr), ocl::KernelArg::ReadOnlyNoSize(groupedSum), ocl::KernelArg::WriteOnlyNoSize(pts), ocl::KernelArg::WriteOnlyNoSize(nrm) ); if(!kfill.run(3, globalSize, localSize, true)) throw std::runtime_error("Failed to run kernel"); } } #endif cv::Ptr makeTSDFVolume(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor) { #ifdef HAVE_OPENCL if(cv::ocl::useOpenCL()) return cv::makePtr(_res, _voxelSize, _pose, _truncDist, _maxWeight, _raycastStepFactor); #endif return cv::makePtr(_res, _voxelSize, _pose, _truncDist, _maxWeight, _raycastStepFactor); } } // namespace kinfu } // namespace cv