#include "render_3d.hpp" int sum(std::vector dim){ return std::accumulate(dim.begin(), dim.end(), 0); } ////////////////////////////////////////////////////////////////////////////////////////////////// // Scene3D Class ////////////////////////////////////////////////////////////////////////////////////////////////// unsigned long long get_timestamp_dss(){ struct timeval now; gettimeofday (&now, NULL); return now.tv_usec + (unsigned long long)now.tv_sec * 1000000; }; struct ImgObjInd{ int ImageId; int ObjId; int TarId; ImgObjInd(int i,int j,int k){ ImageId = i; ObjId = j; TarId = k; } }; struct RGBDpixel{ uint8_t R; uint8_t G; uint8_t B; uint8_t D; uint8_t D_; }; struct Box3D{ unsigned int category; float base[9]; float center[3]; float coeff[3]; }; struct Box2D{ unsigned int category; float tblr[4]; }; struct Box3Ddiff{ float diff[6]; float diff2d[4]; int oreintation; }; void __global__ compute_xyzkernel(float * XYZimage, float * Depthimage, float * K, float * R){ int ix = blockIdx.x; int iy = threadIdx.x; int height = blockDim.x; // //float depth = float(*((uint16_t*)(&(RGBDimage[iy + ix * height].D))))/1000.0; float depth = Depthimage[iy + ix * height]; // project the depth point to 3d float tdx = (float(ix + 1) - K[2]) * depth / K[0]; float tdz = - (float(iy + 1) - K[5]) * depth / K[4]; float tdy = depth; XYZimage[3 * (iy + ix * height) + 0] = R[0] * tdx + R[1] * tdy + R[2] * tdz; XYZimage[3 * (iy + ix * height) + 1] = R[3] * tdx + R[4] * tdy + R[5] * tdz; XYZimage[3 * (iy + ix * height) + 2] = R[6] * tdx + R[7] * tdy + R[8] * tdz; } void __global__ compute_xyzkernel(float * XYZimage, RGBDpixel * RGBDimage, float * K, float * R){ int ix = blockIdx.x; int iy = threadIdx.x; int height = blockDim.x; // //float depth = float(*((uint16_t*)(&(RGBDimage[iy + ix * height].D))))/1000.0; uint16_t D = (uint16_t)RGBDimage[iy + ix * height].D; uint16_t D_ = (uint16_t)RGBDimage[iy + ix * height].D_; D_ = D_<<8; float depth = float(D|D_)/1000.0; //printf("%d,%d,%f\n",RGBDimage[iy + ix * height].D,D_,depth); // project the depth point to 3d float tdx = (float(ix + 1) - K[2]) * depth / K[0]; float tdz = - (float(iy + 1) - K[5]) * depth / K[4]; float tdy = depth; XYZimage[3 * (iy + ix * height) + 0] = R[0] * tdx + R[1] * tdy + R[2] * tdz; XYZimage[3 * (iy + ix * height) + 1] = R[3] * tdx + R[4] * tdy + R[5] * tdz; XYZimage[3 * (iy + ix * height) + 2] = R[6] * tdx + R[7] * tdy + R[8] * tdz; } void __global__ fillInBeIndexFull(unsigned int* beIndexFull, unsigned int* beIndex, unsigned int* beLinIdx, unsigned int len_beLinIdx){ const int index = threadIdx.x + blockIdx.x * blockDim.x; if (index>=len_beLinIdx) { return; } else{ beIndexFull[2*beLinIdx[index]+0] = beIndex[2*index+0]; beIndexFull[2*beLinIdx[index]+1] = beIndex[2*index+1]; } } class sceneMesh{ public: std::vector objects; std::string mesh_file; }; struct mesh_meta{ // x : left right // y : height // z : depth std::string mesh_file; float base[9];// oreintation float center[3]; float coeff[3]; }; enum Scene3DType { RGBD, Render, Mesh }; class Scene3D{ public: // defined in .list file std::vector mesh_List; std::string filename; std::string seqname; float K[9]; float R[9]; unsigned int width; unsigned int height; unsigned int len_pcIndex; unsigned int len_beIndex; unsigned int len_beLinIdx; std::vector objects; std::vector objects_2d_tight; std::vector objects_2d_full; bool GPUdata; Scene3DType DataType; // defined in .data file unsigned int* grid_range; float* begin_range; float grid_delta; RGBDpixel* RGBDimage; unsigned int* beIndex; unsigned int* beLinIdx; unsigned int* pcIndex; float* XYZimage; float* K_GPU; float* R_GPU; //Scene3D(): RGBDimage(NULL), beIndex(NULL), pcIndex(NULL), beLinIdx(NULL),XYZimage(NULL), grid_range(NULL), begin_range(NULL),K_GPU(NULL),R_GPU(NULL),GPUdata(false),isMesh(false){}; Scene3D(){ RGBDimage = NULL; beIndex = NULL; pcIndex = NULL; beLinIdx = NULL; XYZimage = NULL; grid_range = NULL; begin_range = NULL; K_GPU = NULL; R_GPU = NULL; GPUdata = false; DataType = RGBD; }; void compute_xyz() { XYZimage = new float[width*height*3]; //printf("scene->K:%f,%f,%f\n%f,%f,%f\n%f,%f,%f\n",K[0],K[1],K[2],K[3],K[4],K[5],K[6],K[7],K[8]); for (int ix = 0; ix < width; ix++){ for (int iy = 0; iy < height; iy++){ float depth = float(*((uint16_t*)(&(RGBDimage[iy + ix * height].D))))/1000.0; //printf("%d,%f\n",RGBDimage[iy + ix * height].D,RGBDimage[iy + ix * height].D_,depth); // project the depth point to 3d float tdx = (float(ix + 1) - K[2]) * depth / K[0]; float tdz = - (float(iy + 1) - K[5]) * depth / K[4]; float tdy = depth; XYZimage[3 * (iy + ix * height) + 0] = R[0] * tdx + R[1] * tdy + R[2] * tdz; XYZimage[3 * (iy + ix * height) + 1] = R[3] * tdx + R[4] * tdy + R[5] * tdz; XYZimage[3 * (iy + ix * height) + 2] = R[6] * tdx + R[7] * tdy + R[8] * tdz; } } }; void compute_xyzGPU() { if (!GPUdata){ std::cout<< "Data is not at GPU cannot compute_xyz at GPU"<>>(XYZimage,RGBDimage,K_GPU,R_GPU); } void loadData2XYZimage(){ //enum Scene3DType { RGBD, Render, Mesh }; switch(DataType){ case RGBD: { this ->load(); this -> cpu2gpu(); this -> compute_xyzGPU(); } break; case Mesh: { this ->loadMesh2XYZimage(); } break; case Render: { this ->loadrender2XYZimage(); } break; } }; void loadMesh2XYZimage(){ std::vector mesh_models(mesh_List.size()); for (int i = 0 ;i < mesh_List.size();++i){ mesh_models[i] = new Mesh3D(mesh_List[i].mesh_file); // scale and rotate and move to its center mesh_models[i]->zeroCenter(); float scale_ratio = mesh_models[i]->scaleMesh(mesh_List[i].coeff); mesh_models[i]->roateMesh(R); mesh_models[i]->translate(mesh_List[i].center); } float camRT[12] ={0}; for (int i = 0; i<3; ++i){ for (int j = 0; j<3; ++j){ camRT[i*4+j] = R[j*3+i]; } } float P[12]; getProjectionMatrix(P,K, camRT); float* depth = renderDepth(mesh_models, P, width, height); // copy to GPU checkCUDA(__LINE__, cudaMemcpy(K_GPU, (float*)K, sizeof(float)*9, cudaMemcpyHostToDevice)); checkCUDA(__LINE__, cudaMemcpy(R_GPU, (float*)R, sizeof(float)*9, cudaMemcpyHostToDevice)); float * depth_GPU; checkCUDA(__LINE__, cudaMalloc(&depth_GPU, sizeof(float)*width*height)); checkCUDA(__LINE__, cudaMemcpy(depth_GPU, (float*)depth, sizeof(float)*width*height, cudaMemcpyHostToDevice)); // compute XYZimage checkCUDA(__LINE__, cudaMalloc(&XYZimage, sizeof(float)*width*height*3)); compute_xyzkernel<<>>(XYZimage,depth_GPU,K_GPU,R_GPU); // free memory checkCUDA(__LINE__, cudaFree(depth_GPU)); delete[] depth; for (int i = 0 ;i * depth = new Tensor(filename); std::vector*> depthRender = readTensors(filename); float* depth = new float[width*height]; for(int ix=0; ixCPUmem[ix + iy * width] ; } //pbufferD[i] = float( m_near / (1.0 - double(pDepthBuffer[i])/double(4294967296)) ); } float * depth_GPU; //checkCUDA(__LINE__,cudaDeviceSynchronize()); checkCUDA(__LINE__, cudaMalloc(&K_GPU, sizeof(float)*9)); checkCUDA(__LINE__, cudaMemcpy(K_GPU, (float*)K, sizeof(float)*9, cudaMemcpyHostToDevice)); checkCUDA(__LINE__, cudaMalloc(&R_GPU, sizeof(float)*9)); checkCUDA(__LINE__, cudaMemcpy(R_GPU, (float*)R, sizeof(float)*9, cudaMemcpyHostToDevice)); checkCUDA(__LINE__, cudaMalloc(&depth_GPU, sizeof(float)*width*height)); checkCUDA(__LINE__, cudaMemcpy(depth_GPU, (float*)depth, sizeof(float)*width*height, cudaMemcpyHostToDevice)); //checkCUDA(__LINE__,cudaDeviceSynchronize()); checkCUDA(__LINE__, cudaMalloc(&XYZimage, sizeof(float)*width*height*3)); //checkCUDA(__LINE__,cudaDeviceSynchronize()); compute_xyzkernel<<>>(XYZimage,depth_GPU,K_GPU,R_GPU); //checkCUDA(__LINE__,cudaDeviceSynchronize()); checkCUDA(__LINE__, cudaFree(depth_GPU)); for (int i = 0 ;i >>(beIndexFull,beIndex,beLinIdx,len_beLinIdx); checkCUDA(__LINE__,cudaGetLastError()); checkCUDA(__LINE__, cudaFree(beIndex)); beIndex = NULL; checkCUDA(__LINE__, cudaFree(beLinIdx)); beLinIdx = NULL; beIndex = beIndexFull; if (pcIndex!=NULL){ unsigned int* pcIndexCPU = pcIndex; checkCUDA(__LINE__, cudaMalloc(&pcIndex, sizeof(unsigned int)*len_pcIndex)); checkCUDA(__LINE__, cudaMemcpy(pcIndex, pcIndexCPU,sizeof(unsigned int)*len_pcIndex, cudaMemcpyHostToDevice)); delete [] pcIndexCPU; } else{ std::cout << "pcIndexCPU is NULL"< volume_size) return; float delta_x = 2 * bb3d_data[12] / float(tsdf_size); float delta_y = 2 * bb3d_data[13] / float(tsdf_size1); float delta_z = 2 * bb3d_data[14] / float(tsdf_size2); float surface_thick = 0.1; const float MaxDis = surface_thick + 20; //printf("delta_x:%f,%f,%f\n",R_data[0],R_data[1],R_data[2]); // caculate tsdf for this box /* float x = float(index % tsdf_size); float y = float((index / tsdf_size) % tsdf_size); float z = float((index / tsdf_size / tsdf_size) % tsdf_size); */ float x = float((index / (tsdf_size1*tsdf_size2))%tsdf_size) ; float y = float((index / tsdf_size2) % tsdf_size1); float z = float(index % tsdf_size2); for (int i =0;i= im_w || iy < 0 || iy >= im_h || zz < 0.0001){ return; } // find the most nearby point float disTosurfaceMin = MaxDis; int idx_min = 0; int x_grid = floor((x-range[0])/grid_delta); int y_grid = floor((y-range[1])/grid_delta); int z_grid = floor((z-range[2])/grid_delta); //grid_range = [w,d,h]; linearInd =x(i)*d*h+y(i)*h+z(i); //if (x_grid < 0 || x_grid >= grid_range[0] || y_grid < 0 || y_grid >= grid_range[1] || z_grid < 0 || z_grid >= grid_range[2]){ if (x_grid < 0 || x_grid > grid_range[0] || y_grid < 0 || y_grid > grid_range[1] || z_grid < 0 || z_grid > grid_range[2]){ return; } int linearInd =x_grid*grid_range[1]*grid_range[2]+y_grid*grid_range[2]+z_grid; int search_region =1; if (star_end_indx_data[2*linearInd+0]>0){ search_region =0; } int find_close_point = -1; while(find_close_point<0&&search_region<3){ for (int iix = max(0,x_grid-search_region); iix < min((int)grid_range[0],x_grid+search_region+1); iix++){ for (int iiy = max(0,y_grid-search_region); iiy < min((int)grid_range[1],y_grid+search_region+1); iiy++){ for (int iiz = max(0,z_grid-search_region); iiz < min((int)grid_range[2],z_grid+search_region+1); iiz++){ unsigned int iilinearInd = iix*grid_range[1]*grid_range[2] + iiy*grid_range[2] + iiz; for (int pid = star_end_indx_data[2*iilinearInd+0]-1; pid < star_end_indx_data[2*iilinearInd+1]-1;pid++){ //printf("%d-%d\n",star_end_indx_data[2*iilinearInd+0],star_end_indx_data[2*iilinearInd+1]); unsigned int p_idx_lin = pc_lin_indx_data[pid]; float xp = XYZimage[3*p_idx_lin+0]; float yp = XYZimage[3*p_idx_lin+1]; float zp = XYZimage[3*p_idx_lin+2]; // distance float xd = abs(x - xp); float yd = abs(y - yp); float zd = abs(z - zp); if (xd < 2.0 * delta_x||yd < 2.0 * delta_x|| zd < 2.0 * delta_x){ float disTosurface = sqrt(xd * xd + yd * yd + zd * zd); if (disTosurface < disTosurfaceMin){ disTosurfaceMin = disTosurface; idx_min = p_idx_lin; find_close_point = 1; //printf("x:%f,%f,%f,xp,%f,%f,%f,xd%f,%f,%f,%f\n",x,y,z,xp,yp,zp,xd,yd,zd,disTosurfaceMin); } } } // for all points in this grid } } } search_region ++; }//while float tsdf_x = MaxDis; float tsdf_y = MaxDis; float tsdf_z = MaxDis; float color_b =0; float color_g =0; float color_r =0; float xnear = 0; float ynear = 0; float znear = 0; if (find_close_point>0){ xnear = XYZimage[3*idx_min+0]; ynear = XYZimage[3*idx_min+1]; znear = XYZimage[3*idx_min+2]; tsdf_x = abs(x - xnear); tsdf_y = abs(y - ynear); tsdf_z = abs(z - znear); color_b = float(RGBDimage[idx_min].B)/255.0; color_g = float(RGBDimage[idx_min].G)/255.0; color_r = float(RGBDimage[idx_min].R)/255.0; //printf("x:%f,tsdf_x:%f,%f,%f\n",disTosurfaceMin,tsdf_x,tsdf_y,tsdf_z); } //printf("before : %f,%f,%f\n",tsdf_x,tsdf_y,tsdf_z); disTosurfaceMin = min(disTosurfaceMin/surface_thick,float(1.0)); float ratio = 1.0 - disTosurfaceMin; float second_ratio =0; if (ratio > 0.5) { second_ratio = 1 - ratio; } else{ second_ratio = ratio; } if (disTosurfaceMin > 0.999){ tsdf_x = MaxDis; tsdf_y = MaxDis; tsdf_z = MaxDis; } if (encode_type == 101){ tsdf_x = min(tsdf_x, surface_thick); tsdf_y = min(tsdf_y, surface_thick); tsdf_z = min(tsdf_z, surface_thick); } else{ tsdf_x = min(tsdf_x, float(2.0 * delta_x)); tsdf_y = min(tsdf_y, float(2.0 * delta_y)); tsdf_z = min(tsdf_z, float(2.0 * delta_z)); } float depth_project = XYZimage[3*(ix * im_h + iy)+1]; if (zz > depth_project) { tsdf_x = - tsdf_x; tsdf_y = - tsdf_y; tsdf_z = - tsdf_z; disTosurfaceMin = - disTosurfaceMin; second_ratio = - second_ratio; } // encode_type if (encode_type == 100||encode_type == 101){ tsdf_data[index + 0 * volume_size] = GPUCompute2StorageT(tsdf_x); tsdf_data[index + 1 * volume_size] = GPUCompute2StorageT(tsdf_y); tsdf_data[index + 2 * volume_size] = GPUCompute2StorageT(tsdf_z); } else if(encode_type == 102){ tsdf_data[index + 0 * volume_size] = GPUCompute2StorageT(tsdf_x); tsdf_data[index + 1 * volume_size] = GPUCompute2StorageT(tsdf_y); tsdf_data[index + 2 * volume_size] = GPUCompute2StorageT(tsdf_z); tsdf_data[index + 3 * volume_size] = GPUCompute2StorageT(color_b/scale); tsdf_data[index + 4 * volume_size] = GPUCompute2StorageT(color_g/scale); tsdf_data[index + 5 * volume_size] = GPUCompute2StorageT(color_r/scale); } else if(encode_type == 103){ tsdf_data[index + 0 * volume_size] = GPUCompute2StorageT(ratio); } // scale feature for (int i =0;i volume_size) return; float delta_x = 2 * bb3d_data[12] / float(tsdf_size); float delta_y = 2 * bb3d_data[13] / float(tsdf_size1); float delta_z = 2 * bb3d_data[14] / float(tsdf_size2); float surface_thick = 0.1; const float MaxDis = surface_thick + 20; float x = float((index / (tsdf_size1*tsdf_size2))%tsdf_size) ; float y = float((index / tsdf_size2) % tsdf_size1); float z = float(index % tsdf_size2); for (int i =0;i= im_w || iy < 0 || iy >= im_h || zz < 0.0001){ return; } float x_project = XYZimage[3*(ix * im_h + iy)+0]; float y_project = XYZimage[3*(ix * im_h + iy)+1]; float z_project = XYZimage[3*(ix * im_h + iy)+2]; float tsdf_x = abs(x - x_project); float tsdf_y = abs(y - y_project); float tsdf_z = abs(z - z_project); float color_b = 0; float color_g = 0; float color_r = 0; if (RGBDimage!=NULL){ color_b = float(RGBDimage[(ix * im_h + iy)].B)/255.0; color_g = float(RGBDimage[(ix * im_h + iy)].G)/255.0; color_r = float(RGBDimage[(ix * im_h + iy)].R)/255.0; } float disTosurfaceMin = sqrt(tsdf_x * tsdf_x + tsdf_y * tsdf_y + tsdf_z * tsdf_z); disTosurfaceMin = min(disTosurfaceMin/surface_thick,float(1.0)); float ratio = 1.0 - disTosurfaceMin; float second_ratio =0; if (ratio > 0.5) { second_ratio = 1 - ratio; } else{ second_ratio = ratio; } if (disTosurfaceMin > 0.999){ tsdf_x = MaxDis; tsdf_y = MaxDis; tsdf_z = MaxDis; } tsdf_x = min(tsdf_x, float(2.0 * delta_x)); tsdf_y = min(tsdf_y, float(2.0 * delta_y)); tsdf_z = min(tsdf_z, float(2.0 * delta_z)); if (zz > y_project) { tsdf_x = - tsdf_x; tsdf_y = - tsdf_y; tsdf_z = - tsdf_z; disTosurfaceMin = - disTosurfaceMin; second_ratio = - second_ratio; } // encode_type if (encode_type == 0){ tsdf_data[index + 0 * volume_size] = GPUCompute2StorageT(tsdf_x); tsdf_data[index + 1 * volume_size] = GPUCompute2StorageT(tsdf_y); tsdf_data[index + 2 * volume_size] = GPUCompute2StorageT(tsdf_z); } if (encode_type == 2){ tsdf_data[index + 0 * volume_size] = GPUCompute2StorageT(tsdf_x); tsdf_data[index + 1 * volume_size] = GPUCompute2StorageT(tsdf_y); tsdf_data[index + 2 * volume_size] = GPUCompute2StorageT(tsdf_z); tsdf_data[index + 3 * volume_size] = GPUCompute2StorageT(color_b/scale); tsdf_data[index + 4 * volume_size] = GPUCompute2StorageT(color_g/scale); tsdf_data[index + 5 * volume_size] = GPUCompute2StorageT(color_r/scale); } // scale feature for (int i =0;i grid_size, int encode_type, float scale){ scene->loadData2XYZimage(); float* bb3d_data; checkCUDA(__LINE__, cudaMalloc(&bb3d_data, sizeof(float)*15)); checkCUDA(__LINE__, cudaMemcpy(bb3d_data , SpaceBox.base, sizeof(float)*15, cudaMemcpyHostToDevice)); unsigned int * grid_range = scene->grid_range; float* R_data = scene->R_GPU; float* K_data = scene->K_GPU; float* range = scene->begin_range; RGBDpixel* RGBDimage = scene->RGBDimage; unsigned int* star_end_indx_data = scene->beIndex; unsigned int* pc_lin_indx_data = scene->pcIndex; float* XYZimage = scene->XYZimage; int THREADS_NUM = 1024; int BLOCK_NUM = int((grid_size[1]*grid_size[2]*grid_size[3] + size_t(THREADS_NUM) - 1) / THREADS_NUM); compute_TSDFGPUbox<<>>(tsdf_data_GPU, R_data, K_data, range, scene->grid_delta, grid_range, RGBDimage, star_end_indx_data, pc_lin_indx_data, XYZimage, bb3d_data, grid_size[1],grid_size[2],grid_size[3],grid_size[0], scene->width, scene->height, encode_type, scale); scene-> free(); checkCUDA(__LINE__,cudaGetLastError()); checkCUDA(__LINE__, cudaFree(bb3d_data)); } void compute_TSDF (std::vector *chosen_scenes_ptr, std::vector *chosen_box_id, StorageT* datamem, std::vector grid_size, int encode_type, float scale) { // for each scene int totalcounter = 0; float tsdf_size = grid_size[1]; if (grid_size[1]!=grid_size[2]||grid_size[1]!=grid_size[3]){ std::cerr << "grid_size[1]!=grid_size[2]||grid_size[1]!=grid_size[3]" <>>(tsdf_data, R_data, K_data, RGBDimage, XYZimage, bb3d_data, grid_size[1],grid_size[2],grid_size[3], grid_size[0], scene->width, scene->height, encode_type, scale); } checkCUDA(__LINE__,cudaDeviceSynchronize()); checkCUDA(__LINE__,cudaGetLastError()); //time4 = get_timestamp_dss(); // ++totalcounter; scene_prev = scene; //loadtime += time1-time0; //copygputime += time2-time1; //transformtime += time4-time3; } checkCUDA(__LINE__, cudaFree(bb3d_data)); // free the loaded images for (int sceneId = 0;sceneId<(*chosen_scenes_ptr).size();sceneId++){ (*chosen_scenes_ptr)[sceneId]->free(); } //std::cout << "compute_TSDF: read disk " << loadtime/1000 << " ms, " << "copygputime " //<< copygputime/1000 << "transform " << transformtime/1000 << " ms" < 0){ float context_scale = float(tsdf_size) / (float(tsdf_size) - 2*context_pad); box.coeff[0] = box.coeff[0] * context_scale; box.coeff[1] = box.coeff[1] * context_scale; box.coeff[2] = box.coeff[2] * context_scale; } // change the oreintation if (box.base[1]<0){ box.base[0] = -1*box.base[0]; box.base[1] = -1*box.base[1]; box.base[2] = -1*box.base[2]; } if (box.base[4]<0){ box.base[3] = -1*box.base[3]; box.base[4] = -1*box.base[4]; box.base[5] = -1*box.base[5]; } if(box.base[1] scenes; //int count = 0; int object_count = 0; float scale =100; float context_pad =3; std::vector grid_size {3,30,30,30}; int encode_type =100; std::cout<<"loading file "<filename.resize(len); if (len>0) fread((void*)(scene->filename.data()), sizeof(char), len, fp); scene->filename = data_root+scene->filename+".bin"; fread((void*)(scene->R), sizeof(float), 9, fp); fread((void*)(scene->K), sizeof(float), 9, fp); fread((void*)(&scene->height), sizeof(unsigned int), 1, fp); fread((void*)(&scene->width), sizeof(unsigned int), 1, fp); fread((void*)(&len), sizeof(unsigned int), 1, fp); scene->objects.resize(len); if (len>0){ for (int i=0;iobjects[i]=box; object_count++; //num_categories = max(num_categories, box.category); //printf("category:%d\n",box.category); //printf("box.base:%f,%f,%f,%f,%f,%f\n",box.base[0],box.base[1],box.base[2],box.base[3],box.base[4],box.base[5]); //printf("box.base:%f,%f,%f,%f,%f,%f\n",box.base[0],box.base[1],box.base[2],box.base[3],box.base[4],box.base[5]); //printf("box.center:%f,%f,%f\n",box.center[0],box.center[1],box.center[2]); //printf("box.coeff:%f,%f,%f\n",box.coeff[0],box.coeff[1],box.coeff[2]); } } scenes.push_back(scene); } fclose(fp); std::vector chosen_scenes; std::vector chosen_box_id; for (int i = 0;iobjects.size();++j){ chosen_scenes.push_back(scenes[i]); chosen_box_id.push_back(j); } } std::cout<<"object_count:" <* labelCPU; StorageT* dataGPU; Tensor* bb_tar_diff; Tensor* bb_loss_weights; Tensor* hha_fea; Tensor* img_fea; Tensor* bb_2d_diff; Tensor* bb_2d_weights; Tensor* oreintation_label; Tensor* oreintation_label_w; std::future lock; std::vector batch_size; std::vector bb_param_weight; std::vector scenes; std::vector target_boxes; std::vector target_2dboxes; std::vector imgobj_pos; std::vector imgobj_neg; int counter_neg; int counter_pos; //std::vector> imgobj_pos_struct; //std::vector> imgobj_neg_struct; std::vector> imgobj_pos_cates; std::vector counter_pos_cates; int num_percate; std::string file_list; std::string data_root; std::vector grid_size; int encode_type; float scale; float context_pad; unsigned int num_categories; bool box_reg; bool is_render; bool is_combineimg; bool is_combinehha; int imgfea_dim; std::string img_fea_folder; bool box_2dreg; bool orein_cls; int num_oreintation; int numofitems(){ int total_number = 0; for (int i=0;iobjects.size(); } return total_number; }; int numofitemsTruncated(){ return sum(batch_size) * floor(double(numofitems())/double(sum(batch_size))); }; void shuffle(){ std::cout<< "!!!!!!!!!!! Should Call shuffle() with input !!!!!!!!!!" <DataType = Render;} //std::vector boxlist_pos; //std::vector boxlist_neg; unsigned int len = 0; file_size += fread((void*)(&len), sizeof(unsigned int), 1, fp); if (len==0) break; scene->filename.resize(len); if (len>0) file_size += fread((void*)(scene->filename.data()), sizeof(char), len, fp); if (is_render) { scene->DataType = Render; scene->filename = data_root+scene->filename; } else{ scene->filename = data_root+scene->filename+".bin"; } //std::cout<filename<R), sizeof(float), 9, fp); file_size += fread((void*)(scene->K), sizeof(float), 9, fp); file_size += fread((void*)(&scene->height), sizeof(unsigned int), 1, fp); file_size += fread((void*)(&scene->width), sizeof(unsigned int), 1, fp); file_size += fread((void*)(&len), sizeof(unsigned int), 1, fp); scene->objects.resize(len); if (len>0){ for (int i=0;iobjects[i] = box; //num_categories = max(num_categories, box.category); // read target box if exist int tarId = -1; if (box_reg&&(phase==Training||is_render)){ uint8_t hasTarget = 0; file_size += fread((void*)(&hasTarget), sizeof(uint8_t), 1, fp); if (hasTarget>0){ Box3Ddiff box_tar_diff; file_size += fread((void*)(box_tar_diff.diff), sizeof(float), 6, fp); if (box_2dreg&&phase==Training){ file_size += fread((void*)(box_tar_diff.diff2d), sizeof(float), 4, fp); float diff2d_full[4]; file_size += fread((void*)(diff2d_full), sizeof(float), 4, fp); } if (orein_cls&&phase==Training){ file_size += fread((void*)(&box_tar_diff.oreintation), sizeof(int), 1, fp); //std::cout< data_dim; std::vector label_dim; std::vector bb_tar_diff_dim; data_dim.push_back(sum(batch_size)); data_dim.insert( data_dim.end(), grid_size.begin(), grid_size.end() ); label_dim.push_back(sum(batch_size)); label_dim.push_back(1); label_dim.push_back(1); label_dim.push_back(1); label_dim.push_back(1); bb_tar_diff_dim.push_back(sum(batch_size)); bb_tar_diff_dim.push_back(bb_param_weight.size()*num_categories); bb_tar_diff_dim.push_back(1); bb_tar_diff_dim.push_back(1); bb_tar_diff_dim.push_back(1); std::vector img_fea_dim(5,1); img_fea_dim[0] = sum(batch_size); img_fea_dim[1] = imgfea_dim; labelCPU = new Tensor(label_dim); bb_tar_diff = new Tensor(bb_tar_diff_dim); bb_loss_weights = new Tensor(bb_tar_diff_dim); img_fea = new Tensor(img_fea_dim); hha_fea = new Tensor(img_fea_dim); out[0]->need_diff = false; out[0]->receptive_field.resize(data_dim.size()-2); fill_n(out[0]->receptive_field.begin(),data_dim.size()-2,1); out[0]->receptive_gap.resize(data_dim.size()-2); fill_n(out[0]->receptive_gap.begin(), data_dim.size()-2,1); out[0]->receptive_offset.resize(data_dim.size()-2); fill_n(out[0]->receptive_offset.begin(), data_dim.size()-2,0); memoryBytes += out[0]->Malloc(data_dim); out[1]->need_diff = false; memoryBytes += out[1]->Malloc(label_dim); out[2]->need_diff = false; memoryBytes += out[2]->Malloc(bb_tar_diff_dim); out[3]->need_diff = false; memoryBytes += out[3]->Malloc(bb_tar_diff_dim); if (is_combineimg){ out[4]->need_diff = false; out[4]->receptive_field.resize(img_fea_dim.size()-2); fill_n(out[4]->receptive_field.begin(), img_fea_dim.size()-2,1); out[4]->receptive_gap.resize(img_fea_dim.size()-2); fill_n(out[4]->receptive_gap.begin(), img_fea_dim.size()-2,1); out[4]->receptive_offset.resize(img_fea_dim.size()-2); fill_n(out[4]->receptive_offset.begin(), img_fea_dim.size()-2,0); memoryBytes += out[4]->Malloc(img_fea_dim); } if (is_combinehha){ out[5]->need_diff = false; out[5]->receptive_field.resize(img_fea_dim.size()-2); fill_n(out[5]->receptive_field.begin(), img_fea_dim.size()-2,1); out[5]->receptive_gap.resize(img_fea_dim.size()-2); fill_n(out[5]->receptive_gap.begin(), img_fea_dim.size()-2,1); out[5]->receptive_offset.resize(img_fea_dim.size()-2); fill_n(out[5]->receptive_offset.begin(), img_fea_dim.size()-2,0); memoryBytes += out[5]->Malloc(img_fea_dim); } // if (box_2dreg){ std::vector bb_2d_tar_diff; bb_2d_tar_diff.push_back(sum(batch_size)); bb_2d_tar_diff.push_back(4*num_categories); bb_2d_tar_diff.push_back(1); bb_2d_tar_diff.push_back(1); bb_2d_tar_diff.push_back(1); bb_2d_diff = new Tensor(bb_2d_tar_diff); bb_2d_weights = new Tensor(bb_2d_tar_diff); out[6]->need_diff = false; memoryBytes += out[6]->Malloc(bb_2d_tar_diff); out[7]->need_diff = false; memoryBytes += out[7]->Malloc(bb_2d_tar_diff); } if (orein_cls){ std::vector oreintation_label_dim(5,1); std::vector oreintation_label_w_dim(5,1); oreintation_label_dim[0] = sum(batch_size); oreintation_label_dim[1] = 1; oreintation_label_dim[2] = num_categories; oreintation_label_w_dim[0] = sum(batch_size); oreintation_label_w_dim[1] = num_oreintation; oreintation_label_w_dim[2] = num_categories; oreintation_label = new Tensor(oreintation_label_dim); oreintation_label_w = new Tensor(oreintation_label_w_dim); out[8]->need_diff = false; memoryBytes += out[8]->Malloc(oreintation_label_dim); out[9]->need_diff = false; memoryBytes += out[9]->Malloc(oreintation_label_w_dim); } checkCUDA(__LINE__, cudaMalloc(&dataGPU, numel(data_dim) * sizeofStorageT) ); lock = std::async(std::launch::async,&Scene3DDataLayer::prefetch,this); //prefetch(); return memoryBytes; }; void prefetch(){ checkCUDA(__LINE__,cudaSetDevice(GPU)); //return; //int tmpD; cudaGetDevice(&tmpD); std::cout<<"GPU at prefetch LINE: "<<__LINE__<<" = "< chosen_scenes; std::vector chosen_box_id; std::vector choose_img_id(sum(batch_size)); std::vector category_count(num_categories,0); int totalpos, totalneg, total_percate; int count_batch_box = 0; if (batch_size.size()==1||phase==Testing) { totalneg = sum(batch_size); totalpos = 0; total_percate =0; } else{ totalneg = batch_size[0]; total_percate = num_percate*num_categories; totalpos = batch_size[1]-total_percate; } // set bb_tar_diff, bb_loss_weights to zeros if (box_reg){ memset(bb_tar_diff->CPUmem, 0, bb_tar_diff->numBytes()); memset(bb_loss_weights->CPUmem, 0, bb_loss_weights->numBytes()); } if (box_2dreg){ memset(bb_2d_diff->CPUmem, 0, bb_2d_diff->numBytes()); memset(bb_2d_weights->CPUmem, 0, bb_2d_weights->numBytes()); } if(orein_cls){ memset(oreintation_label->CPUmem, 0, oreintation_label->numBytes()); memset(oreintation_label_w->CPUmem,0, oreintation_label_w->numBytes()); } // get the negtive boxes for (int i =0; i< totalneg; ++i){ int imgId = imgobj_neg[counter_neg].ImageId; int objId = imgobj_neg[counter_neg].ObjId; choose_img_id[count_batch_box] = imgId; chosen_scenes.push_back(scenes[imgId]); chosen_box_id.push_back(objId); labelCPU->CPUmem[count_batch_box] = CPUCompute2StorageT(ComputeT(scenes[imgId]->objects[objId].category)); ++counter_neg; ++count_batch_box; category_count[0]++; if (counter_neg >= imgobj_neg.size()){ counter_neg = 0; ++epoch_prefetch; if(phase!=Testing){ shuffle(true,false); } } } /* std::cout<<"totalneg: "<CPUmem[i]<<" , "; } std::cout<0){ for (int i = 1; i< num_categories; ++i){ for (int j = 0; j< num_percate; ++j){ int imgId = imgobj_pos_cates[i][j].ImageId; int objId = imgobj_pos_cates[i][j].ObjId; int tarId = imgobj_pos_cates[i][j].TarId; choose_img_id[count_batch_box] = imgId; chosen_scenes.push_back(scenes[imgId]); chosen_box_id.push_back(objId); labelCPU->CPUmem[count_batch_box] = CPUCompute2StorageT(ComputeT(scenes[imgId]->objects[objId].category)); if (box_reg){ int idx = count_batch_box*bb_param_weight.size()*num_categories + scenes[imgId]->objects[objId].category*bb_param_weight.size(); for (int cid = 0;cid<6;cid++){ bb_tar_diff->CPUmem[idx+cid] = CPUCompute2StorageT(target_boxes[tarId].diff[cid]); } for (int cid = 0;cidCPUmem[idx+cid] = CPUCompute2StorageT(bb_param_weight[cid]); } } if (box_2dreg){ int idx_2d = count_batch_box*4*num_categories + scenes[imgId]->objects[objId].category*4; for (int cid = 0;cid<4;cid++){ bb_2d_diff ->CPUmem[idx_2d+cid] = CPUCompute2StorageT(target_boxes[tarId].diff2d[cid]); } for (int cid = 0;cid<4;cid++){ bb_2d_weights ->CPUmem[idx_2d+cid] = CPUCompute2StorageT(1); } } if (orein_cls){ if(target_boxes[tarId].oreintation >= 0){ oreintation_label ->CPUmem[count_batch_box*num_categories+scenes[imgId]->objects[objId].category] = CPUCompute2StorageT(ComputeT(target_boxes[tarId].oreintation)); for (int cid = 0;cidCPUmem[count_batch_box*num_oreintation*num_categories+cid*num_categories+scenes[imgId]->objects[objId].category] = CPUCompute2StorageT(1); } } } ++count_batch_box; category_count[i]++; counter_pos_cates[i]++; if (counter_pos_cates[i] >= imgobj_pos_cates[i].size()){ counter_pos_cates[i] = 0; if(phase!=Testing){ shuffle(i); } } } } } // get the positive boxes for (int i =0; i< totalpos; ++i){ int imgId = imgobj_pos[counter_pos].ImageId; int objId = imgobj_pos[counter_pos].ObjId; int tarId = imgobj_pos[counter_pos].TarId; choose_img_id[count_batch_box] = imgId; chosen_scenes.push_back(scenes[imgId]); chosen_box_id.push_back(objId); labelCPU->CPUmem[count_batch_box] = CPUCompute2StorageT(ComputeT(scenes[imgId]->objects[objId].category)); if (box_reg){ int idx = count_batch_box*bb_param_weight.size()*num_categories + scenes[imgId]->objects[objId].category*bb_param_weight.size(); for (int cid = 0;cid<6;cid++){ bb_tar_diff->CPUmem[idx+cid] = CPUCompute2StorageT(target_boxes[tarId].diff[cid]); } for (int cid = 0;cidCPUmem[idx+cid] = CPUCompute2StorageT(bb_param_weight[cid]); } } if (box_2dreg){ int idx_2d = count_batch_box*4*num_categories + scenes[imgId]->objects[objId].category*4; for (int cid = 0;cid<4;cid++){ bb_2d_diff -> CPUmem[idx_2d+cid] = CPUCompute2StorageT(target_boxes[tarId].diff2d[cid]); } for (int cid = 0;cid<4;cid++){ bb_2d_weights -> CPUmem[idx_2d+cid] = CPUCompute2StorageT(1); } } if (orein_cls){ if(target_boxes[tarId].oreintation >= 0){ oreintation_label ->CPUmem[count_batch_box*num_categories+scenes[imgId]->objects[objId].category] = CPUCompute2StorageT(ComputeT(target_boxes[tarId].oreintation)); //std::cout << target_boxes[tarId].oreintation<<":"<CPUmem[count_batch_box]<<","; for (int cid = 0;cidCPUmem[count_batch_box*num_oreintation*num_categories+cid*num_categories+scenes[imgId]->objects[objId].category] = CPUCompute2StorageT(1); //std::cout << oreintation_label_w ->CPUmem[count_batch_box*num_oreintation+cid]<<","; } } } ++counter_pos; ++count_batch_box; category_count[scenes[imgId]->objects[objId].category]++; if (counter_pos >= imgobj_pos.size()){ counter_pos = 0; std::cout< * feaTensor; for (int i =0; i < chosen_scenes.size(); i++){ if (i == 0){ feaTensor = new Tensor(img_fea_folder +"fc7_"+ std::to_string(choose_img_id[i]) + ".tensor"); } else if(choose_img_id[i]!=choose_img_id[i-1]){ delete feaTensor; feaTensor = new Tensor(img_fea_folder +"fc7_"+ std::to_string(choose_img_id[i]) + ".tensor"); } //std::cout<<"i = "<(img_fea_folder +"da_fc7_"+ std::to_string(choose_img_id[i]) +".tensor"); } memcpy(hha_fea->CPUmem + i*imgfea_dim, feaTensor->CPUmem + chosen_box_id[i]*imgfea_dim, sizeofStorageT*imgfea_dim); //std::cout<filename<CPUmem,sizeof(float),(sum(batch_size))*3*30*30*30,fid); fclose(fid); std::cout<< labelCPU->CPUmem[0]<<","<< labelCPU->CPUmem[1]<<","<CPUmem[2]<CPUmem = float [sum(batch_size)* 6 * 30 * 30 * 30]; //labelCPU->CPUmem = float [sum(batch_size)* 6 * 1 * 1 * 1]; //std::cout<<"prefetch GPU="<write("DSS/debug/oreintation_label.tensor"); oreintation_label_w->write("DSS/debug/oreintation_label_w.tensor"); std::cout<<"finish writing"<dataGPU,dataGPU); checkCUDA(__LINE__, cudaMemcpy(out[1]->dataGPU, labelCPU->CPUmem, labelCPU->numBytes(), cudaMemcpyHostToDevice) ); if (box_reg){ checkCUDA(__LINE__, cudaMemcpy(out[2]->dataGPU, bb_tar_diff->CPUmem, bb_tar_diff->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[3]->dataGPU, bb_loss_weights->CPUmem, bb_loss_weights->numBytes(), cudaMemcpyHostToDevice) ); } if (is_combineimg){ checkCUDA(__LINE__, cudaMemcpy(out[4]->dataGPU, img_fea->CPUmem, img_fea->numBytes(), cudaMemcpyHostToDevice) ); } if (is_combinehha){ checkCUDA(__LINE__, cudaMemcpy(out[5]->dataGPU, hha_fea->CPUmem, hha_fea->numBytes(), cudaMemcpyHostToDevice) ); } if (box_2dreg){ checkCUDA(__LINE__, cudaMemcpy(out[6]->dataGPU, bb_2d_diff->CPUmem, bb_2d_diff->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[7]->dataGPU, bb_2d_weights->CPUmem, bb_2d_weights->numBytes(), cudaMemcpyHostToDevice) ); } if(orein_cls){ //std::cout<<"copied"<dataGPU, oreintation_label->CPUmem, oreintation_label->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[9]->dataGPU, oreintation_label_w->CPUmem, oreintation_label_w->numBytes(), cudaMemcpyHostToDevice) ); } epoch = epoch_prefetch; lock = std::async(std::launch::async,&Scene3DDataLayer::prefetch,this); //prefetch(); //checkCUDA(__LINE__, cudaMemcpy(out[0]->dataGPU, dataCPU->CPUmem, dataCPU->numBytes() , cudaMemcpyHostToDevice) ); //checkCUDA(__LINE__, cudaMemcpy(out[0]->dataGPU, dataGPU, numel(data_dim) * sizeof(float) , cudaMemcpyDeviceToDevice) ); /* { stringstream sstm; sstm << "/n/fs/modelnet/deepDetect/code/caffe_gpu2/debug/" << epoch << "_" << "bottom_0_conv1.tensor"; std::string fname = sstm.str(); Tensor* dataCPU = new Tensor(fname); std::cout<<"data ="; veciPrint(dataCPU->dim); std::cout<dataGPU, dataCPU->CPUmem, dataCPU->numBytes() , cudaMemcpyHostToDevice) ); delete dataCPU; } { stringstream sstm; sstm << "/n/fs/modelnet/deepDetect/code/caffe_gpu2/debug/" << epoch << "_" << "bottom_1_loss.tensor"; std::string fname = sstm.str(); Tensor* labelCPU = new Tensor(fname); std::cout<<"label="; veciPrint(labelCPU->dim); std::cout<dataGPU, labelCPU->CPUmem, labelCPU->numBytes() , cudaMemcpyHostToDevice) ); delete labelCPU; } epoch++; */ }; }; class RPNDataLayer : public DataLayer { public: int epoch_prefetch; //Tensor* dataCPU; StorageT* dataGPU; Tensor* labelCPU; Tensor* label_weights; Tensor* bb_tar_diff; Tensor* bb_loss_weights; Tensor* labelCPU_1; Tensor* label_weights_1; Tensor* bb_tar_diff_1; Tensor* bb_loss_weights_1; Box3D SpaceBox; std::vector grid_size; std::vector label_size; std::vector batch_size; std::vector bb_param_weight; std::vector size_group; std::vector num_size_group; std::vector size_group_map; std::vector scenes; // store the scenes without box std::string file_list; std::string data_root;// to store the depth and image std::string rpn_data_root; int encode_type; float scale; float context_pad; unsigned int num_categories; unsigned int num_anchors; std::vector pos_overlap; float neg_overlap; float pos_weight; float neg_weight; std::future lock; void shuffle(){ std::shuffle(scenes.begin(),scenes.end(), rng ); }; int numofitems(){ return scenes.size(); }; /* int numofitemsTruncated(){ return sum(batch_size) * floor(double(numofitems())/double(sum(batch_size))); }; */ void init(){ epoch_prefetch = 0; counter = 0; train_me = false; if (phase==Testing){ batch_size[0] = sum(batch_size); batch_size.resize(1); } SpaceBox.category = 0; float base[9] = {1,0,0,0,1,0,0,0,1}; for (int i=0;i<9;i++){ SpaceBox.base[i] = base[i]; } SpaceBox.center[0] = 0; SpaceBox.center[1] = 3.0; SpaceBox.center[2] = -0.25; //{0.0 , 3.0 , -0.25}; SpaceBox.coeff[0] = 2.6; SpaceBox.coeff[1] = 2.6; SpaceBox.coeff[2] = 1.25; //{2.6 , 2.6 , 1.25}; // get size_group if (size_group.size()!=num_anchors){ std::cout<<"size_group.size()!=num_anchors"<=num_size_group.size()){ num_size_group.resize(size_group[i]+1); } size_group_map[i] = num_size_group[size_group[i]]; num_size_group[size_group[i]]++; } std::cout<<"num_size_group: "; veciPrint(num_size_group); std::cout<filename.resize(len); if (len>0) file_size += fread((void*)(scene->filename.data()), sizeof(char), len, fp); scene->seqname = scene->filename; scene->filename = data_root+scene->filename+".bin"; file_size += fread((void*)(scene->R), sizeof(float), 9, fp); file_size += fread((void*)(scene->K), sizeof(float), 9, fp); file_size += fread((void*)(&scene->height), sizeof(unsigned int), 1, fp); file_size += fread((void*)(&scene->width), sizeof(unsigned int), 1, fp); scenes.push_back(scene); //std::cout<seqname< data_dim; std::vector label_dim; std::vector bb_tar_diff_dim; std::vector label_weights_dim; // allocate data data_dim.push_back(sum(batch_size)); data_dim.insert( data_dim.end(), grid_size.begin(), grid_size.end() ); memoryBytes += numel(data_dim) * sizeof(float)/4; checkCUDA(__LINE__, cudaMalloc(&dataGPU, numel(data_dim) * sizeofStorageT) ); out[0]->need_diff = false; out[0]->receptive_field.resize(data_dim.size()-2); fill_n(out[0]->receptive_field.begin(),data_dim.size()-2,0.025); out[0]->receptive_gap.resize(data_dim.size()-2); fill_n(out[0]->receptive_gap.begin(), data_dim.size()-2,0.025); out[0]->receptive_offset.resize(data_dim.size()-2); fill_n(out[0]->receptive_offset.begin(), data_dim.size()-2,0); memoryBytes += out[0]->Malloc(data_dim); // allocate label label_dim.push_back(sum(batch_size)); label_dim.push_back(1); label_dim.push_back(num_size_group[0]); label_dim.insert(label_dim.end(), label_size.begin(), label_size.end() ); label_weights_dim.push_back(sum(batch_size)); label_weights_dim.push_back(2); label_weights_dim.push_back(num_size_group[0]); label_weights_dim.insert(label_weights_dim.end(), label_size.begin(), label_size.end() ); bb_tar_diff_dim.push_back(sum(batch_size)); bb_tar_diff_dim.push_back(bb_param_weight.size()); bb_tar_diff_dim.push_back(num_size_group[0]); bb_tar_diff_dim.insert(bb_tar_diff_dim.end(), label_size.begin(), label_size.end() ); labelCPU = new Tensor(label_dim); label_weights = new Tensor(label_weights_dim); bb_tar_diff = new Tensor(bb_tar_diff_dim); bb_loss_weights = new Tensor(bb_tar_diff_dim); out[1]->need_diff = false; memoryBytes += out[1]->Malloc(label_dim); out[2]->need_diff = false; memoryBytes += out[2]->Malloc(label_weights_dim); out[3]->need_diff = false; memoryBytes += out[3]->Malloc(bb_tar_diff_dim); out[4]->need_diff = false; memoryBytes += out[4]->Malloc(bb_tar_diff_dim); if (num_size_group.size()>1){ label_dim[2] = num_size_group[1]; label_weights_dim[2] = num_size_group[1]; bb_tar_diff_dim[2] = num_size_group[1]; labelCPU_1 = new Tensor(label_dim); label_weights_1 = new Tensor(label_weights_dim); bb_tar_diff_1 = new Tensor(bb_tar_diff_dim); bb_loss_weights_1 = new Tensor(bb_tar_diff_dim); out[5]->need_diff = false; memoryBytes += out[5]->Malloc(label_dim); out[6]->need_diff = false; memoryBytes += out[6]->Malloc(label_weights_dim); out[7]->need_diff = false; memoryBytes += out[7]->Malloc(bb_tar_diff_dim); out[8]->need_diff = false; memoryBytes += out[8]->Malloc(bb_tar_diff_dim); } lock = std::async(std::launch::async,&RPNDataLayer::prefetch,this); //prefetch(); return memoryBytes; }; void prefetch(){ checkCUDA(__LINE__,cudaSetDevice(GPU)); // set mem to be zeros memset(labelCPU ->CPUmem, 0, labelCPU->numBytes()); memset(label_weights ->CPUmem, 0, label_weights->numBytes()); memset(bb_tar_diff ->CPUmem, 0, bb_tar_diff->numBytes()); memset(bb_loss_weights ->CPUmem, 0, bb_loss_weights->numBytes()); if (num_size_group.size()>0){ memset(labelCPU_1 ->CPUmem, 0, labelCPU_1->numBytes()); memset(label_weights_1 ->CPUmem, 0, label_weights_1->numBytes()); memset(bb_tar_diff_1 ->CPUmem, 0, bb_tar_diff_1->numBytes()); memset(bb_loss_weights_1 ->CPUmem, 0, bb_loss_weights_1->numBytes()); } // compute TSDF for (int batch_id =0; batch_id < sum(batch_size); ++batch_id){ StorageT * tsdf_data_GPU = &dataGPU[batch_id*grid_size[0]*grid_size[1]*grid_size[2]*grid_size[3]]; compute_TSDF_Space(scenes[counter], SpaceBox, tsdf_data_GPU, grid_size, encode_type, scale); std::string spacefile = rpn_data_root + scenes[counter]->seqname + ".bin"; FILE* fp = fopen(spacefile.c_str(),"rb"); if (fp==NULL) { std::cout<<"fail to open file: "<0){ vol1 = num_size_group[1] *label_size[0]*label_size[1]*label_size[2]; } //std::cout<<"vol"< neg_list; int poscount = 0; int negcount = 0; std::vector posnegcount={0,0,0,0}; for(int idx = 0; idx= min(pos_overlap[0],pos_overlap[1])){ // check in which btach and caculate idex int groupid = size_group[anchor_Idx[idx]]; int linearInd = size_group_map[anchor_Idx[idx]]*label_size[0]*label_size[1]*label_size[2] + x_Idx[idx]*label_size[1]*label_size[2] + y_Idx[idx]*label_size[2] + z_Idx[idx]; if (groupid == 0&& ov >= pos_overlap[0]){ labelCPU->CPUmem[batch_id*vol+linearInd] = CPUCompute2StorageT(ComputeT(1)); for (int pid = 0; pidCPUmem[batch_id*bb_param_weight.size()*vol+pid*vol+linearInd] = CPUCompute2StorageT(diff[6*idx+pid]); bb_loss_weights->CPUmem[batch_id*bb_param_weight.size()*vol+pid*vol+linearInd] = CPUCompute2StorageT(bb_param_weight[pid]); } label_weights->CPUmem[batch_id*2*vol + 0*vol + linearInd] = CPUCompute2StorageT(pos_weight); label_weights->CPUmem[batch_id*2*vol + 1*vol + linearInd] = CPUCompute2StorageT(pos_weight); posnegcount[0]++; } else if(groupid == 1&& ov >= pos_overlap[1]){ labelCPU_1->CPUmem[batch_id*vol1+linearInd] = CPUCompute2StorageT(ComputeT(1)); for (int pid = 0; pidCPUmem[batch_id*bb_param_weight.size()*vol1+pid*vol1+linearInd] = CPUCompute2StorageT(diff[6*idx+pid]); bb_loss_weights_1 ->CPUmem[batch_id*bb_param_weight.size()*vol1+pid*vol1+linearInd] = CPUCompute2StorageT(bb_param_weight[pid]); } //std::cout<CPUmem[batch_id*2*vol1 + 0*vol1 + linearInd] = CPUCompute2StorageT(pos_weight); label_weights_1 ->CPUmem[batch_id*2*vol1 + 1*vol1 + linearInd] = CPUCompute2StorageT(pos_weight); posnegcount[1]++; } poscount++; } else if(ov < neg_overlap){ neg_list.push_back(idx); } } int numneg = max(256-poscount,poscount); std::uniform_int_distribution distribution(0,neg_list.size()-1); while (negcountCPUmem[batch_id*2*vol + 0*vol + linearInd]) < neg_weight){ label_weights->CPUmem[batch_id*2*vol + 0*vol + linearInd] = CPUCompute2StorageT(neg_weight); label_weights->CPUmem[batch_id*2*vol + 1*vol + linearInd] = CPUCompute2StorageT(neg_weight); posnegcount[2]++; } } else{ if (CPUStorage2ComputeT(label_weights_1->CPUmem[batch_id*2*vol1 + 0*vol1 + linearInd]) < neg_weight){ label_weights_1->CPUmem[batch_id*2*vol1 + 0*vol1 + linearInd] = CPUCompute2StorageT(neg_weight); label_weights_1->CPUmem[batch_id*2*vol1 + 1*vol1 + linearInd] = CPUCompute2StorageT(neg_weight); posnegcount[3]++; } } negcount++; } //std::cout << "poscount : "<< poscount <<" numneg"<= scenes.size()){ counter = 0; ++epoch_prefetch; shuffle(); } /* if (counter==1&&phase!=Testing){ std::cout << " poscount : "<< poscount << "." <write("DSS/debug/labelCPU.tensor"); label_weights->write("DSS/debug/label_weights.tensor"); bb_tar_diff->write("DSS/debug/bb_tar_diff.tensor"); bb_loss_weights->write("DSS/debug/bb_loss_weights.tensor"); labelCPU_1->write("DSS/debug/labelCPU_1.tensor"); label_weights_1->write("DSS/debug/label_weights_1.tensor"); bb_tar_diff_1->write("DSS/debug/bb_tar_diff_1.tensor"); bb_loss_weights_1->write("DSS/debug/bb_loss_weights_1.tensor"); std::cout<<"finish writing"<dataGPU,dataGPU); checkCUDA(__LINE__, cudaMemcpy(out[1]->dataGPU, labelCPU->CPUmem, labelCPU->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[2]->dataGPU, label_weights->CPUmem, label_weights->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[3]->dataGPU, bb_tar_diff->CPUmem, bb_tar_diff->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[4]->dataGPU, bb_loss_weights->CPUmem, bb_loss_weights->numBytes(), cudaMemcpyHostToDevice) ); if (num_size_group.size()>0){ checkCUDA(__LINE__, cudaMemcpy(out[5]->dataGPU, labelCPU_1->CPUmem, labelCPU_1->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[6]->dataGPU, label_weights_1->CPUmem, label_weights_1->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[7]->dataGPU, bb_tar_diff_1->CPUmem, bb_tar_diff_1->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[8]->dataGPU, bb_loss_weights_1->CPUmem, bb_loss_weights_1->numBytes(), cudaMemcpyHostToDevice) ); } epoch = epoch_prefetch; lock = std::async(std::launch::async,&RPNDataLayer::prefetch,this); //prefetch(); }; }; class RenderMeshDataLayer : public DataLayer { public: int epoch_prefetch; Tensor* labelCPU; StorageT* dataGPU; std::future lock; std::vector batch_size; std::string file_list; int context_pad; std::vector grid_size; int encode_type; float scale; unsigned int num_categories; std::string off_root; std::vector heigth_dis; std::vector sceneMeshList; Tensor* oreintation_label; Tensor* oreintation_label_w; float* K_GPU; float* R_GPU; float* depth_GPU; float* XYZimage_GPU; float* bb3d_GPU; int num_oreintation; bool orein_cls; int im_w; int im_h; float camK[9]; bool shuffle_data; int numofitems(){ return sceneMeshList.size(); }; int numofitemsTruncated(){ return sum(batch_size) * floor(double(numofitems())/double(sum(batch_size))); }; void shuffle(){ std::shuffle(sceneMeshList.begin(),sceneMeshList.end(), rng ); return; }; void init(){ epoch_prefetch = 0; train_me = false; counter =0; std::cout<<"loading file "<0) file_size += fread((void*)(height_dis_file.data()), sizeof(char), len, fp); std::ifstream infile; infile.open(height_dis_file); int numcls; infile >> numcls; heigth_dis.resize(2*numcls); std::vector classname(numcls); for (int i =0;i> classname[i]; infile >> heigth_dis[2*i] ; infile >> heigth_dis[2*i+1]; } infile.close(); std::cout<<"numcls "<mesh_file.resize(len); if (len>0) file_size += fread((void*)(scene->mesh_file.data()), sizeof(char), len, fp); //mesh_file = off_root+mesh_file; //scene->meshdata.readOFF(mesh_file); scene->objects.resize(1); Box3D box; file_size += fread((void*)(&(box.category)), sizeof(unsigned int), 1, fp); file_size += fread((void*)(box.base), sizeof(float), 9, fp); file_size += fread((void*)(box.center), sizeof(float), 3, fp); file_size += fread((void*)(box.coeff), sizeof(float), 3, fp); box = processbox (box, context_pad, grid_size[1]); scene->objects[0] = box; sceneMeshList.push_back(scene); } fclose(fp); std::cout<< "sceneMeshList length: "<data_dim,label_dim; data_dim.push_back(sum(batch_size)); data_dim.insert( data_dim.end(), grid_size.begin(), grid_size.end() ); label_dim.push_back(sum(batch_size)); label_dim.push_back(1); label_dim.push_back(1); label_dim.push_back(1); label_dim.push_back(1); labelCPU = new Tensor(label_dim); out[0]->need_diff = false; memoryBytes += out[0]->Malloc(data_dim); //std::cout<<"data_dim="; veciPrint(data_dim); std::cout<need_diff = false; memoryBytes += out[1]->Malloc(label_dim); //std::cout<<"label_dim="; veciPrint(label_dim); std::cout< oreintation_label_dim(5,1); std::vector oreintation_label_w_dim(5,1); oreintation_label_dim[0] = sum(batch_size); oreintation_label_dim[1] = 1; oreintation_label_dim[2] = num_categories; oreintation_label_w_dim[0] = sum(batch_size); oreintation_label_w_dim[1] = num_oreintation; oreintation_label_w_dim[2] = num_categories; oreintation_label = new Tensor(oreintation_label_dim); oreintation_label_w = new Tensor(oreintation_label_w_dim); out[2]->need_diff = false; memoryBytes += out[2]->Malloc(oreintation_label_dim); out[3]->need_diff = false; memoryBytes += out[3]->Malloc(oreintation_label_w_dim); } checkCUDA(__LINE__, cudaMalloc(&dataGPU, numel(data_dim) * sizeofStorageT) ); checkCUDA(__LINE__, cudaMalloc(&K_GPU, sizeof(float)*9)); checkCUDA(__LINE__, cudaMalloc(&R_GPU, sizeof(float)*9)); checkCUDA(__LINE__, cudaMalloc(&depth_GPU, sizeof(float)*im_w*im_h)); checkCUDA(__LINE__, cudaMalloc(&bb3d_GPU, sizeof(float)*15)); checkCUDA(__LINE__, cudaMalloc(&XYZimage_GPU, sizeof(float)*im_w*im_h*3)); //lock = std::async(std::launch::async,&RenderMeshDataLayer::prefetch,this); prefetch(); return memoryBytes; }; void prefetch(){ //checkCUDA(__LINE__,cudaSetDevice(GPU)); if(orein_cls){ memset(oreintation_label->CPUmem, 0, oreintation_label->numBytes()); memset(oreintation_label_w->CPUmem,0, oreintation_label_w->numBytes()); } //int tmpD; cudaGetDevice(&tmpD); std::cout<<"GPU at LINE "<<__LINE__<<" = "< xzRot_dis(0,360); std::uniform_real_distribution cam_tilt_dis(-5,0); std::uniform_real_distribution objz_dis(1.500,5.000);//depth std::uniform_real_distribution objx_dis(-0.250,0.250);//left right int batch_id = 0; std::vector category_count(num_categories,0); std::vector oreintation_count(num_categories,0); unsigned long long time0,time1,time2,time3; unsigned long long T1 = 0; unsigned long long T2 = 0; unsigned long long T3 = 0; while(batch_id < sum(batch_size)){ time0 = get_timestamp_dss(); Box3D objbox = sceneMeshList[counter]->objects[0]; Mesh3D model(sceneMeshList[counter]->mesh_file); time1 = get_timestamp_dss(); model.zeroCenter(); float Ryzswi[9] = {1, 0, 0, 0, 0, 1, 0, -1, 0}; model.roateMesh(Ryzswi); std::normal_distribution objh_dis (heigth_dis[2*objbox.category],heigth_dis[2*objbox.category+1]); float objh = objh_dis(rng); float sizeTarget[3] = {-1,objh,-1};// only scale height float scale_ratio = model.scaleMesh(sizeTarget); float angle_yaw = xzRot_dis(rng); float* R = genRotMat(angle_yaw); model.roateMesh(R); Point3D newcenter; newcenter.z = objz_dis(rng); newcenter.x = objx_dis(rng)*newcenter.z; newcenter.y = 0.5*objh-1; //newcenter.x = 0; newcenter.y = -0.4;//1.24267; newcenter.z = 2.65842; model.translate(newcenter); time2 = get_timestamp_dss(); float tilt_angle = cam_tilt_dis(rng); float* Rtilt = genTiltMat(tilt_angle); float camRT[12] = {0}; for (int i = 0; i<3; ++i){ for (int j = 0; j<3; ++j){ camRT[i*4+j] = Rtilt[i*3+j]; } } float* depth = renderCameraView(&model, camK, camRT,im_w, im_h); /************************ box in point could coordinate ************************/ //transform the box for (int i =0;i<3;++i){ objbox.coeff[i] = scale_ratio*objbox.coeff[i]; } Point3D bbcenter = model.getBoxCenter(); objbox.center[0] = -1*bbcenter.x; objbox.center[1] = bbcenter.z; objbox.center[2] = bbcenter.y; float Rotate_boxbase[4]; float angle_yaw_rad = -1*angle_yaw*3.14159265/180; Rotate_boxbase[0] = cos(angle_yaw_rad)*objbox.base[0]-sin(angle_yaw_rad)*objbox.base[1]; Rotate_boxbase[1] = cos(angle_yaw_rad)*objbox.base[3]-sin(angle_yaw_rad)*objbox.base[4]; Rotate_boxbase[2] = sin(angle_yaw_rad)*objbox.base[0]+cos(angle_yaw_rad)*objbox.base[1]; Rotate_boxbase[3] = sin(angle_yaw_rad)*objbox.base[3]+cos(angle_yaw_rad)*objbox.base[4]; objbox.base[0] = Rotate_boxbase[0]; objbox.base[1] = Rotate_boxbase[2]; objbox.base[3] = Rotate_boxbase[1]; objbox.base[4] = Rotate_boxbase[3]; float R_data[9]; for (int i = 0; i<3; ++i){ for (int j = 0; j<3; ++j){ R_data[i*3+j] = Rtilt[j*3+i]; } } int numValid = 0; for (int i=1;i0&depth[i]<8) {numValid++;} else {depth[i] = 10;} } if (numValid>500){ // copy data to GPU RGBDpixel * RGBDimage = NULL; checkCUDA(__LINE__, cudaMemcpy(K_GPU, (float*)camK, sizeof(float)*9, cudaMemcpyHostToDevice)); checkCUDA(__LINE__, cudaMemcpy(R_GPU, (float*)R_data, sizeof(float)*9, cudaMemcpyHostToDevice)); checkCUDA(__LINE__, cudaMemcpy(depth_GPU, (float*)depth, sizeof(float)*im_w*im_h, cudaMemcpyHostToDevice)); checkCUDA(__LINE__, cudaMemcpy(bb3d_GPU, objbox.base, sizeof(float)*15, cudaMemcpyHostToDevice)); compute_xyzkernel<<>>(XYZimage_GPU,depth_GPU,K_GPU,R_GPU); StorageT * tsdf_data_GPU = &dataGPU[batch_id*grid_size[0]*grid_size[1]*grid_size[2]*grid_size[3]]; int THREADS_NUM = 1024; int BLOCK_NUM = int((grid_size[1]*grid_size[2]*grid_size[3] + size_t(THREADS_NUM) - 1) / THREADS_NUM); //std::cout<<"BLOCK_NUM"<>>(tsdf_data_GPU, R_GPU, K_GPU, RGBDimage, XYZimage_GPU, bb3d_GPU, grid_size[1],grid_size[2],grid_size[3], grid_size[0], im_w, im_h, encode_type, scale); labelCPU->CPUmem[batch_id] = CPUCompute2StorageT(ComputeT(objbox.category)); int oreintation = floor((360-angle_yaw)/18); if (orein_cls){ oreintation_label -> CPUmem[batch_id*num_categories + objbox.category] = CPUCompute2StorageT(ComputeT(oreintation)); for (int cid = 0;cid CPUmem[batch_id*num_oreintation*num_categories + cid*num_categories + objbox.category] = CPUCompute2StorageT(1); } } oreintation_count[oreintation]++; category_count[objbox.category]++; batch_id++; /* for (int h=0; h<480; h=h+8){ for (int w=0; w<640; w = w+4) if (depth[h+w*480]<8) std::cout<<"."; else std::cout<<" "; std::cout<mesh_file<= sceneMeshList.size()){ counter = 0; ++epoch_prefetch; shuffle(); } //std::cout<dataGPU,dataGPU); checkCUDA(__LINE__, cudaMemcpy(out[1]->dataGPU, labelCPU->CPUmem, labelCPU->numBytes(), cudaMemcpyHostToDevice) ); if(orein_cls){ //std::cout<<"copied"<dataGPU, oreintation_label->CPUmem, oreintation_label->numBytes(), cudaMemcpyHostToDevice) ); checkCUDA(__LINE__, cudaMemcpy(out[3]->dataGPU, oreintation_label_w->CPUmem, oreintation_label_w->numBytes(), cudaMemcpyHostToDevice) ); } epoch = epoch_prefetch; //lock = std::async(std::launch::async,&RenderMeshDataLayer::prefetch,this); prefetch(); }; };//end of RenderMeshDataLayer class Scene2DDataLayer : public DataLayer { public: int epoch_prefetch; Tensor* rois; StorageT* HHAdata_GPU; StorageT* IMAdata_GPU; std::vector imagesize; int batch_size; int numBoxperImage; std::vector scenes; std::string file_list; std::string hha_root; std::string image_root; std::future lock; //float up_scale; int box_type; //int up_size; // int numofitems(){ int total_number = scenes.size()*numBoxperImage; return total_number; }; void shuffle(){ return; }; void init(){ epoch_prefetch = 0; train_me = false; counter = 0; std::cout<<"loading file "<filename.resize(len); if (len>0) file_size += fread((void*)(scene->filename.data()), sizeof(char), len, fp); file_size += fread((void*)(scene->R), sizeof(float), 9, fp); file_size += fread((void*)(scene->K), sizeof(float), 9, fp); file_size += fread((void*)(&scene->height), sizeof(unsigned int), 1, fp); file_size += fread((void*)(&scene->width), sizeof(unsigned int), 1, fp); file_size += fread((void*)(&len), sizeof(unsigned int), 1, fp); scene->objects.resize(len); //std::cout<filename <objects_2d_tight.push_back(box); uint8_t hasTarget = 0; file_size += fread((void*)(&hasTarget), sizeof(uint8_t), 1, fp); if (hasTarget>0){ std::cout<<" sth wrong in line " << __LINE__ << std::endl; } file_size += fread((void*)(box.tblr), sizeof(float), 4, fp); scene->objects_2d_full.push_back(box); file_size += fread((void*)(&hasTarget), sizeof(uint8_t), 1, fp); if (hasTarget>0){ std::cout<<" sth wrong in line " << __LINE__ << std::endl; } } scenes.push_back(scene); } fclose(fp); //std::cout< roi_data_dim(4,1); std::vector image_data_dim(4,1); roi_data_dim[0] = batch_size*numBoxperImage; roi_data_dim[1] = 5; image_data_dim[0] = batch_size; image_data_dim[1] = 3; image_data_dim[2] = imagesize[0]; image_data_dim[3] = imagesize[1]; rois = new Tensor(roi_data_dim); checkCUDA(__LINE__, cudaMalloc(&IMAdata_GPU, sizeof(float)*batch_size*3*imagesize[0]*imagesize[1])); checkCUDA(__LINE__, cudaMalloc(&HHAdata_GPU, sizeof(float)*batch_size*3*imagesize[0]*imagesize[1])); out[0]->need_diff = false; memoryBytes += out[0]->Malloc(roi_data_dim); veciPrint(out[0]->dim); std::cout<need_diff = false; memoryBytes += out[1]->Malloc(image_data_dim); out[2]->need_diff = false; memoryBytes += out[2]->Malloc(image_data_dim); //lock = std::async(std::launch::async,&Scene2DDataLayer::prefetch,this); prefetch(); return memoryBytes; }; void prefetch(){ checkCUDA(__LINE__,cudaSetDevice(GPU)); memset(rois->CPUmem, 0, rois->numBytes()); float up_scale = std::min(float(imagesize[0])/float(scenes[counter]->height),float(imagesize[1])/float(scenes[counter]->width)); for (int batch_id = 0;batch_id < batch_size;batch_id++){ for(int box_id = 0; box_id < scenes[counter]->objects_2d_tight.size();box_id++){ rois->CPUmem[batch_id*numBoxperImage+5*box_id+0] = CPUCompute2StorageT((float)batch_id); for (int i =0;i<4;i++){ if (box_type==0){ rois->CPUmem[batch_id*numBoxperImage+5*box_id+i+1] = CPUCompute2StorageT((float)(scenes[counter]->objects_2d_tight[box_id].tblr[i]*up_scale-1)); } else if(box_type==1){ rois->CPUmem[batch_id*numBoxperImage+5*box_id+i+1] = CPUCompute2StorageT((float)(scenes[counter]->objects_2d_full[box_id].tblr[i]*up_scale-1)); } else{ FatalError(__LINE__); } } } //size_t numel_batch_all = 3*imagesize[0]*imagesize[1]; Tensor* IMA = new Tensor(image_root+ scenes[counter]->filename + ".tensor"); IMA->writeGPU(&IMAdata_GPU[batch_id*3*imagesize[0]*imagesize[1]]); // read HH image Tensor* HHA = new Tensor(hha_root + scenes[counter]->filename + ".tensor"); HHA->writeGPU(&HHAdata_GPU[batch_id*3*imagesize[0]*imagesize[1]]); counter++; } if (counter >= scenes.size()){ counter = 0; epoch_prefetch++; } } void forward(Phase phase_){ //lock.wait(); checkCUDA(__LINE__,cudaDeviceSynchronize()); checkCUDA(__LINE__, cudaMemcpy(out[0]->dataGPU, rois->CPUmem, rois->numBytes(), cudaMemcpyHostToDevice) ); std::swap(out[1]->dataGPU,IMAdata_GPU); std::swap(out[2]->dataGPU,HHAdata_GPU); epoch = epoch_prefetch; prefetch(); //lock = std::async(std::launch::async,&Scene2DDataLayer::prefetch,this); }; };//Scene2DDataLayer