/* * ---------------------------------------------------------------------------- * Marvin: N-Dimensional ConvNets on GPUs * Copyright (C) 2015 Princeton Vision Group * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * Princeton Vision Group wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. * ---------------------------------------------------------------------------- */ // Please choose a data type to compile #define DATATYPE 1 #if DATATYPE==0 #pragma message "Compiling using StorageT=half ComputeT=float" #define StorageT half #define ComputeT float #define sizeofStorageT 2 #define sizeofComputeT 4 #define CUDNNStorageT CUDNN_DATA_HALF #define CPUStorage2ComputeT(x) (cpu_half2float(x)) #define CPUCompute2StorageT(x) (cpu_float2half(x)) #define GPUStorage2ComputeT(x) (__half2float(x)) #define GPUCompute2StorageT(x) (__float2half(x)) #define GPUgemm Hgemm #define GPUasum Hasum #define ISNAN(x) (ishnan(x)) #define ComputeT_MIN FLT_MIN #include #elif DATATYPE==1 #pragma message "Compiling using StorageT=float ComputeT=float" #define StorageT float #define ComputeT float #define sizeofStorageT 4 #define sizeofComputeT 4 #define CUDNNStorageT CUDNN_DATA_FLOAT #define CPUStorage2ComputeT(x) (x) #define CPUCompute2StorageT(x) (x) #define GPUStorage2ComputeT(x) (x) #define GPUCompute2StorageT(x) (x) #define GPUgemm cublasSgemm #define GPUasum cublasSasum #define ISNAN(x) (std::isnan(x)) #define ComputeT_MIN FLT_MIN #elif DATATYPE==2 #pragma message "Compiling using StorageT=double ComputeT=double" #define StorageT double #define ComputeT double #define sizeofStorageT 8 #define sizeofComputeT 8 #define CUDNNStorageT CUDNN_DATA_DOUBLE #define CPUStorage2ComputeT(x) (x) #define CPUCompute2StorageT(x) (x) #define GPUStorage2ComputeT(x) (x) #define GPUCompute2StorageT(x) (x) #define GPUgemm cublasDgemm #define GPUasum cublasDasum #define ISNAN(x) (std::isnan(x)) #define ComputeT_MIN DBL_MIN #endif ////////////////////////////////////////////////////////////////////////////////////////////////// // Includes ////////////////////////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace marvin { ////////////////////////////////////////////////////////////////////////////////////////////////// // Type definition ////////////////////////////////////////////////////////////////////////////////////////////////// enum Filler { Xavier, Gaussian, Constant }; enum Pool { Max, Average, Sum }; enum LossObjective { MultinomialLogistic_StableSoftmax, MultinomialLogistic, SmoothL1, Contrastive, EuclideanSSE, HingeL1, HingeL2, SigmoidCrossEntropy, Infogain }; enum Phase { Training, Testing, TrainingTesting }; enum LRPolicy { LR_fixed, LR_step, LR_exp, LR_inv, LR_multistep, LR_poly, LR_sigmoid, LR_cyclical }; enum SolverAlgorithm { SGD, AdaGrad, NAG }; enum Regularizer { L2, L1 }; enum LRN { CrossChannel, DivisiveNormalization }; enum ElementWiseOp { ElementWise_EQL, ElementWise_MUL, ElementWise_SUM, ElementWise_MAX }; ComputeT anyval; ComputeT oneval = 1; ComputeT zeroval = 0; const void* one = static_cast(&oneval); const void* zero = static_cast(&zeroval); const ComputeT* oneComputeT = &oneval; const ComputeT* zeroComputeT = &zeroval; ////////////////////////////////////////////////////////////////////////////////////////////////// // debugging utility ////////////////////////////////////////////////////////////////////////////////////////////////// void FatalError(const int lineNumber=0) { std::cerr << "FatalError"; if (lineNumber!=0) std::cerr<<" at LINE "< 0x7f800000) { ret.x = 0x7fffU; return ret; } sign = ((x >> 16) & 0x8000); // Get rid of +Inf/-Inf, +0/-0. if (u > 0x477fefff) { ret.x = sign | 0x7c00U; return ret; } if (u < 0x33000001) { ret.x = (sign | 0x0000); return ret; } exponent = ((u >> 23) & 0xff); mantissa = (u & 0x7fffff); if (exponent > 0x70) { shift = 13; exponent -= 0x70; } else { shift = 0x7e - exponent; exponent = 0; mantissa |= 0x800000; } lsb = (1 << shift); lsb_s1 = (lsb >> 1); lsb_m1 = (lsb - 1); // Round to nearest even. remainder = (mantissa & lsb_m1); mantissa >>= shift; if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { ++mantissa; if (!(mantissa & 0x3ff)) { ++exponent; mantissa = 0; } } ret.x = (sign | (exponent << 10) | mantissa); return ret; } float cpu_half2float(half h) { unsigned sign = ((h.x >> 15) & 1); unsigned exponent = ((h.x >> 10) & 0x1f); unsigned mantissa = ((h.x & 0x3ff) << 13); if (exponent == 0x1f) { /* NaN or Inf */ mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); exponent = 0xff; } else if (!exponent) { /* Denorm or Zero */ if (mantissa) { unsigned int msb; exponent = 0x71; do { msb = (mantissa & 0x400000); mantissa <<= 1; /* normalize */ --exponent; } while (!msb); mantissa &= 0x7fffff; /* 1.mantissa is implicit */ } } else { exponent += 0x70; } int temp = ((sign << 31) | (exponent << 23) | mantissa); return *((float*)((void*)&temp)); } bool operator <(const half& x, const half& y) { return cpu_half2float(x) < cpu_half2float(y); } std::ostream& operator<< (std::ostream& stream, const half& x) { stream << cpu_half2float(x); return stream; } ////////////////////////////////////////////////////////////////////////////////////////////////// // JSON parser ////////////////////////////////////////////////////////////////////////////////////////////////// enum JSONType { JSON_String, JSON_Bool, JSON_Null, JSON_Number, JSON_Object, JSON_ObjectArray}; // plain object class JSON{ public: JSONType type; std::vector array; std::map member; ~JSON(){ for (int i=0;i::iterator it = member.begin(); it != member.end(); it++ ){ if (it->second != NULL) delete it->second; } }; std::string returnString(){ if (type!=JSON_String) FatalError(__LINE__); return *((std::string*)(array[0])); }; bool returnBool(){ if (type!=JSON_Bool) FatalError(__LINE__); return *((bool*)(array[0])); }; ComputeT returnReal(){ if (type!=JSON_Number) FatalError(__LINE__); return *((ComputeT*)(array[0])); }; std::vector returnIntVector(){ if (type!=JSON_Number) FatalError(__LINE__); std::vector v(array.size()); for (int i=0;i returnRealVector(){ if (type!=JSON_Number) FatalError(__LINE__); std::vector v(array.size()); for (int i=0;i returnStringVector(){ if (type!=JSON_String) FatalError(__LINE__); std::vector v(array.size()); for (int i=0;imember.find(name) == this->member.end()){ FatalError(__LINE__); } else variable = (unsigned int)this->member[name]->returnReal(); }; void setOrDie(std::string name, bool &variable){ if (this->member.find(name) == this->member.end()){ FatalError(__LINE__); } else variable = this->member[name]->returnBool(); }; void setOrDie(std::string name, std::vector &variable){ if (this->member.find(name) == this->member.end()) FatalError(__LINE__); else variable = this->member[name]->returnRealVector(); }; void set(std::string name, bool &variable, bool default_value){ if (this->member.find(name) == this->member.end()) variable = default_value; else variable = this->member[name]->returnBool(); }; void set(std::string name, ComputeT &variable, ComputeT default_value){ if (this->member.find(name) == this->member.end()) variable = default_value; else variable = (ComputeT)(this->member[name]->returnReal()); }; void setOrDie(std::string name, ComputeT &variable){ if (this->member.find(name) == this->member.end()) FatalError(__LINE__); else variable = (ComputeT)(this->member[name]->returnReal()); }; void set(std::string name, int &variable, int default_value){ if (this->member.find(name) == this->member.end()) variable = default_value; else variable = (int)(this->member[name]->returnReal()); }; void set(std::string name, unsigned int &variable, unsigned int default_value){ if (this->member.find(name) == this->member.end()) variable = default_value; else variable = (unsigned int)(this->member[name]->returnReal()); }; void setOrDie(std::string name, int &variable){ if (this->member.find(name) == this->member.end()) FatalError(__LINE__); else variable = (int)(this->member[name]->returnReal()); }; void set(std::string name, std::vector &variable, std::vector default_value){ if (this->member.find(name) == this->member.end()) variable = default_value; else variable = this->member[name]->returnIntVector(); }; void set(std::string name, std::vector &variable, std::vector default_value){ if (this->member.find(name) == this->member.end()) variable = default_value; else variable = this->member[name]->returnRealVector(); }; void set(std::string name, std::vector &variable, std::vector default_value){ if (this->member.find(name) == this->member.end()) variable = default_value; else variable = this->member[name]->returnStringVector(); }; void setOrDie(std::string name, std::vector &variable){ if (this->member.find(name) == this->member.end()) FatalError(__LINE__); else variable = this->member[name]->returnStringVector(); }; void setOrDie(std::string name, std::vector &variable){ if (this->member.find(name) == this->member.end()) {std::cout<<"not find "<member[name]->returnIntVector(); }; void set(std::string name, std::string &variable, std::string default_value){ if (this->member.find(name) == this->member.end()) variable = default_value; else variable = this->member[name]->returnString(); }; void setOrDie(std::string name, std::string &variable){ if (this->member.find(name) == this->member.end()) FatalError(__LINE__); else variable = this->member[name]->returnString(); }; // enum ElementWiseOp { ElementWise_EQL, ElementWise_MUL, ElementWise_SUM, ElementWise_MAX }; void setOrDie(std::string name, ElementWiseOp &variable){ if (this->member.find(name) == this->member.end()) FatalError(__LINE__); else if (0 == this->member[name]->returnString().compare("ElementWise_EQL")) variable = ElementWise_EQL; else if (0 == this->member[name]->returnString().compare("ElementWise_MUL")) variable = ElementWise_MUL; else if (0 == this->member[name]->returnString().compare("ElementWise_SUM")) variable = ElementWise_SUM; else if (0 == this->member[name]->returnString().compare("ElementWise_MAX")) variable = ElementWise_MAX; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("Xavier")) variable = Xavier; else if (0 == this->member[name]->returnString().compare("Gaussian")) variable = Gaussian; else if (0 == this->member[name]->returnString().compare("Constant")) variable = Constant; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("Max")) variable = Max; else if (0 == this->member[name]->returnString().compare("Average")) variable = Average; else if (0 == this->member[name]->returnString().compare("Sum")) variable = Sum; else{ std::cout<<"Pool Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) FatalError(__LINE__); else if (0 == this->member[name]->returnString().compare("MultinomialLogistic_StableSoftmax")) variable = MultinomialLogistic_StableSoftmax; else if (0 == this->member[name]->returnString().compare("MultinomialLogistic")) variable = MultinomialLogistic; else if (0 == this->member[name]->returnString().compare("SmoothL1")) variable = SmoothL1; else if (0 == this->member[name]->returnString().compare("Contrastive")) variable = Contrastive; else if (0 == this->member[name]->returnString().compare("EuclideanSSE")) variable = EuclideanSSE; else if (0 == this->member[name]->returnString().compare("HingeL1")) variable = HingeL1; else if (0 == this->member[name]->returnString().compare("HingeL2")) variable = HingeL2; else if (0 == this->member[name]->returnString().compare("SigmoidCrossEntropy")) variable = SigmoidCrossEntropy; else if (0 == this->member[name]->returnString().compare("Infogain")) variable = Infogain; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("Training")) variable = Training; else if (0 == this->member[name]->returnString().compare("Testing")) variable = Testing; else if (0 == this->member[name]->returnString().compare("TrainingTesting")) variable = TrainingTesting; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("LR_fixed")) variable = LR_fixed; else if (0 == this->member[name]->returnString().compare("LR_step")) variable = LR_step; else if (0 == this->member[name]->returnString().compare("LR_exp")) variable = LR_exp; else if (0 == this->member[name]->returnString().compare("LR_inv")) variable = LR_inv; else if (0 == this->member[name]->returnString().compare("LR_multistep")) variable = LR_multistep; else if (0 == this->member[name]->returnString().compare("LR_poly")) variable = LR_poly; else if (0 == this->member[name]->returnString().compare("LR_sigmoid")) variable = LR_sigmoid; else if (0 == this->member[name]->returnString().compare("LR_cyclical")) variable = LR_cyclical; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("SGD")) variable = SGD; else if (0 == this->member[name]->returnString().compare("AdaGrad")) variable = AdaGrad; else if (0 == this->member[name]->returnString().compare("NAG")) variable = NAG; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("L2")) variable = L2; else if (0 == this->member[name]->returnString().compare("L1")) variable = L1; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("CrossChannel")) variable = CrossChannel; else if (0 == this->member[name]->returnString().compare("DivisiveNormalization")) variable = DivisiveNormalization; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("max")) variable = CUDNN_POOLING_MAX; else if (0 == this->member[name]->returnString().compare("average_include")) variable = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; else if (0 == this->member[name]->returnString().compare("average_exclude")) variable = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; else if (0 == this->member[name]->returnString().compare("sigmoid")) variable = CUDNN_ACTIVATION_SIGMOID; else if (0 == this->member[name]->returnString().compare("ReLU")) variable = CUDNN_ACTIVATION_RELU; else if (0 == this->member[name]->returnString().compare("TanH")) variable = CUDNN_ACTIVATION_TANH; else{ std::cout<<"Unsupported "<member[name]->returnString()<1) std::cout<<"["; for (int i=0;i0) std::cout<< ","; std::cout << "\"" << *((std::string*)(array[i])) << "\"" ; } if (array.size()>1) std::cout<<"]"; std::cout<1) std::cout<<"["; for (int i=0;i0) std::cout<< ","; std::cout << ((*((bool*)(array[i])))? "true": "false"); } if (array.size()>1) std::cout<<"]"; std::cout<1) std::cout<<"["; for (int i=0;i0) std::cout<< ","; std::cout << "null"; } if (array.size()>1) std::cout<<"]"; std::cout<1) std::cout<<"["; for (int i=0;i0) std::cout<< ","; std::cout << *((ComputeT*)(array[i])); } if (array.size()>1) std::cout<<"]"; std::cout<::iterator it = member.begin(); it != member.end(); it++ ){ std::cout << "\t" << it->first << ": "; it->second->print(); } std::cout<<"}"; break; case JSON_ObjectArray: std::cout<<"["<print(); if (i0){ int e = input.find(","); if (e==std::string::npos){ e = input.size(); } std::string first = input.substr(0,e); if (first[0]=='\"'){ type = JSON_String; std::string* p = new std::string(first.substr(1,first.size()-2)); array.push_back((void*)p); }else if (first[0]=='t'){ type = JSON_Bool; bool* p = new bool(true); array.push_back((void*)p); }else if (first[0]=='f'){ type = JSON_Bool; bool* p = new bool(false); array.push_back((void*)p); }else if (first[0]=='n'){ type = JSON_Null; void* p = NULL; array.push_back((void*)p); }else{ type = JSON_Number; ComputeT* p = new ComputeT(stof(first)); array.push_back((void*)p); } if(e+1parseNumberOrTextArray(input.substr(0,e+1)); this->member[name] = p; if (e+2parseNumberOrTextArray(input.substr(0,e)); this->member[name] = p; if (e+1parseNumberOrTextArray(input.substr(0,e)); this->member[name] = p; if (e+10){ int e = input.find("}")+1; if (e==std::string::npos){ e = input.size(); } std::string first = input.substr(0,e); JSON* pObj = new JSON; pObj->parseObject(first); array.push_back((void*)pObj); if(e+1set(#attribute,attribute,value); #define SetOrDie(obj,attribute) obj->setOrDie(#attribute,attribute); void parseNetworkJSON(std::string filename, JSON* train_obj, JSON* test_obj, JSON* architecture_obj){ //std::string filename = "lenet.json"; std::ifstream t(filename); std::string str((std::istreambuf_iterator(t)), std::istreambuf_iterator()); str.erase(remove_if(str.begin(), str.end(), (int(*)(int))isspace), str.end()); std::string input = str; int b,e; b = input.find("\"train\""); std::string train_str = input.substr(b+7); b = train_str.find("{"); e = train_str.find("}"); train_str=train_str.substr(b,e-b+1); if (train_obj!=NULL) train_obj->parseObject(train_str); b = input.find("\"test\""); std::string test_str = input.substr(b+6); b = test_str.find("{"); e = test_str.find("}"); test_str=test_str.substr(b,e-b+1); if (test_obj!=NULL) test_obj->parseObject(test_str); b=input.find("\"layers\""); input = input.substr(b+9); e=input.find("}]"); if (architecture_obj!=NULL) architecture_obj->parseObjectArray(input); } ////////////////////////////////////////////////////////////////////////////////////////////////// // Utility ////////////////////////////////////////////////////////////////////////////////////////////////// bool is_file_exist(const std::string& fileName){ std::ifstream infile(fileName); return infile.good(); } void memorySizePrint(size_t bytes){ if (bytes<512){ std::cout<& v){ std::cout<<"["<0) std::cout<1){ for (int i=1;i& v){ std::cout<<"["; if (v.size()>0) std::cout<1){ for (int i=1;i veci(int n, ...){ std::vector v; if (n==0) return v; va_list ap; va_start(ap, n); for(int i = 0; i < n; i++) { v.push_back(va_arg(ap, int)); } va_end(ap); return v; } std::vector vecs(int n, ...){ std::vector v; if (n==0) return v; va_list ap; va_start(ap, n); for(int i = 0; i < n; i++) { v.push_back(std::string(va_arg(ap, char*))); } va_end(ap); return v; } std::vector getStringVector(std::string input){ std::vector ret; while (input.size()>0){ int e = input.find(","); if (e==std::string::npos){ e = input.size(); } std::string first = input.substr(0,e); ret.push_back(first); if(e+1 > getIntVectorVector(std::string input){ //remove all space input.erase(remove_if(input.begin(), input.end(), (int(*)(int))isspace), input.end()); std::vector > ret; while (input.size()>0){ int e; if (input[0]=='['){ ret.resize(ret.size()+1); e=0; }else if (input[0]==','){ e=0; }else if (input[0]==']'){ e=0; }else{ e = input.find(","); if (e==std::string::npos){ e = input.size(); } int f = input.find("]"); if (f==std::string::npos){ f = input.size(); } e = min(e,f); std::string first = input.substr(0,e); ret[ret.size()-1].push_back(stoi(first)); } if(e+1& dim){ //return accumulate(dim.begin(), dim.end(), 1, multiplies()); size_t res = 1; for (int i=0;i& dim){ //return accumulate(dim.begin()+1, dim.end(), 1, multiplies()); size_t res = 1; for (int i=1;i& dim){ //return accumulate(dim.begin()+1, dim.end(), 1, multiplies()); size_t res = 1; for (int i=2;i& dimA, const std::vector& dimB){ if (dimA.size()!=dimB.size()) return false; for (int i=0;i& dimA, const std::vector& dimB){ if (dimA.size()!=dimB.size()) return false; if (dimA[0]!=dimB[0]) return false; for (int i=2;i0){ std::cout<0) FatalError(); return countNaN; } std::vector randperm(size_t n, std::mt19937& rng){ std::vector v(n); for (size_t i=0;i std::vector sort_indexes(const std::vector &v) { // initialize original index locations std::vector idx(v.size()); for (size_t i = 0; i != idx.size(); ++i) idx[i] = i; // sort indexes based on comparing values in v std::sort(idx.begin(), idx.end(), [&v](size_t i1, size_t i2) {return v[i1] < v[i2];}); return idx; } ////////////////////////////////////////////////////////////////////////////////////////////////// // CUDA kernels ////////////////////////////////////////////////////////////////////////////////////////////////// #define CUDA_NUM_THREADS 512 #define MAX_NUM_BLOCKS 2880 inline int CUDA_GET_BLOCKS(const size_t N) { return min(MAX_NUM_BLOCKS, int((N + size_t(CUDA_NUM_THREADS) - 1) / CUDA_NUM_THREADS)); } inline size_t CUDA_GET_LOOPS(const size_t N) { size_t total_threads = CUDA_GET_BLOCKS(N)*CUDA_NUM_THREADS; return (N + total_threads -1)/ total_threads; } __global__ void Accuracy_MultinomialLogistic(size_t CUDA_NUM_LOOPS, size_t N, int C, int M, const StorageT* pred, const StorageT* label, const StorageT* weight, StorageT* loss){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ int l = int(GPUStorage2ComputeT(label[idx])); int baseID = (idx/M) * C * M + idx % M; int elementID = baseID + l * M ; ComputeT prob = GPUStorage2ComputeT(pred[elementID]); loss[idx] = GPUCompute2StorageT(1); for(int d=0;dprob){ loss[idx]= GPUCompute2StorageT(0); } } if (weight!=NULL){ loss[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(loss[idx]) * GPUStorage2ComputeT(weight[l]) ); } } } __global__ void Loss_MultinomialLogistic(size_t CUDA_NUM_LOOPS, size_t N, int C, int M, const StorageT* pred, const StorageT* label, const StorageT* weight, const StorageT* weightTensor, StorageT* loss){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ int l = int(GPUStorage2ComputeT(label[idx])); int offset = l * M + (idx % M); int elementID = (idx/M) * C * M + offset; ComputeT prob = max( GPUStorage2ComputeT(pred[elementID]), ComputeT_MIN); ComputeT res = log(prob); //if (GPUStorage2ComputeT(weightTensor[offset])>0){ // printf("prob=%f res = %f * weightTensor[%d] \n",prob, res, offset); //,GPUStorage2ComputeT(weightTensor[offset])); //} if (weight!=NULL) res *= GPUStorage2ComputeT(weight[l]); if (weightTensor!=NULL) res *= GPUStorage2ComputeT(weightTensor[elementID]); loss[idx] = GPUCompute2StorageT( res ); } } __global__ void LossGrad_MultinomialLogistic(size_t CUDA_NUM_LOOPS, size_t N, int C, int M, ComputeT scale, const StorageT* pred, const StorageT* label, const StorageT* weight, const StorageT* weightTensor, StorageT* diff){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ int l = int(GPUStorage2ComputeT(label[idx])); int offset = l * M + (idx % M); int elementID = (idx/M) * C * M + offset; ComputeT prob = max( GPUStorage2ComputeT(pred[elementID]), ComputeT_MIN); if (weight!=NULL) scale *= GPUStorage2ComputeT(weight[l]); if (weightTensor!=NULL) scale *= GPUStorage2ComputeT(weightTensor[elementID]); diff[elementID] = GPUCompute2StorageT( GPUStorage2ComputeT(diff[elementID]) + scale / prob ); } } // for numerical stability: http://freemind.pluskid.org/machine-learning/softmax-vs-softmax-loss-numerical-stability/ __global__ void LossGrad_MultinomialLogistic_StableSoftmax(size_t CUDA_NUM_LOOPS, size_t N, int C, int M, ComputeT scale, const StorageT* pred, const StorageT* label, const StorageT* weight, const StorageT* weightTensor, StorageT* diff){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ int l = int(GPUStorage2ComputeT(label[idx])); int modM = idx % M; int baseID = (idx/M) * C * M + modM; int elementID = baseID + l * M ; if (weight!=NULL){ scale *= GPUStorage2ComputeT(weight[l]); } if (weightTensor==NULL){ for(int d=0;d= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ ComputeT val = GPUStorage2ComputeT(pred[idx]) - GPUStorage2ComputeT(target[idx]); if (weight != NULL) val *= GPUStorage2ComputeT(weight[idx]); ComputeT abs_val = abs(val); if (abs_val < 1) { loss[idx] = GPUCompute2StorageT( 0.5 * val * val ); } else { loss[idx] = GPUCompute2StorageT( abs_val - 0.5 ); } } } __global__ void LossGrad_SmoothL1(size_t CUDA_NUM_LOOPS, size_t N, ComputeT scale, const StorageT* pred, const StorageT* target, const StorageT* weight, StorageT* diff){ // diff = scale * f'( weight * (pred - target) ) // f'(x) = x if |x| < 1 // = sign(x) otherwise const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ ComputeT val = GPUStorage2ComputeT(pred[idx]) - GPUStorage2ComputeT(target[idx]); if (weight != NULL) val *= GPUStorage2ComputeT(weight[idx]); ComputeT abs_val = abs(val); if (abs_val < 1) { diff[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(diff[idx]) + scale * val ); } else { diff[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(diff[idx]) + scale * ( (ComputeT(0) < val) - (val < ComputeT(0)) ) ); } } } // The first implementation of this cost did not exactly match the cost of // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2. // legacy_version = false (the default) uses (margin - d)^2 as proposed in the // Hadsell paper. New models should probably use this version. // legacy_version = true uses (margin - d^2). This is kept to support / // reproduce existing models and results // optional bool legacy_version = 2 [default = false]; __global__ void Loss_Contrastive(size_t CUDA_NUM_LOOPS, size_t N, int C, ComputeT margin, const StorageT* a, const StorageT* b, const StorageT* y, StorageT* loss){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ ComputeT d = 0.0; for (int c=0;c= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ if ((int)(GPUStorage2ComputeT(y[idx]))) { for (int c=0;c 0.0) { for (int c=0;c= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(__half2float(pIn[idx])) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(__half2float(pIn[idx])) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const float* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const double* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const uint8_t* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const uint16_t* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const uint32_t* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const uint64_t* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const int8_t* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const int16_t* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const int32_t* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const int64_t* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const char* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const bool* pIn, const StorageT* pMean, StorageT* pOut) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); } __global__ void Kernel_set_value(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, StorageT value){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ GPUdst[idx] = value; } } void GPU_set_value(size_t N, StorageT* GPUdst, StorageT value){ Kernel_set_value<<>>(CUDA_GET_LOOPS(N),N,GPUdst,value); checkCUDA(__LINE__,cudaGetLastError()); } void GPU_set_ones(size_t N, StorageT* GPUdst){ GPU_set_value(N, GPUdst, CPUCompute2StorageT(1)); } __global__ void Kernel_elementwise_multiplication(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, const StorageT* GPUsrcA, const StorageT* GPUsrcB){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ GPUdst[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(GPUsrcA[idx]) * GPUStorage2ComputeT(GPUsrcB[idx])); } } void GPU_elementwise_multiplication(size_t N, StorageT* GPUdst, const StorageT* GPUsrcA, const StorageT* GPUsrcB){ Kernel_elementwise_multiplication<<>>(CUDA_GET_LOOPS(N),N,GPUdst,GPUsrcA,GPUsrcB); checkCUDA(__LINE__,cudaGetLastError()); } __global__ void Kernel_elementwise_comparison(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, const StorageT* GPUsrcA, const StorageT* GPUsrcB){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ GPUdst[idx] = GPUCompute2StorageT(ComputeT(bool(GPUStorage2ComputeT(GPUdst[idx])) && (GPUStorage2ComputeT(GPUsrcA[idx]) == GPUStorage2ComputeT(GPUsrcB[idx])))); } } void GPU_elementwise_comparison(size_t N, StorageT* GPUdst, const StorageT* GPUsrcA, const StorageT* GPUsrcB){ Kernel_elementwise_comparison<<>>(CUDA_GET_LOOPS(N),N,GPUdst,GPUsrcA,GPUsrcB); //checkCUDA(__LINE__,cudaGetLastError()); } __global__ void Kernel_copyGPUforward(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* in, StorageT* out, int sizeofitem_in, int sizeofitem_out, int offset){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ int out_base = idx*sizeofitem_out+offset; int in_base = idx*sizeofitem_in; for(int i=0;i>>(CUDA_GET_LOOPS(N),N,in,out,sizeofitem_in,sizeofitem_out,offset); } __global__ void Kernel_copyGPUbackward(size_t CUDA_NUM_LOOPS, size_t N, StorageT* in, const StorageT* out, int sizeofitem_in, int sizeofitem_out, int offset){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ int in_base = idx*sizeofitem_in; int out_base = idx*sizeofitem_out+offset; for(int i=0;i>>(CUDA_GET_LOOPS(N),N,in,out,sizeofitem_in,sizeofitem_out,offset); } __global__ void Kernel_elementwise_acc(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, const StorageT* GPUsrc){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ GPUdst[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(GPUdst[idx]) + GPUStorage2ComputeT(GPUsrc[idx]) ); } } __global__ void Kernel_ROIforward_2D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* out, const StorageT* in, const StorageT* start, int od1, int od2, int od3, int id1, int id2, int id3){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ int n = (o / (od1*od2*od3)); int o1 = (o / ( od2*od3)) % od1; int o2 = (o / od3 ) % od2; int o3 = (o ) % od3; int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*3+0]))); int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*3+1]))); int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*3+2]))); int i = i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3; out[o] = in[i]; } } __global__ void Kernel_ROIforward_3D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* out, const StorageT* in, const StorageT* start, int od1, int od2, int od3, int od4, int id1, int id2, int id3, int id4){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ int n = (o / (od1*od2*od3*od4)); int o1 = (o / ( od2*od3*od4)) % od1; int o2 = (o / ( od3*od4)) % od2; int o3 = (o / ( od4)) % od3; int o4 = (o ) % od4; int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*4+0]))); int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*4+1]))); int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*4+2]))); int i4 = o4 + ((int)(GPUStorage2ComputeT(start[n*4+3]))); int i = i4 + (i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3 ) * id4; out[o] = in[i]; } } __global__ void Kernel_ROIforward_4D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* out, const StorageT* in, const StorageT* start, int od1, int od2, int od3, int od4, int od5, int id1, int id2, int id3, int id4, int id5){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ int n = (o / (od1*od2*od3*od4*od5)); int o1 = (o / ( od2*od3*od4*od5)) % od1; int o2 = (o / ( od3*od4*od5)) % od2; int o3 = (o / ( od4*od5)) % od3; int o4 = (o / ( od5)) % od4; int o5 = (o ) % od5; int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*5+0]))); int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*5+1]))); int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*5+2]))); int i4 = o4 + ((int)(GPUStorage2ComputeT(start[n*5+3]))); int i5 = o5 + ((int)(GPUStorage2ComputeT(start[n*5+4]))); int i = i5 + (i4 + (i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3 ) * id4) * id5; out[o] = in[i]; } } __global__ void Kernel_ROIbackward_2D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* out, StorageT* in, const StorageT* start, int od1, int od2, int od3, int id1, int id2, int id3){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ int n = (o / (od1*od2*od3)); int o1 = (o / ( od2*od3)) % od1; int o2 = (o / od3 ) % od2; int o3 = (o ) % od3; int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*3+0]))); int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*3+1]))); int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*3+2]))); int i = i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3; in[i] = GPUCompute2StorageT( GPUStorage2ComputeT(in[i]) + GPUStorage2ComputeT(out[o]) ); } } __global__ void Kernel_ROIbackward_3D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* out, StorageT* in, const StorageT* start, int od1, int od2, int od3, int od4, int id1, int id2, int id3, int id4){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ int n = (o / (od1*od2*od3*od4)); int o1 = (o / ( od2*od3*od4)) % od1; int o2 = (o / ( od3*od4)) % od2; int o3 = (o / ( od4)) % od3; int o4 = (o ) % od4; int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*4+0]))); int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*4+1]))); int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*4+2]))); int i4 = o4 + ((int)(GPUStorage2ComputeT(start[n*4+3]))); int i = i4 + (i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3 ) * id4; in[i] = GPUCompute2StorageT( GPUStorage2ComputeT(in[i]) + GPUStorage2ComputeT(out[o]) ); } } __global__ void Kernel_ROIbackward_4D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* out, StorageT* in, const StorageT* start, int od1, int od2, int od3, int od4, int od5, int id1, int id2, int id3, int id4, int id5){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ int n = (o / (od1*od2*od3*od4*od5)); int o1 = (o / ( od2*od3*od4*od5)) % od1; int o2 = (o / ( od3*od4*od5)) % od2; int o3 = (o / ( od4*od5)) % od3; int o4 = (o / ( od5)) % od4; int o5 = (o ) % od5; int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*5+0]))); int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*5+1]))); int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*5+2]))); int i4 = o4 + ((int)(GPUStorage2ComputeT(start[n*5+3]))); int i5 = o5 + ((int)(GPUStorage2ComputeT(start[n*5+4]))); int i = i5 + (i4 + (i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3 ) * id4) * id5; in[i] = GPUCompute2StorageT( GPUStorage2ComputeT(in[i]) + GPUStorage2ComputeT(out[o]) ); } } __global__ void Kernel_ROIPoolForward_2D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* in_data, const StorageT* in_rois, StorageT* out_data, size_t* argmax_data, const ComputeT spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; //printf("N=%d\n",N); for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ // (n, c, ph, pw) is an element in the pooled output int pw = (index) % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = (index / pooled_width / pooled_height / channels); //in_rois += n * 5; int roi_5n = n*5; int roi_batch_ind = GPUStorage2ComputeT(in_rois[roi_5n+0]); int roi_start_h = round(GPUStorage2ComputeT(in_rois[roi_5n+1]) * spatial_scale); int roi_end_h = round(GPUStorage2ComputeT(in_rois[roi_5n+2]) * spatial_scale); int roi_start_w = round(GPUStorage2ComputeT(in_rois[roi_5n+3]) * spatial_scale); int roi_end_w = round(GPUStorage2ComputeT(in_rois[roi_5n+4]) * spatial_scale); // Force malformed ROIs to be 1x1 int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); ComputeT bin_size_h = static_cast(roi_height) / static_cast(pooled_height); ComputeT bin_size_w = static_cast(roi_width) / static_cast(pooled_width); int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, 0), height); hend = min(max(hend + roi_start_h, 0), height); wstart = min(max(wstart + roi_start_w, 0), width); wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); // Define an empty pooling region to be zero ComputeT maxval = is_empty ? 0 : -FLT_MAX; // If nothing is pooled, argmax = -1 causes nothing to be backprop'd size_t maxidx = SIZE_MAX; //in_data += (roi_batch_ind * channels + c) * height * width; size_t in_offset = (roi_batch_ind * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { size_t in_index = in_offset + h * width + w; ComputeT v = GPUStorage2ComputeT(in_data[in_index]); if (v > maxval) { maxval = v; maxidx = in_index; } } } out_data[index] = GPUCompute2StorageT(maxval); if (argmax_data!=NULL) argmax_data[index] = maxidx; } } __global__ void Kernel_ROIPoolForward_3D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* in_data, const StorageT* in_rois, StorageT* out_data, size_t* argmax_data, const ComputeT spatial_scale, const int channels, const int depth, const int height, const int width, const int pooled_depth, const int pooled_height, const int pooled_width){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ // (n, c, pd, ph, pw) is an element in the pooled output int pw = (index) % pooled_width; int ph = (index / pooled_width) % pooled_height; int pd = (index / pooled_width / pooled_height) % pooled_depth; int c = (index / pooled_width / pooled_height / pooled_depth ) % channels; int n = (index / pooled_width / pooled_height / pooled_depth / channels); int roi_7n = n * 7; int roi_batch_ind = GPUStorage2ComputeT(in_rois[roi_7n+0]); int roi_start_d = round(GPUStorage2ComputeT(in_rois[roi_7n+1]) * spatial_scale); int roi_end_d = round(GPUStorage2ComputeT(in_rois[roi_7n+2]) * spatial_scale); int roi_start_h = round(GPUStorage2ComputeT(in_rois[roi_7n+3]) * spatial_scale); int roi_end_h = round(GPUStorage2ComputeT(in_rois[roi_7n+4]) * spatial_scale); int roi_start_w = round(GPUStorage2ComputeT(in_rois[roi_7n+5]) * spatial_scale); int roi_end_w = round(GPUStorage2ComputeT(in_rois[roi_7n+6]) * spatial_scale); // Force malformed ROIs to be 1x1 int roi_depth = max(roi_end_d - roi_start_d + 1, 1); int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); ComputeT bin_size_d = static_cast(roi_depth) / static_cast(pooled_depth); ComputeT bin_size_h = static_cast(roi_height) / static_cast(pooled_height); ComputeT bin_size_w = static_cast(roi_width) / static_cast(pooled_width); int dstart = static_cast(floor(static_cast(pd) * bin_size_d)); int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); int dend = static_cast(ceil(static_cast(pd + 1) * bin_size_d)); int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries dstart = min(max(dstart + roi_start_d, 0), depth); dend = min(max(dend + roi_start_d, 0), depth); hstart = min(max(hstart + roi_start_h, 0), height); hend = min(max(hend + roi_start_h, 0), height); wstart = min(max(wstart + roi_start_w, 0), width); wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (dend <= dstart) || (hend <= hstart) || (wend <= wstart); // Define an empty pooling region to be zero ComputeT maxval = is_empty ? 0 : -FLT_MAX; // If nothing is pooled, argmax = -1 causes nothing to be backprop'd size_t maxidx = SIZE_MAX; size_t in_offset = (roi_batch_ind * channels + c) * depth * height * width; for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { size_t in_index = in_offset + d * height * width + h * width + w; ComputeT v = GPUStorage2ComputeT(in_data[in_index]); if (v > maxval) { maxval = v; maxidx = in_index; } } } } out_data[index] = GPUCompute2StorageT(maxval); if (argmax_data!=NULL) argmax_data[index] = maxidx; } } __global__ void Kernel_ROIPoolBackward_2D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* in_diff, const StorageT* in_rois, const StorageT* out_diff, const size_t* argmax_data, const ComputeT spatial_scale, const int num_rois, const int channels, const int height, const int width, const int pooled_height, const int pooled_width) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ // (n, c, h, w) coords in in data int w = index % width; int h = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; ComputeT gradient = GPUStorage2ComputeT(in_diff[index]); // Accumulate gradient over all ROIs that pooled this element for (int roi_n = 0; roi_n < num_rois; ++roi_n) { int roi_5n = roi_n*5; int roi_batch_ind = (int)(GPUStorage2ComputeT(in_rois[roi_5n+0])); // Skip if ROI's batch index doesn't match n if (n != roi_batch_ind) { continue; } int roi_start_h = round(GPUStorage2ComputeT(in_rois[roi_5n+1]) * spatial_scale); int roi_end_h = round(GPUStorage2ComputeT(in_rois[roi_5n+2]) * spatial_scale); int roi_start_w = round(GPUStorage2ComputeT(in_rois[roi_5n+3]) * spatial_scale); int roi_end_w = round(GPUStorage2ComputeT(in_rois[roi_5n+4]) * spatial_scale); // Skip if ROI doesn't include (h, w) const bool in_roi = (w >= roi_start_w && w <= roi_end_w && h >= roi_start_h && h <= roi_end_h); if (!in_roi) { continue; } size_t offset = (roi_n * channels + c) * pooled_height * pooled_width; // Compute feasible set of pooled units that could have pooled // this in unit // Force malformed ROIs to be 1x1 int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); ComputeT bin_size_h = static_cast(roi_height) / static_cast(pooled_height); ComputeT bin_size_w = static_cast(roi_width) / static_cast(pooled_width); int phstart = floor(static_cast(h - roi_start_h) / bin_size_h); int phend = ceil(static_cast(h - roi_start_h + 1) / bin_size_h); int pwstart = floor(static_cast(w - roi_start_w) / bin_size_w); int pwend = ceil(static_cast(w - roi_start_w + 1) / bin_size_w); phstart = min(max(phstart, 0), pooled_height); phend = min(max(phend, 0), pooled_height); pwstart = min(max(pwstart, 0), pooled_width); pwend = min(max(pwend, 0), pooled_width); for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { size_t out_index = ph * pooled_width + pw; if (argmax_data[offset + out_index] == (h * width + w)) { gradient += GPUStorage2ComputeT(out_diff[offset + out_index]); } } } } in_diff[index] = GPUCompute2StorageT(gradient); } } __global__ void Kernel_ROIPoolBackward_3D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* in_diff, const StorageT* in_rois, const StorageT* out_diff, const size_t* argmax_data, const ComputeT spatial_scale, const int num_rois, const int channels, const int depth, const int height, const int width, const int pooled_depth, const int pooled_height, const int pooled_width) { const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ // (n, c, h, w) coords in in data int w = index % width; int h = (index / width) % height; int d = (index / width / height) % depth; int c = (index / width / height / depth) % channels; int n = index / width / height / depth / channels; ComputeT gradient = GPUStorage2ComputeT(in_diff[index]); // Accumulate gradient over all ROIs that pooled this element for (int roi_n = 0; roi_n < num_rois; ++roi_n) { int roi_7n = roi_n*7; int roi_batch_ind = (int)(GPUStorage2ComputeT(in_rois[roi_7n+0])); // Skip if ROI's batch index doesn't match n if (n != roi_batch_ind) { continue; } int roi_start_d = round(GPUStorage2ComputeT(in_rois[roi_7n+1]) * spatial_scale); int roi_end_d = round(GPUStorage2ComputeT(in_rois[roi_7n+2]) * spatial_scale); int roi_start_h = round(GPUStorage2ComputeT(in_rois[roi_7n+3]) * spatial_scale); int roi_end_h = round(GPUStorage2ComputeT(in_rois[roi_7n+4]) * spatial_scale); int roi_start_w = round(GPUStorage2ComputeT(in_rois[roi_7n+5]) * spatial_scale); int roi_end_w = round(GPUStorage2ComputeT(in_rois[roi_7n+6]) * spatial_scale); // Skip if ROI doesn't include (h, w) const bool in_roi = (w >= roi_start_w && w <= roi_end_w && h >= roi_start_h && h <= roi_end_h && d >= roi_start_d && d <= roi_end_d); if (!in_roi) { continue; } size_t offset = (roi_n * channels + c) * pooled_depth * pooled_height * pooled_width; // Compute feasible set of pooled units that could have pooled // this in unit // Force malformed ROIs to be 1x1 int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); int roi_depth = max(roi_end_d - roi_start_d + 1, 1); ComputeT bin_size_d = static_cast(roi_depth) / static_cast(pooled_depth); ComputeT bin_size_h = static_cast(roi_height) / static_cast(pooled_height); ComputeT bin_size_w = static_cast(roi_width) / static_cast(pooled_width); int pdstart = floor(static_cast(d - roi_start_d) / bin_size_d); int pdend = ceil(static_cast(d - roi_start_d + 1) / bin_size_d); int phstart = floor(static_cast(h - roi_start_h) / bin_size_h); int phend = ceil(static_cast(h - roi_start_h + 1) / bin_size_h); int pwstart = floor(static_cast(w - roi_start_w) / bin_size_w); int pwend = ceil(static_cast(w - roi_start_w + 1) / bin_size_w); pdstart = min(max(pdstart, 0), pooled_depth); pdend = min(max(pdend, 0), pooled_depth); phstart = min(max(phstart, 0), pooled_height); phend = min(max(phend, 0), pooled_height); pwstart = min(max(pwstart, 0), pooled_width); pwend = min(max(pwend, 0), pooled_width); for (int pd = pdstart; pd < pdend; ++pd) { for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { size_t out_index = (pd * pooled_height + ph) * pooled_width + pw; if (argmax_data[offset + out_index] == ((d * height + h) * width + w)) { gradient += GPUStorage2ComputeT(out_diff[offset+out_index]); } } } } } in_diff[index] = GPUCompute2StorageT(gradient); } } __global__ void Kernel_bsa2b(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* a, StorageT* b){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ b[idx] = GPUCompute2StorageT(GPUStorage2ComputeT(b[idx]) - GPUStorage2ComputeT(a[idx])); } } void bsa2b(size_t N, const StorageT* a, StorageT* b){ //checkCUDA(__LINE__,cudaGetLastError()); Kernel_bsa2b<<>>(CUDA_GET_LOOPS(N),N,a,b); //checkCUDA(__LINE__,cudaGetLastError()); } __global__ void Kernel_update_SGDL2(size_t CUDA_NUM_LOOPS, size_t N, int nNets, ComputeT decay, ComputeT momentum, ComputeT lr, const StorageT* weights, StorageT* gradients){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ ComputeT w = GPUStorage2ComputeT(weights[idx]); ComputeT h = GPUStorage2ComputeT(gradients[idx]); ComputeT g = decay * w; // L2 regularization for (int k=1; k>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,lr,weights,gradients); checkCUDA(__LINE__,cudaGetLastError()); } __global__ void Kernel_xpy(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* x, StorageT* y){ const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ y[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(y[idx]) + GPUStorage2ComputeT(x[idx])); } } void xpy(size_t N, const StorageT* x, StorageT* y){ Kernel_xpy<<>>(CUDA_GET_LOOPS(N),N,x,y); checkCUDA(__LINE__,cudaGetLastError()); } __global__ void Kernel_Hasum(size_t N, const half *x, int incx, float *result){ const int i = CUDA_NUM_THREADS * blockIdx.x + threadIdx.x; if (i > 0) return; float r = 0; for (int i=0;i>>(n, x, incx, answer); cudaMemcpy(result, answer, sizeof(float), cudaMemcpyDeviceToHost); cudaFree(answer); return CUBLAS_STATUS_SUCCESS; } cublasStatus_t Hgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const half *A, int lda, const half *B, int ldb, const float *beta, half *C, int ldc){ return cublasSgemmEx(handle, transa, transb, m, n, k, alpha, A, CUBLAS_DATA_HALF, lda, B, CUBLAS_DATA_HALF, ldb, beta, C, CUBLAS_DATA_HALF, ldc); } ////////////////////////////////////////////////////////////////////////////////////////////////// // File format ////////////////////////////////////////////////////////////////////////////////////////////////// uint8_t typeID(std::type_index t){ if (t==typeid(half)) return uint8_t(0); if (t==typeid(float)) return uint8_t(1); if (t==typeid(double)) return uint8_t(2); if (t==typeid(uint8_t)) return uint8_t(3); if (t==typeid(uint16_t)) return uint8_t(4); if (t==typeid(uint32_t)) return uint8_t(5); if (t==typeid(uint64_t)) return uint8_t(6); if (t==typeid(int8_t)) return uint8_t(7); if (t==typeid(int16_t)) return uint8_t(8); if (t==typeid(int32_t)) return uint8_t(9); if (t==typeid(int64_t)) return uint8_t(10); if (t==typeid(char)) return uint8_t(11); if (t==typeid(bool)) return uint8_t(12); FatalError(__LINE__); return uint8_t(255); } uint8_t readTypeID(std::string filename){ FILE* fp = fopen(filename.c_str(),"rb"); while (fp==NULL) { std::cerr<<"readTypeID: fail to open file "< class Tensor{ public: std::vector dim; T* CPUmem; std::string name; // compile will check if your time is not correct for writeGPU and readGPU void writeGPU(T* GPUmem){ cudaMemcpy(GPUmem, CPUmem, numel()*sizeof(T), cudaMemcpyHostToDevice); }; void readGPU(T* GPUmem){ cudaMemcpy(CPUmem, GPUmem, numel()*sizeof(T), cudaMemcpyDeviceToHost); }; Tensor(): CPUmem(NULL){}; size_t numel(){ return marvin::numel(dim); }; size_t numBytes(){ return sizeof(T)*numel(); }; int numofitems(){ return dim[0]; }; size_t sizeofitem(){ return marvin::sizeofitem(dim); }; ~Tensor(){ if (CPUmem!=NULL) delete[] CPUmem; }; void initialize(T val){ for (size_t i=0;i0){ read_cnt = fread((void*)(name.data()), sizeof(char), lenName, fp); if (read_cnt!=lenName) { std::cerr<<"Error at Tensor::readHeader: wrong data type. "<0){ read_cnt = fread((void*)(&dim[0]), sizeof(int), nbDims, fp); if (read_cnt!=nbDims) { std::cerr<<"Error at Tensor::readHeader: wrong data type. "<* floatTensor = new Tensor(fp); this->dim = floatTensor->dim ; this->name = floatTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i]); memcpy(((half*)(CPUmem))+i,&v,sizeof(half)); } delete floatTensor; }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(half))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* halfTensor = new Tensor(fp); this->dim = halfTensor->dim ; this->name = halfTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i]); memcpy(((float*)(CPUmem))+i,&v,sizeof(float)); } delete halfTensor; }else if (myTypeid==typeID(typeid(double)) && fpTypeid==typeID(typeid(float))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* floatTensor = new Tensor(fp); this->dim = floatTensor->dim ; this->name = floatTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i]); memcpy(((double*)(CPUmem))+i,&v,sizeof(double)); } delete floatTensor; }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(double))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* doubleTensor = new Tensor(fp); this->dim = doubleTensor->dim ; this->name = doubleTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i]); memcpy(((float*)(CPUmem))+i,&v,sizeof(float)); } delete doubleTensor; }else if (myTypeid==typeID(typeid(half)) && fpTypeid==typeID(typeid(double))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* doubleTensor = new Tensor(fp); this->dim = doubleTensor->dim ; this->name = doubleTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i])); memcpy(((half*)(CPUmem))+i,&v,sizeof(half)); } delete doubleTensor; }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(half))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* halfTensor = new Tensor(fp); this->dim = halfTensor->dim ; this->name = halfTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i])); memcpy(((double*)(CPUmem))+i,&v,sizeof(double)); } delete halfTensor; }else{ std::cerr<<"Tensor conversion is not supported: from Type "<0){ read_cnt = fread((void*)(name.data()), sizeof(char), lenName, fp); if (read_cnt!=lenName) return NULL; } int nbDims; read_cnt = fread((void*)(&nbDims), sizeof(int), 1, fp); if (read_cnt!=1) return NULL; dim.resize(nbDims); if (nbDims>0){ read_cnt = fread((void*)(&dim[0]), sizeof(int), nbDims, fp); if (read_cnt!=nbDims) return NULL; } size_t n = numel(); Malloc(batch_size); read_cnt = fread((void*)(CPUmem), sizeof(T), n, fp); if (read_cnt!=n){ delete [] CPUmem; CPUmem = NULL; return NULL; } } return CPUmem; }; void Malloc(int batch_size){ size_t n = numel(); //std::cout<<" "; memorySizePrint(n*sizeof(T)); std::cout< dim2write){ uint8_t myTypeid = typeID(typeid(T)); fwrite((void*)(&myTypeid), sizeof(uint8_t), 1, fp); uint32_t typesizeof = uint32_t(sizeof(T)); fwrite((void*)(&typesizeof), sizeof(uint32_t), 1, fp); int lenName = name.size(); fwrite((void*)(&lenName), sizeof(int), 1, fp); if (lenName>0) fwrite((void*)(name.data()), sizeof(char), lenName, fp); int nbDims = dim2write.size(); fwrite((void*)(&nbDims), sizeof(int), 1, fp); if (nbDims>0) fwrite((void*)(&dim2write[0]), sizeof(int), nbDims, fp); if (ferror (fp)){ std::cerr << "disk writing failed"<0){ fwrite((void*)(CPUmem), sizeof(T), n, fp); if (ferror (fp)){ std::cerr << "disk writing failed" << std::endl; FatalError(); } } }; //support continuous write across many NdTensors //write with header void write(FILE* fp){ writeHeader(fp,dim); writeData(fp); }; void write(std::string filename){ FILE* fp = fopen(filename.c_str(),"wb"); while (fp==NULL) { std::cerr<<"Tensor::write: fail to open file "< dim_): dim(dim_){ CPUmem = new T [numel()]; }; Tensor(std::vector dim_, T initValue): dim(dim_){ int n = numel(); CPUmem = new T [n]; if (initValue == T(0)) memset(CPUmem, 0, n*sizeof(T)); else for (int i=0;i dim_): name(name_),dim(dim_){ CPUmem = new T [numel()]; }; void permute(std::vector v){ size_t nbItems = numofitems(); size_t sizeofitem_ = sizeofitem(); size_t nbBytes = sizeofitem_ * sizeof(T); T* CPUmemNew = new T[numel()]; memcpy(CPUmemNew, CPUmem, nbItems * nbBytes); for (size_t i=0;i display_dim){ std::cout<<" name:"<0.0f)<<" "; std::cout< std::vector*> readTensors(std::string filename, size_t max_count = SIZE_MAX){ FILE* fp = fopen(filename.c_str(),"rb"); while (fp==NULL) { std::cerr<<"readTensors: fail to open file "<*> tensors; size_t count = 0; while (feof(fp)==0) { tensors.push_back(new Tensor(fp)); count++; if (count>=max_count) break; } fclose(fp); if (tensors[tensors.size()-1]->CPUmem == NULL) tensors.resize(tensors.size()-1); return tensors; } template void writeTensors(std::string filename, std::vector*> tensors){ FILE* fp = fopen(filename.c_str(),"wb"); while (fp==NULL) { std::cerr<<"writeTensors: fail to open file "<write(fp); } fclose(fp); } ////////////////////////////////////////////////////////////////////////////////////////////////// // Response and Layer ////////////////////////////////////////////////////////////////////////////////////////////////// class Response{ public: std::string name; cudnnTensorDescriptor_t desc; cublasHandle_t cublasHandle; std::vector desc_group; std::vector number_group; StorageT* dataGPU; StorageT* diffGPU; bool need_diff; std::vector dim; std::vector stride; std::vector receptive_field; std::vector receptive_gap; std::vector receptive_offset; size_t sizeofitem(){ return marvin::sizeofitem(dim); }; size_t numBytes(){ return sizeofStorageT*(marvin::numel(dim)); }; Response(std::string name_): name(name_), dataGPU(NULL), diffGPU(NULL), need_diff(false){ checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&desc)); }; size_t Malloc(std::vector dim_){ size_t memoryBytes = 0; if (dataGPU==NULL){ // two layers (one for training, one for testing) may output to the same response and Malloc twice, ignore the second time dim = dim_; stride.resize(dim.size()); //std::cout<<"dim"; veciPrint(dim); //std::cout<<"strideA"; veciPrint(strideA); //FatalError(); /* if (receptive_field.empty()){ receptive_field.resize(dim.size()-2); fill_n(receptive_field.begin(),dim.size()-2,1); } if (receptive_gap.empty()){ receptive_gap.resize(dim.size()-2); fill_n(receptive_gap.begin(),dim.size()-2,1); } */ stride[dim.size()-1] = 1; for (int d=dim.size()-2;d>=0;--d){ stride[d] = stride[d+1] * dim[d+1]; } //std::cout<<"dim"; veciPrint(dim); //std::cout<<"strideA"; veciPrint(strideA); checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(desc, CUDNNStorageT, dim.size(), &dim[0], &stride[0]) ); //std::cout<<"debug LINE "<<__LINE__< dim_new = dim; dim_new[1] = dim[1]/group; checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(desc_new, CUDNNStorageT, dim_new.size(), &dim_new[0], &stride[0]) ); desc_group.push_back(desc_new); return desc_new; } ~Response(){ checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(desc)); for (int i=0; i display_dim, bool printData=true){ if (!printData && diffGPU==NULL) return; Tensor* feature = new Tensor(dim); feature->readGPU((printData? dataGPU: diffGPU)); feature->print(display_dim); delete feature; }; int checkNaN(){ return marvin::checkNaN(dataGPU, numel(dim)); }; int checkNaNdiff(){ return marvin::checkNaN(diffGPU, numel(dim)); }; ComputeT ameanData(){ if (dataGPU!=NULL){ std::cout<<"Response ["< in; std::vector out; std::mt19937 rng; cudnnHandle_t cudnnHandle; cublasHandle_t cublasHandle; // parameters: int GPU; std::string name; Phase phase; bool train_me; // user specify whehter they want to tune this layer ComputeT weight_lr_mult; Filler weight_filler; ComputeT weight_filler_param; std::vector weight_dim; size_t weight_numel; ComputeT weight_decay_mult; ComputeT bias_lr_mult; Filler bias_filler; ComputeT bias_filler_param; std::vector bias_dim; size_t bias_numel; ComputeT bias_decay_mult; Layer(): phase(TrainingTesting), train_me(false), weight_dataGPU(NULL), weight_diffGPU(NULL), weight_histGPU(NULL), bias_dataGPU(NULL), bias_diffGPU(NULL), bias_histGPU(NULL), weight_numel(0), bias_numel(0), weight_decay_mult(ComputeT(1)),bias_decay_mult(ComputeT(1)){ std::random_device rd; rng.seed(rd()); }; Layer(std::string name_): name(name_), phase(TrainingTesting), train_me(false), weight_dataGPU(NULL), weight_diffGPU(NULL), weight_histGPU(NULL), bias_dataGPU(NULL), bias_diffGPU(NULL), bias_histGPU(NULL), weight_numel(0), bias_numel(0), weight_decay_mult(ComputeT(1)),bias_decay_mult(ComputeT(1)){ std::random_device rd; rng.seed(rd()); }; virtual ~Layer(){ if (weight_dataGPU!=NULL) checkCUDA(__LINE__, cudaFree(weight_dataGPU)); if (bias_dataGPU!=NULL) checkCUDA(__LINE__, cudaFree(bias_dataGPU)); }; ComputeT ameanWeightData(){ if (weight_dataGPU==NULL) return -1; ComputeT result; size_t n = numel(weight_dim); checkCUBLAS(__LINE__, GPUasum(cublasHandle, n, weight_dataGPU, 1, &result)); result /= ComputeT(n); return result; }; ComputeT ameanWeightDiff(){ if (weight_diffGPU==NULL) return -1; ComputeT result; size_t n = numel(weight_dim); checkCUBLAS(__LINE__, GPUasum(cublasHandle, n, weight_diffGPU, 1, &result)); result /= ComputeT(n); return result; }; ComputeT ameanBiasData(){ if (bias_dataGPU==NULL) return -1; ComputeT result; size_t n = numel(bias_dim); checkCUBLAS(__LINE__, GPUasum(cublasHandle, n, bias_dataGPU, 1, &result)); result /= ComputeT(n); return result; }; ComputeT ameanBiasDiff(){ if (bias_diffGPU==NULL) return -1; ComputeT result; size_t n = numel(bias_dim); checkCUBLAS(__LINE__, GPUasum(cublasHandle, n, bias_diffGPU, 1, &result)); result /= ComputeT(n); return result; }; void addIn (Response* r){ in.push_back(r); }; void addOut(Response* r){ out.push_back(r); }; virtual size_t Malloc(Phase phase_){ // by default, do nothing std::cout<< (train_me? "* " : " "); std::cout< dim, Filler filler, ComputeT param=0){ int n = numel(dim); StorageT* CPUbuf = new StorageT[n]; switch(filler){ case Xavier: { int fan_in = ComputeT(n/dim[0]); ComputeT scale = sqrt(ComputeT(3) / fan_in); //default_random_engine generator; std::uniform_real_distribution distribution(-scale,scale); for (StorageT* p=CPUbuf;p != CPUbuf+n;++p){ *p = CPUCompute2StorageT(distribution(rng)); } } break; case Gaussian: { std::normal_distribution distribution(0,param); for (StorageT* p=CPUbuf;p != CPUbuf+n;++p){ *p = CPUCompute2StorageT(distribution(rng)); } } break; case Constant: { StorageT paramStorageT = CPUCompute2StorageT(param); for (StorageT* p=CPUbuf;p != CPUbuf+n;++p){ *p = paramStorageT; } } break; } checkCUDA(__LINE__, cudaMemcpy(GPUmem, CPUbuf, n*sizeofStorageT, cudaMemcpyHostToDevice) ); /* { for (int i=0;i "<0){ std::cout<<"fillGPU has NaN. Maybe compiler optimization level is too high."<dim)){ if (!same_dim(weight_dim,weights[i]->dim)){ std::cout<<"[Warning] "<< name << ".weight is loaded with mismatched dimensions "; std::cout<<"need"; veciPrint(weight_dim); std::cout<<" vs. file"; veciPrint(weights[i]->dim); std::cout<dim); std::cout << " is set."<writeGPU(weight_dataGPU); }else{ std::cout<<"[Warning] "<< name << ".weight is found but not loaded because the numels are mismatched: "; std::cout<<"need"; veciPrint(weight_dim); std::cout<<" vs. file"; veciPrint(weights[i]->dim); std::cout<name == name + ".bias"){ if (numel(bias_dim)==numel(weights[i]->dim)){ if (!same_dim(bias_dim,weights[i]->dim)){ std::cout<<"[Warning] "<< name << ".bias is loaded with mismatched dimensions "; std::cout<<"need"; veciPrint(bias_dim); std::cout<<" vs. file"; veciPrint(weights[i]->dim); std::cout<dim); std::cout << " is set."<writeGPU(bias_dataGPU); }else{ std::cout<<"[Warning] "<< name << ".bias is found but not loaded because the numels are mismatched: "; std::cout<<"need"; veciPrint(bias_dim); std::cout<<" vs. file"; veciPrint(weights[i]->dim); std::cout<* t = new Tensor(name+ ".weight",weight_dim); t->readGPU(weight_dataGPU); t->write(fp); delete t; } if (bias_dataGPU!=NULL){ Tensor* t = new Tensor(name+ ".bias",bias_dim); t->readGPU(bias_dataGPU); t->write(fp); delete t; } }; void printWeights(std::vector display_weight, std::vector display_bias){ if (weight_dataGPU!=NULL){ Tensor* t = new Tensor(name+ ".weight",weight_dim); t->readGPU(weight_dataGPU); t->print(display_weight); delete t; } if (bias_dataGPU!=NULL){ Tensor* t = new Tensor(name+ ".bias",bias_dim); t->readGPU(bias_dataGPU); t->print(display_bias); delete t; } }; void setDiffs(std::vector*> weights){ for (int i=0;iname == name + ".weight_diff"){ std::cout<< " " << name << ".weight_diff"; veciPrint(weights[i]->dim); std::cout << " is set."<writeGPU(weight_diffGPU); } if (bias_diffGPU!=NULL && weights[i]->name == name + ".bias_diff"){ std::cout<< " " << name << ".bias_diff"; veciPrint(weights[i]->dim); std::cout << " is set."<writeGPU(bias_diffGPU); } } }; void saveDiffs(FILE* fp){ if (weight_diffGPU!=NULL){ Tensor* t = new Tensor(name+ ".weight_diff",weight_dim); t->readGPU(weight_diffGPU); t->write(fp); delete t; } if (bias_diffGPU!=NULL){ Tensor* t = new Tensor(name+ ".bias_diff",bias_dim); t->readGPU(bias_diffGPU); t->write(fp); delete t; } }; void printDiffs(std::vector display_weight, std::vector display_bias){ if (weight_diffGPU!=NULL){ Tensor* t = new Tensor(name+ ".weight_diff",weight_dim); t->readGPU(weight_diffGPU); t->print(display_weight); delete t; } if (bias_diffGPU!=NULL){ Tensor* t = new Tensor(name+ ".bias_diff",bias_dim); t->readGPU(bias_diffGPU); t->print(display_bias); delete t; } }; void update(){ if (train_me){ if (weight_numel>0 && weight_histGPU !=NULL) bsa2b(weight_numel, weight_histGPU, weight_dataGPU); if (bias_numel>0 && bias_histGPU != NULL) bsa2b( bias_numel, bias_histGPU, bias_dataGPU); } }; }; ////////////////////////////////////////////////////////////////////////////////////////////////// // Layers ////////////////////////////////////////////////////////////////////////////////////////////////// class DataLayer : public Layer { public: // parameters: int counter; int epoch; bool isDataLayer(){ return true; }; DataLayer(): counter(0), epoch(0){}; DataLayer(std::string name_): Layer(name_), counter(0), epoch(0){}; virtual int numofitems() = 0; virtual void shuffle() = 0; }; class TensorLayer: public DataLayer { StorageT* tensorGPU; public: std::vector files; std::vector > dim; TensorLayer(std::string name_): DataLayer(name_), tensorGPU(NULL){ train_me = false; }; TensorLayer(JSON* json): tensorGPU(NULL){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetOrDie(json, files ) train_me = false; }; ~TensorLayer(){ if (tensorGPU!=NULL) checkCUDA(__LINE__, cudaFree(tensorGPU)); }; int numofitems(){ return dim[0][0]; }; void shuffle(){ }; void forward(Phase phase_){ ++epoch; }; size_t Malloc(Phase phase_){ std::cout<< (train_me? "* " : " "); std::cout<* tensorCPU = new Tensor(files[i]); dim[i] = tensorCPU->dim; out[i]->need_diff = false; std::cout<<"tensorCPU->dim="; veciPrint(tensorCPU->dim); std::cout<Malloc(tensorCPU->dim); checkCUDA(__LINE__, cudaMemcpy(out[i]->dataGPU, tensorCPU->CPUmem, tensorCPU-> numBytes(), cudaMemcpyHostToDevice) ); delete tensorCPU; } return memoryBytes; }; }; class MemoryDataLayer : public DataLayer { std::vector*> dataCPU; public: std::vector file_data; std::vector file_mean; std::vector scale; std::vector mean; int batch_size; int numofitems(){ return dataCPU[0]->dim[0]; }; void init(){ train_me = false; std::cout<<"MemoryDataLayer "< (file_data[i],batch_size); dataCPU[i]->print(veci(0)); } if (file_mean.size()>0){ for (int i =0;i* meanCPU = new Tensor(file_mean[i],batch_size); meanCPU->print(veci(0)); if (meanCPU->numel() != dataCPU[i]->sizeofitem()){ std::cerr<<"mean tensor file size error: "<dim); std::cerr<dim); std::cerr<CPUmem; StorageT* dE = dataCPU[i]->CPUmem + dataCPU[i]->numel(); StorageT* m = meanCPU->CPUmem; StorageT* mE = meanCPU->CPUmem + meanCPU->numel(); while(d!=dE){ *d = CPUCompute2StorageT( CPUStorage2ComputeT(*d) - CPUStorage2ComputeT(*m) ); ++m; if (m==mE) m = meanCPU->CPUmem; ++d; } delete meanCPU; } } for (int i =0;iCPUmem + dataCPU[i]->numel(); for(StorageT* d = dataCPU[i]->CPUmem; d!=dE; ++d){ *d = CPUCompute2StorageT( CPUStorage2ComputeT(*d) * scale[i] ); } } } for (int i =0;iCPUmem + dataCPU[i]->numel(); for(StorageT* d = dataCPU[i]->CPUmem; d!=dE; ++d){ *d = CPUCompute2StorageT( CPUStorage2ComputeT(*d) - mean[i] ); } } } if (phase!=Testing) shuffle(); } MemoryDataLayer(std::string name_, Phase phase_, std::vector file_data_, int batch_size_): DataLayer(name_), batch_size(batch_size_), file_data(file_data_){ phase = phase_; init(); }; MemoryDataLayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, Training) SetOrDie(json, file_data ) SetValue(json, file_mean, std::vector(0)) SetValue(json, batch_size, 64) SetValue(json, scale, std::vector(0)) SetValue(json, mean, std::vector(0)) init(); }; ~MemoryDataLayer(){ for (int i =0; ineed_diff = false; std::vector data_dim = dataCPU[i]->dim; data_dim[0] = batch_size; out[i]->receptive_field.resize(data_dim.size()-2); fill_n(out[i]->receptive_field.begin(), data_dim.size()-2,1); out[i]->receptive_gap.resize(data_dim.size()-2); fill_n(out[i]->receptive_gap.begin(), data_dim.size()-2,1); out[i]->receptive_offset.resize(data_dim.size()-2); fill_n(out[i]->receptive_offset.begin(),data_dim.size()-2,0); memoryBytes += out[i]->Malloc(data_dim); } return memoryBytes; } void shuffle(){ std::vector v = randperm(dataCPU[0]->numofitems(), rng); for(int i =0; i permute(v); } }; void forward(Phase phase_){ if (counter + batch_size >= dataCPU[0]->numofitems() ){ ++epoch; if(phase!=Testing){ shuffle(); counter = 0; } } for(int i =0; i dataGPU, dataCPU[i]->CPUmem + (size_t(counter) * size_t( dataCPU[i]->sizeofitem())), batch_size * dataCPU[i]->sizeofitem() * sizeofStorageT, cudaMemcpyHostToDevice) ); } counter+=batch_size; if (counter >= dataCPU[0]->numofitems()) counter = 0; }; }; template class DiskDataLayer : public DataLayer { std::future lock; std::vector ordering; std::bernoulli_distribution* distribution_bernoulli; std::vector*> distribution_uniform; std::vector dataFILE; std::vector dataCPU; std::vector dataGPU; std::vector item_raw; Tensor* labelCPUall; Tensor* labelCPU; std::vector mean_data_GPU; StorageT* labelGPU; size_t numel_per_channel_crop ; size_t numel_all_channel_crop ; size_t numel_per_channel_orgi ; size_t numel_batch_all_channel_crop ; int epoch_prefetch; size_t bytes_per_item; size_t headerBytes; std::vector size_data; public: bool mirror; std::vector size_crop; std::vector file_data; std::vector file_mean; std::string file_label; int batch_size; int numofitems(){ return labelCPUall->numofitems(); }; void init(){ epoch_prefetch = 0; distribution_bernoulli = new std::bernoulli_distribution(0.5); //dataFILE = NULL; //dataCPU = NULL; //dataGPU = NULL; labelCPU = NULL; labelGPU = NULL; labelCPUall = NULL; train_me = false; std::cout<<"DiskDataLayer "<* meanCPU = new Tensor(file_mean[i],batch_size); meanCPU->print(veci(0)); checkCUDA(__LINE__, cudaMalloc(&mean_data_GPU[i], meanCPU->numBytes()) ); meanCPU->writeGPU(mean_data_GPU[i]); delete meanCPU; } Tensor tensor; headerBytes = tensor.readHeader(dataFILE[0]); size_data.insert( size_data.end(), tensor.dim.begin()+1, tensor.dim.end() ); numel_per_channel_crop = numel(size_crop); numel_all_channel_crop = size_data[0] * numel_per_channel_crop; numel_per_channel_orgi = sizeofitem(size_data); numel_batch_all_channel_crop = batch_size*numel_all_channel_crop; bytes_per_item = sizeof(T)* numel(size_data); std::vector data_dim; data_dim.push_back(batch_size); data_dim.push_back(size_data[0]); data_dim.insert( data_dim.end(), size_crop.begin(), size_crop.end() ); for (int i = 0;i(file_label); labelCPUall -> print(veci(0)); std::cout<<" "; labelCPUall->printRange(); while (labelCPUall->dim.size()dim.push_back(1); std::vector label_dim = labelCPUall->dim; label_dim[0] = batch_size; labelCPU = new Tensor(label_dim); distribution_uniform.resize(size_crop.size()); for (int d=0; d(0,size_data[d+1] - size_crop[d]); } if (phase!=Testing){ shuffle(); }else{ ordering.resize(numofitems()); for (int i=0;i size_data_, std::vector size_crop_, std::vector file_data_, std::string file_label_, int batch_size_): DataLayer(name_), mirror(mirror_), size_data(size_data_), size_crop(size_crop_), file_data(file_data_), file_label(file_label_), batch_size(batch_size_){ phase = phase_; init(); }; DiskDataLayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, Training) SetValue(json, mirror, false) SetOrDie(json, file_data ) SetValue(json, file_mean, std::vector(0)) SetValue(json, file_label,"") SetOrDie(json, batch_size ) SetOrDie(json, size_crop ) init(); }; ~DiskDataLayer(){ if (lock.valid()) lock.wait(); delete distribution_bernoulli; for (int i=0;inumofitems(), rng); } }; void prefetch(){ checkCUDA(__LINE__,cudaSetDevice(GPU)); std::vector begin_coor(size_crop.size()); for (size_t i=0;isizeofitem(); memcpy(labelCPU->CPUmem+i*labelSizeOfItem, labelCPUall->CPUmem+image_i*labelSizeOfItem, labelSizeOfItem*sizeofStorageT); // mirror bool mirror_this = false; if (mirror) mirror_this = ((*distribution_bernoulli)(rng)); if (numel_per_channel_orgi != numel_per_channel_crop || mirror_this){ for (int d=0;d= ordering.size()){ if (phase!=Testing) shuffle(); counter = 0; ++epoch_prefetch; } }//end for (size_t i=0;iwriteGPU(labelGPU); }; void forward(Phase phase_){ lock.wait(); epoch = epoch_prefetch; for (int data_i = 0; data_i>>(CUDA_GET_LOOPS(numel_batch_all_channel_crop), numel_batch_all_channel_crop, numel_all_channel_crop, dataGPU[data_i], mean_data, out[data_i]->dataGPU); } std::swap(out[file_data.size()]->dataGPU,labelGPU); lock = std::async(std::launch::async,&DiskDataLayer::prefetch,this); }; size_t Malloc(Phase phase_){ if (phase == Training && phase_==Testing) return 0; if (out.size()!=file_data.size()+1){ std::cout<<"DiskDataLayer: # of out's should match the # of in-1"< data_dim; data_dim.push_back(batch_size); data_dim.push_back(size_data[0]); data_dim.insert( data_dim.end(), size_crop.begin(), size_crop.end() ); for (int data_i = 0; data_ineed_diff = false; out[data_i]->receptive_field.resize(data_dim.size()-2); fill_n(out[data_i]->receptive_field.begin(),data_dim.size()-2,1); out[data_i]->receptive_gap.resize(data_dim.size()-2); fill_n(out[data_i]->receptive_gap.begin(),data_dim.size()-2,1); out[data_i]->receptive_offset.resize(data_dim.size()-2);fill_n(out[data_i]->receptive_offset.begin(),data_dim.size()-2,0); memoryBytes += out[data_i]->Malloc(data_dim); } out[file_data.size()]->need_diff = false; memoryBytes += out[file_data.size()]->Malloc(labelCPU->dim); checkCUDA(__LINE__, cudaMalloc(&labelGPU, labelCPU->numBytes()) ); memoryBytes += labelCPU->numBytes(); for (int data_i = 0; data_i::prefetch,this); return memoryBytes; }; }; class ConvolutionLayer : public Layer { cudnnFilterDescriptor_t filter_desc; cudnnTensorDescriptor_t bias_desc; cudnnConvolutionDescriptor_t conv_desc; //std::vector algos; public: int num_output; std::vector window; std::vector stride; std::vector padding; std::vector upscale; int group; void init(){ weight_dim.push_back(num_output); weight_dim.push_back(0); // need the channel size from the input weight_dim.insert( weight_dim.end(), window.begin(), window.end() ); bias_dim.resize(weight_dim.size(), 1); bias_dim[1] = num_output; }; ConvolutionLayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetValue(json, train_me, true) SetOrDie(json, num_output ) SetOrDie(json, window ) SetValue(json, weight_lr_mult, 1.0) SetValue(json, weight_filler, Xavier) SetValue(json, weight_filler_param, 0.0) SetValue(json, bias_lr_mult, 2.0) SetValue(json, bias_filler, Constant) SetValue(json, bias_filler_param, 0.0) SetValue(json, weight_decay_mult, 1.0) SetValue(json, bias_decay_mult, 1.0) SetValue(json, group, 1) std::vector ones = std::vector(window.size(),1); std::vector zeros = std::vector(window.size(),0); SetValue(json, padding, zeros) SetValue(json, stride, ones) SetValue(json, upscale, ones) init(); }; ConvolutionLayer(std::string name_, int num_output_, std::vector window_, std::vector padding_, std::vector stride_, std::vector upscale_, ComputeT weight_lr_mult_, Filler weight_filler_, ComputeT weight_filler_param_, ComputeT bias_lr_mult_, Filler bias_filler_, ComputeT bias_filler_param_): Layer(name_), num_output(num_output_), window(window_), stride(stride_), padding(padding_), upscale(upscale_){ weight_lr_mult = weight_lr_mult_; weight_filler = weight_filler_; weight_filler_param = weight_filler_param_; bias_lr_mult = bias_lr_mult_; bias_filler = bias_filler_; bias_filler_param = bias_filler_param_; init(); }; size_t Malloc(Phase phase_){ size_t memoryBytes = 0; train_me = train_me && phase_ != Testing; std::cout<< (train_me? "* " : " "); std::cout<1) std::cout<<" ("<dim[1]/group; // create descriptor checkCUDNN(__LINE__,cudnnCreateFilterDescriptor(&filter_desc) ); checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&bias_desc) ); checkCUDNN(__LINE__,cudnnCreateConvolutionDescriptor(&conv_desc) ); // set descriptor // set the parameters for convolution //#ifdef ND_CONV std::vector weight_dim_group = weight_dim; weight_dim_group[0] = weight_dim[0]/group; checkCUDNN(__LINE__,cudnnSetFilterNdDescriptor(filter_desc, CUDNNStorageT, weight_dim.size(), &weight_dim_group[0]) ); checkCUDNN(__LINE__,cudnnSetConvolutionNdDescriptor(conv_desc, padding.size(), &padding[0], &stride[0], &upscale[0], CUDNN_CROSS_CORRELATION) ); /* #else checkCUDNN(__LINE__,cudnnSetFilter4dDescriptor(filter_desc, CUDNNStorageT, weight_dim[0], weight_dim[1], weight_dim[2], weight_dim[3]) ); checkCUDNN(__LINE__,cudnnSetConvolution2dDescriptor(conv_desc, padding[0], padding[1], stride[0], stride[1], upscale[0], upscale[1], CUDNN_CROSS_CORRELATION) ); #endif */ std::vector bias_stride(bias_dim.size()); bias_stride[bias_dim.size()-1] = 1; for (int d=bias_dim.size()-2;d>=0;--d){ bias_stride[d] = bias_stride[d+1] * bias_dim[d+1]; } //std::cout<<"bias_dim"; veciPrint(bias_dim); //std::cout<<"bias_stride"; veciPrint(bias_stride); FatalError(); checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(bias_desc, CUDNNStorageT, bias_dim.size(), &bias_dim[0], &bias_stride[0]) ); weight_numel = numel(weight_dim); bias_numel = numel(bias_dim); if (weight_numel>0){ std::cout<<" weight"; veciPrint(weight_dim); checkCUDA(__LINE__, cudaMalloc( &weight_dataGPU, weight_numel * sizeofStorageT) ); memoryBytes += weight_numel * sizeofStorageT; //if (train_me){ // checkCUDA(__LINE__, cudaMalloc(&weight_diffGPU[0], weight_numel * sizeofStorageT) ); // checkCUDA(__LINE__, cudaMalloc(&weight_diffGPU[1], weight_numel * sizeofStorageT) ); // memoryBytes += 2 * weight_numel * sizeofStorageT; //} } if (bias_numel>0){ std::cout<<" bias"; veciPrint(bias_dim); checkCUDA(__LINE__, cudaMalloc( &bias_dataGPU, bias_numel * sizeofStorageT) ); memoryBytes += bias_numel * sizeofStorageT; //if (train_me){ // checkCUDA(__LINE__, cudaMalloc(&bias_diffGPU[0], bias_numel * sizeofStorageT) ); // checkCUDA(__LINE__, cudaMalloc(&bias_diffGPU[1], bias_numel * sizeofStorageT) ); // memoryBytes += 2 * bias_numel * sizeofStorageT; //} } std::cout<need_diff = train_me || in[i]->need_diff; // if one of them need the grad //std::cout<<"debug LINE "<<__LINE__<dim"; veciPrint(in[i]->dim); //std::cout<<"weight_dim"; veciPrint(weight_dim); std::vector dimOut; dimOut.resize(in[i]->dim.size()); /* dimOut[0] = in[i]->dim[0]; dimOut[1] = weight_dim[0]; for (int d=2;ddim[d] + 2*padding[d-2] - weight_dim[d])/stride[d-2]; } */ // TODO: Fix this. I failed to make it work /* std::cout<<"input"; veciPrint(in[i]->dim); std::cout<<"conv_desc:"<1){ /* dimOut[0] = in[i]->dim[0]; dimOut[1] = num_output; for (int d=2;ddim[d] + 2*padding[d-2] - weight_dim[d])/stride[d-2]; } */ checkCUDNN(__LINE__,cudnnGetConvolutionNdForwardOutputDim(conv_desc, in[i]->getDesc(group), filter_desc, dimOut.size(), &dimOut[0] )); dimOut[1] *= group; /* }else{ checkCUDNN(__LINE__,cudnnGetConvolutionNdForwardOutputDim(conv_desc, in[i]->desc, filter_desc, in[i]->dim.size(), &dimOut[0] )); } */ /* #else checkCUDNN(__LINE__,cudnnGetConvolution2dForwardOutputDim(conv_desc, in[i]->desc, filter_desc, &dimOut[0], &dimOut[1], &dimOut[2], &dimOut[3] )); #endif */ //std::cout<<"debug LINE "<<__LINE__<receptive_field.size(); out[i]->receptive_field .resize(dall); out[i]->receptive_gap .resize(dall); out[i]->receptive_offset.resize(dall); for(size_t d=0;dreceptive_field[d] = in[i]->receptive_field[d] + ComputeT(window[d]-1) * in[i]->receptive_gap[d]; out[i]->receptive_gap[d] = stride[d] * in[i]->receptive_gap[d]; out[i]->receptive_offset[d] = in[i]->receptive_offset[d] - ComputeT(padding[d]) * in[i]->receptive_gap[d]; } memoryBytes += out[i]->Malloc(dimOut); /* size_t workspace_limit_bytes = this->kernel_h_ * this->kernel_w_ * this->channels_ * sizeof(int) + 1; // pick the convolution algorithm CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, in[i]->desc, filter_desc, conv_desc, out[i]->desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, // memoryLimitInBytes, &algo[i])); // get minimum size of the workspace needed for the desired algorithm size_t workspaceSizeInBytes_temp = 0; CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, in[i]->desc, filter_desc, conv_desc, out[i]->desc, algo[i], &workspaceSizeInBytes_temp)); if (workspaceSizeInBytes_temp > workspaceSizeInBytes) { workspaceSizeInBytes = workspaceSizeInBytes_temp; // free the existing workspace and allocate a new (larger) one cudaFree(this->workspace); cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes); if (err != cudaSuccess) { // force zero memory path algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; workspace = NULL; workspaceSizeInBytes = 0; } } */ } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;idim); std::cout<<"filter"; veciPrint(weight_dim); std::cout<<"output"; veciPrint(out[i]->dim); std::cout<<"stride"; veciPrint(stride); std::cout<<"padding"; veciPrint(padding); std::cout<<"upscale"; veciPrint(upscale); */ //std::cout<<"pooling in[i]: "; //in[i]->checkNaN(); //in[i]->print(veci(2,28,28)); //fillGPU(weight_dataGPU, weight_dim, Constant, 1); //fillGPU(out[i]->dataGPU, out[i]->dim, Increase); //std::cout<<"pooling out[i]: "; //out[i]->checkNaN(); //out[i]->print(veci(2,24,24)); checkCUDNN(__LINE__,cudnnConvolutionForward(cudnnHandle, one, in[i]->getDesc(group), in[i]->dataGPU + (g * in[i]->sizeofitem() / group), filter_desc, weight_dataGPU + (g * weight_numel / group), conv_desc, //CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, // CUDNN For 3-d convolutions, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is supported; support is provided for any format for srcDesc and destDesc as well as support for all data type configurations. NULL, 0, zero, out[i]->getDesc(group), out[i]->dataGPU + (g * out[i]->sizeofitem() / group) ) ); //std::cout<<"pooling out[i]: "; //out[i]->checkNaN(); //out[i]->print(veci(2,24,24)); //out[i]->print(); //FatalError(); } // CUDNN bug //std::cout<<"remove LINE "<<__LINE__<<" for debugging CUDNN bug"<desc, out[i]->dataGPU) ); }else{ std::vector bias_dim_bug; bias_dim_bug.push_back(bias_dim[0]); bias_dim_bug.push_back(bias_dim[1]); bias_dim_bug.push_back(bias_dim[2]); bias_dim_bug.push_back(1); for (int d=3;d bias_stride(bias_dim_bug.size()); bias_stride[bias_dim_bug.size()-1] = 1; for (int d=bias_dim_bug.size()-2;d>=0;--d){ bias_stride[d] = bias_stride[d+1] * bias_dim_bug[d+1]; } cudnnTensorDescriptor_t bias_desc_bug; checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&bias_desc_bug) ); checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(bias_desc_bug, CUDNNStorageT, bias_dim_bug.size(), &bias_dim_bug[0], &bias_stride[0]) ); std::vector out_dim_bug; out_dim_bug.push_back(out[i]->dim[0]); out_dim_bug.push_back(out[i]->dim[1]); out_dim_bug.push_back(out[i]->dim[2]); out_dim_bug.push_back(1); for (int d=3;ddim.size();++d) out_dim_bug[3] *= out[i]->dim[d]; std::vector strideA(out_dim_bug.size()); strideA[out_dim_bug.size()-1] = 1; for (int d=out_dim_bug.size()-2;d>=0;--d) strideA[d] = strideA[d+1] * out_dim_bug[d+1]; cudnnTensorDescriptor_t out_desc_bug; checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&out_desc_bug)); checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(out_desc_bug, CUDNNStorageT, out_dim_bug.size(), &out_dim_bug[0], &strideA[0]) ); checkCUDNN(__LINE__,cudnnAddTensor(cudnnHandle, CUDNN_ADD_SAME_C, one, bias_desc_bug, bias_dataGPU, one, out_desc_bug, out[i]->dataGPU) ); checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(bias_desc_bug) ); checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(out_desc_bug) ); } } }; void backward(Phase phase_){ for (int i=0;ineed_diff){ for (int g = 0; g < group; g++) { checkCUDNN(__LINE__,cudnnConvolutionBackwardData(cudnnHandle, one, filter_desc, weight_dataGPU + (g * weight_numel / group), out[i]->getDesc(group), out[i]->diffGPU + (g * out[i]->sizeofitem() / group), conv_desc, one, //zero, //one, //bbb in[i]->getDesc(group), in[i]->diffGPU + (g * in[i]->sizeofitem() / group))); } } } // compute in->diff first because the next layer need to use it immediate, and because weight_diff needs to write to another GPU for (int i=0;i0){ for (int g = 0; g < group; g++) { /* std::cout<<"in[i]->dim="; veciPrint(in[i]->dim); std::cout<<"out[i]->dim="; veciPrint(out[i]->dim); std::cout<<"conv_desc:"<getDesc(group), in[i]->dataGPU + (g * in[i]->sizeofitem() / group), out[i]->getDesc(group), out[i]->diffGPU + (g * out[i]->sizeofitem() / group), conv_desc, &beta, filter_desc, weight_diffGPU + (g * weight_numel / group))); } } if (bias_numel>0){ checkCUDNN(__LINE__,cudnnConvolutionBackwardBias(cudnnHandle, one, out[i]->desc, out[i]->diffGPU, &beta, bias_desc, bias_diffGPU)); } } } }; ~ConvolutionLayer(){ // destory the descriptor checkCUDNN(__LINE__,cudnnDestroyFilterDescriptor(filter_desc) ); checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(bias_desc) ); checkCUDNN(__LINE__,cudnnDestroyConvolutionDescriptor(conv_desc) ); }; }; class InnerProductLayer : public Layer { int num_input; int num_items; public: int num_output; StorageT* bias_multGPU; // a std::vector with size # of mini-batch training example InnerProductLayer(std::string name_, int num_output_, ComputeT weight_lr_mult_, Filler weight_filler_, ComputeT weight_filler_param_, ComputeT bias_lr_mult_, Filler bias_filler_, ComputeT bias_filler_param_): Layer(name_),num_output(num_output_), bias_multGPU(NULL){ weight_filler = weight_filler_; weight_filler_param = weight_filler_param_; bias_filler = bias_filler_; bias_filler_param = bias_filler_param_; weight_lr_mult = weight_lr_mult_; bias_lr_mult = bias_lr_mult_; }; InnerProductLayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetValue(json, train_me, true) SetValue(json, weight_lr_mult, 1.0) SetValue(json, weight_filler, Xavier) SetValue(json, weight_filler_param, 0.0) SetValue(json, bias_lr_mult, 2.0) SetValue(json, bias_filler, Constant) SetValue(json, bias_filler_param, 0.0) SetValue(json, weight_decay_mult, 1.0) SetValue(json, bias_decay_mult, 1.0) SetOrDie(json, num_output ) }; size_t Malloc(Phase phase_){ size_t memoryBytes = 0; train_me = train_me && phase_ != Testing; std::cout<< (train_me? "* " : " "); std::cout<dim); num_items = in[0]->dim[0]; weight_dim.resize(2); weight_dim[0] = num_output; weight_dim[1] = num_input; bias_dim.resize(1); bias_dim[0] = num_output; weight_numel = numel(weight_dim); bias_numel = numel(bias_dim); if (weight_numel>0){ std::cout<<" weight"; veciPrint(weight_dim); checkCUDA(__LINE__, cudaMalloc(&weight_dataGPU, weight_numel * sizeofStorageT) ); memoryBytes += weight_numel * sizeofStorageT; //if (train_me){ // checkCUDA(__LINE__, cudaMalloc(&weight_diffGPU[0], weight_numel * sizeofStorageT) ); // checkCUDA(__LINE__, cudaMalloc(&weight_diffGPU[1], weight_numel * sizeofStorageT) ); // memoryBytes += 2* weight_numel * sizeofStorageT; //} } if (bias_numel>0){ std::cout<<" bias"; veciPrint(bias_dim); checkCUDA(__LINE__, cudaMalloc(&bias_dataGPU, bias_numel * sizeofStorageT) ); memoryBytes += bias_numel * sizeofStorageT; //if (train_me){ // checkCUDA(__LINE__, cudaMalloc(&bias_diffGPU[0], bias_numel * sizeofStorageT) ); // checkCUDA(__LINE__, cudaMalloc(&bias_diffGPU[1], bias_numel * sizeofStorageT) ); // memoryBytes += 2*bias_numel * sizeofStorageT; //} checkCUDA(__LINE__, cudaMalloc(&bias_multGPU, num_items * sizeofStorageT) ); Kernel_set_value<<>>(CUDA_GET_LOOPS(num_items), num_items, bias_multGPU, CPUCompute2StorageT(1)); memoryBytes += num_items * sizeofStorageT; } std::cout<need_diff = train_me || in[i]->need_diff; // if one of them need the grad std::vector dimOut(in[i]->dim.size()); dimOut[0] = in[i]->dim[0]; dimOut[1] = num_output; for (int d=2;ddim.size();++d) dimOut[d] = 1; //std::cout<<"dimOut"; veciPrint(dimOut); size_t dall = in[i]->receptive_field.size(); out[i]->receptive_field .resize(dall); out[i]->receptive_gap .resize(dall); out[i]->receptive_offset.resize(dall); for(size_t d=0;dreceptive_field[d] = in[i]->receptive_field[d] + ComputeT(in[i]->dim[d+2]-1) * in[i]->receptive_gap[d]; out[i]->receptive_gap[d] = 0; out[i]->receptive_offset[d] = 0; } memoryBytes += out[i]->Malloc(dimOut); } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;idataGPU, weight_dataGPU, zero, out[i]->dataGPU); if (bias_numel>0) gemm(CUBLAS_OP_N, CUBLAS_OP_N, num_items, num_output, 1, one, bias_multGPU, bias_dataGPU, one, out[i]->dataGPU); */ checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, num_output, num_items, num_input, oneComputeT, weight_dataGPU, num_input, in[i]->dataGPU, num_input, zeroComputeT, out[i]->dataGPU, num_output) ); /* { std::cout<<"1="; Tensor* weightCPU = new Tensor("",weight_dim); checkCUDA(__LINE__, cudaMemcpy(weightCPU->CPUmem, weight_dataGPU, weightCPU->numBytes(), cudaMemcpyDeviceToHost) ); weightCPU->print(veci(1,5)); std::cout<dim); checkCUDA(__LINE__, cudaMemcpy(weightCPU->CPUmem, in[i]->dataGPU, weightCPU->numBytes(), cudaMemcpyDeviceToHost) ); weightCPU->print(veci(1,5)); std::cout<dim); checkCUDA(__LINE__, cudaMemcpy(weightCPU->CPUmem, out[i]->dataGPU, weightCPU->numBytes(), cudaMemcpyDeviceToHost) ); weightCPU->print(veci(1,5)); std::cout<0) checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, num_output, num_items, 1, oneComputeT, bias_dataGPU, num_output, bias_multGPU, 1, oneComputeT, out[i]->dataGPU, num_output) ); } }; /* void gemm(cuTransA, cuTransB, M, N, K, alpha, A, B, beta, C) { int lda = (cuTransA == CUBLAS_OP_N) ? K : M; int ldb = (cuTransB == CUBLAS_OP_N) ? N : K; checkCUDA(__LINE__, GPUgemm( cuTransB, cuTransA, N, M, K, alpha, B, ldb, A, lda, beta, C, N) ); } */ void backward(Phase phase_){ for (int i=0;ineed_diff){ // gemm(CUBLAS_OP_N, CUBLAS_OP_N, num_items, num_input, num_output, one, out[i]->diffGPU, bias_dataGPU, zero, in[i]->diffGPU); //std::cout<<"====== "<dim"; veciPrint(in[i]->dim); //std::cout<<"out[i]->dim"; veciPrint(out[i]->dim); checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, num_input, num_items, num_output, oneComputeT, weight_dataGPU, num_input, out[i]->diffGPU, num_output, oneComputeT, in[i]->diffGPU, num_input) ); } } for (int i=0;i0){ //gemm(CUBLAS_OP_T, CUBLAS_OP_N, num_output, num_input, num_items, one, out[i]->diffGPU, in[i]->dataGPU, one, weight_diffGPU); checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, num_input, num_output, num_items, oneComputeT, in[i]->dataGPU, num_input, out[i]->diffGPU, num_output, &beta, weight_diffGPU, num_input) ); } if (bias_numel>0){ //checkCUBLAS(__LINE__, GPUgemv(cublasHandle, CUBLAS_OP_N, num_output, num_items, oneComputeT, out[i]->diffGPU, num_output, bias_multGPU, 1, &beta, bias_diffGPU, 1)); checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, num_output, 1, num_items, oneComputeT, out[i]->diffGPU, num_output, bias_multGPU, num_items, &beta, bias_diffGPU, num_output) ); } } } }; ~InnerProductLayer(){ if (bias_multGPU!=NULL) checkCUDA(__LINE__, cudaFree(bias_multGPU)); }; }; class DropoutLayer: public Layer{ ComputeT scale; std::bernoulli_distribution* distribution; std::future lock; bool current_mask; std::vector< StorageT* > GPUmask[2]; std::vector< StorageT* > CPUmask; std::vector BYTESmask; std::vector SIZEmask; public: ComputeT dropout_rate; void init(){ current_mask = true; distribution = new std::bernoulli_distribution(dropout_rate); scale = 1. / (1. - dropout_rate); }; DropoutLayer(std::string name_, ComputeT dropout_rate_): Layer(name_), dropout_rate(dropout_rate_){ init(); }; DropoutLayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetValue(json, dropout_rate, 0.5) init(); }; void generateMask(){ checkCUDA(__LINE__,cudaSetDevice(GPU)); StorageT zeroStorageT = CPUCompute2StorageT(ComputeT(0)); StorageT scaleStorageT = CPUCompute2StorageT(scale); for (int i=0;ineed_diff = in[i]->need_diff; out[i]->receptive_field = in[i]->receptive_field; out[i]->receptive_gap = in[i]->receptive_gap; out[i]->receptive_offset = in[i]->receptive_offset; memoryBytes += out[i]->Malloc(in[i]->dim); } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;idesc, in[i]->dataGPU, zero, out[i]->desc, out[i]->dataGPU)); } }; void backward(Phase phase_){ for (int i=0;ineed_diff){ //debug //fillGPU(in[i]->diffGPU, in[i]->dim, Constant, 0.0); //tic(); if (stable_gradient){ if (in[i]->diffGPU != out[i]->diffGPU){ //checkCUDA(__LINE__, cudaMemcpy(in[i]->diffGPU, out[i]->diffGPU, numel(in[i]->dim)*sizeofStorageT, cudaMemcpyDeviceToDevice) ); //checkCUBLAS(__LINE__, GPUaxpy(cublasHandle, numel(in[i]->dim), oneComputeT, out[i]->diffGPU, 1, in[i]->diffGPU, 1) ); xpy(numel(in[i]->dim), out[i]->diffGPU, in[i]->diffGPU); } }else{ checkCUDNN(__LINE__,cudnnSoftmaxBackward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, //CUDNN_SOFTMAX_MODE_CHANNEL, one, out[i]->desc, out[i]->dataGPU, out[i]->desc, out[i]->diffGPU, zero, //one, //bbb in[i]->desc, in[i]->diffGPU)); } // hack softmax error /* */ //toc(); /* { std::cout<<"out[i]->dataGPU:"<* cpu = new Tensor("",out[i]->dim); checkCUDA(__LINE__, cudaMemcpy(cpu->CPUmem, out[i]->dataGPU, cpu->numBytes(), cudaMemcpyDeviceToHost) ); cpu->print(veci(2,64,10)); std::cout<diffGPU:"<* cpu = new Tensor("",out[i]->dim); checkCUDA(__LINE__, cudaMemcpy(cpu->CPUmem, out[i]->diffGPU, cpu->numBytes(), cudaMemcpyDeviceToHost) ); cpu->print(veci(2,64,10)); std::cout<diffGPU:"<* cpu = new Tensor("",in[i]->dim); checkCUDA(__LINE__, cudaMemcpy(cpu->CPUmem, in[i]->diffGPU, cpu->numBytes(), cudaMemcpyDeviceToHost) ); cpu->print(veci(2,64,10)); std::cout<diffGPU, numel(in[i]->dim))>0){ std::cout<<"out[i]->dataGPU= "; checkNaN(out[i]->dataGPU, numel(out[i]->dim)); std::cout<diffGPU= "; checkNaN(out[i]->diffGPU, numel(out[i]->dim)); std::cout<need_diff = in[i]->need_diff; out[i]->receptive_field = in[i]->receptive_field; out[i]->receptive_gap = in[i]->receptive_gap; out[i]->receptive_offset = in[i]->receptive_offset; memoryBytes += out[i]->Malloc(in[i]->dim); } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;idesc, in[i]->dataGPU, zero, out[i]->desc, out[i]->dataGPU)); } }; void backward(Phase phase_){ for (int i=0;ineed_diff){ checkCUDNN(__LINE__,cudnnActivationBackward(cudnnHandle, mode, one, out[i]->desc, out[i]->dataGPU, out[i]->desc, out[i]->diffGPU, in[i]->desc, in[i]->dataGPU, zero, //one, //bbb in[i]->desc, in[i]->diffGPU)); } } }; }; class PoolingLayer : public Layer { cudnnPoolingDescriptor_t desc; public: cudnnPoolingMode_t mode; std::vector window; std::vector padding; std::vector stride; void init(){ //std::cout<<"window"; veciPrint(window); //std::cout<<"padding"; veciPrint(padding); //std::cout<<"stride"; veciPrint(stride); checkCUDNN(__LINE__,cudnnCreatePoolingDescriptor(&desc) ); //#ifdef ND_CONV checkCUDNN(__LINE__,cudnnSetPoolingNdDescriptor(desc, mode, window.size(), &window[0], &padding[0], &stride[0])); /* #else checkCUDNN(__LINE__,cudnnSetPooling2dDescriptor(desc, mode, window[0], window[1], padding[0], padding[1], stride[0], stride[1])); #endif */ }; PoolingLayer(std::string name_, cudnnPoolingMode_t mode_, std::vector window_, std::vector padding_, std::vector stride_): Layer(name_), mode(mode_), window(window_), padding(padding_), stride(stride_){ init(); }; PoolingLayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetValue(json, mode, CUDNN_POOLING_MAX) SetOrDie(json, window ) std::vector zeros = std::vector(window.size(),0); SetValue(json, padding, zeros) SetValue(json, stride, window) init(); }; size_t Malloc(Phase phase_){ size_t memoryBytes=0; std::cout<< (train_me? "* " : " "); std::cout<need_diff = in[i]->need_diff; // compute the size to allocate memory std::vector dimOut(in[i]->dim.size()); dimOut[0] = in[i]->dim[0]; // size of mini-bath dimOut[1] = in[i]->dim[1]; // channels for (int d=2;ddim.size();++d){ //dimOut[d] = 1 + (in[i]->dim[d] + 2*padding[d-2] - window[d-2])/stride[d-2]; dimOut[d] = 1 + static_cast(ceil(static_cast(in[i]->dim[d] + 2*padding[d-2] - window[d-2])/stride[d-2])); } size_t dall = in[i]->receptive_field.size(); out[i]->receptive_field .resize(dall); out[i]->receptive_gap .resize(dall); out[i]->receptive_offset.resize(dall); for(size_t d=0;dreceptive_field[d] = in[i]->receptive_field[d] + ComputeT(window[d]-1) * in[i]->receptive_gap[d]; out[i]->receptive_gap[d] = stride[d] * in[i]->receptive_gap[d]; out[i]->receptive_offset[d] = in[i]->receptive_offset[d] - ComputeT(padding[d]) * in[i]->receptive_gap[d]; } memoryBytes += out[i]->Malloc(dimOut); } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;icheckNaN(); //in[i]->print(24*24*2); //std::cout<<"pooling out[i]: "; //out[i]->checkNaN(); //out[i]->print(12*12*2); // CUDNN bug checkCUDNN(__LINE__,cudnnPoolingForward(cudnnHandle, desc, one, in[i]->desc, in[i]->dataGPU, zero, out[i]->desc, out[i]->dataGPU)); //std::cout<<"pooling out[i]: "; //out[i]->checkNaN(); //out[i]->print(12*12*2); //FatalError(); } }; void backward(Phase phase_){ for (int i=0;ineed_diff){ checkCUDNN(__LINE__,cudnnPoolingBackward(cudnnHandle, desc, one, out[i]->desc, out[i]->dataGPU, out[i]->desc, out[i]->diffGPU, in[i]->desc, in[i]->dataGPU, one, //zero, //one, //bbb in[i]->desc, in[i]->diffGPU)); } } }; ~PoolingLayer(){ checkCUDNN(__LINE__,cudnnDestroyPoolingDescriptor(desc) ); }; }; class LRNLayer : public Layer { cudnnLRNDescriptor_t desc; public: LRN mode; unsigned int local_size; ComputeT alpha; ComputeT beta; ComputeT k; void init(){ if (local_sizeCUDNN_LRN_MAX_N){ std::cout<<"LRN local_size out of range ["<< CUDNN_LRN_MIN_N <<","<< CUDNN_LRN_MAX_N <<"]: local_size="<need_diff = in[i]->need_diff; out[i]->receptive_field = in[i]->receptive_field; out[i]->receptive_gap = in[i]->receptive_gap; out[i]->receptive_offset = in[i]->receptive_offset; memoryBytes += out[i]->Malloc(in[i]->dim); } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;idesc, in[i]->dataGPU, zero, out[i]->desc, out[i]->dataGPU)); break; case DivisiveNormalization: #ifdef CUDNN_DivisiveNormalization // What is the Best Multi-Stage Architecture for Object Recognition? // http://yann.lecun.com/exdb/publis/pdf/jarrett-iccv-09.pdf std::cout<<"Not implemented yet"<desc, in[i]->dataGPU, srcMeansData, tempData, tempData2, zero, out[i]->desc, out[i]->dataGPU)); #endif break; } } }; void backward(Phase phase_){ for (int i=0;ineed_diff){ switch(mode){ case CrossChannel: checkCUDNN(__LINE__,cudnnLRNCrossChannelBackward(cudnnHandle, desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, one, out[i]->desc /*srcDesc*/, out[i]->dataGPU /*srcData*/, out[i]->desc /*srcDiffDesc*/, out[i]->diffGPU /*srcDiffData*/, in[i]->desc /*destDesc*/, in[i]->dataGPU /*destData*/, zero, //one, //bbb in[i]->desc /*destDiffDesc*/, in[i]->diffGPU /*destDiffData*/)); break; case DivisiveNormalization: #ifdef CUDNN_DivisiveNormalization std::cout<<"Not implemented yet"<desc /*srcDesc*/, out[i]->dataGPU /*srcData*/, srcMeansData /*srcMeansData*/, out[i]->diffGPU /*srcDiffData*/, tempData /*tempData*/, tempData2 /*tempData2*/, zero, //one, //bbb in[i]->desc /*destDataDesc*/, in[i]->diffGPU /*destDataDiff*/, destMeansDiff /*destMeansDiff*/)); #endif break; } } } }; }; class ReshapeLayer: public Layer { public: std::vector shape; ReshapeLayer(std::string name_, Phase phase_): Layer(name_){ phase = phase_; }; ReshapeLayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetOrDie(json, shape) bool remainExist = false; for(int d=0;dneed_diff = in[i]->need_diff; std::vector dim; for(int d=0;ddim[d]); }else if (shape[d]==-1){ dim.push_back(-1); }else{ dim.push_back(shape[d]); } } int remain = numel(in[i]->dim)/numel(dim); if (remain!=1){ remain = -remain; for(int d=0;dreceptive_field = in[i]->receptive_field; out[i]->receptive_gap = in[i]->receptive_gap; memoryBytes += out[i]->Malloc(dim); } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;idataGPU, in[i]->dataGPU, in[i]->numBytes(), cudaMemcpyDeviceToDevice)); } }; void backward(Phase phase_){ for(int i=0;ineed_diff){ size_t N = numel(in[i]->dim); Kernel_elementwise_acc<<>>(CUDA_GET_LOOPS(N), N, in[i]->diffGPU, out[i]->diffGPU); } } }; }; class ROILayer: public Layer { public: std::vector shape; ROILayer(std::string name_, Phase phase_): Layer(name_){ phase = phase_; }; ROILayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetOrDie(json, shape) }; size_t Malloc(Phase phase_){ size_t memoryBytes = 0; std::cout<< (train_me? "* " : " "); std::cout<dim.size() != shape.size()+1) { std::cout<need_diff = in[i*2]->need_diff; if (! (in[i*2+1]->dim[0] == in[i*2]->dim[0] && sizeofitem(in[i*2+1]->dim) == shape.size())){ std::cout<dim is wrong"< dim; dim.push_back(in[i*2]->dim[0]); for(int d=0;ddim[d+1]); }else{ dim.push_back(shape[d]); } } out[i]->receptive_field = in[i*2]->receptive_field; out[i]->receptive_gap = in[i*2]->receptive_gap; out[i]->receptive_offset = in[i*2]->receptive_offset; memoryBytes += out[i]->Malloc(dim); } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;idim); switch(shape.size()){ case 3: Kernel_ROIforward_2D<<>>(CUDA_GET_LOOPS(N), N, out[i]->dataGPU, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3]); break; case 4: Kernel_ROIforward_3D<<>>(CUDA_GET_LOOPS(N), N, out[i]->dataGPU, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], out[i]->dim[4], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4]); break; case 5: Kernel_ROIforward_4D<<>>(CUDA_GET_LOOPS(N), N, out[i]->dataGPU, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], out[i]->dim[4], out[i]->dim[5], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4], in[i*2]->dim[5]); break; default: std::cerr<<"Haven't implemented yet"<need_diff){ size_t N = numel(out[i]->dim); switch(shape.size()){ case 3: Kernel_ROIbackward_2D<<>>(CUDA_GET_LOOPS(N), N, out[i]->diffGPU, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3]); break; case 4: Kernel_ROIbackward_3D<<>>(CUDA_GET_LOOPS(N), N, out[i]->diffGPU, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], out[i]->dim[4], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4]); break; case 5: Kernel_ROIbackward_4D<<>>(CUDA_GET_LOOPS(N), N, out[i]->diffGPU, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], out[i]->dim[4], out[i]->dim[5], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4], in[i*2]->dim[5]); break; default: std::cerr<<"Haven't implemented yet"< GPUIndex; public: ComputeT spatial_scale; std::vector shape; ROIPoolingLayer(std::string name_, Phase phase_): Layer(name_){ phase = phase_; }; ROIPoolingLayer(JSON* json){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetOrDie(json, shape) SetOrDie(json, spatial_scale) }; ~ROIPoolingLayer(){ for (int i=0;idim.size() != shape.size()+2) { std::cout<need_diff = in[i*2]->need_diff; if ( sizeofitem(in[i*2+1]->dim) != 1 + 2 * shape.size() ){ std::cout<dim is wrong"< dim; dim.push_back(in[i*2+1]->dim[0]); // number of boxes dim.push_back(in[i*2]->dim[1]); // number of channels from convolutions for(int d=0;dMalloc(dim); if (in[i*2]->need_diff){ checkCUDA(__LINE__, cudaMalloc(&GPUIndex[i], numel(out[i]->dim) * sizeof(size_t)) ); memoryBytes += numel(out[i]->dim) * sizeof(size_t); } } return memoryBytes; }; void forward(Phase phase_){ for (int i=0;idim); switch(shape.size()){ case 2: Kernel_ROIPoolForward_2D<<>>(CUDA_GET_LOOPS(N), N, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dataGPU, GPUIndex[i], spatial_scale, in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], shape[0], shape[1]); break; case 3: Kernel_ROIPoolForward_3D<<>>(CUDA_GET_LOOPS(N), N, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dataGPU, GPUIndex[i], spatial_scale, in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4], shape[0], shape[1], shape[2]); break; default: std::cerr<<"Haven't implemented yet"<need_diff){ size_t N = numel(in[i*2]->dim); switch(shape.size()){ case 2: Kernel_ROIPoolBackward_2D<<>>(CUDA_GET_LOOPS(N), N, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->diffGPU, GPUIndex[i], spatial_scale, in[i*2+1]->dim[0], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], shape[0], shape[1]); break; case 3: Kernel_ROIPoolBackward_3D<<>>(CUDA_GET_LOOPS(N), N, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->diffGPU, GPUIndex[i], spatial_scale, in[i*2+1]->dim[0], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4], shape[0], shape[1], shape[2]); break; default: std::cerr<<"Haven't implemented yet"<need_diff = false; for(int i=j*in_group; i<(j+1)*in_group;i++){ if (in[i]->need_diff){ out[j]->need_diff = true; break; } } out[j]->receptive_field = in[j*in_group]->receptive_field; out[j]->receptive_gap = in[j*in_group]->receptive_gap; for(int i=j*in_group+1; i<(j+1)*in_group;i++){ for(size_t d=0; dreceptive_field.size();++d){ out[j]->receptive_field[d] = max(out[j]->receptive_field[d],in[i]->receptive_field[d]); out[j]->receptive_gap [d] = max(out[j]->receptive_gap [d],in[i]->receptive_gap [d]); } } memoryBytes += out[j]->Malloc(in[j*in_group]->dim); } return memoryBytes; }; void forward(Phase phase_){ switch(mode){ case ElementWise_EQL: for (int i=0;idim); GPU_set_ones(n, out[i]->dataGPU); for (int j=i*in_group+1; j<(i+1)*in_group; ++j){ GPU_elementwise_comparison(n, out[i]->dataGPU, in[i*in_group]->dataGPU, in[j]->dataGPU); } } break; case ElementWise_MUL: std::cout<<"Not implemented yet"<need_diff){ switch(mode){ case ElementWise_EQL: std::cout<<"ElementWise_EQL cannot backprop"<need_diff = false; for(int i=j*in_group; i<(j+1)*in_group;i++){ if (in[i]->need_diff){ out[j]->need_diff = true; break; } } std::vector dim = in[j*in_group]->dim; for(int i=j*in_group+1; i<(j+1)*in_group;i++){ dim[1] += in[i]->dim[1]; } out[j]->receptive_field = in[j*in_group]->receptive_field; out[j]->receptive_gap = in[j*in_group]->receptive_gap; out[j]->receptive_offset = in[j*in_group]->receptive_offset; for(int i=j*in_group+1; i<(j+1)*in_group;i++){ for(size_t d=0; dreceptive_field.size();++d){ out[j]->receptive_field [d] = max(out[j]->receptive_field [d],in[i]->receptive_field [d]); out[j]->receptive_gap [d] = max(out[j]->receptive_gap [d],in[i]->receptive_gap [d]); out[j]->receptive_offset[d] = min(out[j]->receptive_offset[d],in[i]->receptive_offset[d]); } } memoryBytes += out[j]->Malloc(dim); } return memoryBytes; }; void forward(Phase phase_){ for(int j=0;jdim[0]; for(int i=j*in_group; i<(j+1)*in_group;i++){ copyGPUforward (numofitems, in[i]->dataGPU, out[j]->dataGPU, sizeofitem(in[i]->dim), sizeofitem(out[j]->dim), offset); offset += sizeofitem(in[i]->dim); } } }; void backward(Phase phase_){ for(int j=0;jdim[0]; for(int i=j*in_group; i<(j+1)*in_group;i++){ if (in[i]->need_diff){ copyGPUbackward(numofitems, in[i]->diffGPU, out[j]->diffGPU, sizeofitem(in[i]->dim), sizeofitem(out[j]->dim), offset); } offset += sizeofitem(in[i]->dim); } } }; }; class LossLayer : public Layer { StorageT* loss_values; StorageT* loss_weightsGPU; size_t loss_numel; int numExamples; ComputeT scale; public: ComputeT result; ComputeT loss; LossObjective mode; ComputeT loss_weight; std::vector loss_weights; ComputeT margin; LossLayer(std::string name_, LossObjective mode_, ComputeT loss_weight_): Layer(name_), mode(mode_), loss_weight(loss_weight_), loss_values(NULL), loss_weightsGPU(NULL){ train_me = false; }; LossLayer(JSON* json): loss_values(NULL), loss_weightsGPU(NULL){ SetOrDie(json, name) SetValue(json, phase, TrainingTesting) SetOrDie(json, mode) SetValue(json, loss_weight, 1) SetValue(json, margin, 1) SetValue(json, loss_weights, std::vector()) train_me = false; }; ~LossLayer(){ if (loss_values!=NULL) checkCUDA(__LINE__, cudaFree(loss_values)); if (loss_weightsGPU!=NULL) checkCUDA(__LINE__, cudaFree(loss_weightsGPU)); }; size_t Malloc(Phase phase_){ std::cout<< (train_me? "* " : " "); std::cout<dim[0]; switch(mode){ case MultinomialLogistic_StableSoftmax: case MultinomialLogistic: if (!(in.size()==2 || in.size()==3)) { std::cout<<"LossLayer: MultinomialLogistic should have 2 or 3 ins"<dim,in[1]->dim)){ std::cout<<"LossLayer: MultinomialLogistic should have the same dimensions except channels"<dim[1]!=1) { std::cout<<"LossLayer: MultinomialLogistic in[1] should have only 1 channel"<dim); break; case SmoothL1: if (!(in.size()==2 || in.size()==3)) { std::cout<<"LossLayer: SmoothL1 should have 2 or 3 ins"<dim,in[1]->dim)){ std::cout<<"LossLayer: SmoothL1 should have the same dimensions"<dim,in[2]->dim)){ std::cout<<"LossLayer: SmoothL1 should have the same dimensions"<dim); break; case Contrastive: loss_numel = numExamples; break; case EuclideanSSE: break; case HingeL1: break; case HingeL2: break; case SigmoidCrossEntropy: break; case Infogain: break; } scale = loss_weight / loss_numel; memoryBytes += loss_numel * sizeofStorageT; checkCUDA(__LINE__, cudaMalloc( &loss_values, memoryBytes) ); if (loss_weights.size()>0){ size_t newBytes = loss_weights.size() * sizeofStorageT; checkCUDA(__LINE__, cudaMalloc( &loss_weightsGPU, newBytes) ); memoryBytes += newBytes; StorageT* CPUram = new StorageT[loss_weights.size()]; for (int i=0;i>>(CUDA_GET_LOOPS(loss_numel),loss_numel,in[0]->dim[1],numspel(in[0]->dim),in[0]->dataGPU,in[1]->dataGPU,loss_weightsGPU,loss_values); checkCUBLAS(__LINE__, GPUasum(cublasHandle, loss_numel, loss_values, 1, &resultSum)); result += resultSum/loss_numel; Loss_MultinomialLogistic<<>>(CUDA_GET_LOOPS(loss_numel),loss_numel,in[0]->dim[1],numspel(in[0]->dim),in[0]->dataGPU,in[1]->dataGPU,loss_weightsGPU,(in.size()==3 ? in[2]->dataGPU : NULL),loss_values); break; case SmoothL1: Loss_SmoothL1<<>>(CUDA_GET_LOOPS(loss_numel),loss_numel,in[0]->dataGPU,in[1]->dataGPU,(in.size()==3 ? in[2]->dataGPU : NULL),loss_values); break; case Contrastive: Loss_Contrastive<<>>(CUDA_GET_LOOPS(loss_numel),loss_numel,in[0]->dim[1],margin,in[0]->dataGPU,in[1]->dataGPU,in[2]->dataGPU,loss_values); break; case EuclideanSSE: break; case HingeL1: break; case HingeL2: break; case SigmoidCrossEntropy: break; case Infogain: break; } ComputeT lossSum; checkCUBLAS(__LINE__, GPUasum(cublasHandle, loss_numel, loss_values, 1, &lossSum)); loss += lossSum/loss_numel; }; void backward(Phase phase_){ //return; // either write this in Cuda or get both the prediction and ground truth to CPU and do the computation and write the diff back to GPU if (in[0]->need_diff){ switch(mode){ case MultinomialLogistic_StableSoftmax: LossGrad_MultinomialLogistic_StableSoftmax<<>>(CUDA_GET_LOOPS(loss_numel),loss_numel,in[0]->dim[1],numspel(in[0]->dim),scale,in[0]->dataGPU,in[1]->dataGPU,loss_weightsGPU,(in.size()==3 ? in[2]->dataGPU : NULL),in[0]->diffGPU); break; case MultinomialLogistic: //checkCUDA(__LINE__, cudaMemset(in[0]->diffGPU, 0, sizeofStorageT * numel(in[0]->dim))); // it has been zero out in Net::backward() LossGrad_MultinomialLogistic<<>>(CUDA_GET_LOOPS(loss_numel),loss_numel,in[0]->dim[1],numspel(in[0]->dim),scale,in[0]->dataGPU,in[1]->dataGPU,loss_weightsGPU,(in.size()==3 ? in[2]->dataGPU : NULL),in[0]->diffGPU); break; case SmoothL1: LossGrad_SmoothL1<<>>(CUDA_GET_LOOPS(loss_numel),loss_numel,scale,in[0]->dataGPU,in[1]->dataGPU,(in.size()==3 ? in[2]->dataGPU : NULL),in[0]->diffGPU); break; case Contrastive: LossGrad_Contrastive<<>>(CUDA_GET_LOOPS(loss_numel),loss_numel,in[0]->dim[1],margin,scale,in[0]->dataGPU,in[1]->dataGPU,in[2]->dataGPU,in[0]->diffGPU,in[1]->diffGPU); break; case EuclideanSSE: break; case HingeL1: break; case HingeL2: break; case SigmoidCrossEntropy: break; case Infogain: break; } } }; }; ////////////////////////////////////////////////////////////////////////////////////////////////// // Add your new layers here ////////////////////////////////////////////////////////////////////////////////////////////////// #include "dss.hpp" ////////////////////////////////////////////////////////////////////////////////////////////////// // Net ////////////////////////////////////////////////////////////////////////////////////////////////// class Net{ public: Phase phase; std::vector layers; std::vector responses; std::vector loss_layers; int GPU; bool debug_mode; int train_iter; int test_iter; cudnnHandle_t cudnnHandle; cublasHandle_t cublasHandle; void init(JSON* architecture_obj){ checkCUDA(__LINE__,cudaSetDevice(GPU)); checkCUDNN(__LINE__,cudnnCreate(&cudnnHandle) ); checkCUBLAS(__LINE__, cublasCreate(&cublasHandle) ); for (int l=0;larray.size();++l){ JSON* p = (JSON*)(architecture_obj->array[l]); std::string type = p->member["type"]->returnString(); Layer* pLayer; Response* pResponse; if (0==type.compare("MemoryData")) pLayer = new MemoryDataLayer(p); else if (0==type.compare("DiskData")){ uint8_t fpTypeid = readTypeID(p->member["file_data"]->returnString()); //std::vector file_data = p->member["file_data"]->returnStringVector(); //uint8_t fpTypeid = readTypeID(file_data[0]); if (fpTypeid==typeID(typeid(half))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(float))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(double))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(uint8_t))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(uint16_t))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(uint32_t))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(uint64_t))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(int8_t))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(int16_t))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(int32_t))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(int64_t))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(char))) pLayer = new DiskDataLayer(p); else if (fpTypeid==typeID(typeid(bool))) pLayer = new DiskDataLayer(p); } else if (0==type.compare("ElementWise")) pLayer = new ElementWiseLayer(p); else if (0==type.compare("Concat")) pLayer = new ConcatLayer(p); else if (0==type.compare("Convolution")) pLayer = new ConvolutionLayer(p); else if (0==type.compare("Reshape")) pLayer = new ReshapeLayer(p); else if (0==type.compare("InnerProduct")) pLayer = new InnerProductLayer(p); else if (0==type.compare("Pooling")) pLayer = new PoolingLayer(p); else if (0==type.compare("Dropout")) pLayer = new DropoutLayer(p); else if (0==type.compare("Activation")) pLayer = new ActivationLayer(p); else if (0==type.compare("LRN")) pLayer = new LRNLayer(p); else if (0==type.compare("Softmax")) pLayer = new SoftmaxLayer(p); else if (0==type.compare("ROI")) pLayer = new ROILayer(p); else if (0==type.compare("ROIPooling")) pLayer = new ROIPoolingLayer(p); else if (0==type.compare("Tensor")) pLayer = new TensorLayer(p); else if (0==type.compare("Loss")) { pLayer = new LossLayer(p); loss_layers.push_back((LossLayer*)pLayer); } else if (0==type.compare("Scene3DData")) { pLayer = new Scene3DDataLayer(p); } else if (0==type.compare("Scene2DData")) { pLayer = new Scene2DDataLayer(p); } else if (0==type.compare("RPNDataLayer")) { pLayer = new RPNDataLayer(p); } else if (0==type.compare("RenderMeshDataLayer")) { pLayer = new RenderMeshDataLayer(p); } else { std::cout<<"ERROR: recognizable layer in JSON file: "<cudnnHandle = cudnnHandle; pLayer->cublasHandle = cublasHandle; pLayer->GPU = GPU; addLayer(pLayer); if (p->member.find("out") != p->member.end()){ std::vector out = p->member["out"]->returnStringVector(); for (int i=0;icublasHandle = cublasHandle; } pLayer->addOut(pResponse); } } if (p->member.find("in") != p->member.end()){ std::vector in = p->member["in"]->returnStringVector(); for (int i=0;icublasHandle = cublasHandle; } pLayer->addIn(pResponse); } } } }; Net(std::string filename){ JSON* test_obj = new JSON; JSON* architecture_obj = new JSON; parseNetworkJSON(filename, NULL, test_obj, architecture_obj); SetValue(test_obj, GPU, 0) SetValue(test_obj, debug_mode, false) init(architecture_obj); delete test_obj; delete architecture_obj; }; Net(JSON* architecture_obj, int GPU_ = 0): GPU(GPU_){ init(architecture_obj); }; ~Net(){ checkCUDA(__LINE__,cudaSetDevice(GPU)); for (int i=0;iname == name){ return layers[l]; } } return NULL; }; Response* getResponse(std::string name){ for (int r=0; rname == name){ return responses[r]; } } return NULL; }; Layer* addLayer(Layer* pLayer){ layers.push_back(pLayer); return pLayer; }; Response* addResponse(Response* pResponse){ responses.push_back(pResponse); return pResponse; }; void randInit(){ checkCUDA(__LINE__,cudaSetDevice(GPU)); for (int l=0; lrandInit(); } }; void loadWeights(std::vector*> weights, bool diff=false){ checkCUDA(__LINE__,cudaSetDevice(GPU)); // let the layers find their weights based on their names for (int l=0; lsetWeights(weights); if (diff) layers[l]->setDiffs(weights); } }; void loadWeights(std::string filename, bool diff=false){ std::cout<< "====================================================================================================================================="<*> weights = readTensors(filename); loadWeights(weights, diff); // release memory for the weights for (int i=0; isaveWeights(fp); if (diff) layers[l]->saveDiffs(fp); } fclose(fp); }; size_t Malloc(Phase phase_ = Testing){ checkCUDA(__LINE__,cudaSetDevice(GPU)); phase = phase_; std::cout<< "====================================================================================================================================="<Malloc(phase); } std::cout<< "====================================================================================================================================="<> Total GPU memory: " << memoryBytes <<" Bytes = " << memoryBytes/(1024.0*1024.0*1024.0) << " GB on GPU #" << GPU <phase == phase || layers[l]->phase == TrainingTesting){ if (debug_mode){ std::cout<<"[Forward] Layer["<name; ComputeT avg; avg = layers[l]->ameanWeightData(); if (avg!=-1) std::cout<<" weight.data: "<< avg; avg = layers[l]->ameanBiasData(); if (avg!=-1) std::cout<<" bias.data: "<< avg; tic(); if (l==21){ if (layers[l]->in.size()>0){ for (size_t o=0; oin.size();++o){ ComputeT avg = layers[l]->in[o]->ameanData(); if (avg!=-1) std::cout<<" in[" << o << "].data: " << avg; layers[l]->in[o]->checkNaN(); } } std::cout<forward(phase); if (debug_mode){ checkCUDA(__LINE__,cudaDeviceSynchronize()); checkCUDA(__LINE__,cudaGetLastError()); if (l==21){ if (layers[l]->in.size()>0){ for (size_t o=0; oin.size();++o){ ComputeT avg = layers[l]->in[o]->ameanData(); if (avg!=-1) std::cout<<" in[" << o << "].data: " << avg; layers[l]->in[o]->checkNaN(); } } std::cout<out.size()>0){ for (size_t o=0; oout.size();++o){ ComputeT avg = layers[l]->out[o]->ameanData(); if (avg!=-1) std::cout<<" out[" << o << "].data: " << avg; layers[l]->out[o]->checkNaN(); } } std::cout<clearDiff(); } //std::cout<<"============ the line end ==========" <=0; --l){ if (layers[l]->phase == phase || layers[l]->phase == TrainingTesting){ if (debug_mode){ std::cout<<"[Backward] Layer["<name; tic(); } layers[l]->backward(phase); if (debug_mode){ checkCUDA(__LINE__,cudaDeviceSynchronize()); checkCUDA(__LINE__,cudaGetLastError()); ComputeT avg; avg = layers[l]->ameanWeightDiff(); if (avg!=-1) std::cout<<" weight.diff: "<ameanBiasDiff(); if (avg!=-1) std::cout<<" bias.diff: "<in.size()>0){ for (size_t i=0;iin.size();++i){ avg = layers[l]->in[i]->ameanDiff(); if (avg!=-1) std::cout<<" in[" << i << "].diff: "<update(); } }; void resetLoss(){ for (int l=0; lresult = ComputeT(0); loss_layers[l]->loss = ComputeT(0); //std::cout<<"Layer["<result="<result<loss="<loss<phase == phase || loss_layers[l]->phase == TrainingTesting) loss_layers[l]->eval(); } if(sync) checkCUDA(__LINE__,cudaDeviceSynchronize()); }; void stepTest(bool sync){ checkCUDA(__LINE__,cudaSetDevice(GPU)); resetLoss(); for (int i=0; i < test_iter; ++i){ forward(); eval(false); } for (int l=0; lresult /= test_iter; loss_layers[l]->loss /= test_iter; } if(sync) checkCUDA(__LINE__,cudaDeviceSynchronize()); }; void stepTrain(bool sync){ checkCUDA(__LINE__,cudaSetDevice(GPU)); update(); resetLoss(); for (int l=0; lclearDiff(); } //std::cout<<"DEBUG LINE "<<__LINE__<<" train_iter = "<result /= train_iter; loss_layers[l]->loss /= train_iter; } if(sync) checkCUDA(__LINE__,cudaDeviceSynchronize()); }; void getTopActivations(std::string dataResponseName, std::vector responseNames, std::vector > responseChannels, std::string saveFilePrefix, int topK, int maxIterations){ phase = Training; DataLayer* pDataLayer = NULL; for (int l=0; lphase == phase || layers[l]->phase == TrainingTesting){ if (layers[l]->isDataLayer()){ pDataLayer = (DataLayer*) layers[l]; break; } } } if (pDataLayer==NULL) { std::cerr<<"No data layer."<*> > > data; std::vector > > scores; std::vector > scoresLowest; data.resize(responseChannels.size()); scores.resize(responseChannels.size()); scoresLowest.resize(responseChannels.size()); for(int i=0;idim[1]; Tensor* rdataTensor = new Tensor(rData->dim); int iter = 0; while(pDataLayer->epoch == 0 && iter < maxIterations){ resetLoss(); forward(); eval(false); // display std::cout << "Iteration " << iter << " "; for (int i=0;iphase == phase || loss_layers[i]->phase == TrainingTesting){ loss_layers[i]->display(); } } std::cout << std::endl; rdataTensor->readGPU(rData->dataGPU); for(int i=0;i* features = new Tensor(r->dim); features->readGPU(r->dataGPU); size_t spel = numspel(r->dim); std::vector receptive_field; receptive_field.resize(r->receptive_field.size()); std::vector receptive_offset; receptive_offset.resize(r->receptive_field.size()); std::vector receptive_gap; receptive_gap.resize(r->receptive_field.size()); for (int d=0;dreceptive_field.size();++d){ receptive_field [d] = r->receptive_field[d] /rData->receptive_field[d]; receptive_offset[d] = r->receptive_offset[d]/rData->receptive_field[d]; receptive_gap [d] = r->receptive_gap[d] /rData->receptive_field[d]; } std::vector dataDim; dataDim.push_back(dataChannels); dataDim.insert( dataDim.end(), receptive_field.begin(), receptive_field.end() ); for (int j=0;j= r->dim[1]){ std::cerr<<"Channel exceeds maximal channel: Indexing Channel "<dim[1]<<" channels in "<dim[0]; ++n){ for(size_t k=0; kdim[1] + c) * spel + k; ComputeT val = CPUStorage2ComputeT(features->CPUmem[idx]); if (scores[i][j].size()* toSave = new Tensor(dataDim); toSave->initialize(CPUCompute2StorageT(0)); // copy the n-th, all channesl, all spatal if (dataDim.size()==3){ // convert k to sx and sy int sx = receptive_offset[0] + k / features->dim[3] * receptive_gap[0]; int sy = receptive_offset[1] + k % features->dim[3] * receptive_gap[1]; for (int ic=0;icdim[2] && 0<=sy+y && sy+y< rData->dim[3] ) { size_t idxData = ((n * rData->dim[1] + ic) * rData->dim[2] + (sx+x)) * rData->dim[3] + (sy+y); size_t idxWrite = (ic * dataDim[1] + x) * dataDim[2] + y; toSave->CPUmem[idxWrite] = rdataTensor->CPUmem[idxData]; } } } } }else if (dataDim.size()==4){ // convert k to sx and sy int sx = receptive_offset[0] + k / (features->dim[3] * features->dim[4]) * receptive_gap[0]; int sy = receptive_offset[1] + (k / features->dim[4]) % features->dim[3] * receptive_gap[1];; int sz = receptive_offset[2] + k % features->dim[4]* receptive_gap[2]; for (int ic=0;icdim[2] && 0<=sy+y && sy+y< rData->dim[3] && 0<=sz+z && sz+z< rData->dim[4]) { size_t idxData = (((n * rData->dim[1] + ic) * rData->dim[2] + (sx+x)) * rData->dim[3] + (sy+y)) * rData->dim[4] + (sz+z); size_t idxWrite = ((ic * dataDim[1] + x) * dataDim[2] + y) * dataDim[3] + z; toSave->CPUmem[idxWrite] = rdataTensor->CPUmem[idxData]; } } } } } }else{ std::cerr<<"No implemented."<::iterator minEl = min_element(scores[i][j].begin(), scores[i][j].end()); scoresLowest[i][j]= *minEl; } }else{ //toSave->print(veci(2,5,5)); std::vector::iterator minEl = min_element(scores[i][j].begin(), scores[i][j].end()); int minID = std::distance(scores[i][j].begin(), minEl); scores[i][j][minID]=val; delete data[i][j][minID]; data[i][j][minID] = toSave; minEl = min_element(scores[i][j].begin(), scores[i][j].end()); scoresLowest[i][j]= *minEl; } } } } } delete features; } ++iter; } delete rdataTensor; // saving for(int i=0;i indices = sort_indexes(scores[i][j]); std::vector*> toWrite; toWrite.resize(indices.size()); std::cout<(Fname, toWrite); for(size_t k=0;k test(std::vector responseNames = std::vector(), std::vector saveFilenames = std::vector(), int itersPerSave=0){ phase = Testing; std::vector result(loss_layers.size(), 0); std::vector*> features(responseNames.size(),NULL); //std::vector > lock(responseNames.size()); std::vector files(responseNames.size(),NULL); DataLayer* pDataLayer = NULL; for (int l=0; lphase == phase || layers[l]->phase == TrainingTesting){ if (layers[l]->isDataLayer()){ pDataLayer = (DataLayer*) layers[l]; break; } } } if (pDataLayer==NULL) { std::cerr<<"No data layer for Testing."< total_size(responseNames.size()); for(int i=0;i dim = r->dim; dim[0] = pDataLayer->numofitems(); total_size[i] = numel(dim); } std::vector file_counter(responseNames.size(),0); std::cout<< "====================================================================================================================================="<epoch == 0){ resetLoss(); forward(); eval(false); // display std::cout << "Iteration " << iter << " "; for (int i=0;iphase == phase || loss_layers[i]->phase == TrainingTesting){ loss_layers[i]->display(); result[i] += loss_layers[i]->result; } } std::cout << std::endl; // get features if (responseNames.size()>0){ for(int i=0;i(r->dim); files[i] = fopen(Fname.c_str(),"wb"); while (files[i]==NULL){ std::cerr<<"Open file "< dim = r->dim; if (itersPerSave==0){ dim[0] = pDataLayer->numofitems(); }else{ int samplesPerFile = r->dim[0]*itersPerSave; int samplesSaved = samplesPerFile * file_counter[i]; if (samplesSaved + samplesPerFile <= pDataLayer->numofitems()){ dim[0] = r->dim[0]*itersPerSave; }else{ dim[0] = pDataLayer->numofitems() - samplesSaved; } } features[i]->writeHeader(files[i],dim); file_counter[i]++; } //if (iter!=0) lock[i].wait(); features[i]->readGPU(r-> dataGPU); features[i]->writeData(files[i], total_size[i]); //lock[i] = async(launch::async,&Tensor::writeData, features[i], files[i], total_size[i]); total_size[i] -= features[i]->numel(); if (itersPerSave !=0 && iter % itersPerSave == itersPerSave-1){ fclose(files[i]); files[i] = NULL; } } } ++iter; } for(int i=0;iphase == phase || loss_layers[i]->phase == TrainingTesting){ std::cout << " eval = " << result[i]; std::cout << " "; } } std::cout << std::endl; //for(int i=0;i nets; std::vector threads; std::string path; int iter; int current_step; int train_iter; std::vector GPU; int GPU_solver; // machine learning paramters SolverAlgorithm solver; Regularizer regularizer; ComputeT momentum; ComputeT weight_decay; ComputeT base_lr; LRPolicy lr_policy; ComputeT lr_gamma; ComputeT lr_power; int lr_stepsize; std::vector stepvalue; int max_iter; // maximum number of iterations int snapshot_iter; // snapshot every N iterations int display_iter; // Display every 100 iterations int test_iter; // how many forward passes the test should carry out int test_interval; // Carry out testing every 500 training iterations bool debug_mode; Solver(std::string filename=std::string()){ // construct the network from the file in JSON JSON* train_obj = new JSON; JSON* architecture_obj = new JSON; parseNetworkJSON(filename, train_obj, NULL, architecture_obj); SetValue(train_obj, solver, SGD) SetValue(train_obj, regularizer, L2) SetValue(train_obj, momentum, 0.9) SetValue(train_obj, weight_decay, 0.0005) SetValue(train_obj, base_lr, 0.01) SetValue(train_obj, lr_policy, LR_inv) SetValue(train_obj, lr_gamma, 0.0001) SetValue(train_obj, lr_power, 0.75) SetValue(train_obj, lr_stepsize, 100000) SetValue(train_obj, train_iter, 1) SetValue(train_obj, max_iter, 10000) SetValue(train_obj, snapshot_iter, 5000) SetValue(train_obj, display_iter, 100) SetValue(train_obj, test_iter, 100) SetValue(train_obj, test_interval, 500) SetValue(train_obj, debug_mode, false) SetValue(train_obj, GPU, veci(1,0)) SetOrDie(train_obj, path ) SetValue(train_obj, GPU_solver, -1) if (GPU_solver==-1) GPU_solver=GPU[0]; singleGPU = GPU.size()==1 && GPU_solver==GPU[0]; int nGPUs = 0; checkCUDA(__LINE__, cudaGetDeviceCount(&nGPUs)); if (nGPUs==0){ std::cerr<<"There is no NVIDIA GPU available in this machine."<::iterator largestGPUid = max_element(GPU.begin(), GPU.end()); if (*largestGPUid>=nGPUs){ std::cerr<<"Largest GPU ID request for GPU #"<<*largestGPUid<<" exceeds the number of available GPUs"<debug_mode = debug_mode; nets[n]->train_iter = train_iter; nets[n]->test_iter = test_iter; } delete train_obj; delete architecture_obj; if (GPU.size()>1){ for (int n=0;n= stepvalue[current_step] ) { current_step++; std::cout << "MultiStep Status: Iteration " << iter << ", step = " << current_step << std::endl; } rate = base_lr * pow(lr_gamma, current_step); break; case LR_poly: rate = base_lr * pow(ComputeT(1) - (ComputeT(iter) / ComputeT(max_iter)), lr_power); break; case LR_sigmoid: rate = base_lr * (ComputeT(1) / (ComputeT(1) + exp(-lr_gamma * (ComputeT(iter) - ComputeT(lr_stepsize))))); break; case LR_cyclical: // from http://arxiv.org/abs/1506.01186 rate = 0; // TODO: place holder for now break; } return rate; }; size_t Malloc(Phase phase_ = Training){ phase = phase_; int nGPUs = 0; checkCUDA(__LINE__, cudaGetDeviceCount(&nGPUs)); std::vector memoryBytes(nGPUs,0); for (int n=0;nMalloc(phase); } if (phase == Training || phase == TrainingTesting){ checkCUDA(__LINE__,cudaSetDevice(GPU_solver)); for (int l=0; llayers.size(); ++l){ if (nets[0]->layers[l]->train_me){ size_t weight_numel = nets[0]->layers[l]->weight_numel; size_t bias_numel = nets[0]->layers[l]->bias_numel; if (weight_numel>0){ size_t weight_bytes = (1 + nets.size()) * weight_numel * sizeofStorageT; checkCUDA(__LINE__, cudaMalloc(&(nets[0]->layers[l]->weight_histGPU), weight_bytes) ); memoryBytes[GPU_solver] += weight_bytes; for (int n=0;nlayers[l]->weight_histGPU = nets[0]->layers[l]->weight_histGPU; nets[n]->layers[l]->weight_diffGPU = nets[0]->layers[l]->weight_histGPU + weight_numel * (n+1); } } if (bias_numel>0){ size_t bias_bytes = (1 + nets.size()) * bias_numel * sizeofStorageT; checkCUDA(__LINE__, cudaMalloc(&(nets[0]->layers[l]->bias_histGPU), bias_bytes) ); memoryBytes[GPU_solver] += bias_bytes; for (int n=0;nlayers[l]->bias_histGPU = nets[0]->layers[l]->bias_histGPU; nets[n]->layers[l]->bias_diffGPU = nets[0]->layers[l]->bias_histGPU + bias_numel * (n+1); } } nets[0]->layers[l]->clearHist(); } } } std::cout<< "====================================================================================================================================="<0){ std::cout<< "GPU " << n << ": Total GPU memory: "; memorySizePrint(memoryBytes[n]); std::cout<layers.size(); ++l){ if (nets[0]->layers[l]->train_me){ if (nets[0]->layers[l]->weight_numel>0){ if (nets[0]->layers[l]->weight_histGPU!=NULL) checkCUDA(__LINE__, cudaFree(nets[0]->layers[l]->weight_histGPU)); } if (nets[0]->layers[l]->bias_numel>0){ if (nets[0]->layers[l]->bias_histGPU!=NULL) checkCUDA(__LINE__, cudaFree(nets[0]->layers[l]->bias_histGPU)); } } } }; void randInit(){ nets[0]->randInit(); for (int n=1;nlayers.size(); ++l){ if (nets[0]->layers[l]->weight_numel>0){ checkCUDA(__LINE__, cudaMemcpy(nets[n]->layers[l]->weight_dataGPU, nets[0]->layers[l]->weight_dataGPU, nets[0]->layers[l]->weight_numel*sizeofStorageT, cudaMemcpyDeviceToDevice) ); } if (nets[0]->layers[l]->bias_numel>0){ checkCUDA(__LINE__, cudaMemcpy(nets[n]->layers[l]->bias_dataGPU, nets[0]->layers[l]->bias_dataGPU, nets[0]->layers[l]->bias_numel*sizeofStorageT, cudaMemcpyDeviceToDevice) ); } } } }; void update(ComputeT learning_rate){ checkCUDA(__LINE__,cudaSetDevice(GPU_solver)); for (int l=0; llayers.size(); ++l){ if (nets[0]->layers[l]->train_me){ if (nets[0]->layers[l]->weight_numel>0){ if (solver==SGD && regularizer==L2){ update_SGDL2(nets[0]->layers[l]->weight_numel, nets.size(), weight_decay * nets[0]->layers[l]->weight_decay_mult, momentum, learning_rate * nets[0]->layers[l]->weight_lr_mult, nets[0]->layers[l]->weight_dataGPU, nets[0]->layers[l]->weight_histGPU); } } if (nets[0]->layers[l]->bias_numel>0){ if (solver==SGD && regularizer==L2){ update_SGDL2(nets[0]->layers[l]->bias_numel, nets.size(), weight_decay * nets[0]->layers[l]->bias_decay_mult, momentum, learning_rate * nets[0]->layers[l]->bias_lr_mult, nets[0]->layers[l]->bias_dataGPU, nets[0]->layers[l]->bias_histGPU); } } } } }; void loadWeights(std::string filename, bool diff=false){ std::cout<< "====================================================================================================================================="<*> weights = readTensors(filename); for (int i=0;iloadWeights(weights, diff); } // release memory for the weights for (int i=0; isaveWeights(filename, diff); }; void train(int iter_begin = 0){ checkCUDA(__LINE__,cudaSetDevice(GPU_solver)); phase = Training; current_step = 0; std::cout<< "====================================================================================================================================="<phase = Testing; nets[0]->stepTest(false); nets[0]->phase = Training; }else{ for (int t=0; tphase = Testing; threads[t] = std::thread(&Net::stepTest, nets[t], true); //nets[t]->stepTest(); } for (int t=0; tphase = Training; } } for (int l=0; lloss_layers.size();++l){ if (nets[0]->loss_layers[l]->phase == phase || nets[0]->loss_layers[l]->phase == TrainingTesting){ for (int t=1;tloss_layers[l]->result += nets[t]->loss_layers[l]->result; nets[0]->loss_layers[l]->loss += nets[t]->loss_layers[l]->loss; } nets[0]->loss_layers[l]->result /= nets.size(); nets[0]->loss_layers[l]->loss /= nets.size(); nets[0]->loss_layers[l]->display(); } } std::cout << std::endl; } if (singleGPU){ nets[0]->stepTrain(false); }else{ for (int t=0; tstepTrain(); } for (int t=0; t