android - GPU vs CPU programming: inconsistencies in processing times -
i'm working on image tracking: camera i'm tracking finger touches interact android system. image processing done on gpu opencl: convert camera output black , white frames in order spots in white. processing time 65ms method. since objective make program smoother, i've performed same operations on cpu opencv method. gives processing time of 115ms. problem program feels more reactive, faster opencv method , don't understand how processing time can longer in case: seems contradictory me. measurement, proceed this:
start= clock(); finish = clock(); double time =((double)finish -start)/clocks_per_sec; std::cout<<"process time : "<< time<<std::endl;
here code:
static cv::mat original_right,binary_right; static cv::mat original_left, binary_left; int width, height; clock_t start,finish; double time = 0.0; width = (int) this->camera_right.getcapture().get(cv::cap_prop_frame_width); height = (int) this->camera_right.getcapture().get(cv::cap_prop_frame_height); original_right.create(height, width, cv_8uc3); //--------------------------- camera 2 --------------------------------- int width_2 = (int) this->camera_left.getcapture().get(cv::cap_prop_frame_width); int height_2 = (int) this->camera_left.getcapture().get(cv::cap_prop_frame_height); original_left.create(height_2, width_2, cv_8uc3); binary_right.create(height, width, cv_32f); // gpu binary_left.create(height_2, width_2, cv_32f); // gpu //binary_right.create(height, width, cv_8uc1); // cpu //binary_left.create(height_2, width_2, cv_8uc1); // cpu core::running_ = true; //------------------------------------ set gpu ----------------------------------------- cl_context context; cl_context_properties properties [3]; cl_kernel kernel; cl_command_queue command_queue; cl_program program; cl_int err; cl_uint num_of_platforms=0; cl_platform_id platform_id; cl_device_id device_id; cl_uint num_of_devices=0; cl_mem input, output; size_t global; int data_size =height*width*3; //load opencl source file *fp; char filename[] = "./hellotedkrissv2.cl"; char *source_str; //load source code containing kernel fp = fopen(filename, "r"); if (!fp) { fprintf(stderr, "failed load kernel.\n"); exit(1); } source_str = (char*)malloc(max_source_size); global = fread(source_str, 1, max_source_size, fp); fclose(fp); //retreives list of platforms available if(clgetplatformids(1,&platform_id, &num_of_platforms)!=cl_success){ std::cout<<"unable platform_id"<<std::endl; }; // supported gpu device if(clgetdeviceids(platform_id,cl_device_type_gpu,1,&device_id, &num_of_devices)!= cl_success){ std::cout<<"unable device_id"<<std::endl; }; //context properties list - must terminated 0 properties[0]=cl_context_platform; properties[1]=(cl_context_properties) platform_id; properties[2]=0; // create context gpu device context = clcreatecontext(properties,1,&device_id,null,null,&err); //create command queue using context , device command_queue = clcreatecommandqueue(context,device_id,0,&err); //create program kernel source code program= clcreateprogramwithsource(context,1,(const char **) &source_str, null,&err); // compile program if(clbuildprogram(program,0,null,null,null,null)!=cl_success){ size_t length; std::cout<<"error building program"<<std::endl; char buffer[4096]; clgetprogrambuildinfo(program,device_id,cl_program_build_log, sizeof(buffer),buffer,&length); std::cout<< buffer <<std::endl; } //specify kernel program execute kernel = clcreatekernel(program,"imageprocessing",&err); while (this->isrunning() == true) { start= clock(); //--------------------- start---------------------- //----------------------frame--------------------- this->camera_right.readframe(original_right); if (original_right.empty() == true ) { std::cerr << "[core/error] original frame empty." << std::endl; break; } this->camera_left.readframe(original_left); if (original_left.empty() == true ) { std::cerr << "[core/error] original 2 frame empty." << std::endl; break; } //----------------------frame--------------------- //------------------------------------------------imp gpu ------------------------------------------------------ input = clcreatebuffer(context, cl_mem_read_write | cl_mem_alloc_host_ptr , sizeof(unsigned char)*data_size,null,null); output =clcreatebuffer(context,cl_mem_read_write | cl_mem_alloc_host_ptr, sizeof(float)*data_size/3,null,null); if(clenqueuewritebuffer(command_queue,input,cl_true,0,sizeof(unsigned char)*data_size, original_right.data ,0,null,null )!= cl_success){}; //set argument list kernel command clsetkernelarg(kernel,0,sizeof(cl_mem), &input); clsetkernelarg(kernel,1,sizeof(cl_mem), &output); global = data_size ; //enqueue kernel command execution clenqueuendrangekernel(command_queue, kernel, 1, null, &global, null,0,null,null); clfinish(command_queue); //copy results out of output buffer if(clenqueuereadbuffer(command_queue,output,cl_true ,0,sizeof(float)*data_size/3,binary_right.data,0,null,null )!= cl_success){}; clreleasememobject(input); clreleasememobject(output); //------------------------------------------------imp gpu ------------------------------------------------------ input = clcreatebuffer(context, cl_mem_read_write | cl_mem_alloc_host_ptr , sizeof(unsigned char)*data_size,null,null); output =clcreatebuffer(context,cl_mem_read_write | cl_mem_alloc_host_ptr, sizeof(float)*data_size/3,null,null); if(clenqueuewritebuffer(command_queue,input,cl_true,0,sizeof(unsigned char)*data_size, original_left.data ,0,null,null )!= cl_success){}; //set argument list kernel command clsetkernelarg(kernel,0,sizeof(cl_mem), &input); clsetkernelarg(kernel,1,sizeof(cl_mem), &output); global = data_size ; //enqueue kernel command execution clenqueuendrangekernel(command_queue, kernel, 1, null, &global, null,0,null,null); clfinish(command_queue); //copy results out of output buffer if(clenqueuereadbuffer(command_queue,output,cl_true ,0,sizeof(float)*data_size/3,binary_left.data,0,null,null )!= cl_success){}; clreleasememobject(input); clreleasememobject(output); //------------------------------------------------imp gpu ------------------------------------------------------ // cpu method // adok::processing::doimageprocessing(original_right, binary_right); // adok::processing::doimageprocessing(original_left, binary_left); //-------------------------------------------------------------- tracking ------------------------------------------------------ adok::tracking::dofingercontourstracking(binary_right,binary_left, this->fingercontours, this->perspective_right,this->perspective_left, this->distortion_right,this->distortion_left, this); //------------------------------------------- tracking ----------------------------------------- //------------------------------send coordinates android board-------------------- if (getsideright() && !getsideleft() ) { std::cout<<"right : "<<std::endl; this->uart_.sendall(this->fingercontours, this->perspective_right.getperspectivematrix(), right); }else if (!getsideright() && getsideleft() ){ std::cout<<"left : "<<std::endl; this->uart_.sendall(this->fingercontours, this->perspective_left.getperspectivematrix(), left); }else if (getsideright() && getsideleft() ){ std::cout<<"right & left : "<<std::endl; this->uart_.sendall(this->fingercontours, this->perspective_right.getperspectivematrix(), this->perspective_left.getperspectivematrix()); } this->setsideright(0); this->setsideleft(0); finish = clock(); time =(double)(finish - start)/clocks_per_sec; std::cout << "time: " << time << std::endl; // ------------end----------- } clreleasecommandqueue(command_queue); clreleaseprogram(program); clreleasekernel(kernel); clreleasecontext(context); this->stop();
}
there strange, when i'm on cpu time grabbing frame 5ms while on gpu it's 15ms , don't know why increases.
and i'm working on android xu4.
in gpu
calculation there sometime may take time cpu
calculation. because, gpu
calculation main process send data gpu
memory , after mathematical calculation gpu
sends data cpu
. so, data transfer , receive cpu
takes time. if calculated buffer size bigger , transfer time bigger can take more time in gpu
calculation. cudnn
library along gpu
processor makes many times faster. so, if program not using cudnn
may slower.
Comments
Post a Comment