android - GPU vs CPU programming: inconsistencies in processing times -


i'm working on image tracking: camera i'm tracking finger touches interact android system. image processing done on gpu opencl: convert camera output black , white frames in order spots in white. processing time 65ms method. since objective make program smoother, i've performed same operations on cpu opencv method. gives processing time of 115ms. problem program feels more reactive, faster opencv method , don't understand how processing time can longer in case: seems contradictory me. measurement, proceed this:

start= clock(); finish = clock(); double time =((double)finish -start)/clocks_per_sec; std::cout<<"process time : "<< time<<std::endl; 

here code:

static cv::mat              original_right,binary_right; static cv::mat              original_left, binary_left; int                 width, height; clock_t                 start,finish; double time = 0.0;  width = (int) this->camera_right.getcapture().get(cv::cap_prop_frame_width); height = (int) this->camera_right.getcapture().get(cv::cap_prop_frame_height); original_right.create(height, width, cv_8uc3);   //--------------------------- camera 2 --------------------------------- int width_2 = (int) this->camera_left.getcapture().get(cv::cap_prop_frame_width); int height_2 = (int) this->camera_left.getcapture().get(cv::cap_prop_frame_height); original_left.create(height_2, width_2, cv_8uc3);   binary_right.create(height, width, cv_32f); // gpu binary_left.create(height_2, width_2, cv_32f); // gpu //binary_right.create(height, width, cv_8uc1); // cpu //binary_left.create(height_2, width_2, cv_8uc1); // cpu  core::running_ = true;   //------------------------------------ set gpu ----------------------------------------- cl_context              context; cl_context_properties   properties [3]; cl_kernel               kernel; cl_command_queue        command_queue; cl_program              program; cl_int                  err; cl_uint                 num_of_platforms=0; cl_platform_id          platform_id; cl_device_id            device_id; cl_uint                 num_of_devices=0; cl_mem                  input, output;  size_t                  global;  int                     data_size =height*width*3;   //load opencl source file *fp; char filename[] = "./hellotedkrissv2.cl"; char *source_str;   //load source code containing kernel fp = fopen(filename, "r"); if (!fp) { fprintf(stderr, "failed load kernel.\n"); exit(1); } source_str = (char*)malloc(max_source_size); global = fread(source_str, 1, max_source_size, fp); fclose(fp);   //retreives list of platforms available if(clgetplatformids(1,&platform_id, &num_of_platforms)!=cl_success){     std::cout<<"unable platform_id"<<std::endl; };  // supported gpu device if(clgetdeviceids(platform_id,cl_device_type_gpu,1,&device_id, &num_of_devices)!= cl_success){     std::cout<<"unable device_id"<<std::endl;       };  //context properties list - must terminated 0 properties[0]=cl_context_platform; properties[1]=(cl_context_properties) platform_id; properties[2]=0;  // create context gpu device context = clcreatecontext(properties,1,&device_id,null,null,&err);  //create command queue using context , device command_queue = clcreatecommandqueue(context,device_id,0,&err);  //create program kernel source code program= clcreateprogramwithsource(context,1,(const char **) &source_str, null,&err);  // compile program if(clbuildprogram(program,0,null,null,null,null)!=cl_success){     size_t length;     std::cout<<"error building program"<<std::endl;     char buffer[4096];     clgetprogrambuildinfo(program,device_id,cl_program_build_log, sizeof(buffer),buffer,&length);     std::cout<< buffer <<std::endl; }  //specify kernel program execute kernel = clcreatekernel(program,"imageprocessing",&err);     while (this->isrunning() == true) {       start= clock(); //--------------------- start----------------------      //----------------------frame---------------------     this->camera_right.readframe(original_right);     if (original_right.empty() == true ) {         std::cerr << "[core/error] original  frame empty." << std::endl;         break;     }      this->camera_left.readframe(original_left);     if (original_left.empty() == true ) {         std::cerr << "[core/error] original 2  frame empty." << std::endl;         break;     }     //----------------------frame---------------------      //------------------------------------------------imp gpu ------------------------------------------------------      input = clcreatebuffer(context, cl_mem_read_write | cl_mem_alloc_host_ptr  , sizeof(unsigned char)*data_size,null,null);     output =clcreatebuffer(context,cl_mem_read_write   | cl_mem_alloc_host_ptr, sizeof(float)*data_size/3,null,null);     if(clenqueuewritebuffer(command_queue,input,cl_true,0,sizeof(unsigned char)*data_size, original_right.data ,0,null,null )!= cl_success){};      //set argument list kernel command     clsetkernelarg(kernel,0,sizeof(cl_mem), &input);     clsetkernelarg(kernel,1,sizeof(cl_mem), &output);     global = data_size  ;     //enqueue kernel command execution     clenqueuendrangekernel(command_queue, kernel, 1, null, &global, null,0,null,null);     clfinish(command_queue);     //copy results out of  output buffer     if(clenqueuereadbuffer(command_queue,output,cl_true ,0,sizeof(float)*data_size/3,binary_right.data,0,null,null )!= cl_success){};      clreleasememobject(input);     clreleasememobject(output);      //------------------------------------------------imp gpu ------------------------------------------------------      input = clcreatebuffer(context, cl_mem_read_write | cl_mem_alloc_host_ptr  , sizeof(unsigned char)*data_size,null,null);     output =clcreatebuffer(context,cl_mem_read_write   | cl_mem_alloc_host_ptr, sizeof(float)*data_size/3,null,null);     if(clenqueuewritebuffer(command_queue,input,cl_true,0,sizeof(unsigned char)*data_size, original_left.data ,0,null,null )!= cl_success){};      //set argument list kernel command     clsetkernelarg(kernel,0,sizeof(cl_mem), &input);     clsetkernelarg(kernel,1,sizeof(cl_mem), &output);     global = data_size  ;     //enqueue kernel command execution     clenqueuendrangekernel(command_queue, kernel, 1, null, &global, null,0,null,null);     clfinish(command_queue);     //copy results out of  output buffer     if(clenqueuereadbuffer(command_queue,output,cl_true ,0,sizeof(float)*data_size/3,binary_left.data,0,null,null )!= cl_success){};     clreleasememobject(input);    clreleasememobject(output);      //------------------------------------------------imp gpu ------------------------------------------------------    // cpu method   // adok::processing::doimageprocessing(original_right, binary_right);   // adok::processing::doimageprocessing(original_left, binary_left);      //-------------------------------------------------------------- tracking ------------------------------------------------------  adok::tracking::dofingercontourstracking(binary_right,binary_left, this->fingercontours, this->perspective_right,this->perspective_left, this->distortion_right,this->distortion_left, this);      //------------------------------------------- tracking -----------------------------------------   //------------------------------send coordinates android board-------------------- if (getsideright() && !getsideleft() ) {         std::cout<<"right : "<<std::endl;         this->uart_.sendall(this->fingercontours, this->perspective_right.getperspectivematrix(), right);     }else if (!getsideright() && getsideleft() ){         std::cout<<"left : "<<std::endl;         this->uart_.sendall(this->fingercontours, this->perspective_left.getperspectivematrix(), left);     }else if (getsideright() && getsideleft() ){         std::cout<<"right & left : "<<std::endl;         this->uart_.sendall(this->fingercontours, this->perspective_right.getperspectivematrix(), this->perspective_left.getperspectivematrix());      }  this->setsideright(0); this->setsideleft(0);  finish = clock(); time =(double)(finish - start)/clocks_per_sec; std::cout << "time: " << time << std::endl; // ------------end-----------  } clreleasecommandqueue(command_queue); clreleaseprogram(program); clreleasekernel(kernel); clreleasecontext(context); this->stop(); 

}

there strange, when i'm on cpu time grabbing frame 5ms while on gpu it's 15ms , don't know why increases.

and i'm working on android xu4.

in gpu calculation there sometime may take time cpu calculation. because, gpu calculation main process send data gpu memory , after mathematical calculation gpu sends data cpu. so, data transfer , receive cpu takes time. if calculated buffer size bigger , transfer time bigger can take more time in gpu calculation. cudnn library along gpu processor makes many times faster. so, if program not using cudnn may slower.


Comments

Popular posts from this blog

python - How to insert QWidgets in the middle of a Layout? -

python - serve multiple gunicorn django instances under nginx ubuntu -

module - Prestashop displayPaymentReturn hook url -