/Test_Misc/test1/xxx-2.cl

http://oclbenchmarks.codeplex.com · OpenCL · 48 lines · 44 code · 4 blank · 0 comment · 0 complexity · 9697e51f5d5196e70721da04baae479c MD5 · raw file

  1. #define FPTYPE float
  2. __kernel void
  3. innerProductKer_1( __global const FPTYPE * restrict vec_1,
  4. __global const FPTYPE * restrict vec_2,
  5. __global FPTYPE * restrict reduction_buffer,
  6. const unsigned long realSize,
  7. __local volatile FPTYPE * sum)
  8. {
  9. unsigned long t = get_local_id(0);
  10. unsigned long wsz = get_local_size(0);
  11. // unsigned long ng = get_num_groups(0);
  12. unsigned long gid = get_group_id(0);
  13. unsigned long gsz = get_global_size(0);
  14. unsigned long initid = get_global_id(0);
  15. reduction_buffer[initid] = 0;
  16. // __local volatile FPTYPE sum[get_local_size(0)];
  17. // in case of half of a work group is enough for realSize
  18. // sum[t]=0;
  19. unsigned long n = (gid*wsz)+t;
  20. FPTYPE sumTemp = 0;
  21. // calculate locally
  22. while (n< realSize) {
  23. sumTemp+= vec_1[n] * vec_2[n];
  24. n+=gsz;
  25. }
  26. sum[t]=sumTemp;
  27. // sum[t]=gid;
  28. //barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);
  29. // locally reduction
  30. unsigned long kf = wsz;
  31. while (kf > 1) {
  32. // for (unsigned long kf=wsz/2; kf >0; kf/=2) {
  33. kf /=2;
  34. // barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);
  35. barrier(CLK_LOCAL_MEM_FENCE);
  36. if(t<kf) sum[t] +=sum[t+kf];
  37. }
  38. // barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);
  39. // if (t == 0) reduction_buffer[gid]=sum[0];
  40. if (t == 0) reduction_buffer[gid]=gid;
  41. // reduction_buffer[gid]=sum[0];
  42. // mem_fence(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);
  43. barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);
  44. }