

这个方法对于一般的GPU更有用些。由于CPU往往拥有L1 Data Cache,当寄存器不够用的时候,编译器会将不太常用的数据放到栈中,而栈在此时往往能获得高命中率的Cache访问,因此性能不会过受影响。而GPU端当寄存器不够用时,编译器往往会采取将不常用数据直接存放到VRAM中,而对外部VRAM的访问往往是比较慢的,因此,如果临时变量太多,使得频繁访问外部存储器,会使得整体计算性能大幅下降。当然,现在不少GPU也有了L1 Cache,但是空间也十分有限。因此,这里用“猜”这个词,呵呵~


__kernel void QueryRegisterCount(__global int *pInOut)
int index = get_global_id(); int i0 = pInOut[(index * + ) * ];
int i1 = pInOut[(index * + ) * ];
int i2 = pInOut[(index * + ) * ];
int i3 = pInOut[(index * + ) * ]; for(int i = ; i < ; i++)
i1 += i0 << ;
i2 += i1 << ;
i3 += i2 << ;
i0 += i3 << ; i1 += i0 >> ;
i2 += i1 >> ;
i3 += i2 >> ;
i0 += i3 >> ; i1 += i0 >> ;
i2 += i1 >> ;
i3 += i2 >> ;
i0 += i3 >> ; i1 += i0 >> ;
i2 += i1 >> ;
i3 += i2 >> ;
i0 += i3 >> ;
} pInOut[(index * + ) * ] = i0;
pInOut[(index * + ) * ] = i1;
pInOut[(index * + ) * ] = i2;
pInOut[(index * + ) * ] = i3;


__kernel void QueryRegisterCount(__global int *pInOut)
int index = get_global_id(); int i0 = pInOut[(index * + ) * ];
int i1 = pInOut[(index * + ) * ];
int i2 = pInOut[(index * + ) * ];
int i3 = pInOut[(index * + ) * ];
int i4 = pInOut[(index * + ) * ];
int i5 = pInOut[(index * + ) * ];
int i6 = pInOut[(index * + ) * ];
int i7 = pInOut[(index * + ) * ]; for(int i = ; i < ; i++)
i1 += i0 << ;
i2 += i1 << ;
i3 += i2 << ;
i4 += i3 << ;
i5 += i4 << ;
i6 += i5 << ;
i7 += i6 << ;
i0 += i7 << ; i1 += i0 >> ;
i2 += i1 >> ;
i3 += i2 >> ;
i4 += i3 >> ;
i5 += i4 >> ;
i6 += i5 >> ;
i7 += i6 >> ;
i0 += i7 >> ; i1 += i0 >> ;
i2 += i1 >> ;
i3 += i2 >> ;
i4 += i3 >> ;
i5 += i4 >> ;
i6 += i5 >> ;
i7 += i6 >> ;
i0 += i7 >> ; i1 += i0 >> ;
i2 += i1 >> ;
i3 += i2 >> ;
i4 += i3 >> ;
i5 += i4 >> ;
i6 += i5 >> ;
i7 += i6 >> ;
i0 += i7 >> ;
} pInOut[(index * + ) * ] = i0;
pInOut[(index * + ) * ] = i1;
pInOut[(index * + ) * ] = i2;
pInOut[(index * + ) * ] = i3;
pInOut[(index * + ) * ] = i4;
pInOut[(index * + ) * ] = i5;
pInOut[(index * + ) * ] = i6;
pInOut[(index * + ) * ] = i7;



    /** Prepare for running an OpenCL kernel program to get register count */

    /*Step 4: Creating command queue associate with the context.*/
commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); /*Step 5: Create program object */
// Read the kernel code to the buffer
kernelPath = [[NSBundle mainBundle] pathForResource:@"reg" ofType:@"ocl"];
aSource = [[NSString stringWithContentsOfFile:kernelPath encoding:NSUTF8StringEncoding error:nil] UTF8String];
kernelLength = strlen(aSource);
program = clCreateProgramWithSource(context, , &aSource, &kernelLength, NULL); /*Step 6: Build program. */
status = clBuildProgram(program, , &device, NULL, NULL, NULL); /*Step 7: Initial inputs and output for the host and create memory objects for the kernel*/
const size_t memSize = global_work_size[] * * * ;
cl_int *orgBufer = (cl_int*)malloc(memSize);
memset(orgBufer, , memSize);
outputMemObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, memSize, orgBufer, NULL); /*Step 8: Create kernel object */
kernel = clCreateKernel(program, "QueryRegisterCount", NULL); /*Step 9: Sets Kernel arguments.*/
status |= clSetKernelArg(kernel, , sizeof(outputMemObj), &outputMemObj); /*Step 10: Running the kernel.*/
for(int i = ; i < ; i++)
NSTimeInterval beginTime = [[NSProcessInfo processInfo] systemUptime];
status |= clEnqueueNDRangeKernel(commandQueue, kernel, , NULL, global_work_size, local_work_size, , NULL, NULL);
NSTimeInterval endTime = [[NSProcessInfo processInfo] systemUptime]; NSLog(@"Time spent: %f", endTime - beginTime);
} free(orgBufer); if(status != CL_SUCCESS)
NSLog(@"Program built failed!");
} clReleaseMemObject(outputMemObj);
clReleaseCommandQueue(commandQueue); clReleaseContext(context);

以上由于是在OS X下开发的,因此直接用Objective-C文件读写更方便些。但是大部分都是C代码,很容易读懂。


在2013年的MacBook Air中的Intel HD 5000中的测试结果为:





很显然,我们可以猜得,Intel HD Graphics 5000至少可以为每个work-item分配16个寄存器。



