private static int cudaOccMaxBlocksPerSMWarpsLimit(
cudaOccPartitionedGCConfig gcConfig,
cudaOccDeviceProp properties,
cudaOccFuncAttributes attributes,
int blockSize)
{
int limit;
int maxWarpsPerSm;
int warpsAllocatedPerCTA;
int maxBlocks;
if (blockSize > properties.maxThreadsPerBlock) {
maxBlocks = 0;
}
else {
maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize;
warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);
maxBlocks = 0;
if (gcConfig != cudaOccPartitionedGCConfig.Off) {
int maxBlocksPerSmPartition;
int maxWarpsPerSmPartition;
// If partitioned global caching is on, then a CTA can only use a SM
// partition (a half SM), and thus a half of the warp slots
// available per SM
//
maxWarpsPerSmPartition = maxWarpsPerSm / 2;
maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
maxBlocks = maxBlocksPerSmPartition * 2;
}
// On hardware that supports partitioned global caching, each half SM is
// guaranteed to support at least 32 warps (maximum number of warps of a
// CTA), so caching will not cause 0 occupancy due to insufficient warp
// allocation slots.
//
else {
maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
}
}
limit = maxBlocks;
return limit;
}