private static int cudaOccMaxBlocksPerSMSmemLimit(
cudaOccResult result,
cudaOccDeviceProp properties,
cudaOccFuncAttributes attributes,
cudaOccDeviceState state,
int blockSize,
SizeT dynamicSmemSize)
{
int allocationGranularity;
SizeT userSmemPreference;
SizeT totalSmemUsagePerCTA;
SizeT smemAllocatedPerCTA;
SizeT sharedMemPerMultiprocessor;
int maxBlocks;
allocationGranularity = cudaOccSMemAllocationGranularity(properties);
// Obtain the user preferred shared memory size. This setting is ignored if
// user requests more shared memory than preferred.
//
userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);
totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize;
smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
if (smemAllocatedPerCTA > properties.sharedMemPerBlock) {
maxBlocks = 0;
}
else {
// User requested shared memory limit is used as long as it is greater
// than the total shared memory used per CTA, i.e. as long as at least
// one CTA can be launched. Otherwise, the maximum shared memory limit
// is used instead.
//
if (userSmemPreference >= smemAllocatedPerCTA) {
sharedMemPerMultiprocessor = userSmemPreference;
}
else{
sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor;
}
if (smemAllocatedPerCTA > 0) {
maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
}
else {
maxBlocks = int.MaxValue;
}
}
result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA;
return maxBlocks;
}