private static int cudaOccMaxBlocksPerSMRegsLimit(
ref cudaOccPartitionedGCConfig gcConfig,
cudaOccResult result,
cudaOccDeviceProp properties,
cudaOccFuncAttributes attributes,
int blockSize)
{
int allocationGranularity;
int warpsAllocatedPerCTA;
int regsAllocatedPerCTA;
int regsAssumedPerCTA;
int regsPerWarp;
int regsAllocatedPerWarp;
int numSubPartitions;
int numRegsPerSubPartition;
int numWarpsPerSubPartition;
int numWarpsPerSM;
int maxBlocks;
allocationGranularity = cudaOccRegAllocationGranularity(
properties,
attributes.numRegs); // Fermi requires special handling of certain register usage
numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties);
warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);
// GPUs of compute capability 2.x and higher allocate registers to warps
//
// Number of regs per warp is regs per thread x warp size, rounded up to
// register allocation granularity
//
regsPerWarp = attributes.numRegs * properties.warpSize;
regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA;
// Hardware verifies if a launch fits the per-CTA register limit. For
// historical reasons, the verification logic assumes register
// allocations are made to all partitions simultaneously. Therefore, to
// simulate the hardware check, the warp allocation needs to be rounded
// up to the number of partitions.
//
regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
if (properties.regsPerBlock < regsAssumedPerCTA || // Hardware check
properties.regsPerBlock < regsAllocatedPerCTA) { // Software check
maxBlocks = 0;
}
else {
if (regsAllocatedPerWarp > 0) {
// Registers are allocated in each sub-partition. The max number
// of warps that can fit on an SM is equal to the max number of
// warps per sub-partition x number of sub-partitions.
//
numRegsPerSubPartition = properties.regsPerMultiprocessor / numSubPartitions;
numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
maxBlocks = 0;
if (gcConfig != cudaOccPartitionedGCConfig.Off) {
int numSubPartitionsPerSmPartition;
int numWarpsPerSmPartition;
int maxBlocksPerSmPartition;
// If partitioned global caching is on, then a CTA can only
// use a half SM, and thus a half of the registers available
// per SM
//
numSubPartitionsPerSmPartition = numSubPartitions / 2;
numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA;
maxBlocks = maxBlocksPerSmPartition * 2;
}
// Try again if partitioned global caching is not enabled, or if
// the CTA cannot fit on the SM with caching on. In the latter
// case, the device will automatically turn off caching, except
// if the device forces it. The user can also override this
// assumption with PARTITIONED_GC_ON_STRICT to calculate
// occupancy and launch configuration.
//
{
bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off);
bool zeroOccupancy = (maxBlocks == 0);
bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict ||
cudaOccPartitionedGCForced(properties));
if (gcOff || (zeroOccupancy && (!cachingForced))) {
gcConfig = cudaOccPartitionedGCConfig.Off;
numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA;
}
}
}
else {
maxBlocks = int.MaxValue;
}
}
result.AllocatedRegistersPerBlock = regsAllocatedPerCTA;
return maxBlocks;
}