public static void cudaOccMaxActiveBlocksPerMultiprocessor(
cudaOccResult result,
cudaOccDeviceProp properties,
cudaOccFuncAttributes attributes,
cudaOccDeviceState state,
int blockSize,
SizeT dynamicSmemSize)
{
int ctaLimitWarps = 0;
int ctaLimitBlocks = 0;
int ctaLimitSMem = 0;
int ctaLimitRegs = 0;
int ctaLimit = 0;
cudaOccLimitingFactors limitingFactors = 0;
cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;
//if (!result || !properties || !attributes || !state || blockSize <= 0) {
// return CUDA_OCC_ERROR_INVALID_INPUT;
//}
///////////////////////////
// Check user input
///////////////////////////
cudaOccInputCheck(properties, attributes, state);
///////////////////////////
// Initialization
///////////////////////////
gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
///////////////////////////
// Compute occupancy
///////////////////////////
// Limits due to registers/SM
// Also compute if partitioned global caching has to be turned off
//
ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);
// Limits due to warps/SM
//
ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);
// Limits due to blocks/SM
//
ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);
// Limits due to shared memory/SM
//
ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);
///////////////////////////
// Overall occupancy
///////////////////////////
// Overall limit is min() of limits due to above reasons
//
ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
// Fill in the return values
//
// Determine occupancy limiting factors
//
if (ctaLimit == ctaLimitWarps) {
limitingFactors |= cudaOccLimitingFactors.Warps;
}
if (ctaLimit == ctaLimitRegs) {
limitingFactors |= cudaOccLimitingFactors.Registers;
}
if (ctaLimit == ctaLimitSMem) {
limitingFactors |= cudaOccLimitingFactors.SharedMemory;
}
if (ctaLimit == ctaLimitBlocks) {
limitingFactors |= cudaOccLimitingFactors.Blocks;
}
result.LimitingFactors = limitingFactors;
result.BlockLimitRegs = ctaLimitRegs;
result.BlockLimitSharedMem = ctaLimitSMem;
result.BlockLimitWarps = ctaLimitWarps;
result.BlockLimitBlocks = ctaLimitBlocks;
result.partitionedGCConfig = gcConfig;
// Final occupancy
result.ActiveBlocksPerMultiProcessor = ctaLimit;
}