ManagedCuda.CudaOccupancy.cudaOccMaxActiveBlocksPerMultiprocessor C# (CSharp) Method

cudaOccMaxActiveBlocksPerMultiprocessor() public static method

Determine the maximum number of CTAs that can be run simultaneously per SM. This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
public static cudaOccMaxActiveBlocksPerMultiprocessor ( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, ManagedCuda.BasicTypes.SizeT dynamicSmemSize ) : void
result cudaOccResult
properties cudaOccDeviceProp
attributes cudaOccFuncAttributes
state cudaOccDeviceState
blockSize int
dynamicSmemSize ManagedCuda.BasicTypes.SizeT
return void
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
			cudaOccResult               result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			int                   blockSize,
			SizeT                 dynamicSmemSize)
        {
            int          ctaLimitWarps   = 0;
            int          ctaLimitBlocks  = 0;
            int          ctaLimitSMem    = 0;
            int          ctaLimitRegs    = 0;
            int          ctaLimit        = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);

            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);

            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);

            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps) {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs) {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem) {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks) {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }