ManagedCuda.CudaOccupancy.cudaOccMaxBlocksPerSMRegsLimit C# (CSharp) Method

cudaOccMaxBlocksPerSMRegsLimit() private static method

private static cudaOccMaxBlocksPerSMRegsLimit ( cudaOccPartitionedGCConfig &gcConfig, cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize ) : int
gcConfig cudaOccPartitionedGCConfig
result cudaOccResult
properties cudaOccDeviceProp
attributes cudaOccFuncAttributes
blockSize int
return int
        private static int cudaOccMaxBlocksPerSMRegsLimit(
			ref cudaOccPartitionedGCConfig  gcConfig,
			cudaOccResult         result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			int                   blockSize)
        {
            int allocationGranularity;
            int warpsAllocatedPerCTA;
            int regsAllocatedPerCTA;
            int regsAssumedPerCTA;
            int regsPerWarp;
            int regsAllocatedPerWarp;
            int numSubPartitions;
            int numRegsPerSubPartition;
            int numWarpsPerSubPartition;
            int numWarpsPerSM;
            int maxBlocks;

            allocationGranularity = cudaOccRegAllocationGranularity(
                properties,
                attributes.numRegs);   // Fermi requires special handling of certain register usage

            numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties);

            warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);

            // GPUs of compute capability 2.x and higher allocate registers to warps
            //
            // Number of regs per warp is regs per thread x warp size, rounded up to
            // register allocation granularity
            //
            regsPerWarp          = attributes.numRegs * properties.warpSize;
            regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
            regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;

            // Hardware verifies if a launch fits the per-CTA register limit. For
            // historical reasons, the verification logic assumes register
            // allocations are made to all partitions simultaneously. Therefore, to
            // simulate the hardware check, the warp allocation needs to be rounded
            // up to the number of partitions.
            //
            regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

            if (properties.regsPerBlock < regsAssumedPerCTA ||   // Hardware check
                properties.regsPerBlock < regsAllocatedPerCTA) { // Software check
                maxBlocks = 0;
            }
            else {
                if (regsAllocatedPerWarp > 0) {
                    // Registers are allocated in each sub-partition. The max number
                    // of warps that can fit on an SM is equal to the max number of
                    // warps per sub-partition x number of sub-partitions.
                    //
                    numRegsPerSubPartition  = properties.regsPerMultiprocessor / numSubPartitions;
                    numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

                    maxBlocks = 0;

                    if (gcConfig != cudaOccPartitionedGCConfig.Off) {
                        int numSubPartitionsPerSmPartition;
                        int numWarpsPerSmPartition;
                        int maxBlocksPerSmPartition;

                        // If partitioned global caching is on, then a CTA can only
                        // use a half SM, and thus a half of the registers available
                        // per SM
                        //
                        numSubPartitionsPerSmPartition = numSubPartitions / 2;
                        numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
                        maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
                        maxBlocks                      = maxBlocksPerSmPartition * 2;
                    }

                    // Try again if partitioned global caching is not enabled, or if
                    // the CTA cannot fit on the SM with caching on. In the latter
                    // case, the device will automatically turn off caching, except
                    // if the device forces it. The user can also override this
                    // assumption with PARTITIONED_GC_ON_STRICT to calculate
                    // occupancy and launch configuration.
                    //
                    {
                        bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off);
                        bool zeroOccupancy = (maxBlocks == 0);
                        bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict ||
                                             cudaOccPartitionedGCForced(properties));

                        if (gcOff || (zeroOccupancy && (!cachingForced))) {
                            gcConfig = cudaOccPartitionedGCConfig.Off;
                            numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
                            maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
                        }
                    }
                }
                else {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedRegistersPerBlock = regsAllocatedPerCTA;

            return maxBlocks;
        }