ManagedCuda.CudaOccupancy.cudaOccMaxBlocksPerSMWarpsLimit C# (CSharp) Method

CudaOccupancy Class Documentation ファイルを表示 Open project: kunzmi/managedCuda

cudaOccMaxBlocksPerSMWarpsLimit() private static method

private static cudaOccMaxBlocksPerSMWarpsLimit ( cudaOccPartitionedGCConfig gcConfig, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize ) : int
gcConfig	cudaOccPartitionedGCConfig
properties	cudaOccDeviceProp
attributes	cudaOccFuncAttributes
blockSize	int
return	int

        private static int cudaOccMaxBlocksPerSMWarpsLimit(
			cudaOccPartitionedGCConfig   gcConfig,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			int                   blockSize)
        {
            int limit;
            int maxWarpsPerSm;
            int warpsAllocatedPerCTA;
            int maxBlocks;

            if (blockSize > properties.maxThreadsPerBlock) {
                maxBlocks = 0;
            }
            else {
                maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize;
                warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);
                maxBlocks = 0;

                if (gcConfig != cudaOccPartitionedGCConfig.Off) {
                    int maxBlocksPerSmPartition;
                    int maxWarpsPerSmPartition;

                    // If partitioned global caching is on, then a CTA can only use a SM
                    // partition (a half SM), and thus a half of the warp slots
                    // available per SM
                    //
                    maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
                    maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
                    maxBlocks               = maxBlocksPerSmPartition * 2;
                }
                // On hardware that supports partitioned global caching, each half SM is
                // guaranteed to support at least 32 warps (maximum number of warps of a
                // CTA), so caching will not cause 0 occupancy due to insufficient warp
                // allocation slots.
                //
                else {
                    maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
                }
            }

            limit = maxBlocks;

            return limit;
        }

CudaOccupancy

__occDivideRoundUp

__occMin

__occRoundUp

cudaOccDevicePropCheck

cudaOccDeviceStateCheck

cudaOccFuncAttributesCheck

cudaOccInputCheck

cudaOccMaxActiveBlocksPerMultiprocessor

cudaOccMaxBlocksPerMultiprocessor

cudaOccMaxBlocksPerSMRegsLimit

cudaOccMaxBlocksPerSMSmemLimit