private static SizeT cudaOccSMemPerMultiprocessor(cudaOccDeviceProp properties, cudaOccCacheConfig cacheConfig)
{
SizeT bytes = 0;
SizeT sharedMemPerMultiprocessorHigh = (int)properties.sharedMemPerMultiprocessor;
// Fermi and Kepler has shared L1 cache / shared memory, and support cache
// configuration to trade one for the other. These values are needed to
// calculate the correct shared memory size for user requested cache
// configuration.
//
SizeT minCacheSize = 16384;
SizeT maxCacheSize = 49152;
SizeT cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize;
SizeT sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize;
switch (properties.computeMajor)
{
case 2:
// Fermi supports 48KB / 16KB or 16KB / 48KB partitions for shared /
// L1.
//
switch (cacheConfig)
{
default:
case cudaOccCacheConfig.PreferNone:
case cudaOccCacheConfig.PreferShared:
case cudaOccCacheConfig.PreferEqual:
bytes = sharedMemPerMultiprocessorHigh;
break;
case cudaOccCacheConfig.PreferL1:
bytes = sharedMemPerMultiprocessorLow;
break;
}
break;
case 3:
// Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
// is shared memory.
//
switch (cacheConfig)
{
default:
case cudaOccCacheConfig.PreferNone:
case cudaOccCacheConfig.PreferShared:
bytes = sharedMemPerMultiprocessorHigh;
break;
case cudaOccCacheConfig.PreferL1:
bytes = sharedMemPerMultiprocessorLow;
break;
case cudaOccCacheConfig.PreferEqual:
// Equal is the mid-point between high and low. It should be
// equivalent to low + 16KB.
//
bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
break;
}
break;
case 5:
case 6:
// Maxwell and Pascal have dedicated shared memory.
//
bytes = sharedMemPerMultiprocessorHigh;
break;
default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
}
return bytes;
}