【CUDA编程】【8】【3.Programming Interface】【3.2.CUDA Runtime】【3.2.5.Distributed Shared Memory】

Distributed Shared Memory

Posted by x-jeff on November 3, 2024

【CUDA编程】系列博客参考NVIDIA官方文档“CUDA C++ Programming Guide(v12.6)”
本文为原创文章,未经本人允许,禁止转载。转载请注明出处。

1.Distributed Shared Memory

详见:Thread Block Clusters

下面是一个使用分布式共享内存实现直方图计算的例子。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#include <cooperative_groups.h>

// Distributed Shared memory histogram kernel
//输入参数解释:
//bins:在全局内存中的一维数组,用于存储直方图每个bin上的值。
//nbins:直方图bin的数量。
//bins_per_block:每个block处理的bin的数量。
//input:输入数据数组。
//array_size:输入数组的大小。
__global__ void clusterHist_kernel(int *bins, const int nbins, const int bins_per_block, const int *__restrict__ input,
                                   size_t array_size)
{
  //动态分配的共享内存,大小在kernel启动时指定
  //这个共享内存指的是运行该实例的线程所在block的共享内存
  //cluster内的每个block都有自己的smem
  extern __shared__ int smem[];
  namespace cg = cooperative_groups;
  //tid是线程在该grid内的唯一ID
  //类似于:int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int tid = cg::this_grid().thread_rank();

  // Cluster initialization, size and calculating local bin offsets.
  cg::cluster_group cluster = cg::this_cluster(); //当前线程所属cluster
  unsigned int clusterBlockRank = cluster.block_rank(); //当前block在cluster中的索引,从0开始,本例中block是一维的
  int cluster_size = cluster.dim_blocks().x; //cluster也是一维的

  //初始化smem
  for (int i = threadIdx.x; i < bins_per_block; i += blockDim.x)
  {
    smem[i] = 0; //Initialize shared memory histogram to zeros
  }

  // cluster synchronization ensures that shared memory is initialized to zero in
  // all thread blocks in the cluster. It also ensures that all thread blocks
  // have started executing and they exist concurrently.
  //直方图的计算是在一个cluster内进行的
  //同步该cluster内的所有线程,即完成对smem所有位置的初始化
  cluster.sync();

  //这个for循环的逻辑和初始化smem是类似的,即一个线程可能要处理两个位置上的值
  for (int i = tid; i < array_size; i += blockDim.x * gridDim.x)
  {
    int ldata = input[i];

    //Find the right histogram bin.
    int binid = ldata;
    if (ldata < 0)
      binid = 0;
    else if (ldata >= nbins)
      binid = nbins - 1;

    //Find destination block rank and offset for computing
    //distributed shared memory histogram
    int dst_block_rank = (int)(binid / bins_per_block);
    int dst_offset = binid % bins_per_block;

    //Pointer to target block shared memory
    //dst_mem指向目标block的共享内存(可能存在跨block)
    int *dst_smem = cluster.map_shared_rank(smem, dst_block_rank);

    //Perform atomic update of the histogram bin
    //执行原子加法,确保对共享数据的更新在多线程并发访问时是安全的
    atomicAdd(dst_smem + dst_offset, 1);
  }

  // cluster synchronization is required to ensure all distributed shared
  // memory operations are completed and no thread block exits while
  // other thread blocks are still accessing distributed shared memory
  //线程同步,完成直方图的计算,结果存在共享内存smem中
  cluster.sync();

  // Perform global memory histogram, using the local distributed memory histogram
  int *lbins = bins + cluster.block_rank() * bins_per_block;
  for (int i = threadIdx.x; i < bins_per_block; i += blockDim.x)
  {
    atomicAdd(&lbins[i], smem[i]); //结果累加在位于全局内存的bins上
  }
}

这里着重解释下对smem的初始化:

1
2
3
4
for (int i = threadIdx.x; i < bins_per_block; i += blockDim.x)
{
    smem[i] = 0; //Initialize shared memory histogram to zeros
}

一个block只负责这个block内的bin的初始化。假设bins_per_block是20,blockDim.x是16,那么对于threadIdx.x等于0的线程来说,有i=0,此时将smem[0]初始化为0,然后i=16,将smem[16]初始化为0,相当于这个线程负责了两个位置的初始化。再比如对于threadIdx.x等于1的线程,有i=1,此时将smem[1]初始化为0,然后i=17,再将smem[17]初始化为0,该线程同样也是负责了两个位置的初始化。而对于threadIdx.x等于5的线程,只有i=5,只负责将smem[5]初始化为0就可以了。

上述kernel在运行时可以根据所需的分布式共享内存的大小以不同的cluster大小启动。如果直方图足够小,可以放入一个block的共享内存内,用户可以以cluster大小为1来启动kernel。下面的代码展示了如何根据共享内存的需求来动态启动cluster kernel。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// Launch via extensible launch
{
  cudaLaunchConfig_t config = {0};
  config.gridDim = array_size / threads_per_block; //一个grid内block的数量
  config.blockDim = threads_per_block; //一个block内线程的数量

  // cluster_size depends on the histogram size.
  // ( cluster_size == 1 ) implies no distributed shared memory, just thread block local shared memory
  //一个cluster内block的数量,默认直方图在一个cluster内计算
  int cluster_size = 2; // size 2 is an example here
  int nbins_per_block = nbins / cluster_size; //每个block分配的bin的数量

  //dynamic shared memory size is per block.
  //Distributed shared memory size =  cluster_size * nbins_per_block * sizeof(int)
  //每个block分配的动态共享内存的大小
  //因为是int数组,所以这里用了sizeof(int)
  config.dynamicSmemBytes = nbins_per_block * sizeof(int);

  //cudaFuncAttributeMaxDynamicSharedMemorySize为一个block可以拥有的最大动态共享内存
  //该语句检查为每个block分配的动态共享内存不能超过最大值的限制
  CUDA_CHECK(::cudaFuncSetAttribute((void *)clusterHist_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, config.dynamicSmemBytes));

  //设置cluster维度
  //这个例子中cluster是一维的
  cudaLaunchAttribute attribute[1];
  attribute[0].id = cudaLaunchAttributeClusterDimension;
  attribute[0].val.clusterDim.x = cluster_size;
  attribute[0].val.clusterDim.y = 1;
  attribute[0].val.clusterDim.z = 1;

  config.numAttrs = 1; //表示有1个属性需要配置,即cluster的维度
  config.attrs = attribute;

  //使用cudaLaunchKernelEx启动kernel
  cudaLaunchKernelEx(&config, clusterHist_kernel, bins, nbins, nbins_per_block, input, array_size);
}