CUDA-From-Correctness-To-Performance-Code/gemm_gpu_1thread.cu at master · interestingLSY/CUDA-From-Correctness-To-Performance-Code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#include "gemm_gpu_1thread.h"

#include <cuda_runtime_api.h>

// gemm_gpu_1thread - GEMM on GPU, using only one thread
__global__
void gemm_gpu_1thread_kernel(
	int* __restrict__ C,		// [n, m], on gpu
	const int* __restrict__ A,	// [n, k], on gpu
	const int* __restrict__ B,	// [k, m], on gpu
	const int n,
	const int m,
	const int k
) {
	for (int i = 0; i < n; ++i)
		for (int j = 0; j < m; ++j) {
			int res = 0;
			for (int l = 0; l < k; ++l) {
				res += A[i * k + l] * B[l * m + j];
			}
			C[i * m + j] = res;
		}
}

void gemm_gpu_1thread(
	int* __restrict__ C,		// [n, m], on gpu
	const int* __restrict__ A,	// [n, k], on gpu
	const int* __restrict__ B,	// [k, m], on gpu
	const int n,
	const int m,
	const int k
) {
	gemm_gpu_1thread_kernel<<<1, 1>>>(C, A, B, n, m, k);
}