-
Notifications
You must be signed in to change notification settings - Fork 25
Expand file tree
/
Copy pathgemm_gpu_1thread.cu
More file actions
34 lines (31 loc) · 780 Bytes
/
gemm_gpu_1thread.cu
File metadata and controls
34 lines (31 loc) · 780 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#include "gemm_gpu_1thread.h"
#include <cuda_runtime_api.h>
// gemm_gpu_1thread - GEMM on GPU, using only one thread
__global__
void gemm_gpu_1thread_kernel(
int* __restrict__ C, // [n, m], on gpu
const int* __restrict__ A, // [n, k], on gpu
const int* __restrict__ B, // [k, m], on gpu
const int n,
const int m,
const int k
) {
for (int i = 0; i < n; ++i)
for (int j = 0; j < m; ++j) {
int res = 0;
for (int l = 0; l < k; ++l) {
res += A[i * k + l] * B[l * m + j];
}
C[i * m + j] = res;
}
}
void gemm_gpu_1thread(
int* __restrict__ C, // [n, m], on gpu
const int* __restrict__ A, // [n, k], on gpu
const int* __restrict__ B, // [k, m], on gpu
const int n,
const int m,
const int k
) {
gemm_gpu_1thread_kernel<<<1, 1>>>(C, A, B, n, m, k);
}