-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdot_scalar.cpp
More file actions
104 lines (89 loc) · 4.13 KB
/
dot_scalar.cpp
File metadata and controls
104 lines (89 loc) · 4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include <chrono> // high_resolution_clock for timing
#include <cstddef> // std::size_t
#include <iostream> // std::cout / std::cerr
#include <random> // std::mt19937 and distributions
#include <vector> // std::vector for contiguous float buffers
// Force key functions to remain as separate call frames in profiling tools.
#if defined(__GNUC__) || defined(__clang__)
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
// -----------------------------------------------------------------------------
// dot_product (SCALAR VERSION)
// -----------------------------------------------------------------------------
// This is the unoptimized baseline implementation.
// It computes:
// sum = a[0]*b[0] + a[1]*b[1] + ... + a[n-1]*b[n-1]
// using one element at a time in a simple loop.
NOINLINE float dot_product(const std::vector<float>& a, const std::vector<float>& b) {
// Running accumulator for the final dot-product value.
float sum = 0.0f;
// Visit every element in order.
for (std::size_t i = 0; i < a.size(); ++i) {
// Scalar multiply + scalar add (one float at a time).
sum += a[i] * b[i];
}
return sum;
}
// -----------------------------------------------------------------------------
// run_benchmark
// -----------------------------------------------------------------------------
// Repeats dot_product multiple times to make runtime long enough to measure
// clearly. We return the accumulated result so the work is observable.
NOINLINE float run_benchmark(const std::vector<float>& a, const std::vector<float>& b, int iterations) {
// Volatile prevents the loop from being optimized away by the compiler.
volatile float total = 0.0f;
// Repeat the exact same computation several times.
for (int i = 0; i < iterations; ++i) {
total += dot_product(a, b);
}
return total;
}
int main(int argc, char** argv) {
// ---------------------------------------------------------------------------
// Parse command-line inputs (optional).
// argv[1] = vector length (number of float elements)
// argv[2] = number of benchmark repetitions
//
// Defaults are intentionally large to make scalar vs NEON differences easier
// to see in timing/profiling.
// ---------------------------------------------------------------------------
const std::size_t n = (argc > 1) ? std::stoull(argv[1]) : (640ull * 1024ull * 1024ull);
const int iterations = (argc > 2) ? std::stoi(argv[2]) : 100;
// Guard against invalid repetition counts.
if (iterations <= 0) {
std::cerr << "iterations must be > 0\n";
return 1;
}
// ---------------------------------------------------------------------------
// Allocate input vectors.
// std::vector gives contiguous storage, which is ideal for this workload.
// ---------------------------------------------------------------------------
std::vector<float> a(n);
std::vector<float> b(n);
// ---------------------------------------------------------------------------
// Fill both vectors with deterministic pseudo-random values.
// Fixed seed means runs are reproducible across executions.
// ---------------------------------------------------------------------------
std::mt19937 rng(123);
std::uniform_real_distribution<float> dist(0.0f, 1.0f);
for (std::size_t i = 0; i < n; ++i) {
a[i] = dist(rng);
b[i] = dist(rng);
}
// ---------------------------------------------------------------------------
// Measure only the benchmarked region.
// We start timing immediately before repeated dot_product calls
// and stop immediately after.
// ---------------------------------------------------------------------------
const auto start = std::chrono::high_resolution_clock::now();
const float result = run_benchmark(a, b, iterations);
const auto end = std::chrono::high_resolution_clock::now();
// Convert raw clock duration to seconds for readable output.
const std::chrono::duration<double> elapsed = end - start;
// Print both elapsed time and result value.
// Including result helps confirm both implementations compute the same thing.
std::cout << "time=" << elapsed.count() << "s result=" << result << "\n";
return 0;
}