performance-analysis-example/dot_neon.cpp at main · ArmDeveloperEcosystem/performance-analysis-example · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <arm_neon.h> // NEON intrinsics (vector operations on ARM)

#include <chrono>   // high_resolution_clock for timing
#include <cstddef>  // std::size_t
#include <iostream> // std::cout / std::cerr
#include <random>   // std::mt19937 and distributions
#include <vector>   // std::vector for contiguous float buffers

// Force key functions to remain as separate call frames in profiling tools.
#if defined(__GNUC__) || defined(__clang__)
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif

// -----------------------------------------------------------------------------
// dot_product (NEON VERSION)
// -----------------------------------------------------------------------------
// This is the optimized implementation.
// It computes the same mathematical dot product as the scalar version,
// but processes 4 floats at a time using NEON vector registers.
NOINLINE float dot_product(const std::vector<float>& a, const std::vector<float>& b) {
  // Vector accumulator containing 4 partial sums.
  float32x4_t acc = vdupq_n_f32(0.0f);

  // i tracks our current position through the arrays.
  std::size_t i = 0;

  // Main vector loop: handle 4 elements per iteration.
  // Condition i + 4 <= a.size() means we only load full 4-float chunks.
  for (; i + 4 <= a.size(); i += 4) {
    // Load 4 floats from each input vector.
    const float32x4_t va = vld1q_f32(a.data() + i);
    const float32x4_t vb = vld1q_f32(b.data() + i);

    // Multiply va*vb and add into acc.
    // AArch64 uses fused multiply-add intrinsic when available.
#if defined(__aarch64__)
    acc = vfmaq_f32(acc, va, vb);
#else
    // Fallback for other ARM targets.
    acc = vmlaq_f32(acc, va, vb);
#endif
  }

  // Horizontal reduction: sum the 4 lanes in acc into one scalar.
  float sum = vaddvq_f32(acc);

  // Tail loop: process any leftover elements (when size is not multiple of 4).
  for (; i < a.size(); ++i) {
    sum += a[i] * b[i];
  }

  return sum;
}

// -----------------------------------------------------------------------------
// run_benchmark
// -----------------------------------------------------------------------------
// Repeats dot_product multiple times to make runtime long enough to measure
// clearly. We return the accumulated result so the work is observable.
NOINLINE float run_benchmark(const std::vector<float>& a, const std::vector<float>& b, int iterations) {
  // Volatile prevents the loop from being optimized away by the compiler.
  volatile float total = 0.0f;

  // Repeat the exact same computation several times.
  for (int i = 0; i < iterations; ++i) {
    total += dot_product(a, b);
  }

  return total;
}

int main(int argc, char** argv) {
  // ---------------------------------------------------------------------------
  // Parse command-line inputs (optional).
  // argv[1] = vector length (number of float elements)
  // argv[2] = number of benchmark repetitions
  //
  // Defaults are intentionally large to make scalar vs NEON differences easier
  // to see in timing/profiling.
  // ---------------------------------------------------------------------------
  const std::size_t n = (argc > 1) ? std::stoull(argv[1]) : (640ull * 1024ull * 1024ull);
  const int iterations = (argc > 2) ? std::stoi(argv[2]) : 100;

  // Guard against invalid repetition counts.
  if (iterations <= 0) {
    std::cerr << "iterations must be > 0\n";
    return 1;
  }

  // ---------------------------------------------------------------------------
  // Allocate input vectors.
  // std::vector gives contiguous storage, which is ideal for this workload.
  // ---------------------------------------------------------------------------
  std::vector<float> a(n);
  std::vector<float> b(n);

  // ---------------------------------------------------------------------------
  // Fill both vectors with deterministic pseudo-random values.
  // Fixed seed means runs are reproducible across executions.
  // ---------------------------------------------------------------------------
  std::mt19937 rng(123);
  std::uniform_real_distribution<float> dist(0.0f, 1.0f);

  for (std::size_t i = 0; i < n; ++i) {
    a[i] = dist(rng);
    b[i] = dist(rng);
  }

  // ---------------------------------------------------------------------------
  // Measure only the benchmarked region.
  // We start timing immediately before repeated dot_product calls
  // and stop immediately after.
  // ---------------------------------------------------------------------------
  const auto start = std::chrono::high_resolution_clock::now();
  const float result = run_benchmark(a, b, iterations);
  const auto end = std::chrono::high_resolution_clock::now();

  // Convert raw clock duration to seconds for readable output.
  const std::chrono::duration<double> elapsed = end - start;

  // Print both elapsed time and result value.
  // Including result helps confirm both implementations compute the same thing.
  std::cout << "time=" << elapsed.count() << "s  result=" << result << "\n";
  return 0;
}