To establish a baseline, it is always a good idea to test the most straightforward implementation first:

template<class T>
T dotproduct_naive(size_t n, T* a, T* b){
    T sum=T(0);
    for (size_t i=0;i<n;i++){
        sum+=(*a)*(*b);
        a++;
        b++;
    }
    return sum;
}

This leads to the following running times for different values of n:

{ "data": [ { "line": { "color": "gray", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "naive (double)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[4.39453125e-05,0.00010602678571428571,0.0002232142857142857,0.00053125,0.0010498046875,0.002134488512936172,0.00433175029832576,0.008544921875,0.01688069339750043,0.0337605578471813,0.06801060267857142,0.14124648453194052,0.304813401187447,0.6138392857142857,1.2555803571428572,2.455357142857143,5.0,10.009765625,19.04296875,39.52205882352941,79.86111111111111,165.625] } ], "layout": { "title": "Running Time naive (double)", "xaxis":{ "title":"n" }, "yaxis":{ "title":"Running time [ms]" } }, "frames": [] }

This is almost a perfect straight line, as evidenced by a Pearson correlation coefficient value of around 1, which implies an almost perfect positive linear relationship. This is to be expected, since the algorithm iterates through the arrays in a simple loop which is \(O(n)\) in the number of operations.

Normalization of these numbers with respect to throughput and overall number of array elements leads to the following:

{ "data": [ { "line": { "color": "gray", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "naive (double)", "type": "scatter", "x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456], "y":[23.30168888888889, 19.315873684210526, 18.350080000000002, 15.420235294117644, 15.606712558139535, 15.351687208156864, 15.129219250086956, 15.339168914285715, 15.529219909818183, 15.529601210181818, 15.41783131897436, 14.847463332977778, 13.760234896695653, 13.665805032727274, 13.362120476444444, 13.665805032727274, 13.421772800000001, 13.408678387512197, 14.096302920205128, 13.58408261060465, 13.445115013565218, 12.965939006792455] } ], "layout": { "title": "Throughput naive (double)", "xaxis":{ "title":"2n" }, "yaxis":{ "title":"GB/s" } }, "frames": [] }
Note: We use 2n for the throughput calculation since there are two arrays x,y each of size n that must be loaded into memory.

Interesting behaviour can be observed when the graph is cropped to smaller sizes of 2n:

{ "data": [ { "line": { "color": "gray", "shape": "linear", "width": 3 }, "mode": "lines+markers", "type": "scatter", "x":[128,256,512,1024,2048,4096,8192,16384], "y":[23.30168888888889, 19.315873684210526, 18.350080000000002, 15.420235294117644, 15.606712558139535, 15.351687208156864, 15.129219250086956, 15.339168914285715] } ], "layout": { "title": "Throughput naive (double) cropped", "xaxis":{ "title":"2n" }, "yaxis":{ "title":"GB/s" } }, "frames": [] }

This could be explained as smaller array sizes more easily completely fit into L1 cache. It is therefore worthwhile to study what happens when dotproduct_naive is ran with smaller float datatypes.

{ "data": [ { "line": { "color": "gray", "shape": "linear", "width": 3 }, "marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}}, "mode": "lines+markers", "name": "naive (float)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[4.58984375e-05,0.00010498046875,0.00023995535714285715,0.0005022321428571428,0.001025390625,0.002099609375,0.004237581813579547,0.008370498345989527,0.01708984375,0.0337605578471813,0.068359375,0.13392857142857142,0.26993772599437527,0.546875,1.1474609375,2.29933110367893,4.718959731543624,9.583333333333334,18.673780487804876,37.00657894736842,74.65277777777777,152.34375] } ], "layout": { "title": "Running Time naive (float)", "xaxis":{ "title":"n" }, "yaxis":{ "title":"Running time [ms]" } }, "frames": [] }
The float version runs at around the same speed as the double version. Interestingly, this leads to half the amount of memory is computed on, reducing the bandwidth in half. On another computer with a different processor the differences between both datatypes were more pronounced.

On GCC 14.2 (with -O3 and -avx2 flags) the compiled assembly for a double instantiation of dotproduct_naive autovectorizes the code sucessfully:

1    test    rdi, rdi
2    je      .L9
3    lea     rax, [rdi-1]
4    cmp     rax, 2
5    jbe     .L10
6    mov     r8, rdi
7    xor     eax, eax
8    vxorpd  xmm0, xmm0, xmm0 //zeros a register
9    xor     ecx, ecx
10   shr     r8, 2 //divides the array length n by 4
11.L4: //splits arrays a and b into batches of 64 bytes in order to by processed by SIMD in a loop
12   vmovupd ymm1, YMMWORD PTR [rdx+rax] //unaligned move into an AVX 256 bit SIMD register
13   vmulpd  ymm1, ymm1, YMMWORD PTR [rsi+rax] //multiplies the elements
14   add     rcx, 1
15   add     rax, 32 //adds 32 bytes (corresponding to 4 double elements) to the loop increment
16   vaddsd  xmm0, xmm0, xmm1
17   vunpckhpd       xmm2, xmm1, xmm1
18   vextractf128    xmm1, ymm1, 0x1
19   vaddsd  xmm0, xmm0, xmm2
20   vaddsd  xmm0, xmm0, xmm1
21   vunpckhpd       xmm1, xmm1, xmm1
22   vaddsd  xmm0, xmm0, xmm1
23   cmp     r8, rcx
24   jne     .L4 //repeats loop until iterator is same as floor(n/4)
25   test    dil, 3
26   je      .L21
27   mov     rcx, rdi
28   and     rcx, -4
29   lea     rax, [0+rcx*8]
30   lea     r8, [rdx+rax]
31   add     rax, rsi
32   vzeroupper
33.L3:
34   sub     rdi, rcx
35   cmp     rdi, 1
36   je      .L7
37   vmovupd xmm1, XMMWORD PTR [rdx+rcx*8]
38   vmulpd  xmm1, xmm1, XMMWORD PTR [rsi+rcx*8]
39   vaddsd  xmm0, xmm0, xmm1
40   vunpckhpd       xmm1, xmm1, xmm1
41   vaddsd  xmm0, xmm1, xmm0
42   test    dil, 1
43   je      .L1
44   and     rdi, -2
45   sal     rdi, 3
46   add     r8, rdi
47   add     rax, rdi
48.L7:
49   vmovsd  xmm1, QWORD PTR [rax]
50   vmulsd  xmm1, xmm1, QWORD PTR [r8]
51   vaddsd  xmm0, xmm0, xmm1
52   ret
53.L21:
54   vzeroupper
55.L1:
56   ret
57.L9:
58   vxorpd  xmm0, xmm0, xmm0
59   ret
60.L10:
61   mov     r8, rdx
62   mov     rax, rsi
63   vxorpd  xmm0, xmm0, xmm0
64   xor     ecx, ecx
65   jmp     .L3

Microsoft Visual Studio Compiler v.1940 VS 17.10 translates similarly to vectorized instruction. While this is a great start, the performance can be increased by by adding more register acumulators as seen in the next section.