The previous section showed showed that modern compilers can transform some constructs such
as simple-sum-loops to vectorized versions already. The optimization potential is not yet exhausted, however, as in this specific case we might do better by adding more AVX register accumulators.
Singlethreaded AVX
#include <immintrin.h>
inline double dotproduct_avx(size_t n, double* a, double* b){
size_t blocks_end = n-static_cast<int>(n&15);
double acc = 0;
size_t i = 0;
if (blocks_end != n){
__m256d sum1 = _mm256_setzero_pd();
__m256d sum2 = _mm256_setzero_pd();
__m256d sum3 = _mm256_setzero_pd();
__m256d sum4 = _mm256_setzero_pd();
for (i=0;i<blocks_end;i+=16){
__m256d x = _mm256_loadu_pd(a);
__m256d y = _mm256_loadu_pd(a+4);
__m256d z = _mm256_loadu_pd(a+8);
__m256d w = _mm256_loadu_pd(a+12);
__m256d k = _mm256_loadu_pd(b);
__m256d r = _mm256_loadu_pd(b+4);
__m256d p = _mm256_loadu_pd(b+8);
__m256d m = _mm256_loadu_pd(b+12);
__m256d xk = _mm256_mul_pd(x,k);
__m256d yr = _mm256_mul_pd(y,r);
__m256d zp = _mm256_mul_pd(z,p);
__m256d wm = _mm256_mul_pd(w,m);
sum1 = _mm256_add_pd(sum1,xk);
sum2 = _mm256_add_pd(sum2,yr);
sum3 = _mm256_add_pd(sum3,zp);
sum4 = _mm256_add_pd(sum4,wm);
a += 16;
b += 16;
}
__m256d sum1_plus_sum2 = _mm256_add_pd(sum1,sum2);
__m256d sum2_plus_sum3 = _mm256_add_pd(sum3,sum4);
__m256d sum1_sum2_plus_sum3_sum4 = _mm256_add_pd(sum1_plus_sum2,sum2_plus_sum3);
__m128d sum1_sum2_plus_sum3_sum4_low = _mm256_castpd256_pd128(sum1_sum2_plus_sum3_sum4);
__m128d sum1_sum2_plus_sum3_sum4_high = _mm256_extractf128_pd(sum1_sum2_plus_sum3_sum4,1);
__m128d sum = _mm_add_pd(sum1_sum2_plus_sum3_sum4_low,sum1_sum2_plus_sum3_sum4_high);
__m128d shuffled_sum = _mm_shuffle_pd(sum,sum,0b01);
__m128d result = _mm_add_pd(sum,shuffled_sum); //result is stored in lower and higher register halves
acc+=_mm_cvtsd_f64(result);
}
blocks_end = n-static_cast<int>(n&7);
if (blocks_end != n){
__m256d sum1 = _mm256_setzero_pd();
for (;i<blocks_end;i+=8){
__m256d x = _mm256_loadu_pd(a);
__m256d y = _mm256_loadu_pd(a+4);
__m256d z = _mm256_loadu_pd(b);
__m256d w = _mm256_loadu_pd(b+4);
__m256d xz = _mm256_mul_pd(x,z);
__m256d yw = _mm256_mul_pd(y,w);
__m256d xz_plus_yw = _mm256_add_pd(xz,yw);
sum1 = _mm256_add_pd(sum1,xz_plus_yw);
a += 8;
b += 8;
}
__m128d sum1_low = _mm256_castpd256_pd128(sum1);
__m128d sum1_high = _mm256_extractf128_pd(sum1,1);
__m128d sum = _mm_add_pd(sum1_low,sum1_high);
__m128d shuffled_sum = _mm_shuffle_pd(sum,sum,0b01);
__m128d result = _mm_add_pd(sum,shuffled_sum); //result is stored in lower and higher register halves
acc += _mm_cvtsd_f64(result);
}
while (i<n){
acc += (*a)*(*b);
a++;
b++;
i++;
}
return acc;
}
This leads to the following result:
{
"data": [
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[1.949637302896928e-05,2.4065289909985603e-05,3.076171923065186e-05,5e-05,8.579798724116129e-05,0.0001649693190827669,0.0006626674107142857,0.0013113839285714285,0.0026785714285714286,0.00816123588733979,0.01708984375,0.03278459821428571,0.068359375,0.22529686174724342,0.5929129464285714,1.4997209821428572,2.5820974576271185,5.440848214285714,10.25390625,22.235576923076923,42.96875,87.05357142857143]
},
{
"line": {
"color": "blue",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[1.6113281142098564e-05,1.9042969005039764e-05,2.566964285714286e-05,3.8609095034151686e-05,7.114955357142857e-05,0.0001224190793562911,0.00027273996753303425,0.0006103515625,0.0013950892857142857,0.002825054542386365,0.008579760804639265,0.017996812471539926,0.03606241406403457,0.08196149553571429,0.22879464285714285,0.6138392857142857,1.5373995983935742,2.913135593220339,5.301339285714286,11.25,22.916666666666668,46.875]
}
],
"layout": {
"title": "Running Time AVX",
"xaxis":{
"title":"n"
},
"yaxis":{
"title":"Running time [ms]"
}
},
"frames": []
}
This is around half the time than the running time than the
previous version. Again, it is worthwhile to calculate the throughput:
{
"data": [
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (double)",
"type": "scatter",
"x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456],
"y":[52.52258963646511, 85.10182123965218, 133.152505856, 163.84, 190.96019063882926, 198.6308738024186, 98.89727326315789, 99.94937191489362, 97.86709333333333, 64.24125061907692, 61.35667565714286, 63.96759802553192, 61.35667565714286, 37.23357677929412, 28.29625512658824, 22.373783123348836, 25.990058509128207, 24.668530110358972, 26.17884828038095, 24.14468101535135, 24.98890063127273, 24.668530110358972]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"opacity":0.5,
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (float)",
"type": "scatter",
"x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456],
"y":[31.775030515809522, 53.77312748495238, 79.78295652173912, 106.08899266809757, 115.13775686274509, 133.83534728533334, 120.14374092800001, 107.37418240000001, 93.95240960000001, 92.79254473386668, 61.107531076682925, 58.26453999330232, 58.153400276425536, 51.17407842042553, 36.66435496585366, 27.331610065454548, 21.825446055183676, 23.036642769454545, 25.31770195536842, 23.860929422222224, 23.427094341818183, 22.906492245333332]
}
],
"layout": {
"title": "Throughput AVX",
"xaxis":{
"title":"2n"
},
"yaxis":{
"title":"GB/s"
}
},
"frames": []
}
And cropped:
{
"data": [
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (double)",
"type": "scatter",
"x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152],
"y":[52.52258963646511, 85.10182123965218, 133.152505856, 163.84, 190.96019063882926, 198.6308738024186, 98.89727326315789, 99.94937191489362, 97.86709333333333, 64.24125061907692, 61.35667565714286, 63.96759802553192, 61.35667565714286, 37.23357677929412, 28.29625512658824, 22.373783123348836, 25.990058509128207, 24.668530110358972, 26.17884828038095, 24.14468101535135, 24.98890063127273, 24.668530110358972]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"opacity":0.5,
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (float)",
"type": "scatter",
"x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152],
"y":[31.775030515809522, 53.77312748495238, 79.78295652173912, 106.08899266809757, 115.13775686274509, 133.83534728533334, 120.14374092800001, 107.37418240000001, 93.95240960000001, 92.79254473386668, 61.107531076682925, 58.26453999330232, 58.153400276425536, 51.17407842042553, 36.66435496585366, 27.331610065454548, 21.825446055183676, 23.036642769454545, 25.31770195536842, 23.860929422222224, 23.427094341818183, 22.906492245333332]
}
],
"layout": {
"title": "Throughput AVX cropped",
"xaxis":{
"title":"2n"
},
"yaxis":{
"title":"GB/s"
}
},
"frames": []
}
At 2 million elements the throughput for double sized numbers at 28.29 GB/s is faster than the naive version which for this size ran at 13.36 GB/s.
In fact, it is even higher than the previous overall peak which was 23.3 GB/s. The difference becomes even more stark for smaller sizes:
{
"data": [
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (double)",
"type": "scatter",
"x":[128,256,512,1024,2048,4096,8192,16384,32768,65536],
"y":[52.52258963646511, 85.10182123965218, 133.152505856, 163.84, 190.96019063882926, 198.6308738024186, 98.89727326315789, 99.94937191489362, 97.86709333333333, 64.24125061907692]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"opacity":0.5,
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (float)",
"type": "scatter",
"x":[128,256,512,1024,2048,4096,8192,16384,32768,65536],
"y":[31.775030515809522, 53.77312748495238, 79.78295652173912, 106.08899266809757, 115.13775686274509, 133.83534728533334, 120.14374092800001, 107.37418240000001, 93.95240960000001, 92.79254473386668]
}
],
"layout": {
"title": "Throughput AVX cropped",
"xaxis":{
"title":"2n"
},
"yaxis":{
"title":"GB/s"
}
},
"frames": []
}
For double sized floating points at 4096 overall elements 198.63 GB/s are achieved! As the L1D cache of this system is 32 KiB large it can be assumed that all 4096 8 byte sized elements fit into L1D. A throughput of
198.63 GB/s is close to the theoretical 256 GB/s L1D bandwidth calculated previously limit. The vectorized operations lend themselvs well to utilize the maximum cache throughput as well as possible.
For example, a VMOVAPS
instruction used in aligned loads for _mm256_loadu_pd
can load multiple elements in a single cycle.
Multithreaded AVX
The AVX dotproduct can be parallelized further by using multiple cores:
template<class T>
T dotproduct_parallel(size_t n_threads, size_t block_size, size_t n, T (*f)(size_t, T*, T*), T* x, T* y){
if (n == 0){
return T(0);
}
size_t blocks_used=(n+block_size-1)/block_size;
size_t blocks_per_thread=blocks_used/n_threads;
int blocks_per_thread_remainder=blocks_used%n_threads;
size_t threads_used=(blocks_per_thread==0)?blocks_per_thread_remainder:n_threads;
std::vector<std::thread> ts(threads_used-1);
std::vector<T> res(threads_used-1);
auto dp=[&f](T& res, size_t n, T* a, T* b){
res=f(n,a,b);
};
for (size_t i=0;i<threads_used-1;i++){
size_t step=(blocks_per_thread_remainder>0)?block_size*(blocks_per_thread+1):block_size*blocks_per_thread;
ts[i]=std::thread(dp,std::ref(res[i]),step,x,y);
x+=step;
y+=step;
blocks_per_thread_remainder--;
n-=step;
}
T sum=f(n,x,y);
for (auto& t:ts){
t.join();
}
for (auto s:res){
sum+=s;
}
return sum;
}
This adds 2 new parameters that affect performance: The number of utilized threads n_threads
and the minimum number of elements block_size
each thread works on. For ease
of comparison block_size = 50000
. In a more elaborate performance optimization this value would be more careful chosen to balance the set up cost for each thread and minimize false sharing at a thread's margin of computation range.
Now the optimal number of threads for this system can be found:
{
"data": [
{
"line": {
"color": "green",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (40 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.271487588391556e-05,0.00013822776105019475,0.00026914859296932087,0.0005323888034827205,0.0010591796925548877,0.002124891458697054,0.004236272190061523,0.00851838832351596,0.01694467451966173,0.03389904082332105,0.1524541528879493,0.19703806414824848,0.35132031635258365,0.6372972552588743,1.2317983516331612,2.3329674656190336,2.652317063332284,4.743289781129327,9.129194666941961,18.0449794846563,33.59427368023286,67.79060999397188]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (2 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.447550180997315e-05,0.0001379624799275601,0.0002709494178492576,0.0005358373690030239,0.0010608918229387647,0.002123033487859581,0.004229794152445515,0.008467182957924226,0.016910834571107272,0.033775084549474865,0.15162964183362407,0.20368509745082888,0.2741872102472413,0.4919963396325522,0.8989141042802241,1.5531765863592828,2.93561841008569,5.7042808697113525,10.89219259382743,22.93221613076786,44.353312499879394,89.00738750526216]
},
{
"line": {
"color": "red",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (3 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.116077034561467e-05,0.00013753770290247233,0.0002769580968368999,0.0005455645042154761,0.0010882794339335921,0.002128034257539853,0.00422813170287632,0.008463186361849426,0.016961668003501223,0.03384082025287133,0.15319223822859623,0.19616602404632733,0.25598458454804357,0.394032037750234,0.654403046807183,1.2711733717906692,2.4759583916007117,4.831588571791404,9.92185416624933,20.593913889671157,40.51394000028571,83.96412221352674]
},
{
"line": {
"color": "green",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (4 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.595968955649377e-05,0.00013912179010711366,0.000269185308658608,0.000533078140549063,0.0010887257972547815,0.0021376322142733916,0.004209988687041789,0.008455138846103088,0.017193472218890758,0.034075246574722336,0.15240848087323983,0.19726975818309048,0.27790614282121673,0.3768042941196305,0.5811207215271539,1.190213293681616,2.4172086642705417,4.690310457459608,9.00091351374883,17.93744857090392,37.15378333840312,74.86507777745526]
},
{
"line": {
"color": "blue",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (5 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.296779817561047e-05,0.00013839927716106088,0.00027773955132505835,0.000549007739621939,0.0010875294955908755,0.0021992816173152753,0.00439414173517782,0.008660403550089438,0.017393440648233144,0.034730181690372425,0.160853176755308,0.20959480033282601,0.30774863077082437,0.39066513002647557,0.6722231161289387,1.2358834000770003,2.5162378090759576,4.674005442533363,9.301768830529861,19.532875757607997,37.31512999511324,71.40033332527511]
},
{
"line": {
"color": "#17becf",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (6 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.861030756945898e-05,0.0001420940793864,0.00028016673127448464,0.0005537112433743328,0.0010874314671940602,0.0022258737379456986,0.004373437421661935,0.008730228606676643,0.017326186535335786,0.03385894793166394,0.15601590755806316,0.19840189328976082,0.34088269328963283,0.41768712989051804,0.623433461609798,1.1999411992566584,2.3124739862003088,4.467150993608185,9.084432467503788,18.325307896654856,34.65606842042976,69.1190000041388]
},
{
"line": {
"color": "yellow",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (7 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.531862792356786e-05,0.00014116765885857475,0.0002724830957772748,0.0005394688951769208,0.0010828714384449778,0.0021269004629498665,0.004294519650693686,0.008538672579991071,0.017035384541373346,0.03469674546191848,0.15804207785857774,0.1991886028557658,0.34654676436076526,0.4392760545367759,0.657842461777787,1.1922140728716202,2.3630059646900023,4.664486486502495,9.012212821592888,17.891684846924335,37.389185000211,70.78936666301969]
},
{
"line": {
"color": "purple",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (8 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.43515137055223e-05,0.00013973961160072444,0.0002795723390339725,0.0005518897075474308,0.0010653604025282464,0.002129653199974066,0.004237007312414207,0.008533011136375338,0.01695725222620299,0.03387210926915797,0.15932770721834558,0.20422747577894987,0.3422397217751202,0.4860280847626708,0.6676610010288602,1.2732489680168944,2.343447000021115,4.392943589780957,8.945141024947262,17.73906874950626,37.134114996297285,72.43826000485569]
},
{
"line": {
"color": "blue",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (9 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.25063051482957e-05,0.0001393148314351734,0.00027121944496019196,0.0005349733578627887,0.0010725003899037416,0.002126066838891975,0.004278974107747945,0.008583665193205407,0.017012347893894792,0.03426818839679129,0.1539781366719882,0.2004174666439025,0.3474718843612742,0.5417024596113937,0.6858441699618334,1.2841059730759676,2.603055094368756,5.0933977770101695,9.914974284557891,19.741968571075372,38.60369444333224,74.194290000014]
},
{
"line": {
"color": "#9467bd",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (10 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.345330971457533e-05,0.00014132727782226005,0.0002735032199936989,0.000537497673491867,0.0010690759288776567,0.002178548651438258,0.004285901192994893,0.008467326477358785,0.017077787833614324,0.03398881997010457,0.15283788046291624,0.19777536397795065,0.34204511951926014,0.586888093157407,0.6912174692777927,1.2527679434917387,2.520434999938256,4.74662608736991,9.236508107278496,17.93225714271622,39.21251579212319,72.24953999975696]
},
{
"line": {
"color": "#9167bd",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (11 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.501213500660577e-05,0.0001394733266918083,0.00027411349143465543,0.0005361213280767881,0.0010654616780280688,0.002119538484171784,0.004249666096356249,0.008791300090157614,0.017362876138124393,0.03487186174846981,0.15152671773809964,0.19704817295479723,0.34679699026825794,0.6643658191037562,0.7528513952883956,1.455091100853521,2.485793385136481,4.622906993527512,9.298913157851386,18.391754285299353,37.60640526582536,70.51468999125063]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (12 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.23418906161151e-05,0.00013821661925554033,0.0002791487913946083,0.000535335836949355,0.0011141985704205138,0.002178994079226594,0.004308794539464287,0.008658391935269953,0.017332263687932087,0.034638879969467753,0.15259551240016728,0.19729549674405839,0.3407587828243659,0.6434382105468385,0.7592327333620097,1.261864232260399,2.38585071430342,4.474367785436295,9.003935065785688,17.937246151865484,36.184959998354316,67.70612999098375]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (20 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.534769225350245e-05,0.0001383309724821705,0.0002713367635307779,0.0005335449094135558,0.0010621929831400713,0.0021113965692554047,0.004227263447441641,0.008454349069403894,0.0169742222678221,0.03371277151177071,0.1523271068774057,0.1969638423029525,0.34051803121076385,0.6458522598899326,1.1834141652651042,1.316817796501031,2.433856159188798,4.57035100646942,8.961021333622435,17.031239474339312,36.120150005444884,69.97020998969674]
},
{
"line": {
"color": "green",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (40 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.329174340692232e-05,0.0001388385273581635,0.00027279632978710665,0.0005347428704489033,0.0010701517014004834,0.002128110451795406,0.004220366218929466,0.008497363781726448,0.01708868295348329,0.034681987576647616,0.15516268027525223,0.20085049699092625,0.3421172858185885,0.644367023331298,1.238985461057395,2.34596040966021,2.806991902896418,4.904358954439913,9.206171999685466,18.026430768748888,34.370109997689724,68.16053636033426]
},
{
"line": {
"color": "#17becf",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (60 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.290037748232034e-05,0.0001391916969758417,0.00027159550251471025,0.0005338624023792294,0.0010590533013063825,0.002117661132453418,0.0042533944440281465,0.008453735837140498,0.01709127957608184,0.0340481966593474,0.15405424507803547,0.19768529157131295,0.3464199108343319,0.6464298861366352,1.238909927835614,2.4493321169670805,3.7739052912762396,5.433365000256647,10.417498530039344,18.722584617371933,35.47476999810897,68.37642000755295]
},
{
"line": {
"color": "purple",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (80 threads) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[7.354598900844257e-05,0.0001383035843658785,0.0002703884404915811,0.000534547603809233,0.001061822413950734,0.002116490937964621,0.004216066203146666,0.008427127012921616,0.016945408394253923,0.03390504091763947,0.15671164659610362,0.19864209441352734,0.3418592391480423,0.636183333302721,1.2414279857022876,2.4475358159888985,4.746997793999446,5.646201980180375,9.359279451317677,18.203799997886215,34.66702104962774,68.51051999256015]
}
],
"layout": {
"title": "Running Time AVX (multithreaded)",
"xaxis":{
"title":"n"
},
"yaxis":{
"title":"Running time [ms]"
}
},
"frames": []
}
It must be kept in mind that with the current block_size
a thread is only launched if it can be fed with a minimum of 500000 elements. For example, to utilize 40 threads a minimum size of size of 2 million elements is needed.
The plot shows that all the results are close. The nonconstant overhead of launching threads contributes to the variation observed.
{
"data": [
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (2 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.6169245531275917e-05,2.8601983665557976e-05,3.5721453221084454e-05,5.1360832177131515e-05,8.070514933823096e-05,0.00013869601732202058,0.00027753480231546603,0.0006762094866866764,0.0013694544473247475,0.0027954719551144306,0.10893554376232785,0.1162720880283796,0.12546290847937494,0.15280140908378745,0.26952668796939,0.6143143372207114,1.2373375679330796,2.327538028188889,4.6681108108935625,9.858731945213448,18.590936841639248,36.31524210428133]
},
{
"line": {
"color": "red",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (3 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.572520859776509e-05,2.886633376574882e-05,3.607349507801198e-05,5.1173184433656145e-05,8.177000035150397e-05,0.00014541219227109795,0.00027776137513041063,0.0006872912522146637,0.0013859120115826003,0.002789610935016209,0.11211523720426962,0.16585472971550538,0.1696766970543287,0.18386204768139425,0.25537116609319604,0.5035426311208855,1.133792072131827,2.3465130719344063,4.628613245032037,9.138917333136002,17.382851282421214,34.39147999743]
},
{
"line": {
"color": "green",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (4 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.5904304399778077e-05,2.8682060050155323e-05,3.578214552570864e-05,5.0703831272336284e-05,8.080007507964581e-05,0.00013924398558915488,0.0002759537921778195,0.0006843579395228586,0.0013706589231799898,0.0027847615149729004,0.10488089611683797,0.16071314704713235,0.21755597624812364,0.2230269958391432,0.2717380692878555,0.5527818479361234,1.0609052538702137,2.2167645902449236,4.1675224994833115,8.17848915649376,16.936025000177324,32.851309526623005]
},
{
"line": {
"color": "blue",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (5 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.5625802600137436e-05,2.862340499270607e-05,3.5605045862537125e-05,5.0818357293678666e-05,8.034918995946467e-05,0.0001395330209765532,0.0002732497649195739,0.000682368824157644,0.0013710108347749727,0.0027897151959028342,0.10508965152734602,0.1637334749861106,0.2725963448579462,0.2788047332296464,0.3028115646199845,0.5686723511557045,1.137496092353433,2.274163879522661,4.391518239160332,8.868280000751838,16.751945946944524,34.34173809364438]
},
{
"line": {
"color": "#17becf",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (6 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.499230418793224e-05,2.8927260572805855e-05,3.584365363642168e-05,5.080118936121756e-05,7.98256952826651e-05,0.00013890002760277232,0.0002722825146976473,0.0006855120479763115,0.0013689185669673593,0.0028146283876897796,0.10592092811176596,0.1611571920768577,0.32435389195744546,0.33749301988501473,0.3440527680871121,0.6023674087063181,1.1159104869739338,2.241071717349518,4.340595757645188,8.67012317048204,17.15386578947992,33.147785718375374]
},
{
"line": {
"color": "yellow",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (7 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.4804911625060862e-05,2.8589800448116345e-05,3.5686789382178306e-05,5.1161416651348846e-05,8.037297436934055e-05,0.000138599353779782,0.00027539622061219276,0.0006829910320473438,0.0014024966612119637,0.0027965169869547143,0.1053861980322954,0.1610168466970352,0.32349802754443446,0.3905762359761539,0.3939497679890092,0.5784714966257759,1.1220890971129123,2.195839721462868,4.248368125263369,8.618212658676166,17.01607500144746,32.98713636203584]
},
{
"line": {
"color": "purple",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (8 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.508548948132523e-05,2.9032686458505886e-05,3.596109629218744e-05,5.0633739571342184e-05,8.11728705232222e-05,0.00014320776870649875,0.0002738786183150105,0.0006783062816799984,0.0013982995863522454,0.0028218554142827677,0.10464491768956878,0.16736430220362727,0.3203605018754327,0.4404363352158867,0.45413879642475313,0.6032935088386008,1.1231949626304794,2.201789137594902,4.230653703281725,8.57948148076963,17.08706249773968,33.43774285721814]
},
{
"line": {
"color": "blue",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (9 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.526174400290239e-05,2.859872630636565e-05,3.5971680147320685e-05,5.0612948381366524e-05,8.004061527965222e-05,0.00014000757253871743,0.0002742738975768157,0.0006834325881326349,0.0013555153099329479,0.002802464558118242,0.10439360108635186,0.16011286535492153,0.3184270169184059,0.49590215616648303,0.5155462605962006,0.635721548521238,1.1406023932907443,2.2920963085637798,4.447206579099753,8.802117105867518,17.19787073173992,34.81305999448523]
},
{
"line": {
"color": "#9467bd",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (10 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.553562600540724e-05,2.8739238760865256e-05,3.587155559924309e-05,5.1390611911340995e-05,8.184444155828616e-05,0.0001413714756241849,0.00027480324721016467,0.0007045181071407668,0.001341624935235117,0.002786923639826425,0.10529914959252515,0.16169486113573872,0.3247858785956467,0.5403225181610763,0.5739124166818025,0.6418371320359855,1.1902158620269518,2.233735517325715,4.517405659967811,8.628808973858563,16.714679489198787,33.473409997532144]
},
{
"line": {
"color": "#9167bd",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (11 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.6526642357215145e-05,2.86311145046723e-05,3.6326395242294484e-05,5.092127268478263e-05,8.064682369039051e-05,0.00013915468652117775,0.0002760397693055264,0.0006980537234806364,0.001350929658382658,0.0027993412642989294,0.10506310287434138,0.1612631990965477,0.3204815981688497,0.5995288028521757,0.6263209899544279,0.6743242740241701,1.1620059035042025,2.303593286457916,4.5100700638144255,8.929283117766602,16.744207692308684,33.48244761582464]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (12 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.5420970344506098e-05,2.867206681034504e-05,3.5859124521095234e-05,5.103538742931054e-05,7.985770114772834e-05,0.00014273456739125818,0.0002737346941015183,0.0006836917210436769,0.001376620710622528,0.00277739871362473,0.10423911533657021,0.16042158425156353,0.32178897024888875,0.6023831859198028,0.6771087944424774,0.7284782506551937,1.1883648149065535,2.257228000089526,4.457274341563645,8.589246153580742,16.851400000242784,32.805628570107125]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (20 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.499687130069029e-05,2.8773130251799052e-05,3.5647182797731155e-05,5.0912422376397987e-05,8.030489955161196e-05,0.00013936155000650422,0.0002759341395607887,0.0006839886695954352,0.0014080289999135601,0.0027841723710656293,0.10530124337238014,0.1624867413649518,0.31820719590593916,0.6086215314975528,1.1343622735135344,1.1129267311290554,1.3199237355118045,2.323218835406133,4.401130215666867,8.857360000547487,17.371981080646652,34.42541499971412]
},
{
"line": {
"color": "green",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (40 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.5578665069123974e-05,2.8849624702707564e-05,3.5920514451223376e-05,5.078794809355062e-05,8.048568892754088e-05,0.00013861294492312838,0.00027229196440635444,0.0007048510400108702,0.0014574511183396534,0.0031308870504324665,0.10572568322632422,0.15986238825816934,0.32314274530645026,0.5991410301315068,1.1909922414282659,2.362022297126493,2.239771383893588,2.6856902041187394,4.545555263144993,8.851719736795578,16.87737749889493,33.271238091401756]
},
{
"line": {
"color": "#17becf",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (60 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.4925700585447764e-05,2.8547963625300075e-05,3.570083006375702e-05,5.050128881388961e-05,8.126372431653466e-05,0.00013877367182470632,0.00027828676594277105,0.0006854514250483735,0.0013527713402891341,0.002843461696092688,0.10515926711104454,0.16095879363825472,0.32063542452271815,0.600289450405192,1.1995621761451885,2.4618373664255087,3.5131405941026266,3.5476624342322194,4.973739814616878,8.967606756659979,18.328102497616783,33.627680951862466]
},
{
"line": {
"color": "purple",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (80 threads) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[2.5257645949679405e-05,2.860006862218213e-05,3.96954652729883e-05,5.088837981244243e-05,7.991138629619787e-05,0.0001390593184520973,0.0002708205341581401,0.0007114435046414013,0.0013513832181905811,0.002798686819980638,0.10295567849920916,0.15998016395445966,0.321908651600822,0.606404458642094,1.2302073944379694,2.467808741066683,4.779308029096981,4.514512820885732,5.478944791927158,9.282402740472493,18.083270732881275,33.26653809996233]
}
],
"layout": {
"title": "Running Time AVX (multithreaded)",
"xaxis":{
"title":"n"
},
"yaxis":{
"title":"Running time [ms]"
}
},
"frames": []
}
In the float benchmark all the results are well within the margin of variation too.
{
"data": [
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (12 threads) (double)",
"type": "scatter",
"x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456],
"y":[14.155007441454547, 14.817320891155477, 14.673178341688903, 15.302543626973732, 14.704739742949458, 15.038131728944661, 15.209822468849515, 15.13814585663168, 15.124625653054347, 15.135824266319545, 6.871604436506684, 10.62949755371523, 12.308718693134406, 13.037161707991787, 22.097593086783387, 26.59115865412349, 28.127855442788384, 29.99702626969286, 29.81312659839536, 29.93050925736252, 29.673704877629643, 31.717713717886028]
},
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"opacity":0.5,
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (12 threads) (float)",
"type": "scatter",
"x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456],
"y":[20.140851944727277, 35.71420249448272, 57.11238150265495, 80.25803675289812, 102.58246709162917, 114.78649005246815, 119.7071496821208, 95.85606784876265, 95.21286363672921, 94.38472003102532, 5.029666630488604, 6.536377289203715, 6.517165577110833, 6.962850388321431, 12.388862866427647, 23.03049677174379, 28.235800638912835, 29.730653703276026, 30.112063497737367, 31.252504725119895, 31.859128143196717, 32.73041459045251]
} ],
"layout": {
"title": "Throughput AVX (multithreaded)",
"xaxis":{
"title":"2n"
},
"yaxis":{
"title":"GB/s"
}
},
"frames": []
}
While the throughput for smaller array sizes remains, albeit with different peaks, similar in its qualitative behaviour, large arrays benefit reaching 31.71 GB/s at 268M double sized numbers. This is 1.28 times as much as 24.66 GB/s which was reached in a single
threaded environment. Nevertheless, the program is memory bandwidth starved and the theoretical wall of 42.656 GB/s on this computer for large arrays cannot be exceeded.
The AVX code could even be further optimized via loop unrolling techniques (benefit questionable, tested it out), using 256 bit wide fused multiply add instructions (no benefit, tested it out) or even AVX512 (which this processor does not support).
Special Case Optimization: x = y
If both input vectors x,y
are the same (as happens often during e.g. normalization operations), all registers can be dedicated to reducing a single vector:
inline double dotproduct_sum_of_squares<0,0>(size_t n, double* a){
size_t blocks_end = n-static_cast<size_t>(n&31);
double acc = 0.0;
size_t i = 0;
if (blocks_end != n){
__m256d sum1= _mm256_setzero_pd();
__m256d sum2= _mm256_setzero_pd();
__m256d sum3= _mm256_setzero_pd();
__m256d sum4= _mm256_setzero_pd();
__m256d sum5= _mm256_setzero_pd();
__m256d sum6= _mm256_setzero_pd();
__m256d sum7= _mm256_setzero_pd();
__m256d sum8= _mm256_setzero_pd();
for (;i<blocks_end;i+=32){
__m256d x=_mm256_loadu_pd(a);
__m256d y=_mm256_loadu_pd(a+4);
__m256d z=_mm256_loadu_pd(a+8);
__m256d w=_mm256_loadu_pd(a+12);
__m256d k=_mm256_loadu_pd(a+16);
__m256d r=_mm256_loadu_pd(a+20);
__m256d p=_mm256_loadu_pd(a+24);
__m256d m=_mm256_loadu_pd(a+28);
__m256d xx= _mm256_mul_pd(x,x);
__m256d yy= _mm256_mul_pd(y,y);
__m256d zz= _mm256_mul_pd(z,z);
__m256d ww= _mm256_mul_pd(w,w);
__m256d kk= _mm256_mul_pd(k,k);
__m256d rr= _mm256_mul_pd(r,r);
__m256d pp= _mm256_mul_pd(p,p);
__m256d mm= _mm256_mul_pd(m,m);
sum1= _mm256_add_pd(sum1,xx);
sum2= _mm256_add_pd(sum2,yy);
sum3= _mm256_add_pd(sum3,zz);
sum4= _mm256_add_pd(sum4,ww);
sum5= _mm256_add_pd(sum5,kk);
sum6= _mm256_add_pd(sum6,rr);
sum7= _mm256_add_pd(sum7,pp);
sum8= _mm256_add_pd(sum8,mm);
a+=32;
}
__m256d xx_plus_yy=_mm256_add_pd(sum1,sum2);
__m256d zz_plus_ww=_mm256_add_pd(sum3,sum4);
__m256d kk_plus_rr=_mm256_add_pd(sum5,sum6);
__m256d pp_plus_mm=_mm256_add_pd(sum7,sum8);
__m256d xx_yy_plus_zz_ww=_mm256_add_pd(xx_plus_yy,zz_plus_ww);
__m256d kk_rr_plus_pp_mm=_mm256_add_pd(kk_plus_rr,pp_plus_mm);
__m256d xx_yy_zz_ww_plus_kk_rr_pp_mm=_mm256_add_pd(xx_yy_plus_zz_ww,kk_rr_plus_pp_mm);
__m128d xx_yy_zz_ww_plus_kk_rr_pp_mm_low=_mm256_castpd256_pd128(xx_yy_zz_ww_plus_kk_rr_pp_mm);
__m128d xx_yy_zz_ww_plus_kk_rr_pp_mm_high=_mm256_extractf128_pd(xx_yy_zz_ww_plus_kk_rr_pp_mm,1);
__m128d sum=_mm_add_pd(xx_yy_zz_ww_plus_kk_rr_pp_mm_low,xx_yy_zz_ww_plus_kk_rr_pp_mm_high);
__m128d shuffled_sum=_mm_shuffle_pd(sum,sum,0b01);
__m128d result=_mm_add_pd(sum,shuffled_sum); //result is stored in lower and higher register halves
acc+=_mm_cvtsd_f64(result);//
}
while (i<n){
acc+=(*a)*(*a);
a++;
i++;
}
return acc;
}
{
"data": [
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[1.949637302896928e-05,2.4065289909985603e-05,3.076171923065186e-05,5e-05,8.579798724116129e-05,0.0001649693190827669,0.0006626674107142857,0.0013113839285714285,0.0026785714285714286,0.00816123588733979,0.01708984375,0.03278459821428571,0.068359375,0.22529686174724342,0.5929129464285714,1.4997209821428572,2.5820974576271185,5.440848214285714,10.25390625,22.235576923076923,42.96875,87.05357142857143]
},
{
"line": {
"color": "blue",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[1.6113281142098564e-05,1.9042969005039764e-05,2.566964285714286e-05,3.8609095034151686e-05,7.114955357142857e-05,0.0001224190793562911,0.00027273996753303425,0.0006103515625,0.0013950892857142857,0.002825054542386365,0.008579760804639265,0.017996812471539926,0.03606241406403457,0.08196149553571429,0.22879464285714285,0.6138392857142857,1.5373995983935742,2.913135593220339,5.301339285714286,11.25,22.916666666666668,46.875]
},
{
"line": {
"color": "green",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (square) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[1.7264229795105605e-05,1.8589564981110244e-05,2.668108276795368e-05,5e-05,8.998325491146183e-05,0.00017647880646063437,0.0003529575262575846,0.0007149832589285714,0.0014125272711931825,0.0029157424646612013,0.008719308035714286,0.016497041274829965,0.033691932686448225,0.068359375,0.2431214421252372,0.6417410714285714,1.4118975903614457,2.979343220338983,6.25,13.392857142857142,26.442307692307693,53.125]
},
{
"line": {
"color": "green",
"shape": "linear",
"width": 3
},
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (square) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[1.799665194639868e-05,1.9252232314752076e-05,2.3542131433681568e-05,4.0806362153656634e-05,6.696428571428572e-05,0.00011997767857142857,0.0002301897064520417,0.0004551477575497078,0.0009626111774057244,0.0019670776491764724,0.003923688616071429,0.01025390625,0.02085630549590854,0.04185193121551401,0.07742399892861926,0.2849342663273961,0.7393973214285714,1.5694754464285714,3.374413145539906,6.8359375,13.671875,27.5]
}
],
"layout": {
"title": "Running Time AVX Sum of Squares",
"xaxis":{
"title":"n"
},
"yaxis":{
"title":"Running time [ms]"
}
},
"frames": []
}
{
"data": [
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"mode": "lines+markers",
"name": "AVX (square) (double)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[29.656695148088893, 55.08466717970732, 76.75850406115556, 81.92, 91.03916065339536, 92.83834319026087, 92.8383659853913, 91.66088741463416, 92.79254473386668, 89.90643144145454, 60.129542144, 63.56145823553489, 62.24493024834782, 61.35667565714286, 34.503776905365854, 26.143279193043483, 23.765485704533333, 22.524717374577776, 21.47483648, 20.04318071466667, 20.30348176290909, 20.211610804705884] },
{
"line": {
"color": "gray",
"shape": "linear",
"width": 3
},
"opacity":0.5,
"marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}},
"mode": "lines+markers",
"name": "AVX (square) (float)",
"type": "scatter",
"x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728],
"y":[14.224868090046511, 26.594318603130432, 43.49648641137778, 50.1882523192889, 61.16693333333333, 68.27936744186046, 71.17607582254546, 71.99420288568889, 68.0814866253913, 66.63285511625533, 66.81060238222221, 51.13056304761904, 50.276210242782604, 50.10884657152, 54.173177025729736, 29.440502569674422, 22.690393261886797, 21.379392762311113, 19.88756595756522, 19.634136210285714, 19.634136210285714, 19.52257861818182]}
],
"layout": {
"title": "Throughput AVX (square)",
"xaxis":{
"title":"n"
},
"yaxis":{
"title":"GB/s"
}
},
"frames": []
}
At first glance, while the running time is lower indeed it has not consistently halved, leading to a slight decrease in throughput as well. This shows that looking at throughput only to judge an optimization's benefit is not enough, as this optimization is still well worth it.