On the surface, calculating a dot product is almost embarassingly parallel. But there are nuances to consider. Because the operation to be parallelized is only a simple add, the cost of transfering the data from CPU host to GPU device can quickly become prohibitive.

Blocked Multistream Version

On this system the host-device transfers go through a PCI-Express 3.0 x16 interface, which has a theoretical bandwidth of 15.5 GB/s. The maximum that has been achieved on this system in NVIDIA's publicly available bandwidthtest.cu test is 12.6 GB/s for host-to-device and 11.8 GB/s for device-to-host transfers.

Optimizing for memory transfers is therefore paramount for the GPU dot product to even have a chance of running competitively against the CPU dot products. As the AVX (multithreaded) (double) throughput is around 360 GB/s even at array sizes of 268M elements there is no hope for the GPU version compete. The single threaded AVX bandwidth is much lower however, already being at only 28 GB/s at 2M elements.

The approach will be the roughly following:

  1. Split the input n into n=n_cpu+n_gpu. n_gpu is the largest power of two inside n.

  2. Sum-reduce input arrays x,y of respective lengths n_gpu in parallel on the GPU.

  3. Sum input arrays x,y of respective lengths n_cpu on the CPU via dotproduct_avx.

  4. Add results from 2 and 3 together.

To parallelize memory transfers and computation as much as possible, async memory transfers and kernel launches via streams are helpful facilities. The computation via dotproduct_avx on the host therefore happens in parallel to the kernel computations on the GPU. This again introduces a lot of parameters whose optimal value depend on the computer:

  1. MAX_STREAMS which determines the maximum amount of streams
  2. MIN_N_PER_STREAM_SIZE which determines the minimum amount of elements needed in order to create a new streams
  3. MAX_BLOCKS_PER_STREAM the maximum blocks a kernel within a stream is launched with
  4. THREADS_PER_BLOCK the maximum threads a kernel within a stream is launched with

These values are set to the following:

  1. MAX_STREAMS = 64
  2. MIN_N_PER_STREAM_SIZE = 1<<12
  3. MAX_BLOCKS_PER_STREAM = 32
  4. THREADS_PER_BLOCK = 1024

Strictly speaking, the astute reader will immediately notice this probably won't help much on this computer but we will throw nevertheless the kitchen sink at this problem.

This leads to the following attempt:

    template<class T>
    struct VECTORIZED_CVT;

    template<>
    struct VECTORIZED_CVT<double>{
        using type=double4;
    };

    template<>
    struct VECTORIZED_CVT<float>{
        using type=float4;
    };

    namespace helper{



        template<class T>
        __device__ T get_zero_vector();

        template<>
        __device__ double4 get_zero_vector() { 
            return make_double4(0.0, 0.0, 0.0, 0.0); 
        }

        template<>
        __device__ float4 get_zero_vector() { 
            return make_float4(0.0, 0.0, 0.0, 0.0); 
        }

        __device__
        __forceinline__ void fma(double4* x, double4* y, double4& value){
            value.x=::fma((*x).x,(*y).x,value.x); //vectorized fma would be better
            value.y=::fma((*x).y,(*y).y,value.y); //vectorized fma would be better
            value.z=::fma((*x).z,(*y).z,value.z); //vectorized fma would be better
            value.w=::fma((*x).w,(*y).w,value.w); //vectorized fma would be better
        }

        __device__
        __forceinline__ void fma(float4* x, float4* y, float4& value){
            value.x=::fma((*x).x,(*y).x,value.x); //vectorized fma would be better
            value.y=::fma((*x).y,(*y).y,value.y); //vectorized fma would be better
            value.z=::fma((*x).z,(*y).z,value.z); //vectorized fma would be better
            value.w=::fma((*x).w,(*y).w,value.w); //vectorized fma would be better
        }

        template<int OFFSET>
        __device__
        __forceinline__ void shared_memory_add_assignment_of_vector(double4* sdata){
                sdata[threadIdx.x].x+=sdata[threadIdx.x+OFFSET].x;
                sdata[threadIdx.x].y+=sdata[threadIdx.x+OFFSET].y;
                sdata[threadIdx.x].z+=sdata[threadIdx.x+OFFSET].z;
                sdata[threadIdx.x].w+=sdata[threadIdx.x+OFFSET].w;
        }

        template<int OFFSET>
        __device__
        __forceinline__ void shared_memory_add_assignment_of_vector(float4* sdata){
                sdata[threadIdx.x].x+=sdata[threadIdx.x+OFFSET].x;
                sdata[threadIdx.x].y+=sdata[threadIdx.x+OFFSET].y;
                sdata[threadIdx.x].z+=sdata[threadIdx.x+OFFSET].z;
                sdata[threadIdx.x].w+=sdata[threadIdx.x+OFFSET].w;
        }


        //no syncthreads needed because I am in a warp
        template<size_t THREADS_PER_BLOCK, typename VECTORIZED_T>
        __device__
        __forceinline__ void reduce_warp(VECTORIZED_T* sdata){
            if(THREADS_PER_BLOCK>=64){      
                    shared_memory_add_assignment_of_vector<32>(sdata);
                __syncthreads();
            }
            if(THREADS_PER_BLOCK>=32){
                    shared_memory_add_assignment_of_vector<16>(sdata);
                __syncthreads();
            }
            if(THREADS_PER_BLOCK>=16){
                    shared_memory_add_assignment_of_vector<8>(sdata);
                __syncthreads();
            }
            if(THREADS_PER_BLOCK>=8){
                shared_memory_add_assignment_of_vector<4>(sdata);
                __syncthreads();
            }
            if(THREADS_PER_BLOCK>=4){
                    shared_memory_add_assignment_of_vector<2>(sdata);
                __syncthreads();
            }
            if(THREADS_PER_BLOCK>=2){
                    shared_memory_add_assignment_of_vector<1>(sdata);
                __syncthreads();
            }           
        }

        double vectorized_type_to_scalar_sum(double4& x){
            return x.x+x.y+x.z+x.w;
        }

        float vectorized_type_to_scalar_sum(float4& x){
            return x.x+x.y+x.z+x.w;
        }
    }

    template<size_t THREADS_PER_BLOCK, typename VECTORIZED_T>
    __global__
    void reduce_k(size_t n, VECTORIZED_T* x, VECTORIZED_T* y, VECTORIZED_T* r){
        constexpr size_t memsize=(THREADS_PER_BLOCK<=64)?64:THREADS_PER_BLOCK;
        using T=decltype(x[0].x);
        static __shared__ VECTORIZED_T sdata[memsize];
        VECTORIZED_T value = helper::get_zero_vector<VECTORIZED_T>();
        sdata[threadIdx.x] = helper::get_zero_vector<VECTORIZED_T>();
        const int stride = gridDim.x*blockDim.x;

        for (int i=blockDim.x*blockIdx.x+threadIdx.x; i<n; i+=stride){
            helper::fma(x+i,y+i,value); 
        }
        sdata[threadIdx.x]=value;
        __syncthreads();

        if constexpr (THREADS_PER_BLOCK>=1024){
            if (threadIdx.x < 512) { 
                helper::shared_memory_add_assignment_of_vector<512>(sdata);
            }
            __syncthreads(); 
        }
        if constexpr (THREADS_PER_BLOCK>=512){
            if (threadIdx.x < 256) { 
                helper::shared_memory_add_assignment_of_vector<256>(sdata);
            }
            __syncthreads(); 
        }

        if constexpr (THREADS_PER_BLOCK>=256){
            if (threadIdx.x < 128) { 
                helper::shared_memory_add_assignment_of_vector<128>(sdata);
            }
            __syncthreads(); 
        }

        if constexpr (THREADS_PER_BLOCK>=128){
            if (threadIdx.x < 64) { 
                helper::shared_memory_add_assignment_of_vector<64>(sdata);
            }
            __syncthreads(); 
        }       
        if (threadIdx.x<32){
            helper::reduce_warp<THREADS_PER_BLOCK,VECTORIZED_T>(sdata);
            if (threadIdx.x == 0){ //If statement not needed because all threads because all threads write the same value
                r[blockIdx.x]=sdata[0];
            }
        }
    }

    template<size_t MAX_STREAMS, size_t MIN_N_PER_STREAM_SIZE, size_t MAX_BLOCKS_PER_STREAM, size_t THREADS_PER_BLOCK, typename T> requires dotprod::IsPositivePowerOfTwo<MAX_BLOCKS_PER_STREAM> && dotprod::IsPositivePowerOfTwo<THREADS_PER_BLOCK>
    __host__
    T reduce(size_t n, T* x_pinned, T* y_pinned){
        if (n == 0){
            return T(0);
        }
        using VECTORIZED_T=typename VECTORIZED_CVT<T>::type;
        int VECTORIZED_ELEMENTS=sizeof(VECTORIZED_T)/sizeof(T);

        T res=T(0);
        int n_power_of_two=1u<<dotprod::log2_int(n); 
        uint64_t streams_needed_no_clamp=(n_power_of_two+static_cast<long long>(MIN_N_PER_STREAM_SIZE-1LL))/MIN_N_PER_STREAM_SIZE;
        size_t streams_needed=(streams_needed_no_clamp<MAX_STREAMS)?streams_needed_no_clamp:MAX_STREAMS; //AND only works if MAX_STREAMS is power of two
        int n_per_stream=1u<<dotprod::log2_int(n_power_of_two/streams_needed);
        int n_vectorized_per_stream=n_per_stream/VECTORIZED_ELEMENTS;
        int n_gpu=n_per_stream*streams_needed;
        size_t n_cpu=n-n_gpu;
        int blocks_used=(((n_vectorized_per_stream+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK)<MAX_BLOCKS_PER_STREAM)?(n_vectorized_per_stream+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK:MAX_BLOCKS_PER_STREAM;        
        VECTORIZED_T* r_d;

        VECTORIZED_T* r_h = new VECTORIZED_T[MAX_STREAMS*MAX_BLOCKS_PER_STREAM]; 
        CHECK_CUDA_ERROR(cudaMalloc((void**)&r_d, sizeof(VECTORIZED_T)*streams_needed*blocks_used));

        cudaStream_t streams[MAX_STREAMS];
        for (size_t i=0;i<streams_needed;i++){
            size_t bytes_to_copy=sizeof(T)*n_per_stream;
            VECTORIZED_T* _x_d;
            VECTORIZED_T* _y_d;
            VECTORIZED_T* _r_d=r_d+i*blocks_used;
            cudaStreamCreate (&(streams[i]));
            CHECK_CUDA_ERROR(cudaMallocAsync((void**)&_x_d, bytes_to_copy, streams[i]));
            CHECK_CUDA_ERROR(cudaMallocAsync((void**)&_y_d, bytes_to_copy, streams[i]));
            CHECK_CUDA_ERROR(cudaMemcpyAsync (_x_d, reinterpret_cast<VECTORIZED_T*>(&(x_pinned[i*n_per_stream])), bytes_to_copy, cudaMemcpyHostToDevice, streams[i] ));
            CHECK_CUDA_ERROR(cudaMemcpyAsync (_y_d, reinterpret_cast<VECTORIZED_T*>(&(y_pinned[i*n_per_stream])), bytes_to_copy, cudaMemcpyHostToDevice, streams[i] ));
            reduce_k<THREADS_PER_BLOCK,VECTORIZED_T><<<dim3(blocks_used,1,1),dim3(THREADS_PER_BLOCK,1,1),0,streams[i]>>>(n_vectorized_per_stream,_x_d,_y_d,_r_d);
            cudaFreeAsync(_x_d,streams[i]);
            cudaFreeAsync(_y_d,streams[i]);

        }
        res+=avx::dp<0,0>(n_cpu,x_pinned+n_gpu,y_pinned+n_gpu);
        for (size_t i=0;i<streams_needed;i++){
            CHECK_CUDA_ERROR(cudaStreamSynchronize(streams[i]));
            CHECK_CUDA_ERROR(cudaStreamDestroy(streams[i]));
        }

        CHECK_CUDA_ERROR(cudaMemcpy(r_h, r_d, sizeof(VECTORIZED_T)*streams_needed*blocks_used, cudaMemcpyDeviceToHost));            
        for (int i=0;i<streams_needed*blocks_used;i++){
            res+=helper::vectorized_type_to_scalar_sum(r_h[i]);
        }
        delete[] r_h;
        cudaFree(r_d);
        return res;
    }

While there is surely room for improvement, since memory transfers are the primary bottleneck hyper optimization of computational operations is not as urgent. In order to ensure certain parameters are a power of two, the IsPositivePowerOfTwo template constraint described in the appendix comes in handy.

{ "data": [ { "line": { "color": "gray", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "AVX (double)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[1.949637302896928e-05,2.4065289909985603e-05,3.076171923065186e-05,5e-05,8.579798724116129e-05,0.0001649693190827669,0.0006626674107142857,0.0013113839285714285,0.0026785714285714286,0.00816123588733979,0.01708984375,0.03278459821428571,0.068359375,0.22529686174724342,0.5929129464285714,1.4997209821428572,2.5820974576271185,5.440848214285714,10.25390625,22.235576923076923,42.96875,87.05357142857143] }, { "line": { "color": "blue", "shape": "linear", "width": 3 }, "marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}}, "mode": "lines+markers", "name": "AVX (float)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[1.6113281142098564e-05,1.9042969005039764e-05,2.566964285714286e-05,3.8609095034151686e-05,7.114955357142857e-05,0.0001224190793562911,0.00027273996753303425,0.0006103515625,0.0013950892857142857,0.002825054542386365,0.008579760804639265,0.017996812471539926,0.03606241406403457,0.08196149553571429,0.22879464285714285,0.6138392857142857,1.5373995983935742,2.913135593220339,5.301339285714286,11.25,22.916666666666668,46.875] }, { "line": { "color": "red", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "CUDA (double)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[2.4434156378600824,2.780104365079365,2.73213937007874,2.768647509578544,2.693645381526104,2.7936759689922477,2.7968353174603173,2.839294466403162,2.80734578313253,2.852645416666667,3.124393991416309,3.4364904040404043,4.574319607843138,4.860166666666667,4.416587837837838,5.45330081300813,8.58126582278481,13.950958333333332,24.477949999999996,46.44868666666667,90.3032875,179.03805] }, { "line": { "color": "red", "shape": "linear", "width": 3 }, "marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}}, "mode": "lines+markers", "name": "CUDA (float)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[1.9736842105263157,2.220517981072555,2.305371661237785,2.4896347826086953,2.535403157894737,2.653988235294118,2.7364905511811024,2.7392082677165353,2.856065714285714,2.965857021276596,3.135290178571428,3.3847721393034824,4.506410897435897,4.484786754966888,4.805499310344828,4.526466,5.4311,8.522645569620254,13.9450875,26.25911481481482,48.54294,91.2033125] } ], "layout": { "title": "Running Time CUDA vs. AVX", "xaxis":{ "title":"n" }, "yaxis":{ "title":"Running time [ms]" } }, "frames": [] }
{ "data": [ { "line": { "color": "red", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "PCI-Express 3.0 x16", "type": "scatter", "x":[128,268435456], "y":[15.5,15.5] }, { "line": { "color": "white", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "NVIDIA bandwidthtest.cu device-to-host", "type": "scatter", "x":[128,268435456], "y":[11.8,11.8] }, { "line": { "color": "gray", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "CUDA (double)", "type": "scatter", "x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456], "y":[0.00041908547368421054, 0.00073666299212531, 0.0014991914559182803, 0.0029588454188041527, 0.00608246360577632, 0.011729348844927167, 0.023432198381816188, 0.04616358097088919, 0.09337788083500385, 0.1837901047697101, 0.3356094022971391, 0.6102598155182694, 0.9169241241491822, 1.7259918384143202, 3.7986827424252856, 6.153049895938315, 7.8203921642671705, 9.620681589974406, 10.966419001591229, 11.55836581242412, 11.890395728948407, 11.99456566914128] }, { "line": { "color": "gray", "shape": "linear", "width": 3 }, "opacity":0.5, "marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}}, "mode": "lines+markers", "name": "CUDA (float)", "type": "scatter", "x":[128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456], "y":[0.0002594133333333333, 0.00046115366267171026, 0.0008883600134567462, 0.0016452212302834712, 0.003231044330954529, 0.006173350650962591, 0.011974461225841594, 0.02392516143163961, 0.04589250147305534, 0.08838726820592492, 0.16722152341219274, 0.3097921977742867, 0.46537079013217786, 0.9352293050176401, 1.7456267201915505, 3.706471229431526, 6.178201837565134, 7.874182195164335, 9.624731863460879, 10.22256301832972, 11.05971150490679, 11.77305729986507] } ], "layout": { "title": "Throughput CUDA", "xaxis":{ "title":"2n" }, "yaxis":{ "title":"GB/s" } }, "frames": [] }

The CUDA version is even slower than the naive loop. Through clever selection of the block and stream parameters the CUDA version might come closer to parity but due to the aforementioned bandwidth bottleneck there is little hope to become significantly faster. The GPU bandwidth is low and clearly limited by the PCI-Express 3.0 x16 bandwidth of 15.5 GB/s. The NVIDIA bandwidthtest.cu serves as a good lower limit because this is the minimum time needed just to copy the data to device.

Note: Since this is a CPU-GPU hybrid algorithm, for some array sizes the throughput may be slightly greater especially if n_cpu ≈ n_gpu.

As the double and float versions both max out at around the same throughput, the float version can on work a lot more elements in a given time.

An interesting side effect is that the chosen reduction approach has different numerical error bounds than the sequential additions in the single threaded AVX dot product before.

Special Case Optimization: x = y

As for the AVX version it is possible to optimize for sum of squares too.

template<size_t MAX_STREAMS, size_t MIN_N_PER_STREAM_SIZE, size_t MAX_BLOCKS_PER_STREAM, size_t THREADS_PER_BLOCK, typename T> requires dotprod::IsPositivePowerOfTwo<MAX_BLOCKS_PER_STREAM> && dotprod::IsPositivePowerOfTwo<THREADS_PER_BLOCK>
__host__
T reduce(size_t n, T* x_pinned){
    if (n == 0){
        return T(0);
    }
    using VECTORIZED_T=typename VECTORIZED_CVT<T>::type;
    int VECTORIZED_ELEMENTS=sizeof(VECTORIZED_T)/sizeof(T);

    T res=T(0);
    int n_power_of_two=1u<<dotprod::log2_int(n); //todo change
    uint64_t streams_needed_no_clamp=(n_power_of_two+static_cast<long long>(MIN_N_PER_STREAM_SIZE-1LL))/MIN_N_PER_STREAM_SIZE;
    size_t streams_needed=(streams_needed_no_clamp<MAX_STREAMS)?streams_needed_no_clamp:MAX_STREAMS; //AND only works if MAX_STREAMS is power of two
    int n_per_stream=1u<<dotprod::log2_int(n_power_of_two/streams_needed);
    int n_vectorized_per_stream=n_per_stream/VECTORIZED_ELEMENTS;
    int n_gpu=n_per_stream*streams_needed;
    size_t n_cpu=n-n_gpu;
    int blocks_used=(((n_vectorized_per_stream+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK)<MAX_BLOCKS_PER_STREAM)?(n_vectorized_per_stream+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK:MAX_BLOCKS_PER_STREAM;        
    VECTORIZED_T* r_d;

    VECTORIZED_T* r_h = new VECTORIZED_T[MAX_STREAMS*MAX_BLOCKS_PER_STREAM]; 
    CHECK_CUDA_ERROR(cudaMalloc((void**)&r_d, sizeof(VECTORIZED_T)*streams_needed*blocks_used));

    cudaStream_t streams[MAX_STREAMS];
    for (size_t i=0;i<streams_needed;i++){
        size_t bytes_to_copy=sizeof(T)*n_per_stream;
        VECTORIZED_T* _x_d;
        VECTORIZED_T* _r_d=r_d+i*blocks_used;
        cudaStreamCreate (&(streams[i]));
        CHECK_CUDA_ERROR(cudaMallocAsync((void**)&_x_d, bytes_to_copy, streams[i]));
        CHECK_CUDA_ERROR(cudaMemcpyAsync (_x_d, reinterpret_cast<VECTORIZED_T*>(&(x_pinned[i*n_per_stream])), bytes_to_copy, cudaMemcpyHostToDevice, streams[i] ));
        reduce_k<THREADS_PER_BLOCK,VECTORIZED_T><<<dim3(blocks_used,1,1),dim3(THREADS_PER_BLOCK,1,1),0,streams[i]>>>(n_vectorized_per_stream,_x_d,_x_d,_r_d);
        cudaFreeAsync(_x_d,streams[i]);

    }
    res+=avx::dp_sq<0,0>(n_cpu,x_pinned+n_gpu);
    for (size_t i=0;i<streams_needed;i++){
        CHECK_CUDA_ERROR(cudaStreamSynchronize(streams[i]));
        CHECK_CUDA_ERROR(cudaStreamDestroy(streams[i]));
    }

    CHECK_CUDA_ERROR(cudaMemcpy(r_h, r_d, sizeof(VECTORIZED_T)*streams_needed*blocks_used, cudaMemcpyDeviceToHost));            
    for (int i=0;i<streams_needed*blocks_used;i++){
        res+=helper::vectorized_type_to_scalar_sum(r_h[i]);
    }
    delete[] r_h;
    cudaFree(r_d);
    return res;
}
{ "data": [ { "line": { "color": "red", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "CUDA (double)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[2.4434156378600824,2.780104365079365,2.73213937007874,2.768647509578544,2.693645381526104,2.7936759689922477,2.7968353174603173,2.839294466403162,2.80734578313253,2.852645416666667,3.124393991416309,3.4364904040404043,4.574319607843138,4.860166666666667,4.416587837837838,5.45330081300813,8.58126582278481,13.950958333333332,24.477949999999996,46.44868666666667,90.3032875,179.03805] }, { "line": { "color": "red", "shape": "linear", "width": 3 }, "marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}}, "mode": "lines+markers", "name": "CUDA (float)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[1.9736842105263157,2.220517981072555,2.305371661237785,2.4896347826086953,2.535403157894737,2.653988235294118,2.7364905511811024,2.7392082677165353,2.856065714285714,2.965857021276596,3.135290178571428,3.3847721393034824,4.506410897435897,4.484786754966888,4.805499310344828,4.526466,5.4311,8.522645569620254,13.9450875,26.25911481481482,48.54294,91.2033125] }, { "line": { "color": "#9467bd", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "CUDA (square) (double)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[2.0337301587301586,2.3721505415162456,2.2305374213836475,2.2268716981132077,2.2539651006711408,2.5065308724832214,2.230206709265176,2.2815196141479097,2.276638834951456,2.3416168316831683,2.375689491525424,2.644920610687023,3.9367004950495055,3.5748664893617024,3.998171856287425,3.954112716763006,5.31875,8.39064875,13.617239999999999,24.578664285714286,45.61686666666667,92.0157875] }, { "line": { "color": "#9467bd", "shape": "linear", "width": 3 }, "marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}}, "mode": "lines+markers", "name": "CUDA (square) (float)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[2.0448825503355703,2.253276751592357,2.363061688311688,2.578149290780142,2.506293006993007,2.5945688259109314,2.6953516728624534,2.6734120300751876,2.798325,2.8746704453441296,2.9062642553191487,3.327699528301887,3.8938863157894734,3.797780434782608,3.99015197740113,4.120310112359551,4.169154140127389,5.208818181818182,8.362764634146341,13.778024489795918,24.487157142857143,46.250186666666664] } ], "layout": { "title": "Running Time CUDA Sum of Squares", "xaxis":{ "title":"n" }, "yaxis":{ "title":"Running time [ms]" } }, "frames": [] }
{ "data": [ { "line": { "color": "red", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "PCI-Express 3.0 x16", "type": "scatter", "x":[64,134217728], "y":[15.5,15.5] }, { "line": { "color": "white", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "NVIDIA bandwidthtest.cu device-to-host", "type": "scatter", "x":[64,134217728], "y":[11.8,11.8] }, { "line": { "color": "gray", "shape": "linear", "width": 3 }, "mode": "lines+markers", "name": "CUDA (square) (double)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[0.0002517541463414635, 0.0004316758072805419, 0.0009181643761571973, 0.0018393515906059942, 0.0036344839578752795, 0.006536524317280148, 0.014692808457560705, 0.02872471470050283, 0.057572592537627865, 0.11194999816070221, 0.22068877345724003, 0.3964489504006815, 0.5327181995778492, 1.1732757048358746, 2.0981109120680457, 4.242978691243405, 6.308706368977674, 7.9980542624907285, 9.856456080674205, 10.921482668039907, 11.769131710054616, 11.669104326254883] }, { "line": { "color": "gray", "shape": "linear", "width": 3 }, "opacity":0.5, "marker": {"symbol": "x", "line": {"color": "rgb(0,0,0)"}}, "mode": "lines+markers", "name": "CUDA (square) (float)", "type": "scatter", "x":[64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728], "y":[0.0001251905641025641, 0.0002272246405765192, 0.0004333361270528687, 0.0007943682731345166, 0.0016342861702807396, 0.0031573646912696016, 0.006078613104537952, 0.012256995790910101, 0.02341972429935765, 0.04559548737570481, 0.0901996435871978, 0.15755268633509778, 0.26928777960159955, 0.5522046458486337, 1.0511639716369499, 2.0359166594856504, 4.024129460343572, 6.441851266209401, 8.024722318022093, 9.741434855149402, 10.962295640688618, 11.607972868722117] } ], "layout": { "title": "Throughput CUDA (square)", "xaxis":{ "title":"n" }, "yaxis":{ "title":"GB/s" } }, "frames": [] }

Again, the practical PCI-Express 3.0 x16 bandwidth cannot be exceeded for n_cpu ≪ n_gpu.