Atomic counter for critical section not using atomic bandwidth according to profiler












1















Even though there are first-thread per-warp atomic access to a shared variable, profiler shows zero bandwidth for atomics:



enter image description here



Minimal reproduction example I could do here:



#include <stdio.h>
#include <cuda_runtime.h>

#define criticalSection(T, ...) {
__shared__ int ctrBlock;
if(threadIdx.x==0)
ctrBlock=0;
__syncthreads();
while(atomicAdd(&ctrBlock,0)<(blockDim.x/32))
{
if( atomicAdd(&ctrBlock,0) == (threadIdx.x/32) )
{
int ctr=0;
while(ctr<32)
{
if( ctr == (threadIdx.x&31) )
{
{
T,##__VA_ARGS__;
}
}
ctr++;
__syncwarp();
}
if((threadIdx.x&31) == 0)atomicAdd(&ctrBlock,1);
}
__syncthreads();
}
}

__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

// instead of if(i==0) C[0]=0.0f; initialization
if(i==blockDim.x*blockIdx.x)
C[blockDim.x*blockIdx.x]=0.0f;

__syncthreads();
criticalSection({
if (i < numElements)
{
C[blockDim.x*blockIdx.x] += A[i] + B[i];
}
});
}


int main(void)
{
int numElements = 50000;
size_t size = numElements * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);

for (int i = 0; i < numElements; ++i)
{
h_A[i] = i;
h_B[i] = 2*i;
}

float *d_A = NULL;
cudaMalloc((void **)&d_A, size);

float *d_B = NULL;
cudaMalloc((void **)&d_B, size);

float *d_C = NULL;
cudaMalloc((void **)&d_C, size);

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

printf("%gn",h_C[0]);


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

free(h_A);
free(h_B);
free(h_C);

return 0;
}


it correctly outputs the sum of (1 to 255)*3 result(at every starting element per block) everytime it runs.



Question: why would profiler show it is not using atomic bandwidth even though it correctly works?



Kernel completes (196 blocks, 256 threads per block) under 2.4 milliseconds on a 192-core Kepler GPU. Is GPU collecting atomics and converting them to something more efficient at each synchronization point?



It doesn't give any error, I removed error checking for readability.



Changing C array element addition to:



((volatile float *) C)[blockDim.x*blockIdx.x] += A[i] + B[i];


does not change the behavior nor the result.



Using CUDA toolkit 9.2 and driver v396, Ubuntu 16.04, Quadro K420.



Here are compiling commands:



nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o vectorAdd.o -c vectorAdd.cu
nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o vectorAdd vectorAdd.o


Ptx output of cuobjdump(sass was more than 50k characters):



.visible .entry _Z9vectorAddPKfS0_Pfi(
.param .u64 _Z9vectorAddPKfS0_Pfi_param_0,
.param .u64 _Z9vectorAddPKfS0_Pfi_param_1,
.param .u64 _Z9vectorAddPKfS0_Pfi_param_2,
.param .u32 _Z9vectorAddPKfS0_Pfi_param_3
)
{
.reg .pred %p<32>;
.reg .f32 %f<41>;
.reg .b32 %r<35>;
.reg .b64 %rd<12>;

.shared .align 4 .u32 _ZZ9vectorAddPKfS0_PfiE8ctrBlock;

ld.param.u64 %rd5, [_Z9vectorAddPKfS0_Pfi_param_0];
ld.param.u64 %rd6, [_Z9vectorAddPKfS0_Pfi_param_1];
ld.param.u64 %rd7, [_Z9vectorAddPKfS0_Pfi_param_2];
ld.param.u32 %r13, [_Z9vectorAddPKfS0_Pfi_param_3];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r14, %ctaid.x;
mov.u32 %r1, %ntid.x;
mul.lo.s32 %r2, %r14, %r1;
mov.u32 %r3, %tid.x;
add.s32 %r4, %r2, %r3;
setp.ne.s32 %p8, %r4, 0;
@%p8 bra BB0_2;

mov.u32 %r15, 0;
st.global.u32 [%rd1], %r15;

BB0_2:
bar.sync 0;
setp.ne.s32 %p9, %r3, 0;
@%p9 bra BB0_4;

mov.u32 %r16, 0;
st.shared.u32 [_ZZ9vectorAddPKfS0_PfiE8ctrBlock], %r16;

BB0_4:
bar.sync 0;
mov.u32 %r17, _ZZ9vectorAddPKfS0_PfiE8ctrBlock;
atom.shared.add.u32 %r18, [%r17], 0;
shr.u32 %r5, %r1, 5;
setp.ge.u32 %p10, %r18, %r5;
@%p10 bra BB0_27;

shr.u32 %r6, %r3, 5;
and.b32 %r7, %r3, 31;
cvta.to.global.u64 %rd8, %rd5;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd2, %rd8, %rd9;
cvta.to.global.u64 %rd10, %rd6;
add.s64 %rd3, %rd10, %rd9;
mul.wide.u32 %rd11, %r2, 4;
add.s64 %rd4, %rd1, %rd11;
neg.s32 %r8, %r7;

BB0_6:
atom.shared.add.u32 %r21, [%r17], 0;
mov.u32 %r34, 0;
setp.ne.s32 %p11, %r21, %r6;
mov.u32 %r33, %r8;
@%p11 bra BB0_26;

BB0_7:
setp.eq.s32 %p12, %r33, 0;
setp.lt.s32 %p13, %r4, %r13;
and.pred %p14, %p12, %p13;
@!%p14 bra BB0_9;
bra.uni BB0_8;

BB0_8:
ld.global.f32 %f1, [%rd2];
ld.global.f32 %f2, [%rd3];
add.f32 %f3, %f1, %f2;
ld.volatile.global.f32 %f4, [%rd4];
add.f32 %f5, %f4, %f3;
st.volatile.global.f32 [%rd4], %f5;

BB0_9:
bar.warp.sync -1;
add.s32 %r22, %r34, 1;
setp.eq.s32 %p15, %r22, %r7;
and.pred %p16, %p15, %p13;
@!%p16 bra BB0_11;
bra.uni BB0_10;

BB0_10:
ld.global.f32 %f6, [%rd2];
ld.global.f32 %f7, [%rd3];
add.f32 %f8, %f6, %f7;
ld.volatile.global.f32 %f9, [%rd4];
add.f32 %f10, %f9, %f8;
st.volatile.global.f32 [%rd4], %f10;

BB0_11:
bar.warp.sync -1;
add.s32 %r23, %r34, 2;
setp.eq.s32 %p17, %r23, %r7;
and.pred %p18, %p17, %p13;
@!%p18 bra BB0_13;
bra.uni BB0_12;

BB0_12:
ld.global.f32 %f11, [%rd2];
ld.global.f32 %f12, [%rd3];
add.f32 %f13, %f11, %f12;
ld.volatile.global.f32 %f14, [%rd4];
add.f32 %f15, %f14, %f13;
st.volatile.global.f32 [%rd4], %f15;

BB0_13:
bar.warp.sync -1;
add.s32 %r24, %r34, 3;
setp.eq.s32 %p19, %r24, %r7;
and.pred %p20, %p19, %p13;
@!%p20 bra BB0_15;
bra.uni BB0_14;

BB0_14:
ld.global.f32 %f16, [%rd2];
ld.global.f32 %f17, [%rd3];
add.f32 %f18, %f16, %f17;
ld.volatile.global.f32 %f19, [%rd4];
add.f32 %f20, %f19, %f18;
st.volatile.global.f32 [%rd4], %f20;

BB0_15:
bar.warp.sync -1;
add.s32 %r25, %r34, 4;
setp.eq.s32 %p21, %r25, %r7;
and.pred %p22, %p21, %p13;
@!%p22 bra BB0_17;
bra.uni BB0_16;

BB0_16:
ld.global.f32 %f21, [%rd2];
ld.global.f32 %f22, [%rd3];
add.f32 %f23, %f21, %f22;
ld.volatile.global.f32 %f24, [%rd4];
add.f32 %f25, %f24, %f23;
st.volatile.global.f32 [%rd4], %f25;

BB0_17:
bar.warp.sync -1;
add.s32 %r26, %r34, 5;
setp.eq.s32 %p23, %r26, %r7;
and.pred %p24, %p23, %p13;
@!%p24 bra BB0_19;
bra.uni BB0_18;

BB0_18:
ld.global.f32 %f26, [%rd2];
ld.global.f32 %f27, [%rd3];
add.f32 %f28, %f26, %f27;
ld.volatile.global.f32 %f29, [%rd4];
add.f32 %f30, %f29, %f28;
st.volatile.global.f32 [%rd4], %f30;

BB0_19:
bar.warp.sync -1;
add.s32 %r27, %r34, 6;
setp.eq.s32 %p25, %r27, %r7;
and.pred %p26, %p25, %p13;
@!%p26 bra BB0_21;
bra.uni BB0_20;

BB0_20:
ld.global.f32 %f31, [%rd2];
ld.global.f32 %f32, [%rd3];
add.f32 %f33, %f31, %f32;
ld.volatile.global.f32 %f34, [%rd4];
add.f32 %f35, %f34, %f33;
st.volatile.global.f32 [%rd4], %f35;

BB0_21:
bar.warp.sync -1;
add.s32 %r28, %r34, 7;
setp.eq.s32 %p27, %r28, %r7;
and.pred %p28, %p27, %p13;
@!%p28 bra BB0_23;
bra.uni BB0_22;

BB0_22:
ld.global.f32 %f36, [%rd2];
ld.global.f32 %f37, [%rd3];
add.f32 %f38, %f36, %f37;
ld.volatile.global.f32 %f39, [%rd4];
add.f32 %f40, %f39, %f38;
st.volatile.global.f32 [%rd4], %f40;

BB0_23:
add.s32 %r34, %r34, 8;
bar.warp.sync -1;
add.s32 %r33, %r33, 8;
setp.ne.s32 %p29, %r34, 32;
@%p29 bra BB0_7;

setp.ne.s32 %p30, %r7, 0;
@%p30 bra BB0_26;

atom.shared.add.u32 %r30, [%r17], 1;

BB0_26:
bar.sync 0;
atom.shared.add.u32 %r32, [%r17], 0;
setp.lt.u32 %p31, %r32, %r5;
@%p31 bra BB0_6;

BB0_27:
ret;
}









share|improve this question

























  • Compiling for 3.0 architecture through 7.0 but gpu is 3.0. I will look at cuobjdump tool. Compiling on commandline on nvcc. Also compiling within nvrtc(and using driver api) does same thing with just 3.0 architecture. Do you need parameters of compile command?

    – huseyin tugrul buyukisik
    Nov 16 '18 at 8:02













  • @RobertCrovella that is very interesting. Consider answering as this is (I'd think) quite non-trivial for the average user.

    – Ander Biguri
    Nov 16 '18 at 10:36
















1















Even though there are first-thread per-warp atomic access to a shared variable, profiler shows zero bandwidth for atomics:



enter image description here



Minimal reproduction example I could do here:



#include <stdio.h>
#include <cuda_runtime.h>

#define criticalSection(T, ...) {
__shared__ int ctrBlock;
if(threadIdx.x==0)
ctrBlock=0;
__syncthreads();
while(atomicAdd(&ctrBlock,0)<(blockDim.x/32))
{
if( atomicAdd(&ctrBlock,0) == (threadIdx.x/32) )
{
int ctr=0;
while(ctr<32)
{
if( ctr == (threadIdx.x&31) )
{
{
T,##__VA_ARGS__;
}
}
ctr++;
__syncwarp();
}
if((threadIdx.x&31) == 0)atomicAdd(&ctrBlock,1);
}
__syncthreads();
}
}

__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

// instead of if(i==0) C[0]=0.0f; initialization
if(i==blockDim.x*blockIdx.x)
C[blockDim.x*blockIdx.x]=0.0f;

__syncthreads();
criticalSection({
if (i < numElements)
{
C[blockDim.x*blockIdx.x] += A[i] + B[i];
}
});
}


int main(void)
{
int numElements = 50000;
size_t size = numElements * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);

for (int i = 0; i < numElements; ++i)
{
h_A[i] = i;
h_B[i] = 2*i;
}

float *d_A = NULL;
cudaMalloc((void **)&d_A, size);

float *d_B = NULL;
cudaMalloc((void **)&d_B, size);

float *d_C = NULL;
cudaMalloc((void **)&d_C, size);

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

printf("%gn",h_C[0]);


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

free(h_A);
free(h_B);
free(h_C);

return 0;
}


it correctly outputs the sum of (1 to 255)*3 result(at every starting element per block) everytime it runs.



Question: why would profiler show it is not using atomic bandwidth even though it correctly works?



Kernel completes (196 blocks, 256 threads per block) under 2.4 milliseconds on a 192-core Kepler GPU. Is GPU collecting atomics and converting them to something more efficient at each synchronization point?



It doesn't give any error, I removed error checking for readability.



Changing C array element addition to:



((volatile float *) C)[blockDim.x*blockIdx.x] += A[i] + B[i];


does not change the behavior nor the result.



Using CUDA toolkit 9.2 and driver v396, Ubuntu 16.04, Quadro K420.



Here are compiling commands:



nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o vectorAdd.o -c vectorAdd.cu
nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o vectorAdd vectorAdd.o


Ptx output of cuobjdump(sass was more than 50k characters):



.visible .entry _Z9vectorAddPKfS0_Pfi(
.param .u64 _Z9vectorAddPKfS0_Pfi_param_0,
.param .u64 _Z9vectorAddPKfS0_Pfi_param_1,
.param .u64 _Z9vectorAddPKfS0_Pfi_param_2,
.param .u32 _Z9vectorAddPKfS0_Pfi_param_3
)
{
.reg .pred %p<32>;
.reg .f32 %f<41>;
.reg .b32 %r<35>;
.reg .b64 %rd<12>;

.shared .align 4 .u32 _ZZ9vectorAddPKfS0_PfiE8ctrBlock;

ld.param.u64 %rd5, [_Z9vectorAddPKfS0_Pfi_param_0];
ld.param.u64 %rd6, [_Z9vectorAddPKfS0_Pfi_param_1];
ld.param.u64 %rd7, [_Z9vectorAddPKfS0_Pfi_param_2];
ld.param.u32 %r13, [_Z9vectorAddPKfS0_Pfi_param_3];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r14, %ctaid.x;
mov.u32 %r1, %ntid.x;
mul.lo.s32 %r2, %r14, %r1;
mov.u32 %r3, %tid.x;
add.s32 %r4, %r2, %r3;
setp.ne.s32 %p8, %r4, 0;
@%p8 bra BB0_2;

mov.u32 %r15, 0;
st.global.u32 [%rd1], %r15;

BB0_2:
bar.sync 0;
setp.ne.s32 %p9, %r3, 0;
@%p9 bra BB0_4;

mov.u32 %r16, 0;
st.shared.u32 [_ZZ9vectorAddPKfS0_PfiE8ctrBlock], %r16;

BB0_4:
bar.sync 0;
mov.u32 %r17, _ZZ9vectorAddPKfS0_PfiE8ctrBlock;
atom.shared.add.u32 %r18, [%r17], 0;
shr.u32 %r5, %r1, 5;
setp.ge.u32 %p10, %r18, %r5;
@%p10 bra BB0_27;

shr.u32 %r6, %r3, 5;
and.b32 %r7, %r3, 31;
cvta.to.global.u64 %rd8, %rd5;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd2, %rd8, %rd9;
cvta.to.global.u64 %rd10, %rd6;
add.s64 %rd3, %rd10, %rd9;
mul.wide.u32 %rd11, %r2, 4;
add.s64 %rd4, %rd1, %rd11;
neg.s32 %r8, %r7;

BB0_6:
atom.shared.add.u32 %r21, [%r17], 0;
mov.u32 %r34, 0;
setp.ne.s32 %p11, %r21, %r6;
mov.u32 %r33, %r8;
@%p11 bra BB0_26;

BB0_7:
setp.eq.s32 %p12, %r33, 0;
setp.lt.s32 %p13, %r4, %r13;
and.pred %p14, %p12, %p13;
@!%p14 bra BB0_9;
bra.uni BB0_8;

BB0_8:
ld.global.f32 %f1, [%rd2];
ld.global.f32 %f2, [%rd3];
add.f32 %f3, %f1, %f2;
ld.volatile.global.f32 %f4, [%rd4];
add.f32 %f5, %f4, %f3;
st.volatile.global.f32 [%rd4], %f5;

BB0_9:
bar.warp.sync -1;
add.s32 %r22, %r34, 1;
setp.eq.s32 %p15, %r22, %r7;
and.pred %p16, %p15, %p13;
@!%p16 bra BB0_11;
bra.uni BB0_10;

BB0_10:
ld.global.f32 %f6, [%rd2];
ld.global.f32 %f7, [%rd3];
add.f32 %f8, %f6, %f7;
ld.volatile.global.f32 %f9, [%rd4];
add.f32 %f10, %f9, %f8;
st.volatile.global.f32 [%rd4], %f10;

BB0_11:
bar.warp.sync -1;
add.s32 %r23, %r34, 2;
setp.eq.s32 %p17, %r23, %r7;
and.pred %p18, %p17, %p13;
@!%p18 bra BB0_13;
bra.uni BB0_12;

BB0_12:
ld.global.f32 %f11, [%rd2];
ld.global.f32 %f12, [%rd3];
add.f32 %f13, %f11, %f12;
ld.volatile.global.f32 %f14, [%rd4];
add.f32 %f15, %f14, %f13;
st.volatile.global.f32 [%rd4], %f15;

BB0_13:
bar.warp.sync -1;
add.s32 %r24, %r34, 3;
setp.eq.s32 %p19, %r24, %r7;
and.pred %p20, %p19, %p13;
@!%p20 bra BB0_15;
bra.uni BB0_14;

BB0_14:
ld.global.f32 %f16, [%rd2];
ld.global.f32 %f17, [%rd3];
add.f32 %f18, %f16, %f17;
ld.volatile.global.f32 %f19, [%rd4];
add.f32 %f20, %f19, %f18;
st.volatile.global.f32 [%rd4], %f20;

BB0_15:
bar.warp.sync -1;
add.s32 %r25, %r34, 4;
setp.eq.s32 %p21, %r25, %r7;
and.pred %p22, %p21, %p13;
@!%p22 bra BB0_17;
bra.uni BB0_16;

BB0_16:
ld.global.f32 %f21, [%rd2];
ld.global.f32 %f22, [%rd3];
add.f32 %f23, %f21, %f22;
ld.volatile.global.f32 %f24, [%rd4];
add.f32 %f25, %f24, %f23;
st.volatile.global.f32 [%rd4], %f25;

BB0_17:
bar.warp.sync -1;
add.s32 %r26, %r34, 5;
setp.eq.s32 %p23, %r26, %r7;
and.pred %p24, %p23, %p13;
@!%p24 bra BB0_19;
bra.uni BB0_18;

BB0_18:
ld.global.f32 %f26, [%rd2];
ld.global.f32 %f27, [%rd3];
add.f32 %f28, %f26, %f27;
ld.volatile.global.f32 %f29, [%rd4];
add.f32 %f30, %f29, %f28;
st.volatile.global.f32 [%rd4], %f30;

BB0_19:
bar.warp.sync -1;
add.s32 %r27, %r34, 6;
setp.eq.s32 %p25, %r27, %r7;
and.pred %p26, %p25, %p13;
@!%p26 bra BB0_21;
bra.uni BB0_20;

BB0_20:
ld.global.f32 %f31, [%rd2];
ld.global.f32 %f32, [%rd3];
add.f32 %f33, %f31, %f32;
ld.volatile.global.f32 %f34, [%rd4];
add.f32 %f35, %f34, %f33;
st.volatile.global.f32 [%rd4], %f35;

BB0_21:
bar.warp.sync -1;
add.s32 %r28, %r34, 7;
setp.eq.s32 %p27, %r28, %r7;
and.pred %p28, %p27, %p13;
@!%p28 bra BB0_23;
bra.uni BB0_22;

BB0_22:
ld.global.f32 %f36, [%rd2];
ld.global.f32 %f37, [%rd3];
add.f32 %f38, %f36, %f37;
ld.volatile.global.f32 %f39, [%rd4];
add.f32 %f40, %f39, %f38;
st.volatile.global.f32 [%rd4], %f40;

BB0_23:
add.s32 %r34, %r34, 8;
bar.warp.sync -1;
add.s32 %r33, %r33, 8;
setp.ne.s32 %p29, %r34, 32;
@%p29 bra BB0_7;

setp.ne.s32 %p30, %r7, 0;
@%p30 bra BB0_26;

atom.shared.add.u32 %r30, [%r17], 1;

BB0_26:
bar.sync 0;
atom.shared.add.u32 %r32, [%r17], 0;
setp.lt.u32 %p31, %r32, %r5;
@%p31 bra BB0_6;

BB0_27:
ret;
}









share|improve this question

























  • Compiling for 3.0 architecture through 7.0 but gpu is 3.0. I will look at cuobjdump tool. Compiling on commandline on nvcc. Also compiling within nvrtc(and using driver api) does same thing with just 3.0 architecture. Do you need parameters of compile command?

    – huseyin tugrul buyukisik
    Nov 16 '18 at 8:02













  • @RobertCrovella that is very interesting. Consider answering as this is (I'd think) quite non-trivial for the average user.

    – Ander Biguri
    Nov 16 '18 at 10:36














1












1








1








Even though there are first-thread per-warp atomic access to a shared variable, profiler shows zero bandwidth for atomics:



enter image description here



Minimal reproduction example I could do here:



#include <stdio.h>
#include <cuda_runtime.h>

#define criticalSection(T, ...) {
__shared__ int ctrBlock;
if(threadIdx.x==0)
ctrBlock=0;
__syncthreads();
while(atomicAdd(&ctrBlock,0)<(blockDim.x/32))
{
if( atomicAdd(&ctrBlock,0) == (threadIdx.x/32) )
{
int ctr=0;
while(ctr<32)
{
if( ctr == (threadIdx.x&31) )
{
{
T,##__VA_ARGS__;
}
}
ctr++;
__syncwarp();
}
if((threadIdx.x&31) == 0)atomicAdd(&ctrBlock,1);
}
__syncthreads();
}
}

__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

// instead of if(i==0) C[0]=0.0f; initialization
if(i==blockDim.x*blockIdx.x)
C[blockDim.x*blockIdx.x]=0.0f;

__syncthreads();
criticalSection({
if (i < numElements)
{
C[blockDim.x*blockIdx.x] += A[i] + B[i];
}
});
}


int main(void)
{
int numElements = 50000;
size_t size = numElements * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);

for (int i = 0; i < numElements; ++i)
{
h_A[i] = i;
h_B[i] = 2*i;
}

float *d_A = NULL;
cudaMalloc((void **)&d_A, size);

float *d_B = NULL;
cudaMalloc((void **)&d_B, size);

float *d_C = NULL;
cudaMalloc((void **)&d_C, size);

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

printf("%gn",h_C[0]);


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

free(h_A);
free(h_B);
free(h_C);

return 0;
}


it correctly outputs the sum of (1 to 255)*3 result(at every starting element per block) everytime it runs.



Question: why would profiler show it is not using atomic bandwidth even though it correctly works?



Kernel completes (196 blocks, 256 threads per block) under 2.4 milliseconds on a 192-core Kepler GPU. Is GPU collecting atomics and converting them to something more efficient at each synchronization point?



It doesn't give any error, I removed error checking for readability.



Changing C array element addition to:



((volatile float *) C)[blockDim.x*blockIdx.x] += A[i] + B[i];


does not change the behavior nor the result.



Using CUDA toolkit 9.2 and driver v396, Ubuntu 16.04, Quadro K420.



Here are compiling commands:



nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o vectorAdd.o -c vectorAdd.cu
nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o vectorAdd vectorAdd.o


Ptx output of cuobjdump(sass was more than 50k characters):



.visible .entry _Z9vectorAddPKfS0_Pfi(
.param .u64 _Z9vectorAddPKfS0_Pfi_param_0,
.param .u64 _Z9vectorAddPKfS0_Pfi_param_1,
.param .u64 _Z9vectorAddPKfS0_Pfi_param_2,
.param .u32 _Z9vectorAddPKfS0_Pfi_param_3
)
{
.reg .pred %p<32>;
.reg .f32 %f<41>;
.reg .b32 %r<35>;
.reg .b64 %rd<12>;

.shared .align 4 .u32 _ZZ9vectorAddPKfS0_PfiE8ctrBlock;

ld.param.u64 %rd5, [_Z9vectorAddPKfS0_Pfi_param_0];
ld.param.u64 %rd6, [_Z9vectorAddPKfS0_Pfi_param_1];
ld.param.u64 %rd7, [_Z9vectorAddPKfS0_Pfi_param_2];
ld.param.u32 %r13, [_Z9vectorAddPKfS0_Pfi_param_3];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r14, %ctaid.x;
mov.u32 %r1, %ntid.x;
mul.lo.s32 %r2, %r14, %r1;
mov.u32 %r3, %tid.x;
add.s32 %r4, %r2, %r3;
setp.ne.s32 %p8, %r4, 0;
@%p8 bra BB0_2;

mov.u32 %r15, 0;
st.global.u32 [%rd1], %r15;

BB0_2:
bar.sync 0;
setp.ne.s32 %p9, %r3, 0;
@%p9 bra BB0_4;

mov.u32 %r16, 0;
st.shared.u32 [_ZZ9vectorAddPKfS0_PfiE8ctrBlock], %r16;

BB0_4:
bar.sync 0;
mov.u32 %r17, _ZZ9vectorAddPKfS0_PfiE8ctrBlock;
atom.shared.add.u32 %r18, [%r17], 0;
shr.u32 %r5, %r1, 5;
setp.ge.u32 %p10, %r18, %r5;
@%p10 bra BB0_27;

shr.u32 %r6, %r3, 5;
and.b32 %r7, %r3, 31;
cvta.to.global.u64 %rd8, %rd5;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd2, %rd8, %rd9;
cvta.to.global.u64 %rd10, %rd6;
add.s64 %rd3, %rd10, %rd9;
mul.wide.u32 %rd11, %r2, 4;
add.s64 %rd4, %rd1, %rd11;
neg.s32 %r8, %r7;

BB0_6:
atom.shared.add.u32 %r21, [%r17], 0;
mov.u32 %r34, 0;
setp.ne.s32 %p11, %r21, %r6;
mov.u32 %r33, %r8;
@%p11 bra BB0_26;

BB0_7:
setp.eq.s32 %p12, %r33, 0;
setp.lt.s32 %p13, %r4, %r13;
and.pred %p14, %p12, %p13;
@!%p14 bra BB0_9;
bra.uni BB0_8;

BB0_8:
ld.global.f32 %f1, [%rd2];
ld.global.f32 %f2, [%rd3];
add.f32 %f3, %f1, %f2;
ld.volatile.global.f32 %f4, [%rd4];
add.f32 %f5, %f4, %f3;
st.volatile.global.f32 [%rd4], %f5;

BB0_9:
bar.warp.sync -1;
add.s32 %r22, %r34, 1;
setp.eq.s32 %p15, %r22, %r7;
and.pred %p16, %p15, %p13;
@!%p16 bra BB0_11;
bra.uni BB0_10;

BB0_10:
ld.global.f32 %f6, [%rd2];
ld.global.f32 %f7, [%rd3];
add.f32 %f8, %f6, %f7;
ld.volatile.global.f32 %f9, [%rd4];
add.f32 %f10, %f9, %f8;
st.volatile.global.f32 [%rd4], %f10;

BB0_11:
bar.warp.sync -1;
add.s32 %r23, %r34, 2;
setp.eq.s32 %p17, %r23, %r7;
and.pred %p18, %p17, %p13;
@!%p18 bra BB0_13;
bra.uni BB0_12;

BB0_12:
ld.global.f32 %f11, [%rd2];
ld.global.f32 %f12, [%rd3];
add.f32 %f13, %f11, %f12;
ld.volatile.global.f32 %f14, [%rd4];
add.f32 %f15, %f14, %f13;
st.volatile.global.f32 [%rd4], %f15;

BB0_13:
bar.warp.sync -1;
add.s32 %r24, %r34, 3;
setp.eq.s32 %p19, %r24, %r7;
and.pred %p20, %p19, %p13;
@!%p20 bra BB0_15;
bra.uni BB0_14;

BB0_14:
ld.global.f32 %f16, [%rd2];
ld.global.f32 %f17, [%rd3];
add.f32 %f18, %f16, %f17;
ld.volatile.global.f32 %f19, [%rd4];
add.f32 %f20, %f19, %f18;
st.volatile.global.f32 [%rd4], %f20;

BB0_15:
bar.warp.sync -1;
add.s32 %r25, %r34, 4;
setp.eq.s32 %p21, %r25, %r7;
and.pred %p22, %p21, %p13;
@!%p22 bra BB0_17;
bra.uni BB0_16;

BB0_16:
ld.global.f32 %f21, [%rd2];
ld.global.f32 %f22, [%rd3];
add.f32 %f23, %f21, %f22;
ld.volatile.global.f32 %f24, [%rd4];
add.f32 %f25, %f24, %f23;
st.volatile.global.f32 [%rd4], %f25;

BB0_17:
bar.warp.sync -1;
add.s32 %r26, %r34, 5;
setp.eq.s32 %p23, %r26, %r7;
and.pred %p24, %p23, %p13;
@!%p24 bra BB0_19;
bra.uni BB0_18;

BB0_18:
ld.global.f32 %f26, [%rd2];
ld.global.f32 %f27, [%rd3];
add.f32 %f28, %f26, %f27;
ld.volatile.global.f32 %f29, [%rd4];
add.f32 %f30, %f29, %f28;
st.volatile.global.f32 [%rd4], %f30;

BB0_19:
bar.warp.sync -1;
add.s32 %r27, %r34, 6;
setp.eq.s32 %p25, %r27, %r7;
and.pred %p26, %p25, %p13;
@!%p26 bra BB0_21;
bra.uni BB0_20;

BB0_20:
ld.global.f32 %f31, [%rd2];
ld.global.f32 %f32, [%rd3];
add.f32 %f33, %f31, %f32;
ld.volatile.global.f32 %f34, [%rd4];
add.f32 %f35, %f34, %f33;
st.volatile.global.f32 [%rd4], %f35;

BB0_21:
bar.warp.sync -1;
add.s32 %r28, %r34, 7;
setp.eq.s32 %p27, %r28, %r7;
and.pred %p28, %p27, %p13;
@!%p28 bra BB0_23;
bra.uni BB0_22;

BB0_22:
ld.global.f32 %f36, [%rd2];
ld.global.f32 %f37, [%rd3];
add.f32 %f38, %f36, %f37;
ld.volatile.global.f32 %f39, [%rd4];
add.f32 %f40, %f39, %f38;
st.volatile.global.f32 [%rd4], %f40;

BB0_23:
add.s32 %r34, %r34, 8;
bar.warp.sync -1;
add.s32 %r33, %r33, 8;
setp.ne.s32 %p29, %r34, 32;
@%p29 bra BB0_7;

setp.ne.s32 %p30, %r7, 0;
@%p30 bra BB0_26;

atom.shared.add.u32 %r30, [%r17], 1;

BB0_26:
bar.sync 0;
atom.shared.add.u32 %r32, [%r17], 0;
setp.lt.u32 %p31, %r32, %r5;
@%p31 bra BB0_6;

BB0_27:
ret;
}









share|improve this question
















Even though there are first-thread per-warp atomic access to a shared variable, profiler shows zero bandwidth for atomics:



enter image description here



Minimal reproduction example I could do here:



#include <stdio.h>
#include <cuda_runtime.h>

#define criticalSection(T, ...) {
__shared__ int ctrBlock;
if(threadIdx.x==0)
ctrBlock=0;
__syncthreads();
while(atomicAdd(&ctrBlock,0)<(blockDim.x/32))
{
if( atomicAdd(&ctrBlock,0) == (threadIdx.x/32) )
{
int ctr=0;
while(ctr<32)
{
if( ctr == (threadIdx.x&31) )
{
{
T,##__VA_ARGS__;
}
}
ctr++;
__syncwarp();
}
if((threadIdx.x&31) == 0)atomicAdd(&ctrBlock,1);
}
__syncthreads();
}
}

__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

// instead of if(i==0) C[0]=0.0f; initialization
if(i==blockDim.x*blockIdx.x)
C[blockDim.x*blockIdx.x]=0.0f;

__syncthreads();
criticalSection({
if (i < numElements)
{
C[blockDim.x*blockIdx.x] += A[i] + B[i];
}
});
}


int main(void)
{
int numElements = 50000;
size_t size = numElements * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);

for (int i = 0; i < numElements; ++i)
{
h_A[i] = i;
h_B[i] = 2*i;
}

float *d_A = NULL;
cudaMalloc((void **)&d_A, size);

float *d_B = NULL;
cudaMalloc((void **)&d_B, size);

float *d_C = NULL;
cudaMalloc((void **)&d_C, size);

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

printf("%gn",h_C[0]);


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

free(h_A);
free(h_B);
free(h_C);

return 0;
}


it correctly outputs the sum of (1 to 255)*3 result(at every starting element per block) everytime it runs.



Question: why would profiler show it is not using atomic bandwidth even though it correctly works?



Kernel completes (196 blocks, 256 threads per block) under 2.4 milliseconds on a 192-core Kepler GPU. Is GPU collecting atomics and converting them to something more efficient at each synchronization point?



It doesn't give any error, I removed error checking for readability.



Changing C array element addition to:



((volatile float *) C)[blockDim.x*blockIdx.x] += A[i] + B[i];


does not change the behavior nor the result.



Using CUDA toolkit 9.2 and driver v396, Ubuntu 16.04, Quadro K420.



Here are compiling commands:



nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o vectorAdd.o -c vectorAdd.cu
nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o vectorAdd vectorAdd.o


Ptx output of cuobjdump(sass was more than 50k characters):



.visible .entry _Z9vectorAddPKfS0_Pfi(
.param .u64 _Z9vectorAddPKfS0_Pfi_param_0,
.param .u64 _Z9vectorAddPKfS0_Pfi_param_1,
.param .u64 _Z9vectorAddPKfS0_Pfi_param_2,
.param .u32 _Z9vectorAddPKfS0_Pfi_param_3
)
{
.reg .pred %p<32>;
.reg .f32 %f<41>;
.reg .b32 %r<35>;
.reg .b64 %rd<12>;

.shared .align 4 .u32 _ZZ9vectorAddPKfS0_PfiE8ctrBlock;

ld.param.u64 %rd5, [_Z9vectorAddPKfS0_Pfi_param_0];
ld.param.u64 %rd6, [_Z9vectorAddPKfS0_Pfi_param_1];
ld.param.u64 %rd7, [_Z9vectorAddPKfS0_Pfi_param_2];
ld.param.u32 %r13, [_Z9vectorAddPKfS0_Pfi_param_3];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r14, %ctaid.x;
mov.u32 %r1, %ntid.x;
mul.lo.s32 %r2, %r14, %r1;
mov.u32 %r3, %tid.x;
add.s32 %r4, %r2, %r3;
setp.ne.s32 %p8, %r4, 0;
@%p8 bra BB0_2;

mov.u32 %r15, 0;
st.global.u32 [%rd1], %r15;

BB0_2:
bar.sync 0;
setp.ne.s32 %p9, %r3, 0;
@%p9 bra BB0_4;

mov.u32 %r16, 0;
st.shared.u32 [_ZZ9vectorAddPKfS0_PfiE8ctrBlock], %r16;

BB0_4:
bar.sync 0;
mov.u32 %r17, _ZZ9vectorAddPKfS0_PfiE8ctrBlock;
atom.shared.add.u32 %r18, [%r17], 0;
shr.u32 %r5, %r1, 5;
setp.ge.u32 %p10, %r18, %r5;
@%p10 bra BB0_27;

shr.u32 %r6, %r3, 5;
and.b32 %r7, %r3, 31;
cvta.to.global.u64 %rd8, %rd5;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd2, %rd8, %rd9;
cvta.to.global.u64 %rd10, %rd6;
add.s64 %rd3, %rd10, %rd9;
mul.wide.u32 %rd11, %r2, 4;
add.s64 %rd4, %rd1, %rd11;
neg.s32 %r8, %r7;

BB0_6:
atom.shared.add.u32 %r21, [%r17], 0;
mov.u32 %r34, 0;
setp.ne.s32 %p11, %r21, %r6;
mov.u32 %r33, %r8;
@%p11 bra BB0_26;

BB0_7:
setp.eq.s32 %p12, %r33, 0;
setp.lt.s32 %p13, %r4, %r13;
and.pred %p14, %p12, %p13;
@!%p14 bra BB0_9;
bra.uni BB0_8;

BB0_8:
ld.global.f32 %f1, [%rd2];
ld.global.f32 %f2, [%rd3];
add.f32 %f3, %f1, %f2;
ld.volatile.global.f32 %f4, [%rd4];
add.f32 %f5, %f4, %f3;
st.volatile.global.f32 [%rd4], %f5;

BB0_9:
bar.warp.sync -1;
add.s32 %r22, %r34, 1;
setp.eq.s32 %p15, %r22, %r7;
and.pred %p16, %p15, %p13;
@!%p16 bra BB0_11;
bra.uni BB0_10;

BB0_10:
ld.global.f32 %f6, [%rd2];
ld.global.f32 %f7, [%rd3];
add.f32 %f8, %f6, %f7;
ld.volatile.global.f32 %f9, [%rd4];
add.f32 %f10, %f9, %f8;
st.volatile.global.f32 [%rd4], %f10;

BB0_11:
bar.warp.sync -1;
add.s32 %r23, %r34, 2;
setp.eq.s32 %p17, %r23, %r7;
and.pred %p18, %p17, %p13;
@!%p18 bra BB0_13;
bra.uni BB0_12;

BB0_12:
ld.global.f32 %f11, [%rd2];
ld.global.f32 %f12, [%rd3];
add.f32 %f13, %f11, %f12;
ld.volatile.global.f32 %f14, [%rd4];
add.f32 %f15, %f14, %f13;
st.volatile.global.f32 [%rd4], %f15;

BB0_13:
bar.warp.sync -1;
add.s32 %r24, %r34, 3;
setp.eq.s32 %p19, %r24, %r7;
and.pred %p20, %p19, %p13;
@!%p20 bra BB0_15;
bra.uni BB0_14;

BB0_14:
ld.global.f32 %f16, [%rd2];
ld.global.f32 %f17, [%rd3];
add.f32 %f18, %f16, %f17;
ld.volatile.global.f32 %f19, [%rd4];
add.f32 %f20, %f19, %f18;
st.volatile.global.f32 [%rd4], %f20;

BB0_15:
bar.warp.sync -1;
add.s32 %r25, %r34, 4;
setp.eq.s32 %p21, %r25, %r7;
and.pred %p22, %p21, %p13;
@!%p22 bra BB0_17;
bra.uni BB0_16;

BB0_16:
ld.global.f32 %f21, [%rd2];
ld.global.f32 %f22, [%rd3];
add.f32 %f23, %f21, %f22;
ld.volatile.global.f32 %f24, [%rd4];
add.f32 %f25, %f24, %f23;
st.volatile.global.f32 [%rd4], %f25;

BB0_17:
bar.warp.sync -1;
add.s32 %r26, %r34, 5;
setp.eq.s32 %p23, %r26, %r7;
and.pred %p24, %p23, %p13;
@!%p24 bra BB0_19;
bra.uni BB0_18;

BB0_18:
ld.global.f32 %f26, [%rd2];
ld.global.f32 %f27, [%rd3];
add.f32 %f28, %f26, %f27;
ld.volatile.global.f32 %f29, [%rd4];
add.f32 %f30, %f29, %f28;
st.volatile.global.f32 [%rd4], %f30;

BB0_19:
bar.warp.sync -1;
add.s32 %r27, %r34, 6;
setp.eq.s32 %p25, %r27, %r7;
and.pred %p26, %p25, %p13;
@!%p26 bra BB0_21;
bra.uni BB0_20;

BB0_20:
ld.global.f32 %f31, [%rd2];
ld.global.f32 %f32, [%rd3];
add.f32 %f33, %f31, %f32;
ld.volatile.global.f32 %f34, [%rd4];
add.f32 %f35, %f34, %f33;
st.volatile.global.f32 [%rd4], %f35;

BB0_21:
bar.warp.sync -1;
add.s32 %r28, %r34, 7;
setp.eq.s32 %p27, %r28, %r7;
and.pred %p28, %p27, %p13;
@!%p28 bra BB0_23;
bra.uni BB0_22;

BB0_22:
ld.global.f32 %f36, [%rd2];
ld.global.f32 %f37, [%rd3];
add.f32 %f38, %f36, %f37;
ld.volatile.global.f32 %f39, [%rd4];
add.f32 %f40, %f39, %f38;
st.volatile.global.f32 [%rd4], %f40;

BB0_23:
add.s32 %r34, %r34, 8;
bar.warp.sync -1;
add.s32 %r33, %r33, 8;
setp.ne.s32 %p29, %r34, 32;
@%p29 bra BB0_7;

setp.ne.s32 %p30, %r7, 0;
@%p30 bra BB0_26;

atom.shared.add.u32 %r30, [%r17], 1;

BB0_26:
bar.sync 0;
atom.shared.add.u32 %r32, [%r17], 0;
setp.lt.u32 %p31, %r32, %r5;
@%p31 bra BB0_6;

BB0_27:
ret;
}






cuda atomic






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Nov 16 '18 at 16:47







huseyin tugrul buyukisik

















asked Nov 15 '18 at 18:54









huseyin tugrul buyukisikhuseyin tugrul buyukisik

7,00233165




7,00233165













  • Compiling for 3.0 architecture through 7.0 but gpu is 3.0. I will look at cuobjdump tool. Compiling on commandline on nvcc. Also compiling within nvrtc(and using driver api) does same thing with just 3.0 architecture. Do you need parameters of compile command?

    – huseyin tugrul buyukisik
    Nov 16 '18 at 8:02













  • @RobertCrovella that is very interesting. Consider answering as this is (I'd think) quite non-trivial for the average user.

    – Ander Biguri
    Nov 16 '18 at 10:36



















  • Compiling for 3.0 architecture through 7.0 but gpu is 3.0. I will look at cuobjdump tool. Compiling on commandline on nvcc. Also compiling within nvrtc(and using driver api) does same thing with just 3.0 architecture. Do you need parameters of compile command?

    – huseyin tugrul buyukisik
    Nov 16 '18 at 8:02













  • @RobertCrovella that is very interesting. Consider answering as this is (I'd think) quite non-trivial for the average user.

    – Ander Biguri
    Nov 16 '18 at 10:36

















Compiling for 3.0 architecture through 7.0 but gpu is 3.0. I will look at cuobjdump tool. Compiling on commandline on nvcc. Also compiling within nvrtc(and using driver api) does same thing with just 3.0 architecture. Do you need parameters of compile command?

– huseyin tugrul buyukisik
Nov 16 '18 at 8:02







Compiling for 3.0 architecture through 7.0 but gpu is 3.0. I will look at cuobjdump tool. Compiling on commandline on nvcc. Also compiling within nvrtc(and using driver api) does same thing with just 3.0 architecture. Do you need parameters of compile command?

– huseyin tugrul buyukisik
Nov 16 '18 at 8:02















@RobertCrovella that is very interesting. Consider answering as this is (I'd think) quite non-trivial for the average user.

– Ander Biguri
Nov 16 '18 at 10:36





@RobertCrovella that is very interesting. Consider answering as this is (I'd think) quite non-trivial for the average user.

– Ander Biguri
Nov 16 '18 at 10:36












1 Answer
1






active

oldest

votes


















1














There are at least 2 things to be aware of here.





  1. Let's observe that your program is using atomics on shared memory locations. Also, you indicated that you are compiling for (and when profiling, running on) a Kepler architecture GPU.



    On Kepler, shared memory atomics are emulated via a software sequence. This won't be visible when inspecting the PTX code, as the conversion to the emulation sequence is done by ptxas, the tool that converts PTX to SASS code for execution on the target device.



    Since you are targetting and running on Kepler, the SASS includes no shared memory atomic instructions (instead, shared atomics are emulated with a loop that uses special hardware locks, and for example you can see LDSLK, a load-from-shared-with-lock instruction, in your SASS code).



    Since your code has no actual atomic instructions (on Kepler), it is not generating any atomic traffic that is trackable by the profiler.



    If you want to verify this, use the cuobjdump tool on your compiled binary. I recommend compiling only for the Kepler target architecture you will actually use for this sort of binary analysis. Here's an example:



    $ nvcc -o t324 t324.cu -arch=sm_30
    $ cuobjdump -sass ./t324 |grep ATOM
    $ nvcc -o t324 t324.cu -arch=sm_50
    $ cuobjdump -sass ./t324 |grep ATOM
    /*00e8*/ @P2 ATOMS.ADD R6, [RZ], RZ ; /* 0xec0000000ff2ff06 */
    /*01b8*/ @P0 ATOMS.ADD R12, [RZ], RZ ; /* 0xec0000000ff0ff0c */
    /*10f8*/ @P0 ATOMS.ADD RZ, [RZ], R12 ; /* 0xec00000000c0ffff */
    /*1138*/ @P0 ATOMS.ADD R10, [RZ], RZ ; /* 0xec0000000ff0ff0a */
    $



  2. As indicated above, on Maxwell and beyond, there is a native shared memory atomic instruction available (e.g. ATOMS) in SASS code. Therefore if you compile your code for a maxwell architecture or beyond, you will see actual atomic instructions in the SASS.



    However, I'm not sure if or how this will be represented in the visual profiler. I suspect shared atomic reporting may be limited. This is discoverable by reviewing the available metrics and observing that for architectures of 5.0 and higher, most of the atomic metrics are specifically for global atomics, and the only metric I can find pertaining to shared atomics is:



    inst_executed_shared_atomics    Warp level shared instructions for atom and atom CAS    Multi-context


    I'm not sure that is sufficient to compute bandwidth or utilization, so I'm not sure the visual profiler intends to report much in the way of shared atomic usage, even on 5.0+ architectures. You're welcome to try it out of course.




As an aside, I would usually think that this sort of construct implies a logical defect in the code:



int i = blockDim.x * blockIdx.x + threadIdx.x;

if(i==0)
C[0]=0.0f;
__syncthreads();


But it's not relevant to this particular inquiry, and I'm not sure of the intent of your code anyway. Keep in mind that CUDA specifies no order of block execution.






share|improve this answer
























  • Yes, I mistaken initialization of C[0], I should've done that on host side or used blockDim and blockIdx instead of 0. I've meant to do per-block in-block critical section. Thank you very much for enlightening on the software sequence part. I guess Maxwell and onwards must be much faster in terms of core-to-core atomics communications with the addition of native atomics on shared memory.

    – huseyin tugrul buyukisik
    Nov 16 '18 at 16:43













  • Yes, in fact a whole blog article was written about it.

    – Robert Crovella
    Nov 29 '18 at 15:20











  • 200ms on titanx vs infinity ms on titan tells about it. Thank you.

    – huseyin tugrul buyukisik
    Nov 29 '18 at 16:49











Your Answer






StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53326168%2fatomic-counter-for-critical-section-not-using-atomic-bandwidth-according-to-prof%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown

























1 Answer
1






active

oldest

votes








1 Answer
1






active

oldest

votes









active

oldest

votes






active

oldest

votes









1














There are at least 2 things to be aware of here.





  1. Let's observe that your program is using atomics on shared memory locations. Also, you indicated that you are compiling for (and when profiling, running on) a Kepler architecture GPU.



    On Kepler, shared memory atomics are emulated via a software sequence. This won't be visible when inspecting the PTX code, as the conversion to the emulation sequence is done by ptxas, the tool that converts PTX to SASS code for execution on the target device.



    Since you are targetting and running on Kepler, the SASS includes no shared memory atomic instructions (instead, shared atomics are emulated with a loop that uses special hardware locks, and for example you can see LDSLK, a load-from-shared-with-lock instruction, in your SASS code).



    Since your code has no actual atomic instructions (on Kepler), it is not generating any atomic traffic that is trackable by the profiler.



    If you want to verify this, use the cuobjdump tool on your compiled binary. I recommend compiling only for the Kepler target architecture you will actually use for this sort of binary analysis. Here's an example:



    $ nvcc -o t324 t324.cu -arch=sm_30
    $ cuobjdump -sass ./t324 |grep ATOM
    $ nvcc -o t324 t324.cu -arch=sm_50
    $ cuobjdump -sass ./t324 |grep ATOM
    /*00e8*/ @P2 ATOMS.ADD R6, [RZ], RZ ; /* 0xec0000000ff2ff06 */
    /*01b8*/ @P0 ATOMS.ADD R12, [RZ], RZ ; /* 0xec0000000ff0ff0c */
    /*10f8*/ @P0 ATOMS.ADD RZ, [RZ], R12 ; /* 0xec00000000c0ffff */
    /*1138*/ @P0 ATOMS.ADD R10, [RZ], RZ ; /* 0xec0000000ff0ff0a */
    $



  2. As indicated above, on Maxwell and beyond, there is a native shared memory atomic instruction available (e.g. ATOMS) in SASS code. Therefore if you compile your code for a maxwell architecture or beyond, you will see actual atomic instructions in the SASS.



    However, I'm not sure if or how this will be represented in the visual profiler. I suspect shared atomic reporting may be limited. This is discoverable by reviewing the available metrics and observing that for architectures of 5.0 and higher, most of the atomic metrics are specifically for global atomics, and the only metric I can find pertaining to shared atomics is:



    inst_executed_shared_atomics    Warp level shared instructions for atom and atom CAS    Multi-context


    I'm not sure that is sufficient to compute bandwidth or utilization, so I'm not sure the visual profiler intends to report much in the way of shared atomic usage, even on 5.0+ architectures. You're welcome to try it out of course.




As an aside, I would usually think that this sort of construct implies a logical defect in the code:



int i = blockDim.x * blockIdx.x + threadIdx.x;

if(i==0)
C[0]=0.0f;
__syncthreads();


But it's not relevant to this particular inquiry, and I'm not sure of the intent of your code anyway. Keep in mind that CUDA specifies no order of block execution.






share|improve this answer
























  • Yes, I mistaken initialization of C[0], I should've done that on host side or used blockDim and blockIdx instead of 0. I've meant to do per-block in-block critical section. Thank you very much for enlightening on the software sequence part. I guess Maxwell and onwards must be much faster in terms of core-to-core atomics communications with the addition of native atomics on shared memory.

    – huseyin tugrul buyukisik
    Nov 16 '18 at 16:43













  • Yes, in fact a whole blog article was written about it.

    – Robert Crovella
    Nov 29 '18 at 15:20











  • 200ms on titanx vs infinity ms on titan tells about it. Thank you.

    – huseyin tugrul buyukisik
    Nov 29 '18 at 16:49
















1














There are at least 2 things to be aware of here.





  1. Let's observe that your program is using atomics on shared memory locations. Also, you indicated that you are compiling for (and when profiling, running on) a Kepler architecture GPU.



    On Kepler, shared memory atomics are emulated via a software sequence. This won't be visible when inspecting the PTX code, as the conversion to the emulation sequence is done by ptxas, the tool that converts PTX to SASS code for execution on the target device.



    Since you are targetting and running on Kepler, the SASS includes no shared memory atomic instructions (instead, shared atomics are emulated with a loop that uses special hardware locks, and for example you can see LDSLK, a load-from-shared-with-lock instruction, in your SASS code).



    Since your code has no actual atomic instructions (on Kepler), it is not generating any atomic traffic that is trackable by the profiler.



    If you want to verify this, use the cuobjdump tool on your compiled binary. I recommend compiling only for the Kepler target architecture you will actually use for this sort of binary analysis. Here's an example:



    $ nvcc -o t324 t324.cu -arch=sm_30
    $ cuobjdump -sass ./t324 |grep ATOM
    $ nvcc -o t324 t324.cu -arch=sm_50
    $ cuobjdump -sass ./t324 |grep ATOM
    /*00e8*/ @P2 ATOMS.ADD R6, [RZ], RZ ; /* 0xec0000000ff2ff06 */
    /*01b8*/ @P0 ATOMS.ADD R12, [RZ], RZ ; /* 0xec0000000ff0ff0c */
    /*10f8*/ @P0 ATOMS.ADD RZ, [RZ], R12 ; /* 0xec00000000c0ffff */
    /*1138*/ @P0 ATOMS.ADD R10, [RZ], RZ ; /* 0xec0000000ff0ff0a */
    $



  2. As indicated above, on Maxwell and beyond, there is a native shared memory atomic instruction available (e.g. ATOMS) in SASS code. Therefore if you compile your code for a maxwell architecture or beyond, you will see actual atomic instructions in the SASS.



    However, I'm not sure if or how this will be represented in the visual profiler. I suspect shared atomic reporting may be limited. This is discoverable by reviewing the available metrics and observing that for architectures of 5.0 and higher, most of the atomic metrics are specifically for global atomics, and the only metric I can find pertaining to shared atomics is:



    inst_executed_shared_atomics    Warp level shared instructions for atom and atom CAS    Multi-context


    I'm not sure that is sufficient to compute bandwidth or utilization, so I'm not sure the visual profiler intends to report much in the way of shared atomic usage, even on 5.0+ architectures. You're welcome to try it out of course.




As an aside, I would usually think that this sort of construct implies a logical defect in the code:



int i = blockDim.x * blockIdx.x + threadIdx.x;

if(i==0)
C[0]=0.0f;
__syncthreads();


But it's not relevant to this particular inquiry, and I'm not sure of the intent of your code anyway. Keep in mind that CUDA specifies no order of block execution.






share|improve this answer
























  • Yes, I mistaken initialization of C[0], I should've done that on host side or used blockDim and blockIdx instead of 0. I've meant to do per-block in-block critical section. Thank you very much for enlightening on the software sequence part. I guess Maxwell and onwards must be much faster in terms of core-to-core atomics communications with the addition of native atomics on shared memory.

    – huseyin tugrul buyukisik
    Nov 16 '18 at 16:43













  • Yes, in fact a whole blog article was written about it.

    – Robert Crovella
    Nov 29 '18 at 15:20











  • 200ms on titanx vs infinity ms on titan tells about it. Thank you.

    – huseyin tugrul buyukisik
    Nov 29 '18 at 16:49














1












1








1







There are at least 2 things to be aware of here.





  1. Let's observe that your program is using atomics on shared memory locations. Also, you indicated that you are compiling for (and when profiling, running on) a Kepler architecture GPU.



    On Kepler, shared memory atomics are emulated via a software sequence. This won't be visible when inspecting the PTX code, as the conversion to the emulation sequence is done by ptxas, the tool that converts PTX to SASS code for execution on the target device.



    Since you are targetting and running on Kepler, the SASS includes no shared memory atomic instructions (instead, shared atomics are emulated with a loop that uses special hardware locks, and for example you can see LDSLK, a load-from-shared-with-lock instruction, in your SASS code).



    Since your code has no actual atomic instructions (on Kepler), it is not generating any atomic traffic that is trackable by the profiler.



    If you want to verify this, use the cuobjdump tool on your compiled binary. I recommend compiling only for the Kepler target architecture you will actually use for this sort of binary analysis. Here's an example:



    $ nvcc -o t324 t324.cu -arch=sm_30
    $ cuobjdump -sass ./t324 |grep ATOM
    $ nvcc -o t324 t324.cu -arch=sm_50
    $ cuobjdump -sass ./t324 |grep ATOM
    /*00e8*/ @P2 ATOMS.ADD R6, [RZ], RZ ; /* 0xec0000000ff2ff06 */
    /*01b8*/ @P0 ATOMS.ADD R12, [RZ], RZ ; /* 0xec0000000ff0ff0c */
    /*10f8*/ @P0 ATOMS.ADD RZ, [RZ], R12 ; /* 0xec00000000c0ffff */
    /*1138*/ @P0 ATOMS.ADD R10, [RZ], RZ ; /* 0xec0000000ff0ff0a */
    $



  2. As indicated above, on Maxwell and beyond, there is a native shared memory atomic instruction available (e.g. ATOMS) in SASS code. Therefore if you compile your code for a maxwell architecture or beyond, you will see actual atomic instructions in the SASS.



    However, I'm not sure if or how this will be represented in the visual profiler. I suspect shared atomic reporting may be limited. This is discoverable by reviewing the available metrics and observing that for architectures of 5.0 and higher, most of the atomic metrics are specifically for global atomics, and the only metric I can find pertaining to shared atomics is:



    inst_executed_shared_atomics    Warp level shared instructions for atom and atom CAS    Multi-context


    I'm not sure that is sufficient to compute bandwidth or utilization, so I'm not sure the visual profiler intends to report much in the way of shared atomic usage, even on 5.0+ architectures. You're welcome to try it out of course.




As an aside, I would usually think that this sort of construct implies a logical defect in the code:



int i = blockDim.x * blockIdx.x + threadIdx.x;

if(i==0)
C[0]=0.0f;
__syncthreads();


But it's not relevant to this particular inquiry, and I'm not sure of the intent of your code anyway. Keep in mind that CUDA specifies no order of block execution.






share|improve this answer













There are at least 2 things to be aware of here.





  1. Let's observe that your program is using atomics on shared memory locations. Also, you indicated that you are compiling for (and when profiling, running on) a Kepler architecture GPU.



    On Kepler, shared memory atomics are emulated via a software sequence. This won't be visible when inspecting the PTX code, as the conversion to the emulation sequence is done by ptxas, the tool that converts PTX to SASS code for execution on the target device.



    Since you are targetting and running on Kepler, the SASS includes no shared memory atomic instructions (instead, shared atomics are emulated with a loop that uses special hardware locks, and for example you can see LDSLK, a load-from-shared-with-lock instruction, in your SASS code).



    Since your code has no actual atomic instructions (on Kepler), it is not generating any atomic traffic that is trackable by the profiler.



    If you want to verify this, use the cuobjdump tool on your compiled binary. I recommend compiling only for the Kepler target architecture you will actually use for this sort of binary analysis. Here's an example:



    $ nvcc -o t324 t324.cu -arch=sm_30
    $ cuobjdump -sass ./t324 |grep ATOM
    $ nvcc -o t324 t324.cu -arch=sm_50
    $ cuobjdump -sass ./t324 |grep ATOM
    /*00e8*/ @P2 ATOMS.ADD R6, [RZ], RZ ; /* 0xec0000000ff2ff06 */
    /*01b8*/ @P0 ATOMS.ADD R12, [RZ], RZ ; /* 0xec0000000ff0ff0c */
    /*10f8*/ @P0 ATOMS.ADD RZ, [RZ], R12 ; /* 0xec00000000c0ffff */
    /*1138*/ @P0 ATOMS.ADD R10, [RZ], RZ ; /* 0xec0000000ff0ff0a */
    $



  2. As indicated above, on Maxwell and beyond, there is a native shared memory atomic instruction available (e.g. ATOMS) in SASS code. Therefore if you compile your code for a maxwell architecture or beyond, you will see actual atomic instructions in the SASS.



    However, I'm not sure if or how this will be represented in the visual profiler. I suspect shared atomic reporting may be limited. This is discoverable by reviewing the available metrics and observing that for architectures of 5.0 and higher, most of the atomic metrics are specifically for global atomics, and the only metric I can find pertaining to shared atomics is:



    inst_executed_shared_atomics    Warp level shared instructions for atom and atom CAS    Multi-context


    I'm not sure that is sufficient to compute bandwidth or utilization, so I'm not sure the visual profiler intends to report much in the way of shared atomic usage, even on 5.0+ architectures. You're welcome to try it out of course.




As an aside, I would usually think that this sort of construct implies a logical defect in the code:



int i = blockDim.x * blockIdx.x + threadIdx.x;

if(i==0)
C[0]=0.0f;
__syncthreads();


But it's not relevant to this particular inquiry, and I'm not sure of the intent of your code anyway. Keep in mind that CUDA specifies no order of block execution.







share|improve this answer












share|improve this answer



share|improve this answer










answered Nov 16 '18 at 16:36









Robert CrovellaRobert Crovella

96.8k5110152




96.8k5110152













  • Yes, I mistaken initialization of C[0], I should've done that on host side or used blockDim and blockIdx instead of 0. I've meant to do per-block in-block critical section. Thank you very much for enlightening on the software sequence part. I guess Maxwell and onwards must be much faster in terms of core-to-core atomics communications with the addition of native atomics on shared memory.

    – huseyin tugrul buyukisik
    Nov 16 '18 at 16:43













  • Yes, in fact a whole blog article was written about it.

    – Robert Crovella
    Nov 29 '18 at 15:20











  • 200ms on titanx vs infinity ms on titan tells about it. Thank you.

    – huseyin tugrul buyukisik
    Nov 29 '18 at 16:49



















  • Yes, I mistaken initialization of C[0], I should've done that on host side or used blockDim and blockIdx instead of 0. I've meant to do per-block in-block critical section. Thank you very much for enlightening on the software sequence part. I guess Maxwell and onwards must be much faster in terms of core-to-core atomics communications with the addition of native atomics on shared memory.

    – huseyin tugrul buyukisik
    Nov 16 '18 at 16:43













  • Yes, in fact a whole blog article was written about it.

    – Robert Crovella
    Nov 29 '18 at 15:20











  • 200ms on titanx vs infinity ms on titan tells about it. Thank you.

    – huseyin tugrul buyukisik
    Nov 29 '18 at 16:49

















Yes, I mistaken initialization of C[0], I should've done that on host side or used blockDim and blockIdx instead of 0. I've meant to do per-block in-block critical section. Thank you very much for enlightening on the software sequence part. I guess Maxwell and onwards must be much faster in terms of core-to-core atomics communications with the addition of native atomics on shared memory.

– huseyin tugrul buyukisik
Nov 16 '18 at 16:43







Yes, I mistaken initialization of C[0], I should've done that on host side or used blockDim and blockIdx instead of 0. I've meant to do per-block in-block critical section. Thank you very much for enlightening on the software sequence part. I guess Maxwell and onwards must be much faster in terms of core-to-core atomics communications with the addition of native atomics on shared memory.

– huseyin tugrul buyukisik
Nov 16 '18 at 16:43















Yes, in fact a whole blog article was written about it.

– Robert Crovella
Nov 29 '18 at 15:20





Yes, in fact a whole blog article was written about it.

– Robert Crovella
Nov 29 '18 at 15:20













200ms on titanx vs infinity ms on titan tells about it. Thank you.

– huseyin tugrul buyukisik
Nov 29 '18 at 16:49





200ms on titanx vs infinity ms on titan tells about it. Thank you.

– huseyin tugrul buyukisik
Nov 29 '18 at 16:49




















draft saved

draft discarded




















































Thanks for contributing an answer to Stack Overflow!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53326168%2fatomic-counter-for-critical-section-not-using-atomic-bandwidth-according-to-prof%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

Xamarin.iOS Cant Deploy on Iphone

Glorious Revolution

Dulmage-Mendelsohn matrix decomposition in Python