#include #include "b.h" __global__ void foo(void) { __shared__ int a[N]; a[threadIdx.x] = threadIdx.x; __syncthreads(); g[threadIdx.x] = a[blockDim.x - threadIdx.x - 1]; bar(); } int main(void) { unsigned int i; int *dg, hg[N]; int sum = 0; foo<<<1, N>>>(); if (cudaGetSymbolAddress((void**)&dg, g)) { printf("couldn't get the symbol addr\n"); return 1; } if (cudaMemcpy(hg, dg, N * sizeof(int), cudaMemcpyDeviceToHost)) { printf("couldn't memcpy\n"); return 1; } for (i = 0; i < N; i++) { sum += hg[i]; } if (sum == 36) { printf("PASSED\n"); } else { printf("FAILED (%d)\n", sum); } return 0; }