CUDA C + MEAN FILTER + SYNC/ASYNC ERRORS

up vote
-1
down vote

favorite

I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.

Here is my code, thank you:

#include <stdio.h>

#include <time.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <stdlib.h>



#define MAXR 16

#define MAXC 16



__global__ void imagefilter(float ** intermediates_d, int ** result_d) {



    int idx =  threadIdx.x;

    int idy = threadIdx.y;



    int x,y;

    //result_d[2][2]= 5;

    //if ((idx < 15) && (idy < 15)) {

    result_d[x][y] = result_d[idx][idy];

for(x=1; x < MAXR; x++) {

    for(y=1; y < MAXC; y++) {

            result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]

            + intermediates_d[idx - 1][idy]

            + intermediates_d[idx - 1][idy + 1]

            + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]

            + intermediates_d[idx][idy + 1]

            + intermediates_d[idx + 1][idy - 1]

            + intermediates_d[idx + 1][idy]

            + intermediates_d[idx + 1][idy + 1]) / 9.0F));

            //  result_d[2][2]= 5;

            result_d[idy][idx]= result_d[y][x];

        }

    }

}







    __syncthreads();







int main(void)



{

    int i, j;

    //double cpu_time_used;

    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating

    int matrix[MAXR][MAXC]; // This is the input matrix from file

    int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros

    float ** intermediates_d;

    //int **matrix_d;



    int ** result_d;

    int datasize_f = MAXR * MAXC * sizeof(float);

    int datasize_i = MAXR * MAXC * sizeof(int);

    //Allocate memory on the host.



    cudaMalloc((void**) &intermediates_d, datasize_f);

    //cudaMalloc((void**) &matrix_d, datasize);

    cudaMalloc((void**) &result_d, datasize_i);



    FILE *fp;

    fp = fopen("arrays16.txt", "r"); // reads in matrix

    //clock_t start =clock();

    for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix

            {

        for (j = 0; j < MAXC; j++)

        {

            fscanf(fp, "%dt", &matrix[i][j]);

        }

    }



    printf("*****INPUT MATRIX*****n");

    for (i = 0; i < MAXR; i++)

    {

        printf("n");

        for (j = 0; j < MAXC; j++) {

            printf("%d ", matrix[i][j]);

        }

    }

    printf("nn");



    //This is where we convert the input matrix into floating point in intermediate matrix

    for (int y = 0; y < MAXR; y++) {



        for (int x = 0; x < MAXC; x++) {

            intermediates[y][x] = (float) matrix[y][x];

        }

    }

    printf("*******INTERMEDIATE MATRIX*******n");

    for (i = 0; i < 16; i++) {

        printf("n"); // prints out the results array to .txt file

        for (j = 0; j < 16; j++) {

            printf("%.1f ", intermediates[i][j]);

        }

    }

    printf("nn");

    // copying the data from the host array to the device array



    //cudaMemcpy(matrix_d, matrix, datasize,



    //cudaMemcpyHostToDevice);



    cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);

    cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);



    // how many blocks we will allocate

    dim3 blocks(1, 1);



    //how many threads per block we will allocate

    dim3 threadsPerBlock(16, 16);



    //Launch Kernel

    imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);



    //Copy back Results Matrix.

    cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);



    cudaError_t errSync = cudaGetLastError();

    cudaError_t errAsync = cudaDeviceSynchronize();



    if (errSync != cudaSuccess)

        printf("Sync kernel error: %sn", cudaGetErrorString(errSync));

    if (errAsync != cudaSuccess)

        printf("Async kernel error: %sn", cudaGetErrorString(errAsync));



    FILE *file;

    file = fopen("results.txt", "w+"); // writes  matrix to file

    printf("*******RESULTS MATRIX******nn");

    for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file

        for (j = 1; j < MAXC - 1; j++) {

            printf("%d ", result[i][j]);

            fprintf(file, "%d ",result[i][j]);

        }

        printf("n");



        fprintf(file, "n");

    }



    fclose(file);

}

edited Nov 11 at 9:41

talonmies

58.8k17126192

asked Nov 10 at 22:20

Yeinberg

You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42

when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24

any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00

1

The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41

Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18

add a comment |

up vote
-1
down vote

favorite

Here is my code, thank you:

#include <stdio.h>

#include <time.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <stdlib.h>



#define MAXR 16

#define MAXC 16



__global__ void imagefilter(float ** intermediates_d, int ** result_d) {



    int idx =  threadIdx.x;

    int idy = threadIdx.y;



    int x,y;

    //result_d[2][2]= 5;

    //if ((idx < 15) && (idy < 15)) {

    result_d[x][y] = result_d[idx][idy];

for(x=1; x < MAXR; x++) {

    for(y=1; y < MAXC; y++) {

            result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]

            + intermediates_d[idx - 1][idy]

            + intermediates_d[idx - 1][idy + 1]

            + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]

            + intermediates_d[idx][idy + 1]

            + intermediates_d[idx + 1][idy - 1]

            + intermediates_d[idx + 1][idy]

            + intermediates_d[idx + 1][idy + 1]) / 9.0F));

            //  result_d[2][2]= 5;

            result_d[idy][idx]= result_d[y][x];

        }

    }

}







    __syncthreads();







int main(void)



{

    int i, j;

    //double cpu_time_used;

    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating

    int matrix[MAXR][MAXC]; // This is the input matrix from file

    int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros

    float ** intermediates_d;

    //int **matrix_d;



    int ** result_d;

    int datasize_f = MAXR * MAXC * sizeof(float);

    int datasize_i = MAXR * MAXC * sizeof(int);

    //Allocate memory on the host.



    cudaMalloc((void**) &intermediates_d, datasize_f);

    //cudaMalloc((void**) &matrix_d, datasize);

    cudaMalloc((void**) &result_d, datasize_i);



    FILE *fp;

    fp = fopen("arrays16.txt", "r"); // reads in matrix

    //clock_t start =clock();

    for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix

            {

        for (j = 0; j < MAXC; j++)

        {

            fscanf(fp, "%dt", &matrix[i][j]);

        }

    }



    printf("*****INPUT MATRIX*****n");

    for (i = 0; i < MAXR; i++)

    {

        printf("n");

        for (j = 0; j < MAXC; j++) {

            printf("%d ", matrix[i][j]);

        }

    }

    printf("nn");



    //This is where we convert the input matrix into floating point in intermediate matrix

    for (int y = 0; y < MAXR; y++) {



        for (int x = 0; x < MAXC; x++) {

            intermediates[y][x] = (float) matrix[y][x];

        }

    }

    printf("*******INTERMEDIATE MATRIX*******n");

    for (i = 0; i < 16; i++) {

        printf("n"); // prints out the results array to .txt file

        for (j = 0; j < 16; j++) {

            printf("%.1f ", intermediates[i][j]);

        }

    }

    printf("nn");

    // copying the data from the host array to the device array



    //cudaMemcpy(matrix_d, matrix, datasize,



    //cudaMemcpyHostToDevice);



    cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);

    cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);



    // how many blocks we will allocate

    dim3 blocks(1, 1);



    //how many threads per block we will allocate

    dim3 threadsPerBlock(16, 16);



    //Launch Kernel

    imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);



    //Copy back Results Matrix.

    cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);



    cudaError_t errSync = cudaGetLastError();

    cudaError_t errAsync = cudaDeviceSynchronize();



    if (errSync != cudaSuccess)

        printf("Sync kernel error: %sn", cudaGetErrorString(errSync));

    if (errAsync != cudaSuccess)

        printf("Async kernel error: %sn", cudaGetErrorString(errAsync));



    FILE *file;

    file = fopen("results.txt", "w+"); // writes  matrix to file

    printf("*******RESULTS MATRIX******nn");

    for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file

        for (j = 1; j < MAXC - 1; j++) {

            printf("%d ", result[i][j]);

            fprintf(file, "%d ",result[i][j]);

        }

        printf("n");



        fprintf(file, "n");

    }



    fclose(file);

}

edited Nov 11 at 9:41

talonmies

58.8k17126192

asked Nov 10 at 22:20

Yeinberg

You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42

when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24

any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00

1

The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41

Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18

add a comment |

up vote
-1
down vote

favorite

Here is my code, thank you:

#include <stdio.h>

#include <time.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <stdlib.h>



#define MAXR 16

#define MAXC 16



__global__ void imagefilter(float ** intermediates_d, int ** result_d) {



    int idx =  threadIdx.x;

    int idy = threadIdx.y;



    int x,y;

    //result_d[2][2]= 5;

    //if ((idx < 15) && (idy < 15)) {

    result_d[x][y] = result_d[idx][idy];

for(x=1; x < MAXR; x++) {

    for(y=1; y < MAXC; y++) {

            result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]

            + intermediates_d[idx - 1][idy]

            + intermediates_d[idx - 1][idy + 1]

            + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]

            + intermediates_d[idx][idy + 1]

            + intermediates_d[idx + 1][idy - 1]

            + intermediates_d[idx + 1][idy]

            + intermediates_d[idx + 1][idy + 1]) / 9.0F));

            //  result_d[2][2]= 5;

            result_d[idy][idx]= result_d[y][x];

        }

    }

}







    __syncthreads();







int main(void)



{

    int i, j;

    //double cpu_time_used;

    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating

    int matrix[MAXR][MAXC]; // This is the input matrix from file

    int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros

    float ** intermediates_d;

    //int **matrix_d;



    int ** result_d;

    int datasize_f = MAXR * MAXC * sizeof(float);

    int datasize_i = MAXR * MAXC * sizeof(int);

    //Allocate memory on the host.



    cudaMalloc((void**) &intermediates_d, datasize_f);

    //cudaMalloc((void**) &matrix_d, datasize);

    cudaMalloc((void**) &result_d, datasize_i);



    FILE *fp;

    fp = fopen("arrays16.txt", "r"); // reads in matrix

    //clock_t start =clock();

    for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix

            {

        for (j = 0; j < MAXC; j++)

        {

            fscanf(fp, "%dt", &matrix[i][j]);

        }

    }



    printf("*****INPUT MATRIX*****n");

    for (i = 0; i < MAXR; i++)

    {

        printf("n");

        for (j = 0; j < MAXC; j++) {

            printf("%d ", matrix[i][j]);

        }

    }

    printf("nn");



    //This is where we convert the input matrix into floating point in intermediate matrix

    for (int y = 0; y < MAXR; y++) {



        for (int x = 0; x < MAXC; x++) {

            intermediates[y][x] = (float) matrix[y][x];

        }

    }

    printf("*******INTERMEDIATE MATRIX*******n");

    for (i = 0; i < 16; i++) {

        printf("n"); // prints out the results array to .txt file

        for (j = 0; j < 16; j++) {

            printf("%.1f ", intermediates[i][j]);

        }

    }

    printf("nn");

    // copying the data from the host array to the device array



    //cudaMemcpy(matrix_d, matrix, datasize,



    //cudaMemcpyHostToDevice);



    cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);

    cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);



    // how many blocks we will allocate

    dim3 blocks(1, 1);



    //how many threads per block we will allocate

    dim3 threadsPerBlock(16, 16);



    //Launch Kernel

    imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);



    //Copy back Results Matrix.

    cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);



    cudaError_t errSync = cudaGetLastError();

    cudaError_t errAsync = cudaDeviceSynchronize();



    if (errSync != cudaSuccess)

        printf("Sync kernel error: %sn", cudaGetErrorString(errSync));

    if (errAsync != cudaSuccess)

        printf("Async kernel error: %sn", cudaGetErrorString(errAsync));



    FILE *file;

    file = fopen("results.txt", "w+"); // writes  matrix to file

    printf("*******RESULTS MATRIX******nn");

    for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file

        for (j = 1; j < MAXC - 1; j++) {

            printf("%d ", result[i][j]);

            fprintf(file, "%d ",result[i][j]);

        }

        printf("n");



        fprintf(file, "n");

    }



    fclose(file);

}

edited Nov 11 at 9:41

talonmies

58.8k17126192

asked Nov 10 at 22:20

Yeinberg

Here is my code, thank you:

#include <stdio.h>

#include <time.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <stdlib.h>



#define MAXR 16

#define MAXC 16



__global__ void imagefilter(float ** intermediates_d, int ** result_d) {



    int idx =  threadIdx.x;

    int idy = threadIdx.y;



    int x,y;

    //result_d[2][2]= 5;

    //if ((idx < 15) && (idy < 15)) {

    result_d[x][y] = result_d[idx][idy];

for(x=1; x < MAXR; x++) {

    for(y=1; y < MAXC; y++) {

            result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]

            + intermediates_d[idx - 1][idy]

            + intermediates_d[idx - 1][idy + 1]

            + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]

            + intermediates_d[idx][idy + 1]

            + intermediates_d[idx + 1][idy - 1]

            + intermediates_d[idx + 1][idy]

            + intermediates_d[idx + 1][idy + 1]) / 9.0F));

            //  result_d[2][2]= 5;

            result_d[idy][idx]= result_d[y][x];

        }

    }

}







    __syncthreads();







int main(void)



{

    int i, j;

    //double cpu_time_used;

    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating

    int matrix[MAXR][MAXC]; // This is the input matrix from file

    int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros

    float ** intermediates_d;

    //int **matrix_d;



    int ** result_d;

    int datasize_f = MAXR * MAXC * sizeof(float);

    int datasize_i = MAXR * MAXC * sizeof(int);

    //Allocate memory on the host.



    cudaMalloc((void**) &intermediates_d, datasize_f);

    //cudaMalloc((void**) &matrix_d, datasize);

    cudaMalloc((void**) &result_d, datasize_i);



    FILE *fp;

    fp = fopen("arrays16.txt", "r"); // reads in matrix

    //clock_t start =clock();

    for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix

            {

        for (j = 0; j < MAXC; j++)

        {

            fscanf(fp, "%dt", &matrix[i][j]);

        }

    }



    printf("*****INPUT MATRIX*****n");

    for (i = 0; i < MAXR; i++)

    {

        printf("n");

        for (j = 0; j < MAXC; j++) {

            printf("%d ", matrix[i][j]);

        }

    }

    printf("nn");



    //This is where we convert the input matrix into floating point in intermediate matrix

    for (int y = 0; y < MAXR; y++) {



        for (int x = 0; x < MAXC; x++) {

            intermediates[y][x] = (float) matrix[y][x];

        }

    }

    printf("*******INTERMEDIATE MATRIX*******n");

    for (i = 0; i < 16; i++) {

        printf("n"); // prints out the results array to .txt file

        for (j = 0; j < 16; j++) {

            printf("%.1f ", intermediates[i][j]);

        }

    }

    printf("nn");

    // copying the data from the host array to the device array



    //cudaMemcpy(matrix_d, matrix, datasize,



    //cudaMemcpyHostToDevice);



    cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);

    cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);



    // how many blocks we will allocate

    dim3 blocks(1, 1);



    //how many threads per block we will allocate

    dim3 threadsPerBlock(16, 16);



    //Launch Kernel

    imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);



    //Copy back Results Matrix.

    cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);



    cudaError_t errSync = cudaGetLastError();

    cudaError_t errAsync = cudaDeviceSynchronize();



    if (errSync != cudaSuccess)

        printf("Sync kernel error: %sn", cudaGetErrorString(errSync));

    if (errAsync != cudaSuccess)

        printf("Async kernel error: %sn", cudaGetErrorString(errAsync));



    FILE *file;

    file = fopen("results.txt", "w+"); // writes  matrix to file

    printf("*******RESULTS MATRIX******nn");

    for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file

        for (j = 1; j < MAXC - 1; j++) {

            printf("%d ", result[i][j]);

            fprintf(file, "%d ",result[i][j]);

        }

        printf("n");



        fprintf(file, "n");

    }



    fclose(file);

}

filter parallel-processing cuda box

edited Nov 11 at 9:41

talonmies

58.8k17126192

asked Nov 10 at 22:20

Yeinberg

edited Nov 11 at 9:41

talonmies

58.8k17126192

asked Nov 10 at 22:20

Yeinberg

edited Nov 11 at 9:41

talonmies

58.8k17126192

edited Nov 11 at 9:41

talonmies

58.8k17126192

edited Nov 11 at 9:41

talonmies

58.8k17126192

asked Nov 10 at 22:20

Yeinberg

asked Nov 10 at 22:20

Yeinberg

asked Nov 10 at 22:20

Yeinberg

You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42

when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24

any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00

1

The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41

Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18

add a comment |

You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42

when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24

any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00

1

The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41

Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18

You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42

when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24

any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00

The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41

Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18

add a comment |

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53244001%2fcuda-c-mean-filter-sync-async-errors%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Vfrdtyky