(c or c++) turn a 1d pixelarray into a SDL2 texture

I am making a CUDA application, that manipulates a simple uint8_t*3 pixelarray, one byte for r, one for g, one for b. I want to make an rgb24 texture out of that array so that I can render it to a window. Simply locking the texture doesn’t work, it interferes with cuda and it gives the error
CUDA error = 700 at main.cu:112 'cudaDeviceSynchronize()'

Relevant snippet from int main():

	int num_pixels = width*height;
	int pitch = width*3;
	size_t fb_size = 3*num_pixels*sizeof(char); // SDL_PIXELFORMAT_RGB24

	// allocate framebuffer
	unsigned char *fb;
	checkCudaErrors(cudaMallocManaged((void **)&fb, fb_size));


	////////////
	// init sdl

	if (SDL_Init(SDL_INIT_EVERYTHING) != 0) {
		fprintf(stderr, "SDL_Init Error: %s\n", SDL_GetError());
		return EXIT_FAILURE;
	}

	SDL_Window * window = SDL_CreateWindow(
		"CUDA Renderer", 
		SDL_WINDOWPOS_UNDEFINED, 
		SDL_WINDOWPOS_UNDEFINED, 
		width, 
		height, 
		SDL_WINDOW_RESIZABLE
	);

	SDL_Renderer * renderer;
	renderer = SDL_CreateRenderer(
		window,
		-1,
		SDL_RENDERER_ACCELERATED
	);

	SDL_Texture * buffer = SDL_CreateTexture(
		renderer,
		SDL_PIXELFORMAT_RGB24,
		SDL_TEXTUREACCESS_STREAMING, 
		width,height
	);

	SDL_LockTexture(buffer, NULL, (void **) &fb, &pitch);

	// Render our buffer
	dim3 blocks(width/tx+1,height/ty+1);
	dim3 threads(tx,ty);
	render<<<blocks, threads>>>(fb, width, height);

	SDL_UnlockTexture(buffer);	

	checkCudaErrors(cudaGetLastError());
	checkCudaErrors(cudaDeviceSynchronize()); // error appears here

	SDL_RenderCopy(renderer, buffer, NULL, NULL);
	SDL_RenderPresent(renderer);

	SDL_Delay(10000);

	checkCudaErrors(cudaFree(fb));

	SDL_DestroyRenderer(renderer);
	SDL_DestroyWindow(window);
	SDL_Quit();

Before I tried to hook sdl into this, I outputted the image as a PPM. This worked without error. I have simply added SDL2 code in between the other stuff, so it’s not the CUDA syntax which is wrong.

For now: I found a (very ugly) hotfix, which is just to copy the buffer.

// generate framebuffer (fb) with cuda 

uint8_t *pixels; // declare before loop

SDL_LockTexture(buffer, NULL, (void **) &pixels, &pitch);
memcpy(pixels, fb, width*height*3);
SDL_UnlockTexture(buffer);

// rendercopy and stuff

Simply binding fb as the buffer results in a black screen but no crash. I would really like something better because this wastes 8(!!!) ms per frame…