SDL: GPU: Add SDL_CancelGPUCommandBuffer (#11316)

From b4dff42dcd3864ebb6d4ce7a2a35d0140f5b2bd2 Mon Sep 17 00:00:00 2001
From: Evan Hemsley <[EMAIL REDACTED]>
Date: Tue, 29 Oct 2024 14:43:22 -0700
Subject: [PATCH] GPU: Add SDL_CancelGPUCommandBuffer (#11316)

---------

Co-authored-by: Caleb Cornett <caleb.cornett@outlook.com>
---
 include/SDL3/SDL_gpu.h            |  27 +++++
 src/dynapi/SDL_dynapi.sym         |   1 +
 src/dynapi/SDL_dynapi_overrides.h |   1 +
 src/dynapi/SDL_dynapi_procs.h     |   1 +
 src/gpu/SDL_gpu.c                 |  32 +++++-
 src/gpu/SDL_sysgpu.h              |   5 +
 src/gpu/d3d11/SDL_gpu_d3d11.c     |  77 ++++++++-----
 src/gpu/d3d12/SDL_gpu_d3d12.c     |  54 ++++++---
 src/gpu/metal/SDL_gpu_metal.m     | 166 ++++++++++++++++++++++------
 src/gpu/vulkan/SDL_gpu_vulkan.c   | 178 +++++++++++++++++-------------
 10 files changed, 392 insertions(+), 150 deletions(-)

diff --git a/include/SDL3/SDL_gpu.h b/include/SDL3/SDL_gpu.h
index 91174e24d4b7d..899297ef07125 100644
--- a/include/SDL3/SDL_gpu.h
+++ b/include/SDL3/SDL_gpu.h
@@ -3529,6 +3529,11 @@ extern SDL_DECLSPEC SDL_GPUTextureFormat SDLCALL SDL_GetGPUSwapchainTextureForma
  * freed by the user. You MUST NOT call this function from any thread other
  * than the one that created the window.
  *
+ * When using SDL_GPU_PRESENTMODE_VSYNC, this function will block if too many frames are in flight.
+ * Otherwise, this function will fill the swapchain texture handle with NULL if too many frames are in flight.
+ * The best practice is to call SDL_CancelGPUCommandBuffer if the swapchain texture handle is NULL
+ * to avoid enqueuing needless work on the GPU.
+ *
  * \param command_buffer a command buffer.
  * \param window a window that has been claimed.
  * \param swapchain_texture a pointer filled in with a swapchain texture
@@ -3542,9 +3547,11 @@ extern SDL_DECLSPEC SDL_GPUTextureFormat SDLCALL SDL_GetGPUSwapchainTextureForma
  *
  * \since This function is available since SDL 3.1.3.
  *
+ * \sa SDL_GPUPresentMode
  * \sa SDL_ClaimWindowForGPUDevice
  * \sa SDL_SubmitGPUCommandBuffer
  * \sa SDL_SubmitGPUCommandBufferAndAcquireFence
+ * \sa SDL_CancelGPUCommandBuffer
  * \sa SDL_GetWindowSizeInPixels
  */
 extern SDL_DECLSPEC bool SDLCALL SDL_AcquireGPUSwapchainTexture(
@@ -3603,6 +3610,26 @@ extern SDL_DECLSPEC bool SDLCALL SDL_SubmitGPUCommandBuffer(
 extern SDL_DECLSPEC SDL_GPUFence *SDLCALL SDL_SubmitGPUCommandBufferAndAcquireFence(
     SDL_GPUCommandBuffer *command_buffer);
 
+/**
+ * Cancels a command buffer. None of the enqueued commands are executed.
+ *
+ * This must be called from the thread the command buffer was acquired on.
+ *
+ * You must not reference the command buffer after calling this function.
+ * It is an error to call this function after a swapchain texture has been acquired.
+ *
+ * \param command_buffer a command buffer.
+ * \returns true on success, false on error; call SDL_GetError() for more
+ *          information.
+ *
+ * \since This function is available since SDL 3.2.0.
+ *
+ * \sa SDL_AcquireGPUCommandBuffer
+ * \sa SDL_AcquireGPUSwapchainTexture
+ */
+extern SDL_DECLSPEC bool SDLCALL SDL_CancelGPUCommandBuffer(
+    SDL_GPUCommandBuffer *command_buffer);
+
 /**
  * Blocks the thread until the GPU is completely idle.
  *
diff --git a/src/dynapi/SDL_dynapi.sym b/src/dynapi/SDL_dynapi.sym
index e0efccc2b014f..fcdb93b05fec7 100644
--- a/src/dynapi/SDL_dynapi.sym
+++ b/src/dynapi/SDL_dynapi.sym
@@ -1183,6 +1183,7 @@ SDL3_0.0.0 {
     SDL_GetDefaultLogOutputFunction;
     SDL_RenderDebugText;
     SDL_GetSandbox;
+    SDL_CancelGPUCommandBuffer;
     # extra symbols go here (don't modify this line)
   local: *;
 };
diff --git a/src/dynapi/SDL_dynapi_overrides.h b/src/dynapi/SDL_dynapi_overrides.h
index 4f5f16d889ecf..c33efb30b8c40 100644
--- a/src/dynapi/SDL_dynapi_overrides.h
+++ b/src/dynapi/SDL_dynapi_overrides.h
@@ -1208,3 +1208,4 @@
 #define SDL_GetDefaultLogOutputFunction SDL_GetDefaultLogOutputFunction_REAL
 #define SDL_RenderDebugText SDL_RenderDebugText_REAL
 #define SDL_GetSandbox SDL_GetSandbox_REAL
+#define SDL_CancelGPUCommandBuffer SDL_CancelGPUCommandBuffer_REAL
diff --git a/src/dynapi/SDL_dynapi_procs.h b/src/dynapi/SDL_dynapi_procs.h
index dcfcef6d65c76..cebdf0206ea87 100644
--- a/src/dynapi/SDL_dynapi_procs.h
+++ b/src/dynapi/SDL_dynapi_procs.h
@@ -1214,3 +1214,4 @@ SDL_DYNAPI_PROC(bool,SDL_SetErrorV,(SDL_PRINTF_FORMAT_STRING const char *a,va_li
 SDL_DYNAPI_PROC(SDL_LogOutputFunction,SDL_GetDefaultLogOutputFunction,(void),(),return)
 SDL_DYNAPI_PROC(bool,SDL_RenderDebugText,(SDL_Renderer *a,float b,float c,const char *d),(a,b,c,d),return)
 SDL_DYNAPI_PROC(SDL_Sandbox,SDL_GetSandbox,(void),(),return)
+SDL_DYNAPI_PROC(bool,SDL_CancelGPUCommandBuffer,(SDL_GPUCommandBuffer *a),(a),return)
diff --git a/src/gpu/SDL_gpu.c b/src/gpu/SDL_gpu.c
index 1957ac5deb03e..6f1e8975ddfd0 100644
--- a/src/gpu/SDL_gpu.c
+++ b/src/gpu/SDL_gpu.c
@@ -1301,6 +1301,7 @@ SDL_GPUCommandBuffer *SDL_AcquireGPUCommandBuffer(
     commandBufferHeader->compute_pipeline_bound = false;
     commandBufferHeader->copy_pass.command_buffer = command_buffer;
     commandBufferHeader->copy_pass.in_progress = false;
+    commandBufferHeader->swapchain_texture_acquired = false;
     commandBufferHeader->submitted = false;
 
     return command_buffer;
@@ -2666,6 +2667,8 @@ bool SDL_AcquireGPUSwapchainTexture(
     Uint32 *swapchain_texture_width,
     Uint32 *swapchain_texture_height)
 {
+    CommandBufferCommonHeader *commandBufferHeader = (CommandBufferCommonHeader *)command_buffer;
+
     if (command_buffer == NULL) {
         SDL_InvalidParamError("command_buffer");
         return false;
@@ -2684,12 +2687,18 @@ bool SDL_AcquireGPUSwapchainTexture(
         CHECK_ANY_PASS_IN_PROGRESS("Cannot acquire a swapchain texture during a pass!", false)
     }
 
-    return COMMAND_BUFFER_DEVICE->AcquireSwapchainTexture(
+    bool result = COMMAND_BUFFER_DEVICE->AcquireSwapchainTexture(
         command_buffer,
         window,
         swapchain_texture,
         swapchain_texture_width,
         swapchain_texture_height);
+
+    if (*swapchain_texture != NULL){
+        commandBufferHeader->swapchain_texture_acquired = true;
+    }
+
+    return result;
 }
 
 bool SDL_SubmitGPUCommandBuffer(
@@ -2746,6 +2755,27 @@ SDL_GPUFence *SDL_SubmitGPUCommandBufferAndAcquireFence(
         command_buffer);
 }
 
+bool SDL_CancelGPUCommandBuffer(
+    SDL_GPUCommandBuffer *command_buffer)
+{
+    CommandBufferCommonHeader *commandBufferHeader = (CommandBufferCommonHeader *)command_buffer;
+
+    if (command_buffer == NULL) {
+        SDL_InvalidParamError("command_buffer");
+        return false;
+    }
+
+    if (COMMAND_BUFFER_DEVICE->debug_mode) {
+        if (commandBufferHeader->swapchain_texture_acquired) {
+            SDL_assert_release(!"Cannot cancel command buffer after a swapchain texture has been acquired!");
+            return false;
+        }
+    }
+
+    return COMMAND_BUFFER_DEVICE->Cancel(
+        command_buffer);
+}
+
 bool SDL_WaitForGPUIdle(
     SDL_GPUDevice *device)
 {
diff --git a/src/gpu/SDL_sysgpu.h b/src/gpu/SDL_sysgpu.h
index 7f9ef6ca43324..c20531f99b9bf 100644
--- a/src/gpu/SDL_sysgpu.h
+++ b/src/gpu/SDL_sysgpu.h
@@ -40,6 +40,7 @@ typedef struct CommandBufferCommonHeader
     Pass compute_pass;
     bool compute_pipeline_bound;
     Pass copy_pass;
+    bool swapchain_texture_acquired;
     bool submitted;
 } CommandBufferCommonHeader;
 
@@ -810,6 +811,9 @@ struct SDL_GPUDevice
     SDL_GPUFence *(*SubmitAndAcquireFence)(
         SDL_GPUCommandBuffer *commandBuffer);
 
+    bool (*Cancel)(
+        SDL_GPUCommandBuffer *commandBuffer);
+
     bool (*Wait)(
         SDL_GPURenderer *driverData);
 
@@ -928,6 +932,7 @@ struct SDL_GPUDevice
     ASSIGN_DRIVER_FUNC(AcquireSwapchainTexture, name)       \
     ASSIGN_DRIVER_FUNC(Submit, name)                        \
     ASSIGN_DRIVER_FUNC(SubmitAndAcquireFence, name)         \
+    ASSIGN_DRIVER_FUNC(Cancel, name)                        \
     ASSIGN_DRIVER_FUNC(Wait, name)                          \
     ASSIGN_DRIVER_FUNC(WaitForFences, name)                 \
     ASSIGN_DRIVER_FUNC(QueryFence, name)                    \
diff --git a/src/gpu/d3d11/SDL_gpu_d3d11.c b/src/gpu/d3d11/SDL_gpu_d3d11.c
index d667efddf5ba7..f2a864bac984b 100644
--- a/src/gpu/d3d11/SDL_gpu_d3d11.c
+++ b/src/gpu/d3d11/SDL_gpu_d3d11.c
@@ -748,7 +748,7 @@ typedef struct D3D11CommandBuffer
 
     // Fences
     D3D11Fence *fence;
-    Uint8 autoReleaseFence;
+    bool autoReleaseFence;
 
     // Reference Counting
     D3D11Buffer **usedBuffers;
@@ -3280,15 +3280,10 @@ static SDL_GPUCommandBuffer *D3D11_AcquireCommandBuffer(
     SDL_zeroa(commandBuffer->computeReadWriteStorageTextureSubresources);
     SDL_zeroa(commandBuffer->computeReadWriteStorageBuffers);
 
-    bool acquireFenceResult = D3D11_INTERNAL_AcquireFence(commandBuffer);
-    commandBuffer->autoReleaseFence = 1;
+    commandBuffer->autoReleaseFence = true;
 
     SDL_UnlockMutex(renderer->acquireCommandBufferLock);
 
-    if (!acquireFenceResult) {
-        return NULL;
-    }
-
     return (SDL_GPUCommandBuffer *)commandBuffer;
 }
 
@@ -4806,7 +4801,8 @@ static bool D3D11_INTERNAL_MapAndCopyTextureDownload(
 
 static bool D3D11_INTERNAL_CleanCommandBuffer(
     D3D11Renderer *renderer,
-    D3D11CommandBuffer *commandBuffer)
+    D3D11CommandBuffer *commandBuffer,
+    bool cancel)
 {
     Uint32 i, j;
     bool result = true;
@@ -4817,17 +4813,21 @@ static bool D3D11_INTERNAL_CleanCommandBuffer(
         D3D11TransferBuffer *transferBuffer = commandBuffer->usedTransferBuffers[i];
 
         for (j = 0; j < transferBuffer->bufferDownloadCount; j += 1) {
-            result &= D3D11_INTERNAL_MapAndCopyBufferDownload(
-                renderer,
-                transferBuffer,
-                &transferBuffer->bufferDownloads[j]);
+            if (!cancel) {
+                result &= D3D11_INTERNAL_MapAndCopyBufferDownload(
+                    renderer,
+                    transferBuffer,
+                    &transferBuffer->bufferDownloads[j]);
+            }
         }
 
         for (j = 0; j < transferBuffer->textureDownloadCount; j += 1) {
-            result &= D3D11_INTERNAL_MapAndCopyTextureDownload(
-                renderer,
-                transferBuffer,
-                &transferBuffer->textureDownloads[j]);
+            if (!cancel) {
+                result &= D3D11_INTERNAL_MapAndCopyTextureDownload(
+                    renderer,
+                    transferBuffer,
+                    &transferBuffer->textureDownloads[j]);
+            }
         }
 
         transferBuffer->bufferDownloadCount = 0;
@@ -4887,10 +4887,12 @@ static bool D3D11_INTERNAL_CleanCommandBuffer(
     SDL_UnlockMutex(renderer->acquireCommandBufferLock);
 
     // Remove this command buffer from the submitted list
-    for (i = 0; i < renderer->submittedCommandBufferCount; i += 1) {
-        if (renderer->submittedCommandBuffers[i] == commandBuffer) {
-            renderer->submittedCommandBuffers[i] = renderer->submittedCommandBuffers[renderer->submittedCommandBufferCount - 1];
-            renderer->submittedCommandBufferCount -= 1;
+    if (!cancel) {
+        for (i = 0; i < renderer->submittedCommandBufferCount; i += 1) {
+            if (renderer->submittedCommandBuffers[i] == commandBuffer) {
+                renderer->submittedCommandBuffers[i] = renderer->submittedCommandBuffers[renderer->submittedCommandBufferCount - 1];
+                renderer->submittedCommandBufferCount -= 1;
+            }
         }
     }
 
@@ -5024,7 +5026,8 @@ static bool D3D11_WaitForFences(
         if (res == S_OK) {
             result &= D3D11_INTERNAL_CleanCommandBuffer(
                 renderer,
-                renderer->submittedCommandBuffers[i]);
+                renderer->submittedCommandBuffers[i],
+                false);
         }
     }
 
@@ -5696,6 +5699,11 @@ static bool D3D11_Submit(
 
     SDL_LockMutex(renderer->contextLock);
 
+    if (!D3D11_INTERNAL_AcquireFence(d3d11CommandBuffer)) {
+        SDL_UnlockMutex(renderer->contextLock);
+        return false;
+    }
+
     // Notify the command buffer completion query that we have completed recording
     ID3D11DeviceContext_End(
         renderer->immediateContext,
@@ -5778,7 +5786,8 @@ static bool D3D11_Submit(
         if (res == S_OK) {
             result &= D3D11_INTERNAL_CleanCommandBuffer(
                 renderer,
-                renderer->submittedCommandBuffers[i]);
+                renderer->submittedCommandBuffers[i],
+                false);
         }
     }
 
@@ -5793,12 +5802,26 @@ static SDL_GPUFence *D3D11_SubmitAndAcquireFence(
     SDL_GPUCommandBuffer *commandBuffer)
 {
     D3D11CommandBuffer *d3d11CommandBuffer = (D3D11CommandBuffer *)commandBuffer;
-    D3D11Fence *fence = d3d11CommandBuffer->fence;
+    d3d11CommandBuffer->autoReleaseFence = false;
+    if (!D3D11_Submit(commandBuffer)) {
+        return NULL;
+    }
+    return (SDL_GPUFence *)d3d11CommandBuffer->fence;
+}
+
+static bool D3D11_Cancel(
+    SDL_GPUCommandBuffer *commandBuffer)
+{
+    D3D11CommandBuffer *d3d11CommandBuffer = (D3D11CommandBuffer *)commandBuffer;
+    D3D11Renderer *renderer = d3d11CommandBuffer->renderer;
+    bool result;
 
-    d3d11CommandBuffer->autoReleaseFence = 0;
-    D3D11_Submit(commandBuffer);
+    d3d11CommandBuffer->autoReleaseFence = false;
+    SDL_LockMutex(renderer->contextLock);
+    result = D3D11_INTERNAL_CleanCommandBuffer(renderer, d3d11CommandBuffer, true);
+    SDL_UnlockMutex(renderer->contextLock);
 
-    return (SDL_GPUFence *)fence;
+    return result;
 }
 
 static bool D3D11_Wait(
@@ -5822,7 +5845,7 @@ static bool D3D11_Wait(
 
     for (Sint32 i = renderer->submittedCommandBufferCount - 1; i >= 0; i -= 1) {
         commandBuffer = renderer->submittedCommandBuffers[i];
-        result &= D3D11_INTERNAL_CleanCommandBuffer(renderer, commandBuffer);
+        result &= D3D11_INTERNAL_CleanCommandBuffer(renderer, commandBuffer, false);
     }
 
     D3D11_INTERNAL_PerformPendingDestroys(renderer);
diff --git a/src/gpu/d3d12/SDL_gpu_d3d12.c b/src/gpu/d3d12/SDL_gpu_d3d12.c
index a489f204b81f6..00c329312a84e 100644
--- a/src/gpu/d3d12/SDL_gpu_d3d12.c
+++ b/src/gpu/d3d12/SDL_gpu_d3d12.c
@@ -7297,18 +7297,20 @@ static bool D3D12_INTERNAL_CopyTextureDownload(
 
 static bool D3D12_INTERNAL_CleanCommandBuffer(
     D3D12Renderer *renderer,
-    D3D12CommandBuffer *commandBuffer)
+    D3D12CommandBuffer *commandBuffer,
+    bool cancel)
 {
     Uint32 i;
     HRESULT res;
     bool result = true;
 
     // Perform deferred texture data copies
-
     for (i = 0; i < commandBuffer->textureDownloadCount; i += 1) {
-        result &= D3D12_INTERNAL_CopyTextureDownload(
-            commandBuffer,
-            commandBuffer->textureDownloads[i]);
+        if (!cancel) {
+            result &= D3D12_INTERNAL_CopyTextureDownload(
+                commandBuffer,
+                commandBuffer->textureDownloads[i]);
+        }
         SDL_free(commandBuffer->textureDownloads[i]);
     }
     commandBuffer->textureDownloadCount = 0;
@@ -7401,10 +7403,12 @@ static bool D3D12_INTERNAL_CleanCommandBuffer(
     SDL_UnlockMutex(renderer->acquireCommandBufferLock);
 
     // Remove this command buffer from the submitted list
-    for (i = 0; i < renderer->submittedCommandBufferCount; i += 1) {
-        if (renderer->submittedCommandBuffers[i] == commandBuffer) {
-            renderer->submittedCommandBuffers[i] = renderer->submittedCommandBuffers[renderer->submittedCommandBufferCount - 1];
-            renderer->submittedCommandBufferCount -= 1;
+    if (!cancel) {
+        for (i = 0; i < renderer->submittedCommandBufferCount; i += 1) {
+            if (renderer->submittedCommandBuffers[i] == commandBuffer) {
+                renderer->submittedCommandBuffers[i] = renderer->submittedCommandBuffers[renderer->submittedCommandBufferCount - 1];
+                renderer->submittedCommandBufferCount -= 1;
+            }
         }
     }
 
@@ -7573,7 +7577,8 @@ static bool D3D12_Submit(
         if (fenceValue == D3D12_FENCE_SIGNAL_VALUE) {
             result &= D3D12_INTERNAL_CleanCommandBuffer(
                 renderer,
-                renderer->submittedCommandBuffers[i]);
+                renderer->submittedCommandBuffers[i],
+                false);
         }
     }
 
@@ -7589,10 +7594,32 @@ static SDL_GPUFence *D3D12_SubmitAndAcquireFence(
 {
     D3D12CommandBuffer *d3d12CommandBuffer = (D3D12CommandBuffer *)commandBuffer;
     d3d12CommandBuffer->autoReleaseFence = false;
-    D3D12_Submit(commandBuffer);
+    if (!D3D12_Submit(commandBuffer)) {
+        return NULL;
+    }
     return (SDL_GPUFence *)d3d12CommandBuffer->inFlightFence;
 }
 
+static bool D3D12_Cancel(
+    SDL_GPUCommandBuffer *commandBuffer)
+{
+    D3D12CommandBuffer *d3d12CommandBuffer = (D3D12CommandBuffer *)commandBuffer;
+    D3D12Renderer *renderer = d3d12CommandBuffer->renderer;
+    bool result;
+    HRESULT res;
+
+    // Notify the command buffer that we have completed recording
+    res = ID3D12GraphicsCommandList_Close(d3d12CommandBuffer->graphicsCommandList);
+    CHECK_D3D12_ERROR_AND_RETURN("Failed to close command list!", false);
+
+    d3d12CommandBuffer->autoReleaseFence = false;
+    SDL_LockMutex(renderer->submitLock);
+    result = D3D12_INTERNAL_CleanCommandBuffer(renderer, d3d12CommandBuffer, true);
+    SDL_UnlockMutex(renderer->submitLock);
+
+    return result;
+}
+
 static bool D3D12_Wait(
     SDL_GPURenderer *driverData)
 {
@@ -7636,7 +7663,7 @@ static bool D3D12_Wait(
 
     // Clean up
     for (Sint32 i = renderer->submittedCommandBufferCount - 1; i >= 0; i -= 1) {
-        result &= D3D12_INTERNAL_CleanCommandBuffer(renderer, renderer->submittedCommandBuffers[i]);
+        result &= D3D12_INTERNAL_CleanCommandBuffer(renderer, renderer->submittedCommandBuffers[i], false);
     }
 
     D3D12_INTERNAL_PerformPendingDestroys(renderer);
@@ -7692,7 +7719,8 @@ static bool D3D12_WaitForFences(
         if (fenceValue == D3D12_FENCE_SIGNAL_VALUE) {
             result &= D3D12_INTERNAL_CleanCommandBuffer(
                 renderer,
-                renderer->submittedCommandBuffers[i]);
+                renderer->submittedCommandBuffers[i],
+                false);
         }
     }
 
diff --git a/src/gpu/metal/SDL_gpu_metal.m b/src/gpu/metal/SDL_gpu_metal.m
index 04a5bff59692e..6c496a5c11402 100644
--- a/src/gpu/metal/SDL_gpu_metal.m
+++ b/src/gpu/metal/SDL_gpu_metal.m
@@ -446,6 +446,7 @@ static MTLDepthClipMode SDLToMetal_DepthClipMode(
 typedef struct MetalFence
 {
     SDL_AtomicInt complete;
+    SDL_AtomicInt referenceCount;
 } MetalFence;
 
 typedef struct MetalWindowData
@@ -453,9 +454,12 @@ static MTLDepthClipMode SDLToMetal_DepthClipMode(
     SDL_Window *window;
     SDL_MetalView view;
     CAMetalLayer *layer;
+    SDL_GPUPresentMode presentMode;
     id<CAMetalDrawable> drawable;
     MetalTexture texture;
     MetalTextureContainer textureContainer;
+    SDL_GPUFence *inFlightFences[MAX_FRAMES_IN_FLIGHT];
+    Uint32 frameCounter;
 } MetalWindowData;
 
 typedef struct MetalShader
@@ -605,7 +609,7 @@ static MTLDepthClipMode SDLToMetal_DepthClipMode(
 
     // Fences
     MetalFence *fence;
-    Uint8 autoReleaseFence;
+    bool autoReleaseFence;
 
     // Reference Counting
     MetalBuffer **usedBuffers;
@@ -2019,6 +2023,7 @@ static Uint8 METAL_INTERNAL_CreateFence(
 
     fence = SDL_calloc(1, sizeof(MetalFence));
     SDL_SetAtomicInt(&fence->complete, 0);
+    SDL_SetAtomicInt(&fence->referenceCount, 0);
 
     // Add it to the available pool
     // FIXME: Should this be EXPAND_IF_NEEDED?
@@ -2036,7 +2041,7 @@ static Uint8 METAL_INTERNAL_CreateFence(
     return 1;
 }
 
-static Uint8 METAL_INTERNAL_AcquireFence(
+static bool METAL_INTERNAL_AcquireFence(
     MetalRenderer *renderer,
     MetalCommandBuffer *commandBuffer)
 {
@@ -2049,7 +2054,7 @@ static Uint8 METAL_INTERNAL_AcquireFence(
         if (!METAL_INTERNAL_CreateFence(renderer)) {
             SDL_UnlockMutex(renderer->fenceLock);
             SDL_LogError(SDL_LOG_CATEGORY_GPU, "Failed to create fence!");
-            return 0;
+            return false;
         }
     }
 
@@ -2061,8 +2066,9 @@ static Uint8 METAL_INTERNAL_AcquireFence(
     // Associate the fence with the command buffer
     commandBuffer->fence = fence;
     SDL_SetAtomicInt(&fence->complete, 0); // FIXME: Is this right?
+    (void)SDL_AtomicIncRef(&commandBuffer->fence->referenceCount);
 
-    return 1;
+    return true;
 }
 
 static SDL_GPUCommandBuffer *METAL_AcquireCommandBuffer(
@@ -2099,8 +2105,7 @@ static Uint8 METAL_INTERNAL_AcquireFence(
         commandBuffer->needComputeTextureBind = true;
         commandBuffer->needComputeUniformBind = true;
 
-        METAL_INTERNAL_AcquireFence(renderer, commandBuffer);
-        commandBuffer->autoReleaseFence = 1;
+        commandBuffer->autoReleaseFence = true;
 
         SDL_UnlockMutex(renderer->acquireCommandBufferLock);
 
@@ -3266,29 +3271,36 @@ static void METAL_ReleaseFence(
     SDL_GPURenderer *driverData,
     SDL_GPUFence *fence)
 {
-    METAL_INTERNAL_ReleaseFenceToPool(
-        (MetalRenderer *)driverData,
-        (MetalFence *)fence);
+    MetalFence *metalFence = (MetalFence *)fence;
+    if (SDL_AtomicDecRef(&metalFence->referenceCount)) {
+        METAL_INTERNAL_ReleaseFenceToPool(
+            (MetalRenderer *)driverData,
+            (MetalFence *)fence);
+    }
 }
 
 // Cleanup
 
 static void METAL_INTERNAL_CleanCommandBuffer(
     MetalRenderer *renderer,
-    MetalCommandBuffer *commandBuffer)
+    MetalCommandBuffer *commandBuffer,
+    bool cancel)
 {
     Uint32 i;
 
-    // Reference Counting
-    for (i = 0; i < commandBuffer->usedBufferCount; i += 1) {
-        (void)SDL_AtomicDecRef(&commandBuffer->usedBuffers[i]->referenceCount);
+    // End any active passes
+    if (commandBuffer->renderEncoder) {
+        [commandBuffer->renderEncoder endEncoding];
+        commandBuffer->renderEncoder = nil;
     }
-    commandBuffer->usedBufferCount = 0;
-
-    for (i = 0; i < commandBuffer->usedTextureCount; i += 1) {
-        (void)SDL_AtomicDecRef(&commandBuffer->usedTextures[i]->referenceCount);
+    if (commandBuffer->computeEncoder) {
+        [commandBuffer->computeEncoder endEncoding];
+        commandBuffer->computeEncoder = nil;
+    }
+    if (commandBuffer->blitEncoder) {
+        [commandBuffer->blitEncoder endEncoding];
+        commandBuffer->blitEncoder = nil;
     }
-    commandBuffer->usedTextureCount = 0;
 
     // Uniform buffers are now available
 
@@ -3303,6 +3315,18 @@ static void METAL_INTERNAL_CleanCommandBuffer(
 
     SDL_UnlockMutex(renderer->acquireUniformBufferLock);
 
+    // Reference Counting
+
+    for (i = 0; i < commandBuffer->usedBufferCount; i += 1) {
+        (void)SDL_AtomicDecRef(&commandBuffer->usedBuffers[i]->referenceCount);
+    }
+    commandBuffer->usedBufferCount = 0;
+
+    for (i = 0; i < commandBuffer->usedTextureCount; i += 1) {
+        (void)SDL_AtomicDecRef(&commandBuffer->usedTextures[i]->referenceCount);
+    }
+    commandBuffer->usedTextureCount = 0;
+
     // Reset presentation
     commandBuffer->windowDataCount = 0;
 
@@ -3354,10 +3378,12 @@ static void METAL_INTERNAL_CleanCommandBuffer(
     SDL_UnlockMutex(renderer->acquireCommandBufferLock);
 
     // Remove this command buffer from the submitted list
-    for (i = 0; i < renderer->submittedCommandBufferCount; i += 1) {
-        if (renderer->submittedCommandBuffers[i] == commandBuffer) {
-            renderer->submittedCommandBuffers[i] = renderer->submittedCommandBuffers[renderer->submittedCommandBufferCount - 1];
-            renderer->submittedCommandBufferCount -= 1;
+    if (!cancel) {
+        for (i = 0; i < renderer->submittedCommandBufferCount; i += 1) {
+            if (renderer->submittedCommandBuffers[i] == commandBuffer) {
+                renderer->submittedCommandBuffers[i] = renderer->submittedCommandBuffers[renderer->submittedCommandBufferCount - 1];
+                renderer->submittedCommandBufferCount -= 1;
+            }
         }
     }
 }
@@ -3483,12 +3509,19 @@ static Uint8 METAL_INTERNAL_CreateSwapchain(
 
     windowData->view = SDL_Metal_CreateView(windowData->window);
     windowData->drawable = nil;
+    windowData->presentMode = SDL_GPU_PRESENTMODE_VSYNC;
+    windowData->frameCounter = 0;
+
+    for (int i = 0; i < MAX_FRAMES_IN_FLIGHT; i += 1) {
+        windowData->inFlightFences[i] = NULL;
+    }
 
     windowData->layer = (__bridge CAMetalLayer *)(SDL_Metal_GetLayer(windowData->view));
     windowData->layer.device = renderer->device;
 #ifdef SDL_PLATFORM_MACOS
     if (@available(macOS 10.13, *)) {
         windowData->layer.displaySyncEnabled = (presentMode != SDL_GPU_PRESENTMODE_IMMEDIATE);
+        windowData->presentMode = presentMode;
     }
 #endif
     windowData->layer.pixelFormat = SDLToMetal_TextureFormat(SwapchainCompositionToFormat[swapchainComposition]);
@@ -3610,6 +3643,13 @@ static void METAL_ReleaseWindow(
 
         METAL_Wait(driverData);
         SDL_Metal_DestroyView(windowData->view);
+        for (int i = 0; i < MAX_FRAMES_IN_FLIGHT; i += 1) {
+            if (windowData->inFlightFences[i] != NULL) {
+                METAL_ReleaseFence(
+                    (SDL_GPURenderer *)renderer,
+                    windowData->inFlightFences[i]);
+            }
+        }
 
         SDL_LockMutex(renderer->windowLock);
         for (Uint32 i = 0; i < renderer->claimedWindowCount; i += 1) {
@@ -3653,10 +3693,6 @@ static bool METAL_AcquireSwapchainTexture(
             SET_STRING_ERROR_AND_RETURN("Window is not claimed by this SDL_GpuDevice", false);
         }
 
-        // Get the drawable and its underlying texture
-        windowData->drawable = [windowData->layer nextDrawable];
-        windowData->texture.handle = [windowData->drawable texture];
-
         // Update the window size
         drawableSize = windowData->layer.drawableSize;
         windowData->textureContainer.header.info.width = (Uint32)drawableSize.width;
@@ -3668,6 +3704,39 @@ static bool METAL_AcquireSwapchainTexture(
             *swapchainTextureHeight = (Uint32)drawableSize.height;
         }
 
+        if (windowData->inFlightFences[windowData->frameCounter] != NULL) {
+            if (windowData->presentMode == SDL_GPU_PRESENTMODE_VSYNC) {
+                // In VSYNC mode, block until the least recent presented frame is done
+                if (!METAL_WaitForFences(
+                    (SDL_GPURenderer *)renderer,
+                    true,
+                    &windowData->inFlightFences[windowData->frameCounter],
+                    1)) {
+                    return false;
+                }
+            } else {
+                if (!METAL_QueryFence(
+                        (SDL_GPURenderer *)metalCommandBuffer->renderer,
+                        windowData->inFlightFences[windowData->frameCounter])) {
+                    /*
+                    * In IMMEDIATE mode, if the least recent fence is not signaled,
+                    * return true to indicate that there is no error but rendering should be skipped
+                    */
+                    return true;
+                }
+            }
+
+            METAL_ReleaseFence(
+                (SDL_GPURenderer *)metalCommandBuffer->renderer,
+                windowData->inFlightFences[windowData->frameCounter]);
+
+            windowData->inFlightFences[windowData->frameCounter] = NULL;
+        }
+
+        // Get the drawable and its underlying texture
+        windowData->drawable = [windowData->layer nextDrawable];
+        windowData->texture.handle = [windowData->drawable texture];
+
         // Set up presentation
         if (metalCommandBuffer->windowDataCount == metalCommandBuffer->windowDataCapacity) {
             metalCommandBuffer->windowDataCapacity += 1;
@@ -3723,9 +3792,12 @@ static bool METAL_SetSwapchainParameters(
 
         METAL_Wait(driverData);
 
+        windowData->presentMode = SDL_GPU_PRESENTMODE_VSYNC;
+
 #ifdef SDL_PLATFORM_MACOS
         if (@available(macOS 10.13, *)) {
             windowData->layer.displaySyncEnabled = (presentMode != SDL_GPU_PRESENTMODE_IMMEDIATE);
+            windowData->presentMode = presentMode;
         }
 #endif
         windowData->layer.pixelFormat = SDLToMetal_TextureFormat(SwapchainCompositionToFormat[swapchainComposition]);
@@ -3756,10 +3828,22 @@ static bool METAL_Submit(
 
         SDL_LockMutex(renderer->submitLock);
 
+        if (!METAL_INTERNAL_AcquireFence(renderer, metalCommandBuffer)) {
+            SDL_UnlockMutex(renderer->submitLock);
+            return false;
+        }
+
         // Enqueue present requests, if applicable
         for (Uint32 i = 0; i < metalCommandBuffer->windowDataCount; i += 1) {
-            [metalCommandBuffer->handle presentDrawable:metalCommandBuffer->windowDatas[i]->drawable];
-            metalCommandBuffer->windowDatas[i]->drawable = nil;
+            MetalWindowData *windowData = metalCommandBuffer->windowDatas[i];
+            [metalCommandBuffer->handle presentDrawable:windowData->drawable];
+            windowData->drawable = nil;
+
+            windowData->inFlightFences[windowData->frameCounter] = (SDL_GPUFence *)metalCommandBuffer->fence;
+
+            (void)SDL_AtomicIncRef(&metalCommandBuffer->fence->referenceCount);
+
+            windowData->frameCounter = (windowData->frameCounter + 1) % MAX_FRAMES_IN_FLIGHT;
         }
 
         // Notify the fence when the command buffer has completed
@@ -3787,7 +3871,8 @@ static bool METAL_Submit(
             if (SDL_GetAtomicInt(&renderer->submittedCommandBuffers[i]->fence->complete)) {
                 METAL_INTERNAL_CleanCommandBuffer(
                     renderer,
-                    renderer->submittedCommandBuffers[i]);
+                    renderer->submittedCommandBuffers[i],
+                    false);
             }
         }
 
@@ -3803,12 +3888,25 @@ static bool METAL_Submit(
     SDL_GPUCommandBuffer *commandBuffer)
 {
     MetalCommandBuffer *metalCommandBuffer = (MetalCommandBuffer *)commandBuffer;
-    MetalFence *fence = metalCommandBuffer->fence;
+    metalCommandBuffer->autoReleaseFence = false;
+    if (!METAL_Submit(commandBuffer)) {
+        return NULL;
+    }
+    return (SDL_GPUFence *)metalCommandBuffer->fence;
+}
+
+static bool METAL_Cancel(
+    SDL_GPUCommandBuffer *commandBuffer)
+{
+    MetalCommandBuffer *metalCommandBuffer = (MetalCommandBuffer *)commandBuffer;
+    MetalRenderer *renderer = metalCommandBuffer->renderer;
 
-    metalCommandBuffer->autoReleaseFence = 0;
-    METAL_Submit(commandBuffer);
+    metalCommandBuffer->autoReleaseFence = false;
+    SDL_LockMutex(renderer->submitLock);
+    METAL_INTERNAL_CleanCommandBuffer(renderer, metalCommandBuffer, true);
+    SDL_UnlockMutex(renderer->submitLock);
 
-    return (SDL_GPUFence *)fence;
+    return true;
 }
 
 static bool METAL_Wait(
@@ -3832,7 +3930,7 @@ static bool METAL_Wait(
 
         for (Sint32 i = renderer->submittedCommandBufferCount - 1; i >= 0; i -= 1) {
             commandBuffer = renderer->submittedCommandBuffers[i];
-            METAL_INTERNAL_CleanCommandBuffer(renderer, commandBuffer);
+            METAL_INTERNAL_CleanCommandBuffer(renderer, commandBuffer, false);
         }
 
         METAL_INTERNAL_PerformPendingDestroys(renderer);
diff --git a/src/gpu/vulkan/SDL_gpu_vulkan.c b/src/gpu/vulkan/SDL_gpu_vulkan.c
index ce3abd60d17b3..22983a52022b8 100644
--- a/src/gpu/vulkan/SDL_gpu_vulkan.c
+++ b/src/gpu/vulkan/SDL_gpu_vulkan.c
@@ -630,8 +630,6 @@ typedef struct VulkanTextureSubresource
     VkImageView *renderTargetViews;

(Patch may be truncated, please check the link at the top of this post.)