From 4ee896af99a71fe10a91d348869842ebd28b0004 Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Sun, 12 Apr 2026 17:10:40 +0200
Subject: [PATCH 1/6] render/vulkan: New staging buffer implementation

Implement a ring-buffer that uses timeline points to track and release
allocated spans. We add larger ring-buffers when it fills, and remove
them when they have been unused for many collection cycles.

Signed-off-by: Kenny Levinsen <kl@kl.wtf>
---
 include/render/vulkan.h  |  56 +++++---
 render/vulkan/pass.c     |  15 +-
 render/vulkan/renderer.c | 289 ++++++++++++++++++++++-----------------
 render/vulkan/texture.c  |   6 +-
 4 files changed, 211 insertions(+), 155 deletions(-)

diff --git a/include/render/vulkan.h b/include/render/vulkan.h
index 021749c27..de1195b8f 100644
--- a/include/render/vulkan.h
+++ b/include/render/vulkan.h
@@ -284,8 +284,6 @@ struct wlr_vk_command_buffer {
 	uint64_t timeline_point;
 	// Textures to destroy after the command buffer completes
 	struct wl_list destroy_textures; // wlr_vk_texture.destroy_link
-	// Staging shared buffers to release after the command buffer completes
-	struct wl_list stage_buffers; // wlr_vk_shared_buffer.link
 	// Color transform to unref after the command buffer completes
 	struct wlr_color_transform *color_transform;
 
@@ -352,7 +350,7 @@ struct wlr_vk_renderer {
 	struct {
 		struct wlr_vk_command_buffer *cb;
 		uint64_t last_timeline_point;
-		struct wl_list buffers; // wlr_vk_shared_buffer.link
+		struct wl_list buffers; // wlr_vk_stage_buffer.link
 	} stage;
 
 	struct {
@@ -453,14 +451,20 @@ struct wlr_vk_render_pass {
 struct wlr_vk_render_pass *vulkan_begin_render_pass(struct wlr_vk_renderer *renderer,
 	struct wlr_vk_render_buffer *buffer, const struct wlr_buffer_pass_options *options);
 
-// Suballocates a buffer span with the given size that can be mapped
-// and used as staging buffer. The allocation is implicitly released when the
-// stage cb has finished execution. The start of the span will be a multiple
-// of the given alignment.
+// Suballocates a buffer span with the given size from the staging ring buffer
+// that is mapped for CPU access. vulkan_stage_mark_submit must be called after
+// allocations are made to mark the timeline point after which the allocations
+// will be released. The start of the span will be a multiple of alignment.
 struct wlr_vk_buffer_span vulkan_get_stage_span(
 	struct wlr_vk_renderer *renderer, VkDeviceSize size,
 	VkDeviceSize alignment);
 
+// Records a watermark on all staging buffers with new allocations with the
+// specified timeline point. Once the timeline point is passed, the span will
+// be reclaimed by vulkan_stage_buffer_reclaim.
+void vulkan_stage_mark_submit(struct wlr_vk_renderer *renderer,
+	uint64_t timeline_point);
+
 // Tries to allocate a texture descriptor set. Will additionally
 // return the pool it was allocated from when successful (for freeing it later).
 struct wlr_vk_descriptor_pool *vulkan_alloc_texture_ds(
@@ -544,29 +548,43 @@ struct wlr_vk_descriptor_pool {
 	struct wl_list link; // wlr_vk_renderer.descriptor_pools
 };
 
-struct wlr_vk_allocation {
-	VkDeviceSize start;
-	VkDeviceSize size;
+struct wlr_vk_stage_watermark {
+	VkDeviceSize head;
+	uint64_t timeline_point;
 };
 
-// List of suballocated staging buffers.
-// Used to upload to/read from device local images.
-struct wlr_vk_shared_buffer {
-	struct wl_list link; // wlr_vk_renderer.stage.buffers or wlr_vk_command_buffer.stage_buffers
+// Ring buffer for staging transfers
+struct wlr_vk_stage_buffer {
+	struct wl_list link; // wlr_vk_renderer.stage.buffers
 	VkBuffer buffer;
 	VkDeviceMemory memory;
 	VkDeviceSize buf_size;
 	void *cpu_mapping;
-	struct wl_array allocs; // struct wlr_vk_allocation
-	int64_t last_used_ms;
+
+	VkDeviceSize head;
+	VkDeviceSize tail;
+
+	struct wl_array watermarks; // struct wlr_vk_stage_watermark
+	int empty_gc_cnt;
 };
 
-// Suballocated range on a buffer.
+// Suballocated range on a staging ring buffer.
 struct wlr_vk_buffer_span {
-	struct wlr_vk_shared_buffer *buffer;
-	struct wlr_vk_allocation alloc;
+	struct wlr_vk_stage_buffer *buffer;
+	VkDeviceSize offset;
+	VkDeviceSize size;
 };
 
+// Suballocate a span of size bytes from a staging ring buffer, with the
+// returned offset rounded up to the given alignment. Returns the byte offset
+// of the allocation, or (VkDeviceSize)-1 if the buffer is too full to fit it.
+VkDeviceSize vulkan_stage_buffer_alloc(struct wlr_vk_stage_buffer *buf,
+	VkDeviceSize size, VkDeviceSize alignment);
+
+// Free all allocations covered by watermarks whose timeline point has been
+// reached.
+void vulkan_stage_buffer_reclaim(struct wlr_vk_stage_buffer *buf,
+	uint64_t current_point);
 
 // Prepared form for a color transform
 struct wlr_vk_color_transform {
diff --git a/render/vulkan/pass.c b/render/vulkan/pass.c
index a832a9f0a..3c909d6fd 100644
--- a/render/vulkan/pass.c
+++ b/render/vulkan/pass.c
@@ -595,14 +595,7 @@ static bool render_pass_submit(struct wlr_render_pass *wlr_pass) {
 
 	free(render_wait);
 
-	struct wlr_vk_shared_buffer *stage_buf, *stage_buf_tmp;
-	wl_list_for_each_safe(stage_buf, stage_buf_tmp, &renderer->stage.buffers, link) {
-		if (stage_buf->allocs.size == 0) {
-			continue;
-		}
-		wl_list_remove(&stage_buf->link);
-		wl_list_insert(&stage_cb->stage_buffers, &stage_buf->link);
-	}
+	vulkan_stage_mark_submit(renderer, render_timeline_point);
 
 	if (!vulkan_sync_render_pass_release(renderer, pass)) {
 		wlr_log(WLR_ERROR, "Failed to sync render buffer");
@@ -1056,13 +1049,13 @@ static bool create_3d_lut_image(struct wlr_vk_renderer *renderer,
 	size_t size = dim_len * dim_len * dim_len * bytes_per_block;
 	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
 		size, bytes_per_block);
-	if (!span.buffer || span.alloc.size != size) {
+	if (!span.buffer || span.size != size) {
 		wlr_log(WLR_ERROR, "Failed to retrieve staging buffer");
 		goto fail_imageview;
 	}
 
 	float sample_range = 1.0f / (dim_len - 1);
-	char *map = (char *)span.buffer->cpu_mapping + span.alloc.start;
+	char *map = (char *)span.buffer->cpu_mapping + span.offset;
 	float *dst = (float *)map;
 	for (size_t b_index = 0; b_index < dim_len; b_index++) {
 		for (size_t g_index = 0; g_index < dim_len; g_index++) {
@@ -1092,7 +1085,7 @@ static bool create_3d_lut_image(struct wlr_vk_renderer *renderer,
 		VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT,
 		VK_ACCESS_TRANSFER_WRITE_BIT);
 	VkBufferImageCopy copy = {
-		.bufferOffset = span.alloc.start,
+		.bufferOffset = span.offset,
 		.imageExtent.width = dim_len,
 		.imageExtent.height = dim_len,
 		.imageExtent.depth = dim_len,
diff --git a/render/vulkan/renderer.c b/render/vulkan/renderer.c
index a5bc97512..2c42b4096 100644
--- a/render/vulkan/renderer.c
+++ b/render/vulkan/renderer.c
@@ -1,6 +1,5 @@
 #include <assert.h>
 #include <fcntl.h>
-#include <math.h>
 #include <poll.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -8,6 +7,7 @@
 #include <unistd.h>
 #include <drm_fourcc.h>
 #include <vulkan/vulkan.h>
+#include <wayland-util.h>
 #include <wlr/render/color.h>
 #include <wlr/render/interface.h>
 #include <wlr/types/wlr_drm.h>
@@ -26,11 +26,9 @@
 #include "render/vulkan/shaders/texture.frag.h"
 #include "render/vulkan/shaders/quad.frag.h"
 #include "render/vulkan/shaders/output.frag.h"
-#include "types/wlr_buffer.h"
-#include "util/time.h"
+#include "util/array.h"
 
 // TODO:
-// - simplify stage allocation, don't track allocations but use ringbuffer-like
 // - use a pipeline cache (not sure when to save though, after every pipeline
 //   creation?)
 // - create pipelines as derivatives of each other
@@ -187,18 +185,13 @@ static void destroy_render_format_setup(struct wlr_vk_renderer *renderer,
 	free(setup);
 }
 
-static void shared_buffer_destroy(struct wlr_vk_renderer *r,
-		struct wlr_vk_shared_buffer *buffer) {
+static void stage_buffer_destroy(struct wlr_vk_renderer *r,
+		struct wlr_vk_stage_buffer *buffer) {
 	if (!buffer) {
 		return;
 	}
 
-	if (buffer->allocs.size > 0) {
-		wlr_log(WLR_ERROR, "shared_buffer_finish: %zu allocations left",
-			buffer->allocs.size / sizeof(struct wlr_vk_allocation));
-	}
-
-	wl_array_release(&buffer->allocs);
+	wl_array_release(&buffer->watermarks);
 	if (buffer->cpu_mapping) {
 		vkUnmapMemory(r->dev->dev, buffer->memory);
 		buffer->cpu_mapping = NULL;
@@ -214,75 +207,12 @@ static void shared_buffer_destroy(struct wlr_vk_renderer *r,
 	free(buffer);
 }
 
-struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
-		VkDeviceSize size, VkDeviceSize alignment) {
-	// try to find free span
-	// simple greedy allocation algorithm - should be enough for this usecase
-	// since all allocations are freed together after the frame
-	struct wlr_vk_shared_buffer *buf;
-	wl_list_for_each_reverse(buf, &r->stage.buffers, link) {
-		VkDeviceSize start = 0u;
-		if (buf->allocs.size > 0) {
-			const struct wlr_vk_allocation *allocs = buf->allocs.data;
-			size_t allocs_len = buf->allocs.size / sizeof(struct wlr_vk_allocation);
-			const struct wlr_vk_allocation *last = &allocs[allocs_len - 1];
-			start = last->start + last->size;
-		}
-
-		assert(start <= buf->buf_size);
-
-		// ensure the proposed start is a multiple of alignment
-		start += alignment - 1 - ((start + alignment - 1) % alignment);
-
-		if (buf->buf_size - start < size) {
-			continue;
-		}
-
-		struct wlr_vk_allocation *a = wl_array_add(&buf->allocs, sizeof(*a));
-		if (a == NULL) {
-			wlr_log_errno(WLR_ERROR, "Allocation failed");
-			goto error_alloc;
-		}
-
-		*a = (struct wlr_vk_allocation){
-			.start = start,
-			.size = size,
-		};
-		return (struct wlr_vk_buffer_span) {
-			.buffer = buf,
-			.alloc = *a,
-		};
-	}
-
-	if (size > max_stage_size) {
-		wlr_log(WLR_ERROR, "cannot vulkan stage buffer: "
-			"requested size (%zu bytes) exceeds maximum (%zu bytes)",
-			(size_t)size, (size_t)max_stage_size);
-		goto error_alloc;
-	}
-
-	// we didn't find a free buffer - create one
-	// size = clamp(max(size * 2, prev_size * 2), min_size, max_size)
-	VkDeviceSize bsize = size * 2;
-	bsize = bsize < min_stage_size ? min_stage_size : bsize;
-	if (!wl_list_empty(&r->stage.buffers)) {
-		struct wl_list *last_link = r->stage.buffers.prev;
-		struct wlr_vk_shared_buffer *prev = wl_container_of(
-			last_link, prev, link);
-		VkDeviceSize last_size = 2 * prev->buf_size;
-		bsize = bsize < last_size ? last_size : bsize;
-	}
-
-	if (bsize > max_stage_size) {
-		wlr_log(WLR_INFO, "vulkan stage buffers have reached max size");
-		bsize = max_stage_size;
-	}
-
-	// create buffer
-	buf = calloc(1, sizeof(*buf));
+static struct wlr_vk_stage_buffer *stage_buffer_create(
+		struct wlr_vk_renderer *r, VkDeviceSize bsize) {
+	struct wlr_vk_stage_buffer *buf = calloc(1, sizeof(*buf));
 	if (!buf) {
 		wlr_log_errno(WLR_ERROR, "Allocation failed");
-		goto error_alloc;
+		return NULL;
 	}
 
 	wl_list_init(&buf->link);
@@ -319,7 +249,7 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
 	};
 	res = vkAllocateMemory(r->dev->dev, &mem_info, NULL, &buf->memory);
 	if (res != VK_SUCCESS) {
-		wlr_vk_error("vkAllocatorMemory", res);
+		wlr_vk_error("vkAllocateMemory", res);
 		goto error;
 	}
 
@@ -335,34 +265,162 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
 		goto error;
 	}
 
-	struct wlr_vk_allocation *a = wl_array_add(&buf->allocs, sizeof(*a));
-	if (a == NULL) {
-		wlr_log_errno(WLR_ERROR, "Allocation failed");
+	buf->buf_size = bsize;
+	return buf;
+
+error:
+	stage_buffer_destroy(r, buf);
+	return NULL;
+}
+
+void vulkan_stage_buffer_reclaim(struct wlr_vk_stage_buffer *buf,
+		uint64_t current_point) {
+
+	size_t completed = 0;
+	struct wlr_vk_stage_watermark *mark;
+	wl_array_for_each(mark, &buf->watermarks) {
+		if (mark->timeline_point > current_point) {
+			break;
+		}
+		buf->tail = mark->head;
+		completed++;
+	}
+
+	if (completed > 0) {
+		completed *= sizeof(struct wlr_vk_stage_watermark);
+		if (completed == buf->watermarks.size) {
+			buf->watermarks.size = 0;
+		} else {
+			array_remove_at(&buf->watermarks, 0, completed);
+		}
+	}
+}
+
+VkDeviceSize vulkan_stage_buffer_alloc(struct wlr_vk_stage_buffer *buf,
+		VkDeviceSize size, VkDeviceSize alignment) {
+	VkDeviceSize head = buf->head;
+
+	// Round up to the next multiple of alignment
+	VkDeviceSize rem = head % alignment;
+	if (rem != 0) {
+		head += alignment - rem;
+	}
+
+	VkDeviceSize end = head >= buf->tail ? buf->buf_size : buf->tail;
+	if (head + size < end) {
+		// Regular allocation head till end of available space
+		buf->head = head + size;
+		return head;
+	} else if (size < buf->tail && head >= buf->tail) {
+		// First allocation after wrap-around
+		buf->head = size;
+		return 0;
+	}
+
+	return (VkDeviceSize)-1;
+}
+
+struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
+		VkDeviceSize size, VkDeviceSize alignment) {
+	if (size >= max_stage_size) {
+		wlr_log(WLR_ERROR, "cannot allocate stage buffer: "
+			"requested size (%zu bytes) exceeds maximum (%zu bytes)",
+			(size_t)size, (size_t)max_stage_size-1);
 		goto error;
 	}
 
-	buf->buf_size = bsize;
-	wl_list_insert(&r->stage.buffers, &buf->link);
+	VkDeviceSize max_buf_size = min_stage_size / 2;
+	struct wlr_vk_stage_buffer *buf;
+	wl_list_for_each(buf, &r->stage.buffers, link) {
+		VkDeviceSize offset = vulkan_stage_buffer_alloc(buf, size, alignment);
+		if (offset != (VkDeviceSize)-1) {
+			return (struct wlr_vk_buffer_span) {
+				.buffer = buf,
+				.offset = offset,
+				.size = size,
+			};
+		}
+		if (buf->buf_size > max_buf_size) {
+			max_buf_size = buf->buf_size;
+		}
+	}
+
+	VkDeviceSize bsize = max_buf_size * 2;
+	while (size * 2 > bsize) {
+		bsize *= 2;
+	}
+	if (bsize > max_stage_size) {
+		wlr_log(WLR_INFO, "vulkan stage buffer has reached max size");
+		bsize = max_stage_size;
+	}
+
+	struct wlr_vk_stage_buffer *new_buf = stage_buffer_create(r, bsize);
+	if (new_buf == NULL) {
+		goto error;
+	}
+
+	wl_list_insert(r->stage.buffers.prev, &new_buf->link);
+
+	VkDeviceSize offset = vulkan_stage_buffer_alloc(new_buf, size, alignment);
+	assert(offset != (VkDeviceSize)-1);
 
-	*a = (struct wlr_vk_allocation){
-		.start = 0,
-		.size = size,
-	};
 	return (struct wlr_vk_buffer_span) {
-		.buffer = buf,
-		.alloc = *a,
+		.buffer = new_buf,
+		.offset = offset,
+		.size = size,
 	};
 
 error:
-	shared_buffer_destroy(r, buf);
-
-error_alloc:
 	return (struct wlr_vk_buffer_span) {
 		.buffer = NULL,
-		.alloc = (struct wlr_vk_allocation) {0, 0},
+		.offset = 0,
+		.size = 0,
 	};
 }
 
+void vulkan_stage_mark_submit(struct wlr_vk_renderer *renderer,
+		uint64_t timeline_point) {
+	struct wlr_vk_stage_buffer *buf;
+	wl_list_for_each(buf, &renderer->stage.buffers, link) {
+		if (buf->head == buf->tail) {
+			continue;
+		}
+
+		struct wlr_vk_stage_watermark *mark = wl_array_add(
+			&buf->watermarks, sizeof(*mark));
+		if (mark == NULL) {
+			wlr_log_errno(WLR_ERROR, "Allocation failed");
+			continue;
+		}
+
+		*mark = (struct wlr_vk_stage_watermark){
+			.head = buf->head,
+			.timeline_point = timeline_point,
+		};
+	}
+}
+
+static void stage_buffer_gc(struct wlr_vk_renderer *renderer, uint64_t current_point) {
+	struct wlr_vk_stage_buffer *buf, *buf_tmp;
+	wl_list_for_each_safe(buf, buf_tmp, &renderer->stage.buffers, link) {
+		if (buf->head != buf->tail) {
+			buf->empty_gc_cnt = 0;
+			vulkan_stage_buffer_reclaim(buf, current_point);
+			continue;
+		}
+		if (buf->buf_size <= min_stage_size) {
+			// We will not deallocate the first buffer
+			continue;
+		}
+
+		buf->empty_gc_cnt++;
+		if (buf->empty_gc_cnt >= 1000) {
+			// This buffer hasn't been used for a while, so let's deallocate it
+			stage_buffer_destroy(renderer, buf);
+		}
+	}
+}
+
 VkCommandBuffer vulkan_record_stage_cb(struct wlr_vk_renderer *renderer) {
 	if (renderer->stage.cb == NULL) {
 		renderer->stage.cb = vulkan_acquire_command_buffer(renderer);
@@ -463,16 +521,21 @@ bool vulkan_submit_stage_wait(struct wlr_vk_renderer *renderer, int wait_sync_fi
 		submit_info.pWaitDstStageMask = &wait_stage;
 	}
 
+	vulkan_stage_mark_submit(renderer, timeline_point);
+
 	VkResult res = vkQueueSubmit(renderer->dev->queue, 1, &submit_info, VK_NULL_HANDLE);
 	if (res != VK_SUCCESS) {
 		wlr_vk_error("vkQueueSubmit", res);
 		return false;
 	}
 
-	// NOTE: don't release stage allocations here since they may still be
-	// used for reading. Will be done next frame.
+	if (!vulkan_wait_command_buffer(cb, renderer)) {
+		return false;
+	}
 
-	return vulkan_wait_command_buffer(cb, renderer);
+	// We did a blocking wait so this is now the current point
+	stage_buffer_gc(renderer, timeline_point);
+	return true;
 }
 
 struct wlr_vk_format_props *vulkan_format_props_from_drm(
@@ -506,7 +569,6 @@ static bool init_command_buffer(struct wlr_vk_command_buffer *cb,
 		.vk = vk_cb,
 	};
 	wl_list_init(&cb->destroy_textures);
-	wl_list_init(&cb->stage_buffers);
 	return true;
 }
 
@@ -532,7 +594,7 @@ bool vulkan_wait_command_buffer(struct wlr_vk_command_buffer *cb,
 }
 
 static void release_command_buffer_resources(struct wlr_vk_command_buffer *cb,
-		struct wlr_vk_renderer *renderer, int64_t now) {
+		struct wlr_vk_renderer *renderer) {
 	struct wlr_vk_texture *texture, *texture_tmp;
 	wl_list_for_each_safe(texture, texture_tmp, &cb->destroy_textures, destroy_link) {
 		wl_list_remove(&texture->destroy_link);
@@ -540,15 +602,6 @@ static void release_command_buffer_resources(struct wlr_vk_command_buffer *cb,
 		wlr_texture_destroy(&texture->wlr_texture);
 	}
 
-	struct wlr_vk_shared_buffer *buf, *buf_tmp;
-	wl_list_for_each_safe(buf, buf_tmp, &cb->stage_buffers, link) {
-		buf->allocs.size = 0;
-		buf->last_used_ms = now;
-
-		wl_list_remove(&buf->link);
-		wl_list_insert(&renderer->stage.buffers, &buf->link);
-	}
-
 	if (cb->color_transform) {
 		wlr_color_transform_unref(cb->color_transform);
 		cb->color_transform = NULL;
@@ -567,22 +620,14 @@ static struct wlr_vk_command_buffer *get_command_buffer(
 		return NULL;
 	}
 
-
-	// Garbage collect any buffers that have remained unused for too long
-	int64_t now = get_current_time_msec();
-	struct wlr_vk_shared_buffer *buf, *buf_tmp;
-	wl_list_for_each_safe(buf, buf_tmp, &renderer->stage.buffers, link) {
-		if (buf->allocs.size == 0 && buf->last_used_ms + 10000 < now) {
-			shared_buffer_destroy(renderer, buf);
-		}
-	}
+	stage_buffer_gc(renderer, current_point);
 
 	// Destroy textures for completed command buffers
 	for (size_t i = 0; i < VULKAN_COMMAND_BUFFERS_CAP; i++) {
 		struct wlr_vk_command_buffer *cb = &renderer->command_buffers[i];
 		if (cb->vk != VK_NULL_HANDLE && !cb->recording &&
 				cb->timeline_point <= current_point) {
-			release_command_buffer_resources(cb, renderer, now);
+			release_command_buffer_resources(cb, renderer);
 		}
 	}
 
@@ -1185,7 +1230,7 @@ static void vulkan_destroy(struct wlr_renderer *wlr_renderer) {
 		if (cb->vk == VK_NULL_HANDLE) {
 			continue;
 		}
-		release_command_buffer_resources(cb, renderer, 0);
+		release_command_buffer_resources(cb, renderer);
 		if (cb->binary_semaphore != VK_NULL_HANDLE) {
 			vkDestroySemaphore(renderer->dev->dev, cb->binary_semaphore, NULL);
 		}
@@ -1197,9 +1242,9 @@ static void vulkan_destroy(struct wlr_renderer *wlr_renderer) {
 	}
 
 	// stage.cb automatically freed with command pool
-	struct wlr_vk_shared_buffer *buf, *tmp_buf;
+	struct wlr_vk_stage_buffer *buf, *tmp_buf;
 	wl_list_for_each_safe(buf, tmp_buf, &renderer->stage.buffers, link) {
-		shared_buffer_destroy(renderer, buf);
+		stage_buffer_destroy(renderer, buf);
 	}
 
 	struct wlr_vk_texture *tex, *tex_tmp;
diff --git a/render/vulkan/texture.c b/render/vulkan/texture.c
index ec8aecd1e..deff70f11 100644
--- a/render/vulkan/texture.c
+++ b/render/vulkan/texture.c
@@ -72,16 +72,16 @@ static bool write_pixels(struct wlr_vk_texture *texture,
 
 	// get staging buffer
 	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer, bsize, format_info->bytes_per_block);
-	if (!span.buffer || span.alloc.size != bsize) {
+	if (!span.buffer || span.size != bsize) {
 		wlr_log(WLR_ERROR, "Failed to retrieve staging buffer");
 		free(copies);
 		return false;
 	}
-	char *map = (char*)span.buffer->cpu_mapping + span.alloc.start;
+	char *map = (char*)span.buffer->cpu_mapping + span.offset;
 
 	// upload data
 
-	uint32_t buf_off = span.alloc.start;
+	uint32_t buf_off = span.offset;
 	for (int i = 0; i < rects_len; i++) {
 		pixman_box32_t rect = rects[i];
 		uint32_t width = rect.x2 - rect.x1;

From b01fdc3164d55f338bd9866cfac8f577cb570dc1 Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Sun, 12 Apr 2026 17:47:32 +0200
Subject: [PATCH 2/6] render/vulkan: Add unit-test for staging buffer

Signed-off-by: Kenny Levinsen <kl@kl.wtf>
---
 test/meson.build                |  22 ++++
 test/test_vulkan_stage_buffer.c | 200 ++++++++++++++++++++++++++++++++
 2 files changed, 222 insertions(+)
 create mode 100644 test/test_vulkan_stage_buffer.c

diff --git a/test/meson.build b/test/meson.build
index f51b2c02c..9c622e3ef 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -1,8 +1,30 @@
+# Used to test internal symbols
+lib_wlr_internal = static_library(
+	versioned_name + '-internal',
+	objects: lib_wlr.extract_all_objects(recursive: false),
+	dependencies: wlr_deps,
+	include_directories: [wlr_inc],
+	install: false,
+)
+
 test(
 	'box',
 	executable('test-box', 'test_box.c', dependencies: wlroots),
 )
 
+if features.get('vulkan-renderer')
+	test(
+		'vulkan_stage_buffer',
+		executable(
+			'test-vulkan-stage-buffer',
+			'test_vulkan_stage_buffer.c',
+			link_with: lib_wlr_internal,
+			dependencies: wlr_deps,
+			include_directories: wlr_inc,
+		),
+	)
+endif
+
 benchmark(
 	'scene',
 	executable('bench-scene', 'bench_scene.c', dependencies: wlroots),
diff --git a/test/test_vulkan_stage_buffer.c b/test/test_vulkan_stage_buffer.c
new file mode 100644
index 000000000..74f230a44
--- /dev/null
+++ b/test/test_vulkan_stage_buffer.c
@@ -0,0 +1,200 @@
+#include <assert.h>
+#include <stdio.h>
+#include <wayland-util.h>
+
+#include "render/vulkan.h"
+
+#define BUF_SIZE 1024
+#define ALLOC_FAIL ((VkDeviceSize)-1)
+
+static void stage_buffer_init(struct wlr_vk_stage_buffer *buf) {
+	*buf = (struct wlr_vk_stage_buffer){
+		.buf_size = BUF_SIZE,
+	};
+	wl_array_init(&buf->watermarks);
+}
+
+static void stage_buffer_finish(struct wlr_vk_stage_buffer *buf) {
+	wl_array_release(&buf->watermarks);
+}
+
+static void push_watermark(struct wlr_vk_stage_buffer *buf,
+		uint64_t timeline_point) {
+	struct wlr_vk_stage_watermark *mark = wl_array_add(
+		&buf->watermarks, sizeof(*mark));
+	assert(mark != NULL);
+	*mark = (struct wlr_vk_stage_watermark){
+		.head = buf->head,
+		.timeline_point = timeline_point,
+	};
+}
+
+static size_t watermark_count(const struct wlr_vk_stage_buffer *buf) {
+	return buf->watermarks.size / sizeof(struct wlr_vk_stage_watermark);
+}
+
+static void test_alloc_simple(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	assert(buf.head == 100);
+	assert(vulkan_stage_buffer_alloc(&buf, 200, 1) == 100);
+	assert(buf.head == 300);
+	assert(buf.tail == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_alignment(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 7, 1) == 0);
+	assert(buf.head == 7);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 4, 16) == 16);
+	assert(buf.head == 20);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 8, 8) == 24);
+	assert(buf.head == 32);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_limit(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// We do not allow allocations that would cause head to equal tail
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE, 1) == ALLOC_FAIL);
+	assert(buf.head == 0);
+
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE-1, 1) == 0);
+	assert(buf.head == BUF_SIZE-1);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_wrap(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// Fill the first 924 bytes
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE - 100, 1) == 0);
+	push_watermark(&buf, 1);
+
+	// Fill the end of the buffer
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == 924);
+	push_watermark(&buf, 2);
+
+	// First, check that we don't wrap prematurely
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == ALLOC_FAIL);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == ALLOC_FAIL);
+
+	// Free the beginning of the buffer and try to wrap again
+	vulkan_stage_buffer_reclaim(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == 0);
+	assert(buf.tail == 924);
+	assert(buf.head == 50);
+
+	// Check that freeing from the end of the buffer still works
+	vulkan_stage_buffer_reclaim(&buf, 2);
+	assert(buf.tail == 974);
+	assert(buf.head == 50);
+
+	// Check that allocations still work
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 50);
+	assert(buf.tail == 974);
+	assert(buf.head == 150);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_empty(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// Fresh buffer with no watermarks and head == tail == 0 is drained.
+	vulkan_stage_buffer_reclaim(&buf, 0);
+	assert(buf.head == buf.tail);
+	assert(buf.tail == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_pending_not_completed(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+
+	// current point hasn't reached the watermark yet.
+	vulkan_stage_buffer_reclaim(&buf, 0);
+	assert(buf.head != buf.tail);
+	assert(buf.tail == 0);
+	assert(watermark_count(&buf) == 1);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_partial(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 100);
+	push_watermark(&buf, 2);
+
+	// Only the first watermark is reached.
+	vulkan_stage_buffer_reclaim(&buf, 1);
+	assert(buf.head != buf.tail);
+	assert(buf.tail == 100);
+	assert(watermark_count(&buf) == 1);
+
+	const struct wlr_vk_stage_watermark *remaining = buf.watermarks.data;
+	assert(remaining[0].head == 200);
+	assert(remaining[0].timeline_point == 2);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_all(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 100);
+	push_watermark(&buf, 2);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 200);
+	push_watermark(&buf, 3);
+
+	vulkan_stage_buffer_reclaim(&buf, 100);
+	assert(buf.head == buf.tail);
+	assert(buf.tail == 300);
+	assert(watermark_count(&buf) == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+int main(void) {
+#ifdef NDEBUG
+	fprintf(stderr, "NDEBUG must be disabled for tests\n");
+	return 1;
+#endif
+
+	test_alloc_simple();
+	test_alloc_alignment();
+	test_alloc_limit();
+	test_alloc_wrap();
+
+	test_reclaim_empty();
+	test_reclaim_pending_not_completed();
+	test_reclaim_partial();
+	test_reclaim_all();
+
+	return 0;
+}

From 439258a43b4a6e4434b8014fc871f1b12020e95c Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Thu, 23 Apr 2026 15:37:17 +0200
Subject: [PATCH 3/6] render/vulkan: Intersect clip region once

We are spending quite significant CPU time walking through the clip
rects, taking a pixman box, converting it to a wlr box, intersecting it
and ultimately converting it back to a pixman box before adding it to
the rect union.

Just intersect the clip region once ahead of time, and use pixman boxes
the entire way. This also makes it easy to bail early if nothing
intersects.

Gives a small 97.95% reduction in CPU time for the Vulkan renderer in
the grid/clip200/1024 benchmark.

Signed-off-by: Kenny Levinsen <kl@kl.wtf>
---
 render/vulkan/pass.c | 94 +++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 54 deletions(-)

diff --git a/render/vulkan/pass.c b/render/vulkan/pass.c
index 3c909d6fd..bbf672933 100644
--- a/render/vulkan/pass.c
+++ b/render/vulkan/pass.c
@@ -38,17 +38,6 @@ static void bind_pipeline(struct wlr_vk_render_pass *pass, VkPipeline pipeline)
 	pass->bound_pipeline = pipeline;
 }
 
-static void get_clip_region(struct wlr_vk_render_pass *pass,
-		const pixman_region32_t *in, pixman_region32_t *out) {
-	if (in != NULL) {
-		pixman_region32_init(out);
-		pixman_region32_copy(out, in);
-	} else {
-		struct wlr_buffer *buffer = pass->render_buffer->wlr_buffer;
-		pixman_region32_init_rect(out, 0, 0, buffer->width, buffer->height);
-	}
-}
-
 static void convert_pixman_box_to_vk_rect(const pixman_box32_t *box, VkRect2D *rect) {
 	*rect = (VkRect2D){
 		.offset = { .x = box->x1, .y = box->y1 },
@@ -620,18 +609,11 @@ error:
 }
 
 static void render_pass_mark_box_updated(struct wlr_vk_render_pass *pass,
-		const struct wlr_box *box) {
+		const pixman_box32_t *box) {
 	if (!pass->two_pass) {
 		return;
 	}
-
-	pixman_box32_t pixman_box = {
-		.x1 = box->x,
-		.x2 = box->x + box->width,
-		.y1 = box->y,
-		.y2 = box->y + box->height,
-	};
-	rect_union_add(&pass->updated_region, pixman_box);
+	rect_union_add(&pass->updated_region, *box);
 }
 
 static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
@@ -651,28 +633,30 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 		options->color.a, // no conversion for alpha
 	};
 
+	struct wlr_box box;
+	wlr_render_rect_options_get_box(options, pass->render_buffer->wlr_buffer, &box);
+
 	pixman_region32_t clip;
-	get_clip_region(pass, options->clip, &clip);
+	if (options->clip) {
+		pixman_region32_init(&clip);
+		pixman_region32_intersect_rect(&clip, options->clip,
+			box.x, box.y, box.width, box.height);
+	} else {
+		pixman_region32_init_rect(&clip,
+			box.x, box.y, box.width, box.height);
+	}
 
 	int clip_rects_len;
 	const pixman_box32_t *clip_rects = pixman_region32_rectangles(&clip, &clip_rects_len);
-	// Record regions possibly updated for use in second subpass
-	for (int i = 0; i < clip_rects_len; i++) {
-		struct wlr_box clip_box = {
-			.x = clip_rects[i].x1,
-			.y = clip_rects[i].y1,
-			.width = clip_rects[i].x2 - clip_rects[i].x1,
-			.height = clip_rects[i].y2 - clip_rects[i].y1,
-		};
-		struct wlr_box intersection;
-		if (!wlr_box_intersection(&intersection, &options->box, &clip_box)) {
-			continue;
-		}
-		render_pass_mark_box_updated(pass, &intersection);
+	if (clip_rects_len == 0) {
+		pixman_region32_fini(&clip);
+		return;
 	}
 
-	struct wlr_box box;
-	wlr_render_rect_options_get_box(options, pass->render_buffer->wlr_buffer, &box);
+	// Record regions possibly updated for use in second subpass
+	for (int i = 0; i < clip_rects_len; i++) {
+		render_pass_mark_box_updated(pass, &clip_rects[i]);
+	}
 
 	switch (options->blend_mode) {
 	case WLR_RENDER_BLEND_MODE_PREMULTIPLIED:;
@@ -769,6 +753,22 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 	wlr_matrix_project_box(matrix, &dst_box, options->transform, proj);
 	wlr_matrix_multiply(matrix, pass->projection, matrix);
 
+	pixman_region32_t clip;
+	if (options->clip) {
+		pixman_region32_init(&clip);
+		pixman_region32_intersect_rect(&clip, options->clip,
+			dst_box.x, dst_box.y, dst_box.width, dst_box.height);
+	} else {
+		pixman_region32_init_rect(&clip,
+			dst_box.x, dst_box.y, dst_box.width, dst_box.height);
+	}
+	int clip_rects_len;
+	const pixman_box32_t *clip_rects = pixman_region32_rectangles(&clip, &clip_rects_len);
+	if (clip_rects_len == 0) {
+		pixman_region32_fini(&clip);
+		return;
+	}
+
 	struct wlr_vk_vert_pcr_data vert_pcr_data = {
 		.uv_off = {
 			src_box.x / options->texture->width,
@@ -839,6 +839,7 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 				WLR_RENDER_BLEND_MODE_NONE : options->blend_mode,
 		});
 	if (!pipe) {
+		pixman_region32_fini(&clip);
 		pass->failed = true;
 		return;
 	}
@@ -846,6 +847,7 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 	struct wlr_vk_texture_view *view =
 		vulkan_texture_get_or_create_view(texture, pipe->layout, srgb_image_view);
 	if (!view) {
+		pixman_region32_fini(&clip);
 		pass->failed = true;
 		return;
 	}
@@ -883,33 +885,17 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 		VK_SHADER_STAGE_FRAGMENT_BIT, sizeof(vert_pcr_data),
 		sizeof(frag_pcr_data), &frag_pcr_data);
 
-	pixman_region32_t clip;
-	get_clip_region(pass, options->clip, &clip);
-
-	int clip_rects_len;
-	const pixman_box32_t *clip_rects = pixman_region32_rectangles(&clip, &clip_rects_len);
 	for (int i = 0; i < clip_rects_len; i++) {
 		VkRect2D rect;
 		convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
 		vkCmdSetScissor(cb, 0, 1, &rect);
 		vkCmdDraw(cb, 4, 1, 0, 0);
-
-		struct wlr_box clip_box = {
-			.x = clip_rects[i].x1,
-			.y = clip_rects[i].y1,
-			.width = clip_rects[i].x2 - clip_rects[i].x1,
-			.height = clip_rects[i].y2 - clip_rects[i].y1,
-		};
-		struct wlr_box intersection;
-		if (!wlr_box_intersection(&intersection, &dst_box, &clip_box)) {
-			continue;
-		}
-		render_pass_mark_box_updated(pass, &intersection);
+		render_pass_mark_box_updated(pass, &clip_rects[i]);
 	}
+	pixman_region32_fini(&clip);
 
 	texture->last_used_cb = pass->command_buffer;
 
-	pixman_region32_fini(&clip);
 
 	if (texture->dmabuf_imported || (options != NULL && options->wait_timeline != NULL)) {
 		struct wlr_vk_render_pass_texture *pass_texture =

From 8abe53d1d2ad7ab271e1be514e5d572b751b180e Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Sun, 12 Apr 2026 17:47:40 +0200
Subject: [PATCH 4/6] render/vulkan: Use instanced draws instead of scissors

Similar to what we have already done for gles2. To simplify things we
use the staging ring buffer for the vertex buffers by extending the
usage bits, rather than introducing a separate pool.

Signed-off-by: Kenny Levinsen <kl@kl.wtf>
---
 render/vulkan/pass.c              | 115 ++++++++++++++++++++++--------
 render/vulkan/renderer.c          |  34 ++++++---
 render/vulkan/shaders/common.vert |   3 +
 3 files changed, 113 insertions(+), 39 deletions(-)

diff --git a/render/vulkan/pass.c b/render/vulkan/pass.c
index bbf672933..fe1b37ff0 100644
--- a/render/vulkan/pass.c
+++ b/render/vulkan/pass.c
@@ -2,7 +2,9 @@
 #include <drm_fourcc.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <wlr/util/box.h>
 #include <wlr/util/log.h>
+#include <wlr/util/transform.h>
 #include <wlr/render/color.h>
 #include <wlr/render/drm_syncobj.h>
 
@@ -191,11 +193,13 @@ static bool render_pass_submit(struct wlr_render_pass *wlr_pass) {
 		int width = pass->render_buffer->wlr_buffer->width;
 		int height = pass->render_buffer->wlr_buffer->height;
 
-		float final_matrix[9] = {
-			width, 0, -1,
-			0, height, -1,
-			0, 0, 0,
-		};
+		struct wlr_box output_box = { 0, 0, width, height };
+		float proj[9], final_matrix[9];
+		wlr_matrix_identity(proj);
+		wlr_matrix_project_box(final_matrix, &output_box,
+			WL_OUTPUT_TRANSFORM_NORMAL, proj);
+		wlr_matrix_multiply(final_matrix, pass->projection, final_matrix);
+
 		struct wlr_vk_vert_pcr_data vert_pcr_data = {
 			.uv_off = { 0, 0 },
 			.uv_size = { 1, 1 },
@@ -274,11 +278,28 @@ static bool render_pass_submit(struct wlr_render_pass *wlr_pass) {
 		int clip_rects_len;
 		const pixman_box32_t *clip_rects = pixman_region32_rectangles(
 			clip, &clip_rects_len);
-		for (int i = 0; i < clip_rects_len; i++) {
-			VkRect2D rect;
-			convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
-			vkCmdSetScissor(render_cb->vk, 0, 1, &rect);
-			vkCmdDraw(render_cb->vk, 4, 1, 0, 0);
+
+		if (clip_rects_len > 0) {
+			const VkDeviceSize instance_size = 4 * sizeof(float);
+			struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
+				clip_rects_len * instance_size, 16);
+			if (!span.buffer) {
+				pass->failed = true;
+				goto error;
+			}
+
+			float *instance_data = (float *)((char *)span.buffer->cpu_mapping + span.offset);
+			for (int i = 0; i < clip_rects_len; i++) {
+				const pixman_box32_t *b = &clip_rects[i];
+				instance_data[i * 4 + 0] = (float)b->x1 / width;
+				instance_data[i * 4 + 1] = (float)b->y1 / height;
+				instance_data[i * 4 + 2] = (float)(b->x2 - b->x1) / width;
+				instance_data[i * 4 + 3] = (float)(b->y2 - b->y1) / height;
+			}
+
+			VkDeviceSize vb_offset = span.offset;
+			vkCmdBindVertexBuffers(render_cb->vk, 0, 1, &span.buffer->buffer, &vb_offset);
+			vkCmdDraw(render_cb->vk, 4, clip_rects_len, 0, 0);
 		}
 	}
 
@@ -653,11 +674,6 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 		return;
 	}
 
-	// Record regions possibly updated for use in second subpass
-	for (int i = 0; i < clip_rects_len; i++) {
-		render_pass_mark_box_updated(pass, &clip_rects[i]);
-	}
-
 	switch (options->blend_mode) {
 	case WLR_RENDER_BLEND_MODE_PREMULTIPLIED:;
 		float proj[9], matrix[9];
@@ -676,6 +692,23 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			break;
 		}
 
+		const VkDeviceSize instance_size = 4 * sizeof(float);
+		struct wlr_vk_buffer_span span = vulkan_get_stage_span(pass->renderer,
+			clip_rects_len * instance_size, 16);
+		if (!span.buffer) {
+			pass->failed = true;
+			break;
+		}
+		float *instance_data = (float *)((char *)span.buffer->cpu_mapping + span.offset);
+		for (int i = 0; i < clip_rects_len; i++) {
+			const pixman_box32_t *rect = &clip_rects[i];
+			render_pass_mark_box_updated(pass, rect);
+			instance_data[i * 4 + 0] = (float)(rect->x1 - box.x) / box.width;
+			instance_data[i * 4 + 1] = (float)(rect->y1 - box.y) / box.height;
+			instance_data[i * 4 + 2] = (float)(rect->x2 - rect->x1) / box.width;
+			instance_data[i * 4 + 3] = (float)(rect->y2 - rect->y1) / box.height;
+		}
+
 		struct wlr_vk_vert_pcr_data vert_pcr_data = {
 			.uv_off = { 0, 0 },
 			.uv_size = { 1, 1 },
@@ -689,12 +722,9 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			VK_SHADER_STAGE_FRAGMENT_BIT, sizeof(vert_pcr_data), sizeof(float) * 4,
 			linear_color);
 
-		for (int i = 0; i < clip_rects_len; i++) {
-			VkRect2D rect;
-			convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
-			vkCmdSetScissor(cb, 0, 1, &rect);
-			vkCmdDraw(cb, 4, 1, 0, 0);
-		}
+		VkDeviceSize vb_offset = span.offset;
+		vkCmdBindVertexBuffers(cb, 0, 1, &span.buffer->buffer, &vb_offset);
+		vkCmdDraw(cb, 4, clip_rects_len, 0, 0);
 		break;
 	case WLR_RENDER_BLEND_MODE_NONE:;
 		VkClearAttachment clear_att = {
@@ -711,7 +741,9 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			.layerCount = 1,
 		};
 		for (int i = 0; i < clip_rects_len; i++) {
-			convert_pixman_box_to_vk_rect(&clip_rects[i], &clear_rect.rect);
+			const pixman_box32_t *rect = &clip_rects[i];
+			render_pass_mark_box_updated(pass, rect);
+			convert_pixman_box_to_vk_rect(rect, &clear_rect.rect);
 			vkCmdClearAttachments(cb, 1, &clear_att, 1, &clear_rect);
 		}
 		break;
@@ -769,6 +801,15 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 		return;
 	}
 
+	const VkDeviceSize instance_size = 4 * sizeof(float);
+	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
+		clip_rects_len * instance_size, 16);
+	if (!span.buffer) {
+		pixman_region32_fini(&clip);
+		pass->failed = true;
+		return;
+	}
+
 	struct wlr_vk_vert_pcr_data vert_pcr_data = {
 		.uv_off = {
 			src_box.x / options->texture->width,
@@ -885,17 +926,34 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 		VK_SHADER_STAGE_FRAGMENT_BIT, sizeof(vert_pcr_data),
 		sizeof(frag_pcr_data), &frag_pcr_data);
 
+	float *instance_data = (float *)((char *)span.buffer->cpu_mapping + span.offset);
 	for (int i = 0; i < clip_rects_len; i++) {
-		VkRect2D rect;
-		convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
-		vkCmdSetScissor(cb, 0, 1, &rect);
-		vkCmdDraw(cb, 4, 1, 0, 0);
-		render_pass_mark_box_updated(pass, &clip_rects[i]);
+		const pixman_box32_t *rect = &clip_rects[i];
+		render_pass_mark_box_updated(pass, rect);
+
+		struct wlr_fbox norm = {
+			.x = (double)(rect->x1 - dst_box.x) / dst_box.width,
+			.y = (double)(rect->y1 - dst_box.y) / dst_box.height,
+			.width = (double)(rect->x2 - rect->x1) / dst_box.width,
+			.height = (double)(rect->y2 - rect->y1) / dst_box.height,
+		};
+
+		if (options->transform != WL_OUTPUT_TRANSFORM_NORMAL) {
+			wlr_fbox_transform(&norm, &norm, options->transform, 1.0, 1.0);
+		}
+
+		instance_data[i * 4 + 0] = (float)norm.x;
+		instance_data[i * 4 + 1] = (float)norm.y;
+		instance_data[i * 4 + 2] = (float)norm.width;
+		instance_data[i * 4 + 3] = (float)norm.height;
 	}
 	pixman_region32_fini(&clip);
 
-	texture->last_used_cb = pass->command_buffer;
+	VkDeviceSize vb_offset = span.offset;
+	vkCmdBindVertexBuffers(cb, 0, 1, &span.buffer->buffer, &vb_offset);
+	vkCmdDraw(cb, 4, clip_rects_len, 0, 0);
 
+	texture->last_used_cb = pass->command_buffer;
 
 	if (texture->dmabuf_imported || (options != NULL && options->wait_timeline != NULL)) {
 		struct wlr_vk_render_pass_texture *pass_texture =
@@ -1290,6 +1348,7 @@ struct wlr_vk_render_pass *vulkan_begin_render_pass(struct wlr_vk_renderer *rend
 		.height = height,
 		.maxDepth = 1,
 	});
+	vkCmdSetScissor(cb->vk, 0, 1, &rect);
 
 	// matrix_projection() assumes a GL coordinate system so we need
 	// to pass WL_OUTPUT_TRANSFORM_FLIPPED_180 to adjust it for vulkan.
diff --git a/render/vulkan/renderer.c b/render/vulkan/renderer.c
index 2c42b4096..d342dee2c 100644
--- a/render/vulkan/renderer.c
+++ b/render/vulkan/renderer.c
@@ -222,7 +222,8 @@ static struct wlr_vk_stage_buffer *stage_buffer_create(
 		.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
 		.size = bsize,
 		.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
-			VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+			VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+			VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
 		.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
 	};
 	res = vkCreateBuffer(r->dev->dev, &buf_info, NULL, &buf->buffer);
@@ -1881,6 +1882,25 @@ static bool pipeline_key_equals(const struct wlr_vk_pipeline_key *a,
 	return true;
 }
 
+static const VkVertexInputBindingDescription instance_vert_binding = {
+	.binding = 0,
+	.stride = sizeof(float) * 4,
+	.inputRate = VK_VERTEX_INPUT_RATE_INSTANCE,
+};
+static const VkVertexInputAttributeDescription instance_vert_attr = {
+	.location = 0,
+	.binding = 0,
+	.format = VK_FORMAT_R32G32B32A32_SFLOAT,
+	.offset = 0,
+};
+static const VkPipelineVertexInputStateCreateInfo instance_vert_input = {
+	.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+	.vertexBindingDescriptionCount = 1,
+	.pVertexBindingDescriptions = &instance_vert_binding,
+	.vertexAttributeDescriptionCount = 1,
+	.pVertexAttributeDescriptions = &instance_vert_attr,
+};
+
 // Initializes the pipeline for rendering textures and using the given
 // VkRenderPass and VkPipelineLayout.
 struct wlr_vk_pipeline *setup_get_or_create_pipeline(
@@ -2012,10 +2032,6 @@ struct wlr_vk_pipeline *setup_get_or_create_pipeline(
 		.dynamicStateCount = sizeof(dyn_states) / sizeof(dyn_states[0]),
 	};
 
-	VkPipelineVertexInputStateCreateInfo vertex = {
-		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-	};
-
 	VkGraphicsPipelineCreateInfo pinfo = {
 		.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
 		.layout = pipeline_layout->vk,
@@ -2030,7 +2046,7 @@ struct wlr_vk_pipeline *setup_get_or_create_pipeline(
 		.pMultisampleState = &multisample,
 		.pViewportState = &viewport,
 		.pDynamicState = &dynamic,
-		.pVertexInputState = &vertex,
+		.pVertexInputState = &instance_vert_input,
 	};
 
 	VkPipelineCache cache = VK_NULL_HANDLE;
@@ -2129,10 +2145,6 @@ static bool init_blend_to_output_pipeline(struct wlr_vk_renderer *renderer,
 		.dynamicStateCount = sizeof(dyn_states) / sizeof(dyn_states[0]),
 	};
 
-	VkPipelineVertexInputStateCreateInfo vertex = {
-		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-	};
-
 	VkGraphicsPipelineCreateInfo pinfo = {
 		.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
 		.pNext = NULL,
@@ -2147,7 +2159,7 @@ static bool init_blend_to_output_pipeline(struct wlr_vk_renderer *renderer,
 		.pMultisampleState = &multisample,
 		.pViewportState = &viewport,
 		.pDynamicState = &dynamic,
-		.pVertexInputState = &vertex,
+		.pVertexInputState = &instance_vert_input,
 	};
 
 	VkPipelineCache cache = VK_NULL_HANDLE;
diff --git a/render/vulkan/shaders/common.vert b/render/vulkan/shaders/common.vert
index f1579790d..82ea9658c 100644
--- a/render/vulkan/shaders/common.vert
+++ b/render/vulkan/shaders/common.vert
@@ -8,11 +8,14 @@ layout(push_constant, row_major) uniform UBO {
 	vec2 uv_size;
 } data;
 
+layout(location = 0) in vec4 inst_rect;
+
 layout(location = 0) out vec2 uv;
 
 void main() {
 	vec2 pos = vec2(float((gl_VertexIndex + 1) & 2) * 0.5f,
 		float(gl_VertexIndex & 2) * 0.5f);
+	pos = inst_rect.xy + pos * inst_rect.zw;
 	uv = data.uv_offset + pos * data.uv_size;
 	gl_Position = data.proj * vec4(pos, 0.0, 1.0);
 }

From 17c29268c94e7d222f87d58d9597c8d83883339e Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Thu, 23 Apr 2026 16:47:40 +0200
Subject: [PATCH 5/6] util/rect_union: Take pixman_box32_t by pointer

rect_union_add takes a pixman_box32_t by value, and passes it along by
value to internal helpers. render_pass_mark_box_updated which is the
only caller receives the pixman_box32_t by reference, so just plumb it
through that way.

Results in a 13% improvement in CPU time when using the Vulkan renderer
on the stacked/clip200/1024 benchmarks on my machine.

Signed-off-by: Kenny Levinsen <kl@kl.wtf>
---
 include/util/rect_union.h |  2 +-
 render/vulkan/pass.c      |  2 +-
 util/rect_union.c         | 20 ++++++++++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/util/rect_union.h b/include/util/rect_union.h
index 2d74f94d5..c4b378888 100644
--- a/include/util/rect_union.h
+++ b/include/util/rect_union.h
@@ -58,7 +58,7 @@ void rect_union_finish(struct rect_union *r);
  *
  * Amortized time: O(1)
  */
-void rect_union_add(struct rect_union *r, pixman_box32_t box);
+void rect_union_add(struct rect_union *r, const pixman_box32_t *box);
 
 /**
  * Compute an exact cover of the rectangles added so far, and return
diff --git a/render/vulkan/pass.c b/render/vulkan/pass.c
index fe1b37ff0..b250543ce 100644
--- a/render/vulkan/pass.c
+++ b/render/vulkan/pass.c
@@ -634,7 +634,7 @@ static void render_pass_mark_box_updated(struct wlr_vk_render_pass *pass,
 	if (!pass->two_pass) {
 		return;
 	}
-	rect_union_add(&pass->updated_region, *box);
+	rect_union_add(&pass->updated_region, box);
 }
 
 static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
diff --git a/util/rect_union.c b/util/rect_union.c
index 8cd26d761..8d29798de 100644
--- a/util/rect_union.c
+++ b/util/rect_union.c
@@ -1,15 +1,15 @@
 #include <limits.h>
 #include "util/rect_union.h"
 
-static void box_union(pixman_box32_t *dst, pixman_box32_t box) {
-	dst->x1 = dst->x1 < box.x1 ? dst->x1 : box.x1;
-	dst->y1 = dst->y1 < box.y1 ? dst->y1 : box.y1;
-	dst->x2 = dst->x2 > box.x2 ? dst->x2 : box.x2;
-	dst->y2 = dst->y2 > box.y2 ? dst->y2 : box.y2;
+static void box_union(pixman_box32_t *dst, const pixman_box32_t *box) {
+	dst->x1 = dst->x1 < box->x1 ? dst->x1 : box->x1;
+	dst->y1 = dst->y1 < box->y1 ? dst->y1 : box->y1;
+	dst->x2 = dst->x2 > box->x2 ? dst->x2 : box->x2;
+	dst->y2 = dst->y2 > box->y2 ? dst->y2 : box->y2;
 }
 
-static bool box_empty_or_invalid(pixman_box32_t box) {
-	return box.x1 >= box.x2 || box.y1 >= box.y2;
+static bool box_empty_or_invalid(const pixman_box32_t *box) {
+	return box->x1 >= box->x2 || box->y1 >= box->y2;
 }
 
 void rect_union_init(struct rect_union *ru) {
@@ -37,7 +37,7 @@ static void handle_alloc_failure(struct rect_union *ru) {
 	wl_array_init(&ru->unsorted);
 }
 
-void rect_union_add(struct rect_union *ru, pixman_box32_t box) {
+void rect_union_add(struct rect_union *ru, const pixman_box32_t *box) {
 	if (box_empty_or_invalid(box)) {
 		return;
 	}
@@ -47,7 +47,7 @@ void rect_union_add(struct rect_union *ru, pixman_box32_t box) {
 	if (!ru->alloc_failure) {
 		pixman_box32_t *entry = wl_array_add(&ru->unsorted, sizeof(*entry));
 		if (entry) {
-			*entry = box;
+			*entry = *box;
 		} else {
 			handle_alloc_failure(ru);
 		}
@@ -81,7 +81,7 @@ const pixman_region32_t *rect_union_evaluate(struct rect_union *ru) {
 	return &ru->region;
 bounding_box:
 	pixman_region32_fini(&ru->region);
-	if (box_empty_or_invalid(ru->bounding_box)) {
+	if (box_empty_or_invalid(&ru->bounding_box)) {
 		pixman_region32_init(&ru->region);
 	} else {
 		pixman_region32_init_with_extents(&ru->region, &ru->bounding_box);

From 57441ded02b8895ed1bb2f66c5cff6a6478307f0 Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Thu, 23 Apr 2026 17:15:28 +0200
Subject: [PATCH 6/6] util/rect_union: Limit rect_union_add to 1024 rects

If a very large number of clip rects are accumulated in rect_union_add,
rect_union_evaluate can end up being disproportionately expensive, and
as an extreme numbers of clip rects are not beneficial for drawing, this
is without any benefit.

Limit the number of rects to 1024 in rect_union_add, switching over to
bounding box mode instead when the limit is exceeded. This leads to a
small 70% reduction in CPU time in the Vulkan renderer on the
stacked/clip200/1024 benchmarks.

Signed-off-by: Kenny Levinsen <kl@kl.wtf>
---
 include/util/rect_union.h |  6 ++++--
 util/rect_union.c         | 22 +++++++++++++++-------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/util/rect_union.h b/include/util/rect_union.h
index c4b378888..297c64d43 100644
--- a/include/util/rect_union.h
+++ b/include/util/rect_union.h
@@ -63,8 +63,10 @@ void rect_union_add(struct rect_union *r, const pixman_box32_t *box);
 /**
  * Compute an exact cover of the rectangles added so far, and return
  * a pointer to a pixman_region32_t giving that cover. The pointer will
- * remain valid until the next time *r is modified. If there was an allocation
- * failure, this function may return a single-rectangle bounding box instead.
+ * remain valid until the next time *r is modified.
+ *
+ * An internal complexity limit is enforced by rect_union. If exceeded, this
+ * function will instead return a single-rectangle bounding box.
  *
  * This may be called multiple times and interleaved with rect_union_add().
  *
diff --git a/util/rect_union.c b/util/rect_union.c
index 8d29798de..f113fe43e 100644
--- a/util/rect_union.c
+++ b/util/rect_union.c
@@ -44,13 +44,21 @@ void rect_union_add(struct rect_union *ru, const pixman_box32_t *box) {
 
 	box_union(&ru->bounding_box, box);
 
-	if (!ru->alloc_failure) {
-		pixman_box32_t *entry = wl_array_add(&ru->unsorted, sizeof(*entry));
-		if (entry) {
-			*entry = *box;
-		} else {
-			handle_alloc_failure(ru);
-		}
+	if (ru->alloc_failure) {
+		return;
+	}
+
+	int nrects = (int)(ru->unsorted.size / sizeof(pixman_box32_t));
+	if (nrects >= 1024) {
+		handle_alloc_failure(ru);
+		return;
+	}
+
+	pixman_box32_t *entry = wl_array_add(&ru->unsorted, sizeof(*entry));
+	if (entry) {
+		*entry = *box;
+	} else {
+		handle_alloc_failure(ru);
 	}
 }