diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.c b/src/nouveau/vulkan/nvk_cmd_buffer.c index 43300afe9cc..29e0d507dfb 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.c +++ b/src/nouveau/vulkan/nvk_cmd_buffer.c @@ -381,6 +381,7 @@ nvk_CmdBindPipeline(VkCommandBuffer commandBuffer, { VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); VK_FROM_HANDLE(nvk_pipeline, pipeline, _pipeline); + struct nvk_device *dev = nvk_cmd_buffer_device(cmd); for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) { if (!pipeline->shaders[s].bo) @@ -388,6 +389,9 @@ nvk_CmdBindPipeline(VkCommandBuffer commandBuffer, nouveau_ws_push_ref(cmd->push, pipeline->shaders[s].bo, NOUVEAU_WS_BO_RD); + + if (pipeline->shaders[s].slm_size) + nvk_device_ensure_slm(dev, pipeline->shaders[s].slm_size); } switch (pipelineBindPoint) { diff --git a/src/nouveau/vulkan/nvk_cmd_dispatch.c b/src/nouveau/vulkan/nvk_cmd_dispatch.c index 89bf9428cfe..698dc5cbb41 100644 --- a/src/nouveau/vulkan/nvk_cmd_dispatch.c +++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c @@ -24,33 +24,10 @@ #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a) #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a) -static uint64_t -calc_tls_size(struct nvk_device *device, - uint32_t lpos, uint32_t lneg, uint32_t cstack) -{ - uint64_t size = (lpos + lneg) * 32 + cstack; - - assert (size < (1 << 20)); - - size *= 64; /* max warps */ - size = align(size, 0x8000); - size *= device->pdev->dev->mp_count; - - size = align(size, 1 << 17); - return size; -} - void nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd, const VkCommandBufferBeginInfo *pBeginInfo) -{ - struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device; - - if (dev->ctx->compute.cls < 0xa0c0) - return; - - cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200); -} +{ } static void gv100_compute_setup_launch_desc(uint32_t *qmd, diff --git a/src/nouveau/vulkan/nvk_cmd_draw.c b/src/nouveau/vulkan/nvk_cmd_draw.c index 2301c0ada07..ca942d24766 100644 --- a/src/nouveau/vulkan/nvk_cmd_draw.c +++ b/src/nouveau/vulkan/nvk_cmd_draw.c @@ -171,7 +171,6 @@ void nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd, const VkCommandBufferBeginInfo *pBeginInfo) { - struct nvk_device *dev = nvk_cmd_buffer_device(cmd); struct nouveau_ws_push *p = cmd->push; P_MTHD(p, NV9097, SET_OBJECT); @@ -215,25 +214,6 @@ nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd, /* TODO: Vertex runout */ /* TODO: temp */ - P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff000000); /* TODO */ - - nvk_push_descriptor_table_ref(p, &dev->samplers); - uint64_t tsp_addr = nvk_descriptor_table_base_address(&dev->samplers); - P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A); - P_NV9097_SET_TEX_SAMPLER_POOL_A(p, tsp_addr >> 32); - P_NV9097_SET_TEX_SAMPLER_POOL_B(p, tsp_addr); - P_NV9097_SET_TEX_SAMPLER_POOL_C(p, dev->samplers.alloc - 1); - - nvk_push_descriptor_table_ref(p, &dev->images); - uint64_t thp_addr = nvk_descriptor_table_base_address(&dev->images); - P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A); - P_NV9097_SET_TEX_HEADER_POOL_A(p, thp_addr >> 32); - P_NV9097_SET_TEX_HEADER_POOL_B(p, thp_addr & 0xffffffff); - P_NV9097_SET_TEX_HEADER_POOL_C(p, dev->images.alloc - 1); - - /* TODO: TIC */ - /* TODO: TSC */ - P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, { .mode = MODE_UPPER_LEFT, .flip_y = FLIP_Y_FALSE, diff --git a/src/nouveau/vulkan/nvk_compute_pipeline.c b/src/nouveau/vulkan/nvk_compute_pipeline.c index 69afd42d715..8bb2defbef9 100644 --- a/src/nouveau/vulkan/nvk_compute_pipeline.c +++ b/src/nouveau/vulkan/nvk_compute_pipeline.c @@ -40,8 +40,7 @@ gv100_compute_setup_launch_desc_template(uint32_t *qmd, NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shader->cp.smem_size, 0x100)); NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, - (shader->hdr[1] & 0xfffff0) + - align(shader->cp.lmem_size, 0x10)); + align(shader->slm_size, 0x10)); NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE, gv100_sm_config_smem_size(8 * 1024)); diff --git a/src/nouveau/vulkan/nvk_descriptor_table.h b/src/nouveau/vulkan/nvk_descriptor_table.h index 970caefd2f5..f3197f14847 100644 --- a/src/nouveau/vulkan/nvk_descriptor_table.h +++ b/src/nouveau/vulkan/nvk_descriptor_table.h @@ -41,18 +41,18 @@ void nvk_descriptor_table_free(struct nvk_device *device, struct nvk_descriptor_table *table, uint32_t index); -static inline void -nvk_push_descriptor_table_ref(struct nouveau_ws_push *push, - const struct nvk_descriptor_table *table) +static inline struct nouveau_ws_bo * +nvk_descriptor_table_get_bo_ref(struct nvk_descriptor_table *table, + uint32_t *alloc_count_out) { - if (table->bo) - nouveau_ws_push_ref(push, table->bo, NOUVEAU_WS_BO_RD); -} + simple_mtx_lock(&table->mutex); + struct nouveau_ws_bo *bo = table->bo; + if (bo) + nouveau_ws_bo_ref(bo); + *alloc_count_out = table->alloc; + simple_mtx_unlock(&table->mutex); -static inline uint64_t -nvk_descriptor_table_base_address(const struct nvk_descriptor_table *table) -{ - return table->bo->offset; + return bo; } #endif diff --git a/src/nouveau/vulkan/nvk_device.c b/src/nouveau/vulkan/nvk_device.c index 8a03f31c9ea..b78f9cd0b16 100644 --- a/src/nouveau/vulkan/nvk_device.c +++ b/src/nouveau/vulkan/nvk_device.c @@ -11,109 +11,309 @@ #include "vulkan/wsi/wsi_common.h" +#include "nvk_cl9097.h" #include "nvk_cl90b5.h" #include "nvk_cla0c0.h" #include "cla1c0.h" #include "nvk_clc3c0.h" -static VkResult -nvk_update_preamble_push(struct nvk_queue_state *qs, struct nvk_device *dev, - const struct nvk_queue_alloc_info *needs) +static void +nvk_slm_area_init(struct nvk_slm_area *area) { - struct nouveau_ws_bo *tls_bo = qs->tls_bo; - VkResult result; - if (needs->tls_size > qs->alloc_info.tls_size) { - tls_bo = nouveau_ws_bo_new(dev->pdev->dev, - needs->tls_size, (1 << 17), NOUVEAU_WS_BO_LOCAL); - if (!tls_bo) { - result = VK_ERROR_OUT_OF_DEVICE_MEMORY; - goto fail; - } - } + memset(area, 0, sizeof(*area)); + simple_mtx_init(&area->mutex, mtx_plain); +} - if (tls_bo != qs->tls_bo) { - if (qs->tls_bo) - nouveau_ws_bo_destroy(qs->tls_bo); - qs->tls_bo = tls_bo; - } +static void +nvk_slm_area_finish(struct nvk_slm_area *area) +{ + simple_mtx_destroy(&area->mutex); + if (area->bo) + nouveau_ws_bo_destroy(area->bo); +} - struct nouveau_ws_push *push = nouveau_ws_push_new(dev->pdev->dev, 256); +static struct nouveau_ws_bo * +nvk_slm_area_get_bo_ref(struct nvk_slm_area *area, + uint32_t *bytes_per_warp_out, + uint32_t *bytes_per_mp_out) +{ + simple_mtx_lock(&area->mutex); + struct nouveau_ws_bo *bo = area->bo; + if (bo) + nouveau_ws_bo_ref(bo); + *bytes_per_warp_out = area->bytes_per_warp; + *bytes_per_mp_out = area->bytes_per_mp; + simple_mtx_unlock(&area->mutex); - nouveau_ws_push_ref(push, qs->tls_bo, NOUVEAU_WS_BO_RDWR); - P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_A); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(push, qs->tls_bo->offset >> 32); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(push, qs->tls_bo->offset & 0xffffffff); - - nvk_push_descriptor_table_ref(push, &dev->samplers); - uint64_t tsp_addr = nvk_descriptor_table_base_address(&dev->samplers); - P_MTHD(push, NVA0C0, SET_TEX_SAMPLER_POOL_A); - P_NVA0C0_SET_TEX_SAMPLER_POOL_A(push, tsp_addr >> 32); - P_NVA0C0_SET_TEX_SAMPLER_POOL_B(push, tsp_addr & 0xffffffff); - P_NVA0C0_SET_TEX_SAMPLER_POOL_C(push, dev->samplers.alloc - 1); - - nvk_push_descriptor_table_ref(push, &dev->images); - uint64_t thp_addr = nvk_descriptor_table_base_address(&dev->images); - P_MTHD(push, NVA0C0, SET_TEX_HEADER_POOL_A); - P_NVA0C0_SET_TEX_HEADER_POOL_A(push, thp_addr >> 32); - P_NVA0C0_SET_TEX_HEADER_POOL_B(push, thp_addr & 0xffffffff); - P_NVA0C0_SET_TEX_HEADER_POOL_C(push, dev->images.alloc - 1); - - uint64_t temp_size = qs->tls_bo->size / dev->pdev->dev->mp_count; - P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(push, temp_size >> 32); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(push, temp_size & ~0x7fff); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(push, 0xff); - - if (dev->ctx->compute.cls < 0xc3c0) { - P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(push, temp_size >> 32); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(push, temp_size & ~0x7fff); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(push, 0xff); - - P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW); - P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(push, 0xff << 24); - - P_MTHD(push, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW); - P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(push, 0xfe << 24); - - // TODO CODE_ADDRESS_HIGH - } else { - uint64_t temp = 0xfeULL << 24; - - P_MTHD(push, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A); - P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(push, temp >> 32); - P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(push, temp & 0xffffffff); - - temp = 0xffULL << 24; - P_MTHD(push, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A); - P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(push, temp >> 32); - P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(push, temp & 0xffffffff); - } - - P_MTHD(push, NVA0C0, SET_SPA_VERSION); - P_NVA0C0_SET_SPA_VERSION(push, { .major = dev->ctx->compute.cls >= 0xa1c0 ? 0x4 : 0x3 }); - - if (qs->push) - nouveau_ws_push_destroy(qs->push); - qs->push = push; - return 0; - fail: - return vk_error(qs, result); + return bo; } static VkResult -nvk_update_preambles(struct nvk_queue_state *qs, struct nvk_device *device, - struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count) +nvk_slm_area_ensure(struct nvk_device *dev, + struct nvk_slm_area *area, + uint32_t bytes_per_thread) { - struct nvk_queue_alloc_info needs = { 0 }; - for (uint32_t i = 0; i < cmd_buffer_count; i++) { - struct nvk_cmd_buffer *cmd = (struct nvk_cmd_buffer *)cmd_buffers[i]; - needs.tls_size = MAX2(needs.tls_size, cmd->tls_space_needed); + assert(bytes_per_thread < (1 << 24)); + + /* TODO: Volta+doesn't use CRC */ + const uint32_t crs_size = 0; + + uint64_t bytes_per_warp = bytes_per_thread * 32 + crs_size; + + /* The hardware seems to require this alignment for + * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP + */ + bytes_per_warp = ALIGN(bytes_per_warp, 0x200); + + uint64_t bytes_per_mp = bytes_per_warp * 64; /* max warps */ + + /* The hardware seems to require this alignment for + * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER. + * + * Fortunately, this is just the alignment for bytes_per_warp multiplied + * by the number of warps, 64. It might matter for real on a GPU with 48 + * warps but we don't support any of those yet. + */ + assert(bytes_per_mp == ALIGN(bytes_per_mp, 0x8000)); + + /* nvk_slm_area::bytes_per_mp only ever increases so we can check this + * outside the lock and exit early in the common case. We only need to + * take the lock if we're actually going to resize. + * + * Also, we only care about bytes_per_mp and not bytes_per_warp because + * they are integer multiples of each other. + */ + if (likely(bytes_per_mp <= area->bytes_per_mp)) + return VK_SUCCESS; + + uint64_t size = bytes_per_mp * dev->pdev->dev->mp_count; + + struct nouveau_ws_bo *bo = + nouveau_ws_bo_new(dev->pdev->dev, size, 0, NOUVEAU_WS_BO_LOCAL); + if (bo == NULL) + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + struct nouveau_ws_bo *unref_bo; + simple_mtx_lock(&area->mutex); + if (bytes_per_mp <= area->bytes_per_mp) { + /* We lost the race, throw away our BO */ + assert(area->bytes_per_warp == bytes_per_warp); + unref_bo = bo; + } else { + unref_bo = area->bo; + area->bo = bo; + area->bytes_per_warp = bytes_per_warp; + area->bytes_per_mp = bytes_per_mp; + } + simple_mtx_unlock(&area->mutex); + + if (unref_bo) + nouveau_ws_bo_destroy(unref_bo); + + return VK_SUCCESS; +} + +static void +nvk_queue_state_init(struct nvk_queue_state *qs) +{ + memset(qs, 0, sizeof(*qs)); +} + +static void +nvk_queue_state_finish(struct nvk_device *dev, + struct nvk_queue_state *qs) +{ + if (qs->images.bo) + nouveau_ws_bo_destroy(qs->images.bo); + if (qs->samplers.bo) + nouveau_ws_bo_destroy(qs->samplers.bo); + if (qs->slm.bo) + nouveau_ws_bo_destroy(qs->slm.bo); + if (qs->push) + nouveau_ws_push_destroy(qs->push); +} + +static void +nvk_queue_state_ref(struct nouveau_ws_push *push, + struct nvk_queue_state *qs) +{ + if (qs->images.bo) + nouveau_ws_push_ref(push, qs->images.bo, NOUVEAU_WS_BO_RD); + if (qs->samplers.bo) + nouveau_ws_push_ref(push, qs->samplers.bo, NOUVEAU_WS_BO_RD); + if (qs->slm.bo) + nouveau_ws_push_ref(push, qs->slm.bo, NOUVEAU_WS_BO_RDWR); +} + +static VkResult +nvk_queue_state_update(struct nvk_device *dev, + struct nvk_queue_state *qs) +{ + struct nouveau_ws_bo *bo; + uint32_t alloc_count, bytes_per_warp, bytes_per_mp; + bool dirty = false; + + bo = nvk_descriptor_table_get_bo_ref(&dev->images, &alloc_count); + if (qs->images.bo != bo || qs->images.alloc_count != alloc_count) { + if (qs->images.bo) + nouveau_ws_bo_destroy(qs->images.bo); + qs->images.bo = bo; + qs->images.alloc_count = alloc_count; + dirty = true; + } else { + /* No change */ + if (bo) + nouveau_ws_bo_destroy(bo); } - if (needs.tls_size == qs->alloc_info.tls_size) + bo = nvk_descriptor_table_get_bo_ref(&dev->samplers, &alloc_count); + if (qs->samplers.bo != bo || qs->samplers.alloc_count != alloc_count) { + if (qs->samplers.bo) + nouveau_ws_bo_destroy(qs->samplers.bo); + qs->samplers.bo = bo; + qs->samplers.alloc_count = alloc_count; + dirty = true; + } else { + /* No change */ + if (bo) + nouveau_ws_bo_destroy(bo); + } + + bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_mp); + if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp || + qs->slm.bytes_per_mp != bytes_per_mp) { + if (qs->slm.bo) + nouveau_ws_bo_destroy(qs->slm.bo); + qs->slm.bo = bo; + qs->slm.bytes_per_warp = bytes_per_warp; + qs->slm.bytes_per_mp = bytes_per_mp; + dirty = true; + } else { + /* No change */ + if (bo) + nouveau_ws_bo_destroy(bo); + } + + /* TODO: We're currently depending on kernel reference counting to protect + * us here. If we ever stop reference counting in the kernel, we will + * either need to delay destruction or hold on to our extra BO references + * and insert a GPU stall here if anything has changed before dropping our + * old references. + */ + + if (!dirty) return VK_SUCCESS; - return nvk_update_preamble_push(qs, device, &needs); + + struct nouveau_ws_push *p = nouveau_ws_push_new(dev->pdev->dev, 256); + if (p == NULL) + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + if (qs->images.bo) { + nouveau_ws_push_ref(p, qs->images.bo, NOUVEAU_WS_BO_RD); + + /* Compute */ + P_MTHD(p, NVA0C0, SET_TEX_HEADER_POOL_A); + P_NVA0C0_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32); + P_NVA0C0_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset); + P_NVA0C0_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1); + + /* 3D */ + P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A); + P_NV9097_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32); + P_NV9097_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset); + P_NV9097_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1); + } + + if (qs->samplers.bo) { + nouveau_ws_push_ref(p, qs->samplers.bo, NOUVEAU_WS_BO_RD); + + /* Compute */ + P_MTHD(p, NVA0C0, SET_TEX_SAMPLER_POOL_A); + P_NVA0C0_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32); + P_NVA0C0_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset); + P_NVA0C0_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1); + + /* 3D */ + P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A); + P_NV9097_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32); + P_NV9097_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset); + P_NV9097_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1); + } + + if (qs->slm.bo) { + nouveau_ws_push_ref(p, qs->slm.bo, NOUVEAU_WS_BO_RDWR); + const uint64_t slm_addr = qs->slm.bo->offset; + const uint64_t slm_size = qs->slm.bo->size; + const uint64_t slm_per_warp = qs->slm.bytes_per_warp; + const uint64_t slm_per_mp = qs->slm.bytes_per_mp; + assert(!(slm_per_mp & 0x7fff)); + + /* Compute */ + P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_A); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr); + + P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(p, slm_per_mp >> 32); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(p, slm_per_mp); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(p, 0xff); + + if (dev->ctx->compute.cls < VOLTA_COMPUTE_A) { + P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(p, slm_per_mp >> 32); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(p, slm_per_mp); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(p, 0xff); + } + + /* 3D */ + P_MTHD(p, NV9097, SET_SHADER_LOCAL_MEMORY_A); + P_NV9097_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32); + P_NV9097_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr); + P_NV9097_SET_SHADER_LOCAL_MEMORY_C(p, slm_size >> 32); + P_NV9097_SET_SHADER_LOCAL_MEMORY_D(p, slm_size); + P_NV9097_SET_SHADER_LOCAL_MEMORY_E(p, slm_per_warp); + } + + /* We set memory windows unconditionally. Otherwise, the memory window + * might be in a random place and cause us to fault off into nowhere. + */ + if (dev->ctx->compute.cls >= VOLTA_COMPUTE_A) { + uint64_t temp = 0xfeULL << 24; + P_MTHD(p, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A); + P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(p, temp >> 32); + P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(p, temp & 0xffffffff); + + temp = 0xffULL << 24; + P_MTHD(p, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A); + P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(p, temp >> 32); + P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(p, temp & 0xffffffff); + } else { + P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW); + P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(p, 0xff << 24); + + P_MTHD(p, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW); + P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(p, 0xfe << 24); + + // TODO CODE_ADDRESS_HIGH + } + + /* From nvc0_screen.c: + * + * "Reduce likelihood of collision with real buffers by placing the + * hole at the top of the 4G area. This will have to be dealt with + * for real eventually by blocking off that area from the VM." + * + * Really?!? TODO: Fix this for realz. Annoyingly, we only have a + * 32-bit pointer for this in 3D rather than a full 48 like we have for + * compute. + */ + P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff << 24); + + if (qs->push) + nouveau_ws_push_destroy(qs->push); + qs->push = p; + + return VK_SUCCESS; } static VkResult @@ -127,18 +327,17 @@ nvk_queue_init(struct nvk_device *dev, struct nvk_queue *queue, if (result != VK_SUCCESS) return result; + nvk_queue_state_init(&queue->state); + return VK_SUCCESS; } static void nvk_queue_finish(struct nvk_device *dev, struct nvk_queue *queue) { - if (queue->state.push) - nouveau_ws_push_destroy(queue->state.push); + nvk_queue_state_finish(dev, &queue->state); if (queue->empty_push) nouveau_ws_push_destroy(queue->empty_push); - if (queue->state.tls_bo) - nouveau_ws_bo_destroy(queue->state.tls_bo); vk_queue_finish(&queue->vk); } @@ -155,8 +354,8 @@ nvk_queue_submit(struct vk_queue *vkqueue, struct vk_queue_submit *submission) P_MTHD(queue->empty_push, NV90B5, NOP); P_NV90B5_NOP(queue->empty_push, 0); } - result = nvk_update_preambles(&queue->state, device, submission->command_buffers, - submission->command_buffer_count); + + result = nvk_queue_state_update(device, &queue->state); if (result != VK_SUCCESS) return result; @@ -184,6 +383,8 @@ nvk_queue_submit(struct vk_queue *vkqueue, struct vk_queue_submit *submission) nouveau_ws_push_ref(cmd->push, bo_sync->bo, NOUVEAU_WS_BO_RDWR); } + nvk_queue_state_ref(cmd->push, &queue->state); + simple_mtx_lock(&device->memory_objects_lock); list_for_each_entry(struct nvk_device_memory, mem, &device->memory_objects, link) { @@ -267,10 +468,12 @@ nvk_CreateDevice(VkPhysicalDevice physicalDevice, if (result != VK_SUCCESS) goto fail_images; + nvk_slm_area_init(&device->slm); + result = nvk_queue_init(device, &device->queue, &pCreateInfo->pQueueCreateInfos[0], 0); if (result != VK_SUCCESS) - goto fail_samplers; + goto fail_slm; if (pthread_mutex_init(&device->mutex, NULL) != 0) { result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); @@ -312,7 +515,8 @@ fail_mutex: pthread_mutex_destroy(&device->mutex); fail_queue: nvk_queue_finish(device, &device->queue); -fail_samplers: +fail_slm: + nvk_slm_area_finish(&device->slm); nvk_descriptor_table_finish(device, &device->samplers); fail_images: nvk_descriptor_table_finish(device, &device->images); @@ -340,6 +544,7 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) pthread_mutex_destroy(&device->mutex); nvk_queue_finish(device, &device->queue); vk_device_finish(&device->vk); + nvk_slm_area_finish(&device->slm); nvk_descriptor_table_finish(device, &device->samplers); nvk_descriptor_table_finish(device, &device->images); assert(list_is_empty(&device->memory_objects)); @@ -347,3 +552,10 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) nouveau_ws_context_destroy(device->ctx); vk_free(&device->vk.alloc, device); } + +VkResult +nvk_device_ensure_slm(struct nvk_device *dev, + uint32_t bytes_per_thread) +{ + return nvk_slm_area_ensure(dev, &dev->slm, bytes_per_thread); +} diff --git a/src/nouveau/vulkan/nvk_device.h b/src/nouveau/vulkan/nvk_device.h index 01b085eb825..144969acf24 100644 --- a/src/nouveau/vulkan/nvk_device.h +++ b/src/nouveau/vulkan/nvk_device.h @@ -11,13 +11,29 @@ struct novueau_ws_context; struct nvk_physical_device; -struct nvk_queue_alloc_info { - uint64_t tls_size; +struct nvk_slm_area { + simple_mtx_t mutex; + struct nouveau_ws_bo *bo; + uint32_t bytes_per_warp; + uint32_t bytes_per_mp; }; struct nvk_queue_state { - struct nvk_queue_alloc_info alloc_info; - struct nouveau_ws_bo *tls_bo; + struct { + struct nouveau_ws_bo *bo; + uint32_t alloc_count; + } images; + + struct { + struct nouveau_ws_bo *bo; + uint32_t alloc_count; + } samplers; + + struct { + struct nouveau_ws_bo *bo; + uint32_t bytes_per_warp; + uint32_t bytes_per_mp; + } slm; struct nouveau_ws_push *push; }; @@ -41,6 +57,7 @@ struct nvk_device { struct nvk_descriptor_table images; struct nvk_descriptor_table samplers; + struct nvk_slm_area slm; struct nvk_queue queue; @@ -52,6 +69,9 @@ struct nvk_device { VK_DEFINE_HANDLE_CASTS(nvk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) +VkResult nvk_device_ensure_slm(struct nvk_device *dev, + uint32_t bytes_per_thread); + static struct nvk_physical_device * nvk_device_physical(struct nvk_device *device) { diff --git a/src/nouveau/vulkan/nvk_shader.c b/src/nouveau/vulkan/nvk_shader.c index f6ce2baa90a..3ea63ff8502 100644 --- a/src/nouveau/vulkan/nvk_shader.c +++ b/src/nouveau/vulkan/nvk_shader.c @@ -652,7 +652,7 @@ nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir, assert(info_out.bin.tlsSpace < (1 << 24)); shader->hdr[0] |= 1 << 26; shader->hdr[1] |= align(info_out.bin.tlsSpace, 0x10); /* l[] size */ - shader->need_tls = true; + shader->slm_size = info_out.bin.tlsSpace; } if (info_out.io.globalAccess) diff --git a/src/nouveau/vulkan/nvk_shader.h b/src/nouveau/vulkan/nvk_shader.h index 023e592a69d..e6fbfde30f5 100644 --- a/src/nouveau/vulkan/nvk_shader.h +++ b/src/nouveau/vulkan/nvk_shader.h @@ -22,9 +22,9 @@ struct nvk_shader { uint8_t *code_ptr; uint32_t code_size; - bool need_tls; uint8_t num_gprs; uint8_t num_barriers; + uint32_t slm_size; uint32_t hdr[NVC0_MAX_SHADER_HEADER_SIZE/4]; uint32_t flags[2]; @@ -53,7 +53,6 @@ struct nvk_shader { } fs; struct { - uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */ uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */ uint32_t block_size[3]; } cp;