From 2a8673f6946b90cce977f5a7f854b0fb6782d2b9 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 28 Aug 2022 20:08:09 +0200 Subject: [PATCH] nvk: rework QMD handling to support pre Turing Signed-off-by: Karol Herbst Part-of: --- src/nouveau/vulkan/nvk_cmd_dispatch.c | 59 ++++++++++--- src/nouveau/vulkan/nvk_compute_pipeline.c | 103 ++++++++++++++++++---- src/nouveau/vulkan/nvk_queue.c | 2 - 3 files changed, 133 insertions(+), 31 deletions(-) diff --git a/src/nouveau/vulkan/nvk_cmd_dispatch.c b/src/nouveau/vulkan/nvk_cmd_dispatch.c index 09ae88adf72..bdbdc16937e 100644 --- a/src/nouveau/vulkan/nvk_cmd_dispatch.c +++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c @@ -13,6 +13,7 @@ #include "nvk_cl9097.h" #include "nvk_cla0c0.h" #include "cla1c0.h" +#include "clc0c0.h" #include "nvk_clc3c0.h" #include "nvk_clc597.h" @@ -34,12 +35,22 @@ nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd, { } static void -qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd, - uint32_t x, uint32_t y, uint32_t z) +nva0c0_qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd, + uint32_t x, uint32_t y, uint32_t z) { - NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, x); - NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, y); - NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, z); + NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, x); + NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, y); + NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, z); +} + +static void +nvc0c0_qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd, + uint32_t x, uint32_t y, uint32_t z) +{ + NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, x); + NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, y); + /* this field is different from older QMD versions */ + NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, z); } static uint32_t @@ -54,8 +65,18 @@ qmd_dispatch_size_offset(struct nvk_device *dev) } static inline void -gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, - uint32_t size, uint64_t address) +nva0c0_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, + uint32_t size, uint64_t address) +{ + NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address); + NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32); + NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size); + NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE); +} + +static inline void +nvc0c0_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, + uint32_t size, uint64_t address) { NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address); NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32); @@ -90,6 +111,7 @@ nvk_flush_compute_state(struct nvk_cmd_buffer *cmd, const struct nvk_compute_pipeline *pipeline = cmd->state.cs.pipeline; const struct nvk_shader *shader = &pipeline->base.shaders[MESA_SHADER_COMPUTE]; + const struct nvk_device *dev = nvk_cmd_buffer_device(cmd); struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors; VkResult result; @@ -112,13 +134,24 @@ nvk_flush_compute_state(struct nvk_cmd_buffer *cmd, memset(qmd, 0, sizeof(qmd)); memcpy(qmd, pipeline->qmd_template, sizeof(pipeline->qmd_template)); - qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd, - desc->root.cs.grid_size[0], - desc->root.cs.grid_size[1], - desc->root.cs.grid_size[2]); + if (dev->ctx->compute.cls >= PASCAL_COMPUTE_A) { + nvc0c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd, + desc->root.cs.grid_size[0], + desc->root.cs.grid_size[1], + desc->root.cs.grid_size[2]); - gp100_cp_launch_desc_set_cb(qmd, 0, sizeof(desc->root), root_desc_addr); - gp100_cp_launch_desc_set_cb(qmd, 1, sizeof(desc->root), root_desc_addr); + nvc0c0_cp_launch_desc_set_cb(qmd, 0, sizeof(desc->root), root_desc_addr); + nvc0c0_cp_launch_desc_set_cb(qmd, 1, sizeof(desc->root), root_desc_addr); + } else { + assert(dev->ctx->compute.cls >= KEPLER_COMPUTE_A); + nva0c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd, + desc->root.cs.grid_size[0], + desc->root.cs.grid_size[1], + desc->root.cs.grid_size[2]); + + nva0c0_cp_launch_desc_set_cb(qmd, 0, sizeof(desc->root), root_desc_addr); + nva0c0_cp_launch_desc_set_cb(qmd, 1, sizeof(desc->root), root_desc_addr); + } uint64_t qmd_addr; result = nvk_cmd_buffer_upload_data(cmd, qmd, sizeof(qmd), 256, &qmd_addr); diff --git a/src/nouveau/vulkan/nvk_compute_pipeline.c b/src/nouveau/vulkan/nvk_compute_pipeline.c index 5a1ccd4b6b5..1f63776eb09 100644 --- a/src/nouveau/vulkan/nvk_compute_pipeline.c +++ b/src/nouveau/vulkan/nvk_compute_pipeline.c @@ -1,17 +1,22 @@ #include "nvk_private.h" #include "nvk_device.h" +#include "nvk_physical_device.h" #include "nvk_pipeline.h" #include "nvk_shader.h" #include "vk_nir.h" #include "vk_pipeline.h" #include "nouveau_bo.h" +#include "nouveau_context.h" #include "compiler/spirv/nir_spirv.h" #include "drf.h" +#include "cla0c0.h" #include "cla0c0qmd.h" +#include "clc0c0.h" #include "clc0c0qmd.h" +#include "clc3c0.h" #include "clc3c0qmd.h" #define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a) #define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a) @@ -20,6 +25,11 @@ #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a) #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a) +#define QMD_DEF_SET(qmd, class_id, version_major, version_minor, a...) \ + NVDEF_MW_SET((qmd), NV##class_id, QMDV##version_major##_##version_minor, ##a) +#define QMD_VAL_SET(qmd, class_id, version_major, version_minor, a...) \ + NVVAL_MW_SET((qmd), NV##class_id, QMDV##version_major##_##version_minor, ##a) + static int gv100_sm_config_smem_size(uint32_t size) { @@ -31,32 +41,84 @@ gv100_sm_config_smem_size(uint32_t size) return (size / 4096) + 1; } +#define base_compute_setup_launch_desc_template(qmd, shader, class_id, version_major, version_minor) \ +do { \ + QMD_DEF_SET(qmd, class_id, version_major, version_minor, API_VISIBLE_CALL_LIMIT, NO_CHECK); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, BARRIER_COUNT, shader->num_barriers); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION0, \ + shader->cp.block_size[0]); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION1, \ + shader->cp.block_size[1]); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION2, \ + shader->cp.block_size[2]); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, QMD_MAJOR_VERSION, version_major); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, QMD_VERSION, version_minor); \ + QMD_DEF_SET(qmd, class_id, version_major, version_minor, SAMPLER_INDEX, INDEPENDENTLY); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHADER_LOCAL_MEMORY_LOW_SIZE, \ + align(shader->slm_size, 0x10)); \ + QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHARED_MEMORY_SIZE, \ + align(shader->cp.smem_size, 0x100)); \ +} while (0) + static void -gv100_compute_setup_launch_desc_template(uint32_t *qmd, - struct nvk_shader *shader) +nva0c0_compute_setup_launch_desc_template(uint32_t *qmd, + struct nvk_shader *shader) { + base_compute_setup_launch_desc_template(qmd, shader, A0C0, 00, 06); + + NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_DATA_CACHE, TRUE); + NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, TRUE); + NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, TRUE); + NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, TRUE); + NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_DATA_CACHE, TRUE); + + if (shader->cp.smem_size <= (16 << 10)) + NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB); + else if (shader->cp.smem_size <= (32 << 10)) + NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB); + else if (shader->cp.smem_size <= (48 << 10)) + NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB); + else + unreachable("Invalid shared memory size"); + + uint64_t addr = nvk_shader_address(shader); + assert(addr < 0xffffffff); + NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET, addr); + NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, shader->num_gprs); + NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30); +} + +static void +nvc0c0_compute_setup_launch_desc_template(uint32_t *qmd, + struct nvk_shader *shader) +{ + base_compute_setup_launch_desc_template(qmd, shader, C0C0, 02, 01); + + uint64_t addr = nvk_shader_address(shader); + assert(addr < 0xffffffff); + + NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1); + NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET, addr); + NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, shader->num_gprs); +} + +static void +nvc3c0_compute_setup_launch_desc_template(uint32_t *qmd, + struct nvk_shader *shader) +{ + base_compute_setup_launch_desc_template(qmd, shader, C3C0, 02, 02); + NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1); - NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK); - NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY); - NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE, - align(shader->cp.smem_size, 0x100)); - NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, - align(shader->slm_size, 0x10)); - NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); + /* those are all QMD 2.2+ */ NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE, gv100_sm_config_smem_size(8 * 1024)); NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE, gv100_sm_config_smem_size(96 * 1024)); - NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2); - NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2); NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE, gv100_sm_config_smem_size(shader->cp.smem_size)); - NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, shader->cp.block_size[0]); - NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, shader->cp.block_size[1]); - NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, shader->cp.block_size[2]); NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, shader->num_gprs); - NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, shader->num_barriers); uint64_t addr = nvk_shader_address(shader); NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, addr & 0xffffffff); @@ -114,7 +176,16 @@ nvk_compute_pipeline_create(struct nvk_device *device, if (result != VK_SUCCESS) goto fail; - gv100_compute_setup_launch_desc_template(pipeline->qmd_template, &pipeline->base.shaders[MESA_SHADER_COMPUTE]); + struct nvk_shader *shader = &pipeline->base.shaders[MESA_SHADER_COMPUTE]; + if (device->ctx->compute.cls >= VOLTA_COMPUTE_A) + nvc3c0_compute_setup_launch_desc_template(pipeline->qmd_template, shader); + else if (device->ctx->compute.cls >= PASCAL_COMPUTE_A) + nvc0c0_compute_setup_launch_desc_template(pipeline->qmd_template, shader); + else if (device->ctx->compute.cls >= KEPLER_COMPUTE_A) + nva0c0_compute_setup_launch_desc_template(pipeline->qmd_template, shader); + else + unreachable("Fermi and older not supported!"); + *pPipeline = nvk_pipeline_to_handle(&pipeline->base); return VK_SUCCESS; diff --git a/src/nouveau/vulkan/nvk_queue.c b/src/nouveau/vulkan/nvk_queue.c index c70949a9d45..7c7632ceb44 100644 --- a/src/nouveau/vulkan/nvk_queue.c +++ b/src/nouveau/vulkan/nvk_queue.c @@ -210,8 +210,6 @@ nvk_queue_state_update(struct nvk_device *dev, P_MTHD(p, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW); P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(p, 0xfe << 24); - - // TODO CODE_ADDRESS_HIGH } /* From nvc0_screen.c: