From 80649e148d3743f442926ab5efe2593664298918 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 27 Mar 2024 15:39:27 -0400 Subject: [PATCH] tu: Implement unaligned dispatches These will be used for BVH building. Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.cc | 267 ++++++++++++++++++++----- src/freedreno/vulkan/tu_cmd_buffer.h | 6 + src/freedreno/vulkan/tu_device.cc | 1 + src/freedreno/vulkan/tu_tracepoints.py | 2 + 4 files changed, 230 insertions(+), 46 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 5a83e3267b9..001f6d2e068 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -6478,8 +6478,7 @@ struct tu_dispatch_info /** * Indirect compute parameters resource. */ - struct tu_buffer *indirect; - uint64_t indirect_offset; + VkDeviceAddress indirect; }; static inline struct ir3_driver_params_cs @@ -6524,7 +6523,7 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, return; bool direct_indirect_load = - !(info->indirect_offset & 0xf) && + !(info->indirect & 0xf) && !(info->indirect && num_consts > IR3_DP_CS(base_group_x)); uint64_t iova = 0; @@ -6545,13 +6544,13 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, memcpy(consts.map, &driver_params, num_consts * sizeof(uint32_t)); iova = consts.iova; } else if (direct_indirect_load) { - iova = info->indirect->iova + info->indirect_offset; + iova = info->indirect; } else { /* Vulkan guarantees only 4 byte alignment for indirect_offset. * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment. */ - uint64_t indirect_iova = info->indirect->iova + info->indirect_offset; + uint64_t indirect_iova = info->indirect; /* Wait for any previous uses to finish. */ tu_cs_emit_wfi(cs); @@ -6626,21 +6625,19 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, tu_cs_emit(cs, 0); tu_cs_emit(cs, 0); tu_cs_emit_array(cs, (uint32_t *)&driver_params, num_consts); - } else if (!(info->indirect_offset & 0xf)) { + } else if (!(info->indirect & 0xf)) { tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | CP_LOAD_STATE6_0_NUM_UNIT(1)); - tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset); + tu_cs_emit_qw(cs, info->indirect); } else { /* Vulkan guarantees only 4 byte alignment for indirect_offset. * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment. */ - uint64_t indirect_iova = info->indirect->iova + info->indirect_offset; - /* Wait for any previous uses to finish. */ tu_cs_emit_wfi(cs); @@ -6648,7 +6645,7 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); tu_cs_emit(cs, 0); tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i)); - tu_cs_emit_qw(cs, indirect_iova + i * 4); + tu_cs_emit_qw(cs, info->indirect + i * 4); } tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); @@ -6756,53 +6753,205 @@ tu_dispatch(struct tu_cmd_buffer *cmd, const uint16_t *local_size = shader->variant->local_size; const uint32_t *num_groups = info->blocks; - tu_cs_emit_regs(cs, - HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 3, - .localsizex = local_size[0] - 1, - .localsizey = local_size[1] - 1, - .localsizez = local_size[2] - 1), - HLSQ_CS_NDRANGE_1(CHIP, .globalsize_x = local_size[0] * num_groups[0]), - HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0), - HLSQ_CS_NDRANGE_3(CHIP, .globalsize_y = local_size[1] * num_groups[1]), - HLSQ_CS_NDRANGE_4(CHIP, .globaloff_y = 0), - HLSQ_CS_NDRANGE_5(CHIP, .globalsize_z = local_size[2] * num_groups[2]), - HLSQ_CS_NDRANGE_6(CHIP, .globaloff_z = 0)); - if (CHIP >= A7XX) { + if (info->unaligned) { + assert(CHIP >= A7XX); + + if (info->indirect) { + /* This path is tailored for BVH building and currently only supports + * 1-dimensional dispatches with a power-of-two local size. We use + * CP_RUN_OPENCL instead of CP_EXEC_CS in order to dynamically set + * HLSQ_CS_KERNEL_GROUP_X, which is usually set implicitly by the + * packet, to the number of workgroups. The registers for Y and Z + * dimensions should be unused because we set the kernel dimension to + * 1. + */ + assert(local_size[1] == 1 && local_size[2] == 1); + assert(util_is_power_of_two_nonzero(local_size[0])); + + tu_cs_emit_regs(cs, + HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 1, + .localsizex = local_size[0] - 1)); + + tu_cs_emit_regs(cs, HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0)); + + /* This does: + * - waits for pending cache flushes to finish + * - CP_WAIT_FOR_ME + * + * In a sequence of indirect dispatches this shouldn't wait for the + * previous dispatches to finish. + */ + tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A7XX_HLSQ_CS_NDRANGE_1)); + tu_cs_emit_qw(cs, info->indirect); + + tu_cs_emit_pkt7(cs, CP_SCRATCH_WRITE, 2); + tu_cs_emit(cs, CP_SCRATCH_WRITE_0_SCRATCH(0)); + tu_cs_emit(cs, ~0u); + + /* CP_REG_RMW and CP_REG_TO_SCRATCH implicitly do a CP_WAIT_FOR_IDLE + * *and* CP_WAIT_FOR_ME, which is a full pipeline stall that we don't + * want, so manually wait for the CP_MEM_TO_REG write to land and + * then skip waiting below with SKIP_WAIT_FOR_ME. + */ + tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + + /* scratch0 = ((scratch0 & CS_NDRANGE_1) + -1 + * = ((~0 & CS_NDRANGE_1) + -1 + * = CS_NDRANGE_1 - 1 + */ + tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); + tu_cs_emit(cs, + CP_REG_RMW_0_DST_REG(0) | + CP_REG_RMW_0_DST_SCRATCH | + CP_REG_RMW_0_SKIP_WAIT_FOR_ME | + CP_REG_RMW_0_SRC0_IS_REG | + CP_REG_RMW_0_SRC1_ADD); + tu_cs_emit(cs, REG_A7XX_HLSQ_CS_NDRANGE_1); /* SRC0 */ + tu_cs_emit(cs, -1); /* SRC1 */ + + /* scratch0 = ((scratch0 & (local_size - 1)) rot 2 + * = ((scratch0 & (local_size - 1)) << 2 + */ + tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); + tu_cs_emit(cs, + CP_REG_RMW_0_DST_REG(0) | + CP_REG_RMW_0_DST_SCRATCH | + CP_REG_RMW_0_SKIP_WAIT_FOR_ME | + CP_REG_RMW_0_ROTATE(A7XX_HLSQ_CS_LAST_LOCAL_SIZE_LOCALSIZEX__SHIFT)); + tu_cs_emit(cs, local_size[0] - 1); /* SRC0 */ + tu_cs_emit(cs, 0); /* SRC1 */ + + /* write scratch0 to HLSQ_CS_LAST_LOCAL_SIZE */ + tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1); + tu_cs_emit(cs, + CP_SCRATCH_TO_REG_0_REG(REG_A7XX_HLSQ_CS_LAST_LOCAL_SIZE) | + CP_SCRATCH_TO_REG_0_SCRATCH(0)); + + tu_cs_emit_pkt7(cs, CP_SCRATCH_WRITE, 2); + tu_cs_emit(cs, CP_SCRATCH_WRITE_0_SCRATCH(0)); + tu_cs_emit(cs, ~0u); + + /* scratch0 = (scratch0 & CS_NDRANGE_1) + local_size - 1 + * = (~0u & CS_NDRANGE_1) + local_size - 1 + * = CS_NDRANGE_1 + local_size - 1 + */ + tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); + tu_cs_emit(cs, + CP_REG_RMW_0_DST_REG(0) | + CP_REG_RMW_0_DST_SCRATCH | + CP_REG_RMW_0_SKIP_WAIT_FOR_ME | + CP_REG_RMW_0_SRC0_IS_REG | + CP_REG_RMW_0_SRC1_ADD); + tu_cs_emit(cs, REG_A7XX_HLSQ_CS_NDRANGE_1); /* SRC0 */ + tu_cs_emit(cs, local_size[0] - 1); /* SRC1 */ + + unsigned local_size_log2 = util_logbase2(local_size[0]); + + /* scratch0 = (scratch0 & (~(local_size - 1)) rot (32 - log2(local_size)) + * = scratch0 >> log2(local_size) + * = scratch0 / local_size + * = (CS_NDRANGE_1 + local_size - 1) / local_size + */ + tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); + tu_cs_emit(cs, + CP_REG_RMW_0_DST_REG(0) | + CP_REG_RMW_0_DST_SCRATCH | + CP_REG_RMW_0_SKIP_WAIT_FOR_ME | + CP_REG_RMW_0_ROTATE(32 - local_size_log2)); + tu_cs_emit(cs, ~(local_size[0] - 1)); /* SRC0 */ + tu_cs_emit(cs, 0); /* SRC1 */ + + /* write scratch0 to HLSQ_CS_KERNEL_GROUP_X */ + tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1); + tu_cs_emit(cs, + CP_SCRATCH_TO_REG_0_REG(REG_A7XX_HLSQ_CS_KERNEL_GROUP_X) | + CP_SCRATCH_TO_REG_0_SCRATCH(0)); + } else { + tu_cs_emit_regs(cs, + HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 3, + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1), + HLSQ_CS_NDRANGE_1(CHIP, .globalsize_x = num_groups[0]), + HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0), + HLSQ_CS_NDRANGE_3(CHIP, .globalsize_y = num_groups[1]), + HLSQ_CS_NDRANGE_4(CHIP, .globaloff_y = 0), + HLSQ_CS_NDRANGE_5(CHIP, .globalsize_z = num_groups[2]), + HLSQ_CS_NDRANGE_6(CHIP, .globaloff_z = 0)); + uint32_t last_local_size[3]; + for (unsigned i = 0; i < 3; i++) + last_local_size[i] = ((num_groups[i] - 1) % local_size[i]) + 1; + tu_cs_emit_regs(cs, + A7XX_HLSQ_CS_LAST_LOCAL_SIZE(.localsizex = last_local_size[0] - 1, + .localsizey = last_local_size[1] - 1, + .localsizez = last_local_size[2] - 1)); + } + } else { tu_cs_emit_regs(cs, - A7XX_HLSQ_CS_LAST_LOCAL_SIZE(.localsizex = local_size[0] - 1, - .localsizey = local_size[1] - 1, - .localsizez = local_size[2] - 1)); + HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 3, + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1), + HLSQ_CS_NDRANGE_1(CHIP, .globalsize_x = local_size[0] * num_groups[0]), + HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0), + HLSQ_CS_NDRANGE_3(CHIP, .globalsize_y = local_size[1] * num_groups[1]), + HLSQ_CS_NDRANGE_4(CHIP, .globaloff_y = 0), + HLSQ_CS_NDRANGE_5(CHIP, .globalsize_z = local_size[2] * num_groups[2]), + HLSQ_CS_NDRANGE_6(CHIP, .globaloff_z = 0)); + if (CHIP >= A7XX) { + tu_cs_emit_regs(cs, + A7XX_HLSQ_CS_LAST_LOCAL_SIZE(.localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1)); + } } if (info->indirect) { - uint64_t iova = info->indirect->iova + info->indirect_offset; + trace_start_compute_indirect(&cmd->trace, cs, info->unaligned); - trace_start_compute_indirect(&cmd->trace, cs); + if (info->unaligned) { + tu_cs_emit_pkt7(cs, CP_RUN_OPENCL, 1); + tu_cs_emit(cs, 0x00000000); + } else { + tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit_qw(cs, info->indirect); + tu_cs_emit(cs, + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); - tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit_qw(cs, iova); - tu_cs_emit(cs, - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); + } trace_end_compute_indirect(&cmd->trace, cs, (struct u_trace_address) { - .bo = info->indirect->bo, - .offset = info->indirect_offset, + .bo = NULL, + .offset = info->indirect, }); } else { - trace_start_compute(&cmd->trace, cs, info->indirect != NULL, - local_size[0], local_size[1], local_size[2], - info->blocks[0], info->blocks[1], info->blocks[2]); + trace_start_compute(&cmd->trace, cs, info->indirect != 0, + info->unaligned, local_size[0], local_size[1], + local_size[2], info->blocks[0], info->blocks[1], + info->blocks[2]); - tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0])); - tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1])); - tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2])); + if (info->unaligned) { + tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(DIV_ROUND_UP(info->blocks[0], + local_size[0]))); + tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(DIV_ROUND_UP(info->blocks[1], + local_size[1]))); + tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(DIV_ROUND_UP(info->blocks[2], + local_size[2]))); + } else { + tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0])); + tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1])); + tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2])); + } trace_end_compute(&cmd->trace, cs); } @@ -6852,13 +7001,39 @@ tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VK_FROM_HANDLE(tu_buffer, buffer, _buffer); struct tu_dispatch_info info = {}; - info.indirect = buffer; - info.indirect_offset = offset; + info.indirect = buffer->iova + offset; tu_dispatch(cmd_buffer, &info); } TU_GENX(tu_CmdDispatchIndirect); +void +tu_dispatch_unaligned(VkCommandBuffer commandBuffer, + uint32_t x, uint32_t y, uint32_t z) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + struct tu_dispatch_info info = {}; + + info.unaligned = true; + info.blocks[0] = x; + info.blocks[1] = y; + info.blocks[2] = z; + TU_CALLX(cmd_buffer->device, tu_dispatch)(cmd_buffer, &info); +} + +void +tu_dispatch_unaligned_indirect(VkCommandBuffer commandBuffer, + VkDeviceAddress size_addr) +{ + VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + struct tu_dispatch_info info = {}; + + info.unaligned = true; + info.indirect = size_addr; + + TU_CALLX(cmd_buffer->device, tu_dispatch)(cmd_buffer, &info); +} + VKAPI_ATTR void VKAPI_CALL tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 47455f12d89..e9cd849f6ff 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -683,6 +683,12 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, template void tu_cmd_render(struct tu_cmd_buffer *cmd); +void tu_dispatch_unaligned(VkCommandBuffer commandBuffer, + uint32_t x, uint32_t y, uint32_t z); + +void tu_dispatch_unaligned_indirect(VkCommandBuffer commandBuffer, + VkDeviceAddress size_addr); + void tu_write_buffer_cp(VkCommandBuffer commandBuffer, VkDeviceAddress addr, void *data, uint32_t size); diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 4070fbb3f85..6306c4f3e4b 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2707,6 +2707,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, fd_rd_output_init(&device->rd_output, output_name); } + device->vk.cmd_dispatch_unaligned = tu_dispatch_unaligned; device->vk.write_buffer_cp = tu_write_buffer_cp; device->vk.flush_buffer_write_cp = tu_flush_buffer_write_cp; device->vk.cmd_fill_buffer_addr = tu_cmd_fill_buffer_addr; diff --git a/src/freedreno/vulkan/tu_tracepoints.py b/src/freedreno/vulkan/tu_tracepoints.py index be5b0475499..5143a957e94 100644 --- a/src/freedreno/vulkan/tu_tracepoints.py +++ b/src/freedreno/vulkan/tu_tracepoints.py @@ -142,6 +142,7 @@ begin_end_tp('blit', begin_end_tp('compute', args=[Arg(type='uint8_t', var='indirect', c_format='%u'), + Arg(type='uint8_t', var='unaligned', c_format='%u'), Arg(type='uint16_t', var='local_size_x', c_format='%u'), Arg(type='uint16_t', var='local_size_y', c_format='%u'), Arg(type='uint16_t', var='local_size_z', c_format='%u'), @@ -150,6 +151,7 @@ begin_end_tp('compute', Arg(type='uint16_t', var='num_groups_z', c_format='%u')]) begin_end_tp('compute_indirect', + args=[Arg(type='uint8_t', var='unaligned', c_format='%u')], end_args=[ArgStruct(type='VkDispatchIndirectCommand', var='size', is_indirect=True, c_format="%ux%ux%u", fields=['x', 'y', 'z'])])