tu: Implement unaligned dispatches

These will be used for BVH building.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28447>
This commit is contained in:
Connor Abbott 2024-03-27 15:39:27 -04:00 committed by Marge Bot
parent 1bee1a9301
commit 80649e148d
4 changed files with 230 additions and 46 deletions

View file

@ -6478,8 +6478,7 @@ struct tu_dispatch_info
/**
* Indirect compute parameters resource.
*/
struct tu_buffer *indirect;
uint64_t indirect_offset;
VkDeviceAddress indirect;
};
static inline struct ir3_driver_params_cs
@ -6524,7 +6523,7 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
return;
bool direct_indirect_load =
!(info->indirect_offset & 0xf) &&
!(info->indirect & 0xf) &&
!(info->indirect && num_consts > IR3_DP_CS(base_group_x));
uint64_t iova = 0;
@ -6545,13 +6544,13 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
memcpy(consts.map, &driver_params, num_consts * sizeof(uint32_t));
iova = consts.iova;
} else if (direct_indirect_load) {
iova = info->indirect->iova + info->indirect_offset;
iova = info->indirect;
} else {
/* Vulkan guarantees only 4 byte alignment for indirect_offset.
* However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
*/
uint64_t indirect_iova = info->indirect->iova + info->indirect_offset;
uint64_t indirect_iova = info->indirect;
/* Wait for any previous uses to finish. */
tu_cs_emit_wfi(cs);
@ -6626,21 +6625,19 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
tu_cs_emit_array(cs, (uint32_t *)&driver_params, num_consts);
} else if (!(info->indirect_offset & 0xf)) {
} else if (!(info->indirect & 0xf)) {
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset);
tu_cs_emit_qw(cs, info->indirect);
} else {
/* Vulkan guarantees only 4 byte alignment for indirect_offset.
* However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
*/
uint64_t indirect_iova = info->indirect->iova + info->indirect_offset;
/* Wait for any previous uses to finish. */
tu_cs_emit_wfi(cs);
@ -6648,7 +6645,7 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
tu_cs_emit(cs, 0);
tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i));
tu_cs_emit_qw(cs, indirect_iova + i * 4);
tu_cs_emit_qw(cs, info->indirect + i * 4);
}
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
@ -6756,53 +6753,205 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
const uint16_t *local_size = shader->variant->local_size;
const uint32_t *num_groups = info->blocks;
tu_cs_emit_regs(cs,
HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 3,
.localsizex = local_size[0] - 1,
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1),
HLSQ_CS_NDRANGE_1(CHIP, .globalsize_x = local_size[0] * num_groups[0]),
HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0),
HLSQ_CS_NDRANGE_3(CHIP, .globalsize_y = local_size[1] * num_groups[1]),
HLSQ_CS_NDRANGE_4(CHIP, .globaloff_y = 0),
HLSQ_CS_NDRANGE_5(CHIP, .globalsize_z = local_size[2] * num_groups[2]),
HLSQ_CS_NDRANGE_6(CHIP, .globaloff_z = 0));
if (CHIP >= A7XX) {
if (info->unaligned) {
assert(CHIP >= A7XX);
if (info->indirect) {
/* This path is tailored for BVH building and currently only supports
* 1-dimensional dispatches with a power-of-two local size. We use
* CP_RUN_OPENCL instead of CP_EXEC_CS in order to dynamically set
* HLSQ_CS_KERNEL_GROUP_X, which is usually set implicitly by the
* packet, to the number of workgroups. The registers for Y and Z
* dimensions should be unused because we set the kernel dimension to
* 1.
*/
assert(local_size[1] == 1 && local_size[2] == 1);
assert(util_is_power_of_two_nonzero(local_size[0]));
tu_cs_emit_regs(cs,
HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 1,
.localsizex = local_size[0] - 1));
tu_cs_emit_regs(cs, HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0));
/* This does:
* - waits for pending cache flushes to finish
* - CP_WAIT_FOR_ME
*
* In a sequence of indirect dispatches this shouldn't wait for the
* previous dispatches to finish.
*/
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A7XX_HLSQ_CS_NDRANGE_1));
tu_cs_emit_qw(cs, info->indirect);
tu_cs_emit_pkt7(cs, CP_SCRATCH_WRITE, 2);
tu_cs_emit(cs, CP_SCRATCH_WRITE_0_SCRATCH(0));
tu_cs_emit(cs, ~0u);
/* CP_REG_RMW and CP_REG_TO_SCRATCH implicitly do a CP_WAIT_FOR_IDLE
* *and* CP_WAIT_FOR_ME, which is a full pipeline stall that we don't
* want, so manually wait for the CP_MEM_TO_REG write to land and
* then skip waiting below with SKIP_WAIT_FOR_ME.
*/
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
/* scratch0 = ((scratch0 & CS_NDRANGE_1) + -1
* = ((~0 & CS_NDRANGE_1) + -1
* = CS_NDRANGE_1 - 1
*/
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
tu_cs_emit(cs,
CP_REG_RMW_0_DST_REG(0) |
CP_REG_RMW_0_DST_SCRATCH |
CP_REG_RMW_0_SKIP_WAIT_FOR_ME |
CP_REG_RMW_0_SRC0_IS_REG |
CP_REG_RMW_0_SRC1_ADD);
tu_cs_emit(cs, REG_A7XX_HLSQ_CS_NDRANGE_1); /* SRC0 */
tu_cs_emit(cs, -1); /* SRC1 */
/* scratch0 = ((scratch0 & (local_size - 1)) rot 2
* = ((scratch0 & (local_size - 1)) << 2
*/
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
tu_cs_emit(cs,
CP_REG_RMW_0_DST_REG(0) |
CP_REG_RMW_0_DST_SCRATCH |
CP_REG_RMW_0_SKIP_WAIT_FOR_ME |
CP_REG_RMW_0_ROTATE(A7XX_HLSQ_CS_LAST_LOCAL_SIZE_LOCALSIZEX__SHIFT));
tu_cs_emit(cs, local_size[0] - 1); /* SRC0 */
tu_cs_emit(cs, 0); /* SRC1 */
/* write scratch0 to HLSQ_CS_LAST_LOCAL_SIZE */
tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
tu_cs_emit(cs,
CP_SCRATCH_TO_REG_0_REG(REG_A7XX_HLSQ_CS_LAST_LOCAL_SIZE) |
CP_SCRATCH_TO_REG_0_SCRATCH(0));
tu_cs_emit_pkt7(cs, CP_SCRATCH_WRITE, 2);
tu_cs_emit(cs, CP_SCRATCH_WRITE_0_SCRATCH(0));
tu_cs_emit(cs, ~0u);
/* scratch0 = (scratch0 & CS_NDRANGE_1) + local_size - 1
* = (~0u & CS_NDRANGE_1) + local_size - 1
* = CS_NDRANGE_1 + local_size - 1
*/
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
tu_cs_emit(cs,
CP_REG_RMW_0_DST_REG(0) |
CP_REG_RMW_0_DST_SCRATCH |
CP_REG_RMW_0_SKIP_WAIT_FOR_ME |
CP_REG_RMW_0_SRC0_IS_REG |
CP_REG_RMW_0_SRC1_ADD);
tu_cs_emit(cs, REG_A7XX_HLSQ_CS_NDRANGE_1); /* SRC0 */
tu_cs_emit(cs, local_size[0] - 1); /* SRC1 */
unsigned local_size_log2 = util_logbase2(local_size[0]);
/* scratch0 = (scratch0 & (~(local_size - 1)) rot (32 - log2(local_size))
* = scratch0 >> log2(local_size)
* = scratch0 / local_size
* = (CS_NDRANGE_1 + local_size - 1) / local_size
*/
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
tu_cs_emit(cs,
CP_REG_RMW_0_DST_REG(0) |
CP_REG_RMW_0_DST_SCRATCH |
CP_REG_RMW_0_SKIP_WAIT_FOR_ME |
CP_REG_RMW_0_ROTATE(32 - local_size_log2));
tu_cs_emit(cs, ~(local_size[0] - 1)); /* SRC0 */
tu_cs_emit(cs, 0); /* SRC1 */
/* write scratch0 to HLSQ_CS_KERNEL_GROUP_X */
tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
tu_cs_emit(cs,
CP_SCRATCH_TO_REG_0_REG(REG_A7XX_HLSQ_CS_KERNEL_GROUP_X) |
CP_SCRATCH_TO_REG_0_SCRATCH(0));
} else {
tu_cs_emit_regs(cs,
HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 3,
.localsizex = local_size[0] - 1,
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1),
HLSQ_CS_NDRANGE_1(CHIP, .globalsize_x = num_groups[0]),
HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0),
HLSQ_CS_NDRANGE_3(CHIP, .globalsize_y = num_groups[1]),
HLSQ_CS_NDRANGE_4(CHIP, .globaloff_y = 0),
HLSQ_CS_NDRANGE_5(CHIP, .globalsize_z = num_groups[2]),
HLSQ_CS_NDRANGE_6(CHIP, .globaloff_z = 0));
uint32_t last_local_size[3];
for (unsigned i = 0; i < 3; i++)
last_local_size[i] = ((num_groups[i] - 1) % local_size[i]) + 1;
tu_cs_emit_regs(cs,
A7XX_HLSQ_CS_LAST_LOCAL_SIZE(.localsizex = last_local_size[0] - 1,
.localsizey = last_local_size[1] - 1,
.localsizez = last_local_size[2] - 1));
}
} else {
tu_cs_emit_regs(cs,
A7XX_HLSQ_CS_LAST_LOCAL_SIZE(.localsizex = local_size[0] - 1,
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1));
HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 3,
.localsizex = local_size[0] - 1,
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1),
HLSQ_CS_NDRANGE_1(CHIP, .globalsize_x = local_size[0] * num_groups[0]),
HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0),
HLSQ_CS_NDRANGE_3(CHIP, .globalsize_y = local_size[1] * num_groups[1]),
HLSQ_CS_NDRANGE_4(CHIP, .globaloff_y = 0),
HLSQ_CS_NDRANGE_5(CHIP, .globalsize_z = local_size[2] * num_groups[2]),
HLSQ_CS_NDRANGE_6(CHIP, .globaloff_z = 0));
if (CHIP >= A7XX) {
tu_cs_emit_regs(cs,
A7XX_HLSQ_CS_LAST_LOCAL_SIZE(.localsizex = local_size[0] - 1,
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1));
}
}
if (info->indirect) {
uint64_t iova = info->indirect->iova + info->indirect_offset;
trace_start_compute_indirect(&cmd->trace, cs, info->unaligned);
trace_start_compute_indirect(&cmd->trace, cs);
if (info->unaligned) {
tu_cs_emit_pkt7(cs, CP_RUN_OPENCL, 1);
tu_cs_emit(cs, 0x00000000);
} else {
tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
tu_cs_emit(cs, 0x00000000);
tu_cs_emit_qw(cs, info->indirect);
tu_cs_emit(cs,
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
tu_cs_emit(cs, 0x00000000);
tu_cs_emit_qw(cs, iova);
tu_cs_emit(cs,
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
}
trace_end_compute_indirect(&cmd->trace, cs,
(struct u_trace_address) {
.bo = info->indirect->bo,
.offset = info->indirect_offset,
.bo = NULL,
.offset = info->indirect,
});
} else {
trace_start_compute(&cmd->trace, cs, info->indirect != NULL,
local_size[0], local_size[1], local_size[2],
info->blocks[0], info->blocks[1], info->blocks[2]);
trace_start_compute(&cmd->trace, cs, info->indirect != 0,
info->unaligned, local_size[0], local_size[1],
local_size[2], info->blocks[0], info->blocks[1],
info->blocks[2]);
tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
tu_cs_emit(cs, 0x00000000);
tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
if (info->unaligned) {
tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
tu_cs_emit(cs, 0x00000000);
tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(DIV_ROUND_UP(info->blocks[0],
local_size[0])));
tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(DIV_ROUND_UP(info->blocks[1],
local_size[1])));
tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(DIV_ROUND_UP(info->blocks[2],
local_size[2])));
} else {
tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
tu_cs_emit(cs, 0x00000000);
tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
}
trace_end_compute(&cmd->trace, cs);
}
@ -6852,13 +7001,39 @@ tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
VK_FROM_HANDLE(tu_buffer, buffer, _buffer);
struct tu_dispatch_info info = {};
info.indirect = buffer;
info.indirect_offset = offset;
info.indirect = buffer->iova + offset;
tu_dispatch<CHIP>(cmd_buffer, &info);
}
TU_GENX(tu_CmdDispatchIndirect);
void
tu_dispatch_unaligned(VkCommandBuffer commandBuffer,
uint32_t x, uint32_t y, uint32_t z)
{
VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
struct tu_dispatch_info info = {};
info.unaligned = true;
info.blocks[0] = x;
info.blocks[1] = y;
info.blocks[2] = z;
TU_CALLX(cmd_buffer->device, tu_dispatch)(cmd_buffer, &info);
}
void
tu_dispatch_unaligned_indirect(VkCommandBuffer commandBuffer,
VkDeviceAddress size_addr)
{
VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
struct tu_dispatch_info info = {};
info.unaligned = true;
info.indirect = size_addr;
TU_CALLX(cmd_buffer->device, tu_dispatch)(cmd_buffer, &info);
}
VKAPI_ATTR void VKAPI_CALL
tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
const VkSubpassEndInfo *pSubpassEndInfo)

View file

@ -683,6 +683,12 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
template <chip CHIP>
void tu_cmd_render(struct tu_cmd_buffer *cmd);
void tu_dispatch_unaligned(VkCommandBuffer commandBuffer,
uint32_t x, uint32_t y, uint32_t z);
void tu_dispatch_unaligned_indirect(VkCommandBuffer commandBuffer,
VkDeviceAddress size_addr);
void tu_write_buffer_cp(VkCommandBuffer commandBuffer,
VkDeviceAddress addr,
void *data, uint32_t size);

View file

@ -2707,6 +2707,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
fd_rd_output_init(&device->rd_output, output_name);
}
device->vk.cmd_dispatch_unaligned = tu_dispatch_unaligned;
device->vk.write_buffer_cp = tu_write_buffer_cp;
device->vk.flush_buffer_write_cp = tu_flush_buffer_write_cp;
device->vk.cmd_fill_buffer_addr = tu_cmd_fill_buffer_addr;

View file

@ -142,6 +142,7 @@ begin_end_tp('blit',
begin_end_tp('compute',
args=[Arg(type='uint8_t', var='indirect', c_format='%u'),
Arg(type='uint8_t', var='unaligned', c_format='%u'),
Arg(type='uint16_t', var='local_size_x', c_format='%u'),
Arg(type='uint16_t', var='local_size_y', c_format='%u'),
Arg(type='uint16_t', var='local_size_z', c_format='%u'),
@ -150,6 +151,7 @@ begin_end_tp('compute',
Arg(type='uint16_t', var='num_groups_z', c_format='%u')])
begin_end_tp('compute_indirect',
args=[Arg(type='uint8_t', var='unaligned', c_format='%u')],
end_args=[ArgStruct(type='VkDispatchIndirectCommand', var='size',
is_indirect=True, c_format="%ux%ux%u",
fields=['x', 'y', 'z'])])