diff --git a/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp b/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp index 3b473c9fcde..76870f87ec7 100644 --- a/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp @@ -88,6 +88,12 @@ select_rt_prolog(Program* program, ac_shader_config* config, PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record); /* Temporaries: */ + PhysReg tmp_wg_start_x = PhysReg{num_sgprs}; + num_sgprs++; + PhysReg tmp_wg_start_y = PhysReg{num_sgprs}; + num_sgprs++; + PhysReg tmp_swizzle_bound_y = PhysReg{num_sgprs}; + num_sgprs++; PhysReg tmp_wg_id_y; if (program->gfx_level >= GFX12) { tmp_wg_id_y = PhysReg{num_sgprs}; @@ -101,6 +107,11 @@ select_rt_prolog(Program* program, ac_shader_config* config, PhysReg tmp_ring_offsets = PhysReg{num_sgprs}; num_sgprs += 2; + PhysReg tmp_swizzled_id_x = PhysReg{256 + num_vgprs++}; + PhysReg tmp_swizzled_id_y = PhysReg{256 + num_vgprs++}; + PhysReg tmp_swizzled_id_shifted_x = PhysReg{256 + num_vgprs++}; + PhysReg tmp_swizzled_id_shifted_y = PhysReg{256 + num_vgprs++}; + /* Confirm some assumptions about register aliasing */ assert(in_ring_offsets == out_uniform_shader_addr); assert(get_arg_reg(in_args, in_args->push_constants) == @@ -163,10 +174,139 @@ select_rt_prolog(Program* program, ac_shader_config* config, bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1)); } + /* Swizzle ray launch IDs. We dispatch a 1D 32x1/64x1 workgroup natively. Many games dispatch + * rays in a 2D grid and write RT results to an image indexed by the x/y launch ID. + * In image space, a 1D workgroup maps to a 32/64-pixel wide line, which is inefficient for two + * reasons: + * - Image data is usually arranged on a Z-order curve, a long line makes for inefficient + * memory access patterns. + * - Each wave working on a "line" in image space may increase divergence. It's better to trace + * rays in a small square, since that makes it more likely all rays hit the same or similar + * objects. + * + * It turns out arranging rays along a Z-order curve is best for both image access patterns and + * ray divergence. Since image data is swizzled along a Z-order curve as well, swizzling the + * launch ID should result in each lane accessing whole cachelines at once. For traced rays, + * the Z-order curve means that each quad is arranged in a 2x2 square in image space as well. + * Since the RT unit processes 4 lanes at a time, reducing divergence per quad may result in + * better RT unit utilization (for example by the RT unit being able to skip the quad entirely + * if all 4 lanes are inactive). + * + * To swizzle along a Z-order curve, treat the 1D lane ID as a morton code. Then, do the inverse + * of morton code generation (i.e. deinterleaving the bits) to recover the x-y + * coordinates on the Z-order curve. + */ + + /* Deinterleave bits - odd bits go to tmp_swizzled_id_x, even ones to tmp_swizzled_id_y */ + bld.vop2(aco_opcode::v_and_b32, Definition(tmp_swizzled_id_x, v1), Operand::c32(0x55), + Operand(in_local_id, v1)); + bld.vop2(aco_opcode::v_and_b32, Definition(tmp_swizzled_id_y, v1), Operand::c32(0xaa), + Operand(in_local_id, v1)); + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(tmp_swizzled_id_y, v1), Operand::c32(1), + Operand(tmp_swizzled_id_y, v1)); + + /* The deinterleaved bits are currently padded with a zero between each bit, like so: + * 0 A 0 B 0 C 0 D + * Compact the deinterleaved bits by factor 2 to remove the padding, resulting in + * A B C D + */ + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(tmp_swizzled_id_shifted_x, v1), Operand::c32(1), + Operand(tmp_swizzled_id_x, v1)); + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(tmp_swizzled_id_shifted_y, v1), Operand::c32(1), + Operand(tmp_swizzled_id_y, v1)); + bld.vop2(aco_opcode::v_or_b32, Definition(tmp_swizzled_id_x, v1), Operand(tmp_swizzled_id_x, v1), + Operand(tmp_swizzled_id_shifted_x, v1)); + bld.vop2(aco_opcode::v_or_b32, Definition(tmp_swizzled_id_y, v1), Operand(tmp_swizzled_id_y, v1), + Operand(tmp_swizzled_id_shifted_y, v1)); + bld.vop2(aco_opcode::v_and_b32, Definition(tmp_swizzled_id_x, v1), Operand::c32(0x33u), + Operand(tmp_swizzled_id_x, v1)); + bld.vop2(aco_opcode::v_and_b32, Definition(tmp_swizzled_id_y, v1), Operand::c32(0x33u), + Operand(tmp_swizzled_id_y, v1)); + + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(tmp_swizzled_id_shifted_x, v1), Operand::c32(2), + Operand(tmp_swizzled_id_x, v1)); + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(tmp_swizzled_id_shifted_y, v1), Operand::c32(2), + Operand(tmp_swizzled_id_y, v1)); + bld.vop2(aco_opcode::v_or_b32, Definition(tmp_swizzled_id_x, v1), Operand(tmp_swizzled_id_x, v1), + Operand(tmp_swizzled_id_shifted_x, v1)); + bld.vop2(aco_opcode::v_or_b32, Definition(tmp_swizzled_id_y, v1), Operand(tmp_swizzled_id_y, v1), + Operand(tmp_swizzled_id_shifted_y, v1)); + bld.vop2(aco_opcode::v_and_b32, Definition(tmp_swizzled_id_x, v1), Operand::c32(0x0Fu), + Operand(tmp_swizzled_id_x, v1)); + bld.vop2(aco_opcode::v_and_b32, Definition(tmp_swizzled_id_y, v1), Operand::c32(0x0Fu), + Operand(tmp_swizzled_id_y, v1)); + + /* Fix up the workgroup IDs after converting from 32x1/64x1 to 8x4/8x8. The X dimension of the + * workgroup size gets divided by 4/8, while the Y dimension gets multiplied by the same amount. + * Rearrange the workgroups to make up for that, by rounding the Y component of the workgroup ID + * to the nearest multiple of 4/8. The remainder gets added to the X dimension, to make up for + * the fact we divided the X component of the ID. + */ + uint32_t workgroup_size_log2 = util_logbase2(program->workgroup_size); + bld.sop2(aco_opcode::s_lshl_b32, Definition(tmp_wg_start_x, s1), Definition(scc, s1), + Operand(in_wg_id_x, s1), Operand::c32(workgroup_size_log2)); + + /* unsigned y_remainder = tmp_wg_id_y % wg_height + * We use tmp_wg_start_y to store y_rem, and overwrite it later with the real wg_start_y. + */ + uint32_t workgroup_width_log2 = 3u; + uint32_t workgroup_height_mask = program->workgroup_size == 32 ? 0x3u : 0x7u; + bld.sop2(aco_opcode::s_and_b32, Definition(tmp_wg_start_y, s1), Definition(scc, s1), + Operand(tmp_wg_id_y, s1), Operand::c32(workgroup_height_mask)); + /* wg_start_x += y_remainder * workgroup_width (workgroup_width == 8) */ + bld.sop2(aco_opcode::s_lshl_b32, Definition(tmp_wg_start_y, s1), Definition(scc, s1), + Operand(tmp_wg_start_y, s1), Operand::c32(workgroup_width_log2)); + bld.sop2(aco_opcode::s_add_u32, Definition(tmp_wg_start_x, s1), Definition(scc, s1), + Operand(tmp_wg_start_x, s1), Operand(tmp_wg_start_y, s1)); + /* wg_start_y = ROUND_DOWN_TO(in_wg_y, workgroup_height) */ + bld.sop2(aco_opcode::s_and_b32, Definition(tmp_wg_start_y, s1), Definition(scc, s1), + Operand(tmp_wg_id_y, s1), Operand::c32(~workgroup_height_mask)); + + bld.vop2(aco_opcode::v_add_u32, Definition(tmp_swizzled_id_x, v1), Operand(tmp_wg_start_x, s1), + Operand(tmp_swizzled_id_x, v1)); + bld.vop2(aco_opcode::v_add_u32, Definition(tmp_swizzled_id_y, v1), Operand(tmp_wg_start_y, s1), + Operand(tmp_swizzled_id_y, v1)); + + /* We can only swizzle launch IDs if we run a full workgroup, and the resulting launch IDs + * won't exceed the launch size. Calculate unswizzled launch IDs here to fall back to them + * if the swizzled launch IDs are out of bounds. + */ bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[1], v1), Operand(tmp_wg_id_y, s1)); bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size), Operand(in_local_id, v1)); + /* Round the launch size down to the nearest multiple of workgroup_height. If the workgroup ID + * exceeds this, then the swizzled IDs' Y component will exceed the Y launch size and we have to + * fall back to unswizzled IDs. + */ + bld.sop2(aco_opcode::s_and_b32, Definition(tmp_swizzle_bound_y, s1), Definition(scc, s1), + Operand(out_launch_size_y, s1), Operand::c32(~workgroup_height_mask)); + /* If we are only running a partial workgroup, swizzling would yield a wrong result. */ + if (program->gfx_level >= GFX8) { + bld.sopc(Builder::s_cmp_lg, Definition(scc, s1), Operand(exec, bld.lm), + Operand::c32_or_c64(-1u, program->workgroup_size == 64)); + } else { + /* Write the XOR result to vcc because it's currently unused and a convenient register (always + * the same size as exec). We only care about the value of scc, i.e. if the result is nonzero + * (vcc is about to be overwritten anyway). + */ + bld.sop2(Builder::s_xor, Definition(vcc, bld.lm), Definition(scc, s1), Operand(exec, bld.lm), + Operand::c32_or_c64(-1u, program->workgroup_size == 64)); + } + bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), + Operand::c32_or_c64(-1u, program->wave_size == 64), + Operand::c32_or_c64(0u, program->wave_size == 64), Operand(scc, s1)); + bld.sopc(aco_opcode::s_cmp_ge_u32, Definition(scc, s1), Operand(tmp_wg_id_y, s1), + Operand(tmp_swizzle_bound_y, s1)); + bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), + Operand::c32_or_c64(-1u, program->wave_size == 64), + Operand(vcc, bld.lm), Operand(scc, s1)); + + bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1), + Operand(tmp_swizzled_id_x, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), + Operand(tmp_swizzled_id_y, v1), Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm)); + /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */ if (options->gfx_level < GFX9) { bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),