diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 42d0421b292..ea4518c2943 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8143,7 +8143,15 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break; case nir_intrinsic_load_num_workgroups: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups))); + if (ctx->options->load_grid_size_from_user_sgpr) { + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.num_work_groups)); + } else { + Temp addr = get_arg(ctx, ctx->args->ac.num_work_groups); + assert(addr.regClass() == s2); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()), + bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8))); + } emit_split_vector(ctx, dst, 3); break; } diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 67d0ebfa55f..f8d1deb2fcd 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -3721,7 +3721,14 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins result = ctx->instance_id_replaced ? ctx->instance_id_replaced : ctx->abi->instance_id; break; case nir_intrinsic_load_num_workgroups: - result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups); + if (ctx->abi->load_grid_size_from_user_sgpr) { + result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups); + } else { + LLVMTypeRef ptr_type = ac_array_in_const_addr_space(ctx->ac.v3i32); + LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->args->num_work_groups); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type, ""); + result = ac_build_load_invariant(&ctx->ac, ptr, ctx->ac.i32_0); + } break; case nir_intrinsic_load_local_invocation_index: result = visit_load_local_invocation_index(ctx); diff --git a/src/amd/llvm/ac_shader_abi.h b/src/amd/llvm/ac_shader_abi.h index c1269f83a02..dca3d4a7c6e 100644 --- a/src/amd/llvm/ac_shader_abi.h +++ b/src/amd/llvm/ac_shader_abi.h @@ -175,6 +175,9 @@ struct ac_shader_abi { * images. */ bool disable_aniso_single_level; + + /* Whether to inline the compute dispatch size in user sgprs. */ + bool load_grid_size_from_user_sgpr; }; #endif /* AC_SHADER_ABI_H */ diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index 3950028bf51..e7cb430b7cf 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -1906,6 +1906,9 @@ radv_CmdCopyAccelerationStructureKHR(VkCommandBuffer commandBuffer, cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + cmd_buffer->state.flush_bits |= + radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR, NULL); + radv_indirect_dispatch(cmd_buffer, src->bo, src_addr + offsetof(struct radv_accel_struct_header, copy_dispatch_size)); radv_meta_restore(&saved_state, cmd_buffer); @@ -2052,6 +2055,9 @@ radv_CmdCopyAccelerationStructureToMemoryKHR( cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + cmd_buffer->state.flush_bits |= + radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR, NULL); + radv_indirect_dispatch(cmd_buffer, src->bo, src_addr + offsetof(struct radv_accel_struct_header, copy_dispatch_size)); radv_meta_restore(&saved_state, cmd_buffer); diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f970ae81ca9..0e78c75be45 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -3998,6 +3998,10 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2KHR dst_ { switch ((VkAccessFlags2KHR)(1 << b)) { case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR: + /* SMEM loads are used to read compute dispatch size in shaders */ + if (!cmd_buffer->device->load_grid_size_from_user_sgpr) + flush_bits |= RADV_CMD_FLAG_INV_SCACHE; + break; case VK_ACCESS_2_INDEX_READ_BIT_KHR: case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: break; @@ -7263,24 +7267,17 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel radv_cs_add_buffer(ws, cs, info->indirect); if (loc->sgpr_idx != -1) { - if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10_3) { - unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4; + unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4; + if (cmd_buffer->device->load_grid_size_from_user_sgpr) { + assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10_3); radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0)); radeon_emit(cs, info->va); radeon_emit(cs, info->va >> 32); radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, 3); } else { - for (unsigned i = 0; i < 3; ++i) { - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, - COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG)); - radeon_emit(cs, (info->va + 4 * i)); - radeon_emit(cs, (info->va + 4 * i) >> 32); - radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i); - radeon_emit(cs, 0); - } + radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true); } } @@ -7335,12 +7332,22 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel } if (loc->sgpr_idx != -1) { - assert(loc->num_sgprs == 3); + if (cmd_buffer->device->load_grid_size_from_user_sgpr) { + assert(loc->num_sgprs == 3); - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); - radeon_emit(cs, blocks[0]); - radeon_emit(cs, blocks[1]); - radeon_emit(cs, blocks[2]); + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); + radeon_emit(cs, blocks[0]); + radeon_emit(cs, blocks[1]); + radeon_emit(cs, blocks[2]); + } else { + uint32_t offset; + if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset)) + return; + + uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset; + radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, + R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true); + } } if (offsets[0] || offsets[1] || offsets[2]) { diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 6810d1dcd2c..672cb7078bc 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -3411,6 +3411,9 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr device->physical_device->rad_info.family == CHIP_NAVY_FLOUNDER || device->physical_device->rad_info.family == CHIP_VANGOGH); + /* PKT3_LOAD_SH_REG_INDEX is supported on GFX8+, but it hangs with compute queues until GFX10.3. */ + device->load_grid_size_from_user_sgpr = device->physical_device->rad_info.chip_class >= GFX10_3; + device->keep_shader_info = keep_shader_info; result = radv_device_init_meta(device); if (result != VK_SUCCESS) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 13dcec3c1cc..32c22a4159f 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2311,6 +2311,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.abi.adjust_frag_coord_z = options->adjust_frag_coord_z; ctx.abi.robust_buffer_access = options->robust_buffer_access; ctx.abi.disable_aniso_single_level = options->disable_aniso_single_level; + ctx.abi.load_grid_size_from_user_sgpr = options->load_grid_size_from_user_sgpr; bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && info->is_ngg; if (shader_count >= 2 || is_ngg) diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 877c99b8005..831731ee313 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -835,6 +835,9 @@ struct radv_device { */ bool adjust_frag_coord_z; + /* Whether to inline the compute dispatch size in user sgprs. */ + bool load_grid_size_from_user_sgpr; + /* Whether the driver uses a global BO list. */ bool use_global_bo_list; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 66929492461..a8eb7ebb73a 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1934,6 +1934,7 @@ shader_compile(struct radv_device *device, struct vk_shader_module *module, module && !is_meta_shader(module->nir) && options->key.ps.enable_mrt_output_nan_fixup; options->adjust_frag_coord_z = options->key.adjust_frag_coord_z; options->disable_aniso_single_level = options->key.disable_aniso_single_level; + options->load_grid_size_from_user_sgpr = device->load_grid_size_from_user_sgpr; options->has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug; options->debug.func = radv_compiler_debug; options->debug.private_data = &debug_data; diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index bfc1fb9feba..2282caff8ab 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -127,6 +127,7 @@ struct radv_nir_compiler_options { bool wgp_mode; bool remap_spi_ps_input; bool disable_aniso_single_level; + bool load_grid_size_from_user_sgpr; enum radeon_family family; enum chip_class chip_class; const struct radeon_info *info; diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index c3677bc6c0d..a6ff6085777 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -211,7 +211,7 @@ allocate_user_sgprs(const struct radv_nir_compiler_options *options, if (info->cs.uses_sbt) user_sgpr_count += 1; if (info->cs.uses_grid_size) - user_sgpr_count += 3; + user_sgpr_count += options->load_grid_size_from_user_sgpr ? 3 : 2; if (info->cs.uses_ray_launch_size) user_sgpr_count += 3; break; @@ -594,7 +594,10 @@ radv_declare_shader_args(const struct radv_nir_compiler_options *options, } if (info->cs.uses_grid_size) { - ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups); + if (options->load_grid_size_from_user_sgpr) + ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups); + else + ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_PTR, &args->ac.num_work_groups); } if (info->cs.uses_ray_launch_size) { @@ -819,7 +822,8 @@ radv_declare_shader_args(const struct radv_nir_compiler_options *options, set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx); } if (args->ac.num_work_groups.used) { - set_loc_shader(args, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, 3); + set_loc_shader(args, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, + options->load_grid_size_from_user_sgpr ? 3 : 2); } if (args->ac.ray_launch_size.used) { set_loc_shader(args, AC_UD_CS_RAY_LAUNCH_SIZE, &user_sgpr_idx, 3); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 400d1b2230f..62d24451817 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -521,6 +521,7 @@ static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader * ctx->abi.convert_undef_to_zero = true; ctx->abi.adjust_frag_coord_z = false; ctx->abi.disable_aniso_single_level = true; + ctx->abi.load_grid_size_from_user_sgpr = true; const struct si_shader_info *info = &ctx->shader->selector->info; for (unsigned i = 0; i < info->num_outputs; i++) {