diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 8bfc14c71cf..47dccb11a83 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -10933,6 +10933,28 @@ get_patch_base(isel_context* ctx) Operand::c32(pervertex_output_patch_size)); } +static void +passthrough_all_args(isel_context* ctx, std::vector& regs) +{ + struct ac_arg arg; + arg.used = true; + + for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++) + regs.emplace_back(get_arg_for_end(ctx, arg)); +} + +static void +build_end_with_regs(isel_context* ctx, std::vector& regs) +{ + aco_ptr end{create_instruction( + aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)}; + + for (unsigned i = 0; i < regs.size(); i++) + end->operands[i] = regs[i]; + + ctx->block->instructions.emplace_back(std::move(end)); +} + static void create_tcs_jump_to_epilog(isel_context* ctx) { @@ -11041,13 +11063,7 @@ create_tcs_end_for_epilog(isel_context* ctx) regs.emplace_back(Operand(tf_lds_offset, PhysReg{vgpr})); } - aco_ptr end{create_instruction( - aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)}; - - for (unsigned i = 0; i < regs.size(); i++) - end->operands[i] = regs[i]; - - ctx->block->instructions.emplace_back(std::move(end)); + build_end_with_regs(ctx, regs); } Pseudo_instruction* @@ -11671,6 +11687,55 @@ store_tess_factor_to_tess_ring(isel_context* ctx, Temp tess_ring_desc, Temp fact memory_sync_info(storage_vmem_output), true, false, false); } +Temp +build_fast_udiv_nuw(isel_context* ctx, Temp num, Temp multiplier, Temp pre_shift, Temp post_shift, + Temp increment) +{ + Builder bld(ctx->program, ctx->block); + + num = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), pre_shift, num); + num = bld.nuw().vadd32(bld.def(v1), num, increment); + num = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), num, multiplier); + return bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), post_shift, num); +} + +Temp +get_gl_vs_prolog_vertex_index(isel_context* ctx, const struct aco_gl_vs_prolog_info* vinfo, + unsigned input_index, Temp instance_divisor_constbuf) +{ + bool divisor_is_one = vinfo->instance_divisor_is_one & (1u << input_index); + bool divisor_is_fetched = vinfo->instance_divisor_is_fetched & (1u << input_index); + + Builder bld(ctx->program, ctx->block); + + Temp index; + if (divisor_is_one) { + index = get_arg(ctx, ctx->args->instance_id); + } else if (divisor_is_fetched) { + Temp instance_id = get_arg(ctx, ctx->args->instance_id); + + Temp udiv_factors = bld.smem(aco_opcode::s_buffer_load_dwordx4, bld.def(s4), + instance_divisor_constbuf, Operand::c32(input_index * 16)); + emit_split_vector(ctx, udiv_factors, 4); + + index = build_fast_udiv_nuw(ctx, instance_id, emit_extract_vector(ctx, udiv_factors, 0, s1), + emit_extract_vector(ctx, udiv_factors, 1, s1), + emit_extract_vector(ctx, udiv_factors, 2, s1), + emit_extract_vector(ctx, udiv_factors, 3, s1)); + } + + if (divisor_is_one || divisor_is_fetched) { + Temp start_instance = get_arg(ctx, ctx->args->start_instance); + index = bld.vadd32(bld.def(v1), index, start_instance); + } else { + Temp base_vertex = get_arg(ctx, ctx->args->base_vertex); + Temp vertex_id = get_arg(ctx, ctx->args->vertex_id); + index = bld.vadd32(bld.def(v1), base_vertex, vertex_id); + } + + return index; +} + } /* end namespace */ void @@ -12542,4 +12607,54 @@ select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config, cleanup_cfg(program); } +void +select_gl_vs_prolog(Program* program, void* pinfo, ac_shader_config* config, + const struct aco_compiler_options* options, const struct aco_shader_info* info, + const struct ac_shader_args* args) +{ + const struct aco_gl_vs_prolog_info* vinfo = (const struct aco_gl_vs_prolog_info*)pinfo; + isel_context ctx = + setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::VS); + + ctx.block->fp_mode = program->next_fp_mode; + + add_startpgm(&ctx); + append_logical_start(ctx.block); + + Builder bld(ctx.program, ctx.block); + + bld.sopp(aco_opcode::s_setprio, -1u, 0x3u); + + if (vinfo->as_ls && options->has_ls_vgpr_init_bug) + fix_ls_vgpr_init_bug(&ctx); + + std::vector regs; + passthrough_all_args(&ctx, regs); + + Temp instance_divisor_constbuf; + + if (vinfo->instance_divisor_is_fetched) { + Temp list = get_arg(&ctx, vinfo->internal_bindings); + list = convert_pointer_to_64_bit(&ctx, list); + + instance_divisor_constbuf = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list, + Operand::c32(vinfo->instance_diviser_buf_offset)); + } + + unsigned vgpr = 256 + ctx.args->num_vgprs_used; + + for (unsigned i = 0; i < vinfo->num_inputs; i++) { + Temp index = get_gl_vs_prolog_vertex_index(&ctx, vinfo, i, instance_divisor_constbuf); + regs.emplace_back(Operand(index, PhysReg{vgpr + i})); + } + + program->config->float_mode = program->blocks[0].fp_mode.val; + + append_logical_end(ctx.block); + + build_end_with_regs(&ctx, regs); + + cleanup_cfg(program); +} + } // namespace aco diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 94ae241a966..6de37e5ba1e 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2239,6 +2239,10 @@ void select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config, const struct aco_compiler_options* options, const struct aco_shader_info* info, const struct ac_shader_args* args); +void select_gl_vs_prolog(Program* program, void* pinfo, ac_shader_config* config, + const struct aco_compiler_options* options, + const struct aco_shader_info* info, const struct ac_shader_args* args); + void lower_phis(Program* program); void calc_min_waves(Program* program); void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h index 4a37e4672f1..ee965112190 100644 --- a/src/amd/compiler/aco_shader_info.h +++ b/src/amd/compiler/aco_shader_info.h @@ -94,6 +94,16 @@ struct aco_tcs_epilog_info { struct ac_arg tcs_offchip_layout; }; +struct aco_gl_vs_prolog_info { + uint16_t instance_divisor_is_one; + uint16_t instance_divisor_is_fetched; + unsigned instance_diviser_buf_offset; + unsigned num_inputs; + bool as_ls; + + struct ac_arg internal_bindings; +}; + struct aco_shader_info { enum ac_hw_stage hw_stage; uint8_t wave_size;