diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index eda5e6996fe..300ebdfbab9 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -52,7 +52,7 @@ struct asm_context { // TODO: keep track of branch instructions referring blocks // and, when emitting the block, correct the offset in instr asm_context(Program* program_, std::vector* symbols_) - : program(program_), gfx_level(program->gfx_level), symbols(symbols_) + : program(program_), gfx_level(program->gfx_level), symbols(symbols_) { if (gfx_level <= GFX7) opcode = &instr_info.opcode_gfx7[0]; @@ -1160,8 +1160,7 @@ emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards, emit_instruction(ctx, out, instr.get()); /* create the s_setpc_b64 to jump */ - instr.reset( - bld.sop1(aco_opcode::s_setpc_b64, Operand(def.physReg(), s2)).instr); + instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(def.physReg(), s2)).instr); emit_instruction(ctx, out, instr.get()); } @@ -1218,8 +1217,7 @@ fix_constaddrs(asm_context& ctx, std::vector& out) } unsigned -emit_program(Program* program, std::vector& code, - std::vector* symbols) +emit_program(Program* program, std::vector& code, std::vector* symbols) { asm_context ctx(program, symbols); @@ -1252,8 +1250,8 @@ emit_program(Program* program, std::vector& code, code.insert(code.end(), (uint32_t*)program->constant_data.data(), (uint32_t*)(program->constant_data.data() + program->constant_data.size())); - program->config->scratch_bytes_per_wave = align( - program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule); + program->config->scratch_bytes_per_wave = + align(program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule); return exec_size; } diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 8e4decd6457..049a6044f11 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -254,8 +254,7 @@ public: void join_min(const VGPRCounterMap& other) { unsigned i; - BITSET_FOREACH_SET(i, other.resident, 256) - { + BITSET_FOREACH_SET (i, other.resident, 256) { if (BITSET_TEST(resident, i)) val[i] = MIN2(val[i] + base, other.val[i] + other.base) - base; else @@ -270,8 +269,7 @@ public: return false; unsigned i; - BITSET_FOREACH_SET(i, other.resident, 256) - { + BITSET_FOREACH_SET (i, other.resident, 256) { if (!BITSET_TEST(resident, i)) return false; if (val[i] + base != other.val[i] + other.base) @@ -365,11 +363,11 @@ search_backwards_internal(State& state, GlobalState& global_state, BlockState bl return; } -PRAGMA_DIAGNOSTIC_PUSH -PRAGMA_DIAGNOSTIC_IGNORED(-Waddress) + PRAGMA_DIAGNOSTIC_PUSH + PRAGMA_DIAGNOSTIC_IGNORED(-Waddress) if (block_cb != nullptr && !block_cb(global_state, block_state, block)) return; -PRAGMA_DIAGNOSTIC_POP + PRAGMA_DIAGNOSTIC_POP for (unsigned lin_pred : block->linear_preds) { search_backwards_internal( diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 5c48e0357cd..2008270a2d6 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -52,8 +52,7 @@ struct wqm_ctx { /* state for WQM propagation */ std::set worklist; std::vector branch_wqm; /* true if the branch condition in this block should be in wqm */ - wqm_ctx(Program* program_) - : program(program_), branch_wqm(program->blocks.size()) + wqm_ctx(Program* program_) : program(program_), branch_wqm(program->blocks.size()) { for (unsigned i = 0; i < program->blocks.size(); i++) worklist.insert(i); @@ -137,8 +136,7 @@ get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block) propagate_wqm = true; bool pred_by_exec = needs_exec_mask(instr.get()) || - instr->opcode == aco_opcode::p_logical_end || - instr->isBranch(); + instr->opcode == aco_opcode::p_logical_end || instr->isBranch(); if (needs_exact(instr)) instr_needs[i] = Exact; @@ -574,7 +572,8 @@ process_instructions(exec_ctx& ctx, Block* block, std::vectorindex].exec.resize(1); - assert(ctx.info[block->index].exec[0].second == (mask_type_exact | mask_type_global)); + assert(ctx.info[block->index].exec[0].second == + (mask_type_exact | mask_type_global)); current_exec = get_exec_op(ctx.info[block->index].exec.back().first); ctx.info[block->index].exec[0].first = Operand(bld.lm); } diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 9643a9e2f8a..985a8e1d944 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -91,9 +91,8 @@ enum vmem_type : uint8_t { vmem_bvh = 1 << 2, }; -static const uint16_t exp_events = - event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock | - event_ldsdir; +static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | + event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir; static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; static const uint16_t vm_events = event_vmem | event_flat; static const uint16_t vs_events = event_vmem_store; @@ -580,7 +579,8 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, } if (ctx.program->gfx_level >= GFX11) { - update_alu(ctx, false, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles)); + update_alu(ctx, false, false, false, + MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles)); } /* remove all gprs with higher counter from map */ @@ -775,8 +775,7 @@ insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_ */ uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat; uint32_t alu_events = event_trans | event_valu | event_salu; - bool force_linear = - ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events)); + bool force_linear = ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events)); insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles, force_linear); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index c04b1732165..b51cddf42b1 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -26,8 +26,8 @@ #include "aco_instruction_selection.h" #include "aco_builder.h" -#include "aco_ir.h" #include "aco_interface.h" +#include "aco_ir.h" #include "common/ac_nir.h" #include "common/sid.h" @@ -661,8 +661,8 @@ convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsign Operand::c32(src_bits), Operand::c32((unsigned)sign_extend)); } else { assert(src_bits < 32); - bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), Operand::c32(src_bits), - Operand::c32((unsigned)sign_extend)); + bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), + Operand::c32(src_bits), Operand::c32((unsigned)sign_extend)); } if (dst_bits == 64) { @@ -1894,8 +1894,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } case nir_op_uadd_sat: { if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { - Instruction* add_instr = - emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst); + Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst); add_instr->valu().clamp = 1; break; } @@ -1977,8 +1976,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } case nir_op_iadd_sat: { if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { - Instruction* add_instr = - emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst); + Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst); add_instr->valu().clamp = 1; break; } @@ -3316,8 +3314,8 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) exponent_large); Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent); - mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, - Operand::c64(~0llu), cond); + mantissa = + bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand::c64(~0llu), cond); Temp lower = bld.tmp(s1), upper = bld.tmp(s1); bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); Temp cond_small = @@ -3483,9 +3481,8 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) case nir_op_unpack_64_4x16: case nir_op_unpack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); - emit_split_vector(ctx, dst, - instr->op == nir_op_unpack_32_4x8 || - instr->op == nir_op_unpack_64_4x16 ? 4 : 2); + emit_split_vector( + ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2); break; case nir_op_pack_64_2x32_split: { Temp src0 = get_alu_src(ctx, instr->src[0]); @@ -4029,7 +4026,7 @@ struct LoadEmitInfo { unsigned num_components; unsigned component_size; Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */ - Temp idx = Temp(0, v1); /* buffer index */ + Temp idx = Temp(0, v1); /* buffer index */ unsigned component_stride = 0; unsigned const_offset = 0; unsigned align_mul = 0; @@ -4176,9 +4173,10 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi); } } - Temp aligned_offset_tmp = - aligned_offset.isTemp() ? aligned_offset.getTemp() : - aligned_offset.isConstant() ? bld.copy(bld.def(s1), aligned_offset) : Temp(0, s1); + Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() + : aligned_offset.isConstant() + ? bld.copy(bld.def(s1), aligned_offset) + : Temp(0, s1); Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align, reduced_const_offset, byte_align ? Temp() : info.dst); @@ -4508,8 +4506,7 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne mubuf->offen = offen; mubuf->idxen = idxen; mubuf->glc = info.glc; - mubuf->dlc = - info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); + mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); mubuf->slc = info.slc; mubuf->sync = info.sync; mubuf->offset = const_offset; @@ -4552,40 +4549,20 @@ mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, aco_opcode op = aco_opcode::num_opcodes; if (info.component_size == 2) { switch (bytes_needed) { - case 2: - op = aco_opcode::buffer_load_format_d16_x; - break; - case 4: - op = aco_opcode::buffer_load_format_d16_xy; - break; - case 6: - op = aco_opcode::buffer_load_format_d16_xyz; - break; - case 8: - op = aco_opcode::buffer_load_format_d16_xyzw; - break; - default: - unreachable("invalid buffer load format size"); - break; + case 2: op = aco_opcode::buffer_load_format_d16_x; break; + case 4: op = aco_opcode::buffer_load_format_d16_xy; break; + case 6: op = aco_opcode::buffer_load_format_d16_xyz; break; + case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break; + default: unreachable("invalid buffer load format size"); break; } } else { assert(info.component_size == 4); switch (bytes_needed) { - case 4: - op = aco_opcode::buffer_load_format_x; - break; - case 8: - op = aco_opcode::buffer_load_format_xy; - break; - case 12: - op = aco_opcode::buffer_load_format_xyz; - break; - case 16: - op = aco_opcode::buffer_load_format_xyzw; - break; - default: - unreachable("invalid buffer load format size"); - break; + case 4: op = aco_opcode::buffer_load_format_x; break; + case 8: op = aco_opcode::buffer_load_format_xy; break; + case 12: op = aco_opcode::buffer_load_format_xyz; break; + case 16: op = aco_opcode::buffer_load_format_xyzw; break; + default: unreachable("invalid buffer load format size"); break; } } @@ -4596,8 +4573,7 @@ mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, mubuf->offen = offen; mubuf->idxen = idxen; mubuf->glc = info.glc; - mubuf->dlc = - info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); + mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); mubuf->slc = info.slc; mubuf->sync = info.sync; mubuf->offset = const_offset; @@ -5229,9 +5205,9 @@ resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_off } void -emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx, Temp vdata, - unsigned const_offset, memory_sync_info sync, bool glc, bool slc, - bool swizzled) +emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx, + Temp vdata, unsigned const_offset, memory_sync_info sync, bool glc, + bool slc, bool swizzled) { assert(vdata.id()); assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6); @@ -5256,8 +5232,8 @@ emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp s vaddr_op = Operand(idx); Builder::Result r = - bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset, - offen, swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc, + bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset, offen, + swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc, /* dlc*/ false, slc); r->mubuf().sync = sync; @@ -5269,7 +5245,8 @@ store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Tem bool swizzled, memory_sync_info sync, bool glc, bool slc) { Builder bld(ctx->program, ctx->block); - assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); + assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 || + elem_size_bytes == 8); assert(write_mask); write_mask = util_widen_mask(write_mask, elem_size_bytes); @@ -5282,8 +5259,8 @@ store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Tem for (unsigned i = 0; i < write_count; i++) { unsigned const_offset = offsets[i] + base_const_offset; - emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset, sync, - glc, slc, swizzled); + emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset, + sync, glc, slc, swizzled); } } @@ -5387,7 +5364,7 @@ visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr) { /* LS pass output to TCS by temp if they have same in/out patch size. */ bool ls_need_output = ctx->stage == vertex_tess_control_hs && - ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq; + ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq; bool ps_need_output = ctx->stage == fragment_fs; @@ -6331,8 +6308,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) { opcode = aco_opcode::image_load; } else { - bool level_zero = - nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; + bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip; } @@ -6391,8 +6367,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); unsigned access = nir_intrinsic_access(instr); bool glc = ctx->options->gfx_level == GFX6 || - ((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && - ctx->program->gfx_level < GFX11); + ((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11); if (dim == GLSL_SAMPLER_DIM_BUF) { Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); @@ -6463,7 +6438,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) aco_ptr vec{create_instruction( aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)}; uint32_t index = 0; - u_foreach_bit(bit, dmask) { + u_foreach_bit (bit, dmask) { vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc)); } data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes())); @@ -6491,9 +6466,8 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) } void -translate_buffer_image_atomic_op(const nir_atomic_op op, - aco_opcode *buf_op, aco_opcode *buf_op64, - aco_opcode *image_op) +translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64, + aco_opcode* image_op) { switch (op) { case nir_atomic_op_iadd: @@ -6571,8 +6545,7 @@ translate_buffer_image_atomic_op(const nir_atomic_op op, *buf_op64 = aco_opcode::buffer_atomic_fmax_x2; *image_op = aco_opcode::image_atomic_fmax; break; - default: - unreachable("unsupported atomic operation"); + default: unreachable("unsupported atomic operation"); } } @@ -6682,9 +6655,8 @@ visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); - bool glc = - (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) && - ctx->program->gfx_level < GFX11; + bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) && + ctx->program->gfx_level < GFX11; unsigned write_count = 0; Temp write_datas[32]; @@ -6805,7 +6777,7 @@ visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr) /* Don't expand global loads when they use MUBUF or SMEM. * Global loads don't have the bounds checking that buffer loads have that * makes this safe. - */ + */ unsigned align = nir_intrinsic_align(instr); bool byte_align_for_smem_mubuf = can_use_byte_align_for_global_load(num_components, component_size, align, false); @@ -6836,9 +6808,8 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); - bool glc = - (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) && - ctx->program->gfx_level < GFX11; + bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) && + ctx->program->gfx_level < GFX11; unsigned write_count = 0; Temp write_datas[32]; @@ -6999,8 +6970,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax; op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2; break; - default: - unreachable("unsupported atomic operation"); + default: unreachable("unsupported atomic operation"); } aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; @@ -7192,8 +7162,8 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode), written_once ? semantic_can_reorder : semantic_none); - store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset, elem_size_bytes, - write_mask, swizzled, sync, glc, slc); + store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset, + elem_size_bytes, write_mask, swizzled, sync, glc, slc); } void @@ -7206,8 +7176,8 @@ visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr) /* If base address is 32bit, convert to 64bit with the high 32bit part. */ if (base.bytes() == 4) { - base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), - base, Operand::c32(ctx->options->address32_hi)); + base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base, + Operand::c32(ctx->options->address32_hi)); } aco_opcode opcode = aco_opcode::s_load_dword; @@ -7535,10 +7505,10 @@ get_scratch_resource(isel_context* ctx) Builder bld(ctx->program, ctx->block); Temp scratch_addr = ctx->program->private_segment_buffer; if (!scratch_addr.bytes()) { - Temp addr_lo = bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), - Operand::c32(aco_symbol_scratch_addr_lo)); - Temp addr_hi = bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), - Operand::c32(aco_symbol_scratch_addr_hi)); + Temp addr_lo = + bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); + Temp addr_hi = + bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi)); scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); } else if (ctx->stage.hw != HWStage::CS) { scratch_addr = @@ -8093,8 +8063,7 @@ Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i); Temp lanecount_to_mask(isel_context* ctx, Temp count); Temp -get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, - enum glsl_interp_mode interp) +get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, enum glsl_interp_mode interp) { bool linear = interp == INTERP_MODE_NOPERSPECTIVE; if (intrin == nir_intrinsic_load_barycentric_pixel || @@ -8109,9 +8078,8 @@ get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, } void -ds_ordered_count_offsets(isel_context *ctx, unsigned index_operand, - unsigned wave_release, unsigned wave_done, - unsigned *offset0, unsigned *offset1) +ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release, + unsigned wave_done, unsigned* offset0, unsigned* offset1) { unsigned ordered_count_index = index_operand & 0x3f; unsigned count_dword = (index_operand >> 24) & 0xf; @@ -8189,7 +8157,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) RegClass rc = RegClass(offset.type(), 1); Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc); bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset); - Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr)); + Temp bary = get_interp_param(ctx, instr->intrinsic, + (glsl_interp_mode)nir_intrinsic_interp_mode(instr)); emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2); break; } @@ -8977,8 +8946,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); if (ctx->args->merged_wave_info.used) bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), - get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), - Operand::c32(8u), Operand::zero()); + get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), Operand::c32(8u), + Operand::zero()); else if (ctx->args->gs_wave_id.used) bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_wave_id)); else @@ -9025,8 +8994,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) case nir_intrinsic_overwrite_tes_arguments_amd: { ctx->arg_temps[ctx->args->tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa); ctx->arg_temps[ctx->args->tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa); - ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = - get_ssa_temp(ctx, instr->src[3].ssa); + ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa); ctx->arg_temps[ctx->args->tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[2].ssa); break; } @@ -9036,7 +9004,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Temp src = ctx->arg_temps[nir_intrinsic_base(instr)]; assert(src.id()); - assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr : RegType::vgpr)); + assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr + : RegType::vgpr)); bld.copy(Definition(dst), src); emit_split_vector(ctx, dst, dst.size()); break; @@ -9048,35 +9017,34 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u)); unsigned offset0, offset1; - Instruction *ds_instr; + Instruction* ds_instr; Operand m; /* Lock a GDS mutex. */ ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1); m = bld.m0(bld.as_uniform(ordered_id)); - ds_instr = bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, - offset0, offset1, true); + ds_instr = + bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true); ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile); aco_ptr vec{create_instruction( aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)}; unsigned write_mask = nir_intrinsic_write_mask(instr); - bool use_gds_registers = - ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl; + bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl; for (unsigned i = 0; i < instr->num_components; i++) { if (write_mask & (1 << i)) { Temp chan_counter = emit_extract_vector(ctx, counter, i, v1); if (use_gds_registers) { - ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), - Operand(), chan_counter, i * 4, 0u, true); + ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(), + chan_counter, i * 4, 0u, true); } else { m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0x100u))); - ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, bld.def(v1), - gds_base, chan_counter, m, i * 4, 0u, true); + ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, bld.def(v1), gds_base, chan_counter, m, + i * 4, 0u, true); } ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw); @@ -9092,33 +9060,32 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) /* Unlock a GDS mutex. */ ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1); m = bld.m0(bld.as_uniform(ordered_id)); - ds_instr = bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, - offset0, offset1, true); + ds_instr = + bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true); ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile); emit_split_vector(ctx, dst, instr->num_components); break; } case nir_intrinsic_xfb_counter_sub_amd: { - bool use_gds_registers = - ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl; + bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl; unsigned write_mask = nir_intrinsic_write_mask(instr); Temp counter = get_ssa_temp(ctx, instr->src[0].ssa); Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u)); - u_foreach_bit(i, write_mask) { + u_foreach_bit (i, write_mask) { Temp chan_counter = emit_extract_vector(ctx, counter, i, v1); - Instruction *ds_instr; + Instruction* ds_instr; if (use_gds_registers) { - ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), - Operand(), chan_counter, i * 4, 0u, true); + ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter, + i * 4, 0u, true); } else { Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0x100u))); - ds_instr = bld.ds(aco_opcode::ds_sub_rtn_u32, bld.def(v1), - gds_base, chan_counter, m, i * 4, 0u, true); + ds_instr = bld.ds(aco_opcode::ds_sub_rtn_u32, bld.def(v1), gds_base, chan_counter, m, + i * 4, 0u, true); } ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw); } @@ -9162,15 +9129,14 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) exp->valid_mask = false; /* Compressed export uses two bits for a channel. */ - uint32_t channel_mask = exp->compressed ? - (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0) : - write_mask; + uint32_t channel_mask = + exp->compressed ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0) : write_mask; Temp value = get_ssa_temp(ctx, instr->src[0].ssa); for (unsigned i = 0; i < 4; i++) { - exp->operands[i] = channel_mask & BITFIELD_BIT(i) ? - Operand(emit_extract_vector(ctx, value, i, v1)) : - Operand(v1); + exp->operands[i] = channel_mask & BITFIELD_BIT(i) + ? Operand(emit_extract_vector(ctx, value, i, v1)) + : Operand(v1); } ctx->block->instructions.emplace_back(std::move(exp)); @@ -9183,13 +9149,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) struct aco_export_mrt mrt0, mrt1; for (unsigned i = 0; i < 4; i++) { - mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? - Operand(emit_extract_vector(ctx, val0, i, v1)) : - Operand(v1); + mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1)) + : Operand(v1); - mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? - Operand(emit_extract_vector(ctx, val1, i, v1)) : - Operand(v1); + mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1)) + : Operand(v1); } mrt0.enabled_channels = mrt1.enabled_channels = write_mask; @@ -9383,7 +9347,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) } if (has_wqm_coord) { - assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb || instr->op == nir_texop_lod); + assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb || + instr->op == nir_texop_lod); assert(wqm_coord.regClass().is_linear_vgpr()); assert(!a16 && !g16); } @@ -9701,9 +9666,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) if (dst.regClass() == s1) { Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(), emit_extract_vector(ctx, resource, 1, s1)); - bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), - bld.as_uniform(tmp_dst), Operand::c32(0x76543210), - bld.scc(is_not_null)); + bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst), + Operand::c32(0x76543210), bld.scc(is_not_null)); } else { Temp is_not_null = bld.tmp(bld.lm); bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(), @@ -10782,10 +10746,12 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export* out, /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ if (out->enable_mrt_output_nan_fixup && !is_16bit && - (out->col_format == V_028714_SPI_SHADER_32_R || out->col_format == V_028714_SPI_SHADER_32_GR || - out->col_format == V_028714_SPI_SHADER_32_AR || out->col_format == V_028714_SPI_SHADER_32_ABGR || + (out->col_format == V_028714_SPI_SHADER_32_R || + out->col_format == V_028714_SPI_SHADER_32_GR || + out->col_format == V_028714_SPI_SHADER_32_AR || + out->col_format == V_028714_SPI_SHADER_32_ABGR || out->col_format == V_028714_SPI_SHADER_FP16_ABGR)) { - u_foreach_bit(i, out->write_mask) { + u_foreach_bit (i, out->write_mask) { Temp is_not_nan = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]); values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i], @@ -10847,7 +10813,6 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export* out, } break; - case V_028714_SPI_SHADER_SNORM16_ABGR: if (is_16bit && ctx->options->gfx_level >= GFX9) { compr_op = aco_opcode::v_cvt_pknorm_i16_f16; @@ -10862,13 +10827,13 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export* out, /* clamp */ uint32_t max_rgb = out->is_int8 ? 255 : out->is_int10 ? 1023 : 0; - u_foreach_bit(i, out->write_mask) { + u_foreach_bit (i, out->write_mask) { uint32_t max = i == 3 && out->is_int10 ? 3 : max_rgb; values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]); } } else if (is_16bit) { - u_foreach_bit(i, out->write_mask) { + u_foreach_bit (i, out->write_mask) { Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false); values[i] = Operand(tmp); } @@ -10882,7 +10847,7 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export* out, uint32_t max_rgb = out->is_int8 ? 127 : out->is_int10 ? 511 : 0; uint32_t min_rgb = out->is_int8 ? -128 : out->is_int10 ? -512 : 0; - u_foreach_bit(i, out->write_mask) { + u_foreach_bit (i, out->write_mask) { uint32_t max = i == 3 && out->is_int10 ? 1 : max_rgb; uint32_t min = i == 3 && out->is_int10 ? -2u : min_rgb; @@ -10890,7 +10855,7 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export* out, values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]); } } else if (is_16bit) { - u_foreach_bit(i, out->write_mask) { + u_foreach_bit (i, out->write_mask) { Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true); values[i] = Operand(tmp); } @@ -10996,8 +10961,7 @@ create_fs_jump_to_epilog(isel_context* ctx) } } - Temp continue_pc = - convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.ps.epilog_pc)); + Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.ps.epilog_pc)); aco_ptr jump{create_instruction( aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + color_exports.size(), 0)}; @@ -11068,12 +11032,13 @@ add_startpgm(struct isel_context* ctx) Operand scratch_offset = Operand(get_arg(ctx, ctx->args->scratch_offset)); scratch_offset.setLateKill(true); - Operand scratch_addr = ctx->args->ring_offsets.used ? - Operand(get_arg(ctx, ctx->args->ring_offsets)) : Operand(s2); + Operand scratch_addr = ctx->args->ring_offsets.used + ? Operand(get_arg(ctx, ctx->args->ring_offsets)) + : Operand(s2); Builder bld(ctx->program, ctx->block); - bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), - scratch_addr, scratch_offset); + bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr, + scratch_offset); } return startpgm; @@ -11085,9 +11050,9 @@ fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm) assert(ctx->shader->info.stage == MESA_SHADER_VERTEX); Builder bld(ctx->program, ctx->block); constexpr unsigned hs_idx = 1u; - Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), - get_arg(ctx, ctx->args->merged_wave_info), - Operand::c32((8u << 16) | (hs_idx * 8u))); + Builder::Result hs_thread_count = + bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->merged_wave_info), Operand::c32((8u << 16) | (hs_idx * 8u))); Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp()); /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */ @@ -11218,10 +11183,9 @@ merged_wave_info_to_mask(isel_context* ctx, unsigned i) Builder bld(ctx->program, ctx->block); /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */ - Temp count = i == 0 - ? get_arg(ctx, ctx->args->merged_wave_info) - : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), - get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u)); + Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info) + : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u)); return lanecount_to_mask(ctx, count); } @@ -11276,10 +11240,10 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, ac_shader_config* config, const struct aco_compiler_options* options, - const struct aco_shader_info* info, - const struct ac_shader_args* args) + const struct aco_shader_info* info, const struct ac_shader_args* args) { - isel_context ctx = setup_isel_context(program, shader_count, shaders, config, options, info, args, false); + isel_context ctx = + setup_isel_context(program, shader_count, shaders, config, options, info, args, false); if (ctx.stage == raytracing_cs) return select_program_rt(ctx, shader_count, shaders, args); @@ -11391,8 +11355,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const void select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config, const struct aco_compiler_options* options, - const struct aco_shader_info* info, - const struct ac_shader_args* args) + const struct aco_shader_info* info, const struct ac_shader_args* args) { assert(options->gfx_level == GFX8); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 2c8ca8c561a..0108b2c8398 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -660,8 +660,8 @@ cleanup_context(isel_context* ctx) isel_context setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders, ac_shader_config* config, const struct aco_compiler_options* options, - const struct aco_shader_info* info, - const struct ac_shader_args* args, bool is_ps_epilog) + const struct aco_shader_info* info, const struct ac_shader_args* args, + bool is_ps_epilog) { SWStage sw_stage = SWStage::None; for (unsigned i = 0; i < shader_count; i++) { diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 5c121b51796..45bf4aa41cf 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -80,8 +80,7 @@ validate(aco::Program* program) } static std::string -get_disasm_string(aco::Program* program, std::vector& code, - unsigned exec_size) +get_disasm_string(aco::Program* program, std::vector& code, unsigned exec_size) { std::string disasm; @@ -111,8 +110,7 @@ get_disasm_string(aco::Program* program, std::vector& code, static std::string aco_postprocess_shader(const struct aco_compiler_options* options, - const struct aco_shader_info *info, - std::unique_ptr& program) + const struct aco_shader_info* info, std::unique_ptr& program) { std::string llvm_ir; @@ -211,12 +209,9 @@ aco_postprocess_shader(const struct aco_compiler_options* options, } void -aco_compile_shader(const struct aco_compiler_options* options, - const struct aco_shader_info* info, +aco_compile_shader(const struct aco_compiler_options* options, const struct aco_shader_info* info, unsigned shader_count, struct nir_shader* const* shaders, - const struct ac_shader_args *args, - aco_callback *build_binary, - void **binary) + const struct ac_shader_args* args, aco_callback* build_binary, void** binary) { aco::init(); @@ -335,13 +330,8 @@ aco_compile_vs_prolog(const struct aco_compiler_options* options, if (get_disasm) disasm = get_disasm_string(program.get(), code, exec_size); - (*build_prolog)(binary, - config.num_sgprs, - config.num_vgprs, - code.data(), - code.size(), - disasm.data(), - disasm.size()); + (*build_prolog)(binary, config.num_sgprs, config.num_vgprs, code.data(), code.size(), + disasm.data(), disasm.size()); } void @@ -377,11 +367,6 @@ aco_compile_ps_epilog(const struct aco_compiler_options* options, if (get_disasm) disasm = get_disasm_string(program.get(), code, exec_size); - (*build_epilog)(binary, - config.num_sgprs, - config.num_vgprs, - code.data(), - code.size(), - disasm.data(), - disasm.size()); + (*build_epilog)(binary, config.num_sgprs, config.num_vgprs, code.data(), code.size(), + disasm.data(), disasm.size()); } diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h index a91d7a33d70..e28ff73191b 100644 --- a/src/amd/compiler/aco_interface.h +++ b/src/amd/compiler/aco_interface.h @@ -25,9 +25,9 @@ #ifndef ACO_INTERFACE_H #define ACO_INTERFACE_H -#include "amd_family.h" - #include "aco_shader_info.h" + +#include "amd_family.h" #ifdef __cplusplus extern "C" { #endif @@ -47,24 +47,18 @@ typedef void(aco_callback)(void** priv_ptr, const struct ac_shader_config* confi const char* llvm_ir_str, unsigned llvm_ir_size, const char* disasm_str, unsigned disasm_size, uint32_t* statistics, uint32_t stats_size, uint32_t exec_size, const uint32_t* code, uint32_t code_dw, - const struct aco_symbol *symbols, unsigned num_symbols); + const struct aco_symbol* symbols, unsigned num_symbols); -typedef void (aco_shader_part_callback)(void **priv_ptr, - uint32_t num_sgprs, - uint32_t num_vgprs, - const uint32_t *code, - uint32_t code_size, - const char *disasm_str, - uint32_t disasm_size); +typedef void(aco_shader_part_callback)(void** priv_ptr, uint32_t num_sgprs, uint32_t num_vgprs, + const uint32_t* code, uint32_t code_size, + const char* disasm_str, uint32_t disasm_size); extern const struct aco_compiler_statistic_info* aco_statistic_infos; void aco_compile_shader(const struct aco_compiler_options* options, - const struct aco_shader_info* info, - unsigned shader_count, struct nir_shader* const* shaders, - const struct ac_shader_args *args, - aco_callback *build_binary, - void **binary); + const struct aco_shader_info* info, unsigned shader_count, + struct nir_shader* const* shaders, const struct ac_shader_args* args, + aco_callback* build_binary, void** binary); void aco_compile_rt_prolog(const struct aco_compiler_options* options, const struct aco_shader_info* info, const struct ac_shader_args* in_args, diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 639b6db54e9..f552cc2f3c7 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -98,8 +98,9 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info, program->wave_size = info->wave_size; program->lane_mask = program->wave_size == 32 ? s1 : s2; - program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 : - gfx_level >= GFX7 ? 512 : 256; + program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 + : gfx_level >= GFX7 ? 512 + : 256; program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule; /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */ diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 8f8b5deb736..08a9a2a2017 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -140,9 +140,9 @@ enum storage_class : uint8_t { storage_buffer = 0x1, /* SSBOs and global memory */ storage_gds = 0x2, storage_image = 0x4, - storage_shared = 0x8, /* or TCS output */ - storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */ - storage_task_payload = 0x20,/* Task-Mesh payload */ + storage_shared = 0x8, /* or TCS output */ + storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */ + storage_task_payload = 0x20, /* Task-Mesh payload */ storage_scratch = 0x40, storage_vgpr_spill = 0x80, storage_count = 8, /* not counting storage_none */ @@ -823,7 +823,8 @@ public: assert(bytes() == 2 || bytes() == 4); if (opsel) { if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral()) - return int16_t(data_.i) >> 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */ + return int16_t(data_.i) >> + 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */ else return data_.i >> 16; } @@ -1418,7 +1419,8 @@ struct VINTERP_inreg_instruction : public VALU_instruction { uint8_t padding5; uint8_t padding6; }; -static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); +static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4, + "Unexpected padding"); /** * Data Parallel Primitives Format: @@ -1809,8 +1811,7 @@ memory_sync_info get_sync_info(const Instruction* instr); inline bool is_dead(const std::vector& uses, const Instruction* instr) { - if (instr->definitions.empty() || instr->isBranch() || - instr->opcode == aco_opcode::p_startpgm || + if (instr->definitions.empty() || instr->isBranch() || instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch || instr->opcode == aco_opcode::p_dual_src_export_gfx11) return false; @@ -2216,8 +2217,7 @@ void init_program(Program* program, Stage stage, const struct aco_shader_info* i void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, ac_shader_config* config, const struct aco_compiler_options* options, - const struct aco_shader_info* info, - const struct ac_shader_args* args); + const struct aco_shader_info* info, const struct ac_shader_args* args); void select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config, const struct aco_compiler_options* options, @@ -2258,7 +2258,7 @@ bool dealloc_vgprs(Program* program); void insert_NOPs(Program* program); void form_hard_clauses(Program* program); unsigned emit_program(Program* program, std::vector& code, - std::vector *symbols); + std::vector* symbols); /** * Returns true if print_asm can disassemble the given program for the current build/runtime * configuration diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 61fe7ebb66c..f79f96aa8ef 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2181,7 +2181,7 @@ lower_image_sample(lower_context* ctx, aco_ptr& instr) instr->mimg().strict_wqm = false; if ((3 + num_vaddr) > instr->operands.size()) { - MIMG_instruction *new_instr = create_instruction( + MIMG_instruction* new_instr = create_instruction( instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size()); std::copy(instr->definitions.cbegin(), instr->definitions.cend(), new_instr->definitions.begin()); @@ -2346,8 +2346,8 @@ lower_to_hw_instr(Program* program) target = program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ; if (program->stage == fragment_fs) - bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), - 0, target, false, true, true); + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0, + target, false, true, true); if (should_dealloc_vgprs) bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs); bld.sopp(aco_opcode::s_endpgm); @@ -2518,8 +2518,7 @@ lower_to_hw_instr(Program* program) create_bperm(bld, ext_swiz, dst, Operand::zero()); } } else { - SDWA_instruction& sdwa = - bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa(); + SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa(); sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext); } } @@ -2574,7 +2573,8 @@ lower_to_hw_instr(Program* program) } else { assert(dst.regClass() == v2b); bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op) - ->sdwa().sel[1] = SubdwordSel::ubyte; + ->sdwa() + .sel[1] = SubdwordSel::ubyte; } break; } diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 6462122e713..8e285709f9d 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1369,7 +1369,7 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) if (instr->isSALU() || instr->isPseudo()) { unsigned bits = get_operand_size(instr, i); if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) && - alu_can_accept_constant(instr, i)) { + alu_can_accept_constant(instr, i)) { instr->operands[i] = get_constant_op(ctx, info, bits); continue; } @@ -2116,9 +2116,10 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) case aco_opcode::v_mbcnt_hi_u32_b32_e64: { if (instr->operands[0].constantEquals(-1) && instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_usedef()) { - Instruction *usedef_instr = ctx.info[instr->operands[1].tempId()].instr; + Instruction* usedef_instr = ctx.info[instr->operands[1].tempId()].instr; if (usedef_instr->opcode == aco_opcode::v_mbcnt_lo_u32_b32 && - usedef_instr->operands[0].constantEquals(-1) && usedef_instr->operands[1].constantEquals(0)) + usedef_instr->operands[0].constantEquals(-1) && + usedef_instr->operands[1].constantEquals(0)) ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get()); } break; @@ -2370,7 +2371,9 @@ optimize_cmp_subgroup_invocation(opt_ctx& ctx, aco_ptr& instr) return false; /* Find the constant operand or return early if there isn't one. */ - const int const_op_idx = instr->operands[0].isConstant() ? 0 : instr->operands[1].isConstant() ? 1 : -1; + const int const_op_idx = instr->operands[0].isConstant() ? 0 + : instr->operands[1].isConstant() ? 1 + : -1; if (const_op_idx == -1) return false; @@ -2413,11 +2416,10 @@ optimize_cmp_subgroup_invocation(opt_ctx& ctx, aco_ptr& instr) first_bit = val + 1; num_bits = val >= wave_size ? 0 : (wave_size - val - 1); break; - default: - return false; + default: return false; } - Instruction *cpy = NULL; + Instruction* cpy = NULL; const uint64_t mask = BITFIELD64_RANGE(first_bit, num_bits); if (wave_size == 64 && mask > 0x7fffffff && mask != -1ull) { /* Mask can't be represented as a 64-bit constant or literal, use s_bfm_b64. */ @@ -2426,7 +2428,8 @@ optimize_cmp_subgroup_invocation(opt_ctx& ctx, aco_ptr& instr) cpy->operands[1] = Operand::c32(first_bit); } else { /* Copy mask as a literal constant. */ - cpy = create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1); + cpy = + create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1); cpy->operands[0] = wave_size == 32 ? Operand::c32((uint32_t)mask) : Operand::c64(mask); } @@ -4821,10 +4824,12 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) */ if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) { if (instr->operands[0].isTemp() && fixed_to_exec(instr->operands[1]) && - ctx.uses[instr->operands[0].tempId()] == 1 && ctx.uses[instr->definitions[1].tempId()] == 0 && + ctx.uses[instr->operands[0].tempId()] == 1 && + ctx.uses[instr->definitions[1].tempId()] == 0 && can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), instr->pass_flags)) { ctx.uses[instr->operands[0].tempId()]--; - ctx.info[instr->operands[0].tempId()].instr->definitions[0].setTemp(instr->definitions[0].getTemp()); + ctx.info[instr->operands[0].tempId()].instr->definitions[0].setTemp( + instr->definitions[0].getTemp()); instr.reset(); return; } diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 37db33230e4..c7ba8578d19 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -516,7 +516,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins if (mimg.lwe) fprintf(output, " lwe"); if (mimg.r128) - fprintf(output, " r128"); + fprintf(output, " r128"); if (mimg.a16) fprintf(output, " a16"); if (mimg.d16) diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 0fb22f18e3e..7c4535b26e2 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -460,8 +460,7 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size); /* print assignments ordered by registers */ - std::map> - regs_to_vars; /* maps to byte size and temp id */ + std::map> regs_to_vars; /* maps to byte size and temp id */ for (unsigned id : find_vars(ctx, reg_file, regs)) { const assignment& var = ctx.assignments[id]; PhysReg reg = var.reg; @@ -1088,8 +1087,8 @@ get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, instr->operands[i].regClass() == info.rc) { assignment& op = ctx.assignments[instr->operands[i].tempId()]; /* if everything matches, create parallelcopy for the killed operand */ - if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && - op.reg != scc && reg_file.get_id(op.reg) == instr->operands[i].tempId()) { + if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc && + reg_file.get_id(op.reg) == instr->operands[i].tempId()) { Definition pc_def = Definition(reg, info.rc); parallelcopies.emplace_back(instr->operands[i], pc_def); return op.reg; @@ -1655,8 +1654,7 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, return vcc; } if (ctx.assignments[temp.id()].m0) { - if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0) && - can_write_m0(instr)) + if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0) && can_write_m0(instr)) return m0; } diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 5316878840d..02498c01195 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -587,8 +587,10 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) /* don't move non-reorderable instructions */ if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime || instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 || - instr->opcode == aco_opcode::p_init_scratch || instr->opcode == aco_opcode::p_jump_to_epilog || - instr->opcode == aco_opcode::s_sendmsg_rtn_b32 || instr->opcode == aco_opcode::s_sendmsg_rtn_b64) + instr->opcode == aco_opcode::p_init_scratch || + instr->opcode == aco_opcode::p_jump_to_epilog || + instr->opcode == aco_opcode::s_sendmsg_rtn_b32 || + instr->opcode == aco_opcode::s_sendmsg_rtn_b64) return hazard_fail_unreorderable; memory_event_set instr_set; @@ -663,8 +665,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector& registe int16_t k = 0; /* don't move s_memtime/s_memrealtime */ - if (current->opcode == aco_opcode::s_memtime || - current->opcode == aco_opcode::s_memrealtime || + if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime || current->opcode == aco_opcode::s_sendmsg_rtn_b32 || current->opcode == aco_opcode::s_sendmsg_rtn_b64) return; diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h index 8765c9439b3..4bfe7a61fd3 100644 --- a/src/amd/compiler/aco_shader_info.h +++ b/src/amd/compiler/aco_shader_info.h @@ -35,10 +35,10 @@ extern "C" { #endif -#define ACO_MAX_SO_OUTPUTS 64 -#define ACO_MAX_SO_BUFFERS 4 +#define ACO_MAX_SO_OUTPUTS 64 +#define ACO_MAX_SO_BUFFERS 4 #define ACO_MAX_VERTEX_ATTRIBS 32 -#define ACO_MAX_VBS 32 +#define ACO_MAX_VBS 32 struct aco_vs_input_state { uint32_t instance_rate_inputs; @@ -133,8 +133,8 @@ struct aco_compiler_options { enum amd_gfx_level gfx_level; uint32_t address32_hi; struct { - void (*func)(void *private_data, enum aco_compiler_debug_level level, const char *message); - void *private_data; + void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message); + void* private_data; } debug; }; diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 8e9c0915b51..1b64d601f29 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -94,7 +94,8 @@ struct spill_ctx { spill_ctx(const RegisterDemand target_pressure_, Program* program_, std::vector> register_demand_) : target_pressure(target_pressure_), program(program_), memory(), - register_demand(std::move(register_demand_)), renames(program->blocks.size(), aco::map(memory)), + register_demand(std::move(register_demand_)), + renames(program->blocks.size(), aco::map(memory)), spills_entry(program->blocks.size(), aco::unordered_map(memory)), spills_exit(program->blocks.size(), aco::unordered_map(memory)), processed(program->blocks.size(), false), @@ -226,10 +227,11 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, uint32_t& worklist) std::pair distance{block_idx, 0}; - auto it = instr->definitions[0].isTemp() ? next_use_distances_start.find(instr->definitions[0].getTemp()) - : next_use_distances_start.end(); + auto it = instr->definitions[0].isTemp() + ? next_use_distances_start.find(instr->definitions[0].getTemp()) + : next_use_distances_start.end(); if (it != next_use_distances_start.end() && - phi_defs.insert(instr->definitions[0].getTemp()).second) { + phi_defs.insert(instr->definitions[0].getTemp()).second) { distance = it->second; } @@ -388,7 +390,7 @@ get_rematerialize_info(spill_ctx& ctx) void update_local_next_uses(spill_ctx& ctx, Block* block, - std::vector>>& local_next_uses) + std::vector>>& local_next_uses) { if (local_next_uses.size() < block->instructions.size()) { /* Allocate more next-use-maps. Note that by never reducing the vector size, we enable @@ -1006,7 +1008,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) ctx.renames[pred_idx].find(phi->operands[i].getTemp()); if (it != ctx.renames[pred_idx].end()) { phi->operands[i].setTemp(it->second); - /* prevent the defining instruction from being DCE'd if it could be rematerialized */ + /* prevent the defining instruction from being DCE'd if it could be rematerialized */ } else { auto remat_it = ctx.remat.find(phi->operands[i].getTemp()); if (remat_it != ctx.remat.end()) { @@ -1407,7 +1409,8 @@ load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, Block& block, continue; /* find p_logical_end */ - std::vector>& prev_instructions = ctx.program->blocks[block_idx].instructions; + std::vector>& prev_instructions = + ctx.program->blocks[block_idx].instructions; unsigned idx = prev_instructions.size() - 1; while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end) idx--; @@ -1422,10 +1425,10 @@ load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, Block& block, Temp private_segment_buffer = ctx.program->private_segment_buffer; if (!private_segment_buffer.bytes()) { - Temp addr_lo = bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), - Operand::c32(aco_symbol_scratch_addr_lo)); - Temp addr_hi = bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), - Operand::c32(aco_symbol_scratch_addr_hi)); + Temp addr_lo = + bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); + Temp addr_hi = + bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi)); private_segment_buffer = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); } else if (ctx.program->stage.hw != HWStage::CS) { @@ -1471,8 +1474,7 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, if (ctx.scratch_rsrc == Temp()) { int32_t saddr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size - ctx.program->dev.scratch_global_offset_min; - ctx.scratch_rsrc = - load_scratch_resource(ctx, scratch_offset, block, instructions, saddr); + ctx.scratch_rsrc = load_scratch_resource(ctx, scratch_offset, block, instructions, saddr); } } else { bool add_offset_to_sgpr = diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index afb9896a23f..506ff8039a6 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -35,8 +35,8 @@ namespace aco { static void -aco_log(Program* program, enum aco_compiler_debug_level level, const char* prefix, - const char* file, unsigned line, const char* fmt, va_list args) +aco_log(Program* program, enum aco_compiler_debug_level level, const char* prefix, const char* file, + unsigned line, const char* fmt, va_list args) { char* msg; @@ -270,8 +270,7 @@ validate_ir(Program* program) (instr->opcode == aco_opcode::p_bpermute_gfx11w64 && i == 0) || (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) || ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) || - (instr->isScratch() && i == 0) || - (instr->isDS() && i == 0) || + (instr->isScratch() && i == 0) || (instr->isDS() && i == 0) || (instr->opcode == aco_opcode::p_init_scratch && i == 0); check(can_be_undef, "Undefs can only be used in certain operands", instr.get()); } else { @@ -393,7 +392,7 @@ validate_ir(Program* program) "OPSEL_LO set for unsupported instruction format", instr.get()); check(!instr->valu().opsel_hi || instr->isVOP3P(), "OPSEL_HI set for unsupported instruction format", instr.get()); - check(!instr->valu().omod || instr->isVOP3() ||instr->isSDWA(), + check(!instr->valu().omod || instr->isVOP3() || instr->isSDWA(), "OMOD set for unsupported instruction format", instr.get()); check(!instr->valu().clamp || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() || instr->isVINTERP_INREG(), @@ -562,7 +561,8 @@ validate_ir(Program* program) instr->definitions[2].regClass().size() == 1, "Third definition of p_dual_src_export_gfx11 must be a v1", instr.get()); check(instr->definitions[3].regClass() == program->lane_mask, - "Fourth definition of p_dual_src_export_gfx11 must be a lane mask", instr.get()); + "Fourth definition of p_dual_src_export_gfx11 must be a lane mask", + instr.get()); check(instr->definitions[4].physReg() == vcc, "Fifth definition of p_dual_src_export_gfx11 must be vcc", instr.get()); check(instr->definitions[5].physReg() == scc, @@ -627,26 +627,28 @@ validate_ir(Program* program) check(instr->operands.size() < 4 || instr->operands[3].isOfType(RegType::vgpr), "VMEM write data must be vgpr", instr.get()); - const bool d16 = instr->opcode == aco_opcode::buffer_load_dword || // FIXME: used to spill subdword variables - instr->opcode == aco_opcode::buffer_load_ubyte || - instr->opcode == aco_opcode::buffer_load_sbyte || - instr->opcode == aco_opcode::buffer_load_ushort || - instr->opcode == aco_opcode::buffer_load_sshort || - instr->opcode == aco_opcode::buffer_load_ubyte_d16 || - instr->opcode == aco_opcode::buffer_load_ubyte_d16_hi || - instr->opcode == aco_opcode::buffer_load_sbyte_d16 || - instr->opcode == aco_opcode::buffer_load_sbyte_d16_hi || - instr->opcode == aco_opcode::buffer_load_short_d16 || - instr->opcode == aco_opcode::buffer_load_short_d16_hi || - instr->opcode == aco_opcode::buffer_load_format_d16_x || - instr->opcode == aco_opcode::buffer_load_format_d16_hi_x || - instr->opcode == aco_opcode::buffer_load_format_d16_xy || - instr->opcode == aco_opcode::buffer_load_format_d16_xyz || - instr->opcode == aco_opcode::buffer_load_format_d16_xyzw || - instr->opcode == aco_opcode::tbuffer_load_format_d16_x || - instr->opcode == aco_opcode::tbuffer_load_format_d16_xy || - instr->opcode == aco_opcode::tbuffer_load_format_d16_xyz || - instr->opcode == aco_opcode::tbuffer_load_format_d16_xyzw; + const bool d16 = + instr->opcode == + aco_opcode::buffer_load_dword || // FIXME: used to spill subdword variables + instr->opcode == aco_opcode::buffer_load_ubyte || + instr->opcode == aco_opcode::buffer_load_sbyte || + instr->opcode == aco_opcode::buffer_load_ushort || + instr->opcode == aco_opcode::buffer_load_sshort || + instr->opcode == aco_opcode::buffer_load_ubyte_d16 || + instr->opcode == aco_opcode::buffer_load_ubyte_d16_hi || + instr->opcode == aco_opcode::buffer_load_sbyte_d16 || + instr->opcode == aco_opcode::buffer_load_sbyte_d16_hi || + instr->opcode == aco_opcode::buffer_load_short_d16 || + instr->opcode == aco_opcode::buffer_load_short_d16_hi || + instr->opcode == aco_opcode::buffer_load_format_d16_x || + instr->opcode == aco_opcode::buffer_load_format_d16_hi_x || + instr->opcode == aco_opcode::buffer_load_format_d16_xy || + instr->opcode == aco_opcode::buffer_load_format_d16_xyz || + instr->opcode == aco_opcode::buffer_load_format_d16_xyzw || + instr->opcode == aco_opcode::tbuffer_load_format_d16_x || + instr->opcode == aco_opcode::tbuffer_load_format_d16_xy || + instr->opcode == aco_opcode::tbuffer_load_format_d16_xyz || + instr->opcode == aco_opcode::tbuffer_load_format_d16_xyzw; if (instr->definitions.size()) { check(instr->definitions[0].regClass().type() == RegType::vgpr, "VMEM definitions[0] (VDATA) must be VGPR", instr.get()); @@ -763,11 +765,14 @@ validate_ir(Program* program) break; } case Format::LDSDIR: { - check(instr->definitions.size() == 1 && instr->definitions[0].regClass() == v1, "LDSDIR must have an v1 definition", instr.get()); + check(instr->definitions.size() == 1 && instr->definitions[0].regClass() == v1, + "LDSDIR must have an v1 definition", instr.get()); check(instr->operands.size() == 1, "LDSDIR must have an operand", instr.get()); if (!instr->operands.empty()) { - check(instr->operands[0].regClass() == s1, "LDSDIR must have an s1 operand", instr.get()); - check(instr->operands[0].isFixed() && instr->operands[0].physReg() == m0, "LDSDIR must have an operand fixed to m0", instr.get()); + check(instr->operands[0].regClass() == s1, "LDSDIR must have an s1 operand", + instr.get()); + check(instr->operands[0].isFixed() && instr->operands[0].physReg() == m0, + "LDSDIR must have an operand fixed to m0", instr.get()); } break; } diff --git a/src/amd/compiler/tests/framework.h b/src/amd/compiler/tests/framework.h index 76a17e230ca..261642d40b4 100644 --- a/src/amd/compiler/tests/framework.h +++ b/src/amd/compiler/tests/framework.h @@ -35,19 +35,20 @@ #include struct TestDef { - const char *name; - const char *source_file; + const char* name; + const char* source_file; void (*func)(); }; extern std::map tests; -extern FILE *output; +extern FILE* output; -bool set_variant(const char *name); +bool set_variant(const char* name); -inline bool set_variant(amd_gfx_level cls, const char *rest="") +inline bool +set_variant(amd_gfx_level cls, const char* rest = "") { - char buf[8+strlen(rest)]; + char buf[8 + strlen(rest)]; if (cls != GFX10_3) { snprintf(buf, sizeof(buf), "gfx%d%s", cls - GFX6 + 6 - (cls > GFX10_3), rest); } else { @@ -56,18 +57,21 @@ inline bool set_variant(amd_gfx_level cls, const char *rest="") return set_variant(buf); } -void fail_test(const char *fmt, ...); -void skip_test(const char *fmt, ...); +void fail_test(const char* fmt, ...); +void skip_test(const char* fmt, ...); -#define _BEGIN_TEST(name, struct_name) static void struct_name(); static __attribute__((constructor)) void CONCAT2(add_test_, __COUNTER__)() {\ - tests[#name] = (TestDef){#name, ACO_TEST_BUILD_ROOT "/" __FILE__, &struct_name};\ - }\ - static void struct_name() {\ +#define _BEGIN_TEST(name, struct_name) \ + static void struct_name(); \ + static __attribute__((constructor)) void CONCAT2(add_test_, __COUNTER__)() \ + { \ + tests[#name] = (TestDef){#name, ACO_TEST_BUILD_ROOT "/" __FILE__, &struct_name}; \ + } \ + static void struct_name() \ + { -#define BEGIN_TEST(name) _BEGIN_TEST(name, CONCAT2(Test_, __COUNTER__)) +#define BEGIN_TEST(name) _BEGIN_TEST(name, CONCAT2(Test_, __COUNTER__)) #define BEGIN_TEST_TODO(name) _BEGIN_TEST(name, CONCAT2(Test_, __COUNTER__)) #define BEGIN_TEST_FAIL(name) _BEGIN_TEST(name, CONCAT2(Test_, __COUNTER__)) -#define END_TEST \ - } +#define END_TEST } #endif /* ACO_TEST_COMMON_H */ diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp index c5a88447329..caa8b51eeeb 100644 --- a/src/amd/compiler/tests/helpers.cpp +++ b/src/amd/compiler/tests/helpers.cpp @@ -22,19 +22,20 @@ * */ #include "helpers.h" -#include "vulkan/vk_format.h" + #include "common/amd_family.h" -#include -#include +#include "vulkan/vk_format.h" + #include + #include +#include +#include using namespace aco; extern "C" { -PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr( - VkInstance instance, - const char* pName); +PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(VkInstance instance, const char* pName); } ac_shader_config config; @@ -47,32 +48,34 @@ static VkInstance instance_cache[CHIP_LAST] = {VK_NULL_HANDLE}; static VkDevice device_cache[CHIP_LAST] = {VK_NULL_HANDLE}; static std::mutex create_device_mutex; -#define FUNCTION_LIST\ - ITEM(CreateInstance)\ - ITEM(DestroyInstance)\ - ITEM(EnumeratePhysicalDevices)\ - ITEM(GetPhysicalDeviceProperties2)\ - ITEM(CreateDevice)\ - ITEM(DestroyDevice)\ - ITEM(CreateShaderModule)\ - ITEM(DestroyShaderModule)\ - ITEM(CreateGraphicsPipelines)\ - ITEM(CreateComputePipelines)\ - ITEM(DestroyPipeline)\ - ITEM(CreateDescriptorSetLayout)\ - ITEM(DestroyDescriptorSetLayout)\ - ITEM(CreatePipelineLayout)\ - ITEM(DestroyPipelineLayout)\ - ITEM(CreateRenderPass)\ - ITEM(DestroyRenderPass)\ - ITEM(GetPipelineExecutablePropertiesKHR)\ +#define FUNCTION_LIST \ + ITEM(CreateInstance) \ + ITEM(DestroyInstance) \ + ITEM(EnumeratePhysicalDevices) \ + ITEM(GetPhysicalDeviceProperties2) \ + ITEM(CreateDevice) \ + ITEM(DestroyDevice) \ + ITEM(CreateShaderModule) \ + ITEM(DestroyShaderModule) \ + ITEM(CreateGraphicsPipelines) \ + ITEM(CreateComputePipelines) \ + ITEM(DestroyPipeline) \ + ITEM(CreateDescriptorSetLayout) \ + ITEM(DestroyDescriptorSetLayout) \ + ITEM(CreatePipelineLayout) \ + ITEM(DestroyPipelineLayout) \ + ITEM(CreateRenderPass) \ + ITEM(DestroyRenderPass) \ + ITEM(GetPipelineExecutablePropertiesKHR) \ ITEM(GetPipelineExecutableInternalRepresentationsKHR) #define ITEM(n) PFN_vk##n n; FUNCTION_LIST #undef ITEM -void create_program(enum amd_gfx_level gfx_level, Stage stage, unsigned wave_size, enum radeon_family family) +void +create_program(enum amd_gfx_level gfx_level, Stage stage, unsigned wave_size, + enum radeon_family family) { memset(&config, 0, sizeof(config)); info.wave_size = wave_size; @@ -90,7 +93,7 @@ void create_program(enum amd_gfx_level gfx_level, Stage stage, unsigned wave_siz program->debug.func = nullptr; program->debug.private_data = nullptr; - Block *block = program->create_and_insert_block(); + Block* block = program->create_and_insert_block(); block->kind = block_kind_top_level; bld = Builder(program.get(), &program->blocks[0]); @@ -98,9 +101,9 @@ void create_program(enum amd_gfx_level gfx_level, Stage stage, unsigned wave_siz config.float_mode = program->blocks[0].fp_mode.val; } -bool setup_cs(const char *input_spec, enum amd_gfx_level gfx_level, - enum radeon_family family, const char* subvariant, - unsigned wave_size) +bool +setup_cs(const char* input_spec, enum amd_gfx_level gfx_level, enum radeon_family family, + const char* subvariant, unsigned wave_size) { if (!set_variant(gfx_level, subvariant)) return false; @@ -117,7 +120,8 @@ bool setup_cs(const char *input_spec, enum amd_gfx_level gfx_level, input_classes.push_back(RegClass::get(type, size * (in_bytes ? 1 : 4))); input_spec += 2 + in_bytes; - while (input_spec[0] == ' ') input_spec++; + while (input_spec[0] == ' ') + input_spec++; } aco_ptr startpgm{create_instruction( @@ -132,7 +136,8 @@ bool setup_cs(const char *input_spec, enum amd_gfx_level gfx_level, return true; } -void finish_program(Program *prog) +void +finish_program(Program* prog) { for (Block& BB : prog->blocks) { for (unsigned idx : BB.linear_preds) @@ -149,7 +154,8 @@ void finish_program(Program *prog) } } -void finish_validator_test() +void +finish_validator_test() { finish_program(program.get()); aco_print_program(program.get(), output); @@ -160,7 +166,8 @@ void finish_validator_test() fprintf(output, "Validation failed\n"); } -void finish_opt_test() +void +finish_opt_test() { finish_program(program.get()); if (!aco::validate_ir(program.get())) { @@ -175,7 +182,8 @@ void finish_opt_test() aco_print_program(program.get(), output); } -void finish_setup_reduce_temp_test() +void +finish_setup_reduce_temp_test() { finish_program(program.get()); if (!aco::validate_ir(program.get())) { @@ -190,7 +198,8 @@ void finish_setup_reduce_temp_test() aco_print_program(program.get(), output); } -void finish_ra_test(ra_test_policy policy, bool lower) +void +finish_ra_test(ra_test_policy policy, bool lower) { finish_program(program.get()); if (!aco::validate_ir(program.get())) { @@ -215,42 +224,48 @@ void finish_ra_test(ra_test_policy policy, bool lower) aco_print_program(program.get(), output); } -void finish_optimizer_postRA_test() +void +finish_optimizer_postRA_test() { finish_program(program.get()); aco::optimize_postRA(program.get()); aco_print_program(program.get(), output); } -void finish_to_hw_instr_test() +void +finish_to_hw_instr_test() { finish_program(program.get()); aco::lower_to_hw_instr(program.get()); aco_print_program(program.get(), output); } -void finish_waitcnt_test() +void +finish_waitcnt_test() { finish_program(program.get()); aco::insert_wait_states(program.get()); aco_print_program(program.get(), output); } -void finish_insert_nops_test() +void +finish_insert_nops_test() { finish_program(program.get()); aco::insert_NOPs(program.get()); aco_print_program(program.get(), output); } -void finish_form_hard_clause_test() +void +finish_form_hard_clause_test() { finish_program(program.get()); aco::form_hard_clauses(program.get()); aco_print_program(program.get(), output); } -void finish_assembler_test() +void +finish_assembler_test() { finish_program(program.get()); std::vector binary; @@ -261,13 +276,14 @@ void finish_assembler_test() if (program->gfx_level >= GFX8) { print_asm(program.get(), binary, exec_size / 4u, output); } else { - //TODO: maybe we should use CLRX and skip this test if it's not available? + // TODO: maybe we should use CLRX and skip this test if it's not available? for (uint32_t dword : binary) fprintf(output, "%.8x\n", dword); } } -void writeout(unsigned i, Temp tmp) +void +writeout(unsigned i, Temp tmp) { if (tmp.id()) bld.pseudo(aco_opcode::p_unit_test, Operand::c32(i), tmp); @@ -275,22 +291,26 @@ void writeout(unsigned i, Temp tmp) bld.pseudo(aco_opcode::p_unit_test, Operand::c32(i)); } -void writeout(unsigned i, aco::Builder::Result res) +void +writeout(unsigned i, aco::Builder::Result res) { bld.pseudo(aco_opcode::p_unit_test, Operand::c32(i), res); } -void writeout(unsigned i, Operand op) +void +writeout(unsigned i, Operand op) { bld.pseudo(aco_opcode::p_unit_test, Operand::c32(i), op); } -void writeout(unsigned i, Operand op0, Operand op1) +void +writeout(unsigned i, Operand op0, Operand op1) { bld.pseudo(aco_opcode::p_unit_test, Operand::c32(i), op0, op1); } -Temp fneg(Temp src, Builder b) +Temp +fneg(Temp src, Builder b) { if (src.bytes() == 2) return b.vop2(aco_opcode::v_mul_f16, b.def(v2b), Operand::c16(0xbc00u), src); @@ -298,35 +318,42 @@ Temp fneg(Temp src, Builder b) return b.vop2(aco_opcode::v_mul_f32, b.def(v1), Operand::c32(0xbf800000u), src); } -Temp fabs(Temp src, Builder b) +Temp +fabs(Temp src, Builder b) { if (src.bytes() == 2) { - Builder::Result res = b.vop2_e64(aco_opcode::v_mul_f16, b.def(v2b), Operand::c16(0x3c00), src); + Builder::Result res = + b.vop2_e64(aco_opcode::v_mul_f16, b.def(v2b), Operand::c16(0x3c00), src); res->valu().abs[1] = true; return res; } else { - Builder::Result res = b.vop2_e64(aco_opcode::v_mul_f32, b.def(v1), Operand::c32(0x3f800000u), src); + Builder::Result res = + b.vop2_e64(aco_opcode::v_mul_f32, b.def(v1), Operand::c32(0x3f800000u), src); res->valu().abs[1] = true; return res; } } -Temp f2f32(Temp src, Builder b) +Temp +f2f32(Temp src, Builder b) { return b.vop1(aco_opcode::v_cvt_f32_f16, b.def(v1), src); } -Temp f2f16(Temp src, Builder b) +Temp +f2f16(Temp src, Builder b) { return b.vop1(aco_opcode::v_cvt_f16_f32, b.def(v2b), src); } -Temp u2u16(Temp src, Builder b) +Temp +u2u16(Temp src, Builder b) { return b.pseudo(aco_opcode::p_extract_vector, b.def(v2b), src, Operand::zero()); } -Temp fadd(Temp src0, Temp src1, Builder b) +Temp +fadd(Temp src0, Temp src1, Builder b) { if (src0.bytes() == 2) return b.vop2(aco_opcode::v_add_f16, b.def(v2b), src0, src1); @@ -334,7 +361,8 @@ Temp fadd(Temp src0, Temp src1, Builder b) return b.vop2(aco_opcode::v_add_f32, b.def(v1), src0, src1); } -Temp fmul(Temp src0, Temp src1, Builder b) +Temp +fmul(Temp src0, Temp src1, Builder b) { if (src0.bytes() == 2) return b.vop2(aco_opcode::v_mul_f16, b.def(v2b), src0, src1); @@ -342,7 +370,8 @@ Temp fmul(Temp src0, Temp src1, Builder b) return b.vop2(aco_opcode::v_mul_f32, b.def(v1), src0, src1); } -Temp fma(Temp src0, Temp src1, Temp src2, Builder b) +Temp +fma(Temp src0, Temp src1, Temp src2, Builder b) { if (src0.bytes() == 2) return b.vop3(aco_opcode::v_fma_f16, b.def(v2b), src0, src1, src2); @@ -350,40 +379,46 @@ Temp fma(Temp src0, Temp src1, Temp src2, Builder b) return b.vop3(aco_opcode::v_fma_f32, b.def(v1), src0, src1, src2); } -Temp fsat(Temp src, Builder b) +Temp +fsat(Temp src, Builder b) { if (src.bytes() == 2) - return b.vop3(aco_opcode::v_med3_f16, b.def(v2b), Operand::c16(0u), - Operand::c16(0x3c00u), src); + return b.vop3(aco_opcode::v_med3_f16, b.def(v2b), Operand::c16(0u), Operand::c16(0x3c00u), + src); else - return b.vop3(aco_opcode::v_med3_f32, b.def(v1), Operand::zero(), - Operand::c32(0x3f800000u), src); + return b.vop3(aco_opcode::v_med3_f32, b.def(v1), Operand::zero(), Operand::c32(0x3f800000u), + src); } -Temp fmin(Temp src0, Temp src1, Builder b) +Temp +fmin(Temp src0, Temp src1, Builder b) { return b.vop2(aco_opcode::v_min_f32, b.def(v1), src0, src1); } -Temp fmax(Temp src0, Temp src1, Builder b) +Temp +fmax(Temp src0, Temp src1, Builder b) { return b.vop2(aco_opcode::v_max_f32, b.def(v1), src0, src1); } -Temp ext_ushort(Temp src, unsigned idx, Builder b) +Temp +ext_ushort(Temp src, unsigned idx, Builder b) { return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx), Operand::c32(16u), Operand::c32(false)); } -Temp ext_ubyte(Temp src, unsigned idx, Builder b) +Temp +ext_ubyte(Temp src, unsigned idx, Builder b) { return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx), Operand::c32(8u), Operand::c32(false)); } -void emit_divergent_if_else(Program* prog, aco::Builder& b, Operand cond, std::function then, - std::function els) +void +emit_divergent_if_else(Program* prog, aco::Builder& b, Operand cond, std::function then, + std::function els) { prog->blocks.reserve(prog->blocks.size() + 6); @@ -418,8 +453,10 @@ void emit_divergent_if_else(Program* prog, aco::Builder& b, Operand cond, std::f PhysReg saved_exec_reg(84); b.reset(if_block); - Temp saved_exec = b.sop1(Builder::s_and_saveexec, b.def(b.lm, saved_exec_reg), Definition(scc, s1), Definition(exec, b.lm), cond, Operand(exec, b.lm)); - b.branch(aco_opcode::p_cbranch_nz, Definition(vcc, bld.lm), then_logical->index, then_linear->index); + Temp saved_exec = b.sop1(Builder::s_and_saveexec, b.def(b.lm, saved_exec_reg), + Definition(scc, s1), Definition(exec, b.lm), cond, Operand(exec, b.lm)); + b.branch(aco_opcode::p_cbranch_nz, Definition(vcc, bld.lm), then_logical->index, + then_linear->index); b.reset(then_logical); b.pseudo(aco_opcode::p_logical_start); @@ -431,8 +468,10 @@ void emit_divergent_if_else(Program* prog, aco::Builder& b, Operand cond, std::f b.branch(aco_opcode::p_branch, Definition(vcc, bld.lm), invert->index); b.reset(invert); - b.sop2(Builder::s_andn2, Definition(exec, bld.lm), Definition(scc, s1), Operand(saved_exec, saved_exec_reg), Operand(exec, bld.lm)); - b.branch(aco_opcode::p_cbranch_nz, Definition(vcc, bld.lm), else_logical->index, else_linear->index); + b.sop2(Builder::s_andn2, Definition(exec, bld.lm), Definition(scc, s1), + Operand(saved_exec, saved_exec_reg), Operand(exec, bld.lm)); + b.branch(aco_opcode::p_cbranch_nz, Definition(vcc, bld.lm), else_logical->index, + else_linear->index); b.reset(else_logical); b.pseudo(aco_opcode::p_logical_start); @@ -444,42 +483,29 @@ void emit_divergent_if_else(Program* prog, aco::Builder& b, Operand cond, std::f b.branch(aco_opcode::p_branch, Definition(vcc, bld.lm), endif_block->index); b.reset(endif_block); - b.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), Operand(saved_exec, saved_exec_reg)); + b.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), + Operand(saved_exec, saved_exec_reg)); } -VkDevice get_vk_device(enum amd_gfx_level gfx_level) +VkDevice +get_vk_device(enum amd_gfx_level gfx_level) { enum radeon_family family; switch (gfx_level) { - case GFX6: - family = CHIP_TAHITI; - break; - case GFX7: - family = CHIP_BONAIRE; - break; - case GFX8: - family = CHIP_POLARIS10; - break; - case GFX9: - family = CHIP_VEGA10; - break; - case GFX10: - family = CHIP_NAVI10; - break; - case GFX10_3: - family = CHIP_NAVI21; - break; - case GFX11: - family = CHIP_GFX1100; - break; - default: - family = CHIP_UNKNOWN; - break; + case GFX6: family = CHIP_TAHITI; break; + case GFX7: family = CHIP_BONAIRE; break; + case GFX8: family = CHIP_POLARIS10; break; + case GFX9: family = CHIP_VEGA10; break; + case GFX10: family = CHIP_NAVI10; break; + case GFX10_3: family = CHIP_NAVI21; break; + case GFX11: family = CHIP_GFX1100; break; + default: family = CHIP_UNKNOWN; break; } return get_vk_device(family); } -VkDevice get_vk_device(enum radeon_family family) +VkDevice +get_vk_device(enum radeon_family family) { assert(family != CHIP_UNKNOWN); @@ -496,12 +522,13 @@ VkDevice get_vk_device(enum radeon_family family) VkInstanceCreateInfo instance_create_info = {}; instance_create_info.pApplicationInfo = &app_info; instance_create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; - ASSERTED VkResult result = ((PFN_vkCreateInstance)vk_icdGetInstanceProcAddr(NULL, "vkCreateInstance"))(&instance_create_info, NULL, &instance_cache[family]); + ASSERTED VkResult result = ((PFN_vkCreateInstance)vk_icdGetInstanceProcAddr( + NULL, "vkCreateInstance"))(&instance_create_info, NULL, &instance_cache[family]); assert(result == VK_SUCCESS); - #define ITEM(n) n = (PFN_vk##n)vk_icdGetInstanceProcAddr(instance_cache[family], "vk" #n); +#define ITEM(n) n = (PFN_vk##n)vk_icdGetInstanceProcAddr(instance_cache[family], "vk" #n); FUNCTION_LIST - #undef ITEM +#undef ITEM uint32_t device_count = 1; VkPhysicalDevice device = VK_NULL_HANDLE; @@ -511,7 +538,7 @@ VkDevice get_vk_device(enum radeon_family family) VkDeviceCreateInfo device_create_info = {}; device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; - static const char *extensions[] = {"VK_KHR_pipeline_executable_properties"}; + static const char* extensions[] = {"VK_KHR_pipeline_executable_properties"}; device_create_info.enabledExtensionCount = sizeof(extensions) / sizeof(extensions[0]); device_create_info.ppEnabledExtensionNames = extensions; result = CreateDevice(device, &device_create_info, NULL, &device_cache[family]); @@ -520,7 +547,8 @@ VkDevice get_vk_device(enum radeon_family family) } static struct DestroyDevices { - ~DestroyDevices() { + ~DestroyDevices() + { for (unsigned i = 0; i < CHIP_LAST; i++) { if (!device_cache[i]) continue; @@ -530,8 +558,9 @@ static struct DestroyDevices { } } destroy_devices; -void print_pipeline_ir(VkDevice device, VkPipeline pipeline, VkShaderStageFlagBits stages, - const char *name, bool remove_encoding) +void +print_pipeline_ir(VkDevice device, VkPipeline pipeline, VkShaderStageFlagBits stages, + const char* name, bool remove_encoding) { uint32_t executable_count = 16; VkPipelineExecutablePropertiesKHR executables[16]; @@ -539,7 +568,8 @@ void print_pipeline_ir(VkDevice device, VkPipeline pipeline, VkShaderStageFlagBi pipeline_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR; pipeline_info.pNext = NULL; pipeline_info.pipeline = pipeline; - ASSERTED VkResult result = GetPipelineExecutablePropertiesKHR(device, &pipeline_info, &executable_count, executables); + ASSERTED VkResult result = + GetPipelineExecutablePropertiesKHR(device, &pipeline_info, &executable_count, executables); assert(result == VK_SUCCESS); uint32_t executable = 0; @@ -570,13 +600,13 @@ void print_pipeline_ir(VkDevice device, VkPipeline pipeline, VkShaderStageFlagBi } assert(requested_ir && "Could not find requested IR"); - char *data = (char*)malloc(requested_ir->dataSize); + char* data = (char*)malloc(requested_ir->dataSize); requested_ir->pData = data; result = GetPipelineExecutableInternalRepresentationsKHR(device, &exec_info, &ir_count, ir); assert(result == VK_SUCCESS); if (remove_encoding) { - for (char *c = data; *c; c++) { + for (char* c = data; *c; c++) { if (*c == ';') { for (; *c && *c != '\n'; c++) *c = ' '; @@ -588,23 +618,25 @@ void print_pipeline_ir(VkDevice device, VkPipeline pipeline, VkShaderStageFlagBi free(data); } -VkShaderModule __qoCreateShaderModule(VkDevice dev, const QoShaderModuleCreateInfo *module_info) +VkShaderModule +__qoCreateShaderModule(VkDevice dev, const QoShaderModuleCreateInfo* module_info) { - VkShaderModuleCreateInfo vk_module_info; - vk_module_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - vk_module_info.pNext = NULL; - vk_module_info.flags = 0; - vk_module_info.codeSize = module_info->spirvSize; - vk_module_info.pCode = (const uint32_t*)module_info->pSpirv; + VkShaderModuleCreateInfo vk_module_info; + vk_module_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + vk_module_info.pNext = NULL; + vk_module_info.flags = 0; + vk_module_info.codeSize = module_info->spirvSize; + vk_module_info.pCode = (const uint32_t*)module_info->pSpirv; - VkShaderModule module; - ASSERTED VkResult result = CreateShaderModule(dev, &vk_module_info, NULL, &module); - assert(result == VK_SUCCESS); + VkShaderModule module; + ASSERTED VkResult result = CreateShaderModule(dev, &vk_module_info, NULL, &module); + assert(result == VK_SUCCESS); - return module; + return module; } -PipelineBuilder::PipelineBuilder(VkDevice dev) { +PipelineBuilder::PipelineBuilder(VkDevice dev) +{ memset(this, 0, sizeof(*this)); topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; device = dev; @@ -615,7 +647,7 @@ PipelineBuilder::~PipelineBuilder() DestroyPipeline(device, pipeline, NULL); for (unsigned i = 0; i < (is_compute() ? 1 : gfx_pipeline_info.stageCount); i++) { - VkPipelineShaderStageCreateInfo *stage_info = &stages[i]; + VkPipelineShaderStageCreateInfo* stage_info = &stages[i]; if (owned_stages & stage_info->stage) DestroyShaderModule(device, stage_info->module, NULL); } @@ -628,72 +660,87 @@ PipelineBuilder::~PipelineBuilder() DestroyRenderPass(device, render_pass, NULL); } -void PipelineBuilder::add_desc_binding(VkShaderStageFlags stage_flags, uint32_t layout, - uint32_t binding, VkDescriptorType type, uint32_t count) +void +PipelineBuilder::add_desc_binding(VkShaderStageFlags stage_flags, uint32_t layout, uint32_t binding, + VkDescriptorType type, uint32_t count) { desc_layouts_used |= 1ull << layout; desc_bindings[layout][num_desc_bindings[layout]++] = {binding, type, count, stage_flags, NULL}; } -void PipelineBuilder::add_vertex_binding(uint32_t binding, uint32_t stride, VkVertexInputRate rate) +void +PipelineBuilder::add_vertex_binding(uint32_t binding, uint32_t stride, VkVertexInputRate rate) { vs_bindings[vs_input.vertexBindingDescriptionCount++] = {binding, stride, rate}; } -void PipelineBuilder::add_vertex_attribute(uint32_t location, uint32_t binding, VkFormat format, uint32_t offset) +void +PipelineBuilder::add_vertex_attribute(uint32_t location, uint32_t binding, VkFormat format, + uint32_t offset) { vs_attributes[vs_input.vertexAttributeDescriptionCount++] = {location, binding, format, offset}; } -void PipelineBuilder::add_resource_decls(QoShaderModuleCreateInfo *module) +void +PipelineBuilder::add_resource_decls(QoShaderModuleCreateInfo* module) { for (unsigned i = 0; i < module->declarationCount; i++) { - const QoShaderDecl *decl = &module->pDeclarations[i]; + const QoShaderDecl* decl = &module->pDeclarations[i]; switch (decl->decl_type) { case QoShaderDeclType_ubo: - add_desc_binding(module->stage, decl->set, decl->binding, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); + add_desc_binding(module->stage, decl->set, decl->binding, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); break; case QoShaderDeclType_ssbo: - add_desc_binding(module->stage, decl->set, decl->binding, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + add_desc_binding(module->stage, decl->set, decl->binding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); break; case QoShaderDeclType_img_buf: - add_desc_binding(module->stage, decl->set, decl->binding, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER); + add_desc_binding(module->stage, decl->set, decl->binding, + VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER); break; case QoShaderDeclType_img: - add_desc_binding(module->stage, decl->set, decl->binding, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); + add_desc_binding(module->stage, decl->set, decl->binding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); break; case QoShaderDeclType_tex_buf: - add_desc_binding(module->stage, decl->set, decl->binding, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER); + add_desc_binding(module->stage, decl->set, decl->binding, + VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER); break; case QoShaderDeclType_combined: - add_desc_binding(module->stage, decl->set, decl->binding, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); + add_desc_binding(module->stage, decl->set, decl->binding, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); break; case QoShaderDeclType_tex: - add_desc_binding(module->stage, decl->set, decl->binding, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE); + add_desc_binding(module->stage, decl->set, decl->binding, + VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE); break; case QoShaderDeclType_samp: add_desc_binding(module->stage, decl->set, decl->binding, VK_DESCRIPTOR_TYPE_SAMPLER); break; - default: - break; + default: break; } } } -void PipelineBuilder::add_io_decls(QoShaderModuleCreateInfo *module) +void +PipelineBuilder::add_io_decls(QoShaderModuleCreateInfo* module) { unsigned next_vtx_offset = 0; for (unsigned i = 0; i < module->declarationCount; i++) { - const QoShaderDecl *decl = &module->pDeclarations[i]; + const QoShaderDecl* decl = &module->pDeclarations[i]; switch (decl->decl_type) { case QoShaderDeclType_in: if (module->stage == VK_SHADER_STAGE_VERTEX_BIT) { if (!strcmp(decl->type, "float") || decl->type[0] == 'v') - add_vertex_attribute(decl->location, 0, VK_FORMAT_R32G32B32A32_SFLOAT, next_vtx_offset); + add_vertex_attribute(decl->location, 0, VK_FORMAT_R32G32B32A32_SFLOAT, + next_vtx_offset); else if (decl->type[0] == 'u') - add_vertex_attribute(decl->location, 0, VK_FORMAT_R32G32B32A32_UINT, next_vtx_offset); + add_vertex_attribute(decl->location, 0, VK_FORMAT_R32G32B32A32_UINT, + next_vtx_offset); else if (decl->type[0] == 'i') - add_vertex_attribute(decl->location, 0, VK_FORMAT_R32G32B32A32_SINT, next_vtx_offset); + add_vertex_attribute(decl->location, 0, VK_FORMAT_R32G32B32A32_SINT, + next_vtx_offset); next_vtx_offset += 16; } break; @@ -707,17 +754,17 @@ void PipelineBuilder::add_io_decls(QoShaderModuleCreateInfo *module) color_outputs[decl->location] = VK_FORMAT_R32G32B32A32_SINT; } break; - default: - break; + default: break; } } if (next_vtx_offset) add_vertex_binding(0, next_vtx_offset); } -void PipelineBuilder::add_stage(VkShaderStageFlagBits stage, VkShaderModule module, const char *name) +void +PipelineBuilder::add_stage(VkShaderStageFlagBits stage, VkShaderModule module, const char* name) { - VkPipelineShaderStageCreateInfo *stage_info; + VkPipelineShaderStageCreateInfo* stage_info; if (stage == VK_SHADER_STAGE_COMPUTE_BIT) stage_info = &stages[0]; else @@ -732,40 +779,50 @@ void PipelineBuilder::add_stage(VkShaderStageFlagBits stage, VkShaderModule modu owned_stages |= stage; } -void PipelineBuilder::add_stage(VkShaderStageFlagBits stage, QoShaderModuleCreateInfo module, const char *name) +void +PipelineBuilder::add_stage(VkShaderStageFlagBits stage, QoShaderModuleCreateInfo module, + const char* name) { add_stage(stage, __qoCreateShaderModule(device, &module), name); add_resource_decls(&module); add_io_decls(&module); } -void PipelineBuilder::add_vsfs(VkShaderModule vs, VkShaderModule fs) +void +PipelineBuilder::add_vsfs(VkShaderModule vs, VkShaderModule fs) { add_stage(VK_SHADER_STAGE_VERTEX_BIT, vs); add_stage(VK_SHADER_STAGE_FRAGMENT_BIT, fs); } -void PipelineBuilder::add_vsfs(QoShaderModuleCreateInfo vs, QoShaderModuleCreateInfo fs) +void +PipelineBuilder::add_vsfs(QoShaderModuleCreateInfo vs, QoShaderModuleCreateInfo fs) { add_stage(VK_SHADER_STAGE_VERTEX_BIT, vs); add_stage(VK_SHADER_STAGE_FRAGMENT_BIT, fs); } -void PipelineBuilder::add_cs(VkShaderModule cs) +void +PipelineBuilder::add_cs(VkShaderModule cs) { add_stage(VK_SHADER_STAGE_COMPUTE_BIT, cs); } -void PipelineBuilder::add_cs(QoShaderModuleCreateInfo cs) +void +PipelineBuilder::add_cs(QoShaderModuleCreateInfo cs) { add_stage(VK_SHADER_STAGE_COMPUTE_BIT, cs); } -bool PipelineBuilder::is_compute() { +bool +PipelineBuilder::is_compute() +{ return gfx_pipeline_info.stageCount == 0; } -void PipelineBuilder::create_compute_pipeline() { +void +PipelineBuilder::create_compute_pipeline() +{ VkComputePipelineCreateInfo create_info; create_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; create_info.pNext = NULL; @@ -775,11 +832,14 @@ void PipelineBuilder::create_compute_pipeline() { create_info.basePipelineHandle = VK_NULL_HANDLE; create_info.basePipelineIndex = 0; - ASSERTED VkResult result = CreateComputePipelines(device, VK_NULL_HANDLE, 1, &create_info, NULL, &pipeline); + ASSERTED VkResult result = + CreateComputePipelines(device, VK_NULL_HANDLE, 1, &create_info, NULL, &pipeline); assert(result == VK_SUCCESS); } -void PipelineBuilder::create_graphics_pipeline() { +void +PipelineBuilder::create_graphics_pipeline() +{ /* create the create infos */ if (!samples) samples = VK_SAMPLE_COUNT_1_BIT; @@ -792,7 +852,7 @@ void PipelineBuilder::create_graphics_pipeline() { if (color_outputs[i] == VK_FORMAT_UNDEFINED) continue; - VkAttachmentDescription *desc = &attachment_descs[num_color_attachments]; + VkAttachmentDescription* desc = &attachment_descs[num_color_attachments]; desc->flags = 0; desc->format = color_outputs[i]; desc->samples = samples; @@ -803,16 +863,14 @@ void PipelineBuilder::create_graphics_pipeline() { desc->initialLayout = VK_IMAGE_LAYOUT_GENERAL; desc->finalLayout = VK_IMAGE_LAYOUT_GENERAL; - VkAttachmentReference *ref = &color_attachments[num_color_attachments]; + VkAttachmentReference* ref = &color_attachments[num_color_attachments]; ref->attachment = num_color_attachments; ref->layout = VK_IMAGE_LAYOUT_GENERAL; - VkPipelineColorBlendAttachmentState *blend = &blend_attachment_states[num_color_attachments]; + VkPipelineColorBlendAttachmentState* blend = &blend_attachment_states[num_color_attachments]; blend->blendEnable = false; - blend->colorWriteMask = VK_COLOR_COMPONENT_R_BIT | - VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT | - VK_COLOR_COMPONENT_A_BIT; + blend->colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; num_color_attachments++; } @@ -820,7 +878,7 @@ void PipelineBuilder::create_graphics_pipeline() { unsigned num_attachments = num_color_attachments; VkAttachmentReference ds_attachment; if (ds_output != VK_FORMAT_UNDEFINED) { - VkAttachmentDescription *desc = &attachment_descs[num_attachments]; + VkAttachmentDescription* desc = &attachment_descs[num_attachments]; desc->flags = 0; desc->format = ds_output; desc->samples = samples; @@ -902,8 +960,7 @@ void PipelineBuilder::create_graphics_pipeline() { ds_state.front.passOp = VK_STENCIL_OP_REPLACE; ds_state.front.depthFailOp = VK_STENCIL_OP_REPLACE; ds_state.front.compareOp = VK_COMPARE_OP_ALWAYS; - ds_state.front.compareMask = 0xffffffff, - ds_state.front.writeMask = 0; + ds_state.front.compareMask = 0xffffffff, ds_state.front.writeMask = 0; ds_state.front.reference = 0; ds_state.back = ds_state.front; @@ -915,17 +972,15 @@ void PipelineBuilder::create_graphics_pipeline() { color_blend_state.attachmentCount = num_color_attachments; color_blend_state.pAttachments = blend_attachment_states; - VkDynamicState dynamic_states[9] = { - VK_DYNAMIC_STATE_VIEWPORT, - VK_DYNAMIC_STATE_SCISSOR, - VK_DYNAMIC_STATE_LINE_WIDTH, - VK_DYNAMIC_STATE_DEPTH_BIAS, - VK_DYNAMIC_STATE_BLEND_CONSTANTS, - VK_DYNAMIC_STATE_DEPTH_BOUNDS, - VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, - VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, - VK_DYNAMIC_STATE_STENCIL_REFERENCE - }; + VkDynamicState dynamic_states[9] = {VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_DEPTH_BIAS, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + VK_DYNAMIC_STATE_DEPTH_BOUNDS, + VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, + VK_DYNAMIC_STATE_STENCIL_REFERENCE}; VkPipelineDynamicStateCreateInfo dynamic_state; dynamic_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; @@ -985,7 +1040,9 @@ void PipelineBuilder::create_graphics_pipeline() { assert(result == VK_SUCCESS); } -void PipelineBuilder::create_pipeline() { +void +PipelineBuilder::create_pipeline() +{ unsigned num_desc_layouts = 0; for (unsigned i = 0; i < 64; i++) { if (!(desc_layouts_used & (1ull << i))) @@ -998,7 +1055,8 @@ void PipelineBuilder::create_pipeline() { desc_layout_info.bindingCount = num_desc_bindings[i]; desc_layout_info.pBindings = desc_bindings[i]; - ASSERTED VkResult result = CreateDescriptorSetLayout(device, &desc_layout_info, NULL, &desc_layouts[num_desc_layouts]); + ASSERTED VkResult result = CreateDescriptorSetLayout(device, &desc_layout_info, NULL, + &desc_layouts[num_desc_layouts]); assert(result == VK_SUCCESS); num_desc_layouts++; } @@ -1012,7 +1070,8 @@ void PipelineBuilder::create_pipeline() { pipeline_layout_info.setLayoutCount = num_desc_layouts; pipeline_layout_info.pSetLayouts = desc_layouts; - ASSERTED VkResult result = CreatePipelineLayout(device, &pipeline_layout_info, NULL, &pipeline_layout); + ASSERTED VkResult result = + CreatePipelineLayout(device, &pipeline_layout_info, NULL, &pipeline_layout); assert(result == VK_SUCCESS); if (is_compute()) @@ -1021,7 +1080,8 @@ void PipelineBuilder::create_pipeline() { create_graphics_pipeline(); } -void PipelineBuilder::print_ir(VkShaderStageFlagBits stage_flags, const char *name, bool remove_encoding) +void +PipelineBuilder::print_ir(VkShaderStageFlagBits stage_flags, const char* name, bool remove_encoding) { if (!pipeline) create_pipeline(); diff --git a/src/amd/compiler/tests/helpers.h b/src/amd/compiler/tests/helpers.h index c014369bf61..eb035e0ca05 100644 --- a/src/amd/compiler/tests/helpers.h +++ b/src/amd/compiler/tests/helpers.h @@ -24,8 +24,9 @@ #ifndef ACO_TEST_HELPERS_H #define ACO_TEST_HELPERS_H -#include "framework.h" #include "vulkan/vulkan.h" + +#include "framework.h" #include enum QoShaderDeclType { @@ -42,10 +43,10 @@ enum QoShaderDeclType { }; struct QoShaderDecl { - const char *name; - const char *type; + const char* name; + const char* type; QoShaderDeclType decl_type; - //TODO: array size? + // TODO: array size? unsigned location; unsigned component; unsigned binding; @@ -53,12 +54,12 @@ struct QoShaderDecl { }; struct QoShaderModuleCreateInfo { - void *pNext; - size_t spirvSize; - const void *pSpirv; - uint32_t declarationCount; - const QoShaderDecl *pDeclarations; - VkShaderStageFlagBits stage; + void* pNext; + size_t spirvSize; + const void* pSpirv; + uint32_t declarationCount; + const QoShaderDecl* pDeclarations; + VkShaderStageFlagBits stage; }; extern ac_shader_config config; @@ -71,17 +72,17 @@ namespace aco { struct ra_test_policy; } -void create_program(enum amd_gfx_level gfx_level, aco::Stage stage, - unsigned wave_size=64, enum radeon_family family=CHIP_UNKNOWN); -bool setup_cs(const char *input_spec, enum amd_gfx_level gfx_level, - enum radeon_family family=CHIP_UNKNOWN, const char* subvariant = "", - unsigned wave_size=64); +void create_program(enum amd_gfx_level gfx_level, aco::Stage stage, unsigned wave_size = 64, + enum radeon_family family = CHIP_UNKNOWN); +bool setup_cs(const char* input_spec, enum amd_gfx_level gfx_level, + enum radeon_family family = CHIP_UNKNOWN, const char* subvariant = "", + unsigned wave_size = 64); -void finish_program(aco::Program *program); +void finish_program(aco::Program* program); void finish_validator_test(); void finish_opt_test(); void finish_setup_reduce_temp_test(); -void finish_ra_test(aco::ra_test_policy, bool lower=false); +void finish_ra_test(aco::ra_test_policy, bool lower = false); void finish_optimizer_postRA_test(); void finish_to_hw_instr_test(); void finish_waitcnt_test(); @@ -89,35 +90,35 @@ void finish_insert_nops_test(); void finish_form_hard_clause_test(); void finish_assembler_test(); -void writeout(unsigned i, aco::Temp tmp=aco::Temp(0, aco::s1)); +void writeout(unsigned i, aco::Temp tmp = aco::Temp(0, aco::s1)); void writeout(unsigned i, aco::Builder::Result res); void writeout(unsigned i, aco::Operand op); void writeout(unsigned i, aco::Operand op0, aco::Operand op1); -aco::Temp fneg(aco::Temp src, aco::Builder b=bld); -aco::Temp fabs(aco::Temp src, aco::Builder b=bld); -aco::Temp f2f32(aco::Temp src, aco::Builder b=bld); -aco::Temp f2f16(aco::Temp src, aco::Builder b=bld); -aco::Temp u2u16(aco::Temp src, aco::Builder b=bld); -aco::Temp fadd(aco::Temp src0, aco::Temp src1, aco::Builder b=bld); -aco::Temp fmul(aco::Temp src0, aco::Temp src1, aco::Builder b=bld); -aco::Temp fma(aco::Temp src0, aco::Temp src1, aco::Temp src2, aco::Builder b=bld); -aco::Temp fsat(aco::Temp src, aco::Builder b=bld); -aco::Temp fmin(aco::Temp src0, aco::Temp src1, aco::Builder b=bld); -aco::Temp fmax(aco::Temp src0, aco::Temp src1, aco::Builder b=bld); -aco::Temp ext_ushort(aco::Temp src, unsigned idx, aco::Builder b=bld); -aco::Temp ext_ubyte(aco::Temp src, unsigned idx, aco::Builder b=bld); -void emit_divergent_if_else(aco::Program* prog, aco::Builder& b, aco::Operand cond, std::function then, - std::function els); +aco::Temp fneg(aco::Temp src, aco::Builder b = bld); +aco::Temp fabs(aco::Temp src, aco::Builder b = bld); +aco::Temp f2f32(aco::Temp src, aco::Builder b = bld); +aco::Temp f2f16(aco::Temp src, aco::Builder b = bld); +aco::Temp u2u16(aco::Temp src, aco::Builder b = bld); +aco::Temp fadd(aco::Temp src0, aco::Temp src1, aco::Builder b = bld); +aco::Temp fmul(aco::Temp src0, aco::Temp src1, aco::Builder b = bld); +aco::Temp fma(aco::Temp src0, aco::Temp src1, aco::Temp src2, aco::Builder b = bld); +aco::Temp fsat(aco::Temp src, aco::Builder b = bld); +aco::Temp fmin(aco::Temp src0, aco::Temp src1, aco::Builder b = bld); +aco::Temp fmax(aco::Temp src0, aco::Temp src1, aco::Builder b = bld); +aco::Temp ext_ushort(aco::Temp src, unsigned idx, aco::Builder b = bld); +aco::Temp ext_ubyte(aco::Temp src, unsigned idx, aco::Builder b = bld); +void emit_divergent_if_else(aco::Program* prog, aco::Builder& b, aco::Operand cond, + std::function then, std::function els); /* vulkan helpers */ VkDevice get_vk_device(enum amd_gfx_level gfx_level); VkDevice get_vk_device(enum radeon_family family); void print_pipeline_ir(VkDevice device, VkPipeline pipeline, VkShaderStageFlagBits stages, - const char *name, bool remove_encoding=false); + const char* name, bool remove_encoding = false); -VkShaderModule __qoCreateShaderModule(VkDevice dev, const QoShaderModuleCreateInfo *info); +VkShaderModule __qoCreateShaderModule(VkDevice dev, const QoShaderModuleCreateInfo* info); class PipelineBuilder { public: @@ -152,19 +153,21 @@ public: ~PipelineBuilder(); PipelineBuilder(const PipelineBuilder&) = delete; - PipelineBuilder& operator = (const PipelineBuilder&) = delete; + PipelineBuilder& operator=(const PipelineBuilder&) = delete; - void add_desc_binding(VkShaderStageFlags stage_flags, uint32_t layout, - uint32_t binding, VkDescriptorType type, uint32_t count=1); + void add_desc_binding(VkShaderStageFlags stage_flags, uint32_t layout, uint32_t binding, + VkDescriptorType type, uint32_t count = 1); - void add_vertex_binding(uint32_t binding, uint32_t stride, VkVertexInputRate rate=VK_VERTEX_INPUT_RATE_VERTEX); + void add_vertex_binding(uint32_t binding, uint32_t stride, + VkVertexInputRate rate = VK_VERTEX_INPUT_RATE_VERTEX); void add_vertex_attribute(uint32_t location, uint32_t binding, VkFormat format, uint32_t offset); - void add_resource_decls(QoShaderModuleCreateInfo *module); - void add_io_decls(QoShaderModuleCreateInfo *module); + void add_resource_decls(QoShaderModuleCreateInfo* module); + void add_io_decls(QoShaderModuleCreateInfo* module); - void add_stage(VkShaderStageFlagBits stage, VkShaderModule module, const char *name="main"); - void add_stage(VkShaderStageFlagBits stage, QoShaderModuleCreateInfo module, const char *name="main"); + void add_stage(VkShaderStageFlagBits stage, VkShaderModule module, const char* name = "main"); + void add_stage(VkShaderStageFlagBits stage, QoShaderModuleCreateInfo module, + const char* name = "main"); void add_vsfs(VkShaderModule vs, VkShaderModule fs); void add_vsfs(QoShaderModuleCreateInfo vs, QoShaderModuleCreateInfo fs); void add_cs(VkShaderModule cs); @@ -174,7 +177,8 @@ public: void create_pipeline(); - void print_ir(VkShaderStageFlagBits stages, const char *name, bool remove_encoding=false); + void print_ir(VkShaderStageFlagBits stages, const char* name, bool remove_encoding = false); + private: void create_compute_pipeline(); void create_graphics_pipeline(); diff --git a/src/amd/compiler/tests/main.cpp b/src/amd/compiler/tests/main.cpp index 8f5e8ea914b..a714d3a5855 100644 --- a/src/amd/compiler/tests/main.cpp +++ b/src/amd/compiler/tests/main.cpp @@ -21,20 +21,22 @@ * IN THE SOFTWARE. * */ +#include "aco_ir.h" + +#include + +#include "framework.h" +#include #include #include -#include -#include +#include #include #include -#include +#include #include -#include -#include -#include "aco_ir.h" -#include "framework.h" +#include -static const char *help_message = +static const char* help_message = "Usage: %s [-h] [-l --list] [--no-check] [TEST [TEST ...]]\n" "\n" "Run ACO unit test(s). If TEST is not provided, all tests are run.\n" @@ -50,26 +52,27 @@ static const char *help_message = " --no-check Print test output instead of checking it.\n"; std::map tests; -FILE *output = NULL; +FILE* output = NULL; static TestDef current_test; static unsigned tests_written = 0; -static FILE *checker_stdin = NULL; -static char *checker_stdin_data = NULL; +static FILE* checker_stdin = NULL; +static char* checker_stdin_data = NULL; static size_t checker_stdin_size = 0; -static char *output_data = NULL; +static char* output_data = NULL; static size_t output_size = 0; static size_t output_offset = 0; static char current_variant[64] = {0}; -static std::set *variant_filter = NULL; +static std::set* variant_filter = NULL; bool test_failed = false; bool test_skipped = false; static char fail_message[256] = {0}; -void write_test() +void +write_test() { if (!checker_stdin) { /* not entirely correct, but shouldn't matter */ @@ -81,18 +84,18 @@ void write_test() if (output_offset == output_size && !test_skipped && !test_failed) return; - char *data = output_data + output_offset; + char* data = output_data + output_offset; uint32_t size = output_size - output_offset; fwrite("test", 1, 4, checker_stdin); - fwrite(current_test.name, 1, strlen(current_test.name)+1, checker_stdin); - fwrite(current_variant, 1, strlen(current_variant)+1, checker_stdin); - fwrite(current_test.source_file, 1, strlen(current_test.source_file)+1, checker_stdin); + fwrite(current_test.name, 1, strlen(current_test.name) + 1, checker_stdin); + fwrite(current_variant, 1, strlen(current_variant) + 1, checker_stdin); + fwrite(current_test.source_file, 1, strlen(current_test.source_file) + 1, checker_stdin); if (test_failed || test_skipped) { - const char *res = test_failed ? "failed" : "skipped"; + const char* res = test_failed ? "failed" : "skipped"; fwrite("\x01", 1, 1, checker_stdin); - fwrite(res, 1, strlen(res)+1, checker_stdin); - fwrite(fail_message, 1, strlen(fail_message)+1, checker_stdin); + fwrite(res, 1, strlen(res) + 1, checker_stdin); + fwrite(fail_message, 1, strlen(fail_message) + 1, checker_stdin); } else { fwrite("\x00", 1, 1, checker_stdin); } @@ -103,7 +106,8 @@ void write_test() output_offset += size; } -bool set_variant(const char *name) +bool +set_variant(const char* name) { if (variant_filter && !variant_filter->count(name)) return false; @@ -118,7 +122,8 @@ bool set_variant(const char *name) return true; } -void fail_test(const char *fmt, ...) +void +fail_test(const char* fmt, ...) { va_list args; va_start(args, fmt); @@ -129,7 +134,8 @@ void fail_test(const char *fmt, ...) va_end(args); } -void skip_test(const char *fmt, ...) +void +skip_test(const char* fmt, ...) { va_list args; va_start(args, fmt); @@ -140,7 +146,8 @@ void skip_test(const char *fmt, ...) va_end(args); } -void run_test(TestDef def) +void +run_test(TestDef def) { current_test = def; output_data = NULL; @@ -163,7 +170,8 @@ void run_test(TestDef def) free(output_data); } -int check_output(char **argv) +int +check_output(char** argv) { fflush(stdout); fflush(stderr); @@ -183,7 +191,8 @@ int check_output(char **argv) close(stdin_pipe[0]); close(stdin_pipe[1]); - execlp(ACO_TEST_PYTHON_BIN, ACO_TEST_PYTHON_BIN, ACO_TEST_SOURCE_DIR "/check_output.py", NULL); + execlp(ACO_TEST_PYTHON_BIN, ACO_TEST_PYTHON_BIN, ACO_TEST_SOURCE_DIR "/check_output.py", + NULL); fprintf(stderr, "%s: execlp() failed: %s\n", argv[0], strerror(errno)); return 99; } else { @@ -197,7 +206,8 @@ int check_output(char **argv) } } -bool match_test(std::string name, std::string pattern) +bool +match_test(std::string name, std::string pattern) { if (name.length() < pattern.length()) return false; @@ -206,33 +216,25 @@ bool match_test(std::string name, std::string pattern) return name == pattern; } -int main(int argc, char **argv) +int +main(int argc, char** argv) { int print_help = 0; int do_list = 0; int do_check = 1; - const struct option opts[] = { - { "help", no_argument, &print_help, 1 }, - { "list", no_argument, &do_list, 1 }, - { "no-check", no_argument, &do_check, 0 }, - { NULL, 0, NULL, 0 } - }; + const struct option opts[] = {{"help", no_argument, &print_help, 1}, + {"list", no_argument, &do_list, 1}, + {"no-check", no_argument, &do_check, 0}, + {NULL, 0, NULL, 0}}; int c; while ((c = getopt_long(argc, argv, "hl", opts, NULL)) != -1) { switch (c) { - case 'h': - print_help = 1; - break; - case 'l': - do_list = 1; - break; - case 0: - break; + case 'h': print_help = 1; break; + case 'l': do_list = 1; break; + case 0: break; case '?': - default: - fprintf(stderr, "%s: Invalid argument\n", argv[0]); - return 99; + default: fprintf(stderr, "%s: Invalid argument\n", argv[0]); return 99; } } @@ -262,10 +264,10 @@ int main(int argc, char **argv) if (do_check) checker_stdin = open_memstream(&checker_stdin_data, &checker_stdin_size); - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - LLVMInitializeAMDGPUDisassembler(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUDisassembler(); aco::init(); diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index 5f4aac47941..7adddb3f673 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -21,11 +21,11 @@ * IN THE SOFTWARE. * */ +#include + #include "helpers.h" #include "sid.h" -#include - using namespace aco; BEGIN_TEST(assembler.s_memtime) @@ -178,7 +178,7 @@ BEGIN_TEST(assembler.long_jump.conditional_backwards) finish_assembler_test(); END_TEST -BEGIN_TEST(assembler.long_jump.3f) +BEGIN_TEST(assembler.long_jump .3f) if (!setup_cs(NULL, (amd_gfx_level)GFX10)) return; @@ -354,25 +354,31 @@ BEGIN_TEST(assembler.vopc_sdwa) //~gfx9>> v_cmp_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 86860080 //~gfx10>> v_cmp_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 86860080 - bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(vcc, s2), Operand::zero(), Operand::zero()); + bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(vcc, s2), Operand::zero(), + Operand::zero()); //~gfx9! v_cmp_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 8686ac80 //~gfx10! v_cmp_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 8686ac80 - bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(PhysReg(0x2c), s2), Operand::zero(), Operand::zero()); + bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(PhysReg(0x2c), s2), Operand::zero(), + Operand::zero()); //~gfx9! v_cmp_lt_u32_sdwa exec, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 8686fe80 //~gfx10! v_cmp_lt_u32_sdwa exec, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 8686fe80 - bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(exec, s2), Operand::zero(), Operand::zero()); + bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(exec, s2), Operand::zero(), + Operand::zero()); if (i == GFX10) { //~gfx10! v_cmpx_lt_u32_sdwa 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7da300f9 86860080 - bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(exec, s2), Operand::zero(), Operand::zero()); + bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(exec, s2), Operand::zero(), + Operand::zero()); } else { //~gfx9! v_cmpx_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7db300f9 86860080 - bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(vcc, s2), Definition(exec, s2), Operand::zero(), Operand::zero()); + bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(vcc, s2), Definition(exec, s2), + Operand::zero(), Operand::zero()); //~gfx9! v_cmpx_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7db300f9 8686ac80 - bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(PhysReg(0x2c), s2), Definition(exec, s2), Operand::zero(), Operand::zero()); + bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(PhysReg(0x2c), s2), + Definition(exec, s2), Operand::zero(), Operand::zero()); } finish_assembler_test(); @@ -452,48 +458,70 @@ BEGIN_TEST(assembler.gfx11.mubuf) bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, true); //! buffer_load_b32 v42, v10, s[32:35], s30 idxen ; e0500000 1e882a0a - bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, false)->mubuf().idxen = true; + bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, false)->mubuf().idxen = + true; //! buffer_load_b32 v42, v[20:21], s[32:35], s30 idxen offen ; e0500000 1ec82a14 - bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v2, op_s1, 0, true)->mubuf().idxen = true; + bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v2, op_s1, 0, true)->mubuf().idxen = + true; //! buffer_load_b32 v42, off, s[32:35], s30 offset:84 ; e0500054 1e082a80 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 84, false); /* Various flags */ //! buffer_load_b32 v42, off, s[32:35], 0 glc ; e0504000 80082a80 - bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)->mubuf().glc = true; + bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) + ->mubuf() + .glc = true; //! buffer_load_b32 v42, off, s[32:35], 0 dlc ; e0502000 80082a80 - bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)->mubuf().dlc = true; + bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) + ->mubuf() + .dlc = true; //! buffer_load_b32 v42, off, s[32:35], 0 slc ; e0501000 80082a80 - bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)->mubuf().slc = true; + bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) + ->mubuf() + .slc = true; //; if llvm_ver >= 16: //; insert_pattern('buffer_load_b32 v[42:43], off, s[32:35], 0 tfe ; e0500000 80282a80') //; else: //; insert_pattern('buffer_load_b32 v42, off, s[32:35], 0 tfe ; e0500000 80282a80') - bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)->mubuf().tfe = true; + bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) + ->mubuf() + .tfe = true; /* LDS */ //! buffer_load_lds_b32 off, s[32:35], 0 ; e0c40000 80080080 - bld.mubuf(aco_opcode::buffer_load_dword, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false)->mubuf().lds = true; + bld.mubuf(aco_opcode::buffer_load_dword, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false) + ->mubuf() + .lds = true; //! buffer_load_lds_i8 off, s[32:35], 0 ; e0b80000 80080080 - bld.mubuf(aco_opcode::buffer_load_sbyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false)->mubuf().lds = true; + bld.mubuf(aco_opcode::buffer_load_sbyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false) + ->mubuf() + .lds = true; //! buffer_load_lds_i16 off, s[32:35], 0 ; e0c00000 80080080 - bld.mubuf(aco_opcode::buffer_load_sshort, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false)->mubuf().lds = true; + bld.mubuf(aco_opcode::buffer_load_sshort, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false) + ->mubuf() + .lds = true; //! buffer_load_lds_u8 off, s[32:35], 0 ; e0b40000 80080080 - bld.mubuf(aco_opcode::buffer_load_ubyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false)->mubuf().lds = true; + bld.mubuf(aco_opcode::buffer_load_ubyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false) + ->mubuf() + .lds = true; //! buffer_load_lds_u16 off, s[32:35], 0 ; e0bc0000 80080080 - bld.mubuf(aco_opcode::buffer_load_ushort, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false)->mubuf().lds = true; + bld.mubuf(aco_opcode::buffer_load_ushort, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false) + ->mubuf() + .lds = true; //! buffer_load_lds_format_x off, s[32:35], 0 ; e0c80000 80080080 - bld.mubuf(aco_opcode::buffer_load_format_x, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false)->mubuf().lds = true; + bld.mubuf(aco_opcode::buffer_load_format_x, op_s4, Operand(v1), Operand::zero(), op_m0, 0, false) + ->mubuf() + .lds = true; /* Stores */ //! buffer_store_b32 v10, off, s[32:35], s30 ; e0680000 1e080a80 @@ -532,42 +560,62 @@ BEGIN_TEST(assembler.gfx11.mtbuf) /* Addressing */ //>> tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9900000 1e082a80 - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 0, false); + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 0, + false); //! tbuffer_load_format_x v42, off, s[32:35], 42 format:[BUF_FMT_32_32_FLOAT] ; e9900000 aa082a80 - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::c32(42), dfmt, nfmt, 0, false); + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::c32(42), dfmt, + nfmt, 0, false); //! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9900000 1e482a0a bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, true); //! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen ; e9900000 1e882a0a - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, false)->mtbuf().idxen = true; + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, false) + ->mtbuf() + .idxen = true; //! tbuffer_load_format_x v42, v[20:21], s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen offen ; e9900000 1ec82a14 - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v2, op_s1, dfmt, nfmt, 0, true)->mtbuf().idxen = true; + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v2, op_s1, dfmt, nfmt, 0, true) + ->mtbuf() + .idxen = true; //! tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offset:84 ; e9900054 1e082a80 - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 84, false); + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 84, + false); /* Various flags */ //! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] glc ; e9904000 80082a80 - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false)->mtbuf().glc = true; + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, + nfmt, 0, false) + ->mtbuf() + .glc = true; //! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] dlc ; e9902000 80082a80 - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false)->mtbuf().dlc = true; + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, + nfmt, 0, false) + ->mtbuf() + .dlc = true; //! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] slc ; e9901000 80082a80 - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false)->mtbuf().slc = true; + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, + nfmt, 0, false) + ->mtbuf() + .slc = true; //; if llvm_ver >= 16: //; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80282a80') //; else: //; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] tfe ; e9900000 80282a80') - bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false)->mtbuf().tfe = true; + bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, + nfmt, 0, false) + ->mtbuf() + .tfe = true; /* Stores */ //! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9920000 1e080a80 - bld.mtbuf(aco_opcode::tbuffer_store_format_x, op_s4, Operand(v1), op_s1, op_v1, dfmt, nfmt, 0, false); + bld.mtbuf(aco_opcode::tbuffer_store_format_x, op_s4, Operand(v1), op_s1, op_v1, dfmt, nfmt, 0, + false); //! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9928000 1e48140a bld.mtbuf(aco_opcode::tbuffer_store_format_xy, op_s4, op_v1, op_s1, op_v2, dfmt, nfmt, 0, true); @@ -604,7 +652,8 @@ BEGIN_TEST(assembler.gfx11.mimg) bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1); //! image_sample v[84:87], v[20:21], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; f06c0f04 20105414 - bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v2)->mimg().dim = ac_image_2d; + bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v2)->mimg().dim = + ac_image_2d; //! image_sample v42, v10, s[64:71], s[32:35] dmask:0x1 dim:SQ_RSRC_IMG_1D ; f06c0100 20102a0a bld.mimg(aco_opcode::image_sample, dst_v1, op_s8, op_s4, Operand(v1), op_v1)->mimg().dmask = 0x1; @@ -636,14 +685,20 @@ BEGIN_TEST(assembler.gfx11.mimg) /* NSA */ //! image_sample v[84:87], [v10, v40], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; f06c0f05 2010540a 00000028 - bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1, Operand(bld.tmp(v1), PhysReg(256 + 40)))->mimg().dim = ac_image_2d; + bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1, + Operand(bld.tmp(v1), PhysReg(256 + 40))) + ->mimg() + .dim = ac_image_2d; /* Stores */ //! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; f0180f00 00101e0a bld.mimg(aco_opcode::image_store, op_s8, Operand(s4), op_v4, op_v1); //! image_atomic_add v10, v20, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0300f04 00100a14 - bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4), op_v1, op_v2)->mimg().dim = ac_image_2d; + bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4), + op_v1, op_v2) + ->mimg() + .dim = ac_image_2d; finish_assembler_test(); END_TEST @@ -761,13 +816,19 @@ BEGIN_TEST(assembler.gfx11.vinterp) bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, dst, op0, op1, op2, 0); //! v_interp_p10_f32 v42, -v10, v20, v30 ; cd00002a 247a290a - bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0)->vinterp_inreg().neg[0] = true; + bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0) + ->vinterp_inreg() + .neg[0] = true; //! v_interp_p10_f32 v42, v10, -v20, v30 ; cd00002a 447a290a - bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0)->vinterp_inreg().neg[1] = true; + bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0) + ->vinterp_inreg() + .neg[1] = true; //! v_interp_p10_f32 v42, v10, v20, -v30 ; cd00002a 847a290a - bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0)->vinterp_inreg().neg[2] = true; + bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0) + ->vinterp_inreg() + .neg[2] = true; //! v_interp_p10_f16_f32 v42, v10, v20, v30 op_sel:[1,0,0,0] ; cd02082a 047a290a bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, dst, op0, op1, op2, 0, 0x1); @@ -782,7 +843,9 @@ BEGIN_TEST(assembler.gfx11.vinterp) bld.vinterp_inreg(aco_opcode::v_interp_p2_rtz_f16_f32_inreg, dst, op0, op1, op2, 0, 0x8); //! v_interp_p10_f32 v42, v10, v20, v30 clamp ; cd00802a 047a290a - bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0)->vinterp_inreg().clamp = true; + bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0) + ->vinterp_inreg() + .clamp = true; finish_assembler_test(); END_TEST @@ -899,16 +962,22 @@ BEGIN_TEST(assembler.gfx11.vop12c_v128) bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().abs[0] = true; //! v_mul_f16_e64_dpp v128, -v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 200204fa ff1d2101 - bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().neg[0] = true; + bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().neg[0] = + true; //! v_mul_f16_e64_dpp v128, |v1|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350180 000204fa ff2d2101 - bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().abs[0] = true; + bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().abs[0] = + true; //! v_cmp_eq_f16_e64_dpp vcc, -v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402006a 200204fa ff1d2181 - bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))->dpp16().neg[0] = true; + bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1)) + ->dpp16() + .neg[0] = true; //! v_cmp_eq_f16_e64_dpp vcc, |v129|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402016a 000204fa ff2d2181 - bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))->dpp16().abs[0] = true; + bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1)) + ->dpp16() + .abs[0] = true; finish_assembler_test(); END_TEST diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp index b0cd1dd5cab..ee0299e124b 100644 --- a/src/amd/compiler/tests/test_d3d11_derivs.cpp +++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp @@ -633,9 +633,10 @@ BEGIN_TEST(d3d11_derivs.nsa_max) //~gfx11! v4: %_:v[0-3] = image_sample_c_b_o s8: undef, s4: undef, v1: undef, %_:v[6], %_:v[7], %_:v[8], %_:v[3], %_:v[4-5] 2darray da - Instruction *instr = bld.mimg(aco_opcode::image_sample_c_b_o, Definition(reg_v0, v4), - Operand(s8), Operand(s4), Operand(v1), Operand(reg_v0, v6.as_linear()), - Operand(reg_v6, v1), Operand(reg_v7, v1), Operand(reg_v8, v1)); + Instruction* instr = + bld.mimg(aco_opcode::image_sample_c_b_o, Definition(reg_v0, v4), Operand(s8), Operand(s4), + Operand(v1), Operand(reg_v0, v6.as_linear()), Operand(reg_v6, v1), + Operand(reg_v7, v1), Operand(reg_v8, v1)); instr->mimg().dim = ac_image_2darray; instr->mimg().da = true; instr->mimg().strict_wqm = true; diff --git a/src/amd/compiler/tests/test_hard_clause.cpp b/src/amd/compiler/tests/test_hard_clause.cpp index a9eb4ee76fa..7e61b87c491 100644 --- a/src/amd/compiler/tests/test_hard_clause.cpp +++ b/src/amd/compiler/tests/test_hard_clause.cpp @@ -26,7 +26,8 @@ using namespace aco; -static void create_mubuf(Temp desc=Temp(0, s8)) +static void +create_mubuf(Temp desc = Temp(0, s8)) { Operand desc_op(desc); desc_op.setFixed(PhysReg(0)); @@ -34,13 +35,15 @@ static void create_mubuf(Temp desc=Temp(0, s8)) Operand(PhysReg(256), v1), Operand::zero(), 0, false); } -static void create_mubuf_store() +static void +create_mubuf_store() { bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(PhysReg(256), v1), Operand(PhysReg(256), v1), Operand::zero(), 0, false); } -static void create_mtbuf(Temp desc=Temp(0, s8)) +static void +create_mtbuf(Temp desc = Temp(0, s8)) { Operand desc_op(desc); desc_op.setFixed(PhysReg(0)); @@ -49,22 +52,25 @@ static void create_mtbuf(Temp desc=Temp(0, s8)) V_008F0C_BUF_NUM_FORMAT_FLOAT, 0, false); } -static void create_flat() +static void +create_flat() { - bld.flat(aco_opcode::flat_load_dword, Definition(PhysReg(256), v1), - Operand(PhysReg(256), v2), Operand(s2)); + bld.flat(aco_opcode::flat_load_dword, Definition(PhysReg(256), v1), Operand(PhysReg(256), v2), + Operand(s2)); } -static void create_global() +static void +create_global() { bld.global(aco_opcode::global_load_dword, Definition(PhysReg(256), v1), Operand(PhysReg(256), v2), Operand(s2)); } -static void create_mimg(bool nsa, Temp desc=Temp(0, s8)) +static void +create_mimg(bool nsa, Temp desc = Temp(0, s8)) { - aco_ptr mimg{create_instruction( - aco_opcode::image_sample, Format::MIMG, 5, 1)}; + aco_ptr mimg{ + create_instruction(aco_opcode::image_sample, Format::MIMG, 5, 1)}; mimg->definitions[0] = Definition(PhysReg(256), v1); mimg->operands[0] = Operand(desc); mimg->operands[0].setFixed(PhysReg(0)); @@ -78,13 +84,15 @@ static void create_mimg(bool nsa, Temp desc=Temp(0, s8)) bld.insert(std::move(mimg)); } -static void create_smem() +static void +create_smem() { bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2), Operand::zero()); } -static void create_smem_buffer(Temp desc=Temp(0, s4)) +static void +create_smem_buffer(Temp desc = Temp(0, s4)) { Operand desc_op(desc); desc_op.setFixed(PhysReg(0)); diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index 8fc3a1877a4..1658e8d653f 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -25,22 +25,25 @@ using namespace aco; -void create_mubuf(unsigned offset, PhysReg dst=PhysReg(256), PhysReg vaddr=PhysReg(256)) +void +create_mubuf(unsigned offset, PhysReg dst = PhysReg(256), PhysReg vaddr = PhysReg(256)) { bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4), Operand(vaddr, v1), Operand::zero(), offset, true); } -void create_mubuf_store(PhysReg src=PhysReg(256)) +void +create_mubuf_store(PhysReg src = PhysReg(256)) { - bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), - Operand(src, v1), Operand::zero(), Operand(src, v1), 0, true); + bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(src, v1), + Operand::zero(), Operand(src, v1), 0, true); } -void create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords) +void +create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords) { - aco_ptr mimg{create_instruction( - aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)}; + aco_ptr mimg{ + create_instruction(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)}; mimg->definitions[0] = Definition(PhysReg(256), v1); mimg->operands[0] = Operand(PhysReg(0), s8); mimg->operands[1] = Operand(PhysReg(0), s4); @@ -216,7 +219,8 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write) //! s_waitcnt_depctr vm_vsrc(0) //! s1: %0:m0 = s_mov_b32 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), Operand(m0, s1)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), + Operand(m0, s1)); bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); //! p_unit_test 5 @@ -224,7 +228,8 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write) //! s_waitcnt_depctr vm_vsrc(0) //! s2: %0:exec = s_mov_b64 -1 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), Operand(m0, s1)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), + Operand(m0, s1)); bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); /* no hazard: LDS */ @@ -232,7 +237,8 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write) //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0 //! s1: %0:s[0] = s_mov_b32 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), Operand(m0, s1)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), + Operand(m0, s1)); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); /* no hazard: LDS with VALU in-between */ @@ -241,7 +247,8 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write) //! v_nop //! s1: %0:m0 = s_mov_b32 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), Operand(m0, s1)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), + Operand(m0, s1)); bld.vop1(aco_opcode::v_nop); bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); @@ -269,7 +276,8 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write) //! s_waitcnt lgkmcnt(0) //! s1: %0:m0 = s_mov_b32 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), Operand(m0, s1)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), + Operand(m0, s1)); bld.sopp(aco_opcode::s_waitcnt, -1, 0xc07f); bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); @@ -300,7 +308,8 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write) //! s_waitcnt_depctr vm_vsrc(0) //! s1: %0:m0 = s_mov_b32 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), Operand(m0, s1)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), + Operand(m0, s1)); bld.sopp(aco_opcode::s_waitcnt, -1, 0x3f70); bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); @@ -932,8 +941,8 @@ BEGIN_TEST(insert_nops.valu_mask_write) //! s_waitcnt_depctr sa_sdst(0) //! s1: %0:s[2] = s_mov_b32 %0:s[1] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), - Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), + Operand::zero(), Operand(PhysReg(0), s2)); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); @@ -944,8 +953,8 @@ BEGIN_TEST(insert_nops.valu_mask_write) //! s1: %0:s[1] = s_mov_b32 0 //! s1: %0:s[2] = s_mov_b32 %0:s[1] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), - Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), + Operand::zero(), Operand(PhysReg(0), s2)); bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1)); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); @@ -957,8 +966,8 @@ BEGIN_TEST(insert_nops.valu_mask_write) //! s1: %0:s[2] = s_mov_b32 %0:s[1] //! s1: %0:s[2] = s_mov_b32 %0:s[1] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), - Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), + Operand::zero(), Operand(PhysReg(0), s2)); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); @@ -969,8 +978,8 @@ BEGIN_TEST(insert_nops.valu_mask_write) //! s_waitcnt_depctr sa_sdst(0) //! s1: %0:s[2] = s_mov_b32 %0:s[1] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), - Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), + Operand::zero(), Operand(PhysReg(0), s2)); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); @@ -982,8 +991,8 @@ BEGIN_TEST(insert_nops.valu_mask_write) //! s_waitcnt_depctr sa_sdst(0) //! s1: %0:s[2] = s_mov_b32 %0:s[1] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), - Operand(PhysReg(2), s1), Operand::zero(), Operand(PhysReg(0), s2)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand(PhysReg(2), s1), + Operand::zero(), Operand(PhysReg(0), s2)); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index b6c2e8302c7..86d81845173 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -36,15 +36,14 @@ BEGIN_TEST(insert_waitcnt.ds_ordered_count) Operand chan_counter(PhysReg(260), v1); Operand m(m0, s1); - Instruction *ds_instr; + Instruction* ds_instr; //>> ds_ordered_count %0:v[0], %0:v[3], %0:m0 offset0:3072 gds storage:gds semantics:volatile //! s_waitcnt lgkmcnt(0) ds_instr = bld.ds(aco_opcode::ds_ordered_count, def0, gds_base, m, 3072u, 0u, true); ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile); //! ds_add_rtn_u32 %0:v[1], %0:v[3], %0:v[4], %0:m0 gds storage:gds semantics:volatile,atomic,rmw - ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, def1, - gds_base, chan_counter, m, 0u, 0u, true); + ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, def1, gds_base, chan_counter, m, 0u, 0u, true); ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw); //! s_waitcnt lgkmcnt(0) diff --git a/src/amd/compiler/tests/test_isel.cpp b/src/amd/compiler/tests/test_isel.cpp index a266276b604..82aabb0550c 100644 --- a/src/amd/compiler/tests/test_isel.cpp +++ b/src/amd/compiler/tests/test_isel.cpp @@ -21,19 +21,18 @@ * IN THE SOFTWARE. * */ +#include + #include "helpers.h" #include "test_isel-spirv.h" -#include - using namespace aco; BEGIN_TEST(isel.interp.simple) QoShaderModuleCreateInfo vs = qoShaderModuleCreateInfoGLSL(VERTEX, layout(location = 0) in vec4 in_color; layout(location = 0) out vec4 out_color; - void main() { - out_color = in_color; + void main() { out_color = in_color; } ); QoShaderModuleCreateInfo fs = qoShaderModuleCreateInfoGLSL(FRAGMENT, diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index fa385a26c8c..f49ddc55506 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -61,7 +61,8 @@ BEGIN_TEST(optimize.neg) //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 //! p_unit_test 5, %res5 - writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1))); + writeout(5, + bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1))); //! v1: %res6 = v_subrev_f32 %a, %b //! p_unit_test 6, %res6 @@ -264,7 +265,8 @@ BEGIN_TEST(optimize.output_modifiers) finish_opt_test(); END_TEST -Temp create_subbrev_co(Operand op0, Operand op1, Operand op2) +Temp +create_subbrev_co(Operand op0, Operand op1, Operand op2) { return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.def(bld.lm), op0, op1, op2); } @@ -438,7 +440,7 @@ BEGIN_TEST(optimize.bcnt) END_TEST struct clamp_config { - const char *name; + const char* name; aco_opcode min, max, med3; Operand lb, ub; }; @@ -863,7 +865,7 @@ enum denorm_op { denorm_fnegabs = 3, }; -static const char *denorm_op_names[] = { +static const char* denorm_op_names[] = { "mul1", "fneg", "fabs", @@ -877,31 +879,27 @@ struct denorm_config { aco_opcode dest; }; -static const char *srcdest_op_name(aco_opcode op) +static const char* +srcdest_op_name(aco_opcode op) { switch (op) { - case aco_opcode::v_cndmask_b32: - return "cndmask"; - case aco_opcode::v_min_f32: - return "min"; - case aco_opcode::v_rcp_f32: - return "rcp"; - default: - return "none"; + case aco_opcode::v_cndmask_b32: return "cndmask"; + case aco_opcode::v_min_f32: return "min"; + case aco_opcode::v_rcp_f32: return "rcp"; + default: return "none"; } } -static Temp emit_denorm_srcdest(aco_opcode op, Temp val) +static Temp +emit_denorm_srcdest(aco_opcode op, Temp val) { switch (op) { case aco_opcode::v_cndmask_b32: return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]); case aco_opcode::v_min_f32: return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val); - case aco_opcode::v_rcp_f32: - return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val); - default: - return val; + case aco_opcode::v_rcp_f32: return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val); + default: return val; } } @@ -917,7 +915,8 @@ BEGIN_TEST(optimize.denorm_propagation) configs.push_back({flush, op, aco_opcode::num_opcodes, dest}); } - for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) { + for (aco_opcode src : + {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) { for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs}) configs.push_back({flush, op, src, aco_opcode::num_opcodes}); } @@ -925,18 +924,18 @@ BEGIN_TEST(optimize.denorm_propagation) for (denorm_config cfg : configs) { char subvariant[128]; - sprintf(subvariant, "_%s_%s_%s_%s", - cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src), + sprintf(subvariant, "_%s_%s_%s_%s", cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src), denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest)); if (!setup_cs("v1 s2", (amd_gfx_level)i, CHIP_UNKNOWN, subvariant)) continue; - bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) || - cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) || - !cfg.flush; + bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || + (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) || + cfg.dest == aco_opcode::v_rcp_f32 || + (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) || !cfg.flush; - fprintf(output, "src, dest, op: %s %s %s\n", - srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]); + fprintf(output, "src, dest, op: %s %s %s\n", srcdest_op_name(cfg.src), + srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]); fprintf(output, "can_propagate: %u\n", can_propagate); //! src, dest, op: $src $dest $op //! can_propagate: #can_propagate @@ -976,15 +975,9 @@ BEGIN_TEST(optimize.denorm_propagation) case denorm_mul1: val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val); break; - case denorm_fneg: - val = fneg(val); - break; - case denorm_fabs: - val = fabs(val); - break; - case denorm_fnegabs: - val = fneg(fabs(val)); - break; + case denorm_fneg: val = fneg(val); break; + case denorm_fabs: val = fabs(val); break; + case denorm_fnegabs: val = fneg(fabs(val)); break; } val = emit_denorm_srcdest(cfg.dest, val); writeout( @@ -1123,13 +1116,15 @@ BEGIN_TEST(optimize.dpp_prop) //! v1: %res2 = v_mul_f32 0x12345678, %a //! p_unit_test 2, %res2 Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u)); - writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1))); + writeout(2, + bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1))); //! v1: %literal2 = p_parallelcopy 0x12345679 //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1 //! p_unit_test 3, %res3 Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u)); - writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1))); + writeout(3, + bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1))); //! v1: %b_v = p_parallelcopy %b //! v1: %res4 = v_mul_f32 %b, %a @@ -1171,7 +1166,9 @@ BEGIN_TEST(optimize.casts) //! v1: %res2_tmp = v_mul_f32 -1.0, %a16 //! v2b: %res2 = v_mul_f16 %res2_tmp, %a16 //! p_unit_test 2, %res2 - writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(a16))), a16)); + writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), + Operand::c32(0xbf800000u), bld.as_uniform(a16))), + a16)); //! v1: %res3_tmp = v_mul_f32 %a, %a //! v2b: %res3 = v_add_f16 %res3_tmp, 0 clamp @@ -1191,7 +1188,8 @@ BEGIN_TEST(optimize.casts) //! v2b: %res6_tmp = v_mul_f16 %a16, %a16 //! v1: %res6 = v_mul_f32 2.0, %res6_tmp //! p_unit_test 6, %res6 - writeout(6, fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000)))); + writeout(6, + fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000)))); //! v1: %res7_tmp = v_mul_f32 %a, %a //! v2b: %res7 = v_add_f16 %res7_tmp, %a16 @@ -1211,7 +1209,8 @@ BEGIN_TEST(optimize.casts) //! v2b: %res10_tmp = v_mul_f16 %a16, %a16 //! v1: %res10 = v_mul_f32 -1.0, %res10_tmp //! p_unit_test 10, %res10 - writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(fmul(a16, a16)))); + writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), + bld.as_uniform(fmul(a16, a16)))); finish_opt_test(); END_TEST @@ -1549,7 +1548,8 @@ BEGIN_TEST(optimize.mad_mix.fma.basic) //! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, -0 //! v1: %res2 = v_add_f32 %res2_mul, %c *2 //! p_unit_test 2, %res2 - writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000), fadd(fmul(f2f32(a16), b), c))); + writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000), + fadd(fmul(f2f32(a16), b), c))); /* neg/abs modifiers */ //! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)| @@ -1730,7 +1730,8 @@ BEGIN_TEST(optimize.mad_mix.cast) } END_TEST -static void vop3p_constant(unsigned *idx, aco_opcode op, const char *swizzle, uint32_t val) +static void +vop3p_constant(unsigned* idx, aco_opcode op, const char* swizzle, uint32_t val) { uint32_t halves[2] = {val & 0xffff, val >> 16}; uint32_t expected = halves[swizzle[0] - 'x'] | (halves[swizzle[1] - 'x'] << 16); @@ -1744,7 +1745,7 @@ static void vop3p_constant(unsigned *idx, aco_opcode op, const char *swizzle, ui BEGIN_TEST(optimize.vop3p_constants) for (aco_opcode op : {aco_opcode::v_pk_add_f16, aco_opcode::v_pk_add_u16}) { - for (const char *swizzle : {"xx", "yy", "xy", "yx"}) { + for (const char* swizzle : {"xx", "yy", "xy", "yx"}) { char variant[16]; strcpy(variant, op == aco_opcode::v_pk_add_f16 ? "_f16" : "_u16"); strcat(variant, "_"); diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index d6cc3208ecb..8913397cead 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -27,310 +27,324 @@ using namespace aco; BEGIN_TEST(optimizer_postRA.vcmp) - PhysReg reg_v0(256); - PhysReg reg_s0(0); - PhysReg reg_s2(2); - PhysReg reg_s4(4); + PhysReg reg_v0(256); + PhysReg reg_s0(0); + PhysReg reg_s2(2); + PhysReg reg_s4(4); - //>> v1: %a:v[0] = p_startpgm - ASSERTED bool setup_ok = setup_cs("v1", GFX8); - assert(setup_ok); + //>> v1: %a:v[0] = p_startpgm + ASSERTED bool setup_ok = setup_cs("v1", GFX8); + assert(setup_ok); - auto &startpgm = bld.instructions->at(0); - assert(startpgm->opcode == aco_opcode::p_startpgm); - startpgm->definitions[0].setFixed(reg_v0); + auto& startpgm = bld.instructions->at(0); + assert(startpgm->opcode == aco_opcode::p_startpgm); + startpgm->definitions[0].setFixed(reg_v0); - Temp v_in = inputs[0]; + Temp v_in = inputs[0]; - { - /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */ + { + /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */ - //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] - //! s2: %e:s[2-3] = p_cbranch_z %b:vcc - //! p_unit_test 0, %e:s[2-3] - auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), - Operand(v_in, reg_v0)); - auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm)); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); - writeout(0, Operand(br, reg_s2)); - } + //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] + //! s2: %e:s[2-3] = p_cbranch_z %b:vcc + //! p_unit_test 0, %e:s[2-3] + auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), + Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), + Operand(exec, bld.lm)); + auto br = + bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(0, Operand(br, reg_s2)); + } - //; del b, e + //; del b, e - { - /* When VCC is overwritten inbetween, don't optimize. */ + { + /* When VCC is overwritten inbetween, don't optimize. */ - //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] - //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec - //! s2: %f:vcc = s_mov_b64 0 - //! s2: %e:s[2-3] = p_cbranch_z %d:scc - //! p_unit_test 1, %e:s[2-3], %f:vcc - auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), - Operand(v_in, reg_v0)); - auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm)); - auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); - writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc)); - } + //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec + //! s2: %f:vcc = s_mov_b64 0 + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 1, %e:s[2-3], %f:vcc + auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), + Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), + Operand(exec, bld.lm)); + auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand::zero()); + auto br = + bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc)); + } - //; del b, c, d, e, f + //; del b, c, d, e, f - { - /* When part of VCC is overwritten inbetween, don't optimize. */ + { + /* When part of VCC is overwritten inbetween, don't optimize. */ - //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] - //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec - //! s1: %f:s[107] = s_mov_b32 0 - //! s2: %e:s[2-3] = p_cbranch_z %d:scc - //! p_unit_test 1, %e:s[2-3], %f:vcc - auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), - Operand(v_in, reg_v0)); - auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm)); - auto ovrwr = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1, vcc_hi), Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); - writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc)); - } + //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec + //! s1: %f:s[107] = s_mov_b32 0 + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 1, %e:s[2-3], %f:vcc + auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), + Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), + Operand(exec, bld.lm)); + auto ovrwr = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1, vcc_hi), Operand::zero()); + auto br = + bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc)); + } - //; del b, c, d, e, f + //; del b, c, d, e, f - { - /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */ + { + /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */ - //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0] - //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec - //! s2: %e:s[2-3] = p_cbranch_z %d:scc - //! p_unit_test 2, %e:s[2-3] - auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand::zero(), - Operand(v_in, reg_v0)); - auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(vcmp, reg_s4), Operand(exec, bld.lm)); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); - writeout(2, Operand(br, reg_s2)); - } + //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 2, %e:s[2-3] + auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand::zero(), + Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), + Operand(vcmp, reg_s4), Operand(exec, bld.lm)); + auto br = + bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(2, Operand(br, reg_s2)); + } - //; del b, c, d, e + //; del b, c, d, e - { - /* When the VCC isn't written by VOPC, don't optimize */ + { + /* When the VCC isn't written by VOPC, don't optimize */ - //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5] - //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec - //! s2: %e:s[2-3] = p_cbranch_z %d:scc - //! p_unit_test 2, %e:s[2-3] - auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), - Operand::c32(1u), Operand(reg_s4, bld.lm)); - auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(salu, vcc), Operand(exec, bld.lm)); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); - writeout(2, Operand(br, reg_s2)); - } + //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 2, %e:s[2-3] + auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand::c32(1u), + Operand(reg_s4, bld.lm)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), + Operand(salu, vcc), Operand(exec, bld.lm)); + auto br = + bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(2, Operand(br, reg_s2)); + } - //; del b, c, d, e, f, x + //; del b, c, d, e, f, x - { - /* When EXEC is overwritten inbetween, don't optimize. */ + { + /* When EXEC is overwritten inbetween, don't optimize. */ - //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] - //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec - //! s2: %f:exec = s_mov_b64 42 - //! s2: %e:s[2-3] = p_cbranch_z %d:scc - //! p_unit_test 4, %e:s[2-3], %f:exec - auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), - Operand(v_in, reg_v0)); - auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm)); - auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand::c32(42u)); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); - writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec)); - } + //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec + //! s2: %f:exec = s_mov_b64 42 + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 4, %e:s[2-3], %f:exec + auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), + Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), + Operand(exec, bld.lm)); + auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand::c32(42u)); + auto br = + bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec)); + } - //; del b, c, d, e, f, x + //; del b, c, d, e, f, x - finish_optimizer_postRA_test(); + finish_optimizer_postRA_test(); END_TEST BEGIN_TEST(optimizer_postRA.scc_nocmp_opt) - //>> s1: %a, s2: %y, s1: %z = p_startpgm - ASSERTED bool setup_ok = setup_cs("s1 s2 s1", GFX6); - assert(setup_ok); + //>> s1: %a, s2: %y, s1: %z = p_startpgm + ASSERTED bool setup_ok = setup_cs("s1 s2 s1", GFX6); + assert(setup_ok); - PhysReg reg_s0{0}; - PhysReg reg_s2{2}; - PhysReg reg_s3{3}; - PhysReg reg_s4{4}; - PhysReg reg_s6{6}; - PhysReg reg_s8{8}; + PhysReg reg_s0{0}; + PhysReg reg_s2{2}; + PhysReg reg_s3{3}; + PhysReg reg_s4{4}; + PhysReg reg_s6{6}; + PhysReg reg_s8{8}; - Temp in_0 = inputs[0]; - Temp in_1 = inputs[1]; - Temp in_2 = inputs[2]; - Operand op_in_0(in_0); - op_in_0.setFixed(reg_s0); - Operand op_in_1(in_1); - op_in_1.setFixed(reg_s4); - Operand op_in_2(in_2); - op_in_2.setFixed(reg_s6); + Temp in_0 = inputs[0]; + Temp in_1 = inputs[1]; + Temp in_2 = inputs[2]; + Operand op_in_0(in_0); + op_in_0.setFixed(reg_s0); + Operand op_in_1(in_1); + op_in_1.setFixed(reg_s4); + Operand op_in_2(in_2); + op_in_2.setFixed(reg_s6); - { - //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 - //! s2: %f:vcc = p_cbranch_nz %e:scc - //! p_unit_test 0, %f:vcc - auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, - Operand::c32(0x40018u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); - writeout(0, Operand(br, vcc)); - } + { + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s2: %f:vcc = p_cbranch_nz %e:scc + //! p_unit_test 0, %f:vcc + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); + writeout(0, Operand(br, vcc)); + } - //; del d, e, f + //; del d, e, f - { - //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 - //! s2: %f:vcc = p_cbranch_z %e:scc - //! p_unit_test 1, %f:vcc - auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, - Operand::c32(0x40018u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); - writeout(1, Operand(br, vcc)); - } + { + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s2: %f:vcc = p_cbranch_z %e:scc + //! p_unit_test 1, %f:vcc + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); + writeout(1, Operand(br, vcc)); + } - //; del d, e, f + //; del d, e, f - { - //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 - //! s2: %f:vcc = p_cbranch_z %e:scc - //! p_unit_test 2, %f:vcc - auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, - Operand::c32(0x40018u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); - writeout(2, Operand(br, vcc)); - } + { + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s2: %f:vcc = p_cbranch_z %e:scc + //! p_unit_test 2, %f:vcc + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); + writeout(2, Operand(br, vcc)); + } - //; del d, e, f + //; del d, e, f - { - //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 - //! s2: %f:vcc = p_cbranch_nz %e:scc - //! p_unit_test 3, %f:vcc - auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, - Operand::c32(0x40018u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); - writeout(3, Operand(br, vcc)); - } + { + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s2: %f:vcc = p_cbranch_nz %e:scc + //! p_unit_test 3, %f:vcc + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); + writeout(3, Operand(br, vcc)); + } - //; del d, e, f + //; del d, e, f - { - //! s2: %d:s[2-3], s1: %e:scc = s_and_b64 %y:s[4-5], 0x12345 - //! s2: %f:vcc = p_cbranch_z %e:scc - //! p_unit_test 4, %f:vcc - auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s2), bld.def(s1, scc), op_in_1, - Operand::c32(0x12345u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u64, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero(8)); - auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); - writeout(4, Operand(br, vcc)); - } + { + //! s2: %d:s[2-3], s1: %e:scc = s_and_b64 %y:s[4-5], 0x12345 + //! s2: %f:vcc = p_cbranch_z %e:scc + //! p_unit_test 4, %f:vcc + auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s2), bld.def(s1, scc), op_in_1, + Operand::c32(0x12345u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u64, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero(8)); + auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); + writeout(4, Operand(br, vcc)); + } - //; del d, e, f + //; del d, e, f - { - /* SCC is overwritten in between, don't optimize */ + { + /* SCC is overwritten in between, don't optimize */ - //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 - //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 - //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0 - //! s2: %f:vcc = p_cbranch_z %g:scc - //! p_unit_test 5, %f:vcc, %h:s[3] - auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, - Operand::c32(0x40018u)); - auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, - Operand::c32(1u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); - writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); - } + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 + //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0 + //! s2: %f:vcc = p_cbranch_z %g:scc + //! p_unit_test 5, %f:vcc, %h:s[3] + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, + Operand::c32(1u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); + writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); + } - //; del d, e, f, g, h, x + //; del d, e, f, g, h, x - { - /* SCC is overwritten in between, optimize by pulling down */ + { + /* SCC is overwritten in between, optimize by pulling down */ - //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 - //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 - //! s2: %f:vcc = p_cbranch_z %g:scc - //! p_unit_test 5, %f:vcc, %h:s[3] - auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, - Operand::c32(0x40018u)); - auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, - Operand::c32(1u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); - writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); - } + //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s2: %f:vcc = p_cbranch_z %g:scc + //! p_unit_test 5, %f:vcc, %h:s[3] + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, + Operand::c32(1u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); + writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); + } - //; del d, e, f, g, h, x + //; del d, e, f, g, h, x - { - /* SCC is overwritten in between, optimize by pulling down */ + { + /* SCC is overwritten in between, optimize by pulling down */ - //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 - //! s2: %d:s[8-9], s1: %e:scc = s_and_b64 %b:s[4-5], 0x40018 - //! s2: %f:vcc = p_cbranch_z %g:scc - //! p_unit_test 5, %f:vcc, %h:s[3] - auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), op_in_1, - Operand::c32(0x40018u)); - auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, - Operand::c32(1u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s8), - Operand::zero()); - auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); - writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); - } + //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 + //! s2: %d:s[8-9], s1: %e:scc = s_and_b64 %b:s[4-5], 0x40018 + //! s2: %f:vcc = p_cbranch_z %g:scc + //! p_unit_test 5, %f:vcc, %h:s[3] + auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), op_in_1, + Operand::c32(0x40018u)); + auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, + Operand::c32(1u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s8), + Operand::zero()); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); + writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); + } - //; del d, e, f, g, h, x + //; del d, e, f, g, h, x - { - //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 - //! s1: %f:s[4] = s_cselect_b32 %z:s[6], %a:s[0], %e:scc - //! p_unit_test 6, %f:s[4] - auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, - Operand::c32(0x40018u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero()); - auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0), Operand(op_in_2), bld.scc(scmp)); - writeout(6, Operand(br, reg_s4)); - } + { + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s1: %f:s[4] = s_cselect_b32 %z:s[6], %a:s[0], %e:scc + //! p_unit_test 6, %f:s[4] + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0), + Operand(op_in_2), bld.scc(scmp)); + writeout(6, Operand(br, reg_s4)); + } - //; del d, e, f + //; del d, e, f - { - /* SCC is overwritten in between, don't optimize */ + { + /* SCC is overwritten in between, don't optimize */ - //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 - //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 - //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0 - //! s1: %f:s[4] = s_cselect_b32 %a:s[0], %z:s[6], %g:scc - //! p_unit_test 7, %f:s[4], %h:s[3] - auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, - Operand::c32(0x40018u)); - auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, - Operand::c32(1u)); - auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), - Operand::zero()); - auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0), Operand(op_in_2), bld.scc(scmp)); - writeout(7, Operand(br, reg_s4), Operand(ovrw, reg_s3)); - } + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 + //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0 + //! s1: %f:s[4] = s_cselect_b32 %a:s[0], %z:s[6], %g:scc + //! p_unit_test 7, %f:s[4], %h:s[3] + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, + Operand::c32(1u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0), + Operand(op_in_2), bld.scc(scmp)); + writeout(7, Operand(br, reg_s4), Operand(ovrw, reg_s3)); + } - //; del d, e, f, g, h, x + //; del d, e, f, g, h, x - finish_optimizer_postRA_test(); + finish_optimizer_postRA_test(); END_TEST BEGIN_TEST(optimizer_postRA.dpp) @@ -368,7 +382,8 @@ BEGIN_TEST(optimizer_postRA.dpp) //! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 //! p_unit_test 2, %res2:v[2] Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); - Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2), dpp_row_half_mirror); + Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2), + dpp_row_half_mirror); writeout(2, Operand(res2, reg_v2)); /* modifiers */ @@ -429,14 +444,16 @@ BEGIN_TEST(optimizer_postRA.dpp) //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 //! p_unit_test 8, %res8:v[2] Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); - Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c); + Temp res8 = + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c); writeout(8, Operand(res8, reg_v2)); //! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 //! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1] //! p_unit_test 9, %res9:v[2] Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); - Temp res9 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp9, reg_v2), b, d); + Temp res9 = + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp9, reg_v2), b, d); writeout(9, Operand(res9, reg_v2)); /* control flow */ @@ -485,48 +502,53 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf) Operand c(inputs[2], PhysReg(258)); /* buffer store address */ Operand d(inputs[3], PhysReg(259)); /* buffer store value */ Operand e(inputs[4], PhysReg(0)); /* condition */ - PhysReg reg_v12(268); /* temporary register */ + PhysReg reg_v12(268); /* temporary register */ Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec //! s2: %0:vcc = p_cbranch_nz BB1, BB2 - emit_divergent_if_else(program.get(), bld, e, [&]() -> void { - /* --- logical then --- */ - //! BB1 - //! /* logical preds: BB0, / linear preds: BB0, / kind: */ - //! p_logical_start + emit_divergent_if_else( + program.get(), bld, e, + [&]() -> void + { + /* --- logical then --- */ + //! BB1 + //! /* logical preds: BB0, / linear preds: BB0, / kind: */ + //! p_logical_start - //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen - bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true); + //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen + bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true); - //! p_logical_end - //! s2: %0:vcc = p_branch BB3 + //! p_logical_end + //! s2: %0:vcc = p_branch BB3 - /* --- linear then --- */ - //! BB2 - //! /* logical preds: / linear preds: BB0, / kind: */ - //! s2: %0:vcc = p_branch BB3 + /* --- linear then --- */ + //! BB2 + //! /* logical preds: / linear preds: BB0, / kind: */ + //! s2: %0:vcc = p_branch BB3 - /* --- invert --- */ - //! BB3 - //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ - //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec - //! s2: %0:vcc = p_cbranch_nz BB4, BB5 - }, [&]() -> void { - /* --- logical else --- */ - //! BB4 - //! /* logical preds: BB0, / linear preds: BB3, / kind: */ - //! p_logical_start - //! p_logical_end - //! s2: %0:vcc = p_branch BB6 + /* --- invert --- */ + //! BB3 + //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ + //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec + //! s2: %0:vcc = p_cbranch_nz BB4, BB5 + }, + [&]() -> void + { + /* --- logical else --- */ + //! BB4 + //! /* logical preds: BB0, / linear preds: BB3, / kind: */ + //! p_logical_start + //! p_logical_end + //! s2: %0:vcc = p_branch BB6 - /* --- linear else --- */ - //! BB5 - //! /* logical preds: / linear preds: BB3, / kind: */ - //! s2: %0:vcc = p_branch BB6 - }); + /* --- linear else --- */ + //! BB5 + //! /* logical preds: / linear preds: BB3, / kind: */ + //! s2: %0:vcc = p_branch BB6 + }); /* --- merge block --- */ //! BB6 @@ -535,7 +557,8 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf) //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 //! p_unit_test 10, %res10:v[12] - Temp result = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); + Temp result = + bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); writeout(10, Operand(result, reg_v12)); finish_optimizer_postRA_test(); @@ -560,7 +583,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten) Operand d(inputs[3], PhysReg(259)); /* buffer store value */ Operand e(inputs[4], PhysReg(0)); /* condition */ Operand f(inputs[5], PhysReg(2)); /* buffer store address (scalar) */ - PhysReg reg_v12(268); /* temporary register */ + PhysReg reg_v12(268); /* temporary register */ //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); @@ -568,44 +591,50 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten) //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec //! s2: %0:vcc = p_cbranch_nz BB1, BB2 - emit_divergent_if_else(program.get(), bld, e, [&]() -> void { - /* --- logical then --- */ - //! BB1 - //! /* logical preds: BB0, / linear preds: BB0, / kind: */ - //! p_logical_start + emit_divergent_if_else( + program.get(), bld, e, + [&]() -> void + { + /* --- logical then --- */ + //! BB1 + //! /* logical preds: BB0, / linear preds: BB0, / kind: */ + //! p_logical_start - //! v1: %addr:v[0] = p_parallelcopy %f:s[2] - Temp addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), f); + //! v1: %addr:v[0] = p_parallelcopy %f:s[2] + Temp addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), f); - //! buffer_store_dword %addr:v[0], 0, %d:v[3], 0 offen - bld.mubuf(aco_opcode::buffer_store_dword, Operand(addr, a.physReg()), Operand::zero(), d, Operand::zero(), 0, true); + //! buffer_store_dword %addr:v[0], 0, %d:v[3], 0 offen + bld.mubuf(aco_opcode::buffer_store_dword, Operand(addr, a.physReg()), Operand::zero(), d, + Operand::zero(), 0, true); - //! p_logical_end - //! s2: %0:vcc = p_branch BB3 + //! p_logical_end + //! s2: %0:vcc = p_branch BB3 - /* --- linear then --- */ - //! BB2 - //! /* logical preds: / linear preds: BB0, / kind: */ - //! s2: %0:vcc = p_branch BB3 + /* --- linear then --- */ + //! BB2 + //! /* logical preds: / linear preds: BB0, / kind: */ + //! s2: %0:vcc = p_branch BB3 - /* --- invert --- */ - //! BB3 - //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ - //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec - //! s2: %0:vcc = p_cbranch_nz BB4, BB5 - }, [&]() -> void { - /* --- logical else --- */ - //! BB4 - //! /* logical preds: BB0, / linear preds: BB3, / kind: */ - //! p_logical_start - //! p_logical_end - //! s2: %0:vcc = p_branch BB6 + /* --- invert --- */ + //! BB3 + //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ + //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec + //! s2: %0:vcc = p_cbranch_nz BB4, BB5 + }, + [&]() -> void + { + /* --- logical else --- */ + //! BB4 + //! /* logical preds: BB0, / linear preds: BB3, / kind: */ + //! p_logical_start + //! p_logical_end + //! s2: %0:vcc = p_branch BB6 - /* --- linear else --- */ - //! BB5 - //! /* logical preds: / linear preds: BB3, / kind: */ - //! s2: %0:vcc = p_branch BB6 - }); + /* --- linear else --- */ + //! BB5 + //! /* logical preds: / linear preds: BB3, / kind: */ + //! s2: %0:vcc = p_branch BB6 + }); /* --- merge block --- */ //! BB6 @@ -613,7 +642,8 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten) //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1] - Temp result = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); + Temp result = + bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); //! p_unit_test 10, %result:v[12] writeout(10, Operand(result, reg_v12)); @@ -631,53 +661,58 @@ BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf) startpgm->definitions[2].setFixed(PhysReg(259)); startpgm->definitions[3].setFixed(PhysReg(0)); - Operand a(inputs[0], PhysReg(2)); /* source for s_and */ + Operand a(inputs[0], PhysReg(2)); /* source for s_and */ Operand c(inputs[1], PhysReg(258)); /* buffer store address */ Operand d(inputs[2], PhysReg(259)); /* buffer store value */ Operand e(inputs[3], PhysReg(0)); /* condition */ - PhysReg reg_s8(8); /* temporary register */ + PhysReg reg_s8(8); /* temporary register */ auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a, - Operand::c32(0x40018u)); + Operand::c32(0x40018u)); //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec //! s2: %0:vcc = p_cbranch_nz BB1, BB2 - emit_divergent_if_else(program.get(), bld, e, [&]() -> void { - /* --- logical then --- */ - //! BB1 - //! /* logical preds: BB0, / linear preds: BB0, / kind: */ - //! p_logical_start + emit_divergent_if_else( + program.get(), bld, e, + [&]() -> void + { + /* --- logical then --- */ + //! BB1 + //! /* logical preds: BB0, / linear preds: BB0, / kind: */ + //! p_logical_start - //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen - bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true); + //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen + bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true); - //! p_logical_end - //! s2: %0:vcc = p_branch BB3 + //! p_logical_end + //! s2: %0:vcc = p_branch BB3 - /* --- linear then --- */ - //! BB2 - //! /* logical preds: / linear preds: BB0, / kind: */ - //! s2: %0:vcc = p_branch BB3 + /* --- linear then --- */ + //! BB2 + //! /* logical preds: / linear preds: BB0, / kind: */ + //! s2: %0:vcc = p_branch BB3 - /* --- invert --- */ - //! BB3 - //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ - //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec - //! s2: %0:vcc = p_cbranch_nz BB4, BB5 - }, [&]() -> void { - /* --- logical else --- */ - //! BB4 - //! /* logical preds: BB0, / linear preds: BB3, / kind: */ - //! p_logical_start - //! p_logical_end - //! s2: %0:vcc = p_branch BB6 + /* --- invert --- */ + //! BB3 + //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ + //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec + //! s2: %0:vcc = p_cbranch_nz BB4, BB5 + }, + [&]() -> void + { + /* --- logical else --- */ + //! BB4 + //! /* logical preds: BB0, / linear preds: BB3, / kind: */ + //! p_logical_start + //! p_logical_end + //! s2: %0:vcc = p_branch BB6 - /* --- linear else --- */ - //! BB5 - //! /* logical preds: / linear preds: BB3, / kind: */ - //! s2: %0:vcc = p_branch BB6 - }); + /* --- linear else --- */ + //! BB5 + //! /* logical preds: / linear preds: BB3, / kind: */ + //! s2: %0:vcc = p_branch BB6 + }); /* --- merge block --- */ //! BB6 @@ -695,7 +730,6 @@ BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf) finish_optimizer_postRA_test(); END_TEST - BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf_partially_overwritten) //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[4] = p_startpgm if (!setup_cs("s2 v1 v1 s2 s1", GFX10_3)) @@ -708,59 +742,65 @@ BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf_partially_overwritten) startpgm->definitions[3].setFixed(PhysReg(0)); startpgm->definitions[4].setFixed(PhysReg(4)); - Operand a(inputs[0], PhysReg(2)); /* source for s_and */ + Operand a(inputs[0], PhysReg(2)); /* source for s_and */ Operand c(inputs[1], PhysReg(258)); /* buffer store address */ Operand d(inputs[2], PhysReg(259)); /* buffer store value */ Operand e(inputs[3], PhysReg(0)); /* condition */ Operand f(inputs[4], PhysReg(4)); /* overwrite value */ - PhysReg reg_s3(3); /* temporary register */ - PhysReg reg_s8(8); /* temporary register */ + PhysReg reg_s3(3); /* temporary register */ + PhysReg reg_s8(8); /* temporary register */ //! s2: %tmp_salu:s[8-9], s1: %tmp_salu_scc:scc = s_and_b64 %a:s[2-3], 0x40018 auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a, - Operand::c32(0x40018u)); + Operand::c32(0x40018u)); //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec //! s2: %0:vcc = p_cbranch_nz BB1, BB2 - emit_divergent_if_else(program.get(), bld, e, [&]() -> void { - /* --- logical then --- */ - //! BB1 - //! /* logical preds: BB0, / linear preds: BB0, / kind: */ - //! p_logical_start + emit_divergent_if_else( + program.get(), bld, e, + [&]() -> void + { + /* --- logical then --- */ + //! BB1 + //! /* logical preds: BB0, / linear preds: BB0, / kind: */ + //! p_logical_start - //! s1: %ovrwr:s[3] = p_parallelcopy %f:s[4] - Temp s_addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1, reg_s3), f); + //! s1: %ovrwr:s[3] = p_parallelcopy %f:s[4] + Temp s_addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1, reg_s3), f); - //! buffer_store_dword %c:v[2], %ovrwr:s[3], %d:v[3], 0 offen - bld.mubuf(aco_opcode::buffer_store_dword, c, Operand(s_addr, reg_s3), d, Operand::zero(), 0, true); + //! buffer_store_dword %c:v[2], %ovrwr:s[3], %d:v[3], 0 offen + bld.mubuf(aco_opcode::buffer_store_dword, c, Operand(s_addr, reg_s3), d, Operand::zero(), + 0, true); - //! p_logical_end - //! s2: %0:vcc = p_branch BB3 + //! p_logical_end + //! s2: %0:vcc = p_branch BB3 - /* --- linear then --- */ - //! BB2 - //! /* logical preds: / linear preds: BB0, / kind: */ - //! s2: %0:vcc = p_branch BB3 + /* --- linear then --- */ + //! BB2 + //! /* logical preds: / linear preds: BB0, / kind: */ + //! s2: %0:vcc = p_branch BB3 - /* --- invert --- */ - //! BB3 - //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ - //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec - //! s2: %0:vcc = p_cbranch_nz BB4, BB5 - }, [&]() -> void { - /* --- logical else --- */ - //! BB4 - //! /* logical preds: BB0, / linear preds: BB3, / kind: */ - //! p_logical_start - //! p_logical_end - //! s2: %0:vcc = p_branch BB6 + /* --- invert --- */ + //! BB3 + //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ + //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec + //! s2: %0:vcc = p_cbranch_nz BB4, BB5 + }, + [&]() -> void + { + /* --- logical else --- */ + //! BB4 + //! /* logical preds: BB0, / linear preds: BB3, / kind: */ + //! p_logical_start + //! p_logical_end + //! s2: %0:vcc = p_branch BB6 - /* --- linear else --- */ - //! BB5 - //! /* logical preds: / linear preds: BB3, / kind: */ - //! s2: %0:vcc = p_branch BB6 - }); + /* --- linear else --- */ + //! BB5 + //! /* logical preds: / linear preds: BB3, / kind: */ + //! s2: %0:vcc = p_branch BB6 + }); /* --- merge block --- */ //! BB6 diff --git a/src/amd/compiler/tests/test_reduce_assign.cpp b/src/amd/compiler/tests/test_reduce_assign.cpp index 93b0680a152..7f44e55486f 100644 --- a/src/amd/compiler/tests/test_reduce_assign.cpp +++ b/src/amd/compiler/tests/test_reduce_assign.cpp @@ -35,22 +35,27 @@ BEGIN_TEST(setup_reduce_temp.divergent_if_phi) * use_linear_vgpr(v0) * } * ... = phi ... - */ - //TODO: fix the RA validator to spot this + */ + // TODO: fix the RA validator to spot this //>> s2: %_, v1: %a = p_startpgm if (!setup_cs("s2 v1", GFX9)) return; //>> lv1: %lv = p_start_linear_vgpr - emit_divergent_if_else(program.get(), bld, Operand(inputs[0]), [&]() -> void { - //>> s1: %_, s2: %_, s1: %_:scc = p_reduce %a, %lv, lv1: undef op:umin32 cluster_size:64 - Instruction* reduce = bld.reduction(aco_opcode::p_reduce, bld.def(s1), - bld.def(bld.lm), bld.def(s1, scc), inputs[1], - Operand(v1.as_linear()), Operand(v1.as_linear()), umin32); - reduce->reduction().cluster_size = bld.lm.bytes() * 8; - }, [&]() -> void { - /* nothing */ - }); + emit_divergent_if_else( + program.get(), bld, Operand(inputs[0]), + [&]() -> void + { + //>> s1: %_, s2: %_, s1: %_:scc = p_reduce %a, %lv, lv1: undef op:umin32 cluster_size:64 + Instruction* reduce = + bld.reduction(aco_opcode::p_reduce, bld.def(s1), bld.def(bld.lm), bld.def(s1, scc), + inputs[1], Operand(v1.as_linear()), Operand(v1.as_linear()), umin32); + reduce->reduction().cluster_size = bld.lm.bytes() * 8; + }, + [&]() -> void + { + /* nothing */ + }); bld.pseudo(aco_opcode::p_phi, bld.def(v1), Operand::c32(1), Operand::zero()); //>> /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ //! p_end_linear_vgpr %lv diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp index 6b2b9d38d0a..456c42359d4 100644 --- a/src/amd/compiler/tests/test_regalloc.cpp +++ b/src/amd/compiler/tests/test_regalloc.cpp @@ -37,7 +37,7 @@ BEGIN_TEST(regalloc.subdword_alloc.reuse_16bit_operands) /* TODO: is this possible to do on GFX11? */ for (amd_gfx_level cc = GFX8; cc <= GFX10_3; cc = (amd_gfx_level)((unsigned)cc + 1)) { - for (bool pessimistic : { false, true }) { + for (bool pessimistic : {false, true}) { const char* subvariant = pessimistic ? "/pessimistic" : "/optimistic"; //>> v1: %_:v[#a] = p_startpgm @@ -45,7 +45,8 @@ BEGIN_TEST(regalloc.subdword_alloc.reuse_16bit_operands) return; //! v2b: %_:v[#a][0:16], v2b: %res1:v[#a][16:32] = p_split_vector %_:v[#a] - Builder::Result tmp = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]); + Builder::Result tmp = + bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]); //! v1: %_:v[#b] = v_cvt_f32_f16 %_:v[#a][16:32] dst_sel:dword src0_sel:uword1 //! v1: %_:v[#a] = v_cvt_f32_f16 %_:v[#a][0:16] @@ -55,7 +56,7 @@ BEGIN_TEST(regalloc.subdword_alloc.reuse_16bit_operands) writeout(0, result1); writeout(1, result2); - finish_ra_test(ra_test_policy { pessimistic }); + finish_ra_test(ra_test_policy{pessimistic}); } } END_TEST @@ -67,7 +68,8 @@ BEGIN_TEST(regalloc._32bit_partial_write) /* ensure high 16 bits are occupied */ //! v2b: %_:v[0][0:16], v2b: %_:v[0][16:32] = p_split_vector %_:v[0] - Temp hi = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]).def(1).getTemp(); + Temp hi = + bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]).def(1).getTemp(); /* This test checks if this instruction uses SDWA. */ //! v2b: %_:v[0][0:16] = v_not_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword @@ -168,9 +170,9 @@ BEGIN_TEST(regalloc.precolor.multiple_operands) //! v1: %tmp3_2:v[0], v1: %tmp0_2:v[1], v1: %tmp1_2:v[2], v1: %tmp2_2:v[3] = p_parallelcopy %tmp3:v[3], %tmp0:v[0], %tmp1:v[1], %tmp2:v[2] //! p_unit_test %tmp3_2:v[0], %tmp0_2:v[1], %tmp1_2:v[2], %tmp2_2:v[3] - bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[3], PhysReg(256+0)), - Operand(inputs[0], PhysReg(256+1)), Operand(inputs[1], PhysReg(256+2)), - Operand(inputs[2], PhysReg(256+3))); + bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[3], PhysReg(256 + 0)), + Operand(inputs[0], PhysReg(256 + 1)), Operand(inputs[1], PhysReg(256 + 2)), + Operand(inputs[2], PhysReg(256 + 3))); finish_ra_test(ra_test_policy()); END_TEST @@ -182,8 +184,8 @@ BEGIN_TEST(regalloc.precolor.different_regs) //! v1: %tmp1:v[1], v1: %tmp2:v[2] = p_parallelcopy %tmp0:v[0], %tmp0:v[0] //! p_unit_test %tmp0:v[0], %tmp1:v[1], %tmp2:v[2] - bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[0], PhysReg(256+0)), - Operand(inputs[0], PhysReg(256+1)), Operand(inputs[0], PhysReg(256+2))); + bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[0], PhysReg(256 + 0)), + Operand(inputs[0], PhysReg(256 + 1)), Operand(inputs[0], PhysReg(256 + 2))); finish_ra_test(ra_test_policy()); END_TEST @@ -256,7 +258,8 @@ BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_impl) //! s1: %scc_tmp:scc, s1: %1:s[0] = p_unit_test Temp s0_tmp = bld.tmp(s1); - Temp scc_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(s1, scc), Definition(s0_tmp.id(), PhysReg{0}, s1)); + Temp scc_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(s1, scc), + Definition(s0_tmp.id(), PhysReg{0}, s1)); //! lv1: %tmp1:v[1] = p_unit_test Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v1)); @@ -273,7 +276,8 @@ BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_impl) //>> lv1: %5:v[2] = p_parallelcopy %3:v[1] scc:1 scratch:s1 Pseudo_instruction& parallelcopy = program->blocks[0].instructions[3]->pseudo(); aco_print_instr(program->gfx_level, ¶llelcopy, output); - fprintf(output, " scc:%u scratch:s%u\n", parallelcopy.tmp_in_scc, parallelcopy.scratch_sgpr.reg()); + fprintf(output, " scc:%u scratch:s%u\n", parallelcopy.tmp_in_scc, + parallelcopy.scratch_sgpr.reg()); END_TEST BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_regs_for_copies) @@ -392,13 +396,15 @@ BEGIN_TEST(regalloc.vinterp_fp16) //! v1: %tmp0:v[1] = v_interp_p10_f16_f32_inreg %lo:v[3][0:16], %in1:v[1], hi(%hi:v[3][16:32]) //! p_unit_test %tmp0:v[1] - Temp tmp0 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), lo, inputs[1], hi); + Temp tmp0 = + bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), lo, inputs[1], hi); bld.pseudo(aco_opcode::p_unit_test, tmp0); //! v2b: %tmp1:v[0][16:32] = v_interp_p2_f16_f32_inreg %in0:v[0], %in2:v[2], %tmp0:v[1] opsel_hi //! v1: %tmp2:v[0] = p_create_vector 0, %tmp1:v[0][16:32] //! p_unit_test %tmp2:v[0] - Temp tmp1 = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0], inputs[2], tmp0); + Temp tmp1 = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0], + inputs[2], tmp0); Temp tmp2 = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), Operand::zero(2), tmp1); bld.pseudo(aco_opcode::p_unit_test, tmp2); diff --git a/src/amd/compiler/tests/test_sdwa.cpp b/src/amd/compiler/tests/test_sdwa.cpp index d398373da31..f544cc70e46 100644 --- a/src/amd/compiler/tests/test_sdwa.cpp +++ b/src/amd/compiler/tests/test_sdwa.cpp @@ -34,7 +34,8 @@ BEGIN_TEST(validate.sdwa.allow) //>> Validation results: //! Validation passed - SDWA_instruction *sdwa = &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1])->sdwa(); + SDWA_instruction* sdwa = + &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1])->sdwa(); sdwa->neg[0] = sdwa->neg[1] = sdwa->abs[0] = sdwa->abs[1] = true; bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1b), inputs[0], inputs[1]); @@ -105,7 +106,9 @@ BEGIN_TEST(validate.sdwa.vopc) bld.vopc_sdwa(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), inputs[0], inputs[1]); //~gfx(9|10)! SDWA VOPC clamp only supported on GFX8: s2: %_:vcc = v_cmp_eq_f32 %vgpr0, %vgpr1 clamp src0_sel:dword src1_sel:dword - bld.vopc_sdwa(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm, vcc), inputs[0], inputs[1])->sdwa().clamp = true; + bld.vopc_sdwa(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm, vcc), inputs[0], inputs[1]) + ->sdwa() + .clamp = true; //! Validation failed @@ -138,11 +141,13 @@ BEGIN_TEST(validate.sdwa.vcc) //! 3rd operand must be fixed to vcc with SDWA: v1: %_ = v_cndmask_b32 %vgpr0, %vgpr1, %_ dst_sel:dword src0_sel:dword src1_sel:dword bld.vop2_sdwa(aco_opcode::v_cndmask_b32, bld.def(v1), inputs[0], inputs[1], inputs[2]); - bld.vop2_sdwa(aco_opcode::v_cndmask_b32, bld.def(v1), inputs[0], inputs[1], bld.vcc(inputs[2])); + bld.vop2_sdwa(aco_opcode::v_cndmask_b32, bld.def(v1), inputs[0], inputs[1], + bld.vcc(inputs[2])); //! 2nd definition must be fixed to vcc with SDWA: v1: %_, s2: %_ = v_add_co_u32 %vgpr0, %vgpr1 dst_sel:dword src0_sel:dword src1_sel:dword bld.vop2_sdwa(aco_opcode::v_add_co_u32, bld.def(v1), bld.def(bld.lm), inputs[0], inputs[1]); - bld.vop2_sdwa(aco_opcode::v_add_co_u32, bld.def(v1), bld.def(bld.lm, vcc), inputs[0], inputs[1]); + bld.vop2_sdwa(aco_opcode::v_add_co_u32, bld.def(v1), bld.def(bld.lm, vcc), inputs[0], + inputs[1]); //! Validation failed @@ -152,125 +157,127 @@ END_TEST BEGIN_TEST(optimize.sdwa.extract) for (unsigned i = GFX7; i <= GFX10; i++) { - for (unsigned is_signed = 0; is_signed <= 1; is_signed++) { - //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm - if (!setup_cs("v1 v1 s1 s1", (amd_gfx_level)i, CHIP_UNKNOWN, is_signed ? "_signed" : "_unsigned")) - continue; + for (unsigned is_signed = 0; is_signed <= 1; is_signed++) { + //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm + if (!setup_cs("v1 v1 s1 s1", (amd_gfx_level)i, CHIP_UNKNOWN, + is_signed ? "_signed" : "_unsigned")) + continue; - //; def standard_test(index, sel): - //; res = 'v1: %%res%s = v_mul_f32 %%a, %%b dst_sel:dword src0_sel:dword src1_sel:%c%s\n' % (index, 's' if variant.endswith('_signed') else 'u', sel) - //; res += 'p_unit_test %s, %%res%s' % (index, index) - //; return res - //; funcs['standard_test'] = lambda a: standard_test(*(v for v in a.split(','))) + //; def standard_test(index, sel): + //; res = 'v1: %%res%s = v_mul_f32 %%a, %%b dst_sel:dword src0_sel:dword src1_sel:%c%s\n' % (index, 's' if variant.endswith('_signed') else 'u', sel) + //; res += 'p_unit_test %s, %%res%s' % (index, index) + //; return res + //; funcs['standard_test'] = lambda a: standard_test(*(v for v in a.split(','))) - aco_opcode ext = aco_opcode::p_extract; - aco_opcode ins = aco_opcode::p_insert; + aco_opcode ext = aco_opcode::p_extract; + aco_opcode ins = aco_opcode::p_insert; - { - //~gfx[^7].*! @standard_test(0,byte0) - Temp bfe_byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u), - Operand::c32(is_signed)); - writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte0_b)); + { + //~gfx[^7].*! @standard_test(0,byte0) + Temp bfe_byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), + Operand::c32(8u), Operand::c32(is_signed)); + writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte0_b)); - //~gfx[^7].*! @standard_test(1,byte1) - Temp bfe_byte1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u), - Operand::c32(is_signed)); - writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte1_b)); + //~gfx[^7].*! @standard_test(1,byte1) + Temp bfe_byte1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), + Operand::c32(8u), Operand::c32(is_signed)); + writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte1_b)); - //~gfx[^7].*! @standard_test(2,byte2) - Temp bfe_byte2_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(2u), Operand::c32(8u), - Operand::c32(is_signed)); - writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte2_b)); + //~gfx[^7].*! @standard_test(2,byte2) + Temp bfe_byte2_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(2u), + Operand::c32(8u), Operand::c32(is_signed)); + writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte2_b)); - //~gfx[^7].*! @standard_test(3,byte3) - Temp bfe_byte3_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(3u), Operand::c32(8u), - Operand::c32(is_signed)); - writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte3_b)); + //~gfx[^7].*! @standard_test(3,byte3) + Temp bfe_byte3_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(3u), + Operand::c32(8u), Operand::c32(is_signed)); + writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte3_b)); - //~gfx[^7].*! @standard_test(4,word0) - Temp bfe_word0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u), - Operand::c32(is_signed)); - writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_word0_b)); + //~gfx[^7].*! @standard_test(4,word0) + Temp bfe_word0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), + Operand::c32(16u), Operand::c32(is_signed)); + writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_word0_b)); - //~gfx[^7].*! @standard_test(5,word1) - Temp bfe_word1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), - Operand::c32(16u), Operand::c32(is_signed)); - writeout(5, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_word1_b)); + //~gfx[^7].*! @standard_test(5,word1) + Temp bfe_word1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), + Operand::c32(16u), Operand::c32(is_signed)); + writeout(5, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_word1_b)); - //~gfx[^7]_unsigned! @standard_test(6,byte0) - Temp bfi_byte0_b = bld.pseudo(ins, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u)); - writeout(6, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_byte0_b)); + //~gfx[^7]_unsigned! @standard_test(6,byte0) + Temp bfi_byte0_b = + bld.pseudo(ins, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u)); + writeout(6, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_byte0_b)); - //~gfx[^7]_unsigned! @standard_test(7,word0) - Temp bfi_word0_b = - bld.pseudo(ins, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u)); - writeout(7, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_word0_b)); + //~gfx[^7]_unsigned! @standard_test(7,word0) + Temp bfi_word0_b = + bld.pseudo(ins, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u)); + writeout(7, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_word0_b)); + } + + //>> p_unit_test 63 + writeout(63); + + { + //! v1: %tmp8 = p_insert %b, 1, 8 + //! v1: %res8 = v_mul_f32 %a, %tmp8 + //! p_unit_test 8, %res8 + Temp bfi_byte1_b = + bld.pseudo(ins, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u)); + writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_byte1_b)); + + /* v_cvt_f32_ubyte[0-3] can be used instead of v_cvt_f32_u32+sdwa */ + //~gfx7_signed! v1: %bfe_byte0_b = p_extract %b, 0, 8, 1 + //~gfx7_signed! v1: %res9 = v_cvt_f32_u32 %bfe_byte0_b + //~gfx[^7]+_signed! v1: %res9 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte0 + //~gfx\d+_unsigned! v1: %res9 = v_cvt_f32_ubyte0 %b + //! p_unit_test 9, %res9 + Temp bfe_byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), + Operand::c32(8u), Operand::c32(is_signed)); + writeout(9, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte0_b)); + + //~gfx7_signed! v1: %bfe_byte1_b = p_extract %b, 1, 8, 1 + //~gfx7_signed! v1: %res10 = v_cvt_f32_u32 %bfe_byte1_b + //~gfx[^7]+_signed! v1: %res10 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte1 + //~gfx\d+_unsigned! v1: %res10 = v_cvt_f32_ubyte1 %b + //! p_unit_test 10, %res10 + Temp bfe_byte1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), + Operand::c32(8u), Operand::c32(is_signed)); + writeout(10, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte1_b)); + + //~gfx7_signed! v1: %bfe_byte2_b = p_extract %b, 2, 8, 1 + //~gfx7_signed! v1: %res11 = v_cvt_f32_u32 %bfe_byte2_b + //~gfx[^7]+_signed! v1: %res11 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte2 + //~gfx\d+_unsigned! v1: %res11 = v_cvt_f32_ubyte2 %b + //! p_unit_test 11, %res11 + Temp bfe_byte2_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(2u), + Operand::c32(8u), Operand::c32(is_signed)); + writeout(11, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte2_b)); + + //~gfx7_signed! v1: %bfe_byte3_b = p_extract %b, 3, 8, 1 + //~gfx7_signed! v1: %res12 = v_cvt_f32_u32 %bfe_byte3_b + //~gfx[^7]+_signed! v1: %res12 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte3 + //~gfx\d+_unsigned! v1: %res12 = v_cvt_f32_ubyte3 %b + //! p_unit_test 12, %res12 + Temp bfe_byte3_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(3u), + Operand::c32(8u), Operand::c32(is_signed)); + writeout(12, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte3_b)); + + /* VOP3-only instructions can't use SDWA but they can use opsel on GFX9+ instead */ + //~gfx(9|10).*! v1: %res13 = v_add_i16 %a, %b + //~gfx(9|10).*! p_unit_test 13, %res13 + Temp bfe_word0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), + Operand::c32(16u), Operand::c32(is_signed)); + writeout(13, bld.vop3(aco_opcode::v_add_i16, bld.def(v1), inputs[0], bfe_word0_b)); + + //~gfx(9|10).*! v1: %res14 = v_add_i16 %a, hi(%b) + //~gfx(9|10).*! p_unit_test 14, %res14 + Temp bfe_word1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), + Operand::c32(16u), Operand::c32(is_signed)); + writeout(14, bld.vop3(aco_opcode::v_add_i16, bld.def(v1), inputs[0], bfe_word1_b)); + } + + finish_opt_test(); } - - //>> p_unit_test 63 - writeout(63); - - { - //! v1: %tmp8 = p_insert %b, 1, 8 - //! v1: %res8 = v_mul_f32 %a, %tmp8 - //! p_unit_test 8, %res8 - Temp bfi_byte1_b = - bld.pseudo(ins, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u)); - writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_byte1_b)); - - /* v_cvt_f32_ubyte[0-3] can be used instead of v_cvt_f32_u32+sdwa */ - //~gfx7_signed! v1: %bfe_byte0_b = p_extract %b, 0, 8, 1 - //~gfx7_signed! v1: %res9 = v_cvt_f32_u32 %bfe_byte0_b - //~gfx[^7]+_signed! v1: %res9 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte0 - //~gfx\d+_unsigned! v1: %res9 = v_cvt_f32_ubyte0 %b - //! p_unit_test 9, %res9 - Temp bfe_byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u), - Operand::c32(is_signed)); - writeout(9, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte0_b)); - - //~gfx7_signed! v1: %bfe_byte1_b = p_extract %b, 1, 8, 1 - //~gfx7_signed! v1: %res10 = v_cvt_f32_u32 %bfe_byte1_b - //~gfx[^7]+_signed! v1: %res10 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte1 - //~gfx\d+_unsigned! v1: %res10 = v_cvt_f32_ubyte1 %b - //! p_unit_test 10, %res10 - Temp bfe_byte1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u), - Operand::c32(is_signed)); - writeout(10, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte1_b)); - - //~gfx7_signed! v1: %bfe_byte2_b = p_extract %b, 2, 8, 1 - //~gfx7_signed! v1: %res11 = v_cvt_f32_u32 %bfe_byte2_b - //~gfx[^7]+_signed! v1: %res11 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte2 - //~gfx\d+_unsigned! v1: %res11 = v_cvt_f32_ubyte2 %b - //! p_unit_test 11, %res11 - Temp bfe_byte2_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(2u), Operand::c32(8u), - Operand::c32(is_signed)); - writeout(11, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte2_b)); - - //~gfx7_signed! v1: %bfe_byte3_b = p_extract %b, 3, 8, 1 - //~gfx7_signed! v1: %res12 = v_cvt_f32_u32 %bfe_byte3_b - //~gfx[^7]+_signed! v1: %res12 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte3 - //~gfx\d+_unsigned! v1: %res12 = v_cvt_f32_ubyte3 %b - //! p_unit_test 12, %res12 - Temp bfe_byte3_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(3u), Operand::c32(8u), - Operand::c32(is_signed)); - writeout(12, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte3_b)); - - /* VOP3-only instructions can't use SDWA but they can use opsel on GFX9+ instead */ - //~gfx(9|10).*! v1: %res13 = v_add_i16 %a, %b - //~gfx(9|10).*! p_unit_test 13, %res13 - Temp bfe_word0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u), - Operand::c32(is_signed)); - writeout(13, bld.vop3(aco_opcode::v_add_i16, bld.def(v1), inputs[0], bfe_word0_b)); - - //~gfx(9|10).*! v1: %res14 = v_add_i16 %a, hi(%b) - //~gfx(9|10).*! p_unit_test 14, %res14 - Temp bfe_word1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), - Operand::c32(16u), Operand::c32(is_signed)); - writeout(14, bld.vop3(aco_opcode::v_add_i16, bld.def(v1), inputs[0], bfe_word1_b)); - } - - finish_opt_test(); - } } END_TEST diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index f6d08924a9f..c067c83fe63 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -52,8 +52,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v2b), Definition(v1_lo, v2b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v1_lo, v2b), Operand(v1_lo, v2b), Operand(v0_lo, v2b)); //~gfx[67]! p_unit_test 1 @@ -61,9 +60,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2 //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u)); - bld.pseudo(aco_opcode::p_create_vector, - Definition(v0_lo, v1), - Operand(v1_lo, v2b), Operand(v0_lo, v2b)); + bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v2b), + Operand(v0_lo, v2b)); //~gfx[67]! p_unit_test 2 //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] @@ -71,8 +69,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); - bld.pseudo(aco_opcode::p_create_vector, - Definition(v0_lo, v6b), Operand(v1_lo, v2b), + bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v6b), Operand(v1_lo, v2b), Operand(v0_lo, v2b), Operand(v2_lo, v2b)); //~gfx[67]! p_unit_test 3 @@ -82,10 +79,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[2][0:16] //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[3][0:16], %0:v[1][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u)); - bld.pseudo(aco_opcode::p_create_vector, - Definition(v0_lo, v2), - Operand(v1_lo, v2b), Operand(v0_lo, v2b), - Operand(v2_lo, v2b), Operand(v3_lo, v2b)); + bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2), Operand(v1_lo, v2b), + Operand(v0_lo, v2b), Operand(v2_lo, v2b), Operand(v3_lo, v2b)); //~gfx[67]! p_unit_test 4 //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16] @@ -96,17 +91,14 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); - bld.pseudo(aco_opcode::p_create_vector, - Definition(v0_lo, v2), - Operand(v1_lo, v2b), Operand(v2_lo, v2b), - Operand(v0_lo, v2b), Operand(v3_lo, v2b)); + bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2), Operand(v1_lo, v2b), + Operand(v2_lo, v2b), Operand(v0_lo, v2b), Operand(v3_lo, v2b)); //~gfx[67]! p_unit_test 5 //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u)); - bld.pseudo(aco_opcode::p_split_vector, - Definition(v1_lo, v2b), Definition(v0_lo, v2b), + bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b), Operand(v0_lo, v1)); //~gfx[67]! p_unit_test 6 @@ -114,8 +106,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u)); - bld.pseudo(aco_opcode::p_split_vector, - Definition(v1_lo, v2b), Definition(v0_lo, v2b), + bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b), Definition(v2_lo, v2b), Operand(v0_lo, v6b)); //~gfx[67]! p_unit_test 7 @@ -124,10 +115,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32] //~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[2][16:32] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u)); - bld.pseudo(aco_opcode::p_split_vector, - Definition(v1_lo, v2b), Definition(v0_lo, v2b), - Definition(v2_lo, v2b), Definition(v3_lo, v2b), - Operand(v0_lo, v2)); + bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b), + Definition(v2_lo, v2b), Definition(v3_lo, v2b), Operand(v0_lo, v2)); //~gfx[67]! p_unit_test 8 //~gfx[67]! v2b: %0:v[2][0:16] = v_lshrrev_b32 16, %0:v[0][16:32] @@ -136,18 +125,15 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u)); - bld.pseudo(aco_opcode::p_split_vector, - Definition(v1_lo, v2b), Definition(v2_lo, v2b), - Definition(v0_lo, v2b), Definition(v3_lo, v2b), - Operand(v0_lo, v2)); + bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v2_lo, v2b), + Definition(v0_lo, v2b), Definition(v3_lo, v2b), Operand(v0_lo, v2)); //~gfx[67]! p_unit_test 9 //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0] //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v1b), Definition(v1_lo, v1b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v1_lo, v1b), Operand(v1_lo, v1b), Operand(v0_lo, v1b)); //~gfx[67]! p_unit_test 10 @@ -155,9 +141,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3 //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u)); - bld.pseudo(aco_opcode::p_create_vector, - Definition(v0_lo, v2b), - Operand(v1_lo, v1b), Operand(v0_lo, v1b)); + bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2b), Operand(v1_lo, v1b), + Operand(v0_lo, v1b)); //~gfx[67]! p_unit_test 11 //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8] @@ -166,8 +151,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] //~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u)); - bld.pseudo(aco_opcode::p_create_vector, - Definition(v0_lo, v3b), Operand(v1_lo, v1b), + bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v3b), Operand(v1_lo, v1b), Operand(v0_lo, v1b), Operand(v2_lo, v1b)); //~gfx[67]! p_unit_test 12 @@ -179,10 +163,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v3b: %0:v[0][8:32] = v_lshlrev_b32 8, %0:v[0][0:24] //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:8], %0:v[0][8:32], 1 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u)); - bld.pseudo(aco_opcode::p_create_vector, - Definition(v0_lo, v1), - Operand(v1_lo, v1b), Operand(v0_lo, v1b), - Operand(v2_lo, v1b), Operand(v3_lo, v1b)); + bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v1b), + Operand(v0_lo, v1b), Operand(v2_lo, v1b), Operand(v3_lo, v1b)); //~gfx[67]! p_unit_test 13 //~gfx[67]! v1b: %0:v[0][0:8] = v_and_b32 0xff, %0:v[0][0:8] @@ -193,18 +175,16 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001 //~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u)); - Instruction* pseudo = bld.pseudo(aco_opcode::p_create_vector, - Definition(v0_lo, v1), - Operand(v0_lo, v1b), Operand(v0_lo, v1b), - Operand(v0_lo, v1b), Operand(v0_lo, v1b)); + Instruction* pseudo = + bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v0_lo, v1b), + Operand(v0_lo, v1b), Operand(v0_lo, v1b), Operand(v0_lo, v1b)); pseudo->pseudo().scratch_sgpr = m0; //~gfx[67]! p_unit_test 14 //~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8] //~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u)); - bld.pseudo(aco_opcode::p_split_vector, - Definition(v1_lo, v1b), Definition(v0_lo, v1b), + bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v1b), Definition(v0_lo, v1b), Operand(v0_lo, v2b)); //~gfx[67]! p_unit_test 15 @@ -213,10 +193,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[67]! v1b: %0:v[2][0:8] = v_lshrrev_b32 16, %0:v[1][16:24] //~gfx[67]! v1b: %0:v[3][0:8] = v_lshrrev_b32 24, %0:v[1][24:32] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u)); - bld.pseudo(aco_opcode::p_split_vector, - Definition(v1_lo, v1b), Definition(v0_lo, v1b), - Definition(v2_lo, v1b), Definition(v3_lo, v1b), - Operand(v0_lo, v1)); + bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v1b), Definition(v0_lo, v1b), + Definition(v2_lo, v1b), Definition(v3_lo, v1b), Operand(v0_lo, v1)); //~gfx[67]! s_endpgm @@ -231,8 +209,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx8! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 //~gfx(9|11)! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v2b), Definition(v0_hi, v2b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), Operand(v0_hi, v2b), Operand(v0_lo, v2b)); //~gfx(8|9|11)! p_unit_test 1 @@ -243,8 +220,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v1), Definition(v1_lo, v2b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b)); //~gfx(8|9|11)! p_unit_test 2 @@ -259,9 +235,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16] //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v1), Definition(v1_lo, v2b), Definition(v1_hi, v2b), - Operand(v1_lo, v1), Operand(v0_lo, v2b), Operand(v0_lo, v2b)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b), + Definition(v1_hi, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b), + Operand(v0_lo, v2b)); //~gfx(8|9|11)! p_unit_test 3 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] @@ -273,8 +249,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 %0:v[0][0:16] //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v1), Definition(v1_b3, v1b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_b3, v1b), Operand(v1_lo, v1), Operand(v0_b3, v1b)); //~gfx(8|9|11)! p_unit_test 4 @@ -287,8 +262,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104 //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v1), Definition(v1_lo, v1b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b), Operand(v1_lo, v1), Operand(v0_lo, v1b)); //~gfx(8|9|11)! p_unit_test 5 @@ -301,9 +275,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060104 //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x3060504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v1b), Definition(v0_hi, v1b), Definition(v1_lo, v1), - Operand(v1_lo, v1b), Operand(v1_hi, v1b), Operand(v0_lo, v1)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v0_hi, v1b), + Definition(v1_lo, v1), Operand(v1_lo, v1b), Operand(v1_hi, v1b), + Operand(v0_lo, v1)); //~gfx(8|9|11)! p_unit_test 6 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] @@ -311,9 +285,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v2b), Definition(v0_hi, v2b), Definition(v1_lo, v1), - Operand(v1_lo, v2b), Operand(v1_hi, v2b), Operand(v0_lo, v1)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Definition(v1_lo, v1), Operand(v1_lo, v2b), Operand(v1_hi, v2b), + Operand(v0_lo, v1)); //~gfx(8|9|11)! p_unit_test 7 //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1] @@ -322,9 +296,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx(9|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1] //~gfx(8|9|11)! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v2b), Definition(v0_hi, v2b), Definition(v1_lo, v1), - Operand(v1_hi, v2b), Operand(v1_lo, v2b), Operand(v0_lo, v1)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Definition(v1_lo, v1), Operand(v1_hi, v2b), Operand(v1_lo, v2b), + Operand(v0_lo, v1)); //~gfx(8|9|11)! p_unit_test 8 //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] @@ -342,8 +316,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v3b), Definition(v1_lo, v3b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b), Operand(v1_lo, v3b), Operand(v0_lo, v3b)); //~gfx(8|9|11)! p_unit_test 9 @@ -354,9 +327,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x3060504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_lo, v3b), Definition(v1_lo, v3b), Definition(v0_b3, v1b), - Operand(v1_lo, v3b), Operand(v0_lo, v3b), Operand(v1_b3, v1b)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b), + Definition(v0_b3, v1b), Operand(v1_lo, v3b), Operand(v0_lo, v3b), + Operand(v1_b3, v1b)); //~gfx(8|9|11)! p_unit_test 10 //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 @@ -380,8 +353,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_b1, v2b), Definition(v1_b1, v2b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Definition(v1_b1, v2b), Operand(v1_b1, v2b), Operand(v0_b1, v2b)); //~gfx(8|9|11)! p_unit_test 11 @@ -398,8 +370,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1 //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u)); - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(v0_b1, v1b), Definition(v0_b3, v1b), + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v1b), Definition(v0_b3, v1b), Operand(v0_b3, v1b), Operand(v0_b1, v1b)); //~gfx(8|9|11)! s_endpgm @@ -535,8 +506,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0xff bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u)); - bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), - Operand::c16(0x00ff)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x00ff)); //! p_unit_test 14 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0] @@ -544,29 +514,25 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504 //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0xffffff00 opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u)); - bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), - Operand::c16(0xff00)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0xff00)); //! p_unit_test 15 //~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u)); - bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), - Operand::zero(2)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::zero(2)); //! p_unit_test 16 //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 -1 dst_sel:ubyte0 dst_preserve src0_sel:dword //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x706050d bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16u)); - bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), - Operand::c8(0xff)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::c8(0xff)); //! p_unit_test 17 //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x706050c bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17u)); - bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), - Operand::zero(1)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::zero(1)); //! s_endpgm @@ -589,12 +555,12 @@ BEGIN_TEST(to_hw_instr.self_intersecting_swap) //! v1: %0:v[3], v1: %0:v[7] = v_swap_b32 %0:v[7], %0:v[3] //! s_endpgm bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); - //v[1:2] = v[2:3] - //v3 = v7 - //v7 = v1 - bld.pseudo(aco_opcode::p_parallelcopy, - Definition(reg_v1, v2), Definition(reg_v3, v1), Definition(reg_v7, v1), - Operand(reg_v2, v2), Operand(reg_v7, v1), Operand(reg_v1, v1)); + // v[1:2] = v[2:3] + // v3 = v7 + // v7 = v1 + bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v1, v2), Definition(reg_v3, v1), + Definition(reg_v7, v1), Operand(reg_v2, v2), Operand(reg_v7, v1), + Operand(reg_v1, v1)); finish_to_hw_instr_test(); END_TEST @@ -606,98 +572,98 @@ BEGIN_TEST(to_hw_instr.extract) PhysReg v1_lo{257}; for (amd_gfx_level lvl : {GFX7, GFX8, GFX9, GFX11}) { - for (unsigned is_signed = 0; is_signed <= 1; is_signed++) { - if (!setup_cs(NULL, lvl, CHIP_UNKNOWN, is_signed ? "_signed" : "_unsigned")) - continue; + for (unsigned is_signed = 0; is_signed <= 1; is_signed++) { + if (!setup_cs(NULL, lvl, CHIP_UNKNOWN, is_signed ? "_signed" : "_unsigned")) + continue; #define EXT(idx, size) \ bld.pseudo(aco_opcode::p_extract, Definition(v0_lo, v1), Operand(v1_lo, v1), Operand::c32(idx), \ Operand::c32(size), Operand::c32(is_signed)); - //; funcs['v_bfe'] = lambda _: 'v_bfe_i32' if variant.endswith('_signed') else 'v_bfe_u32' - //; funcs['v_shr'] = lambda _: 'v_ashrrev_i32' if variant.endswith('_signed') else 'v_lshrrev_b32' - //; funcs['s_bfe'] = lambda _: 's_bfe_i32' if variant.endswith('_signed') else 's_bfe_u32' - //; funcs['s_shr'] = lambda _: 's_ashr_i32' if variant.endswith('_signed') else 's_lshr_b32' - //; funcs['byte'] = lambda n: '%cbyte%s' % ('s' if variant.endswith('_signed') else 'u', n) + //; funcs['v_bfe'] = lambda _: 'v_bfe_i32' if variant.endswith('_signed') else 'v_bfe_u32' + //; funcs['v_shr'] = lambda _: 'v_ashrrev_i32' if variant.endswith('_signed') else 'v_lshrrev_b32' + //; funcs['s_bfe'] = lambda _: 's_bfe_i32' if variant.endswith('_signed') else 's_bfe_u32' + //; funcs['s_shr'] = lambda _: 's_ashr_i32' if variant.endswith('_signed') else 's_lshr_b32' + //; funcs['byte'] = lambda n: '%cbyte%s' % ('s' if variant.endswith('_signed') else 'u', n) - //>> p_unit_test 0 - bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); - //! v1: %_:v[0] = @v_bfe %_:v[1], 0, 8 - EXT(0, 8) - //! v1: %_:v[0] = @v_bfe %_:v[1], 8, 8 - EXT(1, 8) - //! v1: %_:v[0] = @v_bfe %_:v[1], 16, 8 - EXT(2, 8) - //! v1: %_:v[0] = @v_shr 24, %_:v[1] - EXT(3, 8) - //~gfx(7|8|9)_.*! v1: %_:v[0] = @v_bfe %_:v[1], 0, 16 - //~gfx11_unsigned! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1] - //~gfx11_signed! v1: %_:v[0] = v_cvt_i32_i16 %_:v[1] - EXT(0, 16) - //! v1: %_:v[0] = @v_shr 16, %_:v[1] - EXT(1, 16) + //>> p_unit_test 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); + //! v1: %_:v[0] = @v_bfe %_:v[1], 0, 8 + EXT(0, 8) + //! v1: %_:v[0] = @v_bfe %_:v[1], 8, 8 + EXT(1, 8) + //! v1: %_:v[0] = @v_bfe %_:v[1], 16, 8 + EXT(2, 8) + //! v1: %_:v[0] = @v_shr 24, %_:v[1] + EXT(3, 8) + //~gfx(7|8|9)_.*! v1: %_:v[0] = @v_bfe %_:v[1], 0, 16 + //~gfx11_unsigned! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1] + //~gfx11_signed! v1: %_:v[0] = v_cvt_i32_i16 %_:v[1] + EXT(0, 16) + //! v1: %_:v[0] = @v_shr 16, %_:v[1] + EXT(1, 16) - #undef EXT +#undef EXT #define EXT(idx, size) \ bld.pseudo(aco_opcode::p_extract, Definition(s0_lo, s1), Definition(scc, s1), \ Operand(s1_lo, s1), Operand::c32(idx), Operand::c32(size), Operand::c32(is_signed)); - //>> p_unit_test 2 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); - //~gfx.*_unsigned! s1: %_:s[0], s1: %_:scc = @s_bfe %_:s[1], 0x80000 - //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i8 %_:s[1] - EXT(0, 8) - //! s1: %_:s[0], s1: %_:scc = @s_bfe %_:s[1], 0x80008 - EXT(1, 8) - //! s1: %_:s[0], s1: %_:scc = @s_bfe %_:s[1], 0x80010 - EXT(2, 8) - //! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 24 - EXT(3, 8) - //~gfx(7|8)_unsigned! s1: %_:s[0], s1: %_:scc = @s_bfe %_:s[1], 0x100000 - //~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0 - //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i16 %_:s[1] - EXT(0, 16) - //! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16 - EXT(1, 16) + //>> p_unit_test 2 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); + //~gfx.*_unsigned! s1: %_:s[0], s1: %_:scc = @s_bfe %_:s[1], 0x80000 + //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i8 %_:s[1] + EXT(0, 8) + //! s1: %_:s[0], s1: %_:scc = @s_bfe %_:s[1], 0x80008 + EXT(1, 8) + //! s1: %_:s[0], s1: %_:scc = @s_bfe %_:s[1], 0x80010 + EXT(2, 8) + //! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 24 + EXT(3, 8) + //~gfx(7|8)_unsigned! s1: %_:s[0], s1: %_:scc = @s_bfe %_:s[1], 0x100000 + //~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0 + //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i16 %_:s[1] + EXT(0, 16) + //! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16 + EXT(1, 16) - #undef EXT +#undef EXT #define EXT(idx, src_b) \ bld.pseudo(aco_opcode::p_extract, Definition(v0_lo, v2b), Operand(v1_lo.advance(src_b), v2b), \ Operand::c32(idx), Operand::c32(8u), Operand::c32(is_signed)); - //>> p_unit_test 4 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); - //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 0, 8 - //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0) - //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c00 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060000 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04 - EXT(0, 0) - //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2) - //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c02 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060202 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04 - if (lvl != GFX7) - EXT(0, 2) - //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 8, 8 - //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1) - //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c01 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060801 - EXT(1, 0) - //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3) - //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c03 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060903 - if (lvl != GFX7) - EXT(1, 2) + //>> p_unit_test 4 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); + //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 0, 8 + //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0) + //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c00 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060000 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04 + EXT(0, 0) + //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2) + //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c02 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060202 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04 + if (lvl != GFX7) + EXT(0, 2) + //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 8, 8 + //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1) + //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c01 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060801 + EXT(1, 0) + //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3) + //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c03 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060903 + if (lvl != GFX7) + EXT(1, 2) - #undef EXT +#undef EXT - finish_to_hw_instr_test(); + finish_to_hw_instr_test(); - //! s_endpgm - } + //! s_endpgm + } } END_TEST @@ -736,7 +702,7 @@ BEGIN_TEST(to_hw_instr.insert) //! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[1] INS(1, 16) - #undef INS +#undef INS #define INS(idx, size) \ bld.pseudo(aco_opcode::p_insert, Definition(s0_lo, s1), Definition(scc, s1), \ @@ -759,7 +725,7 @@ BEGIN_TEST(to_hw_instr.insert) //! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 16 INS(1, 16) - #undef INS +#undef INS #define INS(idx, def_b) \ bld.pseudo(aco_opcode::p_insert, Definition(v0_lo.advance(def_b), v2b), Operand(v1_lo, v2b), \ @@ -784,7 +750,7 @@ BEGIN_TEST(to_hw_instr.insert) if (lvl != GFX7) INS(1, 2) - #undef INS +#undef INS finish_to_hw_instr_test(); @@ -816,10 +782,9 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc) //! lv1: %0:v[0] = v_mov_b32 %0:v[1] //! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec //! s1: %0:scc = s_cmp_lg_i32 %0:m0, 0 - Instruction *instr = bld.pseudo( - aco_opcode::p_parallelcopy, - Definition(scc, s1), Definition(v0_lo, v1.as_linear()), - Operand(reg_s0, s1), Operand(v1_lo, v1.as_linear())); + Instruction* instr = + bld.pseudo(aco_opcode::p_parallelcopy, Definition(scc, s1), Definition(v0_lo, v1.as_linear()), + Operand(reg_s0, s1), Operand(v1_lo, v1.as_linear())); instr->pseudo().scratch_sgpr = m0; finish_to_hw_instr_test(); @@ -836,10 +801,9 @@ BEGIN_TEST(to_hw_instr.swap_linear_vgpr) //>> p_unit_test 0 bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); - Instruction *instr = bld.pseudo( - aco_opcode::p_parallelcopy, - Definition(reg_v0, v1_linear), Definition(reg_v1, v1_linear), - Operand(reg_v1, v1_linear), Operand(reg_v0, v1_linear)); + Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear), + Definition(reg_v1, v1_linear), Operand(reg_v1, v1_linear), + Operand(reg_v0, v1_linear)); instr->pseudo().scratch_sgpr = m0; finish_to_hw_instr_test();