mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 13:10:10 +01:00
intel/elk: Remove DPAS opcode
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27629>
This commit is contained in:
parent
7b90470ca1
commit
24569b8079
22 changed files with 28 additions and 703 deletions
|
|
@ -52,12 +52,6 @@ elk_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
|
|||
/* Default to the sampler since that's what we've done since forever */
|
||||
compiler->indirect_ubos_use_sampler = true;
|
||||
|
||||
compiler->lower_dpas = devinfo->verx10 < 125 ||
|
||||
intel_device_info_is_mtl(devinfo) ||
|
||||
(intel_device_info_is_arl(devinfo) &&
|
||||
devinfo->platform != INTEL_PLATFORM_ARL_H) ||
|
||||
debug_get_bool_option("INTEL_LOWER_DPAS", false);
|
||||
|
||||
/* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
|
||||
for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
|
||||
compiler->scalar_stage[i] = devinfo->ver >= 8 ||
|
||||
|
|
@ -175,8 +169,6 @@ elk_get_compiler_config_value(const struct elk_compiler *compiler)
|
|||
|
||||
insert_u64_bit(&config, compiler->precise_trig);
|
||||
bits++;
|
||||
insert_u64_bit(&config, compiler->lower_dpas);
|
||||
bits++;
|
||||
|
||||
uint64_t mask = DEBUG_DISK_CACHE_MASK;
|
||||
bits += util_bitcount64(mask);
|
||||
|
|
|
|||
|
|
@ -131,14 +131,6 @@ struct elk_compiler {
|
|||
*/
|
||||
bool use_bindless_sampler_offset;
|
||||
|
||||
/**
|
||||
* Should DPAS instructions be lowered?
|
||||
*
|
||||
* This will be set for all platforms before Gfx12.5. It may also be set
|
||||
* platforms that support DPAS for testing purposes.
|
||||
*/
|
||||
bool lower_dpas;
|
||||
|
||||
/**
|
||||
* Calling the ra_allocate function after each register spill can take
|
||||
* several minutes. This option speeds up shader compilation by spilling
|
||||
|
|
@ -1218,7 +1210,6 @@ struct elk_cs_prog_data {
|
|||
bool uses_num_work_groups;
|
||||
bool uses_inline_data;
|
||||
bool uses_btd_stack_ids;
|
||||
bool uses_systolic;
|
||||
uint8_t generate_local_id;
|
||||
enum intel_compute_walk_order walk_order;
|
||||
|
||||
|
|
|
|||
|
|
@ -810,13 +810,6 @@ static const char* const xe2_lsc_cache_store[] = {
|
|||
[XE2_LSC_CACHE_STORE_L1WB_L3WB] = "L1WB_L3WB",
|
||||
};
|
||||
|
||||
static const char* const dpas_systolic_depth[4] = {
|
||||
[0] = "16",
|
||||
[1] = "2",
|
||||
[2] = "4",
|
||||
[3] = "8"
|
||||
};
|
||||
|
||||
static int column;
|
||||
|
||||
static int
|
||||
|
|
@ -1057,27 +1050,6 @@ dest_3src(FILE *file, const struct intel_device_info *devinfo,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
dest_dpas_3src(FILE *file, const struct intel_device_info *devinfo,
|
||||
const elk_inst *inst)
|
||||
{
|
||||
uint32_t reg_file = elk_inst_dpas_3src_dst_reg_file(devinfo, inst);
|
||||
|
||||
if (reg(file, reg_file, elk_inst_dpas_3src_dst_reg_nr(devinfo, inst)) == -1)
|
||||
return 0;
|
||||
|
||||
enum elk_reg_type type = elk_inst_dpas_3src_dst_type(devinfo, inst);
|
||||
unsigned subreg_nr = elk_inst_dpas_3src_dst_subreg_nr(devinfo, inst);
|
||||
|
||||
if (subreg_nr)
|
||||
format(file, ".%u", subreg_nr);
|
||||
string(file, "<1>");
|
||||
|
||||
string(file, elk_reg_type_to_letters(type));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
src_align1_region(FILE *file,
|
||||
unsigned _vert_stride, unsigned _width,
|
||||
|
|
@ -1552,69 +1524,6 @@ src2_3src(FILE *file, const struct intel_device_info *devinfo,
|
|||
return err;
|
||||
}
|
||||
|
||||
static int
|
||||
src0_dpas_3src(FILE *file, const struct intel_device_info *devinfo,
|
||||
const elk_inst *inst)
|
||||
{
|
||||
uint32_t reg_file = elk_inst_dpas_3src_src0_reg_file(devinfo, inst);
|
||||
|
||||
if (reg(file, reg_file, elk_inst_dpas_3src_src0_reg_nr(devinfo, inst)) == -1)
|
||||
return 0;
|
||||
|
||||
unsigned subreg_nr = elk_inst_dpas_3src_src0_subreg_nr(devinfo, inst);
|
||||
enum elk_reg_type type = elk_inst_dpas_3src_src0_type(devinfo, inst);
|
||||
|
||||
if (subreg_nr)
|
||||
format(file, ".%d", subreg_nr);
|
||||
src_align1_region(file, 1, 1, 0);
|
||||
|
||||
string(file, elk_reg_type_to_letters(type));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
src1_dpas_3src(FILE *file, const struct intel_device_info *devinfo,
|
||||
const elk_inst *inst)
|
||||
{
|
||||
uint32_t reg_file = elk_inst_dpas_3src_src1_reg_file(devinfo, inst);
|
||||
|
||||
if (reg(file, reg_file, elk_inst_dpas_3src_src1_reg_nr(devinfo, inst)) == -1)
|
||||
return 0;
|
||||
|
||||
unsigned subreg_nr = elk_inst_dpas_3src_src1_subreg_nr(devinfo, inst);
|
||||
enum elk_reg_type type = elk_inst_dpas_3src_src1_type(devinfo, inst);
|
||||
|
||||
if (subreg_nr)
|
||||
format(file, ".%d", subreg_nr);
|
||||
src_align1_region(file, 1, 1, 0);
|
||||
|
||||
string(file, elk_reg_type_to_letters(type));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
src2_dpas_3src(FILE *file, const struct intel_device_info *devinfo,
|
||||
const elk_inst *inst)
|
||||
{
|
||||
uint32_t reg_file = elk_inst_dpas_3src_src2_reg_file(devinfo, inst);
|
||||
|
||||
if (reg(file, reg_file, elk_inst_dpas_3src_src2_reg_nr(devinfo, inst)) == -1)
|
||||
return 0;
|
||||
|
||||
unsigned subreg_nr = elk_inst_dpas_3src_src2_subreg_nr(devinfo, inst);
|
||||
enum elk_reg_type type = elk_inst_dpas_3src_src2_type(devinfo, inst);
|
||||
|
||||
if (subreg_nr)
|
||||
format(file, ".%d", subreg_nr);
|
||||
src_align1_region(file, 1, 1, 0);
|
||||
|
||||
string(file, elk_reg_type_to_letters(type));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
imm(FILE *file, const struct elk_isa_info *isa, enum elk_reg_type type,
|
||||
const elk_inst *inst)
|
||||
|
|
@ -1885,7 +1794,7 @@ swsb(FILE *file, const struct elk_isa_info *isa, const elk_inst *inst)
|
|||
const uint32_t x = elk_inst_swsb(devinfo, inst);
|
||||
const bool is_unordered =
|
||||
opcode == ELK_OPCODE_SEND || opcode == ELK_OPCODE_SENDC ||
|
||||
opcode == ELK_OPCODE_MATH || opcode == ELK_OPCODE_DPAS ||
|
||||
opcode == ELK_OPCODE_MATH ||
|
||||
(devinfo->has_64bit_float_via_math_pipe &&
|
||||
inst_has_type(isa, inst, ELK_REGISTER_TYPE_DF));
|
||||
const struct tgl_swsb swsb = tgl_swsb_decode(devinfo, is_unordered, x);
|
||||
|
|
@ -2026,15 +1935,6 @@ elk_disassemble_inst(FILE *file, const struct elk_isa_info *isa,
|
|||
err |= control(file, "function", sync_function,
|
||||
elk_inst_cond_modifier(devinfo, inst), NULL);
|
||||
|
||||
} else if (opcode == ELK_OPCODE_DPAS) {
|
||||
string(file, ".");
|
||||
|
||||
err |= control(file, "systolic depth", dpas_systolic_depth,
|
||||
elk_inst_dpas_3src_sdepth(devinfo, inst), NULL);
|
||||
|
||||
const unsigned rcount = elk_inst_dpas_3src_rcount(devinfo, inst) + 1;
|
||||
|
||||
format(file, "x%d", rcount);
|
||||
} else if (!is_send(opcode) &&
|
||||
(devinfo->ver < 12 ||
|
||||
elk_inst_src0_reg_file(devinfo, inst) != ELK_IMMEDIATE_VALUE ||
|
||||
|
|
@ -2106,19 +2006,6 @@ elk_disassemble_inst(FILE *file, const struct elk_isa_info *isa,
|
|||
} else if (opcode == ELK_OPCODE_JMPI) {
|
||||
pad(file, 16);
|
||||
err |= src1(file, isa, inst);
|
||||
} else if (opcode == ELK_OPCODE_DPAS) {
|
||||
pad(file, 16);
|
||||
err |= dest_dpas_3src(file, devinfo, inst);
|
||||
|
||||
pad(file, 32);
|
||||
err |= src0_dpas_3src(file, devinfo, inst);
|
||||
|
||||
pad(file, 48);
|
||||
err |= src1_dpas_3src(file, devinfo, inst);
|
||||
|
||||
pad(file, 64);
|
||||
err |= src2_dpas_3src(file, devinfo, inst);
|
||||
|
||||
} else if (desc && desc->nsrc == 3) {
|
||||
pad(file, 16);
|
||||
err |= dest_3src(file, devinfo, inst);
|
||||
|
|
|
|||
|
|
@ -744,7 +744,6 @@ static const struct elk_opcode_desc opcode_descs[] = {
|
|||
{ ELK_OPCODE_DP2, 87, "dp2", 2, 1, GFX_LT(GFX11) },
|
||||
{ ELK_OPCODE_DP4A, 88, "dp4a", 3, 1, GFX_GE(GFX12) },
|
||||
{ ELK_OPCODE_LINE, 89, "line", 2, 1, GFX_LE(GFX10) },
|
||||
{ ELK_OPCODE_DPAS, 89, "dpas", 3, 1, GFX_GE(GFX125) },
|
||||
{ ELK_OPCODE_PLN, 90, "pln", 2, 1, GFX_GE(GFX45) & GFX_LE(GFX10) },
|
||||
{ ELK_OPCODE_MAD, 91, "mad", 3, 1, GFX_GE(GFX6) },
|
||||
{ ELK_OPCODE_LRP, 92, "lrp", 3, 1, GFX_GE(GFX6) & GFX_LE(GFX10) },
|
||||
|
|
|
|||
|
|
@ -1910,10 +1910,6 @@ void elk_CMPN(struct elk_codegen *p,
|
|||
struct elk_reg src0,
|
||||
struct elk_reg src1);
|
||||
|
||||
elk_inst *elk_DPAS(struct elk_codegen *p, enum elk_gfx12_systolic_depth sdepth,
|
||||
unsigned rcount, struct elk_reg dest, struct elk_reg src0,
|
||||
struct elk_reg src1, struct elk_reg src2);
|
||||
|
||||
void
|
||||
elk_untyped_atomic(struct elk_codegen *p,
|
||||
struct elk_reg dst,
|
||||
|
|
|
|||
|
|
@ -1095,25 +1095,6 @@ static const uint64_t xe2_3src_control_index_table[16] = {
|
|||
0b0000011011000011101100000000000011, /* (8|M0) arf<1>:df :df :df :df */
|
||||
};
|
||||
|
||||
static const uint64_t xe2_3src_dpas_control_index_table[16] = {
|
||||
0b0000000000111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :ub Atomic */
|
||||
0b0000000100111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :b Atomic */
|
||||
0b0000100000111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :b :ub Atomic */
|
||||
0b0000100100111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :b :b Atomic */
|
||||
0b0000000000111110011000000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :ub */
|
||||
0b0000100100111110011000000000000100, /* dpas.8x* (16|M0) grf:d :d :b :b */
|
||||
0b0000101101111010101001000000000100, /* dpas.8x* (16|M0) grf:f :f :bf :bf Atomic */
|
||||
0b0000101101111101101001000000000100, /* dpas.8x* (16|M0) grf:f :bf :bf :bf Atomic */
|
||||
0b0000101101111010110101000000000100, /* dpas.8x* (16|M0) grf:bf :f :bf :bf Atomic */
|
||||
0b0000101101111101110101000000000100, /* dpas.8x* (16|M0) grf:bf :bf :bf :bf Atomic */
|
||||
0b0000101101111010101000000000000100, /* dpas.8x* (16|M0) grf:f :f :bf :bf */
|
||||
0b0000001001111010101001000000000100, /* dpas.8x* (16|M0) grf:f :f :hf :hf Atomic */
|
||||
0b0000001001111001101001000000000100, /* dpas.8x* (16|M0) grf:f :hf :hf :hf Atomic */
|
||||
0b0000001001111010100101000000000100, /* dpas.8x* (16|M0) grf:hf :f :hf :hf Atomic */
|
||||
0b0000001001111001100101000000000100, /* dpas.8x* (16|M0) grf:hf :hf :hf :hf Atomic */
|
||||
0b0000001001111010101000000000000100, /* dpas.8x* (16|M0) grf:f :f :hf :hf */
|
||||
};
|
||||
|
||||
static const uint32_t gfx12_3src_source_index_table[32] = {
|
||||
0b100101100001100000000, /* grf<0;0> grf<8;1> grf<0> */
|
||||
0b100101100001001000010, /* arf<4;1> grf<8;1> grf<0> */
|
||||
|
|
@ -1206,28 +1187,6 @@ static const uint32_t xe2_3src_source_index_table[16] = {
|
|||
0b100100010001000000001, /* arf<1;0> -grf<1;0> grf<0> */
|
||||
};
|
||||
|
||||
static const uint32_t xe2_3src_dpas_source_index_table[16] = {
|
||||
0b100100000000100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[ub,b]
|
||||
* dpas.*x1 grf:[f,bf] grf:bf grf:bf
|
||||
* dpas.*x1 grf:[f,hf] grf:hf grf:hf
|
||||
*/
|
||||
0b100100000010100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u4,s4] */
|
||||
0b100100000100100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u2,s2] */
|
||||
0b100100001000100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[ub,b] */
|
||||
0b100100001010100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u4,s4] */
|
||||
0b100100001100100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u2,s2] */
|
||||
0b100100010000100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[ub,b] */
|
||||
0b100100010010100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u4,s4] */
|
||||
0b100100010100100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u2,s2] */
|
||||
0b100100000000100000010, /* dpas.*x2 grf:d grf:[ub,b] grf:[ub,b] */
|
||||
0b100100000010100000010, /* dpas.*x2 grf:d grf:[ub,b] grf:[u4,s4] */
|
||||
0b100100001000100000010, /* dpas.*x2 grf:d grf:[u4,s4] grf:[ub,b] */
|
||||
0b100100001010100000010, /* dpas.*x2 grf:d grf:[u4,s4] grf:[u4,s4] */
|
||||
0b100100010100100000010, /* dpas.*x2 grf:d grf:[u2,s2] grf:[u2,s2] */
|
||||
0b100100000000100001110, /* dpas.*x8 grf:d grf:[ub,b] grf:[ub,b] */
|
||||
0b100100001010100001110, /* dpas.*x8 grf:d grf:[u4,s4] grf:[u4,s4] */
|
||||
};
|
||||
|
||||
static const uint32_t gfx12_3src_subreg_table[32] = {
|
||||
0b00000000000000000000, /* .0 .0 .0 .0 */
|
||||
0b00100000000000000000, /* .0 .0 .0 .4 */
|
||||
|
|
@ -1530,13 +1489,12 @@ set_src1_index(const struct compaction_state *c, elk_compact_inst *dst,
|
|||
|
||||
static bool
|
||||
set_3src_control_index(const struct intel_device_info *devinfo,
|
||||
elk_compact_inst *dst, const elk_inst *src,
|
||||
bool is_dpas)
|
||||
elk_compact_inst *dst, const elk_inst *src)
|
||||
{
|
||||
assert(devinfo->ver >= 8);
|
||||
|
||||
if (devinfo->ver >= 20) {
|
||||
assert(is_dpas || !elk_inst_bits(src, 49, 49));
|
||||
assert(!elk_inst_bits(src, 49, 49));
|
||||
|
||||
const uint64_t uncompacted = /* 34b/Xe2+ */
|
||||
(elk_inst_bits(src, 95, 92) << 30) | /* 4b */
|
||||
|
|
@ -1556,13 +1514,8 @@ set_3src_control_index(const struct intel_device_info *devinfo,
|
|||
(elk_inst_bits(src, 23, 21) << 3) | /* 3b */
|
||||
(elk_inst_bits(src, 20, 18)); /* 3b */
|
||||
|
||||
/* The bits used to index the tables for 3src and 3src-dpas
|
||||
* are the same, so just need to pick the right one.
|
||||
*/
|
||||
const uint64_t *table = is_dpas ? xe2_3src_dpas_control_index_table :
|
||||
xe2_3src_control_index_table;
|
||||
const unsigned size = is_dpas ? ARRAY_SIZE(xe2_3src_dpas_control_index_table) :
|
||||
ARRAY_SIZE(xe2_3src_control_index_table);
|
||||
const uint64_t *table = xe2_3src_control_index_table;
|
||||
const unsigned size = ARRAY_SIZE(xe2_3src_control_index_table);
|
||||
for (unsigned i = 0; i < size; i++) {
|
||||
if (table[i] == uncompacted) {
|
||||
elk_compact_inst_set_3src_control_index(devinfo, dst, i);
|
||||
|
|
@ -1646,8 +1599,7 @@ set_3src_control_index(const struct intel_device_info *devinfo,
|
|||
|
||||
static bool
|
||||
set_3src_source_index(const struct intel_device_info *devinfo,
|
||||
elk_compact_inst *dst, const elk_inst *src,
|
||||
bool is_dpas)
|
||||
elk_compact_inst *dst, const elk_inst *src)
|
||||
{
|
||||
assert(devinfo->ver >= 8);
|
||||
|
||||
|
|
@ -1669,17 +1621,12 @@ set_3src_source_index(const struct intel_device_info *devinfo,
|
|||
(elk_inst_bits(src, 43, 43) << 1) | /* 1b */
|
||||
(elk_inst_bits(src, 35, 35)); /* 1b */
|
||||
|
||||
/* In Xe2, the bits used to index the tables for 3src and 3src-dpas
|
||||
* are the same, so just need to pick the right one.
|
||||
*/
|
||||
const uint32_t *three_src_source_index_table =
|
||||
devinfo->ver >= 20 ? (is_dpas ? xe2_3src_dpas_source_index_table :
|
||||
xe2_3src_source_index_table) :
|
||||
devinfo->ver >= 20 ? xe2_3src_source_index_table :
|
||||
devinfo->verx10 >= 125 ? xehp_3src_source_index_table :
|
||||
gfx12_3src_source_index_table;
|
||||
const uint32_t three_src_source_index_table_len =
|
||||
devinfo->ver >= 20 ? (is_dpas ? ARRAY_SIZE(xe2_3src_dpas_source_index_table) :
|
||||
ARRAY_SIZE(xe2_3src_source_index_table)) :
|
||||
devinfo->ver >= 20 ? ARRAY_SIZE(xe2_3src_source_index_table) :
|
||||
devinfo->verx10 >= 125 ? ARRAY_SIZE(xehp_3src_source_index_table) :
|
||||
ARRAY_SIZE(gfx12_3src_source_index_table);
|
||||
|
||||
|
|
@ -1785,18 +1732,18 @@ has_unmapped_bits(const struct elk_isa_info *isa, const elk_inst *src)
|
|||
|
||||
static bool
|
||||
has_3src_unmapped_bits(const struct intel_device_info *devinfo,
|
||||
const elk_inst *src, bool is_dpas)
|
||||
const elk_inst *src)
|
||||
{
|
||||
/* Check for three-source instruction bits that don't map to any of the
|
||||
* fields of the compacted instruction. All of them seem to be reserved
|
||||
* bits currently.
|
||||
*/
|
||||
if (devinfo->ver >= 20) {
|
||||
assert(is_dpas || !elk_inst_bits(src, 49, 49));
|
||||
assert(!elk_inst_bits(src, 49, 49));
|
||||
assert(!elk_inst_bits(src, 33, 33));
|
||||
assert(!elk_inst_bits(src, 7, 7));
|
||||
} else if (devinfo->ver >= 12) {
|
||||
assert(is_dpas || !elk_inst_bits(src, 49, 49));
|
||||
assert(!elk_inst_bits(src, 49, 49));
|
||||
assert(!elk_inst_bits(src, 7, 7));
|
||||
} else if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) {
|
||||
assert(!elk_inst_bits(src, 127, 127) &&
|
||||
|
|
@ -1823,8 +1770,7 @@ elk_try_compact_3src_instruction(const struct elk_isa_info *isa,
|
|||
const struct intel_device_info *devinfo = isa->devinfo;
|
||||
assert(devinfo->ver >= 8);
|
||||
|
||||
bool is_dpas = elk_inst_opcode(isa, src) == ELK_OPCODE_DPAS;
|
||||
if (has_3src_unmapped_bits(devinfo, src, is_dpas))
|
||||
if (has_3src_unmapped_bits(devinfo, src))
|
||||
return false;
|
||||
|
||||
#define compact(field) \
|
||||
|
|
@ -1834,10 +1780,10 @@ elk_try_compact_3src_instruction(const struct elk_isa_info *isa,
|
|||
|
||||
compact(hw_opcode);
|
||||
|
||||
if (!set_3src_control_index(devinfo, dst, src, is_dpas))
|
||||
if (!set_3src_control_index(devinfo, dst, src))
|
||||
return false;
|
||||
|
||||
if (!set_3src_source_index(devinfo, dst, src, is_dpas))
|
||||
if (!set_3src_source_index(devinfo, dst, src))
|
||||
return false;
|
||||
|
||||
if (devinfo->ver >= 12) {
|
||||
|
|
@ -2395,16 +2341,14 @@ set_uncompacted_src1(const struct compaction_state *c, elk_inst *dst,
|
|||
|
||||
static void
|
||||
set_uncompacted_3src_control_index(const struct compaction_state *c,
|
||||
elk_inst *dst, elk_compact_inst *src,
|
||||
bool is_dpas)
|
||||
elk_inst *dst, elk_compact_inst *src)
|
||||
{
|
||||
const struct intel_device_info *devinfo = c->isa->devinfo;
|
||||
assert(devinfo->ver >= 8);
|
||||
|
||||
if (devinfo->ver >= 20) {
|
||||
uint64_t compacted = elk_compact_inst_3src_control_index(devinfo, src);
|
||||
uint64_t uncompacted = is_dpas ? xe2_3src_dpas_control_index_table[compacted] :
|
||||
xe2_3src_control_index_table[compacted];
|
||||
uint64_t uncompacted = xe2_3src_control_index_table[compacted];
|
||||
|
||||
elk_inst_set_bits(dst, 95, 92, (uncompacted >> 30) & 0xf);
|
||||
elk_inst_set_bits(dst, 90, 88, (uncompacted >> 27) & 0x7);
|
||||
|
|
@ -2482,8 +2426,7 @@ set_uncompacted_3src_control_index(const struct compaction_state *c,
|
|||
|
||||
static void
|
||||
set_uncompacted_3src_source_index(const struct intel_device_info *devinfo,
|
||||
elk_inst *dst, elk_compact_inst *src,
|
||||
bool is_dpas)
|
||||
elk_inst *dst, elk_compact_inst *src)
|
||||
{
|
||||
assert(devinfo->ver >= 8);
|
||||
|
||||
|
|
@ -2491,8 +2434,7 @@ set_uncompacted_3src_source_index(const struct intel_device_info *devinfo,
|
|||
|
||||
if (devinfo->ver >= 12) {
|
||||
const uint32_t *three_src_source_index_table =
|
||||
devinfo->ver >= 20 ? (is_dpas ? xe2_3src_dpas_source_index_table :
|
||||
xe2_3src_source_index_table) :
|
||||
devinfo->ver >= 20 ? xe2_3src_source_index_table :
|
||||
devinfo->verx10 >= 125 ? xehp_3src_source_index_table :
|
||||
gfx12_3src_source_index_table;
|
||||
uint32_t uncompacted = three_src_source_index_table[compacted];
|
||||
|
|
@ -2550,7 +2492,7 @@ set_uncompacted_3src_subreg_index(const struct intel_device_info *devinfo,
|
|||
|
||||
static void
|
||||
elk_uncompact_3src_instruction(const struct compaction_state *c,
|
||||
elk_inst *dst, elk_compact_inst *src, bool is_dpas)
|
||||
elk_inst *dst, elk_compact_inst *src)
|
||||
{
|
||||
const struct intel_device_info *devinfo = c->isa->devinfo;
|
||||
assert(devinfo->ver >= 8);
|
||||
|
|
@ -2563,8 +2505,8 @@ elk_uncompact_3src_instruction(const struct compaction_state *c,
|
|||
uncompact(hw_opcode);
|
||||
|
||||
if (devinfo->ver >= 12) {
|
||||
set_uncompacted_3src_control_index(c, dst, src, is_dpas);
|
||||
set_uncompacted_3src_source_index(devinfo, dst, src, is_dpas);
|
||||
set_uncompacted_3src_control_index(c, dst, src);
|
||||
set_uncompacted_3src_source_index(devinfo, dst, src);
|
||||
set_uncompacted_3src_subreg_index(devinfo, dst, src);
|
||||
|
||||
uncompact(debug_control);
|
||||
|
|
@ -2574,8 +2516,8 @@ elk_uncompact_3src_instruction(const struct compaction_state *c,
|
|||
uncompact(src1_reg_nr);
|
||||
uncompact(src2_reg_nr);
|
||||
} else {
|
||||
set_uncompacted_3src_control_index(c, dst, src, is_dpas);
|
||||
set_uncompacted_3src_source_index(devinfo, dst, src, is_dpas);
|
||||
set_uncompacted_3src_control_index(c, dst, src);
|
||||
set_uncompacted_3src_source_index(devinfo, dst, src);
|
||||
|
||||
uncompact(dst_reg_nr);
|
||||
uncompact_a16(src0_rep_ctrl);
|
||||
|
|
@ -2607,8 +2549,7 @@ uncompact_instruction(const struct compaction_state *c, elk_inst *dst,
|
|||
const enum elk_opcode opcode =
|
||||
elk_opcode_decode(c->isa, elk_compact_inst_3src_hw_opcode(devinfo, src));
|
||||
if (elk_is_3src(c->isa, opcode)) {
|
||||
const bool is_dpas = opcode == ELK_OPCODE_DPAS;
|
||||
elk_uncompact_3src_instruction(c, dst, src, is_dpas);
|
||||
elk_uncompact_3src_instruction(c, dst, src);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -975,61 +975,6 @@ elk_alu3(struct elk_codegen *p, unsigned opcode, struct elk_reg dest,
|
|||
return inst;
|
||||
}
|
||||
|
||||
static elk_inst *
|
||||
elk_dpas_three_src(struct elk_codegen *p, enum elk_gfx12_systolic_depth opcode,
|
||||
unsigned sdepth, unsigned rcount, struct elk_reg dest,
|
||||
struct elk_reg src0, struct elk_reg src1, struct elk_reg src2)
|
||||
{
|
||||
const struct intel_device_info *devinfo = p->devinfo;
|
||||
elk_inst *inst = next_insn(p, opcode);
|
||||
|
||||
assert(dest.file == ELK_GENERAL_REGISTER_FILE);
|
||||
elk_inst_set_dpas_3src_dst_reg_file(devinfo, inst,
|
||||
ELK_GENERAL_REGISTER_FILE);
|
||||
elk_inst_set_dpas_3src_dst_reg_nr(devinfo, inst, dest.nr);
|
||||
elk_inst_set_dpas_3src_dst_subreg_nr(devinfo, inst, dest.subnr);
|
||||
|
||||
if (elk_reg_type_is_floating_point(dest.type)) {
|
||||
elk_inst_set_dpas_3src_exec_type(devinfo, inst,
|
||||
ELK_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
|
||||
} else {
|
||||
elk_inst_set_dpas_3src_exec_type(devinfo, inst,
|
||||
ELK_ALIGN1_3SRC_EXEC_TYPE_INT);
|
||||
}
|
||||
|
||||
elk_inst_set_dpas_3src_sdepth(devinfo, inst, sdepth);
|
||||
elk_inst_set_dpas_3src_rcount(devinfo, inst, rcount - 1);
|
||||
|
||||
elk_inst_set_dpas_3src_dst_type(devinfo, inst, dest.type);
|
||||
elk_inst_set_dpas_3src_src0_type(devinfo, inst, src0.type);
|
||||
elk_inst_set_dpas_3src_src1_type(devinfo, inst, src1.type);
|
||||
elk_inst_set_dpas_3src_src2_type(devinfo, inst, src2.type);
|
||||
|
||||
assert(src0.file == ELK_GENERAL_REGISTER_FILE ||
|
||||
(src0.file == ELK_ARCHITECTURE_REGISTER_FILE &&
|
||||
src0.nr == ELK_ARF_NULL));
|
||||
|
||||
elk_inst_set_dpas_3src_src0_reg_file(devinfo, inst, src0.file);
|
||||
elk_inst_set_dpas_3src_src0_reg_nr(devinfo, inst, src0.nr);
|
||||
elk_inst_set_dpas_3src_src0_subreg_nr(devinfo, inst, src0.subnr);
|
||||
|
||||
assert(src1.file == ELK_GENERAL_REGISTER_FILE);
|
||||
|
||||
elk_inst_set_dpas_3src_src1_reg_file(devinfo, inst, src1.file);
|
||||
elk_inst_set_dpas_3src_src1_reg_nr(devinfo, inst, src1.nr);
|
||||
elk_inst_set_dpas_3src_src1_subreg_nr(devinfo, inst, src1.subnr);
|
||||
elk_inst_set_dpas_3src_src1_subbyte(devinfo, inst, ELK_SUB_BYTE_PRECISION_NONE);
|
||||
|
||||
assert(src2.file == ELK_GENERAL_REGISTER_FILE);
|
||||
|
||||
elk_inst_set_dpas_3src_src2_reg_file(devinfo, inst, src2.file);
|
||||
elk_inst_set_dpas_3src_src2_reg_nr(devinfo, inst, src2.nr);
|
||||
elk_inst_set_dpas_3src_src2_subreg_nr(devinfo, inst, src2.subnr);
|
||||
elk_inst_set_dpas_3src_src2_subbyte(devinfo, inst, ELK_SUB_BYTE_PRECISION_NONE);
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* Convenience routines.
|
||||
*/
|
||||
|
|
@ -1261,15 +1206,6 @@ elk_PLN(struct elk_codegen *p, struct elk_reg dest,
|
|||
return elk_alu2(p, ELK_OPCODE_PLN, dest, src0, src1);
|
||||
}
|
||||
|
||||
elk_inst *
|
||||
elk_DPAS(struct elk_codegen *p, enum elk_gfx12_systolic_depth sdepth,
|
||||
unsigned rcount, struct elk_reg dest, struct elk_reg src0,
|
||||
struct elk_reg src1, struct elk_reg src2)
|
||||
{
|
||||
return elk_dpas_three_src(p, ELK_OPCODE_DPAS, sdepth, rcount, dest, src0,
|
||||
src1, src2);
|
||||
}
|
||||
|
||||
elk_inst *
|
||||
elk_F32TO16(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -88,7 +88,6 @@ enum elk_opcode {
|
|||
ELK_OPCODE_DP2,
|
||||
ELK_OPCODE_DP4A, /**< Gfx12+ */
|
||||
ELK_OPCODE_LINE,
|
||||
ELK_OPCODE_DPAS, /**< Gfx12.5+ */
|
||||
ELK_OPCODE_PLN, /**< G45+ */
|
||||
ELK_OPCODE_MAD, /**< Gfx6+ */
|
||||
ELK_OPCODE_LRP, /**< Gfx6+ */
|
||||
|
|
|
|||
|
|
@ -639,10 +639,7 @@ general_restrictions_based_on_operand_types(const struct elk_isa_info *isa,
|
|||
return error_msg;
|
||||
|
||||
if (devinfo->ver >= 11) {
|
||||
/* A register type of B or UB for DPAS actually means 4 bytes packed into
|
||||
* a D or UD, so it is allowed.
|
||||
*/
|
||||
if (num_sources == 3 && elk_inst_opcode(isa, inst) != ELK_OPCODE_DPAS) {
|
||||
if (num_sources == 3) {
|
||||
ERROR_IF(elk_reg_type_to_size(elk_inst_3src_a1_src1_type(devinfo, inst)) == 1 ||
|
||||
elk_reg_type_to_size(elk_inst_3src_a1_src2_type(devinfo, inst)) == 1,
|
||||
"Byte data type is not supported for src1/2 register regioning. This includes "
|
||||
|
|
@ -2416,153 +2413,6 @@ instruction_restrictions(const struct elk_isa_info *isa,
|
|||
}
|
||||
}
|
||||
|
||||
if (elk_inst_opcode(isa, inst) == ELK_OPCODE_DPAS) {
|
||||
ERROR_IF(elk_inst_dpas_3src_sdepth(devinfo, inst) != ELK_SYSTOLIC_DEPTH_8,
|
||||
"Systolic depth must be 8.");
|
||||
|
||||
const unsigned sdepth = 8;
|
||||
|
||||
const enum elk_reg_type dst_type =
|
||||
elk_inst_dpas_3src_dst_type(devinfo, inst);
|
||||
const enum elk_reg_type src0_type =
|
||||
elk_inst_dpas_3src_src0_type(devinfo, inst);
|
||||
const enum elk_reg_type src1_type =
|
||||
elk_inst_dpas_3src_src1_type(devinfo, inst);
|
||||
const enum elk_reg_type src2_type =
|
||||
elk_inst_dpas_3src_src2_type(devinfo, inst);
|
||||
|
||||
const enum gfx12_sub_byte_precision src1_sub_byte =
|
||||
elk_inst_dpas_3src_src1_subbyte(devinfo, inst);
|
||||
|
||||
if (src1_type != ELK_REGISTER_TYPE_B && src1_type != ELK_REGISTER_TYPE_UB) {
|
||||
ERROR_IF(src1_sub_byte != ELK_SUB_BYTE_PRECISION_NONE,
|
||||
"Sub-byte precision must be None for source type larger than Byte.");
|
||||
} else {
|
||||
ERROR_IF(src1_sub_byte != ELK_SUB_BYTE_PRECISION_NONE &&
|
||||
src1_sub_byte != ELK_SUB_BYTE_PRECISION_4BIT &&
|
||||
src1_sub_byte != ELK_SUB_BYTE_PRECISION_2BIT,
|
||||
"Invalid sub-byte precision.");
|
||||
}
|
||||
|
||||
const enum gfx12_sub_byte_precision src2_sub_byte =
|
||||
elk_inst_dpas_3src_src2_subbyte(devinfo, inst);
|
||||
|
||||
if (src2_type != ELK_REGISTER_TYPE_B && src2_type != ELK_REGISTER_TYPE_UB) {
|
||||
ERROR_IF(src2_sub_byte != ELK_SUB_BYTE_PRECISION_NONE,
|
||||
"Sub-byte precision must be None.");
|
||||
} else {
|
||||
ERROR_IF(src2_sub_byte != ELK_SUB_BYTE_PRECISION_NONE &&
|
||||
src2_sub_byte != ELK_SUB_BYTE_PRECISION_4BIT &&
|
||||
src2_sub_byte != ELK_SUB_BYTE_PRECISION_2BIT,
|
||||
"Invalid sub-byte precision.");
|
||||
}
|
||||
|
||||
const unsigned src1_bits_per_element =
|
||||
(8 * elk_reg_type_to_size(src1_type)) >>
|
||||
elk_inst_dpas_3src_src1_subbyte(devinfo, inst);
|
||||
|
||||
const unsigned src2_bits_per_element =
|
||||
(8 * elk_reg_type_to_size(src2_type)) >>
|
||||
elk_inst_dpas_3src_src2_subbyte(devinfo, inst);
|
||||
|
||||
/* The MAX2(1, ...) is just to prevent possible division by 0 later. */
|
||||
const unsigned ops_per_chan =
|
||||
MAX2(1, 32 / MAX2(src1_bits_per_element, src2_bits_per_element));
|
||||
|
||||
ERROR_IF(elk_inst_exec_size(devinfo, inst) != ELK_EXECUTE_8,
|
||||
"DPAS execution size must be 8.");
|
||||
|
||||
const unsigned exec_size = 8;
|
||||
|
||||
const unsigned dst_subnr = elk_inst_dpas_3src_dst_subreg_nr(devinfo, inst);
|
||||
const unsigned src0_subnr = elk_inst_dpas_3src_src0_subreg_nr(devinfo, inst);
|
||||
const unsigned src1_subnr = elk_inst_dpas_3src_src1_subreg_nr(devinfo, inst);
|
||||
const unsigned src2_subnr = elk_inst_dpas_3src_src2_subreg_nr(devinfo, inst);
|
||||
|
||||
/* Until HF is supported as dst type, this is effectively subnr == 0. */
|
||||
ERROR_IF(dst_subnr % exec_size != 0,
|
||||
"Destination subregister offset must be a multiple of ExecSize.");
|
||||
|
||||
/* Until HF is supported as src0 type, this is effectively subnr == 0. */
|
||||
ERROR_IF(src0_subnr % exec_size != 0,
|
||||
"Src0 subregister offset must be a multiple of ExecSize.");
|
||||
|
||||
ERROR_IF(src1_subnr != 0,
|
||||
"Src1 subregister offsets must be 0.");
|
||||
|
||||
/* In nearly all cases, this effectively requires that src2.subnr be
|
||||
* 0. It is only when src1 is 8 bits and src2 is 2 or 4 bits that the
|
||||
* ops_per_chan value can allow non-zero src2.subnr.
|
||||
*/
|
||||
ERROR_IF(src2_subnr % (sdepth * ops_per_chan) != 0,
|
||||
"Src2 subregister offset must be a multiple of SystolicDepth "
|
||||
"times OPS_PER_CHAN.");
|
||||
|
||||
ERROR_IF(dst_subnr * type_sz(dst_type) >= REG_SIZE,
|
||||
"Destination subregister specifies next register.");
|
||||
|
||||
ERROR_IF(src0_subnr * type_sz(src0_type) >= REG_SIZE,
|
||||
"Src0 subregister specifies next register.");
|
||||
|
||||
ERROR_IF((src1_subnr * type_sz(src1_type) * src1_bits_per_element) / 8 >= REG_SIZE,
|
||||
"Src1 subregister specifies next register.");
|
||||
|
||||
ERROR_IF((src2_subnr * type_sz(src2_type) * src2_bits_per_element) / 8 >= REG_SIZE,
|
||||
"Src2 subregister specifies next register.");
|
||||
|
||||
if (elk_inst_3src_atomic_control(devinfo, inst)) {
|
||||
/* FINISHME: When we start emitting DPAS with Atomic set, figure out
|
||||
* a way to validate it. Also add a test in test_eu_validate.cpp.
|
||||
*/
|
||||
ERROR_IF(true,
|
||||
"When instruction option Atomic is used it must be follwed by a "
|
||||
"DPAS instruction.");
|
||||
}
|
||||
|
||||
if (elk_inst_dpas_3src_exec_type(devinfo, inst) ==
|
||||
ELK_ALIGN1_3SRC_EXEC_TYPE_FLOAT) {
|
||||
ERROR_IF(dst_type != ELK_REGISTER_TYPE_F,
|
||||
"DPAS destination type must be F.");
|
||||
ERROR_IF(src0_type != ELK_REGISTER_TYPE_F,
|
||||
"DPAS src0 type must be F.");
|
||||
ERROR_IF(src1_type != ELK_REGISTER_TYPE_HF,
|
||||
"DPAS src1 type must be HF.");
|
||||
ERROR_IF(src2_type != ELK_REGISTER_TYPE_HF,
|
||||
"DPAS src2 type must be HF.");
|
||||
} else {
|
||||
ERROR_IF(dst_type != ELK_REGISTER_TYPE_D &&
|
||||
dst_type != ELK_REGISTER_TYPE_UD,
|
||||
"DPAS destination type must be D or UD.");
|
||||
ERROR_IF(src0_type != ELK_REGISTER_TYPE_D &&
|
||||
src0_type != ELK_REGISTER_TYPE_UD,
|
||||
"DPAS src0 type must be D or UD.");
|
||||
ERROR_IF(src1_type != ELK_REGISTER_TYPE_B &&
|
||||
src1_type != ELK_REGISTER_TYPE_UB,
|
||||
"DPAS src1 base type must be B or UB.");
|
||||
ERROR_IF(src2_type != ELK_REGISTER_TYPE_B &&
|
||||
src2_type != ELK_REGISTER_TYPE_UB,
|
||||
"DPAS src2 base type must be B or UB.");
|
||||
|
||||
if (elk_reg_type_is_unsigned_integer(dst_type)) {
|
||||
ERROR_IF(!elk_reg_type_is_unsigned_integer(src0_type) ||
|
||||
!elk_reg_type_is_unsigned_integer(src1_type) ||
|
||||
!elk_reg_type_is_unsigned_integer(src2_type),
|
||||
"If any source datatype is signed, destination datatype "
|
||||
"must be signed.");
|
||||
}
|
||||
}
|
||||
|
||||
/* FINISHME: Additional restrictions mentioned in the Bspec that are not
|
||||
* yet enforced here:
|
||||
*
|
||||
* - General Accumulator registers access is not supported. This is
|
||||
* currently enforced in elk_dpas_three_src (elk_eu_emit.c).
|
||||
*
|
||||
* - Given any combination of datatypes in the sources of a DPAS
|
||||
* instructions, the boundaries of a register should not be crossed.
|
||||
*/
|
||||
}
|
||||
|
||||
return error_msg;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -391,21 +391,6 @@ elk_fs_inst::has_source_and_destination_hazard() const
|
|||
default:
|
||||
return !is_uniform(src[0]);
|
||||
}
|
||||
case ELK_OPCODE_DPAS:
|
||||
/* This is overly conservative. The actual hazard is more complicated to
|
||||
* describe. When the repeat count is N, the single instruction behaves
|
||||
* like N instructions with a repeat count of one, but the destination
|
||||
* and source registers are incremented (in somewhat complex ways) for
|
||||
* each instruction.
|
||||
*
|
||||
* This means the source and destination register is actually a range of
|
||||
* registers. The hazard exists of an earlier iteration would write a
|
||||
* register that should be read by a later iteration.
|
||||
*
|
||||
* There may be some advantage to properly modeling this, but for now,
|
||||
* be overly conservative.
|
||||
*/
|
||||
return rcount > 1;
|
||||
default:
|
||||
/* The SIMD16 compressed instruction
|
||||
*
|
||||
|
|
@ -855,9 +840,6 @@ elk_fs_inst::components_read(unsigned i) const
|
|||
else
|
||||
return 1;
|
||||
|
||||
case ELK_OPCODE_DPAS:
|
||||
unreachable("Do not use components_read() for DPAS.");
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -918,26 +900,6 @@ elk_fs_inst::size_read(int arg) const
|
|||
}
|
||||
break;
|
||||
|
||||
case ELK_OPCODE_DPAS:
|
||||
switch (arg) {
|
||||
case 0:
|
||||
if (src[0].type == ELK_REGISTER_TYPE_HF) {
|
||||
return rcount * REG_SIZE / 2;
|
||||
} else {
|
||||
return rcount * REG_SIZE;
|
||||
}
|
||||
case 1:
|
||||
return sdepth * REG_SIZE;
|
||||
case 2:
|
||||
/* This is simpler than the formula described in the Bspec, but it
|
||||
* covers all of the cases that we support on DG2.
|
||||
*/
|
||||
return rcount * REG_SIZE;
|
||||
default:
|
||||
unreachable("Invalid source number.");
|
||||
}
|
||||
break;
|
||||
|
||||
case ELK_SHADER_OPCODE_TEX:
|
||||
case ELK_FS_OPCODE_TXB:
|
||||
case ELK_SHADER_OPCODE_TXD:
|
||||
|
|
|
|||
|
|
@ -592,8 +592,6 @@ void elk_emit_predicate_on_sample_mask(const elk::fs_builder &bld, elk_fs_inst *
|
|||
int elk_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
||||
const elk_stage_prog_data *prog_data);
|
||||
|
||||
bool elk_lower_dpas(elk_fs_visitor &v);
|
||||
|
||||
void nir_to_elk(elk_fs_visitor *s);
|
||||
|
||||
#endif /* ELK_FS_H */
|
||||
|
|
|
|||
|
|
@ -834,27 +834,6 @@ namespace elk {
|
|||
return inst;
|
||||
}
|
||||
|
||||
instruction *
|
||||
DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
|
||||
unsigned sdepth, unsigned rcount) const
|
||||
{
|
||||
assert(_dispatch_width == 8);
|
||||
assert(sdepth == 8);
|
||||
assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
|
||||
|
||||
instruction *inst = emit(ELK_OPCODE_DPAS, dst, src0, src1, src2);
|
||||
inst->sdepth = sdepth;
|
||||
inst->rcount = rcount;
|
||||
|
||||
if (dst.type == ELK_REGISTER_TYPE_HF) {
|
||||
inst->size_written = rcount * REG_SIZE / 2;
|
||||
} else {
|
||||
inst->size_written = rcount * REG_SIZE;
|
||||
}
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
elk_fs_visitor *shader;
|
||||
|
||||
elk_fs_inst *BREAK() { return emit(ELK_OPCODE_BREAK); }
|
||||
|
|
|
|||
|
|
@ -1599,19 +1599,6 @@ elk_fs_generator::enable_debug(const char *shader_name)
|
|||
this->shader_name = shader_name;
|
||||
}
|
||||
|
||||
static elk_gfx12_systolic_depth
|
||||
translate_systolic_depth(unsigned d)
|
||||
{
|
||||
/* Could also return (ffs(d) - 1) & 3. */
|
||||
switch (d) {
|
||||
case 2: return ELK_SYSTOLIC_DEPTH_2;
|
||||
case 4: return ELK_SYSTOLIC_DEPTH_4;
|
||||
case 8: return ELK_SYSTOLIC_DEPTH_8;
|
||||
case 16: return ELK_SYSTOLIC_DEPTH_16;
|
||||
default: unreachable("Invalid systolic depth.");
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
elk_fs_generator::generate_code(const elk_cfg_t *cfg, int dispatch_width,
|
||||
struct shader_stats shader_stats,
|
||||
|
|
@ -1820,12 +1807,6 @@ elk_fs_generator::generate_code(const elk_cfg_t *cfg, int dispatch_width,
|
|||
elk_LINE(p, dst, src[0], src[1]);
|
||||
break;
|
||||
|
||||
case ELK_OPCODE_DPAS:
|
||||
assert(devinfo->verx10 >= 125);
|
||||
elk_DPAS(p, translate_systolic_depth(inst->sdepth), inst->rcount,
|
||||
dst, src[0], src[1], src[2]);
|
||||
break;
|
||||
|
||||
case ELK_OPCODE_MAD:
|
||||
assert(devinfo->ver >= 6);
|
||||
if (devinfo->ver < 10)
|
||||
|
|
|
|||
|
|
@ -253,8 +253,7 @@ namespace {
|
|||
has_invalid_src_region(const intel_device_info *devinfo, const elk_fs_inst *inst,
|
||||
unsigned i)
|
||||
{
|
||||
if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
|
||||
inst->opcode == ELK_OPCODE_DPAS) {
|
||||
if (is_send(inst) || inst->is_math() || inst->is_control_source(i)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4547,66 +4547,6 @@ fs_nir_emit_cs_intrinsic(nir_to_elk_state &ntb,
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_dpas_intel: {
|
||||
const unsigned sdepth = nir_intrinsic_systolic_depth(instr);
|
||||
const unsigned rcount = nir_intrinsic_repeat_count(instr);
|
||||
|
||||
const elk_reg_type dest_type =
|
||||
elk_type_for_nir_type(devinfo, nir_intrinsic_dest_type(instr));
|
||||
const elk_reg_type src_type =
|
||||
elk_type_for_nir_type(devinfo, nir_intrinsic_src_type(instr));
|
||||
|
||||
dest = retype(dest, dest_type);
|
||||
elk_fs_reg src2 = retype(get_nir_src(ntb, instr->src[2]), dest_type);
|
||||
const elk_fs_reg dest_hf = dest;
|
||||
|
||||
fs_builder bld8 = bld.exec_all().group(8, 0);
|
||||
fs_builder bld16 = bld.exec_all().group(16, 0);
|
||||
|
||||
/* DG2 cannot have the destination or source 0 of DPAS be float16. It is
|
||||
* still advantageous to support these formats for memory and bandwidth
|
||||
* savings.
|
||||
*
|
||||
* The float16 source must be expanded to float32.
|
||||
*/
|
||||
if (devinfo->verx10 == 125 && dest_type == ELK_REGISTER_TYPE_HF &&
|
||||
!s.compiler->lower_dpas) {
|
||||
dest = bld8.vgrf(ELK_REGISTER_TYPE_F, rcount);
|
||||
|
||||
if (src2.file != ARF) {
|
||||
const elk_fs_reg src2_hf = src2;
|
||||
|
||||
src2 = bld8.vgrf(ELK_REGISTER_TYPE_F, rcount);
|
||||
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
bld16.MOV(byte_offset(src2, REG_SIZE * i * 2),
|
||||
byte_offset(src2_hf, REG_SIZE * i));
|
||||
}
|
||||
} else {
|
||||
src2 = retype(src2, ELK_REGISTER_TYPE_F);
|
||||
}
|
||||
}
|
||||
|
||||
bld8.DPAS(dest,
|
||||
src2,
|
||||
retype(get_nir_src(ntb, instr->src[1]), src_type),
|
||||
retype(get_nir_src(ntb, instr->src[0]), src_type),
|
||||
sdepth,
|
||||
rcount)
|
||||
->saturate = nir_intrinsic_saturate(instr);
|
||||
|
||||
/* Compact the destination to float16 (from float32). */
|
||||
if (!dest.equals(dest_hf)) {
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
bld16.MOV(byte_offset(dest_hf, REG_SIZE * i),
|
||||
byte_offset(dest, REG_SIZE * i * 2));
|
||||
}
|
||||
}
|
||||
|
||||
cs_prog_data->uses_systolic = true;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
fs_nir_emit_intrinsic(ntb, bld, instr);
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -400,7 +400,7 @@ add_label(struct elk_codegen *p, const char* label_name, enum instr_label_type t
|
|||
%token <integer> ADD ADD3 ADDC AND ASR AVG
|
||||
%token <integer> BFE BFI1 BFI2 BFB BFREV BRC BRD BREAK
|
||||
%token <integer> CALL CALLA CASE CBIT CMP CMPN CONT CSEL
|
||||
%token <integer> DIM DO DPAS DPASW DP2 DP3 DP4 DP4A DPH
|
||||
%token <integer> DIM DO DP2 DP3 DP4 DP4A DPH
|
||||
%token <integer> ELSE ENDIF F16TO32 F32TO16 FBH FBL FORK FRC
|
||||
%token <integer> GOTO
|
||||
%token <integer> HALT
|
||||
|
|
|
|||
|
|
@ -616,67 +616,6 @@ elk_inst_set_3src_a1_src2_imm(ASSERTED const struct intel_device_info *devinfo,
|
|||
}
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* Three-source systolic instructions:
|
||||
* @{
|
||||
*/
|
||||
F(dpas_3src_src2_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 127, 120)
|
||||
F(dpas_3src_src2_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 119, 115)
|
||||
F(dpas_3src_src2_reg_file, /* 4+ */ -1, -1, /* 12+ */ 114, 114)
|
||||
F(dpas_3src_src1_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 111, 104)
|
||||
F(dpas_3src_src1_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 103, 99)
|
||||
F(dpas_3src_src1_reg_file, /* 4+ */ -1, -1, /* 12+ */ 98, 98)
|
||||
F(dpas_3src_src1_hw_type, /* 4+ */ -1, -1, /* 12+ */ 90, 88)
|
||||
F(dpas_3src_src1_subbyte, /* 4+ */ -1, -1, /* 12+ */ 87, 86)
|
||||
F(dpas_3src_src2_subbyte, /* 4+ */ -1, -1, /* 12+ */ 85, 84)
|
||||
F(dpas_3src_src2_hw_type, /* 4+ */ -1, -1, /* 12+ */ 82, 80)
|
||||
F(dpas_3src_src0_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 79, 72)
|
||||
F(dpas_3src_src0_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 71, 67)
|
||||
F(dpas_3src_src0_reg_file, /* 4+ */ -1, -1, /* 12+ */ 66, 66)
|
||||
F(dpas_3src_dst_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 63, 56)
|
||||
F(dpas_3src_dst_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 55, 51)
|
||||
F(dpas_3src_dst_reg_file, /* 4+ */ -1, -1, /* 12+ */ 50, 50)
|
||||
F(dpas_3src_sdepth, /* 4+ */ -1, -1, /* 12+ */ 49, 48)
|
||||
F(dpas_3src_rcount, /* 4+ */ -1, -1, /* 12+ */ 45, 43)
|
||||
F(dpas_3src_src0_hw_type, /* 4+ */ -1, -1, /* 12+ */ 42, 40)
|
||||
F(dpas_3src_exec_type, /* 4+ */ -1, -1, /* 12+ */ 39, 39)
|
||||
F(dpas_3src_dst_hw_type, /* 4+ */ -1, -1, /* 12+ */ 38, 36)
|
||||
/** @} */
|
||||
|
||||
#define REG_TYPE(reg) \
|
||||
static inline void \
|
||||
elk_inst_set_dpas_3src_##reg##_type(const struct intel_device_info *devinfo, \
|
||||
elk_inst *inst, enum elk_reg_type type) \
|
||||
{ \
|
||||
UNUSED enum gfx10_align1_3src_exec_type exec_type = \
|
||||
(enum gfx10_align1_3src_exec_type) elk_inst_dpas_3src_exec_type(devinfo,\
|
||||
inst); \
|
||||
if (elk_reg_type_is_floating_point(type)) { \
|
||||
assert(exec_type == ELK_ALIGN1_3SRC_EXEC_TYPE_FLOAT); \
|
||||
} else { \
|
||||
assert(exec_type == ELK_ALIGN1_3SRC_EXEC_TYPE_INT); \
|
||||
} \
|
||||
unsigned hw_type = elk_reg_type_to_a1_hw_3src_type(devinfo, type); \
|
||||
elk_inst_set_dpas_3src_##reg##_hw_type(devinfo, inst, hw_type); \
|
||||
} \
|
||||
\
|
||||
static inline enum elk_reg_type \
|
||||
elk_inst_dpas_3src_##reg##_type(const struct intel_device_info *devinfo, \
|
||||
const elk_inst *inst) \
|
||||
{ \
|
||||
enum gfx10_align1_3src_exec_type exec_type = \
|
||||
(enum gfx10_align1_3src_exec_type) elk_inst_dpas_3src_exec_type(devinfo,\
|
||||
inst); \
|
||||
unsigned hw_type = elk_inst_dpas_3src_##reg##_hw_type(devinfo, inst); \
|
||||
return elk_a1_hw_3src_type_to_reg_type(devinfo, hw_type, exec_type); \
|
||||
}
|
||||
|
||||
REG_TYPE(dst)
|
||||
REG_TYPE(src0)
|
||||
REG_TYPE(src1)
|
||||
REG_TYPE(src2)
|
||||
#undef REG_TYPE
|
||||
|
||||
/**
|
||||
* Flow control instruction bits:
|
||||
* @{
|
||||
|
|
|
|||
|
|
@ -199,16 +199,6 @@ struct elk_backend_instruction {
|
|||
*/
|
||||
unsigned flag_subreg:3;
|
||||
|
||||
/**
|
||||
* Systolic depth used by DPAS instruction.
|
||||
*/
|
||||
unsigned sdepth:4;
|
||||
|
||||
/**
|
||||
* Repeat count used by DPAS instruction.
|
||||
*/
|
||||
unsigned rcount:4;
|
||||
|
||||
/** The number of hardware registers used for a message header. */
|
||||
uint8_t header_size;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -576,7 +576,6 @@ static inline bool
|
|||
is_unordered(const intel_device_info *devinfo, const elk_fs_inst *inst)
|
||||
{
|
||||
return is_send(inst) || (devinfo->ver < 20 && inst->is_math()) ||
|
||||
inst->opcode == ELK_OPCODE_DPAS ||
|
||||
(devinfo->has_64bit_float_via_math_pipe &&
|
||||
(get_exec_type(inst) == ELK_REGISTER_TYPE_DF ||
|
||||
inst->dst.type == ELK_REGISTER_TYPE_DF));
|
||||
|
|
|
|||
|
|
@ -148,8 +148,6 @@ namespace {
|
|||
!elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
|
||||
type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
|
||||
tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
|
||||
|
||||
rcount = inst->opcode == ELK_OPCODE_DPAS ? inst->rcount : 0;
|
||||
}
|
||||
|
||||
instruction_info(const struct elk_isa_info *isa,
|
||||
|
|
@ -157,7 +155,7 @@ namespace {
|
|||
isa(isa), devinfo(isa->devinfo), op(inst->opcode),
|
||||
td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
|
||||
tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
|
||||
desc(inst->desc), sfid(inst->sfid), rcount(0)
|
||||
desc(inst->desc), sfid(inst->sfid)
|
||||
{
|
||||
/* Compute the maximum source size. */
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
|
||||
|
|
@ -197,8 +195,6 @@ namespace {
|
|||
uint32_t desc;
|
||||
/** Send message shared function ID. */
|
||||
uint8_t sfid;
|
||||
/** Repeat count for DPAS instructions. */
|
||||
uint8_t rcount;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -509,32 +505,6 @@ namespace {
|
|||
else
|
||||
abort();
|
||||
|
||||
case ELK_OPCODE_DPAS: {
|
||||
unsigned ld;
|
||||
|
||||
switch (info.rcount) {
|
||||
case 1:
|
||||
ld = 21;
|
||||
break;
|
||||
case 2:
|
||||
ld = 22;
|
||||
break;
|
||||
case 8:
|
||||
default:
|
||||
ld = 32;
|
||||
break;
|
||||
}
|
||||
|
||||
/* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
|
||||
* for la and lf.
|
||||
*/
|
||||
if (devinfo->verx10 >= 125)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
||||
0, ld, UINT_MAX, UINT_MAX, 0, 0);
|
||||
else
|
||||
abort();
|
||||
}
|
||||
|
||||
case ELK_SHADER_OPCODE_RCP:
|
||||
case ELK_SHADER_OPCODE_RSQ:
|
||||
case ELK_SHADER_OPCODE_SQRT:
|
||||
|
|
|
|||
|
|
@ -617,21 +617,6 @@ elk_schedule_node::set_latency_gfx7(const struct elk_isa_info *isa)
|
|||
}
|
||||
break;
|
||||
|
||||
case ELK_OPCODE_DPAS:
|
||||
switch (inst->rcount) {
|
||||
case 1:
|
||||
latency = 21;
|
||||
break;
|
||||
case 2:
|
||||
latency = 22;
|
||||
break;
|
||||
case 8:
|
||||
default:
|
||||
latency = 32;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* 2 cycles:
|
||||
* mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
|
||||
|
|
|
|||
|
|
@ -165,13 +165,6 @@ elk_instruction_name(const struct elk_isa_info *isa, enum elk_opcode op)
|
|||
if (devinfo->ver > 7 && op == ELK_OPCODE_F16TO32)
|
||||
return "f16to32";
|
||||
|
||||
/* DPAS instructions may transiently exist on platforms that do not
|
||||
* support DPAS. They will eventually be lowered, but in the meantime it
|
||||
* must be possible to query the instruction name.
|
||||
*/
|
||||
if (devinfo->verx10 < 125 && op == ELK_OPCODE_DPAS)
|
||||
return "dpas";
|
||||
|
||||
assert(elk_opcode_desc(isa, op)->name);
|
||||
return elk_opcode_desc(isa, op)->name;
|
||||
case ELK_FS_OPCODE_FB_WRITE:
|
||||
|
|
@ -944,7 +937,6 @@ elk_backend_instruction::can_do_source_mods() const
|
|||
case ELK_OPCODE_ROR:
|
||||
case ELK_OPCODE_SUBB:
|
||||
case ELK_OPCODE_DP4A:
|
||||
case ELK_OPCODE_DPAS:
|
||||
case ELK_SHADER_OPCODE_BROADCAST:
|
||||
case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
|
||||
case ELK_SHADER_OPCODE_MOV_INDIRECT:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue