diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 5f0753c75de..ecd4b50cfd7 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4348,35 +4348,10 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb, dest = retype(dest, dest_type); fs_reg src0 = retype(get_nir_src(ntb, instr->src[0]), dest_type); - const fs_reg dest_hf = dest; fs_builder bld16 = bld.exec_all().group(16, 0); fs_builder bldn = devinfo->ver >= 20 ? bld16 : bld.exec_all().group(8, 0); - /* DG2 cannot have the destination or source 0 of DPAS be float16. It is - * still advantageous to support these formats for memory and bandwidth - * savings. - * - * The float16 source must be expanded to float32. - */ - if (devinfo->verx10 == 125 && dest_type == BRW_TYPE_HF && - !s.compiler->lower_dpas) { - dest = bldn.vgrf(BRW_TYPE_F, rcount); - - if (src0.file != ARF) { - const fs_reg src0_hf = src0; - - src0 = bldn.vgrf(BRW_TYPE_F, rcount); - - for (unsigned i = 0; i < 4; i++) { - bld16.MOV(byte_offset(src0, REG_SIZE * i * 2), - byte_offset(src0_hf, REG_SIZE * i)); - } - } else { - src0 = retype(src0, BRW_TYPE_F); - } - } - bldn.DPAS(dest, src0, retype(get_nir_src(ntb, instr->src[2]), src_type), @@ -4385,14 +4360,6 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb, rcount) ->saturate = nir_intrinsic_saturate(instr); - /* Compact the destination to float16 (from float32). */ - if (!dest.equals(dest_hf)) { - for (unsigned i = 0; i < 4; i++) { - bld16.MOV(byte_offset(dest_hf, REG_SIZE * i), - byte_offset(dest, REG_SIZE * i * 2)); - } - } - cs_prog_data->uses_systolic = true; break; } diff --git a/src/intel/compiler/brw_nir_lower_cooperative_matrix.c b/src/intel/compiler/brw_nir_lower_cooperative_matrix.c index 895a1ae2216..e71c76f87ad 100644 --- a/src/intel/compiler/brw_nir_lower_cooperative_matrix.c +++ b/src/intel/compiler/brw_nir_lower_cooperative_matrix.c @@ -148,21 +148,14 @@ get_slice_type_from_desc(const struct lower_cmat_state *state, const unsigned element_bits = 32; const unsigned bits = glsl_base_type_get_bit_size(desc.element_type); - unsigned packing_factor = MIN2(elements_per_invocation, - element_bits / bits); - /* Adjust the packing factor so that each row of the matrix fills and - * entire GRF. - * - * The in-register layout of B matrices is different, so those are handled - * more like column major (for row major matrices). See the file comment - * for more details. + /* Each invocation must have at least one dword of data, and that dword + * must be tightly packed with values. No matter the matrix dimensions, a + * matrix of uint8_t data must pack 4 values in each entry. */ - const unsigned actual_cols = desc.use != GLSL_CMAT_USE_B ? desc.cols : desc.rows; - while ((actual_cols / packing_factor) < 8) { - assert(packing_factor > 1); - packing_factor /= 2; - } + const unsigned packing_factor = element_bits / bits; + + assert(elements_per_invocation >= packing_factor); switch (desc.element_type) { case GLSL_TYPE_FLOAT: @@ -172,12 +165,12 @@ get_slice_type_from_desc(const struct lower_cmat_state *state, case GLSL_TYPE_FLOAT16: case GLSL_TYPE_UINT8: case GLSL_TYPE_UINT16: - base_type = glsl_get_base_type(glsl_uintN_t_type(packing_factor * bits)); + base_type = GLSL_TYPE_UINT; break; case GLSL_TYPE_INT: case GLSL_TYPE_INT8: case GLSL_TYPE_INT16: - base_type = glsl_get_base_type(glsl_intN_t_type(packing_factor * bits)); + base_type = GLSL_TYPE_INT; break; default: unreachable("Invalid cooperative matrix element type."); @@ -422,54 +415,30 @@ lower_cmat_unary_op(nir_builder *b, nir_intrinsic_instr *intrin, const nir_op op = nir_intrinsic_alu_op(intrin); - /* There are three possible cases: - * - * 1. dst_packing_factor == src_packing_factor. This is the common case, - * and handling it is straightforward. - * - * 2. dst_packing_factor > src_packing_factor. This occurs when converting a - * float32_t matrix slice to a packed float16_t slice. Loop over the size - * of the destination slice, but read multiple entries from the source - * slice on each iteration. - * - * 3. dst_packing_factor < src_packing_factor. This occurs when converting a - * packed int8_t matrix slice to an int32_t slice. Loop over the size of - * the source slice, but write multiple entries to the destination slice - * on each iteration. - * - * Handle all cases by iterating over the total (non-packed) number of - * elements in the slice. When dst_packing_factor values have been - * calculated, store them. + /* With the combinations of formats exposed on all platforms, matrices with + * the same dimensions will always have the same data size. The only real + * type conversion possible is int32 <-> float32. As a result + * dst_packing_factor == src_packing_factor. */ - assert((dst_packing_factor * glsl_get_vector_elements(dst_slice->type)) == - (src_packing_factor * glsl_get_vector_elements(src_slice->type))); + assert(dst_packing_factor == src_packing_factor); /* Stores at most dst_packing_factor partial results. */ nir_def *v[4]; assert(dst_packing_factor <= 4); - for (unsigned i = 0; i < num_components * dst_packing_factor; i++) { - const unsigned dst_chan_index = i % dst_packing_factor; - const unsigned src_chan_index = i % src_packing_factor; - const unsigned dst_index = i / dst_packing_factor; - const unsigned src_index = i / src_packing_factor; + for (unsigned i = 0; i < num_components; i++) { + nir_def *chan = nir_channel(b, nir_load_deref(b, src_slice), i); - nir_def *src = - nir_channel(b, - nir_unpack_bits(b, - nir_channel(b, - nir_load_deref(b, src_slice), - src_index), - src_bits), - src_chan_index); + for (unsigned j = 0; j < dst_packing_factor; j++) { + nir_def *src = + nir_channel(b, nir_unpack_bits(b, chan, src_bits), j); - v[dst_chan_index] = nir_build_alu1(b, op, src); - - if (dst_chan_index == (dst_packing_factor - 1)) { - results[dst_index] = - nir_pack_bits(b, nir_vec(b, v, dst_packing_factor), - dst_packing_factor * dst_bits); + v[j] = nir_build_alu1(b, op, src); } + + results[i] = + nir_pack_bits(b, nir_vec(b, v, dst_packing_factor), + dst_packing_factor * dst_bits); } nir_store_deref(b, dst_slice, nir_vec(b, results, num_components), diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c index 5b7cd9a5f08..4695bcc4143 100644 --- a/src/intel/dev/intel_device_info.c +++ b/src/intel/dev/intel_device_info.c @@ -620,7 +620,6 @@ static const struct intel_device_info intel_device_info_chv = { .has_sample_with_hiz = true, \ .has_illegal_ccs_values = true, \ .cooperative_matrix_configurations = { \ - { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \ @@ -852,7 +851,6 @@ static const struct intel_device_info intel_device_info_cfl_gt3 = { .num_subslices = _subslices, \ .max_eus_per_subslice = 8, \ .cooperative_matrix_configurations = { \ - { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \ @@ -985,7 +983,6 @@ static const struct intel_device_info intel_device_info_ehl_2x4 = { .writecombining = PAT_ENTRY(1, WC, NONE), \ }, \ .cooperative_matrix_configurations = { \ - { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \ @@ -1117,7 +1114,6 @@ static const struct intel_device_info intel_device_info_sg1 = { .has_aux_map = false, \ .simulator_id = 29, \ .cooperative_matrix_configurations = { \ - { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \ { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \