diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h index 67a4d1b18ff..86172b4dcc6 100644 --- a/src/intel/compiler/brw/brw_eu_defines.h +++ b/src/intel/compiler/brw/brw_eu_defines.h @@ -552,35 +552,6 @@ enum ENUM_PACKED opcode { SHADER_OPCODE_LOAD_REG, }; -enum sampler_opcode { - /** - * Texture sampling opcodes. - * - * LOGICAL opcodes are eventually translated to SHADER_OPCODE_SEND but - * take parameters as individual sources. See enum tex_logical_srcs. - */ - SAMPLER_OPCODE_TEX_LOGICAL, - SAMPLER_OPCODE_TXD_LOGICAL, - SAMPLER_OPCODE_TXF_LOGICAL, - SAMPLER_OPCODE_TXL_LOGICAL, - SAMPLER_OPCODE_TXS_LOGICAL, - SAMPLER_OPCODE_TXB_LOGICAL, - SAMPLER_OPCODE_TXF_CMS_W_LOGICAL, - SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL, - SAMPLER_OPCODE_TXF_MCS_LOGICAL, - SAMPLER_OPCODE_LOD_LOGICAL, - SAMPLER_OPCODE_TG4_LOGICAL, - SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL, - SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL, - SAMPLER_OPCODE_TG4_BIAS_LOGICAL, - SAMPLER_OPCODE_TG4_OFFSET_LOGICAL, - SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL, - SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL, - SAMPLER_OPCODE_SAMPLEINFO_LOGICAL, - - SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL, -}; - enum send_srcs { /** The 32-bit message descriptor (can be a register) */ SEND_SRC_DESC, @@ -612,26 +583,24 @@ enum fb_write_logical_srcs { }; enum tex_logical_srcs { - /** Texture coordinates */ - TEX_LOGICAL_SRC_COORDINATE, - /** Shadow comparator */ - TEX_LOGICAL_SRC_SHADOW_C, - /** dPdx if the operation takes explicit derivatives, otherwise LOD value */ - TEX_LOGICAL_SRC_LOD, - /** dPdy if the operation takes explicit derivatives */ - TEX_LOGICAL_SRC_LOD2, - /** Min LOD */ - TEX_LOGICAL_SRC_MIN_LOD, - /** Sample index */ - TEX_LOGICAL_SRC_SAMPLE_INDEX, - /** MCS data */ - TEX_LOGICAL_SRC_MCS, /** REQUIRED: Texture surface index */ TEX_LOGICAL_SRC_SURFACE, /** Texture sampler index */ TEX_LOGICAL_SRC_SAMPLER, - /** Texel offset for gathers */ - TEX_LOGICAL_SRC_TG4_OFFSET, + /** Sampler payloads */ + TEX_LOGICAL_SRC_PAYLOAD0, + TEX_LOGICAL_SRC_PAYLOAD1, + TEX_LOGICAL_SRC_PAYLOAD2, + TEX_LOGICAL_SRC_PAYLOAD3, + TEX_LOGICAL_SRC_PAYLOAD4, + TEX_LOGICAL_SRC_PAYLOAD5, + TEX_LOGICAL_SRC_PAYLOAD6, + TEX_LOGICAL_SRC_PAYLOAD7, + TEX_LOGICAL_SRC_PAYLOAD8, + TEX_LOGICAL_SRC_PAYLOAD9, + TEX_LOGICAL_SRC_PAYLOAD10, + TEX_LOGICAL_SRC_PAYLOAD11, + TEX_LOGICAL_SRC_PAYLOAD12, TEX_LOGICAL_NUM_SRCS, }; diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 82605af5861..3b277ac951c 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -88,38 +88,6 @@ static void brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, static void brw_combine_with_vec(const brw_builder &bld, const brw_reg &dst, const brw_reg &src, unsigned n); -static bool -brw_texture_offset(const nir_tex_instr *tex, unsigned src, - uint32_t *offset_bits_out) -{ - if (!nir_src_is_const(tex->src[src].src)) - return false; - - const unsigned num_components = nir_tex_instr_src_size(tex, src); - - /* Combine all three offsets into a single unsigned dword: - * - * bits 11:8 - U Offset (X component) - * bits 7:4 - V Offset (Y component) - * bits 3:0 - R Offset (Z component) - */ - uint32_t offset_bits = 0; - for (unsigned i = 0; i < num_components; i++) { - int offset = nir_src_comp_as_int(tex->src[src].src, i); - - /* offset out of bounds; caller will handle it. */ - if (offset > 7 || offset < -8) - return false; - - const unsigned shift = 4 * (2 - i); - offset_bits |= (offset & 0xF) << shift; - } - - *offset_bits_out = offset_bits; - - return true; -} - static brw_reg setup_imm_b(const brw_builder &bld, int8_t v) { @@ -5945,6 +5913,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, brw_reg srcs[TEX_LOGICAL_NUM_SRCS]; srcs[TEX_LOGICAL_SRC_SURFACE] = image; srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); + srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */ /* Since the image size is always uniform, we can just emit a SIMD8 * query instruction and splat the result out. @@ -5953,8 +5922,9 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4); brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER, - tmp, srcs, ARRAY_SIZE(srcs))->as_tex(); - inst->sampler_opcode = SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL; + tmp, srcs, 3)->as_tex(); + inst->required_params = 0x1 /* LOD */; + inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO; inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size; inst->size_written = 4 * REG_SIZE * reg_unit(devinfo); @@ -7359,11 +7329,6 @@ static void brw_from_nir_emit_texture(nir_to_brw_state &ntb, nir_tex_instr *instr) { - const intel_device_info *devinfo = ntb.devinfo; - const brw_builder &bld = ntb.bld; - - brw_reg srcs[TEX_LOGICAL_NUM_SRCS]; - /* SKL PRMs: Volume 7: 3D-Media-GPGPU: * * "The Pixel Null Mask field, when enabled via the Pixel Null Mask @@ -7373,270 +7338,170 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb, * * We'll take care of this in NIR. */ - assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE); + assert(!instr->is_sparse || + nir_tex_instr_src_index(instr, nir_tex_src_comparator) == -1); - int lod_components = 0; + const intel_device_info *devinfo = ntb.devinfo; + const brw_builder &bld = ntb.bld; - /* The hardware requires a LOD for buffer textures */ - if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) - srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); + brw_reg srcs[TEX_LOGICAL_NUM_SRCS]; - ASSERTED bool got_lod = false; - ASSERTED bool got_bias = false; - bool pack_lod_bias_and_offset = false; - uint32_t header_bits = 0; + const enum brw_sampler_opcode sampler_opcode = + (enum brw_sampler_opcode)(instr->backend_flags & + ~BRW_TEX_INSTR_FUSED_EU_DISABLE); + const struct brw_sampler_payload_desc *payload_desc = + brw_get_sampler_payload_desc(sampler_opcode); - brw_reg_type default_src_type; - switch (instr->op) { - case nir_texop_txf_ms: - case nir_texop_txf_ms_mcs_intel: - default_src_type = devinfo->verx10 >= 125 ? BRW_TYPE_W : BRW_TYPE_D; - break; - - case nir_texop_txf: - case nir_texop_txs: - default_src_type = BRW_TYPE_D; - break; - - default: - default_src_type = BRW_TYPE_F; - break; - } - - for (unsigned i = 0; i < instr->num_srcs; i++) { - nir_src nir_src = instr->src[i].src; - brw_reg src = get_nir_src(ntb, nir_src, -1); - - /* If the source is not a vector (e.g., a 1D texture coordinate), then - * the eventual LOAD_PAYLOAD lowering will not properly adjust the - * stride, etc., so do it now. - */ - if (nir_tex_instr_src_size(instr, i) == 1) - src = offset(src, bld, 0); - - brw_reg_type src_type = BRW_TYPE_F; - switch (instr->src[i].src_type) { - case nir_tex_src_sampler_offset: - case nir_tex_src_texture_offset: - case nir_tex_src_sampler_handle: - case nir_tex_src_texture_handle: - case nir_tex_src_offset: - src_type = BRW_TYPE_D; - break; - - default: - src_type = default_src_type; - break; - } - - switch (instr->src[i].src_type) { - case nir_tex_src_bias: - assert(!got_lod); - got_bias = true; - srcs[TEX_LOGICAL_SRC_LOD] = - retype(get_nir_src_imm(ntb, instr->src[i].src), src_type); - break; - case nir_tex_src_comparator: - srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, src_type); - break; - case nir_tex_src_coord: - srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, src_type); - break; - case nir_tex_src_ddx: - srcs[TEX_LOGICAL_SRC_LOD] = retype(src, src_type); - lod_components = nir_tex_instr_src_size(instr, i); - break; - case nir_tex_src_ddy: - srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, src_type); - break; - case nir_tex_src_lod: - assert(!got_bias); - got_lod = true; - srcs[TEX_LOGICAL_SRC_LOD] = - retype(get_nir_src_imm(ntb, instr->src[i].src), src_type); - break; - case nir_tex_src_min_lod: - srcs[TEX_LOGICAL_SRC_MIN_LOD] = - retype(get_nir_src_imm(ntb, instr->src[i].src), src_type); - break; - case nir_tex_src_ms_index: - srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, src_type); - break; - - case nir_tex_src_offset: { - uint32_t offset_bits = 0; - if (brw_texture_offset(instr, i, &offset_bits)) { - header_bits |= offset_bits; - } else { - /* On gfx12.5+, if the offsets are not both constant and in the - * {-8,7} range, nir_lower_tex() will have already lowered the - * source offset. So we should never reach this point. - */ - assert(devinfo->verx10 < 125); - srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = - retype(src, src_type); - } - break; - } - - case nir_tex_src_projector: - UNREACHABLE("should be lowered"); - - case nir_tex_src_texture_offset: - assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE); - /* Emit code to evaluate the actual indexing expression */ - srcs[TEX_LOGICAL_SRC_SURFACE] = - bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD), - brw_imm_ud(instr->texture_index))); - break; - - case nir_tex_src_sampler_offset: - assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle) == -1); - assert(srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE); - /* Emit code to evaluate the actual indexing expression */ - srcs[TEX_LOGICAL_SRC_SAMPLER] = - bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD), - brw_imm_ud(instr->sampler_index))); - break; - - case nir_tex_src_texture_handle: - assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1); - assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE); - srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(src); - break; - - case nir_tex_src_sampler_handle: - assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); - assert(srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE); - srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(src); - break; - - case nir_tex_src_ms_mcs_intel: - assert(instr->op == nir_texop_txf_ms); - srcs[TEX_LOGICAL_SRC_MCS] = retype(src, src_type); - break; - - /* If this parameter is present, we are packing offset U, V and LOD/Bias - * into a single (32-bit) value. - */ - case nir_tex_src_backend2: - assert(instr->op == nir_texop_tg4); - pack_lod_bias_and_offset = true; - srcs[TEX_LOGICAL_SRC_LOD] = - retype(get_nir_src_imm(ntb, instr->src[i].src), src_type); - break; - - /* If this parameter is present, we are packing either the explicit LOD - * or LOD bias and the array index into a single (32-bit) value when - * 32-bit texture coordinates are used. - */ - case nir_tex_src_backend1: - assert(!got_lod && !got_bias); - got_lod = true; - assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb); - srcs[TEX_LOGICAL_SRC_LOD] = - retype(get_nir_src_imm(ntb, instr->src[i].src), src_type); - break; - - default: - UNREACHABLE("unknown texture source"); - } - } - - const bool surface_bindless = nir_tex_instr_src_index( - instr, nir_tex_src_texture_handle) >= 0; - const bool sampler_bindless = nir_tex_instr_src_index( - instr, nir_tex_src_sampler_handle) >= 0; - - /* If the surface or sampler were not specified through sources, use the - * instruction index. - */ - if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) - srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index); - if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) - srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index); - - assert(srcs[TEX_LOGICAL_SRC_MCS].file != BAD_FILE || - instr->op != nir_texop_txf_ms); - - enum sampler_opcode opcode; - switch (instr->op) { - case nir_texop_tex: - opcode = SAMPLER_OPCODE_TEX_LOGICAL; - break; - case nir_texop_txb: - opcode = SAMPLER_OPCODE_TXB_LOGICAL; - break; - case nir_texop_txl: - opcode = SAMPLER_OPCODE_TXL_LOGICAL; - break; - case nir_texop_txd: - opcode = SAMPLER_OPCODE_TXD_LOGICAL; - break; - case nir_texop_txf: - opcode = SAMPLER_OPCODE_TXF_LOGICAL; - break; - case nir_texop_txf_ms: - /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared - * Functions - 3D Sampler - Messages - Message Format: - * - * ld2dms REMOVEDBY(GEN:HAS:1406788836) - */ - if (devinfo->verx10 >= 125) - opcode = SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL; - else - opcode = SAMPLER_OPCODE_TXF_CMS_W_LOGICAL; - break; - case nir_texop_txf_ms_mcs_intel: - opcode = SAMPLER_OPCODE_TXF_MCS_LOGICAL; - break; - case nir_texop_query_levels: - case nir_texop_txs: - opcode = SAMPLER_OPCODE_TXS_LOGICAL; - break; - case nir_texop_lod: - opcode = SAMPLER_OPCODE_LOD_LOGICAL; - break; - case nir_texop_tg4: { - if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) { - opcode = SAMPLER_OPCODE_TG4_OFFSET_LOGICAL; + /* First deal with surface & sampler */ + bool surface_bindless = false; + bool sampler_bindless = false; + int src_idx; + { + if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle)) >= 0) { + srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize( + get_nir_src(ntb, instr->src[src_idx].src, -1)); + surface_bindless = true; + } else if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_texture_offset)) >= 0) { + srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize( + bld.ADD(get_nir_src(ntb, instr->src[src_idx].src, -1), + brw_imm_ud(instr->texture_index))); } else { - opcode = SAMPLER_OPCODE_TG4_LOGICAL; - if (devinfo->ver >= 20) { - /* If SPV_AMD_texture_gather_bias_lod extension is enabled, all - * texture gather functions (ie. the ones which do not take the - * extra bias argument and the ones that do) fetch texels from - * implicit LOD in fragment shader stage. In all other shader - * stages, base level is used instead. - */ - if (instr->is_gather_implicit_lod) - opcode = SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL; - - if (got_bias) - opcode = SAMPLER_OPCODE_TG4_BIAS_LOGICAL; - - if (got_lod) - opcode = SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL; - - if (pack_lod_bias_and_offset) { - if (got_lod) - opcode = SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL; - if (got_bias) - opcode = SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL; - } - } + srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index); + } + + if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle)) >= 0) { + srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize( + get_nir_src(ntb, instr->src[src_idx].src, -1)); + sampler_bindless = true; + } else if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset)) >= 0) { + srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize( + bld.ADD(get_nir_src(ntb, instr->src[src_idx].src, -1), + brw_imm_ud(instr->sampler_index))); + } else { + srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index); } - break; - } - case nir_texop_texture_samples: - opcode = SAMPLER_OPCODE_SAMPLEINFO_LOGICAL; - break; - default: - UNREACHABLE("unknown texture opcode"); } - if (instr->op == nir_texop_tg4) { - header_bits |= instr->component << 16; + /* Now the sampler payload */ + bool has_offset_in_payload = false; + uint32_t n_sources = TEX_LOGICAL_SRC_PAYLOAD0; + uint16_t required_params = 0; + for (uint32_t i = 0; payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID; i++) { + nir_tex_src_type nir_source; + unsigned nir_comp; + +#define P(name) BRW_SAMPLER_PAYLOAD_PARAM_##name +#define S(name, component) do { \ + nir_source = nir_tex_src_##name; \ + nir_comp = component; \ + } while (0) + + struct brw_sampler_payload_src sampler_src = + payload_desc->sources[i]; + + switch (sampler_src.param) { + case P(U): S(coord, 0); break; + case P(V): S(coord, 1); break; + case P(R): S(coord, 2); break; + case P(AI): S(coord, 3); break; + case P(BIAS): S(bias, 0); break; + case P(LOD): S(lod, 0); break; + case P(MLOD): S(min_lod, 0); break; + case P(REF): S(comparator, 0); break; + case P(DUDX): S(ddx, 0); break; + case P(DUDY): S(ddy, 0); break; + case P(DVDX): S(ddx, 1); break; + case P(DVDY): S(ddy, 1); break; + case P(DRDX): S(ddx, 2); break; + case P(DRDY): S(ddy, 2); break; + case P(SI): S(ms_index, 0); break; + case P(MCSL): S(ms_mcs_intel, 0); break; + case P(MCSH): S(ms_mcs_intel, 1); break; + case P(MCS0): S(ms_mcs_intel, 0); break; + case P(MCS1): S(ms_mcs_intel, 1); break; + case P(MCS2): S(ms_mcs_intel, 2); break; + case P(MCS3): S(ms_mcs_intel, 3); break; + + case P(OFFU): + S(offset, 0); + has_offset_in_payload = true; + break; + case P(OFFV): + S(offset, 1); + has_offset_in_payload = true; + break; + case P(OFFUV4): + case P(OFFUVR4): + case P(OFFUV6): + case P(OFFUVR6): + case P(BIAS_OFFUV6): + case P(BIAS_OFFUVR4): + case P(LOD_OFFUV6): + case P(LOD_OFFUVR4): + /* There is no payload with 2 packed entries, so backend1 is always + * the one payload parameter packed. */ + S(backend1, 0); + has_offset_in_payload = true; + break; + + case P(BIAS_AI): + case P(LOD_AI): + case P(MLOD_R): + /* There is no payload with 2 packed entries, so backend1 is always + * the one payload parameter packed. */ + S(backend1, 0); + break; + + default: UNREACHABLE("unhandled sampler param"); + } + +#undef P +#undef S + + /* TODO: make sure sources have consistent bit sizes */ + brw_reg param_val = brw_imm_ud(0); + + src_idx = nir_tex_instr_src_index(instr, nir_source); + if (src_idx >= 0 && + nir_comp < instr->src[src_idx].src.ssa->num_components) { + param_val = + get_nir_src(ntb, instr->src[src_idx].src, nir_comp); + } + + /* The hardware requires a LOD for buffer textures */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF && + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_LOD) { + sampler_src.optional = false; + } + + /* Wa_14012688258: + * + * Don't trim zeros at the end of payload for sample operations + * in cube and cube arrays. + * + * Compiler should send U,V,R parameters even if V,R are 0. + */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + intel_needs_workaround(devinfo, 14012688258) && + (sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_U || + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_V || + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_R)) { + sampler_src.optional = false; + } + + srcs[TEX_LOGICAL_SRC_PAYLOAD0 + i] = param_val; + + /* The last source present in the payload dictates the number of + * sources, unless it's required. + * + * We can skip the last source if it's zero. + */ + if (!sampler_src.optional || + !(param_val.file == IMM && param_val.ud == 0)) + n_sources = TEX_LOGICAL_SRC_PAYLOAD0 + i + 1; + + if (!sampler_src.optional) + required_params |= BITFIELD_BIT(i); } brw_reg nir_def_reg = get_nir_def(ntb, instr->def); @@ -7669,31 +7534,32 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb, brw_allocate_vgrf_units(*bld.shader, total_regs * reg_unit(devinfo)), dst_type); - brw_tex_inst *tex = bld.emit(SHADER_OPCODE_SAMPLER, dst, srcs, ARRAY_SIZE(srcs))->as_tex(); - tex->sampler_opcode = opcode; + brw_tex_inst *tex = bld.emit(SHADER_OPCODE_SAMPLER, dst, srcs, n_sources)->as_tex(); + tex->sampler_opcode = (enum brw_sampler_opcode) instr->backend_flags; tex->surface_bindless = surface_bindless; tex->sampler_bindless = sampler_bindless; - tex->offset = header_bits; tex->size_written = total_regs * grf_size; tex->residency = instr->is_sparse; + tex->required_params = required_params; tex->coord_components = instr->coord_components; - tex->grad_components = lod_components; tex->fused_eu_disable = (instr->backend_flags & BRW_TEX_INSTR_FUSED_EU_DISABLE) != 0; + tex->gather_component = instr->component; - /* Wa_14012688258: + /* If the NIR instruction has an offset param but the sampler payload + * doesn't, we can put the offset into the header of the message. * - * Don't trim zeros at the end of payload for sample operations - * in cube and cube arrays. + * The restriction though is that it should be a constant value. */ - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && - intel_needs_workaround(devinfo, 14012688258)) { + if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset)) != -1 && + !has_offset_in_payload) { + assert(nir_src_is_const(instr->src[src_idx].src)); - /* Compiler should send U,V,R parameters even if V,R are 0. */ - if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE) - assert(instr->coord_components >= 3u); - - /* See opt_zero_samples(). */ - tex->keep_payload_trailing_zeros = true; + const unsigned num_components = nir_tex_instr_src_size(instr, src_idx); + for (unsigned i = 0; i < num_components; i++) { + int offset = nir_src_comp_as_int(instr->src[src_idx].src, i); + tex->const_offsets[i] = offset; + } + tex->has_const_offsets = true; } /* With half-floats returns, the stride into a GRF allocation for each diff --git a/src/intel/compiler/brw/brw_inst.cpp b/src/intel/compiler/brw/brw_inst.cpp index 488e5b49afd..c70a9f88445 100644 --- a/src/intel/compiler/brw/brw_inst.cpp +++ b/src/intel/compiler/brw/brw_inst.cpp @@ -502,29 +502,8 @@ brw_inst::components_read(unsigned i) const else return 1; - case SHADER_OPCODE_SAMPLER: { - const brw_tex_inst *tex = as_tex(); - /* Texture coordinates. */ - if (i == TEX_LOGICAL_SRC_COORDINATE) - return tex->coord_components; - /* Texture derivatives. */ - else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) && - tex->sampler_opcode == SAMPLER_OPCODE_TXD_LOGICAL) - return tex->grad_components; - /* Texture offset. */ - else if (i == TEX_LOGICAL_SRC_TG4_OFFSET) - return 2; - /* MCS */ - else if (i == TEX_LOGICAL_SRC_MCS) { - if (tex->sampler_opcode == SAMPLER_OPCODE_TXF_CMS_W_LOGICAL) - return 2; - else if (tex->sampler_opcode == SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) - return 4; - else - return 1; - } else - return 1; - } + case SHADER_OPCODE_SAMPLER: + return 1; case SHADER_OPCODE_MEMORY_LOAD_LOGICAL: if (i == MEMORY_LOGICAL_DATA0) diff --git a/src/intel/compiler/brw/brw_inst.h b/src/intel/compiler/brw/brw_inst.h index ba553d6393f..3cd5a6977ac 100644 --- a/src/intel/compiler/brw/brw_inst.h +++ b/src/intel/compiler/brw/brw_inst.h @@ -27,6 +27,7 @@ #include #include "brw_reg.h" #include "compiler/brw_list.h" +#include "brw_sampler.h" #define MAX_SAMPLER_MESSAGE_SIZE 11 @@ -202,7 +203,6 @@ struct brw_inst : brw_exec_node { */ bool predicate_trivial:1; bool eot:1; - bool keep_payload_trailing_zeros:1; /** * Whether the parameters of the SEND instructions are build with * NoMask (for A32 messages this covers only the surface handle, for @@ -218,7 +218,7 @@ struct brw_inst : brw_exec_node { */ bool fused_eu_disable:1; - uint8_t pad:4; + uint8_t pad:5; }; uint16_t bits; }; @@ -285,10 +285,7 @@ struct brw_send_inst : brw_inst { }; struct brw_tex_inst : brw_inst { - enum sampler_opcode sampler_opcode; - uint32_t offset; - uint8_t coord_components; - uint8_t grad_components; + enum brw_sampler_opcode sampler_opcode; union { struct { /** @@ -308,9 +305,31 @@ struct brw_tex_inst : brw_inst { * Whether the sampler handle is bindless */ bool sampler_bindless:1; + /** + * Whether const_offsets holds meaningful values + */ + bool has_const_offsets:1; + /** + * Coord components + */ + uint8_t coord_components:2; + /** + * Gather component + */ + uint8_t gather_component:2; + /** + * Bitfields payload parameters that cannot be optimized by + * brw_opt_zero_samples() + */ + uint16_t required_params:13; }; - uint8_t bits; + uint32_t bits; }; + + /** + * Constant offsets + */ + int8_t const_offsets[3]; }; struct brw_mem_inst : brw_inst { diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp index 567ab45431a..31a2b0f2e59 100644 --- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp @@ -28,6 +28,7 @@ #include "brw_eu.h" #include "brw_shader.h" #include "brw_builder.h" +#include "brw_sampler.h" #include "util/bitpack_helpers.h" @@ -604,92 +605,6 @@ is_high_sampler(const struct intel_device_info *devinfo, const brw_reg &sampler) return sampler.file != IMM || sampler.ud >= 16; } -static unsigned -sampler_msg_type(const intel_device_info *devinfo, - sampler_opcode opcode, bool shadow_compare, - bool lod_is_zero, bool has_min_lod) -{ - switch (opcode) { - case SAMPLER_OPCODE_TEX_LOGICAL: - if (devinfo->ver >= 20 && has_min_lod) { - return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD : - XE2_SAMPLER_MESSAGE_SAMPLE_MLOD; - } else { - return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE : - GFX5_SAMPLER_MESSAGE_SAMPLE; - } - case SAMPLER_OPCODE_TXB_LOGICAL: - return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE : - GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS; - case SAMPLER_OPCODE_TXL_LOGICAL: - assert(!has_min_lod); - if (lod_is_zero) { - return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ : - GFX9_SAMPLER_MESSAGE_SAMPLE_LZ; - } - return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE : - GFX5_SAMPLER_MESSAGE_SAMPLE_LOD; - case SAMPLER_OPCODE_TXS_LOGICAL: - case SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL: - assert(!has_min_lod); - return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO; - case SAMPLER_OPCODE_TXD_LOGICAL: - return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE : - GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS; - case SAMPLER_OPCODE_TXF_LOGICAL: - assert(!has_min_lod); - return lod_is_zero ? GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ : - GFX5_SAMPLER_MESSAGE_SAMPLE_LD; - case SAMPLER_OPCODE_TXF_CMS_W_LOGICAL: - case SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: - assert(!has_min_lod); - return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; - case SAMPLER_OPCODE_TXF_MCS_LOGICAL: - assert(!has_min_lod); - return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; - case SAMPLER_OPCODE_LOD_LOGICAL: - assert(!has_min_lod); - return GFX5_SAMPLER_MESSAGE_LOD; - case SAMPLER_OPCODE_TG4_LOGICAL: - assert(!has_min_lod); - return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C : - GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4; - break; - case SAMPLER_OPCODE_TG4_OFFSET_LOGICAL: - assert(!has_min_lod); - return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C : - GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; - case SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL: - assert(!has_min_lod); - assert(devinfo->ver >= 20); - return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C: - XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L; - case SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: - assert(!has_min_lod); - assert(devinfo->ver >= 20); - return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B; - case SAMPLER_OPCODE_TG4_BIAS_LOGICAL: - assert(!has_min_lod); - assert(devinfo->ver >= 20); - return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B; - case SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: - assert(!has_min_lod); - assert(devinfo->ver >= 20); - return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C : - XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L; - case SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: - assert(!has_min_lod); - assert(devinfo->ver >= 20); - return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C : - XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I; - case SAMPLER_OPCODE_SAMPLEINFO_LOGICAL: - assert(!has_min_lod); - return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; - default: - UNREACHABLE("not reached"); - } -} - /** * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to * the given requested_alignment_sz. @@ -733,20 +648,28 @@ emit_load_payload_with_padding(const brw_builder &bld, const brw_reg &dst, } static bool -shader_opcode_needs_header(sampler_opcode op, - const struct intel_device_info *devinfo) +sampler_op_needs_header(enum brw_sampler_opcode op, + const struct intel_device_info *devinfo) { switch (op) { - case SAMPLER_OPCODE_TG4_LOGICAL: - case SAMPLER_OPCODE_TG4_OFFSET_LOGICAL: - case SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: - case SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL: - case SAMPLER_OPCODE_TG4_BIAS_LOGICAL: - case SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: - case SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: - case SAMPLER_OPCODE_SAMPLEINFO_LOGICAL: + case BRW_SAMPLER_OPCODE_GATHER4: + case BRW_SAMPLER_OPCODE_GATHER4_B: + case BRW_SAMPLER_OPCODE_GATHER4_C: + case BRW_SAMPLER_OPCODE_GATHER4_I: + case BRW_SAMPLER_OPCODE_GATHER4_I_C: + case BRW_SAMPLER_OPCODE_GATHER4_L: + case BRW_SAMPLER_OPCODE_GATHER4_L_C: + case BRW_SAMPLER_OPCODE_GATHER4_PO: + case BRW_SAMPLER_OPCODE_GATHER4_PO_PACKED: + case BRW_SAMPLER_OPCODE_GATHER4_PO_B: + case BRW_SAMPLER_OPCODE_GATHER4_PO_C: + case BRW_SAMPLER_OPCODE_GATHER4_PO_C_PACKED: + case BRW_SAMPLER_OPCODE_GATHER4_PO_L: + case BRW_SAMPLER_OPCODE_GATHER4_PO_L_C: + case BRW_SAMPLER_OPCODE_SAMPLEINFO: return true; - case SAMPLER_OPCODE_TXF_LOGICAL: + case BRW_SAMPLER_OPCODE_LD: + case BRW_SAMPLER_OPCODE_LD_LZ: /* Xe3 HW does not seem to work unless we force a header. */ return devinfo->ver >= 30; default: @@ -757,13 +680,13 @@ shader_opcode_needs_header(sampler_opcode op, } static bool -sampler_opcode_uses_sampler_state(sampler_opcode op) +sampler_opcode_uses_sampler_state(enum brw_sampler_opcode op) { switch (op) { - case SAMPLER_OPCODE_SAMPLEINFO_LOGICAL: - case SAMPLER_OPCODE_TXF_LOGICAL: - case SAMPLER_OPCODE_TXS_LOGICAL: - case SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL: + case BRW_SAMPLER_OPCODE_RESINFO: + case BRW_SAMPLER_OPCODE_SAMPLEINFO: + case BRW_SAMPLER_OPCODE_LD: + case BRW_SAMPLER_OPCODE_LD_LZ: return false; default: @@ -777,12 +700,12 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, { assert(inst); const brw_reg *src = inst->src; - unsigned src_type_size = 0; + unsigned src_type_size = 4; /* SAMPLEINFO has no payload source */ /* All sources need to have the same size, therefore seek the first valid * and take the size from there. */ - for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { + for (unsigned i = TEX_LOGICAL_SRC_PAYLOAD0; i < inst->sources; i++) { if (src[i].file != BAD_FILE) { src_type_size = brw_type_size_bytes(src[i].type); break; @@ -797,15 +720,9 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, * which is already in 16-bits unlike the other parameters that need forced * conversion. */ - if (inst->sampler_opcode != SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) { - for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { - /* surface/sampler don't go in the payload */ - if (i == TEX_LOGICAL_SRC_SURFACE || - i == TEX_LOGICAL_SRC_SAMPLER) - continue; - assert(src[i].file == BAD_FILE || - brw_type_size_bytes(src[i].type) == src_type_size); - } + for (unsigned i = TEX_LOGICAL_SRC_PAYLOAD0; i < inst->sources; i++) { + assert(src[i].file == BAD_FILE || + brw_type_size_bytes(src[i].type) == src_type_size); } #endif @@ -820,8 +737,8 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, * ld_mcs SIMD8H and SIMD16H Only * ld2dms REMOVEDBY(GEN:HAS:1406788836) */ - if (inst->sampler_opcode == SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL || - inst->sampler_opcode == SAMPLER_OPCODE_TXF_MCS_LOGICAL) + if (inst->sampler_opcode == BRW_SAMPLER_OPCODE_LD2DMS_W_GFX125 || + inst->sampler_opcode == BRW_SAMPLER_OPCODE_LD_MCS) src_type_size = 2; return src_type_size * 8; @@ -833,16 +750,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) const intel_device_info *devinfo = bld.shader->devinfo; const brw_compiler *compiler = bld.shader->compiler; - const brw_reg coordinate = tex->src[TEX_LOGICAL_SRC_COORDINATE]; - const brw_reg shadow_c = tex->src[TEX_LOGICAL_SRC_SHADOW_C]; - const brw_reg lod = tex->src[TEX_LOGICAL_SRC_LOD]; - const brw_reg lod2 = tex->src[TEX_LOGICAL_SRC_LOD2]; - const brw_reg min_lod = tex->src[TEX_LOGICAL_SRC_MIN_LOD]; - const brw_reg sample_index = tex->src[TEX_LOGICAL_SRC_SAMPLE_INDEX]; - const brw_reg mcs = tex->src[TEX_LOGICAL_SRC_MCS]; + const enum brw_sampler_opcode op = tex->sampler_opcode; + const bool surface_bindless = tex->surface_bindless; + const bool sampler_bindless = tex->sampler_bindless; const brw_reg surface = tex->src[TEX_LOGICAL_SRC_SURFACE]; const brw_reg sampler = tex->src[TEX_LOGICAL_SRC_SAMPLER]; - const brw_reg tg4_offset = tex->src[TEX_LOGICAL_SRC_TG4_OFFSET]; const unsigned payload_type_bit_size = get_sampler_msg_payload_type_bit_size(devinfo, tex); @@ -853,23 +765,22 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) /* We never generate EOT sampler messages */ assert(!tex->eot); - const bool surface_bindless = tex->surface_bindless; - const bool sampler_bindless = tex->sampler_bindless; const enum brw_reg_type payload_type = - brw_type_with_size(BRW_TYPE_F, payload_type_bit_size); - const enum brw_reg_type payload_unsigned_type = brw_type_with_size(BRW_TYPE_UD, payload_type_bit_size); - const enum brw_reg_type payload_signed_type = - brw_type_with_size(BRW_TYPE_D, payload_type_bit_size); - unsigned header_size = 0, length = 0; - sampler_opcode op = tex->sampler_opcode; - brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE]; - for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) - sources[i] = bld.vgrf(payload_type); - if (shader_opcode_needs_header(op, devinfo) || tex->offset != 0 || - sampler_bindless || is_high_sampler(devinfo, sampler) || - tex->residency) { + const bool needs_header = + sampler_op_needs_header(op, devinfo) || + tex->has_const_offsets || + sampler_bindless || is_high_sampler(devinfo, sampler) || + tex->residency; + + unsigned header_size = needs_header ? reg_unit(devinfo) : 0, length = 0; + brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE]; + + for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) + sources[i] = bld.vgrf((i == 0 && needs_header) ? BRW_TYPE_UD : payload_type); + + if (needs_header) { /* For general texture offsets (no txf workaround), we need a header to * put them in. * @@ -882,6 +793,12 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) for (header_size = 0; header_size < reg_unit(devinfo); header_size++) sources[length++] = byte_offset(header, REG_SIZE * header_size); + uint32_t g0_2 = 0; + if (tex->gather_component) + g0_2 |= tex->gather_component << 16; + if (tex->residency) + g0_2 |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */ + /* If we're requesting fewer than four channels worth of response, * and we have an explicit header, we need to set up the sampler * writemask. It's reversed from normal: 1 means "don't write". @@ -895,11 +812,14 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) if (comps_regs < 4 * comp_regs) { assert(comps_regs % comp_regs == 0); unsigned mask = ~((1 << (comps_regs / comp_regs)) - 1) & 0xf; - tex->offset |= mask << 12; + g0_2 |= mask << 12; } - if (tex->residency) - tex->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */ + if (tex->has_const_offsets) { + g0_2 |= ((tex->const_offsets[2] & 0xf) << 0) | + ((tex->const_offsets[1] & 0xf) << 4) | + ((tex->const_offsets[0] & 0xf) << 8); + } /* Build the actual header */ const brw_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0); @@ -908,8 +828,9 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) ubld.MOV(header, brw_imm_ud(0)); else ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD)); - if (tex->offset) { - ubld1.MOV(component(header, 2), brw_imm_ud(tex->offset)); + + if (g0_2) { + ubld1.MOV(component(header, 2), brw_imm_ud(g0_2)); } else if (devinfo->ver < 11 && bld.shader->stage != MESA_SHADER_VERTEX && bld.shader->stage != MESA_SHADER_FRAGMENT) { @@ -976,218 +897,14 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) } } - const bool lod_is_zero = lod.is_zero(); + const unsigned msg_type = brw_get_sampler_hw_opcode(op); - /* On Xe2 and newer platforms, min_lod is the first parameter specifically - * so that a bunch of other, possibly unused, parameters don't need to also - * be included. - */ - const unsigned msg_type = - sampler_msg_type(devinfo, op, shadow_c.file != BAD_FILE, lod_is_zero, - min_lod.file != BAD_FILE); - - const bool min_lod_is_first = devinfo->ver >= 20 && - (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD || - msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD); - - if (min_lod_is_first) { - assert(min_lod.file != BAD_FILE); - bld.MOV(sources[length++], min_lod); - } - - if (shadow_c.file != BAD_FILE) { - bld.MOV(sources[length], shadow_c); - length++; - } - - bool coordinate_done = false; - - /* Set up the LOD info */ - switch (op) { - case SAMPLER_OPCODE_TXL_LOGICAL: - if (lod_is_zero) - break; - FALLTHROUGH; - case SAMPLER_OPCODE_TXB_LOGICAL: - case SAMPLER_OPCODE_TG4_BIAS_LOGICAL: - case SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: - case SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL: - case SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: - bld.MOV(sources[length], lod); - length++; - break; - case SAMPLER_OPCODE_TXD_LOGICAL: - /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in - * Xe2+). - */ - assert(bld.dispatch_width() == (8 * reg_unit(devinfo))); - - /* Load dPdx and the coordinate together: - * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z - */ - for (unsigned i = 0; i < tex->coord_components; i++) { - bld.MOV(sources[length++], offset(coordinate, bld, i)); - - /* For cube map array, the coordinate is (u,v,r,ai) but there are - * only derivatives for (u, v, r). - */ - if (i < tex->grad_components) { - bld.MOV(sources[length++], offset(lod, bld, i)); - bld.MOV(sources[length++], offset(lod2, bld, i)); - } - } - - coordinate_done = true; - break; - case SAMPLER_OPCODE_TXS_LOGICAL: - sources[length] = retype(sources[length], payload_unsigned_type); - bld.MOV(sources[length++], lod); - break; - case SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL: - /* We need an LOD; just use 0 */ - sources[length] = retype(sources[length], payload_unsigned_type); - bld.MOV(sources[length++], brw_imm_ud(0)); - break; - case SAMPLER_OPCODE_TXF_LOGICAL: - /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */ - sources[length] = retype(sources[length], payload_signed_type); - bld.MOV(sources[length++], offset(coordinate, bld, 0)); - - if (tex->coord_components >= 2) { - sources[length] = retype(sources[length], payload_signed_type); - bld.MOV(sources[length], offset(coordinate, bld, 1)); - } else { - sources[length] = brw_imm_d(0); - } - length++; - - if (!lod_is_zero) { - sources[length] = retype(sources[length], payload_signed_type); - bld.MOV(sources[length++], lod); - } - - for (unsigned i = 2; i < tex->coord_components; i++) { - sources[length] = retype(sources[length], payload_signed_type); - bld.MOV(sources[length++], offset(coordinate, bld, i)); - } - - coordinate_done = true; - break; - - case SAMPLER_OPCODE_TXF_CMS_W_LOGICAL: - case SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: - sources[length] = retype(sources[length], payload_unsigned_type); - bld.MOV(sources[length++], sample_index); - - /* Data from the multisample control surface. */ - for (unsigned i = 0; i < 2; ++i) { - /* Sampler always writes 4/8 register worth of data but for ld_mcs - * only valid data is in first two register. So with 16-bit - * payload, we need to split 2-32bit register into 4-16-bit - * payload. - * - * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs - - * Shared Functions - 3D Sampler - Messages - Message Format: - * - * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r - */ - if (op == SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) { - sources[length] = retype(sources[length], payload_unsigned_type); - bld.MOV(sources[length++], - mcs.file == IMM ? mcs : offset(mcs, bld, 2 * i + 0)); - - sources[length] = retype(sources[length], payload_unsigned_type); - bld.MOV(sources[length++], - mcs.file == IMM ? mcs : offset(mcs, bld, 2 * i + 1)); - } else { - sources[length] = retype(sources[length], payload_unsigned_type); - bld.MOV(sources[length++], - mcs.file == IMM ? mcs : offset(mcs, bld, i)); - } - } - FALLTHROUGH; - - case SAMPLER_OPCODE_TXF_MCS_LOGICAL: - /* There is no offsetting for this message; just copy in the integer - * texture coordinates. - */ - for (unsigned i = 0; i < tex->coord_components; i++) { - sources[length] = retype(sources[length], payload_signed_type); - bld.MOV(sources[length++], offset(coordinate, bld, i)); - } - - coordinate_done = true; - break; - case SAMPLER_OPCODE_TG4_OFFSET_LOGICAL: - /* More crazy intermixing */ - for (unsigned i = 0; i < 2; i++) /* u, v */ - bld.MOV(sources[length++], offset(coordinate, bld, i)); - - for (unsigned i = 0; i < 2; i++) { /* offu, offv */ - sources[length] = retype(sources[length], payload_signed_type); - bld.MOV(sources[length++], offset(tg4_offset, bld, i)); - } - - if (tex->coord_components == 3) /* r if present */ - bld.MOV(sources[length++], offset(coordinate, bld, 2)); - - coordinate_done = true; - break; - default: - break; - } - - /* Set up the coordinate (except for cases where it was done above) */ - if (!coordinate_done) { - for (unsigned i = 0; i < tex->coord_components; i++) - bld.MOV(retype(sources[length++], payload_type), - offset(coordinate, bld, i)); - } - - if (min_lod.file != BAD_FILE && !min_lod_is_first) { - /* Account for all of the missing coordinate sources */ - if (op == SAMPLER_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) { - /* Bspec 64985: - * - * For sample_b sampler message format: - * - * SIMD16H/SIMD32H - * Param Number 0 1 2 3 4 5 - * Param BIAS U V R Ai MLOD - * - * SIMD16/SIMD32 - * Param Number 0 1 2 3 4 - * Param BIAS_AI U V R MLOD - */ - length += 3 - tex->coord_components; - } else if (op == SAMPLER_OPCODE_TXD_LOGICAL && devinfo->verx10 >= 125) { - /* On DG2 and newer platforms, sample_d can only be used with 1D and - * 2D surfaces, so the maximum number of gradient components is 2. - * In spite of this limitation, the Bspec lists a mysterious R - * component before the min_lod, so the maximum coordinate components - * is 3. - * - * See bspec 45942, "Enable new message layout for cube array" - */ - length += 3 - tex->coord_components; - length += (2 - tex->grad_components) * 2; - } else { - length += 4 - tex->coord_components; - if (op == SAMPLER_OPCODE_TXD_LOGICAL) - length += (3 - tex->grad_components) * 2; - } - - bld.MOV(sources[length++], min_lod); - - /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */ - if (intel_needs_workaround(devinfo, 14014595444) && - op == SAMPLER_OPCODE_TXB_LOGICAL && shadow_c.file == BAD_FILE) - bld.MOV(sources[length++], min_lod); - } + for (uint32_t i = TEX_LOGICAL_SRC_PAYLOAD0; i < tex->sources; i++) + bld.MOV(retype(sources[length++], payload_type), retype(tex->src[i], payload_type)); const brw_reg src_payload = retype(brw_allocate_vgrf_units(*bld.shader, length * bld.dispatch_width() / 8), - BRW_TYPE_F); + BRW_TYPE_UD); /* In case of 16-bit payload each component takes one full register in * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16 * elements. In SIMD8H case hardware simply expects the components to be diff --git a/src/intel/compiler/brw/brw_lower_simd_width.cpp b/src/intel/compiler/brw/brw_lower_simd_width.cpp index ed1dbb2f425..2091ecaf4ef 100644 --- a/src/intel/compiler/brw/brw_lower_simd_width.cpp +++ b/src/intel/compiler/brw/brw_lower_simd_width.cpp @@ -162,69 +162,35 @@ static unsigned get_sampler_lowered_simd_width(const struct intel_device_info *devinfo, const brw_tex_inst *tex) { - /* On gfx12 parameters are fixed to 16-bit values and therefore they all - * always fit regardless of the execution size. - */ - if (tex->sampler_opcode == SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) - return MIN2(16, tex->exec_size); - /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still * unsuppported on Xe2. */ - if (tex->sampler_opcode == SAMPLER_OPCODE_TXD_LOGICAL) + if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_SAMPLE_D || + tex->sampler_opcode == BRW_SAMPLER_OPCODE_SAMPLE_D_REDUCED || + tex->sampler_opcode == BRW_SAMPLER_OPCODE_SAMPLE_D_C || + tex->sampler_opcode == BRW_SAMPLER_OPCODE_SAMPLE_D_C_PACKED) return devinfo->ver < 20 ? 8 : 16; - /* If we have a min_lod parameter on anything other than a simple sample - * message, it will push it over 5 arguments and we have to fall back to - * SIMD8. - */ - if (tex->sampler_opcode != SAMPLER_OPCODE_TEX_LOGICAL && - tex->components_read(TEX_LOGICAL_SRC_MIN_LOD)) - return devinfo->ver < 20 ? 8 : 16; + const unsigned max_payload_size = + MAX_SAMPLER_MESSAGE_SIZE * + (reg_unit(devinfo) * 8) /* min SIMD */ * + 4 /* dword */; + const unsigned payload_param_size = + brw_type_size_bytes(tex->src[TEX_LOGICAL_SRC_PAYLOAD0].type); + unsigned payload_size = + (tex->sources - TEX_LOGICAL_SRC_PAYLOAD0) * + tex->exec_size * + payload_param_size; - /* On Gfx9+ the LOD argument is for free if we're able to use the LZ - * variant of the TXL or TXF message. - */ - const bool implicit_lod = (tex->sampler_opcode == SAMPLER_OPCODE_TXL_LOGICAL || - tex->sampler_opcode == SAMPLER_OPCODE_TXF_LOGICAL) && - tex->src[TEX_LOGICAL_SRC_LOD].is_zero(); - - /* Calculate the total number of argument components that need to be passed - * to the sampler unit. - */ - unsigned num_payload_components = - tex->coord_components + - tex->components_read(TEX_LOGICAL_SRC_SHADOW_C) + - (implicit_lod ? 0 : tex->components_read(TEX_LOGICAL_SRC_LOD)) + - tex->components_read(TEX_LOGICAL_SRC_LOD2) + - tex->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) + - (tex->sampler_opcode == SAMPLER_OPCODE_TG4_OFFSET_LOGICAL ? - tex->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) + - tex->components_read(TEX_LOGICAL_SRC_MCS) + - tex->components_read(TEX_LOGICAL_SRC_MIN_LOD); - - - if (tex->sampler_opcode == SAMPLER_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) { - num_payload_components += 3 - tex->coord_components; - } else if (tex->sampler_opcode == SAMPLER_OPCODE_TXD_LOGICAL && - devinfo->verx10 >= 125 && devinfo->ver < 20) { - num_payload_components += - 3 - tex->coord_components + (2 - tex->grad_components) * 2; - } else { - num_payload_components += 4 - tex->coord_components; - if (tex->sampler_opcode == SAMPLER_OPCODE_TXD_LOGICAL) - num_payload_components += (3 - tex->grad_components) * 2; + unsigned simd_width = tex->exec_size; + while (payload_size > max_payload_size) { + payload_size /= 2; + simd_width /= 2; } + const unsigned max_hw_simd = devinfo->ver < 20 ? 16 : 32; - const unsigned simd_limit = reg_unit(devinfo) * - (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16); - - /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the - * maximum message size supported by the sampler, regardless of whether a - * header is provided or not. - */ - return MIN2(tex->exec_size, simd_limit); + return MIN2(simd_width, max_hw_simd); } static bool diff --git a/src/intel/compiler/brw/brw_nir_lower_texture.c b/src/intel/compiler/brw/brw_nir_lower_texture.c index 8aaada7fb4a..dcbd4a21294 100644 --- a/src/intel/compiler/brw/brw_nir_lower_texture.c +++ b/src/intel/compiler/brw/brw_nir_lower_texture.c @@ -141,11 +141,30 @@ pack_lod_and_array_index(nir_builder *b, nir_tex_instr *tex) return true; } +static nir_def * +build_packed_offset(nir_builder *b, + nir_def *offset, + unsigned offset_bits, + unsigned offset_count) +{ + offset = nir_iand_imm(b, offset, BITFIELD_MASK(offset_bits)); + + nir_def *offuvr = nir_channel(b, offset, 0); + for (unsigned i = 1; i < MIN2(offset->num_components, offset_count); i++) { + nir_def *chan = nir_channel(b, offset, i); + offuvr = nir_ior(b, offuvr, nir_ishl_imm(b, chan, i * offset_bits)); + } + + return offuvr; +} + /** * Pack either the explicit LOD/Bias and the offset together. */ static bool -pack_lod_or_bias_and_offset(nir_builder *b, nir_tex_instr *tex) +pack_lod_or_bias_and_offset(nir_builder *b, nir_tex_instr *tex, + unsigned offset_bits, + unsigned offset_count) { int offset_index = nir_tex_instr_src_index(tex, nir_tex_src_offset); if (offset_index < 0) @@ -175,7 +194,6 @@ pack_lod_or_bias_and_offset(nir_builder *b, nir_tex_instr *tex) } nir_def *lod = tex->src[lod_index].src.ssa; - nir_def *offset = tex->src[offset_index].src.ssa; b->cursor = nir_before_instr(&tex->instr); @@ -192,16 +210,20 @@ pack_lod_or_bias_and_offset(nir_builder *b, nir_tex_instr *tex) * ------------------------------------------ * |OffsetUV | LOD/Bias | OffsetV | OffsetU | * ------------------------------------------ + * + * Or + * --------------------------------------------------- + * |Bits | [31:12] | [11:9] | [8:5] | [4:0] | + * ---------------------------------------------------- + * |OffsetUV | LOD/Bias | OffsetR | OffsetV | OffsetU | + * ---------------------------------------------------- */ - nir_def *offu = nir_iand_imm(b, nir_channel(b, offset, 0), 0x3F); - nir_def *offv = nir_iand_imm(b, nir_channel(b, offset, 1), 0x3F); + nir_def *offuvr = build_packed_offset( + b, tex->src[offset_index].src.ssa, offset_bits, offset_count); - nir_def *offsetUV = nir_ior(b, offu, nir_ishl_imm(b, offv, 6)); - - nir_def *lod_offsetUV = nir_ior(b, offsetUV, - nir_iand_imm(b, lod, 0xFFFFF000)); + nir_def *packed = nir_ior(b, offuvr, nir_iand_imm(b, lod, 0xFFFFF000)); nir_tex_instr_remove_src(tex, offset_index); - nir_tex_instr_add_src(tex, nir_tex_src_backend2, lod_offsetUV); + nir_tex_instr_add_src(tex, nir_tex_src_backend1, packed); return true; } @@ -219,9 +241,15 @@ brw_nir_lower_texture_instr(nir_builder *b, nir_tex_instr *tex, void *cb_data) if (brw_sampler_opcode_param_index(sampler_opcode, BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUV6) != -1 || - brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUV6) != -1) - return pack_lod_or_bias_and_offset(b, tex); + brw_sampler_opcode_param_index(sampler_opcode, + BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUV6) != -1) + return pack_lod_or_bias_and_offset(b, tex, 6, 2); + + if (brw_sampler_opcode_param_index(sampler_opcode, + BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUVR4) != -1 || + brw_sampler_opcode_param_index(sampler_opcode, + BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUVR4) != -1) + return pack_lod_or_bias_and_offset(b, tex, 4, 3); return false; } @@ -321,7 +349,7 @@ brw_nir_lower_mcs_fetch_instr(nir_builder *b, nir_tex_instr *tex, void *cb_data) break; default: - continue; + break; } } diff --git a/src/intel/compiler/brw/brw_opt.cpp b/src/intel/compiler/brw/brw_opt.cpp index 97f012487e4..22721be7e85 100644 --- a/src/intel/compiler/brw/brw_opt.cpp +++ b/src/intel/compiler/brw/brw_opt.cpp @@ -110,6 +110,12 @@ brw_optimize(brw_shader &s) OPT(brw_lower_simd_width); OPT(brw_lower_scalar_fp64_MAD); OPT(brw_lower_barycentrics); + + /* Identify trailing zeros LOAD_PAYLOAD of sampler messages. Do this before + * lowering the send messages. + */ + OPT(brw_opt_zero_samples); + OPT(brw_lower_logical_sends); brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING); @@ -119,15 +125,6 @@ brw_optimize(brw_shader &s) if (!OPT(brw_opt_copy_propagation_defs)) OPT(brw_opt_copy_propagation); - /* Identify trailing zeros LOAD_PAYLOAD of sampler messages. - * Do this before splitting SENDs. - */ - if (OPT(brw_opt_zero_samples)) { - if (!OPT(brw_opt_copy_propagation_defs)) { - OPT(brw_opt_copy_propagation); - } - } - if (s.devinfo->ver >= 30) OPT(brw_opt_send_to_send_gather); @@ -264,56 +261,21 @@ brw_opt_zero_samples(brw_shader &s) bool progress = false; foreach_block_and_inst(block, brw_inst, inst, s.cfg) { - if (inst->opcode != SHADER_OPCODE_SEND) + brw_tex_inst *tex = inst->as_tex(); + if (tex == NULL) continue; - brw_send_inst *send = inst->as_send(); - if (send->sfid != BRW_SFID_SAMPLER) - continue; + int last_req_param = util_last_bit(tex->required_params) - 1; + assert(last_req_param <= (tex->sources - TEX_LOGICAL_SRC_PAYLOAD0)); - /* Wa_14012688258: - * - * Don't trim zeros at the end of payload for sample operations - * in cube and cube arrays. - */ - if (send->keep_payload_trailing_zeros) - continue; + int last_param = tex->sources - 1 - TEX_LOGICAL_SRC_PAYLOAD0; - /* This pass works on SENDs before splitting. */ - if (send->ex_mlen > 0) - continue; - - brw_inst *prev = (brw_inst *) send->prev; - - if (prev->is_head_sentinel() || prev->opcode != SHADER_OPCODE_LOAD_PAYLOAD) - continue; - - brw_load_payload_inst *lp = prev->as_load_payload(); - - /* How much of the payload are actually read by this SEND. */ - const unsigned params = - load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE); - - /* We don't want to remove the message header or the first parameter. - * Removing the first parameter is not allowed, see the Haswell PRM - * volume 7, page 149: - * - * "Parameter 0 is required except for the sampleinfo message, which - * has no parameter 0" - */ - const unsigned first_param_idx = lp->header_size; - unsigned zero_size = 0; - for (unsigned i = params - 1; i > first_param_idx; i--) { - if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero()) + for (int i = last_param; i > last_req_param; i--) { + if (tex->src[TEX_LOGICAL_SRC_PAYLOAD0 + i].file != IMM || + tex->src[TEX_LOGICAL_SRC_PAYLOAD0 + i].ud != 0) break; - zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride; - } - /* Round down to ensure to only consider full registers. */ - const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo)); - if (zero_len > 0) { - /* Note mlen is in REG_SIZE units. */ - send->mlen -= zero_len; + tex->sources = TEX_LOGICAL_SRC_PAYLOAD0 + i; progress = true; } } diff --git a/src/intel/compiler/brw/brw_opt_cse.cpp b/src/intel/compiler/brw/brw_opt_cse.cpp index 8e2267b9479..ee2e6695987 100644 --- a/src/intel/compiler/brw/brw_opt_cse.cpp +++ b/src/intel/compiler/brw/brw_opt_cse.cpp @@ -239,12 +239,16 @@ static bool tex_inst_match(brw_tex_inst *a, brw_tex_inst *b) { return a->sampler_opcode == b->sampler_opcode && - a->offset == b->offset && a->surface_bindless == b->surface_bindless && a->sampler_bindless == b->sampler_bindless && + a->residency == b->residency && + a->required_params == b->required_params && a->coord_components == b->coord_components && - a->grad_components == b->grad_components && - a->residency == b->residency; + a->gather_component == b->gather_component && + a->has_const_offsets == b->has_const_offsets && + a->const_offsets[0] == b->const_offsets[0] && + a->const_offsets[1] == b->const_offsets[1] && + a->const_offsets[2] == b->const_offsets[2]; } static bool @@ -389,12 +393,13 @@ hash_inst(const void *v) case BRW_KIND_TEX: { const brw_tex_inst *tex = inst->as_tex(); const uint8_t tex_u8data[] = { - tex->coord_components, - tex->grad_components, - tex->bits, + tex->sampler_opcode, + (uint8_t)tex->const_offsets[0], + (uint8_t)tex->const_offsets[1], + (uint8_t)tex->const_offsets[2], }; const uint32_t tex_u32data[] = { - tex->sampler_opcode, + tex->bits, }; hash = HASH(hash, tex_u8data); hash = HASH(hash, tex_u32data); diff --git a/src/intel/compiler/brw/brw_opt_txf_combiner.cpp b/src/intel/compiler/brw/brw_opt_txf_combiner.cpp index 336150b25b9..ec3fe257021 100644 --- a/src/intel/compiler/brw/brw_opt_txf_combiner.cpp +++ b/src/intel/compiler/brw/brw_opt_txf_combiner.cpp @@ -48,6 +48,87 @@ sources_match(ASSERTED const brw_def_analysis &defs, return brw_regs_equal(&a->src[src], &b->src[src]); } +static void +merge_instructions(brw_shader &s, brw_tex_inst **txfs, unsigned count) +{ + const unsigned min_simd = 8 * reg_unit(s.devinfo); + const unsigned max_simd = 16 * reg_unit(s.devinfo); + const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo); + + for (unsigned curr = 0; curr < count; curr += max_simd) { + const unsigned lanes = CLAMP(count - curr, min_simd, max_simd); + const unsigned width = util_next_power_of_two(lanes); + const brw_builder ubld = + brw_builder(&s).before(txfs[curr]).exec_all().group(width, 0); + const brw_builder ubld1 = ubld.group(1, 0); + + enum brw_reg_type coord_type = + txfs[curr]->src[TEX_LOGICAL_SRC_PAYLOAD0].type; + brw_reg coord = ubld.vgrf(coord_type); + brw_reg coord_comps[32]; + + for (unsigned i = 0; i < width; i++) { + /* Our block size might be larger than the number of convergent + * loads we're combining. If so, repeat the last component. + */ + if (txfs[curr+i]) + coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_PAYLOAD0]; + else + coord_comps[i] = coord_comps[i-1]; + } + ubld1.VEC(coord, coord_comps, width); + + brw_reg srcs[TEX_LOGICAL_NUM_SRCS]; + srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE]; + srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); + srcs[TEX_LOGICAL_SRC_PAYLOAD0] = coord; + for (unsigned i = TEX_LOGICAL_SRC_PAYLOAD1; i < txfs[0]->sources; i++) + srcs[i] = txfs[0]->src[i]; + + /* Each of our txf may have a reduced response length if some + * components are never read. Use the maximum of the sizes. + */ + unsigned new_dest_comps = 0; + for (unsigned i = 0; i < width; i++) { + const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]); + new_dest_comps = MAX2(new_dest_comps, this_comps); + } + + /* Emit the new divergent TXF */ + brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps); + brw_tex_inst *div_txf = + ubld.emit(SHADER_OPCODE_SAMPLER, div, srcs, txfs[0]->sources)->as_tex(); + div_txf->surface_bindless = txfs[0]->surface_bindless; + div_txf->sampler_opcode = txfs[0]->sampler_opcode; + div_txf->residency = false; + + /* Update it to also use response length reduction */ + const unsigned per_component_regs = + DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size, + grf_size); + div_txf->size_written = new_dest_comps * per_component_regs * grf_size; + + for (unsigned i = 0; i < width; i++) { + brw_inst *txf = txfs[curr+i]; + if (!txf) + break; + + const brw_builder ibld = brw_builder(txf); + + /* Replace each of the original TXFs with MOVs from our new one */ + const unsigned dest_comps = dest_comps_for_txf(s, txf); + assert(dest_comps <= 4); + + brw_reg v[4]; + for (unsigned c = 0; c < dest_comps; c++) + v[c] = component(offset(div, ubld, c), i); + ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps); + + txf->remove(); + } + } +} + /** * Look for a series of convergent texture buffer fetches within a basic * block and combine them into a single divergent load with one lane for @@ -82,23 +163,22 @@ brw_opt_combine_convergent_txf(brw_shader &s) { const brw_def_analysis &defs = s.def_analysis.require(); - const unsigned min_simd = 8 * reg_unit(s.devinfo); - const unsigned max_simd = 16 * reg_unit(s.devinfo); - const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo); - bool progress = false; foreach_block(block, s.cfg) { /* Gather a list of convergent TXFs to the same surface in this block */ - brw_tex_inst *txfs[32] = {}; - unsigned count = 0; + brw_tex_inst *txfs_ld[32] = {}; + brw_tex_inst *txfs_ld_lz[32] = {}; + unsigned ld_count = 0; + unsigned ld_lz_count = 0; foreach_inst_in_block(brw_inst, inst, block) { brw_tex_inst *tex = inst->as_tex(); if (tex == NULL) continue; - if (tex->sampler_opcode != SAMPLER_OPCODE_TXF_LOGICAL) + if (tex->sampler_opcode != BRW_SAMPLER_OPCODE_LD && + tex->sampler_opcode != BRW_SAMPLER_OPCODE_LD_LZ) continue; /* Only handle buffers or single miplevel 1D images for now */ @@ -111,120 +191,48 @@ brw_opt_combine_convergent_txf(brw_shader &s) if (tex->predicate || tex->force_writemask_all) continue; - if (!is_uniform_def(defs, tex->src[TEX_LOGICAL_SRC_LOD]) || - !is_uniform_def(defs, tex->src[TEX_LOGICAL_SRC_SURFACE])) + if (!is_uniform_def(defs, tex->src[TEX_LOGICAL_SRC_SURFACE])) continue; /* Only handle immediates for now: we could check is_uniform(), * but we'd need to ensure the coordinate's definition reaches * txfs[0] which is where we'll insert the combined coordinate. */ - if (tex->src[TEX_LOGICAL_SRC_COORDINATE].file != IMM) + if (tex->src[TEX_LOGICAL_SRC_PAYLOAD0].file != IMM) continue; - /* texelFetch from 1D buffers shouldn't have any of these */ - assert(tex->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE); - assert(tex->src[TEX_LOGICAL_SRC_LOD2].file == BAD_FILE); - assert(tex->src[TEX_LOGICAL_SRC_MIN_LOD].file == BAD_FILE); - assert(tex->src[TEX_LOGICAL_SRC_SAMPLE_INDEX].file == BAD_FILE); - assert(tex->src[TEX_LOGICAL_SRC_MCS].file == BAD_FILE); - assert(tex->src[TEX_LOGICAL_SRC_TG4_OFFSET].file == BAD_FILE); - assert(tex->grad_components == 0); + brw_tex_inst *tex0 = tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD ? + txfs_ld[0] : txfs_ld_lz[0]; - if (count > 0 && - (!sources_match(defs, tex, txfs[0], TEX_LOGICAL_SRC_LOD) || - !sources_match(defs, tex, txfs[0], TEX_LOGICAL_SRC_SURFACE) || - tex->surface_bindless != txfs[0]->surface_bindless || - !sources_match(defs, tex, txfs[0], TEX_LOGICAL_SRC_SAMPLER) || - tex->sampler_bindless != txfs[0]->sampler_bindless)) - continue; + if (tex0 != NULL) { + if (!sources_match(defs, tex, tex0, TEX_LOGICAL_SRC_SURFACE) || + tex->surface_bindless != tex0->surface_bindless) + continue; - txfs[count++] = tex; + if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD) { + if (ld_count > 0 && + !sources_match(defs, tex, tex0, TEX_LOGICAL_SRC_PAYLOAD2)) + continue; + } + } - if (count == ARRAY_SIZE(txfs)) + if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD) + txfs_ld[ld_count++] = tex; + if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD_LZ) + txfs_ld_lz[ld_lz_count++] = tex; + + if (ld_count == ARRAY_SIZE(txfs_ld) || + ld_lz_count == ARRAY_SIZE(txfs_ld_lz)) break; } - /* Need at least two things to combine. */ - if (count < 2) - continue; - /* Emit divergent TXFs and replace the original ones with MOVs */ - for (unsigned curr = 0; curr < count; curr += max_simd) { - const unsigned lanes = CLAMP(count - curr, min_simd, max_simd); - const unsigned width = util_next_power_of_two(lanes); - const brw_builder ubld = - brw_builder(&s).before(txfs[curr]).exec_all().group(width, 0); - const brw_builder ubld1 = ubld.group(1, 0); - - enum brw_reg_type coord_type = - txfs[curr]->src[TEX_LOGICAL_SRC_COORDINATE].type; - brw_reg coord = ubld.vgrf(coord_type); - brw_reg coord_comps[32]; - - for (unsigned i = 0; i < width; i++) { - /* Our block size might be larger than the number of convergent - * loads we're combining. If so, repeat the last component. - */ - if (txfs[curr+i]) - coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_COORDINATE]; - else - coord_comps[i] = coord_comps[i-1]; - } - ubld1.VEC(coord, coord_comps, width); - - brw_reg srcs[TEX_LOGICAL_NUM_SRCS]; - srcs[TEX_LOGICAL_SRC_COORDINATE] = coord; - srcs[TEX_LOGICAL_SRC_LOD] = txfs[0]->src[TEX_LOGICAL_SRC_LOD]; - srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE]; - srcs[TEX_LOGICAL_SRC_SAMPLER] = txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER]; - - /* Each of our txf may have a reduced response length if some - * components are never read. Use the maximum of the sizes. - */ - unsigned new_dest_comps = 0; - for (unsigned i = 0; i < width; i++) { - const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]); - new_dest_comps = MAX2(new_dest_comps, this_comps); - } - - /* Emit the new divergent TXF */ - brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps); - brw_tex_inst *div_txf = - ubld.emit(SHADER_OPCODE_SAMPLER, div, srcs, - TEX_LOGICAL_NUM_SRCS)->as_tex(); - div_txf->surface_bindless = txfs[0]->surface_bindless; - div_txf->sampler_bindless = txfs[0]->sampler_bindless; - div_txf->sampler_opcode = SAMPLER_OPCODE_TXF_LOGICAL; - div_txf->coord_components = 1; - div_txf->grad_components = 0; - div_txf->residency = false; - - /* Update it to also use response length reduction */ - const unsigned per_component_regs = - DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size, - grf_size); - div_txf->size_written = new_dest_comps * per_component_regs * grf_size; - - for (unsigned i = 0; i < width; i++) { - brw_inst *txf = txfs[curr+i]; - if (!txf) - break; - - const brw_builder ibld = brw_builder(txf); - - /* Replace each of the original TXFs with MOVs from our new one */ - const unsigned dest_comps = dest_comps_for_txf(s, txf); - assert(dest_comps <= 4); - - brw_reg v[4]; - for (unsigned c = 0; c < dest_comps; c++) - v[c] = component(offset(div, ubld, c), i); - ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps); - - txf->remove(); - } - + if (ld_count >= 2) { + merge_instructions(s, txfs_ld, ld_count); + progress = true; + } + if (ld_lz_count >= 2) { + merge_instructions(s, txfs_ld_lz, ld_lz_count); progress = true; } } diff --git a/src/intel/compiler/brw/brw_print.cpp b/src/intel/compiler/brw/brw_print.cpp index 71617c772ab..2abf80b3eb5 100644 --- a/src/intel/compiler/brw/brw_print.cpp +++ b/src/intel/compiler/brw/brw_print.cpp @@ -74,32 +74,6 @@ brw_print_instructions(const brw_shader &s, FILE *file) } } -static const char * -brw_sampler_opcode_name(sampler_opcode opcode) { - switch (opcode) { - case SAMPLER_OPCODE_TEX_LOGICAL: return "tex_logical"; - case SAMPLER_OPCODE_TXD_LOGICAL: return "txd_logical"; - case SAMPLER_OPCODE_TXF_LOGICAL: return "txf_logical"; - case SAMPLER_OPCODE_TXL_LOGICAL: return "txl_logical"; - case SAMPLER_OPCODE_TXS_LOGICAL: return "txs_logical"; - case SAMPLER_OPCODE_TXB_LOGICAL: return "txb_logical"; - case SAMPLER_OPCODE_TXF_CMS_W_LOGICAL: return "txf_cms_w_logical"; - case SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: return "txf_cms_w_gfx12_logical"; - case SAMPLER_OPCODE_TXF_MCS_LOGICAL: return "txf_mcs_logical"; - case SAMPLER_OPCODE_LOD_LOGICAL: return "lod_logical"; - case SAMPLER_OPCODE_TG4_LOGICAL: return "tg4_logical"; - case SAMPLER_OPCODE_TG4_OFFSET_LOGICAL: return "tg4_offset_logical"; - case SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL: return "tg4_offset_lod_logical"; - case SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: return "tg4_offset_bias_logical"; - case SAMPLER_OPCODE_TG4_BIAS_LOGICAL: return "tg4_b_logical"; - case SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: return "tg4_l_logical"; - case SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: return "tg4_i_logical"; - case SAMPLER_OPCODE_SAMPLEINFO_LOGICAL: return "sampleinfo_logical"; - case SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL: return "image_size_logical"; - default: UNREACHABLE("invalid sampler opcode"); - } -} - static const char * brw_instruction_name(const struct brw_isa_info *isa, const brw_inst *inst) { @@ -474,12 +448,34 @@ brw_print_instruction(const brw_shader &s, const brw_inst *inst, FILE *file, con fprintf(file, " coherent"); } + const brw_tex_inst *tex = inst->as_tex(); + const struct brw_sampler_payload_desc *tex_payload = NULL; + if (tex) + tex_payload = brw_get_sampler_payload_desc(tex->sampler_opcode); + for (int i = 0; i < inst->sources; i++) { if (mem) { if (print_memory_logical_source(file, inst, i)) continue; - } else { - fprintf(file, ", "); + } + + fprintf(file, ", "); + + if (tex_payload) { + switch (i) { + case TEX_LOGICAL_SRC_SURFACE: + fprintf(file, "surf: "); + break; + case TEX_LOGICAL_SRC_SAMPLER: + fprintf(file, "smpl: "); + break; + default: + fprintf(file, "%s: ", + brw_sampler_payload_param_name( + tex_payload->sources[ + i - TEX_LOGICAL_SRC_PAYLOAD0].param)); + break; + } } if (inst->src[i].negate) @@ -634,10 +630,16 @@ brw_print_instruction(const brw_shader &s, const brw_inst *inst, FILE *file, con fprintf(file, ", surface bindless"); if (tex->sampler_bindless) fprintf(file, ", sampler bindless"); - fprintf(file, ", grad_comps: %uu", tex->grad_components); - fprintf(file, ", coord_comps: %uu", tex->coord_components); - fprintf(file, ", grad_comps: %uu", tex->grad_components); - fprintf(file, ", residency: %s", tex->residency ? "true" : "false"); + if (brw_sampler_opcode_is_gather(tex->sampler_opcode)) + fprintf(file, ", gather_comp: %hhu", tex->gather_component); + if (tex->has_const_offsets) { + fprintf(file, ", offsets: %hhi,%hhi,%hhi", + tex->const_offsets[0], + tex->const_offsets[1], + tex->const_offsets[2]); + } + if (tex->residency) + fprintf(file, ", residency"); } fprintf(file, " "); diff --git a/src/intel/compiler/brw/test_opt_cmod_propagation.cpp b/src/intel/compiler/brw/test_opt_cmod_propagation.cpp index 3c9b5edc69f..8acadc273d8 100644 --- a/src/intel/compiler/brw/test_opt_cmod_propagation.cpp +++ b/src/intel/compiler/brw/test_opt_cmod_propagation.cpp @@ -199,32 +199,6 @@ TEST_F(cmod_propagation_test, intervening_mismatch_flag_read) EXPECT_SHADERS_MATCH(bld, exp); } -TEST_F(cmod_propagation_test, intervening_dest_write) -{ - brw_builder bld = make_shader(); - - brw_reg dest = bld.vgrf(BRW_TYPE_F, 4); - brw_reg src0 = bld.vgrf(BRW_TYPE_F); - brw_reg src1 = bld.vgrf(BRW_TYPE_F); - brw_reg src2 = bld.vgrf(BRW_TYPE_F, 2); - brw_reg zero(brw_imm_f(0.0f)); - - brw_reg tex_srcs[TEX_LOGICAL_NUM_SRCS]; - tex_srcs[TEX_LOGICAL_SRC_COORDINATE] = src2; - tex_srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(0); - - bld.ADD(offset(dest, bld, 2), src0, src1); - - brw_tex_inst *tex = - bld.emit(SHADER_OPCODE_SAMPLER, dest, tex_srcs, TEX_LOGICAL_NUM_SRCS)->as_tex(); - tex->size_written = 4 * REG_SIZE; - tex->coord_components = 2; - - bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE); - - EXPECT_NO_PROGRESS(brw_opt_cmod_propagation, bld); -} - TEST_F(cmod_propagation_test, intervening_flag_read_same_value) { brw_builder bld = make_shader(); diff --git a/src/intel/compiler/brw/test_opt_saturate_propagation.cpp b/src/intel/compiler/brw/test_opt_saturate_propagation.cpp index 241cfeceb33..81227e77208 100644 --- a/src/intel/compiler/brw/test_opt_saturate_propagation.cpp +++ b/src/intel/compiler/brw/test_opt_saturate_propagation.cpp @@ -264,32 +264,6 @@ TEST_F(saturate_propagation_test, producer_saturates) EXPECT_SHADERS_MATCH(bld, exp); } -TEST_F(saturate_propagation_test, intervening_dest_write) -{ - brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); - - brw_reg dst0 = bld.vgrf(BRW_TYPE_F, 4); - brw_reg dst1 = bld.vgrf(BRW_TYPE_F); - brw_reg src0 = bld.LOAD_REG(bld.vgrf(BRW_TYPE_F)); - brw_reg src1 = bld.LOAD_REG(bld.vgrf(BRW_TYPE_F)); - brw_reg src2 = bld.LOAD_REG(bld.vgrf(BRW_TYPE_F, 2)); - - brw_reg tex_srcs[TEX_LOGICAL_NUM_SRCS] = {}; - tex_srcs[TEX_LOGICAL_SRC_COORDINATE] = src2; - tex_srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(0); - - bld.ADD(offset(dst0, bld, 2), src0, src1); - - brw_tex_inst *tex = - bld.emit(SHADER_OPCODE_SAMPLER, dst0, tex_srcs, TEX_LOGICAL_NUM_SRCS)->as_tex(); - tex->size_written = 8 * REG_SIZE; - tex->coord_components = 2; - - bld.MOV(dst1, offset(dst0, bld, 2))->saturate = true; - - EXPECT_NO_PROGRESS(brw_opt_saturate_propagation, bld); -} - TEST_F(saturate_propagation_test, mul_neg_mov_sat_mov_sat) { brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);