intel/brw: Remove Gfx8- code from visitor

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
This commit is contained in:
Caio Oliveira 2024-02-17 22:43:47 -08:00 committed by Marge Bot
parent c793644ce9
commit 3ef1ed73d3
5 changed files with 48 additions and 260 deletions

View file

@ -202,22 +202,6 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
}
/**
* A helper for MOV generation for fixing up broken hardware SEND dependency
* handling.
*/
void
fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
{
/* The caller always wants uncompressed to emit the minimal extra
* dependencies, and to avoid having to deal with aligning its regs to 2.
*/
const fs_builder ubld = bld.annotate("send dependency resolve")
.quarter(0);
ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
}
bool
fs_inst::is_send_from_grf() const
{
@ -1636,7 +1620,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
prog_data->urb_setup[i] = urb_next++;
}
}
} else if (devinfo->ver >= 6) {
} else {
assert(!nir->info.per_primitive_inputs);
uint64_t vue_header_bits =
@ -1713,34 +1697,6 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
}
urb_next = prev_stage_vue_map.num_slots - first_slot;
}
} else {
/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
/* Point size is packed into the header, not as a general attribute */
if (i == VARYING_SLOT_PSIZ)
continue;
if (key->input_slots_valid & BITFIELD64_BIT(i)) {
/* The back color slot is skipped when the front color is
* also written to. In addition, some slots can be
* written in the vertex shader and not read in the
* fragment shader. So the register number must always be
* incremented, mapped or not.
*/
if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
prog_data->urb_setup[i] = urb_next;
urb_next++;
}
}
/*
* It's a FS only attribute, and we did interpolation for this attribute
* in SF thread. So, count it here, too.
*
* See compile_sf_prog() for more info.
*/
if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
}
prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
@ -2071,14 +2027,11 @@ fs_visitor::assign_constant_locations()
/* Now that we know how many regular uniforms we'll push, reduce the
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
*/
/* For gen4/5:
* Only allow 16 registers (128 uniform components) as push constants.
*
* If changing this value, note the limitation about total_regs in
* brw_curbe.c/crocus_state.c
*/
const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
const unsigned max_push_length = 64;
unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
for (int i = 0; i < 4; i++) {
struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
@ -2129,14 +2082,8 @@ fs_visitor::emit_repclear_shader()
assert(uniforms == 0);
assume(key->nr_color_regions > 0);
fs_reg color_output, header;
if (devinfo->ver >= 7) {
color_output = retype(brw_vec4_grf(127, 0), BRW_REGISTER_TYPE_UD);
header = retype(brw_vec8_grf(125, 0), BRW_REGISTER_TYPE_UD);
} else {
color_output = retype(brw_vec4_reg(MRF, 2, 0), BRW_REGISTER_TYPE_UD);
header = retype(brw_vec8_reg(MRF, 0, 0), BRW_REGISTER_TYPE_UD);
}
fs_reg color_output = retype(brw_vec4_grf(127, 0), BRW_REGISTER_TYPE_UD);
fs_reg header = retype(brw_vec8_grf(125, 0), BRW_REGISTER_TYPE_UD);
/* We pass the clear color as a flat input. Copy it to the output. */
fs_reg color_input =
@ -2157,23 +2104,17 @@ fs_visitor::emit_repclear_shader()
if (i > 0)
bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
if (devinfo->ver >= 7) {
write = bld.emit(SHADER_OPCODE_SEND);
write->resize_sources(3);
write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
write->src[0] = brw_imm_ud(0);
write->src[1] = brw_imm_ud(0);
write->src[2] = i == 0 ? color_output : header;
write->check_tdr = true;
write->send_has_side_effects = true;
write->desc = brw_fb_write_desc(devinfo, i,
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
i == key->nr_color_regions - 1, false);
} else {
write = bld.emit(FS_OPCODE_REP_FB_WRITE);
write->target = i;
write->base_mrf = i == 0 ? color_output.nr : header.nr;
}
write = bld.emit(SHADER_OPCODE_SEND);
write->resize_sources(3);
write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
write->src[0] = brw_imm_ud(0);
write->src[1] = brw_imm_ud(0);
write->src[2] = i == 0 ? color_output : header;
write->check_tdr = true;
write->send_has_side_effects = true;
write->desc = brw_fb_write_desc(devinfo, i,
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
i == key->nr_color_regions - 1, false);
/* We can use a headerless message for the first render target */
write->header_size = i == 0 ? 0 : 2;
@ -2206,7 +2147,7 @@ brw_sample_mask_reg(const fs_builder &bld)
assert(bld.dispatch_width() <= 16);
return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
} else {
assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16);
assert(bld.dispatch_width() <= 16);
assert(s.devinfo->ver < 20);
return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
BRW_REGISTER_TYPE_UW);
@ -2774,24 +2715,6 @@ fs_visitor::allocate_registers(bool allow_spilling)
prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
prog_data->total_scratch);
if (gl_shader_stage_is_compute(stage)) {
if (devinfo->platform == INTEL_PLATFORM_HSW) {
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
* field documentation, Haswell supports a minimum of 2kB of
* scratch space for compute shaders, unlike every other stage
* and platform.
*/
prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
} else if (devinfo->ver <= 7) {
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
* field documentation, platforms prior to Haswell measure scratch
* size linearly with a range of [1kB, 12kB] and 1kB granularity.
*/
prog_data->total_scratch = ALIGN(last_scratch, 1024);
max_scratch_size = 12 * 1024;
}
}
/* We currently only support up to 2MB of scratch space. If we
* need to support more eventually, the documentation suggests
* that we could allocate a larger buffer, and partition it out
@ -2892,7 +2815,7 @@ fs_visitor::emit_tcs_thread_end()
* separate write just to finish the thread. There isn't guaranteed to
* be one, so this may not succeed.
*/
if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
if (mark_last_urb_write_with_eot())
return;
const fs_builder bld = fs_builder(this).at_end();
@ -3089,10 +3012,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
if (nir->info.inputs_read > 0 ||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
(nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
if (devinfo->ver < 6)
emit_interpolation_setup_gfx4();
else
emit_interpolation_setup_gfx6();
emit_interpolation_setup();
}
/* We handle discards by keeping track of the still-live pixels in f0.1.
@ -3108,8 +3028,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
*/
const fs_reg dispatch_mask =
devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
devinfo->ver >= 6 ? brw_vec1_grf(i + 1, 7) :
brw_vec1_grf(0, 0);
brw_vec1_grf(i + 1, 7);
bld.exec_all().group(1, 0)
.MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
@ -3154,7 +3073,6 @@ bool
fs_visitor::run_cs(bool allow_spilling)
{
assert(gl_shader_stage_is_compute(stage));
assert(devinfo->ver >= 7);
const fs_builder bld = fs_builder(this).at_end();
payload_ = new cs_thread_payload(*this);
@ -3517,26 +3435,24 @@ brw_nir_populate_wm_prog_data(nir_shader *shader,
assert(prog_data->alpha_to_coverage != BRW_SOMETIMES ||
prog_data->persample_dispatch == BRW_SOMETIMES);
if (devinfo->ver >= 6) {
prog_data->uses_sample_mask =
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
prog_data->uses_sample_mask =
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
*
* "MSDISPMODE_PERSAMPLE is required in order to select
* POSOFFSET_SAMPLE"
*
* So we can only really get sample positions if we are doing real
* per-sample dispatch. If we need gl_SamplePosition and we don't have
* persample dispatch, we hard-code it to 0.5.
*/
prog_data->uses_pos_offset =
prog_data->persample_dispatch != BRW_NEVER &&
(BITSET_TEST(shader->info.system_values_read,
SYSTEM_VALUE_SAMPLE_POS) ||
BITSET_TEST(shader->info.system_values_read,
SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
}
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
*
* "MSDISPMODE_PERSAMPLE is required in order to select
* POSOFFSET_SAMPLE"
*
* So we can only really get sample positions if we are doing real
* per-sample dispatch. If we need gl_SamplePosition and we don't have
* persample dispatch, we hard-code it to 0.5.
*/
prog_data->uses_pos_offset =
prog_data->persample_dispatch != BRW_NEVER &&
(BITSET_TEST(shader->info.system_values_read,
SYSTEM_VALUE_SAMPLE_POS) ||
BITSET_TEST(shader->info.system_values_read,
SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
@ -3951,17 +3867,13 @@ cs_fill_push_const_info(const struct intel_device_info *devinfo,
{
const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data);
bool cross_thread_supported = devinfo->verx10 >= 75;
/* The thread ID should be stored in the last param dword */
assert(subgroup_id_index == -1 ||
subgroup_id_index == (int)prog_data->nr_params - 1);
unsigned cross_thread_dwords, per_thread_dwords;
if (!cross_thread_supported) {
cross_thread_dwords = 0u;
per_thread_dwords = prog_data->nr_params;
} else if (subgroup_id_index >= 0) {
if (subgroup_id_index >= 0) {
/* Fill all but the last register with cross-thread payload */
cross_thread_dwords = 8 * (subgroup_id_index / 8);
per_thread_dwords = prog_data->nr_params - cross_thread_dwords;

View file

@ -223,7 +223,6 @@ public:
uint32_t const_offset,
uint8_t alignment,
unsigned components);
void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
bool run_fs(bool allow_spilling, bool do_rep_send);
bool run_vs();
@ -268,8 +267,7 @@ public:
void limit_dispatch_width(unsigned n, const char *msg);
void emit_repclear_shader();
void emit_interpolation_setup_gfx4();
void emit_interpolation_setup_gfx6();
void emit_interpolation_setup();
void set_tcs_invocation_id();
@ -412,14 +410,13 @@ public:
/**
* Return the flag register used in fragment shaders to keep track of live
* samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
* dispatch mode, while earlier generations are constrained to f0.1, which
* limits the dispatch width to SIMD16 for fragment shaders that use discard.
* dispatch mode.
*/
static inline unsigned
sample_mask_flag_subreg(const fs_visitor &s)
{
assert(s.stage == MESA_SHADER_FRAGMENT);
return s.devinfo->ver >= 7 ? 2 : 1;
return 2;
}
/**

View file

@ -116,67 +116,7 @@ fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned comp
/** Emits the interpolation for the varying inputs. */
void
fs_visitor::emit_interpolation_setup_gfx4()
{
struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
fs_builder abld = fs_builder(this).at_end().annotate("compute pixel centers");
this->pixel_x = vgrf(glsl_uint_type());
this->pixel_y = vgrf(glsl_uint_type());
this->pixel_x.type = BRW_REGISTER_TYPE_UW;
this->pixel_y.type = BRW_REGISTER_TYPE_UW;
abld.ADD(this->pixel_x,
fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
fs_reg(brw_imm_v(0x10101010)));
abld.ADD(this->pixel_y,
fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
fs_reg(brw_imm_v(0x11001100)));
const fs_builder bld = fs_builder(this).at_end();
abld = bld.annotate("compute pixel deltas from v0");
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] =
vgrf(glsl_vec2_type());
const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
if (devinfo->has_pln) {
for (unsigned i = 0; i < dispatch_width / 8; i++) {
abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 0), i),
quarter(this->pixel_x, i), xstart);
abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 1), i),
quarter(this->pixel_y, i), ystart);
}
} else {
abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
}
this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
/* The SF program automatically handles doing the perspective correction or
* not based on wm_prog_data::interp_mode[] so we can use the same pixel
* offsets for both perspective and non-perspective.
*/
this->delta_xy[BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL] =
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
abld = bld.annotate("compute pos.w and 1/pos.w");
/* Compute wpos.w. It's always in our setup, since it's needed to
* interpolate the other attributes.
*/
this->wpos_w = vgrf(glsl_float_type());
abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
interp_reg(abld, VARYING_SLOT_POS, 3, 0));
/* Compute the pixel 1/W value from wpos.w. */
this->pixel_w = vgrf(glsl_float_type());
abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
}
/** Emits the interpolation for the varying inputs. */
void
fs_visitor::emit_interpolation_setup_gfx6()
fs_visitor::emit_interpolation_setup()
{
const fs_builder bld = fs_builder(this).at_end();
fs_builder abld = bld.annotate("compute pixel centers");
@ -384,7 +324,7 @@ fs_visitor::emit_interpolation_setup_gfx6()
hbld.MOV(offset(pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
hbld.MOV(offset(pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
} else if (devinfo->ver >= 8 || dispatch_width == 8) {
} else {
/* The "Register Region Restrictions" page says for BDW (and newer,
* presumably):
*
@ -407,31 +347,6 @@ fs_visitor::emit_interpolation_setup_gfx6()
horiz_stride(half_int_pixel_offset_x, 0));
hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
horiz_stride(half_int_pixel_offset_y, 0));
} else {
/* The "Register Region Restrictions" page says for SNB, IVB, HSW:
*
* "When destination spans two registers, the source MUST span
* two registers."
*
* Since the GRF source of the ADD will only read a single register,
* we must do two separate ADDs in SIMD16.
*/
const fs_reg int_pixel_x = hbld.vgrf(BRW_REGISTER_TYPE_UW);
const fs_reg int_pixel_y = hbld.vgrf(BRW_REGISTER_TYPE_UW);
hbld.ADD(int_pixel_x,
fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
fs_reg(brw_imm_v(0x10101010)));
hbld.ADD(int_pixel_y,
fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
fs_reg(brw_imm_v(0x11001100)));
/* As of gfx6, we can no longer mix float and int sources. We have
* to turn the integer pixel centers into floats for their actual
* use.
*/
hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
}
}
@ -676,19 +591,8 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
const fs_reg dst_depth = fetch_payload_reg(bld, fs_payload().dest_depth_reg);
fs_reg src_depth, src_stencil;
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
src_depth = frag_depth;
} else if (source_depth_to_render_target) {
/* If we got here, we're in one of those strange Gen4-5 cases where
* we're forced to pass the source depth, unmodified, to the FB write.
* In this case, we don't want to use pixel_z because we may not have
* set up interpolation. It's also perfectly safe because it only
* happens on old hardware (no coarse interpolation) and this is
* explicitly the pass-through case.
*/
assert(devinfo->ver <= 5);
src_depth = fetch_payload_reg(bld, fs_payload().source_depth_reg);
}
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
src_stencil = frag_stencil;
@ -725,7 +629,7 @@ fs_visitor::do_emit_fb_writes(int nr_color_regions, bool replicate_alpha)
ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
fs_reg src0_alpha;
if (devinfo->ver >= 6 && replicate_alpha && target != 0)
if (replicate_alpha && target != 0)
src0_alpha = offset(outputs[0], bld, 3);
inst = emit_single_fb_write(abld, this->outputs[target],
@ -761,16 +665,6 @@ fs_visitor::emit_fb_writes()
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
if (source_depth_to_render_target && devinfo->ver == 6) {
/* For outputting oDepth on gfx6, SIMD8 writes have to be used. This
* would require SIMD8 moves of each half to message regs, e.g. by using
* the SIMD lowering pass. Unfortunately this is more difficult than it
* sounds because the SIMD8 single-source message lacks channel selects
* for the second and third subspans.
*/
limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
}
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
/* From the 'Render Target Write message' section of the docs:
* "Output Stencil is not supported with SIMD16 Render Target Write
@ -786,7 +680,7 @@ fs_visitor::emit_fb_writes()
*/
const bool replicate_alpha = key->alpha_test_replicate_alpha ||
(key->nr_color_regions > 1 && key->alpha_to_coverage &&
(sample_mask.file == BAD_FILE || devinfo->ver == 6));
sample_mask.file == BAD_FILE);
prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
this->outputs[0].file != BAD_FILE);
@ -1142,7 +1036,6 @@ fs_visitor::emit_urb_fence()
void
fs_visitor::emit_cs_terminate()
{
assert(devinfo->ver >= 7);
const fs_builder bld = fs_builder(this).at_end();
/* We can't directly send from g0, since sends with EOT have to use
@ -1247,7 +1140,7 @@ fs_visitor::init()
this->source_depth_to_render_target = false;
this->runtime_check_aads_emit = false;
this->first_non_payload_grf = 0;
this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
this->max_grf = GFX7_MRF_HACK_START;
this->uniforms = 0;
this->last_scratch = 0;

View file

@ -149,10 +149,10 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
switch (op) {
case 0 ... NUM_BRW_OPCODES - 1:
/* The DO instruction doesn't exist on Gfx6+, but we use it to mark the
/* The DO instruction doesn't exist on Gfx9+, but we use it to mark the
* start of a loop in the IR.
*/
if (devinfo->ver >= 6 && op == BRW_OPCODE_DO)
if (op == BRW_OPCODE_DO)
return "do";
/* DPAS instructions may transiently exist on platforms that do not

View file

@ -130,7 +130,6 @@ static inline nir_variable_mode
brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
gl_shader_stage stage)
{
const struct intel_device_info *devinfo = compiler->devinfo;
nir_variable_mode indirect_mask = (nir_variable_mode) 0;
switch (stage) {
@ -149,19 +148,6 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
stage != MESA_SHADER_MESH)
indirect_mask |= nir_var_shader_out;
/* On HSW+, we allow indirects in scalar shaders. They get implemented
* using nir_lower_vars_to_explicit_types and nir_lower_explicit_io in
* brw_postprocess_nir.
*
* We haven't plumbed through the indirect scratch messages on gfx6 or
* earlier so doing indirects via scratch doesn't work there. On gfx7 and
* earlier the scratch space size is limited to 12kB. If we allowed
* indirects as scratch all the time, we may easily exceed this limit
* without having any fallback.
*/
if (devinfo->verx10 <= 70)
indirect_mask |= nir_var_function_temp;
return indirect_mask;
}