2024-07-12 13:52:46 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_eu.h"
|
2025-02-05 14:25:15 -08:00
|
|
|
#include "brw_shader.h"
|
2024-12-06 19:48:54 -08:00
|
|
|
#include "brw_analysis.h"
|
2025-01-15 08:20:46 -08:00
|
|
|
#include "brw_builder.h"
|
2024-12-06 16:17:46 -08:00
|
|
|
#include "brw_generator.h"
|
2024-07-12 13:52:46 -07:00
|
|
|
#include "brw_nir.h"
|
|
|
|
|
#include "brw_cfg.h"
|
|
|
|
|
#include "brw_private.h"
|
|
|
|
|
#include "intel_nir.h"
|
|
|
|
|
#include "shader_enums.h"
|
|
|
|
|
#include "dev/intel_debug.h"
|
|
|
|
|
#include "dev/intel_wa.h"
|
|
|
|
|
|
|
|
|
|
#include <memory>
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
static brw_inst *
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_emit_single_fb_write(brw_shader &s, const brw_builder &bld,
|
2024-07-12 15:26:20 -07:00
|
|
|
brw_reg color0, brw_reg color1,
|
2025-01-16 15:36:36 -08:00
|
|
|
brw_reg src0_alpha,
|
|
|
|
|
unsigned target, unsigned components,
|
2024-09-17 09:44:21 +03:00
|
|
|
bool null_rt)
|
2024-07-12 15:26:20 -07:00
|
|
|
{
|
|
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
|
|
|
|
|
|
2024-09-17 09:44:21 +03:00
|
|
|
brw_reg sources[FB_WRITE_LOGICAL_NUM_SRCS];
|
|
|
|
|
sources[FB_WRITE_LOGICAL_SRC_COLOR0] = color0;
|
|
|
|
|
sources[FB_WRITE_LOGICAL_SRC_COLOR1] = color1;
|
|
|
|
|
sources[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA] = src0_alpha;
|
2025-01-16 15:36:36 -08:00
|
|
|
sources[FB_WRITE_LOGICAL_SRC_TARGET] = brw_imm_ud(target);
|
2024-09-17 09:44:21 +03:00
|
|
|
sources[FB_WRITE_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(components);
|
|
|
|
|
sources[FB_WRITE_LOGICAL_SRC_NULL_RT] = brw_imm_ud(null_rt);
|
2025-01-16 15:26:18 -08:00
|
|
|
sources[FB_WRITE_LOGICAL_SRC_LAST_RT] = brw_imm_ud(false);
|
2024-07-12 15:26:20 -07:00
|
|
|
|
2024-09-17 09:44:21 +03:00
|
|
|
if (prog_data->uses_omask)
|
|
|
|
|
sources[FB_WRITE_LOGICAL_SRC_OMASK] = s.sample_mask;
|
|
|
|
|
if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
|
|
|
|
|
sources[FB_WRITE_LOGICAL_SRC_SRC_DEPTH] = s.frag_depth;
|
2024-07-12 15:26:20 -07:00
|
|
|
if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
|
2024-09-17 09:44:21 +03:00
|
|
|
sources[FB_WRITE_LOGICAL_SRC_SRC_STENCIL] = s.frag_stencil;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, brw_reg(),
|
2024-07-12 15:26:20 -07:00
|
|
|
sources, ARRAY_SIZE(sources));
|
|
|
|
|
|
|
|
|
|
if (prog_data->uses_kill) {
|
|
|
|
|
write->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
write->flag_subreg = sample_mask_flag_subreg(s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return write;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_do_emit_fb_writes(brw_shader &s, int nr_color_regions, bool replicate_alpha)
|
2024-07-12 15:26:20 -07:00
|
|
|
{
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder bld = brw_builder(&s);
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *inst = NULL;
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
for (int target = 0; target < nr_color_regions; target++) {
|
|
|
|
|
/* Skip over outputs that weren't written. */
|
|
|
|
|
if (s.outputs[target].file == BAD_FILE)
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = bld.annotate(
|
2024-07-12 15:26:20 -07:00
|
|
|
ralloc_asprintf(s.mem_ctx, "FB write target %d", target));
|
|
|
|
|
|
|
|
|
|
brw_reg src0_alpha;
|
|
|
|
|
if (replicate_alpha && target != 0)
|
|
|
|
|
src0_alpha = offset(s.outputs[0], bld, 3);
|
|
|
|
|
|
|
|
|
|
inst = brw_emit_single_fb_write(s, abld, s.outputs[target],
|
2025-01-16 15:36:36 -08:00
|
|
|
s.dual_src_output, src0_alpha, target, 4,
|
2024-09-17 09:44:21 +03:00
|
|
|
false);
|
2024-07-12 15:26:20 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (inst == NULL) {
|
2024-09-17 09:45:27 +03:00
|
|
|
struct brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
|
|
|
|
|
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
|
|
|
|
|
/* Disable null_rt if any non color output is written or if
|
|
|
|
|
* alpha_to_coverage can be enabled. Since the alpha_to_coverage bit is
|
|
|
|
|
* coming from the BLEND_STATE structure and the HW will avoid reading
|
|
|
|
|
* it if null_rt is enabled.
|
|
|
|
|
*/
|
|
|
|
|
const bool use_null_rt =
|
2024-11-18 10:58:46 +02:00
|
|
|
key->alpha_to_coverage == INTEL_NEVER &&
|
2024-09-17 09:45:27 +03:00
|
|
|
!prog_data->uses_omask;
|
|
|
|
|
|
2024-07-12 15:26:20 -07:00
|
|
|
/* Even if there's no color buffers enabled, we still need to send
|
|
|
|
|
* alpha out the pipeline to our null renderbuffer to support
|
|
|
|
|
* alpha-testing, alpha-to-coverage, and so on.
|
|
|
|
|
*/
|
|
|
|
|
/* FINISHME: Factor out this frequently recurring pattern into a
|
|
|
|
|
* helper function.
|
|
|
|
|
*/
|
|
|
|
|
const brw_reg srcs[] = { reg_undef, reg_undef,
|
|
|
|
|
reg_undef, offset(s.outputs[0], bld, 3) };
|
|
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_UD, 4);
|
|
|
|
|
bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
|
|
|
|
|
|
2025-01-16 15:36:36 -08:00
|
|
|
inst = brw_emit_single_fb_write(s, bld, tmp, reg_undef, reg_undef,
|
|
|
|
|
0, 4, use_null_rt);
|
2024-07-12 15:26:20 -07:00
|
|
|
}
|
|
|
|
|
|
2025-01-16 15:26:18 -08:00
|
|
|
inst->src[FB_WRITE_LOGICAL_SRC_LAST_RT] = brw_imm_ud(true);
|
2024-07-12 15:26:20 -07:00
|
|
|
inst->eot = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_emit_fb_writes(brw_shader &s)
|
2024-07-12 15:26:20 -07:00
|
|
|
{
|
|
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
|
|
|
|
|
|
|
|
|
|
if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
|
|
|
|
|
/* From the 'Render Target Write message' section of the docs:
|
|
|
|
|
* "Output Stencil is not supported with SIMD16 Render Target Write
|
|
|
|
|
* Messages."
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 20)
|
|
|
|
|
s.limit_dispatch_width(16, "gl_FragStencilRefARB unsupported "
|
|
|
|
|
"in SIMD32+ mode.\n");
|
|
|
|
|
else
|
|
|
|
|
s.limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
|
|
|
|
|
"in SIMD16+ mode.\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ANV doesn't know about sample mask output during the wm key creation
|
|
|
|
|
* so we compute if we need replicate alpha and emit alpha to coverage
|
|
|
|
|
* workaround here.
|
|
|
|
|
*/
|
|
|
|
|
const bool replicate_alpha = key->alpha_test_replicate_alpha ||
|
|
|
|
|
(key->nr_color_regions > 1 && key->alpha_to_coverage &&
|
|
|
|
|
s.sample_mask.file == BAD_FILE);
|
|
|
|
|
|
|
|
|
|
prog_data->dual_src_blend = (s.dual_src_output.file != BAD_FILE &&
|
|
|
|
|
s.outputs[0].file != BAD_FILE);
|
|
|
|
|
assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
|
|
|
|
|
|
|
|
|
|
/* Following condition implements Wa_14017468336:
|
|
|
|
|
*
|
|
|
|
|
* "If dual source blend is enabled do not enable SIMD32 dispatch" and
|
|
|
|
|
* "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last
|
|
|
|
|
* Render Target Select set."
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
|
|
|
|
|
prog_data->dual_src_blend) {
|
|
|
|
|
/* The dual-source RT write messages fail to release the thread
|
|
|
|
|
* dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
|
|
|
|
|
*
|
|
|
|
|
* XXX - Emit an extra single-source NULL RT-write marked LastRT in
|
|
|
|
|
* order to release the thread dependency without disabling
|
|
|
|
|
* SIMD32.
|
|
|
|
|
*
|
|
|
|
|
* The dual-source RT write messages may lead to hangs with SIMD16
|
|
|
|
|
* dispatch on ICL due some unknown reasons, see
|
|
|
|
|
* https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 20)
|
|
|
|
|
s.limit_dispatch_width(16, "Dual source blending unsupported "
|
|
|
|
|
"in SIMD32 mode.\n");
|
|
|
|
|
else
|
|
|
|
|
s.limit_dispatch_width(8, "Dual source blending unsupported "
|
|
|
|
|
"in SIMD16 and SIMD32 modes.\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_do_emit_fb_writes(s, key->nr_color_regions, replicate_alpha);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** Emits the interpolation for the varying inputs. */
|
|
|
|
|
static void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_emit_interpolation_setup(brw_shader &s)
|
2024-07-12 15:26:20 -07:00
|
|
|
{
|
|
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder bld = brw_builder(&s);
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder abld = bld.annotate("compute pixel centers");
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
s.pixel_x = bld.vgrf(BRW_TYPE_F);
|
|
|
|
|
s.pixel_y = bld.vgrf(BRW_TYPE_F);
|
|
|
|
|
|
|
|
|
|
const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) s.key;
|
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
|
2024-12-06 22:13:36 -08:00
|
|
|
brw_fs_thread_payload &payload = s.fs_payload();
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
brw_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
|
|
|
|
|
brw_reg int_sample_offset_xy; /* Used on Gen8+ */
|
|
|
|
|
brw_reg half_int_sample_offset_x, half_int_sample_offset_y;
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->coarse_pixel_dispatch != INTEL_ALWAYS) {
|
2024-07-12 15:26:20 -07:00
|
|
|
/* The thread payload only delivers subspan locations (ss0, ss1,
|
|
|
|
|
* ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
|
|
|
|
|
* generate 4 pixel coordinates out of each subspan location. We do this
|
|
|
|
|
* by replicating a subspan coordinate 4 times and adding an offset of 1
|
|
|
|
|
* in each direction from the initial top left (tl) location to generate
|
|
|
|
|
* top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
|
|
|
|
|
* (br = +1 in x, +1 in y).
|
|
|
|
|
*
|
|
|
|
|
* The locations we build look like this in SIMD8 :
|
|
|
|
|
*
|
|
|
|
|
* ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
|
|
|
|
|
*
|
|
|
|
|
* The value 0x11001010 is a vector of 8 half byte vector. It adds
|
|
|
|
|
* following to generate the 4 pixels coordinates out of the subspan0:
|
|
|
|
|
*
|
|
|
|
|
* 0x
|
|
|
|
|
* 1 : ss0.y + 1 -> ss0.br.y
|
|
|
|
|
* 1 : ss0.y + 1 -> ss0.bl.y
|
|
|
|
|
* 0 : ss0.y + 0 -> ss0.tr.y
|
|
|
|
|
* 0 : ss0.y + 0 -> ss0.tl.y
|
|
|
|
|
* 1 : ss0.x + 1 -> ss0.br.x
|
|
|
|
|
* 0 : ss0.x + 0 -> ss0.bl.x
|
|
|
|
|
* 1 : ss0.x + 1 -> ss0.tr.x
|
|
|
|
|
* 0 : ss0.x + 0 -> ss0.tl.x
|
|
|
|
|
*
|
|
|
|
|
* By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
|
|
|
|
|
* coordinates out of 2 subspans coordinates in a single ADD instruction
|
|
|
|
|
* (twice the operation above).
|
|
|
|
|
*/
|
|
|
|
|
int_sample_offset_xy = brw_reg(brw_imm_v(0x11001010));
|
|
|
|
|
half_int_sample_offset_x = brw_reg(brw_imm_uw(0));
|
|
|
|
|
half_int_sample_offset_y = brw_reg(brw_imm_uw(0));
|
|
|
|
|
/* On Gfx12.5, because of regioning restrictions, the interpolation code
|
|
|
|
|
* is slightly different and works off X & Y only inputs. The ordering
|
|
|
|
|
* of the half bytes here is a bit odd, with each subspan replicated
|
|
|
|
|
* twice and every other element is discarded :
|
|
|
|
|
*
|
|
|
|
|
* ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
|
|
|
|
|
* X offset: 0 0 1 0 0 0 1 0
|
|
|
|
|
* Y offset: 0 0 0 0 1 0 1 0
|
|
|
|
|
*/
|
|
|
|
|
int_sample_offset_x = brw_reg(brw_imm_v(0x01000100));
|
|
|
|
|
int_sample_offset_y = brw_reg(brw_imm_v(0x01010000));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */
|
|
|
|
|
brw_reg int_coarse_offset_xy; /* Used on Gen8+ */
|
|
|
|
|
brw_reg half_int_coarse_offset_x, half_int_coarse_offset_y;
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
|
2024-07-12 15:26:20 -07:00
|
|
|
/* In coarse pixel dispatch we have to do the same ADD instruction that
|
|
|
|
|
* we do in normal per pixel dispatch, except this time we're not adding
|
|
|
|
|
* 1 in each direction, but instead the coarse pixel size.
|
|
|
|
|
*
|
|
|
|
|
* The coarse pixel size is delivered as 2 u8 in r1.0
|
|
|
|
|
*/
|
2024-08-20 11:48:54 -07:00
|
|
|
struct brw_reg r1_0 = retype(brw_vec1_reg(FIXED_GRF, 1, 0), BRW_TYPE_UB);
|
2024-07-12 15:26:20 -07:00
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder dbld =
|
2024-07-12 15:26:20 -07:00
|
|
|
abld.exec_all().group(MIN2(16, s.dispatch_width) * 2, 0);
|
|
|
|
|
|
|
|
|
|
if (devinfo->verx10 >= 125) {
|
|
|
|
|
/* To build the array of half bytes we do and AND operation with the
|
|
|
|
|
* right mask in X.
|
|
|
|
|
*/
|
|
|
|
|
int_coarse_offset_x = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
|
|
|
|
|
|
|
|
|
|
/* And the right mask in Y. */
|
|
|
|
|
int_coarse_offset_y = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
|
|
|
|
|
} else {
|
|
|
|
|
/* To build the array of half bytes we do and AND operation with the
|
|
|
|
|
* right mask in X.
|
|
|
|
|
*/
|
|
|
|
|
int_coarse_offset_x = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
|
|
|
|
|
|
|
|
|
|
/* And the right mask in Y. */
|
|
|
|
|
int_coarse_offset_y = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
|
|
|
|
|
|
|
|
|
|
/* Finally OR the 2 registers. */
|
|
|
|
|
int_coarse_offset_xy = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Also compute the half coarse size used to center coarses. */
|
|
|
|
|
half_int_coarse_offset_x = bld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
half_int_coarse_offset_y = bld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
|
|
|
|
|
bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
|
|
|
|
|
bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
|
|
|
|
|
brw_reg int_pixel_offset_xy; /* Used on Gen8+ */
|
|
|
|
|
brw_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
|
|
|
|
|
switch (wm_prog_data->coarse_pixel_dispatch) {
|
2024-11-18 10:58:46 +02:00
|
|
|
case INTEL_NEVER:
|
2024-07-12 15:26:20 -07:00
|
|
|
int_pixel_offset_x = int_sample_offset_x;
|
|
|
|
|
int_pixel_offset_y = int_sample_offset_y;
|
|
|
|
|
int_pixel_offset_xy = int_sample_offset_xy;
|
|
|
|
|
half_int_pixel_offset_x = half_int_sample_offset_x;
|
|
|
|
|
half_int_pixel_offset_y = half_int_sample_offset_y;
|
|
|
|
|
break;
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
case INTEL_SOMETIMES: {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder dbld =
|
2024-07-12 15:26:20 -07:00
|
|
|
abld.exec_all().group(MIN2(16, s.dispatch_width) * 2, 0);
|
|
|
|
|
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_check_dynamic_msaa_flag(dbld, wm_prog_data,
|
|
|
|
|
INTEL_MSAA_FLAG_COARSE_RT_WRITES);
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
int_pixel_offset_x = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
dbld.SEL(int_pixel_offset_x,
|
|
|
|
|
int_coarse_offset_x,
|
|
|
|
|
int_sample_offset_x));
|
|
|
|
|
|
|
|
|
|
int_pixel_offset_y = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
dbld.SEL(int_pixel_offset_y,
|
|
|
|
|
int_coarse_offset_y,
|
|
|
|
|
int_sample_offset_y));
|
|
|
|
|
|
|
|
|
|
int_pixel_offset_xy = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
dbld.SEL(int_pixel_offset_xy,
|
|
|
|
|
int_coarse_offset_xy,
|
|
|
|
|
int_sample_offset_xy));
|
|
|
|
|
|
|
|
|
|
half_int_pixel_offset_x = bld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
bld.SEL(half_int_pixel_offset_x,
|
|
|
|
|
half_int_coarse_offset_x,
|
|
|
|
|
half_int_sample_offset_x));
|
|
|
|
|
|
|
|
|
|
half_int_pixel_offset_y = bld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
bld.SEL(half_int_pixel_offset_y,
|
|
|
|
|
half_int_coarse_offset_y,
|
|
|
|
|
half_int_sample_offset_y));
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
case INTEL_ALWAYS:
|
2024-07-12 15:26:20 -07:00
|
|
|
int_pixel_offset_x = int_coarse_offset_x;
|
|
|
|
|
int_pixel_offset_y = int_coarse_offset_y;
|
|
|
|
|
int_pixel_offset_xy = int_coarse_offset_xy;
|
|
|
|
|
half_int_pixel_offset_x = half_int_coarse_offset_x;
|
|
|
|
|
half_int_pixel_offset_y = half_int_coarse_offset_y;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
|
2024-07-12 15:26:20 -07:00
|
|
|
/* According to the "PS Thread Payload for Normal Dispatch"
|
|
|
|
|
* pages on the BSpec, subspan X/Y coordinates are stored in
|
|
|
|
|
* R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
|
|
|
|
|
* on gfx20+. gi_reg is the 32B section of the GRF that
|
|
|
|
|
* contains the subspan coordinates.
|
|
|
|
|
*/
|
|
|
|
|
const struct brw_reg gi_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
|
|
|
|
|
brw_vec1_grf(i + 1, 0);
|
|
|
|
|
const struct brw_reg gi_uw = retype(gi_reg, BRW_TYPE_UW);
|
|
|
|
|
|
|
|
|
|
if (devinfo->verx10 >= 125) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder dbld =
|
2024-07-12 15:26:20 -07:00
|
|
|
abld.exec_all().group(hbld.dispatch_width() * 2, 0);
|
|
|
|
|
const brw_reg int_pixel_x = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
const brw_reg int_pixel_y = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
|
|
|
|
|
dbld.ADD(int_pixel_x,
|
|
|
|
|
brw_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
|
|
|
|
|
int_pixel_offset_x);
|
|
|
|
|
dbld.ADD(int_pixel_y,
|
|
|
|
|
brw_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
|
|
|
|
|
int_pixel_offset_y);
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x,
|
2024-07-12 15:26:20 -07:00
|
|
|
horiz_stride(half_int_pixel_offset_x, 0));
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y,
|
2024-07-12 15:26:20 -07:00
|
|
|
horiz_stride(half_int_pixel_offset_y, 0));
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->coarse_pixel_dispatch != INTEL_ALWAYS) {
|
2024-07-12 15:26:20 -07:00
|
|
|
addx->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
addy->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hbld.MOV(offset(s.pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
|
|
|
|
|
hbld.MOV(offset(s.pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
/* The "Register Region Restrictions" page says for BDW (and newer,
|
|
|
|
|
* presumably):
|
|
|
|
|
*
|
|
|
|
|
* "When destination spans two registers, the source may be one or
|
|
|
|
|
* two registers. The destination elements must be evenly split
|
|
|
|
|
* between the two registers."
|
|
|
|
|
*
|
|
|
|
|
* Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
|
|
|
|
|
* to compute our pixel centers.
|
|
|
|
|
*/
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder dbld =
|
2024-07-12 15:26:20 -07:00
|
|
|
abld.exec_all().group(hbld.dispatch_width() * 2, 0);
|
|
|
|
|
brw_reg int_pixel_xy = dbld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
|
|
|
|
|
dbld.ADD(int_pixel_xy,
|
|
|
|
|
brw_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
|
|
|
|
|
int_pixel_offset_xy);
|
|
|
|
|
|
|
|
|
|
hbld.emit(FS_OPCODE_PIXEL_X, offset(s.pixel_x, hbld, i), int_pixel_xy,
|
|
|
|
|
horiz_stride(half_int_pixel_offset_x, 0));
|
|
|
|
|
hbld.emit(FS_OPCODE_PIXEL_Y, offset(s.pixel_y, hbld, i), int_pixel_xy,
|
|
|
|
|
horiz_stride(half_int_pixel_offset_y, 0));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
abld = bld.annotate("compute pos.z");
|
|
|
|
|
brw_reg coarse_z;
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->coarse_pixel_dispatch != INTEL_NEVER &&
|
2024-07-12 15:26:20 -07:00
|
|
|
wm_prog_data->uses_depth_w_coefficients) {
|
|
|
|
|
/* In coarse pixel mode, the HW doesn't interpolate Z coordinate
|
|
|
|
|
* properly. In the same way we have to add the coarse pixel size to
|
|
|
|
|
* pixels locations, here we recompute the Z value with 2 coefficients
|
|
|
|
|
* in X & Y axis.
|
|
|
|
|
*/
|
|
|
|
|
brw_reg coef_payload = brw_vec8_grf(payload.depth_w_coef_reg, 0);
|
2025-01-15 12:26:45 +02:00
|
|
|
const brw_reg x_start = devinfo->ver >= 20 ?
|
|
|
|
|
brw_vec1_grf(coef_payload.nr, 6) :
|
|
|
|
|
brw_vec1_grf(coef_payload.nr, 2);
|
|
|
|
|
const brw_reg y_start = devinfo->ver >= 20 ?
|
|
|
|
|
brw_vec1_grf(coef_payload.nr, 7) :
|
|
|
|
|
brw_vec1_grf(coef_payload.nr, 6);
|
|
|
|
|
const brw_reg z_cx = devinfo->ver >= 20 ?
|
|
|
|
|
brw_vec1_grf(coef_payload.nr + 1, 1) :
|
|
|
|
|
brw_vec1_grf(coef_payload.nr, 1);
|
|
|
|
|
const brw_reg z_cy = devinfo->ver >= 20 ?
|
|
|
|
|
brw_vec1_grf(coef_payload.nr + 1, 0) :
|
|
|
|
|
brw_vec1_grf(coef_payload.nr, 0);
|
|
|
|
|
const brw_reg z_c0 = devinfo->ver >= 20 ?
|
|
|
|
|
brw_vec1_grf(coef_payload.nr + 1, 2) :
|
|
|
|
|
brw_vec1_grf(coef_payload.nr, 3);
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
const brw_reg float_pixel_x = abld.vgrf(BRW_TYPE_F);
|
|
|
|
|
const brw_reg float_pixel_y = abld.vgrf(BRW_TYPE_F);
|
|
|
|
|
|
|
|
|
|
abld.ADD(float_pixel_x, s.pixel_x, negate(x_start));
|
|
|
|
|
abld.ADD(float_pixel_y, s.pixel_y, negate(y_start));
|
|
|
|
|
|
|
|
|
|
/* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
|
|
|
|
|
const brw_reg u8_cps_width = brw_reg(retype(brw_vec1_grf(1, 0), BRW_TYPE_UB));
|
|
|
|
|
/* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
|
|
|
|
|
const brw_reg u8_cps_height = byte_offset(u8_cps_width, 1);
|
|
|
|
|
const brw_reg u32_cps_width = abld.vgrf(BRW_TYPE_UD);
|
|
|
|
|
const brw_reg u32_cps_height = abld.vgrf(BRW_TYPE_UD);
|
|
|
|
|
abld.MOV(u32_cps_width, u8_cps_width);
|
|
|
|
|
abld.MOV(u32_cps_height, u8_cps_height);
|
|
|
|
|
|
|
|
|
|
const brw_reg f_cps_width = abld.vgrf(BRW_TYPE_F);
|
|
|
|
|
const brw_reg f_cps_height = abld.vgrf(BRW_TYPE_F);
|
|
|
|
|
abld.MOV(f_cps_width, u32_cps_width);
|
|
|
|
|
abld.MOV(f_cps_height, u32_cps_height);
|
|
|
|
|
|
|
|
|
|
/* Center in the middle of the coarse pixel. */
|
2024-11-18 14:13:02 -08:00
|
|
|
abld.MAD(float_pixel_x, float_pixel_x, f_cps_width, brw_imm_f(0.5f));
|
|
|
|
|
abld.MAD(float_pixel_y, float_pixel_y, f_cps_height, brw_imm_f(0.5f));
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
coarse_z = abld.vgrf(BRW_TYPE_F);
|
|
|
|
|
abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x);
|
|
|
|
|
abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (wm_prog_data->uses_src_depth)
|
2024-12-06 21:46:48 -08:00
|
|
|
s.pixel_z = brw_fetch_payload_reg(bld, payload.source_depth_reg);
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
if (wm_prog_data->uses_depth_w_coefficients ||
|
|
|
|
|
wm_prog_data->uses_src_depth) {
|
|
|
|
|
brw_reg sample_z = s.pixel_z;
|
|
|
|
|
|
|
|
|
|
switch (wm_prog_data->coarse_pixel_dispatch) {
|
2024-11-18 10:58:46 +02:00
|
|
|
case INTEL_NEVER:
|
2024-07-12 15:26:20 -07:00
|
|
|
break;
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
case INTEL_SOMETIMES:
|
2024-07-12 15:26:20 -07:00
|
|
|
assert(wm_prog_data->uses_src_depth);
|
|
|
|
|
assert(wm_prog_data->uses_depth_w_coefficients);
|
|
|
|
|
s.pixel_z = abld.vgrf(BRW_TYPE_F);
|
|
|
|
|
|
|
|
|
|
/* We re-use the check_dynamic_msaa_flag() call from above */
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
abld.SEL(s.pixel_z, coarse_z, sample_z));
|
|
|
|
|
break;
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
case INTEL_ALWAYS:
|
2024-07-12 15:26:20 -07:00
|
|
|
assert(!wm_prog_data->uses_src_depth);
|
|
|
|
|
assert(wm_prog_data->uses_depth_w_coefficients);
|
|
|
|
|
s.pixel_z = coarse_z;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (wm_prog_data->uses_src_w) {
|
|
|
|
|
abld = bld.annotate("compute pos.w");
|
2024-12-06 21:46:48 -08:00
|
|
|
s.pixel_w = brw_fetch_payload_reg(abld, payload.source_w_reg);
|
2024-07-12 15:26:20 -07:00
|
|
|
s.wpos_w = bld.vgrf(BRW_TYPE_F);
|
|
|
|
|
abld.emit(SHADER_OPCODE_RCP, s.wpos_w, s.pixel_w);
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_key->persample_interp == INTEL_SOMETIMES) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.exec_all().group(16, 0);
|
2024-07-12 15:26:20 -07:00
|
|
|
bool loaded_flag = false;
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
for (int i = 0; i < INTEL_BARYCENTRIC_MODE_COUNT; ++i) {
|
2024-07-12 15:26:20 -07:00
|
|
|
if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* The sample mode will always be the top bit set in the perspective
|
|
|
|
|
* or non-perspective section. In the case where no SAMPLE mode was
|
|
|
|
|
* requested, wm_prog_data_barycentric_modes() will swap out the top
|
|
|
|
|
* mode for SAMPLE so this works regardless of whether SAMPLE was
|
|
|
|
|
* requested or not.
|
|
|
|
|
*/
|
|
|
|
|
int sample_mode;
|
2024-11-18 11:33:35 +02:00
|
|
|
if (BITFIELD_BIT(i) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) {
|
2024-07-12 15:26:20 -07:00
|
|
|
sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
|
2024-11-18 11:33:35 +02:00
|
|
|
INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
|
2024-07-12 15:26:20 -07:00
|
|
|
} else {
|
|
|
|
|
sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
|
2024-11-18 11:33:35 +02:00
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
|
2024-07-12 15:26:20 -07:00
|
|
|
}
|
|
|
|
|
assert(wm_prog_data->barycentric_interp_modes &
|
|
|
|
|
BITFIELD_BIT(sample_mode));
|
|
|
|
|
|
|
|
|
|
if (i == sample_mode)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
uint8_t *barys = payload.barycentric_coord_reg[i];
|
|
|
|
|
|
|
|
|
|
uint8_t *sample_barys = payload.barycentric_coord_reg[sample_mode];
|
|
|
|
|
assert(barys[0] && sample_barys[0]);
|
|
|
|
|
|
|
|
|
|
if (!loaded_flag) {
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_check_dynamic_msaa_flag(ubld, wm_prog_data,
|
|
|
|
|
INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
|
2024-07-12 15:26:20 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < s.dispatch_width / 8; j++) {
|
|
|
|
|
set_predicate(
|
|
|
|
|
BRW_PREDICATE_NORMAL,
|
|
|
|
|
ubld.MOV(brw_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
|
|
|
|
|
brw_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
for (int i = 0; i < INTEL_BARYCENTRIC_MODE_COUNT; ++i) {
|
2024-12-06 21:46:48 -08:00
|
|
|
s.delta_xy[i] = brw_fetch_barycentric_reg(
|
2024-07-12 15:26:20 -07:00
|
|
|
bld, payload.barycentric_coord_reg[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
|
|
|
|
|
* instructions to FS_OPCODE_REP_FB_WRITE.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_emit_repclear_shader(brw_shader &s)
|
2024-07-12 15:26:20 -07:00
|
|
|
{
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *write = NULL;
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
assert(s.devinfo->ver < 20);
|
|
|
|
|
assert(s.uniforms == 0);
|
|
|
|
|
assume(key->nr_color_regions > 0);
|
|
|
|
|
|
|
|
|
|
brw_reg color_output = retype(brw_vec4_grf(127, 0), BRW_TYPE_UD);
|
|
|
|
|
brw_reg header = retype(brw_vec8_grf(125, 0), BRW_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
/* We pass the clear color as a flat input. Copy it to the output. */
|
|
|
|
|
brw_reg color_input =
|
2024-08-20 11:48:54 -07:00
|
|
|
brw_make_reg(FIXED_GRF, 2, 3, 0, 0, BRW_TYPE_UD,
|
2024-07-12 15:26:20 -07:00
|
|
|
BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
|
|
|
|
|
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
|
|
|
|
|
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder bld = brw_builder(&s);
|
2024-07-12 15:26:20 -07:00
|
|
|
bld.exec_all().group(4, 0).MOV(color_output, color_input);
|
|
|
|
|
|
|
|
|
|
if (key->nr_color_regions > 1) {
|
|
|
|
|
/* Copy g0..g1 as the message header */
|
|
|
|
|
bld.exec_all().group(16, 0)
|
|
|
|
|
.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < key->nr_color_regions; ++i) {
|
|
|
|
|
if (i > 0)
|
2025-04-03 01:14:03 -07:00
|
|
|
bld.uniform().MOV(component(header, 2), brw_imm_ud(i));
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
write = bld.emit(SHADER_OPCODE_SEND);
|
|
|
|
|
write->resize_sources(3);
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
|
|
|
|
|
/* We can use a headerless message for the first render target */
|
|
|
|
|
write->header_size = i == 0 ? 0 : 2;
|
|
|
|
|
write->mlen = 1 + write->header_size;
|
|
|
|
|
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
write->sfid = BRW_SFID_RENDER_CACHE;
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
write->src[0] = brw_imm_ud(
|
|
|
|
|
brw_fb_write_desc(
|
|
|
|
|
s.devinfo, i,
|
|
|
|
|
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
|
|
|
|
|
i == key->nr_color_regions - 1, false) |
|
|
|
|
|
brw_message_desc(s.devinfo, write->mlen,
|
|
|
|
|
0 /* rlen */, write->header_size));
|
2024-07-12 15:26:20 -07:00
|
|
|
write->src[1] = brw_imm_ud(0);
|
|
|
|
|
write->src[2] = i == 0 ? color_output : header;
|
|
|
|
|
write->check_tdr = true;
|
|
|
|
|
write->send_has_side_effects = true;
|
|
|
|
|
|
|
|
|
|
/* We can use a headerless message for the first render target */
|
|
|
|
|
write->header_size = i == 0 ? 0 : 2;
|
|
|
|
|
write->mlen = 1 + write->header_size;
|
|
|
|
|
}
|
|
|
|
|
write->eot = true;
|
|
|
|
|
|
2024-07-12 17:08:46 -07:00
|
|
|
brw_calculate_cfg(s);
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
s.first_non_payload_grf = s.payload().num_regs;
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_scoreboard(s);
|
2024-07-12 15:26:20 -07:00
|
|
|
}
|
|
|
|
|
|
2024-07-12 13:52:46 -07:00
|
|
|
static void
|
|
|
|
|
calculate_urb_setup(const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
|
|
|
|
const nir_shader *nir,
|
|
|
|
|
const struct brw_mue_map *mue_map)
|
|
|
|
|
{
|
|
|
|
|
memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
|
|
|
|
|
memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
|
|
|
|
|
|
|
|
|
|
int urb_next = 0; /* in vec4s */
|
|
|
|
|
|
|
|
|
|
const uint64_t inputs_read =
|
|
|
|
|
nir->info.inputs_read & ~nir->info.per_primitive_inputs;
|
|
|
|
|
|
|
|
|
|
/* Figure out where each of the incoming setup attributes lands. */
|
2024-11-18 10:58:46 +02:00
|
|
|
if (key->mesh_input != INTEL_NEVER) {
|
2024-07-12 13:52:46 -07:00
|
|
|
/* Per-Primitive Attributes are laid out by Hardware before the regular
|
|
|
|
|
* attributes, so order them like this to make easy later to map setup
|
|
|
|
|
* into real HW registers.
|
|
|
|
|
*/
|
|
|
|
|
if (nir->info.per_primitive_inputs) {
|
|
|
|
|
uint64_t per_prim_inputs_read =
|
|
|
|
|
nir->info.inputs_read & nir->info.per_primitive_inputs;
|
|
|
|
|
|
|
|
|
|
/* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
|
|
|
|
|
* are always at the beginning, because they come from MUE
|
|
|
|
|
* Primitive Header, not Per-Primitive Attributes.
|
|
|
|
|
*/
|
|
|
|
|
const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
|
|
|
|
|
VARYING_BIT_LAYER |
|
|
|
|
|
VARYING_BIT_PRIMITIVE_SHADING_RATE;
|
|
|
|
|
|
|
|
|
|
if (mue_map) {
|
|
|
|
|
unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
|
|
|
|
|
unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
|
|
|
|
|
|
|
|
|
|
bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
|
|
|
|
|
|
|
|
|
|
if (reads_header || mue_map->user_data_in_primitive_header) {
|
|
|
|
|
/* Primitive Shading Rate, Layer and Viewport live in the same
|
|
|
|
|
* 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
|
|
|
|
|
* is dword 2).
|
|
|
|
|
*/
|
|
|
|
|
if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
|
|
|
|
|
|
|
|
|
|
if (per_prim_inputs_read & VARYING_BIT_LAYER)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
|
|
|
|
|
|
|
|
|
|
if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
|
|
|
|
|
|
|
|
|
|
per_prim_inputs_read &= ~primitive_header_bits;
|
|
|
|
|
} else {
|
|
|
|
|
/* If fs doesn't need primitive header, then it won't be made
|
|
|
|
|
* available through SBE_MESH, so we have to skip them when
|
|
|
|
|
* calculating offset from start of per-prim data.
|
|
|
|
|
*/
|
|
|
|
|
per_prim_start_dw += mue_map->per_primitive_header_size_dw;
|
|
|
|
|
per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
u_foreach_bit64(i, per_prim_inputs_read) {
|
|
|
|
|
int start = mue_map->start_dw[i];
|
|
|
|
|
|
|
|
|
|
assert(start >= 0);
|
|
|
|
|
assert(mue_map->len_dw[i] > 0);
|
|
|
|
|
|
|
|
|
|
assert(unsigned(start) >= per_prim_start_dw);
|
|
|
|
|
unsigned pos_dw = unsigned(start) - per_prim_start_dw;
|
|
|
|
|
|
|
|
|
|
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
|
|
|
|
|
prog_data->urb_setup_channel[i] = pos_dw % 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
urb_next = per_prim_size_dw / 4;
|
|
|
|
|
} else {
|
|
|
|
|
/* With no MUE map, we never read the primitive header, and
|
|
|
|
|
* per-primitive attributes won't be packed either, so just lay
|
|
|
|
|
* them in varying order.
|
|
|
|
|
*/
|
|
|
|
|
per_prim_inputs_read &= ~primitive_header_bits;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
|
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* The actual setup attributes later must be aligned to a full GRF. */
|
|
|
|
|
urb_next = ALIGN(urb_next, 2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prog_data->num_per_primitive_inputs = urb_next;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
|
|
|
|
|
VARYING_BIT_CLIP_DIST1;
|
|
|
|
|
|
|
|
|
|
uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
|
|
|
|
|
|
|
|
|
|
if (inputs_read & clip_dist_bits) {
|
|
|
|
|
assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
|
|
|
|
|
unique_fs_attrs &= ~clip_dist_bits;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (mue_map) {
|
|
|
|
|
unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
|
|
|
|
|
unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
|
|
|
|
|
|
|
|
|
|
/* Per-Vertex header is available to fragment shader only if there's
|
|
|
|
|
* user data there.
|
|
|
|
|
*/
|
|
|
|
|
if (!mue_map->user_data_in_vertex_header) {
|
|
|
|
|
per_vertex_start_dw += 8;
|
|
|
|
|
per_vertex_size_dw -= 8;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* In Mesh, CLIP_DIST slots are always at the beginning, because
|
|
|
|
|
* they come from MUE Vertex Header, not Per-Vertex Attributes.
|
|
|
|
|
*/
|
|
|
|
|
if (inputs_read & clip_dist_bits) {
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
|
|
|
|
|
} else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
|
|
|
|
|
/* Clip distances are in MUE, but we are not reading them in FS. */
|
|
|
|
|
per_vertex_start_dw += 8;
|
|
|
|
|
per_vertex_size_dw -= 8;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Per-Vertex attributes are laid out ordered. Because we always link
|
|
|
|
|
* Mesh and Fragment shaders, the which slots are written and read by
|
|
|
|
|
* each of them will match. */
|
|
|
|
|
u_foreach_bit64(i, unique_fs_attrs) {
|
|
|
|
|
int start = mue_map->start_dw[i];
|
|
|
|
|
|
|
|
|
|
assert(start >= 0);
|
|
|
|
|
assert(mue_map->len_dw[i] > 0);
|
|
|
|
|
|
|
|
|
|
assert(unsigned(start) >= per_vertex_start_dw);
|
|
|
|
|
unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
|
|
|
|
|
|
|
|
|
|
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
|
|
|
|
|
prog_data->urb_setup_channel[i] = pos_dw % 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
urb_next += per_vertex_size_dw / 4;
|
|
|
|
|
} else {
|
|
|
|
|
/* If we don't have an MUE map, just lay down the inputs the FS reads
|
|
|
|
|
* in varying order, as we do for the legacy pipeline.
|
|
|
|
|
*/
|
|
|
|
|
if (inputs_read & clip_dist_bits) {
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
if (unique_fs_attrs & BITFIELD64_BIT(i))
|
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
assert(!nir->info.per_primitive_inputs);
|
|
|
|
|
|
2025-03-19 12:11:53 +02:00
|
|
|
const uint64_t vue_header_bits = BRW_VUE_HEADER_VARYING_MASK;
|
2024-07-12 13:52:46 -07:00
|
|
|
|
2025-03-19 12:11:53 +02:00
|
|
|
uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits;
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
if (util_bitcount64(unique_fs_attrs) <= 16) {
|
|
|
|
|
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
|
|
|
|
|
* first 16 varying inputs, so we can put them wherever we want.
|
|
|
|
|
* Just put them in order.
|
|
|
|
|
*
|
|
|
|
|
* This is useful because it means that (a) inputs not used by the
|
|
|
|
|
* fragment shader won't take up valuable register space, and (b) we
|
|
|
|
|
* won't have to recompile the fragment shader if it gets paired with
|
|
|
|
|
* a different vertex (or geometry) shader.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
|
|
|
|
|
BITFIELD64_BIT(i)) {
|
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* We have enough input varyings that the SF/SBE pipeline stage can't
|
|
|
|
|
* arbitrarily rearrange them to suit our whim; we have to put them
|
|
|
|
|
* in an order that matches the output of the previous pipeline stage
|
|
|
|
|
* (geometry or vertex shader).
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* Re-compute the VUE map here in the case that the one coming from
|
|
|
|
|
* geometry has more than one position slot (used for Primitive
|
|
|
|
|
* Replication).
|
|
|
|
|
*/
|
|
|
|
|
struct intel_vue_map prev_stage_vue_map;
|
|
|
|
|
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
|
|
|
|
|
key->input_slots_valid,
|
2025-04-29 17:40:22 +03:00
|
|
|
key->base.vue_layout, 1);
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
int first_slot =
|
2025-03-19 12:11:53 +02:00
|
|
|
brw_compute_first_fs_urb_slot_required(unique_fs_attrs,
|
|
|
|
|
&prev_stage_vue_map);
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
|
|
|
|
|
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
|
|
|
|
|
slot++) {
|
|
|
|
|
int varying = prev_stage_vue_map.slot_to_varying[slot];
|
|
|
|
|
if (varying != BRW_VARYING_SLOT_PAD &&
|
|
|
|
|
(inputs_read & BRW_FS_VARYING_INPUT_MASK &
|
|
|
|
|
BITFIELD64_BIT(varying))) {
|
|
|
|
|
prog_data->urb_setup[varying] = slot - first_slot;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
urb_next = prev_stage_vue_map.num_slots - first_slot;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
|
|
|
|
|
prog_data->inputs = inputs_read;
|
|
|
|
|
|
|
|
|
|
brw_compute_urb_setup_index(prog_data);
|
|
|
|
|
}
|
|
|
|
|
static bool
|
|
|
|
|
is_used_in_not_interp_frag_coord(nir_def *def)
|
|
|
|
|
{
|
|
|
|
|
nir_foreach_use_including_if(src, def) {
|
|
|
|
|
if (nir_src_is_if(src))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
|
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a bitfield where bit n is set if barycentric interpolation mode n
|
2024-11-18 11:33:35 +02:00
|
|
|
* (see enum intel_barycentric_mode) is needed by the fragment shader.
|
2024-07-12 13:52:46 -07:00
|
|
|
*
|
|
|
|
|
* We examine the load_barycentric intrinsics rather than looking at input
|
|
|
|
|
* variables so that we catch interpolateAtCentroid() messages too, which
|
2024-11-18 11:33:35 +02:00
|
|
|
* also need the INTEL_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
|
2024-07-12 13:52:46 -07:00
|
|
|
*/
|
|
|
|
|
static unsigned
|
|
|
|
|
brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
const nir_shader *shader)
|
|
|
|
|
{
|
|
|
|
|
unsigned barycentric_interp_modes = 0;
|
|
|
|
|
|
|
|
|
|
nir_foreach_function_impl(impl, shader) {
|
|
|
|
|
nir_foreach_block(block, impl) {
|
|
|
|
|
nir_foreach_instr(instr, block) {
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Ignore WPOS; it doesn't require interpolation. */
|
|
|
|
|
if (!is_used_in_not_interp_frag_coord(&intrin->def))
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
enum intel_barycentric_mode bary =
|
2024-07-12 13:52:46 -07:00
|
|
|
brw_barycentric_mode(key, intrin);
|
|
|
|
|
|
|
|
|
|
barycentric_interp_modes |= 1 << bary;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return barycentric_interp_modes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a bitfield where bit n is set if barycentric interpolation
|
2024-11-18 11:33:35 +02:00
|
|
|
* mode n (see enum intel_barycentric_mode) is needed by the fragment
|
2024-07-12 13:52:46 -07:00
|
|
|
* shader barycentric intrinsics that take an explicit offset or
|
|
|
|
|
* sample as argument.
|
|
|
|
|
*/
|
|
|
|
|
static unsigned
|
|
|
|
|
brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key *key,
|
|
|
|
|
const nir_shader *shader)
|
|
|
|
|
{
|
|
|
|
|
unsigned barycentric_interp_modes = 0;
|
|
|
|
|
|
|
|
|
|
nir_foreach_function_impl(impl, shader) {
|
|
|
|
|
nir_foreach_block(block, impl) {
|
|
|
|
|
nir_foreach_instr(instr, block) {
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_load_barycentric_at_offset ||
|
|
|
|
|
intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample)
|
|
|
|
|
barycentric_interp_modes |= 1 << brw_barycentric_mode(key, intrin);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return barycentric_interp_modes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
|
|
|
|
|
const nir_shader *shader)
|
|
|
|
|
{
|
|
|
|
|
prog_data->flat_inputs = 0;
|
|
|
|
|
|
2024-09-26 18:59:31 -07:00
|
|
|
const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
|
|
|
|
|
|
2024-07-12 13:52:46 -07:00
|
|
|
nir_foreach_shader_in_variable(var, shader) {
|
|
|
|
|
/* flat shading */
|
|
|
|
|
if (var->data.interpolation != INTERP_MODE_FLAT)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (var->data.per_primitive)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
unsigned slots = glsl_count_attribute_slots(var->type, false);
|
|
|
|
|
for (unsigned s = 0; s < slots; s++) {
|
2024-09-26 18:59:31 -07:00
|
|
|
int input_index = prog_data->urb_setup[var->data.location + s] - per_vertex_start;
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
if (input_index >= 0)
|
|
|
|
|
prog_data->flat_inputs |= 1 << input_index;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint8_t
|
|
|
|
|
computed_depth_mode(const nir_shader *shader)
|
|
|
|
|
{
|
|
|
|
|
if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
|
|
|
|
switch (shader->info.fs.depth_layout) {
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_NONE:
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_ANY:
|
|
|
|
|
return BRW_PSCDEPTH_ON;
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_GREATER:
|
|
|
|
|
return BRW_PSCDEPTH_ON_GE;
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_LESS:
|
|
|
|
|
return BRW_PSCDEPTH_ON_LE;
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_UNCHANGED:
|
|
|
|
|
/* We initially set this to OFF, but having the shader write the
|
|
|
|
|
* depth means we allocate register space in the SEND message. The
|
|
|
|
|
* difference between the SEND register count and the OFF state
|
|
|
|
|
* programming makes the HW hang.
|
|
|
|
|
*
|
|
|
|
|
* Removing the depth writes also leads to test failures. So use
|
|
|
|
|
* LesserThanOrEqual, which fits writing the same value
|
|
|
|
|
* (unchanged/equal).
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
return BRW_PSCDEPTH_ON_LE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return BRW_PSCDEPTH_OFF;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
brw_nir_populate_wm_prog_data(nir_shader *shader,
|
|
|
|
|
const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
|
|
|
|
const struct brw_mue_map *mue_map)
|
|
|
|
|
{
|
|
|
|
|
prog_data->uses_kill = shader->info.fs.uses_discard;
|
|
|
|
|
prog_data->uses_omask = !key->ignore_sample_mask_out &&
|
|
|
|
|
(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
|
|
|
|
|
prog_data->max_polygons = 1;
|
|
|
|
|
prog_data->computed_depth_mode = computed_depth_mode(shader);
|
|
|
|
|
prog_data->computed_stencil =
|
|
|
|
|
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
|
|
|
|
|
|
|
|
|
|
prog_data->sample_shading =
|
|
|
|
|
shader->info.fs.uses_sample_shading ||
|
|
|
|
|
shader->info.outputs_read;
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
assert(key->multisample_fbo != INTEL_NEVER ||
|
|
|
|
|
key->persample_interp == INTEL_NEVER);
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
prog_data->persample_dispatch = key->persample_interp;
|
|
|
|
|
if (prog_data->sample_shading)
|
2024-11-18 10:58:46 +02:00
|
|
|
prog_data->persample_dispatch = INTEL_ALWAYS;
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
/* We can only persample dispatch if we have a multisample FBO */
|
|
|
|
|
prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
|
|
|
|
|
key->multisample_fbo);
|
|
|
|
|
|
|
|
|
|
/* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
|
|
|
|
|
* persample_dispatch & multisample_fbo are not dynamic, Anv should be able
|
|
|
|
|
* to definitively tell whether alpha_to_coverage is on or off.
|
|
|
|
|
*/
|
|
|
|
|
prog_data->alpha_to_coverage = key->alpha_to_coverage;
|
|
|
|
|
|
|
|
|
|
prog_data->uses_sample_mask =
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
|
|
|
|
|
|
|
|
|
|
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
|
|
|
|
|
*
|
|
|
|
|
* "MSDISPMODE_PERSAMPLE is required in order to select
|
|
|
|
|
* POSOFFSET_SAMPLE"
|
|
|
|
|
*
|
|
|
|
|
* So we can only really get sample positions if we are doing real
|
|
|
|
|
* per-sample dispatch. If we need gl_SamplePosition and we don't have
|
|
|
|
|
* persample dispatch, we hard-code it to 0.5.
|
|
|
|
|
*/
|
|
|
|
|
prog_data->uses_pos_offset =
|
2024-11-18 10:58:46 +02:00
|
|
|
prog_data->persample_dispatch != INTEL_NEVER &&
|
2024-07-12 13:52:46 -07:00
|
|
|
(BITSET_TEST(shader->info.system_values_read,
|
|
|
|
|
SYSTEM_VALUE_SAMPLE_POS) ||
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read,
|
|
|
|
|
SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
|
|
|
|
|
|
|
|
|
|
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
|
|
|
|
|
prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
|
|
|
|
|
prog_data->inner_coverage = shader->info.fs.inner_coverage;
|
|
|
|
|
|
|
|
|
|
prog_data->barycentric_interp_modes =
|
|
|
|
|
brw_compute_barycentric_interp_modes(devinfo, key, shader);
|
|
|
|
|
|
|
|
|
|
/* From the BDW PRM documentation for 3DSTATE_WM:
|
|
|
|
|
*
|
|
|
|
|
* "MSDISPMODE_PERSAMPLE is required in order to select Perspective
|
|
|
|
|
* Sample or Non- perspective Sample barycentric coordinates."
|
|
|
|
|
*
|
|
|
|
|
* So cleanup any potentially set sample barycentric mode when not in per
|
|
|
|
|
* sample dispatch.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
if (prog_data->persample_dispatch == INTEL_NEVER) {
|
2024-07-12 13:52:46 -07:00
|
|
|
prog_data->barycentric_interp_modes &=
|
2024-11-18 11:33:35 +02:00
|
|
|
~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
const unsigned offset_bary_modes =
|
|
|
|
|
brw_compute_offset_barycentric_interp_modes(key, shader);
|
|
|
|
|
|
|
|
|
|
prog_data->uses_npc_bary_coefficients =
|
2024-11-18 11:33:35 +02:00
|
|
|
offset_bary_modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS;
|
2024-07-12 13:52:46 -07:00
|
|
|
prog_data->uses_pc_bary_coefficients =
|
2024-11-18 11:33:35 +02:00
|
|
|
offset_bary_modes & ~INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS;
|
2024-07-12 13:52:46 -07:00
|
|
|
prog_data->uses_sample_offsets =
|
2024-11-18 11:33:35 +02:00
|
|
|
offset_bary_modes & ((1 << INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
|
|
|
|
|
(1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prog_data->uses_nonperspective_interp_modes =
|
2024-11-18 11:33:35 +02:00
|
|
|
(prog_data->barycentric_interp_modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
|
2024-07-12 13:52:46 -07:00
|
|
|
prog_data->uses_npc_bary_coefficients;
|
|
|
|
|
|
|
|
|
|
/* The current VK_EXT_graphics_pipeline_library specification requires
|
|
|
|
|
* coarse to specified at compile time. But per sample interpolation can be
|
|
|
|
|
* dynamic. So we should never be in a situation where coarse &
|
2024-11-18 10:58:46 +02:00
|
|
|
* persample_interp are both respectively true & INTEL_ALWAYS.
|
2024-07-12 13:52:46 -07:00
|
|
|
*
|
|
|
|
|
* Coarse will dynamically turned off when persample_interp is active.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS);
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
prog_data->coarse_pixel_dispatch =
|
2024-11-18 10:58:46 +02:00
|
|
|
intel_sometimes_invert(prog_data->persample_dispatch);
|
2024-07-12 13:52:46 -07:00
|
|
|
if (!key->coarse_pixel ||
|
|
|
|
|
prog_data->uses_omask ||
|
|
|
|
|
prog_data->sample_shading ||
|
|
|
|
|
prog_data->uses_sample_mask ||
|
|
|
|
|
(prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
|
|
|
|
|
prog_data->computed_stencil) {
|
2024-11-18 10:58:46 +02:00
|
|
|
prog_data->coarse_pixel_dispatch = INTEL_NEVER;
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
|
|
|
|
|
* Message Descriptor :
|
|
|
|
|
*
|
|
|
|
|
* "Message Type. Specifies the type of message being sent when
|
|
|
|
|
* pixel-rate evaluation is requested :
|
|
|
|
|
*
|
|
|
|
|
* Format = U2
|
|
|
|
|
* 0: Per Message Offset (eval_snapped with immediate offset)
|
|
|
|
|
* 1: Sample Position Offset (eval_sindex)
|
|
|
|
|
* 2: Centroid Position Offset (eval_centroid)
|
|
|
|
|
* 3: Per Slot Offset (eval_snapped with register offset)
|
|
|
|
|
*
|
|
|
|
|
* Message Type. Specifies the type of message being sent when
|
|
|
|
|
* coarse-rate evaluation is requested :
|
|
|
|
|
*
|
|
|
|
|
* Format = U2
|
|
|
|
|
* 0: Coarse to Pixel Mapping Message (internal message)
|
|
|
|
|
* 1: Reserved
|
|
|
|
|
* 2: Coarse Centroid Position (eval_centroid)
|
|
|
|
|
* 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
|
|
|
|
|
*
|
|
|
|
|
* The Sample Position Offset is marked as reserved for coarse rate
|
|
|
|
|
* evaluation and leads to hangs if we try to use it. So disable coarse
|
|
|
|
|
* pixel shading if we have any intrinsic that will result in a pixel
|
|
|
|
|
* interpolater message at sample.
|
|
|
|
|
*/
|
|
|
|
|
if (intel_nir_pulls_at_sample(shader))
|
2024-11-18 10:58:46 +02:00
|
|
|
prog_data->coarse_pixel_dispatch = INTEL_NEVER;
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
/* We choose to always enable VMask prior to XeHP, as it would cause
|
|
|
|
|
* us to lose out on the eliminate_find_live_channel() optimization.
|
|
|
|
|
*/
|
|
|
|
|
prog_data->uses_vmask = devinfo->verx10 < 125 ||
|
2025-03-04 09:15:49 -05:00
|
|
|
shader->info.fs.needs_coarse_quad_helper_invocations ||
|
2024-07-12 13:52:46 -07:00
|
|
|
shader->info.uses_wide_subgroup_intrinsics ||
|
2024-11-18 10:58:46 +02:00
|
|
|
prog_data->coarse_pixel_dispatch != INTEL_NEVER;
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
prog_data->uses_src_w =
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
|
|
|
|
|
prog_data->uses_src_depth =
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
|
2024-11-18 10:58:46 +02:00
|
|
|
prog_data->coarse_pixel_dispatch != INTEL_ALWAYS;
|
2024-07-12 13:52:46 -07:00
|
|
|
prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients ||
|
|
|
|
|
(BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
|
2024-11-18 10:58:46 +02:00
|
|
|
prog_data->coarse_pixel_dispatch != INTEL_NEVER);
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
|
|
|
|
|
brw_compute_flat_inputs(prog_data, shader);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 14:20:57 -07:00
|
|
|
/* From the SKL PRM, Volume 16, Workarounds:
|
|
|
|
|
*
|
|
|
|
|
* 0877 3D Pixel Shader Hang possible when pixel shader dispatched with
|
|
|
|
|
* only header phases (R0-R2)
|
|
|
|
|
*
|
|
|
|
|
* WA: Enable a non-header phase (e.g. push constant) when dispatch would
|
|
|
|
|
* have been header only.
|
|
|
|
|
*
|
|
|
|
|
* Instead of enabling push constants one can alternatively enable one of the
|
|
|
|
|
* inputs. Here one simply chooses "layer" which shouldn't impose much
|
|
|
|
|
* overhead.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
{
|
|
|
|
|
if (wm_prog_data->num_varying_inputs)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (wm_prog_data->base.curb_read_length)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
|
|
|
|
|
wm_prog_data->num_varying_inputs = 1;
|
|
|
|
|
|
|
|
|
|
brw_compute_urb_setup_index(wm_prog_data);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 15:26:20 -07:00
|
|
|
static void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_assign_urb_setup(brw_shader &s)
|
2024-07-12 15:26:20 -07:00
|
|
|
{
|
|
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
|
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
|
|
|
|
|
|
|
|
|
|
int urb_start = s.payload().num_regs + prog_data->base.curb_read_length;
|
2025-04-29 12:50:42 +03:00
|
|
|
bool read_attribute_payload = false;
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
/* Offset all the urb_setup[] index by the actual position of the
|
|
|
|
|
* setup regs, now that the location of the constants has been chosen.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
|
2025-04-29 12:50:42 +03:00
|
|
|
if (inst->opcode == FS_OPCODE_READ_ATTRIBUTE_PAYLOAD) {
|
|
|
|
|
brw_reg offset = inst->src[0];
|
|
|
|
|
inst->resize_sources(3);
|
|
|
|
|
inst->opcode = SHADER_OPCODE_MOV_INDIRECT;
|
|
|
|
|
inst->src[0] = retype(brw_vec8_grf(urb_start, 0), BRW_TYPE_UD);
|
|
|
|
|
inst->src[1] = offset;
|
|
|
|
|
inst->src[2] = brw_imm_ud(REG_SIZE * 2 * 32);
|
|
|
|
|
read_attribute_payload = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 15:26:20 -07:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
|
|
|
|
/* ATTR brw_reg::nr in the FS is in units of logical scalar
|
|
|
|
|
* inputs each of which consumes 16B on Gfx4-Gfx12. In
|
|
|
|
|
* single polygon mode this leads to the following layout
|
|
|
|
|
* of the vertex setup plane parameters in the ATTR
|
|
|
|
|
* register file:
|
|
|
|
|
*
|
|
|
|
|
* brw_reg::nr Input Comp0 Comp1 Comp2 Comp3
|
|
|
|
|
* 0 Attr0.x a1-a0 a2-a0 N/A a0
|
|
|
|
|
* 1 Attr0.y a1-a0 a2-a0 N/A a0
|
|
|
|
|
* 2 Attr0.z a1-a0 a2-a0 N/A a0
|
|
|
|
|
* 3 Attr0.w a1-a0 a2-a0 N/A a0
|
|
|
|
|
* 4 Attr1.x a1-a0 a2-a0 N/A a0
|
|
|
|
|
* ...
|
|
|
|
|
*
|
|
|
|
|
* In multipolygon mode that no longer works since
|
|
|
|
|
* different channels may be processing polygons with
|
|
|
|
|
* different plane parameters, so each parameter above is
|
|
|
|
|
* represented as a dispatch_width-wide vector:
|
|
|
|
|
*
|
|
|
|
|
* brw_reg::nr brw_reg::offset Input Comp0 ... CompN
|
|
|
|
|
* 0 0 Attr0.x a1[0]-a0[0] ... a1[N]-a0[N]
|
|
|
|
|
* 0 4 * dispatch_width Attr0.x a2[0]-a0[0] ... a2[N]-a0[N]
|
|
|
|
|
* 0 8 * dispatch_width Attr0.x N/A ... N/A
|
|
|
|
|
* 0 12 * dispatch_width Attr0.x a0[0] ... a0[N]
|
|
|
|
|
* 1 0 Attr0.y a1[0]-a0[0] ... a1[N]-a0[N]
|
|
|
|
|
* ...
|
|
|
|
|
*
|
|
|
|
|
* Note that many of the components on a single row above
|
|
|
|
|
* are likely to be replicated multiple times (if, say, a
|
|
|
|
|
* single SIMD thread is only processing 2 different
|
|
|
|
|
* polygons), so plane parameters aren't actually stored
|
|
|
|
|
* in GRF memory with that layout to avoid wasting space.
|
|
|
|
|
* Instead we compose ATTR register regions with a 2D
|
|
|
|
|
* region that walks through the parameters of each
|
|
|
|
|
* polygon with the correct stride, reading the parameter
|
|
|
|
|
* corresponding to each channel directly from the PS
|
|
|
|
|
* thread payload.
|
|
|
|
|
*
|
|
|
|
|
* The latter layout corresponds to a param_width equal to
|
|
|
|
|
* dispatch_width, while the former (scalar parameter)
|
|
|
|
|
* layout has a param_width of 1.
|
|
|
|
|
*
|
|
|
|
|
* Gfx20+ represent plane parameters in a format similar
|
|
|
|
|
* to the above, except the parameters are packed in 12B
|
|
|
|
|
* and ordered like "a0, a1-a0, a2-a0" instead of the
|
|
|
|
|
* above vec4 representation with a missing component.
|
2025-05-07 09:54:30 +03:00
|
|
|
*
|
|
|
|
|
* First documented in the TGL PRMs, Volume 9: Render Engine, PS
|
|
|
|
|
* Thread Payload for Normal Dispatch.
|
|
|
|
|
*
|
|
|
|
|
* Pre Xe2 : BSpec 47024
|
|
|
|
|
* Xe2+ : BSpec 56480
|
2024-07-12 15:26:20 -07:00
|
|
|
*/
|
|
|
|
|
const unsigned param_width = (s.max_polygons > 1 ? s.dispatch_width : 1);
|
|
|
|
|
|
|
|
|
|
/* Size of a single scalar component of a plane parameter
|
|
|
|
|
* in bytes.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned chan_sz = 4;
|
|
|
|
|
struct brw_reg reg;
|
|
|
|
|
assert(s.max_polygons > 0);
|
|
|
|
|
|
|
|
|
|
/* Calculate the base register on the thread payload of
|
|
|
|
|
* either the block of vertex setup data or the block of
|
|
|
|
|
* per-primitive constant data depending on whether we're
|
|
|
|
|
* accessing a primitive or vertex input. Also calculate
|
|
|
|
|
* the index of the input within that block.
|
|
|
|
|
*/
|
|
|
|
|
const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
|
|
|
|
|
const unsigned base = urb_start +
|
|
|
|
|
(per_prim ? 0 :
|
|
|
|
|
ALIGN(prog_data->num_per_primitive_inputs / 2,
|
|
|
|
|
reg_unit(devinfo)) * s.max_polygons);
|
|
|
|
|
const unsigned idx = per_prim ? inst->src[i].nr :
|
|
|
|
|
inst->src[i].nr - prog_data->num_per_primitive_inputs;
|
|
|
|
|
|
|
|
|
|
/* Translate the offset within the param_width-wide
|
|
|
|
|
* representation described above into an offset and a
|
|
|
|
|
* grf, which contains the plane parameters for the first
|
|
|
|
|
* polygon processed by the thread.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 20 && !per_prim) {
|
|
|
|
|
/* Gfx20+ is able to pack 5 logical input components
|
|
|
|
|
* per 64B register for vertex setup data.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned grf = base + idx / 5 * 2 * s.max_polygons;
|
|
|
|
|
assert(inst->src[i].offset / param_width < 12);
|
|
|
|
|
const unsigned delta = idx % 5 * 12 +
|
|
|
|
|
inst->src[i].offset / (param_width * chan_sz) * chan_sz +
|
|
|
|
|
inst->src[i].offset % chan_sz;
|
|
|
|
|
reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
|
|
|
|
delta);
|
|
|
|
|
} else {
|
|
|
|
|
/* Earlier platforms and per-primitive block pack 2 logical
|
|
|
|
|
* input components per 32B register.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned grf = base + idx / 2 * s.max_polygons;
|
|
|
|
|
assert(inst->src[i].offset / param_width < REG_SIZE / 2);
|
|
|
|
|
const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
|
|
|
|
|
inst->src[i].offset / (param_width * chan_sz) * chan_sz +
|
|
|
|
|
inst->src[i].offset % chan_sz;
|
|
|
|
|
reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
|
|
|
|
delta);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (s.max_polygons > 1) {
|
|
|
|
|
assert(devinfo->ver >= 12);
|
|
|
|
|
/* Misaligned channel strides that would lead to
|
|
|
|
|
* cross-channel access in the representation above are
|
|
|
|
|
* disallowed.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[i].stride * brw_type_size_bytes(inst->src[i].type) == chan_sz);
|
|
|
|
|
|
|
|
|
|
/* Number of channels processing the same polygon. */
|
|
|
|
|
const unsigned poly_width = s.dispatch_width / s.max_polygons;
|
|
|
|
|
assert(s.dispatch_width % s.max_polygons == 0);
|
|
|
|
|
|
|
|
|
|
/* Accessing a subset of channels of a parameter vector
|
|
|
|
|
* starting from "chan" is necessary to handle
|
|
|
|
|
* SIMD-lowered instructions though.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned chan = inst->src[i].offset %
|
|
|
|
|
(param_width * chan_sz) / chan_sz;
|
|
|
|
|
assert(chan < s.dispatch_width);
|
|
|
|
|
assert(chan % poly_width == 0);
|
|
|
|
|
const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
|
|
|
|
|
reg = byte_offset(reg, chan / poly_width * reg_size);
|
|
|
|
|
|
|
|
|
|
if (inst->exec_size > poly_width) {
|
|
|
|
|
/* Accessing the parameters for multiple polygons.
|
|
|
|
|
* Corresponding parameters for different polygons
|
|
|
|
|
* are stored a GRF apart on the thread payload, so
|
|
|
|
|
* use that as vertical stride.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned vstride = reg_size / brw_type_size_bytes(inst->src[i].type);
|
|
|
|
|
assert(vstride <= 32);
|
|
|
|
|
assert(chan % poly_width == 0);
|
|
|
|
|
reg = stride(reg, vstride, poly_width, 0);
|
|
|
|
|
} else {
|
|
|
|
|
/* Accessing one parameter for a single polygon --
|
|
|
|
|
* Translate to a scalar region.
|
|
|
|
|
*/
|
|
|
|
|
assert(chan % poly_width + inst->exec_size <= poly_width);
|
|
|
|
|
reg = stride(reg, 0, 1, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
const unsigned width = inst->src[i].stride == 0 ?
|
|
|
|
|
1 : MIN2(inst->exec_size, 8);
|
|
|
|
|
reg = stride(reg, width * inst->src[i].stride,
|
|
|
|
|
width, inst->src[i].stride);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
inst->src[i] = reg;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-29 12:50:42 +03:00
|
|
|
if (read_attribute_payload) {
|
|
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 15:26:20 -07:00
|
|
|
/* Each attribute is 4 setup channels, each of which is half a reg,
|
|
|
|
|
* but they may be replicated multiple times for multipolygon
|
|
|
|
|
* dispatch.
|
|
|
|
|
*/
|
2025-04-29 12:50:42 +03:00
|
|
|
s.first_non_payload_grf +=
|
|
|
|
|
(read_attribute_payload ? 32 : prog_data->num_varying_inputs) *
|
|
|
|
|
2 * s.max_polygons;
|
2024-07-12 15:26:20 -07:00
|
|
|
|
|
|
|
|
/* Unlike regular attributes, per-primitive attributes have all 4 channels
|
|
|
|
|
* in the same slot, so each GRF can store two slots.
|
|
|
|
|
*/
|
|
|
|
|
assert(prog_data->num_per_primitive_inputs % 2 == 0);
|
|
|
|
|
s.first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * s.max_polygons;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 14:20:57 -07:00
|
|
|
static bool
|
2024-12-07 10:25:45 -08:00
|
|
|
run_fs(brw_shader &s, bool allow_spilling, bool do_rep_send)
|
2024-07-12 14:20:57 -07:00
|
|
|
{
|
|
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
|
|
|
|
|
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) s.key;
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder bld = brw_builder(&s);
|
2024-07-12 14:20:57 -07:00
|
|
|
const nir_shader *nir = s.nir;
|
|
|
|
|
|
|
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
2024-12-06 22:13:36 -08:00
|
|
|
s.payload_ = new brw_fs_thread_payload(s, s.source_depth_to_render_target);
|
2024-07-12 14:20:57 -07:00
|
|
|
|
|
|
|
|
if (nir->info.ray_queries > 0)
|
|
|
|
|
s.limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n");
|
|
|
|
|
|
|
|
|
|
if (do_rep_send) {
|
|
|
|
|
assert(s.dispatch_width == 16);
|
2024-07-12 15:26:20 -07:00
|
|
|
brw_emit_repclear_shader(s);
|
2024-07-12 14:20:57 -07:00
|
|
|
} else {
|
|
|
|
|
if (nir->info.inputs_read > 0 ||
|
|
|
|
|
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
|
|
|
|
|
(nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
|
2024-07-12 15:26:20 -07:00
|
|
|
brw_emit_interpolation_setup(s);
|
2024-07-12 14:20:57 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We handle discards by keeping track of the still-live pixels in f0.1.
|
|
|
|
|
* Initialize it with the dispatched pixels.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 20 || wm_prog_data->uses_kill) {
|
|
|
|
|
const unsigned lower_width = MIN2(s.dispatch_width, 16);
|
|
|
|
|
for (unsigned i = 0; i < s.dispatch_width / lower_width; i++) {
|
|
|
|
|
/* According to the "PS Thread Payload for Normal
|
|
|
|
|
* Dispatch" pages on the BSpec, the dispatch mask is
|
|
|
|
|
* stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
|
|
|
|
|
* gfx6+.
|
|
|
|
|
*/
|
|
|
|
|
const brw_reg dispatch_mask =
|
|
|
|
|
devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
|
|
|
|
|
brw_vec1_grf(i + 1, 7);
|
2025-04-03 01:14:03 -07:00
|
|
|
bld.uniform().MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
|
|
|
|
|
retype(dispatch_mask, BRW_TYPE_UW));
|
2024-07-12 14:20:57 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (nir->info.writes_memory)
|
|
|
|
|
wm_prog_data->has_side_effects = true;
|
|
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir(&s);
|
2024-07-12 14:20:57 -07:00
|
|
|
|
|
|
|
|
if (s.failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
2024-07-12 15:26:20 -07:00
|
|
|
brw_emit_fb_writes(s);
|
2024-09-18 14:33:19 -07:00
|
|
|
if (s.failed)
|
|
|
|
|
return false;
|
2024-07-12 14:20:57 -07:00
|
|
|
|
2024-07-12 17:08:46 -07:00
|
|
|
brw_calculate_cfg(s);
|
2024-07-12 14:20:57 -07:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_optimize(s);
|
2024-07-12 14:20:57 -07:00
|
|
|
|
|
|
|
|
s.assign_curb_setup();
|
|
|
|
|
|
|
|
|
|
if (devinfo->ver == 9)
|
|
|
|
|
gfx9_ps_header_only_workaround(wm_prog_data);
|
|
|
|
|
|
2024-07-12 15:26:20 -07:00
|
|
|
brw_assign_urb_setup(s);
|
2024-07-12 14:20:57 -07:00
|
|
|
|
2025-04-29 12:50:42 +03:00
|
|
|
s.debug_optimizer(nir, "urb_setup", 89, 0);
|
|
|
|
|
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_3src_null_dest(s);
|
|
|
|
|
brw_workaround_emit_dummy_mov_instruction(s);
|
2024-07-12 14:20:57 -07:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_allocate_registers(s, allow_spilling);
|
2024-10-19 12:53:21 +03:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_workaround_source_arf_before_eot(s);
|
2024-07-12 14:20:57 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return !s.failed;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 13:52:46 -07:00
|
|
|
const unsigned *
|
|
|
|
|
brw_compile_fs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_fs_params *params)
|
|
|
|
|
{
|
|
|
|
|
struct nir_shader *nir = params->base.nir;
|
|
|
|
|
const struct brw_wm_prog_key *key = params->key;
|
|
|
|
|
struct brw_wm_prog_data *prog_data = params->prog_data;
|
|
|
|
|
bool allow_spilling = params->allow_spilling;
|
|
|
|
|
const bool debug_enabled =
|
|
|
|
|
brw_should_print_shader(nir, params->base.debug_flag ?
|
|
|
|
|
params->base.debug_flag : DEBUG_WM);
|
|
|
|
|
|
2025-02-12 12:42:08 +02:00
|
|
|
brw_prog_data_init(&prog_data->base, ¶ms->base);
|
2024-07-12 13:52:46 -07:00
|
|
|
|
|
|
|
|
const struct intel_device_info *devinfo = compiler->devinfo;
|
|
|
|
|
const unsigned max_subgroup_size = 32;
|
|
|
|
|
|
|
|
|
|
brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
|
|
|
|
|
brw_nir_lower_fs_inputs(nir, devinfo, key);
|
|
|
|
|
brw_nir_lower_fs_outputs(nir);
|
|
|
|
|
|
|
|
|
|
/* From the SKL PRM, Volume 7, "Alpha Coverage":
|
|
|
|
|
* "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
|
|
|
|
|
* hardware, regardless of the state setting for this feature."
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
if (key->alpha_to_coverage != INTEL_NEVER) {
|
2024-07-12 13:52:46 -07:00
|
|
|
/* Run constant fold optimization in order to get the correct source
|
|
|
|
|
* offset to determine render target 0 store instruction in
|
|
|
|
|
* emit_alpha_to_coverage pass.
|
|
|
|
|
*/
|
|
|
|
|
NIR_PASS(_, nir, nir_opt_constant_folding);
|
2024-05-12 14:39:14 +03:00
|
|
|
NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage);
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
|
2024-05-12 14:39:14 +03:00
|
|
|
NIR_PASS(_, nir, brw_nir_lower_fs_msaa, key);
|
2024-07-12 13:52:46 -07:00
|
|
|
brw_postprocess_nir(nir, compiler, debug_enabled,
|
|
|
|
|
key->base.robust_flags);
|
|
|
|
|
|
|
|
|
|
brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
|
|
|
|
|
params->mue_map);
|
|
|
|
|
|
2024-09-18 14:24:41 -07:00
|
|
|
/* Either an unrestricted or a fixed SIMD16 subgroup size are
|
|
|
|
|
* allowed -- The latter is needed for fast clear and replicated
|
|
|
|
|
* data clear shaders.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned reqd_dispatch_width = brw_required_dispatch_width(&nir->info);
|
|
|
|
|
assert(reqd_dispatch_width == SUBGROUP_SIZE_VARYING ||
|
|
|
|
|
reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16);
|
|
|
|
|
|
2024-12-07 10:25:45 -08:00
|
|
|
std::unique_ptr<brw_shader> v8, v16, v32, vmulti;
|
2024-07-12 13:52:46 -07:00
|
|
|
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
|
|
|
|
|
*multi_cfg = NULL;
|
|
|
|
|
float throughput = 0;
|
|
|
|
|
bool has_spilled = false;
|
|
|
|
|
|
|
|
|
|
if (devinfo->ver < 20) {
|
2024-12-07 10:25:45 -08:00
|
|
|
v8 = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-07-12 13:52:46 -07:00
|
|
|
prog_data, nir, 8, 1,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
2024-07-12 14:20:57 -07:00
|
|
|
if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) {
|
2024-07-12 13:52:46 -07:00
|
|
|
params->base.error_str = ralloc_strdup(params->base.mem_ctx,
|
|
|
|
|
v8->fail_msg);
|
|
|
|
|
return NULL;
|
|
|
|
|
} else if (INTEL_SIMD(FS, 8)) {
|
|
|
|
|
simd8_cfg = v8->cfg;
|
|
|
|
|
|
|
|
|
|
assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
|
2024-09-18 14:32:58 -07:00
|
|
|
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
|
|
|
|
|
v8->grf_used);
|
2024-07-12 13:52:46 -07:00
|
|
|
|
2024-12-06 21:20:58 -08:00
|
|
|
const brw_performance &perf = v8->performance_analysis.require();
|
2024-07-12 13:52:46 -07:00
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
|
|
|
|
has_spilled = v8->spilled_any_registers;
|
|
|
|
|
allow_spilling = false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-18 14:37:32 -07:00
|
|
|
if (key->coarse_pixel) {
|
|
|
|
|
if (prog_data->dual_src_blend) {
|
|
|
|
|
v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
|
|
|
|
|
" use SIMD8 messages.\n");
|
|
|
|
|
}
|
|
|
|
|
v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
|
|
|
|
|
" pixel shading.\n");
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-16 04:01:15 -08:00
|
|
|
if (devinfo->ver >= 30) {
|
2024-09-18 14:38:19 -07:00
|
|
|
unsigned max_dispatch_width = reqd_dispatch_width ? reqd_dispatch_width : 32;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader *vbase = NULL;
|
2024-09-18 14:38:19 -07:00
|
|
|
|
|
|
|
|
if (params->max_polygons >= 2 && !key->coarse_pixel) {
|
|
|
|
|
if (params->max_polygons >= 4 && max_dispatch_width >= 32 &&
|
|
|
|
|
4 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 4X8)) {
|
|
|
|
|
/* Try a quad-SIMD8 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
vmulti = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-09-18 14:38:19 -07:00
|
|
|
prog_data, nir, 32, 4,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
|
|
|
|
|
|
|
|
|
|
if (!run_fs(*vmulti, false, false)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Quad-SIMD8 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
vbase = vmulti.get();
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
assert(!vmulti->spilled_any_registers);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!vbase && max_dispatch_width >= 32 &&
|
|
|
|
|
2 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 2X16)) {
|
|
|
|
|
/* Try a dual-SIMD16 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
vmulti = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-09-18 14:38:19 -07:00
|
|
|
prog_data, nir, 32, 2,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
|
|
|
|
|
|
|
|
|
|
if (!run_fs(*vmulti, false, false)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Dual-SIMD16 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
vbase = vmulti.get();
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
assert(!vmulti->spilled_any_registers);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!vbase && max_dispatch_width >= 16 &&
|
|
|
|
|
2 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 2X8)) {
|
|
|
|
|
/* Try a dual-SIMD8 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
vmulti = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-09-18 14:38:19 -07:00
|
|
|
prog_data, nir, 16, 2,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
|
|
|
|
|
|
|
|
|
|
if (!run_fs(*vmulti, false, false)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Dual-SIMD8 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
vbase = vmulti.get();
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ((!vbase || vbase->dispatch_width < 32) &&
|
|
|
|
|
max_dispatch_width >= 32 &&
|
2024-11-08 02:52:19 -08:00
|
|
|
INTEL_SIMD(FS, 32) &&
|
|
|
|
|
!prog_data->base.ray_queries) {
|
2024-09-18 14:38:19 -07:00
|
|
|
/* Try a SIMD32 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
v32 = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-09-18 14:38:19 -07:00
|
|
|
prog_data, nir, 32, 1,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
if (vbase)
|
|
|
|
|
v32->import_uniforms(vbase);
|
|
|
|
|
|
|
|
|
|
if (!run_fs(*v32, false, false)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"SIMD32 shader failed to compile: %s\n",
|
|
|
|
|
v32->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
if (!vbase)
|
|
|
|
|
vbase = v32.get();
|
|
|
|
|
|
|
|
|
|
simd32_cfg = v32->cfg;
|
|
|
|
|
assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
|
2024-09-18 14:32:58 -07:00
|
|
|
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
|
|
|
|
|
v32->grf_used);
|
2024-09-18 14:38:19 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!vbase && INTEL_SIMD(FS, 16)) {
|
|
|
|
|
/* Try a SIMD16 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
v16 = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-09-18 14:38:19 -07:00
|
|
|
prog_data, nir, 16, 1,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
|
|
|
|
|
if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"SIMD16 shader failed to compile: %s\n",
|
|
|
|
|
v16->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
simd16_cfg = v16->cfg;
|
|
|
|
|
|
|
|
|
|
assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
|
2024-09-18 14:32:58 -07:00
|
|
|
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
|
|
|
|
|
v16->grf_used);
|
2024-09-18 14:38:19 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-16 04:01:15 -08:00
|
|
|
} else {
|
|
|
|
|
if ((!has_spilled && (!v8 || v8->max_dispatch_width >= 16) &&
|
|
|
|
|
INTEL_SIMD(FS, 16)) ||
|
|
|
|
|
reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16) {
|
|
|
|
|
/* Try a SIMD16 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
v16 = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-12-16 04:01:15 -08:00
|
|
|
prog_data, nir, 16, 1,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
if (v8)
|
|
|
|
|
v16->import_uniforms(v8.get());
|
|
|
|
|
if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
|
2024-07-12 13:52:46 -07:00
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2024-12-16 04:01:15 -08:00
|
|
|
"SIMD16 shader failed to compile: %s\n",
|
|
|
|
|
v16->fail_msg);
|
2024-07-12 13:52:46 -07:00
|
|
|
} else {
|
2024-12-16 04:01:15 -08:00
|
|
|
simd16_cfg = v16->cfg;
|
2024-07-12 13:52:46 -07:00
|
|
|
|
2024-12-16 04:01:15 -08:00
|
|
|
assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
|
2024-09-18 14:32:58 -07:00
|
|
|
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
|
|
|
|
|
v16->grf_used);
|
2024-07-12 13:52:46 -07:00
|
|
|
|
2024-12-06 21:20:58 -08:00
|
|
|
const brw_performance &perf = v16->performance_analysis.require();
|
2024-07-12 13:52:46 -07:00
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
2024-12-16 04:01:15 -08:00
|
|
|
has_spilled = v16->spilled_any_registers;
|
|
|
|
|
allow_spilling = false;
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-16 04:01:15 -08:00
|
|
|
const bool simd16_failed = v16 && !simd16_cfg;
|
|
|
|
|
|
|
|
|
|
/* Currently, the compiler only supports SIMD32 on SNB+ */
|
|
|
|
|
if (!has_spilled &&
|
|
|
|
|
(!v8 || v8->max_dispatch_width >= 32) &&
|
|
|
|
|
(!v16 || v16->max_dispatch_width >= 32) &&
|
|
|
|
|
reqd_dispatch_width == SUBGROUP_SIZE_VARYING &&
|
|
|
|
|
!simd16_failed && INTEL_SIMD(FS, 32)) {
|
|
|
|
|
/* Try a SIMD32 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
v32 = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-12-16 04:01:15 -08:00
|
|
|
prog_data, nir, 32, 1,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
if (v8)
|
|
|
|
|
v32->import_uniforms(v8.get());
|
|
|
|
|
else if (v16)
|
|
|
|
|
v32->import_uniforms(v16.get());
|
|
|
|
|
|
|
|
|
|
if (!run_fs(*v32, allow_spilling, false)) {
|
2024-07-12 13:52:46 -07:00
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2024-12-16 04:01:15 -08:00
|
|
|
"SIMD32 shader failed to compile: %s\n",
|
|
|
|
|
v32->fail_msg);
|
2024-07-12 13:52:46 -07:00
|
|
|
} else {
|
2024-12-06 21:20:58 -08:00
|
|
|
const brw_performance &perf = v32->performance_analysis.require();
|
2024-12-16 04:01:15 -08:00
|
|
|
|
|
|
|
|
if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"SIMD32 shader inefficient\n");
|
|
|
|
|
} else {
|
|
|
|
|
simd32_cfg = v32->cfg;
|
|
|
|
|
|
|
|
|
|
assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
|
2024-09-18 14:32:58 -07:00
|
|
|
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
|
|
|
|
|
v32->grf_used);
|
2024-12-16 04:01:15 -08:00
|
|
|
|
|
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
|
|
|
|
}
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-16 04:01:15 -08:00
|
|
|
if (devinfo->ver >= 12 && !has_spilled &&
|
|
|
|
|
params->max_polygons >= 2 && !key->coarse_pixel &&
|
|
|
|
|
reqd_dispatch_width == SUBGROUP_SIZE_VARYING) {
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
|
2024-12-16 04:01:15 -08:00
|
|
|
assert(vbase);
|
|
|
|
|
|
|
|
|
|
if (devinfo->ver >= 20 &&
|
|
|
|
|
params->max_polygons >= 4 &&
|
|
|
|
|
vbase->max_dispatch_width >= 32 &&
|
|
|
|
|
4 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 4X8)) {
|
|
|
|
|
/* Try a quad-SIMD8 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
vmulti = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-12-16 04:01:15 -08:00
|
|
|
prog_data, nir, 32, 4,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
vmulti->import_uniforms(vbase);
|
|
|
|
|
if (!run_fs(*vmulti, false, params->use_rep_send)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Quad-SIMD8 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
assert(!vmulti->spilled_any_registers);
|
|
|
|
|
}
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
|
2024-12-16 04:01:15 -08:00
|
|
|
if (!multi_cfg && devinfo->ver >= 20 &&
|
|
|
|
|
vbase->max_dispatch_width >= 32 &&
|
|
|
|
|
2 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 2X16)) {
|
|
|
|
|
/* Try a dual-SIMD16 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
vmulti = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-12-16 04:01:15 -08:00
|
|
|
prog_data, nir, 32, 2,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
vmulti->import_uniforms(vbase);
|
|
|
|
|
if (!run_fs(*vmulti, false, params->use_rep_send)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Dual-SIMD16 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
assert(!vmulti->spilled_any_registers);
|
|
|
|
|
}
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
|
2024-12-16 04:01:15 -08:00
|
|
|
if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
|
|
|
|
|
2 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 2X8)) {
|
|
|
|
|
/* Try a dual-SIMD8 compile */
|
2024-12-07 10:25:45 -08:00
|
|
|
vmulti = std::make_unique<brw_shader>(compiler, ¶ms->base, key,
|
2024-12-16 04:01:15 -08:00
|
|
|
prog_data, nir, 16, 2,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
vmulti->import_uniforms(vbase);
|
|
|
|
|
if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Dual-SIMD8 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-07-12 13:52:46 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-16 04:01:15 -08:00
|
|
|
if (multi_cfg) {
|
|
|
|
|
assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
|
2024-09-18 14:32:58 -07:00
|
|
|
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
|
|
|
|
|
vmulti->grf_used);
|
2024-12-16 04:01:15 -08:00
|
|
|
}
|
|
|
|
|
|
2024-09-18 14:24:41 -07:00
|
|
|
/* When the caller compiles a repclear or fast clear shader, they
|
|
|
|
|
* want SIMD16-only.
|
|
|
|
|
*/
|
|
|
|
|
if (reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16)
|
2024-07-12 13:52:46 -07:00
|
|
|
simd8_cfg = NULL;
|
|
|
|
|
|
2024-12-06 16:33:35 -08:00
|
|
|
brw_generator g(compiler, ¶ms->base, &prog_data->base,
|
2024-07-12 13:52:46 -07:00
|
|
|
MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
|
|
|
|
if (unlikely(debug_enabled)) {
|
|
|
|
|
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
"%s fragment shader %s",
|
|
|
|
|
nir->info.label ?
|
|
|
|
|
nir->info.label : "unnamed",
|
|
|
|
|
nir->info.name));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct brw_compile_stats *stats = params->base.stats;
|
|
|
|
|
uint32_t max_dispatch_width = 0;
|
|
|
|
|
|
|
|
|
|
if (multi_cfg) {
|
|
|
|
|
prog_data->dispatch_multi = vmulti->dispatch_width;
|
|
|
|
|
prog_data->max_polygons = vmulti->max_polygons;
|
|
|
|
|
g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
|
|
|
|
|
vmulti->performance_analysis.require(),
|
|
|
|
|
stats, vmulti->max_polygons);
|
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
|
|
|
|
max_dispatch_width = vmulti->dispatch_width;
|
|
|
|
|
|
|
|
|
|
} else if (simd8_cfg) {
|
|
|
|
|
prog_data->dispatch_8 = true;
|
|
|
|
|
g.generate_code(simd8_cfg, 8, v8->shader_stats,
|
|
|
|
|
v8->performance_analysis.require(), stats, 1);
|
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
|
|
|
|
max_dispatch_width = 8;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (simd16_cfg) {
|
|
|
|
|
prog_data->dispatch_16 = true;
|
|
|
|
|
prog_data->prog_offset_16 = g.generate_code(
|
|
|
|
|
simd16_cfg, 16, v16->shader_stats,
|
|
|
|
|
v16->performance_analysis.require(), stats, 1);
|
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
|
|
|
|
max_dispatch_width = 16;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (simd32_cfg) {
|
|
|
|
|
prog_data->dispatch_32 = true;
|
|
|
|
|
prog_data->prog_offset_32 = g.generate_code(
|
|
|
|
|
simd32_cfg, 32, v32->shader_stats,
|
|
|
|
|
v32->performance_analysis.require(), stats, 1);
|
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
|
|
|
|
max_dispatch_width = 32;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
|
|
|
|
|
s->max_dispatch_width = max_dispatch_width;
|
|
|
|
|
|
|
|
|
|
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
|
|
|
|
return g.get_assembly();
|
|
|
|
|
}
|
2025-03-10 10:31:55 +02:00
|
|
|
|
|
|
|
|
extern "C" void
|
|
|
|
|
brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
{
|
|
|
|
|
fprintf(fp, "FS URB (inputs=0x%016lx, flat_inputs=0x%08x):\n",
|
|
|
|
|
prog_data->inputs, prog_data->flat_inputs);
|
|
|
|
|
fprintf(fp, " URB setup:\n");
|
|
|
|
|
for (uint32_t i = 0; i < ARRAY_SIZE(prog_data->urb_setup); i++) {
|
|
|
|
|
if (prog_data->urb_setup[i] >= 0) {
|
|
|
|
|
fprintf(fp, " [%02d]: %i channel=%u (%s)\n",
|
|
|
|
|
i, prog_data->urb_setup[i], prog_data->urb_setup_channel[i],
|
|
|
|
|
gl_varying_slot_name_for_stage((gl_varying_slot)i,
|
|
|
|
|
MESA_SHADER_FRAGMENT));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
fprintf(fp, " URB setup attributes:\n");
|
|
|
|
|
for (uint32_t i = 0; i < prog_data->urb_setup_attribs_count; i++) {
|
|
|
|
|
fprintf(fp, " [%02d]: %i (%s)\n",
|
|
|
|
|
i, prog_data->urb_setup_attribs[i],
|
|
|
|
|
gl_varying_slot_name_for_stage((gl_varying_slot)i,
|
|
|
|
|
MESA_SHADER_FRAGMENT));
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-03-19 12:11:53 +02:00
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
|
brw_compute_first_fs_urb_slot_required(uint64_t inputs_read,
|
|
|
|
|
const struct intel_vue_map *prev_stage_vue_map)
|
|
|
|
|
{
|
|
|
|
|
/* The header slots are irrelevant for the URB varying slots. They are
|
|
|
|
|
* delivered somewhere else in the thread payload.
|
|
|
|
|
*
|
|
|
|
|
* For example on DG2:
|
|
|
|
|
* - PRIMITIVE_SHADING_RATE : R1.0, ActualCoarsePixelShadingSize.(X|Y)
|
|
|
|
|
* - LAYER : R1.1, Render Target Array Index
|
|
|
|
|
* - VIEWPORT : R1.1, Viewport Index
|
|
|
|
|
* - PSIZ : not available in fragment shaders
|
|
|
|
|
*/
|
|
|
|
|
inputs_read &= ~BRW_VUE_HEADER_VARYING_MASK;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
|
|
|
|
|
int varying = prev_stage_vue_map->slot_to_varying[i];
|
|
|
|
|
if (varying != BRW_VARYING_SLOT_PAD && varying > 0 &&
|
|
|
|
|
(inputs_read & BITFIELD64_BIT(varying)) != 0) {
|
|
|
|
|
return ROUND_DOWN_TO(i, 2);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|