mesa/src/intel/compiler/brw_fs_thread_payload.cpp
Francisco Jerez ef6ef7aa8e intel/fs/xe2+: Implement PS thread payload register offset setup.
The PS thread payload format has changed enough in Xe2 that it
probably doesn't make sense to share code with gfx6.  See BSpec page
"PS Thread Payload for Normal Dispatch - 512 bit GRF" for the new
format.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26606>
2023-12-28 11:07:03 -08:00

582 lines
18 KiB
C++

/*
* Copyright © 2006-2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
#include "brw_fs_builder.h"
using namespace brw;
vs_thread_payload::vs_thread_payload(const fs_visitor &v)
{
unsigned r = 0;
/* R0: Thread header. */
r += reg_unit(v.devinfo);
/* R1: URB handles. */
urb_handles = brw_ud8_grf(r, 0);
r += reg_unit(v.devinfo);
num_regs = r;
}
tcs_thread_payload::tcs_thread_payload(const fs_visitor &v)
{
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(v.prog_data);
struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) v.key;
if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
patch_urb_output = brw_ud1_grf(0, 0);
primitive_id = brw_vec1_grf(0, 1);
/* r1-r4 contain the ICP handles. */
icp_handle_start = brw_ud8_grf(1, 0);
num_regs = 5;
} else {
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_MULTI_PATCH);
assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES);
unsigned r = 0;
r += reg_unit(v.devinfo);
patch_urb_output = brw_ud8_grf(r, 0);
r += reg_unit(v.devinfo);
if (tcs_prog_data->include_primitive_id) {
primitive_id = brw_vec8_grf(r, 0);
r += reg_unit(v.devinfo);
}
/* ICP handles occupy the next 1-32 registers. */
icp_handle_start = brw_ud8_grf(r, 0);
r += brw_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
num_regs = r;
}
}
tes_thread_payload::tes_thread_payload(const fs_visitor &v)
{
unsigned r = 0;
/* R0: Thread Header. */
patch_urb_input = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
primitive_id = brw_vec1_grf(0, 1);
r += reg_unit(v.devinfo);
/* R1-3: gl_TessCoord.xyz. */
for (unsigned i = 0; i < 3; i++) {
coords[i] = brw_vec8_grf(r, 0);
r += reg_unit(v.devinfo);
}
/* R4: URB output handles. */
urb_output = brw_ud8_grf(r, 0);
r += reg_unit(v.devinfo);
num_regs = r;
}
gs_thread_payload::gs_thread_payload(fs_visitor &v)
{
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data);
const fs_builder bld = fs_builder(&v).at_end();
/* R0: thread header. */
unsigned r = reg_unit(v.devinfo);
/* R1: output URB handles. */
urb_handles = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(urb_handles, brw_ud8_grf(r, 0),
v.devinfo->ver >= 20 ? brw_imm_ud(0xFFFFFF) : brw_imm_ud(0xFFFF));
r += reg_unit(v.devinfo);
if (gs_prog_data->include_primitive_id) {
primitive_id = brw_ud8_grf(r, 0);
r += reg_unit(v.devinfo);
}
/* Always enable VUE handles so we can safely use pull model if needed.
*
* The push model for a GS uses a ton of register space even for trivial
* scenarios with just a few inputs, so just make things easier and a bit
* safer by always having pull model available.
*/
gs_prog_data->base.include_vue_handles = true;
/* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
icp_handle_start = brw_ud8_grf(r, 0);
r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
num_regs = r;
/* Use a maximum of 24 registers for push-model inputs. */
const unsigned max_push_components = 24;
/* If pushing our inputs would take too many registers, reduce the URB read
* length (which is in HWords, or 8 registers), and resort to pulling.
*
* Note that the GS reads <URB Read Length> HWords for every vertex - so we
* have to multiply by VerticesIn to obtain the total storage requirement.
*/
if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
max_push_components) {
vue_prog_data->urb_read_length =
ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
}
}
static inline void
setup_fs_payload_gfx20(fs_thread_payload &payload,
const fs_visitor &v,
bool &source_depth_to_render_target)
{
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
const unsigned payload_width = 16;
assert(v.dispatch_width % payload_width == 0);
assert(v.devinfo->ver >= 20);
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
/* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */
payload.num_regs++;
payload.subspan_coord_reg[j] = payload.num_regs++;
}
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
/* R2-13: Barycentric interpolation coordinates. These appear
* in the same order that they appear in the brw_barycentric_mode
* enum. Each set of coordinates occupies 2 64B registers per
* SIMD16 half. Coordinates only appear if they were enabled
* using the "Barycentric Interpolation Mode" bits in WM_STATE.
*/
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
if (prog_data->barycentric_interp_modes & (1 << i)) {
payload.barycentric_coord_reg[i][j] = payload.num_regs;
payload.num_regs += payload_width / 4;
}
}
/* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */
if (prog_data->uses_src_depth) {
payload.source_depth_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */
if (prog_data->uses_src_w) {
payload.source_w_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R16: MSAA input coverage mask if "Pixel Shader Uses Input
* Coverage Mask" is set.
*/
if (prog_data->uses_sample_mask) {
payload.sample_mask_in_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R19: MSAA position XY offsets if "Position XY Offset Select"
* is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE. Note that
* this is delivered as a single SIMD32 vector, inconsistently
* with most other PS payload fields.
*/
if (prog_data->uses_pos_offset && j == 0) {
for (unsigned k = 0; k < 2; k++) {
payload.sample_pos_reg[k] = payload.num_regs;
payload.num_regs++;
}
}
}
if (prog_data->uses_depth_w_coefficients) {
assert(v.max_polygons == 1);
payload.depth_w_coef_reg = payload.num_regs;
payload.num_regs += 2;
}
if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
source_depth_to_render_target = true;
}
}
static inline void
setup_fs_payload_gfx6(fs_thread_payload &payload,
const fs_visitor &v,
bool &source_depth_to_render_target)
{
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
const unsigned payload_width = MIN2(16, v.dispatch_width);
assert(v.dispatch_width % payload_width == 0);
assert(v.devinfo->ver >= 6 && v.devinfo->ver < 20);
payload.num_regs = 0;
/* R0: PS thread payload header. */
payload.num_regs++;
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
/* R1: masks, pixel X/Y coordinates. */
payload.subspan_coord_reg[j] = payload.num_regs++;
}
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
/* R3-26: barycentric interpolation coordinates. These appear in the
* same order that they appear in the brw_barycentric_mode enum. Each
* set of coordinates occupies 2 registers if dispatch width == 8 and 4
* registers if dispatch width == 16. Coordinates only appear if they
* were enabled using the "Barycentric Interpolation Mode" bits in
* WM_STATE.
*/
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
if (prog_data->barycentric_interp_modes & (1 << i)) {
payload.barycentric_coord_reg[i][j] = payload.num_regs;
payload.num_regs += payload_width / 4;
}
}
/* R27-28: interpolated depth if uses source depth */
if (prog_data->uses_src_depth) {
payload.source_depth_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
if (prog_data->uses_src_w) {
payload.source_w_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R31: MSAA position offsets. */
if (prog_data->uses_pos_offset) {
payload.sample_pos_reg[j] = payload.num_regs;
payload.num_regs++;
}
/* R32-33: MSAA input coverage mask */
if (prog_data->uses_sample_mask) {
assert(v.devinfo->ver >= 7);
payload.sample_mask_in_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
}
/* R66: Source Depth and/or W Attribute Vertex Deltas */
if (prog_data->uses_depth_w_coefficients) {
assert(v.max_polygons == 1);
payload.depth_w_coef_reg = payload.num_regs;
payload.num_regs++;
}
if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
source_depth_to_render_target = true;
}
}
#undef P /* prompted depth */
#undef C /* computed */
#undef N /* non-promoted? */
#define P 0
#define C 1
#define N 2
static const struct {
GLuint mode:2;
GLuint sd_present:1;
GLuint sd_to_rt:1;
GLuint dd_present:1;
GLuint ds_present:1;
} wm_iz_table[BRW_WM_IZ_BIT_MAX] =
{
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 0 },
{ N, 0, 1, 0, 0 },
{ N, 0, 1, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ C, 0, 1, 1, 0 },
{ C, 0, 1, 1, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 0 },
{ C, 0, 1, 1, 0 },
{ C, 0, 1, 1, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 0 },
{ N, 0, 1, 0, 0 },
{ N, 0, 1, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ C, 0, 1, 1, 0 },
{ C, 0, 1, 1, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 0 },
{ C, 0, 1, 1, 0 },
{ C, 0, 1, 1, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 1 },
{ N, 0, 1, 0, 1 },
{ N, 0, 1, 0, 1 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ C, 0, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 1 },
{ C, 0, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ P, 0, 0, 0, 0 },
{ C, 0, 0, 0, 1 },
{ P, 0, 0, 0, 0 },
{ C, 0, 1, 0, 1 },
{ P, 0, 0, 0, 0 },
{ C, 1, 1, 0, 1 },
{ C, 0, 1, 0, 1 },
{ C, 0, 1, 0, 1 },
{ P, 0, 0, 0, 0 },
{ C, 1, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ P, 0, 0, 0, 0 },
{ C, 1, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ C, 0, 1, 1, 1 }
};
/**
* \param line_aa BRW_NEVER, BRW_ALWAYS or BRW_SOMETIMES
* \param lookup bitmask of BRW_WM_IZ_* flags
*/
static inline void
setup_fs_payload_gfx4(fs_thread_payload &payload,
const fs_visitor &v,
bool &source_depth_to_render_target,
bool &runtime_check_aads_emit)
{
assert(v.dispatch_width <= 16);
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
brw_wm_prog_key *key = (brw_wm_prog_key *) v.key;
GLuint reg = 1;
bool kill_stats_promoted_workaround = false;
int lookup = key->iz_lookup;
assert(lookup < BRW_WM_IZ_BIT_MAX);
/* Crazy workaround in the windowizer, which we need to track in
* our register allocation and render target writes. See the "If
* statistics are enabled..." paragraph of 11.5.3.2: Early Depth
* Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
*/
if (key->stats_wm &&
(lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
wm_iz_table[lookup].mode == P) {
kill_stats_promoted_workaround = true;
}
payload.subspan_coord_reg[0] = reg++;
if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
kill_stats_promoted_workaround) {
payload.source_depth_reg[0] = reg;
reg += 2;
}
if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
source_depth_to_render_target = true;
if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_NEVER) {
payload.aa_dest_stencil_reg[0] = reg;
runtime_check_aads_emit =
!wm_iz_table[lookup].ds_present && key->line_aa == BRW_SOMETIMES;
reg++;
}
if (wm_iz_table[lookup].dd_present) {
payload.dest_depth_reg[0] = reg;
reg+=2;
}
payload.num_regs = reg;
}
#undef P /* prompted depth */
#undef C /* computed */
#undef N /* non-promoted? */
fs_thread_payload::fs_thread_payload(const fs_visitor &v,
bool &source_depth_to_render_target,
bool &runtime_check_aads_emit)
: subspan_coord_reg(),
source_depth_reg(),
source_w_reg(),
aa_dest_stencil_reg(),
dest_depth_reg(),
sample_pos_reg(),
sample_mask_in_reg(),
depth_w_coef_reg(),
barycentric_coord_reg()
{
if (v.devinfo->ver >= 20)
setup_fs_payload_gfx20(*this, v, source_depth_to_render_target);
else if (v.devinfo->ver >= 6)
setup_fs_payload_gfx6(*this, v, source_depth_to_render_target);
else
setup_fs_payload_gfx4(*this, v, source_depth_to_render_target,
runtime_check_aads_emit);
}
cs_thread_payload::cs_thread_payload(const fs_visitor &v)
{
/* See nir_setup_uniforms for subgroup_id in earlier versions. */
if (v.devinfo->verx10 >= 125)
subgroup_id_ = brw_ud1_grf(0, 2);
/* TODO: Fill out uses_btd_stack_ids automatically */
num_regs = (1 + brw_cs_prog_data(v.prog_data)->uses_btd_stack_ids) *
reg_unit(v.devinfo);
}
void
cs_thread_payload::load_subgroup_id(const fs_builder &bld,
fs_reg &dest) const
{
auto devinfo = bld.shader->devinfo;
dest = retype(dest, BRW_REGISTER_TYPE_UD);
if (subgroup_id_.file != BAD_FILE) {
assert(devinfo->verx10 >= 125);
bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0)));
} else {
assert(devinfo->verx10 < 125);
assert(gl_shader_stage_is_compute(bld.shader->stage));
int index = brw_get_subgroup_id_param_index(devinfo,
bld.shader->stage_prog_data);
bld.MOV(dest, fs_reg(UNIFORM, index, BRW_REGISTER_TYPE_UD));
}
}
task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
: cs_thread_payload(v)
{
/* Task and Mesh Shader Payloads (SIMD8 and SIMD16)
*
* R0: Header
* R1: Local_ID.X[0-7 or 0-15]
* R2: Inline Parameter
*
* Task and Mesh Shader Payloads (SIMD32)
*
* R0: Header
* R1: Local_ID.X[0-15]
* R2: Local_ID.X[16-31]
* R3: Inline Parameter
*
* Local_ID.X values are 16 bits.
*
* Inline parameter is optional but always present since we use it to pass
* the address to descriptors.
*/
const fs_builder bld = fs_builder(&v).at_end();
unsigned r = 0;
assert(subgroup_id_.file != BAD_FILE);
extended_parameter_0 = retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
if (v.devinfo->ver >= 20) {
urb_output = brw_ud1_grf(1, 0);
} else {
urb_output = bld.vgrf(BRW_REGISTER_TYPE_UD);
/* In both mesh and task shader payload, lower 16 bits of g0.6 is
* an offset within Slice's Local URB, which says where shader is
* supposed to output its data.
*/
bld.AND(urb_output, brw_ud1_grf(0, 6), brw_imm_ud(0xFFFF));
}
if (v.stage == MESA_SHADER_MESH) {
/* g0.7 is Task Shader URB Entry Offset, which contains both an offset
* within Slice's Local USB (bits 0:15) and a slice selector
* (bits 16:24). Slice selector can be non zero when mesh shader
* is spawned on slice other than the one where task shader was run.
* Bit 24 says that Slice ID is present and bits 16:23 is the Slice ID.
*/
task_urb_input = brw_ud1_grf(0, 7);
}
r += reg_unit(v.devinfo);
local_index = brw_uw8_grf(r, 0);
r += reg_unit(v.devinfo);
if (v.devinfo->ver < 20 && v.dispatch_width == 32)
r += reg_unit(v.devinfo);
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
num_regs = r;
}
bs_thread_payload::bs_thread_payload(const fs_visitor &v)
{
unsigned r = 0;
/* R0: Thread header. */
r += reg_unit(v.devinfo);
/* R1: Stack IDs. */
r += reg_unit(v.devinfo);
/* R2: Inline Parameter. Used for argument addresses. */
global_arg_ptr = brw_ud1_grf(r, 0);
local_arg_ptr = brw_ud1_grf(r, 2);
r += reg_unit(v.devinfo);
num_regs = r;
}
void
bs_thread_payload::load_shader_type(const fs_builder &bld, fs_reg &dest) const
{
fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD);
bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type));
bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf));
}