mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-04 19:20:38 +01:00
brw: Convert VS/TES/GS outputs to URB intrinsics.
For VS/TES/GS, we lower all outputs to temporaries and emit copies at the end of the shader (or for GS, at each EmitVertex() call) from those temporaries back to real outputs. We use vec8 URB writes without writemasking, since our output area's contents are undefined anyhow. This is simpler than what TCS and Mesh do, which allow for output variables to be read/written at a per-component level at any time, with the output memory being used for cross-thread communication. Rather than using the complicated TCS/Mesh handling and relying on vectorization, we port the emit_urb_writes() approach to NIR. This also takes care of emitting the VUE header with default values when fields aren't explicitly written by the shader. We also handle multiview in the process. It simplifies things, and also drops another case of non-semantic IO in brw. Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39666>
This commit is contained in:
parent
52341b8b9c
commit
6fbe201a12
8 changed files with 172 additions and 391 deletions
|
|
@ -288,6 +288,14 @@ brw_compile_gs(const struct brw_compiler *compiler,
|
|||
prog_data->output_vertex_size_hwords =
|
||||
align(output_vertex_size_bytes, 32) / 32;
|
||||
|
||||
const unsigned starting_urb_offset =
|
||||
2 * prog_data->control_data_header_size_hwords +
|
||||
((prog_data->static_vertex_count == -1) ? 2 : 0);
|
||||
|
||||
BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, compiler->devinfo,
|
||||
&prog_data->base.vue_map, starting_urb_offset,
|
||||
2 * prog_data->output_vertex_size_hwords);
|
||||
|
||||
/* Compute URB entry size. The maximum allowed URB entry size is 32k.
|
||||
* That divides up as follows:
|
||||
*
|
||||
|
|
|
|||
|
|
@ -40,8 +40,6 @@ run_tes(brw_shader &s)
|
|||
if (s.failed)
|
||||
return false;
|
||||
|
||||
s.emit_urb_writes();
|
||||
|
||||
brw_calculate_cfg(s);
|
||||
|
||||
s.emit_tes_terminate();
|
||||
|
|
@ -132,6 +130,9 @@ brw_compile_tes(const struct brw_compiler *compiler,
|
|||
|
||||
brw_postprocess_nir(pt, debug_enabled, key->base.robust_flags);
|
||||
|
||||
BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
|
||||
&prog_data->base.vue_map, 0, 0);
|
||||
|
||||
unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;
|
||||
|
||||
assert(output_size_bytes >= 1);
|
||||
|
|
|
|||
|
|
@ -214,8 +214,6 @@ run_vs(brw_shader &s)
|
|||
if (s.failed)
|
||||
return false;
|
||||
|
||||
s.emit_urb_writes();
|
||||
|
||||
brw_calculate_cfg(s);
|
||||
|
||||
ASSERTED bool eot = s.mark_last_urb_write_with_eot();
|
||||
|
|
@ -300,6 +298,9 @@ brw_compile_vs(const struct brw_compiler *compiler,
|
|||
brw_postprocess_nir(pt, debug_enabled,
|
||||
key->base.robust_flags);
|
||||
|
||||
BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, compiler->devinfo,
|
||||
&prog_data->base.vue_map, 0, 0);
|
||||
|
||||
unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
|
||||
/* gl_VertexID and gl_InstanceID are system values, but arrive via an
|
||||
* incoming vertex attribute. So, add an extra slot.
|
||||
|
|
|
|||
|
|
@ -78,56 +78,6 @@ setup_imm_b(const brw_builder &bld, int8_t v)
|
|||
return tmp;
|
||||
}
|
||||
|
||||
static void
|
||||
brw_from_nir_setup_outputs(nir_to_brw_state &ntb)
|
||||
{
|
||||
brw_shader &s = ntb.s;
|
||||
|
||||
if (s.stage == MESA_SHADER_TESS_CTRL ||
|
||||
s.stage == MESA_SHADER_TASK ||
|
||||
s.stage == MESA_SHADER_MESH ||
|
||||
s.stage == MESA_SHADER_FRAGMENT ||
|
||||
s.stage == MESA_SHADER_COMPUTE)
|
||||
return;
|
||||
|
||||
unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
|
||||
|
||||
/* Calculate the size of output registers in a separate pass, before
|
||||
* allocating them. With ARB_enhanced_layouts, multiple output variables
|
||||
* may occupy the same slot, but have different type sizes.
|
||||
*/
|
||||
nir_foreach_shader_out_variable(var, s.nir) {
|
||||
const int loc = var->data.driver_location;
|
||||
const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
|
||||
vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
|
||||
}
|
||||
|
||||
for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
|
||||
if (vec4s[loc] == 0) {
|
||||
loc++;
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned reg_size = vec4s[loc];
|
||||
|
||||
/* Check if there are any ranges that start within this range and extend
|
||||
* past it. If so, include them in this allocation.
|
||||
*/
|
||||
for (unsigned i = 1; i < reg_size; i++) {
|
||||
assert(i + loc < ARRAY_SIZE(vec4s));
|
||||
reg_size = MAX2(vec4s[i + loc] + i, reg_size);
|
||||
}
|
||||
|
||||
brw_reg reg = ntb.bld.vgrf(BRW_TYPE_F, 4 * reg_size);
|
||||
for (unsigned i = 0; i < reg_size; i++) {
|
||||
assert(loc + i < ARRAY_SIZE(s.outputs));
|
||||
s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
|
||||
}
|
||||
|
||||
loc += reg_size;
|
||||
}
|
||||
}
|
||||
|
||||
static brw_reg
|
||||
emit_work_group_id_setup(nir_to_brw_state &ntb)
|
||||
{
|
||||
|
|
@ -2647,8 +2597,6 @@ emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src,
|
|||
abld.emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
s.emit_urb_writes(vertex_count);
|
||||
|
||||
/* In stream mode we have to set control data bits for all vertices
|
||||
* unless we have disabled control data bits completely (which we do
|
||||
* do for MESA_PRIM_POINTS outputs that don't use streams).
|
||||
|
|
@ -3143,13 +3091,6 @@ brw_from_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
|
|||
|
||||
case nir_intrinsic_emit_vertex_with_counter:
|
||||
emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
|
||||
|
||||
/* After an EmitVertex() call, the values of all outputs are undefined.
|
||||
* If this is not in control flow, recreate a fresh set of output
|
||||
* registers to keep their live ranges separate.
|
||||
*/
|
||||
if (instr->instr.block->cf_node.parent->type == nir_cf_node_function)
|
||||
brw_from_nir_setup_outputs(ntb);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_end_primitive_with_counter:
|
||||
|
|
@ -5520,22 +5461,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_store_output: {
|
||||
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||
brw_reg src = get_nir_src(ntb, instr->src[0], -1);
|
||||
|
||||
unsigned store_offset = nir_src_as_uint(instr->src[1]);
|
||||
unsigned num_components = instr->num_components;
|
||||
unsigned first_component = nir_intrinsic_component(instr);
|
||||
|
||||
brw_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
|
||||
4 * store_offset), src.type);
|
||||
|
||||
brw_combine_with_vec(bld, offset(new_dest, bld, first_component),
|
||||
src, num_components);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_subgroup_size:
|
||||
/* This should only happen for fragment shaders because every other case
|
||||
* is lowered in NIR so we can optimize on it.
|
||||
|
|
@ -6949,7 +6874,6 @@ brw_from_nir(brw_shader *s)
|
|||
/* emit the arrays used for inputs and outputs - load/store intrinsics will
|
||||
* be converted to reads/writes of these arrays
|
||||
*/
|
||||
brw_from_nir_setup_outputs(ntb);
|
||||
brw_from_nir_emit_system_values(ntb);
|
||||
ntb.s.last_scratch = align(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
|
||||
|
||||
|
|
|
|||
|
|
@ -463,6 +463,159 @@ brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *nir,
|
|||
nir_metadata_control_flow, (void *) cd);
|
||||
}
|
||||
|
||||
/* See if comps[0..3] has any non-undef values. */
|
||||
static bool
|
||||
slot_defined(nir_scalar *comps)
|
||||
{
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
if (comps[i].def && !nir_def_is_undef(comps[i].def))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Replace any NULL defs in comps[0..(n-1)] with undef */
|
||||
static void
|
||||
fill_undefs(nir_scalar *comps, nir_def *undef, unsigned n)
|
||||
{
|
||||
for (unsigned i = 0; i < n; i++) {
|
||||
if (!comps[i].def)
|
||||
comps[i] = nir_get_scalar(undef, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
emit_urb_writes(nir_builder *b,
|
||||
const struct intel_device_info *devinfo,
|
||||
nir_scalar *outputs,
|
||||
unsigned num_slots,
|
||||
nir_def *offset)
|
||||
{
|
||||
nir_def *undef = nir_undef(b, 1, 32);
|
||||
|
||||
/* Primitive Shading Rate defaults to (1, 1) in half-float */
|
||||
if (devinfo->has_coarse_pixel_primitive_and_cb && !outputs[0].def)
|
||||
outputs[0] = nir_get_scalar(nir_imm_int(b, 0x3C003C00), 0);
|
||||
|
||||
/* Viewport, Layer, and Point Size default to 0 */
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
if (!outputs[i].def)
|
||||
outputs[i] = nir_get_scalar(nir_imm_int(b, 0), 0);
|
||||
}
|
||||
|
||||
/* Emit URB writes */
|
||||
for (unsigned slot = 0; slot < num_slots; slot++) {
|
||||
if (!slot_defined(&outputs[4 * slot]))
|
||||
continue;
|
||||
|
||||
const bool vec8 = slot + 1 < num_slots &&
|
||||
slot_defined(&outputs[4 * (slot + 1)]);
|
||||
|
||||
fill_undefs(&outputs[4 * slot], undef, vec8 ? 8 : 4);
|
||||
|
||||
nir_def *val = nir_vec_scalars(b, &outputs[4 * slot], vec8 ? 8 : 4);
|
||||
|
||||
if (devinfo->ver >= 20) {
|
||||
nir_def *addr = nir_iadd(b, output_handle(b),
|
||||
nir_imul_imm(b, offset, 16));
|
||||
nir_store_urb_lsc_intel(b, val, addr, .base = 16 * slot);
|
||||
} else {
|
||||
nir_store_urb_vec4_intel(b, val, output_handle(b), offset,
|
||||
nir_imm_int(b, vec8 ? 0xff : 0xf),
|
||||
.base = slot);
|
||||
}
|
||||
|
||||
if (vec8)
|
||||
slot++;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_lower_deferred_urb_writes(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo,
|
||||
const struct intel_vue_map *vue_map,
|
||||
unsigned extra_urb_slot_offset,
|
||||
unsigned gs_vertex_stride)
|
||||
{
|
||||
nir_scalar *outputs = calloc(vue_map->num_slots, 4 * sizeof(nir_scalar));
|
||||
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
|
||||
|
||||
nir_foreach_block(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_store_output:
|
||||
case nir_intrinsic_store_per_view_output: {
|
||||
nir_src *view_index = nir_get_io_arrayed_index_src(intrin);
|
||||
nir_src *offset = nir_get_io_offset_src(intrin);
|
||||
|
||||
const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
|
||||
const unsigned slot =
|
||||
vue_map->varying_to_slot[sem.location] +
|
||||
(view_index ? nir_src_as_uint(*view_index) : 0);
|
||||
nir_src_as_uint(*offset);
|
||||
const unsigned c = io_component(intrin, NULL);
|
||||
const unsigned mask = nir_intrinsic_write_mask(intrin);
|
||||
assert(slot != -1);
|
||||
assert(c < 4);
|
||||
|
||||
u_foreach_bit(i, mask) {
|
||||
outputs[4 * slot + c + i] =
|
||||
nir_scalar_resolved(intrin->src[0].ssa, i);
|
||||
}
|
||||
|
||||
nir_instr_remove(instr);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_emit_vertex_with_counter: {
|
||||
/* The only purpose of primitives sent to non-zero streams
|
||||
* is to be recorded by transform feedback, so if it is disabled,
|
||||
* we can discard all geometry bound for those streams.
|
||||
*/
|
||||
if (nir_intrinsic_stream_id(intrin) > 0 &&
|
||||
!nir->info.has_transform_feedback_varyings) {
|
||||
nir_instr_remove(instr);
|
||||
break;
|
||||
}
|
||||
|
||||
nir_builder b = nir_builder_at(nir_before_instr(instr));
|
||||
b.constant_fold_alu = true;
|
||||
nir_def *offset =
|
||||
nir_iadd_imm(&b, nir_imul_imm(&b, intrin->src[0].ssa,
|
||||
gs_vertex_stride),
|
||||
extra_urb_slot_offset);
|
||||
|
||||
emit_urb_writes(&b, devinfo, outputs, vue_map->num_slots, offset);
|
||||
/* After EmitVertex() all outputs are undefined */
|
||||
memset(outputs, 0, 4 * vue_map->num_slots * sizeof(nir_scalar));
|
||||
|
||||
/* Leave emit_vertex_with_counter for control data writes */
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nir->info.stage != MESA_SHADER_GEOMETRY) {
|
||||
nir_builder b = nir_builder_at(nir_after_impl(impl));
|
||||
emit_urb_writes(&b, devinfo, outputs, vue_map->num_slots,
|
||||
nir_imm_int(&b, 0));
|
||||
}
|
||||
|
||||
free(outputs);
|
||||
|
||||
return nir_progress(true, impl, nir_metadata_control_flow);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
lower_task_payload_to_urb(nir_builder *b, nir_intrinsic_instr *io, void *data)
|
||||
{
|
||||
|
|
@ -735,60 +888,6 @@ remap_tess_levels(nir_shader *nir,
|
|||
nir_metadata_control_flow, &cb);
|
||||
}
|
||||
|
||||
/* Replace store_per_view_output to plain store_output, mapping the view index
|
||||
* to IO offset. Because we only use per-view outputs for position, the offset
|
||||
* pitch is always 1. */
|
||||
static bool
|
||||
lower_per_view_outputs(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
UNUSED void *cb_data)
|
||||
{
|
||||
if (intrin->intrinsic != nir_intrinsic_store_per_view_output &&
|
||||
intrin->intrinsic != nir_intrinsic_load_per_view_output)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_src *view_index = nir_get_io_arrayed_index_src(intrin);
|
||||
nir_src *offset = nir_get_io_offset_src(intrin);
|
||||
|
||||
nir_def *new_offset = nir_iadd(b, view_index->ssa, offset->ssa);
|
||||
|
||||
nir_intrinsic_instr *new;
|
||||
if (intrin->intrinsic == nir_intrinsic_store_per_view_output)
|
||||
new = nir_store_output(b, intrin->src[0].ssa, new_offset);
|
||||
else {
|
||||
nir_def *new_def = nir_load_output(b, intrin->def.num_components,
|
||||
intrin->def.bit_size, new_offset);
|
||||
new = nir_def_as_intrinsic(new_def);
|
||||
}
|
||||
|
||||
nir_intrinsic_set_base(new, nir_intrinsic_base(intrin));
|
||||
nir_intrinsic_set_range(new, nir_intrinsic_range(intrin));
|
||||
nir_intrinsic_set_write_mask(new, nir_intrinsic_write_mask(intrin));
|
||||
nir_intrinsic_set_component(new, nir_intrinsic_component(intrin));
|
||||
nir_intrinsic_set_src_type(new, nir_intrinsic_src_type(intrin));
|
||||
|
||||
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
|
||||
/* the meaning of the offset src is different for brw */
|
||||
sem.no_validate = 1;
|
||||
nir_intrinsic_set_io_semantics(new, sem);
|
||||
|
||||
if (intrin->intrinsic == nir_intrinsic_load_per_view_output)
|
||||
nir_def_rewrite_uses(&intrin->def, &new->def);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
brw_nir_lower_per_view_outputs(nir_shader *nir)
|
||||
{
|
||||
return nir_shader_intrinsics_pass(nir, lower_per_view_outputs,
|
||||
nir_metadata_control_flow,
|
||||
NULL);
|
||||
}
|
||||
|
||||
static bool
|
||||
brw_nir_should_vectorize_urb(unsigned align_mul, unsigned align_offset,
|
||||
unsigned bit_size,
|
||||
|
|
@ -1306,13 +1405,8 @@ brw_nir_lower_fs_inputs(nir_shader *nir,
|
|||
void
|
||||
brw_nir_lower_vue_outputs(nir_shader *nir)
|
||||
{
|
||||
nir_foreach_shader_out_variable(var, nir) {
|
||||
var->data.driver_location = var->data.location;
|
||||
}
|
||||
|
||||
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4,
|
||||
nir_lower_io_lower_64bit_to_32);
|
||||
NIR_PASS(_, nir, brw_nir_lower_per_view_outputs);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -219,6 +219,11 @@ struct brw_lower_urb_cb_data {
|
|||
bool brw_nir_lower_inputs_to_urb_intrinsics(nir_shader *, const struct brw_lower_urb_cb_data *);
|
||||
|
||||
bool brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *, const struct brw_lower_urb_cb_data *);
|
||||
bool brw_nir_lower_deferred_urb_writes(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo,
|
||||
const struct intel_vue_map *vue_map,
|
||||
unsigned extra_urb_slot_offset,
|
||||
unsigned gs_vertex_stride);
|
||||
|
||||
void brw_nir_opt_vectorize_urb(struct brw_pass_tracker *pt);
|
||||
|
||||
|
|
|
|||
|
|
@ -19,257 +19,6 @@
|
|||
#include "compiler/nir/nir_builder.h"
|
||||
#include "util/u_math.h"
|
||||
|
||||
void
|
||||
brw_shader::emit_urb_writes(const brw_reg &gs_vertex_count)
|
||||
{
|
||||
int slot, urb_offset, length;
|
||||
int starting_urb_offset = 0;
|
||||
const struct brw_vue_prog_data *vue_prog_data =
|
||||
brw_vue_prog_data(this->prog_data);
|
||||
const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
|
||||
bool flush;
|
||||
brw_reg sources[8];
|
||||
brw_reg urb_handle;
|
||||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_VERTEX:
|
||||
urb_handle = vs_payload().urb_handles;
|
||||
break;
|
||||
case MESA_SHADER_TESS_EVAL:
|
||||
urb_handle = tes_payload().urb_output;
|
||||
break;
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
urb_handle = gs_payload().urb_handles;
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("invalid stage");
|
||||
}
|
||||
|
||||
const brw_builder bld = brw_builder(this);
|
||||
|
||||
brw_reg per_slot_offsets;
|
||||
|
||||
if (stage == MESA_SHADER_GEOMETRY) {
|
||||
const struct brw_gs_prog_data *gs_prog_data =
|
||||
brw_gs_prog_data(this->prog_data);
|
||||
|
||||
/* We need to increment the Global Offset to skip over the control data
|
||||
* header and the extra "Vertex Count" field (1 HWord) at the beginning
|
||||
* of the VUE. We're counting in OWords, so the units are doubled.
|
||||
*/
|
||||
starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
|
||||
if (gs_prog_data->static_vertex_count == -1)
|
||||
starting_urb_offset += 2;
|
||||
|
||||
/* The URB offset is in 128-bit units, so we need to multiply by 2 */
|
||||
const int output_vertex_size_owords =
|
||||
gs_prog_data->output_vertex_size_hwords * 2;
|
||||
|
||||
/* On Xe2+ platform, LSC can operate on the Dword data element with byte
|
||||
* offset granularity, so convert per slot offset in bytes since it's in
|
||||
* Owords (16-bytes) unit else keep per slot offset in oword unit for
|
||||
* previous platforms.
|
||||
*/
|
||||
const int output_vertex_size = devinfo->ver >= 20 ?
|
||||
output_vertex_size_owords * 16 :
|
||||
output_vertex_size_owords;
|
||||
if (gs_vertex_count.file == IMM) {
|
||||
per_slot_offsets = brw_imm_ud(output_vertex_size *
|
||||
gs_vertex_count.ud);
|
||||
} else {
|
||||
per_slot_offsets = bld.vgrf(BRW_TYPE_UD);
|
||||
bld.MUL(per_slot_offsets, gs_vertex_count,
|
||||
brw_imm_ud(output_vertex_size));
|
||||
}
|
||||
}
|
||||
|
||||
length = 0;
|
||||
urb_offset = starting_urb_offset;
|
||||
flush = false;
|
||||
|
||||
/* SSO shaders can have VUE slots allocated which are never actually
|
||||
* written to, so ignore them when looking for the last (written) slot.
|
||||
*/
|
||||
int last_slot = vue_map->num_slots - 1;
|
||||
while (last_slot > 0 &&
|
||||
(vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
|
||||
outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
|
||||
last_slot--;
|
||||
}
|
||||
|
||||
bool urb_written = false;
|
||||
for (slot = 0; slot < vue_map->num_slots; slot++) {
|
||||
int varying = vue_map->slot_to_varying[slot];
|
||||
switch (varying) {
|
||||
case VARYING_SLOT_PSIZ: {
|
||||
/* The point size varying slot is the vue header and is always in the
|
||||
* vue map. If anything in the header is going to be read back by HW,
|
||||
* we need to initialize it, in particular the viewport & layer
|
||||
* values.
|
||||
*
|
||||
* SKL PRMs, Volume 7: 3D-Media-GPGPU, Vertex URB Entry (VUE)
|
||||
* Formats:
|
||||
*
|
||||
* "VUEs are written in two ways:
|
||||
*
|
||||
* - At the top of the 3D Geometry pipeline, the VF's
|
||||
* InputAssembly function creates VUEs and initializes them
|
||||
* from data extracted from Vertex Buffers as well as
|
||||
* internally generated data.
|
||||
*
|
||||
* - VS, GS, HS and DS threads can compute, format, and write
|
||||
* new VUEs as thread output."
|
||||
*
|
||||
* "Software must ensure that any VUEs subject to readback by the
|
||||
* 3D pipeline start with a valid Vertex Header. This extends to
|
||||
* all VUEs with the following exceptions:
|
||||
*
|
||||
* - If the VS function is enabled, the VF-written VUEs are not
|
||||
* required to have Vertex Headers, as the VS-incoming
|
||||
* vertices are guaranteed to be consumed by the VS (i.e.,
|
||||
* the VS thread is responsible for overwriting the input
|
||||
* vertex data).
|
||||
*
|
||||
* - If the GS FF is enabled, neither VF-written VUEs nor VS
|
||||
* thread-generated VUEs are required to have Vertex Headers,
|
||||
* as the GS will consume all incoming vertices.
|
||||
*
|
||||
* - If Rendering is disabled, VertexHeaders are not required
|
||||
* anywhere."
|
||||
*/
|
||||
brw_reg zero =
|
||||
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
|
||||
bld.MOV(zero, brw_imm_ud(0u));
|
||||
|
||||
if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
|
||||
this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
|
||||
sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
|
||||
} else if (devinfo->has_coarse_pixel_primitive_and_cb) {
|
||||
uint32_t one_fp16 = 0x3C00;
|
||||
brw_reg one_by_one_fp16 =
|
||||
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
|
||||
bld.MOV(one_by_one_fp16, brw_imm_ud((one_fp16 << 16) | one_fp16));
|
||||
sources[length++] = one_by_one_fp16;
|
||||
} else {
|
||||
sources[length++] = zero;
|
||||
}
|
||||
|
||||
if (vue_map->slots_valid & VARYING_BIT_LAYER)
|
||||
sources[length++] = this->outputs[VARYING_SLOT_LAYER];
|
||||
else
|
||||
sources[length++] = zero;
|
||||
|
||||
if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
|
||||
sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
|
||||
else
|
||||
sources[length++] = zero;
|
||||
|
||||
if (vue_map->slots_valid & VARYING_BIT_PSIZ)
|
||||
sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
|
||||
else
|
||||
sources[length++] = zero;
|
||||
break;
|
||||
}
|
||||
case VARYING_SLOT_EDGE:
|
||||
UNREACHABLE("unexpected scalar vs output");
|
||||
break;
|
||||
|
||||
default:
|
||||
/* gl_Position is always in the vue map, but isn't always written by
|
||||
* the shader. Other varyings (clip distances) get added to the vue
|
||||
* map but don't always get written. In those cases, the
|
||||
* corresponding this->output[] slot will be invalid we and can skip
|
||||
* the urb write for the varying. If we've already queued up a vue
|
||||
* slot for writing we flush a mlen 5 urb write, otherwise we just
|
||||
* advance the urb_offset.
|
||||
*/
|
||||
if (varying == BRW_VARYING_SLOT_PAD ||
|
||||
this->outputs[varying].file == BAD_FILE) {
|
||||
if (length > 0)
|
||||
flush = true;
|
||||
else
|
||||
urb_offset++;
|
||||
break;
|
||||
}
|
||||
|
||||
int slot_offset = 0;
|
||||
|
||||
/* When using Primitive Replication, there may be multiple slots
|
||||
* assigned to POS.
|
||||
*/
|
||||
if (varying == VARYING_SLOT_POS)
|
||||
slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
|
||||
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
sources[length++] = offset(this->outputs[varying], bld,
|
||||
i + (slot_offset * 4));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
const brw_builder abld = bld.annotate("URB write");
|
||||
|
||||
/* If we've queued up 8 registers of payload (2 VUE slots), if this is
|
||||
* the last slot or if we need to flush (see BAD_FILE varying case
|
||||
* above), emit a URB write send now to flush out the data.
|
||||
*/
|
||||
if (length == 8 || (length > 0 && slot == last_slot))
|
||||
flush = true;
|
||||
if (flush) {
|
||||
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
||||
|
||||
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
||||
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
|
||||
srcs[URB_LOGICAL_SRC_DATA] =
|
||||
retype(brw_allocate_vgrf_units(*this, (dispatch_width / 8) * length), BRW_TYPE_F);
|
||||
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
|
||||
|
||||
brw_urb_inst *urb = abld.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
||||
urb->components = length;
|
||||
urb->offset = urb_offset * (devinfo->ver >= 20 ? 16 : 1);
|
||||
urb_offset = starting_urb_offset + slot + 1;
|
||||
length = 0;
|
||||
flush = false;
|
||||
urb_written = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* If we don't have any valid slots to write, just do a minimal urb write
|
||||
* send to terminate the shader. This includes 1 slot of undefined data,
|
||||
* because it's invalid to write 0 data:
|
||||
*
|
||||
* From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
|
||||
* Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
|
||||
* Write Data Payload:
|
||||
*
|
||||
* "The write data payload can be between 1 and 8 message phases long."
|
||||
*/
|
||||
if (!urb_written) {
|
||||
/* For GS, just turn EmitVertex() into a no-op. We don't want it to
|
||||
* end the thread, and emit_gs_thread_end() already emits a SEND with
|
||||
* EOT at the end of the program for us.
|
||||
*/
|
||||
if (stage == MESA_SHADER_GEOMETRY)
|
||||
return;
|
||||
|
||||
brw_reg uniform_urb_handle =
|
||||
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
|
||||
brw_reg payload =
|
||||
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
|
||||
|
||||
bld.exec_all().MOV(uniform_urb_handle, urb_handle);
|
||||
|
||||
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
||||
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
|
||||
srcs[URB_LOGICAL_SRC_DATA] = payload;
|
||||
|
||||
brw_urb_inst *urb = bld.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
||||
urb->offset = devinfo->ver >= 20 ? 16 : 1;
|
||||
urb->components = 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
brw_shader::emit_tes_terminate()
|
||||
{
|
||||
|
|
|
|||
|
|
@ -88,7 +88,6 @@ public:
|
|||
void fail(const char *msg, ...);
|
||||
void limit_dispatch_width(unsigned n, const char *msg);
|
||||
|
||||
void emit_urb_writes(const brw_reg &gs_vertex_count = brw_reg());
|
||||
void emit_gs_control_data_bits(const brw_reg &vertex_count);
|
||||
brw_reg gs_urb_channel_mask(const brw_reg &dword_index);
|
||||
brw_reg gs_urb_per_slot_dword_index(const brw_reg &vertex_count);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue