intel/compiler/mesh: compactify MUE layout

Instead of using 4 dwords for each output slot, use only the amount
of memory actually needed by each variable.

There are some complications from this "obvious" idea:
- flat and non-flat variables can't be merged into the same vec4 slot,
  because flat inputs mask has vec4 stride
- multi-slot variables can have different layout:
   float[N] requires N 1-dword slots, but
   i64vec3 requires 1 fully occupied 4-dword slot followed by 2-dword slot
- some output variables occur both in single-channel/component split
  and combined variants
- crossing vec4 boundary requires generating more writes, so avoiding them
  if possible is beneficial

This patch fixes some issues with arrays in per-vertex and per-primitive data
(func.mesh.ext.outputs.*.indirect_array.q0 in crucible)
and by reduction in single MUE size it allows spawning more threads at
the same time.

Note: this patch doesn't improve vk_meshlet_cadscene performance because
default layout is already optimal enough.

Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
This commit is contained in:
Marcin Ślusarz 2022-12-21 15:40:07 +01:00 committed by Marge Bot
parent fb765a65c8
commit a252123363
8 changed files with 478 additions and 118 deletions

View file

@ -1022,6 +1022,7 @@ struct brw_wm_prog_data {
* For varying slots that are not used by the FS, the value is -1. * For varying slots that are not used by the FS, the value is -1.
*/ */
int urb_setup[VARYING_SLOT_MAX]; int urb_setup[VARYING_SLOT_MAX];
int urb_setup_channel[VARYING_SLOT_MAX];
/** /**
* Cache structure into the urb_setup array above that contains the * Cache structure into the urb_setup array above that contains the
@ -1625,6 +1626,7 @@ struct brw_tue_map {
struct brw_mue_map { struct brw_mue_map {
int32_t start_dw[VARYING_SLOT_MAX]; int32_t start_dw[VARYING_SLOT_MAX];
uint32_t len_dw[VARYING_SLOT_MAX];
uint32_t per_primitive_indices_dw; uint32_t per_primitive_indices_dw;
uint32_t size_dw; uint32_t size_dw;

View file

@ -1764,10 +1764,10 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
const nir_shader *nir, const nir_shader *nir,
const struct brw_mue_map *mue_map) const struct brw_mue_map *mue_map)
{ {
memset(prog_data->urb_setup, -1, memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX); memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
int urb_next = 0; int urb_next = 0; /* in vec4s */
const uint64_t inputs_read = const uint64_t inputs_read =
nir->info.inputs_read & ~nir->info.per_primitive_inputs; nir->info.inputs_read & ~nir->info.per_primitive_inputs;
@ -1782,6 +1782,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
uint64_t per_prim_inputs_read = uint64_t per_prim_inputs_read =
nir->info.inputs_read & nir->info.per_primitive_inputs; nir->info.inputs_read & nir->info.per_primitive_inputs;
unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
/* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
* are always at the beginning, because they come from MUE * are always at the beginning, because they come from MUE
* Primitive Header, not Per-Primitive Attributes. * Primitive Header, not Per-Primitive Attributes.
@ -1789,8 +1792,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT | const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
VARYING_BIT_LAYER | VARYING_BIT_LAYER |
VARYING_BIT_PRIMITIVE_SHADING_RATE; VARYING_BIT_PRIMITIVE_SHADING_RATE;
bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
if (per_prim_inputs_read & primitive_header_bits) { if (reads_header) {
/* Primitive Shading Rate, Layer and Viewport live in the same /* Primitive Shading Rate, Layer and Viewport live in the same
* 4-dwords slot (psr is dword 0, layer is dword 1, and viewport * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
* is dword 2). * is dword 2).
@ -1804,23 +1808,30 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
if (per_prim_inputs_read & VARYING_BIT_VIEWPORT) if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0; prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
/* 3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputRead[Offset|Length]
* are in full GRFs (8 dwords) and MUE Primitive Header is 8 dwords,
* so next per-primitive attribute must be placed in slot 2 (each slot
* is 4 dwords long).
*/
urb_next = 2;
per_prim_inputs_read &= ~primitive_header_bits; per_prim_inputs_read &= ~primitive_header_bits;
} else {
/* If fs doesn't need primitive header, then it won't be made
* available through SBE_MESH, so we have to skip them when
* calculating offset from start of per-prim data.
*/
per_prim_start_dw += mue_map->per_primitive_header_size_dw;
per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
} }
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { u_foreach_bit64(i, per_prim_inputs_read) {
if (per_prim_inputs_read & BITFIELD64_BIT(i)) { int start = mue_map->start_dw[i];
prog_data->urb_setup[i] = urb_next++;
} assert(start >= 0);
assert(mue_map->len_dw[i] > 0);
assert(unsigned(start) >= per_prim_start_dw);
unsigned pos_dw = unsigned(start) - per_prim_start_dw;
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
prog_data->urb_setup_channel[i] = pos_dw % 4;
} }
/* The actual setup attributes later must be aligned to a full GRF. */ urb_next = per_prim_size_dw / 4;
urb_next = ALIGN(urb_next, 2);
prog_data->num_per_primitive_inputs = urb_next; prog_data->num_per_primitive_inputs = urb_next;
} }
@ -1835,21 +1846,43 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
unique_fs_attrs &= ~clip_dist_bits; unique_fs_attrs &= ~clip_dist_bits;
} }
unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
/* Per-Vertex header is never available to fragment shader. */
per_vertex_start_dw += 8;
per_vertex_size_dw -= 8;
/* In Mesh, CLIP_DIST slots are always at the beginning, because /* In Mesh, CLIP_DIST slots are always at the beginning, because
* they come from MUE Vertex Header, not Per-Vertex Attributes. * they come from MUE Vertex Header, not Per-Vertex Attributes.
*/ */
if (inputs_read & clip_dist_bits) { if (inputs_read & clip_dist_bits) {
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++; prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++; prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
} else if (mue_map->per_vertex_header_size_dw > 8) {
/* Clip distances are in MUE, but we are not reading them in FS. */
per_vertex_start_dw += 8;
per_vertex_size_dw -= 8;
} }
/* Per-Vertex attributes are laid out ordered. Because we always link /* Per-Vertex attributes are laid out ordered. Because we always link
* Mesh and Fragment shaders, the which slots are written and read by * Mesh and Fragment shaders, the which slots are written and read by
* each of them will match. */ * each of them will match. */
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
if (unique_fs_attrs & BITFIELD64_BIT(i)) u_foreach_bit64(i, unique_fs_attrs) {
prog_data->urb_setup[i] = urb_next++; int start = mue_map->start_dw[i];
assert(start >= 0);
assert(mue_map->len_dw[i] > 0);
assert(unsigned(start) >= per_vertex_start_dw);
unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
prog_data->urb_setup_channel[i] = pos_dw % 4;
} }
urb_next += per_vertex_size_dw / 4;
} else if (devinfo->ver >= 6) { } else if (devinfo->ver >= 6) {
uint64_t vue_header_bits = uint64_t vue_header_bits =
VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT; VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;

View file

@ -438,7 +438,7 @@ public:
fs_reg get_timestamp(const brw::fs_builder &bld); fs_reg get_timestamp(const brw::fs_builder &bld);
fs_reg interp_reg(int location, int channel); fs_reg interp_reg(int location, int channel);
fs_reg per_primitive_reg(int location); fs_reg per_primitive_reg(int location, unsigned comp);
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const; virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
virtual void dump_instructions_to_file(FILE *file) const; virtual void dump_instructions_to_file(FILE *file) const;

View file

@ -3489,7 +3489,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
assert(base != VARYING_SLOT_PRIMITIVE_INDICES); assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
for (unsigned int i = 0; i < num_components; i++) { for (unsigned int i = 0; i < num_components; i++) {
bld.MOV(offset(dest, bld, i), bld.MOV(offset(dest, bld, i),
retype(component(per_primitive_reg(base), comp + i), dest.type)); retype(per_primitive_reg(base, comp + i), dest.type));
} }
} else { } else {
for (unsigned int i = 0; i < num_components; i++) { for (unsigned int i = 0; i < num_components; i++) {

View file

@ -126,6 +126,7 @@ fs_visitor::interp_reg(int location, int channel)
assert(prog_data->urb_setup[location] >= 0); assert(prog_data->urb_setup[location] >= 0);
unsigned nr = prog_data->urb_setup[location]; unsigned nr = prog_data->urb_setup[location];
channel += prog_data->urb_setup_channel[location];
/* Adjust so we start counting from the first per_vertex input. */ /* Adjust so we start counting from the first per_vertex input. */
assert(nr >= prog_data->num_per_primitive_inputs); assert(nr >= prog_data->num_per_primitive_inputs);
@ -142,19 +143,22 @@ fs_visitor::interp_reg(int location, int channel)
* generate_code() time. * generate_code() time.
*/ */
fs_reg fs_reg
fs_visitor::per_primitive_reg(int location) fs_visitor::per_primitive_reg(int location, unsigned comp)
{ {
assert(stage == MESA_SHADER_FRAGMENT); assert(stage == MESA_SHADER_FRAGMENT);
assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs); assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
comp += prog_data->urb_setup_channel[location];
assert(prog_data->urb_setup[location] >= 0); assert(prog_data->urb_setup[location] >= 0);
const unsigned regnr = prog_data->urb_setup[location]; const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
assert(regnr < prog_data->num_per_primitive_inputs); assert(regnr < prog_data->num_per_primitive_inputs);
return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F); return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp % 4);
} }
/** Emits the interpolation for the varying inputs. */ /** Emits the interpolation for the varying inputs. */

View file

@ -21,6 +21,8 @@
* IN THE SOFTWARE. * IN THE SOFTWARE.
*/ */
#include <list>
#include <vector>
#include "brw_compiler.h" #include "brw_compiler.h"
#include "brw_fs.h" #include "brw_fs.h"
#include "brw_nir.h" #include "brw_nir.h"
@ -414,6 +416,224 @@ brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
nir_address_format_32bit_offset); nir_address_format_32bit_offset);
} }
/* Attribute types. Flat attributes have to be a separate class because
* flat and interpolated attributes can't share the same vec4 slot
* (see 3DSTATE_SBE.ConstantInterpolationEnable).
*/
enum {
PRIM, /* per primitive */
VERT, /* per vertex interpolated */
VERT_FLAT, /* per vertex flat */
};
struct attr_desc {
int location;
const struct glsl_type *type;
unsigned dwords;
unsigned slots;
};
struct attr_type_info {
/* order of attributes, negative values are holes */
std::list<struct attr_desc> *order;
/* attributes after which there's hole of size equal to array index */
std::list<int> holes[4];
};
static void
brw_mue_assign_position(const struct attr_desc *attr,
struct brw_mue_map *map,
unsigned start_dw)
{
bool is_array = glsl_type_is_array(attr->type);
int location = attr->location;
unsigned remaining = attr->dwords;
for (unsigned slot = 0; slot < attr->slots; ++slot) {
map->start_dw[location + slot] = start_dw;
unsigned sz;
if (is_array) {
assert(attr->dwords % attr->slots == 0);
sz = attr->dwords / attr->slots;
} else {
sz = MIN2(remaining, 4);
}
map->len_dw[location + slot] = sz;
start_dw += sz;
remaining -= sz;
}
}
static nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader *shader,
nir_variable_mode mode,
int location)
{
nir_variable *best_var = NULL;
unsigned last_size = 0;
nir_foreach_variable_with_modes(var, shader, mode) {
if (var->data.location != location)
continue;
unsigned new_size = glsl_count_dword_slots(var->type, false);
if (new_size > last_size) {
best_var = var;
last_size = new_size;
}
}
return best_var;
}
/* Finds order of outputs which require minimum size, without splitting
* of URB read/write messages (which operate on vec4-aligned memory).
*/
static void
brw_compute_mue_layout(std::list<struct attr_desc> *orders,
uint64_t outputs_written,
struct nir_shader *nir)
{
const struct shader_info *info = &nir->info;
struct attr_type_info data[3];
bool no_compact = !debug_get_bool_option("BRW_MESH_COMPACTION", true);
for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
data[i].order = &orders[i];
u_foreach_bit64(location, outputs_written) {
if ((BITFIELD64_BIT(location) & outputs_written) == 0)
continue;
/* At this point there are both complete and split variables as
* outputs. We need the complete variable to compute the required
* size.
*/
nir_variable *var =
brw_nir_find_complete_variable_with_location(nir,
nir_var_shader_out,
location);
struct attr_desc d;
d.location = location;
d.type = brw_nir_get_var_type(nir, var);
d.dwords = glsl_count_dword_slots(d.type, false);
d.slots = glsl_count_attribute_slots(d.type, false);
struct attr_type_info *type_data;
if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
type_data = &data[PRIM];
else if (var->data.interpolation == INTERP_MODE_FLAT)
type_data = &data[VERT_FLAT];
else
type_data = &data[VERT];
std::list<struct attr_desc> *order = type_data->order;
std::list<int> *holes = type_data->holes;
outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
int mod = d.dwords % 4;
if (mod == 0) {
order->push_back(d);
continue;
}
struct attr_desc h;
h.location = -1;
h.type = NULL;
h.dwords = 4 - mod;
h.slots = 0;
if (no_compact) {
order->push_back(d);
order->push_back(h);
continue;
}
if (d.dwords > 4) {
order->push_back(d);
order->push_back(h);
holes[h.dwords].push_back(location);
continue;
}
assert(d.dwords < 4);
unsigned found = 0;
/* try to find the smallest hole big enough to hold this attribute */
for (unsigned sz = d.dwords; sz < 4; sz++){
if (!holes[sz].empty()) {
found = sz;
break;
}
}
/* append at the end if not found */
if (found == 0) {
order->push_back(d);
order->push_back(h);
holes[h.dwords].push_back(location);
continue;
}
assert(found < 4);
assert(!holes[found].empty());
int after_loc = holes[found].back();
holes[found].pop_back();
bool inserted_back = false;
for (auto it = order->begin(); it != order->end(); ++it) {
if ((*it).location != after_loc)
continue;
++it;
/* must be a hole */
assert((*it).location < 0);
/* and it must be big enough */
assert(d.dwords <= (*it).dwords);
if (d.dwords == (*it).dwords) {
/* exact size, just replace */
*it = d;
} else {
/* inexact size, shrink hole */
(*it).dwords -= d.dwords;
/* and insert new attribute before it */
order->insert(it, d);
/* Insert shrunk hole in a spot so that the order of attributes
* is preserved.
*/
std::list<int> &hole_list = holes[(*it).dwords];
std::list<int>::iterator insert_before = hole_list.end();
for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
if ((*it2) >= (int)location) {
insert_before = it2;
break;
}
}
hole_list.insert(insert_before, location);
}
inserted_back = true;
break;
}
assert(inserted_back);
}
}
/* Mesh URB Entry consists of an initial section /* Mesh URB Entry consists of an initial section
* *
* - Primitive Count * - Primitive Count
@ -443,8 +663,8 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
{ {
memset(map, 0, sizeof(*map)); memset(map, 0, sizeof(*map));
for (int i = 0; i < VARYING_SLOT_MAX; i++) memset(&map->start_dw[0], -1, sizeof(map->start_dw));
map->start_dw[i] = -1; memset(&map->len_dw[0], 0, sizeof(map->len_dw));
unsigned vertices_per_primitive = unsigned vertices_per_primitive =
num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type); num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
@ -454,16 +674,6 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
uint64_t outputs_written = nir->info.outputs_written; uint64_t outputs_written = nir->info.outputs_written;
/* Assign initial section. */
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
}
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
}
/* One dword for primitives count then K extra dwords for each primitive. */ /* One dword for primitives count then K extra dwords for each primitive. */
switch (index_format) { switch (index_format) {
case BRW_INDEX_FORMAT_U32: case BRW_INDEX_FORMAT_U32:
@ -479,86 +689,157 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw * map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
map->max_primitives + 1, 8); map->max_primitives + 1, 8);
/* TODO(mesh): Multiview. */ /* Assign initial section. */
map->per_primitive_header_size_dw = if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
(nir->info.outputs_written & (BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) | map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE) | map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) | outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
BITFIELD64_BIT(VARYING_SLOT_LAYER))) ? 8 : 0; }
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
map->per_primitive_indices_dw * map->max_primitives;
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
}
map->per_primitive_data_size_dw = 0; const uint64_t per_primitive_header_bits =
u_foreach_bit64(location, outputs_written & nir->info.per_primitive_outputs) { BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
assert(map->start_dw[location] == -1); BITFIELD64_BIT(VARYING_SLOT_LAYER) |
BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
unsigned start; const uint64_t per_vertex_header_bits =
switch (location) { BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
case VARYING_SLOT_PRIMITIVE_SHADING_RATE: BITFIELD64_BIT(VARYING_SLOT_POS) |
start = map->per_primitive_start_dw + 0; BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
break; BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
case VARYING_SLOT_LAYER:
start = map->per_primitive_start_dw + 1; /* RTAIndex */ std::list<struct attr_desc> orders[3];
break; uint64_t regular_outputs = outputs_written &
case VARYING_SLOT_VIEWPORT: ~(per_primitive_header_bits | per_vertex_header_bits);
start = map->per_primitive_start_dw + 2; brw_compute_mue_layout(orders, regular_outputs, nir);
break;
case VARYING_SLOT_CULL_PRIMITIVE: if (outputs_written & per_primitive_header_bits) {
start = map->per_primitive_start_dw + 3; if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
break; map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
default: map->per_primitive_start_dw + 0;
assert(location == VARYING_SLOT_PRIMITIVE_ID || map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
location >= VARYING_SLOT_VAR0);
start = map->per_primitive_start_dw +
map->per_primitive_header_size_dw +
map->per_primitive_data_size_dw;
map->per_primitive_data_size_dw += 4;
break;
} }
map->start_dw[location] = start; if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
map->start_dw[VARYING_SLOT_LAYER] =
map->per_primitive_start_dw + 1; /* RTAIndex */
map->len_dw[VARYING_SLOT_LAYER] = 1;
}
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
map->start_dw[VARYING_SLOT_VIEWPORT] =
map->per_primitive_start_dw + 2;
map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
}
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
map->per_primitive_start_dw + 3;
map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
}
map->per_primitive_header_size_dw = 8;
outputs_written &= ~per_primitive_header_bits;
} else {
map->per_primitive_header_size_dw = 0;
}
map->per_primitive_data_size_dw = 0;
unsigned start_dw = map->per_primitive_start_dw +
map->per_primitive_header_size_dw;
for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
int location = (*it).location;
if (location < 0) {
start_dw += (*it).dwords;
map->per_primitive_data_size_dw += (*it).dwords;
continue;
}
assert(map->start_dw[location] == -1);
assert(location == VARYING_SLOT_PRIMITIVE_ID ||
location >= VARYING_SLOT_VAR0);
brw_mue_assign_position(&*it, map, start_dw);
start_dw += (*it).dwords;
map->per_primitive_data_size_dw += (*it).dwords;
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
} }
map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw + map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
map->per_primitive_data_size_dw, 8); map->per_primitive_data_size_dw, 8);
map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw + map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
map->per_primitive_pitch_dw * map->max_primitives, 8); map->per_primitive_pitch_dw *
map->max_primitives, 8);
/* TODO(mesh): Multiview. */ /* TODO(mesh): Multiview. */
unsigned fixed_header_size = 8; unsigned fixed_header_size = 8;
map->per_vertex_header_size_dw = ALIGN(fixed_header_size + map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
nir->info.clip_distance_array_size + nir->info.clip_distance_array_size +
nir->info.cull_distance_array_size, 8); nir->info.cull_distance_array_size, 8);
map->per_vertex_data_size_dw = 0;
u_foreach_bit64(location, outputs_written & ~nir->info.per_primitive_outputs) {
assert(map->start_dw[location] == -1);
unsigned start; if (outputs_written & per_vertex_header_bits) {
switch (location) { if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
case VARYING_SLOT_PSIZ: map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
start = map->per_vertex_start_dw + 3; map->len_dw[VARYING_SLOT_PSIZ] = 1;
break; }
case VARYING_SLOT_POS:
start = map->per_vertex_start_dw + 4; if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
break; map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
case VARYING_SLOT_CLIP_DIST0: map->len_dw[VARYING_SLOT_POS] = 4;
start = map->per_vertex_start_dw + fixed_header_size + 0; }
break;
case VARYING_SLOT_CLIP_DIST1: if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
start = map->per_vertex_start_dw + fixed_header_size + 4; map->start_dw[VARYING_SLOT_CLIP_DIST0] =
break; map->per_vertex_start_dw + fixed_header_size + 0;
case VARYING_SLOT_CULL_DIST0: map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
case VARYING_SLOT_CULL_DIST1: }
unreachable("cull distances should be lowered earlier");
break; if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
default: map->start_dw[VARYING_SLOT_CLIP_DIST1] =
assert(location >= VARYING_SLOT_VAR0); map->per_vertex_start_dw + fixed_header_size + 4;
start = map->per_vertex_start_dw + map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
map->per_vertex_header_size_dw + }
map->per_vertex_data_size_dw;
map->per_vertex_data_size_dw += 4; outputs_written &= ~per_vertex_header_bits;
break; }
/* cull distances should be lowered earlier */
assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
map->per_vertex_data_size_dw = 0;
start_dw = map->per_vertex_start_dw +
map->per_vertex_header_size_dw;
for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
int location = (*it).location;
if (location < 0) {
start_dw += (*it).dwords;
map->per_vertex_data_size_dw += (*it).dwords;
continue;
}
assert(map->start_dw[location] == -1);
assert(location >= VARYING_SLOT_VAR0);
brw_mue_assign_position(&*it, map, start_dw);
start_dw += (*it).dwords;
map->per_vertex_data_size_dw += (*it).dwords;
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
} }
map->start_dw[location] = start;
} }
map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw + map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
@ -571,14 +852,18 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
} }
static void static void
brw_print_mue_map(FILE *fp, const struct brw_mue_map *map) brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
{ {
fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n", fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
map->size_dw, map->max_primitives, map->max_vertices); map->size_dw, map->max_primitives, map->max_vertices);
fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_COUNT\n", fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT]); map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_INDICES\n", map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES]); map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n", fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
map->per_primitive_start_dw, map->per_primitive_start_dw,
@ -589,13 +874,20 @@ brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
if (map->start_dw[i] < 0) if (map->start_dw[i] < 0)
continue; continue;
const unsigned offset = map->start_dw[i]; const unsigned offset = map->start_dw[i];
if (offset >= map->per_primitive_start_dw && const unsigned len = map->len_dw[i];
offset < map->per_primitive_start_dw + map->per_primitive_pitch_dw) {
fprintf(fp, " %4d: %s\n", offset, if (offset < map->per_primitive_start_dw ||
gl_varying_slot_name_for_stage((gl_varying_slot)i, offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
MESA_SHADER_MESH)); continue;
}
const char *name =
gl_varying_slot_name_for_stage((gl_varying_slot)i,
MESA_SHADER_MESH);
fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
name, i);
} }
fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n", fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
@ -607,13 +899,24 @@ brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
if (map->start_dw[i] < 0) if (map->start_dw[i] < 0)
continue; continue;
const unsigned offset = map->start_dw[i]; const unsigned offset = map->start_dw[i];
if (offset >= map->per_vertex_start_dw && const unsigned len = map->len_dw[i];
offset < map->per_vertex_start_dw + map->per_vertex_pitch_dw) {
fprintf(fp, " %4d: %s\n", offset, if (offset < map->per_vertex_start_dw ||
gl_varying_slot_name_for_stage((gl_varying_slot)i, offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
MESA_SHADER_MESH)); continue;
}
nir_variable *var =
nir_find_variable_with_location(nir, nir_var_shader_out, i);
bool flat = var->data.interpolation == INTERP_MODE_FLAT;
const char *name =
gl_varying_slot_name_for_stage((gl_varying_slot)i,
MESA_SHADER_MESH);
fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
name, i, flat ? " (flat)" : "");
} }
fprintf(fp, "\n"); fprintf(fp, "\n");
@ -1070,7 +1373,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
brw_print_tue_map(stderr, params->tue_map); brw_print_tue_map(stderr, params->tue_map);
} }
fprintf(stderr, "Mesh Output "); fprintf(stderr, "Mesh Output ");
brw_print_mue_map(stderr, &prog_data->map); brw_print_mue_map(stderr, &prog_data->map, nir);
} }
fs_generator g(compiler, &params->base, &prog_data->base.base, fs_generator g(compiler, &params->base, &prog_data->base.base,

View file

@ -2085,6 +2085,21 @@ brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
return sysval; return sysval;
} }
const struct glsl_type *
brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
{
const struct glsl_type *type = var->interface_type;
if (!type) {
type = var->type;
if (nir_is_arrayed_io(var, nir->info.stage) || var->data.per_view) {
assert(glsl_type_is_array(type));
type = glsl_get_array_element(type);
}
}
return type;
}
bool bool
brw_nir_pulls_at_sample(nir_shader *shader) brw_nir_pulls_at_sample(nir_shader *shader)
{ {

View file

@ -283,6 +283,9 @@ nir_ssa_def *brw_nir_load_global_const(nir_builder *b,
nir_ssa_def *base_addr, nir_ssa_def *base_addr,
unsigned off); unsigned off);
const struct glsl_type *brw_nir_get_var_type(const struct nir_shader *nir,
nir_variable *var);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif