2021-10-29 12:27:45 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2021 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
#include <list>
|
|
|
|
|
#include <vector>
|
2021-10-29 12:27:45 -07:00
|
|
|
#include "brw_compiler.h"
|
|
|
|
|
#include "brw_fs.h"
|
|
|
|
|
#include "brw_nir.h"
|
|
|
|
|
#include "brw_private.h"
|
|
|
|
|
#include "compiler/nir/nir_builder.h"
|
|
|
|
|
#include "dev/intel_debug.h"
|
|
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
#include <memory>
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2021-07-12 13:43:03 +02:00
|
|
|
static bool
|
|
|
|
|
brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
|
|
|
|
|
UNUSED const void *data)
|
|
|
|
|
{
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
return false;
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
return intrin->intrinsic == nir_intrinsic_load_uniform;
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
static nir_def *
|
2021-07-12 13:43:03 +02:00
|
|
|
brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
|
|
|
|
|
UNUSED void *data)
|
|
|
|
|
{
|
|
|
|
|
assert(instr->type == nir_instr_type_intrinsic);
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
assert(intrin->intrinsic == nir_intrinsic_load_uniform);
|
|
|
|
|
|
2021-12-13 14:11:27 +01:00
|
|
|
/* Read the first few 32-bit scalars from InlineData. */
|
|
|
|
|
if (nir_src_is_const(intrin->src[0]) &&
|
2023-08-14 11:56:00 -05:00
|
|
|
intrin->def.bit_size == 32 &&
|
|
|
|
|
intrin->def.num_components == 1) {
|
2021-12-13 14:11:27 +01:00
|
|
|
unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
|
|
|
|
|
unsigned off_dw = off / 4;
|
|
|
|
|
if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
|
|
|
|
|
off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
|
|
|
|
|
return nir_load_mesh_inline_data_intel(b, 32, off_dw);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return brw_nir_load_global_const(b, intrin,
|
|
|
|
|
nir_load_mesh_inline_data_intel(b, 64, 0), 0);
|
2021-07-12 13:43:03 +02:00
|
|
|
}
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
static bool
|
2021-07-12 13:43:03 +02:00
|
|
|
brw_nir_lower_load_uniforms(nir_shader *nir)
|
|
|
|
|
{
|
2022-07-18 18:35:34 +02:00
|
|
|
return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
|
|
|
|
|
brw_nir_lower_load_uniforms_impl, NULL);
|
2021-07-12 13:43:03 +02:00
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:45:17 -07:00
|
|
|
static inline int
|
|
|
|
|
type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
|
|
|
|
|
{
|
|
|
|
|
return glsl_count_dword_slots(type, bindless);
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
/* TODO(mesh): Make this a common function. */
|
|
|
|
|
static void
|
|
|
|
|
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
|
|
|
|
|
{
|
|
|
|
|
assert(glsl_type_is_vector_or_scalar(type));
|
|
|
|
|
|
|
|
|
|
uint32_t comp_size = glsl_type_is_boolean(type)
|
|
|
|
|
? 4 : glsl_get_bit_size(type) / 8;
|
|
|
|
|
unsigned length = glsl_get_vector_elements(type);
|
|
|
|
|
*size = comp_size * length,
|
|
|
|
|
*align = comp_size * (length == 3 ? 4 : length);
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-03 15:39:45 +02:00
|
|
|
static bool
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
|
|
|
|
|
nir_intrinsic_instr *intrin,
|
|
|
|
|
void *data)
|
2021-10-29 12:45:17 -07:00
|
|
|
{
|
2022-06-03 15:39:45 +02:00
|
|
|
if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *local_invocation_index = nir_load_local_invocation_index(b);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2022-06-03 15:39:45 +02:00
|
|
|
/* Make sure that the mesh workgroup size is taken from the first invocation
|
|
|
|
|
* (nir_intrinsic_launch_mesh_workgroups requirement)
|
2022-02-14 16:40:54 -08:00
|
|
|
*/
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
|
2022-06-03 15:39:45 +02:00
|
|
|
nir_if *if_stmt = nir_push_if(b, cmp);
|
|
|
|
|
{
|
|
|
|
|
/* TUE header contains 4 words:
|
|
|
|
|
*
|
|
|
|
|
* - Word 0 for Task Count.
|
|
|
|
|
*
|
|
|
|
|
* - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
|
|
|
|
|
* 3D dispatch into the 1D dispatch supported by HW.
|
|
|
|
|
*/
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
|
|
|
|
|
nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
|
|
|
|
|
nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
|
|
|
|
|
nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
|
|
|
|
|
nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
|
2022-06-03 15:39:45 +02:00
|
|
|
nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
|
2022-02-14 16:40:54 -08:00
|
|
|
}
|
2022-06-03 15:39:45 +02:00
|
|
|
nir_pop_if(b, if_stmt);
|
|
|
|
|
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
nir_instr_remove(&intrin->instr);
|
2022-06-03 15:39:45 +02:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
|
|
|
|
|
{
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
return nir_shader_intrinsics_pass(nir,
|
2022-06-03 15:39:45 +02:00
|
|
|
brw_nir_lower_launch_mesh_workgroups_instr,
|
|
|
|
|
nir_metadata_none,
|
|
|
|
|
NULL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
|
|
|
|
|
{
|
|
|
|
|
memset(map, 0, sizeof(*map));
|
2022-02-14 16:40:54 -08:00
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
|
|
|
|
|
type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
|
2022-02-14 16:40:54 -08:00
|
|
|
|
|
|
|
|
/* From bspec: "It is suggested that SW reserve the 16 bytes following the
|
2021-10-29 12:45:17 -07:00
|
|
|
* TUE Header, and therefore start the SW-defined data structure at 32B
|
|
|
|
|
* alignment. This allows the TUE Header to always be written as 32 bytes
|
|
|
|
|
* with 32B alignment, the most optimal write performance case."
|
|
|
|
|
*/
|
|
|
|
|
map->per_task_data_start_dw = 8;
|
|
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
/* Lowering to explicit types will start offsets from task_payload_size, so
|
|
|
|
|
* set it to start after the header.
|
|
|
|
|
*/
|
|
|
|
|
nir->info.task_payload_size = map->per_task_data_start_dw * 4;
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
|
|
|
|
|
nir_var_mem_task_payload, shared_type_info);
|
|
|
|
|
NIR_PASS(_, nir, nir_lower_explicit_io,
|
|
|
|
|
nir_var_mem_task_payload, nir_address_format_32bit_offset);
|
2022-02-14 16:36:32 -08:00
|
|
|
|
|
|
|
|
map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
|
|
|
|
|
}
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
static void
|
|
|
|
|
brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
|
|
|
|
|
{
|
|
|
|
|
fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
|
|
|
|
|
}
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
static bool
|
|
|
|
|
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
|
treewide: Also handle struct nir_builder form
Via Coccinelle patch:
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(struct nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:53:26 -04:00
|
|
|
nir_intrinsic_instr *intrin,
|
|
|
|
|
void *data)
|
2022-02-14 16:36:32 -08:00
|
|
|
{
|
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
case nir_intrinsic_store_task_payload:
|
|
|
|
|
case nir_intrinsic_load_task_payload: {
|
|
|
|
|
nir_src *offset_src = nir_get_io_offset_src(intrin);
|
|
|
|
|
|
|
|
|
|
if (nir_src_is_const(*offset_src))
|
|
|
|
|
assert(nir_src_as_uint(*offset_src) % 4 == 0);
|
|
|
|
|
|
|
|
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
|
|
|
|
|
|
|
|
|
/* Regular I/O uses dwords while explicit I/O used for task payload uses
|
|
|
|
|
* bytes. Normalize it to dwords.
|
|
|
|
|
*
|
|
|
|
|
* TODO(mesh): Figure out how to handle 8-bit, 16-bit.
|
|
|
|
|
*/
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
|
2023-08-17 16:27:15 -05:00
|
|
|
nir_src_rewrite(offset_src, offset);
|
2022-02-14 16:36:32 -08:00
|
|
|
|
2022-10-21 15:49:52 +02:00
|
|
|
unsigned base = nir_intrinsic_base(intrin);
|
|
|
|
|
assert(base % 4 == 0);
|
|
|
|
|
nir_intrinsic_set_base(intrin, base / 4);
|
|
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
return true;
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
2022-05-23 17:09:33 +02:00
|
|
|
static bool
|
2022-02-14 16:36:32 -08:00
|
|
|
brw_nir_adjust_task_payload_offsets(nir_shader *nir)
|
2021-10-29 12:45:17 -07:00
|
|
|
{
|
treewide: Also handle struct nir_builder form
Via Coccinelle patch:
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(struct nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:53:26 -04:00
|
|
|
return nir_shader_intrinsics_pass(nir,
|
2022-05-23 17:09:33 +02:00
|
|
|
brw_nir_adjust_task_payload_offsets_instr,
|
|
|
|
|
nir_metadata_block_index |
|
|
|
|
|
nir_metadata_dominance,
|
|
|
|
|
NULL);
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-24 16:38:18 -07:00
|
|
|
void
|
2022-05-23 17:09:33 +02:00
|
|
|
brw_nir_adjust_payload(nir_shader *shader, const struct brw_compiler *compiler)
|
|
|
|
|
{
|
|
|
|
|
/* Adjustment of task payload offsets must be performed *after* last pass
|
|
|
|
|
* which interprets them as bytes, because it changes their unit.
|
|
|
|
|
*/
|
|
|
|
|
bool adjusted = false;
|
|
|
|
|
NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
|
|
|
|
|
if (adjusted) /* clean up the mess created by offset adjustments */
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, shader, nir_opt_constant_folding);
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
2022-12-05 12:27:38 +01:00
|
|
|
static bool
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
|
|
|
|
|
nir_intrinsic_instr *intrin,
|
|
|
|
|
void *data)
|
2022-12-05 12:27:38 +01:00
|
|
|
{
|
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* nir_lower_task_shader uses "range" as task payload size. */
|
|
|
|
|
unsigned range = nir_intrinsic_range(intrin);
|
|
|
|
|
/* This will avoid special case in nir_lower_task_shader dealing with
|
|
|
|
|
* not vec4-aligned payload when payload_in_shared workaround is enabled.
|
|
|
|
|
*/
|
|
|
|
|
nir_intrinsic_set_range(intrin, ALIGN(range, 16));
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
|
|
|
|
|
{
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
return nir_shader_intrinsics_pass(nir,
|
2022-12-05 12:27:38 +01:00
|
|
|
brw_nir_align_launch_mesh_workgroups_instr,
|
|
|
|
|
nir_metadata_block_index |
|
|
|
|
|
nir_metadata_dominance,
|
|
|
|
|
NULL);
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
const unsigned *
|
|
|
|
|
brw_compile_task(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_task_params *params)
|
|
|
|
|
{
|
2023-07-14 02:10:20 +03:00
|
|
|
struct nir_shader *nir = params->base.nir;
|
2021-10-29 12:27:45 -07:00
|
|
|
const struct brw_task_prog_key *key = params->key;
|
|
|
|
|
struct brw_task_prog_data *prog_data = params->prog_data;
|
2023-06-20 14:42:02 -07:00
|
|
|
const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2022-06-03 15:39:45 +02:00
|
|
|
brw_nir_lower_tue_outputs(nir, &prog_data->map);
|
|
|
|
|
|
2022-12-05 12:27:38 +01:00
|
|
|
NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
|
|
|
|
|
|
2022-06-03 15:39:45 +02:00
|
|
|
nir_lower_task_shader_options lower_ts_opt = {
|
|
|
|
|
.payload_to_shared_for_atomics = true,
|
2022-09-07 12:44:38 +02:00
|
|
|
.payload_to_shared_for_small_types = true,
|
2022-10-24 14:59:41 +02:00
|
|
|
/* The actual payload data starts after the TUE header and padding,
|
|
|
|
|
* so skip those when copying.
|
|
|
|
|
*/
|
|
|
|
|
.payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
|
2022-06-03 15:39:45 +02:00
|
|
|
};
|
|
|
|
|
NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
|
|
|
|
|
|
|
|
|
|
NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
prog_data->base.base.stage = MESA_SHADER_TASK;
|
|
|
|
|
prog_data->base.base.total_shared = nir->info.shared_size;
|
2022-02-28 15:13:07 +02:00
|
|
|
prog_data->base.base.total_scratch = 0;
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
prog_data->base.local_size[0] = nir->info.workgroup_size[0];
|
|
|
|
|
prog_data->base.local_size[1] = nir->info.workgroup_size[1];
|
|
|
|
|
prog_data->base.local_size[2] = nir->info.workgroup_size[2];
|
|
|
|
|
|
2021-07-16 15:03:20 +02:00
|
|
|
prog_data->uses_drawid =
|
|
|
|
|
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
|
|
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
brw_simd_selection_state simd_state{
|
2023-07-14 02:10:20 +03:00
|
|
|
.mem_ctx = params->base.mem_ctx,
|
2022-11-08 01:47:50 -08:00
|
|
|
.devinfo = compiler->devinfo,
|
|
|
|
|
.prog_data = &prog_data->base,
|
|
|
|
|
.required_width = brw_required_dispatch_width(&nir->info),
|
|
|
|
|
};
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
std::unique_ptr<fs_visitor> v[3];
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
for (unsigned simd = 0; simd < 3; simd++) {
|
2022-11-08 01:47:50 -08:00
|
|
|
if (!brw_simd_should_compile(simd_state, simd))
|
2021-10-29 12:27:45 -07:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
const unsigned dispatch_width = 8 << simd;
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
|
2023-05-17 17:09:06 +02:00
|
|
|
brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
|
|
|
|
|
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2023-05-17 16:44:17 +02:00
|
|
|
brw_postprocess_nir(shader, compiler, debug_enabled,
|
2022-06-21 18:06:04 -07:00
|
|
|
key->base.robust_flags);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
|
|
|
|
|
&key->base,
|
|
|
|
|
&prog_data->base.base,
|
|
|
|
|
shader, dispatch_width,
|
|
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
debug_enabled);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
if (prog_data->base.prog_mask) {
|
|
|
|
|
unsigned first = ffs(prog_data->base.prog_mask) - 1;
|
2022-11-08 14:14:37 -08:00
|
|
|
v[simd]->import_uniforms(v[first].get());
|
2021-10-29 12:27:45 -07:00
|
|
|
}
|
|
|
|
|
|
2022-11-08 03:38:18 -08:00
|
|
|
const bool allow_spilling = !brw_simd_any_compiled(simd_state);
|
2021-10-29 12:27:45 -07:00
|
|
|
if (v[simd]->run_task(allow_spilling))
|
2022-11-08 01:47:50 -08:00
|
|
|
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
2021-10-29 12:27:45 -07:00
|
|
|
else
|
2023-07-14 02:10:20 +03:00
|
|
|
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
|
2021-10-29 12:27:45 -07:00
|
|
|
}
|
|
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
int selected_simd = brw_simd_select(simd_state);
|
2021-10-29 12:27:45 -07:00
|
|
|
if (selected_simd < 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
params->base.error_str =
|
|
|
|
|
ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
"Can't compile shader: %s, %s and %s.\n",
|
|
|
|
|
simd_state.error[0], simd_state.error[1],
|
|
|
|
|
simd_state.error[2]);
|
2021-10-29 12:27:45 -07:00
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
fs_visitor *selected = v[selected_simd].get();
|
2021-10-29 12:27:45 -07:00
|
|
|
prog_data->base.prog_mask = 1 << selected_simd;
|
|
|
|
|
|
2021-10-29 12:45:17 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
|
|
|
|
fprintf(stderr, "Task Output ");
|
|
|
|
|
brw_print_tue_map(stderr, &prog_data->map);
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_generator g(compiler, ¶ms->base, &prog_data->base.base,
|
|
|
|
|
false, MESA_SHADER_TASK);
|
2021-10-29 12:27:45 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
2021-10-29 12:27:45 -07:00
|
|
|
"%s task shader %s",
|
|
|
|
|
nir->info.label ? nir->info.label
|
|
|
|
|
: "unnamed",
|
|
|
|
|
nir->info.name));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
|
2023-07-14 02:10:20 +03:00
|
|
|
selected->performance_analysis.require(), params->base.stats);
|
2023-01-24 10:52:10 +01:00
|
|
|
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
2021-10-29 12:27:45 -07:00
|
|
|
return g.get_assembly();
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:45:17 -07:00
|
|
|
static void
|
|
|
|
|
brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
|
|
|
|
|
{
|
|
|
|
|
if (!map)
|
|
|
|
|
return;
|
|
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
nir->info.task_payload_size = map->per_task_data_start_dw * 4;
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
|
|
|
|
|
nir_var_mem_task_payload, shared_type_info);
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
2022-02-14 16:36:32 -08:00
|
|
|
/* The types for Task Output and Mesh Input should match, so their sizes
|
|
|
|
|
* should also match.
|
|
|
|
|
*/
|
|
|
|
|
assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
|
|
|
|
|
} else {
|
|
|
|
|
/* Mesh doesn't read any input, to make it clearer set the
|
|
|
|
|
* task_payload_size to zero instead of keeping an incomplete size that
|
|
|
|
|
* just includes the header.
|
|
|
|
|
*/
|
|
|
|
|
nir->info.task_payload_size = 0;
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
|
|
|
|
|
nir_address_format_32bit_offset);
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
/* Attribute types. Flat attributes have to be a separate class because
|
|
|
|
|
* flat and interpolated attributes can't share the same vec4 slot
|
|
|
|
|
* (see 3DSTATE_SBE.ConstantInterpolationEnable).
|
|
|
|
|
*/
|
|
|
|
|
enum {
|
|
|
|
|
PRIM, /* per primitive */
|
|
|
|
|
VERT, /* per vertex interpolated */
|
|
|
|
|
VERT_FLAT, /* per vertex flat */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct attr_desc {
|
|
|
|
|
int location;
|
|
|
|
|
const struct glsl_type *type;
|
|
|
|
|
unsigned dwords;
|
|
|
|
|
unsigned slots;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct attr_type_info {
|
|
|
|
|
/* order of attributes, negative values are holes */
|
|
|
|
|
std::list<struct attr_desc> *order;
|
|
|
|
|
|
|
|
|
|
/* attributes after which there's hole of size equal to array index */
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
std::list<int> holes[5];
|
2022-12-21 15:40:07 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
brw_mue_assign_position(const struct attr_desc *attr,
|
|
|
|
|
struct brw_mue_map *map,
|
|
|
|
|
unsigned start_dw)
|
|
|
|
|
{
|
|
|
|
|
bool is_array = glsl_type_is_array(attr->type);
|
|
|
|
|
int location = attr->location;
|
|
|
|
|
unsigned remaining = attr->dwords;
|
|
|
|
|
|
|
|
|
|
for (unsigned slot = 0; slot < attr->slots; ++slot) {
|
|
|
|
|
map->start_dw[location + slot] = start_dw;
|
|
|
|
|
|
|
|
|
|
unsigned sz;
|
|
|
|
|
|
|
|
|
|
if (is_array) {
|
|
|
|
|
assert(attr->dwords % attr->slots == 0);
|
|
|
|
|
sz = attr->dwords / attr->slots;
|
|
|
|
|
} else {
|
|
|
|
|
sz = MIN2(remaining, 4);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
map->len_dw[location + slot] = sz;
|
|
|
|
|
start_dw += sz;
|
|
|
|
|
remaining -= sz;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static nir_variable *
|
|
|
|
|
brw_nir_find_complete_variable_with_location(nir_shader *shader,
|
|
|
|
|
nir_variable_mode mode,
|
|
|
|
|
int location)
|
|
|
|
|
{
|
|
|
|
|
nir_variable *best_var = NULL;
|
|
|
|
|
unsigned last_size = 0;
|
|
|
|
|
|
|
|
|
|
nir_foreach_variable_with_modes(var, shader, mode) {
|
|
|
|
|
if (var->data.location != location)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
unsigned new_size = glsl_count_dword_slots(var->type, false);
|
|
|
|
|
if (new_size > last_size) {
|
|
|
|
|
best_var = var;
|
|
|
|
|
last_size = new_size;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return best_var;
|
|
|
|
|
}
|
|
|
|
|
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
static unsigned
|
|
|
|
|
brw_sum_size(const std::list<struct attr_desc> &orders)
|
|
|
|
|
{
|
|
|
|
|
unsigned sz = 0;
|
|
|
|
|
for (auto it = orders.cbegin(); it != orders.cend(); ++it)
|
|
|
|
|
sz += (*it).dwords;
|
|
|
|
|
return sz;
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
/* Finds order of outputs which require minimum size, without splitting
|
|
|
|
|
* of URB read/write messages (which operate on vec4-aligned memory).
|
|
|
|
|
*/
|
|
|
|
|
static void
|
2023-07-21 11:50:51 +02:00
|
|
|
brw_compute_mue_layout(const struct brw_compiler *compiler,
|
|
|
|
|
std::list<struct attr_desc> *orders,
|
2022-12-21 15:40:07 +01:00
|
|
|
uint64_t outputs_written,
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
struct nir_shader *nir,
|
|
|
|
|
bool *pack_prim_data_into_header,
|
|
|
|
|
bool *pack_vert_data_into_header)
|
2022-12-21 15:40:07 +01:00
|
|
|
{
|
|
|
|
|
const struct shader_info *info = &nir->info;
|
|
|
|
|
|
|
|
|
|
struct attr_type_info data[3];
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
|
2023-07-21 11:50:51 +02:00
|
|
|
if ((compiler->mesh.mue_header_packing & 1) == 0)
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
*pack_prim_data_into_header = false;
|
2023-07-21 11:50:51 +02:00
|
|
|
if ((compiler->mesh.mue_header_packing & 2) == 0)
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
*pack_vert_data_into_header = false;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
|
|
|
for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
|
|
|
|
|
data[i].order = &orders[i];
|
|
|
|
|
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
/* If packing into header is enabled, add a hole of size 4 and add
|
|
|
|
|
* a virtual location to keep the algorithm happy (it expects holes
|
|
|
|
|
* to be preceded by some location). We'll remove those virtual
|
|
|
|
|
* locations at the end.
|
|
|
|
|
*/
|
|
|
|
|
const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
|
|
|
|
|
assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
|
|
|
|
|
|
|
|
|
|
struct attr_desc d;
|
|
|
|
|
d.location = virtual_header_location;
|
|
|
|
|
d.type = NULL;
|
|
|
|
|
d.dwords = 0;
|
|
|
|
|
d.slots = 0;
|
|
|
|
|
|
|
|
|
|
struct attr_desc h;
|
|
|
|
|
h.location = -1;
|
|
|
|
|
h.type = NULL;
|
|
|
|
|
h.dwords = 4;
|
|
|
|
|
h.slots = 0;
|
|
|
|
|
|
|
|
|
|
if (*pack_prim_data_into_header) {
|
|
|
|
|
orders[PRIM].push_back(d);
|
|
|
|
|
orders[PRIM].push_back(h);
|
|
|
|
|
data[PRIM].holes[4].push_back(virtual_header_location);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (*pack_vert_data_into_header) {
|
|
|
|
|
orders[VERT].push_back(d);
|
|
|
|
|
orders[VERT].push_back(h);
|
|
|
|
|
data[VERT].holes[4].push_back(virtual_header_location);
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
u_foreach_bit64(location, outputs_written) {
|
|
|
|
|
if ((BITFIELD64_BIT(location) & outputs_written) == 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* At this point there are both complete and split variables as
|
|
|
|
|
* outputs. We need the complete variable to compute the required
|
|
|
|
|
* size.
|
|
|
|
|
*/
|
|
|
|
|
nir_variable *var =
|
|
|
|
|
brw_nir_find_complete_variable_with_location(nir,
|
|
|
|
|
nir_var_shader_out,
|
|
|
|
|
location);
|
|
|
|
|
|
|
|
|
|
d.location = location;
|
|
|
|
|
d.type = brw_nir_get_var_type(nir, var);
|
|
|
|
|
d.dwords = glsl_count_dword_slots(d.type, false);
|
|
|
|
|
d.slots = glsl_count_attribute_slots(d.type, false);
|
|
|
|
|
|
|
|
|
|
struct attr_type_info *type_data;
|
|
|
|
|
|
|
|
|
|
if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
|
|
|
|
|
type_data = &data[PRIM];
|
|
|
|
|
else if (var->data.interpolation == INTERP_MODE_FLAT)
|
|
|
|
|
type_data = &data[VERT_FLAT];
|
|
|
|
|
else
|
|
|
|
|
type_data = &data[VERT];
|
|
|
|
|
|
|
|
|
|
std::list<struct attr_desc> *order = type_data->order;
|
|
|
|
|
std::list<int> *holes = type_data->holes;
|
|
|
|
|
|
|
|
|
|
outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
|
|
|
|
|
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
/* special case to use hole of size 4 */
|
|
|
|
|
if (d.dwords == 4 && !holes[4].empty()) {
|
|
|
|
|
holes[4].pop_back();
|
|
|
|
|
|
|
|
|
|
assert(order->front().location == virtual_header_location);
|
|
|
|
|
order->pop_front();
|
|
|
|
|
|
|
|
|
|
assert(order->front().location == -1);
|
|
|
|
|
assert(order->front().dwords == 4);
|
|
|
|
|
order->front() = d;
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
int mod = d.dwords % 4;
|
|
|
|
|
if (mod == 0) {
|
|
|
|
|
order->push_back(d);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
h.location = -1;
|
|
|
|
|
h.type = NULL;
|
|
|
|
|
h.dwords = 4 - mod;
|
|
|
|
|
h.slots = 0;
|
|
|
|
|
|
2023-07-21 11:50:51 +02:00
|
|
|
if (!compiler->mesh.mue_compaction) {
|
2022-12-21 15:40:07 +01:00
|
|
|
order->push_back(d);
|
|
|
|
|
order->push_back(h);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (d.dwords > 4) {
|
|
|
|
|
order->push_back(d);
|
|
|
|
|
order->push_back(h);
|
|
|
|
|
holes[h.dwords].push_back(location);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(d.dwords < 4);
|
|
|
|
|
|
|
|
|
|
unsigned found = 0;
|
|
|
|
|
/* try to find the smallest hole big enough to hold this attribute */
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
for (unsigned sz = d.dwords; sz <= 4; sz++){
|
2022-12-21 15:40:07 +01:00
|
|
|
if (!holes[sz].empty()) {
|
|
|
|
|
found = sz;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* append at the end if not found */
|
|
|
|
|
if (found == 0) {
|
|
|
|
|
order->push_back(d);
|
|
|
|
|
order->push_back(h);
|
|
|
|
|
holes[h.dwords].push_back(location);
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
assert(found <= 4);
|
2022-12-21 15:40:07 +01:00
|
|
|
assert(!holes[found].empty());
|
|
|
|
|
int after_loc = holes[found].back();
|
|
|
|
|
holes[found].pop_back();
|
|
|
|
|
|
|
|
|
|
bool inserted_back = false;
|
|
|
|
|
|
|
|
|
|
for (auto it = order->begin(); it != order->end(); ++it) {
|
|
|
|
|
if ((*it).location != after_loc)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
++it;
|
|
|
|
|
/* must be a hole */
|
|
|
|
|
assert((*it).location < 0);
|
|
|
|
|
/* and it must be big enough */
|
|
|
|
|
assert(d.dwords <= (*it).dwords);
|
|
|
|
|
|
|
|
|
|
if (d.dwords == (*it).dwords) {
|
|
|
|
|
/* exact size, just replace */
|
|
|
|
|
*it = d;
|
|
|
|
|
} else {
|
|
|
|
|
/* inexact size, shrink hole */
|
|
|
|
|
(*it).dwords -= d.dwords;
|
|
|
|
|
/* and insert new attribute before it */
|
|
|
|
|
order->insert(it, d);
|
|
|
|
|
|
|
|
|
|
/* Insert shrunk hole in a spot so that the order of attributes
|
|
|
|
|
* is preserved.
|
|
|
|
|
*/
|
|
|
|
|
std::list<int> &hole_list = holes[(*it).dwords];
|
|
|
|
|
std::list<int>::iterator insert_before = hole_list.end();
|
|
|
|
|
|
|
|
|
|
for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
|
|
|
|
|
if ((*it2) >= (int)location) {
|
|
|
|
|
insert_before = it2;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hole_list.insert(insert_before, location);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inserted_back = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(inserted_back);
|
|
|
|
|
}
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
|
|
|
|
|
if (*pack_prim_data_into_header) {
|
|
|
|
|
if (orders[PRIM].front().location == virtual_header_location)
|
|
|
|
|
orders[PRIM].pop_front();
|
|
|
|
|
|
|
|
|
|
if (!data[PRIM].holes[4].empty()) {
|
|
|
|
|
*pack_prim_data_into_header = false;
|
|
|
|
|
|
|
|
|
|
assert(orders[PRIM].front().location == -1);
|
|
|
|
|
assert(orders[PRIM].front().dwords == 4);
|
|
|
|
|
orders[PRIM].pop_front();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (*pack_prim_data_into_header) {
|
|
|
|
|
unsigned sz = brw_sum_size(orders[PRIM]);
|
|
|
|
|
|
|
|
|
|
if (sz % 8 == 0 || sz % 8 > 4)
|
|
|
|
|
*pack_prim_data_into_header = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (*pack_vert_data_into_header) {
|
|
|
|
|
if (orders[VERT].front().location == virtual_header_location)
|
|
|
|
|
orders[VERT].pop_front();
|
|
|
|
|
|
|
|
|
|
if (!data[VERT].holes[4].empty()) {
|
|
|
|
|
*pack_vert_data_into_header = false;
|
|
|
|
|
|
|
|
|
|
assert(orders[VERT].front().location == -1);
|
|
|
|
|
assert(orders[VERT].front().dwords == 4);
|
|
|
|
|
orders[VERT].pop_front();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (*pack_vert_data_into_header) {
|
|
|
|
|
unsigned sz = brw_sum_size(orders[VERT]) +
|
|
|
|
|
brw_sum_size(orders[VERT_FLAT]);
|
|
|
|
|
|
|
|
|
|
if (sz % 8 == 0 || sz % 8 > 4)
|
|
|
|
|
*pack_vert_data_into_header = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (INTEL_DEBUG(DEBUG_MESH)) {
|
|
|
|
|
fprintf(stderr, "MUE attribute order:\n");
|
|
|
|
|
for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
|
|
|
|
|
if (!orders[i].empty())
|
|
|
|
|
fprintf(stderr, "%d: ", i);
|
|
|
|
|
for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
|
|
|
|
|
fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
|
|
|
|
|
}
|
|
|
|
|
if (!orders[i].empty())
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:56:22 -07:00
|
|
|
/* Mesh URB Entry consists of an initial section
|
|
|
|
|
*
|
|
|
|
|
* - Primitive Count
|
|
|
|
|
* - Primitive Indices (from 0 to Max-1)
|
|
|
|
|
* - Padding to 32B if needed
|
|
|
|
|
*
|
|
|
|
|
* optionally followed by a section for per-primitive data,
|
|
|
|
|
* in which each primitive (from 0 to Max-1) gets
|
|
|
|
|
*
|
|
|
|
|
* - Primitive Header (e.g. ViewportIndex)
|
|
|
|
|
* - Primitive Custom Attributes
|
|
|
|
|
*
|
|
|
|
|
* then followed by a section for per-vertex data
|
|
|
|
|
*
|
|
|
|
|
* - Vertex Header (e.g. Position)
|
|
|
|
|
* - Vertex Custom Attributes
|
|
|
|
|
*
|
|
|
|
|
* Each per-element section has a pitch and a starting offset. All the
|
|
|
|
|
* individual attributes offsets in start_dw are considering the first entry
|
|
|
|
|
* of the section (i.e. where the Position for first vertex, or ViewportIndex
|
|
|
|
|
* for first primitive). Attributes for other elements are calculated using
|
|
|
|
|
* the pitch.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
2023-07-21 11:50:51 +02:00
|
|
|
brw_compute_mue_map(const struct brw_compiler *compiler,
|
|
|
|
|
struct nir_shader *nir, struct brw_mue_map *map,
|
2023-01-25 15:06:23 +01:00
|
|
|
enum brw_mesh_index_format index_format)
|
2021-10-29 12:56:22 -07:00
|
|
|
{
|
|
|
|
|
memset(map, 0, sizeof(*map));
|
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
memset(&map->start_dw[0], -1, sizeof(map->start_dw));
|
|
|
|
|
memset(&map->len_dw[0], 0, sizeof(map->len_dw));
|
2021-10-29 12:56:22 -07:00
|
|
|
|
2022-02-14 10:44:28 +01:00
|
|
|
unsigned vertices_per_primitive =
|
|
|
|
|
num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
|
2021-10-29 12:56:22 -07:00
|
|
|
|
|
|
|
|
map->max_primitives = nir->info.mesh.max_primitives_out;
|
|
|
|
|
map->max_vertices = nir->info.mesh.max_vertices_out;
|
|
|
|
|
|
|
|
|
|
uint64_t outputs_written = nir->info.outputs_written;
|
|
|
|
|
|
2023-01-25 15:06:23 +01:00
|
|
|
/* One dword for primitives count then K extra dwords for each primitive. */
|
|
|
|
|
switch (index_format) {
|
|
|
|
|
case BRW_INDEX_FORMAT_U32:
|
|
|
|
|
map->per_primitive_indices_dw = vertices_per_primitive;
|
|
|
|
|
break;
|
|
|
|
|
case BRW_INDEX_FORMAT_U888X:
|
|
|
|
|
map->per_primitive_indices_dw = 1;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("invalid index format");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
|
|
|
|
|
map->max_primitives + 1, 8);
|
2021-10-29 12:56:22 -07:00
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
/* Assign initial section. */
|
|
|
|
|
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
|
|
|
|
|
map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
|
|
|
|
|
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
|
|
|
|
|
}
|
|
|
|
|
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
|
|
|
|
|
map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
|
|
|
|
|
map->per_primitive_indices_dw * map->max_primitives;
|
|
|
|
|
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const uint64_t per_primitive_header_bits =
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_LAYER) |
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
|
|
|
|
|
|
|
|
|
|
const uint64_t per_vertex_header_bits =
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_POS) |
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
|
|
|
|
|
|
|
|
|
|
std::list<struct attr_desc> orders[3];
|
|
|
|
|
uint64_t regular_outputs = outputs_written &
|
|
|
|
|
~(per_primitive_header_bits | per_vertex_header_bits);
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
|
|
|
|
|
/* packing into prim header is possible only if prim header is present */
|
|
|
|
|
map->user_data_in_primitive_header =
|
|
|
|
|
(outputs_written & per_primitive_header_bits) != 0;
|
|
|
|
|
|
|
|
|
|
/* Packing into vert header is always possible, but we allow it only
|
|
|
|
|
* if full vec4 is available (so point size is not used) and there's
|
|
|
|
|
* nothing between it and normal vertex data (so no clip distances).
|
|
|
|
|
*/
|
|
|
|
|
map->user_data_in_vertex_header =
|
|
|
|
|
(outputs_written & per_vertex_header_bits) ==
|
|
|
|
|
BITFIELD64_BIT(VARYING_SLOT_POS);
|
|
|
|
|
|
2023-07-21 11:50:51 +02:00
|
|
|
brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
&map->user_data_in_primitive_header,
|
|
|
|
|
&map->user_data_in_vertex_header);
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
|
|
|
if (outputs_written & per_primitive_header_bits) {
|
|
|
|
|
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
|
|
|
|
|
map->per_primitive_start_dw + 0;
|
|
|
|
|
map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_LAYER] =
|
|
|
|
|
map->per_primitive_start_dw + 1; /* RTAIndex */
|
|
|
|
|
map->len_dw[VARYING_SLOT_LAYER] = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_VIEWPORT] =
|
|
|
|
|
map->per_primitive_start_dw + 2;
|
|
|
|
|
map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
|
|
|
|
|
map->per_primitive_start_dw + 3;
|
|
|
|
|
map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
map->per_primitive_header_size_dw = 8;
|
|
|
|
|
outputs_written &= ~per_primitive_header_bits;
|
|
|
|
|
} else {
|
|
|
|
|
map->per_primitive_header_size_dw = 0;
|
|
|
|
|
}
|
2021-10-29 12:56:22 -07:00
|
|
|
|
2022-02-01 18:09:52 +01:00
|
|
|
map->per_primitive_data_size_dw = 0;
|
2021-10-29 12:56:22 -07:00
|
|
|
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
unsigned start_dw = map->per_primitive_start_dw;
|
|
|
|
|
if (map->user_data_in_primitive_header)
|
|
|
|
|
start_dw += 4; /* first 4 dwords are used */
|
|
|
|
|
else
|
|
|
|
|
start_dw += map->per_primitive_header_size_dw;
|
|
|
|
|
unsigned header_used_dw = 0;
|
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
|
|
|
|
|
int location = (*it).location;
|
|
|
|
|
if (location < 0) {
|
|
|
|
|
start_dw += (*it).dwords;
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
if (map->user_data_in_primitive_header && header_used_dw < 4)
|
|
|
|
|
header_used_dw += (*it).dwords;
|
|
|
|
|
else
|
|
|
|
|
map->per_primitive_data_size_dw += (*it).dwords;
|
|
|
|
|
assert(header_used_dw <= 4);
|
2022-12-21 15:40:07 +01:00
|
|
|
continue;
|
2022-02-01 18:09:52 +01:00
|
|
|
}
|
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
assert(map->start_dw[location] == -1);
|
|
|
|
|
|
|
|
|
|
assert(location == VARYING_SLOT_PRIMITIVE_ID ||
|
|
|
|
|
location >= VARYING_SLOT_VAR0);
|
|
|
|
|
|
|
|
|
|
brw_mue_assign_position(&*it, map, start_dw);
|
|
|
|
|
|
|
|
|
|
start_dw += (*it).dwords;
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
if (map->user_data_in_primitive_header && header_used_dw < 4)
|
|
|
|
|
header_used_dw += (*it).dwords;
|
|
|
|
|
else
|
|
|
|
|
map->per_primitive_data_size_dw += (*it).dwords;
|
|
|
|
|
assert(header_used_dw <= 4);
|
2022-12-21 15:40:07 +01:00
|
|
|
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
|
2021-10-29 12:56:22 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
|
|
|
|
|
map->per_primitive_data_size_dw, 8);
|
|
|
|
|
|
|
|
|
|
map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
|
2022-12-21 15:40:07 +01:00
|
|
|
map->per_primitive_pitch_dw *
|
|
|
|
|
map->max_primitives, 8);
|
2021-10-29 12:56:22 -07:00
|
|
|
|
2021-12-09 16:47:43 +01:00
|
|
|
/* TODO(mesh): Multiview. */
|
|
|
|
|
unsigned fixed_header_size = 8;
|
|
|
|
|
map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
|
|
|
|
|
nir->info.clip_distance_array_size +
|
|
|
|
|
nir->info.cull_distance_array_size, 8);
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
|
|
|
if (outputs_written & per_vertex_header_bits) {
|
|
|
|
|
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
|
|
|
|
|
map->len_dw[VARYING_SLOT_PSIZ] = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
|
|
|
|
|
map->len_dw[VARYING_SLOT_POS] = 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_CLIP_DIST0] =
|
|
|
|
|
map->per_vertex_start_dw + fixed_header_size + 0;
|
|
|
|
|
map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
|
|
|
|
|
map->start_dw[VARYING_SLOT_CLIP_DIST1] =
|
|
|
|
|
map->per_vertex_start_dw + fixed_header_size + 4;
|
|
|
|
|
map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
outputs_written &= ~per_vertex_header_bits;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* cull distances should be lowered earlier */
|
|
|
|
|
assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
|
|
|
|
|
assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
|
|
|
|
|
|
2021-12-09 16:47:43 +01:00
|
|
|
map->per_vertex_data_size_dw = 0;
|
2021-10-29 12:56:22 -07:00
|
|
|
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
start_dw = map->per_vertex_start_dw;
|
|
|
|
|
if (!map->user_data_in_vertex_header)
|
|
|
|
|
start_dw += map->per_vertex_header_size_dw;
|
|
|
|
|
|
|
|
|
|
header_used_dw = 0;
|
2022-12-21 15:40:07 +01:00
|
|
|
for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
|
|
|
|
|
for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
|
|
|
|
|
int location = (*it).location;
|
|
|
|
|
if (location < 0) {
|
|
|
|
|
start_dw += (*it).dwords;
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
if (map->user_data_in_vertex_header && header_used_dw < 4) {
|
|
|
|
|
header_used_dw += (*it).dwords;
|
|
|
|
|
assert(header_used_dw <= 4);
|
|
|
|
|
if (header_used_dw == 4)
|
|
|
|
|
start_dw += 4; /* jump over gl_position */
|
|
|
|
|
} else {
|
|
|
|
|
map->per_vertex_data_size_dw += (*it).dwords;
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(map->start_dw[location] == -1);
|
|
|
|
|
|
2021-10-29 12:56:22 -07:00
|
|
|
assert(location >= VARYING_SLOT_VAR0);
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
|
|
|
brw_mue_assign_position(&*it, map, start_dw);
|
|
|
|
|
|
|
|
|
|
start_dw += (*it).dwords;
|
intel/compiler,anv: put some vertex and primitive data in headers
Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).
Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.
Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.
Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.
This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.
Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.
16/16 is better than 24/16, so packing makes sense.
Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.
Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.
With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.
24/24 is worse than 24/16, so packing doesn't make sense.
This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
|
|
|
if (map->user_data_in_vertex_header && header_used_dw < 4) {
|
|
|
|
|
header_used_dw += (*it).dwords;
|
|
|
|
|
assert(header_used_dw <= 4);
|
|
|
|
|
if (header_used_dw == 4)
|
|
|
|
|
start_dw += 4; /* jump over gl_position */
|
|
|
|
|
} else {
|
|
|
|
|
map->per_vertex_data_size_dw += (*it).dwords;
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
|
2021-10-29 12:56:22 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
|
|
|
|
|
map->per_vertex_data_size_dw, 8);
|
|
|
|
|
|
|
|
|
|
map->size_dw =
|
|
|
|
|
map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
|
|
|
|
|
|
|
|
|
|
assert(map->size_dw % 8 == 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2022-12-21 15:40:07 +01:00
|
|
|
brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
|
2021-10-29 12:56:22 -07:00
|
|
|
{
|
|
|
|
|
fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
|
|
|
|
|
map->size_dw, map->max_primitives, map->max_vertices);
|
2022-12-21 15:40:07 +01:00
|
|
|
fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
|
|
|
|
|
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
|
|
|
|
|
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
|
|
|
|
|
map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
|
|
|
|
|
fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
|
|
|
|
|
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
|
|
|
|
|
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
|
|
|
|
|
map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
|
2021-10-29 12:56:22 -07:00
|
|
|
|
|
|
|
|
fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
|
|
|
|
|
map->per_primitive_start_dw,
|
|
|
|
|
map->per_primitive_header_size_dw,
|
|
|
|
|
map->per_primitive_data_size_dw,
|
|
|
|
|
map->per_primitive_pitch_dw);
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
if (map->start_dw[i] < 0)
|
|
|
|
|
continue;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2021-10-29 12:56:22 -07:00
|
|
|
const unsigned offset = map->start_dw[i];
|
2022-12-21 15:40:07 +01:00
|
|
|
const unsigned len = map->len_dw[i];
|
|
|
|
|
|
|
|
|
|
if (offset < map->per_primitive_start_dw ||
|
|
|
|
|
offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
const char *name =
|
|
|
|
|
gl_varying_slot_name_for_stage((gl_varying_slot)i,
|
|
|
|
|
MESA_SHADER_MESH);
|
|
|
|
|
|
|
|
|
|
fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
|
|
|
|
|
name, i);
|
2021-10-29 12:56:22 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
|
|
|
|
|
map->per_vertex_start_dw,
|
|
|
|
|
map->per_vertex_header_size_dw,
|
|
|
|
|
map->per_vertex_data_size_dw,
|
|
|
|
|
map->per_vertex_pitch_dw);
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
if (map->start_dw[i] < 0)
|
|
|
|
|
continue;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2021-10-29 12:56:22 -07:00
|
|
|
const unsigned offset = map->start_dw[i];
|
2022-12-21 15:40:07 +01:00
|
|
|
const unsigned len = map->len_dw[i];
|
|
|
|
|
|
|
|
|
|
if (offset < map->per_vertex_start_dw ||
|
|
|
|
|
offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_variable *var =
|
|
|
|
|
nir_find_variable_with_location(nir, nir_var_shader_out, i);
|
|
|
|
|
bool flat = var->data.interpolation == INTERP_MODE_FLAT;
|
|
|
|
|
|
|
|
|
|
const char *name =
|
|
|
|
|
gl_varying_slot_name_for_stage((gl_varying_slot)i,
|
|
|
|
|
MESA_SHADER_MESH);
|
|
|
|
|
|
|
|
|
|
fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
|
|
|
|
|
name, i, flat ? " (flat)" : "");
|
2021-10-29 12:56:22 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(fp, "\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
|
|
|
|
|
{
|
|
|
|
|
nir_foreach_shader_out_variable(var, nir) {
|
|
|
|
|
int location = var->data.location;
|
|
|
|
|
assert(location >= 0);
|
|
|
|
|
assert(map->start_dw[location] != -1);
|
|
|
|
|
var->data.driver_location = map->start_dw[location];
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
|
|
|
|
|
type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
|
2021-10-29 12:56:22 -07:00
|
|
|
}
|
|
|
|
|
|
2022-02-01 18:08:49 +01:00
|
|
|
static void
|
|
|
|
|
brw_nir_initialize_mue(nir_shader *nir,
|
|
|
|
|
const struct brw_mue_map *map,
|
|
|
|
|
unsigned dispatch_width)
|
|
|
|
|
{
|
|
|
|
|
assert(map->per_primitive_header_size_dw > 0);
|
|
|
|
|
|
|
|
|
|
nir_builder b;
|
|
|
|
|
nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
|
2023-08-28 13:58:57 -04:00
|
|
|
b = nir_builder_at(nir_before_impl(entrypoint));
|
2022-02-01 18:08:49 +01:00
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *dw_off = nir_imm_int(&b, 0);
|
|
|
|
|
nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
|
2022-02-01 18:08:49 +01:00
|
|
|
|
|
|
|
|
/* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
|
|
|
|
|
|
|
|
|
|
assert(!nir->info.workgroup_size_variable);
|
|
|
|
|
const unsigned workgroup_size = nir->info.workgroup_size[0] *
|
|
|
|
|
nir->info.workgroup_size[1] *
|
|
|
|
|
nir->info.workgroup_size[2];
|
|
|
|
|
|
|
|
|
|
/* Invocations from a single workgroup will cooperate in zeroing MUE. */
|
|
|
|
|
|
|
|
|
|
/* How many prims each invocation needs to cover without checking its index? */
|
|
|
|
|
unsigned prims_per_inv = map->max_primitives / workgroup_size;
|
|
|
|
|
|
|
|
|
|
/* Zero first 4 dwords of MUE Primitive Header:
|
|
|
|
|
* Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
|
|
|
|
|
*/
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
|
2022-02-01 18:08:49 +01:00
|
|
|
|
|
|
|
|
/* Zero primitive headers distanced by workgroup_size, starting from
|
|
|
|
|
* invocation index.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
|
2022-02-01 18:08:49 +01:00
|
|
|
prim_in_inv * workgroup_size);
|
|
|
|
|
|
|
|
|
|
nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
|
|
|
|
|
.base = (int)map->per_primitive_start_dw,
|
|
|
|
|
.write_mask = WRITEMASK_XYZW,
|
2022-03-12 17:24:11 +01:00
|
|
|
.component = 0,
|
2022-02-01 18:08:49 +01:00
|
|
|
.src_type = nir_type_uint32);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* How many prims are left? */
|
|
|
|
|
unsigned remaining = map->max_primitives % workgroup_size;
|
|
|
|
|
|
|
|
|
|
if (remaining) {
|
|
|
|
|
/* Zero "remaining" primitive headers starting from the last one covered
|
|
|
|
|
* by the loop above + workgroup_size.
|
|
|
|
|
*/
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
|
2022-02-01 18:08:49 +01:00
|
|
|
nir_if *if_stmt = nir_push_if(&b, cmp);
|
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
|
2022-02-01 18:08:49 +01:00
|
|
|
prims_per_inv * workgroup_size);
|
|
|
|
|
|
|
|
|
|
nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
|
|
|
|
|
.base = (int)map->per_primitive_start_dw,
|
|
|
|
|
.write_mask = WRITEMASK_XYZW,
|
2022-03-12 17:24:11 +01:00
|
|
|
.component = 0,
|
2022-02-01 18:08:49 +01:00
|
|
|
.src_type = nir_type_uint32);
|
|
|
|
|
}
|
|
|
|
|
nir_pop_if(&b, if_stmt);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If there's more than one subgroup, then we need to wait for all of them
|
|
|
|
|
* to finish initialization before we can proceed. Otherwise some subgroups
|
|
|
|
|
* may start filling MUE before other finished initializing.
|
|
|
|
|
*/
|
|
|
|
|
if (workgroup_size > dispatch_width) {
|
2023-07-28 15:08:00 -04:00
|
|
|
nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
|
2022-02-01 18:08:49 +01:00
|
|
|
NIR_MEMORY_ACQ_REL, nir_var_shader_out);
|
|
|
|
|
}
|
2022-04-13 14:37:15 +02:00
|
|
|
|
|
|
|
|
if (remaining) {
|
|
|
|
|
nir_metadata_preserve(entrypoint, nir_metadata_none);
|
|
|
|
|
} else {
|
|
|
|
|
nir_metadata_preserve(entrypoint, nir_metadata_block_index |
|
|
|
|
|
nir_metadata_dominance);
|
|
|
|
|
}
|
2022-02-01 18:08:49 +01:00
|
|
|
}
|
|
|
|
|
|
2022-11-09 17:03:13 +01:00
|
|
|
static void
|
|
|
|
|
brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
|
|
|
|
|
{
|
|
|
|
|
nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
|
|
|
|
|
nir_src *offset_src = nir_get_io_offset_src(intrin);
|
|
|
|
|
|
|
|
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *offset =
|
2022-11-09 17:03:13 +01:00
|
|
|
nir_iadd(b,
|
|
|
|
|
offset_src->ssa,
|
|
|
|
|
nir_imul_imm(b, index_src->ssa, pitch));
|
2023-08-17 16:27:15 -05:00
|
|
|
nir_src_rewrite(offset_src, offset);
|
2022-11-09 17:03:13 +01:00
|
|
|
}
|
|
|
|
|
|
2022-03-01 11:29:41 -08:00
|
|
|
static bool
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
|
|
|
|
|
nir_intrinsic_instr *intrin,
|
|
|
|
|
void *data)
|
2021-10-29 12:56:22 -07:00
|
|
|
{
|
2022-03-01 11:29:41 -08:00
|
|
|
const struct brw_mue_map *map = (const struct brw_mue_map *) data;
|
|
|
|
|
|
|
|
|
|
/* Remap per_vertex and per_primitive offsets using the extra source and
|
|
|
|
|
* the pitch.
|
2021-10-29 12:56:22 -07:00
|
|
|
*/
|
2022-03-01 11:29:41 -08:00
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_per_vertex_output:
|
2022-11-09 17:03:13 +01:00
|
|
|
case nir_intrinsic_store_per_vertex_output:
|
|
|
|
|
brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
|
2022-03-01 11:29:41 -08:00
|
|
|
|
|
|
|
|
return true;
|
2021-10-29 12:56:22 -07:00
|
|
|
|
2022-03-01 11:29:41 -08:00
|
|
|
case nir_intrinsic_load_per_primitive_output:
|
|
|
|
|
case nir_intrinsic_store_per_primitive_output: {
|
2022-11-09 16:46:27 +01:00
|
|
|
struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
|
|
|
|
|
uint32_t pitch;
|
|
|
|
|
if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
|
2023-01-25 15:06:23 +01:00
|
|
|
pitch = map->per_primitive_indices_dw;
|
2022-11-09 16:46:27 +01:00
|
|
|
else
|
|
|
|
|
pitch = map->per_primitive_pitch_dw;
|
|
|
|
|
|
2022-11-09 17:03:13 +01:00
|
|
|
brw_nir_adjust_offset(b, intrin, pitch);
|
|
|
|
|
|
2022-03-01 11:29:41 -08:00
|
|
|
return true;
|
2021-10-29 12:56:22 -07:00
|
|
|
}
|
2022-03-01 11:29:41 -08:00
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
static bool
|
2022-03-01 11:29:41 -08:00
|
|
|
brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
|
|
|
|
|
{
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
return nir_shader_intrinsics_pass(nir,
|
2022-07-18 18:35:34 +02:00
|
|
|
brw_nir_adjust_offset_for_arrayed_indices_instr,
|
|
|
|
|
nir_metadata_block_index |
|
|
|
|
|
nir_metadata_dominance,
|
|
|
|
|
(void *)map);
|
2021-10-29 12:56:22 -07:00
|
|
|
}
|
|
|
|
|
|
2023-01-25 15:06:23 +01:00
|
|
|
struct index_packing_state {
|
|
|
|
|
unsigned vertices_per_primitive;
|
|
|
|
|
nir_variable *original_prim_indices;
|
|
|
|
|
nir_variable *packed_prim_indices;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
|
|
|
|
|
{
|
|
|
|
|
/* can single index fit into one byte of U888X format? */
|
|
|
|
|
if (nir->info.mesh.max_vertices_out > 255)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
state->vertices_per_primitive =
|
|
|
|
|
num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
|
|
|
|
|
/* packing point indices doesn't help */
|
|
|
|
|
if (state->vertices_per_primitive == 1)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
state->original_prim_indices =
|
|
|
|
|
nir_find_variable_with_location(nir,
|
|
|
|
|
nir_var_shader_out,
|
|
|
|
|
VARYING_SLOT_PRIMITIVE_INDICES);
|
|
|
|
|
/* no indices = no changes to the shader, but it's still worth it,
|
|
|
|
|
* because less URB space will be used
|
|
|
|
|
*/
|
|
|
|
|
if (!state->original_prim_indices)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
|
|
|
|
|
assert(type->is_array());
|
|
|
|
|
assert(type->without_array()->is_vector());
|
|
|
|
|
assert(type->without_array()->vector_elements == state->vertices_per_primitive);
|
|
|
|
|
|
2023-06-28 19:40:56 +08:00
|
|
|
nir_foreach_function_impl(impl, nir) {
|
|
|
|
|
nir_foreach_block(block, impl) {
|
2023-01-25 15:06:23 +01:00
|
|
|
nir_foreach_instr(instr, block) {
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_store_deref) {
|
|
|
|
|
/* any unknown deref operation on primitive indices -> don't pack */
|
|
|
|
|
unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
|
|
|
|
|
for (unsigned i = 0; i < num_srcs; i++) {
|
|
|
|
|
nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
|
|
|
|
|
if (!deref)
|
|
|
|
|
continue;
|
|
|
|
|
nir_variable *var = nir_deref_instr_get_variable(deref);
|
|
|
|
|
|
|
|
|
|
if (var == state->original_prim_indices)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
|
|
|
|
|
if (!deref)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_variable *var = nir_deref_instr_get_variable(deref);
|
|
|
|
|
if (var != state->original_prim_indices)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (deref->deref_type != nir_deref_type_array)
|
|
|
|
|
return false; /* unknown chain of derefs */
|
|
|
|
|
|
|
|
|
|
nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
|
|
|
|
|
if (!var_deref || var_deref->deref_type != nir_deref_type_var)
|
|
|
|
|
return false; /* unknown chain of derefs */
|
|
|
|
|
|
|
|
|
|
assert (var_deref->var == state->original_prim_indices);
|
|
|
|
|
|
|
|
|
|
unsigned write_mask = nir_intrinsic_write_mask(intrin);
|
|
|
|
|
|
|
|
|
|
/* If only some components are written, then we can't easily pack.
|
|
|
|
|
* In theory we could, by loading current dword value, bitmasking
|
|
|
|
|
* one byte and storing back the whole dword, but it would be slow
|
|
|
|
|
* and could actually decrease performance. TODO: reevaluate this
|
|
|
|
|
* once there will be something hitting this.
|
|
|
|
|
*/
|
|
|
|
|
if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
|
|
|
void *data)
|
2023-01-25 15:06:23 +01:00
|
|
|
{
|
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_store_deref)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
|
|
|
|
|
if (!array_deref || array_deref->deref_type != nir_deref_type_array)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
|
|
|
|
|
if (!var_deref || var_deref->deref_type != nir_deref_type_var)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
struct index_packing_state *state =
|
|
|
|
|
(struct index_packing_state *)data;
|
|
|
|
|
|
|
|
|
|
nir_variable *var = var_deref->var;
|
|
|
|
|
|
|
|
|
|
if (var != state->original_prim_indices)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned vertices_per_primitive = state->vertices_per_primitive;
|
|
|
|
|
|
|
|
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
|
|
|
|
|
|
|
|
|
nir_deref_instr *new_var_deref =
|
|
|
|
|
nir_build_deref_var(b, state->packed_prim_indices);
|
|
|
|
|
nir_deref_instr *new_array_deref =
|
|
|
|
|
nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
|
|
|
|
|
|
|
|
|
|
nir_src *data_src = &intrin->src[1];
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *data_def =
|
2023-01-25 15:06:23 +01:00
|
|
|
nir_ssa_for_src(b, *data_src, vertices_per_primitive);
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *new_data =
|
2023-01-25 15:06:23 +01:00
|
|
|
nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
|
|
|
|
|
nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
|
|
|
|
|
|
|
|
|
|
if (vertices_per_primitive >= 3) {
|
|
|
|
|
new_data =
|
|
|
|
|
nir_ior(b, new_data,
|
|
|
|
|
nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-14 11:56:00 -05:00
|
|
|
nir_build_store_deref(b, &new_array_deref->def, new_data);
|
2023-01-25 15:06:23 +01:00
|
|
|
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
nir_instr_remove(&intrin->instr);
|
2023-01-25 15:06:23 +01:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
brw_pack_primitive_indices(nir_shader *nir, void *data)
|
|
|
|
|
{
|
|
|
|
|
struct index_packing_state *state = (struct index_packing_state *)data;
|
|
|
|
|
|
|
|
|
|
const struct glsl_type *new_type =
|
|
|
|
|
glsl_array_type(glsl_uint_type(),
|
|
|
|
|
nir->info.mesh.max_primitives_out,
|
|
|
|
|
0);
|
|
|
|
|
|
|
|
|
|
state->packed_prim_indices =
|
|
|
|
|
nir_variable_create(nir, nir_var_shader_out,
|
|
|
|
|
new_type, "gl_PrimitiveIndicesPacked");
|
|
|
|
|
state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
|
|
|
|
|
state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
|
|
|
|
|
state->packed_prim_indices->data.per_primitive = 1;
|
|
|
|
|
|
treewide: Use nir_shader_intrinsic_pass sometimes
This converts a lot of trivial passes. Nice boilerplate deletion. Via Coccinelle
patch (with a small manual fix-up for panfrost where coccinelle got confused by
genxml + ninja clang-format squashed in, and for Zink because my semantic patch
was slightly buggy).
@def@
typedef bool;
typedef nir_builder;
typedef nir_instr;
typedef nir_def;
identifier fn, instr, intr, x, builder, data;
@@
static fn(nir_builder* builder,
-nir_instr *instr,
+nir_intrinsic_instr *intr,
...)
{
(
- if (instr->type != nir_instr_type_intrinsic)
- return false;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
- if (instr->type != nir_instr_type_intrinsic)
- return false;
)
<...
(
-instr->x
+intr->instr.x
|
-instr
+&intr->instr
)
...>
}
@pass depends on def@
identifier def.fn;
expression shader, progress;
@@
(
-nir_shader_instructions_pass(shader, fn,
+nir_shader_intrinsics_pass(shader, fn,
...)
|
-NIR_PASS_V(shader, nir_shader_instructions_pass, fn,
+NIR_PASS_V(shader, nir_shader_intrinsics_pass, fn,
...)
|
-NIR_PASS(progress, shader, nir_shader_instructions_pass, fn,
+NIR_PASS(progress, shader, nir_shader_intrinsics_pass, fn,
...)
)
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24852>
2023-08-23 12:48:10 -04:00
|
|
|
return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
|
2023-01-25 15:06:23 +01:00
|
|
|
nir_metadata_block_index |
|
|
|
|
|
nir_metadata_dominance,
|
|
|
|
|
data);
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
const unsigned *
|
|
|
|
|
brw_compile_mesh(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_mesh_params *params)
|
|
|
|
|
{
|
2023-07-14 02:10:20 +03:00
|
|
|
struct nir_shader *nir = params->base.nir;
|
2021-10-29 12:27:45 -07:00
|
|
|
const struct brw_mesh_prog_key *key = params->key;
|
|
|
|
|
struct brw_mesh_prog_data *prog_data = params->prog_data;
|
2023-06-20 14:42:02 -07:00
|
|
|
const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
prog_data->base.base.stage = MESA_SHADER_MESH;
|
|
|
|
|
prog_data->base.base.total_shared = nir->info.shared_size;
|
2022-02-28 15:13:07 +02:00
|
|
|
prog_data->base.base.total_scratch = 0;
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
prog_data->base.local_size[0] = nir->info.workgroup_size[0];
|
|
|
|
|
prog_data->base.local_size[1] = nir->info.workgroup_size[1];
|
|
|
|
|
prog_data->base.local_size[2] = nir->info.workgroup_size[2];
|
|
|
|
|
|
2021-12-09 16:47:43 +01:00
|
|
|
prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
|
|
|
|
|
prog_data->cull_distance_mask =
|
|
|
|
|
((1 << nir->info.cull_distance_array_size) - 1) <<
|
|
|
|
|
nir->info.clip_distance_array_size;
|
2021-10-29 12:27:45 -07:00
|
|
|
prog_data->primitive_type = nir->info.mesh.primitive_type;
|
|
|
|
|
|
2023-01-25 15:06:23 +01:00
|
|
|
struct index_packing_state index_packing_state = {};
|
|
|
|
|
if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
|
|
|
|
|
if (index_packing_state.original_prim_indices)
|
|
|
|
|
NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
|
|
|
|
|
prog_data->index_format = BRW_INDEX_FORMAT_U888X;
|
|
|
|
|
} else {
|
|
|
|
|
prog_data->index_format = BRW_INDEX_FORMAT_U32;
|
|
|
|
|
}
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2021-07-16 15:03:20 +02:00
|
|
|
prog_data->uses_drawid =
|
|
|
|
|
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
brw_nir_lower_tue_inputs(nir, params->tue_map);
|
2022-02-14 16:13:28 -08:00
|
|
|
|
2023-07-21 11:50:51 +02:00
|
|
|
brw_compute_mue_map(compiler, nir, &prog_data->map, prog_data->index_format);
|
2022-07-18 18:35:34 +02:00
|
|
|
brw_nir_lower_mue_outputs(nir, &prog_data->map);
|
2021-10-29 12:56:22 -07:00
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
brw_simd_selection_state simd_state{
|
2023-07-14 02:10:20 +03:00
|
|
|
.mem_ctx = params->base.mem_ctx,
|
2022-11-08 01:47:50 -08:00
|
|
|
.devinfo = compiler->devinfo,
|
|
|
|
|
.prog_data = &prog_data->base,
|
|
|
|
|
.required_width = brw_required_dispatch_width(&nir->info),
|
|
|
|
|
};
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
std::unique_ptr<fs_visitor> v[3];
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
for (int simd = 0; simd < 3; simd++) {
|
2022-11-08 01:47:50 -08:00
|
|
|
if (!brw_simd_should_compile(simd_state, simd))
|
2021-10-29 12:27:45 -07:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
const unsigned dispatch_width = 8 << simd;
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
|
2022-02-01 18:08:49 +01:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* When Primitive Header is enabled, we may not generates writes to all
|
|
|
|
|
* fields, so let's initialize everything.
|
|
|
|
|
*/
|
|
|
|
|
if (prog_data->map.per_primitive_header_size_dw > 0)
|
|
|
|
|
NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
|
|
|
|
|
|
2023-05-17 17:09:06 +02:00
|
|
|
brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
|
2021-07-12 13:43:03 +02:00
|
|
|
/* Load uniforms can do a better job for constants, so fold before it. */
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, shader, nir_opt_constant_folding);
|
|
|
|
|
NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
|
2021-07-12 13:43:03 +02:00
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2023-05-17 16:44:17 +02:00
|
|
|
brw_postprocess_nir(shader, compiler, debug_enabled,
|
2022-06-21 18:06:04 -07:00
|
|
|
key->base.robust_flags);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
|
|
|
|
|
&key->base,
|
|
|
|
|
&prog_data->base.base,
|
|
|
|
|
shader, dispatch_width,
|
|
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
debug_enabled);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
if (prog_data->base.prog_mask) {
|
|
|
|
|
unsigned first = ffs(prog_data->base.prog_mask) - 1;
|
2022-11-08 14:14:37 -08:00
|
|
|
v[simd]->import_uniforms(v[first].get());
|
2021-10-29 12:27:45 -07:00
|
|
|
}
|
|
|
|
|
|
2022-11-08 03:38:18 -08:00
|
|
|
const bool allow_spilling = !brw_simd_any_compiled(simd_state);
|
2021-10-29 12:27:45 -07:00
|
|
|
if (v[simd]->run_mesh(allow_spilling))
|
2022-11-08 01:47:50 -08:00
|
|
|
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
2021-10-29 12:27:45 -07:00
|
|
|
else
|
2023-07-14 02:10:20 +03:00
|
|
|
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
|
2021-10-29 12:27:45 -07:00
|
|
|
}
|
|
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
int selected_simd = brw_simd_select(simd_state);
|
2021-10-29 12:27:45 -07:00
|
|
|
if (selected_simd < 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
params->base.error_str =
|
|
|
|
|
ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
"Can't compile shader: %s, %s and %s.\n",
|
|
|
|
|
simd_state.error[0], simd_state.error[1],
|
|
|
|
|
simd_state.error[2]);;
|
2021-10-29 12:27:45 -07:00
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
fs_visitor *selected = v[selected_simd].get();
|
2021-10-29 12:27:45 -07:00
|
|
|
prog_data->base.prog_mask = 1 << selected_simd;
|
|
|
|
|
|
2021-10-29 12:45:17 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
|
|
|
|
if (params->tue_map) {
|
|
|
|
|
fprintf(stderr, "Mesh Input ");
|
|
|
|
|
brw_print_tue_map(stderr, params->tue_map);
|
|
|
|
|
}
|
2021-10-29 12:56:22 -07:00
|
|
|
fprintf(stderr, "Mesh Output ");
|
2022-12-21 15:40:07 +01:00
|
|
|
brw_print_mue_map(stderr, &prog_data->map, nir);
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_generator g(compiler, ¶ms->base, &prog_data->base.base,
|
|
|
|
|
false, MESA_SHADER_MESH);
|
2021-10-29 12:27:45 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
2021-10-29 12:27:45 -07:00
|
|
|
"%s mesh shader %s",
|
|
|
|
|
nir->info.label ? nir->info.label
|
|
|
|
|
: "unnamed",
|
|
|
|
|
nir->info.name));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
|
2023-07-14 02:10:20 +03:00
|
|
|
selected->performance_analysis.require(), params->base.stats);
|
2023-01-24 10:52:10 +01:00
|
|
|
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
2021-10-29 12:27:45 -07:00
|
|
|
return g.get_assembly();
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
static unsigned
|
|
|
|
|
component_from_intrinsic(nir_intrinsic_instr *instr)
|
|
|
|
|
{
|
|
|
|
|
if (nir_intrinsic_has_component(instr))
|
|
|
|
|
return nir_intrinsic_component(instr);
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-09 16:51:41 +01:00
|
|
|
static void
|
|
|
|
|
adjust_handle_and_offset(const fs_builder &bld,
|
|
|
|
|
fs_reg &urb_handle,
|
|
|
|
|
unsigned &urb_global_offset)
|
|
|
|
|
{
|
|
|
|
|
/* Make sure that URB global offset is below 2048 (2^11), because
|
|
|
|
|
* that's the maximum possible value encoded in Message Descriptor.
|
|
|
|
|
*/
|
|
|
|
|
unsigned adjustment = (urb_global_offset >> 11) << 11;
|
|
|
|
|
|
|
|
|
|
if (adjustment) {
|
|
|
|
|
fs_builder ubld8 = bld.group(8, 0).exec_all();
|
2023-02-01 17:23:25 +01:00
|
|
|
/* Allocate new register to not overwrite the shared URB handle. */
|
|
|
|
|
fs_reg new_handle = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
ubld8.ADD(new_handle, urb_handle, brw_imm_ud(adjustment));
|
|
|
|
|
urb_handle = new_handle;
|
2021-12-09 16:51:41 +01:00
|
|
|
urb_global_offset -= adjustment;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-14 11:32:53 +01:00
|
|
|
static void
|
|
|
|
|
emit_urb_direct_vec4_write(const fs_builder &bld,
|
|
|
|
|
unsigned urb_global_offset,
|
|
|
|
|
const fs_reg &src,
|
|
|
|
|
fs_reg urb_handle,
|
|
|
|
|
unsigned dst_comp_offset,
|
|
|
|
|
unsigned comps,
|
|
|
|
|
unsigned mask)
|
|
|
|
|
{
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
|
|
|
|
fs_builder bld8 = bld.group(8, q);
|
|
|
|
|
|
2023-01-13 14:53:54 +01:00
|
|
|
fs_reg payload_srcs[8];
|
2022-11-14 11:32:53 +01:00
|
|
|
unsigned length = 0;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < dst_comp_offset; i++)
|
|
|
|
|
payload_srcs[length++] = reg_undef;
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++)
|
2023-01-31 14:52:24 +01:00
|
|
|
payload_srcs[length++] = quarter(offset(src, bld, c), q);
|
2022-11-14 11:32:53 +01:00
|
|
|
|
|
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
|
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
|
|
|
|
|
BRW_REGISTER_TYPE_F);
|
|
|
|
|
bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
|
|
|
|
|
|
|
|
|
|
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
|
|
|
|
|
reg_undef, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
inst->mlen = 2 + length;
|
|
|
|
|
inst->offset = urb_global_offset;
|
|
|
|
|
assert(inst->offset < 2048);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:45:17 -07:00
|
|
|
static void
|
|
|
|
|
emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
|
2022-08-21 22:04:21 -07:00
|
|
|
const fs_reg &src, fs_reg urb_handle)
|
2021-10-29 12:45:17 -07:00
|
|
|
{
|
|
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
|
|
|
|
|
|
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
assert(nir_src_is_const(*offset_nir_src));
|
|
|
|
|
|
|
|
|
|
const unsigned comps = nir_src_num_components(instr->src[0]);
|
2023-01-31 14:52:24 +01:00
|
|
|
assert(comps <= 4);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
|
|
|
|
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
nir_src_as_uint(*offset_nir_src) +
|
2022-02-14 16:36:32 -08:00
|
|
|
component_from_intrinsic(instr);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
|
|
|
|
/* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
|
2023-01-31 14:52:24 +01:00
|
|
|
* We can write up to 8 dwords, so single vec4 write is enough.
|
2021-10-29 12:45:17 -07:00
|
|
|
*/
|
2023-01-31 14:52:24 +01:00
|
|
|
const unsigned comp_shift = offset_in_dwords % 4;
|
|
|
|
|
const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2021-12-09 16:51:41 +01:00
|
|
|
unsigned urb_global_offset = offset_in_dwords / 4;
|
|
|
|
|
adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
|
|
|
|
|
|
2023-01-31 14:52:24 +01:00
|
|
|
emit_urb_direct_vec4_write(bld, urb_global_offset, src, urb_handle,
|
|
|
|
|
comp_shift, comps, mask);
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
static void
|
|
|
|
|
emit_urb_indirect_vec4_write(const fs_builder &bld,
|
|
|
|
|
const fs_reg &offset_src,
|
|
|
|
|
unsigned base,
|
|
|
|
|
const fs_reg &src,
|
|
|
|
|
fs_reg urb_handle,
|
|
|
|
|
unsigned dst_comp_offset,
|
|
|
|
|
unsigned comps,
|
|
|
|
|
unsigned mask)
|
|
|
|
|
{
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
|
|
|
|
fs_builder bld8 = bld.group(8, q);
|
|
|
|
|
|
2023-02-15 13:29:24 +01:00
|
|
|
/* offset is always positive, so signedness doesn't matter */
|
|
|
|
|
assert(offset_src.type == BRW_REGISTER_TYPE_D ||
|
|
|
|
|
offset_src.type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
fs_reg off = bld8.vgrf(offset_src.type, 1);
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
bld8.MOV(off, quarter(offset_src, q));
|
|
|
|
|
bld8.ADD(off, off, brw_imm_ud(base));
|
|
|
|
|
bld8.SHR(off, off, brw_imm_ud(2));
|
|
|
|
|
|
2023-01-13 14:53:54 +01:00
|
|
|
fs_reg payload_srcs[8];
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
unsigned length = 0;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < dst_comp_offset; i++)
|
|
|
|
|
payload_srcs[length++] = reg_undef;
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++)
|
2023-01-31 14:52:24 +01:00
|
|
|
payload_srcs[length++] = quarter(offset(src, bld, c), q);
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
|
|
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
|
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
|
|
|
|
|
BRW_REGISTER_TYPE_F);
|
|
|
|
|
bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
|
|
|
|
|
|
|
|
|
|
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
|
|
|
|
|
reg_undef, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
inst->mlen = 3 + length;
|
|
|
|
|
inst->offset = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr,
|
|
|
|
|
const fs_reg &src, const fs_reg &offset_src,
|
|
|
|
|
fs_reg urb_handle, unsigned mod)
|
|
|
|
|
{
|
|
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
|
|
|
|
|
|
|
|
|
const unsigned comps = nir_src_num_components(instr->src[0]);
|
2023-01-31 14:52:24 +01:00
|
|
|
assert(comps <= 4);
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
|
|
|
|
|
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
2023-01-31 14:52:24 +01:00
|
|
|
const unsigned comp_shift = mod;
|
|
|
|
|
const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
|
2023-01-31 14:52:24 +01:00
|
|
|
emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
|
|
|
|
|
urb_handle, comp_shift, comps, mask);
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:45:17 -07:00
|
|
|
static void
|
|
|
|
|
emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
|
2022-08-21 22:04:21 -07:00
|
|
|
const fs_reg &src, const fs_reg &offset_src,
|
|
|
|
|
fs_reg urb_handle)
|
2021-10-29 12:45:17 -07:00
|
|
|
{
|
|
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
|
|
|
|
|
|
|
|
|
const unsigned comps = nir_src_num_components(instr->src[0]);
|
|
|
|
|
assert(comps <= 4);
|
|
|
|
|
|
|
|
|
|
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
2022-02-14 16:36:32 -08:00
|
|
|
component_from_intrinsic(instr);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
|
|
|
|
/* Use URB write message that allow different offsets per-slot. The offset
|
|
|
|
|
* is in units of vec4s (128 bits), so we use a write for each component,
|
|
|
|
|
* replicating it in the sources and applying the appropriate mask based on
|
|
|
|
|
* the dword offset.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++) {
|
|
|
|
|
if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
fs_reg src_comp = offset(src, bld, c);
|
|
|
|
|
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
|
|
|
|
fs_builder bld8 = bld.group(8, q);
|
|
|
|
|
|
2023-02-15 13:29:24 +01:00
|
|
|
/* offset is always positive, so signedness doesn't matter */
|
|
|
|
|
assert(offset_src.type == BRW_REGISTER_TYPE_D ||
|
|
|
|
|
offset_src.type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
fs_reg off = bld8.vgrf(offset_src.type, 1);
|
2021-10-29 12:45:17 -07:00
|
|
|
bld8.MOV(off, quarter(offset_src, q));
|
|
|
|
|
bld8.ADD(off, off, brw_imm_ud(c + base_in_dwords));
|
|
|
|
|
|
|
|
|
|
fs_reg mask = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
|
|
|
|
bld8.AND(mask, off, brw_imm_ud(0x3));
|
|
|
|
|
|
|
|
|
|
fs_reg one = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
|
|
|
|
bld8.MOV(one, brw_imm_ud(1));
|
|
|
|
|
bld8.SHL(mask, one, mask);
|
|
|
|
|
bld8.SHL(mask, mask, brw_imm_ud(16));
|
|
|
|
|
|
|
|
|
|
bld8.SHR(off, off, brw_imm_ud(2));
|
|
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
fs_reg payload_srcs[4];
|
|
|
|
|
unsigned length = 0;
|
2021-10-29 12:45:17 -07:00
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < 4; j++)
|
2022-07-12 15:32:01 -07:00
|
|
|
payload_srcs[length++] = quarter(src_comp, q);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
|
|
|
|
|
BRW_REGISTER_TYPE_F);
|
2022-08-12 17:16:17 +02:00
|
|
|
bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2022-07-12 15:52:31 -07:00
|
|
|
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
|
2022-07-12 15:32:01 -07:00
|
|
|
reg_undef, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
inst->mlen = 3 + length;
|
2021-10-29 12:45:17 -07:00
|
|
|
inst->offset = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
|
2022-08-21 22:04:21 -07:00
|
|
|
const fs_reg &dest, fs_reg urb_handle)
|
2021-10-29 12:45:17 -07:00
|
|
|
{
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2023-08-14 11:56:00 -05:00
|
|
|
unsigned comps = instr->def.num_components;
|
2021-10-29 12:45:17 -07:00
|
|
|
if (comps == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
assert(nir_src_is_const(*offset_nir_src));
|
|
|
|
|
|
|
|
|
|
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
nir_src_as_uint(*offset_nir_src) +
|
2022-02-14 16:36:32 -08:00
|
|
|
component_from_intrinsic(instr);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2021-12-09 16:51:41 +01:00
|
|
|
unsigned urb_global_offset = offset_in_dwords / 4;
|
|
|
|
|
adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
|
|
|
|
|
|
2021-10-29 12:45:17 -07:00
|
|
|
const unsigned comp_offset = offset_in_dwords % 4;
|
|
|
|
|
const unsigned num_regs = comp_offset + comps;
|
|
|
|
|
|
|
|
|
|
fs_builder ubld8 = bld.group(8, 0).exec_all();
|
|
|
|
|
fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs);
|
2022-07-14 11:57:03 -07:00
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2022-07-14 11:57:03 -07:00
|
|
|
fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
2021-10-29 12:45:17 -07:00
|
|
|
inst->mlen = 1;
|
2021-12-09 16:51:41 +01:00
|
|
|
inst->offset = urb_global_offset;
|
|
|
|
|
assert(inst->offset < 2048);
|
2021-10-29 12:45:17 -07:00
|
|
|
inst->size_written = num_regs * REG_SIZE;
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++) {
|
|
|
|
|
fs_reg dest_comp = offset(dest, bld, c);
|
|
|
|
|
fs_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
|
|
|
|
|
bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
|
2022-08-21 22:04:21 -07:00
|
|
|
const fs_reg &dest, const fs_reg &offset_src, fs_reg urb_handle)
|
2021-10-29 12:45:17 -07:00
|
|
|
{
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
2023-08-14 11:56:00 -05:00
|
|
|
unsigned comps = instr->def.num_components;
|
2021-10-29 12:45:17 -07:00
|
|
|
if (comps == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
fs_reg seq_ud;
|
|
|
|
|
{
|
|
|
|
|
fs_builder ubld8 = bld.group(8, 0).exec_all();
|
|
|
|
|
seq_ud = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
|
|
|
|
fs_reg seq_uw = ubld8.vgrf(BRW_REGISTER_TYPE_UW, 1);
|
|
|
|
|
ubld8.MOV(seq_uw, fs_reg(brw_imm_v(0x76543210)));
|
|
|
|
|
ubld8.MOV(seq_ud, seq_uw);
|
|
|
|
|
ubld8.SHL(seq_ud, seq_ud, brw_imm_ud(2));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
2022-02-14 16:36:32 -08:00
|
|
|
component_from_intrinsic(instr);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++) {
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
|
|
|
|
fs_builder bld8 = bld.group(8, q);
|
|
|
|
|
|
2023-02-15 13:29:24 +01:00
|
|
|
/* offset is always positive, so signedness doesn't matter */
|
|
|
|
|
assert(offset_src.type == BRW_REGISTER_TYPE_D ||
|
|
|
|
|
offset_src.type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
fs_reg off = bld8.vgrf(offset_src.type, 1);
|
2021-10-29 12:45:17 -07:00
|
|
|
bld8.MOV(off, quarter(offset_src, q));
|
|
|
|
|
bld8.ADD(off, off, brw_imm_ud(base_in_dwords + c));
|
|
|
|
|
|
2022-05-31 13:17:30 +02:00
|
|
|
STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
|
2021-10-29 12:45:17 -07:00
|
|
|
|
|
|
|
|
fs_reg comp = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
|
|
|
|
bld8.AND(comp, off, brw_imm_ud(0x3));
|
|
|
|
|
bld8.SHL(comp, comp, brw_imm_ud(ffs(REG_SIZE) - 1));
|
|
|
|
|
bld8.ADD(comp, comp, seq_ud);
|
|
|
|
|
|
|
|
|
|
bld8.SHR(off, off, brw_imm_ud(2));
|
|
|
|
|
|
2022-07-14 11:57:03 -07:00
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
|
2021-10-29 12:45:17 -07:00
|
|
|
|
|
|
|
|
fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4);
|
|
|
|
|
|
2022-07-12 15:52:31 -07:00
|
|
|
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL,
|
2022-07-14 11:57:03 -07:00
|
|
|
data, srcs, ARRAY_SIZE(srcs));
|
2021-10-29 12:45:17 -07:00
|
|
|
inst->mlen = 2;
|
|
|
|
|
inst->offset = 0;
|
|
|
|
|
inst->size_written = 4 * REG_SIZE;
|
|
|
|
|
|
|
|
|
|
fs_reg dest_comp = offset(dest, bld, c);
|
|
|
|
|
bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
|
|
|
|
|
retype(quarter(dest_comp, q), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
data,
|
|
|
|
|
comp,
|
2023-02-08 14:11:07 +02:00
|
|
|
brw_imm_ud(4 * REG_SIZE));
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2022-08-21 22:21:37 -07:00
|
|
|
fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *instr,
|
|
|
|
|
const fs_reg &urb_handle)
|
2021-10-29 12:45:17 -07:00
|
|
|
{
|
|
|
|
|
fs_reg src = get_nir_src(instr->src[0]);
|
|
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
if (nir_src_is_const(*offset_nir_src)) {
|
2023-02-01 17:23:25 +01:00
|
|
|
emit_urb_direct_writes(bld, instr, src, urb_handle);
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
} else {
|
|
|
|
|
bool use_mod = false;
|
|
|
|
|
unsigned mod;
|
|
|
|
|
|
2023-08-01 12:24:31 -04:00
|
|
|
/* Try to calculate the value of (offset + base) % 4. If we can do
|
|
|
|
|
* this, then we can do indirect writes using only 1 URB write.
|
|
|
|
|
*/
|
2023-08-15 10:07:24 -05:00
|
|
|
use_mod = nir_mod_analysis(nir_get_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
|
2023-08-01 12:24:31 -04:00
|
|
|
if (use_mod) {
|
|
|
|
|
mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
|
|
|
|
|
mod %= 4;
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (use_mod) {
|
2023-02-01 17:23:25 +01:00
|
|
|
emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(*offset_nir_src), urb_handle, mod);
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
} else {
|
2023-02-01 17:23:25 +01:00
|
|
|
emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), urb_handle);
|
intel/compiler/mesh: optimize indirect writes
Our hardware requires that we write to URB using full vec4s at aligned
addresses. It gives us an ability to mask-off dwords within vec4 we don't
want to write, but we have to know their positions at compile time.
Let's assume that:
- V represents one dword we want to write
- ? is an unitinitialized value
- "|" is a vec4 boundary.
When we want to write 2-dword value at offset 0 we generate 1 write message:
| V1 V2 ? ? |
with mask:
| 1 1 0 0 |
When we want to write 4-dword value at offset 2 we generate 2 write messages:
| ? ? V1 V2 | V3 V4 ? ? |
with mask:
| 0 0 1 1 | 1 1 0 0 |
However if we don't know the offset within vec4 at *compile time* we
currently generate 4 write messages:
| V1 V1 V1 V1 |
| 0 0 1 0 |
| V2 V2 V2 V2 |
| 0 0 0 1 |
| V3 V3 V3 V3 |
| 1 0 0 0 |
| V4 V4 V4 V4 |
| 0 1 0 0 |
where masks are determined at *run time*.
This is quite wasteful and slow.
However, if we could determine the offset modulo 4 statically at compile time,
we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4.
This is what this patch does: it analyzes the addressing expression for
modulo 4 value and if it can determine it at compile time, we generate
1 or 2 writes, and if it can't we fallback to the old 4 writes method.
In mesh shader, the value of offset modulo 4 should be known for all outputs,
with an exception of primitive indices.
The modulo value should be known because of MUE layout restrictions, which
require that user per-primitive and per-vertex data start at address aligned
to 8 dwords and we should statically always know the offset from this base.
There can be some cases where the offset from the base is more dynamic
(e.g. indirect array access inside a per-vertex value), so we always do
the analysis.
Primitive indices are an exception, because they form vec3s (for triangles),
which means that the offset will not be easy to analyse.
When U888X index format lands, primitive indices will use only one dword
per triangle, which means that we'll always write them using one message.
Task shaders don't have any predetermined structure of output memory, so
always do the analysis.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
|
|
|
}
|
|
|
|
|
}
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2022-08-21 22:21:37 -07:00
|
|
|
fs_visitor::emit_task_mesh_load(const fs_builder &bld, nir_intrinsic_instr *instr,
|
2023-02-01 17:23:25 +01:00
|
|
|
const fs_reg &urb_handle)
|
2021-10-29 12:45:17 -07:00
|
|
|
{
|
2023-08-14 11:56:00 -05:00
|
|
|
fs_reg dest = get_nir_def(instr->def);
|
2021-10-29 12:45:17 -07:00
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
|
2021-10-29 12:56:22 -07:00
|
|
|
/* TODO(mesh): for per_vertex and per_primitive, if we could keep around
|
|
|
|
|
* the non-array-index offset, we could use to decide if we can perform
|
|
|
|
|
* a single large aligned read instead one per component.
|
|
|
|
|
*/
|
|
|
|
|
|
2021-10-29 12:45:17 -07:00
|
|
|
if (nir_src_is_const(*offset_nir_src))
|
2023-02-01 17:23:25 +01:00
|
|
|
emit_urb_direct_reads(bld, instr, dest, urb_handle);
|
2021-10-29 12:45:17 -07:00
|
|
|
else
|
2023-02-01 17:23:25 +01:00
|
|
|
emit_urb_indirect_reads(bld, instr, dest, get_nir_src(*offset_nir_src), urb_handle);
|
2021-10-29 12:45:17 -07:00
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::nir_emit_task_intrinsic(const fs_builder &bld,
|
|
|
|
|
nir_intrinsic_instr *instr)
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_TASK);
|
2022-08-21 23:05:08 -07:00
|
|
|
const task_mesh_thread_payload &payload = task_mesh_payload();
|
2022-08-21 22:21:37 -07:00
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_store_output:
|
2022-02-14 16:36:32 -08:00
|
|
|
case nir_intrinsic_store_task_payload:
|
2022-08-21 23:05:08 -07:00
|
|
|
emit_task_mesh_store(bld, instr, payload.urb_output);
|
2021-10-29 12:45:17 -07:00
|
|
|
break;
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
case nir_intrinsic_load_output:
|
2022-02-14 16:36:32 -08:00
|
|
|
case nir_intrinsic_load_task_payload:
|
2023-02-01 17:23:25 +01:00
|
|
|
emit_task_mesh_load(bld, instr, payload.urb_output);
|
2021-10-29 12:27:45 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
nir_emit_task_mesh_intrinsic(bld, instr);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
fs_visitor::nir_emit_mesh_intrinsic(const fs_builder &bld,
|
|
|
|
|
nir_intrinsic_instr *instr)
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_MESH);
|
2022-08-21 23:05:08 -07:00
|
|
|
const task_mesh_thread_payload &payload = task_mesh_payload();
|
2022-08-21 22:21:37 -07:00
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_store_per_primitive_output:
|
|
|
|
|
case nir_intrinsic_store_per_vertex_output:
|
|
|
|
|
case nir_intrinsic_store_output:
|
2022-08-21 23:05:08 -07:00
|
|
|
emit_task_mesh_store(bld, instr, payload.urb_output);
|
2021-10-29 12:27:45 -07:00
|
|
|
break;
|
|
|
|
|
|
2021-10-29 12:56:22 -07:00
|
|
|
case nir_intrinsic_load_per_vertex_output:
|
|
|
|
|
case nir_intrinsic_load_per_primitive_output:
|
|
|
|
|
case nir_intrinsic_load_output:
|
2023-02-01 17:23:25 +01:00
|
|
|
emit_task_mesh_load(bld, instr, payload.urb_output);
|
2022-08-21 22:21:37 -07:00
|
|
|
break;
|
|
|
|
|
|
2022-02-14 16:36:32 -08:00
|
|
|
case nir_intrinsic_load_task_payload:
|
2023-02-01 17:23:25 +01:00
|
|
|
emit_task_mesh_load(bld, instr, payload.task_urb_input);
|
2021-10-29 12:45:17 -07:00
|
|
|
break;
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
default:
|
|
|
|
|
nir_emit_task_mesh_intrinsic(bld, instr);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
fs_visitor::nir_emit_task_mesh_intrinsic(const fs_builder &bld,
|
|
|
|
|
nir_intrinsic_instr *instr)
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK);
|
2022-08-21 23:05:08 -07:00
|
|
|
const task_mesh_thread_payload &payload = task_mesh_payload();
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2021-10-29 12:48:54 -07:00
|
|
|
fs_reg dest;
|
|
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-08-14 11:56:00 -05:00
|
|
|
dest = get_nir_def(instr->def);
|
2021-10-29 12:48:54 -07:00
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
switch (instr->intrinsic) {
|
2022-08-21 23:05:08 -07:00
|
|
|
case nir_intrinsic_load_mesh_inline_data_intel: {
|
|
|
|
|
fs_reg data = offset(payload.inline_parameter, 1, nir_intrinsic_align_offset(instr));
|
|
|
|
|
bld.MOV(dest, retype(data, dest.type));
|
2021-07-12 13:43:03 +02:00
|
|
|
break;
|
2022-08-21 23:05:08 -07:00
|
|
|
}
|
2021-07-12 13:43:03 +02:00
|
|
|
|
2021-07-16 15:03:20 +02:00
|
|
|
case nir_intrinsic_load_draw_id:
|
2022-09-29 16:47:32 +02:00
|
|
|
dest = retype(dest, BRW_REGISTER_TYPE_UD);
|
2022-08-21 23:05:08 -07:00
|
|
|
bld.MOV(dest, payload.extended_parameter_0);
|
2021-07-16 15:03:20 +02:00
|
|
|
break;
|
|
|
|
|
|
2021-10-29 12:48:54 -07:00
|
|
|
case nir_intrinsic_load_local_invocation_id:
|
2023-07-10 14:05:37 +02:00
|
|
|
unreachable("local invocation id should have been lowered earlier");
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_local_invocation_index:
|
2021-10-29 12:48:54 -07:00
|
|
|
dest = retype(dest, BRW_REGISTER_TYPE_UD);
|
2022-08-21 23:05:08 -07:00
|
|
|
bld.MOV(dest, payload.local_index);
|
2021-10-29 12:48:54 -07:00
|
|
|
break;
|
|
|
|
|
|
2022-04-30 13:06:42 +02:00
|
|
|
case nir_intrinsic_load_num_workgroups:
|
|
|
|
|
dest = retype(dest, BRW_REGISTER_TYPE_UD);
|
2023-04-05 12:16:33 +02:00
|
|
|
bld.MOV(offset(dest, bld, 0), brw_uw1_grf(0, 13)); /* g0.6 >> 16 */
|
|
|
|
|
bld.MOV(offset(dest, bld, 1), brw_uw1_grf(0, 8)); /* g0.4 & 0xffff */
|
|
|
|
|
bld.MOV(offset(dest, bld, 2), brw_uw1_grf(0, 9)); /* g0.4 >> 16 */
|
2022-04-30 13:06:42 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_workgroup_index:
|
|
|
|
|
dest = retype(dest, BRW_REGISTER_TYPE_UD);
|
|
|
|
|
bld.MOV(dest, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
break;
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
default:
|
|
|
|
|
nir_emit_cs_intrinsic(bld, instr);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|