From c1685f08dd4edcf505d5e9a150b70ccbbb989fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20=C5=9Alusarz?= Date: Wed, 21 Dec 2022 15:42:55 +0100 Subject: [PATCH] intel/compiler,anv: put some vertex and primitive data in headers Both per-primitive and per-vertex space is allocated in MUE in 8 dword chunks and those 8-dword chunks (granularity of 3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength) are passed to fragment shaders as inputs (either non-interpolated for per-primitive and flat vertex attributes or interpolated for non-flat vertex attributes). Some attributes have a special meaning and must be placed in separate 8/16-dword slot called Primitive Header or Vertex Header. Primitive Header contains 4 such attributes (Cull Primitive, ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword slot) potentially unused. Vertex Header is similar - it starts with 3 unused dwords, 1 dword for Point Size (but if we declare that shader doesn't produce Point Size then we can reuse it), followed by 4 dwords for Position and optionally 8 dwords for clip distances. This means we have an interesting optimization problem - we can put some user attributes into holes in Primitive and Vertex Headers, which may lead to smaller MUE size and potentially more mesh threads running in parallel, but we have to be careful to use those holes only when we need it, otherwise we could force HW to pass too much data to fragment shader. Example 1: Let's assume that Primitive Header is enabled and user defined 12 dwords of per-primitive attributes. Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader. With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to fragment shader. 16/16 is better than 24/16, so packing makes sense. Example 2: Now let's assume that Primitive Header is enabled and user defined 16 dwords of per-primitive attributes. Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader. With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to fragment shader. 24/24 is worse than 24/16, so packing doesn't make sense. This change doesn't affect vk_meshlet_cadscene in default configuration, but it speeds it up by up to 25% with "-extraattributes N", where N is some small value divisible by 2 (by default N == 1) and we are bound by URB size. Reviewed-by: Ivan Briano Part-of: --- src/intel/compiler/brw_compiler.h | 2 + src/intel/compiler/brw_fs.cpp | 12 +- src/intel/compiler/brw_mesh.cpp | 191 +++++++++++++++++++++++++++--- src/intel/vulkan/genX_pipeline.c | 8 +- 4 files changed, 193 insertions(+), 20 deletions(-) diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 87e20596716..6632b4c119b 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -1636,12 +1636,14 @@ struct brw_mue_map { uint32_t per_primitive_header_size_dw; uint32_t per_primitive_data_size_dw; uint32_t per_primitive_pitch_dw; + bool user_data_in_primitive_header; uint32_t max_vertices; uint32_t per_vertex_start_dw; uint32_t per_vertex_header_size_dw; uint32_t per_vertex_data_size_dw; uint32_t per_vertex_pitch_dw; + bool user_data_in_vertex_header; }; struct brw_task_prog_data { diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 2f6d33cef7e..e7f2d4c26ad 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1794,7 +1794,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo, VARYING_BIT_PRIMITIVE_SHADING_RATE; bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0; - if (reads_header) { + if (reads_header || mue_map->user_data_in_primitive_header) { /* Primitive Shading Rate, Layer and Viewport live in the same * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport * is dword 2). @@ -1849,9 +1849,13 @@ calculate_urb_setup(const struct intel_device_info *devinfo, unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw; unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw; - /* Per-Vertex header is never available to fragment shader. */ - per_vertex_start_dw += 8; - per_vertex_size_dw -= 8; + /* Per-Vertex header is available to fragment shader only if there's + * user data there. + */ + if (!mue_map->user_data_in_vertex_header) { + per_vertex_start_dw += 8; + per_vertex_size_dw -= 8; + } /* In Mesh, CLIP_DIST slots are always at the beginning, because * they come from MUE Vertex Header, not Per-Vertex Attributes. diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp index cf4b589341e..6af1a19e994 100644 --- a/src/intel/compiler/brw_mesh.cpp +++ b/src/intel/compiler/brw_mesh.cpp @@ -438,7 +438,7 @@ struct attr_type_info { std::list *order; /* attributes after which there's hole of size equal to array index */ - std::list holes[4]; + std::list holes[5]; }; static void @@ -490,22 +490,71 @@ brw_nir_find_complete_variable_with_location(nir_shader *shader, return best_var; } +static unsigned +brw_sum_size(const std::list &orders) +{ + unsigned sz = 0; + for (auto it = orders.cbegin(); it != orders.cend(); ++it) + sz += (*it).dwords; + return sz; +} + /* Finds order of outputs which require minimum size, without splitting * of URB read/write messages (which operate on vec4-aligned memory). */ static void brw_compute_mue_layout(std::list *orders, uint64_t outputs_written, - struct nir_shader *nir) + struct nir_shader *nir, + bool *pack_prim_data_into_header, + bool *pack_vert_data_into_header) { const struct shader_info *info = &nir->info; struct attr_type_info data[3]; bool no_compact = !debug_get_bool_option("BRW_MESH_COMPACTION", true); + unsigned header_packing = (unsigned)debug_get_num_option("BRW_MESH_HEADER_PACKING", 3); + + if ((header_packing & 1) == 0) + *pack_prim_data_into_header = false; + if ((header_packing & 2) == 0) + *pack_vert_data_into_header = false; for (unsigned i = PRIM; i <= VERT_FLAT; ++i) data[i].order = &orders[i]; + /* If packing into header is enabled, add a hole of size 4 and add + * a virtual location to keep the algorithm happy (it expects holes + * to be preceded by some location). We'll remove those virtual + * locations at the end. + */ + const gl_varying_slot virtual_header_location = VARYING_SLOT_POS; + assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0); + + struct attr_desc d; + d.location = virtual_header_location; + d.type = NULL; + d.dwords = 0; + d.slots = 0; + + struct attr_desc h; + h.location = -1; + h.type = NULL; + h.dwords = 4; + h.slots = 0; + + if (*pack_prim_data_into_header) { + orders[PRIM].push_back(d); + orders[PRIM].push_back(h); + data[PRIM].holes[4].push_back(virtual_header_location); + } + + if (*pack_vert_data_into_header) { + orders[VERT].push_back(d); + orders[VERT].push_back(h); + data[VERT].holes[4].push_back(virtual_header_location); + } + u_foreach_bit64(location, outputs_written) { if ((BITFIELD64_BIT(location) & outputs_written) == 0) continue; @@ -519,7 +568,6 @@ brw_compute_mue_layout(std::list *orders, nir_var_shader_out, location); - struct attr_desc d; d.location = location; d.type = brw_nir_get_var_type(nir, var); d.dwords = glsl_count_dword_slots(d.type, false); @@ -539,13 +587,26 @@ brw_compute_mue_layout(std::list *orders, outputs_written &= ~BITFIELD64_RANGE(location, d.slots); + /* special case to use hole of size 4 */ + if (d.dwords == 4 && !holes[4].empty()) { + holes[4].pop_back(); + + assert(order->front().location == virtual_header_location); + order->pop_front(); + + assert(order->front().location == -1); + assert(order->front().dwords == 4); + order->front() = d; + + continue; + } + int mod = d.dwords % 4; if (mod == 0) { order->push_back(d); continue; } - struct attr_desc h; h.location = -1; h.type = NULL; h.dwords = 4 - mod; @@ -568,7 +629,7 @@ brw_compute_mue_layout(std::list *orders, unsigned found = 0; /* try to find the smallest hole big enough to hold this attribute */ - for (unsigned sz = d.dwords; sz < 4; sz++){ + for (unsigned sz = d.dwords; sz <= 4; sz++){ if (!holes[sz].empty()) { found = sz; break; @@ -584,7 +645,7 @@ brw_compute_mue_layout(std::list *orders, continue; } - assert(found < 4); + assert(found <= 4); assert(!holes[found].empty()); int after_loc = holes[found].back(); holes[found].pop_back(); @@ -632,6 +693,61 @@ brw_compute_mue_layout(std::list *orders, assert(inserted_back); } + + if (*pack_prim_data_into_header) { + if (orders[PRIM].front().location == virtual_header_location) + orders[PRIM].pop_front(); + + if (!data[PRIM].holes[4].empty()) { + *pack_prim_data_into_header = false; + + assert(orders[PRIM].front().location == -1); + assert(orders[PRIM].front().dwords == 4); + orders[PRIM].pop_front(); + } + + if (*pack_prim_data_into_header) { + unsigned sz = brw_sum_size(orders[PRIM]); + + if (sz % 8 == 0 || sz % 8 > 4) + *pack_prim_data_into_header = false; + } + } + + if (*pack_vert_data_into_header) { + if (orders[VERT].front().location == virtual_header_location) + orders[VERT].pop_front(); + + if (!data[VERT].holes[4].empty()) { + *pack_vert_data_into_header = false; + + assert(orders[VERT].front().location == -1); + assert(orders[VERT].front().dwords == 4); + orders[VERT].pop_front(); + } + + if (*pack_vert_data_into_header) { + unsigned sz = brw_sum_size(orders[VERT]) + + brw_sum_size(orders[VERT_FLAT]); + + if (sz % 8 == 0 || sz % 8 > 4) + *pack_vert_data_into_header = false; + } + } + + + if (INTEL_DEBUG(DEBUG_MESH)) { + fprintf(stderr, "MUE attribute order:\n"); + for (unsigned i = PRIM; i <= VERT_FLAT; ++i) { + if (!orders[i].empty()) + fprintf(stderr, "%d: ", i); + for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) { + fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords); + } + if (!orders[i].empty()) + fprintf(stderr, "\n"); + } + } } /* Mesh URB Entry consists of an initial section @@ -717,7 +833,22 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map, std::list orders[3]; uint64_t regular_outputs = outputs_written & ~(per_primitive_header_bits | per_vertex_header_bits); - brw_compute_mue_layout(orders, regular_outputs, nir); + + /* packing into prim header is possible only if prim header is present */ + map->user_data_in_primitive_header = + (outputs_written & per_primitive_header_bits) != 0; + + /* Packing into vert header is always possible, but we allow it only + * if full vec4 is available (so point size is not used) and there's + * nothing between it and normal vertex data (so no clip distances). + */ + map->user_data_in_vertex_header = + (outputs_written & per_vertex_header_bits) == + BITFIELD64_BIT(VARYING_SLOT_POS); + + brw_compute_mue_layout(orders, regular_outputs, nir, + &map->user_data_in_primitive_header, + &map->user_data_in_vertex_header); if (outputs_written & per_primitive_header_bits) { if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) { @@ -752,13 +883,22 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map, map->per_primitive_data_size_dw = 0; - unsigned start_dw = map->per_primitive_start_dw + - map->per_primitive_header_size_dw; + unsigned start_dw = map->per_primitive_start_dw; + if (map->user_data_in_primitive_header) + start_dw += 4; /* first 4 dwords are used */ + else + start_dw += map->per_primitive_header_size_dw; + unsigned header_used_dw = 0; + for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) { int location = (*it).location; if (location < 0) { start_dw += (*it).dwords; - map->per_primitive_data_size_dw += (*it).dwords; + if (map->user_data_in_primitive_header && header_used_dw < 4) + header_used_dw += (*it).dwords; + else + map->per_primitive_data_size_dw += (*it).dwords; + assert(header_used_dw <= 4); continue; } @@ -770,7 +910,11 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map, brw_mue_assign_position(&*it, map, start_dw); start_dw += (*it).dwords; - map->per_primitive_data_size_dw += (*it).dwords; + if (map->user_data_in_primitive_header && header_used_dw < 4) + header_used_dw += (*it).dwords; + else + map->per_primitive_data_size_dw += (*it).dwords; + assert(header_used_dw <= 4); outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); } @@ -819,14 +963,24 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map, map->per_vertex_data_size_dw = 0; - start_dw = map->per_vertex_start_dw + - map->per_vertex_header_size_dw; + start_dw = map->per_vertex_start_dw; + if (!map->user_data_in_vertex_header) + start_dw += map->per_vertex_header_size_dw; + + header_used_dw = 0; for (unsigned type = VERT; type <= VERT_FLAT; ++type) { for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) { int location = (*it).location; if (location < 0) { start_dw += (*it).dwords; - map->per_vertex_data_size_dw += (*it).dwords; + if (map->user_data_in_vertex_header && header_used_dw < 4) { + header_used_dw += (*it).dwords; + assert(header_used_dw <= 4); + if (header_used_dw == 4) + start_dw += 4; /* jump over gl_position */ + } else { + map->per_vertex_data_size_dw += (*it).dwords; + } continue; } @@ -837,7 +991,14 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map, brw_mue_assign_position(&*it, map, start_dw); start_dw += (*it).dwords; - map->per_vertex_data_size_dw += (*it).dwords; + if (map->user_data_in_vertex_header && header_used_dw < 4) { + header_used_dw += (*it).dwords; + assert(header_used_dw <= 4); + if (header_used_dw == 4) + start_dw += 4; /* jump over gl_position */ + } else { + map->per_vertex_data_size_dw += (*it).dwords; + } outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); } } diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 1292aa8ded1..563a7da9c26 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -558,6 +558,11 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) sbe_mesh.PerVertexURBEntryOutputReadLength += 1; } + if (mue->user_data_in_vertex_header) { + sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; + sbe_mesh.PerVertexURBEntryOutputReadLength += 1; + } + assert(mue->per_primitive_header_size_dw % 8 == 0); sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8; sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8); @@ -569,7 +574,8 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) */ if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 || wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 || - wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0) { + wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 || + mue->user_data_in_primitive_header) { assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0); sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1; sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;