diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index b1deca378e8..c39c8f5809a 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -426,6 +426,9 @@ struct brw_task_prog_key struct brw_mesh_prog_key { struct brw_base_prog_key base; + + bool compact_mue:1; + unsigned padding:31; }; enum brw_sf_primitive { diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp index 759a5336487..0c7eacd753a 100644 --- a/src/intel/compiler/brw_mesh.cpp +++ b/src/intel/compiler/brw_mesh.cpp @@ -763,7 +763,7 @@ brw_compute_mue_layout(const struct brw_compiler *compiler, static void brw_compute_mue_map(const struct brw_compiler *compiler, struct nir_shader *nir, struct brw_mue_map *map, - enum brw_mesh_index_format index_format) + enum brw_mesh_index_format index_format, bool compact_mue) { memset(map, 0, sizeof(*map)); @@ -823,21 +823,17 @@ brw_compute_mue_map(const struct brw_compiler *compiler, ~(per_primitive_header_bits | per_vertex_header_bits); /* packing into prim header is possible only if prim header is present */ - map->user_data_in_primitive_header = + map->user_data_in_primitive_header = compact_mue && (outputs_written & per_primitive_header_bits) != 0; /* Packing into vert header is always possible, but we allow it only * if full vec4 is available (so point size is not used) and there's * nothing between it and normal vertex data (so no clip distances). */ - map->user_data_in_vertex_header = + map->user_data_in_vertex_header = compact_mue && (outputs_written & per_vertex_header_bits) == BITFIELD64_BIT(VARYING_SLOT_POS); - brw_compute_mue_layout(compiler, orders, regular_outputs, nir, - &map->user_data_in_primitive_header, - &map->user_data_in_vertex_header); - if (outputs_written & per_primitive_header_bits) { if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) { map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = @@ -871,39 +867,80 @@ brw_compute_mue_map(const struct brw_compiler *compiler, map->per_primitive_data_size_dw = 0; - unsigned start_dw = map->per_primitive_start_dw; - if (map->user_data_in_primitive_header) - start_dw += 4; /* first 4 dwords are used */ - else - start_dw += map->per_primitive_header_size_dw; - unsigned header_used_dw = 0; + /* For fast linked libraries, we can't pack the MUE, as the fragment shader + * will be compiled without access to the MUE map and won't be able to find + * out where everything is. + * Instead, keep doing things as we did before the packing, just laying out + * everything in varying order, which is how the FS will expect them. + */ + if (compact_mue) { + brw_compute_mue_layout(compiler, orders, regular_outputs, nir, + &map->user_data_in_primitive_header, + &map->user_data_in_vertex_header); + + unsigned start_dw = map->per_primitive_start_dw; + if (map->user_data_in_primitive_header) + start_dw += 4; /* first 4 dwords are used */ + else + start_dw += map->per_primitive_header_size_dw; + unsigned header_used_dw = 0; + + for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) { + int location = (*it).location; + if (location < 0) { + start_dw += (*it).dwords; + if (map->user_data_in_primitive_header && header_used_dw < 4) + header_used_dw += (*it).dwords; + else + map->per_primitive_data_size_dw += (*it).dwords; + assert(header_used_dw <= 4); + continue; + } + + assert(map->start_dw[location] == -1); + + assert(location == VARYING_SLOT_PRIMITIVE_ID || + location >= VARYING_SLOT_VAR0); + + brw_mue_assign_position(&*it, map, start_dw); - for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) { - int location = (*it).location; - if (location < 0) { start_dw += (*it).dwords; if (map->user_data_in_primitive_header && header_used_dw < 4) header_used_dw += (*it).dwords; else map->per_primitive_data_size_dw += (*it).dwords; assert(header_used_dw <= 4); - continue; + outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); } + } else { + unsigned start_dw = map->per_primitive_start_dw + + map->per_primitive_header_size_dw; - assert(map->start_dw[location] == -1); + uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs; + while (per_prim_outputs) { + uint64_t location = ffsl(per_prim_outputs) - 1; - assert(location == VARYING_SLOT_PRIMITIVE_ID || - location >= VARYING_SLOT_VAR0); + assert(map->start_dw[location] == -1); + assert(location == VARYING_SLOT_PRIMITIVE_ID || + location >= VARYING_SLOT_VAR0); - brw_mue_assign_position(&*it, map, start_dw); + nir_variable *var = + brw_nir_find_complete_variable_with_location(nir, + nir_var_shader_out, + location); + struct attr_desc d; + d.location = location; + d.type = brw_nir_get_var_type(nir, var); + d.dwords = glsl_count_dword_slots(d.type, false); + d.slots = glsl_count_attribute_slots(d.type, false); - start_dw += (*it).dwords; - if (map->user_data_in_primitive_header && header_used_dw < 4) - header_used_dw += (*it).dwords; - else - map->per_primitive_data_size_dw += (*it).dwords; - assert(header_used_dw <= 4); - outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); + brw_mue_assign_position(&d, map, start_dw); + + map->per_primitive_data_size_dw += ALIGN(d.dwords, 4); + start_dw += ALIGN(d.dwords, 4); + + per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots); + } } map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw + @@ -951,15 +988,40 @@ brw_compute_mue_map(const struct brw_compiler *compiler, map->per_vertex_data_size_dw = 0; - start_dw = map->per_vertex_start_dw; - if (!map->user_data_in_vertex_header) - start_dw += map->per_vertex_header_size_dw; + /* For fast linked libraries, we can't pack the MUE, as the fragment shader + * will be compiled without access to the MUE map and won't be able to find + * out where everything is. + * Instead, keep doing things as we did before the packing, just laying out + * everything in varying order, which is how the FS will expect them. + */ + if (compact_mue) { + unsigned start_dw = map->per_vertex_start_dw; + if (!map->user_data_in_vertex_header) + start_dw += map->per_vertex_header_size_dw; + + unsigned header_used_dw = 0; + for (unsigned type = VERT; type <= VERT_FLAT; ++type) { + for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) { + int location = (*it).location; + if (location < 0) { + start_dw += (*it).dwords; + if (map->user_data_in_vertex_header && header_used_dw < 4) { + header_used_dw += (*it).dwords; + assert(header_used_dw <= 4); + if (header_used_dw == 4) + start_dw += 4; /* jump over gl_position */ + } else { + map->per_vertex_data_size_dw += (*it).dwords; + } + continue; + } + + assert(map->start_dw[location] == -1); + + assert(location >= VARYING_SLOT_VAR0); + + brw_mue_assign_position(&*it, map, start_dw); - header_used_dw = 0; - for (unsigned type = VERT; type <= VERT_FLAT; ++type) { - for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) { - int location = (*it).location; - if (location < 0) { start_dw += (*it).dwords; if (map->user_data_in_vertex_header && header_used_dw < 4) { header_used_dw += (*it).dwords; @@ -969,25 +1031,36 @@ brw_compute_mue_map(const struct brw_compiler *compiler, } else { map->per_vertex_data_size_dw += (*it).dwords; } - continue; + outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); } + } + } else { + unsigned start_dw = map->per_vertex_start_dw + + map->per_vertex_header_size_dw; + + uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs; + while (per_vertex_outputs) { + uint64_t location = ffsl(per_vertex_outputs) - 1; assert(map->start_dw[location] == -1); - assert(location >= VARYING_SLOT_VAR0); - brw_mue_assign_position(&*it, map, start_dw); + nir_variable *var = + brw_nir_find_complete_variable_with_location(nir, + nir_var_shader_out, + location); + struct attr_desc d; + d.location = location; + d.type = brw_nir_get_var_type(nir, var); + d.dwords = glsl_count_dword_slots(d.type, false); + d.slots = glsl_count_attribute_slots(d.type, false); - start_dw += (*it).dwords; - if (map->user_data_in_vertex_header && header_used_dw < 4) { - header_used_dw += (*it).dwords; - assert(header_used_dw <= 4); - if (header_used_dw == 4) - start_dw += 4; /* jump over gl_position */ - } else { - map->per_vertex_data_size_dw += (*it).dwords; - } - outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); + brw_mue_assign_position(&d, map, start_dw); + + map->per_vertex_data_size_dw += ALIGN(d.dwords, 4); + start_dw += ALIGN(d.dwords, 4); + + per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots); } } @@ -1435,7 +1508,8 @@ brw_compile_mesh(const struct brw_compiler *compiler, brw_nir_lower_tue_inputs(nir, params->tue_map); - brw_compute_mue_map(compiler, nir, &prog_data->map, prog_data->index_format); + brw_compute_mue_map(compiler, nir, &prog_data->map, + prog_data->index_format, key->compact_mue); brw_nir_lower_mue_outputs(nir, &prog_data->map); brw_simd_selection_state simd_state{ diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 5bbcd6440fe..5e9ea5ed1c7 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -554,11 +554,14 @@ populate_task_prog_key(struct anv_pipeline_stage *stage, static void populate_mesh_prog_key(struct anv_pipeline_stage *stage, - const struct anv_device *device) + const struct anv_device *device, + bool compact_mue) { memset(&stage->key, 0, sizeof(stage->key)); populate_base_prog_key(stage, device); + + stage->key.mesh.compact_mue = compact_mue; } static uint32_t @@ -1737,9 +1740,13 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline, populate_task_prog_key(&stages[s], device); break; - case MESA_SHADER_MESH: - populate_mesh_prog_key(&stages[s], device); + case MESA_SHADER_MESH: { + const bool compact_mue = + !(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB && + !anv_pipeline_base_has_stage(pipeline, MESA_SHADER_FRAGMENT)); + populate_mesh_prog_key(&stages[s], device, compact_mue); break; + } default: unreachable("Invalid graphics shader stage");