diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index e134853ec0f..d1245068bae 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -4678,30 +4678,39 @@ iris_create_so_decl_list(const struct pipe_stream_output_info *info, } static inline int -iris_compute_first_urb_slot_required(uint64_t inputs_read, +iris_compute_first_urb_slot_required(struct iris_compiled_shader *fs_shader, const struct intel_vue_map *prev_stage_vue_map) { #if GFX_VER >= 9 - return brw_compute_first_fs_urb_slot_required(inputs_read, prev_stage_vue_map); + uint32_t read_offset, read_length, num_varyings, primid_offset; + brw_compute_sbe_per_vertex_urb_read(prev_stage_vue_map, + false /* mesh*/, + brw_wm_prog_data(fs_shader->brw_prog_data), + &read_offset, &read_length, &num_varyings, + &primid_offset); + return 2 * read_offset; #else - return elk_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map); + const struct iris_fs_data *fs_data = iris_fs_data(fs_shader); + return elk_compute_first_urb_slot_required(fs_data->inputs, prev_stage_vue_map); #endif } static void -iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots, +iris_compute_sbe_urb_read_interval(struct iris_compiled_shader *fs_shader, const struct intel_vue_map *last_vue_map, bool two_sided_color, unsigned *out_offset, unsigned *out_length) { + const struct iris_fs_data *fs_data = iris_fs_data(fs_shader); + /* The compiler computes the first URB slot without considering COL/BFC * swizzling (because it doesn't know whether it's enabled), so we need * to do that here too. This may result in a smaller offset, which * should be safe. */ const unsigned first_slot = - iris_compute_first_urb_slot_required(fs_input_slots, last_vue_map); + iris_compute_first_urb_slot_required(fs_shader, last_vue_map); /* This becomes the URB read offset (counted in pairs of slots). */ assert(first_slot % 2 == 0); @@ -4710,6 +4719,7 @@ iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots, /* We need to adjust the inputs read to account for front/back color * swizzling, as it can make the URB length longer. */ + uint64_t fs_input_slots = fs_data->inputs; for (int c = 0; c <= 1; c++) { if (fs_input_slots & (VARYING_BIT_COL0 << c)) { /* If two sided color is enabled, the fragment shader's gl_Color @@ -4898,7 +4908,7 @@ iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice) &iris_vue_data(ice->shaders.last_vue_shader)->vue_map; unsigned urb_read_offset, urb_read_length; - iris_compute_sbe_urb_read_interval(fs_data->inputs, + iris_compute_sbe_urb_read_interval(ice->shaders.prog[MESA_SHADER_FRAGMENT], last_vue_map, cso_rast->light_twoside, &urb_read_offset, &urb_read_length); diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp index 0ad3c9cfcab..c7faffeffd7 100644 --- a/src/intel/compiler/brw_compile_fs.cpp +++ b/src/intel/compiler/brw_compile_fs.cpp @@ -630,223 +630,138 @@ static void calculate_urb_setup(const struct intel_device_info *devinfo, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, - const nir_shader *nir, - const struct brw_mue_map *mue_map) + nir_shader *nir, + const struct brw_mue_map *mue_map, + int *per_primitive_offsets) { memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup)); memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel)); int urb_next = 0; /* in vec4s */ + /* Figure out where the PrimitiveID lives, either in the per-vertex block + * or in the per-primitive block or both. + */ + const uint64_t per_vert_primitive_id = + key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID; + const uint64_t per_prim_primitive_id = + key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID; const uint64_t inputs_read = - nir->info.inputs_read & ~nir->info.per_primitive_inputs; + nir->info.inputs_read & + (~nir->info.per_primitive_inputs | per_vert_primitive_id); + const uint64_t per_primitive_header_bits = + VARYING_BIT_PRIMITIVE_SHADING_RATE | + VARYING_BIT_LAYER | + VARYING_BIT_VIEWPORT | + VARYING_BIT_CULL_PRIMITIVE; + const uint64_t per_primitive_inputs = + nir->info.inputs_read & + (nir->info.per_primitive_inputs | per_prim_primitive_id) & + ~per_primitive_header_bits; + uint64_t unique_fs_attrs = + inputs_read & BRW_FS_VARYING_INPUT_MASK; + struct intel_vue_map vue_map; + uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX; - /* Figure out where each of the incoming setup attributes lands. */ - if (key->mesh_input != INTEL_NEVER) { - /* Per-Primitive Attributes are laid out by Hardware before the regular - * attributes, so order them like this to make easy later to map setup - * into real HW registers. + if (mue_map != NULL) { + memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map)); + + memcpy(per_primitive_offsets, + mue_map->per_primitive_offsets, + sizeof(mue_map->per_primitive_offsets)); + + u_foreach_bit64(location, per_primitive_inputs) { + assert(per_primitive_offsets[location] != -1); + + first_read_offset = MIN2(first_read_offset, + (uint32_t)per_primitive_offsets[location]); + per_primitive_stride = + MAX2((uint32_t)per_primitive_offsets[location] + 16, + per_primitive_stride); + } + } else { + brw_compute_vue_map(devinfo, &vue_map, inputs_read, + key->base.vue_layout, + 1 /* pos_slots, TODO */); + brw_compute_per_primitive_map(per_primitive_offsets, + &per_primitive_stride, + &first_read_offset, + 0, nir, nir_var_shader_in, + per_primitive_inputs, + true /* separate_shader */); + } + + if (per_primitive_stride > first_read_offset) { + first_read_offset = ROUND_DOWN_TO(first_read_offset, 32); + + /* Remove the first few unused registers */ + for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) { + if (per_primitive_offsets[i] == -1) + continue; + per_primitive_offsets[i] -= first_read_offset; + } + + prog_data->num_per_primitive_inputs = + 2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32); + } else { + prog_data->num_per_primitive_inputs = 0; + } + + /* Now do the per-vertex stuff (what used to be legacy pipeline) */ + const uint64_t vue_header_bits = BRW_VUE_HEADER_VARYING_MASK; + + unique_fs_attrs &= ~vue_header_bits; + + /* If Mesh is involved, we cannot do any packing. Documentation doesn't say + * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when + * using Mesh. + */ + if (util_bitcount64(unique_fs_attrs) <= 16 && key->mesh_input == INTEL_NEVER) { + /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do + * arbitrary rearrangement of the first 16 varying inputs, so we can put + * them wherever we want. Just put them in order. + * + * This is useful because it means that (a) inputs not used by the + * fragment shader won't take up valuable register space, and (b) we + * won't have to recompile the fragment shader if it gets paired with a + * different vertex (or geometry) shader. */ - if (nir->info.per_primitive_inputs) { - uint64_t per_prim_inputs_read = - nir->info.inputs_read & nir->info.per_primitive_inputs; - - /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots - * are always at the beginning, because they come from MUE - * Primitive Header, not Per-Primitive Attributes. - */ - const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT | - VARYING_BIT_LAYER | - VARYING_BIT_PRIMITIVE_SHADING_RATE; - - if (mue_map) { - unsigned per_prim_start_dw = mue_map->per_primitive_start_dw; - unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw; - - bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0; - - if (reads_header || mue_map->user_data_in_primitive_header) { - /* Primitive Shading Rate, Layer and Viewport live in the same - * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport - * is dword 2). - */ - if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE) - prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0; - - if (per_prim_inputs_read & VARYING_BIT_LAYER) - prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; - - if (per_prim_inputs_read & VARYING_BIT_VIEWPORT) - prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0; - - per_prim_inputs_read &= ~primitive_header_bits; - } else { - /* If fs doesn't need primitive header, then it won't be made - * available through SBE_MESH, so we have to skip them when - * calculating offset from start of per-prim data. - */ - per_prim_start_dw += mue_map->per_primitive_header_size_dw; - per_prim_size_dw -= mue_map->per_primitive_header_size_dw; - } - - u_foreach_bit64(i, per_prim_inputs_read) { - int start = mue_map->start_dw[i]; - - assert(start >= 0); - assert(mue_map->len_dw[i] > 0); - - assert(unsigned(start) >= per_prim_start_dw); - unsigned pos_dw = unsigned(start) - per_prim_start_dw; - - prog_data->urb_setup[i] = urb_next + pos_dw / 4; - prog_data->urb_setup_channel[i] = pos_dw % 4; - } - - urb_next = per_prim_size_dw / 4; - } else { - /* With no MUE map, we never read the primitive header, and - * per-primitive attributes won't be packed either, so just lay - * them in varying order. - */ - per_prim_inputs_read &= ~primitive_header_bits; - - for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { - if (per_prim_inputs_read & BITFIELD64_BIT(i)) { - prog_data->urb_setup[i] = urb_next++; - } - } - - /* The actual setup attributes later must be aligned to a full GRF. */ - urb_next = ALIGN(urb_next, 2); - } - - prog_data->num_per_primitive_inputs = urb_next; - } - - const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 | - VARYING_BIT_CLIP_DIST1; - - uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; - - if (inputs_read & clip_dist_bits) { - assert(!mue_map || mue_map->per_vertex_header_size_dw > 8); - unique_fs_attrs &= ~clip_dist_bits; - } - - if (mue_map) { - unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw; - unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw; - - /* Per-Vertex header is available to fragment shader only if there's - * user data there. - */ - if (!mue_map->user_data_in_vertex_header) { - per_vertex_start_dw += 8; - per_vertex_size_dw -= 8; - } - - /* In Mesh, CLIP_DIST slots are always at the beginning, because - * they come from MUE Vertex Header, not Per-Vertex Attributes. - */ - if (inputs_read & clip_dist_bits) { - prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next; - prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1; - } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) { - /* Clip distances are in MUE, but we are not reading them in FS. */ - per_vertex_start_dw += 8; - per_vertex_size_dw -= 8; - } - - /* Per-Vertex attributes are laid out ordered. Because we always link - * Mesh and Fragment shaders, the which slots are written and read by - * each of them will match. */ - u_foreach_bit64(i, unique_fs_attrs) { - int start = mue_map->start_dw[i]; - - assert(start >= 0); - assert(mue_map->len_dw[i] > 0); - - assert(unsigned(start) >= per_vertex_start_dw); - unsigned pos_dw = unsigned(start) - per_vertex_start_dw; - - prog_data->urb_setup[i] = urb_next + pos_dw / 4; - prog_data->urb_setup_channel[i] = pos_dw % 4; - } - - urb_next += per_vertex_size_dw / 4; - } else { - /* If we don't have an MUE map, just lay down the inputs the FS reads - * in varying order, as we do for the legacy pipeline. - */ - if (inputs_read & clip_dist_bits) { - prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++; - prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++; - } - - for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { - if (unique_fs_attrs & BITFIELD64_BIT(i)) - prog_data->urb_setup[i] = urb_next++; + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits & + BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; } } } else { - assert(!nir->info.per_primitive_inputs); - - const uint64_t vue_header_bits = BRW_VUE_HEADER_VARYING_MASK; - - uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits; - - if (util_bitcount64(unique_fs_attrs) <= 16) { - /* The SF/SBE pipeline stage can do arbitrary rearrangement of the - * first 16 varying inputs, so we can put them wherever we want. - * Just put them in order. - * - * This is useful because it means that (a) inputs not used by the - * fragment shader won't take up valuable register space, and (b) we - * won't have to recompile the fragment shader if it gets paired with - * a different vertex (or geometry) shader. - */ - for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { - if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits & - BITFIELD64_BIT(i)) { - prog_data->urb_setup[i] = urb_next++; - } + /* We have enough input varyings that the SF/SBE pipeline stage can't + * arbitrarily rearrange them to suit our whim; we have to put them in + * an order that matches the output of the previous pipeline stage + * (geometry or vertex shader). + */ + int first_slot = 0; + for (int i = 0; i < vue_map.num_slots; i++) { + int varying = vue_map.slot_to_varying[i]; + if (varying != BRW_VARYING_SLOT_PAD && varying > 0 && + (inputs_read & BITFIELD64_BIT(varying)) != 0) { + first_slot = ROUND_DOWN_TO(i, 2); + break; } - } else { - /* We have enough input varyings that the SF/SBE pipeline stage can't - * arbitrarily rearrange them to suit our whim; we have to put them - * in an order that matches the output of the previous pipeline stage - * (geometry or vertex shader). - */ - - /* Re-compute the VUE map here in the case that the one coming from - * geometry has more than one position slot (used for Primitive - * Replication). - */ - struct intel_vue_map prev_stage_vue_map; - brw_compute_vue_map(devinfo, &prev_stage_vue_map, - key->input_slots_valid, - key->base.vue_layout, 1); - - int first_slot = - brw_compute_first_fs_urb_slot_required(unique_fs_attrs, - &prev_stage_vue_map); - - assert(prev_stage_vue_map.num_slots <= first_slot + 32); - for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; - slot++) { - int varying = prev_stage_vue_map.slot_to_varying[slot]; - if (varying != BRW_VARYING_SLOT_PAD && - (inputs_read & BRW_FS_VARYING_INPUT_MASK & - BITFIELD64_BIT(varying))) { - prog_data->urb_setup[varying] = slot - first_slot; - } - } - urb_next = prev_stage_vue_map.num_slots - first_slot; } + + for (int slot = first_slot; slot < vue_map.num_slots; slot++) { + int varying = vue_map.slot_to_varying[slot]; + if (varying != BRW_VARYING_SLOT_PAD && + (inputs_read & BRW_FS_VARYING_INPUT_MASK & + BITFIELD64_BIT(varying))) { + prog_data->urb_setup[varying] = slot - first_slot; + } + } + urb_next = vue_map.num_slots - first_slot; } - prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs; + prog_data->num_varying_inputs = urb_next; prog_data->inputs = inputs_read; + prog_data->per_primitive_inputs = per_primitive_inputs; brw_compute_urb_setup_index(prog_data); } @@ -951,8 +866,6 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, { prog_data->flat_inputs = 0; - const unsigned per_vertex_start = prog_data->num_per_primitive_inputs; - nir_foreach_shader_in_variable(var, shader) { /* flat shading */ if (var->data.interpolation != INTERP_MODE_FLAT) @@ -963,7 +876,7 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, unsigned slots = glsl_count_attribute_slots(var->type, false); for (unsigned s = 0; s < slots; s++) { - int input_index = prog_data->urb_setup[var->data.location + s] - per_vertex_start; + int input_index = prog_data->urb_setup[var->data.location + s]; if (input_index >= 0) prog_data->flat_inputs |= 1 << input_index; @@ -1005,7 +918,8 @@ brw_nir_populate_wm_prog_data(nir_shader *shader, const struct intel_device_info *devinfo, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, - const struct brw_mue_map *mue_map) + const struct brw_mue_map *mue_map, + int *per_primitive_offsets) { prog_data->uses_kill = shader->info.fs.uses_discard; prog_data->uses_omask = !key->ignore_sample_mask_out && @@ -1158,7 +1072,7 @@ brw_nir_populate_wm_prog_data(nir_shader *shader, (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && prog_data->coarse_pixel_dispatch != INTEL_NEVER); - calculate_urb_setup(devinfo, key, prog_data, shader, mue_map); + calculate_urb_setup(devinfo, key, prog_data, shader, mue_map, per_primitive_offsets); brw_compute_flat_inputs(prog_data, shader); } @@ -1479,6 +1393,44 @@ run_fs(brw_shader &s, bool allow_spilling, bool do_rep_send) return !s.failed; } +static void +brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data, + int *per_primitive_offsets) +{ + fprintf(fp, "FS URB (inputs=0x%016" PRIx64 ", flat_inputs=0x%08x):\n", + prog_data->inputs, prog_data->flat_inputs); + fprintf(fp, " URB setup:\n"); + for (uint32_t i = 0; i < ARRAY_SIZE(prog_data->urb_setup); i++) { + if (prog_data->urb_setup[i] >= 0) { + fprintf(fp, " [%02d]: %i channel=%u (%s)\n", + i, prog_data->urb_setup[i], prog_data->urb_setup_channel[i], + gl_varying_slot_name_for_stage((gl_varying_slot)i, + MESA_SHADER_FRAGMENT)); + } + } + fprintf(fp, " URB setup attributes:\n"); + for (uint32_t i = 0; i < prog_data->urb_setup_attribs_count; i++) { + fprintf(fp, " [%02d]: %i (%s)\n", + i, prog_data->urb_setup_attribs[i], + gl_varying_slot_name_for_stage( + (gl_varying_slot)prog_data->urb_setup_attribs[i], + MESA_SHADER_FRAGMENT)); + } + if (per_primitive_offsets) { + fprintf(fp, " Per Primitive URB setup:\n"); + for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) { + if (per_primitive_offsets[i] == -1 || + i == VARYING_SLOT_PRIMITIVE_COUNT || + i == VARYING_SLOT_PRIMITIVE_INDICES) + continue; + fprintf(fp, " [%02d]: %i (%s)\n", + i, per_primitive_offsets[i], + gl_varying_slot_name_for_stage((gl_varying_slot)i, + MESA_SHADER_FRAGMENT)); + } + } +} + const unsigned * brw_compile_fs(const struct brw_compiler *compiler, struct brw_compile_fs_params *params) @@ -1518,8 +1470,15 @@ brw_compile_fs(const struct brw_compiler *compiler, brw_postprocess_nir(nir, compiler, debug_enabled, key->base.robust_flags); + int per_primitive_offsets[VARYING_SLOT_MAX]; + memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets)); + brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data, - params->mue_map); + params->mue_map, + per_primitive_offsets); + + if (unlikely(debug_enabled)) + brw_print_fs_urb_setup(stderr, prog_data, per_primitive_offsets); /* Either an unrestricted or a fixed SIMD16 subgroup size are * allowed -- The latter is needed for fast clear and replicated @@ -1540,6 +1499,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 8, 1, params->base.stats != NULL, debug_enabled); + v8->import_per_primitive_offsets(per_primitive_offsets); if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) { params->base.error_str = ralloc_strdup(params->base.mem_ctx, v8->fail_msg); @@ -1645,6 +1605,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 32, 1, params->base.stats != NULL, debug_enabled); + v32->import_per_primitive_offsets(per_primitive_offsets); if (vbase) v32->import_uniforms(vbase); @@ -1670,6 +1631,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 16, 1, params->base.stats != NULL, debug_enabled); + v16->import_per_primitive_offsets(per_primitive_offsets); if (!run_fs(*v16, allow_spilling, params->use_rep_send)) { brw_shader_perf_log(compiler, params->base.log_data, @@ -1694,6 +1656,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 16, 1, params->base.stats != NULL, debug_enabled); + v16->import_per_primitive_offsets(per_primitive_offsets); if (v8) v16->import_uniforms(v8.get()); if (!run_fs(*v16, allow_spilling, params->use_rep_send)) { @@ -1728,6 +1691,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 32, 1, params->base.stats != NULL, debug_enabled); + v32->import_per_primitive_offsets(per_primitive_offsets); if (v8) v32->import_uniforms(v8.get()); else if (v16) @@ -1772,6 +1736,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 32, 4, params->base.stats != NULL, debug_enabled); + vmulti->import_per_primitive_offsets(per_primitive_offsets); vmulti->import_uniforms(vbase); if (!run_fs(*vmulti, false, params->use_rep_send)) { brw_shader_perf_log(compiler, params->base.log_data, @@ -1792,6 +1757,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 32, 2, params->base.stats != NULL, debug_enabled); + vmulti->import_per_primitive_offsets(per_primitive_offsets); vmulti->import_uniforms(vbase); if (!run_fs(*vmulti, false, params->use_rep_send)) { brw_shader_perf_log(compiler, params->base.log_data, @@ -1811,6 +1777,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 16, 2, params->base.stats != NULL, debug_enabled); + vmulti->import_per_primitive_offsets(per_primitive_offsets); vmulti->import_uniforms(vbase); if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) { brw_shader_perf_log(compiler, params->base.log_data, @@ -1893,31 +1860,110 @@ brw_compile_fs(const struct brw_compiler *compiler, } extern "C" void -brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data) +brw_compute_sbe_per_vertex_urb_read(const struct intel_vue_map *prev_stage_vue_map, + bool mesh, + const struct brw_wm_prog_data *wm_prog_data, + uint32_t *out_read_offset, + uint32_t *out_read_length, + uint32_t *out_num_varyings, + uint32_t *out_primitive_id_offset) { - fprintf(fp, "FS URB (inputs=0x%016lx, flat_inputs=0x%08x):\n", - prog_data->inputs, prog_data->flat_inputs); - fprintf(fp, " URB setup:\n"); - for (uint32_t i = 0; i < ARRAY_SIZE(prog_data->urb_setup); i++) { - if (prog_data->urb_setup[i] >= 0) { - fprintf(fp, " [%02d]: %i channel=%u (%s)\n", - i, prog_data->urb_setup[i], prog_data->urb_setup_channel[i], - gl_varying_slot_name_for_stage((gl_varying_slot)i, - MESA_SHADER_FRAGMENT)); + int first_slot = INT32_MAX, last_slot = -1; + + /* Ignore PrimitiveID in mesh pipelines, this value is coming from the + * per-primitive block. + */ + uint64_t inputs_read = wm_prog_data->inputs; + if (mesh) + inputs_read &= ~VARYING_BIT_PRIMITIVE_ID; + + for (int _i = 0; _i < prev_stage_vue_map->num_slots; _i++) { + uint32_t i = prev_stage_vue_map->num_slots - 1 - _i; + int varying = prev_stage_vue_map->slot_to_varying[i]; + if (varying < 0) + continue; + + if (varying == BRW_VARYING_SLOT_PAD || + (inputs_read & BITFIELD64_BIT(varying)) == 0) + continue; + + last_slot = i; + break; + } + + for (int i = 0; i < prev_stage_vue_map->num_slots; i++) { + int varying = prev_stage_vue_map->slot_to_varying[i]; + if (varying != BRW_VARYING_SLOT_PAD && varying > 0 && + (inputs_read & BITFIELD64_BIT(varying)) != 0) { + first_slot = i; + break; } } - fprintf(fp, " URB setup attributes:\n"); - for (uint32_t i = 0; i < prog_data->urb_setup_attribs_count; i++) { - fprintf(fp, " [%02d]: %i (%s)\n", - i, prog_data->urb_setup_attribs[i], - gl_varying_slot_name_for_stage((gl_varying_slot)i, - MESA_SHADER_FRAGMENT)); + + assert((first_slot == INT32_MAX && last_slot == -1) || + (first_slot >= 0 && last_slot >= 0 && last_slot >= first_slot)); + + uint32_t num_varyings = wm_prog_data->num_varying_inputs; + + /* When using INTEL_VUE_LAYOUT_SEPARATE_MESH, the location of the + * PrimitiveID is unknown at compile time, here we compute the offset + * inside the attribute registers which will be read with MOV_INDIRECT in + * the shader. + */ + *out_primitive_id_offset = 0; + if (prev_stage_vue_map->layout == INTEL_VUE_LAYOUT_SEPARATE_MESH) { + if (mesh) { + /* When using Mesh, the PrimitiveID is in the per-primitive block. */ + if (wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] >= 0) + num_varyings--; + *out_primitive_id_offset = INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH; + } else if (inputs_read & VARYING_BIT_PRIMITIVE_ID) { + int primitive_id_slot; + if (prev_stage_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] < 0) { + /* If the previous stage doesn't write PrimitiveID, we can have + * the HW generate a value (except if GS is enabled but in that + * case that's undefined). + * + * If the FS shader already has a slot of the PrimitiveID value, + * use that. + */ + if (wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] >= 0) { + if (first_slot == INT32_MAX) + first_slot = 0; + primitive_id_slot = + first_slot + wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID]; + } else { + primitive_id_slot = ++last_slot; + } + } else { + primitive_id_slot = + prev_stage_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID]; + } + last_slot = MAX2(primitive_id_slot, last_slot); + + *out_primitive_id_offset = 4 * (primitive_id_slot - first_slot); + } + } + + /* Compute the read parameters for SBE (those have to be 32B aligned) */ + if (last_slot == -1) { + *out_read_offset = 0; + *out_read_length = DIV_ROUND_UP(num_varyings, 2); + *out_num_varyings = num_varyings; + } else { + first_slot = ROUND_DOWN_TO(first_slot, 2); + *out_read_offset = first_slot / 2; + *out_read_length = DIV_ROUND_UP(last_slot - first_slot + 1, 2); + *out_num_varyings = num_varyings; } } -extern "C" int -brw_compute_first_fs_urb_slot_required(uint64_t inputs_read, - const struct intel_vue_map *prev_stage_vue_map) +extern "C" void +brw_compute_sbe_per_primitive_urb_read(uint64_t inputs_read, + uint32_t num_varyings, + const struct brw_mue_map *mue_map, + uint32_t *out_read_offset, + uint32_t *out_read_length) { /* The header slots are irrelevant for the URB varying slots. They are * delivered somewhere else in the thread payload. @@ -1927,16 +1973,19 @@ brw_compute_first_fs_urb_slot_required(uint64_t inputs_read, * - LAYER : R1.1, Render Target Array Index * - VIEWPORT : R1.1, Viewport Index * - PSIZ : not available in fragment shaders + * - FACE : R1.1, Front/Back Facing */ - inputs_read &= ~BRW_VUE_HEADER_VARYING_MASK; + inputs_read &= ~(BRW_VUE_HEADER_VARYING_MASK | VARYING_BIT_FACE); - for (int i = 0; i < prev_stage_vue_map->num_slots; i++) { - int varying = prev_stage_vue_map->slot_to_varying[i]; - if (varying != BRW_VARYING_SLOT_PAD && varying > 0 && - (inputs_read & BITFIELD64_BIT(varying)) != 0) { - return ROUND_DOWN_TO(i, 2); - } + uint32_t first_read = UINT32_MAX; + u_foreach_bit64(varying, inputs_read) { + if (mue_map->per_primitive_offsets[varying] < 0) + continue; + + first_read = mue_map->per_primitive_offsets[varying]; + break; } - return 0; + *out_read_offset = DIV_ROUND_UP(first_read, 32); + *out_read_length = DIV_ROUND_UP(num_varyings, 2); } diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp index 82562e358a6..5939e3a42e0 100644 --- a/src/intel/compiler/brw_compile_mesh.cpp +++ b/src/intel/compiler/brw_compile_mesh.cpp @@ -516,711 +516,166 @@ struct attr_desc { unsigned slots; }; -struct attr_type_info { - /* order of attributes, negative values are holes */ - std::list *order; - - /* attributes after which there's hole of size equal to array index */ - std::list holes[5]; -}; - -static void -brw_mue_assign_position(const struct attr_desc *attr, - struct brw_mue_map *map, - unsigned start_dw) -{ - bool is_array = glsl_type_is_array(attr->type); - int location = attr->location; - unsigned remaining = attr->dwords; - - for (unsigned slot = 0; slot < attr->slots; ++slot) { - map->start_dw[location + slot] = start_dw; - - unsigned sz; - - if (is_array) { - assert(attr->dwords % attr->slots == 0); - sz = attr->dwords / attr->slots; - } else { - sz = MIN2(remaining, 4); - } - - map->len_dw[location + slot] = sz; - start_dw += sz; - remaining -= sz; - } -} - -static unsigned -brw_sum_size(const std::list &orders) -{ - unsigned sz = 0; - for (auto it = orders.cbegin(); it != orders.cend(); ++it) - sz += (*it).dwords; - return sz; -} - -/* Finds order of outputs which require minimum size, without splitting - * of URB read/write messages (which operate on vec4-aligned memory). - */ -static void -brw_compute_mue_layout(const struct brw_compiler *compiler, - std::list *orders, - uint64_t outputs_written, - struct nir_shader *nir, - bool *pack_prim_data_into_header, - bool *pack_vert_data_into_header) -{ - const struct shader_info *info = &nir->info; - - struct attr_type_info data[3]; - - if ((compiler->mesh.mue_header_packing & 1) == 0) - *pack_prim_data_into_header = false; - if ((compiler->mesh.mue_header_packing & 2) == 0) - *pack_vert_data_into_header = false; - - for (unsigned i = PRIM; i <= VERT_FLAT; ++i) - data[i].order = &orders[i]; - - /* If packing into header is enabled, add a hole of size 4 and add - * a virtual location to keep the algorithm happy (it expects holes - * to be preceded by some location). We'll remove those virtual - * locations at the end. - */ - const gl_varying_slot virtual_header_location = VARYING_SLOT_POS; - assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0); - - struct attr_desc d; - d.location = virtual_header_location; - d.type = NULL; - d.dwords = 0; - d.slots = 0; - - struct attr_desc h; - h.location = -1; - h.type = NULL; - h.dwords = 4; - h.slots = 0; - - if (*pack_prim_data_into_header) { - orders[PRIM].push_back(d); - orders[PRIM].push_back(h); - data[PRIM].holes[4].push_back(virtual_header_location); - } - - if (*pack_vert_data_into_header) { - orders[VERT].push_back(d); - orders[VERT].push_back(h); - data[VERT].holes[4].push_back(virtual_header_location); - } - - u_foreach_bit64(location, outputs_written) { - if ((BITFIELD64_BIT(location) & outputs_written) == 0) - continue; - - /* At this point there are both complete and split variables as - * outputs. We need the complete variable to compute the required - * size. - */ - nir_variable *var = - brw_nir_find_complete_variable_with_location(nir, - nir_var_shader_out, - location); - - d.location = location; - d.type = brw_nir_get_var_type(nir, var); - d.dwords = glsl_count_dword_slots(d.type, false); - d.slots = glsl_count_attribute_slots(d.type, false); - - struct attr_type_info *type_data; - - if (BITFIELD64_BIT(location) & info->per_primitive_outputs) - type_data = &data[PRIM]; - else if (var->data.interpolation == INTERP_MODE_FLAT) - type_data = &data[VERT_FLAT]; - else - type_data = &data[VERT]; - - std::list *order = type_data->order; - std::list *holes = type_data->holes; - - outputs_written &= ~BITFIELD64_RANGE(location, d.slots); - - /* special case to use hole of size 4 */ - if (d.dwords == 4 && !holes[4].empty()) { - holes[4].pop_back(); - - assert(order->front().location == virtual_header_location); - order->pop_front(); - - assert(order->front().location == -1); - assert(order->front().dwords == 4); - order->front() = d; - - continue; - } - - int mod = d.dwords % 4; - if (mod == 0) { - order->push_back(d); - continue; - } - - h.location = -1; - h.type = NULL; - h.dwords = 4 - mod; - h.slots = 0; - - if (!compiler->mesh.mue_compaction) { - order->push_back(d); - order->push_back(h); - continue; - } - - if (d.dwords > 4) { - order->push_back(d); - order->push_back(h); - holes[h.dwords].push_back(location); - continue; - } - - assert(d.dwords < 4); - - unsigned found = 0; - /* try to find the smallest hole big enough to hold this attribute */ - for (unsigned sz = d.dwords; sz <= 4; sz++){ - if (!holes[sz].empty()) { - found = sz; - break; - } - } - - /* append at the end if not found */ - if (found == 0) { - order->push_back(d); - order->push_back(h); - holes[h.dwords].push_back(location); - - continue; - } - - assert(found <= 4); - assert(!holes[found].empty()); - int after_loc = holes[found].back(); - holes[found].pop_back(); - - bool inserted_back = false; - - for (auto it = order->begin(); it != order->end(); ++it) { - if ((*it).location != after_loc) - continue; - - ++it; - /* must be a hole */ - assert((*it).location < 0); - /* and it must be big enough */ - assert(d.dwords <= (*it).dwords); - - if (d.dwords == (*it).dwords) { - /* exact size, just replace */ - *it = d; - } else { - /* inexact size, shrink hole */ - (*it).dwords -= d.dwords; - /* and insert new attribute before it */ - order->insert(it, d); - - /* Insert shrunk hole in a spot so that the order of attributes - * is preserved. - */ - std::list &hole_list = holes[(*it).dwords]; - std::list::iterator insert_before = hole_list.end(); - - for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) { - if ((*it2) >= (int)location) { - insert_before = it2; - break; - } - } - - hole_list.insert(insert_before, location); - } - - inserted_back = true; - break; - } - - assert(inserted_back); - } - - if (*pack_prim_data_into_header) { - if (orders[PRIM].front().location == virtual_header_location) - orders[PRIM].pop_front(); - - if (!data[PRIM].holes[4].empty()) { - *pack_prim_data_into_header = false; - - assert(orders[PRIM].front().location == -1); - assert(orders[PRIM].front().dwords == 4); - orders[PRIM].pop_front(); - } - - if (*pack_prim_data_into_header) { - unsigned sz = brw_sum_size(orders[PRIM]); - - if (sz % 8 == 0 || sz % 8 > 4) - *pack_prim_data_into_header = false; - } - } - - if (*pack_vert_data_into_header) { - if (orders[VERT].front().location == virtual_header_location) - orders[VERT].pop_front(); - - if (!data[VERT].holes[4].empty()) { - *pack_vert_data_into_header = false; - - assert(orders[VERT].front().location == -1); - assert(orders[VERT].front().dwords == 4); - orders[VERT].pop_front(); - } - - if (*pack_vert_data_into_header) { - unsigned sz = brw_sum_size(orders[VERT]) + - brw_sum_size(orders[VERT_FLAT]); - - if (sz % 8 == 0 || sz % 8 > 4) - *pack_vert_data_into_header = false; - } - } - - - if (INTEL_DEBUG(DEBUG_MESH)) { - fprintf(stderr, "MUE attribute order:\n"); - for (unsigned i = PRIM; i <= VERT_FLAT; ++i) { - if (!orders[i].empty()) - fprintf(stderr, "%d: ", i); - for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) { - fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords); - } - if (!orders[i].empty()) - fprintf(stderr, "\n"); - } - } -} - -/* Mesh URB Entry consists of an initial section - * - * - Primitive Count - * - Primitive Indices (from 0 to Max-1) - * - Padding to 32B if needed - * - * optionally followed by a section for per-primitive data, - * in which each primitive (from 0 to Max-1) gets - * - * - Primitive Header (e.g. ViewportIndex) - * - Primitive Custom Attributes - * - * then followed by a section for per-vertex data - * - * - Vertex Header (e.g. Position) - * - Vertex Custom Attributes - * - * Each per-element section has a pitch and a starting offset. All the - * individual attributes offsets in start_dw are considering the first entry - * of the section (i.e. where the Position for first vertex, or ViewportIndex - * for first primitive). Attributes for other elements are calculated using - * the pitch. - */ static void brw_compute_mue_map(const struct brw_compiler *compiler, - struct nir_shader *nir, struct brw_mue_map *map, - enum brw_mesh_index_format index_format, bool compact_mue) + nir_shader *nir, struct brw_mue_map *map, + enum brw_mesh_index_format index_format, + enum intel_vue_layout vue_layout) { memset(map, 0, sizeof(*map)); - memset(&map->start_dw[0], -1, sizeof(map->start_dw)); - memset(&map->len_dw[0], 0, sizeof(map->len_dw)); - - unsigned vertices_per_primitive = - mesa_vertices_per_prim(nir->info.mesh.primitive_type); - map->max_primitives = nir->info.mesh.max_primitives_out; map->max_vertices = nir->info.mesh.max_vertices_out; - uint64_t outputs_written = nir->info.outputs_written; + /* NumPrimitives */ + map->size += 4; + + /* PrimX indices */ + const unsigned vertices_per_primitive = + mesa_vertices_per_prim(nir->info.mesh.primitive_type); - /* One dword for primitives count then K extra dwords for each primitive. */ switch (index_format) { case BRW_INDEX_FORMAT_U32: - map->per_primitive_indices_dw = vertices_per_primitive; + map->per_primitive_indices_stride = 4 * vertices_per_primitive; break; case BRW_INDEX_FORMAT_U888X: - map->per_primitive_indices_dw = 1; + map->per_primitive_indices_stride = 4; break; default: unreachable("invalid index format"); } - map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw * - map->max_primitives + 1, 8); + map->size += map->per_primitive_indices_stride * map->max_primitives; - /* Assign initial section. */ - if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) { - map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0; - map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1; - outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT); - } - if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) { - map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1; - map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] = - map->per_primitive_indices_dw * map->max_primitives; - outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES); - } + /* Per primitive blocks */ + map->size = align(map->size, 32); + map->per_primitive_offset = map->size; + const uint64_t count_indices_bits = + VARYING_BIT_PRIMITIVE_COUNT | + VARYING_BIT_PRIMITIVE_INDICES; const uint64_t per_primitive_header_bits = - BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) | - BITFIELD64_BIT(VARYING_SLOT_LAYER) | - BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) | - BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE); + VARYING_BIT_PRIMITIVE_SHADING_RATE | + VARYING_BIT_LAYER | + VARYING_BIT_VIEWPORT | + VARYING_BIT_CULL_PRIMITIVE; - const uint64_t per_vertex_header_bits = - BITFIELD64_BIT(VARYING_SLOT_PSIZ) | - BITFIELD64_BIT(VARYING_SLOT_POS) | - BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) | - BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); + /* Do we need a header? */ + map->has_per_primitive_header = + (nir->info.outputs_written & + nir->info.per_primitive_outputs & + per_primitive_header_bits) != 0; - std::list orders[3]; - uint64_t regular_outputs = outputs_written & - ~(per_primitive_header_bits | per_vertex_header_bits); + uint32_t first_per_prim_offset; + brw_compute_per_primitive_map(map->per_primitive_offsets, + &map->per_primitive_stride, + &first_per_prim_offset, + map->has_per_primitive_header ? 32 : 0, + nir, nir_var_shader_out, + nir->info.outputs_written & + nir->info.per_primitive_outputs, + vue_layout != INTEL_VUE_LAYOUT_FIXED); - /* packing into prim header is possible only if prim header is present */ - map->user_data_in_primitive_header = compact_mue && - (outputs_written & per_primitive_header_bits) != 0; + map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_COUNT] = 0; + map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_INDICES] = 4; + if (map->has_per_primitive_header) { + /* Setup all the fields in the header */ + map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0; + map->per_primitive_offsets[VARYING_SLOT_LAYER] = 4; + map->per_primitive_offsets[VARYING_SLOT_VIEWPORT] = 8; + map->per_primitive_offsets[VARYING_SLOT_CULL_PRIMITIVE] = 12; + } - /* Packing into vert header is always possible, but we allow it only - * if full vec4 is available (so point size is not used) and there's - * nothing between it and normal vertex data (so no clip distances). + map->per_primitive_stride = align(map->per_primitive_stride, 32); + + map->size += map->per_primitive_stride * map->max_primitives; + assert(map->size % 32 == 0); + + assert((nir->info.outputs_written & VARYING_BIT_PRIMITIVE_ID) == 0 || + (nir->info.outputs_written & nir->info.per_primitive_outputs) != 0); + + /* Per vertex blocks: + * + * For some selected bit that can appear either as per-primitive or + * per-vertex inputs to the fragment shader, we need to add them to the + * per-vertex block as well so that the layouts match. Even though they're + * not written. */ - map->user_data_in_vertex_header = compact_mue && - (outputs_written & per_vertex_header_bits) == - BITFIELD64_BIT(VARYING_SLOT_POS); + const uint64_t per_primitive_outputs = + nir->info.outputs_written & nir->info.per_primitive_outputs; + const uint64_t per_vertex_outputs = + (nir->info.outputs_written & + ~(per_primitive_outputs | count_indices_bits | per_primitive_header_bits)); - if (outputs_written & per_primitive_header_bits) { - bool zero_layer_viewport = false; - if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) { - map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = - map->per_primitive_start_dw + 0; - map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1; - /* Wa_16020916187: force 0 writes to layer and viewport slots */ - zero_layer_viewport = - intel_needs_workaround(compiler->devinfo, 16020916187); - } - - if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) || - zero_layer_viewport) { - map->start_dw[VARYING_SLOT_LAYER] = - map->per_primitive_start_dw + 1; /* RTAIndex */ - map->len_dw[VARYING_SLOT_LAYER] = 1; - } - - if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) || - zero_layer_viewport) { - map->start_dw[VARYING_SLOT_VIEWPORT] = - map->per_primitive_start_dw + 2; - map->len_dw[VARYING_SLOT_VIEWPORT] = 1; - } - - if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) { - map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] = - map->per_primitive_start_dw + 3; - map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1; - } - - map->per_primitive_header_size_dw = 8; - outputs_written &= ~per_primitive_header_bits; - } else { - map->per_primitive_header_size_dw = 0; - } - - map->per_primitive_data_size_dw = 0; - - /* For fast linked libraries, we can't pack the MUE, as the fragment shader - * will be compiled without access to the MUE map and won't be able to find - * out where everything is. - * Instead, keep doing things as we did before the packing, just laying out - * everything in varying order, which is how the FS will expect them. - */ - if (compact_mue) { - brw_compute_mue_layout(compiler, orders, regular_outputs, nir, - &map->user_data_in_primitive_header, - &map->user_data_in_vertex_header); - - unsigned start_dw = map->per_primitive_start_dw; - if (map->user_data_in_primitive_header) - start_dw += 4; /* first 4 dwords are used */ - else - start_dw += map->per_primitive_header_size_dw; - unsigned header_used_dw = 0; - - for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) { - int location = (*it).location; - if (location < 0) { - start_dw += (*it).dwords; - if (map->user_data_in_primitive_header && header_used_dw < 4) - header_used_dw += (*it).dwords; - else - map->per_primitive_data_size_dw += (*it).dwords; - assert(header_used_dw <= 4); - continue; - } - - assert(map->start_dw[location] == -1); - - assert(location == VARYING_SLOT_PRIMITIVE_ID || - location >= VARYING_SLOT_VAR0); - - brw_mue_assign_position(&*it, map, start_dw); - - start_dw += (*it).dwords; - if (map->user_data_in_primitive_header && header_used_dw < 4) - header_used_dw += (*it).dwords; - else - map->per_primitive_data_size_dw += (*it).dwords; - assert(header_used_dw <= 4); - outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); - } - } else { - unsigned start_dw = map->per_primitive_start_dw + - map->per_primitive_header_size_dw; - - uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs; - while (per_prim_outputs) { - uint64_t location = ffsll(per_prim_outputs) - 1; - - assert(map->start_dw[location] == -1); - assert(location == VARYING_SLOT_PRIMITIVE_ID || - location >= VARYING_SLOT_VAR0); - - nir_variable *var = - brw_nir_find_complete_variable_with_location(nir, - nir_var_shader_out, - location); - struct attr_desc d; - d.location = location; - d.type = brw_nir_get_var_type(nir, var); - d.dwords = glsl_count_dword_slots(d.type, false); - d.slots = glsl_count_attribute_slots(d.type, false); - - brw_mue_assign_position(&d, map, start_dw); - - map->per_primitive_data_size_dw += ALIGN(d.dwords, 4); - start_dw += ALIGN(d.dwords, 4); - - per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots); - } - } - - map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw + - map->per_primitive_data_size_dw, 8); - - map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw + - map->per_primitive_pitch_dw * - map->max_primitives, 8); - - /* TODO(mesh): Multiview. */ - unsigned fixed_header_size = 8; - map->per_vertex_header_size_dw = ALIGN(fixed_header_size + - nir->info.clip_distance_array_size + - nir->info.cull_distance_array_size, 8); - - if (outputs_written & per_vertex_header_bits) { - if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) { - map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3; - map->len_dw[VARYING_SLOT_PSIZ] = 1; - } - - if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) { - map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4; - map->len_dw[VARYING_SLOT_POS] = 4; - } - - if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) { - map->start_dw[VARYING_SLOT_CLIP_DIST0] = - map->per_vertex_start_dw + fixed_header_size + 0; - map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4; - } - - if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) { - map->start_dw[VARYING_SLOT_CLIP_DIST1] = - map->per_vertex_start_dw + fixed_header_size + 4; - map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4; - } - - outputs_written &= ~per_vertex_header_bits; - } - - /* cull distances should be lowered earlier */ - assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0))); - assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1))); - - map->per_vertex_data_size_dw = 0; - - /* For fast linked libraries, we can't pack the MUE, as the fragment shader - * will be compiled without access to the MUE map and won't be able to find - * out where everything is. - * Instead, keep doing things as we did before the packing, just laying out - * everything in varying order, which is how the FS will expect them. - */ - if (compact_mue) { - unsigned start_dw = map->per_vertex_start_dw; - if (!map->user_data_in_vertex_header) - start_dw += map->per_vertex_header_size_dw; - - unsigned header_used_dw = 0; - for (unsigned type = VERT; type <= VERT_FLAT; ++type) { - for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) { - int location = (*it).location; - if (location < 0) { - start_dw += (*it).dwords; - if (map->user_data_in_vertex_header && header_used_dw < 4) { - header_used_dw += (*it).dwords; - assert(header_used_dw <= 4); - if (header_used_dw == 4) - start_dw += 4; /* jump over gl_position */ - } else { - map->per_vertex_data_size_dw += (*it).dwords; - } - continue; - } - - assert(map->start_dw[location] == -1); - - assert(location >= VARYING_SLOT_VAR0); - - brw_mue_assign_position(&*it, map, start_dw); - - start_dw += (*it).dwords; - if (map->user_data_in_vertex_header && header_used_dw < 4) { - header_used_dw += (*it).dwords; - assert(header_used_dw <= 4); - if (header_used_dw == 4) - start_dw += 4; /* jump over gl_position */ - } else { - map->per_vertex_data_size_dw += (*it).dwords; - } - outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); - } - } - } else { - unsigned start_dw = map->per_vertex_start_dw + - map->per_vertex_header_size_dw; - - uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs; - while (per_vertex_outputs) { - uint64_t location = ffsll(per_vertex_outputs) - 1; - - assert(map->start_dw[location] == -1); - assert(location >= VARYING_SLOT_VAR0); - - nir_variable *var = - brw_nir_find_complete_variable_with_location(nir, - nir_var_shader_out, - location); - struct attr_desc d; - d.location = location; - d.type = brw_nir_get_var_type(nir, var); - d.dwords = glsl_count_dword_slots(d.type, false); - d.slots = glsl_count_attribute_slots(d.type, false); - - brw_mue_assign_position(&d, map, start_dw); - - map->per_vertex_data_size_dw += ALIGN(d.dwords, 4); - start_dw += ALIGN(d.dwords, 4); - - per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots); - } - } - - map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw + - map->per_vertex_data_size_dw, 8); - - map->size_dw = - map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices; - - assert(map->size_dw % 8 == 0); + map->per_vertex_offset = map->size; + brw_compute_vue_map(compiler->devinfo, + &map->vue_map, per_vertex_outputs, + vue_layout, 1 /* pos_slots, TODO: multiview */); + map->per_vertex_stride = align(map->vue_map.num_slots * 16, 32); + map->size += map->per_vertex_stride * map->max_vertices; + assert(map->size % 32 == 0); } static void brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir) { - fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n", - map->size_dw, map->max_primitives, map->max_vertices); - fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n", - map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT], - map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] + - map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1); - fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n", - map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES], - map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] + - map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1); + fprintf(fp, "MUE map (%d bytes, %d primitives, %d vertices):\n", + map->size, map->max_primitives, map->max_vertices); + fprintf(fp, " indices_stride: %d\n", map->per_primitive_indices_stride); + fprintf(fp, " primitive_header: %d\n", map->has_per_primitive_header); + fprintf(fp, " primitive_offset: %d\n", map->per_primitive_offset); + fprintf(fp, " primitive_stride: %d\n", map->per_primitive_stride); + fprintf(fp, " vertex_offset: %d\n", map->per_vertex_offset); + fprintf(fp, " vertex_stride: %d\n", map->per_vertex_stride); - fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n", - map->per_primitive_start_dw, - map->per_primitive_header_size_dw, - map->per_primitive_data_size_dw, - map->per_primitive_pitch_dw); - - for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { - if (map->start_dw[i] < 0) + fprintf(fp, " primitive offsets:\n"); + fprintf(fp, " %s: %d\n", + gl_varying_slot_name_for_stage(VARYING_SLOT_PRIMITIVE_COUNT, + MESA_SHADER_MESH), + map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_COUNT]); + fprintf(fp, " %s: %d\n", + gl_varying_slot_name_for_stage(VARYING_SLOT_PRIMITIVE_INDICES, + MESA_SHADER_MESH), + map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_INDICES]); + for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) { + if (map->per_primitive_offsets[i] < 0 || + i == VARYING_SLOT_PRIMITIVE_COUNT || + i == VARYING_SLOT_PRIMITIVE_INDICES) continue; - - const unsigned offset = map->start_dw[i]; - const unsigned len = map->len_dw[i]; - - if (offset < map->per_primitive_start_dw || - offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw) - continue; - - const char *name = - gl_varying_slot_name_for_stage((gl_varying_slot)i, - MESA_SHADER_MESH); - - fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1, - name, i); + fprintf(fp, " %s: %d (relative %d)\n", + gl_varying_slot_name_for_stage((gl_varying_slot)i, + MESA_SHADER_MESH), + map->per_primitive_offset + map->per_primitive_offsets[i], + map->per_primitive_offsets[i]); } + brw_print_vue_map(fp, &map->vue_map, MESA_SHADER_MESH); +} - fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n", - map->per_vertex_start_dw, - map->per_vertex_header_size_dw, - map->per_vertex_data_size_dw, - map->per_vertex_pitch_dw); +static bool +remap_io_to_dwords(nir_builder *b, nir_intrinsic_instr *intrin, void *data) +{ + if (intrin->intrinsic != nir_intrinsic_load_per_vertex_output && + intrin->intrinsic != nir_intrinsic_load_per_primitive_output && + intrin->intrinsic != nir_intrinsic_store_per_vertex_output && + intrin->intrinsic != nir_intrinsic_store_per_primitive_output) + return false; - for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { - if (map->start_dw[i] < 0) - continue; + nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin); + if (io_sem.location == VARYING_SLOT_PRIMITIVE_INDICES || + io_sem.location == VARYING_SLOT_PRIMITIVE_COUNT) + return false; - const unsigned offset = map->start_dw[i]; - const unsigned len = map->len_dw[i]; + nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) * 4); + nir_intrinsic_set_range(intrin, nir_intrinsic_range(intrin) * 4); - if (offset < map->per_vertex_start_dw || - offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw) - continue; + b->cursor = nir_before_instr(&intrin->instr); - nir_variable *var = - nir_find_variable_with_location(nir, nir_var_shader_out, i); - bool flat = var->data.interpolation == INTERP_MODE_FLAT; + nir_src *offset = nir_get_io_offset_src(intrin); + assert(offset != NULL); - const char *name = - gl_varying_slot_name_for_stage((gl_varying_slot)i, - MESA_SHADER_MESH); + nir_src_rewrite(offset, nir_ishl_imm(b, offset->ssa, 2)); - fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1, - name, i, flat ? " (flat)" : ""); - } - - fprintf(fp, "\n"); + return true; } static void @@ -1229,12 +684,73 @@ brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map) nir_foreach_shader_out_variable(var, nir) { int location = var->data.location; assert(location >= 0); - assert(map->start_dw[location] != -1); - var->data.driver_location = map->start_dw[location]; + + switch (location) { + case VARYING_SLOT_PRIMITIVE_COUNT: + case VARYING_SLOT_PRIMITIVE_INDICES: + /* Primitive count & indices are not part of the per-primitive block, + * they have there own spot just before. We saved their offset in the + * the per-primitive array, we just don't need to add the block + * offset. + */ + var->data.driver_location = + map->per_primitive_offsets[location] / 4; + break; + + case VARYING_SLOT_PRIMITIVE_SHADING_RATE: + var->data.driver_location = map->per_primitive_offset / 16; + var->data.location_frac = 0; + break; + + case VARYING_SLOT_LAYER: + var->data.driver_location = map->per_primitive_offset / 16; + var->data.location_frac = 1; + break; + + case VARYING_SLOT_VIEWPORT: + var->data.driver_location = map->per_primitive_offset / 16; + var->data.location_frac = 2; + break; + + case VARYING_SLOT_CULL_PRIMITIVE: + var->data.driver_location = map->per_primitive_offset / 16; + var->data.location_frac = 3; + break; + + case VARYING_SLOT_PSIZ: + var->data.driver_location = map->per_vertex_offset / 16; + var->data.location_frac = 3; + break; + + default: + if (nir->info.per_primitive_outputs & BITFIELD64_BIT(location)) { + assert(map->per_primitive_offsets[location] != -1); + var->data.driver_location = + (map->per_primitive_offset + + map->per_primitive_offsets[location]) / 16; + } else { + /* Each per vertex location has its own slot/vec4 (16B) of data, use + * map->vue_map.varying_to_slot[] to get the 16B offset and add the + * per-vertex block offset. + */ + assert(map->vue_map.varying_to_slot[location] != -1); + var->data.driver_location = + map->per_vertex_offset / 16 + + map->vue_map.varying_to_slot[location]; + } + break; + } } NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, - type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32); + type_size_vec4, + nir_lower_io_lower_64bit_to_32); + + /* Everythings works with slots in terms if IO, but our backend deals with + * dwords. Apply remapping. + */ + NIR_PASS(_, nir, nir_shader_intrinsics_pass, + remap_io_to_dwords, nir_metadata_control_flow, NULL); } static void @@ -1242,8 +758,6 @@ brw_nir_initialize_mue(nir_shader *nir, const struct brw_mue_map *map, unsigned dispatch_width) { - assert(map->per_primitive_header_size_dw > 0); - nir_builder b; nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir); b = nir_builder_at(nir_before_impl(entrypoint)); @@ -1277,7 +791,7 @@ brw_nir_initialize_mue(nir_shader *nir, prim_in_inv * workgroup_size); nir_store_per_primitive_output(&b, zerovec, prim, dw_off, - .base = (int)map->per_primitive_start_dw, + .base = (int)map->per_primitive_offset / 4, .write_mask = WRITEMASK_XYZW, .component = 0, .src_type = nir_type_uint32); @@ -1297,7 +811,7 @@ brw_nir_initialize_mue(nir_shader *nir, prims_per_inv * workgroup_size); nir_store_per_primitive_output(&b, zerovec, prim, dw_off, - .base = (int)map->per_primitive_start_dw, + .base = (int)map->per_primitive_offset / 4, .write_mask = WRITEMASK_XYZW, .component = 0, .src_type = nir_type_uint32); @@ -1348,8 +862,7 @@ brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b, switch (intrin->intrinsic) { case nir_intrinsic_load_per_vertex_output: case nir_intrinsic_store_per_vertex_output: - brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw); - + brw_nir_adjust_offset(b, intrin, map->per_vertex_stride / 4); return true; case nir_intrinsic_load_per_primitive_output: @@ -1357,12 +870,11 @@ brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b, struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); uint32_t pitch; if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES) - pitch = map->per_primitive_indices_dw; + pitch = map->per_primitive_indices_stride; else - pitch = map->per_primitive_pitch_dw; - - brw_nir_adjust_offset(b, intrin, pitch); + pitch = map->per_primitive_stride; + brw_nir_adjust_offset(b, intrin, pitch / 4); return true; } @@ -1571,11 +1083,12 @@ brw_mesh_autostrip_enable(const struct brw_compiler *compiler, struct nir_shader * those to 0. The workaround also requires disabling autostrip. */ if (intel_needs_workaround(compiler->devinfo, 16020916187) && - (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) & outputs_written)) + (VARYING_BIT_PRIMITIVE_SHADING_RATE & outputs_written)) return false; - if (map->start_dw[VARYING_SLOT_VIEWPORT] < 0 && - map->start_dw[VARYING_SLOT_LAYER] < 0) + /* Values not written */ + if ((outputs_written & (VARYING_BIT_VIEWPORT | + VARYING_BIT_LAYER)) == 0) return true; nir_def *vp = NULL; @@ -1694,7 +1207,7 @@ brw_compile_mesh(const struct brw_compiler *compiler, * When Primitive Header is enabled, we may not generates writes to all * fields, so let's initialize everything. */ - if (prog_data->map.per_primitive_header_size_dw > 0) + if (prog_data->map.has_per_primitive_header) NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width); brw_nir_apply_key(shader, compiler, &key->base, dispatch_width); diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 1ce6eea63e0..4e3f185c805 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -194,11 +194,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) compiler->nir_options[i] = nir_options; } - compiler->mesh.mue_header_packing = - (unsigned)debug_get_num_option("INTEL_MESH_HEADER_PACKING", 3); - compiler->mesh.mue_compaction = - debug_get_bool_option("INTEL_MESH_COMPACTION", true); - return compiler; } @@ -218,8 +213,6 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler) bits++; insert_u64_bit(&config, compiler->lower_dpas); bits++; - insert_u64_bit(&config, compiler->mesh.mue_compaction); - bits++; enum intel_debug_flag debug_bits[] = { DEBUG_NO_DUAL_OBJECT_GS, @@ -244,9 +237,6 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler) mask = 3; bits += util_bitcount64(mask); - u_foreach_bit64(bit, mask) - insert_u64_bit(&config, (compiler->mesh.mue_header_packing & (1ULL << bit)) != 0); - assert(bits <= util_bitcount64(UINT64_MAX)); return config; diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index b45f0eb0393..8727b3ec00f 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -122,11 +122,6 @@ struct brw_compiler { int spilling_rate; struct nir_shader *clc_shader; - - struct { - unsigned mue_header_packing; - bool mue_compaction; - } mesh; }; #define brw_shader_debug_log(compiler, data, fmt, ... ) do { \ @@ -371,7 +366,7 @@ struct brw_wm_prog_key { /* Whether or not we are running on a multisampled framebuffer */ enum intel_sometimes multisample_fbo:2; - /* Whether the preceding shader stage is mesh */ + /* Whether the shader is dispatch with a preceeding mesh shader */ enum intel_sometimes mesh_input:2; bool coherent_fb_fetch:1; @@ -388,7 +383,7 @@ brw_wm_prog_key_is_dynamic(const struct brw_wm_prog_key *key) return key->alpha_to_coverage == INTEL_SOMETIMES || key->persample_interp == INTEL_SOMETIMES || key->multisample_fbo == INTEL_SOMETIMES || - key->mesh_input == INTEL_SOMETIMES; + key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH; } struct brw_cs_prog_key { @@ -784,6 +779,13 @@ struct brw_wm_prog_data { */ uint64_t inputs; + /** + * The FS per-primitive inputs (some bits can be in both inputs & + * per_primitive_inputs if the shader is compiled without being linked to + * the previous stage) + */ + uint64_t per_primitive_inputs; + /** * Map from gl_varying_slot to the position within the FS setup data * payload where the varying's attribute vertex deltas should be delivered. @@ -1049,8 +1051,6 @@ typedef enum void brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map, gl_shader_stage stage); -void brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data); - /** * Convert a VUE slot number into a byte offset within the VUE. */ @@ -1069,6 +1069,16 @@ brw_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying) return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); } +void +brw_compute_per_primitive_map(int *out_per_primitive_map, + uint32_t *out_per_primitive_stride, + uint32_t *out_first_offset, + uint32_t base_offset, + nir_shader *nir, + uint32_t variables_mode, + uint64_t slots_valid, + bool separate_shader); + void brw_compute_vue_map(const struct intel_device_info *devinfo, struct intel_vue_map *vue_map, uint64_t slots_valid, @@ -1190,25 +1200,40 @@ struct brw_tue_map { }; struct brw_mue_map { - int32_t start_dw[VARYING_SLOT_MAX]; - uint32_t len_dw[VARYING_SLOT_MAX]; - uint32_t per_primitive_indices_dw; - - uint32_t size_dw; + /* Total size in bytes of the MUE (32B aligned) */ + uint32_t size; uint32_t max_primitives; - uint32_t per_primitive_start_dw; - uint32_t per_primitive_header_size_dw; - uint32_t per_primitive_data_size_dw; - uint32_t per_primitive_pitch_dw; - bool user_data_in_primitive_header; - uint32_t max_vertices; - uint32_t per_vertex_start_dw; - uint32_t per_vertex_header_size_dw; - uint32_t per_vertex_data_size_dw; - uint32_t per_vertex_pitch_dw; - bool user_data_in_vertex_header; + + /* Stride in bytes between sets of primitive indices */ + uint32_t per_primitive_indices_stride; + + /* Per primitive offset from the start of the MUE (32B aligned) */ + uint32_t per_primitive_offset; + + /* Per primitive stride in bytes (32B aligned) */ + uint32_t per_primitive_stride; + + /* Whether the per primitive block includes a header */ + bool has_per_primitive_header; + + /* Per vertex offset in bytes from the start of the MUE (32B aligned) */ + uint32_t per_vertex_offset; + + /* Size of the per vertex header (32B aligned) */ + uint32_t per_vertex_header_size; + + /* Per vertex stride in bytes (32B aligned) */ + uint32_t per_vertex_stride; + + /* VUE map for the per vertex attributes */ + struct intel_vue_map vue_map; + + /* Offset in bytes of each per primitive relative to + * per_primitive_offset (-1 if unused) + */ + int per_primitive_offsets[VARYING_SLOT_MAX]; }; struct brw_task_prog_data { @@ -1628,7 +1653,28 @@ brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo, */ int brw_compute_first_fs_urb_slot_required(uint64_t inputs_read, - const struct intel_vue_map *prev_stage_vue_map); + const struct intel_vue_map *prev_stage_vue_map, + bool mesh); + +void +brw_compute_sbe_per_vertex_urb_read(const struct intel_vue_map *prev_stage_vue_map, + bool mesh, + const struct brw_wm_prog_data *wm_prog_data, + uint32_t *out_first_slot, + uint32_t *num_slots, + uint32_t *out_num_varyings, + uint32_t *out_primitive_id_offset); + +/** + * Computes the URB offset at which SBE should read the per primitive date + * written by the mesh shader. + */ +void +brw_compute_sbe_per_primitive_urb_read(uint64_t inputs_read, + uint32_t num_varyings, + const struct brw_mue_map *mue_map, + uint32_t *out_read_offset, + uint32_t *out_read_length); /* From InlineData in 3DSTATE_TASK_SHADER_DATA and 3DSTATE_MESH_SHADER_DATA. */ #define BRW_TASK_MESH_INLINE_DATA_SIZE_DW 8 diff --git a/src/intel/compiler/brw_from_nir.cpp b/src/intel/compiler/brw_from_nir.cpp index ebaf2670d92..a397f01cba0 100644 --- a/src/intel/compiler/brw_from_nir.cpp +++ b/src/intel/compiler/brw_from_nir.cpp @@ -4153,7 +4153,8 @@ brw_interp_reg(const brw_builder &bld, unsigned location, { brw_shader &s = *bld.shader; assert(s.stage == MESA_SHADER_FRAGMENT); - assert(BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs); + assert((BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs) || + location == VARYING_SLOT_PRIMITIVE_ID); const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data); @@ -4161,10 +4162,6 @@ brw_interp_reg(const brw_builder &bld, unsigned location, unsigned nr = prog_data->urb_setup[location]; channel += prog_data->urb_setup_channel[location]; - /* Adjust so we start counting from the first per_vertex input. */ - assert(nr >= prog_data->num_per_primitive_inputs); - nr -= prog_data->num_per_primitive_inputs; - const unsigned per_vertex_start = prog_data->num_per_primitive_inputs; const unsigned regnr = per_vertex_start + (nr * 4) + channel; @@ -4196,14 +4193,15 @@ brw_per_primitive_reg(const brw_builder &bld, int location, unsigned comp) const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data); - comp += prog_data->urb_setup_channel[location]; + comp += (s.fs.per_primitive_offsets[location] % 16) / 4; - assert(prog_data->urb_setup[location] >= 0); - - const unsigned regnr = prog_data->urb_setup[location] + comp / 4; + const unsigned regnr = s.fs.per_primitive_offsets[location] / 16 + comp / 4; + assert(s.fs.per_primitive_offsets[location] >= 0); assert(regnr < prog_data->num_per_primitive_inputs); + brw_reg loc_reg = brw_attr_reg(regnr, BRW_TYPE_UD); + if (s.max_polygons > 1) { /* In multipolygon dispatch each primitive constant is a * dispatch_width-wide SIMD vector (see comment in @@ -4211,11 +4209,10 @@ brw_per_primitive_reg(const brw_builder &bld, int location, unsigned comp) * component() to select the specified parameter. */ const brw_reg tmp = bld.vgrf(BRW_TYPE_UD); - bld.MOV(tmp, offset(brw_attr_reg(regnr, BRW_TYPE_UD), - s.dispatch_width, comp % 4)); + bld.MOV(tmp, offset(loc_reg, s.dispatch_width, comp % 4)); return retype(tmp, BRW_TYPE_F); } else { - return component(brw_attr_reg(regnr, BRW_TYPE_F), comp % 4); + return component(loc_reg, comp % 4); } } @@ -4419,7 +4416,7 @@ brw_from_nir_emit_fs_intrinsic(nir_to_brw_state &ntb, break; } - if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) { + if (instr->intrinsic == nir_intrinsic_load_per_primitive_input) { assert(base != VARYING_SLOT_PRIMITIVE_INDICES); for (unsigned int i = 0; i < num_components; i++) { bld.MOV(offset(dest, bld, i), diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index d71a5b641f4..90d172617f9 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -658,11 +658,75 @@ lower_barycentric_at_offset(nir_builder *b, nir_intrinsic_instr *intrin, return true; } +static bool +lower_indirect_primitive_id(nir_builder *b, + nir_intrinsic_instr *intrin, + void *data) +{ + if (intrin->intrinsic != nir_intrinsic_load_per_primitive_input) + return false; + + if (nir_intrinsic_io_semantics(intrin).location != VARYING_SLOT_PRIMITIVE_ID) + return false; + + nir_def *indirect_primitive_id = data; + nir_def_replace(&intrin->def, indirect_primitive_id); + + return true; +} + void brw_nir_lower_fs_inputs(nir_shader *nir, const struct intel_device_info *devinfo, const struct brw_wm_prog_key *key) { + nir_def *indirect_primitive_id = NULL; + if (key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH && + (nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID)) { + nir_builder _b = nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir))), *b = &_b; + nir_def *index = nir_ushr_imm(b, + nir_load_fs_msaa_intel(b), + INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET); + nir_def *max_poly = nir_load_max_polygon_intel(b); + /* Build the per-vertex offset into the attribute section of the thread + * payload. There is always one GRF of padding in front. + * + * The computation is fairly complicated due to the layout of the + * payload. You can find a description of the layout in + * brw_compile_fs.cpp brw_assign_urb_setup(). + * + * Gfx < 20 packs 2 slots per GRF (hence the %/ 2 in the formula) + * Gfx >= 20 pack 5 slots per GRF (hence the %/ 5 in the formula) + * + * Then an additional offset needs to added to handle how multiple + * polygon data is interleaved. + */ + nir_def *per_vertex_offset = nir_iadd_imm( + b, + devinfo->ver >= 20 ? + nir_iadd(b, + nir_imul(b, nir_udiv_imm(b, index, 5), nir_imul_imm(b, max_poly, 64)), + nir_imul_imm(b, nir_umod_imm(b, index, 5), 12)) : + nir_iadd_imm( + b, + nir_iadd( + b, + nir_imul(b, nir_udiv_imm(b, index, 2), nir_imul_imm(b, max_poly, 32)), + nir_imul_imm(b, nir_umod_imm(b, index, 2), 16)), + 12), + devinfo->grf_size); + /* When the attribute index is INTEL_MSAA_FLAG_PRIMITIVE_ID_MESH_INDEX, + * it means the value is coming from the per-primitive block. We always + * lay out PrimitiveID at offset 0 in the per-primitive block. + */ + nir_def *attribute_offset = nir_bcsel( + b, + nir_ieq_imm(b, index, INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH), + nir_imm_int(b, 0), per_vertex_offset); + indirect_primitive_id = + nir_read_attribute_payload_intel(b, attribute_offset); + } + nir_foreach_shader_in_variable(var, nir) { var->data.driver_location = var->data.location; @@ -679,6 +743,14 @@ brw_nir_lower_fs_inputs(nir_shader *nir, var->data.interpolation = flat ? INTERP_MODE_FLAT : INTERP_MODE_SMOOTH; } + + /* Always pull the PrimitiveID from the per-primitive block if mesh can be involved. + */ + if (var->data.location == VARYING_SLOT_PRIMITIVE_ID && + key->mesh_input != INTEL_NEVER) { + var->data.per_primitive = true; + nir->info.per_primitive_inputs |= VARYING_BIT_PRIMITIVE_ID; + } } NIR_PASS(_, nir, nir_lower_io, @@ -704,6 +776,13 @@ brw_nir_lower_fs_inputs(nir_shader *nir, NULL); } + if (indirect_primitive_id != NULL) { + NIR_PASS(_, nir, nir_shader_intrinsics_pass, + lower_indirect_primitive_id, + nir_metadata_control_flow, + indirect_primitive_id); + } + /* This pass needs actual constants */ NIR_PASS(_, nir, nir_opt_constant_folding); diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index d4b9aec396e..3440d2874fa 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -477,6 +477,9 @@ brw_shader::init() this->gs.control_data_bits_per_vertex = 0; this->gs.control_data_header_size_bits = 0; + + memset(&this->fs.per_primitive_offsets, -1, + sizeof(this->fs.per_primitive_offsets)); } brw_shader::~brw_shader() @@ -548,6 +551,16 @@ brw_shader::import_uniforms(brw_shader *v) this->uniforms = v->uniforms; } +/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. + * This brings in those uniform definitions + */ +void +brw_shader::import_per_primitive_offsets(const int *per_primitive_offsets) +{ + memcpy(this->fs.per_primitive_offsets, per_primitive_offsets, + sizeof(this->fs.per_primitive_offsets)); +} + enum intel_barycentric_mode brw_barycentric_mode(const struct brw_wm_prog_key *key, nir_intrinsic_instr *intr) @@ -1309,4 +1322,3 @@ brw_allocate_vgrf_units(brw_shader &s, unsigned units_of_REGSIZE) { return brw_vgrf(brw_allocate_vgrf_number(s, units_of_REGSIZE), BRW_TYPE_UD); } - diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h index f317aaf79ad..6ba50bdc500 100644 --- a/src/intel/compiler/brw_shader.h +++ b/src/intel/compiler/brw_shader.h @@ -83,6 +83,7 @@ public: ~brw_shader(); void import_uniforms(brw_shader *v); + void import_per_primitive_offsets(const int *per_primitive_offsets); void assign_curb_setup(); void convert_attr_sources_to_hw_regs(brw_inst *inst); @@ -202,6 +203,11 @@ public: unsigned control_data_header_size_bits; } gs; + struct { + /* Offset of per-primitive locations in bytes */ + int per_primitive_offsets[VARYING_SLOT_MAX]; + } fs; + unsigned grf_used; bool spilled_any_registers; bool needs_register_pressure; diff --git a/src/intel/compiler/brw_vue_map.c b/src/intel/compiler/brw_vue_map.c index 922c208df8e..d5c2a32d3ef 100644 --- a/src/intel/compiler/brw_vue_map.c +++ b/src/intel/compiler/brw_vue_map.c @@ -42,6 +42,105 @@ #include "brw_compiler.h" #include "dev/intel_debug.h" +#include "brw_nir.h" + +static unsigned +get_var_slots(gl_shader_stage stage, const nir_variable *var) +{ + const struct glsl_type *type = var->type; + + if (nir_is_arrayed_io(var, stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + /* EXT_mesh_shader: PRIMITIVE_INDICES is a flat array, not a proper arrayed + * output, as opposed to D3D-style mesh shaders where it's addressed by the + * primitive index. Prevent assigning several slots to primitive indices, + * to avoid some issues. + */ + if (stage == MESA_SHADER_MESH && + var->data.location == VARYING_SLOT_PRIMITIVE_INDICES && + !nir_is_arrayed_io(var, stage)) + return 1; + + return glsl_count_vec4_slots(type, false, var->data.bindless); +} + +void +brw_compute_per_primitive_map(int *out_per_primitive_map, + uint32_t *out_per_primitive_stride, + uint32_t *out_first_offset, + uint32_t base_offset, + nir_shader *nir, + uint32_t variables_mode, + uint64_t slots_valid, + bool separate_shader) +{ + memset(out_per_primitive_map, -1, sizeof(*out_per_primitive_map) * VARYING_SLOT_MAX); + *out_per_primitive_stride = base_offset; + *out_first_offset = UINT32_MAX; + + const uint64_t count_indices_bits = + VARYING_BIT_PRIMITIVE_COUNT | + VARYING_BIT_PRIMITIVE_INDICES; + const uint64_t per_primitive_header_bits = + VARYING_BIT_PRIMITIVE_SHADING_RATE | + VARYING_BIT_LAYER | + VARYING_BIT_VIEWPORT | + VARYING_BIT_CULL_PRIMITIVE; + const uint64_t per_primitive_outputs_written = + slots_valid & ~(count_indices_bits | per_primitive_header_bits); + + *out_first_offset = base_offset; + + /* We put each variable in its own 16B slot. Technically we could do a lot + * better by allocating the space needed for the variable since the data is + * constant and not interpolated for the fragment shader. Unfortunately the + * backend treats those values similarly to vertex attributes and making + * that change would require a pretty large change in the backend. Let's do + * this later. + */ + + /* Lay out builtins first */ + const uint64_t builtins = + per_primitive_outputs_written & BITFIELD64_MASK(VARYING_SLOT_VAR0); + u_foreach_bit64(location, builtins) { + assert(out_per_primitive_map[location] == -1); + + out_per_primitive_map[location] = *out_per_primitive_stride; + *out_per_primitive_stride += 16; + } + + uint32_t generics_offset = *out_per_primitive_stride; + + /* Lay out generics */ + const uint64_t generics = + per_primitive_outputs_written & ~BITFIELD64_MASK(VARYING_SLOT_VAR0); + const int first_generic_output = ffsl(generics) - 1; + u_foreach_bit64(location, generics) { + assert(out_per_primitive_map[location] == -1); + if (!separate_shader) { + /* Just append the location at the back */ + out_per_primitive_map[location] = *out_per_primitive_stride; + } else { + assert(location >= VARYING_SLOT_VAR0); + /* Each location has its fixed spot */ + out_per_primitive_map[location] = generics_offset + + 16 * (location - first_generic_output); + } + + *out_per_primitive_stride = + MAX2(out_per_primitive_map[location] + 16, + *out_per_primitive_stride); + + *out_first_offset = MIN2(out_per_primitive_map[location], + *out_first_offset); + } + + *out_first_offset = *out_first_offset == UINT32_MAX ? 0 : + ROUND_DOWN_TO(*out_first_offset, 32); +} static inline void assign_vue_slot(struct intel_vue_map *vue_map, int varying, int slot) @@ -63,6 +162,9 @@ brw_compute_vue_map(const struct intel_device_info *devinfo, enum intel_vue_layout layout, uint32_t pos_slots) { + vue_map->slots_valid = slots_valid; + vue_map->layout = layout; + if (layout != INTEL_VUE_LAYOUT_FIXED) { /* In SSO mode, we don't know whether the adjacent stage will * read/write gl_ClipDistance, which has a fixed slot location. @@ -76,15 +178,17 @@ brw_compute_vue_map(const struct intel_device_info *devinfo, slots_valid |= VARYING_BIT_CLIP_DIST1; } - vue_map->slots_valid = slots_valid; - vue_map->layout = layout; - /* gl_Layer, gl_ViewportIndex & gl_PrimitiveShadingRateEXT don't get their * own varying slots -- they are stored in the first VUE slot * (VARYING_SLOT_PSIZ). */ slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE); + /* gl_FrontFace is provided somewhere else in the FS thread payload, it's + * never in the VUE. + */ + slots_valid &= ~VARYING_BIT_FACE; + /* Make sure that the values we store in vue_map->varying_to_slot and * vue_map->slot_to_varying won't overflow the signed chars that are used * to store them. Note that since vue_map->slot_to_varying sometimes holds @@ -153,35 +257,38 @@ brw_compute_vue_map(const struct intel_device_info *devinfo, * can assign them however we like. For normal programs, we simply assign * them contiguously. * - * For separate shader pipelines, we first assign built-in varyings - * contiguous slots. This works because ARB_separate_shader_objects - * requires that all shaders have matching built-in varying interface - * blocks. Next, we assign generic varyings based on their location - * (either explicit or linker assigned). This guarantees a fixed layout. - * * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX, * since it's encoded as the clip distances by emit_clip_distances(). * However, it may be output by transform feedback, and we'd rather not * recompute state when TF changes, so we just always include it. */ - uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0); - while (builtins != 0) { - const int varying = ffsll(builtins) - 1; - if (vue_map->varying_to_slot[varying] == -1) { + if (layout != INTEL_VUE_LAYOUT_SEPARATE_MESH) { + const uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0); + u_foreach_bit64(varying, builtins) { + /* Already assigned above? */ + if (vue_map->varying_to_slot[varying] != -1) + continue; assign_vue_slot(vue_map, varying, slot++); } - builtins &= ~BITFIELD64_BIT(varying); } const int first_generic_slot = slot; - uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0); - while (generics != 0) { - const int varying = ffsll(generics) - 1; + const uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0); + u_foreach_bit64(varying, generics) { if (layout != INTEL_VUE_LAYOUT_FIXED) { slot = first_generic_slot + varying - VARYING_SLOT_VAR0; } assign_vue_slot(vue_map, varying, slot++); - generics &= ~BITFIELD64_BIT(varying); + } + + if (layout == INTEL_VUE_LAYOUT_SEPARATE_MESH) { + const uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0); + u_foreach_bit64(varying, builtins) { + /* Already assigned above? */ + if (vue_map->varying_to_slot[varying] != -1) + continue; + assign_vue_slot(vue_map, varying, slot++); + } } vue_map->num_slots = slot; diff --git a/src/intel/compiler/intel_shader_enums.h b/src/intel/compiler/intel_shader_enums.h index 91255da816c..3326655433a 100644 --- a/src/intel/compiler/intel_shader_enums.h +++ b/src/intel/compiler/intel_shader_enums.h @@ -30,6 +30,9 @@ intel_sometimes_invert(enum intel_sometimes x) return (enum intel_sometimes)((int)INTEL_ALWAYS - (int)x); } +#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET (20) +#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH (32) + enum intel_msaa_flags { /** Must be set whenever any dynamic MSAA is used * @@ -63,6 +66,15 @@ enum intel_msaa_flags { * in the render target messages. */ INTEL_MSAA_FLAG_COARSE_RT_WRITES = (1 << 18), + + /** Index of the PrimitiveID attribute relative to the first read + * attribute. + * + * This is not a flag but a value that cover bits 20:31. Value 32 means the + * PrimitiveID is coming from the PerPrimitive block, written by the Mesh + * shader. + */ + INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX = (1 << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET), }; MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(intel_msaa_flags) @@ -133,6 +145,10 @@ enum intel_vue_layout { * Mesh support. */ INTEL_VUE_LAYOUT_SEPARATE, + /** + * Layout is separate and works with Mesh shaders. + */ + INTEL_VUE_LAYOUT_SEPARATE_MESH, }; /** @@ -157,11 +173,70 @@ struct intel_vue_map { /** * The layout of the VUE * - * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched - * without the linker having a chance to dead code eliminate unused varyings. + * Separable programs (GL_ARB_separate_shader_objects) can be mixed and + * matched without the linker having a chance to dead code eliminate unused + * varyings. * * This means that we have to use a fixed slot layout, based on the output's * location field, rather than assigning slots in a compact contiguous block. + * + * When using Mesh, another constraint arises which is the HW limits for + * loading per-primitive & per-vertex data, limited to 32 varying in total. + * This requires us to be quite inventive with the way we lay things out. + * Take a fragment shader loading the following data : + * + * float gl_ClipDistance[]; + * uint gl_PrimitiveID; + * vec4 someAppValue[29]; + * + * According to the Vulkan spec, someAppValue will occupy 29 slots, + * gl_PrimitiveID 1 slot, gl_ClipDistance[] up to 2 slots. If the input is + * coming from a VS/DS/GS shader, we can load all of this through a single + * block using 3DSTATE_SBE::VertexURBEntryReadLength = 16 (maximum + * programmable value) and the layout with + * BRW_VUE_MAP_LAYOUT_FIXED/BRW_VUE_MAP_LAYOUT_SEPARATE will be this : + * + * ----------------------- + * | gl_ClipDistance 0-3 | + * |---------------------| + * | gl_ClipDistance 4-7 | + * |---------------------| + * | gl_PrimitiveID | + * |---------------------| + * | someAppValue[] | + * |---------------------| + * + * This works nicely as everything is coming from the same location in the + * URB. + * + * When mesh shaders are involved, gl_PrimitiveID is located in a different + * place in the URB (the per-primitive block) and requires programming + * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength to load some + * additional data. The HW has a limit such that + * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength + + * 3DSTATE_SBE_MESH::PerVertexURBEntryOutputReadLength <= 16. With the + * layout above, we would not be able to accomodate that HW limit. + * + * The solution to this is to lay the built-in varyings out + * (gl_ClipDistance omitted since it's part of the VUE header and cannot + * live any other place) at the end of the VUE like this : + * + * ----------------------- + * | gl_ClipDistance 0-3 | + * |---------------------| + * | gl_ClipDistance 4-7 | + * |---------------------| + * | someAppValue[] | + * |---------------------| + * | gl_PrimitiveID | + * |---------------------| + * + * This layout adds another challenge because with separate shader + * compilations, we cannot tell in the consumer shader how many outputs the + * producer has, so we don't know where the gl_PrimitiveID lives. The + * solution to this other problem is to read the built-in with a + * MOV_INDIRECT and have the offset of the MOV_INDIRECT loaded through a + * push constant. */ enum intel_vue_layout layout; @@ -361,6 +436,7 @@ struct intel_fs_params { uint32_t rasterization_samples; bool coarse_pixel; bool alpha_to_coverage; + uint32_t primitive_id_index; }; static inline enum intel_msaa_flags @@ -392,6 +468,9 @@ intel_fs_msaa_flags(struct intel_fs_params params) if (params.alpha_to_coverage) fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE; + fs_msaa_flags |= (enum intel_msaa_flags)( + params.primitive_id_index << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET); + return fs_msaa_flags; } diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 44629946981..671924c5ee7 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -1757,7 +1757,10 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline, !device->vk.enabled_extensions.EXT_graphics_pipeline_library) { vue_layout = INTEL_VUE_LAYOUT_FIXED; } else { - vue_layout = INTEL_VUE_LAYOUT_SEPARATE; + vue_layout = + (pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB && + device->vk.enabled_extensions.EXT_mesh_shader) ? + INTEL_VUE_LAYOUT_SEPARATE_MESH : INTEL_VUE_LAYOUT_SEPARATE; } for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) { diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 2d707e4f81f..ec891a7dae0 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -4976,6 +4976,9 @@ struct anv_graphics_pipeline { uint32_t view_mask; uint32_t instance_multiplier; + /* Attribute index of the PrimitiveID in the delivered attributes */ + uint32_t primitive_id_index; + bool kill_pixel; bool uses_xfb; bool sample_shading_enable; diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c index 4fd19bd3bee..01da0c25d67 100644 --- a/src/intel/vulkan/genX_gfx_state.c +++ b/src/intel/vulkan/genX_gfx_state.c @@ -780,6 +780,7 @@ update_fs_msaa_flags(struct anv_gfx_dynamic_state *hw_state, .rasterization_samples = dyn->ms.rasterization_samples, .coarse_pixel = !vk_fragment_shading_rate_is_disabled(&dyn->fsr), .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable, + .primitive_id_index = pipeline->primitive_id_index, }); SET(FS_MSAA_FLAGS, fs_msaa_flags, fs_msaa_flags); diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index b773704ae27..bca6441e048 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -531,7 +531,7 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline, const struct intel_mesh_urb_allocation alloc = intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config, task_prog_data ? task_prog_data->map.size_dw : 0, - mesh_prog_data->map.size_dw); + mesh_prog_data->map.size / 4); /* Zero out the primitive pipeline URB allocations. */ for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { @@ -641,14 +641,14 @@ sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline) get_mesh_prog_data(pipeline); const struct brw_mue_map *mue = &mesh_prog_data->map; return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) && - mue->start_dw[VARYING_SLOT_PRIMITIVE_ID] == -1; + mue->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_ID] == -1; } const struct intel_vue_map *fs_input_map = &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map; return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) && - fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1; + (fs_input_map->slots_valid & VARYING_BIT_PRIMITIVE_ID) == 0; } static void @@ -667,30 +667,34 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) return; } + const struct intel_vue_map *vue_map = + anv_pipeline_is_mesh(pipeline) ? + &get_mesh_prog_data(pipeline)->map.vue_map : + &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map; + anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) { anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) { + int max_source_attr = 0; + uint32_t vertex_read_offset, vertex_read_length, vertex_varyings; + brw_compute_sbe_per_vertex_urb_read( + vue_map, anv_pipeline_is_mesh(pipeline), wm_prog_data, + &vertex_read_offset, &vertex_read_length, &vertex_varyings, + &pipeline->primitive_id_index); - /* TODO(mesh): Figure out cases where we need attribute swizzling. See also - * calculate_urb_setup() and related functions. - */ sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline); sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT; - sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; - sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs; + sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs & + ((1u << vertex_varyings) - 1); + sbe.NumberofSFOutputAttributes = vertex_varyings; for (unsigned i = 0; i < 32; i++) sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; + /* As far as we can test, some of the fields in 3DSTATE_SBE & all of + * 3DSTATE_SBE_SWIZ has no effect when the pipeline is using Mesh so + * don't bother filling those fields. + */ if (anv_pipeline_is_primitive(pipeline)) { - const struct intel_vue_map *fs_input_map = - &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map; - - int first_slot = - brw_compute_first_fs_urb_slot_required(wm_prog_data->inputs, - fs_input_map); - assert(first_slot % 2 == 0); - unsigned urb_entry_read_offset = first_slot / 2; - int max_source_attr = 0; for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) { uint8_t attr = wm_prog_data->urb_setup_attribs[idx]; int input_index = wm_prog_data->urb_setup[attr]; @@ -702,7 +706,7 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) continue; } - const int slot = fs_input_map->varying_to_slot[attr]; + const int slot = vue_map->varying_to_slot[attr]; if (slot == -1) { /* This attribute does not exist in the VUE--that means that @@ -723,7 +727,7 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) /* We have to subtract two slots to account for the URB entry * output read offset in the VS and GS stages. */ - const int source_attr = slot - 2 * urb_entry_read_offset; + const int source_attr = slot - 2 * vertex_read_offset; assert(source_attr >= 0 && source_attr < 32); max_source_attr = MAX2(max_source_attr, source_attr); /* The hardware can only do overrides on 16 overrides at a time, @@ -737,83 +741,51 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) assert(source_attr == input_index); } - sbe.VertexURBEntryReadOffset = urb_entry_read_offset; - sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2); + sbe.VertexURBEntryReadOffset = vertex_read_offset; + sbe.VertexURBEntryReadLength = vertex_read_length; sbe.ForceVertexURBEntryReadOffset = true; sbe.ForceVertexURBEntryReadLength = true; - - /* Ask the hardware to supply PrimitiveID if the fragment shader - * reads it but a previous stage didn't write one. - */ - if (sbe_primitive_id_override(pipeline)) { - sbe.PrimitiveIDOverrideAttributeSelect = - wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID]; - sbe.PrimitiveIDOverrideComponentX = true; - sbe.PrimitiveIDOverrideComponentY = true; - sbe.PrimitiveIDOverrideComponentZ = true; - sbe.PrimitiveIDOverrideComponentW = true; - } } - } - } + + /* Ask the hardware to supply PrimitiveID if the fragment shader reads + * it but a previous stage didn't write one. + */ + if (sbe_primitive_id_override(pipeline)) { + sbe.PrimitiveIDOverrideAttributeSelect = + wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID]; + sbe.PrimitiveIDOverrideComponentX = true; + sbe.PrimitiveIDOverrideComponentY = true; + sbe.PrimitiveIDOverrideComponentZ = true; + sbe.PrimitiveIDOverrideComponentW = true; + } #if GFX_VERx10 >= 125 - if (device->vk.enabled_extensions.EXT_mesh_shader) { - anv_pipeline_emit(pipeline, final.sbe_mesh, - GENX(3DSTATE_SBE_MESH), sbe_mesh) { - if (!anv_pipeline_is_mesh(pipeline)) - continue; + if (device->vk.enabled_extensions.EXT_mesh_shader) { + anv_pipeline_emit(pipeline, final.sbe_mesh, + GENX(3DSTATE_SBE_MESH), sbe_mesh) { + if (!anv_pipeline_is_mesh(pipeline)) + continue; - const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); - const struct brw_mue_map *mue = &mesh_prog_data->map; + const struct brw_mesh_prog_data *mesh_prog_data = + get_mesh_prog_data(pipeline); - assert(mue->per_vertex_header_size_dw % 8 == 0); - sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8; - sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8); + sbe_mesh.PerVertexURBEntryOutputReadOffset = vertex_read_offset; + sbe_mesh.PerVertexURBEntryOutputReadLength = vertex_read_length; - /* Clip distance array is passed in the per-vertex header so that it - * can be consumed by the HW. If user wants to read it in the FS, - * adjust the offset and length to cover it. Conveniently it is at - * the end of the per-vertex header, right before per-vertex - * attributes. - * - * Note that FS attribute reading must be aware that the clip - * distances have fixed position. - */ - if (mue->per_vertex_header_size_dw > 8 && - (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 || - wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) { - sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; - sbe_mesh.PerVertexURBEntryOutputReadLength += 1; - } + uint32_t prim_read_offset, prim_read_length; + brw_compute_sbe_per_primitive_urb_read(wm_prog_data->per_primitive_inputs, + wm_prog_data->num_per_primitive_inputs, + &mesh_prog_data->map, + &prim_read_offset, + &prim_read_length); - if (mue->user_data_in_vertex_header) { - sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; - sbe_mesh.PerVertexURBEntryOutputReadLength += 1; - } - - assert(mue->per_primitive_header_size_dw % 8 == 0); - sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = - mue->per_primitive_header_size_dw / 8; - sbe_mesh.PerPrimitiveURBEntryOutputReadLength = - DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8); - - /* Just like with clip distances, if Primitive Shading Rate, Viewport - * Index or Layer is read back in the FS, adjust the offset and - * length to cover the Primitive Header, where PSR, Viewport Index & - * Layer are stored. - */ - if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 || - wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 || - wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 || - mue->user_data_in_primitive_header) { - assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0); - sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1; - sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1; + sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = prim_read_offset; + sbe_mesh.PerPrimitiveURBEntryOutputReadLength = prim_read_length; } } - } #endif + } + } } static void @@ -842,7 +814,7 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline, } else { assert(anv_pipeline_is_mesh(pipeline)); const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); - point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0; + point_from_shader = mesh_prog_data->map.vue_map.slots_valid & VARYING_BIT_PSIZ; } if (point_from_shader) { @@ -893,7 +865,7 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline, const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); clip.ForceZeroRTAIndexEnable = - mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0; + mesh_prog_data->map.per_primitive_offsets[VARYING_SLOT_LAYER] < 0; } clip.NonPerspectiveBarycentricEnable = wm_prog_data ? @@ -905,12 +877,13 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline, if (device->vk.enabled_extensions.EXT_mesh_shader) { anv_pipeline_emit(pipeline, final.clip_mesh, GENX(3DSTATE_CLIP_MESH), clip_mesh) { - if (anv_pipeline_is_mesh(pipeline)) { - const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); - clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0; - clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask; - clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask; - } + if (!anv_pipeline_is_mesh(pipeline)) + continue; + + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.has_per_primitive_header; + clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask; + clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask; } } #endif @@ -1903,9 +1876,9 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline) mesh.MaximumPrimitiveCount = MAX2(mesh_prog_data->map.max_primitives, 1) - 1; mesh.OutputTopology = output_topology; - mesh.PerVertexDataPitch = mesh_prog_data->map.per_vertex_pitch_dw / 8; - mesh.PerPrimitiveDataPresent = mesh_prog_data->map.per_primitive_pitch_dw > 0; - mesh.PerPrimitiveDataPitch = mesh_prog_data->map.per_primitive_pitch_dw / 8; + mesh.PerVertexDataPitch = mesh_prog_data->map.per_vertex_stride / 32; + mesh.PerPrimitiveDataPresent = mesh_prog_data->map.per_primitive_stride > 0; + mesh.PerPrimitiveDataPitch = mesh_prog_data->map.per_primitive_stride / 32; mesh.IndexFormat = index_format; mesh.NumberofBarriers = mesh_prog_data->base.uses_barrier;