diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 172d372c129..3691c4325d6 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -1412,8 +1412,6 @@ struct brw_clip_prog_data { }; struct brw_tue_map { - int32_t start_dw[VARYING_SLOT_MAX]; - uint32_t size_dw; uint32_t per_task_data_start_dw; diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp index 190c60e01e6..cf7d49ca1e0 100644 --- a/src/intel/compiler/brw_mesh.cpp +++ b/src/intel/compiler/brw_mesh.cpp @@ -77,18 +77,33 @@ type_size_scalar_dwords(const struct glsl_type *type, bool bindless) return glsl_count_dword_slots(type, bindless); } +/* TODO(mesh): Make this a common function. */ +static void +shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) + ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size * (length == 3 ? 4 : length); +} + static void brw_nir_lower_tue_outputs(nir_shader *nir, const brw_tue_map *map) { nir_foreach_shader_out_variable(var, nir) { - int location = var->data.location; - assert(location >= 0); - assert(map->start_dw[location] != -1); - var->data.driver_location = map->start_dw[location]; + assert(var->data.location == VARYING_SLOT_TASK_COUNT); + /* First word in TUE header. */ + var->data.driver_location = 0; } nir_lower_io(nir, nir_var_shader_out, type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32); + + nir_lower_explicit_io(nir, nir_var_mem_task_payload, + nir_address_format_32bit_offset); } static void @@ -96,63 +111,79 @@ brw_compute_tue_map(struct nir_shader *nir, struct brw_tue_map *map) { memset(map, 0, sizeof(*map)); - map->start_dw[VARYING_SLOT_TASK_COUNT] = 0; - - /* Words 1-3 are used for "Dispatch Dimensions" feature, to allow mapping a - * 3D dispatch into the 1D dispatch supported by HW. So ignore those. - */ - - /* From bspec: "It is suggested that SW reserve the 16 bytes following the + /* TUE header contains 4 words: + * + * - Word 0 for Task Count. + * + * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a + * 3D dispatch into the 1D dispatch supported by HW. Currently not used. + * + * From bspec: "It is suggested that SW reserve the 16 bytes following the * TUE Header, and therefore start the SW-defined data structure at 32B * alignment. This allows the TUE Header to always be written as 32 bytes * with 32B alignment, the most optimal write performance case." */ map->per_task_data_start_dw = 8; + /* Lowering to explicit types will start offsets from task_payload_size, so + * set it to start after the header. + */ + nir->info.task_payload_size = map->per_task_data_start_dw * 4; + nir_lower_vars_to_explicit_types(nir, nir_var_mem_task_payload, + shared_type_info); - /* Compact the data: find the size associated with each location... */ - nir_foreach_shader_out_variable(var, nir) { - const int location = var->data.location; - if (location == VARYING_SLOT_TASK_COUNT) - continue; - assert(location >= VARYING_SLOT_VAR0); - assert(location < VARYING_SLOT_MAX); - - map->start_dw[location] += type_size_scalar_dwords(var->type, false); - } - - /* ...then assign positions using those sizes. */ - unsigned next = map->per_task_data_start_dw; - for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { - if (i == VARYING_SLOT_TASK_COUNT) - continue; - if (map->start_dw[i] == 0) { - map->start_dw[i] = -1; - } else { - const unsigned size = map->start_dw[i]; - map->start_dw[i] = next; - next += size; - } - } - - map->size_dw = ALIGN(next, 8); + map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8); } static void brw_print_tue_map(FILE *fp, const struct brw_tue_map *map) { - fprintf(fp, "TUE map (%d dwords)\n", map->size_dw); - fprintf(fp, " %4d: VARYING_SLOT_TASK_COUNT\n", - map->start_dw[VARYING_SLOT_TASK_COUNT]); + fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw); +} - for (int i = VARYING_SLOT_VAR0; i < VARYING_SLOT_MAX; i++) { - if (map->start_dw[i] != -1) { - fprintf(fp, " %4d: VARYING_SLOT_VAR%d\n", map->start_dw[i], - i - VARYING_SLOT_VAR0); - } +static bool +brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b, + nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_store_task_payload: + case nir_intrinsic_load_task_payload: { + nir_src *offset_src = nir_get_io_offset_src(intrin); + + if (nir_src_is_const(*offset_src)) + assert(nir_src_as_uint(*offset_src) % 4 == 0); + + b->cursor = nir_before_instr(&intrin->instr); + + /* Regular I/O uses dwords while explicit I/O used for task payload uses + * bytes. Normalize it to dwords. + * + * TODO(mesh): Figure out how to handle 8-bit, 16-bit. + */ + + assert(offset_src->is_ssa); + nir_ssa_def *offset = nir_ishr_imm(b, offset_src->ssa, 2); + nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset)); + + return true; } - fprintf(fp, "\n"); + default: + return false; + } +} + +static void +brw_nir_adjust_task_payload_offsets(nir_shader *nir) +{ + nir_shader_instructions_pass(nir, brw_nir_adjust_task_payload_offsets_instr, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); } const unsigned * @@ -178,6 +209,7 @@ brw_compile_task(const struct brw_compiler *compiler, brw_compute_tue_map(nir, &prog_data->map); NIR_PASS_V(nir, brw_nir_lower_tue_outputs, &prog_data->map); + NIR_PASS_V(nir, brw_nir_adjust_task_payload_offsets); const unsigned required_dispatch_width = brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type); @@ -259,15 +291,24 @@ brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map) if (!map) return; - nir_foreach_shader_in_variable(var, nir) { - int location = var->data.location; - assert(location >= 0); - assert(map->start_dw[location] != -1); - var->data.driver_location = map->start_dw[location]; + nir->info.task_payload_size = map->per_task_data_start_dw * 4; + + if (nir_lower_vars_to_explicit_types(nir, nir_var_mem_task_payload, + shared_type_info)) { + /* The types for Task Output and Mesh Input should match, so their sizes + * should also match. + */ + assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8)); + } else { + /* Mesh doesn't read any input, to make it clearer set the + * task_payload_size to zero instead of keeping an incomplete size that + * just includes the header. + */ + nir->info.task_payload_size = 0; } - nir_lower_io(nir, nir_var_shader_in, type_size_scalar_dwords, - nir_lower_io_lower_64bit_to_32); + nir_lower_explicit_io(nir, nir_var_mem_task_payload, + nir_address_format_32bit_offset); } /* Mesh URB Entry consists of an initial section @@ -645,6 +686,7 @@ brw_compile_mesh(const struct brw_compiler *compiler, BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); NIR_PASS_V(nir, brw_nir_lower_tue_inputs, params->tue_map); + NIR_PASS_V(nir, brw_nir_adjust_task_payload_offsets); brw_compute_mue_map(nir, &prog_data->map); NIR_PASS_V(nir, brw_nir_lower_mue_outputs, &prog_data->map); @@ -742,7 +784,13 @@ brw_compile_mesh(const struct brw_compiler *compiler, static fs_reg get_mesh_urb_handle(const fs_builder &bld, nir_intrinsic_op op) { - const unsigned subreg = op == nir_intrinsic_load_input ? 7 : 6; + unsigned subreg; + if (bld.shader->stage == MESA_SHADER_TASK) { + subreg = 6; + } else { + assert(bld.shader->stage == MESA_SHADER_MESH); + subreg = op == nir_intrinsic_load_task_payload ? 7 : 6; + } fs_builder ubld8 = bld.group(8, 0).exec_all(); @@ -753,6 +801,15 @@ get_mesh_urb_handle(const fs_builder &bld, nir_intrinsic_op op) return h; } +static unsigned +component_from_intrinsic(nir_intrinsic_instr *instr) +{ + if (nir_intrinsic_has_component(instr)) + return nir_intrinsic_component(instr); + else + return 0; +} + static void adjust_handle_and_offset(const fs_builder &bld, fs_reg &urb_handle, @@ -787,7 +844,7 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr, const unsigned mask = nir_intrinsic_write_mask(instr); const unsigned offset_in_dwords = nir_intrinsic_base(instr) + nir_src_as_uint(*offset_nir_src) + - nir_intrinsic_component(instr); + component_from_intrinsic(instr); /* URB writes are vec4 aligned but the intrinsic offsets are in dwords. * With a max of 4 components, an intrinsic can require up to two writes. @@ -873,7 +930,7 @@ emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr, fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic); const unsigned base_in_dwords = nir_intrinsic_base(instr) + - nir_intrinsic_component(instr); + component_from_intrinsic(instr); /* Use URB write message that allow different offsets per-slot. The offset * is in units of vec4s (128 bits), so we use a write for each component, @@ -940,7 +997,7 @@ emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr, const unsigned offset_in_dwords = nir_intrinsic_base(instr) + nir_src_as_uint(*offset_nir_src) + - nir_intrinsic_component(instr); + component_from_intrinsic(instr); unsigned urb_global_offset = offset_in_dwords / 4; adjust_handle_and_offset(bld, urb_handle, urb_global_offset); @@ -987,7 +1044,7 @@ emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr, fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic); const unsigned base_in_dwords = nir_intrinsic_base(instr) + - nir_intrinsic_component(instr); + component_from_intrinsic(instr); for (unsigned c = 0; c < comps; c++) { for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { @@ -1072,10 +1129,12 @@ fs_visitor::nir_emit_task_intrinsic(const fs_builder &bld, switch (instr->intrinsic) { case nir_intrinsic_store_output: + case nir_intrinsic_store_task_payload: emit_task_mesh_store(bld, instr); break; case nir_intrinsic_load_output: + case nir_intrinsic_load_task_payload: emit_task_mesh_load(bld, instr); break; @@ -1098,10 +1157,10 @@ fs_visitor::nir_emit_mesh_intrinsic(const fs_builder &bld, emit_task_mesh_store(bld, instr); break; - case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_output: case nir_intrinsic_load_per_primitive_output: case nir_intrinsic_load_output: + case nir_intrinsic_load_task_payload: emit_task_mesh_load(bld, instr); break; diff --git a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c index 1b83b28b8db..bfdfa732049 100644 --- a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c +++ b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c @@ -258,12 +258,14 @@ lower_mem_access_bit_sizes_instr(nir_builder *b, case nir_intrinsic_load_ssbo: case nir_intrinsic_load_shared: case nir_intrinsic_load_scratch: + case nir_intrinsic_load_task_payload: return lower_mem_load_bit_size(b, intrin, devinfo); case nir_intrinsic_store_global: case nir_intrinsic_store_ssbo: case nir_intrinsic_store_shared: case nir_intrinsic_store_scratch: + case nir_intrinsic_store_task_payload: return lower_mem_store_bit_size(b, intrin, devinfo); default: