diff --git a/src/intel/compiler/brw/brw_compile_gs.cpp b/src/intel/compiler/brw/brw_compile_gs.cpp index b9c79742fa6..74e5b5f2d6c 100644 --- a/src/intel/compiler/brw/brw_compile_gs.cpp +++ b/src/intel/compiler/brw/brw_compile_gs.cpp @@ -288,6 +288,14 @@ brw_compile_gs(const struct brw_compiler *compiler, prog_data->output_vertex_size_hwords = align(output_vertex_size_bytes, 32) / 32; + const unsigned starting_urb_offset = + 2 * prog_data->control_data_header_size_hwords + + ((prog_data->static_vertex_count == -1) ? 2 : 0); + + BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, compiler->devinfo, + &prog_data->base.vue_map, starting_urb_offset, + 2 * prog_data->output_vertex_size_hwords); + /* Compute URB entry size. The maximum allowed URB entry size is 32k. * That divides up as follows: * diff --git a/src/intel/compiler/brw/brw_compile_tes.cpp b/src/intel/compiler/brw/brw_compile_tes.cpp index 44c4550772f..6cd8301e22b 100644 --- a/src/intel/compiler/brw/brw_compile_tes.cpp +++ b/src/intel/compiler/brw/brw_compile_tes.cpp @@ -40,8 +40,6 @@ run_tes(brw_shader &s) if (s.failed) return false; - s.emit_urb_writes(); - brw_calculate_cfg(s); s.emit_tes_terminate(); @@ -132,6 +130,9 @@ brw_compile_tes(const struct brw_compiler *compiler, brw_postprocess_nir(pt, debug_enabled, key->base.robust_flags); + BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo, + &prog_data->base.vue_map, 0, 0); + unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4; assert(output_size_bytes >= 1); diff --git a/src/intel/compiler/brw/brw_compile_vs.cpp b/src/intel/compiler/brw/brw_compile_vs.cpp index 86cd321912a..549d5199ebb 100644 --- a/src/intel/compiler/brw/brw_compile_vs.cpp +++ b/src/intel/compiler/brw/brw_compile_vs.cpp @@ -214,8 +214,6 @@ run_vs(brw_shader &s) if (s.failed) return false; - s.emit_urb_writes(); - brw_calculate_cfg(s); ASSERTED bool eot = s.mark_last_urb_write_with_eot(); @@ -300,6 +298,9 @@ brw_compile_vs(const struct brw_compiler *compiler, brw_postprocess_nir(pt, debug_enabled, key->base.robust_flags); + BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, compiler->devinfo, + &prog_data->base.vue_map, 0, 0); + unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read); /* gl_VertexID and gl_InstanceID are system values, but arrive via an * incoming vertex attribute. So, add an extra slot. diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 5cf8d49dcbf..5bf59798b4d 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -78,56 +78,6 @@ setup_imm_b(const brw_builder &bld, int8_t v) return tmp; } -static void -brw_from_nir_setup_outputs(nir_to_brw_state &ntb) -{ - brw_shader &s = ntb.s; - - if (s.stage == MESA_SHADER_TESS_CTRL || - s.stage == MESA_SHADER_TASK || - s.stage == MESA_SHADER_MESH || - s.stage == MESA_SHADER_FRAGMENT || - s.stage == MESA_SHADER_COMPUTE) - return; - - unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, }; - - /* Calculate the size of output registers in a separate pass, before - * allocating them. With ARB_enhanced_layouts, multiple output variables - * may occupy the same slot, but have different type sizes. - */ - nir_foreach_shader_out_variable(var, s.nir) { - const int loc = var->data.driver_location; - const unsigned var_vec4s = nir_variable_count_slots(var, var->type); - vec4s[loc] = MAX2(vec4s[loc], var_vec4s); - } - - for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) { - if (vec4s[loc] == 0) { - loc++; - continue; - } - - unsigned reg_size = vec4s[loc]; - - /* Check if there are any ranges that start within this range and extend - * past it. If so, include them in this allocation. - */ - for (unsigned i = 1; i < reg_size; i++) { - assert(i + loc < ARRAY_SIZE(vec4s)); - reg_size = MAX2(vec4s[i + loc] + i, reg_size); - } - - brw_reg reg = ntb.bld.vgrf(BRW_TYPE_F, 4 * reg_size); - for (unsigned i = 0; i < reg_size; i++) { - assert(loc + i < ARRAY_SIZE(s.outputs)); - s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i); - } - - loc += reg_size; - } -} - static brw_reg emit_work_group_id_setup(nir_to_brw_state &ntb) { @@ -2647,8 +2597,6 @@ emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src, abld.emit(BRW_OPCODE_ENDIF); } - s.emit_urb_writes(vertex_count); - /* In stream mode we have to set control data bits for all vertices * unless we have disabled control data bits completely (which we do * do for MESA_PRIM_POINTS outputs that don't use streams). @@ -3143,13 +3091,6 @@ brw_from_nir_emit_gs_intrinsic(nir_to_brw_state &ntb, case nir_intrinsic_emit_vertex_with_counter: emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr)); - - /* After an EmitVertex() call, the values of all outputs are undefined. - * If this is not in control flow, recreate a fresh set of output - * registers to keep their live ranges separate. - */ - if (instr->instr.block->cf_node.parent->type == nir_cf_node_function) - brw_from_nir_setup_outputs(ntb); break; case nir_intrinsic_end_primitive_with_counter: @@ -5520,22 +5461,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, break; } - case nir_intrinsic_store_output: { - assert(nir_src_bit_size(instr->src[0]) == 32); - brw_reg src = get_nir_src(ntb, instr->src[0], -1); - - unsigned store_offset = nir_src_as_uint(instr->src[1]); - unsigned num_components = instr->num_components; - unsigned first_component = nir_intrinsic_component(instr); - - brw_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld, - 4 * store_offset), src.type); - - brw_combine_with_vec(bld, offset(new_dest, bld, first_component), - src, num_components); - break; - } - case nir_intrinsic_load_subgroup_size: /* This should only happen for fragment shaders because every other case * is lowered in NIR so we can optimize on it. @@ -6949,7 +6874,6 @@ brw_from_nir(brw_shader *s) /* emit the arrays used for inputs and outputs - load/store intrinsics will * be converted to reads/writes of these arrays */ - brw_from_nir_setup_outputs(ntb); brw_from_nir_emit_system_values(ntb); ntb.s.last_scratch = align(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width; diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index 7a0bb1975a4..d6fec632c03 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -463,6 +463,159 @@ brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *nir, nir_metadata_control_flow, (void *) cd); } +/* See if comps[0..3] has any non-undef values. */ +static bool +slot_defined(nir_scalar *comps) +{ + for (unsigned i = 0; i < 4; i++) { + if (comps[i].def && !nir_def_is_undef(comps[i].def)) + return true; + } + return false; +} + +/* Replace any NULL defs in comps[0..(n-1)] with undef */ +static void +fill_undefs(nir_scalar *comps, nir_def *undef, unsigned n) +{ + for (unsigned i = 0; i < n; i++) { + if (!comps[i].def) + comps[i] = nir_get_scalar(undef, 0); + } +} + +static void +emit_urb_writes(nir_builder *b, + const struct intel_device_info *devinfo, + nir_scalar *outputs, + unsigned num_slots, + nir_def *offset) +{ + nir_def *undef = nir_undef(b, 1, 32); + + /* Primitive Shading Rate defaults to (1, 1) in half-float */ + if (devinfo->has_coarse_pixel_primitive_and_cb && !outputs[0].def) + outputs[0] = nir_get_scalar(nir_imm_int(b, 0x3C003C00), 0); + + /* Viewport, Layer, and Point Size default to 0 */ + for (unsigned i = 0; i < 4; i++) { + if (!outputs[i].def) + outputs[i] = nir_get_scalar(nir_imm_int(b, 0), 0); + } + + /* Emit URB writes */ + for (unsigned slot = 0; slot < num_slots; slot++) { + if (!slot_defined(&outputs[4 * slot])) + continue; + + const bool vec8 = slot + 1 < num_slots && + slot_defined(&outputs[4 * (slot + 1)]); + + fill_undefs(&outputs[4 * slot], undef, vec8 ? 8 : 4); + + nir_def *val = nir_vec_scalars(b, &outputs[4 * slot], vec8 ? 8 : 4); + + if (devinfo->ver >= 20) { + nir_def *addr = nir_iadd(b, output_handle(b), + nir_imul_imm(b, offset, 16)); + nir_store_urb_lsc_intel(b, val, addr, .base = 16 * slot); + } else { + nir_store_urb_vec4_intel(b, val, output_handle(b), offset, + nir_imm_int(b, vec8 ? 0xff : 0xf), + .base = slot); + } + + if (vec8) + slot++; + } +} + +bool +brw_nir_lower_deferred_urb_writes(nir_shader *nir, + const struct intel_device_info *devinfo, + const struct intel_vue_map *vue_map, + unsigned extra_urb_slot_offset, + unsigned gs_vertex_stride) +{ + nir_scalar *outputs = calloc(vue_map->num_slots, 4 * sizeof(nir_scalar)); + + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_view_output: { + nir_src *view_index = nir_get_io_arrayed_index_src(intrin); + nir_src *offset = nir_get_io_offset_src(intrin); + + const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + const unsigned slot = + vue_map->varying_to_slot[sem.location] + + (view_index ? nir_src_as_uint(*view_index) : 0); + nir_src_as_uint(*offset); + const unsigned c = io_component(intrin, NULL); + const unsigned mask = nir_intrinsic_write_mask(intrin); + assert(slot != -1); + assert(c < 4); + + u_foreach_bit(i, mask) { + outputs[4 * slot + c + i] = + nir_scalar_resolved(intrin->src[0].ssa, i); + } + + nir_instr_remove(instr); + break; + } + + case nir_intrinsic_emit_vertex_with_counter: { + /* The only purpose of primitives sent to non-zero streams + * is to be recorded by transform feedback, so if it is disabled, + * we can discard all geometry bound for those streams. + */ + if (nir_intrinsic_stream_id(intrin) > 0 && + !nir->info.has_transform_feedback_varyings) { + nir_instr_remove(instr); + break; + } + + nir_builder b = nir_builder_at(nir_before_instr(instr)); + b.constant_fold_alu = true; + nir_def *offset = + nir_iadd_imm(&b, nir_imul_imm(&b, intrin->src[0].ssa, + gs_vertex_stride), + extra_urb_slot_offset); + + emit_urb_writes(&b, devinfo, outputs, vue_map->num_slots, offset); + /* After EmitVertex() all outputs are undefined */ + memset(outputs, 0, 4 * vue_map->num_slots * sizeof(nir_scalar)); + + /* Leave emit_vertex_with_counter for control data writes */ + break; + } + + default: + break; + } + } + } + + if (nir->info.stage != MESA_SHADER_GEOMETRY) { + nir_builder b = nir_builder_at(nir_after_impl(impl)); + emit_urb_writes(&b, devinfo, outputs, vue_map->num_slots, + nir_imm_int(&b, 0)); + } + + free(outputs); + + return nir_progress(true, impl, nir_metadata_control_flow); +} + + static bool lower_task_payload_to_urb(nir_builder *b, nir_intrinsic_instr *io, void *data) { @@ -735,60 +888,6 @@ remap_tess_levels(nir_shader *nir, nir_metadata_control_flow, &cb); } -/* Replace store_per_view_output to plain store_output, mapping the view index - * to IO offset. Because we only use per-view outputs for position, the offset - * pitch is always 1. */ -static bool -lower_per_view_outputs(nir_builder *b, - nir_intrinsic_instr *intrin, - UNUSED void *cb_data) -{ - if (intrin->intrinsic != nir_intrinsic_store_per_view_output && - intrin->intrinsic != nir_intrinsic_load_per_view_output) - return false; - - b->cursor = nir_before_instr(&intrin->instr); - - nir_src *view_index = nir_get_io_arrayed_index_src(intrin); - nir_src *offset = nir_get_io_offset_src(intrin); - - nir_def *new_offset = nir_iadd(b, view_index->ssa, offset->ssa); - - nir_intrinsic_instr *new; - if (intrin->intrinsic == nir_intrinsic_store_per_view_output) - new = nir_store_output(b, intrin->src[0].ssa, new_offset); - else { - nir_def *new_def = nir_load_output(b, intrin->def.num_components, - intrin->def.bit_size, new_offset); - new = nir_def_as_intrinsic(new_def); - } - - nir_intrinsic_set_base(new, nir_intrinsic_base(intrin)); - nir_intrinsic_set_range(new, nir_intrinsic_range(intrin)); - nir_intrinsic_set_write_mask(new, nir_intrinsic_write_mask(intrin)); - nir_intrinsic_set_component(new, nir_intrinsic_component(intrin)); - nir_intrinsic_set_src_type(new, nir_intrinsic_src_type(intrin)); - - nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); - /* the meaning of the offset src is different for brw */ - sem.no_validate = 1; - nir_intrinsic_set_io_semantics(new, sem); - - if (intrin->intrinsic == nir_intrinsic_load_per_view_output) - nir_def_rewrite_uses(&intrin->def, &new->def); - nir_instr_remove(&intrin->instr); - - return true; -} - -static bool -brw_nir_lower_per_view_outputs(nir_shader *nir) -{ - return nir_shader_intrinsics_pass(nir, lower_per_view_outputs, - nir_metadata_control_flow, - NULL); -} - static bool brw_nir_should_vectorize_urb(unsigned align_mul, unsigned align_offset, unsigned bit_size, @@ -1306,13 +1405,8 @@ brw_nir_lower_fs_inputs(nir_shader *nir, void brw_nir_lower_vue_outputs(nir_shader *nir) { - nir_foreach_shader_out_variable(var, nir) { - var->data.driver_location = var->data.location; - } - NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4, nir_lower_io_lower_64bit_to_32); - NIR_PASS(_, nir, brw_nir_lower_per_view_outputs); } void diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h index 978ad981e87..81faeac06a0 100644 --- a/src/intel/compiler/brw/brw_nir.h +++ b/src/intel/compiler/brw/brw_nir.h @@ -219,6 +219,11 @@ struct brw_lower_urb_cb_data { bool brw_nir_lower_inputs_to_urb_intrinsics(nir_shader *, const struct brw_lower_urb_cb_data *); bool brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *, const struct brw_lower_urb_cb_data *); +bool brw_nir_lower_deferred_urb_writes(nir_shader *nir, + const struct intel_device_info *devinfo, + const struct intel_vue_map *vue_map, + unsigned extra_urb_slot_offset, + unsigned gs_vertex_stride); void brw_nir_opt_vectorize_urb(struct brw_pass_tracker *pt); diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp index 482df008755..bd2679b5f7d 100644 --- a/src/intel/compiler/brw/brw_shader.cpp +++ b/src/intel/compiler/brw/brw_shader.cpp @@ -19,257 +19,6 @@ #include "compiler/nir/nir_builder.h" #include "util/u_math.h" -void -brw_shader::emit_urb_writes(const brw_reg &gs_vertex_count) -{ - int slot, urb_offset, length; - int starting_urb_offset = 0; - const struct brw_vue_prog_data *vue_prog_data = - brw_vue_prog_data(this->prog_data); - const struct intel_vue_map *vue_map = &vue_prog_data->vue_map; - bool flush; - brw_reg sources[8]; - brw_reg urb_handle; - - switch (stage) { - case MESA_SHADER_VERTEX: - urb_handle = vs_payload().urb_handles; - break; - case MESA_SHADER_TESS_EVAL: - urb_handle = tes_payload().urb_output; - break; - case MESA_SHADER_GEOMETRY: - urb_handle = gs_payload().urb_handles; - break; - default: - UNREACHABLE("invalid stage"); - } - - const brw_builder bld = brw_builder(this); - - brw_reg per_slot_offsets; - - if (stage == MESA_SHADER_GEOMETRY) { - const struct brw_gs_prog_data *gs_prog_data = - brw_gs_prog_data(this->prog_data); - - /* We need to increment the Global Offset to skip over the control data - * header and the extra "Vertex Count" field (1 HWord) at the beginning - * of the VUE. We're counting in OWords, so the units are doubled. - */ - starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords; - if (gs_prog_data->static_vertex_count == -1) - starting_urb_offset += 2; - - /* The URB offset is in 128-bit units, so we need to multiply by 2 */ - const int output_vertex_size_owords = - gs_prog_data->output_vertex_size_hwords * 2; - - /* On Xe2+ platform, LSC can operate on the Dword data element with byte - * offset granularity, so convert per slot offset in bytes since it's in - * Owords (16-bytes) unit else keep per slot offset in oword unit for - * previous platforms. - */ - const int output_vertex_size = devinfo->ver >= 20 ? - output_vertex_size_owords * 16 : - output_vertex_size_owords; - if (gs_vertex_count.file == IMM) { - per_slot_offsets = brw_imm_ud(output_vertex_size * - gs_vertex_count.ud); - } else { - per_slot_offsets = bld.vgrf(BRW_TYPE_UD); - bld.MUL(per_slot_offsets, gs_vertex_count, - brw_imm_ud(output_vertex_size)); - } - } - - length = 0; - urb_offset = starting_urb_offset; - flush = false; - - /* SSO shaders can have VUE slots allocated which are never actually - * written to, so ignore them when looking for the last (written) slot. - */ - int last_slot = vue_map->num_slots - 1; - while (last_slot > 0 && - (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD || - outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) { - last_slot--; - } - - bool urb_written = false; - for (slot = 0; slot < vue_map->num_slots; slot++) { - int varying = vue_map->slot_to_varying[slot]; - switch (varying) { - case VARYING_SLOT_PSIZ: { - /* The point size varying slot is the vue header and is always in the - * vue map. If anything in the header is going to be read back by HW, - * we need to initialize it, in particular the viewport & layer - * values. - * - * SKL PRMs, Volume 7: 3D-Media-GPGPU, Vertex URB Entry (VUE) - * Formats: - * - * "VUEs are written in two ways: - * - * - At the top of the 3D Geometry pipeline, the VF's - * InputAssembly function creates VUEs and initializes them - * from data extracted from Vertex Buffers as well as - * internally generated data. - * - * - VS, GS, HS and DS threads can compute, format, and write - * new VUEs as thread output." - * - * "Software must ensure that any VUEs subject to readback by the - * 3D pipeline start with a valid Vertex Header. This extends to - * all VUEs with the following exceptions: - * - * - If the VS function is enabled, the VF-written VUEs are not - * required to have Vertex Headers, as the VS-incoming - * vertices are guaranteed to be consumed by the VS (i.e., - * the VS thread is responsible for overwriting the input - * vertex data). - * - * - If the GS FF is enabled, neither VF-written VUEs nor VS - * thread-generated VUEs are required to have Vertex Headers, - * as the GS will consume all incoming vertices. - * - * - If Rendering is disabled, VertexHeaders are not required - * anywhere." - */ - brw_reg zero = - retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD); - bld.MOV(zero, brw_imm_ud(0u)); - - if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE && - this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) { - sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE]; - } else if (devinfo->has_coarse_pixel_primitive_and_cb) { - uint32_t one_fp16 = 0x3C00; - brw_reg one_by_one_fp16 = - retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD); - bld.MOV(one_by_one_fp16, brw_imm_ud((one_fp16 << 16) | one_fp16)); - sources[length++] = one_by_one_fp16; - } else { - sources[length++] = zero; - } - - if (vue_map->slots_valid & VARYING_BIT_LAYER) - sources[length++] = this->outputs[VARYING_SLOT_LAYER]; - else - sources[length++] = zero; - - if (vue_map->slots_valid & VARYING_BIT_VIEWPORT) - sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT]; - else - sources[length++] = zero; - - if (vue_map->slots_valid & VARYING_BIT_PSIZ) - sources[length++] = this->outputs[VARYING_SLOT_PSIZ]; - else - sources[length++] = zero; - break; - } - case VARYING_SLOT_EDGE: - UNREACHABLE("unexpected scalar vs output"); - break; - - default: - /* gl_Position is always in the vue map, but isn't always written by - * the shader. Other varyings (clip distances) get added to the vue - * map but don't always get written. In those cases, the - * corresponding this->output[] slot will be invalid we and can skip - * the urb write for the varying. If we've already queued up a vue - * slot for writing we flush a mlen 5 urb write, otherwise we just - * advance the urb_offset. - */ - if (varying == BRW_VARYING_SLOT_PAD || - this->outputs[varying].file == BAD_FILE) { - if (length > 0) - flush = true; - else - urb_offset++; - break; - } - - int slot_offset = 0; - - /* When using Primitive Replication, there may be multiple slots - * assigned to POS. - */ - if (varying == VARYING_SLOT_POS) - slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS]; - - for (unsigned i = 0; i < 4; i++) { - sources[length++] = offset(this->outputs[varying], bld, - i + (slot_offset * 4)); - } - break; - } - - const brw_builder abld = bld.annotate("URB write"); - - /* If we've queued up 8 registers of payload (2 VUE slots), if this is - * the last slot or if we need to flush (see BAD_FILE varying case - * above), emit a URB write send now to flush out the data. - */ - if (length == 8 || (length > 0 && slot == last_slot)) - flush = true; - if (flush) { - brw_reg srcs[URB_LOGICAL_NUM_SRCS]; - - srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; - srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets; - srcs[URB_LOGICAL_SRC_DATA] = - retype(brw_allocate_vgrf_units(*this, (dispatch_width / 8) * length), BRW_TYPE_F); - abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); - - brw_urb_inst *urb = abld.URB_WRITE(srcs, ARRAY_SIZE(srcs)); - urb->components = length; - urb->offset = urb_offset * (devinfo->ver >= 20 ? 16 : 1); - urb_offset = starting_urb_offset + slot + 1; - length = 0; - flush = false; - urb_written = true; - } - } - - /* If we don't have any valid slots to write, just do a minimal urb write - * send to terminate the shader. This includes 1 slot of undefined data, - * because it's invalid to write 0 data: - * - * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions - - * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read > - * Write Data Payload: - * - * "The write data payload can be between 1 and 8 message phases long." - */ - if (!urb_written) { - /* For GS, just turn EmitVertex() into a no-op. We don't want it to - * end the thread, and emit_gs_thread_end() already emits a SEND with - * EOT at the end of the program for us. - */ - if (stage == MESA_SHADER_GEOMETRY) - return; - - brw_reg uniform_urb_handle = - retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD); - brw_reg payload = - retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD); - - bld.exec_all().MOV(uniform_urb_handle, urb_handle); - - brw_reg srcs[URB_LOGICAL_NUM_SRCS]; - srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle; - srcs[URB_LOGICAL_SRC_DATA] = payload; - - brw_urb_inst *urb = bld.URB_WRITE(srcs, ARRAY_SIZE(srcs)); - urb->offset = devinfo->ver >= 20 ? 16 : 1; - urb->components = 1; - return; - } -} - void brw_shader::emit_tes_terminate() { diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h index 200bdf86aab..fa9f62764f2 100644 --- a/src/intel/compiler/brw/brw_shader.h +++ b/src/intel/compiler/brw/brw_shader.h @@ -88,7 +88,6 @@ public: void fail(const char *msg, ...); void limit_dispatch_width(unsigned n, const char *msg); - void emit_urb_writes(const brw_reg &gs_vertex_count = brw_reg()); void emit_gs_control_data_bits(const brw_reg &vertex_count); brw_reg gs_urb_channel_mask(const brw_reg &dword_index); brw_reg gs_urb_per_slot_dword_index(const brw_reg &vertex_count);