diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index d6b366841c8..d2ee6559937 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -2690,160 +2690,6 @@ brw_combine_with_vec(const brw_builder &bld, const brw_reg &dst, bld.VEC(dst, comps, n); } -static void -emit_gs_input_load(nir_to_brw_state &ntb, const brw_reg &dst, - const nir_src &vertex_src, - unsigned base_offset, - const nir_src &offset_src, - unsigned num_components, - unsigned first_component) -{ - const brw_builder &bld = ntb.bld; - const struct intel_device_info *devinfo = ntb.devinfo; - - brw_shader &s = ntb.s; - - assert(brw_type_size_bytes(dst.type) == 4); - struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data); - const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; - - /* TODO: figure out push input layout for invocations == 1 */ - if (gs_prog_data->invocations == 1 && - nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) && - 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) { - int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 + - nir_src_as_uint(vertex_src) * push_reg_count; - - const brw_reg attr = offset(brw_attr_reg(0, dst.type), bld, - first_component + imm_offset); - brw_combine_with_vec(bld, dst, attr, num_components); - return; - } - - /* Resort to the pull model. Ensure the VUE handles are provided. */ - assert(gs_prog_data->base.include_vue_handles); - - brw_reg start = s.gs_payload().icp_handle_start; - brw_reg icp_handle = ntb.bld.vgrf(BRW_TYPE_UD); - const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo); - - if (gs_prog_data->invocations == 1) { - if (nir_src_is_const(vertex_src)) { - /* The vertex index is constant; just select the proper URB handle. */ - icp_handle = - byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes); - } else { - /* The vertex index is non-constant. We need to use indirect - * addressing to fetch the proper URB handle. - * - * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> - * indicating that channel should read the handle from - * DWord . We convert that to bytes by multiplying by 4. - * - * Next, we convert the vertex index to bytes by multiplying - * by 32/64 (shifting by 5/6), and add the two together. This is - * the final indirect byte offset. - */ - brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION(); - - /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ - brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u)); - /* Convert vertex_index to bytes (multiply by 32/64) */ - assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */ - brw_reg vertex_offset_bytes = - bld.SHL(retype(get_nir_src(ntb, vertex_src, 0), BRW_TYPE_UD), - brw_imm_ud(ffs(grf_size_bytes) - 1)); - brw_reg icp_offset_bytes = - bld.ADD(vertex_offset_bytes, channel_offsets); - - /* Use first_icp_handle as the base offset. There is one register - * of URB handles per vertex, so inform the register allocator that - * we might read up to nir->info.gs.vertices_in registers. - */ - bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start, - brw_reg(icp_offset_bytes), - brw_imm_ud(s.nir->info.gs.vertices_in * grf_size_bytes)); - } - } else { - assert(gs_prog_data->invocations > 1); - - if (nir_src_is_const(vertex_src)) { - unsigned vertex = nir_src_as_uint(vertex_src); - bld.MOV(icp_handle, component(start, vertex)); - } else { - /* The vertex index is non-constant. We need to use indirect - * addressing to fetch the proper URB handle. - * - * Convert vertex_index to bytes (multiply by 4) - */ - brw_reg icp_offset_bytes = - bld.SHL(retype(get_nir_src(ntb, vertex_src, 0), BRW_TYPE_UD), - brw_imm_ud(2u)); - - /* Use first_icp_handle as the base offset. There is one DWord - * of URB handles per vertex, so inform the register allocator that - * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. - */ - bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start, - brw_reg(icp_offset_bytes), - brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) * - grf_size_bytes)); - } - } - - brw_urb_inst *urb; - brw_reg indirect_offset = get_nir_src(ntb, offset_src, 0); - - if (nir_src_is_const(offset_src)) { - brw_reg srcs[URB_LOGICAL_NUM_SRCS]; - srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; - - /* Constant indexing - use global offset. */ - if (first_component != 0) { - unsigned read_components = num_components + first_component; - brw_reg tmp = bld.vgrf(dst.type, read_components); - urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs)); - urb->size_written = read_components * - tmp.component_size(urb->exec_size); - brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component), - num_components); - } else { - urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs)); - urb->size_written = num_components * - dst.component_size(urb->exec_size); - } - urb->offset = base_offset + nir_src_as_uint(offset_src); - } else { - /* Indirect indexing - use per-slot offsets as well. */ - unsigned read_components = num_components + first_component; - brw_reg tmp = bld.vgrf(dst.type, read_components); - - /* Convert oword offset to bytes on Xe2+ */ - if (devinfo->ver >= 20) - indirect_offset = bld.SHL(indirect_offset, brw_imm_ud(4u)); - - brw_reg srcs[URB_LOGICAL_NUM_SRCS]; - srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; - srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; - - if (first_component != 0) { - urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs)); - urb->size_written = read_components * - tmp.component_size(urb->exec_size); - brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component), - num_components); - } else { - urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs)); - urb->size_written = num_components * - dst.component_size(urb->exec_size); - } - urb->offset = base_offset; - } - - if (devinfo->ver >= 20) - urb->offset *= 16; -} - static void brw_from_nir_emit_vs_intrinsic(nir_to_brw_state &ntb, nir_intrinsic_instr *instr) @@ -3198,6 +3044,8 @@ brw_from_nir_emit_gs_intrinsic(nir_to_brw_state &ntb, assert(s.stage == MESA_SHADER_GEOMETRY); + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data); + brw_reg dest; if (nir_intrinsic_infos[instr->intrinsic].has_dest) dest = get_nir_def(ntb, instr->def); @@ -3212,11 +3060,91 @@ brw_from_nir_emit_gs_intrinsic(nir_to_brw_state &ntb, case nir_intrinsic_load_input: UNREACHABLE("load_input intrinsics are invalid for the GS stage"); - case nir_intrinsic_load_per_vertex_input: - emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr), - instr->src[1], instr->num_components, - nir_intrinsic_component(instr)); + case nir_intrinsic_load_per_vertex_input: { + /* Load a push input (assuming single invocation layout) */ + assert(s.nir->info.gs.invocations == 1); + assert(nir_src_as_uint(instr->src[1]) == 0); + const unsigned vertex = nir_src_as_uint(instr->src[0]); + const unsigned stride = gs_prog_data->base.urb_read_length * 8; + const unsigned imm_offset = vertex * stride + + 4 * nir_intrinsic_base(instr) + + nir_intrinsic_component(instr); + + const brw_reg attr = offset(brw_attr_reg(0, dest.type), bld, imm_offset); + brw_combine_with_vec(bld, dest, attr, instr->num_components); break; + } + + case nir_intrinsic_load_urb_input_handle_indexed_intel: { + const unsigned grf_size_bytes = REG_SIZE * reg_unit(ntb.devinfo); + brw_reg start = s.gs_payload().icp_handle_start; + dest.type = start.type; + + if (gs_prog_data->invocations == 1) { + if (nir_src_is_const(instr->src[0])) { + /* Vertex index is constant; just select the proper URB handle. */ + bld.MOV(dest, byte_offset(start, grf_size_bytes * + nir_src_as_uint(instr->src[0]))); + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + * + * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> + * indicating that channel should read the handle from + * DWord . We convert that to bytes by multiplying by 4. + * + * Next, we convert the vertex index to bytes by multiplying + * by 32/64 (shifting by 5/6), and add the two together. This is + * the final indirect byte offset. + */ + brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION(); + + /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ + brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u)); + /* Convert vertex_index to bytes (multiply by 32/64) */ + assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* ffs() */ + brw_reg vertex_offset_bytes = + bld.SHL(retype(get_nir_src(ntb, instr->src[0], 0), BRW_TYPE_UD), + brw_imm_ud(ffs(grf_size_bytes) - 1)); + brw_reg icp_offset_bytes = + bld.ADD(vertex_offset_bytes, channel_offsets); + + /* Use first_icp_handle as the base offset. There is one register + * of URB handles per vertex, so inform the register allocator that + * we might read up to nir->info.gs.vertices_in registers. + */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, dest, start, + brw_reg(icp_offset_bytes), + brw_imm_ud(s.nir->info.gs.vertices_in * grf_size_bytes)); + } + } else { + assert(gs_prog_data->invocations > 1); + + if (nir_src_is_const(instr->src[0])) { + unsigned vertex = nir_src_as_uint(instr->src[0]); + bld.MOV(dest, component(start, vertex)); + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + * + * Convert vertex_index to bytes (multiply by 4) + */ + brw_reg icp_offset_bytes = + bld.SHL(retype(get_nir_src(ntb, instr->src[0], 0), BRW_TYPE_UD), + brw_imm_ud(2u)); + + /* Use first_icp_handle as the base offset. There is one DWord + * of URB handles per vertex, so inform the register allocator that + * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. + */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, dest, start, + brw_reg(icp_offset_bytes), + brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) * + grf_size_bytes)); + } + } + break; + } case nir_intrinsic_emit_vertex_with_counter: emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr)); diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index 618f1010d57..db0239a4780 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -345,11 +345,13 @@ try_load_push_input(nir_builder *b, nir_intrinsic_instr *io, nir_def *offset) { + const enum mesa_shader_stage stage = b->shader->info.stage; + if (!nir_def_is_const(offset)) return NULL; const unsigned offset_unit = cb_data->vec4_access ? 16 : 4; - const uint32_t byte_offset = + uint32_t byte_offset = 16 * io_base_slot(io, cb_data) + 4 * io_component(io, cb_data) + offset_unit * nir_src_as_uint(nir_src_for_ssa(offset)); assert((byte_offset % 4) == 0); @@ -357,6 +359,16 @@ try_load_push_input(nir_builder *b, if (byte_offset >= cb_data->max_push_bytes) return NULL; + if (stage == MESA_SHADER_GEOMETRY) { + /* GS push inputs still use load_per_vertex_input */ + const nir_io_semantics io_sem = nir_intrinsic_io_semantics(io); + const int slot = cb_data->varying_to_slot[io_sem.location]; + assert(slot != -1); + nir_intrinsic_set_base(io, slot); + nir_intrinsic_set_component(io, io_component(io, cb_data)); + return &io->def; + } + return load_push_input(b, io, byte_offset); } @@ -377,7 +389,8 @@ lower_urb_inputs(nir_builder *b, nir_intrinsic_instr *intrin, void *data) load = load_urb(b, cb_data, intrin, input_handle(b, intrin), offset, ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE); } - nir_def_replace(&intrin->def, load); + if (load != &intrin->def) + nir_def_replace(&intrin->def, load); return true; } return false; @@ -909,9 +922,6 @@ brw_nir_lower_gs_inputs(nir_shader *nir, const struct intel_vue_map *vue_map, unsigned *out_urb_read_length) { - nir_foreach_shader_in_variable(var, nir) - var->data.driver_location = var->data.location; - /* Inputs are stored in vec4 slots, so use type_size_vec4(). */ NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4, nir_lower_io_lower_64bit_to_32); @@ -919,40 +929,6 @@ brw_nir_lower_gs_inputs(nir_shader *nir, /* Fold constant offset srcs for IO. */ NIR_PASS(_, nir, nir_opt_constant_folding); - nir_foreach_function_impl(impl, nir) { - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - if (intrin->intrinsic == nir_intrinsic_load_input || - intrin->intrinsic == nir_intrinsic_load_per_vertex_input) { - /* Offset 0 is the VUE header, which contains - * VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and - * VARYING_SLOT_PSIZ [.w]. - */ - nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin); - gl_varying_slot varying = io_sem.location; - int vue_slot; - switch (varying) { - case VARYING_SLOT_PSIZ: - nir_intrinsic_set_base(intrin, 0); - nir_intrinsic_set_component(intrin, 3); - break; - - default: - vue_slot = vue_map->varying_to_slot[varying]; - assert(vue_slot != -1); - nir_intrinsic_set_base(intrin, vue_slot); - break; - } - } - } - } - } - unsigned urb_read_length = 0; if (nir->info.gs.invocations == 1) { @@ -974,6 +950,15 @@ brw_nir_lower_gs_inputs(nir_shader *nir, } *out_urb_read_length = urb_read_length; + + const struct brw_lower_urb_cb_data cb_data = { + .devinfo = devinfo, + .vec4_access = true, + /* pushed bytes per vertex */ + .max_push_bytes = urb_read_length * 8 * sizeof(uint32_t), + .varying_to_slot = vue_map->varying_to_slot, + }; + NIR_PASS(_, nir, brw_nir_lower_inputs_to_urb_intrinsics, &cb_data); } void