diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index f0e7a773849..16bce3222e2 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -848,6 +848,7 @@ enum brw_pixel_shader_computed_depth_mode { struct brw_wm_prog_data { struct brw_stage_prog_data base; + GLuint num_per_primitive_inputs; GLuint num_varying_inputs; uint8_t reg_blocks_8; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 8caae21d683..d12eeba6e7c 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1871,10 +1871,31 @@ calculate_urb_setup(const struct intel_device_info *devinfo, sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX); int urb_next = 0; + + /* Per-Primitive Attributes are laid out by Hardware before the regular + * attributes, so order them like this to make easy later to map setup into + * real HW registers. + */ + if (nir->info.per_primitive_inputs) { + for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { + if (nir->info.per_primitive_inputs & BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + + /* The actual setup attributes later must be aligned to a full GRF. */ + urb_next = ALIGN(urb_next, 2); + + prog_data->num_per_primitive_inputs = urb_next; + } + + const uint64_t inputs_read = + nir->info.inputs_read & ~nir->info.per_primitive_inputs; + /* Figure out where each of the incoming setup attributes lands. */ if (devinfo->ver >= 6) { - if (util_bitcount64(nir->info.inputs_read & - BRW_FS_VARYING_INPUT_MASK) <= 16) { + if (util_bitcount64(inputs_read & + BRW_FS_VARYING_INPUT_MASK) <= 16) { /* The SF/SBE pipeline stage can do arbitrary rearrangement of the * first 16 varying inputs, so we can put them wherever we want. * Just put them in order. @@ -1885,7 +1906,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo, * a different vertex (or geometry) shader. */ for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { - if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & + if (inputs_read & BRW_FS_VARYING_INPUT_MASK & BITFIELD64_BIT(i)) { prog_data->urb_setup[i] = urb_next++; } @@ -1897,6 +1918,11 @@ calculate_urb_setup(const struct intel_device_info *devinfo, * (geometry or vertex shader). */ + /* TODO(mesh): Implement this case for Mesh. Basically have a large + * number of outputs in Mesh (hence a lot of inputs in Fragment) + * should already trigger this. + */ + /* Re-compute the VUE map here in the case that the one coming from * geometry has more than one position slot (used for Primitive * Replication). @@ -1907,7 +1933,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo, nir->info.separate_shader, 1); int first_slot = - brw_compute_first_urb_slot_required(nir->info.inputs_read, + brw_compute_first_urb_slot_required(inputs_read, &prev_stage_vue_map); assert(prev_stage_vue_map.num_slots <= first_slot + 32); @@ -1915,7 +1941,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo, slot++) { int varying = prev_stage_vue_map.slot_to_varying[slot]; if (varying != BRW_VARYING_SLOT_PAD && - (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & + (inputs_read & BRW_FS_VARYING_INPUT_MASK & BITFIELD64_BIT(varying))) { prog_data->urb_setup[varying] = slot - first_slot; } @@ -1948,12 +1974,12 @@ calculate_urb_setup(const struct intel_device_info *devinfo, * * See compile_sf_prog() for more info. */ - if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC)) + if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC)) prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++; } - prog_data->num_varying_inputs = urb_next; - prog_data->inputs = nir->info.inputs_read; + prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs; + prog_data->inputs = inputs_read; brw_compute_urb_setup_index(prog_data); } @@ -1995,6 +2021,12 @@ fs_visitor::assign_urb_setup() /* Each attribute is 4 setup channels, each of which is half a reg. */ this->first_non_payload_grf += prog_data->num_varying_inputs * 2; + + /* Unlike regular attributes, per-primitive attributes have all 4 channels + * in the same slot, so each GRF can store two slots. + */ + assert(prog_data->num_per_primitive_inputs % 2 == 0); + this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2; } void diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 8bc2dd29a4f..fdeb821bf69 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -332,6 +332,7 @@ public: fs_reg get_timestamp(const brw::fs_builder &bld); fs_reg interp_reg(int location, int channel); + fs_reg per_primitive_reg(int location); virtual void dump_instructions() const; virtual void dump_instructions(const char *name) const; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 2b2bccdf053..a6b13456ce5 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3620,21 +3620,33 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, } case nir_intrinsic_load_input: { - /* load_input is only used for flat inputs */ + /* In Fragment Shaders load_input is used either for flat inputs or + * per-primitive inputs. + */ assert(nir_dest_bit_size(instr->dest) == 32); unsigned base = nir_intrinsic_base(instr); unsigned comp = nir_intrinsic_component(instr); unsigned num_components = instr->num_components; + /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */ + /* Special case fields in the VUE header */ if (base == VARYING_SLOT_LAYER) comp = 1; else if (base == VARYING_SLOT_VIEWPORT) comp = 2; - for (unsigned int i = 0; i < num_components; i++) { - bld.MOV(offset(dest, bld, i), - retype(component(interp_reg(base, comp + i), 3), dest.type)); + if (BITFIELD64_BIT(base) & nir->info.per_primitive_inputs) { + assert(base != VARYING_SLOT_PRIMITIVE_INDICES); + for (unsigned int i = 0; i < num_components; i++) { + bld.MOV(offset(dest, bld, i), + retype(component(per_primitive_reg(base), comp + i), dest.type)); + } + } else { + for (unsigned int i = 0; i < num_components; i++) { + bld.MOV(offset(dest, bld, i), + retype(component(interp_reg(base, comp + i), 3), dest.type)); + } } break; } diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index 060cb83dfec..4de37671b90 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -136,6 +136,11 @@ fs_visitor::emit_dummy_fs() calculate_cfg(); } +/* Input data is organized with first the per-primitive values, followed + * by per-vertex values. The per-vertex will have interpolation information + * associated, so use 4 components for each value. + */ + /* The register location here is relative to the start of the URB * data. It will get adjusted to be a real location before * generate_code() time. @@ -144,9 +149,39 @@ fs_reg fs_visitor::interp_reg(int location, int channel) { assert(stage == MESA_SHADER_FRAGMENT); - struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); - int regnr = prog_data->urb_setup[location] * 4 + channel; - assert(prog_data->urb_setup[location] != -1); + assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs); + + const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + assert(prog_data->urb_setup[location] >= 0); + unsigned nr = prog_data->urb_setup[location]; + + /* Adjust so we start counting from the first per_vertex input. */ + assert(nr >= prog_data->num_per_primitive_inputs); + nr -= prog_data->num_per_primitive_inputs; + + const unsigned per_vertex_start = prog_data->num_per_primitive_inputs; + const unsigned regnr = per_vertex_start + (nr * 4) + channel; + + return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F); +} + +/* The register location here is relative to the start of the URB + * data. It will get adjusted to be a real location before + * generate_code() time. + */ +fs_reg +fs_visitor::per_primitive_reg(int location) +{ + assert(stage == MESA_SHADER_FRAGMENT); + assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs); + + const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + assert(prog_data->urb_setup[location] >= 0); + + const unsigned regnr = prog_data->urb_setup[location]; + assert(regnr < prog_data->num_per_primitive_inputs); return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F); }