From 2c90f08bf7841e508a4c87e3f44d872e0abb64ba Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 23 Nov 2015 18:32:38 -0800 Subject: [PATCH 01/16] i965/fs: Add support for doing MOV_INDIRECT on uniforms --- src/mesa/drivers/dri/i965/brw_fs.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index c833ef0be3b..61b2a4ba80e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -850,7 +850,10 @@ fs_inst::regs_read(int arg) const assert(src[2].file == IMM); unsigned region_length = src[2].ud; - if (src[0].file == FIXED_GRF) { + if (src[0].file == UNIFORM) { + assert(region_length % 4 == 0); + return region_length / 4; + } else if (src[0].file == FIXED_GRF) { /* If the start of the region is not register aligned, then * there's some portion of the register that's technically * unread at the beginning. From 653d8044abb69ff2eaa8637412e240d1a33f4f12 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 24 Nov 2015 09:03:29 -0800 Subject: [PATCH 02/16] i965/fs: Don't force MASK_DISABLE on INDIRECT_MOV instructions It should work fine without it and the visitor can set it if it wants. --- src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index c25da07c4ba..d86eee1de4d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -366,7 +366,6 @@ fs_generator::generate_mov_indirect(fs_inst *inst, assert(inst->exec_size == 8 || devinfo->gen >= 8); brw_MOV(p, addr, indirect_byte_offset); - brw_inst_set_mask_control(devinfo, brw_last_inst, BRW_MASK_DISABLE); brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type)); } From 4be9a1c7bbad8c95e1f5aab91e8cdf3c0cfb0ec1 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 24 Nov 2015 11:24:57 -0800 Subject: [PATCH 03/16] i965/fs: Fix regs_read() for MOV_INDIRECT with a non-zero subnr The subnr field is in bytes so we don't need to multiply by type_sz. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 61b2a4ba80e..b903fbed196 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -867,7 +867,7 @@ fs_inst::regs_read(int arg) const * unread portion at the beginning. */ if (src[0].subnr) - region_length += src[0].subnr * type_sz(src[0].type); + region_length += src[0].subnr; return DIV_ROUND_UP(region_length, REG_SIZE); } else { From 2f1455dbb0e5a2f1b395e767b0c7cd3a58dc76e4 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 24 Nov 2015 09:01:11 -0800 Subject: [PATCH 04/16] i965/fs: Add support for MOV_INDIRECT on pre-Broadwell hardware While we're at it, we also add support for the possibility that the indirect is, in fact, a constant. This shouldn't happen in the common case (if it does, that means NIR failed to constant-fold something), but it's possible so we should handle it. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 4 ++ .../drivers/dri/i965/brw_fs_generator.cpp | 51 ++++++++++++++----- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index b903fbed196..93bacadc52b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -4423,6 +4423,10 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: return 8; + case SHADER_OPCODE_MOV_INDIRECT: + /* Prior to Broadwell, we only have 8 address subregisters */ + return devinfo->gen < 8 ? 8 : inst->exec_size; + default: return inst->exec_size; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index d86eee1de4d..7fa6d848473 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -351,22 +351,47 @@ fs_generator::generate_mov_indirect(fs_inst *inst, unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr; - /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ - struct brw_reg addr = vec8(brw_address_reg(0)); + if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) { + imm_byte_offset += indirect_byte_offset.ud; - /* The destination stride of an instruction (in bytes) must be greater - * than or equal to the size of the rest of the instruction. Since the - * address register is of type UW, we can't use a D-type instruction. - * In order to get around this, re re-type to UW and use a stride. - */ - indirect_byte_offset = - retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); + reg.nr = imm_byte_offset / REG_SIZE; + reg.subnr = imm_byte_offset % REG_SIZE; + brw_MOV(p, dst, reg); + } else { + /* Prior to Broadwell, there are only 8 address registers. */ + assert(inst->exec_size == 8 || devinfo->gen >= 8); - /* Prior to Broadwell, there are only 8 address registers. */ - assert(inst->exec_size == 8 || devinfo->gen >= 8); + /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ + struct brw_reg addr = vec8(brw_address_reg(0)); - brw_MOV(p, addr, indirect_byte_offset); - brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type)); + /* The destination stride of an instruction (in bytes) must be greater + * than or equal to the size of the rest of the instruction. Since the + * address register is of type UW, we can't use a D-type instruction. + * In order to get around this, re re-type to UW and use a stride. + */ + indirect_byte_offset = + retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); + + if (devinfo->gen < 8) { + /* Prior to broadwell, we have a restriction that the bottom 5 bits + * of the base offset and the bottom 5 bits of the indirect must add + * to less than 32. In other words, the hardware needs to be able to + * add the bottom five bits of the two to get the subnumber and add + * the next 7 bits of each to get the actual register number. Since + * the indirect may cause us to cross a register boundary, this makes + * it almost useless. We could try and do something clever where we + * use a actual base offset if base_offset % 32 == 0 but that would + * mean we were generating different code depending on the base + * offset. Instead, for the sake of consistency, we'll just do the + * add ourselves. + */ + brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), dst.type)); + } else { + brw_MOV(p, addr, indirect_byte_offset); + brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type)); + } + } } void From 4115648a6be2e846660a35a0e260ae53b809b7e0 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 23 Nov 2015 21:39:15 -0800 Subject: [PATCH 05/16] i965/vec4: Add support for SHADER_OPCODE_MOV_INDIRECT --- .../drivers/dri/i965/brw_vec4_generator.cpp | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index c3426ddd1c8..71a7f63be48 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1051,6 +1051,48 @@ generate_set_simd4x2_header_gen9(struct brw_codegen *p, brw_pop_insn_state(p); } +static void +generate_mov_indirect(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, struct brw_reg reg, + struct brw_reg indirect, struct brw_reg length) +{ + assert(indirect.type == BRW_REGISTER_TYPE_UD); + + unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2); + + /* This instruction acts in align1 mode */ + assert(inst->force_writemask_all || reg.writemask == 0xf); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + struct brw_reg addr = vec2(brw_address_reg(0)); + + /* We need to move the indirect value into the address register. In order + * to make things make some sense, we want to respect at least the X + * component of the swizzle. In order to do that, we need to convert the + * subnr (probably 0) to an align1 subnr and add in the swizzle. We then + * use a region of <8,4,0>:uw to pick off the first 2 bytes of the indirect + * and splat it out to all four channels of the given half of a0. + */ + assert(brw_is_single_value_swizzle(indirect.swizzle)); + indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)) * 2; + indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0); + + brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset)); + + /* Use a <4,1> region Vx1 region*/ + struct brw_reg src = brw_VxH_indirect(0, 0); + src.width = BRW_WIDTH_4; + src.hstride = BRW_HORIZONTAL_STRIDE_1; + + brw_MOV(p, dst, retype(src, reg.type)); + + brw_pop_insn_state(p); +} + static void generate_code(struct brw_codegen *p, const struct brw_compiler *compiler, @@ -1538,6 +1580,9 @@ generate_code(struct brw_codegen *p, break; } + case SHADER_OPCODE_MOV_INDIRECT: + generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]); + default: unreachable("Unsupported opcode"); } From 7ba70b1b5121577983c3493e181ee3568563f22e Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 24 Nov 2015 13:52:49 -0800 Subject: [PATCH 06/16] nir: Add another index to load_uniform to specify the range read --- src/glsl/nir/nir_intrinsics.h | 7 +++++-- src/glsl/nir/nir_lower_io.c | 5 +++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h index 9811fb391de..5b10423078d 100644 --- a/src/glsl/nir/nir_intrinsics.h +++ b/src/glsl/nir/nir_intrinsics.h @@ -263,6 +263,9 @@ SYSTEM_VALUE(helper_invocation, 1, 0) * of the start of the variable being loaded and and the offset source is a * offset into that variable. * + * Uniform load operations have a second index that specifies the size of the + * variable being loaded. If const_index[1] == 0, then the size is unknown. + * * Some load operations such as UBO/SSBO load and per_vertex loads take an * additional source to specify which UBO/SSBO/vertex to load from. * @@ -275,8 +278,8 @@ SYSTEM_VALUE(helper_invocation, 1, 0) #define LOAD(name, srcs, indices, flags) \ INTRINSIC(load_##name, srcs, ARR(1, 1, 1, 1), true, 0, 0, indices, flags) -/* src[] = { offset }. const_index[] = { base } */ -LOAD(uniform, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER) +/* src[] = { offset }. const_index[] = { base, size } */ +LOAD(uniform, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER) /* src[] = { buffer_index, offset }. No const_index */ LOAD(ubo, 2, 0, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER) /* src[] = { offset }. const_index[] = { base } */ diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c index 3d646eb14b4..c1f034ba598 100644 --- a/src/glsl/nir/nir_lower_io.c +++ b/src/glsl/nir/nir_lower_io.c @@ -216,6 +216,11 @@ nir_lower_io_block(nir_block *block, void *void_state) load->const_index[0] = intrin->variables[0]->var->data.driver_location; + if (load->intrinsic == nir_intrinsic_load_uniform) { + load->const_index[1] = + state->type_size(intrin->variables[0]->var->type); + } + if (per_vertex) load->src[0] = nir_src_for_ssa(vertex_index); From a3cd95a88417d0ee5622d73f6a2f6aa02bb7e54a Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 24 Nov 2015 15:12:20 -0800 Subject: [PATCH 07/16] i965/fs: Use MOV_INDIRECT for all indirect uniform loads Instead of using reladdr, this commit changes the FS backend to emit a MOV_INDIRECT whenever we need an indirect uniform load. We also have to rework some of the other bits of the backend to handle this new form of uniform load. The obvious change is that demote_pull_constants now acts more like a lowering pass when it hits a MOV_INDIRECT. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 72 ++++++++++++++---------- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 56 ++++++++++++++---- 2 files changed, 88 insertions(+), 40 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 93bacadc52b..7fc3e464b09 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1948,8 +1948,8 @@ fs_visitor::assign_constant_locations() if (inst->src[i].file != UNIFORM) continue; - if (inst->src[i].reladdr) { - int uniform = inst->src[i].nr; + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + int uniform = inst->src[0].nr; /* If this array isn't already present in the pull constant buffer, * add it. @@ -2031,49 +2031,63 @@ fs_visitor::assign_constant_locations() void fs_visitor::demote_pull_constants() { - foreach_block_and_inst (block, fs_inst, inst, cfg) { + const unsigned index = stage_prog_data->binding_table.pull_constants_start; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + /* Set up the annotation tracking for new generated instructions. */ + const fs_builder ibld(this, block, inst); + for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != UNIFORM) continue; - int pull_index; + /* We'll handle this case later */ + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) + continue; + unsigned location = inst->src[i].nr + inst->src[i].reg_offset; - if (location >= uniforms) /* Out of bounds access */ - pull_index = -1; - else - pull_index = pull_constant_loc[location]; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; if (pull_index == -1) continue; - /* Set up the annotation tracking for new generated instructions. */ - const fs_builder ibld(this, block, inst); - const unsigned index = stage_prog_data->binding_table.pull_constants_start; - fs_reg dst = vgrf(glsl_type::float_type); - assert(inst->src[i].stride == 0); - /* Generate a pull load into dst. */ - if (inst->src[i].reladdr) { - VARYING_PULL_CONSTANT_LOAD(ibld, dst, - brw_imm_ud(index), - *inst->src[i].reladdr, - pull_index * 4); - inst->src[i].reladdr = NULL; - inst->src[i].stride = 1; - } else { - const fs_builder ubld = ibld.exec_all().group(8, 0); - struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), offset); - inst->src[i].set_smear(pull_index & 3); - } - brw_mark_surface_used(prog_data, index); + fs_reg dst = vgrf(glsl_type::float_type); + const fs_builder ubld = ibld.exec_all().group(8, 0); + struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + dst, brw_imm_ud(index), offset); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; inst->src[i].nr = dst.nr; inst->src[i].reg_offset = 0; + inst->src[i].set_smear(pull_index & 3); + + brw_mark_surface_used(prog_data, index); + } + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && + inst->src[0].file == UNIFORM) { + + unsigned location = inst->src[0].nr + inst->src[0].reg_offset; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; + assert(pull_index >= 0); /* This had better be pull */ + + VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, + brw_imm_ud(index), + inst->src[1], + pull_index * 4); + inst->remove(block); + + brw_mark_surface_used(prog_data, index); } } invalidate_live_intervals(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index f08f910ba27..2681dab3f46 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1136,6 +1136,8 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref) { fs_reg image(UNIFORM, deref->var->data.driver_location / 4, BRW_REGISTER_TYPE_UD); + fs_reg indirect; + unsigned indirect_max = 0; for (const nir_deref *tail = &deref->deref; tail->child; tail = tail->child) { @@ -1147,7 +1149,7 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref) image = offset(image, bld, base * element_size); if (deref_array->deref_array_type == nir_deref_array_type_indirect) { - fs_reg tmp = vgrf(glsl_type::int_type); + fs_reg tmp = vgrf(glsl_type::uint_type); if (devinfo->gen == 7 && !devinfo->is_haswell) { /* IVB hangs when trying to access an invalid surface index with @@ -1165,15 +1167,31 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref) bld.MOV(tmp, get_nir_src(deref_array->indirect)); } + indirect_max += element_size * (tail->type->length - 1); + bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4)); - if (image.reladdr) - bld.ADD(*image.reladdr, *image.reladdr, tmp); - else - image.reladdr = new(mem_ctx) fs_reg(tmp); + if (indirect.file == BAD_FILE) { + indirect = tmp; + } else { + bld.ADD(indirect, indirect, tmp); + } } } - return image; + if (indirect.file == BAD_FILE) { + return image; + } else { + /* Emit a pile of MOVs to load the uniform into a temporary. The + * dead-code elimination pass will get rid of what we don't use. + */ + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE); + for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + offset(tmp, bld, j), offset(image, bld, j), + indirect, brw_imm_ud((indirect_max + 1) * 4)); + } + return tmp; + } } void @@ -2333,12 +2351,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr /* Offsets are in bytes but they should always be multiples of 4 */ assert(const_offset->u[0] % 4 == 0); src.reg_offset = const_offset->u[0] / 4; - } else { - src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0])); - } - for (unsigned j = 0; j < instr->num_components; j++) { - bld.MOV(offset(dest, bld, j), offset(src, bld, j)); + for (unsigned j = 0; j < instr->num_components; j++) { + bld.MOV(offset(dest, bld, j), offset(src, bld, j)); + } + } else { + fs_reg indirect = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); + + /* We need to pass a size to the MOV_INDIRECT but we don't want it to + * go past the end of the uniform. In order to keep the n'th + * component from running past, we subtract off the size of all but + * one component of the vector. + */ + assert(instr->const_index[1] >= instr->num_components * 4); + unsigned read_size = instr->const_index[1] - + (instr->num_components - 1) * 4; + + for (unsigned j = 0; j < instr->num_components; j++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + offset(dest, bld, j), offset(src, bld, j), + indirect, brw_imm_ud(read_size)); + } } break; } From 9f46af9e418e70e71092cccbb3a21a22e36d8e24 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 24 Nov 2015 16:54:41 -0800 Subject: [PATCH 08/16] i965/fs: Get rid of reladdr We aren't using it anymore. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 7 +------ src/mesa/drivers/dri/i965/brw_ir_fs.h | 5 +---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 7fc3e464b09..b78f99fcfdb 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -433,7 +433,6 @@ fs_reg::fs_reg(struct ::brw_reg reg) : { this->reg_offset = 0; this->subreg_offset = 0; - this->reladdr = NULL; this->stride = 1; if (this->file == IMM && (this->type != BRW_REGISTER_TYPE_V && @@ -448,7 +447,6 @@ fs_reg::equals(const fs_reg &r) const { return (this->backend_reg::equals(r) && subreg_offset == r.subreg_offset && - !reladdr && !r.reladdr && stride == r.stride); } @@ -4723,9 +4721,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) break; case UNIFORM: fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset); - if (inst->src[i].reladdr) { - fprintf(file, "+reladdr"); - } else if (inst->src[i].subreg_offset) { + if (inst->src[i].subreg_offset) { fprintf(file, "+%d.%d", inst->src[i].reg_offset, inst->src[i].subreg_offset); } @@ -4836,7 +4832,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, { if (end == start || end->is_partial_write() || - reg.reladdr || !reg.equals(end->dst)) { return NULL; } else { diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h index c3eec2efb42..e4f20f4ffc9 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_fs.h +++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h @@ -58,8 +58,6 @@ public: */ int subreg_offset; - fs_reg *reladdr; - /** Register region horizontal stride */ uint8_t stride; }; @@ -136,8 +134,7 @@ component(fs_reg reg, unsigned idx) static inline bool is_uniform(const fs_reg ®) { - return (reg.stride == 0 || reg.is_null()) && - (!reg.reladdr || is_uniform(*reg.reladdr)); + return (reg.stride == 0 || reg.is_null()); } /** From 9024353db3c3f4b096855307964b3c1f92b21259 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 24 Nov 2015 15:16:14 -0800 Subject: [PATCH 09/16] i965/fs: Stop relying on param_size in assign_constant_locations Now that we have MOV_INDIRECT opcodes, we have all of the size information we need directly in the opcode. With a little restructuring of the algorithm used in assign_constant_locations we don't need param_size anymore. The big thing to watch out for now, however, is that you can have two ranges overlap where neither contains the other. In order to deal with this, we make the first pass just flag what needs pulling and handle assigning pull constant locations until later. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 46 +++++++++++----------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index b78f99fcfdb..6520424a0b0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1923,14 +1923,12 @@ fs_visitor::assign_constant_locations() if (dispatch_width != 8) return; - unsigned int num_pull_constants = 0; - - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms); - bool is_live[uniforms]; memset(is_live, 0, sizeof(is_live)); + bool needs_pull[uniforms]; + memset(needs_pull, 0, sizeof(needs_pull)); + /* First, we walk through the instructions and do two things: * * 1) Figure out which uniforms are live. @@ -1946,20 +1944,15 @@ fs_visitor::assign_constant_locations() if (inst->src[i].file != UNIFORM) continue; - if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { - int uniform = inst->src[0].nr; + int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; - /* If this array isn't already present in the pull constant buffer, - * add it. - */ - if (pull_constant_loc[uniform] == -1) { - assert(param_size[uniform]); - for (int j = 0; j < param_size[uniform]; j++) - pull_constant_loc[uniform + j] = num_pull_constants++; + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + for (unsigned j = 0; j < inst->src[2].ud / 4; j++) { + is_live[constant_nr + j] = true; + needs_pull[constant_nr + j] = true; } } else { /* Mark the the one accessed uniform as live */ - int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; if (constant_nr >= 0 && constant_nr < (int) uniforms) is_live[constant_nr] = true; } @@ -1976,26 +1969,23 @@ fs_visitor::assign_constant_locations() */ unsigned int max_push_components = 16 * 8; unsigned int num_push_constants = 0; + unsigned int num_pull_constants = 0; push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); for (unsigned int i = 0; i < uniforms; i++) { - if (!is_live[i] || pull_constant_loc[i] != -1) { - /* This UNIFORM register is either dead, or has already been demoted - * to a pull const. Mark it as no longer living in the param[] array. - */ - push_constant_loc[i] = -1; - continue; - } + push_constant_loc[i] = -1; + pull_constant_loc[i] = -1; - if (num_push_constants < max_push_components) { - /* Retain as a push constant. Record the location in the params[] - * array. - */ + if (!is_live[i]) + continue; + + if (!needs_pull[i] && num_push_constants < max_push_components) { + /* Retain as a push constant */ push_constant_loc[i] = num_push_constants++; } else { - /* Demote to a pull constant. */ - push_constant_loc[i] = -1; + /* We have to pull it */ pull_constant_loc[i] = num_pull_constants++; } } From 9c36c4084529966cb0238a1eadd36e6b13da4798 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 24 Nov 2015 17:02:01 -0800 Subject: [PATCH 10/16] i965/fs: Get rid of the param_size array --- src/mesa/drivers/dri/i965/brw_fs.cpp | 1 - src/mesa/drivers/dri/i965/brw_fs.h | 2 -- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 9 --------- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 --- 4 files changed, 15 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 6520424a0b0..cbaa8afcdcf 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1021,7 +1021,6 @@ fs_visitor::import_uniforms(fs_visitor *v) this->push_constant_loc = v->push_constant_loc; this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; - this->param_size = v->param_size; } fs_reg * diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index f2e384129cb..7bed2179531 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -313,8 +313,6 @@ public: struct brw_stage_prog_data *prog_data; struct gl_program *prog; - int *param_size; - int *virtual_grf_start; int *virtual_grf_end; brw::fs_live_variables *live_intervals; diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 2681dab3f46..82a6ce2b295 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -174,15 +174,6 @@ fs_visitor::nir_setup_uniforms() return; uniforms = nir->num_uniforms / 4; - - nir_foreach_variable(var, &nir->uniforms) { - /* UBO's and atomics don't take up space in the uniform file */ - if (var->interface_type != NULL || var->type->contains_atomic()) - continue; - - if (type_size_scalar(var->type) > 0) - param_size[var->data.driver_location / 4] = type_size_scalar(var->type); - } } static bool diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 0582e7831de..75ca1da8a79 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -1012,9 +1012,6 @@ fs_visitor::init() this->spilled_any_registers = false; this->do_dual_src = false; - - if (dispatch_width == 8) - this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params); } fs_visitor::~fs_visitor() From 46f5396846e5c4d1874c5af83378b91230f0d9da Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 25 Nov 2015 09:12:37 -0800 Subject: [PATCH 11/16] i965/vec4: Inline get_pull_constant_offset It's not really doing enough anymore to justify a helper function. --- src/mesa/drivers/dri/i965/brw_vec4.h | 2 - .../drivers/dri/i965/brw_vec4_visitor.cpp | 37 +++++++------------ 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index ae5bf6939f5..f2e5ce18ab4 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -284,8 +284,6 @@ public: src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst, src_reg *reladdr, int reg_offset); - src_reg get_pull_constant_offset(bblock_t *block, vec4_instruction *inst, - src_reg *reladdr, int reg_offset); void emit_scratch_read(bblock_t *block, vec4_instruction *inst, dst_reg dst, src_reg orig_src, diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 443d0eb5387..7712d3471ea 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1464,27 +1464,6 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, } } -src_reg -vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst, - src_reg *reladdr, int reg_offset) -{ - if (reladdr) { - src_reg index = src_reg(this, glsl_type::int_type); - - emit_before(block, inst, ADD(dst_reg(index), *reladdr, - brw_imm_d(reg_offset * 16))); - - return index; - } else if (devinfo->gen >= 8) { - /* Store the offset in a GRF so we can send-from-GRF. */ - src_reg offset = src_reg(this, glsl_type::int_type); - emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16))); - return offset; - } else { - return brw_imm_d(reg_offset * 16); - } -} - /** * Emits an instruction before @inst to load the value named by @orig_src * from scratch space at @base_offset to @temp. @@ -1666,8 +1645,20 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, { int reg_offset = base_offset + orig_src.reg_offset; const unsigned index = prog_data->base.binding_table.pull_constants_start; - src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr, - reg_offset); + + src_reg offset; + if (orig_src.reladdr) { + offset = src_reg(this, glsl_type::int_type); + + emit_before(block, inst, ADD(dst_reg(offset), *orig_src.reladdr, + brw_imm_d(reg_offset * 16))); + } else if (devinfo->gen >= 8) { + /* Store the offset in a GRF so we can send-from-GRF. */ + offset = src_reg(this, glsl_type::int_type); + emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16))); + } else { + offset = brw_imm_d(reg_offset * 16); + } emit_pull_constant_load_reg(temp, brw_imm_ud(index), From a487f0284f618416e74f73ca2534d8d93c26531c Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 25 Nov 2015 09:36:34 -0800 Subject: [PATCH 12/16] i965/vec4: Use MOV_INDIRECT instead of reladdr for indirect push constants This commit moves us to an instruction based model rather than a register-based model for indirects. This is more accurate anyway as we have to emit instructions to resolve the reladdr. It's also a lot simpler because it gets rid of the recursive reladdr problem by design. One side-effect of this is that we need a whole new algorithm in move_uniform_array_access_to_pull_constants. This new algorithm is much more straightforward than the old one and is fairly similar to what we're already doing in the FS backend. --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 2 +- src/mesa/drivers/dri/i965/brw_vec4.h | 3 +- src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 12 ++- .../drivers/dri/i965/brw_vec4_visitor.cpp | 92 +++++++++---------- 4 files changed, 54 insertions(+), 55 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index a697bdf84a0..e4a405b8705 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -775,7 +775,7 @@ vec4_visitor::move_push_constants_to_pull_constants() dst_reg temp = dst_reg(this, glsl_type::vec4_type); emit_pull_constant_load(block, inst, temp, inst->src[i], - pull_constant_loc[uniform]); + pull_constant_loc[uniform], src_reg()); inst->src[i].file = temp.file; inst->src[i].nr = temp.nr; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index f2e5ce18ab4..e6d6c8218be 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -293,7 +293,8 @@ public: void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, dst_reg dst, src_reg orig_src, - int base_offset); + int base_offset, + src_reg indirect); void emit_pull_constant_load_reg(dst_reg dst, src_reg surf_index, src_reg offset, diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index f965b39360f..58b6612de9d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -673,12 +673,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) /* Offsets are in bytes but they should always be multiples of 16 */ assert(const_offset->u[0] % 16 == 0); src.reg_offset = const_offset->u[0] / 16; - } else { - src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1); - src.reladdr = new(mem_ctx) src_reg(tmp); - } - emit(MOV(dest, src)); + emit(MOV(dest, src)); + } else { + src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); + + emit(SHADER_OPCODE_MOV_INDIRECT, dest, src, + indirect, brw_imm_ud(instr->const_index[1])); + } break; } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 7712d3471ea..bc7b30df2ad 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1641,16 +1641,16 @@ vec4_visitor::move_grf_array_access_to_scratch() void vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, dst_reg temp, src_reg orig_src, - int base_offset) + int base_offset, src_reg indirect) { int reg_offset = base_offset + orig_src.reg_offset; const unsigned index = prog_data->base.binding_table.pull_constants_start; src_reg offset; - if (orig_src.reladdr) { + if (indirect.file != BAD_FILE) { offset = src_reg(this, glsl_type::int_type); - emit_before(block, inst, ADD(dst_reg(offset), *orig_src.reladdr, + emit_before(block, inst, ADD(dst_reg(offset), indirect, brw_imm_d(reg_offset * 16))); } else if (devinfo->gen >= 8) { /* Store the offset in a GRF so we can send-from-GRF. */ @@ -1685,59 +1685,55 @@ vec4_visitor::move_uniform_array_access_to_pull_constants() { int pull_constant_loc[this->uniforms]; memset(pull_constant_loc, -1, sizeof(pull_constant_loc)); - bool nested_reladdr; - /* Walk through and find array access of uniforms. Put a copy of that - * uniform in the pull constant buffer. - * - * Note that we don't move constant-indexed accesses to arrays. No - * testing has been done of the performance impact of this choice. + /* First, walk through the instructions and determine which things need to + * be pulled. We mark something as needing to be pulled by setting + * pull_constant_loc to 0. */ - do { - nested_reladdr = false; + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + /* We only care about MOV_INDIRECT of a uniform */ + if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || + inst->src[0].file != UNIFORM) + continue; - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - for (int i = 0 ; i < 3; i++) { - if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr) - continue; + int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset; - int uniform = inst->src[i].nr; + for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++) + pull_constant_loc[uniform_nr + j] = 0; + } - if (inst->src[i].reladdr->reladdr) - nested_reladdr = true; /* will need another pass */ + /* Next, we walk the list of uniforms and assign real pull constant + * locations and set their corresponding entries in pull_param. + */ + for (int j = 0; j < this->uniforms; j++) { + if (pull_constant_loc[j] < 0) + continue; - /* If this array isn't already present in the pull constant buffer, - * add it. - */ - if (pull_constant_loc[uniform] == -1) { - const gl_constant_value **values = - &stage_prog_data->param[uniform * 4]; + pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4; - pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4; - - assert(uniform < uniform_array_size); - for (int j = 0; j < uniform_size[uniform] * 4; j++) { - stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] - = values[j]; - } - } - - /* Set up the annotation tracking for new generated instructions. */ - base_ir = inst->ir; - current_annotation = inst->annotation; - - dst_reg temp = dst_reg(this, glsl_type::vec4_type); - - emit_pull_constant_load(block, inst, temp, inst->src[i], - pull_constant_loc[uniform]); - - inst->src[i].file = temp.file; - inst->src[i].nr = temp.nr; - inst->src[i].reg_offset = temp.reg_offset; - inst->src[i].reladdr = NULL; - } + for (int i = 0; i < 4; i++) { + stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] + = stage_prog_data->param[j * 4 + i]; } - } while (nested_reladdr); + } + + /* Finally, we can walk through the instructions and lower MOV_INDIRECT + * instructions to actual uniform pulls. + */ + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + /* We only care about MOV_INDIRECT of a uniform */ + if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || + inst->src[0].file != UNIFORM) + continue; + + int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset; + + assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP); + + emit_pull_constant_load(block, inst, inst->dst, inst->src[0], + pull_constant_loc[uniform_nr], inst->src[1]); + inst->remove(block); + } /* Now there are no accesses of the UNIFORM file with a reladdr, so * no need to track them as larger-than-vec4 objects. This will be From eb76f226cf0c3497caa1738efffa9c2e8b02201c Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 25 Nov 2015 09:59:03 -0800 Subject: [PATCH 13/16] i965/fs: Use UD type for offsets in VARYING_PULL_CONSTANT_LOAD --- src/mesa/drivers/dri/i965/brw_fs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index cbaa8afcdcf..dc0bdba72c1 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -174,7 +174,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, * CSE can later notice that those loads are all the same and eliminate * the redundant ones. */ - fs_reg vec4_offset = vgrf(glsl_type::int_type); + fs_reg vec4_offset = vgrf(glsl_type::uint_type); bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf)); int scale = 1; From 75f33a6420af37406edbf64c535b5b29d2d2eefc Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 8 Dec 2015 17:02:16 -0800 Subject: [PATCH 14/16] i965/vec4: Get rid of the uniform_size array --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 8 -------- src/mesa/drivers/dri/i965/brw_vec4.h | 2 -- src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 9 --------- src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 11 ----------- src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp | 1 - 5 files changed, 31 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index e4a405b8705..1304e2312f8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -466,11 +466,6 @@ vec4_visitor::split_uniform_registers() inst->src[i].reg_offset = 0; } } - - /* Update that everything is now vector-sized. */ - for (int i = 0; i < this->uniforms; i++) { - this->uniform_size[i] = 1; - } } void @@ -528,7 +523,6 @@ vec4_visitor::pack_uniform_registers() * push constants. */ for (int src = 0; src < uniforms; src++) { - assert(src < uniform_array_size); int size = chans_used[src]; if (size == 0) @@ -1588,8 +1582,6 @@ vec4_visitor::setup_uniforms(int reg) * matter what, or the GPU would hang. */ if (devinfo->gen < 6 && this->uniforms == 0) { - assert(this->uniforms < this->uniform_array_size); - stage_prog_data->param = reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4); for (unsigned int i = 0; i < 4; i++) { diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index e6d6c8218be..0dc04eaa252 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -115,8 +115,6 @@ public: */ dst_reg output_reg[BRW_VARYING_SLOT_COUNT]; const char *output_reg_annotation[BRW_VARYING_SLOT_COUNT]; - int *uniform_size; - int uniform_array_size; /*< Size of the uniform_size array */ int uniforms; src_reg shader_start_time; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 58b6612de9d..bafc9a514d2 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -118,15 +118,6 @@ void vec4_visitor::nir_setup_uniforms() { uniforms = nir->num_uniforms / 16; - - nir_foreach_variable(var, &nir->uniforms) { - /* UBO's and atomics don't take up space in the uniform file */ - if (var->interface_type != NULL || var->type->contains_atomic()) - continue; - - if (type_size_vec4(var->type) > 0) - uniform_size[var->data.driver_location / 16] = type_size_vec4(var->type); - } } void diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index bc7b30df2ad..357dd4d5a64 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1786,17 +1786,6 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; this->uniforms = 0; - - /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires - * at least one. See setup_uniforms() in brw_vec4.cpp. - */ - this->uniform_array_size = 1; - if (prog_data) { - this->uniform_array_size = - MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1); - } - - this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size); } vec4_visitor::~vec4_visitor() diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp index fd8be7d972c..205323ce6ed 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp @@ -261,7 +261,6 @@ void vec4_vs_visitor::setup_uniform_clipplane_values() { for (int i = 0; i < key->nr_userclip_plane_consts; ++i) { - assert(this->uniforms < uniform_array_size); this->userplane[i] = dst_reg(UNIFORM, this->uniforms); this->userplane[i].type = BRW_REGISTER_TYPE_F; for (int j = 0; j < 4; ++j) { From 63c313de84afa9ee65f5d98e1c843ace3a5c9f21 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 8 Dec 2015 17:14:49 -0800 Subject: [PATCH 15/16] i965/fs: Rename demote_pull_constants to lower_constant_loads --- src/mesa/drivers/dri/i965/brw_fs.cpp | 4 ++-- src/mesa/drivers/dri/i965/brw_fs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index dc0bdba72c1..7124ee3905b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2016,7 +2016,7 @@ fs_visitor::assign_constant_locations() * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. */ void -fs_visitor::demote_pull_constants() +fs_visitor::lower_constant_loads() { const unsigned index = stage_prog_data->binding_table.pull_constants_start; @@ -5033,7 +5033,7 @@ fs_visitor::optimize() bld = fs_builder(this, 64); assign_constant_locations(); - demote_pull_constants(); + lower_constant_loads(); validate(); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 7bed2179531..d09f2e42425 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -136,7 +136,7 @@ public: void split_virtual_grfs(); bool compact_virtual_grfs(); void assign_constant_locations(); - void demote_pull_constants(); + void lower_constant_loads(); void invalidate_live_intervals(); void calculate_live_intervals(); void calculate_register_pressure(); From 091b6156dd8553979336c15acdaf140e5419c483 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 8 Dec 2015 17:34:38 -0800 Subject: [PATCH 16/16] i965/fs: Push small uniform arrays Unfortunately, this also means that we need to use a slightly different algorithm for assign_constant_locations. The old algorithm worked based on the assumption that each read of a uniform value read exactly one float. If it encountered a MOV_INDIRECT, it would immediately bail and push the whole thing. Since we can now read ranges using MOV_INDIRECT, we need to be able to push a series of floats without breaking them up. To do this, we use an algorithm similar to the on in split_virtual_grfs. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 76 +++++++++++++++++++--------- 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 7124ee3905b..6c4a70f1dac 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1911,9 +1911,7 @@ fs_visitor::compact_virtual_grfs() * maximum number of fragment shader uniform components (64). If * there are too many of these, they'd fill up all of register space. * So, this will push some of them out to the pull constant buffer and - * update the program to load them. We also use pull constants for all - * indirect constant loads because we don't support indirect accesses in - * registers yet. + * update the program to load them. */ void fs_visitor::assign_constant_locations() @@ -1925,15 +1923,18 @@ fs_visitor::assign_constant_locations() bool is_live[uniforms]; memset(is_live, 0, sizeof(is_live)); - bool needs_pull[uniforms]; - memset(needs_pull, 0, sizeof(needs_pull)); + /* For each uniform slot, a value of true indicates that the given slot and + * the next slot must remain contiguous. This is used to keep us from + * splitting arrays apart. + */ + bool contiguous[uniforms]; + memset(contiguous, 0, sizeof(contiguous)); /* First, we walk through the instructions and do two things: * * 1) Figure out which uniforms are live. * - * 2) Find all indirect access of uniform arrays and flag them as needing - * to go into the pull constant buffer. + * 2) Mark any indirectly used ranges of registers as contiguous. * * Note that we don't move constant-indexed accesses to arrays. No * testing has been done of the performance impact of this choice. @@ -1946,12 +1947,16 @@ fs_visitor::assign_constant_locations() int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { - for (unsigned j = 0; j < inst->src[2].ud / 4; j++) { - is_live[constant_nr + j] = true; - needs_pull[constant_nr + j] = true; + assert(inst->src[2].ud % 4 == 0); + unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; + assert(last < uniforms); + + for (unsigned j = constant_nr; j < last; j++) { + is_live[j] = true; + contiguous[j] = true; } + is_live[last] = true; } else { - /* Mark the the one accessed uniform as live */ if (constant_nr >= 0 && constant_nr < (int) uniforms) is_live[constant_nr] = true; } @@ -1966,26 +1971,49 @@ fs_visitor::assign_constant_locations() * If changing this value, note the limitation about total_regs in * brw_curbe.c. */ - unsigned int max_push_components = 16 * 8; + const unsigned int max_push_components = 16 * 8; + + /* We push small arrays, but no bigger than 16 floats. This is big enough + * for a vec4 but hopefully not large enough to push out other stuff. We + * should probably use a better heuristic at some point. + */ + const unsigned int max_chunk_size = 16; + unsigned int num_push_constants = 0; unsigned int num_pull_constants = 0; push_constant_loc = ralloc_array(mem_ctx, int, uniforms); pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - for (unsigned int i = 0; i < uniforms; i++) { - push_constant_loc[i] = -1; - pull_constant_loc[i] = -1; + int chunk_start = -1; + for (unsigned u = 0; u < uniforms; u++) { + push_constant_loc[u] = -1; + pull_constant_loc[u] = -1; - if (!is_live[i]) + if (!is_live[u]) continue; - if (!needs_pull[i] && num_push_constants < max_push_components) { - /* Retain as a push constant */ - push_constant_loc[i] = num_push_constants++; - } else { - /* We have to pull it */ - pull_constant_loc[i] = num_pull_constants++; + /* This is the first live uniform in the chunk */ + if (chunk_start < 0) + chunk_start = u; + + /* If this element does not need to be contiguous with the next, we + * split at this point and everthing between chunk_start and u forms a + * single chunk. + */ + if (!contiguous[u]) { + unsigned chunk_size = u - chunk_start + 1; + + if (num_push_constants + chunk_size <= max_push_components && + chunk_size <= max_chunk_size) { + for (unsigned j = chunk_start; j <= u; j++) + push_constant_loc[j] = num_push_constants++; + } else { + for (unsigned j = chunk_start; j <= u; j++) + pull_constant_loc[j] = num_pull_constants++; + } + + chunk_start = -1; } } @@ -2066,7 +2094,9 @@ fs_visitor::lower_constant_loads() continue; /* Out of bounds access */ int pull_index = pull_constant_loc[location]; - assert(pull_index >= 0); /* This had better be pull */ + + if (pull_index == -1) + continue; VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, brw_imm_ud(index),