diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 446ac53e95f..484dc050368 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -850,6 +850,9 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) break; case nir_op_unpack_half_2x16_split_x: + /* XXX perf: It would be good to be able to merge this unpack + * with whatever uses our result. + */ result = vir_FMOV(c, src[0]); vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); break; @@ -1489,6 +1492,10 @@ ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) static void ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) { + /* XXX perf: Experiment with using immediate loads to avoid having + * these end up in the uniform stream. Watch out for breaking the + * small immediates optimization in the process! + */ struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); for (int i = 0; i < instr->def.num_components; i++) qregs[i] = vir_uniform_ui(c, instr->value.u32[i]); @@ -1535,6 +1542,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) for (int i = 0; i < instr->num_components; i++) { int ubo = nir_src_as_uint(instr->src[0]); + /* XXX perf: On V3D 4.x with uniform offsets, we + * should probably try setting UBOs up in the A + * register file and doing a sequence of loads that + * way. + */ /* Adjust for where we stored the TGSI register base. */ vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), @@ -1669,6 +1681,12 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) /* Clears (activates) the execute flags for any channels whose jump target * matches this block. + * + * XXX perf: Could we be using flpush/flpop somehow for our execution channel + * enabling? + * + * XXX perf: For uniform control flow, we should be able to skip c->execute + * handling entirely. */ static void ntq_activate_execute_for_block(struct v3d_compile *c) @@ -1704,6 +1722,10 @@ ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt) /* Set A for executing (execute == 0) and jumping (if->condition == * 0) channels, and then update execute flags for those to point to * the ELSE block. + * + * XXX perf: we could reuse ntq_emit_comparison() to generate our if + * condition, and the .uf field to ignore non-executing channels, to + * reduce the overhead of if statements. */ vir_PF(c, vir_OR(c, c->execute, @@ -1925,6 +1947,10 @@ nir_to_vir(struct v3d_compile *c) c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); + /* XXX perf: We could set the "disable implicit point/line + * varyings" field in the shader record and not emit these, if + * they're not going to be used. + */ if (c->fs_key->is_points) { c->point_x = emit_fragment_varying(c, NULL, 0); c->point_y = emit_fragment_varying(c, NULL, 0); @@ -2119,7 +2145,14 @@ v3d_nir_to_vir(struct v3d_compile *c) vir_check_payload_w(c); - /* XXX: vir_schedule_instructions(c); */ + /* XXX perf: On VC4, we do a VIR-level instruction scheduling here. + * We used that on that platform to pipeline TMU writes and reduce the + * number of thread switches, as well as try (mostly successfully) to + * reduce maximum register pressure to allow more threads. We should + * do something of that sort for V3D -- either instruction scheduling + * here, or delay the the THRSW and LDTMUs from our texture + * instructions until the results are needed. + */ if (V3D_DEBUG & (V3D_DEBUG_VIR | v3d_debug_flag_for_shader_stage(c->s->info.stage))) { diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index 365aebdbd6d..7662c8f6f08 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -195,6 +195,9 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, if (!magic) { add_write_dep(state, &state->last_rf[waddr], n); } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { + /* XXX perf: For V3D 4.x, we could reorder TMU writes other + * than the TMUS/TMUD/TMUA to improve scheduling flexibility. + */ add_write_dep(state, &state->last_tmu_write, n); switch (waddr) { case V3D_QPU_WADDR_TMUS: @@ -590,6 +593,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) return next_score; next_score++; + /* XXX perf: We should schedule SFU ALU ops so that the reader is 2 + * instructions after the producer if possible, not just 1. + */ + /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; @@ -784,6 +791,12 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, * sooner. If the ldvary's r5 wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update * r5 in the same tick. + * + * XXX perf: To get good pipelining of a sequence of varying + * loads, we need to figure out how to pair the ldvary signal + * up to the instruction before the last r5 user in the + * previous ldvary sequence. Currently, it usually pairs with + * the last r5 user. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c index 9f1fd9a0d20..3cb96e1204d 100644 --- a/src/broadcom/compiler/v3d40_tex.c +++ b/src/broadcom/compiler/v3d40_tex.c @@ -34,6 +34,9 @@ static void vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val, int *tmu_writes) { + /* XXX perf: We should figure out how to merge ALU operations + * producing the val with this MOV, when possible. + */ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); (*tmu_writes)++; @@ -147,6 +150,10 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) /* Limit the number of channels returned to both how many the NIR * instruction writes and how many the instruction could produce. + * + * XXX perf: Can we also limit to the number of channels that are + * actually read by the users of this NIR dest, so that we don't need + * to emit unused LDTMUs? */ uint32_t instr_return_channels = nir_tex_instr_dest_size(instr); if (!p1_unpacked.output_type_32_bit) @@ -187,6 +194,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) p1_packed |= unit << 24; vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed); + /* XXX perf: Can we skip p1 setup for txf ops? */ vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed); if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0) vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); @@ -226,6 +234,12 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) STATIC_ASSERT(PIPE_SWIZZLE_X == 0); chan = return_values[i / 2]; + /* XXX perf: We should move this unpacking into NIR. + * That would give us exposure of these types to NIR + * optimization, so that (for example) a repacking of + * half-float samples to the half-float render target + * could be eliminated. + */ if (nir_alu_type_get_base_type(instr->dest_type) == nir_type_float) { enum v3d_qpu_input_unpack unpack; diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c index 692f1fe3c04..46e629d0c64 100644 --- a/src/gallium/drivers/v3d/v3dx_draw.c +++ b/src/gallium/drivers/v3d/v3dx_draw.c @@ -124,6 +124,11 @@ v3d_predraw_check_stage_inputs(struct pipe_context *pctx, { struct v3d_context *v3d = v3d_context(pctx); + /* XXX perf: If we're reading from the output of TF in this job, we + * should instead be using the wait for transform feedback + * functionality. + */ + /* Flush writes to textures we're sampling. */ for (int i = 0; i < v3d->tex[s].num_textures; i++) { struct pipe_sampler_view *pview = v3d->tex[s].textures[i]; @@ -175,6 +180,10 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD), 32); + /* XXX perf: We should move most of the SHADER_STATE_RECORD setup to + * compile time, so that we mostly just have to OR the VS and FS + * records together at draw time. + */ cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) { shader.enable_clipping = true; /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */ diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c index 01a907b0a86..17b30465c9d 100644 --- a/src/gallium/drivers/v3d/v3dx_rcl.c +++ b/src/gallium/drivers/v3d/v3dx_rcl.c @@ -761,7 +761,10 @@ v3dX(emit_rcl)(struct v3d_job *job) v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1); - /* XXX: Use Morton order */ + /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11 + * performance, but we should use Morton order otherwise to improve + * cache locality. + */ uint32_t supertile_w_in_pixels = job->tile_width * supertile_w; uint32_t supertile_h_in_pixels = job->tile_height * supertile_h; uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;