diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index 48f3f945456..e455a3ee0ba 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -1107,7 +1107,9 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) }; uint32_t best_register_pressure = UINT32_MAX; - enum brw_instruction_scheduler_mode best_sched = BRW_SCHEDULE_NONE; + float best_perf = 0; + unsigned best_press_idx = 0; + unsigned best_perf_idx = 0; brw_opt_compact_virtual_grfs(s); @@ -1123,56 +1125,105 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) * prevent dependencies between the different scheduling modes. */ brw_inst **orig_order = save_instruction_order(s.cfg); - brw_inst **best_pressure_order = NULL; + brw_inst **orders[ARRAY_SIZE(pre_modes)] = {}; void *scheduler_ctx = ralloc_context(NULL); brw_instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx); - /* Try each scheduling heuristic to see if it can successfully register - * allocate without spilling. They should be ordered by decreasing - * performance but increasing likelihood of allocating. + /* Try each scheduling heuristic to choose the one one with the + * best trade-off between latency and register pressure, which on + * xe3+ is dependent on the thread parallelism that can be achieved + * at the GRF register requirement of each ordering of the program + * (note that the register requirement of the program can only be + * estimated at this point prior to register allocation). */ for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { enum brw_instruction_scheduler_mode sched_mode = pre_modes[i]; - if (devinfo->ver < 30 && sched_mode == BRW_SCHEDULE_PRE_LATENCY) + /* Only use the PRE heuristic on pre-xe3 platforms during the + * first pass, since the trade-off between EU thread count and + * GRF use isn't a concern on platforms that don't support VRT. + */ + if (devinfo->ver < 30 && sched_mode != BRW_SCHEDULE_PRE) + continue; + + /* These don't appear to provide much benefit on xe3+. + */ + if (devinfo->ver >= 30 && (sched_mode == BRW_SCHEDULE_PRE_LIFO || + sched_mode == BRW_SCHEDULE_NONE)) continue; brw_schedule_instructions_pre_ra(s, sched, sched_mode); s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode]; - s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i); + orders[i] = save_instruction_order(s.cfg); - if (0) { - brw_assign_regs_trivial(s); - allocated = true; - break; + const unsigned press = brw_compute_max_register_pressure(s); + if (press < best_register_pressure) { + best_register_pressure = press; + best_press_idx = i; } - /* We should only spill registers on the last scheduling. */ - assert(!s.spilled_any_registers); - - allocated = brw_assign_regs(s, false, spill_all); - if (allocated) - break; - - /* Save the maximum register pressure */ - uint32_t this_pressure = brw_compute_max_register_pressure(s); - - if (0) { - fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n", - scheduler_mode_name[sched_mode], this_pressure); + const brw_performance &perf = s.performance_analysis.require(); + if (perf.throughput > best_perf) { + best_perf = perf.throughput; + best_perf_idx = i; } - if (this_pressure < best_register_pressure) { - best_register_pressure = this_pressure; - best_sched = sched_mode; - delete[] best_pressure_order; - best_pressure_order = save_instruction_order(s.cfg); + if (i + 1 < ARRAY_SIZE(pre_modes)) { + /* Reset back to the original order before trying the next mode */ + restore_instruction_order(s, orig_order); } + } - /* Reset back to the original order before trying the next mode */ - restore_instruction_order(s, orig_order); + restore_instruction_order(s, orders[best_perf_idx]); + s.shader_stats.scheduler_mode = scheduler_mode_name[pre_modes[best_perf_idx]]; + allocated = brw_assign_regs(s, false, spill_all); + + if (!allocated) { + /* Try each scheduling heuristic to see if it can successfully register + * allocate without spilling. They should be ordered by decreasing + * performance but increasing likelihood of allocating. + */ + for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { + enum brw_instruction_scheduler_mode sched_mode = pre_modes[i]; + + /* The latency-sensitive heuristic is unlikely to be helpful + * if we failed to register-allocate. + */ + if (sched_mode == BRW_SCHEDULE_PRE_LATENCY) + continue; + + /* Already tried to register-allocate this. */ + if (i == best_perf_idx) + continue; + + if (orders[i]) { + /* We already scheduled the program with this mode. */ + restore_instruction_order(s, orders[i]); + } else { + restore_instruction_order(s, orig_order); + brw_schedule_instructions_pre_ra(s, sched, sched_mode); + s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode]; + s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i); + orders[i] = save_instruction_order(s.cfg); + + const unsigned press = brw_compute_max_register_pressure(s); + if (press < best_register_pressure) { + best_register_pressure = press; + best_press_idx = i; + } + } + + s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode]; + + /* We should only spill registers on the last scheduling. */ + assert(!s.spilled_any_registers); + + allocated = brw_assign_regs(s, false, spill_all); + if (allocated) + break; + } } ralloc_free(scheduler_ctx); @@ -1180,16 +1231,17 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) if (!allocated) { if (0) { fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n", - scheduler_mode_name[best_sched]); + scheduler_mode_name[pre_modes[best_press_idx]]); } - restore_instruction_order(s, best_pressure_order); - s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched]; + restore_instruction_order(s, orders[best_press_idx]); + s.shader_stats.scheduler_mode = scheduler_mode_name[pre_modes[best_press_idx]]; allocated = brw_assign_regs(s, allow_spilling, spill_all); } delete[] orig_order; - delete[] best_pressure_order; + for (unsigned i = 0; i < ARRAY_SIZE(orders); i++) + delete[] orders[i]; if (!allocated) { s.fail("Failure to register allocate. Reduce number of "