diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index b9d72ea5b30..d7428c3c446 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -79,6 +79,8 @@ COMPILER_FILES = \ compiler/brw_interpolation_map.c \ compiler/brw_ir_allocator.h \ compiler/brw_ir_fs.h \ + compiler/brw_ir_performance.h \ + compiler/brw_ir_performance.cpp \ compiler/brw_ir_vec4.h \ compiler/brw_nir.h \ compiler/brw_nir.c \ diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index b421723c53e..4e8f8ccac78 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -32,6 +32,7 @@ #include "brw_ir_fs.h" #include "brw_fs_builder.h" #include "brw_fs_live_variables.h" +#include "brw_ir_performance.h" #include "compiler/nir/nir.h" struct bblock_t; @@ -349,6 +350,8 @@ public: backend_shader *) live_analysis; BRW_ANALYSIS(regpressure_analysis, brw::register_pressure, fs_visitor *) regpressure_analysis; + BRW_ANALYSIS(performance_analysis, brw::performance, + fs_visitor *) performance_analysis; /** Number of uniform variable components visited. */ unsigned uniforms; diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index 0d8b0f78a32..7b315693e4a 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -900,6 +900,7 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, key(key), gs_compile(NULL), prog_data(prog_data), input_vue_map(input_vue_map), live_analysis(this), regpressure_analysis(this), + performance_analysis(this), dispatch_width(dispatch_width), shader_time_index(shader_time_index), bld(fs_builder(this, dispatch_width).at_end()) @@ -918,6 +919,7 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, key(&c->key.base), gs_compile(c), prog_data(&prog_data->base.base), live_analysis(this), regpressure_analysis(this), + performance_analysis(this), dispatch_width(8), shader_time_index(shader_time_index), bld(fs_builder(this, dispatch_width).at_end()) diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp new file mode 100644 index 00000000000..5785d839e3a --- /dev/null +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -0,0 +1,1561 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_vec4.h" +#include "brw_cfg.h" + +using namespace brw; + +namespace { + /** + * Enumeration representing the various asynchronous units that can run + * computations in parallel on behalf of a shader thread. + */ + enum unit { + /** EU front-end. */ + unit_fe, + /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */ + unit_fpu, + /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */ + unit_em, + /** Sampler shared function. */ + unit_sampler, + /** Pixel Interpolator shared function. */ + unit_pi, + /** Unified Return Buffer shared function. */ + unit_urb, + /** Data Port Data Cache shared function. */ + unit_dp_dc, + /** Data Port Render Cache shared function. */ + unit_dp_rc, + /** Data Port Constant Cache shared function. */ + unit_dp_cc, + /** Message Gateway shared function. */ + unit_gateway, + /** Thread Spawner shared function. */ + unit_spawner, + /* unit_vme, */ + /* unit_cre, */ + /** Number of asynchronous units currently tracked. */ + num_units, + /** Dummy unit for instructions that don't consume runtime from the above. */ + unit_null = num_units + }; + + /** + * Enumeration representing a computation result another computation can + * potentially depend on. + */ + enum dependency_id { + /* Register part of the GRF. */ + dependency_id_grf0 = 0, + /* Register part of the MRF. Only used on Gen4-6. */ + dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF, + /* Address register part of the ARF. */ + dependency_id_addr0 = dependency_id_mrf0 + 24, + /* Accumulator register part of the ARF. */ + dependency_id_accum0 = dependency_id_addr0 + 1, + /* Flag register part of the ARF. */ + dependency_id_flag0 = dependency_id_accum0 + 12, + /* SBID token write completion. Only used on Gen12+. */ + dependency_id_sbid_wr0 = dependency_id_flag0 + 8, + /* SBID token read completion. Only used on Gen12+. */ + dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16, + /* Number of computation dependencies currently tracked. */ + num_dependency_ids = dependency_id_sbid_rd0 + 16 + }; + + /** + * State of our modeling of the program execution. + */ + struct state { + state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {} + /** + * Time at which a given unit will be ready to execute the next + * computation, in clock units. + */ + unsigned unit_ready[num_units]; + /** + * Time at which an instruction dependent on a given dependency ID will + * be ready to execute, in clock units. + */ + unsigned dep_ready[num_dependency_ids]; + /** + * Aggregated utilization of a given unit excluding idle cycles, + * in clock units. + */ + float unit_busy[num_units]; + /** + * Factor of the overhead of a computation accounted for in the + * aggregated utilization calculation. + */ + float weight; + }; + + /** + * Information derived from an IR instruction used to compute performance + * estimates. Allows the timing calculation to work on both FS and VEC4 + * instructions. + */ + struct instruction_info { + instruction_info(const gen_device_info *devinfo, const fs_inst *inst) : + devinfo(devinfo), op(inst->opcode), + td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), + tx(get_exec_type(inst)), sx(0), ss(0), + sc(has_bank_conflict(devinfo, inst) ? sd : 0), + desc(inst->desc), sfid(inst->sfid) + { + /* We typically want the maximum source size, except for split send + * messages which require the total size. + */ + if (inst->opcode == SHADER_OPCODE_SEND) { + ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) + + DIV_ROUND_UP(inst->size_read(3), REG_SIZE); + } else { + for (unsigned i = 0; i < inst->sources; i++) + ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); + } + + /* Convert the execution size to GRF units. */ + sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); + + /* 32x32 integer multiplication has half the usual ALU throughput. + * Treat it as double-precision. + */ + if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && + !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && + type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) + tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); + } + + instruction_info(const gen_device_info *devinfo, + const vec4_instruction *inst) : + devinfo(devinfo), op(inst->opcode), + td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), + tx(get_exec_type(inst)), sx(0), ss(0), sc(0), + desc(inst->desc), sfid(inst->sfid) + { + /* Compute the maximum source size. */ + for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) + ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); + + /* Convert the execution size to GRF units. */ + sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); + + /* 32x32 integer multiplication has half the usual ALU throughput. + * Treat it as double-precision. + */ + if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && + !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && + type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) + tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); + } + + /** Device information. */ + const struct gen_device_info *devinfo; + /** Instruction opcode. */ + opcode op; + /** Destination type. */ + brw_reg_type td; + /** Destination size in GRF units. */ + unsigned sd; + /** Execution type. */ + brw_reg_type tx; + /** Execution size in GRF units. */ + unsigned sx; + /** Source size. */ + unsigned ss; + /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */ + unsigned sc; + /** Send message descriptor. */ + uint32_t desc; + /** Send message shared function ID. */ + uint8_t sfid; + }; + + /** + * Timing information of an instruction used to estimate the performance of + * the program. + */ + struct perf_desc { + perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) : + u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {} + + /** + * Back-end unit its runtime shall be accounted to, in addition to the + * EU front-end which is always assumed to be involved. + */ + unit u; + /** + * Overhead cycles from the time that the EU front-end starts executing + * the instruction until it's ready to execute the next instruction. + */ + int df; + /** + * Overhead cycles from the time that the back-end starts executing the + * instruction until it's ready to execute the next instruction. + */ + int db; + /** + * Latency cycles from the time that the back-end starts executing the + * instruction until its sources have been read from the register file. + */ + int ls; + /** + * Latency cycles from the time that the back-end starts executing the + * instruction until its regular destination has been written to the + * register file. + */ + int ld; + /** + * Latency cycles from the time that the back-end starts executing the + * instruction until its accumulator destination has been written to the + * ARF file. + * + * Note that this is an approximation of the real behavior of + * accumulating instructions in the hardware: Instead of modeling a pair + * of back-to-back accumulating instructions as a first computation with + * latency equal to ld followed by another computation with a + * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we + * model the stall as if it occurred at the top of the pipeline, with + * the latency of the accumulator computation offset accordingly. + */ + int la; + /** + * Latency cycles from the time that the back-end starts executing the + * instruction until its flag destination has been written to the ARF + * file. + */ + int lf; + }; + + /** + * Compute the timing information of an instruction based on any relevant + * information from the IR and a number of parameters specifying a linear + * approximation: Parameter X_Y specifies the derivative of timing X + * relative to info field Y, while X_1 specifies the independent term of + * the approximation of timing X. + */ + perf_desc + calculate_desc(const instruction_info &info, unit u, + int df_1, int df_sd, int df_sc, + int db_1, int db_sx, + int ls_1, int ld_1, int la_1, int lf_1, + int l_ss, int l_sd) + { + return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc), + db_1 + db_sx * int(info.sx), + ls_1 + l_ss * int(info.ss), + ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd), + la_1, lf_1); + } + + /** + * Compute the timing information of an instruction based on any relevant + * information from the IR and a number of linear approximation parameters + * hard-coded for each IR instruction. + * + * Most timing parameters are obtained from the multivariate linear + * regression of a sample of empirical timings measured using the tm0 + * register (as can be done today by using the shader_time debugging + * option). The Gen4-5 math timings are obtained from BSpec Volume 5c.3 + * "Shared Functions - Extended Math", Section 3.2 "Performance". + * Parameters marked XXX shall be considered low-quality, they're possibly + * high variance or completely guessed in cases where experimental data was + * unavailable. + */ + const perf_desc + instruction_desc(const instruction_info &info) + { + const struct gen_device_info *devinfo = info.devinfo; + + switch (info.op) { + case BRW_OPCODE_SYNC: + case BRW_OPCODE_SEL: + case BRW_OPCODE_NOT: + case BRW_OPCODE_AND: + case BRW_OPCODE_OR: + case BRW_OPCODE_XOR: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_DIM: + case BRW_OPCODE_ASR: + case BRW_OPCODE_CMPN: + case BRW_OPCODE_F16TO32: + case BRW_OPCODE_BFREV: + case BRW_OPCODE_BFI1: + case BRW_OPCODE_AVG: + case BRW_OPCODE_FRC: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_MAC: + case BRW_OPCODE_MACH: + case BRW_OPCODE_LZD: + case BRW_OPCODE_FBH: + case BRW_OPCODE_FBL: + case BRW_OPCODE_CBIT: + case BRW_OPCODE_ADDC: + case BRW_OPCODE_ROR: + case BRW_OPCODE_ROL: + case BRW_OPCODE_SUBB: + case BRW_OPCODE_SAD2: + case BRW_OPCODE_SADA2: + case BRW_OPCODE_LINE: + case BRW_OPCODE_NOP: + case SHADER_OPCODE_CLUSTER_BROADCAST: + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDX_FINE: + case FS_OPCODE_DDY_COARSE: + case FS_OPCODE_PIXEL_X: + case FS_OPCODE_PIXEL_Y: + case FS_OPCODE_SET_SAMPLE_ID: + case VEC4_OPCODE_MOV_BYTES: + case VEC4_OPCODE_UNPACK_UNIFORM: + case VEC4_OPCODE_DOUBLE_TO_F32: + case VEC4_OPCODE_DOUBLE_TO_D32: + case VEC4_OPCODE_DOUBLE_TO_U32: + case VEC4_OPCODE_TO_DOUBLE: + case VEC4_OPCODE_PICK_LOW_32BIT: + case VEC4_OPCODE_PICK_HIGH_32BIT: + case VEC4_OPCODE_SET_LOW_32BIT: + case VEC4_OPCODE_SET_HIGH_32BIT: + case GS_OPCODE_SET_DWORD_2: + case GS_OPCODE_SET_WRITE_OFFSET: + case GS_OPCODE_SET_VERTEX_COUNT: + case GS_OPCODE_PREPARE_CHANNEL_MASKS: + case GS_OPCODE_SET_CHANNEL_MASKS: + case GS_OPCODE_GET_INSTANCE_ID: + case GS_OPCODE_SET_PRIMITIVE_ID: + case GS_OPCODE_SVB_SET_DST_INDEX: + case TCS_OPCODE_SRC0_010_IS_ZERO: + case TCS_OPCODE_GET_PRIMITIVE_ID: + case TES_OPCODE_GET_PRIMITIVE_ID: + if (devinfo->gen >= 11) { + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 14, 0, 0); + } else if (devinfo->gen >= 8) { + if (type_sz(info.tx) > 4) + return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 8, 4, 12, 0, 0); + } else if (devinfo->is_haswell) { + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 16, 0, 0); + } else { + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + } + + case BRW_OPCODE_MOV: + case BRW_OPCODE_CMP: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + if (devinfo->gen >= 11) { + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 10, 6, 14, 0, 0); + } else if (devinfo->gen >= 8) { + if (type_sz(info.tx) > 4) + return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 8, 4, 12, 0, 0); + } else if (devinfo->is_haswell) { + if (info.tx == BRW_REGISTER_TYPE_F) + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 16, 0, 0); + } else if (devinfo->gen >= 7) { + if (info.tx == BRW_REGISTER_TYPE_F) + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 14, 10 /* XXX */, 20, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + } else { + return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0, + 0, 2 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + } + + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_CSEL: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->gen >= 7) + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case BRW_OPCODE_MAD: + if (devinfo->gen >= 11) { + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + } else if (devinfo->gen >= 8) { + if (type_sz(info.tx) > 4) + return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + } else if (devinfo->is_haswell) { + if (info.tx == BRW_REGISTER_TYPE_F) + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 16, 0, 0); + } else if (devinfo->gen >= 7) { + if (info.tx == BRW_REGISTER_TYPE_F) + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 14, 10 /* XXX */, 20, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + } else if (devinfo->gen >= 6) { + return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */, + 0, 2 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + } else { + abort(); + } + + case BRW_OPCODE_F32TO16: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->gen >= 7) + return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case BRW_OPCODE_DP4: + case BRW_OPCODE_DPH: + case BRW_OPCODE_DP3: + case BRW_OPCODE_DP2: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + if (devinfo->gen >= 6) { + switch (info.op) { + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_em, -2, 4, 0, 0, 4, + 0, 16, 0, 0, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_em, 0, 2, 0, 0, 2, + 0, 12, 0, 0, 0, 0); + else + return calculate_desc(info, unit_em, 0, 2, 0, 0, 2, + 0, 14, 0, 0, 0, 0); + + case SHADER_OPCODE_POW: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_em, -2, 4, 0, 0, 8, + 0, 24, 0, 0, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_em, 0, 2, 0, 0, 4, + 0, 20, 0, 0, 0, 0); + else + return calculate_desc(info, unit_em, 0, 2, 0, 0, 4, + 0, 22, 0, 0, 0, 0); + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + return calculate_desc(info, unit_em, 2, 0, 0, 26, 0, + 0, 28 /* XXX */, 0, 0, 0, 0); + + default: + abort(); + } + } else { + switch (info.op) { + case SHADER_OPCODE_RCP: + return calculate_desc(info, unit_em, 2, 0, 0, 0, 8, + 0, 22, 0, 0, 0, 8); + + case SHADER_OPCODE_RSQ: + return calculate_desc(info, unit_em, 2, 0, 0, 0, 16, + 0, 44, 0, 0, 0, 8); + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_LOG2: + return calculate_desc(info, unit_em, 2, 0, 0, 0, 24, + 0, 66, 0, 0, 0, 8); + + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_EXP2: + return calculate_desc(info, unit_em, 2, 0, 0, 0, 32, + 0, 88, 0, 0, 0, 8); + + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return calculate_desc(info, unit_em, 2, 0, 0, 0, 48, + 0, 132, 0, 0, 0, 8); + + case SHADER_OPCODE_POW: + return calculate_desc(info, unit_em, 2, 0, 0, 0, 64, + 0, 176, 0, 0, 0, 8); + + default: + abort(); + } + } + + case BRW_OPCODE_DO: + if (devinfo->gen >= 6) + return calculate_desc(info, unit_null, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + else + return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + + case BRW_OPCODE_IF: + case BRW_OPCODE_ELSE: + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_WHILE: + case BRW_OPCODE_BREAK: + case BRW_OPCODE_CONTINUE: + case FS_OPCODE_DISCARD_JUMP: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_null, 8, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_null, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + else + return calculate_desc(info, unit_null, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + + case FS_OPCODE_LINTERP: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + + case BRW_OPCODE_LRP: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->gen >= 6) + return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else if (devinfo->gen >= 7) + return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + else + abort(); + + case SHADER_OPCODE_MOV_INDIRECT: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + + case SHADER_OPCODE_BROADCAST: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->gen >= 7) + return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->gen >= 7) + return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case SHADER_OPCODE_RND_MODE: + case SHADER_OPCODE_FLOAT_CONTROL_MODE: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else if (devinfo->gen >= 6) + return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else + abort(); + + case SHADER_OPCODE_SHUFFLE: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0, + 44 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0, + 42 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0, + 0, 44 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else if (devinfo->gen >= 6) + return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0, + 0, 46 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + else + abort(); + + case SHADER_OPCODE_SEL_EXEC: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0, + 0, 4 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0, + 0, 4 /* XXX */, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0, + 0, 4 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0, + 0, 4 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + + case SHADER_OPCODE_QUAD_SWIZZLE: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0, + 0, 8 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0, + 0, 8 /* XXX */, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0, + 0, 8 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0, + 0, 8 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + + case FS_OPCODE_DDY_FINE: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, + 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0); + + case FS_OPCODE_LOAD_LIVE_CHANNELS: + if (devinfo->gen >= 11) + return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0, + 2 /* XXX */, 0, + 0, 0, 0, 10 /* XXX */, 0, 0); + else if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0, + 0, 2 /* XXX */, + 0, 0, 0, 8 /* XXX */, 0, 0); + else + abort(); + + case VEC4_OPCODE_PACK_BYTES: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, + 0, 0); + else + return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else + abort(); + + case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: + case TCS_OPCODE_GET_INSTANCE_ID: + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + case TES_OPCODE_CREATE_INPUT_READ_HEADER: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0, + 6 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0, + 6 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, + 0, 0); + else + return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0, + 6 /* XXX */, 0, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + + case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: + case TCS_OPCODE_CREATE_BARRIER_HEADER: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0, + 8 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0, + 8 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, + 0, 0); + else if (devinfo->gen >= 6) + return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0, + 8 /* XXX */, 0, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + else + abort(); + + case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: + if (devinfo->gen >= 8) + return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else if (devinfo->is_haswell) + return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, + 0, 0); + else if (devinfo->gen >= 7) + return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + else + abort(); + + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_UMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_GET_BUFFER_SIZE: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: + return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */, + 8 /* XXX */, 750 /* XXX */, 0, 0, + 2 /* XXX */, 0); + + case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8: + case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: + case VEC4_OPCODE_URB_READ: + case VS_OPCODE_URB_WRITE: + case GS_OPCODE_URB_WRITE: + case GS_OPCODE_URB_WRITE_ALLOCATE: + case GS_OPCODE_THREAD_END: + case GS_OPCODE_FF_SYNC: + case TCS_OPCODE_URB_WRITE: + case TCS_OPCODE_RELEASE_INPUT: + case TCS_OPCODE_THREAD_END: + return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */, + 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0); + + case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_INTERLOCK: + if (devinfo->gen >= 7) + return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + else + abort(); + + case SHADER_OPCODE_GEN4_SCRATCH_READ: + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + case SHADER_OPCODE_GEN7_SCRATCH_READ: + return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + + case VEC4_OPCODE_UNTYPED_ATOMIC: + if (devinfo->gen >= 7) + return calculate_desc(info, unit_dp_dc, 2, 0, 0, + 30 /* XXX */, 400 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 400 /* XXX */); + else + abort(); + + case VEC4_OPCODE_UNTYPED_SURFACE_READ: + case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: + if (devinfo->gen >= 7) + return calculate_desc(info, unit_dp_dc, 2, 0, 0, + 0, 20 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 0); + else + abort(); + + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_FB_READ: + case FS_OPCODE_REP_FB_WRITE: + return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */, + 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); + + case GS_OPCODE_SVB_WRITE: + if (devinfo->gen >= 6) + return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0, + 0, 450 /* XXX */, + 10 /* XXX */, 300 /* XXX */, 0, 0, + 0, 0); + else + abort(); + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: + return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + + case VS_OPCODE_PULL_CONSTANT_LOAD: + case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16, + 8, 750, 0, 0, 2, 0); + + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + if (devinfo->gen >= 7) + return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0, + 0, 90 /* XXX */, 0, 0, 0, 0); + else + abort(); + + case SHADER_OPCODE_BARRIER: + if (devinfo->gen >= 7) + return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0, + 0 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else + abort(); + + case CS_OPCODE_CS_TERMINATE: + if (devinfo->gen >= 7) + return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0, + 10 /* XXX */, 0, 0, 0, 0, 0); + else + abort(); + + case SHADER_OPCODE_SEND: + switch (info.sfid) { + case GEN6_SFID_DATAPORT_RENDER_CACHE: + if (devinfo->gen >= 7) { + switch (brw_dp_desc_msg_type(devinfo, info.desc)) { + case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP: + return calculate_desc(info, unit_dp_rc, 2, 0, 0, + 30 /* XXX */, 450 /* XXX */, + 10 /* XXX */, 100 /* XXX */, + 0, 0, 0, 400 /* XXX */); + default: + return calculate_desc(info, unit_dp_rc, 2, 0, 0, + 0, 450 /* XXX */, + 10 /* XXX */, 300 /* XXX */, 0, 0, + 0, 0); + } + } else if (devinfo->gen >= 6) { + return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0, + 0, 450 /* XXX */, + 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); + } else { + abort(); + } + case BRW_SFID_SAMPLER: { + if (devinfo->gen >= 6) + return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16, + 8, 750, 0, 0, 2, 0); + else + abort(); + } + case GEN7_SFID_DATAPORT_DATA_CACHE: + case HSW_SFID_DATAPORT_DATA_CACHE_1: + if (devinfo->gen >= 8 || devinfo->is_haswell) { + switch (brw_dp_desc_msg_type(devinfo, info.desc)) { + case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: + case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: + case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: + case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: + return calculate_desc(info, unit_dp_dc, 2, 0, 0, + 30 /* XXX */, 400 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 400 /* XXX */); + + default: + return calculate_desc(info, unit_dp_dc, 2, 0, 0, + 0, 20 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 0); + } + } else if (devinfo->gen >= 7) { + switch (brw_dp_desc_msg_type(devinfo, info.desc)) { + case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP: + return calculate_desc(info, unit_dp_dc, 2, 0, 0, + 30 /* XXX */, 400 /* XXX */, + 10 /* XXX */, 100 /* XXX */, + 0, 0, 0, 400 /* XXX */); + default: + return calculate_desc(info, unit_dp_dc, 2, 0, 0, + 0, 20 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 0); + } + } else { + abort(); + } + default: + abort(); + } + + case SHADER_OPCODE_UNDEF: + case FS_OPCODE_PLACEHOLDER_HALT: + case FS_OPCODE_SCHEDULING_FENCE: + return calculate_desc(info, unit_null, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + + default: + abort(); + } + } + + /** + * Model the performance behavior of a stall on the specified dependency + * ID. + */ + void + stall_on_dependency(state &st, dependency_id id) + { + if (id < ARRAY_SIZE(st.dep_ready)) + st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe], + st.dep_ready[id]); + } + + /** + * Model the performance behavior of the front-end and back-end while + * executing an instruction with the specified timing information, assuming + * all dependencies are already clear. + */ + void + execute_instruction(state &st, const perf_desc &perf) + { + /* Compute the time at which the front-end will be ready to execute the + * next instruction. + */ + st.unit_ready[unit_fe] += perf.df; + + if (perf.u < num_units) { + /* Wait for the back-end to be ready to execute this instruction. */ + st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe], + st.unit_ready[perf.u]); + + /* Compute the time at which the back-end will be ready to execute + * the next instruction, and update the back-end utilization. + */ + st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db; + st.unit_busy[perf.u] += perf.db * st.weight; + } + } + + /** + * Model the performance behavior of a read dependency provided by an + * instruction. + */ + void + mark_read_dependency(state &st, const perf_desc &perf, dependency_id id) + { + if (id < ARRAY_SIZE(st.dep_ready)) + st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls; + } + + /** + * Model the performance behavior of a write dependency provided by an + * instruction. + */ + void + mark_write_dependency(state &st, const perf_desc &perf, dependency_id id) + { + if (id >= dependency_id_accum0 && id < dependency_id_flag0) + st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la; + else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0) + st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf; + else if (id < ARRAY_SIZE(st.dep_ready)) + st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld; + } + + /** + * Return the dependency ID of a backend_reg, offset by \p delta GRFs. + */ + dependency_id + reg_dependency_id(const gen_device_info *devinfo, const backend_reg &r, + const int delta) + { + if (r.file == VGRF) { + const unsigned i = r.nr + r.offset / REG_SIZE + delta; + assert(i < dependency_id_mrf0 - dependency_id_grf0); + return dependency_id(dependency_id_grf0 + i); + + } else if (r.file == FIXED_GRF) { + const unsigned i = r.nr + delta; + assert(i < dependency_id_mrf0 - dependency_id_grf0); + return dependency_id(dependency_id_grf0 + i); + + } else if (r.file == MRF && devinfo->gen >= 7) { + const unsigned i = GEN7_MRF_HACK_START + + r.nr + r.offset / REG_SIZE + delta; + assert(i < dependency_id_mrf0 - dependency_id_grf0); + return dependency_id(dependency_id_grf0 + i); + + } else if (r.file == MRF && devinfo->gen < 7) { + const unsigned i = (r.nr & ~BRW_MRF_COMPR4) + + r.offset / REG_SIZE + delta; + assert(i < dependency_id_addr0 - dependency_id_mrf0); + return dependency_id(dependency_id_mrf0 + i); + + } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS && + r.nr < BRW_ARF_ACCUMULATOR) { + assert(delta == 0); + return dependency_id_addr0; + + } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR && + r.nr < BRW_ARF_FLAG) { + const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta; + assert(i < dependency_id_flag0 - dependency_id_accum0); + return dependency_id(dependency_id_accum0 + i); + + } else { + return num_dependency_ids; + } + } + + /** + * Return the dependency ID of flag register starting at offset \p i. + */ + dependency_id + flag_dependency_id(unsigned i) + { + assert(i < dependency_id_sbid_wr0 - dependency_id_flag0); + return dependency_id(dependency_id_flag0 + i); + } + + /** + * Return the dependency ID corresponding to the SBID read completion + * condition of a Gen12+ SWSB. + */ + dependency_id + tgl_swsb_rd_dependency_id(tgl_swsb swsb) + { + if (swsb.mode) { + assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0); + return dependency_id(dependency_id_sbid_rd0 + swsb.sbid); + } else { + return num_dependency_ids; + } + } + + /** + * Return the dependency ID corresponding to the SBID write completion + * condition of a Gen12+ SWSB. + */ + dependency_id + tgl_swsb_wr_dependency_id(tgl_swsb swsb) + { + if (swsb.mode) { + assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0); + return dependency_id(dependency_id_sbid_wr0 + swsb.sbid); + } else { + return num_dependency_ids; + } + } + + /** + * Return the implicit accumulator register accessed by channel \p i of the + * instruction. + */ + unsigned + accum_reg_of_channel(const gen_device_info *devinfo, + const backend_instruction *inst, + brw_reg_type tx, unsigned i) + { + assert(inst->reads_accumulator_implicitly() || + inst->writes_accumulator_implicitly(devinfo)); + const unsigned offset = (inst->group + i) * type_sz(tx) * + (devinfo->gen < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2); + return offset / REG_SIZE % 2; + } + + /** + * Model the performance behavior of an FS back-end instruction. + */ + void + issue_fs_inst(state &st, const gen_device_info *devinfo, + const backend_instruction *be_inst) + { + const fs_inst *inst = static_cast(be_inst); + const instruction_info info(devinfo, inst); + const perf_desc perf = instruction_desc(info); + + /* Stall on any source dependencies. */ + for (unsigned i = 0; i < inst->sources; i++) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->src[i], j)); + } + + if (inst->reads_accumulator_implicitly()) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (is_send(inst) && inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + stall_on_dependency( + st, reg_dependency_id( + devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); + } + + if (const unsigned mask = inst->flags_read(devinfo)) { + for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { + if (mask & (1 << i)) + stall_on_dependency(st, flag_dependency_id(i)); + } + } + + /* Stall on any write dependencies. */ + if (!inst->no_dd_check) { + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->dst, j)); + } + + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (const unsigned mask = inst->flags_written()) { + for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { + if (mask & (1 << i)) + stall_on_dependency(st, flag_dependency_id(i)); + } + } + } + + /* Stall on any SBID dependencies. */ + if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST)) + stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched)); + else if (inst->sched.mode & TGL_SBID_SRC) + stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched)); + + /* Execute the instruction. */ + execute_instruction(st, perf); + + /* Mark any source dependencies. */ + if (inst->is_send_from_grf()) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->is_payload(i)) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + mark_read_dependency( + st, perf, reg_dependency_id(devinfo, inst->src[i], j)); + } + } + } + + if (is_send(inst) && inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + mark_read_dependency(st, perf, + reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); + } + + /* Mark any destination dependencies. */ + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) { + mark_write_dependency(st, perf, + reg_dependency_id(devinfo, inst->dst, j)); + } + } + + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + mark_write_dependency(st, perf, + reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (const unsigned mask = inst->flags_written()) { + for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { + if (mask & (1 << i)) + mark_write_dependency(st, perf, flag_dependency_id(i)); + } + } + + /* Mark any SBID dependencies. */ + if (inst->sched.mode & TGL_SBID_SET) { + mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched)); + mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched)); + } + } + + /** + * Model the performance behavior of a VEC4 back-end instruction. + */ + void + issue_vec4_instruction(state &st, const gen_device_info *devinfo, + const backend_instruction *be_inst) + { + const vec4_instruction *inst = + static_cast(be_inst); + const instruction_info info(devinfo, inst); + const perf_desc perf = instruction_desc(info); + + /* Stall on any source dependencies. */ + for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->src[i], j)); + } + + if (inst->reads_accumulator_implicitly()) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + stall_on_dependency( + st, reg_dependency_id( + devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); + } + + if (inst->reads_flag()) + stall_on_dependency(st, dependency_id_flag0); + + /* Stall on any write dependencies. */ + if (!inst->no_dd_check) { + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->dst, j)); + } + + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (inst->writes_flag()) + stall_on_dependency(st, dependency_id_flag0); + } + + /* Execute the instruction. */ + execute_instruction(st, perf); + + /* Mark any source dependencies. */ + if (inst->is_send_from_grf()) { + for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + mark_read_dependency( + st, perf, reg_dependency_id(devinfo, inst->src[i], j)); + } + } + + if (inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + mark_read_dependency(st, perf, + reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); + } + + /* Mark any destination dependencies. */ + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) { + mark_write_dependency(st, perf, + reg_dependency_id(devinfo, inst->dst, j)); + } + } + + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + mark_write_dependency(st, perf, + reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (inst->writes_flag()) + mark_write_dependency(st, perf, dependency_id_flag0); + } + + /** + * Calculate the maximum possible throughput of the program compatible with + * the cycle-count utilization estimated for each asynchronous unit, in + * threads-per-cycle units. + */ + float + calculate_thread_throughput(const state &st, float busy) + { + for (unsigned i = 0; i < num_units; i++) + busy = MAX2(busy, st.unit_busy[i]); + + return 1.0 / busy; + } + + /** + * Estimate the performance of the specified shader. + */ + void + calculate_performance(performance &p, const backend_shader *s, + void (*issue_instruction)( + state &, const gen_device_info *, + const backend_instruction *), + unsigned dispatch_width) + { + /* XXX - Plumbing the trip counts from NIR loop analysis would allow us + * to do a better job regarding the loop weights. And some branch + * divergence analysis would allow us to do a better job with + * branching weights. + * + * In the meantime use values that roughly match the control flow + * weights used elsewhere in the compiler back-end -- Main + * difference is the worst-case scenario branch_weight used for + * SIMD32 which accounts for the possibility of a dynamically + * uniform branch becoming divergent in SIMD32. + */ + const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5); + const float loop_weight = 10; + unsigned elapsed = 0; + state st; + + foreach_block(block, s->cfg) { + const unsigned elapsed0 = elapsed; + + foreach_inst_in_block(backend_instruction, inst, block) { + const unsigned clock0 = st.unit_ready[unit_fe]; + + issue_instruction(st, s->devinfo, inst); + + if (inst->opcode == BRW_OPCODE_ENDIF) + st.weight /= branch_weight; + + elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight; + + if (inst->opcode == BRW_OPCODE_IF) + st.weight *= branch_weight; + else if (inst->opcode == BRW_OPCODE_DO) + st.weight *= loop_weight; + else if (inst->opcode == BRW_OPCODE_WHILE) + st.weight /= loop_weight; + } + + p.block_latency[block->num] = elapsed - elapsed0; + } + + p.latency = elapsed; + p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed); + } +} + +brw::performance::performance(const fs_visitor *v) : + block_latency(new unsigned[v->cfg->num_blocks]) +{ + calculate_performance(*this, v, issue_fs_inst, v->dispatch_width); +} + +brw::performance::performance(const vec4_visitor *v) : + block_latency(new unsigned[v->cfg->num_blocks]) +{ + calculate_performance(*this, v, issue_vec4_instruction, 8); +} + +brw::performance::~performance() +{ + delete[] block_latency; +} diff --git a/src/intel/compiler/brw_ir_performance.h b/src/intel/compiler/brw_ir_performance.h new file mode 100644 index 00000000000..c3cefe838aa --- /dev/null +++ b/src/intel/compiler/brw_ir_performance.h @@ -0,0 +1,86 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_PERFORMANCE_H +#define BRW_IR_PERFORMANCE_H + +class fs_visitor; + +namespace brw { + class vec4_visitor; + + /** + * Various estimates of the performance of a shader based on static + * analysis. + */ + struct performance { + performance(const fs_visitor *v); + performance(const vec4_visitor *v); + ~performance(); + + analysis_dependency_class + dependency_class() const + { + return (DEPENDENCY_INSTRUCTIONS | + DEPENDENCY_BLOCKS); + } + + bool + validate(const backend_shader *) const + { + return true; + } + + /** + * Array containing estimates of the runtime of each basic block of the + * program in cycle units. + */ + unsigned *block_latency; + + /** + * Estimate of the runtime of the whole program in cycle units assuming + * uncontended execution. + */ + unsigned latency; + + /** + * Estimate of the throughput of the whole program in + * invocations-per-cycle units. + * + * Note that this might be lower than the ratio between the dispatch + * width of the program and its latency estimate in cases where + * performance doesn't scale without limits as a function of its thread + * parallelism, e.g. due to the existence of a bottleneck in a shared + * function. + */ + float throughput; + + private: + performance(const performance &perf); + performance & + operator=(performance u); + }; +} + +#endif diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h index 1f2d922b186..aa93b05d5af 100644 --- a/src/intel/compiler/brw_vec4.h +++ b/src/intel/compiler/brw_vec4.h @@ -28,6 +28,7 @@ #ifdef __cplusplus #include "brw_ir_vec4.h" +#include "brw_ir_performance.h" #include "brw_vec4_builder.h" #include "brw_vec4_live_variables.h" #endif @@ -107,6 +108,8 @@ public: unsigned int max_grf; BRW_ANALYSIS(live_analysis, brw::vec4_live_variables, backend_shader *) live_analysis; + BRW_ANALYSIS(performance_analysis, brw::performance, + vec4_visitor *) performance_analysis; bool need_all_constants_in_pull_buffer; diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp index da8af709600..f18fd9e38ee 100644 --- a/src/intel/compiler/brw_vec4_visitor.cpp +++ b/src/intel/compiler/brw_vec4_visitor.cpp @@ -1841,7 +1841,7 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, prog_data(prog_data), fail_msg(NULL), first_non_payload_grf(0), - live_analysis(this), + live_analysis(this), performance_analysis(this), need_all_constants_in_pull_buffer(false), no_spills(no_spills), shader_time_index(shader_time_index), diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index af5661e9021..6a609869405 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -72,6 +72,8 @@ libintel_compiler_files = files( 'brw_ir_allocator.h', 'brw_ir_analysis.h', 'brw_ir_fs.h', + 'brw_ir_performance.h', + 'brw_ir_performance.cpp', 'brw_ir_vec4.h', 'brw_nir.h', 'brw_nir.c',