mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 13:30:12 +01:00
Borderlands 3 (both DX11 and DX12 renderers) have a common pattern across many shaders: con 32x4 %510 = (uint32)txf %2 (handle), %1191 (0x10) (coord), %1 (0x0) (lod), 0 (texture) con 32x4 %512 = (uint32)txf %2 (handle), %1511 (0x11) (coord), %1 (0x0) (lod), 0 (texture) ... con 32x4 %550 = (uint32)txf %2 (handle), %1549 (0x25) (coord), %1 (0x0) (lod), 0 (texture) con 32x4 %552 = (uint32)txf %2 (handle), %1551 (0x26) (coord), %1 (0x0) (lod), 0 (texture) A single basic block contains piles of texelFetches from a 1D buffer texture, with constant coordinates. In most cases, only the .x channel of the result is read. So we have something on the order of 28 sampler messages, each asking for...a single uint32_t scalar value. Because our sampler doesn't have any support for convergent block loads (like the untyped LSC transpose messages for SSBOs)...this means we were emitting SIMD8/16 (or SIMD16/32 on Xe2) sampler messages for every single scalar, replicating what's effectively a SIMD1 value to the entire register. This is hugely wasteful, both in terms of register pressure, and also in back-and-forth sending and receiving memory messages. The good news is we can take advantage of our explicit SIMD model to handle this more efficiently. This patch adds a new optimization pass that detects a series of SHADER_OPCODE_TXF_LOGICAL, in the same basic block, with constant offsets, from the same texture. It constructs a new divergent coordinate where each channel is one of the constants (i.e <10, 11, 12, ..., 26> in the above example). It issues a new NoMask divergent texel fetch which loads N useful channels in one go, and replaces the rest with expansion MOVs that splat the SIMD1 result back to the full SIMD width. (These get copy propagated away.) We can pick the SIMD size of the load independently of the native shader width as well. On Xe2, those 28 convergent loads become a single SIMD32 ld message. On earlier hardware, we use 2 SIMD16 messages. Or we can use a smaller size when there aren't many to combine. In fossil-db, this cuts 27% of send messages in affected shaders, 3-6% of cycles, 2-3% of instructions, and 8-12% of live registers. On A770, this improves performance of Borderlands 3 by roughly 2.5-3.5%. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32573>
319 lines
8.4 KiB
Meson
319 lines
8.4 KiB
Meson
# Copyright © 2017 Intel Corporation
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
intel_nir_files = files(
|
|
'intel_nir.h',
|
|
'intel_nir.c',
|
|
|
|
'intel_nir_blockify_uniform_loads.c',
|
|
'intel_nir_clamp_image_1d_2d_array_sizes.c',
|
|
'intel_nir_clamp_per_vertex_loads.c',
|
|
'intel_nir_lower_conversions.c',
|
|
'intel_nir_lower_non_uniform_barycentric_at_sample.c',
|
|
'intel_nir_lower_non_uniform_resource_intel.c',
|
|
'intel_nir_lower_printf.c',
|
|
'intel_nir_lower_shading_rate_output.c',
|
|
'intel_nir_lower_sparse.c',
|
|
'intel_nir_lower_texture.c',
|
|
'intel_nir_opt_peephole_ffma.c',
|
|
'intel_nir_opt_peephole_imul32x16.c',
|
|
'intel_nir_tcs_workarounds.c',
|
|
)
|
|
|
|
libintel_compiler_brw_files = files(
|
|
'brw_cfg.cpp',
|
|
'brw_cfg.h',
|
|
'brw_compile_bs.cpp',
|
|
'brw_compile_cs.cpp',
|
|
'brw_compile_fs.cpp',
|
|
'brw_compile_gs.cpp',
|
|
'brw_compile_mesh.cpp',
|
|
'brw_compile_tcs.cpp',
|
|
'brw_compile_tes.cpp',
|
|
'brw_compile_vs.cpp',
|
|
'brw_compiler.c',
|
|
'brw_compiler.h',
|
|
'brw_debug_recompile.c',
|
|
'brw_def_analysis.cpp',
|
|
'brw_disasm.c',
|
|
'brw_disasm_info.cpp',
|
|
'brw_disasm_info.h',
|
|
'brw_eu.c',
|
|
'brw_eu_compact.c',
|
|
'brw_eu_defines.h',
|
|
'brw_eu_emit.c',
|
|
'brw_eu.h',
|
|
'brw_eu_validate.c',
|
|
'brw_fs_bank_conflicts.cpp',
|
|
'brw_fs_builder.h',
|
|
'brw_fs_cmod_propagation.cpp',
|
|
'brw_fs_combine_constants.cpp',
|
|
'brw_fs_copy_propagation.cpp',
|
|
'brw_fs.cpp',
|
|
'brw_fs_cse.cpp',
|
|
'brw_fs_dead_code_eliminate.cpp',
|
|
'brw_fs_generator.cpp',
|
|
'brw_fs.h',
|
|
'brw_fs_live_variables.cpp',
|
|
'brw_fs_live_variables.h',
|
|
'brw_fs_lower.cpp',
|
|
'brw_fs_lower_dpas.cpp',
|
|
'brw_fs_lower_integer_multiplication.cpp',
|
|
'brw_fs_lower_pack.cpp',
|
|
'brw_fs_lower_regioning.cpp',
|
|
'brw_fs_lower_simd_width.cpp',
|
|
'brw_fs_nir.cpp',
|
|
'brw_fs_opt.cpp',
|
|
'brw_fs_opt_algebraic.cpp',
|
|
'brw_fs_opt_virtual_grfs.cpp',
|
|
'brw_fs_reg_allocate.cpp',
|
|
'brw_fs_register_coalesce.cpp',
|
|
'brw_fs_saturate_propagation.cpp',
|
|
'brw_fs_scoreboard.cpp',
|
|
'brw_fs_thread_payload.cpp',
|
|
'brw_fs_validate.cpp',
|
|
'brw_fs_visitor.cpp',
|
|
'brw_fs_workaround.cpp',
|
|
'brw_inst.h',
|
|
'brw_ir.h',
|
|
'brw_ir_allocator.h',
|
|
'brw_ir_analysis.h',
|
|
'brw_ir_fs.h',
|
|
'brw_ir_performance.h',
|
|
'brw_ir_performance.cpp',
|
|
'brw_isa_info.h',
|
|
'brw_lower_logical_sends.cpp',
|
|
'brw_lower_subgroup_ops.cpp',
|
|
'brw_nir.h',
|
|
'brw_nir.c',
|
|
'brw_nir_analyze_ubo_ranges.c',
|
|
'brw_nir_lower_cooperative_matrix.c',
|
|
'brw_nir_lower_cs_intrinsics.c',
|
|
'brw_nir_lower_alpha_to_coverage.c',
|
|
'brw_nir_lower_intersection_shader.c',
|
|
'brw_nir_lower_ray_queries.c',
|
|
'brw_nir_lower_rt_intrinsics.c',
|
|
'brw_nir_lower_shader_calls.c',
|
|
'brw_nir_lower_storage_image.c',
|
|
'brw_nir_opt_fsat.c',
|
|
'brw_nir_rt.h',
|
|
'brw_nir_rt.c',
|
|
'brw_nir_rt_builder.h',
|
|
'brw_opt_txf_combiner.cpp',
|
|
'brw_packed_float.c',
|
|
'brw_print.cpp',
|
|
'brw_prim.h',
|
|
'brw_private.h',
|
|
'brw_reg.h',
|
|
'brw_reg_type.c',
|
|
'brw_reg_type.h',
|
|
'brw_rt.h',
|
|
'brw_schedule_instructions.cpp',
|
|
'brw_shader.cpp',
|
|
'brw_simd_selection.cpp',
|
|
'brw_vue_map.c',
|
|
)
|
|
|
|
brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c',
|
|
input : ['brw_device_sha1_gen_c.py', '../dev/intel_device_info.py'],
|
|
output : ['brw_device_sha1_gen.c'],
|
|
command : [prog_python, '@INPUT0@', '--outdir', meson.current_build_dir()])
|
|
|
|
|
|
brw_nir_lower_fsign = custom_target(
|
|
'brw_nir_lower_fsign.c',
|
|
input : 'brw_nir_lower_fsign.py',
|
|
output : 'brw_nir_lower_fsign.c',
|
|
command : [
|
|
prog_python, '@INPUT@', '-p', dir_compiler_nir,
|
|
],
|
|
depend_files : nir_algebraic_depends,
|
|
capture : true,
|
|
)
|
|
|
|
brw_nir_trig = custom_target(
|
|
'brw_nir_trig_workarounds.c',
|
|
input : 'brw_nir_trig_workarounds.py',
|
|
output : 'brw_nir_trig_workarounds.c',
|
|
command : [
|
|
prog_python, '@INPUT@', '-p', dir_compiler_nir,
|
|
],
|
|
depend_files : nir_algebraic_depends,
|
|
capture : true,
|
|
)
|
|
|
|
libintel_compiler_brw = static_library(
|
|
'intel_compiler',
|
|
[libintel_compiler_brw_files, intel_nir_files, brw_nir_lower_fsign, brw_nir_trig, ir_expression_operation_h, [brw_device_sha1_gen_src]],
|
|
include_directories : [inc_include, inc_src, inc_intel],
|
|
c_args : [no_override_init_args],
|
|
gnu_symbol_visibility : 'hidden',
|
|
dependencies : [idep_nir_headers, idep_mesautil, idep_intel_dev],
|
|
build_by_default : false,
|
|
)
|
|
|
|
idep_intel_compiler_brw = declare_dependency(
|
|
link_with : [libintel_compiler_brw],
|
|
dependencies : [
|
|
idep_nir,
|
|
idep_mesautil,
|
|
],
|
|
)
|
|
|
|
# For now this tool is only going to be used by Anv
|
|
if get_option('intel-clc') == 'system'
|
|
prog_intel_clc = find_program('intel_clc', native : true)
|
|
dep_prog_intel_clc = []
|
|
elif with_intel_clc
|
|
prog_intel_clc = executable(
|
|
'intel_clc',
|
|
[
|
|
'intel_clc.c',
|
|
'brw_kernel.c',
|
|
|
|
# Use just the nir_options part of ELK instead of fully linking.
|
|
'elk/elk_nir_options.h',
|
|
'elk/elk_nir_options.c',
|
|
],
|
|
link_with : [libisl],
|
|
include_directories : [inc_include, inc_src, inc_intel],
|
|
c_args : [pre_args, no_override_init_args],
|
|
link_args : [ld_args_build_id],
|
|
dependencies : [idep_nir, idep_vtn, idep_mesaclc, idep_mesautil, idep_intel_dev,
|
|
idep_intel_compiler_brw],
|
|
# If we can run host binaries directly, just build intel_clc for the host.
|
|
# Most commonly this happens when doing a cross compile from an x86_64 build
|
|
# machine to an x86 host
|
|
native : not meson.can_run_host_binaries(),
|
|
install : get_option('install-intel-clc'),
|
|
)
|
|
dep_prog_intel_clc = [prog_intel_clc]
|
|
endif
|
|
|
|
if with_tests
|
|
test(
|
|
'intel_compiler_brw_tests',
|
|
executable(
|
|
'intel_compiler_brw_tests',
|
|
files(
|
|
'test_eu_compact.cpp',
|
|
'test_eu_validate.cpp',
|
|
'test_fs_cmod_propagation.cpp',
|
|
'test_fs_combine_constants.cpp',
|
|
'test_fs_copy_propagation.cpp',
|
|
'test_fs_cse.cpp',
|
|
'test_fs_saturate_propagation.cpp',
|
|
'test_fs_scoreboard.cpp',
|
|
'test_simd_selection.cpp',
|
|
'test_vf_float_conversions.cpp',
|
|
),
|
|
ir_expression_operation_h,
|
|
include_directories : [inc_include, inc_src, inc_intel],
|
|
link_with : libisl,
|
|
dependencies : [idep_gtest, idep_nir, idep_mesautil, idep_intel_dev,
|
|
idep_intel_compiler_brw],
|
|
),
|
|
suite : ['intel'],
|
|
protocol : 'gtest',
|
|
)
|
|
endif
|
|
|
|
if with_intel_tools
|
|
|
|
bison_command = []
|
|
if yacc_is_bison
|
|
bison_command = [
|
|
prog_bison, '@INPUT@', '--defines=@OUTPUT1@',
|
|
'--output=@OUTPUT0@'
|
|
]
|
|
else
|
|
bison_command = [
|
|
prog_bison, '-H', '@OUTPUT1@',
|
|
'-o', '@OUTPUT0@', '@INPUT@'
|
|
]
|
|
endif
|
|
|
|
brw_gram_tab = custom_target(
|
|
'brw_gram.tab.[ch]',
|
|
input : 'brw_gram.y',
|
|
output : ['brw_gram.tab.c', 'brw_gram.tab.h'],
|
|
command : bison_command
|
|
)
|
|
|
|
brw_lex_yy_c = custom_target(
|
|
'brw_lex.yy.c',
|
|
input : 'brw_lex.l',
|
|
output : 'brw_lex.yy.c',
|
|
command : [prog_flex, '-o', '@OUTPUT@', '@INPUT@']
|
|
)
|
|
|
|
brw_asm_deps = [
|
|
dep_thread,
|
|
idep_intel_compiler_brw,
|
|
idep_intel_dev,
|
|
idep_mesautil,
|
|
]
|
|
|
|
brw_asm = static_library(
|
|
'brw_asm',
|
|
['brw_asm.c', brw_gram_tab[0], brw_gram_tab[1], brw_lex_yy_c],
|
|
dependencies : brw_asm_deps,
|
|
include_directories : [inc_include, inc_src, inc_intel],
|
|
c_args : [no_override_init_args],
|
|
gnu_symbol_visibility : 'hidden',
|
|
build_by_default : false,
|
|
)
|
|
|
|
idep_brw_asm = declare_dependency(
|
|
link_with : brw_asm,
|
|
dependencies : brw_asm_deps,
|
|
)
|
|
|
|
brw_asm_tool = executable(
|
|
'brw_asm',
|
|
['brw_asm_tool.c'],
|
|
dependencies : idep_brw_asm,
|
|
include_directories : [inc_include, inc_src, inc_intel],
|
|
c_args : [no_override_init_args],
|
|
gnu_symbol_visibility : 'hidden',
|
|
install : true
|
|
)
|
|
|
|
asm_testcases = [
|
|
['skl', 'gfx9'],
|
|
['icl', 'gfx11'],
|
|
['tgl', 'gfx12'],
|
|
['dg2', 'gfx12.5'],
|
|
]
|
|
|
|
test_runner = find_program('tests/run-test.py')
|
|
foreach testcase : asm_testcases
|
|
_gen_name = testcase[0]
|
|
_gen_num = testcase[1]
|
|
_gen_folder = join_paths(meson.current_source_dir(), 'tests',
|
|
_gen_num.replace('gfx', 'gen'))
|
|
test(
|
|
'brw_asm_' + _gen_num, test_runner,
|
|
args : [
|
|
'--brw_asm', brw_asm_tool,
|
|
'--gen_name', _gen_name,
|
|
'--gen_folder', _gen_folder,
|
|
],
|
|
suite : 'intel',
|
|
)
|
|
endforeach
|
|
|
|
brw_disasm_tool = executable(
|
|
'brw_disasm',
|
|
files('brw_disasm_tool.c'),
|
|
dependencies : [idep_mesautil, dep_thread, idep_intel_dev,
|
|
idep_intel_compiler_brw],
|
|
include_directories : [inc_include, inc_src, inc_intel],
|
|
c_args : [no_override_init_args],
|
|
gnu_symbol_visibility : 'hidden',
|
|
install : true
|
|
)
|
|
|
|
endif
|
|
|
|
subdir('elk')
|