From c6d44284aa633569a58200d00015b3e6d80a465a Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 2 Aug 2023 13:36:33 -0700 Subject: [PATCH] intel/dev: Enable VK_KHR_cooperative_matrix on all Gfx9+ GPUs Gfx12.5 (DG2) will use DPAS instructions to accelerate the implementation. Earlier platforms will use equivalent discrete instructions (basically subgroup operations). Gfx12 (Tigerlake) will use DP4A for 8-bit integer matrix multiplication. Older platforms, which lack DP4A, will use a suboptimal instruction sequence. There is plenty of room for improvement here. On DG2 (Gfx12.5) gets the following results from the CTS: Test run totals: Passed: 1642/13982 (11.7%) Failed: 0/13982 (0.0%) Not supported: 12340/13982 (88.3%) Warnings: 0/13982 (0.0%) Waived: 0/13982 (0.0%) On DG2 (Gfx12.5) with forced lowering, Raptor Lake (Gfx12) and Ice Lake (Gfx11): Test run totals: Passed: 1662/13982 (11.9%) Failed: 0/13982 (0.0%) Not supported: 12320/13982 (88.1%) Warnings: 0/13982 (0.0%) Waived: 0/13982 (0.0%) The difference in the number of tests run is due to saturatingAccumulation not being set on DG2 when DPAS is used. There is a comment in "intel/dev: Advertise integer configs with saturatingAccumulation too" that explains how this could be added should the need arise. v2: Prefix type names with INTEL_CMAT_. Suggested by Lionel. Reviewed-by: Caio Oliveira Part-of: --- src/intel/dev/intel_device_info.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c index 2da815e46d1..1ed313d71a2 100644 --- a/src/intel/dev/intel_device_info.c +++ b/src/intel/dev/intel_device_info.c @@ -614,7 +614,13 @@ static const struct intel_device_info intel_device_info_chv = { GFX8_FEATURES, \ GFX9_HW_INFO, \ .has_sample_with_hiz = true, \ - .has_illegal_ccs_values = true + .has_illegal_ccs_values = true, \ + .cooperative_matrix_configurations = { \ + { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \ + { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \ + { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \ + { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \ + } static const struct intel_device_info intel_device_info_skl_gt1 = { GFX9_FEATURES, .gt = 1, @@ -840,7 +846,13 @@ static const struct intel_device_info intel_device_info_cfl_gt3 = { .has_illegal_ccs_values = true, \ .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \ .num_subslices = _subslices, \ - .max_eus_per_subslice = 8 + .max_eus_per_subslice = 8, \ + .cooperative_matrix_configurations = { \ + { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \ + { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \ + { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \ + { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \ + } #define GFX11_URB_MIN_MAX_ENTRIES \ .min_entries = { \ @@ -967,6 +979,12 @@ static const struct intel_device_info intel_device_info_ehl_2x4 = { .scanout = PAT_ENTRY(1, WC, NONE), \ .writeback_incoherent = PAT_ENTRY(0, WB, 2WAY), \ .writecombining = PAT_ENTRY(1, WC, NONE), \ + }, \ + .cooperative_matrix_configurations = { \ + { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \ + { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \ + { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \ + { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \ } #define dual_subslices(args...) { args, } @@ -1099,7 +1117,13 @@ static const struct intel_device_info intel_device_info_sg1 = { .has_lsc = true, \ .has_local_mem = true, \ .has_aux_map = false, \ - .simulator_id = 29 + .simulator_id = 29, \ + .cooperative_matrix_configurations = { \ + { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \ + { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \ + { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \ + { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \ + } #define DG2_FEATURES \ /* (Sub)slice info comes from the kernel topology info */ \