2023-12-19 16:13:52 -08:00
|
|
|
/*
|
|
|
|
|
* Copyright 2023 Intel Corporation
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \file
|
|
|
|
|
* Optimize subgroup operations with uniform sources.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "nir/nir.h"
|
|
|
|
|
#include "nir/nir_builder.h"
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
opt_uniform_subgroup_filter(const nir_instr *instr, const void *_state)
|
|
|
|
|
{
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
case nir_intrinsic_shuffle:
|
|
|
|
|
case nir_intrinsic_read_invocation:
|
|
|
|
|
case nir_intrinsic_read_first_invocation:
|
|
|
|
|
case nir_intrinsic_quad_broadcast:
|
|
|
|
|
case nir_intrinsic_quad_swap_horizontal:
|
|
|
|
|
case nir_intrinsic_quad_swap_vertical:
|
|
|
|
|
case nir_intrinsic_quad_swap_diagonal:
|
|
|
|
|
case nir_intrinsic_quad_swizzle_amd:
|
|
|
|
|
case nir_intrinsic_masked_swizzle_amd:
|
2024-01-05 10:27:38 -08:00
|
|
|
case nir_intrinsic_vote_all:
|
|
|
|
|
case nir_intrinsic_vote_any:
|
2025-06-27 13:56:02 -04:00
|
|
|
case nir_intrinsic_vote_feq:
|
2025-06-26 17:57:12 -04:00
|
|
|
case nir_intrinsic_vote_ieq:
|
2024-09-10 12:31:27 +02:00
|
|
|
return !nir_src_is_divergent(&intrin->src[0]);
|
2023-12-19 16:13:52 -08:00
|
|
|
|
|
|
|
|
case nir_intrinsic_reduce:
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
case nir_intrinsic_exclusive_scan:
|
2023-12-19 16:13:52 -08:00
|
|
|
case nir_intrinsic_inclusive_scan: {
|
2024-09-10 12:31:27 +02:00
|
|
|
if (nir_src_is_divergent(&intrin->src[0]))
|
2023-12-19 16:13:52 -08:00
|
|
|
return false;
|
|
|
|
|
|
2025-02-24 15:09:24 -05:00
|
|
|
const nir_op reduction_op = (nir_op)nir_intrinsic_reduction_op(intrin);
|
2023-12-19 16:13:52 -08:00
|
|
|
|
|
|
|
|
switch (reduction_op) {
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
case nir_op_iadd:
|
|
|
|
|
case nir_op_fadd:
|
|
|
|
|
case nir_op_ixor:
|
|
|
|
|
return true;
|
|
|
|
|
|
2023-12-19 16:13:52 -08:00
|
|
|
case nir_op_imin:
|
|
|
|
|
case nir_op_umin:
|
|
|
|
|
case nir_op_fmin:
|
|
|
|
|
case nir_op_imax:
|
|
|
|
|
case nir_op_umax:
|
|
|
|
|
case nir_op_fmax:
|
|
|
|
|
case nir_op_iand:
|
|
|
|
|
case nir_op_ior:
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
return intrin->intrinsic != nir_intrinsic_exclusive_scan;
|
2023-12-19 16:13:52 -08:00
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-03 06:27:09 +02:00
|
|
|
static nir_def *
|
|
|
|
|
ballot_bit_count(nir_builder *b, nir_def *ballot)
|
|
|
|
|
{
|
|
|
|
|
return ballot->num_components == 1
|
|
|
|
|
? nir_bit_count(b, ballot)
|
2025-09-03 14:08:36 +02:00
|
|
|
: nir_ballot_bit_count_reduce(b, ballot);
|
2025-07-03 06:27:09 +02:00
|
|
|
}
|
|
|
|
|
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
static nir_def *
|
|
|
|
|
count_active_invocations(nir_builder *b, nir_def *value, bool inclusive,
|
2025-07-03 06:27:09 +02:00
|
|
|
bool has_mbcnt_amd,
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
{
|
|
|
|
|
/* For the non-inclusive case, the two paths are functionally the same.
|
|
|
|
|
* For the inclusive case, the are similar but very subtly different.
|
|
|
|
|
*
|
|
|
|
|
* The bit_count path will mask "value" with the subgroup LE mask instead
|
|
|
|
|
* of the subgroup LT mask. This is the definition of the inclusive count.
|
|
|
|
|
*
|
|
|
|
|
* AMD's mbcnt instruction always uses the subgroup LT mask. To perform the
|
|
|
|
|
* inclusive count using mbcnt, two assumptions are made. First, trivially,
|
|
|
|
|
* the current invocation is active. Second, the bit for the current
|
|
|
|
|
* invocation in "value" is set. Since "value" is assumed to be the result
|
|
|
|
|
* of ballot(true), the second condition will also be met.
|
|
|
|
|
*
|
|
|
|
|
* When those conditions are met, the inclusive count is the exclusive
|
|
|
|
|
* count plus one.
|
|
|
|
|
*/
|
|
|
|
|
if (has_mbcnt_amd) {
|
2025-02-24 15:09:24 -05:00
|
|
|
return nir_mbcnt_amd(b, value, nir_imm_int(b, (int)inclusive));
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
} else {
|
2025-07-03 06:27:09 +02:00
|
|
|
nir_def *mask =
|
|
|
|
|
inclusive ? nir_load_subgroup_le_mask(b, options->ballot_components,
|
|
|
|
|
options->ballot_bit_size)
|
|
|
|
|
: nir_load_subgroup_lt_mask(b, options->ballot_components,
|
|
|
|
|
options->ballot_bit_size);
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
|
2025-07-03 06:27:09 +02:00
|
|
|
return ballot_bit_count(b, nir_iand(b, value, mask));
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-19 16:13:52 -08:00
|
|
|
static nir_def *
|
|
|
|
|
opt_uniform_subgroup_instr(nir_builder *b, nir_instr *instr, void *_state)
|
|
|
|
|
{
|
2025-02-24 15:09:24 -05:00
|
|
|
const nir_lower_subgroups_options *options = (nir_lower_subgroups_options *)_state;
|
2023-12-19 16:13:52 -08:00
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
if (intrin->intrinsic == nir_intrinsic_reduce ||
|
|
|
|
|
intrin->intrinsic == nir_intrinsic_inclusive_scan ||
|
|
|
|
|
intrin->intrinsic == nir_intrinsic_exclusive_scan) {
|
2025-02-24 15:09:24 -05:00
|
|
|
const nir_op reduction_op = (nir_op)nir_intrinsic_reduction_op(intrin);
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
|
|
|
|
|
if (reduction_op == nir_op_iadd ||
|
|
|
|
|
reduction_op == nir_op_fadd ||
|
|
|
|
|
reduction_op == nir_op_ixor) {
|
|
|
|
|
nir_def *count;
|
|
|
|
|
|
|
|
|
|
nir_def *ballot = nir_ballot(b, options->ballot_components,
|
|
|
|
|
options->ballot_bit_size, nir_imm_true(b));
|
|
|
|
|
|
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_reduce) {
|
2025-07-03 06:27:09 +02:00
|
|
|
count = ballot_bit_count(b, ballot);
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
} else {
|
|
|
|
|
count = count_active_invocations(b, ballot,
|
|
|
|
|
intrin->intrinsic == nir_intrinsic_inclusive_scan,
|
2025-07-03 06:27:09 +02:00
|
|
|
false, options);
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const unsigned bit_size = intrin->src[0].ssa->bit_size;
|
|
|
|
|
|
|
|
|
|
if (reduction_op == nir_op_iadd) {
|
|
|
|
|
return nir_imul(b,
|
|
|
|
|
nir_u2uN(b, count, bit_size),
|
|
|
|
|
intrin->src[0].ssa);
|
|
|
|
|
} else if (reduction_op == nir_op_fadd) {
|
|
|
|
|
return nir_fmul(b,
|
|
|
|
|
nir_u2fN(b, count, bit_size),
|
|
|
|
|
intrin->src[0].ssa);
|
|
|
|
|
} else {
|
|
|
|
|
return nir_imul(b,
|
|
|
|
|
nir_u2uN(b,
|
|
|
|
|
nir_iand(b, count, nir_imm_int(b, 1)),
|
|
|
|
|
bit_size),
|
|
|
|
|
intrin->src[0].ssa);
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-06-27 13:56:02 -04:00
|
|
|
} else if (intrin->intrinsic == nir_intrinsic_vote_feq) {
|
|
|
|
|
nir_def *x = intrin->src[0].ssa;
|
|
|
|
|
return nir_feq(b, x, x);
|
2025-06-26 17:57:12 -04:00
|
|
|
} else if (intrin->intrinsic == nir_intrinsic_vote_ieq) {
|
|
|
|
|
return nir_imm_true(b);
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
}
|
|
|
|
|
|
2023-12-19 16:13:52 -08:00
|
|
|
return intrin->src[0].ssa;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce,
inclusive scan, and exclusive scan.
NOTE: The fadd and ixor optimizations had no shader-db or fossil-db
changes on any Intel platform.
NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size
and base-local-size.shader_test on DG2 and MTL. This is just changing
the code path taken to not use whatever path was not working properly
before.
This is a subset of the things optimized by ACO. See also
https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The
min, max, iand, and ior exclusive_scan optimizations are not
implemented.
Broadwell on shader-db is not happy. I have not investigated.
v2: Silence some warnings about discarding const.
v3: Rename mbcnt to count_active_invocations. Add a big comment
explaining the differences between the two paths. Suggested by Rhys.
shader-db:
All Gfx9 and newer platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 20300384 -> 20299545 (<.01%)
instructions in affected programs: 19167 -> 18328 (-4.38%)
helped: 35 / HURT: 0
total cycles in shared programs: 842809750 -> 842766381 (<.01%)
cycles in affected programs: 2160249 -> 2116880 (-2.01%)
helped: 33 / HURT: 2
total spills in shared programs: 4632 -> 4626 (-0.13%)
spills in affected programs: 206 -> 200 (-2.91%)
helped: 3 / HURT: 0
total fills in shared programs: 5594 -> 5581 (-0.23%)
fills in affected programs: 664 -> 651 (-1.96%)
helped: 3 / HURT: 1
fossil-db results:
All Intel platforms had similar results. (Ice Lake shown)
Totals:
Instrs: 165551893 -> 165513303 (-0.02%)
Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00%
Spill count: 45258 -> 45204 (-0.12%)
Fill count: 74286 -> 74157 (-0.17%)
Scratch Memory Size: 2467840 -> 2451456 (-0.66%)
Totals from 712 (0.11% of 656120) affected shaders:
Instrs: 598931 -> 560341 (-6.44%)
Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04%
Spill count: 983 -> 929 (-5.49%)
Fill count: 2274 -> 2145 (-5.67%)
Scratch Memory Size: 52224 -> 35840 (-31.37%)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
2023-12-19 18:40:41 -08:00
|
|
|
nir_opt_uniform_subgroup(nir_shader *shader,
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
2023-12-19 16:13:52 -08:00
|
|
|
{
|
2024-08-23 11:48:48 +02:00
|
|
|
nir_divergence_analysis(shader);
|
|
|
|
|
|
2023-12-19 16:13:52 -08:00
|
|
|
bool progress = nir_shader_lower_instructions(shader,
|
|
|
|
|
opt_uniform_subgroup_filter,
|
|
|
|
|
opt_uniform_subgroup_instr,
|
2025-02-24 15:09:24 -05:00
|
|
|
(void *)options);
|
2023-12-19 16:13:52 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|