2020-01-22 19:59:56 +00:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2020 Valve Corporation
|
|
|
|
|
*
|
2024-04-08 09:02:30 +02:00
|
|
|
* SPDX-License-Identifier: MIT
|
2020-01-22 19:59:56 +00:00
|
|
|
*/
|
|
|
|
|
#include "helpers.h"
|
|
|
|
|
|
|
|
|
|
using namespace aco;
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.neg)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], s1: %c:s[0], s1: %d:s[1] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v1 s1 s1", (amd_gfx_level)i))
|
2020-01-22 19:59:56 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mul_f32 %a, -%b
|
|
|
|
|
//! p_unit_test 0, %res0
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_b = fneg(inputs[1]);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
//~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
|
2024-08-22 15:27:49 +02:00
|
|
|
//~gfx9! v1: %res1 = v_max_f32 0x123456, %neg_a
|
|
|
|
|
//~gfx10! v1: %res1 = v_max_f32 0x123456, -%a
|
2020-01-22 19:59:56 +00:00
|
|
|
//! p_unit_test 1, %res1
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_a = fneg(inputs[0]);
|
2024-08-22 15:27:49 +02:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
|
2020-01-22 19:59:56 +00:00
|
|
|
|
|
|
|
|
//! v1: %res2 = v_mul_f32 %a, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_neg_a = fneg(neg_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
//! v1: %res3 = v_mul_f32 |%a|, %b
|
2020-01-22 19:59:56 +00:00
|
|
|
//! p_unit_test 3, %res3
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp abs_neg_a = fabs(neg_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_mul_f32 -|%a|, %b
|
|
|
|
|
//! p_unit_test 4, %res4
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp abs_a = fabs(inputs[0]);
|
|
|
|
|
Temp neg_abs_a = fneg(abs_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//~gfx9! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
|
|
|
|
|
//~gfx10! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 fi
|
2020-01-22 19:59:56 +00:00
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5,
|
|
|
|
|
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6 = v_subrev_f32 %a, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res7 = v_sub_f32 %b, %a
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res8 = v_mul_f32 %a, -%c
|
|
|
|
|
//! p_unit_test 8, %res8
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
// //! v1: %res9 = v_mul_f32 |%neg_a|, %b
|
|
|
|
|
// //! p_unit_test 9, %res9
|
|
|
|
|
Temp abs_neg_abs_a = fabs(neg_abs_a);
|
|
|
|
|
writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
|
|
|
|
|
|
2020-01-22 19:59:56 +00:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
|
2020-11-13 15:12:35 +00:00
|
|
|
BEGIN_TEST(optimize.output_modifiers)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1] = p_startpgm
|
2020-11-13 15:12:35 +00:00
|
|
|
if (!setup_cs("v1 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
|
|
|
|
/* 32-bit modifiers */
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_add_f32 %a, %b *0.5
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res1 = v_add_f32 %a, %b *2
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res2 = v_add_f32 %a, %b *4
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res3 = v_add_f32 %a, %b clamp
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(0x3f800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res4 = v_add_f32 %a, %b *2 clamp
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
|
|
|
|
|
writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(0x3f800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* 16-bit modifiers */
|
|
|
|
|
|
|
|
|
|
//! v2b: %res5 = v_add_f16 %a, %b *0.5
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res6 = v_add_f16 %a, %b *2
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res7 = v_add_f16 %a, %b *4
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res8 = v_add_f16 %a, %b clamp
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
|
|
|
|
|
Operand::c16(0x3c00u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res9 = v_add_f16 %a, %b *2 clamp
|
|
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
|
|
|
|
|
writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
|
|
|
|
|
Operand::c16(0x3c00u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* clamping is done after omod */
|
|
|
|
|
|
|
|
|
|
//! v1: %res10_tmp = v_add_f32 %a, %b clamp
|
|
|
|
|
//! v1: %res10 = v_mul_f32 2.0, %res10_tmp
|
|
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
|
|
|
|
|
tmp);
|
|
|
|
|
writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* unsupported instructions */
|
|
|
|
|
|
|
|
|
|
//! v1: %res11_tmp = v_xor_b32 %a, %b
|
|
|
|
|
//! v1: %res11 = v_mul_f32 2.0, %res11_tmp
|
|
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* several users */
|
|
|
|
|
|
|
|
|
|
//! v1: %res12_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! p_unit_test %res12_tmp
|
|
|
|
|
//! v1: %res12 = v_mul_f32 2.0, %res12_tmp
|
|
|
|
|
//! p_unit_test 12, %res12
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
bld.pseudo(aco_opcode::p_unit_test, tmp);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res13 = v_add_f32 %a, %b
|
|
|
|
|
//! p_unit_test 13, %res13
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
|
2020-11-13 15:12:35 +00:00
|
|
|
writeout(13, tmp);
|
|
|
|
|
|
|
|
|
|
/* omod has no effect if denormals are enabled but clamp is fine */
|
|
|
|
|
|
|
|
|
|
//>> BB1
|
2024-07-30 17:08:19 +01:00
|
|
|
//! /* logical preds: BB0, / linear preds: BB0, / kind: */
|
2020-11-13 15:12:35 +00:00
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
2024-07-30 17:08:19 +01:00
|
|
|
program->blocks[0].linear_succs.push_back(1);
|
|
|
|
|
program->blocks[0].logical_succs.push_back(1);
|
|
|
|
|
program->blocks[1].linear_preds.push_back(0);
|
|
|
|
|
program->blocks[1].logical_preds.push_back(0);
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res14_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! v1: %res14 = v_mul_f32 2.0, %res13_tmp
|
|
|
|
|
//! p_unit_test 14, %res14
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res15 = v_add_f32 %a, %b clamp
|
|
|
|
|
//! p_unit_test 15, %res15
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(0x3f800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//>> BB2
|
2024-07-30 17:08:19 +01:00
|
|
|
//! /* logical preds: BB1, / linear preds: BB1, / kind: */
|
2020-11-13 15:12:35 +00:00
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_flush;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
2024-07-30 17:08:19 +01:00
|
|
|
program->blocks[1].linear_succs.push_back(2);
|
|
|
|
|
program->blocks[1].logical_succs.push_back(2);
|
|
|
|
|
program->blocks[2].linear_preds.push_back(1);
|
|
|
|
|
program->blocks[2].logical_preds.push_back(1);
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res16_tmp = v_add_f16 %a, %b
|
|
|
|
|
//! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
|
|
|
|
|
//! p_unit_test 16, %res16
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res17 = v_add_f16 %a, %b clamp
|
|
|
|
|
//! p_unit_test 17, %res17
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
|
|
|
|
|
Operand::c16(0x3c00u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* omod flushes -0.0 to +0.0 */
|
|
|
|
|
|
|
|
|
|
//>> BB3
|
2024-09-13 20:14:59 +02:00
|
|
|
//! /* logical preds: BB2, / linear preds: BB2, / kind: uniform, */
|
2024-09-13 20:21:25 +02:00
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_flush;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_flush;
|
2020-11-13 15:12:35 +00:00
|
|
|
bld.reset(program->create_and_insert_block());
|
2024-09-13 20:14:59 +02:00
|
|
|
bld.is_sz_preserve = true;
|
2024-07-30 17:08:19 +01:00
|
|
|
program->blocks[2].linear_succs.push_back(3);
|
|
|
|
|
program->blocks[2].logical_succs.push_back(3);
|
|
|
|
|
program->blocks[3].linear_preds.push_back(2);
|
|
|
|
|
program->blocks[3].logical_preds.push_back(2);
|
2020-11-13 15:12:35 +00:00
|
|
|
|
2024-09-13 20:14:59 +02:00
|
|
|
//! v1: (SzPreserve)%res18_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! v1: (SzPreserve)%res18 = v_mul_f32 2.0, %res18_tmp
|
2020-11-13 15:12:35 +00:00
|
|
|
//! p_unit_test 18, %res18
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2024-09-13 20:14:59 +02:00
|
|
|
//! v1: (SzPreserve)%res19 = v_add_f32 %a, %b clamp
|
2020-11-13 15:12:35 +00:00
|
|
|
//! p_unit_test 19, %res19
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(0x3f800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
2024-09-13 20:14:59 +02:00
|
|
|
//! v2b: (SzPreserve)%res20_tmp = v_add_f16 %a, %b
|
|
|
|
|
//! v2b: (SzPreserve)%res20 = v_mul_f16 2.0, %res20_tmp
|
2020-11-13 15:12:35 +00:00
|
|
|
//! p_unit_test 20, %res20
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
|
2024-09-13 20:14:59 +02:00
|
|
|
//! v2b: (SzPreserve)%res21 = v_add_f16 %a, %b clamp
|
2020-11-13 15:12:35 +00:00
|
|
|
//! p_unit_test 21, %res21
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
|
|
|
|
|
Operand::c16(0x3c00u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
2020-11-10 10:24:36 +01:00
|
|
|
BEGIN_TEST(optimize.add_lshl)
|
2020-11-18 13:15:24 +01:00
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> s1: %a:s[0], v1: %b:v[0] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("s1 v1", (amd_gfx_level)i))
|
2020-11-10 10:24:36 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp shift;
|
|
|
|
|
|
2020-11-18 13:15:24 +01:00
|
|
|
//~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
|
|
|
|
|
//~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
|
|
|
|
|
//~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
|
2020-11-10 10:24:36 +01:00
|
|
|
//! p_unit_test 0, %res0
|
2021-07-13 11:22:46 +02:00
|
|
|
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
|
|
|
|
|
Operand::c32(3u));
|
|
|
|
|
writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
|
|
|
|
|
Operand::c32(4u)));
|
2020-11-10 10:24:36 +01:00
|
|
|
|
2020-11-18 13:15:24 +01:00
|
|
|
//~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
|
|
|
|
|
//~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
|
|
|
|
|
//~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
|
|
|
|
|
//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
|
|
|
|
|
//~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
|
|
|
|
|
//~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
|
2020-11-10 10:24:36 +01:00
|
|
|
//! p_unit_test 1, %res1
|
2021-07-13 11:22:46 +02:00
|
|
|
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
|
|
|
|
|
Operand::c32(3u));
|
|
|
|
|
Temp sadd =
|
|
|
|
|
bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
|
2020-11-10 10:24:36 +01:00
|
|
|
Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
|
|
|
|
|
|
2023-11-10 14:15:50 +01:00
|
|
|
//~gfx8! s1: %lshl2, s1: %_:scc = s_lshl_b32 %a, 3
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
2023-11-10 14:15:50 +01:00
|
|
|
Temp lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
|
|
|
|
|
Operand(inputs[0]), Operand::c32(3u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
2023-11-10 14:15:50 +01:00
|
|
|
//~gfx8! s1: %lshl3, s1: %_:scc = s_lshl_b32 (is24bit)%a, 7
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
Operand a_24bit = Operand(inputs[0]);
|
|
|
|
|
a_24bit.set24bit(true);
|
2023-11-10 14:15:50 +01:00
|
|
|
lshl =
|
|
|
|
|
bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(7u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
2023-11-10 14:15:50 +01:00
|
|
|
//! s1: %lshl4, s1: %_:scc = s_lshl_b32 (is24bit)%a, 3
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
//~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
|
|
|
|
|
//~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
|
|
|
|
|
//! p_unit_test 4, %carry
|
2023-11-10 14:15:50 +01:00
|
|
|
lshl =
|
|
|
|
|
bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(3u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
|
|
|
|
|
writeout(4, carry);
|
|
|
|
|
|
2023-11-10 14:15:50 +01:00
|
|
|
//~gfx8! s1: %lshl5, s1: %_:scc = s_lshl_b32 (is24bit)%a, (is24bit)%a
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
|
|
|
|
|
//! p_unit_test 5, %res5
|
2023-11-10 14:15:50 +01:00
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, a_24bit);
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
2023-11-10 14:15:50 +01:00
|
|
|
lshl =
|
|
|
|
|
bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(3u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
Operand a_16bit = Operand(inputs[0]);
|
|
|
|
|
a_16bit.set16bit(true);
|
2023-11-10 14:15:50 +01:00
|
|
|
lshl =
|
|
|
|
|
bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_16bit, Operand::c32(4u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
2020-11-02 16:44:04 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-11 18:42:35 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.bcnt)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], s1: %b:s[0] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 s1", (amd_gfx_level)i))
|
2020-11-11 18:42:35 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp bcnt;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_bcnt_u32_b32 %a, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
2020-11-11 18:42:35 +01:00
|
|
|
writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_bcnt_u32_b32 %a, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
2020-11-11 18:42:35 +01:00
|
|
|
writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_bcnt_u32_b32 %a, 42
|
|
|
|
|
//! p_unit_test 2, %res2
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
|
|
|
|
writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
|
2020-11-11 18:42:35 +01:00
|
|
|
|
aco/optimizer: use new helpers for remaining add opts
Foz-DB Navi48:
Totals from 373 (0.45% of 82419) affected shaders:
Instrs: 542269 -> 542186 (-0.02%); split: -0.06%, +0.04%
CodeSize: 2872728 -> 2867204 (-0.19%); split: -0.21%, +0.02%
Latency: 3174435 -> 3174634 (+0.01%); split: -0.01%, +0.01%
InvThroughput: 828783 -> 828600 (-0.02%); split: -0.03%, +0.01%
SClause: 11954 -> 11955 (+0.01%)
Copies: 49104 -> 49110 (+0.01%)
PreSGPRs: 15422 -> 15420 (-0.01%)
VALU: 262635 -> 262641 (+0.00%)
Foz-DB Navi21:
Totals from 426 (0.52% of 82387) affected shaders:
Instrs: 624744 -> 624754 (+0.00%); split: -0.00%, +0.00%
CodeSize: 3382728 -> 3385664 (+0.09%); split: -0.00%, +0.09%
Latency: 3841693 -> 3842101 (+0.01%); split: -0.00%, +0.01%
InvThroughput: 1132036 -> 1132065 (+0.00%); split: -0.00%, +0.00%
VClause: 14008 -> 14011 (+0.02%)
Copies: 73104 -> 73114 (+0.01%); split: -0.00%, +0.02%
PreSGPRs: 19504 -> 19502 (-0.01%)
SALU: 131431 -> 131443 (+0.01%)
Foz-DB Polaris10:
Totals from 812 (1.31% of 61894) affected shaders:
Instrs: 610178 -> 609219 (-0.16%); split: -0.21%, +0.05%
CodeSize: 3142404 -> 3147304 (+0.16%); split: -0.02%, +0.17%
VGPRs: 38380 -> 38376 (-0.01%)
Latency: 8312085 -> 8307755 (-0.05%); split: -0.12%, +0.07%
InvThroughput: 3929970 -> 3924631 (-0.14%); split: -0.15%, +0.01%
VClause: 15714 -> 15632 (-0.52%); split: -0.67%, +0.15%
SClause: 14509 -> 14510 (+0.01%); split: -0.02%, +0.03%
Copies: 70197 -> 70388 (+0.27%); split: -0.61%, +0.89%
PreSGPRs: 26409 -> 26404 (-0.02%); split: -0.02%, +0.00%
PreVGPRs: 30448 -> 30436 (-0.04%)
VALU: 408184 -> 407068 (-0.27%); split: -0.29%, +0.01%
SALU: 95726 -> 95959 (+0.24%); split: -0.30%, +0.54%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38530>
2024-12-15 16:55:58 +01:00
|
|
|
//! v1: %res3 = v_bcnt_u32_b32 %b, %a
|
2020-11-11 18:42:35 +01:00
|
|
|
//! p_unit_test 3, %res3
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
|
2020-11-11 18:42:35 +01:00
|
|
|
writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
|
|
|
|
|
//~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
|
|
|
|
|
//~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
|
|
|
|
|
//! p_unit_test 4, %carry
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
2020-11-11 18:42:35 +01:00
|
|
|
Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
|
|
|
|
|
writeout(4, carry);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-10-07 11:09:16 +01:00
|
|
|
|
2020-11-11 15:44:54 +00:00
|
|
|
struct clamp_config {
|
|
|
|
|
const char* name;
|
|
|
|
|
aco_opcode min, max, med3;
|
|
|
|
|
Operand lb, ub;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static const clamp_config clamp_configs[] = {
|
|
|
|
|
/* 0.0, 4.0 */
|
|
|
|
|
{"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::zero(), Operand::c32(0x40800000u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0u), Operand::c16(0x4400)},
|
2020-11-11 15:44:54 +00:00
|
|
|
/* -1.0, 0.0 */
|
|
|
|
|
{"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(0xbf800000u), Operand::zero()},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0xBC00), Operand::c16(0u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
/* 0, 3 */
|
|
|
|
|
{"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::zero(), Operand::c32(3u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0u), Operand::c16(3u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::zero(), Operand::c32(3u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0u), Operand::c16(3u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
/* -5, 0 */
|
|
|
|
|
{"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(0xfffffffbu), Operand::zero()},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0xfffbu), Operand::c16(0u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
};
|
|
|
|
|
|
2020-10-07 11:09:16 +01:00
|
|
|
BEGIN_TEST(optimize.clamp)
|
2020-11-11 15:44:54 +00:00
|
|
|
for (clamp_config cfg : clamp_configs) {
|
2020-11-05 12:43:14 +01:00
|
|
|
if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
|
2020-11-11 15:44:54 +00:00
|
|
|
continue;
|
2020-10-07 11:09:16 +01:00
|
|
|
|
2020-11-11 15:44:54 +00:00
|
|
|
//! cfg: @match_func(min max med3 lb ub)
|
|
|
|
|
fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
|
|
|
|
|
fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
|
|
|
|
|
fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
|
|
|
|
|
aco_print_operand(&cfg.lb, output);
|
|
|
|
|
fprintf(output, " ");
|
|
|
|
|
aco_print_operand(&cfg.ub, output);
|
|
|
|
|
fprintf(output, "\n");
|
|
|
|
|
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2] = p_startpgm
|
2020-11-11 15:44:54 +00:00
|
|
|
|
|
|
|
|
//! v1: %res0 = @med3 @ub, @lb, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = @med3 @lb, @ub, %a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
|
|
|
|
|
bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* min constant must be greater than max constant */
|
|
|
|
|
//! v1: %res2_tmp = @min @lb, %a
|
|
|
|
|
//! v1: %res2 = @max @ub, %res2_tmp
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3_tmp = @max @ub, %a
|
|
|
|
|
//! v1: %res3 = @min @lb, %res3_tmp
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* needs two constants */
|
|
|
|
|
|
|
|
|
|
//! v1: %res4_tmp = @max @lb, %a
|
|
|
|
|
//! v1: %res4 = @min %b, %res4_tmp
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5_tmp = @max %b, %a
|
|
|
|
|
//! v1: %res5 = @min @ub, %res5_tmp
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6_tmp = @max %c, %a
|
|
|
|
|
//! v1: %res6 = @min %b, %res6_tmp
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* correct NaN behaviour with precise */
|
2022-04-29 17:23:20 +01:00
|
|
|
if (cfg.min == aco_opcode::v_min_f16 || cfg.min == aco_opcode::v_min_f32) {
|
|
|
|
|
//~f(16|32)! v1: %res7 = @med3 @ub, @lb, %a
|
|
|
|
|
//~f(16|32)! p_unit_test 7, %res7
|
|
|
|
|
Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
|
|
|
|
|
max.def(0).setPrecise(true);
|
|
|
|
|
Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
|
|
|
|
|
max.def(0).setPrecise(true);
|
|
|
|
|
writeout(7, min);
|
|
|
|
|
|
|
|
|
|
//~f(16|32)! v1: (precise)%res8_tmp = @min @ub, %a
|
|
|
|
|
//~f(16|32)! v1: %res8 = @max @lb, %res8_tmp
|
|
|
|
|
//~f(16|32)! p_unit_test 8, %res8
|
|
|
|
|
min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
|
|
|
|
|
min.def(0).setPrecise(true);
|
|
|
|
|
writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
|
|
|
|
|
}
|
2020-11-11 15:44:54 +00:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
2020-10-07 11:09:16 +01:00
|
|
|
END_TEST
|
2020-10-07 11:40:45 +01:00
|
|
|
|
2020-10-07 11:45:30 +01:00
|
|
|
BEGIN_TEST(optimize.add3)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2] = p_startpgm
|
2020-10-07 11:45:30 +01:00
|
|
|
if (!setup_cs("v1 v1 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_add3_u32 %a, %b, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %tmp1 = v_add_u32 %b, %c clamp
|
|
|
|
|
//! v1: %res1 = v_add_u32 %a, %tmp1
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
2023-02-21 20:08:42 +01:00
|
|
|
tmp->valu().clamp = true;
|
2020-10-07 11:45:30 +01:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %tmp2 = v_add_u32 %b, %c
|
|
|
|
|
//! v1: %res2 = v_add_u32 %a, %tmp2 clamp
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
|
2023-02-21 20:08:42 +01:00
|
|
|
tmp->valu().clamp = true;
|
2020-10-07 11:45:30 +01:00
|
|
|
writeout(2, tmp);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2020-11-17 17:14:49 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.minmax)
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
for (unsigned i = GFX10_3; i <= GFX11; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2] = p_startpgm
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
|
2020-11-17 17:14:49 +01:00
|
|
|
continue;
|
|
|
|
|
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
Temp c = inputs[2];
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_min3_f32 %a, %b, %c
|
2020-11-17 17:14:49 +01:00
|
|
|
//! p_unit_test 0, %res0
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
writeout(0, fmin(c, fmin(a, b)));
|
2020-11-17 17:14:49 +01:00
|
|
|
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
//! v1: %res1 = v_max3_f32 %a, %b, %c
|
2020-11-17 17:14:49 +01:00
|
|
|
//! p_unit_test 1, %res1
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
writeout(1, fmax(c, fmax(a, b)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_min3_f32 -%a, -%b, %c
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fmin(c, fneg(fmax(a, b))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3 = v_max3_f32 -%a, -%b, %c
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, fmax(c, fneg(fmin(a, b))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_max3_f32 -%a, %b, %c
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, fmax(c, fneg(fmin(a, fneg(b)))));
|
|
|
|
|
|
|
|
|
|
//~gfx10_3! v1: %res5_tmp = v_max_f32 %a, %b
|
|
|
|
|
//~gfx10_3! v1: %res5 = v_min_f32 %c, %res5_tmp
|
|
|
|
|
//~gfx11! v1: %res5 = v_maxmin_f32 %a, %b, %c
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, fmin(c, fmax(a, b)));
|
|
|
|
|
|
|
|
|
|
//~gfx10_3! v1: %res6_tmp = v_min_f32 %a, %b
|
|
|
|
|
//~gfx10_3! v1: %res6 = v_max_f32 %c, %res6_tmp
|
|
|
|
|
//~gfx11! v1: %res6 = v_minmax_f32 %a, %b, %c
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, fmax(c, fmin(a, b)));
|
|
|
|
|
|
|
|
|
|
//~gfx10_3! v1: %res7_tmp = v_min_f32 %a, %b
|
|
|
|
|
//~gfx10_3! v1: %res7 = v_min_f32 %c, -%res7_tmp
|
|
|
|
|
//~gfx11! v1: %res7 = v_maxmin_f32 -%a, -%b, %c
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, fmin(c, fneg(fmin(a, b))));
|
|
|
|
|
|
|
|
|
|
//~gfx10_3! v1: %res8_tmp = v_max_f32 %a, %b
|
|
|
|
|
//~gfx10_3! v1: %res8 = v_max_f32 %c, -%res8_tmp
|
|
|
|
|
//~gfx11! v1: %res8 = v_minmax_f32 -%a, -%b, %c
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
writeout(8, fmax(c, fneg(fmax(a, b))));
|
|
|
|
|
|
|
|
|
|
//~gfx10_3! v1: %res9_tmp = v_max_f32 %a, -%b
|
|
|
|
|
//~gfx10_3! v1: %res9 = v_max_f32 %c, -%res9_tmp
|
|
|
|
|
//~gfx11! v1: %res9 = v_minmax_f32 -%a, %b, %c
|
|
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
writeout(9, fmax(c, fneg(fmax(a, fneg(b)))));
|
2020-11-17 17:14:49 +01:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-06-05 17:36:29 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_32_24)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX9; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
|
2020-06-05 17:36:29 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mad_u32_u24 %b, %c, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1_tmp = v_mul_u32_u24 %b, %c
|
|
|
|
|
//! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-18 13:07:57 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.add_lshlrev)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], s1: %c:s[0] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v1 s1", (amd_gfx_level)i))
|
2020-11-18 13:07:57 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp lshl;
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
|
|
|
|
|
//~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//! p_unit_test 0, %res0
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
|
|
|
|
|
//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
Operand a_24bit = Operand(inputs[0]);
|
|
|
|
|
a_24bit.set24bit(true);
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
|
|
|
|
|
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
Operand b_24bit = Operand(inputs[1]);
|
|
|
|
|
b_24bit.set24bit(true);
|
|
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
|
|
|
|
|
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
|
|
|
|
|
//! p_unit_test 3, %res3
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
Operand a_16bit = Operand(inputs[0]);
|
|
|
|
|
a_16bit.set16bit(true);
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
2021-09-09 08:38:41 +02:00
|
|
|
//~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
|
2020-11-18 13:07:57 +01:00
|
|
|
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
Operand c_24bit = Operand(inputs[2]);
|
|
|
|
|
c_24bit.set24bit(true);
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2021-02-16 16:48:46 +00:00
|
|
|
|
|
|
|
|
enum denorm_op {
|
|
|
|
|
denorm_mul1 = 0,
|
|
|
|
|
denorm_fneg = 1,
|
|
|
|
|
denorm_fabs = 2,
|
|
|
|
|
denorm_fnegabs = 3,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static const char* denorm_op_names[] = {
|
|
|
|
|
"mul1",
|
|
|
|
|
"fneg",
|
|
|
|
|
"fabs",
|
|
|
|
|
"fnegabs",
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct denorm_config {
|
|
|
|
|
bool flush;
|
|
|
|
|
unsigned op;
|
|
|
|
|
aco_opcode src;
|
|
|
|
|
aco_opcode dest;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static const char*
|
|
|
|
|
srcdest_op_name(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_cndmask_b32: return "cndmask";
|
|
|
|
|
case aco_opcode::v_min_f32: return "min";
|
|
|
|
|
case aco_opcode::v_rcp_f32: return "rcp";
|
|
|
|
|
default: return "none";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static Temp
|
|
|
|
|
emit_denorm_srcdest(aco_opcode op, Temp val)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_cndmask_b32:
|
2021-07-13 11:22:46 +02:00
|
|
|
return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
|
2021-02-16 16:48:46 +00:00
|
|
|
case aco_opcode::v_min_f32:
|
2021-07-13 11:22:46 +02:00
|
|
|
return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
|
2021-02-16 16:48:46 +00:00
|
|
|
case aco_opcode::v_rcp_f32: return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
|
|
|
|
|
default: return val;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.denorm_propagation)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX9; i++) {
|
|
|
|
|
std::vector<denorm_config> configs;
|
|
|
|
|
for (bool flush : {false, true}) {
|
|
|
|
|
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
|
|
|
|
|
configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
|
|
|
|
|
|
|
|
|
|
for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
|
|
|
|
|
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
|
|
|
|
|
configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (aco_opcode src :
|
|
|
|
|
{aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
|
|
|
|
|
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
|
|
|
|
|
configs.push_back({flush, op, src, aco_opcode::num_opcodes});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (denorm_config cfg : configs) {
|
|
|
|
|
char subvariant[128];
|
|
|
|
|
sprintf(subvariant, "_%s_%s_%s_%s", cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
|
|
|
|
|
denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 s2", (amd_gfx_level)i, CHIP_UNKNOWN, subvariant))
|
2021-02-16 16:48:46 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 ||
|
|
|
|
|
(i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
|
|
|
|
|
cfg.dest == aco_opcode::v_rcp_f32 ||
|
|
|
|
|
(i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) || !cfg.flush;
|
|
|
|
|
|
|
|
|
|
fprintf(output, "src, dest, op: %s %s %s\n", srcdest_op_name(cfg.src),
|
|
|
|
|
srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
|
|
|
|
|
fprintf(output, "can_propagate: %u\n", can_propagate);
|
|
|
|
|
//! src, dest, op: $src $dest $op
|
|
|
|
|
//! can_propagate: #can_propagate
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], s2: %b:s[0-1] = p_startpgm
|
2021-02-16 16:48:46 +00:00
|
|
|
|
|
|
|
|
//; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
|
|
|
|
|
//; 'min': 'v1: %{} = v_min_f32 0, {}',
|
|
|
|
|
//; 'rcp': 'v1: %{} = v_rcp_f32 {}'}
|
|
|
|
|
//; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
|
|
|
|
|
//; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
|
|
|
|
|
//; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
|
|
|
|
|
//; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
|
|
|
|
|
//; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
|
|
|
|
|
|
|
|
|
|
//; name = 'a'
|
|
|
|
|
//; if src != 'none':
|
|
|
|
|
//; insert_pattern(patterns[src].format('src_res', '%'+name))
|
|
|
|
|
//; name = 'src_res'
|
|
|
|
|
|
|
|
|
|
//; if can_propagate:
|
|
|
|
|
//; name = inline_ops[op].format(name)
|
|
|
|
|
//; else:
|
|
|
|
|
//; insert_pattern(ops[op].format('op_res', name))
|
|
|
|
|
//; name = '%op_res'
|
|
|
|
|
|
|
|
|
|
//; if dest != 'none':
|
|
|
|
|
//; insert_pattern(patterns[dest].format('dest_res', name))
|
|
|
|
|
//; name = '%dest_res'
|
|
|
|
|
|
|
|
|
|
//; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
|
|
|
|
|
//! p_unit_test 0, %res
|
|
|
|
|
|
|
|
|
|
program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
|
|
|
|
|
|
|
|
|
|
Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
|
|
|
|
|
switch (cfg.op) {
|
|
|
|
|
case denorm_mul1:
|
2021-07-13 11:22:46 +02:00
|
|
|
val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
|
2021-02-16 16:48:46 +00:00
|
|
|
break;
|
|
|
|
|
case denorm_fneg: val = fneg(val); break;
|
|
|
|
|
case denorm_fabs: val = fabs(val); break;
|
|
|
|
|
case denorm_fnegabs: val = fneg(fabs(val)); break;
|
|
|
|
|
}
|
|
|
|
|
val = emit_denorm_srcdest(cfg.dest, val);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(
|
|
|
|
|
0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
|
2021-02-16 16:48:46 +00:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2021-07-19 15:39:34 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.dpp)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], s2: %c:s[0-1], s1: %d:s[2] = p_startpgm
|
2021-11-29 16:34:15 +00:00
|
|
|
if (!setup_cs("v1 v1 s2 s1", GFX10_3))
|
2021-07-19 15:39:34 +01:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Operand a(inputs[0]);
|
|
|
|
|
Operand b(inputs[1]);
|
|
|
|
|
Operand c(inputs[2]);
|
2021-11-29 16:34:15 +00:00
|
|
|
Operand d(inputs[3]);
|
2021-07-19 15:39:34 +01:00
|
|
|
|
|
|
|
|
/* basic optimization */
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
|
|
|
|
|
writeout(0, res0);
|
|
|
|
|
|
|
|
|
|
/* operand swapping */
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
|
|
|
|
|
writeout(1, res1);
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
|
|
|
|
|
//! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
|
|
|
|
|
writeout(2, res2);
|
|
|
|
|
|
|
|
|
|
/* modifiers */
|
2024-08-21 18:26:55 +02:00
|
|
|
//! v1: %res3 = v_max_f32 -%a, %b row_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
2023-01-04 14:52:34 +00:00
|
|
|
tmp3->dpp16().neg[0] = true;
|
2024-08-21 18:26:55 +02:00
|
|
|
Temp res3 = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), tmp3, b);
|
2021-07-19 15:39:34 +01:00
|
|
|
writeout(3, res3);
|
|
|
|
|
|
2024-08-21 18:26:55 +02:00
|
|
|
//! v1: %res4 = v_max_f32 -%a, %b row_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
2024-08-21 18:26:55 +02:00
|
|
|
auto res4 = bld.vop2_e64(aco_opcode::v_max_f32, bld.def(v1), tmp4, b);
|
2023-02-21 20:08:42 +01:00
|
|
|
res4->valu().neg[0] = true;
|
2021-07-19 15:39:34 +01:00
|
|
|
writeout(4, res4);
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! v1: %res5 = v_add_f32 %tmp5, %b clamp
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
|
2023-02-21 20:08:42 +01:00
|
|
|
res5->valu().clamp = true;
|
2021-07-19 15:39:34 +01:00
|
|
|
writeout(5, res5);
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
2023-01-04 14:52:34 +00:00
|
|
|
tmp6->dpp16().neg[0] = true;
|
2021-07-19 15:39:34 +01:00
|
|
|
auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
|
2023-02-21 20:08:42 +01:00
|
|
|
res6->valu().abs[0] = true;
|
2021-07-19 15:39:34 +01:00
|
|
|
writeout(6, res6);
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
|
2023-02-21 20:08:42 +01:00
|
|
|
res7->valu().abs[0] = true;
|
2021-07-19 15:39:34 +01:00
|
|
|
writeout(7, res7);
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
|
2023-02-08 16:37:44 +00:00
|
|
|
//! v1: %res11 = v_add_u32 %tmp11, %b
|
|
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
auto tmp11 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
tmp11->dpp16().neg[0] = true;
|
|
|
|
|
Temp res11 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), tmp11, b);
|
|
|
|
|
writeout(11, res11);
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
|
2023-02-08 16:37:44 +00:00
|
|
|
//! v1: %res12 = v_add_f16 %tmp12, %b
|
|
|
|
|
//! p_unit_test 12, %res12
|
|
|
|
|
auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
tmp12->dpp16().neg[0] = true;
|
|
|
|
|
Temp res12 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1), tmp12, b);
|
|
|
|
|
writeout(12, res12);
|
|
|
|
|
|
2021-07-19 15:39:34 +01:00
|
|
|
/* vcc */
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 fi
|
2021-07-19 15:39:34 +01:00
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
|
|
|
|
|
writeout(8, res8);
|
|
|
|
|
|
2021-11-29 16:34:15 +00:00
|
|
|
/* sgprs */
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
|
2021-11-29 16:34:15 +00:00
|
|
|
//! v1: %res9 = v_add_f32 %tmp9, %d
|
|
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
|
|
|
|
|
writeout(9, res9);
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
|
2021-11-29 16:34:15 +00:00
|
|
|
//! v1: %res10 = v_add_f32 %d, %tmp10
|
|
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10);
|
|
|
|
|
writeout(10, res10);
|
|
|
|
|
|
2021-07-19 15:39:34 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
2021-08-27 17:53:48 +01:00
|
|
|
BEGIN_TEST(optimize.dpp_prop)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], s1: %b:s[0] = p_startpgm
|
2021-08-27 17:53:48 +01:00
|
|
|
if (!setup_cs("v1 s1", GFX10))
|
|
|
|
|
return;
|
|
|
|
|
|
2021-08-30 10:30:45 +01:00
|
|
|
//! v1: %one = p_parallelcopy 1
|
|
|
|
|
//! v1: %res0 = v_mul_f32 1, %a
|
2021-08-27 17:53:48 +01:00
|
|
|
//! p_unit_test 0, %res0
|
2021-08-30 10:30:45 +01:00
|
|
|
Temp one = bld.copy(bld.def(v1), Operand::c32(1));
|
2024-08-21 12:22:50 +02:00
|
|
|
writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_rr(1)));
|
2021-08-27 17:53:48 +01:00
|
|
|
|
2024-08-21 12:22:50 +02:00
|
|
|
//! v1: %res1 = v_mul_f32 %a, %one row_ror:1 bound_ctrl:1 fi
|
2021-08-27 17:53:48 +01:00
|
|
|
//! p_unit_test 1, %res1
|
2024-08-21 12:22:50 +02:00
|
|
|
writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_rr(1)));
|
2021-08-27 17:53:48 +01:00
|
|
|
|
2021-08-30 10:30:45 +01:00
|
|
|
//! v1: %res2 = v_mul_f32 0x12345678, %a
|
2021-08-27 17:53:48 +01:00
|
|
|
//! p_unit_test 2, %res2
|
2021-08-30 10:30:45 +01:00
|
|
|
Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
|
|
|
|
|
writeout(2,
|
2024-08-21 12:22:50 +02:00
|
|
|
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_rr(1)));
|
2021-08-30 10:30:45 +01:00
|
|
|
|
|
|
|
|
//! v1: %literal2 = p_parallelcopy 0x12345679
|
2024-08-21 12:22:50 +02:00
|
|
|
//! v1: %res3 = v_mul_f32 %a, %literal row_ror:1 bound_ctrl:1 fi
|
2021-08-30 10:30:45 +01:00
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
|
|
|
|
|
writeout(3,
|
2024-08-21 12:22:50 +02:00
|
|
|
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_rr(1)));
|
2021-08-30 10:30:45 +01:00
|
|
|
|
|
|
|
|
//! v1: %b_v = p_parallelcopy %b
|
|
|
|
|
//! v1: %res4 = v_mul_f32 %b, %a
|
|
|
|
|
//! p_unit_test 4, %res4
|
2021-08-27 17:53:48 +01:00
|
|
|
Temp b_v = bld.copy(bld.def(v1), inputs[1]);
|
2024-08-21 12:22:50 +02:00
|
|
|
writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_rr(1)));
|
2021-08-30 10:30:45 +01:00
|
|
|
|
2024-08-21 12:22:50 +02:00
|
|
|
//! v1: %res5 = v_mul_f32 %a, %b_v row_ror:1 bound_ctrl:1 fi
|
2021-08-30 10:30:45 +01:00
|
|
|
//! p_unit_test 5, %res5
|
2024-08-21 12:22:50 +02:00
|
|
|
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_rr(1)));
|
2021-08-30 10:30:45 +01:00
|
|
|
|
|
|
|
|
//! v1: %res6 = v_rcp_f32 %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
2024-08-21 12:22:50 +02:00
|
|
|
writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_rr(1)));
|
2021-08-27 17:53:48 +01:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
2022-01-31 18:13:07 +00:00
|
|
|
BEGIN_TEST(optimize.casts)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v2b: %a16:v[1][0:16] = p_startpgm
|
2022-01-31 18:13:07 +00:00
|
|
|
if (!setup_cs("v1 v2b", GFX10_3))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp a16 = inputs[1];
|
|
|
|
|
|
|
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0_tmp = v_mul_f32 -1.0, %a
|
|
|
|
|
//! v2b: %res0 = v_mul_f16 %res0_tmp, %a16
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fmul(u2u16(fneg(a)), a16));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res1_tmp = v_mul_f16 -1.0, %a16
|
|
|
|
|
//! v1: %res1 = v_mul_f32 %res1_tmp, %a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fmul(bld.as_uniform(fneg(a16)), a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2_tmp = v_mul_f32 -1.0, %a16
|
|
|
|
|
//! v2b: %res2 = v_mul_f16 %res2_tmp, %a16
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1),
|
|
|
|
|
Operand::c32(0xbf800000u), bld.as_uniform(a16))),
|
|
|
|
|
a16));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3_tmp = v_mul_f32 %a, %a
|
2023-02-22 17:31:06 +01:00
|
|
|
//! v2b: %res3 = v_add_f16 %res3_tmp, 0 clamp
|
2022-01-31 18:13:07 +00:00
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, fsat(u2u16(fmul(a, a))));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res4_tmp = v_mul_f16 %a16, %a16
|
2023-02-22 17:31:06 +01:00
|
|
|
//! v1: %res4 = v_add_f32 %res4_tmp, 0 clamp
|
2022-01-31 18:13:07 +00:00
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, fsat(bld.as_uniform(fmul(a16, a16))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5_tmp = v_mul_f32 %a, %a
|
|
|
|
|
//! v2b: %res5 = v_mul_f16 2.0, %res5_tmp
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, fmul(u2u16(fmul(a, a)), bld.copy(bld.def(v2b), Operand::c16(0x4000))));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res6_tmp = v_mul_f16 %a16, %a16
|
|
|
|
|
//! v1: %res6 = v_mul_f32 2.0, %res6_tmp
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6,
|
|
|
|
|
fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res7_tmp = v_mul_f32 %a, %a
|
|
|
|
|
//! v2b: %res7 = v_add_f16 %res7_tmp, %a16
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, fadd(u2u16(fmul(a, a)), a16));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res8_tmp = v_mul_f16 %a16, %a16
|
|
|
|
|
//! v1: %res8 = v_add_f32 %res8_tmp, %a
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
writeout(8, fadd(bld.as_uniform(fmul(a16, a16)), a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res9_tmp = v_mul_f32 %a, %a
|
|
|
|
|
//! v2b: %res9 = v_mul_f16 -1.0, %res9_tmp
|
|
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
writeout(9, fneg(u2u16(fmul(a, a))));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res10_tmp = v_mul_f16 %a16, %a16
|
|
|
|
|
//! v1: %res10 = v_mul_f32 -1.0, %res10_tmp
|
|
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u),
|
|
|
|
|
bld.as_uniform(fmul(a16, a16))));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
BEGIN_TEST(optimize.mad_mix.input_conv.basic)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v2b: %a16:v[1][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp a16 = inputs[1];
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res0 = v_fma_mix_f32 %a, lo(%a16), neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fmul(a, f2f32(a16)));
|
|
|
|
|
|
2024-05-03 21:54:38 +02:00
|
|
|
//! v1: %res1 = v_fma_mix_f32 1.0, lo(%a16), %a
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fadd(a, f2f32(a16)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_fma_mix_f32 1.0, lo(%a16), %a
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fadd(f2f32(a16), a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3 = v_fma_mix_f32 %a, %a, lo(%a16)
|
|
|
|
|
//! p_unit_test 3, %res3
|
2025-10-31 13:12:34 +01:00
|
|
|
writeout(3, fadd(fmul(a, a), f2f32(a16)));
|
2022-01-27 14:19:21 +00:00
|
|
|
|
2025-10-31 13:12:34 +01:00
|
|
|
//~gfx9! v1: %tmp4 = v_cvt_f32_f16 %a16
|
|
|
|
|
//~gfx9! v1: %res4 = v_fma_f32 %a, %a, %tmp4
|
|
|
|
|
//~gfx10! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, fma(a, a, f2f32(a16)));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.input_conv.precision)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v2b: %a16:v[1][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp a16 = inputs[1];
|
|
|
|
|
|
|
|
|
|
/* precise arithmetic */
|
|
|
|
|
//~gfx9! v1: %res0_cvt = v_cvt_f32_f16 %a16
|
|
|
|
|
//~gfx9! v1: (precise)%res0 = v_fma_f32 %a, %a, %res0_cvt
|
|
|
|
|
//~gfx10! v1: (precise)%res0 = v_fma_mix_f32 %a, %a, lo(%a16)
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fma(a, a, f2f32(a16), bld.precise()));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res1_cvt = v_cvt_f16_f32 %a
|
|
|
|
|
//! v2b: (precise)%res1 = v_mul_f16 %a16, %res1_cvt
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fmul(a16, f2f16(a), bld.precise()));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res2_cvt = v_cvt_f16_f32 %a
|
|
|
|
|
//! v2b: (precise)%res2 = v_add_f16 %a16, %res2_cvt
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fadd(a16, f2f16(a), bld.precise()));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res3_cvt = v_cvt_f16_f32 %a
|
|
|
|
|
//! v2b: (precise)%res3 = v_fma_f16 %a16, %a16, %res3_cvt
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, fma(a16, a16, f2f16(a), bld.precise()));
|
|
|
|
|
|
|
|
|
|
/* precise conversions */
|
|
|
|
|
//! v2b: (precise)%res4_cvt = v_cvt_f16_f32 %a
|
|
|
|
|
//! v2b: %res4 = v_mul_f16 %a16, %res4_cvt
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, fmul(a16, f2f16(a, bld.precise())));
|
|
|
|
|
|
|
|
|
|
//! v2b: (precise)%res5_cvt = v_cvt_f16_f32 %a
|
|
|
|
|
//! v2b: %res5 = v_add_f16 %a16, %res5_cvt
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, fadd(a16, f2f16(a, bld.precise())));
|
|
|
|
|
|
|
|
|
|
//! v2b: (precise)%res6_cvt = v_cvt_f16_f32 %a
|
|
|
|
|
//! v2b: %res6 = v_fma_f16 %a16, %a16, %res6_cvt
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, fma(a16, a16, f2f16(a, bld.precise())));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.input_conv.modifiers)
|
2023-03-26 22:25:02 +02:00
|
|
|
for (unsigned i = GFX9; i <= GFX11; i++) {
|
|
|
|
|
if (i == GFX10_3)
|
|
|
|
|
continue;
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v2b: %a16:v[1][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp a16 = inputs[1];
|
|
|
|
|
|
|
|
|
|
/* check whether modifiers are preserved when converting to VOP3P */
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res0 = v_fma_mix_f32 -%a, lo(%a16), neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fmul(fneg(a), f2f32(a16)));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res1 = v_fma_mix_f32 |%a|, lo(%a16), neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fmul(fabs(a), f2f32(a16)));
|
|
|
|
|
|
|
|
|
|
/* fneg modifiers */
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res2 = v_fma_mix_f32 %a, -lo(%a16), neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fmul(a, fneg(f2f32(a16))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res3 = v_fma_mix_f32 %a, -lo(%a16), neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, fmul(a, f2f32(fneg(a16))));
|
|
|
|
|
|
|
|
|
|
/* fabs modifiers */
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res4 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, fmul(a, fabs(f2f32(a16))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res5 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, fmul(a, f2f32(fabs(a16))));
|
|
|
|
|
|
|
|
|
|
/* both fabs and fneg modifiers */
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res6 = v_fma_mix_f32 %a, -|lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, fmul(a, fneg(f2f32(fabs(a16)))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res7 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, fmul(a, fabs(f2f32(fabs(a16)))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res8 = v_fma_mix_f32 %a, -|lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
writeout(8, fmul(a, fneg(fabs(f2f32(fabs(a16))))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res9 = v_fma_mix_f32 %a, -|lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
writeout(9, fmul(a, f2f32(fneg(fabs(a16)))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res10 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
writeout(10, fmul(a, fneg(f2f32(fneg(fabs(a16))))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res11 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
writeout(11, fmul(a, fabs(f2f32(fneg(fabs(a16))))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res12 = v_fma_mix_f32 %a, -|lo(%a16)|, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 12, %res12
|
|
|
|
|
writeout(12, fmul(a, fneg(fabs(f2f32(fneg(fabs(a16)))))));
|
|
|
|
|
|
|
|
|
|
/* sdwa */
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res13 = v_fma_mix_f32 lo(%a), %a, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 13, %res13
|
|
|
|
|
writeout(13, fmul(f2f32(ext_ushort(a, 0)), a));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res14 = v_fma_mix_f32 hi(%a), %a, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 14, %res14
|
|
|
|
|
writeout(14, fmul(f2f32(ext_ushort(a, 1)), a));
|
|
|
|
|
|
2023-03-26 22:25:02 +02:00
|
|
|
//~gfx(9|10)! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword
|
2024-08-22 10:28:27 +02:00
|
|
|
//~gfx11! v1: %res16_cvt1 = v_fma_mix_f32 lo(%a), 1.0, neg(0)
|
2023-03-26 22:25:02 +02:00
|
|
|
//~gfx11! v1: %res15_cvt = p_extract %res16_cvt1, 0, 16, 0
|
2022-01-27 14:19:21 +00:00
|
|
|
//! v1: %res15 = v_mul_f32 %res15_cvt, %a
|
|
|
|
|
//! p_unit_test 15, %res15
|
|
|
|
|
writeout(15, fmul(ext_ushort(f2f32(a), 0), a));
|
|
|
|
|
|
2023-04-20 15:25:17 +02:00
|
|
|
//~gfx(9|10)! v1: %res16_cvt = v_cvt_f32_f16 %a
|
2023-03-26 22:25:02 +02:00
|
|
|
//~gfx(9|10)! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword
|
2024-08-22 10:28:27 +02:00
|
|
|
//~gfx11! v1: %res16_cvt = v_fma_mix_f32 lo(%a), 1.0, neg(0)
|
2023-03-26 22:25:02 +02:00
|
|
|
//~gfx11! v1: %res16_ext = p_extract %res16_cvt, 1, 16, 0
|
|
|
|
|
//~gfx11! v1: %res16 = v_mul_f32 %res16_ext, %a
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 16, %res16
|
|
|
|
|
writeout(16, fmul(ext_ushort(f2f32(a), 1), a));
|
|
|
|
|
|
2023-03-26 22:25:02 +02:00
|
|
|
//~gfx(9|10)! v1: %res17_cvt = v_cvt_f32_f16 %a dst_sel:dword src0_sel:ubyte2
|
|
|
|
|
//~gfx(9|10)! v1: %res17 = v_mul_f32 %res17_cvt, %a
|
|
|
|
|
//~gfx11! v1: %res17_ext = p_extract %a, 2, 8, 0
|
2024-08-22 10:28:27 +02:00
|
|
|
//~gfx11! v1: %res17 = v_fma_mix_f32 lo(%res17_ext), %a, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 17, %res17
|
|
|
|
|
writeout(17, fmul(f2f32(ext_ubyte(a, 2)), a));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.output_conv.basic)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v2b: %a16:v[3][0:16], v2b: %b16:v[4][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
Temp c = inputs[2];
|
|
|
|
|
Temp a16 = inputs[3];
|
|
|
|
|
Temp b16 = inputs[4];
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v2b: %res0 = v_fma_mixlo_f16 %a, %b, neg(lo(0))
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, f2f16(fmul(a, b)));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res1 = v_fma_mixlo_f16 1.0, %a, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, f2f16(fadd(a, b)));
|
|
|
|
|
|
2025-10-31 13:12:34 +01:00
|
|
|
//~gfx9! v1: %tmp2 = v_fma_f32 %a, %b, %c
|
|
|
|
|
//~gfx9! v2b: %res2 = v_cvt_f16_f32 %tmp2
|
|
|
|
|
//~gfx10! v2b: %res2 = v_fma_mixlo_f16 %a, %b, %c
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, f2f16(fma(a, b, c)));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v2b: %res3 = v_fma_mixlo_f16 lo(%a16), %b, neg(lo(0))
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, f2f16(fmul(f2f32(a16), b)));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res4 = v_fma_mixlo_f16 1.0, %a, lo(%b16)
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, f2f16(fadd(a, f2f32(b16))));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res5 = v_fma_mixlo_f16 %a, lo(%b16), %c
|
|
|
|
|
//! p_unit_test 5, %res5
|
2025-10-31 13:12:34 +01:00
|
|
|
writeout(5, f2f16(fadd(fmul(a, f2f32(b16)), c)));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res6 = v_fma_mixlo_f16 %a, %b, %c
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, f2f16(fadd(fmul(a, b), c)));
|
2022-01-27 14:19:21 +00:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.output_conv.precision)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v2b: %a16:v[0][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a16 = inputs[0];
|
|
|
|
|
|
|
|
|
|
//! v2b: %res0_tmp = v_mul_f16 %a16, %a16
|
|
|
|
|
//! v1: (precise)%res0 = v_cvt_f32_f16 %res0_tmp
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, f2f32(fmul(a16, a16), bld.precise()));
|
|
|
|
|
|
|
|
|
|
//! v2b: (precise)%res1_tmp = v_mul_f16 %a16, %a16
|
|
|
|
|
//! v1: %res1 = v_cvt_f32_f16 %res1_tmp
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, f2f32(fmul(a16, a16, bld.precise())));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.output_conv.modifiers)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v2b: %a16:v[2][0:16], v2b: %b16:v[3][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v1 v2b v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
Temp a16 = inputs[2];
|
|
|
|
|
Temp b16 = inputs[3];
|
|
|
|
|
|
|
|
|
|
/* fneg/fabs */
|
|
|
|
|
//! v1: %res0_add = v_add_f32 %1, %2
|
|
|
|
|
//! v2b: %res0 = v_cvt_f16_f32 |%res0_add|
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, f2f16(fabs(fadd(a, b))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1_add = v_add_f32 %1, %2
|
|
|
|
|
//! v2b: %res1 = v_cvt_f16_f32 -%res1_add
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, f2f16(fneg(fadd(a, b))));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res2_add = v_add_f16 %3, %4
|
|
|
|
|
//! v1: %res2 = v_cvt_f32_f16 |%res2_add|
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, f2f32(fabs(fadd(a16, b16))));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res3_add = v_add_f16 %3, %4
|
|
|
|
|
//! v1: %res3 = v_cvt_f32_f16 -%res3_add
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, f2f32(fneg(fadd(a16, b16))));
|
|
|
|
|
|
|
|
|
|
/* sdwa */
|
|
|
|
|
//! v2b: %res4_add = v_fma_mixlo_f16 1.0, %a, %b
|
|
|
|
|
//! v2b: %res4 = p_extract %res4_add, 0, 8, 0
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, ext_ubyte(f2f16(fadd(a, b)), 0));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
|
|
|
|
|
//! v2b: %res5 = v_cvt_f16_f32 %res5_mul
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, f2f16(ext_ushort(fadd(a, b), 0)));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.fma.basic)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v2b: %a16:v[3][0:16], v2b: %c16:v[4][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
Temp c = inputs[2];
|
|
|
|
|
Temp a16 = inputs[3];
|
|
|
|
|
Temp c16 = inputs[4];
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fadd(fmul(f2f32(a16), b), c));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_fma_mix_f32 %a, %b, lo(%c16)
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fadd(fmul(a, b), f2f32(c16)));
|
|
|
|
|
|
|
|
|
|
/* omod/clamp check */
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! v1: %res2 = v_add_f32 %res2_mul, %c *2
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000),
|
|
|
|
|
fadd(fmul(f2f32(a16), b), c)));
|
|
|
|
|
|
|
|
|
|
/* neg/abs modifiers */
|
|
|
|
|
//! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)|
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, fadd(fmul(fneg(f2f32(a16)), b), fabs(f2f32(c16))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_fma_mix_f32 |%a|, |%b|, lo(%c16)
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, fadd(fabs(fmul(fneg(a), fneg(b))), f2f32(c16)));
|
|
|
|
|
|
aco/optimizer: use new helpers to create fma
Foz-DB Navi48:
Totals from 25949 (31.48% of 82419) affected shaders:
Instrs: 30904250 -> 30904153 (-0.00%); split: -0.00%, +0.00%
CodeSize: 164623100 -> 164604652 (-0.01%); split: -0.01%, +0.00%
Latency: 209402611 -> 209402684 (+0.00%); split: -0.00%, +0.00%
InvThroughput: 36622293 -> 36622236 (-0.00%); split: -0.00%, +0.00%
Copies: 2252080 -> 2251998 (-0.00%); split: -0.00%, +0.00%
VALU: 16831507 -> 16831382 (-0.00%); split: -0.00%, +0.00%
VOPD: 28252 -> 28295 (+0.15%)
Foz-DB Navi21:
Totals from 56269 (68.30% of 82387) affected shaders:
Instrs: 43751754 -> 43746463 (-0.01%); split: -0.01%, +0.00%
CodeSize: 233615096 -> 233576912 (-0.02%); split: -0.02%, +0.00%
VGPRs: 2445528 -> 2445520 (-0.00%)
Latency: 276776920 -> 276761183 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 66406450 -> 66402214 (-0.01%); split: -0.01%, +0.00%
VClause: 902951 -> 902947 (-0.00%)
Copies: 3926260 -> 3926289 (+0.00%); split: -0.01%, +0.01%
VALU: 26924056 -> 26918783 (-0.02%); split: -0.02%, +0.00%
SALU: 6938335 -> 6938321 (-0.00%); split: -0.00%, +0.00%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38150>
2025-03-27 15:57:10 +01:00
|
|
|
//! v1: %res5 = v_fma_mix_f32 -%a, %b, lo(%c16)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, fadd(fneg(fmul(a, b)), f2f32(c16)));
|
|
|
|
|
|
aco/optimizer: use new helpers to create fma
Foz-DB Navi48:
Totals from 25949 (31.48% of 82419) affected shaders:
Instrs: 30904250 -> 30904153 (-0.00%); split: -0.00%, +0.00%
CodeSize: 164623100 -> 164604652 (-0.01%); split: -0.01%, +0.00%
Latency: 209402611 -> 209402684 (+0.00%); split: -0.00%, +0.00%
InvThroughput: 36622293 -> 36622236 (-0.00%); split: -0.00%, +0.00%
Copies: 2252080 -> 2251998 (-0.00%); split: -0.00%, +0.00%
VALU: 16831507 -> 16831382 (-0.00%); split: -0.00%, +0.00%
VOPD: 28252 -> 28295 (+0.15%)
Foz-DB Navi21:
Totals from 56269 (68.30% of 82387) affected shaders:
Instrs: 43751754 -> 43746463 (-0.01%); split: -0.01%, +0.00%
CodeSize: 233615096 -> 233576912 (-0.02%); split: -0.02%, +0.00%
VGPRs: 2445528 -> 2445520 (-0.00%)
Latency: 276776920 -> 276761183 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 66406450 -> 66402214 (-0.01%); split: -0.01%, +0.00%
VClause: 902951 -> 902947 (-0.00%)
Copies: 3926260 -> 3926289 (+0.00%); split: -0.01%, +0.01%
VALU: 26924056 -> 26918783 (-0.02%); split: -0.02%, +0.00%
SALU: 6938335 -> 6938321 (-0.00%); split: -0.00%, +0.00%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38150>
2025-03-27 15:57:10 +01:00
|
|
|
//! v1: %res6 = v_fma_mix_f32 -|%a|, |%b|, lo(%c16)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, fadd(fneg(fabs(fmul(fneg(a), fneg(b)))), f2f32(c16)));
|
|
|
|
|
|
|
|
|
|
/* output conversions */
|
|
|
|
|
//! v2b: %res7 = v_fma_mixlo_f16 %a, %b, %c
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, f2f16(fadd(fmul(a, b), c)));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.fma.precision)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v2b: %a16:v[3][0:16], v2b: %c16:v[4][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
Temp c = inputs[2];
|
|
|
|
|
Temp a16 = inputs[3];
|
|
|
|
|
Temp b16 = inputs[4];
|
|
|
|
|
|
|
|
|
|
/* the optimization is precise for 32-bit on GFX9 */
|
2023-03-01 15:31:23 +01:00
|
|
|
//~gfx9! v1: (precise)%res0 = v_fma_mix_f32 lo(%a16), %b, %c
|
2024-08-22 10:28:27 +02:00
|
|
|
//~gfx10! v1: (precise)%res0_tmp = v_fma_mix_f32 lo(%a16), %b, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//~gfx10! v1: %res0 = v_add_f32 %res0_tmp, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fadd(fmul(f2f32(a16), b, bld.precise()), c));
|
|
|
|
|
|
|
|
|
|
//~gfx9! v1: (precise)%res1 = v_fma_mix_f32 lo(%a16), %b, %c
|
2024-08-22 10:28:27 +02:00
|
|
|
//~gfx10! v1: %res1_tmp = v_fma_mix_f32 lo(%a16), %b, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//~gfx10! v1: (precise)%res1 = v_add_f32 %res1_tmp, %c
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fadd(fmul(f2f32(a16), b), c, bld.precise()));
|
|
|
|
|
|
|
|
|
|
/* never promote 16-bit arithmetic to 32-bit */
|
|
|
|
|
//! v2b: %res2_tmp = v_cvt_f16_f32 %a
|
|
|
|
|
//! v2b: %res2 = v_add_f16 %res2_tmp, %b16
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fadd(f2f16(a), b16));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res3_tmp = v_cvt_f16_f32 %a
|
|
|
|
|
//! v2b: %res3 = v_mul_f16 %res3_tmp, %b16
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, fmul(f2f16(a), b16));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res4_tmp = v_mul_f16 %a16, %b16
|
|
|
|
|
//! v1: %res4 = v_cvt_f32_f16 %res4_tmp
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, f2f32(fmul(a16, b16)));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res5_tmp = v_add_f16 %a16, %b16
|
|
|
|
|
//! v1: %res5 = v_cvt_f32_f16 %res5_tmp
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, f2f32(fadd(a16, b16)));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, neg(lo(0))
|
2022-01-27 14:19:21 +00:00
|
|
|
//! v2b: %res6 = v_add_f16 %res6_tmp, %a16
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, fadd(f2f16(fmul(a, b)), a16));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res7_tmp = v_mul_f16 %a16, %b16
|
|
|
|
|
//! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, fadd(f2f32(fmul(a16, b16)), c));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.clamp)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v2b: %b:v[1][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp a16 = inputs[1];
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res0 = v_fma_mix_f32 lo(%a16), %a, neg(0) clamp
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fsat(fmul(f2f32(a16), a)));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v2b: %res1 = v_fma_mixlo_f16 %a, %a, neg(lo(0)) clamp
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, f2f16(fsat(fmul(a, a))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v2b: %res2 = v_fma_mixlo_f16 %a, %a, neg(lo(0)) clamp
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fsat(f2f16(fmul(a, a))));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_mix.cast)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v2b: %b:v[1][0:16] = p_startpgm
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!setup_cs("v1 v2b", (amd_gfx_level)i))
|
2022-01-27 14:19:21 +00:00
|
|
|
continue;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
2022-01-27 14:19:21 +00:00
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp a16 = inputs[1];
|
|
|
|
|
|
|
|
|
|
/* The optimizer copy-propagates v2b=p_extract_vector(v1, 0) and p_as_uniform, so the
|
|
|
|
|
* optimizer has to check compatibility.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
//! v1: %res0_cvt = v_cvt_f32_f16 %a16
|
|
|
|
|
//! v2b: %res0 = v_mul_f16 %res0_cvt, %a16
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fmul(u2u16(f2f32(a16)), a16));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res1_cvt = v_cvt_f16_f32 %a
|
|
|
|
|
//! v1: %res1 = v_mul_f32 %res1_cvt, %a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fmul(bld.as_uniform(f2f16(a)), a));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res2_mul = v_mul_f16 %a16, %a16
|
|
|
|
|
//! v2b: %res2 = v_cvt_f16_f32 %res2_mul
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, f2f16(bld.as_uniform(fmul(a16, a16))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3_mul = v_mul_f32 %a, %a
|
|
|
|
|
//! v1: %res3 = v_cvt_f32_f16 %res3_mul
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, f2f32(u2u16(fmul(a, a))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, neg(0)
|
2023-02-22 17:31:06 +01:00
|
|
|
//! v2b: %res4 = v_add_f16 %res4_mul, 0 clamp
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, neg(lo(0))
|
2023-02-22 17:31:06 +01:00
|
|
|
//! v1: %res5 = v_add_f32 %res5_mul, 0 clamp
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6_mul = v_mul_f32 %a, %a
|
|
|
|
|
//! v1: %res6 = v_fma_mix_f32 1.0, lo(%res6_mul), %a
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, fadd(f2f32(u2u16(fmul(a, a))), a));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res7_mul = v_mul_f16 %a16, %a16
|
2024-05-03 21:54:38 +02:00
|
|
|
//! v1: %res7 = v_fma_mix_f32 1.0, lo(%a16), %res7_mul
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, fadd(bld.as_uniform(fmul(a16, a16)), f2f32(a16)));
|
|
|
|
|
|
|
|
|
|
/* opsel_hi should be obtained from the original opcode, not the operand regclass */
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res8 = v_fma_mix_f32 lo(%a16), %a16, neg(0)
|
2022-01-27 14:19:21 +00:00
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
writeout(8, fmul(f2f32(a16), a16));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
|
|
|
|
|
2022-05-02 18:21:56 +01:00
|
|
|
static void
|
|
|
|
|
vop3p_constant(unsigned* idx, aco_opcode op, const char* swizzle, uint32_t val)
|
|
|
|
|
{
|
|
|
|
|
uint32_t halves[2] = {val & 0xffff, val >> 16};
|
|
|
|
|
uint32_t expected = halves[swizzle[0] - 'x'] | (halves[swizzle[1] - 'x'] << 16);
|
|
|
|
|
fprintf(output, "Expected for %u: 0x%.8x / %u\n", *idx, expected, expected);
|
|
|
|
|
|
|
|
|
|
unsigned opsel_lo = swizzle[0] == 'x' ? 0x0 : 0x1;
|
|
|
|
|
unsigned opsel_hi = swizzle[1] == 'x' ? 0x2 : 0x3;
|
|
|
|
|
writeout((*idx)++, bld.vop3p(op, bld.def(v1), bld.copy(bld.def(v1), Operand::c32(val)),
|
|
|
|
|
inputs[0], opsel_lo, opsel_hi));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.vop3p_constants)
|
|
|
|
|
for (aco_opcode op : {aco_opcode::v_pk_add_f16, aco_opcode::v_pk_add_u16}) {
|
|
|
|
|
for (const char* swizzle : {"xx", "yy", "xy", "yx"}) {
|
|
|
|
|
char variant[16];
|
|
|
|
|
strcpy(variant, op == aco_opcode::v_pk_add_f16 ? "_f16" : "_u16");
|
|
|
|
|
strcat(variant, "_");
|
|
|
|
|
strcat(variant, swizzle);
|
|
|
|
|
|
|
|
|
|
//; for i in range(36):
|
|
|
|
|
//; insert_pattern('Expected for %u: $_ / #expected%u' % (i, i))
|
|
|
|
|
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0] = p_startpgm
|
2022-05-02 18:21:56 +01:00
|
|
|
if (!setup_cs("v1", GFX10_3, CHIP_UNKNOWN, variant))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//; opcode = 'v_pk_add_u16' if 'u16' in variant else 'v_pk_add_f16'
|
|
|
|
|
//; for i in range(36):
|
|
|
|
|
//; insert_pattern('v1: %%res%u = %s $got%u %%a' % (i, opcode, i))
|
|
|
|
|
//; insert_pattern('p_unit_test %u, %%res%u' % (i, i))
|
|
|
|
|
//! s_endpgm
|
|
|
|
|
|
|
|
|
|
//; def parse_op(op):
|
|
|
|
|
//; is_int = opcode == 'v_pk_add_u16'
|
|
|
|
|
//; op = op.rstrip(',')
|
|
|
|
|
//;
|
|
|
|
|
//; mods = lambda v: v
|
|
|
|
|
//; if op.endswith('*[1,-1]'):
|
|
|
|
|
//; mods = lambda v: v ^ 0x80000000
|
|
|
|
|
//; assert(not is_int)
|
|
|
|
|
//; elif op.endswith('*[-1,1]'):
|
|
|
|
|
//; mods = lambda v: v ^ 0x00008000
|
|
|
|
|
//; assert(not is_int)
|
2024-08-22 10:34:48 +02:00
|
|
|
//; elif op.startswith('neg('):
|
|
|
|
|
//; mods = lambda v: v ^ 0x80008000
|
|
|
|
|
//; assert(not is_int)
|
|
|
|
|
//; op = op[4:-1]
|
2022-05-02 18:21:56 +01:00
|
|
|
//; op = op.split('*')[0]
|
|
|
|
|
//;
|
|
|
|
|
//; swizzle = lambda v: v
|
|
|
|
|
//; if op.endswith('.xx'):
|
|
|
|
|
//; swizzle = lambda v: ((v & 0xffff) | (v << 16)) & 0xffffffff;
|
|
|
|
|
//; elif op.endswith('.yy'):
|
|
|
|
|
//; swizzle = lambda v: (v >> 16) | (v & 0xffff0000);
|
|
|
|
|
//; elif op.endswith('.yx'):
|
|
|
|
|
//; swizzle = lambda v: ((v >> 16) | (v << 16)) & 0xffffffff;
|
|
|
|
|
//; op = op.rstrip('xy.')
|
|
|
|
|
//;
|
|
|
|
|
//; val = None
|
|
|
|
|
//; if op.startswith('0x'):
|
|
|
|
|
//; val = int(op[2:], 16)
|
|
|
|
|
//; elif op == '-1.0':
|
|
|
|
|
//; val = 0xbf800000 if is_int else 0xbC00
|
|
|
|
|
//; elif op == '1.0':
|
|
|
|
|
//; val = 0x3f800000 if is_int else 0x3c00
|
|
|
|
|
//; else:
|
|
|
|
|
//; val = int(op) & 0xffffffff
|
|
|
|
|
//;
|
|
|
|
|
//; return mods(swizzle(val))
|
|
|
|
|
|
|
|
|
|
//; # Check correctness
|
|
|
|
|
//; for i in range(36):
|
|
|
|
|
//; expected = globals()['expected%u' % i]
|
|
|
|
|
//; got = globals()['got%u' % i]
|
|
|
|
|
//; got_parsed = parse_op(got)
|
|
|
|
|
//; if got_parsed != expected:
|
|
|
|
|
//; raise Exception('Check %u failed: expected 0x%.8x, got 0x%.8x ("%s")' % (i, expected, got_parsed, got))
|
|
|
|
|
|
|
|
|
|
//; # Check that all literals are ones that cannot be encoded as inline constants
|
|
|
|
|
//; allowed_literals = [0x00004242, 0x0000fffe, 0x00308030, 0x0030ffff, 0x3c00ffff,
|
|
|
|
|
//; 0x42420000, 0x42424242, 0x4242c242, 0x4242ffff, 0x7ffefffe,
|
|
|
|
|
//; 0x80300030, 0xbeefdead, 0xc2424242, 0xdeadbeef, 0xfffe0000,
|
2024-08-22 11:08:49 +02:00
|
|
|
//; 0xfffe7ffe, 0xffff0030, 0xffff3c00, 0xffff4242, 0xc242c242,
|
2024-12-17 17:23:10 +01:00
|
|
|
//; 0x80308030, 0xdeaddead, 0xbeefbeef, 0x7ffe7ffe, 0x0000c242,
|
|
|
|
|
//; 0x0000beef, 0x0000dead, ]
|
2022-05-02 18:21:56 +01:00
|
|
|
//; if opcode == 'v_pk_add_u16':
|
2024-12-17 17:23:10 +01:00
|
|
|
//; allowed_literals.extend([0x00003c00, 0x3c000000, 0x3c003c00, 0x3c00bc00,
|
|
|
|
|
//; 0xbc003c00, 0xbc00bc00, 0x0000bc00, 0x00008030,
|
|
|
|
|
//; 0x00007ffe])
|
2022-05-02 18:21:56 +01:00
|
|
|
//; else:
|
2024-08-22 11:08:49 +02:00
|
|
|
//; allowed_literals.extend([0x00003f80, 0x3f800000, 0x3f803f80])
|
2022-05-02 18:21:56 +01:00
|
|
|
//;
|
|
|
|
|
//; for i in range(36):
|
2024-08-22 10:34:48 +02:00
|
|
|
//; got = globals()['got%u' % i].removeprefix('neg(')
|
2022-05-02 18:21:56 +01:00
|
|
|
//; if not got.startswith('0x'):
|
|
|
|
|
//; continue;
|
2024-08-22 10:34:48 +02:00
|
|
|
//; got = int(got[2:].rstrip(',)').split('*')[0].split('.')[0], 16)
|
2022-05-02 18:21:56 +01:00
|
|
|
//; if got not in allowed_literals:
|
|
|
|
|
//; raise Exception('Literal check %u failed: 0x%.8x not in allowed literals' % (i, got))
|
|
|
|
|
|
|
|
|
|
unsigned idx = 0;
|
|
|
|
|
for (uint32_t constant : {0x3C00, 0x0030, 0xfffe, 0x4242}) {
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, constant);
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, constant | 0xffff0000);
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, constant | (constant << 16));
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, constant << 16);
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, (constant << 16) | 0x0000ffff);
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, constant | ((constant ^ 0x8000) << 16));
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, (constant ^ 0x8000) | (constant << 16));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (uint32_t constant : {0x3f800000u, 0xfffffffeu, 0x00000030u, 0xdeadbeefu}) {
|
|
|
|
|
uint32_t lo = constant & 0xffff;
|
|
|
|
|
uint32_t hi = constant >> 16;
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, constant);
|
|
|
|
|
vop3p_constant(&idx, op, swizzle, hi | (lo << 16));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2023-03-01 17:57:07 +00:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.fmamix_two_literals)
|
|
|
|
|
/* This test has to recreate literals sometimes because we don't combine them at all if there's
|
|
|
|
|
* at least one uncombined use.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = GFX10; i <= GFX10_3; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1] = p_startpgm
|
2023-03-01 17:57:07 +00:00
|
|
|
if (!setup_cs("v1 v1", (amd_gfx_level)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
|
|
|
|
|
Temp c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
|
|
|
|
|
Temp c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
|
|
|
|
|
Temp c_denorm = bld.copy(bld.def(v1), Operand::c32(0x387fc000));
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_fma_mix_f32 %a, lo(0x42003e00), hi(0x42003e00)
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fma(a, c15, c30));
|
|
|
|
|
|
|
|
|
|
/* No need to use v_fma_mix_f32. */
|
|
|
|
|
//! v1: %res1 = v_fmaak_f32 %a, %b, 0x40400000
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fma(a, b, c30));
|
|
|
|
|
|
|
|
|
|
/* Separate mul/add can become v_fma_mix_f32 if it's not precise. */
|
|
|
|
|
//! v1: %res2 = v_fma_mix_f32 %a, lo(0x42003e00), hi(0x42003e00)
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fadd(fmul(a, c15), c30));
|
|
|
|
|
|
|
|
|
|
//~gfx10! v1: %c15 = p_parallelcopy 0x3fc00000
|
|
|
|
|
c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
|
|
|
|
|
c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
|
|
|
|
|
|
|
|
|
|
/* v_fma_mix_f32 is a fused mul/add, so it can't be used for precise separate mul/add. */
|
|
|
|
|
//~gfx10! v1: (precise)%res3 = v_madak_f32 %a, %c15, 0x40400000
|
aco/optimizer: use new helpers to apply literals
Foz-DB Navi21:
Totals from 21009 (26.33% of 79789) affected shaders:
MaxWaves: 495342 -> 495414 (+0.01%)
Instrs: 22345587 -> 22335371 (-0.05%); split: -0.05%, +0.00%
CodeSize: 122095820 -> 121795112 (-0.25%); split: -0.25%, +0.00%
VGPRs: 1025800 -> 1025480 (-0.03%)
Latency: 202876235 -> 203076272 (+0.10%); split: -0.04%, +0.14%
InvThroughput: 47599930 -> 47596113 (-0.01%); split: -0.03%, +0.02%
VClause: 475271 -> 475439 (+0.04%); split: -0.02%, +0.05%
SClause: 700679 -> 700629 (-0.01%); split: -0.01%, +0.01%
Copies: 1628498 -> 1618165 (-0.63%); split: -0.64%, +0.01%
Branches: 567199 -> 567216 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 952134 -> 952043 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 846614 -> 846272 (-0.04%)
VALU: 15572374 -> 15564050 (-0.05%); split: -0.05%, +0.00%
SALU: 2423329 -> 2421319 (-0.08%); split: -0.08%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Foz-DB Navi31:
Totals from 13049 (16.44% of 79395) affected shaders:
MaxWaves: 357242 -> 357268 (+0.01%)
Instrs: 19955572 -> 19944106 (-0.06%); split: -0.06%, +0.00%
CodeSize: 105689464 -> 105454348 (-0.22%); split: -0.23%, +0.00%
VGPRs: 765744 -> 764952 (-0.10%); split: -0.11%, +0.00%
Latency: 179063640 -> 179141591 (+0.04%); split: -0.02%, +0.07%
InvThroughput: 27978134 -> 27971318 (-0.02%); split: -0.03%, +0.01%
VClause: 386791 -> 386826 (+0.01%); split: -0.02%, +0.03%
SClause: 598113 -> 598106 (-0.00%); split: -0.01%, +0.01%
Copies: 1393111 -> 1383102 (-0.72%); split: -0.73%, +0.01%
Branches: 498533 -> 498535 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 573310 -> 573236 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 591459 -> 591043 (-0.07%)
VALU: 11623734 -> 11615755 (-0.07%); split: -0.07%, +0.00%
SALU: 1962055 -> 1960005 (-0.10%); split: -0.11%, +0.00%
VOPD: 3544 -> 3566 (+0.62%); split: +0.73%, -0.11%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35272>
2024-08-22 22:13:54 +02:00
|
|
|
//~gfx10_3! v1: (precise)%res3_tmp = v_mul_f32 0x3fc00000, %a
|
|
|
|
|
//~gfx10_3! v1: %res3 = v_add_f32 0x40400000, %res3_tmp
|
2023-03-01 17:57:07 +00:00
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, fadd(bld.precise().vop2(aco_opcode::v_mul_f32, bld.def(v1), a, c15), c30));
|
|
|
|
|
|
|
|
|
|
//~gfx10! v1: (precise)%res4 = v_madak_f32 %1, %c16, 0x40400000
|
aco/optimizer: use new helpers to apply literals
Foz-DB Navi21:
Totals from 21009 (26.33% of 79789) affected shaders:
MaxWaves: 495342 -> 495414 (+0.01%)
Instrs: 22345587 -> 22335371 (-0.05%); split: -0.05%, +0.00%
CodeSize: 122095820 -> 121795112 (-0.25%); split: -0.25%, +0.00%
VGPRs: 1025800 -> 1025480 (-0.03%)
Latency: 202876235 -> 203076272 (+0.10%); split: -0.04%, +0.14%
InvThroughput: 47599930 -> 47596113 (-0.01%); split: -0.03%, +0.02%
VClause: 475271 -> 475439 (+0.04%); split: -0.02%, +0.05%
SClause: 700679 -> 700629 (-0.01%); split: -0.01%, +0.01%
Copies: 1628498 -> 1618165 (-0.63%); split: -0.64%, +0.01%
Branches: 567199 -> 567216 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 952134 -> 952043 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 846614 -> 846272 (-0.04%)
VALU: 15572374 -> 15564050 (-0.05%); split: -0.05%, +0.00%
SALU: 2423329 -> 2421319 (-0.08%); split: -0.08%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Foz-DB Navi31:
Totals from 13049 (16.44% of 79395) affected shaders:
MaxWaves: 357242 -> 357268 (+0.01%)
Instrs: 19955572 -> 19944106 (-0.06%); split: -0.06%, +0.00%
CodeSize: 105689464 -> 105454348 (-0.22%); split: -0.23%, +0.00%
VGPRs: 765744 -> 764952 (-0.10%); split: -0.11%, +0.00%
Latency: 179063640 -> 179141591 (+0.04%); split: -0.02%, +0.07%
InvThroughput: 27978134 -> 27971318 (-0.02%); split: -0.03%, +0.01%
VClause: 386791 -> 386826 (+0.01%); split: -0.02%, +0.03%
SClause: 598113 -> 598106 (-0.00%); split: -0.01%, +0.01%
Copies: 1393111 -> 1383102 (-0.72%); split: -0.73%, +0.01%
Branches: 498533 -> 498535 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 573310 -> 573236 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 591459 -> 591043 (-0.07%)
VALU: 11623734 -> 11615755 (-0.07%); split: -0.07%, +0.00%
SALU: 1962055 -> 1960005 (-0.10%); split: -0.11%, +0.00%
VOPD: 3544 -> 3566 (+0.62%); split: +0.73%, -0.11%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35272>
2024-08-22 22:13:54 +02:00
|
|
|
//~gfx10_3! v1: %res4_tmp = v_mul_f32 0x3fc00000, %a
|
|
|
|
|
//~gfx10_3! v1: (precise)%res4 = v_add_f32 0x40400000, %res4_tmp
|
2023-03-01 17:57:07 +00:00
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, bld.precise().vop2(aco_opcode::v_add_f32, bld.def(v1), fmul(a, c15), c30));
|
|
|
|
|
|
|
|
|
|
/* Can't convert to fp16 if it will be flushed as a denormal. */
|
|
|
|
|
//! v1: %res5 = v_fma_mix_f32 %1, lo(0x3ff3e00), hi(0x3ff3e00)
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
|
|
|
|
|
writeout(5, fma(a, c15, c_denorm));
|
|
|
|
|
|
|
|
|
|
//>> BB1
|
2024-07-30 17:08:19 +01:00
|
|
|
//! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
|
2023-03-01 17:57:07 +00:00
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
2024-07-30 17:08:19 +01:00
|
|
|
program->blocks[0].linear_succs.push_back(1);
|
|
|
|
|
program->blocks[0].logical_succs.push_back(1);
|
|
|
|
|
program->blocks[1].linear_preds.push_back(0);
|
|
|
|
|
program->blocks[1].logical_preds.push_back(0);
|
2023-03-01 17:57:07 +00:00
|
|
|
|
|
|
|
|
//~gfx10; del c15
|
|
|
|
|
//! v1: %c15 = p_parallelcopy 0x3fc00000
|
|
|
|
|
//! v1: %res6 = v_fmaak_f32 %a, %c15, 0x387fc000
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
|
|
|
|
|
c_denorm = bld.copy(bld.def(v1), Operand::c32(0x387fc000));
|
|
|
|
|
writeout(6, fma(a, c15, c_denorm));
|
|
|
|
|
|
|
|
|
|
/* Can't accept more than 3 unique fp16 literals. */
|
|
|
|
|
//! v1: %c45 = p_parallelcopy 0x40900000
|
|
|
|
|
//! v1: %res7 = v_fma_mix_f32 lo(0x42003e00), hi(0x42003e00), %c45
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
Temp c45 = bld.copy(bld.def(v1), Operand::c32(fui(4.5f)));
|
|
|
|
|
writeout(7, fma(c15, c30, c45));
|
|
|
|
|
|
|
|
|
|
/* Modifiers must be preserved. */
|
aco/optimizer: use new helpers to apply literals
Foz-DB Navi21:
Totals from 21009 (26.33% of 79789) affected shaders:
MaxWaves: 495342 -> 495414 (+0.01%)
Instrs: 22345587 -> 22335371 (-0.05%); split: -0.05%, +0.00%
CodeSize: 122095820 -> 121795112 (-0.25%); split: -0.25%, +0.00%
VGPRs: 1025800 -> 1025480 (-0.03%)
Latency: 202876235 -> 203076272 (+0.10%); split: -0.04%, +0.14%
InvThroughput: 47599930 -> 47596113 (-0.01%); split: -0.03%, +0.02%
VClause: 475271 -> 475439 (+0.04%); split: -0.02%, +0.05%
SClause: 700679 -> 700629 (-0.01%); split: -0.01%, +0.01%
Copies: 1628498 -> 1618165 (-0.63%); split: -0.64%, +0.01%
Branches: 567199 -> 567216 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 952134 -> 952043 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 846614 -> 846272 (-0.04%)
VALU: 15572374 -> 15564050 (-0.05%); split: -0.05%, +0.00%
SALU: 2423329 -> 2421319 (-0.08%); split: -0.08%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Foz-DB Navi31:
Totals from 13049 (16.44% of 79395) affected shaders:
MaxWaves: 357242 -> 357268 (+0.01%)
Instrs: 19955572 -> 19944106 (-0.06%); split: -0.06%, +0.00%
CodeSize: 105689464 -> 105454348 (-0.22%); split: -0.23%, +0.00%
VGPRs: 765744 -> 764952 (-0.10%); split: -0.11%, +0.00%
Latency: 179063640 -> 179141591 (+0.04%); split: -0.02%, +0.07%
InvThroughput: 27978134 -> 27971318 (-0.02%); split: -0.03%, +0.01%
VClause: 386791 -> 386826 (+0.01%); split: -0.02%, +0.03%
SClause: 598113 -> 598106 (-0.00%); split: -0.01%, +0.01%
Copies: 1393111 -> 1383102 (-0.72%); split: -0.73%, +0.01%
Branches: 498533 -> 498535 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 573310 -> 573236 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 591459 -> 591043 (-0.07%)
VALU: 11623734 -> 11615755 (-0.07%); split: -0.07%, +0.00%
SALU: 1962055 -> 1960005 (-0.10%); split: -0.11%, +0.00%
VOPD: 3544 -> 3566 (+0.62%); split: +0.73%, -0.11%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35272>
2024-08-22 22:13:54 +02:00
|
|
|
//! v1: %res8 = v_fma_mix_f32 |%a|, lo(0x44804200), hi(0x44804200)
|
2023-03-01 17:57:07 +00:00
|
|
|
//! p_unit_test 8, %res8
|
aco/optimizer: use new helpers to apply literals
Foz-DB Navi21:
Totals from 21009 (26.33% of 79789) affected shaders:
MaxWaves: 495342 -> 495414 (+0.01%)
Instrs: 22345587 -> 22335371 (-0.05%); split: -0.05%, +0.00%
CodeSize: 122095820 -> 121795112 (-0.25%); split: -0.25%, +0.00%
VGPRs: 1025800 -> 1025480 (-0.03%)
Latency: 202876235 -> 203076272 (+0.10%); split: -0.04%, +0.14%
InvThroughput: 47599930 -> 47596113 (-0.01%); split: -0.03%, +0.02%
VClause: 475271 -> 475439 (+0.04%); split: -0.02%, +0.05%
SClause: 700679 -> 700629 (-0.01%); split: -0.01%, +0.01%
Copies: 1628498 -> 1618165 (-0.63%); split: -0.64%, +0.01%
Branches: 567199 -> 567216 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 952134 -> 952043 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 846614 -> 846272 (-0.04%)
VALU: 15572374 -> 15564050 (-0.05%); split: -0.05%, +0.00%
SALU: 2423329 -> 2421319 (-0.08%); split: -0.08%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Foz-DB Navi31:
Totals from 13049 (16.44% of 79395) affected shaders:
MaxWaves: 357242 -> 357268 (+0.01%)
Instrs: 19955572 -> 19944106 (-0.06%); split: -0.06%, +0.00%
CodeSize: 105689464 -> 105454348 (-0.22%); split: -0.23%, +0.00%
VGPRs: 765744 -> 764952 (-0.10%); split: -0.11%, +0.00%
Latency: 179063640 -> 179141591 (+0.04%); split: -0.02%, +0.07%
InvThroughput: 27978134 -> 27971318 (-0.02%); split: -0.03%, +0.01%
VClause: 386791 -> 386826 (+0.01%); split: -0.02%, +0.03%
SClause: 598113 -> 598106 (-0.00%); split: -0.01%, +0.01%
Copies: 1393111 -> 1383102 (-0.72%); split: -0.73%, +0.01%
Branches: 498533 -> 498535 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 573310 -> 573236 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 591459 -> 591043 (-0.07%)
VALU: 11623734 -> 11615755 (-0.07%); split: -0.07%, +0.00%
SALU: 1962055 -> 1960005 (-0.10%); split: -0.11%, +0.00%
VOPD: 3544 -> 3566 (+0.62%); split: +0.73%, -0.11%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35272>
2024-08-22 22:13:54 +02:00
|
|
|
writeout(8, fma(fabs(a), c30, c45));
|
2023-03-01 17:57:07 +00:00
|
|
|
|
|
|
|
|
//! v1: %res9 = v_fma_mix_f32 lo(0x44804200), |%a|, hi(0x44804200)
|
|
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
writeout(9, fma(c30, fabs(a), c45));
|
|
|
|
|
|
|
|
|
|
//! v1: %res10 = v_fma_mix_f32 %a, lo(0x44804200), hi(0x44804200) clamp
|
|
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
writeout(10, fsat(fma(a, c30, c45)));
|
|
|
|
|
|
|
|
|
|
/* Output modifiers are not supported by v_fma_mix_f32. */
|
|
|
|
|
c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
|
|
|
|
|
//; del c45
|
|
|
|
|
//! v1: %c45 = p_parallelcopy 0x40900000
|
|
|
|
|
//! v1: %res11 = v_fma_f32 %a, 0x40400000, %c45 *0.5
|
|
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
c45 = bld.copy(bld.def(v1), Operand::c32(fui(4.5f)));
|
|
|
|
|
writeout(11, fmul(fma(a, c30, c45), bld.copy(bld.def(v1), Operand::c32(0x3f000000))));
|
|
|
|
|
|
|
|
|
|
/* Has a literal which can't be represented as fp16. */
|
|
|
|
|
//! v1: %c03 = p_parallelcopy 0x3e99999a
|
|
|
|
|
//! v1: %res12 = v_fmaak_f32 %a, %c03, 0x40400000
|
|
|
|
|
//! p_unit_test 12, %res12
|
|
|
|
|
Temp c03 = bld.copy(bld.def(v1), Operand::c32(fui(0.3f)));
|
|
|
|
|
writeout(12, fma(a, c03, c30));
|
|
|
|
|
|
|
|
|
|
/* We should still use fmaak/fmamk if the two literals are identical. */
|
|
|
|
|
//! v1: %res13 = v_fmaak_f32 0x40400000, %a, 0x40400000
|
|
|
|
|
//! p_unit_test 13, %res13
|
|
|
|
|
writeout(13, fma(a, c30, c30));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2023-03-21 14:24:28 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.fma_opsel)
|
aco/optimizer: use new helpers to create fma
Foz-DB Navi48:
Totals from 25949 (31.48% of 82419) affected shaders:
Instrs: 30904250 -> 30904153 (-0.00%); split: -0.00%, +0.00%
CodeSize: 164623100 -> 164604652 (-0.01%); split: -0.01%, +0.00%
Latency: 209402611 -> 209402684 (+0.00%); split: -0.00%, +0.00%
InvThroughput: 36622293 -> 36622236 (-0.00%); split: -0.00%, +0.00%
Copies: 2252080 -> 2251998 (-0.00%); split: -0.00%, +0.00%
VALU: 16831507 -> 16831382 (-0.00%); split: -0.00%, +0.00%
VOPD: 28252 -> 28295 (+0.15%)
Foz-DB Navi21:
Totals from 56269 (68.30% of 82387) affected shaders:
Instrs: 43751754 -> 43746463 (-0.01%); split: -0.01%, +0.00%
CodeSize: 233615096 -> 233576912 (-0.02%); split: -0.02%, +0.00%
VGPRs: 2445528 -> 2445520 (-0.00%)
Latency: 276776920 -> 276761183 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 66406450 -> 66402214 (-0.01%); split: -0.01%, +0.00%
VClause: 902951 -> 902947 (-0.00%)
Copies: 3926260 -> 3926289 (+0.00%); split: -0.01%, +0.01%
VALU: 26924056 -> 26918783 (-0.02%); split: -0.02%, +0.00%
SALU: 6938335 -> 6938321 (-0.00%); split: -0.00%, +0.00%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38150>
2025-03-27 15:57:10 +01:00
|
|
|
for (unsigned i = GFX9; i <= GFX11; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v2b: %a:v[0][0:16], v2b: %b:v[1][0:16], v1: %c:v[2], v1: %d:v[3], v1: %e:v[4] = p_startpgm
|
2023-03-21 14:24:28 +01:00
|
|
|
if (!setup_cs("v2b v2b v1 v1 v1", (amd_gfx_level)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
Temp c = inputs[2];
|
|
|
|
|
Temp d = inputs[3];
|
|
|
|
|
Temp e = inputs[4];
|
|
|
|
|
Temp c_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), c, Operand::c32(1));
|
|
|
|
|
Temp d_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), d, Operand::c32(1));
|
|
|
|
|
Temp e_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), e, Operand::c32(1));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res0 = v_fma_f16 %b, hi(%c), %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fadd(fmul(b, c_hi), a));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res1 = v_fma_f16 %a, %b, hi(%d)
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fadd(fmul(a, b), d_hi));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res2 = v_fma_f16 %a, %b, hi(%e)
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, fma(a, b, e_hi));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2023-03-26 20:26:08 +02:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.dpp_opsel)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1] = p_startpgm
|
2023-03-26 20:26:08 +02:00
|
|
|
if (!setup_cs("v1 v1", GFX11))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
|
|
|
|
|
Temp dpp16 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp dpp16_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), dpp16, Operand::c32(1));
|
|
|
|
|
Temp dpp8 = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), a);
|
|
|
|
|
Temp dpp8_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), dpp8, Operand::c32(1));
|
|
|
|
|
|
|
|
|
|
Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1));
|
|
|
|
|
Temp b_lo = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(0));
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1 fi
|
2023-03-26 20:26:08 +02:00
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fadd(dpp16_hi, b_hi));
|
|
|
|
|
|
2023-10-02 15:47:11 +01:00
|
|
|
//! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0] fi
|
2023-03-26 20:26:08 +02:00
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fadd(b_lo, dpp8_hi));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2023-03-26 21:00:55 +02:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.apply_sgpr_swap_opsel)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], s1: %b:s[0] = p_startpgm
|
2023-03-26 21:00:55 +02:00
|
|
|
if (!setup_cs("v1 s1", GFX11))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
|
|
|
|
|
Temp b_vgpr = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), bld.copy(bld.def(v1), b),
|
|
|
|
|
Operand::c32(0));
|
|
|
|
|
|
|
|
|
|
Temp res0 = bld.tmp(v2b);
|
|
|
|
|
VALU_instruction& valu = bld.vop2(aco_opcode::v_sub_f16, Definition(res0), a, b_vgpr)->valu();
|
|
|
|
|
valu.opsel[0] = true;
|
|
|
|
|
|
|
|
|
|
//! v2b: %res0 = v_subrev_f16 %b, hi(%a)
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, res0);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2023-03-26 21:31:48 +02:00
|
|
|
|
2023-03-26 22:01:57 +02:00
|
|
|
BEGIN_TEST(optimize.max3_opsel)
|
aco/optimizer: use new helpers for min3/max3/minmax/maxmin
Foz-DB Navi48:
Totals from 10453 (12.68% of 82419) affected shaders:
Instrs: 18676282 -> 18675798 (-0.00%); split: -0.00%, +0.00%
CodeSize: 100603268 -> 100603508 (+0.00%); split: -0.00%, +0.00%
Latency: 157036823 -> 157031708 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 28049331 -> 28048776 (-0.00%); split: -0.00%, +0.00%
Copies: 1452464 -> 1452503 (+0.00%); split: -0.00%, +0.00%
PreVGPRs: 458422 -> 458413 (-0.00%); split: -0.00%, +0.00%
VALU: 10429583 -> 10429353 (-0.00%); split: -0.00%, +0.00%
SALU: 2628403 -> 2628416 (+0.00%); split: -0.00%, +0.00%
VOPD: 21738 -> 21744 (+0.03%); split: +0.04%, -0.01%
Foz-DB Navi21:
Totals from 889 (1.08% of 82387) affected shaders:
MaxWaves: 15641 -> 15639 (-0.01%); split: +0.01%, -0.03%
Instrs: 2505527 -> 2505489 (-0.00%); split: -0.01%, +0.01%
CodeSize: 13975300 -> 13976516 (+0.01%); split: -0.00%, +0.01%
VGPRs: 65584 -> 65576 (-0.01%); split: -0.02%, +0.01%
Latency: 37135606 -> 37132577 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 10937032 -> 10935704 (-0.01%); split: -0.01%, +0.00%
VClause: 63136 -> 63140 (+0.01%); split: -0.01%, +0.01%
Copies: 256011 -> 256073 (+0.02%); split: -0.01%, +0.03%
PreSGPRs: 51804 -> 51809 (+0.01%)
PreVGPRs: 57905 -> 57890 (-0.03%); split: -0.03%, +0.00%
VALU: 1593523 -> 1593339 (-0.01%); split: -0.02%, +0.00%
SALU: 425116 -> 425134 (+0.00%); split: -0.00%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38150>
2024-12-12 22:11:21 +01:00
|
|
|
for (unsigned i = GFX9; i <= GFX11; i++) {
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v2b: %c:v[2][0:16] = p_startpgm
|
aco/optimizer: use new helpers for min3/max3/minmax/maxmin
Foz-DB Navi48:
Totals from 10453 (12.68% of 82419) affected shaders:
Instrs: 18676282 -> 18675798 (-0.00%); split: -0.00%, +0.00%
CodeSize: 100603268 -> 100603508 (+0.00%); split: -0.00%, +0.00%
Latency: 157036823 -> 157031708 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 28049331 -> 28048776 (-0.00%); split: -0.00%, +0.00%
Copies: 1452464 -> 1452503 (+0.00%); split: -0.00%, +0.00%
PreVGPRs: 458422 -> 458413 (-0.00%); split: -0.00%, +0.00%
VALU: 10429583 -> 10429353 (-0.00%); split: -0.00%, +0.00%
SALU: 2628403 -> 2628416 (+0.00%); split: -0.00%, +0.00%
VOPD: 21738 -> 21744 (+0.03%); split: +0.04%, -0.01%
Foz-DB Navi21:
Totals from 889 (1.08% of 82387) affected shaders:
MaxWaves: 15641 -> 15639 (-0.01%); split: +0.01%, -0.03%
Instrs: 2505527 -> 2505489 (-0.00%); split: -0.01%, +0.01%
CodeSize: 13975300 -> 13976516 (+0.01%); split: -0.00%, +0.01%
VGPRs: 65584 -> 65576 (-0.01%); split: -0.02%, +0.01%
Latency: 37135606 -> 37132577 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 10937032 -> 10935704 (-0.01%); split: -0.01%, +0.00%
VClause: 63136 -> 63140 (+0.01%); split: -0.01%, +0.01%
Copies: 256011 -> 256073 (+0.02%); split: -0.01%, +0.03%
PreSGPRs: 51804 -> 51809 (+0.01%)
PreVGPRs: 57905 -> 57890 (-0.03%); split: -0.03%, +0.00%
VALU: 1593523 -> 1593339 (-0.01%); split: -0.02%, +0.00%
SALU: 425116 -> 425134 (+0.00%); split: -0.00%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38150>
2024-12-12 22:11:21 +01:00
|
|
|
if (!setup_cs("v1 v1 v2b", (amd_gfx_level)i))
|
2023-03-26 22:01:57 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
Temp c = inputs[2];
|
|
|
|
|
|
|
|
|
|
Temp a_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
|
|
|
|
|
Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res0 = v_max3_f16 hi(%a), %c, hi(%b)
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_max_f16, bld.def(v2b),
|
|
|
|
|
bld.vop2(aco_opcode::v_max_f16, bld.def(v2b), a_hi, c), b_hi));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2023-03-26 22:38:24 +02:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.neg_mul_opsel)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v2b: %b:v[1][0:16] = p_startpgm
|
2023-03-26 22:38:24 +02:00
|
|
|
if (!setup_cs("v1 v2b", GFX11))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
|
|
|
|
|
Temp a_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res0 = v_mul_f16 -hi(%a), %b
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, fneg(fmul(a_hi, b)));
|
|
|
|
|
|
2024-08-22 10:28:27 +02:00
|
|
|
//! v1: %res1 = v_fma_mix_f32 -hi(%a), lo(%b), neg(0)
|
2023-03-26 22:38:24 +02:00
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, fneg(fmul(f2f32(a_hi), f2f32(b))));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2023-09-14 13:01:09 +02:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.vinterp_inreg_output_modifiers)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2] = p_startpgm
|
2023-09-14 13:01:09 +02:00
|
|
|
if (!setup_cs("v1 v1 v1", GFX11))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_interp_p2_f32_inreg %a, %b, %c clamp
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[0],
|
|
|
|
|
inputs[1], inputs[2]);
|
|
|
|
|
writeout(0, fsat(tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_fma_f32 %b, %a, %c *2 quad_perm:[2,2,2,2] fi
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[1], inputs[0],
|
|
|
|
|
inputs[2]);
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
|
|
|
|
|
writeout(1, tmp);
|
|
|
|
|
|
|
|
|
|
//! v2b: %res2 = v_interp_p2_f16_f32_inreg %a, %b, %c clamp
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0],
|
|
|
|
|
inputs[1], inputs[2]);
|
|
|
|
|
writeout(2, fsat(tmp));
|
|
|
|
|
|
|
|
|
|
//! v2b: %tmp3 = v_interp_p2_f16_f32_inreg %b, %a, %c
|
|
|
|
|
//! v2b: %res3 = v_mul_f16 2.0, %tmp3
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[1],
|
|
|
|
|
inputs[0], inputs[2]);
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp);
|
|
|
|
|
writeout(3, tmp);
|
|
|
|
|
|
2023-09-14 13:25:07 +02:00
|
|
|
//! v2b: %res4 = v_fma_mixlo_f16 %c, %b, %a quad_perm:[2,2,2,2] fi
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[2], inputs[1],
|
|
|
|
|
inputs[0]);
|
|
|
|
|
writeout(4, f2f16(tmp));
|
|
|
|
|
|
2023-09-14 13:01:09 +02:00
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2024-06-06 15:25:13 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.s_pack)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> s1: %a:s[0], s1: %b:s[1], s1: %c:s[2] = p_startpgm
|
2024-06-06 15:25:13 +01:00
|
|
|
if (!setup_cs("s1 s1 s1", GFX11))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Temp lo = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[1],
|
|
|
|
|
Operand::c32(0), Operand::c32(16u), Operand::c32(false));
|
|
|
|
|
Temp hi = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[2],
|
|
|
|
|
Operand::c32(1), Operand::c32(16u), Operand::c32(false));
|
|
|
|
|
|
|
|
|
|
//! s1: %res0 = s_pack_lh_b32_b16 %b, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, hi));
|
|
|
|
|
|
|
|
|
|
//! s1: %res1 = s_pack_ll_b32_b16 %b, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, lo));
|
|
|
|
|
|
|
|
|
|
//! s1: %res2 = s_pack_hl_b32_b16 %c, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, lo));
|
|
|
|
|
|
|
|
|
|
//! s1: %res3 = s_pack_hh_b32_b16 %c, %c
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, hi));
|
|
|
|
|
|
|
|
|
|
lo = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[1], Operand::c32(0),
|
|
|
|
|
Operand::c32(16u), Operand::c32(false));
|
|
|
|
|
hi = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[2], Operand::c32(1),
|
|
|
|
|
Operand::c32(16u), Operand::c32(false));
|
|
|
|
|
|
|
|
|
|
//! s1: %res4 = s_pack_ll_b32_b16 %a, %b
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), inputs[0], lo));
|
|
|
|
|
|
|
|
|
|
//! s1: %res5 = s_pack_lh_b32_b16 %a, %c
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), inputs[0], hi));
|
|
|
|
|
|
|
|
|
|
//! s1: %res6 = s_pack_ll_b32_b16 %b, %a
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, inputs[0]));
|
|
|
|
|
|
|
|
|
|
//! s1: %res7 = s_pack_hl_b32_b16 %c, %a
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, inputs[0]));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2024-08-19 19:55:37 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.trans_inline_constant)
|
|
|
|
|
if (!setup_cs("", GFX12))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//>> s1: %res0 = v_s_rcp_f32 1.0
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.vop3(aco_opcode::v_s_rcp_f32, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(0x3f800000))));
|
|
|
|
|
|
|
|
|
|
//! s1: %res1 = v_s_rcp_f32 0x3c00
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, bld.vop3(aco_opcode::v_s_rcp_f32, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(0x3c00))));
|
|
|
|
|
|
|
|
|
|
//! s1: %tmp2 = p_parallelcopy 1.0
|
|
|
|
|
//! s1: %res2 = v_s_rcp_f16 %tmp2
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, bld.vop3(aco_opcode::v_s_rcp_f16, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(0x3f800000))));
|
|
|
|
|
|
|
|
|
|
//! s1: %tmp3 = p_parallelcopy 0x3c00
|
|
|
|
|
//! s1: %res3 = v_s_rcp_f16 %tmp3
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, bld.vop3(aco_opcode::v_s_rcp_f16, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(0x3c00))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_rcp_f32 1.0
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(0x3f800000))));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5 = v_rcp_f32 0x3c00
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(0x3c00))));
|
|
|
|
|
|
aco/optimizer: use new helpers to apply literals
Foz-DB Navi21:
Totals from 21009 (26.33% of 79789) affected shaders:
MaxWaves: 495342 -> 495414 (+0.01%)
Instrs: 22345587 -> 22335371 (-0.05%); split: -0.05%, +0.00%
CodeSize: 122095820 -> 121795112 (-0.25%); split: -0.25%, +0.00%
VGPRs: 1025800 -> 1025480 (-0.03%)
Latency: 202876235 -> 203076272 (+0.10%); split: -0.04%, +0.14%
InvThroughput: 47599930 -> 47596113 (-0.01%); split: -0.03%, +0.02%
VClause: 475271 -> 475439 (+0.04%); split: -0.02%, +0.05%
SClause: 700679 -> 700629 (-0.01%); split: -0.01%, +0.01%
Copies: 1628498 -> 1618165 (-0.63%); split: -0.64%, +0.01%
Branches: 567199 -> 567216 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 952134 -> 952043 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 846614 -> 846272 (-0.04%)
VALU: 15572374 -> 15564050 (-0.05%); split: -0.05%, +0.00%
SALU: 2423329 -> 2421319 (-0.08%); split: -0.08%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Foz-DB Navi31:
Totals from 13049 (16.44% of 79395) affected shaders:
MaxWaves: 357242 -> 357268 (+0.01%)
Instrs: 19955572 -> 19944106 (-0.06%); split: -0.06%, +0.00%
CodeSize: 105689464 -> 105454348 (-0.22%); split: -0.23%, +0.00%
VGPRs: 765744 -> 764952 (-0.10%); split: -0.11%, +0.00%
Latency: 179063640 -> 179141591 (+0.04%); split: -0.02%, +0.07%
InvThroughput: 27978134 -> 27971318 (-0.02%); split: -0.03%, +0.01%
VClause: 386791 -> 386826 (+0.01%); split: -0.02%, +0.03%
SClause: 598113 -> 598106 (-0.00%); split: -0.01%, +0.01%
Copies: 1393111 -> 1383102 (-0.72%); split: -0.73%, +0.01%
Branches: 498533 -> 498535 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 573310 -> 573236 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 591459 -> 591043 (-0.07%)
VALU: 11623734 -> 11615755 (-0.07%); split: -0.07%, +0.00%
SALU: 1962055 -> 1960005 (-0.10%); split: -0.11%, +0.00%
VOPD: 3544 -> 3566 (+0.62%); split: +0.73%, -0.11%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35272>
2024-08-22 22:13:54 +02:00
|
|
|
//! v2b: %res6 = v_rcp_f16 0
|
2024-08-19 19:55:37 +01:00
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop1(aco_opcode::v_rcp_f16, bld.def(v2b), bld.copy(bld.def(s1), Operand::c32(0x3f800000))));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res7 = v_rcp_f16 1.0
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, bld.vop1(aco_opcode::v_rcp_f16, bld.def(v2b), bld.copy(bld.def(s1), Operand::c32(0x3c00))));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2025-03-12 14:15:17 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.trans_no_omod)
|
2025-07-28 12:25:11 +02:00
|
|
|
//>> s1: %a:s[0] = p_startpgm
|
2025-03-12 14:15:17 +01:00
|
|
|
if (!setup_cs("s1", GFX12))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! s1: %tmp0 = v_s_log_f32 %a
|
|
|
|
|
//! v1: %res = v_mul_legacy_f32 %tmp0, 0.5
|
|
|
|
|
//! p_unit_test 0, %res
|
|
|
|
|
Temp dst = bld.vop3(aco_opcode::v_s_log_f32, bld.def(s1), inputs[0]);
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), dst,
|
|
|
|
|
bld.copy(bld.def(v1), Operand::c32(0x3f000000))));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2025-08-24 08:45:28 +02:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.pk_clamp_fma_mix)
|
|
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v2b: %c:v[2][0:16] = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 v1 v2b", GFX12))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v2b: %cvt0 = v_fma_mixlo_f16 1.0, %a, %b
|
|
|
|
|
//! v1: %clamp0 = v_pk_mul_f16 %cvt0.xx, 1.0.xx clamp
|
|
|
|
|
//! p_unit_test 0, %clamp0
|
|
|
|
|
Temp add0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
Temp cvt0 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), add0);
|
|
|
|
|
Builder::Result clamp0 =
|
|
|
|
|
bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), cvt0, Operand::c16(0x3c00), 0, 0);
|
|
|
|
|
clamp0->valu().clamp = true;
|
|
|
|
|
|
|
|
|
|
writeout(0, clamp0);
|
|
|
|
|
|
|
|
|
|
//! v1: %add1 = v_fma_mix_f32 1.0, lo(%c), %a
|
|
|
|
|
//! v1: %clamp1 = v_pk_mul_f16 %add1, 1.0.xx clamp
|
|
|
|
|
//! p_unit_test 1, %clamp1
|
|
|
|
|
Temp cvt1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), inputs[2]);
|
|
|
|
|
Temp add1 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], cvt1);
|
|
|
|
|
Builder::Result clamp1 =
|
|
|
|
|
bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), add1, Operand::c16(0x3c00), 0, 0x1);
|
|
|
|
|
clamp1->valu().clamp = true;
|
|
|
|
|
|
|
|
|
|
writeout(1, clamp1);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2025-10-22 21:02:42 +02:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.fp64_input_modifiers)
|
|
|
|
|
//>> v2: %a:v[0-1], v2: %b:v[2-3], s2: %c:s[0-1] = p_startpgm
|
|
|
|
|
if (!setup_cs("v2 v2 s2", GFX12))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v2: %add = v_fma_f64 -%a, |%b|, -|%c|
|
|
|
|
|
//! p_unit_test 0, %add
|
|
|
|
|
Temp neg_a =
|
|
|
|
|
bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(-1.0)), inputs[0]);
|
|
|
|
|
Builder::Result abs_b =
|
|
|
|
|
bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(1.0)), inputs[1]);
|
|
|
|
|
abs_b->valu().abs[1] = true;
|
|
|
|
|
Builder::Result abs_c = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(1.0)),
|
|
|
|
|
bld.copy(bld.def(v2), inputs[2]));
|
|
|
|
|
abs_c->valu().abs[1] = true;
|
|
|
|
|
Temp neg_abs_c =
|
|
|
|
|
bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(-1.0)), abs_c);
|
|
|
|
|
Temp mul = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), neg_a, abs_b);
|
|
|
|
|
Temp add = bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), mul, neg_abs_c);
|
|
|
|
|
writeout(0, add);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.fp64_input_modifiers_not_applied)
|
|
|
|
|
//>> v2: %a:v[0-1], v2: %b:v[2-3], s2: %c:s[0-1] = p_startpgm
|
|
|
|
|
if (!setup_cs("v2 v2 s2", GFX12))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v1: %lo_a, v1: %hi_a = p_split_vector %a
|
|
|
|
|
//! v1: %neg_hi_a = v_xor_b32 0x80000000, %hi_a
|
|
|
|
|
//! v2: %neg_a = p_create_vector %lo_a, %neg_hi_a
|
|
|
|
|
//! p_unit_test 0, %neg_a
|
|
|
|
|
Temp neg_a =
|
|
|
|
|
bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(-1.0)), inputs[0]);
|
|
|
|
|
writeout(0, neg_a);
|
|
|
|
|
|
|
|
|
|
//! v1: %lo_b, v1: %hi_b = p_split_vector %b
|
|
|
|
|
//! v1: %abs_hi_b = v_and_b32 0x7fffffff, %hi_b
|
|
|
|
|
//! v2: %abs_b = p_create_vector %lo_b, %abs_hi_b
|
|
|
|
|
//! p_unit_test 1, %abs_b
|
|
|
|
|
Builder::Result abs_b =
|
|
|
|
|
bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(1.0)), inputs[1]);
|
|
|
|
|
abs_b->valu().abs[1] = true;
|
|
|
|
|
writeout(1, abs_b);
|
|
|
|
|
|
|
|
|
|
//! s1: %lo_c, s1: %hi_c = p_split_vector %c
|
|
|
|
|
//! s1: %neg_abs_hi_c, s1: %_:scc = s_or_b32 0x80000000, %hi_c
|
|
|
|
|
//! v2: %neg_abs_c = p_create_vector %lo_c, %neg_abs_hi_c
|
|
|
|
|
//! p_unit_test 2, %neg_abs_c
|
|
|
|
|
Builder::Result abs_c = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(1.0)),
|
|
|
|
|
bld.copy(bld.def(v2), inputs[2]));
|
|
|
|
|
abs_c->valu().abs[1] = true;
|
|
|
|
|
Temp neg_abs_c =
|
|
|
|
|
bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(-1.0)), abs_c);
|
|
|
|
|
writeout(2, neg_abs_c);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.fp64_omod)
|
|
|
|
|
//>> v2: %a:v[0-1] = p_startpgm
|
|
|
|
|
if (!setup_cs("v2", GFX12))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
|
|
|
|
//! v2: %res0 = v_rcp_f64 %a *2
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp res0 = bld.vop1(aco_opcode::v_rcp_f64, bld.def(v2), inputs[0]);
|
|
|
|
|
res0 = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(2.0)), res0);
|
|
|
|
|
writeout(0, res0);
|
|
|
|
|
|
|
|
|
|
//! v2: %res1 = v_sqrt_f64 %a *0.5
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
Temp res1 = bld.vop1(aco_opcode::v_sqrt_f64, bld.def(v2), inputs[0]);
|
|
|
|
|
res1 = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(0.5)), res1);
|
|
|
|
|
writeout(1, res1);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.fp64_clamp)
|
|
|
|
|
//>> v2: %a:v[0-1] = p_startpgm
|
|
|
|
|
if (!setup_cs("v2", GFX12))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v2: %clamp = v_rcp_f64 %a clamp
|
|
|
|
|
//! p_unit_test 0, %clamp
|
|
|
|
|
Temp tmp = bld.vop1(aco_opcode::v_rcp_f64, bld.def(v2), inputs[0]);
|
|
|
|
|
Instruction* clamp =
|
|
|
|
|
bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(dui(1.0)), tmp);
|
|
|
|
|
clamp->valu().clamp = true;
|
|
|
|
|
writeout(0, clamp->definitions[0].getTemp());
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
aco/optimizer: use new helpers to optimize mul(b2f(a), b)
Foz-DB Navi48:
Totals from 979 (1.19% of 82419) affected shaders:
Instrs: 3630560 -> 3629463 (-0.03%); split: -0.03%, +0.00%
CodeSize: 19154176 -> 19147124 (-0.04%); split: -0.04%, +0.00%
Latency: 17700546 -> 17699505 (-0.01%); split: -0.01%, +0.01%
InvThroughput: 3143808 -> 3143254 (-0.02%); split: -0.02%, +0.01%
SClause: 76410 -> 76405 (-0.01%); split: -0.01%, +0.00%
Copies: 256544 -> 256554 (+0.00%); split: -0.02%, +0.02%
PreVGPRs: 40868 -> 40835 (-0.08%)
VALU: 2003291 -> 2002466 (-0.04%); split: -0.04%, +0.00%
SALU: 514000 -> 514006 (+0.00%)
VOPD: 3254 -> 3256 (+0.06%); split: +0.12%, -0.06%
Foz-DB Navi21:
Totals from 926 (1.12% of 82387) affected shaders:
MaxWaves: 21538 -> 21542 (+0.02%)
Instrs: 2984216 -> 2983187 (-0.03%); split: -0.04%, +0.00%
CodeSize: 16104112 -> 16097272 (-0.04%); split: -0.05%, +0.00%
VGPRs: 46864 -> 46848 (-0.03%)
Latency: 15678064 -> 15677099 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 3779550 -> 3778230 (-0.03%); split: -0.04%, +0.01%
VClause: 81590 -> 81598 (+0.01%)
SClause: 70753 -> 70751 (-0.00%); split: -0.01%, +0.00%
Copies: 240446 -> 240466 (+0.01%); split: -0.01%, +0.02%
PreSGPRs: 51121 -> 51062 (-0.12%)
PreVGPRs: 38538 -> 38505 (-0.09%)
VALU: 1978847 -> 1977777 (-0.05%); split: -0.06%, +0.00%
SALU: 439184 -> 439212 (+0.01%)
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38150>
2024-12-14 16:51:55 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.mul_b2f)
|
|
|
|
|
//>> v1: %a:v[0], s2: %b:s[0-1] = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 s2", GFX11))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//>> v1: %res0 = v_cndmask_b32 0, %a, %b
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp cond = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0),
|
|
|
|
|
Operand::c32(0x3F800000), inputs[1]);
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], cond));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2024-12-17 17:14:08 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.pk_fma)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX11; i++) {
|
|
|
|
|
if (i == GFX10_3)
|
|
|
|
|
continue;
|
|
|
|
|
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], s1: %d:s[0], s1: %e:s[1] = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 v1 v1 s1 s1", (amd_gfx_level)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp a = inputs[0];
|
|
|
|
|
Temp b = inputs[1];
|
|
|
|
|
Temp c = inputs[2];
|
|
|
|
|
Temp d = inputs[3];
|
|
|
|
|
Temp e = inputs[4];
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_pk_fma_f16 %a, %b, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Builder::Result mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, b, 0x0, 0x3);
|
|
|
|
|
Builder::Result add = bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x3);
|
|
|
|
|
writeout(0, add);
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_pk_fma_f16 %a.yx, %b, %c.xx
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, b, 0x1, 0x2);
|
|
|
|
|
add = bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x1);
|
|
|
|
|
writeout(1, add);
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_pk_fma_f16 %a.xx*[-1,1], %b.yy, %3.xx
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, b, 0x1, 0x2);
|
|
|
|
|
add = bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x1, 0x1);
|
|
|
|
|
add->valu().neg_lo[0] = true;
|
|
|
|
|
writeout(2, add);
|
|
|
|
|
|
|
|
|
|
//! v1: (precise)%mul3 = v_pk_mul_f16 %a, %b
|
|
|
|
|
//! v1: %res3 = v_pk_add_f16 %mul3, %c
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
mul = bld.precise().vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, b, 0x0, 0x3);
|
|
|
|
|
add = bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x3);
|
|
|
|
|
writeout(3, add);
|
|
|
|
|
|
|
|
|
|
//! v1: %mul4 = v_pk_mul_f16 %a, %b
|
|
|
|
|
//! v1: (precise)%res4 = v_pk_add_f16 %mul4, %c
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, b, 0x0, 0x3);
|
|
|
|
|
add = bld.precise().vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x3);
|
|
|
|
|
writeout(4, add);
|
|
|
|
|
|
|
|
|
|
//~gfx9! s1: %const5 = p_parallelcopy 0x4800
|
|
|
|
|
//~gfx9! v1: (precise)%res5 = v_pk_fma_f16 %a, %const5.xx, %c
|
|
|
|
|
//~gfx(10|11)! v1: (precise)%res5 = v_pk_fma_f16 %a, 0x4800.xx, %c
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
Temp constant = bld.copy(bld.def(s1), Operand::c32(0x4800));
|
|
|
|
|
mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, constant, 0x0, 0x1);
|
|
|
|
|
add = bld.precise().vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x3);
|
|
|
|
|
writeout(5, add);
|
|
|
|
|
|
|
|
|
|
//~gfx9! s1: %const6 = p_parallelcopy 0x48404800
|
|
|
|
|
//~gfx9! v1: %mul6 = v_pk_mul_f16 %a, %const6
|
|
|
|
|
//~gfx(10|11)! v1: %mul6 = v_pk_mul_f16 %a, 0x48404800
|
|
|
|
|
//! v1: (precise)%res6 = v_pk_add_f16 %mul6, %c
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
constant = bld.copy(bld.def(s1), Operand::c32(0x48404800));
|
|
|
|
|
mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, constant, 0x0, 0x3);
|
|
|
|
|
add = bld.precise().vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x3);
|
|
|
|
|
writeout(6, add);
|
|
|
|
|
|
|
|
|
|
//~gfx9! s1: %const7 = p_parallelcopy 0x4840
|
|
|
|
|
//~gfx9! s1: %const72 = p_parallelcopy 0x5000
|
|
|
|
|
//~gfx9! v1: %mul7 = v_pk_mul_f16 %a, %const7.xx
|
|
|
|
|
//~gfx9! v1: %res7 = v_pk_add_f16 %mul7, %const72.xx
|
|
|
|
|
//~gfx(10|11)! v1: %res7 = v_pk_fma_f16 %a, 0x50004840.xx, 0x50004840.yy
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
constant = bld.copy(bld.def(s1), Operand::c32(0x4840));
|
|
|
|
|
Temp constant2 = bld.copy(bld.def(s1), Operand::c32(0x5000));
|
|
|
|
|
mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, constant, 0x0, 0x1);
|
|
|
|
|
add = bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, constant2, 0x0, 0x1);
|
|
|
|
|
writeout(7, add);
|
|
|
|
|
|
|
|
|
|
//! v1: %res8 = v_pk_fma_f16 %a.yy, %b.xx, %c
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
Temp extract = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
|
|
|
|
|
mul = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), extract, b);
|
|
|
|
|
add = bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x2);
|
|
|
|
|
writeout(8, add);
|
|
|
|
|
|
|
|
|
|
//! v1: %res9 = v_pk_fma_f16 %d.yy, -%b.xx, %c
|
|
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
extract = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), d, Operand::c32(1),
|
|
|
|
|
Operand::c32(16u), Operand::c32(false));
|
|
|
|
|
mul = bld.vop2_e64(aco_opcode::v_mul_f16, bld.def(v2b), extract, b);
|
|
|
|
|
add = bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x2);
|
|
|
|
|
mul->valu().neg[1] = true;
|
|
|
|
|
writeout(9, add);
|
|
|
|
|
|
|
|
|
|
//! v2b: %mul10 = v_mul_f16 %e, |%b|
|
|
|
|
|
//! v1: %res10 = v_pk_add_f16 %mul10.xx, %c
|
|
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
mul = bld.vop2_e64(aco_opcode::v_mul_f16, bld.def(v2b), e, b);
|
|
|
|
|
add = bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), mul, c, 0x0, 0x2);
|
|
|
|
|
mul->valu().abs[1] = true;
|
|
|
|
|
writeout(10, add);
|
|
|
|
|
|
|
|
|
|
//! v2b: %res11 = v_fma_f16 %a, %b, %e
|
|
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, b, 0x0, 0x3);
|
|
|
|
|
add = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), e, mul);
|
|
|
|
|
writeout(11, add);
|
|
|
|
|
|
|
|
|
|
//! v2b: %res12 = v_fma_f16 hi(%a), -%b, %e
|
|
|
|
|
//! p_unit_test 12, %res12
|
|
|
|
|
mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), a, b, 0x0, 0x1);
|
|
|
|
|
extract = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), mul, Operand::c32(1));
|
|
|
|
|
add = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), e, extract);
|
|
|
|
|
mul->valu().neg_hi[1] = true;
|
|
|
|
|
writeout(12, add);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|