2020-01-22 19:59:56 +00:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2020 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
#include "helpers.h"
|
|
|
|
|
|
|
|
|
|
using namespace aco;
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.neg)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
|
2020-01-22 19:59:56 +00:00
|
|
|
if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mul_f32 %a, -%b
|
|
|
|
|
//! p_unit_test 0, %res0
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_b = fneg(inputs[1]);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
//~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
|
|
|
|
|
//~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
|
2020-01-22 19:59:56 +00:00
|
|
|
//~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
|
|
|
|
|
//! p_unit_test 1, %res1
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_a = fneg(inputs[0]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
|
2020-01-22 19:59:56 +00:00
|
|
|
|
|
|
|
|
//! v1: %res2 = v_mul_f32 %a, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_neg_a = fneg(neg_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
//! v1: %res3 = v_mul_f32 |%a|, %b
|
2020-01-22 19:59:56 +00:00
|
|
|
//! p_unit_test 3, %res3
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp abs_neg_a = fabs(neg_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_mul_f32 -|%a|, %b
|
|
|
|
|
//! p_unit_test 4, %res4
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp abs_a = fabs(inputs[0]);
|
|
|
|
|
Temp neg_abs_a = fneg(abs_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6 = v_subrev_f32 %a, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res7 = v_sub_f32 %b, %a
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res8 = v_mul_f32 %a, -%c
|
|
|
|
|
//! p_unit_test 8, %res8
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
// //! v1: %res9 = v_mul_f32 |%neg_a|, %b
|
|
|
|
|
// //! p_unit_test 9, %res9
|
|
|
|
|
Temp abs_neg_abs_a = fabs(neg_abs_a);
|
|
|
|
|
writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
|
|
|
|
|
|
2020-01-22 19:59:56 +00:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
|
2020-11-13 15:12:35 +00:00
|
|
|
BEGIN_TEST(optimize.output_modifiers)
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b = p_startpgm
|
2020-11-13 15:12:35 +00:00
|
|
|
if (!setup_cs("v1 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
|
|
|
|
/* 32-bit modifiers */
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_add_f32 %a, %b *0.5
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res1 = v_add_f32 %a, %b *2
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res2 = v_add_f32 %a, %b *4
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res3 = v_add_f32 %a, %b clamp
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(0x3f800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res4 = v_add_f32 %a, %b *2 clamp
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
|
|
|
|
|
writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(0x3f800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* 16-bit modifiers */
|
|
|
|
|
|
|
|
|
|
//! v2b: %res5 = v_add_f16 %a, %b *0.5
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res6 = v_add_f16 %a, %b *2
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res7 = v_add_f16 %a, %b *4
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res8 = v_add_f16 %a, %b clamp
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
|
|
|
|
|
Operand::c16(0x3c00u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res9 = v_add_f16 %a, %b *2 clamp
|
|
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
|
|
|
|
|
writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
|
|
|
|
|
Operand::c16(0x3c00u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* clamping is done after omod */
|
|
|
|
|
|
|
|
|
|
//! v1: %res10_tmp = v_add_f32 %a, %b clamp
|
|
|
|
|
//! v1: %res10 = v_mul_f32 2.0, %res10_tmp
|
|
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
|
|
|
|
|
tmp);
|
|
|
|
|
writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* unsupported instructions */
|
|
|
|
|
|
|
|
|
|
//! v1: %res11_tmp = v_xor_b32 %a, %b
|
|
|
|
|
//! v1: %res11 = v_mul_f32 2.0, %res11_tmp
|
|
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* several users */
|
|
|
|
|
|
|
|
|
|
//! v1: %res12_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! p_unit_test %res12_tmp
|
|
|
|
|
//! v1: %res12 = v_mul_f32 2.0, %res12_tmp
|
|
|
|
|
//! p_unit_test 12, %res12
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
bld.pseudo(aco_opcode::p_unit_test, tmp);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res13 = v_add_f32 %a, %b
|
|
|
|
|
//! p_unit_test 13, %res13
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
|
2020-11-13 15:12:35 +00:00
|
|
|
writeout(13, tmp);
|
|
|
|
|
|
|
|
|
|
/* omod has no effect if denormals are enabled but clamp is fine */
|
|
|
|
|
|
|
|
|
|
//>> BB1
|
|
|
|
|
//! /* logical preds: / linear preds: / kind: uniform, */
|
|
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
|
|
|
|
|
|
|
|
|
//! v1: %res14_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! v1: %res14 = v_mul_f32 2.0, %res13_tmp
|
|
|
|
|
//! p_unit_test 14, %res14
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v1: %res15 = v_add_f32 %a, %b clamp
|
|
|
|
|
//! p_unit_test 15, %res15
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(0x3f800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//>> BB2
|
|
|
|
|
//! /* logical preds: / linear preds: / kind: uniform, */
|
|
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_flush;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
|
|
|
|
|
|
|
|
|
//! v2b: %res16_tmp = v_add_f16 %a, %b
|
|
|
|
|
//! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
|
|
|
|
|
//! p_unit_test 16, %res16
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//! v2b: %res17 = v_add_f16 %a, %b clamp
|
|
|
|
|
//! p_unit_test 17, %res17
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
|
|
|
|
|
Operand::c16(0x3c00u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
/* omod flushes -0.0 to +0.0 */
|
|
|
|
|
|
|
|
|
|
//>> BB3
|
|
|
|
|
//! /* logical preds: / linear preds: / kind: uniform, */
|
|
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
|
|
|
|
|
|
|
|
|
//! v1: %res18_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! v1: %res18 = v_mul_f32 2.0, %res18_tmp
|
|
|
|
|
//! p_unit_test 18, %res18
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
//! v1: %res19 = v_add_f32 %a, %b clamp
|
|
|
|
|
//! p_unit_test 19, %res19
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(0x3f800000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
//>> BB4
|
|
|
|
|
//! /* logical preds: / linear preds: / kind: uniform, */
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
|
|
|
|
//! v2b: %res20_tmp = v_add_f16 %a, %b
|
|
|
|
|
//! v2b: %res20 = v_mul_f16 2.0, %res20_tmp
|
|
|
|
|
//! p_unit_test 20, %res20
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
//! v2b: %res21 = v_add_f16 %a, %b clamp
|
|
|
|
|
//! p_unit_test 21, %res21
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
|
|
|
|
|
Operand::c16(0x3c00u), tmp));
|
2020-11-13 15:12:35 +00:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
|
|
|
|
|
{
|
|
|
|
|
return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.cndmask)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, s1: %b, s2: %c = p_startpgm
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
if (!setup_cs("v1 s1 s2", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp subbrev;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_cndmask_b32 0, %a, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
2021-07-13 11:22:46 +02:00
|
|
|
subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_cndmask_b32 0, 42, %c
|
|
|
|
|
//! p_unit_test 1, %res1
|
2021-07-13 11:22:46 +02:00
|
|
|
subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
|
|
|
|
|
writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
|
|
|
|
|
//~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
|
|
|
|
|
//~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
|
|
|
|
|
//~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
|
|
|
|
|
//! p_unit_test 2, %res2
|
2021-07-13 11:22:46 +02:00
|
|
|
subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
|
|
|
|
|
|
|
|
|
|
//! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
|
|
|
|
|
//! v1: %xor = v_xor_b32 %a, %subbrev1
|
|
|
|
|
//! v1: %res3 = v_cndmask_b32 0, %xor, %c
|
|
|
|
|
//! p_unit_test 3, %res3
|
2021-07-13 11:22:46 +02:00
|
|
|
subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
|
|
|
|
|
writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
|
|
|
|
|
|
aco: fix combining add/sub to b2i if a new dest needs to be allocated
The uses vector needs to be expanded to avoid out of bounds access
and to make sure the number of uses is initialized to 0.
This fixes combining more v_and(a, v_subbrev_co_u32).
fossilds-db (Vega10):
Totals from 4574 (3.28% of 139517) affected shaders:
SGPRs: 291625 -> 292217 (+0.20%); split: -0.01%, +0.21%
VGPRs: 276368 -> 276188 (-0.07%); split: -0.07%, +0.01%
SpillSGPRs: 455 -> 533 (+17.14%)
SpillVGPRs: 76 -> 78 (+2.63%)
CodeSize: 23327500 -> 23304152 (-0.10%); split: -0.17%, +0.07%
MaxWaves: 22044 -> 22066 (+0.10%)
Instrs: 4583064 -> 4576301 (-0.15%); split: -0.15%, +0.01%
Cycles: 47925276 -> 47871968 (-0.11%); split: -0.13%, +0.01%
VMEM: 1599363 -> 1597473 (-0.12%); split: +0.08%, -0.19%
SMEM: 331461 -> 331126 (-0.10%); split: +0.08%, -0.18%
VClause: 80639 -> 80696 (+0.07%); split: -0.02%, +0.09%
SClause: 155992 -> 155993 (+0.00%); split: -0.02%, +0.02%
Copies: 333482 -> 333318 (-0.05%); split: -0.12%, +0.07%
Branches: 70967 -> 70968 (+0.00%)
PreSGPRs: 187078 -> 187711 (+0.34%); split: -0.01%, +0.35%
PreVGPRs: 244918 -> 244785 (-0.05%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7513>
2020-11-09 19:42:22 +01:00
|
|
|
//! v1: %res4 = v_cndmask_b32 0, %a, %c
|
|
|
|
|
//! p_unit_test 4, %res4
|
2021-07-13 11:22:46 +02:00
|
|
|
Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
|
|
|
|
|
Operand::c32(1u), Operand(inputs[2]));
|
|
|
|
|
Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
|
aco: fix combining add/sub to b2i if a new dest needs to be allocated
The uses vector needs to be expanded to avoid out of bounds access
and to make sure the number of uses is initialized to 0.
This fixes combining more v_and(a, v_subbrev_co_u32).
fossilds-db (Vega10):
Totals from 4574 (3.28% of 139517) affected shaders:
SGPRs: 291625 -> 292217 (+0.20%); split: -0.01%, +0.21%
VGPRs: 276368 -> 276188 (-0.07%); split: -0.07%, +0.01%
SpillSGPRs: 455 -> 533 (+17.14%)
SpillVGPRs: 76 -> 78 (+2.63%)
CodeSize: 23327500 -> 23304152 (-0.10%); split: -0.17%, +0.07%
MaxWaves: 22044 -> 22066 (+0.10%)
Instrs: 4583064 -> 4576301 (-0.15%); split: -0.15%, +0.01%
Cycles: 47925276 -> 47871968 (-0.11%); split: -0.13%, +0.01%
VMEM: 1599363 -> 1597473 (-0.12%); split: +0.08%, -0.19%
SMEM: 331461 -> 331126 (-0.10%); split: +0.08%, -0.18%
VClause: 80639 -> 80696 (+0.07%); split: -0.02%, +0.09%
SClause: 155992 -> 155993 (+0.00%); split: -0.02%, +0.02%
Copies: 333482 -> 333318 (-0.05%); split: -0.12%, +0.07%
Branches: 70967 -> 70968 (+0.00%)
PreSGPRs: 187078 -> 187711 (+0.34%); split: -0.01%, +0.35%
PreVGPRs: 244918 -> 244785 (-0.05%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7513>
2020-11-09 19:42:22 +01:00
|
|
|
writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
|
|
|
|
|
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-10 10:24:36 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.add_lshl)
|
2020-11-18 13:15:24 +01:00
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> s1: %a, v1: %b = p_startpgm
|
2020-11-10 10:24:36 +01:00
|
|
|
if (!setup_cs("s1 v1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp shift;
|
|
|
|
|
|
2020-11-18 13:15:24 +01:00
|
|
|
//~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
|
|
|
|
|
//~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
|
|
|
|
|
//~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
|
2020-11-10 10:24:36 +01:00
|
|
|
//! p_unit_test 0, %res0
|
2021-07-13 11:22:46 +02:00
|
|
|
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
|
|
|
|
|
Operand::c32(3u));
|
|
|
|
|
writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
|
|
|
|
|
Operand::c32(4u)));
|
2020-11-10 10:24:36 +01:00
|
|
|
|
2020-11-18 13:15:24 +01:00
|
|
|
//~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
|
|
|
|
|
//~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
|
|
|
|
|
//~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
|
|
|
|
|
//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
|
|
|
|
|
//~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
|
|
|
|
|
//~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
|
2020-11-10 10:24:36 +01:00
|
|
|
//! p_unit_test 1, %res1
|
2021-07-13 11:22:46 +02:00
|
|
|
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
|
|
|
|
|
Operand::c32(3u));
|
|
|
|
|
Temp sadd =
|
|
|
|
|
bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
|
2020-11-10 10:24:36 +01:00
|
|
|
Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
|
|
|
|
|
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
//~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3
|
|
|
|
|
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
2021-07-13 11:22:46 +02:00
|
|
|
Temp lshl =
|
|
|
|
|
bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand::c32(3u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7
|
|
|
|
|
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
Operand a_24bit = Operand(inputs[0]);
|
|
|
|
|
a_24bit.set24bit(true);
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(7u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3
|
|
|
|
|
//~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
|
|
|
|
|
//~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
|
|
|
|
|
//! p_unit_test 4, %carry
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
|
|
|
|
|
writeout(4, carry);
|
|
|
|
|
|
|
|
|
|
//~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a
|
|
|
|
|
//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);
|
|
|
|
|
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
Operand a_16bit = Operand(inputs[0]);
|
|
|
|
|
a_16bit.set16bit(true);
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand::c32(4u));
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
2020-11-02 16:44:04 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-11 18:42:35 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.bcnt)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, s1: %b = p_startpgm
|
2020-11-11 18:42:35 +01:00
|
|
|
if (!setup_cs("v1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp bcnt;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_bcnt_u32_b32 %a, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
2020-11-11 18:42:35 +01:00
|
|
|
writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_bcnt_u32_b32 %a, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
2020-11-11 18:42:35 +01:00
|
|
|
writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_bcnt_u32_b32 %a, 42
|
|
|
|
|
//! p_unit_test 2, %res2
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
|
|
|
|
writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
|
2020-11-11 18:42:35 +01:00
|
|
|
|
|
|
|
|
//! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
|
|
|
|
|
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
|
|
|
|
|
//! p_unit_test 3, %res3
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
|
2020-11-11 18:42:35 +01:00
|
|
|
writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
|
|
|
|
|
//~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
|
|
|
|
|
//~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
|
|
|
|
|
//! p_unit_test 4, %carry
|
2021-07-13 11:22:46 +02:00
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
|
2020-11-11 18:42:35 +01:00
|
|
|
Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
|
|
|
|
|
writeout(4, carry);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-10-07 11:09:16 +01:00
|
|
|
|
2020-11-11 15:44:54 +00:00
|
|
|
struct clamp_config {
|
|
|
|
|
const char *name;
|
|
|
|
|
aco_opcode min, max, med3;
|
|
|
|
|
Operand lb, ub;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static const clamp_config clamp_configs[] = {
|
|
|
|
|
/* 0.0, 4.0 */
|
|
|
|
|
{"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::zero(), Operand::c32(0x40800000u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0u), Operand::c16(0x4400)},
|
2020-11-11 15:44:54 +00:00
|
|
|
/* -1.0, 0.0 */
|
|
|
|
|
{"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(0xbf800000u), Operand::zero()},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0xBC00), Operand::c16(0u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
/* 0, 3 */
|
|
|
|
|
{"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::zero(), Operand::c32(3u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0u), Operand::c16(3u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::zero(), Operand::c32(3u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0u), Operand::c16(3u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
/* -5, 0 */
|
|
|
|
|
{"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(0xfffffffbu), Operand::zero()},
|
2020-11-11 15:44:54 +00:00
|
|
|
{"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c16(0xfffbu), Operand::c16(0u)},
|
2020-11-11 15:44:54 +00:00
|
|
|
};
|
|
|
|
|
|
2020-10-07 11:09:16 +01:00
|
|
|
BEGIN_TEST(optimize.clamp)
|
2020-11-11 15:44:54 +00:00
|
|
|
for (clamp_config cfg : clamp_configs) {
|
2020-11-05 12:43:14 +01:00
|
|
|
if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
|
2020-11-11 15:44:54 +00:00
|
|
|
continue;
|
2020-10-07 11:09:16 +01:00
|
|
|
|
2020-11-11 15:44:54 +00:00
|
|
|
//! cfg: @match_func(min max med3 lb ub)
|
|
|
|
|
fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
|
|
|
|
|
fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
|
|
|
|
|
fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
|
|
|
|
|
aco_print_operand(&cfg.lb, output);
|
|
|
|
|
fprintf(output, " ");
|
|
|
|
|
aco_print_operand(&cfg.ub, output);
|
|
|
|
|
fprintf(output, "\n");
|
|
|
|
|
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, v1: %c = p_startpgm
|
2020-11-11 15:44:54 +00:00
|
|
|
|
|
|
|
|
//! v1: %res0 = @med3 @ub, @lb, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = @med3 @lb, @ub, %a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
|
|
|
|
|
bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* min constant must be greater than max constant */
|
|
|
|
|
//! v1: %res2_tmp = @min @lb, %a
|
|
|
|
|
//! v1: %res2 = @max @ub, %res2_tmp
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3_tmp = @max @ub, %a
|
|
|
|
|
//! v1: %res3 = @min @lb, %res3_tmp
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* needs two constants */
|
|
|
|
|
|
|
|
|
|
//! v1: %res4_tmp = @max @lb, %a
|
|
|
|
|
//! v1: %res4 = @min %b, %res4_tmp
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5_tmp = @max %b, %a
|
|
|
|
|
//! v1: %res5 = @min @ub, %res5_tmp
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6_tmp = @max %c, %a
|
|
|
|
|
//! v1: %res6 = @min %b, %res6_tmp
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* correct NaN behaviour with precise */
|
|
|
|
|
|
|
|
|
|
//! v1: %res7 = @med3 @ub, @lb, %a
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
|
|
|
|
|
max.def(0).setPrecise(true);
|
|
|
|
|
Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
|
|
|
|
|
max.def(0).setPrecise(true);
|
|
|
|
|
writeout(7, min);
|
|
|
|
|
|
|
|
|
|
//! v1: (precise)%res8_tmp = @min @ub, %a
|
|
|
|
|
//! v1: %res8 = @max @lb, %res8_tmp
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
|
|
|
|
|
min.def(0).setPrecise(true);
|
|
|
|
|
writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
2020-10-07 11:09:16 +01:00
|
|
|
END_TEST
|
2020-10-07 11:40:45 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.const_comparison_ordering)
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
|
2020-10-07 11:40:45 +01:00
|
|
|
if (!setup_cs("v1 v1 v2 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* optimize to unordered comparison */
|
|
|
|
|
//! s2: %res0 = v_cmp_nge_f32 4.0, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
|
|
|
|
|
Operand::c32(0x40800000u), inputs[0])));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
|
|
|
|
//! s2: %res1 = v_cmp_nge_f32 4.0, %a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
|
|
|
|
|
Operand::c32(0x40800000u), inputs[0])));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
|
|
|
|
//! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
|
|
|
|
|
bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
|
|
|
|
/* optimize to ordered comparison */
|
|
|
|
|
//! s2: %res3 = v_cmp_lt_f32 4.0, %a
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
|
|
|
|
|
Operand::c32(0x40800000u), inputs[0])));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
|
|
|
|
//! s2: %res4 = v_cmp_lt_f32 4.0, %a
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
|
|
|
|
|
Operand::c32(0x40800000u), inputs[0])));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
|
|
|
|
//! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
|
|
|
|
|
bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
2020-10-07 14:46:34 +01:00
|
|
|
/* similar but unoptimizable expressions */
|
|
|
|
|
//! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a
|
|
|
|
|
//! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a
|
|
|
|
|
//! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0
|
|
|
|
|
//! p_unit_test 6, %res6
|
2021-07-13 11:22:46 +02:00
|
|
|
Temp src1 =
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
|
2020-12-06 10:38:40 +00:00
|
|
|
Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
//! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a
|
|
|
|
|
//! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a
|
|
|
|
|
//! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
|
|
|
|
|
//! p_unit_test 7, %res7
|
2021-07-13 11:22:46 +02:00
|
|
|
src1 =
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
|
2020-12-06 10:38:40 +00:00
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
//! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d
|
|
|
|
|
//! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a
|
|
|
|
|
//! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0
|
|
|
|
|
//! p_unit_test 8, %res8
|
2021-07-13 11:22:46 +02:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]);
|
2020-12-06 10:38:40 +00:00
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
//! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a
|
|
|
|
|
//! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d
|
|
|
|
|
//! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0
|
|
|
|
|
//! p_unit_test 9, %res9
|
2021-07-13 11:22:46 +02:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
|
2020-12-06 10:38:40 +00:00
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);
|
|
|
|
|
writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
/* bit sizes */
|
2021-01-15 09:23:04 +01:00
|
|
|
//! s2: %res10 = v_cmp_nge_f16 4.0, %b
|
2020-10-07 14:46:34 +01:00
|
|
|
//! p_unit_test 10, %res10
|
2021-07-13 11:22:46 +02:00
|
|
|
Temp input1_16 =
|
|
|
|
|
bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero());
|
2020-10-07 14:46:34 +01:00
|
|
|
writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u),
|
|
|
|
|
input1_16)));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
//! s2: %res11 = v_cmp_nge_f64 4.0, %c
|
|
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm),
|
|
|
|
|
Operand::c64(0x4010000000000000u), inputs[2])));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
2020-10-07 11:40:45 +01:00
|
|
|
/* NaN */
|
|
|
|
|
uint16_t nan16 = 0x7e00;
|
|
|
|
|
uint32_t nan32 = 0x7fc00000;
|
2020-10-07 14:46:34 +01:00
|
|
|
uint64_t nan64 = 0xffffffffffffffffllu;
|
2020-10-07 11:40:45 +01:00
|
|
|
|
2020-10-07 14:46:34 +01:00
|
|
|
//! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a
|
|
|
|
|
//! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a
|
|
|
|
|
//! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0
|
|
|
|
|
//! p_unit_test 12, %res12
|
2021-07-13 11:22:46 +02:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]);
|
2020-12-06 10:38:40 +00:00
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
2020-10-07 14:46:34 +01:00
|
|
|
//! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a
|
|
|
|
|
//! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a
|
|
|
|
|
//! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0
|
|
|
|
|
//! p_unit_test 13, %res13
|
2021-07-13 11:22:46 +02:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]);
|
2020-12-06 10:38:40 +00:00
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
2020-10-07 14:46:34 +01:00
|
|
|
//! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a
|
|
|
|
|
//! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a
|
|
|
|
|
//! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0
|
|
|
|
|
//! p_unit_test 14, %res14
|
2021-07-13 11:22:46 +02:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[0]);
|
2020-12-06 10:38:40 +00:00
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
2020-10-07 11:40:45 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2020-10-07 11:45:30 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.add3)
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, v1: %c = p_startpgm
|
2020-10-07 11:45:30 +01:00
|
|
|
if (!setup_cs("v1 v1 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_add3_u32 %a, %b, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %tmp1 = v_add_u32 %b, %c clamp
|
|
|
|
|
//! v1: %res1 = v_add_u32 %a, %tmp1
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
2021-01-21 16:13:34 +00:00
|
|
|
tmp.instr->vop3().clamp = true;
|
2020-10-07 11:45:30 +01:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %tmp2 = v_add_u32 %b, %c
|
|
|
|
|
//! v1: %res2 = v_add_u32 %a, %tmp2 clamp
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
|
2021-01-21 16:13:34 +00:00
|
|
|
tmp.instr->vop3().clamp = true;
|
2020-10-07 11:45:30 +01:00
|
|
|
writeout(2, tmp);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2020-11-17 17:14:49 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.minmax)
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a = p_startpgm
|
2020-11-17 17:14:49 +01:00
|
|
|
if (!setup_cs("v1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_max3_f32 0, -0, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp xor0 = fneg(inputs[0]);
|
2021-07-13 11:22:46 +02:00
|
|
|
Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0);
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp xor1 = fneg(min);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
|
2020-11-17 17:14:49 +01:00
|
|
|
|
|
|
|
|
//! v1: %res1 = v_max3_f32 0, -0, -%a
|
|
|
|
|
//! p_unit_test 1, %res1
|
2021-07-13 11:22:46 +02:00
|
|
|
min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0]));
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
xor1 = fneg(min);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
|
2020-11-17 17:14:49 +01:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-06-05 17:36:29 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_32_24)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX9; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, v1: %c = p_startpgm
|
2020-06-05 17:36:29 +01:00
|
|
|
if (!setup_cs("v1 v1 v1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mad_u32_u24 %b, %c, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1_tmp = v_mul_u32_u24 %b, %c
|
|
|
|
|
//! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-18 13:07:57 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.add_lshlrev)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, s1: %c = p_startpgm
|
2020-11-18 13:07:57 +01:00
|
|
|
if (!setup_cs("v1 v1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp lshl;
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
|
|
|
|
|
//~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//! p_unit_test 0, %res0
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
|
|
|
|
|
//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
Operand a_24bit = Operand(inputs[0]);
|
|
|
|
|
a_24bit.set24bit(true);
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
|
|
|
|
|
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
Operand b_24bit = Operand(inputs[1]);
|
|
|
|
|
b_24bit.set24bit(true);
|
|
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
|
|
|
|
|
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
|
|
|
|
|
//! p_unit_test 3, %res3
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
Operand a_16bit = Operand(inputs[0]);
|
|
|
|
|
a_16bit.set16bit(true);
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl5 = v_lshlrev_b32 4, (is24bit)%c
|
|
|
|
|
//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %c, %lshl5
|
|
|
|
|
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
Operand c_24bit = Operand(inputs[2]);
|
|
|
|
|
c_24bit.set24bit(true);
|
2021-07-13 11:22:46 +02:00
|
|
|
lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
|
2020-11-18 13:07:57 +01:00
|
|
|
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2021-02-16 16:48:46 +00:00
|
|
|
|
|
|
|
|
enum denorm_op {
|
|
|
|
|
denorm_mul1 = 0,
|
|
|
|
|
denorm_fneg = 1,
|
|
|
|
|
denorm_fabs = 2,
|
|
|
|
|
denorm_fnegabs = 3,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static const char *denorm_op_names[] = {
|
|
|
|
|
"mul1",
|
|
|
|
|
"fneg",
|
|
|
|
|
"fabs",
|
|
|
|
|
"fnegabs",
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct denorm_config {
|
|
|
|
|
bool flush;
|
|
|
|
|
unsigned op;
|
|
|
|
|
aco_opcode src;
|
|
|
|
|
aco_opcode dest;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static const char *srcdest_op_name(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_cndmask_b32:
|
|
|
|
|
return "cndmask";
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
return "min";
|
|
|
|
|
case aco_opcode::v_rcp_f32:
|
|
|
|
|
return "rcp";
|
|
|
|
|
default:
|
|
|
|
|
return "none";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static Temp emit_denorm_srcdest(aco_opcode op, Temp val)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_cndmask_b32:
|
2021-07-13 11:22:46 +02:00
|
|
|
return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
|
2021-02-16 16:48:46 +00:00
|
|
|
case aco_opcode::v_min_f32:
|
2021-07-13 11:22:46 +02:00
|
|
|
return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
|
2021-02-16 16:48:46 +00:00
|
|
|
case aco_opcode::v_rcp_f32:
|
|
|
|
|
return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
|
|
|
|
|
default:
|
|
|
|
|
return val;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.denorm_propagation)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX9; i++) {
|
|
|
|
|
std::vector<denorm_config> configs;
|
|
|
|
|
for (bool flush : {false, true}) {
|
|
|
|
|
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
|
|
|
|
|
configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
|
|
|
|
|
|
|
|
|
|
for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
|
|
|
|
|
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
|
|
|
|
|
configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
|
|
|
|
|
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
|
|
|
|
|
configs.push_back({flush, op, src, aco_opcode::num_opcodes});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (denorm_config cfg : configs) {
|
|
|
|
|
char subvariant[128];
|
|
|
|
|
sprintf(subvariant, "_%s_%s_%s_%s",
|
|
|
|
|
cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
|
|
|
|
|
denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
|
|
|
|
|
if (!setup_cs("v1 s2", (chip_class)i, CHIP_UNKNOWN, subvariant))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
|
|
|
|
|
cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) ||
|
|
|
|
|
!cfg.flush;
|
|
|
|
|
|
|
|
|
|
fprintf(output, "src, dest, op: %s %s %s\n",
|
|
|
|
|
srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
|
|
|
|
|
fprintf(output, "can_propagate: %u\n", can_propagate);
|
|
|
|
|
//! src, dest, op: $src $dest $op
|
|
|
|
|
//! can_propagate: #can_propagate
|
|
|
|
|
//>> v1: %a, s2: %b = p_startpgm
|
|
|
|
|
|
|
|
|
|
//; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
|
|
|
|
|
//; 'min': 'v1: %{} = v_min_f32 0, {}',
|
|
|
|
|
//; 'rcp': 'v1: %{} = v_rcp_f32 {}'}
|
|
|
|
|
//; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
|
|
|
|
|
//; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
|
|
|
|
|
//; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
|
|
|
|
|
//; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
|
|
|
|
|
//; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
|
|
|
|
|
|
|
|
|
|
//; name = 'a'
|
|
|
|
|
//; if src != 'none':
|
|
|
|
|
//; insert_pattern(patterns[src].format('src_res', '%'+name))
|
|
|
|
|
//; name = 'src_res'
|
|
|
|
|
|
|
|
|
|
//; if can_propagate:
|
|
|
|
|
//; name = inline_ops[op].format(name)
|
|
|
|
|
//; else:
|
|
|
|
|
//; insert_pattern(ops[op].format('op_res', name))
|
|
|
|
|
//; name = '%op_res'
|
|
|
|
|
|
|
|
|
|
//; if dest != 'none':
|
|
|
|
|
//; insert_pattern(patterns[dest].format('dest_res', name))
|
|
|
|
|
//; name = '%dest_res'
|
|
|
|
|
|
|
|
|
|
//; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
|
|
|
|
|
//! p_unit_test 0, %res
|
|
|
|
|
|
|
|
|
|
program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
|
|
|
|
|
|
|
|
|
|
Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
|
|
|
|
|
switch (cfg.op) {
|
|
|
|
|
case denorm_mul1:
|
2021-07-13 11:22:46 +02:00
|
|
|
val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
|
2021-02-16 16:48:46 +00:00
|
|
|
break;
|
|
|
|
|
case denorm_fneg:
|
|
|
|
|
val = fneg(val);
|
|
|
|
|
break;
|
|
|
|
|
case denorm_fabs:
|
|
|
|
|
val = fabs(val);
|
|
|
|
|
break;
|
|
|
|
|
case denorm_fnegabs:
|
|
|
|
|
val = fneg(fabs(val));
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
val = emit_denorm_srcdest(cfg.dest, val);
|
2021-07-13 11:22:46 +02:00
|
|
|
writeout(
|
|
|
|
|
0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
|
2021-02-16 16:48:46 +00:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2021-07-19 15:39:34 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimizer.dpp)
|
|
|
|
|
//>> v1: %a, v1: %b, s2: %c = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 v1 s2", GFX10_3))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Operand a(inputs[0]);
|
|
|
|
|
Operand b(inputs[1]);
|
|
|
|
|
Operand c(inputs[2]);
|
|
|
|
|
|
|
|
|
|
/* basic optimization */
|
|
|
|
|
//! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
|
|
|
|
|
writeout(0, res0);
|
|
|
|
|
|
|
|
|
|
/* operand swapping */
|
|
|
|
|
//! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
|
|
|
|
|
writeout(1, res1);
|
|
|
|
|
|
|
|
|
|
//! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
|
|
|
|
|
//! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
|
|
|
|
|
writeout(2, res2);
|
|
|
|
|
|
|
|
|
|
/* modifiers */
|
|
|
|
|
//! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
tmp3.instr->dpp().neg[0] = true;
|
|
|
|
|
Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
|
|
|
|
|
writeout(3, res3);
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
|
|
|
|
|
res4.instr->vop3().neg[0] = true;
|
|
|
|
|
writeout(4, res4);
|
|
|
|
|
|
|
|
|
|
//! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
|
|
|
|
|
//! v1: %res5 = v_add_f32 %tmp5, %b clamp
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
|
|
|
|
|
res5.instr->vop3().clamp = true;
|
|
|
|
|
writeout(5, res5);
|
|
|
|
|
|
|
|
|
|
//! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
tmp6.instr->dpp().neg[0] = true;
|
|
|
|
|
auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
|
|
|
|
|
res6.instr->vop3().abs[0] = true;
|
|
|
|
|
writeout(6, res6);
|
|
|
|
|
|
|
|
|
|
//! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
|
|
|
|
|
res7.instr->vop3().abs[0] = true;
|
|
|
|
|
writeout(7, res7);
|
|
|
|
|
|
|
|
|
|
/* vcc */
|
|
|
|
|
//! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
|
|
|
|
Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
|
|
|
|
|
writeout(8, res8);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
2021-08-27 17:53:48 +01:00
|
|
|
BEGIN_TEST(optimize.dpp_prop)
|
|
|
|
|
//>> v1: %a, s1: %b = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 s1", GFX10))
|
|
|
|
|
return;
|
|
|
|
|
|
2021-08-30 10:30:45 +01:00
|
|
|
//! v1: %one = p_parallelcopy 1
|
|
|
|
|
//! v1: %res0 = v_mul_f32 1, %a
|
2021-08-27 17:53:48 +01:00
|
|
|
//! p_unit_test 0, %res0
|
2021-08-30 10:30:45 +01:00
|
|
|
Temp one = bld.copy(bld.def(v1), Operand::c32(1));
|
|
|
|
|
writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
|
2021-08-27 17:53:48 +01:00
|
|
|
|
2021-08-30 10:30:45 +01:00
|
|
|
//! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
|
2021-08-27 17:53:48 +01:00
|
|
|
//! p_unit_test 1, %res1
|
2021-08-30 10:30:45 +01:00
|
|
|
writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
|
2021-08-27 17:53:48 +01:00
|
|
|
|
2021-08-30 10:30:45 +01:00
|
|
|
//! v1: %res2 = v_mul_f32 0x12345678, %a
|
2021-08-27 17:53:48 +01:00
|
|
|
//! p_unit_test 2, %res2
|
2021-08-30 10:30:45 +01:00
|
|
|
Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
|
|
|
|
|
writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
|
|
|
|
|
|
|
|
|
|
//! v1: %literal2 = p_parallelcopy 0x12345679
|
|
|
|
|
//! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
|
|
|
|
|
writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
|
|
|
|
|
|
|
|
|
|
//! v1: %b_v = p_parallelcopy %b
|
|
|
|
|
//! v1: %res4 = v_mul_f32 %b, %a
|
|
|
|
|
//! p_unit_test 4, %res4
|
2021-08-27 17:53:48 +01:00
|
|
|
Temp b_v = bld.copy(bld.def(v1), inputs[1]);
|
2021-08-30 10:30:45 +01:00
|
|
|
writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6 = v_rcp_f32 %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
|
2021-08-27 17:53:48 +01:00
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|