2020-01-22 19:59:56 +00:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2020 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
#include "helpers.h"
|
|
|
|
|
|
|
|
|
|
using namespace aco;
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp fneg(Temp src)
|
|
|
|
|
{
|
|
|
|
|
return bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf800000u), src);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Temp fabs(Temp src)
|
|
|
|
|
{
|
|
|
|
|
Builder::Result res = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), src);
|
|
|
|
|
res.instr->vop3().abs[1] = true;
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-22 19:59:56 +00:00
|
|
|
BEGIN_TEST(optimize.neg)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
|
2020-01-22 19:59:56 +00:00
|
|
|
if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mul_f32 %a, -%b
|
|
|
|
|
//! p_unit_test 0, %res0
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_b = fneg(inputs[1]);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
//~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
|
|
|
|
|
//~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
|
2020-01-22 19:59:56 +00:00
|
|
|
//~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
|
|
|
|
|
//! p_unit_test 1, %res1
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_a = fneg(inputs[0]);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x123456u), neg_a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_mul_f32 %a, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_neg_a = fneg(neg_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
//! v1: %res3 = v_mul_f32 |%a|, %b
|
2020-01-22 19:59:56 +00:00
|
|
|
//! p_unit_test 3, %res3
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp abs_neg_a = fabs(neg_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_mul_f32 -|%a|, %b
|
|
|
|
|
//! p_unit_test 4, %res4
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp abs_a = fabs(inputs[0]);
|
|
|
|
|
Temp neg_abs_a = fneg(abs_a);
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6 = v_subrev_f32 %a, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res7 = v_sub_f32 %b, %a
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res8 = v_mul_f32 %a, -%c
|
|
|
|
|
//! p_unit_test 8, %res8
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
|
2020-01-22 19:59:56 +00:00
|
|
|
writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
|
|
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
// //! v1: %res9 = v_mul_f32 |%neg_a|, %b
|
|
|
|
|
// //! p_unit_test 9, %res9
|
|
|
|
|
Temp abs_neg_abs_a = fabs(neg_abs_a);
|
|
|
|
|
writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
|
|
|
|
|
|
2020-01-22 19:59:56 +00:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
|
2020-11-13 15:12:35 +00:00
|
|
|
BEGIN_TEST(optimize.output_modifiers)
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b = p_startpgm
|
2020-11-13 15:12:35 +00:00
|
|
|
if (!setup_cs("v1 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
|
|
|
|
|
/* 32-bit modifiers */
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_add_f32 %a, %b *0.5
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f000000u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_add_f32 %a, %b *2
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40000000u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_add_f32 %a, %b *4
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40800000u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3 = v_add_f32 %a, %b clamp
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand(0u), Operand(0x3f800000u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_add_f32 %a, %b *2 clamp
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40000000u), tmp);
|
|
|
|
|
writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand(0u), Operand(0x3f800000u), tmp));
|
|
|
|
|
|
|
|
|
|
/* 16-bit modifiers */
|
|
|
|
|
|
|
|
|
|
//! v2b: %res5 = v_add_f16 %a, %b *0.5
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3800u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res6 = v_add_f16 %a, %b *2
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x4000u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res7 = v_add_f16 %a, %b *4
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x4400u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res8 = v_add_f16 %a, %b clamp
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand((uint16_t)0u), Operand((uint16_t)0x3c00u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res9 = v_add_f16 %a, %b *2 clamp
|
|
|
|
|
//! p_unit_test 9, %res9
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x4000), tmp);
|
|
|
|
|
writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand((uint16_t)0u), Operand((uint16_t)0x3c00u), tmp));
|
|
|
|
|
|
|
|
|
|
/* clamping is done after omod */
|
|
|
|
|
|
|
|
|
|
//! v1: %res10_tmp = v_add_f32 %a, %b clamp
|
|
|
|
|
//! v1: %res10 = v_mul_f32 2.0, %res10_tmp
|
|
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand(0u), Operand(0x3f800000u), tmp);
|
|
|
|
|
writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40000000u), tmp));
|
|
|
|
|
|
|
|
|
|
/* unsupported instructions */
|
|
|
|
|
|
|
|
|
|
//! v1: %res11_tmp = v_xor_b32 %a, %b
|
|
|
|
|
//! v1: %res11 = v_mul_f32 2.0, %res11_tmp
|
|
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40000000u), tmp));
|
|
|
|
|
|
|
|
|
|
/* several users */
|
|
|
|
|
|
|
|
|
|
//! v1: %res12_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! p_unit_test %res12_tmp
|
|
|
|
|
//! v1: %res12 = v_mul_f32 2.0, %res12_tmp
|
|
|
|
|
//! p_unit_test 12, %res12
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
bld.pseudo(aco_opcode::p_unit_test, tmp);
|
|
|
|
|
writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40000000u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %res13 = v_add_f32 %a, %b
|
|
|
|
|
//! p_unit_test 13, %res13
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40000000u), tmp);
|
|
|
|
|
writeout(13, tmp);
|
|
|
|
|
|
|
|
|
|
/* omod has no effect if denormals are enabled but clamp is fine */
|
|
|
|
|
|
|
|
|
|
//>> BB1
|
|
|
|
|
//! /* logical preds: / linear preds: / kind: uniform, */
|
|
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_flush;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
|
|
|
|
|
|
|
|
|
//! v1: %res14_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! v1: %res14 = v_mul_f32 2.0, %res13_tmp
|
|
|
|
|
//! p_unit_test 14, %res14
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40000000u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %res15 = v_add_f32 %a, %b clamp
|
|
|
|
|
//! p_unit_test 15, %res15
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand(0u), Operand(0x3f800000u), tmp));
|
|
|
|
|
|
|
|
|
|
//>> BB2
|
|
|
|
|
//! /* logical preds: / linear preds: / kind: uniform, */
|
|
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_flush;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
|
|
|
|
|
|
|
|
|
//! v2b: %res16_tmp = v_add_f16 %a, %b
|
|
|
|
|
//! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
|
|
|
|
|
//! p_unit_test 16, %res16
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x4000u), tmp));
|
|
|
|
|
|
|
|
|
|
//! v2b: %res17 = v_add_f16 %a, %b clamp
|
|
|
|
|
//! p_unit_test 17, %res17
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand((uint16_t)0u), Operand((uint16_t)0x3c00u), tmp));
|
|
|
|
|
|
|
|
|
|
/* omod flushes -0.0 to +0.0 */
|
|
|
|
|
|
|
|
|
|
//>> BB3
|
|
|
|
|
//! /* logical preds: / linear preds: / kind: uniform, */
|
|
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
|
|
|
|
|
|
|
|
|
//! v1: %res18_tmp = v_add_f32 %a, %b
|
|
|
|
|
//! v1: %res18 = v_mul_f32 2.0, %res18_tmp
|
|
|
|
|
//! p_unit_test 18, %res18
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x40000000u), tmp));
|
|
|
|
|
//! v1: %res19 = v_add_f32 %a, %b clamp
|
|
|
|
|
//! p_unit_test 19, %res19
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand(0u), Operand(0x3f800000u), tmp));
|
|
|
|
|
|
|
|
|
|
//>> BB4
|
|
|
|
|
//! /* logical preds: / linear preds: / kind: uniform, */
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;
|
|
|
|
|
bld.reset(program->create_and_insert_block());
|
|
|
|
|
//! v2b: %res20_tmp = v_add_f16 %a, %b
|
|
|
|
|
//! v2b: %res20 = v_mul_f16 2.0, %res20_tmp
|
|
|
|
|
//! p_unit_test 20, %res20
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x4000u), tmp));
|
|
|
|
|
//! v2b: %res21 = v_add_f16 %a, %b clamp
|
|
|
|
|
//! p_unit_test 21, %res21
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
|
|
|
|
|
writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand((uint16_t)0u), Operand((uint16_t)0x3c00u), tmp));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
|
|
|
|
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
|
|
|
|
|
{
|
|
|
|
|
return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.cndmask)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, s1: %b, s2: %c = p_startpgm
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
if (!setup_cs("v1 s1 s2", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp subbrev;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_cndmask_b32 0, %a, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_cndmask_b32 0, 42, %c
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
|
|
|
|
|
writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(42u), subbrev));
|
|
|
|
|
|
|
|
|
|
//~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
|
|
|
|
|
//~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
|
|
|
|
|
//~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
|
|
|
|
|
writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
|
|
|
|
|
|
|
|
|
|
//! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
|
|
|
|
|
//! v1: %xor = v_xor_b32 %a, %subbrev1
|
|
|
|
|
//! v1: %res3 = v_cndmask_b32 0, %xor, %c
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
|
|
|
|
|
Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
|
|
|
|
|
writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
|
|
|
|
|
|
aco: fix combining add/sub to b2i if a new dest needs to be allocated
The uses vector needs to be expanded to avoid out of bounds access
and to make sure the number of uses is initialized to 0.
This fixes combining more v_and(a, v_subbrev_co_u32).
fossilds-db (Vega10):
Totals from 4574 (3.28% of 139517) affected shaders:
SGPRs: 291625 -> 292217 (+0.20%); split: -0.01%, +0.21%
VGPRs: 276368 -> 276188 (-0.07%); split: -0.07%, +0.01%
SpillSGPRs: 455 -> 533 (+17.14%)
SpillVGPRs: 76 -> 78 (+2.63%)
CodeSize: 23327500 -> 23304152 (-0.10%); split: -0.17%, +0.07%
MaxWaves: 22044 -> 22066 (+0.10%)
Instrs: 4583064 -> 4576301 (-0.15%); split: -0.15%, +0.01%
Cycles: 47925276 -> 47871968 (-0.11%); split: -0.13%, +0.01%
VMEM: 1599363 -> 1597473 (-0.12%); split: +0.08%, -0.19%
SMEM: 331461 -> 331126 (-0.10%); split: +0.08%, -0.18%
VClause: 80639 -> 80696 (+0.07%); split: -0.02%, +0.09%
SClause: 155992 -> 155993 (+0.00%); split: -0.02%, +0.02%
Copies: 333482 -> 333318 (-0.05%); split: -0.12%, +0.07%
Branches: 70967 -> 70968 (+0.00%)
PreSGPRs: 187078 -> 187711 (+0.34%); split: -0.01%, +0.35%
PreVGPRs: 244918 -> 244785 (-0.05%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7513>
2020-11-09 19:42:22 +01:00
|
|
|
//! v1: %res4 = v_cndmask_b32 0, %a, %c
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), Operand(inputs[2]));
|
|
|
|
|
Temp sub = bld.vsub32(bld.def(v1), Operand(0u), cndmask);
|
|
|
|
|
writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
|
|
|
|
|
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-10 10:24:36 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.add_lshl)
|
2020-11-18 13:15:24 +01:00
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> s1: %a, v1: %b = p_startpgm
|
2020-11-10 10:24:36 +01:00
|
|
|
if (!setup_cs("s1 v1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp shift;
|
|
|
|
|
|
2020-11-18 13:15:24 +01:00
|
|
|
//~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
|
|
|
|
|
//~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
|
|
|
|
|
//~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
|
2020-11-10 10:24:36 +01:00
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
|
|
|
|
|
Operand(inputs[0]), Operand(3u));
|
|
|
|
|
writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand(4u)));
|
|
|
|
|
|
2020-11-18 13:15:24 +01:00
|
|
|
//~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
|
|
|
|
|
//~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
|
|
|
|
|
//~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
|
|
|
|
|
//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
|
|
|
|
|
//~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
|
|
|
|
|
//~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
|
2020-11-10 10:24:36 +01:00
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
|
|
|
|
|
Operand(inputs[0]), Operand(3u));
|
|
|
|
|
Temp sadd = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand(4u));
|
|
|
|
|
Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
|
|
|
|
|
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
//~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3
|
|
|
|
|
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
Temp lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand(3u));
|
|
|
|
|
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7
|
|
|
|
|
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
Operand a_24bit = Operand(inputs[0]);
|
|
|
|
|
a_24bit.set24bit(true);
|
|
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand(7u));
|
|
|
|
|
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3
|
|
|
|
|
//~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
|
|
|
|
|
//~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
|
|
|
|
|
//! p_unit_test 4, %carry
|
|
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand(3u));
|
|
|
|
|
Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
|
|
|
|
|
writeout(4, carry);
|
|
|
|
|
|
|
|
|
|
//~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a
|
|
|
|
|
//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);
|
|
|
|
|
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand(3u));
|
|
|
|
|
writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
Operand a_16bit = Operand(inputs[0]);
|
|
|
|
|
a_16bit.set16bit(true);
|
|
|
|
|
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand(4u));
|
|
|
|
|
writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
2020-11-10 10:24:36 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-02 16:44:04 +01:00
|
|
|
|
|
|
|
|
Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true)
|
|
|
|
|
{
|
|
|
|
|
a.set16bit(is16bit);
|
|
|
|
|
b.set16bit(is16bit);
|
|
|
|
|
|
|
|
|
|
return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_u32_u16)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, s1: %c = p_startpgm
|
2020-11-02 16:44:04 +01:00
|
|
|
if (!setup_cs("v1 v1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand(42u), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, create_mad_u32_u16(Operand(42u), Operand(inputs[2]), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5 = v_mad_u32_u16 42, %a, 0
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u), false));
|
|
|
|
|
|
2020-11-02 15:34:25 +01:00
|
|
|
//~gfx9! v1: %mul6 = v_mul_lo_u16 %a, %b
|
|
|
|
|
//~gfx9! v1: %res6 = v_add_u32 %mul6, %b
|
|
|
|
|
//~gfx10! v1: %mul6 = v_mul_lo_u16_e64 %a, %b
|
|
|
|
|
//~gfx10! v1: %res6 = v_add_u32 %mul6, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
Temp mul;
|
|
|
|
|
if (i >= GFX10) {
|
|
|
|
|
mul = bld.vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
} else {
|
|
|
|
|
mul = bld.vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
}
|
|
|
|
|
writeout(6, bld.vadd32(bld.def(v1), mul, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//~gfx9! v1: %res7 = v_mad_u32_u16 %a, %b, %b
|
|
|
|
|
//~gfx10! v1: (nuw)%mul7 = v_mul_lo_u16_e64 %a, %b
|
|
|
|
|
//~gfx10! v1: %res7 = v_add_u32 %mul7, %b
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
if (i >= GFX10) {
|
|
|
|
|
mul = bld.nuw().vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
} else {
|
|
|
|
|
mul = bld.nuw().vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
}
|
|
|
|
|
writeout(7, bld.vadd32(bld.def(v1), mul, inputs[1]));
|
|
|
|
|
|
2020-11-02 16:44:04 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-11 18:42:35 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.bcnt)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, s1: %b = p_startpgm
|
2020-11-11 18:42:35 +01:00
|
|
|
if (!setup_cs("v1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp bcnt;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_bcnt_u32_b32 %a, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
|
|
|
|
|
writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_bcnt_u32_b32 %a, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_bcnt_u32_b32 %a, 42
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
|
|
|
|
|
writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand(42u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
|
|
|
|
|
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand(0u));
|
|
|
|
|
writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
|
|
|
|
|
//~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
|
|
|
|
|
//~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
|
|
|
|
|
//! p_unit_test 4, %carry
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
|
|
|
|
|
Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
|
|
|
|
|
writeout(4, carry);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-10-07 11:09:16 +01:00
|
|
|
|
2020-11-11 15:44:54 +00:00
|
|
|
struct clamp_config {
|
|
|
|
|
const char *name;
|
|
|
|
|
aco_opcode min, max, med3;
|
|
|
|
|
Operand lb, ub;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static const clamp_config clamp_configs[] = {
|
|
|
|
|
/* 0.0, 4.0 */
|
|
|
|
|
{"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
|
|
|
|
|
Operand(0u), Operand(0x40800000u)},
|
|
|
|
|
{"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
|
|
|
|
|
Operand((uint16_t)0u), Operand((uint16_t)0x4400)},
|
|
|
|
|
/* -1.0, 0.0 */
|
|
|
|
|
{"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
|
|
|
|
|
Operand(0xbf800000u), Operand(0u)},
|
|
|
|
|
{"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
|
|
|
|
|
Operand((uint16_t)0xBC00), Operand((uint16_t)0u)},
|
|
|
|
|
/* 0, 3 */
|
|
|
|
|
{"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
|
|
|
|
|
Operand(0u), Operand(3u)},
|
|
|
|
|
{"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
|
|
|
|
|
Operand((uint16_t)0u), Operand((uint16_t)3u)},
|
|
|
|
|
{"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
|
|
|
|
|
Operand(0u), Operand(3u)},
|
|
|
|
|
{"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
|
|
|
|
|
Operand((uint16_t)0u), Operand((uint16_t)3u)},
|
|
|
|
|
/* -5, 0 */
|
|
|
|
|
{"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
|
|
|
|
|
Operand(0xfffffffbu), Operand(0u)},
|
|
|
|
|
{"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
|
|
|
|
|
Operand((uint16_t)0xfffbu), Operand((uint16_t)0u)},
|
|
|
|
|
};
|
|
|
|
|
|
2020-10-07 11:09:16 +01:00
|
|
|
BEGIN_TEST(optimize.clamp)
|
2020-11-11 15:44:54 +00:00
|
|
|
for (clamp_config cfg : clamp_configs) {
|
2020-11-05 12:43:14 +01:00
|
|
|
if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
|
2020-11-11 15:44:54 +00:00
|
|
|
continue;
|
2020-10-07 11:09:16 +01:00
|
|
|
|
2020-11-11 15:44:54 +00:00
|
|
|
//! cfg: @match_func(min max med3 lb ub)
|
|
|
|
|
fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
|
|
|
|
|
fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
|
|
|
|
|
fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
|
|
|
|
|
aco_print_operand(&cfg.lb, output);
|
|
|
|
|
fprintf(output, " ");
|
|
|
|
|
aco_print_operand(&cfg.ub, output);
|
|
|
|
|
fprintf(output, "\n");
|
|
|
|
|
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, v1: %c = p_startpgm
|
2020-11-11 15:44:54 +00:00
|
|
|
|
|
|
|
|
//! v1: %res0 = @med3 @ub, @lb, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = @med3 @lb, @ub, %a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
|
|
|
|
|
bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* min constant must be greater than max constant */
|
|
|
|
|
//! v1: %res2_tmp = @min @lb, %a
|
|
|
|
|
//! v1: %res2 = @max @ub, %res2_tmp
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3_tmp = @max @ub, %a
|
|
|
|
|
//! v1: %res3 = @min @lb, %res3_tmp
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* needs two constants */
|
|
|
|
|
|
|
|
|
|
//! v1: %res4_tmp = @max @lb, %a
|
|
|
|
|
//! v1: %res4 = @min %b, %res4_tmp
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5_tmp = @max %b, %a
|
|
|
|
|
//! v1: %res5 = @min @ub, %res5_tmp
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6_tmp = @max %c, %a
|
|
|
|
|
//! v1: %res6 = @min %b, %res6_tmp
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
|
|
|
|
|
bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* correct NaN behaviour with precise */
|
|
|
|
|
|
|
|
|
|
//! v1: %res7 = @med3 @ub, @lb, %a
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
|
|
|
|
|
max.def(0).setPrecise(true);
|
|
|
|
|
Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
|
|
|
|
|
max.def(0).setPrecise(true);
|
|
|
|
|
writeout(7, min);
|
|
|
|
|
|
|
|
|
|
//! v1: (precise)%res8_tmp = @min @ub, %a
|
|
|
|
|
//! v1: %res8 = @max @lb, %res8_tmp
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
|
|
|
|
|
min.def(0).setPrecise(true);
|
|
|
|
|
writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
2020-10-07 11:09:16 +01:00
|
|
|
END_TEST
|
2020-10-07 11:40:45 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.const_comparison_ordering)
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
|
2020-10-07 11:40:45 +01:00
|
|
|
if (!setup_cs("v1 v1 v2 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* optimize to unordered comparison */
|
|
|
|
|
//! s2: %res0 = v_cmp_nge_f32 4.0, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! s2: %res1 = v_cmp_nge_f32 4.0, %a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), bld.copy(bld.def(v1), Operand(0x40a00000u)), inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* optimize to ordered comparison */
|
|
|
|
|
//! s2: %res3 = v_cmp_lt_f32 4.0, %a
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! s2: %res4 = v_cmp_lt_f32 4.0, %a
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), bld.copy(bld.def(v1), Operand(0x40a00000u)), inputs[0])));
|
|
|
|
|
|
2020-10-07 14:46:34 +01:00
|
|
|
/* similar but unoptimizable expressions */
|
|
|
|
|
//! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a
|
|
|
|
|
//! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a
|
|
|
|
|
//! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0
|
|
|
|
|
//! p_unit_test 6, %res6
|
2020-12-06 10:38:40 +00:00
|
|
|
Temp src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0]);
|
|
|
|
|
Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
//! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a
|
|
|
|
|
//! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a
|
|
|
|
|
//! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
|
|
|
|
|
//! p_unit_test 7, %res7
|
2020-12-06 10:38:40 +00:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0]);
|
|
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
//! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d
|
|
|
|
|
//! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a
|
|
|
|
|
//! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0
|
|
|
|
|
//! p_unit_test 8, %res8
|
2020-12-06 10:38:40 +00:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[3]);
|
|
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
//! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a
|
|
|
|
|
//! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d
|
|
|
|
|
//! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0
|
|
|
|
|
//! p_unit_test 9, %res9
|
2020-12-06 10:38:40 +00:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0]);
|
|
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);
|
|
|
|
|
writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
|
|
|
|
/* bit sizes */
|
2021-01-15 09:23:04 +01:00
|
|
|
//! s2: %res10 = v_cmp_nge_f16 4.0, %b
|
2020-10-07 14:46:34 +01:00
|
|
|
//! p_unit_test 10, %res10
|
|
|
|
|
Temp input1_16 = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand(0u));
|
|
|
|
|
writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand((uint16_t)0x4400u), input1_16)));
|
|
|
|
|
|
|
|
|
|
//! s2: %res11 = v_cmp_nge_f64 4.0, %c
|
|
|
|
|
//! p_unit_test 11, %res11
|
|
|
|
|
writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),
|
|
|
|
|
bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand(0x4010000000000000u), inputs[2])));
|
|
|
|
|
|
2020-10-07 11:40:45 +01:00
|
|
|
/* NaN */
|
|
|
|
|
uint16_t nan16 = 0x7e00;
|
|
|
|
|
uint32_t nan32 = 0x7fc00000;
|
2020-10-07 14:46:34 +01:00
|
|
|
uint64_t nan64 = 0xffffffffffffffffllu;
|
2020-10-07 11:40:45 +01:00
|
|
|
|
2020-10-07 14:46:34 +01:00
|
|
|
//! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a
|
|
|
|
|
//! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a
|
|
|
|
|
//! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0
|
|
|
|
|
//! p_unit_test 12, %res12
|
2020-12-06 10:38:40 +00:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand(nan16), inputs[0]);
|
|
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
2020-10-07 14:46:34 +01:00
|
|
|
//! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a
|
|
|
|
|
//! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a
|
|
|
|
|
//! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0
|
|
|
|
|
//! p_unit_test 13, %res13
|
2020-12-06 10:38:40 +00:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(nan32), inputs[0]);
|
|
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 11:40:45 +01:00
|
|
|
|
2020-10-07 14:46:34 +01:00
|
|
|
//! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a
|
|
|
|
|
//! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a
|
|
|
|
|
//! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0
|
|
|
|
|
//! p_unit_test 14, %res14
|
2020-12-06 10:38:40 +00:00
|
|
|
src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand(nan64), inputs[0]);
|
|
|
|
|
src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]);
|
|
|
|
|
writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
|
2020-10-07 14:46:34 +01:00
|
|
|
|
2020-10-07 11:40:45 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2020-10-07 11:45:30 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.add3)
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, v1: %c = p_startpgm
|
2020-10-07 11:45:30 +01:00
|
|
|
if (!setup_cs("v1 v1 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_add3_u32 %a, %b, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %tmp1 = v_add_u32 %b, %c clamp
|
|
|
|
|
//! v1: %res1 = v_add_u32 %a, %tmp1
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
2021-01-21 16:13:34 +00:00
|
|
|
tmp.instr->vop3().clamp = true;
|
2020-10-07 11:45:30 +01:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
|
|
|
|
|
|
|
|
|
|
//! v1: %tmp2 = v_add_u32 %b, %c
|
|
|
|
|
//! v1: %res2 = v_add_u32 %a, %tmp2 clamp
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
|
2021-01-21 16:13:34 +00:00
|
|
|
tmp.instr->vop3().clamp = true;
|
2020-10-07 11:45:30 +01:00
|
|
|
writeout(2, tmp);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|
2020-11-17 17:14:49 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.minmax)
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a = p_startpgm
|
2020-11-17 17:14:49 +01:00
|
|
|
if (!setup_cs("v1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_max3_f32 0, -0, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp xor0 = fneg(inputs[0]);
|
2020-11-17 17:14:49 +01:00
|
|
|
Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0u), xor0);
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
Temp xor1 = fneg(min);
|
2020-11-17 17:14:49 +01:00
|
|
|
writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), xor1));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_max3_f32 0, -0, -%a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0u), Operand(inputs[0]));
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
xor1 = fneg(min);
|
2020-11-17 17:14:49 +01:00
|
|
|
writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), xor1));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-06-05 17:36:29 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_32_24)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX9; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, v1: %c = p_startpgm
|
2020-06-05 17:36:29 +01:00
|
|
|
if (!setup_cs("v1 v1 v1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mad_u32_u24 %b, %c, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1_tmp = v_mul_u32_u24 %b, %c
|
|
|
|
|
//! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-18 13:07:57 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.add_lshlrev)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
2021-02-04 16:01:44 +01:00
|
|
|
//>> v1: %a, v1: %b, s1: %c = p_startpgm
|
2020-11-18 13:07:57 +01:00
|
|
|
if (!setup_cs("v1 v1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp lshl;
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
|
|
|
|
|
//~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), Operand(inputs[0]));
|
|
|
|
|
writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
|
|
|
|
|
//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
Operand a_24bit = Operand(inputs[0]);
|
|
|
|
|
a_24bit.set24bit(true);
|
|
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), a_24bit);
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
|
|
|
|
|
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
Operand b_24bit = Operand(inputs[1]);
|
|
|
|
|
b_24bit.set24bit(true);
|
|
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
|
|
|
|
|
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), a_24bit);
|
|
|
|
|
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
|
|
|
|
|
//~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
Operand a_16bit = Operand(inputs[0]);
|
|
|
|
|
a_16bit.set16bit(true);
|
|
|
|
|
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(4u), a_16bit);
|
|
|
|
|
writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//~gfx8! v1: %lshl5 = v_lshlrev_b32 4, (is24bit)%c
|
|
|
|
|
//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %c, %lshl5
|
|
|
|
|
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
Operand c_24bit = Operand(inputs[2]);
|
|
|
|
|
c_24bit.set24bit(true);
|
|
|
|
|
lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(4u), c_24bit);
|
|
|
|
|
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|