2020-01-22 19:59:56 +00:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2020 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
#include "helpers.h"
|
|
|
|
|
|
|
|
|
|
using namespace aco;
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.neg)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
|
|
|
|
//>> v1: %a, v1: %b, s1: %c, s1: %d, s2: %_:exec = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mul_f32 %a, -%b
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
Temp neg_b = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), inputs[1]);
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
|
|
|
|
|
|
|
|
|
|
//! v1: %neg_a = v_xor_b32 0x80000000, %a
|
|
|
|
|
//~gfx[6-9]! v1: %res1 = v_mul_f32 0x123456, %neg_a
|
|
|
|
|
//~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
Temp neg_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), inputs[0]);
|
|
|
|
|
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x123456u), neg_a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_mul_f32 %a, %b
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
Temp neg_neg_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), neg_a);
|
|
|
|
|
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
/* we could optimize this case into just an abs(), but NIR already does this */
|
|
|
|
|
//! v1: %res3 = v_mul_f32 |%neg_a|, %b
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
Temp abs_neg_a = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), neg_a);
|
|
|
|
|
writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_mul_f32 -|%a|, %b
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
Temp abs_a = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), inputs[0]);
|
|
|
|
|
Temp neg_abs_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), abs_a);
|
|
|
|
|
writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res6 = v_subrev_f32 %a, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//! v1: %res7 = v_sub_f32 %b, %a
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
|
|
|
|
|
|
|
|
|
|
//! v1: %res8 = v_mul_f32 %a, -%c
|
|
|
|
|
//! p_unit_test 8, %res8
|
|
|
|
|
Temp neg_c = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), bld.copy(bld.def(v1), inputs[2]));
|
|
|
|
|
writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
|
|
|
|
|
Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
|
|
|
|
|
{
|
|
|
|
|
return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.cndmask)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
|
|
|
|
//>> v1: %a, s1: %b, s2: %c, s2: %_:exec = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 s1 s2", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp subbrev;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_cndmask_b32 0, %a, %c
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_cndmask_b32 0, 42, %c
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
|
|
|
|
|
writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(42u), subbrev));
|
|
|
|
|
|
|
|
|
|
//~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
|
|
|
|
|
//~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
|
|
|
|
|
//~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
|
|
|
|
|
writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
|
|
|
|
|
|
|
|
|
|
//! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
|
|
|
|
|
//! v1: %xor = v_xor_b32 %a, %subbrev1
|
|
|
|
|
//! v1: %res3 = v_cndmask_b32 0, %xor, %c
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
|
|
|
|
|
Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
|
|
|
|
|
writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
|
|
|
|
|
|
aco: fix combining add/sub to b2i if a new dest needs to be allocated
The uses vector needs to be expanded to avoid out of bounds access
and to make sure the number of uses is initialized to 0.
This fixes combining more v_and(a, v_subbrev_co_u32).
fossilds-db (Vega10):
Totals from 4574 (3.28% of 139517) affected shaders:
SGPRs: 291625 -> 292217 (+0.20%); split: -0.01%, +0.21%
VGPRs: 276368 -> 276188 (-0.07%); split: -0.07%, +0.01%
SpillSGPRs: 455 -> 533 (+17.14%)
SpillVGPRs: 76 -> 78 (+2.63%)
CodeSize: 23327500 -> 23304152 (-0.10%); split: -0.17%, +0.07%
MaxWaves: 22044 -> 22066 (+0.10%)
Instrs: 4583064 -> 4576301 (-0.15%); split: -0.15%, +0.01%
Cycles: 47925276 -> 47871968 (-0.11%); split: -0.13%, +0.01%
VMEM: 1599363 -> 1597473 (-0.12%); split: +0.08%, -0.19%
SMEM: 331461 -> 331126 (-0.10%); split: +0.08%, -0.18%
VClause: 80639 -> 80696 (+0.07%); split: -0.02%, +0.09%
SClause: 155992 -> 155993 (+0.00%); split: -0.02%, +0.02%
Copies: 333482 -> 333318 (-0.05%); split: -0.12%, +0.07%
Branches: 70967 -> 70968 (+0.00%)
PreSGPRs: 187078 -> 187711 (+0.34%); split: -0.01%, +0.35%
PreVGPRs: 244918 -> 244785 (-0.05%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7513>
2020-11-09 19:42:22 +01:00
|
|
|
//! v1: %res4 = v_cndmask_b32 0, %a, %c
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), Operand(inputs[2]));
|
|
|
|
|
Temp sub = bld.vsub32(bld.def(v1), Operand(0u), cndmask);
|
|
|
|
|
writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
|
|
|
|
|
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-10 10:24:36 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.add_lshl)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
|
|
|
|
//>> s1: %a, v1: %b, s2: %_:exec = p_startpgm
|
|
|
|
|
if (!setup_cs("s1 v1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp shift;
|
|
|
|
|
|
|
|
|
|
//! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
|
|
|
|
|
Operand(inputs[0]), Operand(3u));
|
|
|
|
|
writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand(4u)));
|
|
|
|
|
|
aco: combine more s_add+s_lshl to s_lshl<n>_add by ignoring uses
Even if the s_lshl is used more that once, it can still be combined.
fossils-db (Vega10):
Totals from 771 (0.55% of 139517) affected shaders:
SGPRs: 46216 -> 46304 (+0.19%); split: -0.02%, +0.21%
VGPRs: 38488 -> 38464 (-0.06%)
SpillSGPRs: 1894 -> 1875 (-1.00%); split: -3.12%, +2.11%
CodeSize: 5681856 -> 5679844 (-0.04%); split: -0.07%, +0.03%
MaxWaves: 5320 -> 5323 (+0.06%)
Instrs: 1093960 -> 1093474 (-0.04%); split: -0.09%, +0.05%
Cycles: 47198380 -> 47258872 (+0.13%); split: -0.06%, +0.19%
VMEM: 176036 -> 176283 (+0.14%); split: +0.16%, -0.02%
SMEM: 53397 -> 53255 (-0.27%); split: +0.03%, -0.30%
VClause: 23156 -> 23152 (-0.02%); split: -0.03%, +0.01%
SClause: 35716 -> 35726 (+0.03%); split: -0.00%, +0.03%
Copies: 139395 -> 139871 (+0.34%); split: -0.04%, +0.39%
Branches: 33808 -> 33798 (-0.03%); split: -0.04%, +0.01%
PreSGPRs: 35381 -> 35331 (-0.14%); split: -0.20%, +0.06%
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7539>
2020-11-10 11:20:18 +01:00
|
|
|
//! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
|
|
|
|
|
//! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
|
|
|
|
|
//! v1: %res1 = v_add_u32 %lshl1, %lshl_add
|
2020-11-10 10:24:36 +01:00
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
|
|
|
|
|
Operand(inputs[0]), Operand(3u));
|
|
|
|
|
Temp sadd = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand(4u));
|
|
|
|
|
Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-02 16:44:04 +01:00
|
|
|
|
|
|
|
|
Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true)
|
|
|
|
|
{
|
|
|
|
|
a.set16bit(is16bit);
|
|
|
|
|
b.set16bit(is16bit);
|
|
|
|
|
|
|
|
|
|
return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.mad_u32_u16)
|
|
|
|
|
for (unsigned i = GFX9; i <= GFX10; i++) {
|
|
|
|
|
//>> v1: %a, v1: %b, s1: %c, s2: %_:exec = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 v1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand(42u), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0
|
|
|
|
|
//! p_unit_test 4, %res4
|
|
|
|
|
writeout(4, create_mad_u32_u16(Operand(42u), Operand(inputs[2]), Operand(0u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %res5 = v_mad_u32_u16 42, %a, 0
|
|
|
|
|
//! p_unit_test 5, %res5
|
|
|
|
|
writeout(5, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u), false));
|
|
|
|
|
|
2020-11-02 15:34:25 +01:00
|
|
|
//~gfx9! v1: %mul6 = v_mul_lo_u16 %a, %b
|
|
|
|
|
//~gfx9! v1: %res6 = v_add_u32 %mul6, %b
|
|
|
|
|
//~gfx10! v1: %mul6 = v_mul_lo_u16_e64 %a, %b
|
|
|
|
|
//~gfx10! v1: %res6 = v_add_u32 %mul6, %b
|
|
|
|
|
//! p_unit_test 6, %res6
|
|
|
|
|
Temp mul;
|
|
|
|
|
if (i >= GFX10) {
|
|
|
|
|
mul = bld.vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
} else {
|
|
|
|
|
mul = bld.vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
}
|
|
|
|
|
writeout(6, bld.vadd32(bld.def(v1), mul, inputs[1]));
|
|
|
|
|
|
|
|
|
|
//~gfx9! v1: %res7 = v_mad_u32_u16 %a, %b, %b
|
|
|
|
|
//~gfx10! v1: (nuw)%mul7 = v_mul_lo_u16_e64 %a, %b
|
|
|
|
|
//~gfx10! v1: %res7 = v_add_u32 %mul7, %b
|
|
|
|
|
//! p_unit_test 7, %res7
|
|
|
|
|
if (i >= GFX10) {
|
|
|
|
|
mul = bld.nuw().vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
} else {
|
|
|
|
|
mul = bld.nuw().vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
|
|
|
|
|
}
|
|
|
|
|
writeout(7, bld.vadd32(bld.def(v1), mul, inputs[1]));
|
|
|
|
|
|
2020-11-02 16:44:04 +01:00
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-11-11 18:42:35 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.bcnt)
|
|
|
|
|
for (unsigned i = GFX8; i <= GFX10; i++) {
|
|
|
|
|
//>> v1: %a, s1: %b, s2: %_:exec = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 s1", (chip_class)i))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Temp bcnt;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_bcnt_u32_b32 %a, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
|
|
|
|
|
writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_bcnt_u32_b32 %a, %b
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
|
|
|
|
|
writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_bcnt_u32_b32 %a, 42
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
|
|
|
|
|
writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand(42u)));
|
|
|
|
|
|
|
|
|
|
//! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
|
|
|
|
|
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
|
|
|
|
|
//~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand(0u));
|
|
|
|
|
writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
|
|
|
|
|
//~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
|
|
|
|
|
//~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
|
|
|
|
|
//! p_unit_test 4, %carry
|
|
|
|
|
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
|
|
|
|
|
Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
|
|
|
|
|
writeout(4, carry);
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
}
|
|
|
|
|
END_TEST
|
2020-10-07 11:09:16 +01:00
|
|
|
|
|
|
|
|
BEGIN_TEST(optimize.clamp)
|
|
|
|
|
//>> v1: %a, v1: %b, v1: %c, s2: %_:exec = p_startpgm
|
|
|
|
|
if (!setup_cs("v1 v1 v1", GFX9))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
//! v1: %res0 = v_med3_f32 4.0, 0, %a
|
|
|
|
|
//! p_unit_test 0, %res0
|
|
|
|
|
writeout(0, bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u),
|
|
|
|
|
bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), inputs[0])));
|
|
|
|
|
|
|
|
|
|
//! v1: %res1 = v_med3_f32 0, 4.0, %a
|
|
|
|
|
//! p_unit_test 1, %res1
|
|
|
|
|
writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u),
|
|
|
|
|
bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), inputs[0])));
|
|
|
|
|
|
|
|
|
|
/* correct NaN behaviour with precise */
|
|
|
|
|
|
|
|
|
|
//! v1: %res2 = v_med3_f32 4.0, 0, %a
|
|
|
|
|
//! p_unit_test 2, %res2
|
|
|
|
|
Builder::Result max = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), inputs[0]);
|
|
|
|
|
max.def(0).setPrecise(true);
|
|
|
|
|
Builder::Result min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), max);
|
|
|
|
|
max.def(0).setPrecise(true);
|
|
|
|
|
writeout(2, min);
|
|
|
|
|
|
|
|
|
|
//! v1: (precise)%res3_tmp = v_min_f32 4.0, %a
|
|
|
|
|
//! v1: %res3 = v_max_f32 0, %res3_tmp
|
|
|
|
|
//! p_unit_test 3, %res3
|
|
|
|
|
min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), inputs[0]);
|
|
|
|
|
min.def(0).setPrecise(true);
|
|
|
|
|
writeout(3, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), min));
|
|
|
|
|
|
|
|
|
|
finish_opt_test();
|
|
|
|
|
END_TEST
|