diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c index e1ee4211185..ae998521829 100644 --- a/src/intel/compiler/jay/jay_register_allocate.c +++ b/src/intel/compiler/jay/jay_register_allocate.c @@ -595,18 +595,28 @@ jay_emit_parallel_copies(jay_builder *b, jay_def dst = def_from_reg(copy->dst), src = def_from_reg(copy->src); assert(dst.file == src.file); enum jay_file file = dst.file; - struct jay_temp_regs t = { .gpr = temps.gpr2, .ugpr = temps.ugpr2 }; - jay_def temp_backing = jay_null(); - jay_def temp = - push_temp(b, temps, file == GPR || file == MEM ? GPR : UGPR, - file == MEM /* stride4 */, true /* outer */, - &temp_backing, dst, src); - { + + if (file == GPR && + jay_def_stride(b->shader, dst) == JAY_STRIDE_4 && + jay_def_stride(b->shader, src) == JAY_STRIDE_4) { + + /* If everything is stride=4, swapping is easy */ + jay_def acc = jay_bare_reg(ACCUM, 2); + jay_MOV(b, acc, dst)->type = JAY_TYPE_F32; + jay_MOV(b, dst, src)->type = JAY_TYPE_F32; + jay_MOV(b, src, acc)->type = JAY_TYPE_F32; + } else { + struct jay_temp_regs t = { .gpr = temps.gpr2, .ugpr = temps.ugpr2 }; + jay_def temp_backing = jay_null(); + jay_def temp = + push_temp(b, temps, file == GPR || file == MEM ? GPR : UGPR, + file == MEM /* stride4 */, true /* outer */, + &temp_backing, dst, src); mov(b, temp, dst, t); mov(b, dst, src, t); mov(b, src, temp, t); + pop_temp(b, temp, temp_backing); } - pop_temp(b, temp, temp_backing); for (unsigned j = 0; j < num_copies; j++) { if (pcopies[j].src == copy->dst)