diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp
index a064713fecf..cc177f81e79 100644
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@@ -283,32 +283,32 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
    return true;
 }
 
-unsigned
-get_latency(const Instruction* const instr)
+Instruction_cycle_info
+get_cycle_info_with_mem_latency(const SchedILPContext& ctx, const Instruction* const instr)
 {
-   /* Note, that these are not accurate latency estimations. */
-   if (instr->isVALU() || instr->isVINTRP())
-      return 5;
-   if (instr->isSALU())
-      return 2;
-   /* Based on get_wait_counter_info in aco_statistics.cpp. */
-   if (instr->isVMEM() || instr->isFlatLike())
-      return 320;
-   if (instr->isSMEM()) {
-      if (instr->operands.empty())
-         return 1;
-      if (instr->operands[0].size() == 2 ||
-          (instr->operands[1].isConstant() &&
-           (instr->operands.size() < 3 || instr->operands[2].isConstant())))
-         return 30;
-      return 200;
-   }
-   if (instr->isLDSDIR())
-      return 13;
-   if (instr->isDS())
-      return 20;
+   Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
 
-   return 0;
+   /* Based on get_wait_counter_info in aco_statistics.cpp. */
+   if (instr->isVMEM() || instr->isFlatLike()) {
+      cycle_info.latency = 320;
+   } else if (instr->isSMEM()) {
+      if (instr->operands.empty()) {
+         cycle_info.latency = 1;
+      } else if (instr->operands[0].size() == 2 ||
+                 (instr->operands[1].isConstant() &&
+                  (instr->operands.size() < 3 || instr->operands[2].isConstant()))) {
+         /* Likely cached. */
+         cycle_info.latency = 30;
+      } else {
+         cycle_info.latency = 200;
+      }
+   } else if (instr->isLDSDIR()) {
+      cycle_info.latency = 13;
+   } else if (instr->isDS()) {
+      cycle_info.latency = 20;
+   }
+
+   return cycle_info;
 }
 
 bool
@@ -452,13 +452,18 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
    const mask_t mask = ~BITFIELD_BIT(idx);
    ctx.active_mask &= mask;
 
-   int stall = 1; /* Assume all instructions take one cycle to issue. */
-   if (ctx.nodes[idx].wait_cycles > 0) {
-      /* Add remaining latency stall. */
-      stall += ctx.nodes[idx].wait_cycles;
-   }
-
+   int latency = 0;
+   int stall = 1;
    if (!ctx.is_vopd) {
+      Instruction_cycle_info cycle_info = get_cycle_info_with_mem_latency(ctx, instr);
+      latency = cycle_info.latency;
+      stall = cycle_info.issue_cycles;
+
+      if (ctx.nodes[idx].wait_cycles > 0) {
+         /* Add remaining latency stall. */
+         stall += ctx.nodes[idx].wait_cycles;
+      }
+
       unsigned i;
       BITSET_FOREACH_SET (i, ctx.reg_has_latency, 512) {
          if (ctx.regs[i].latency <= stall) {
@@ -489,8 +494,6 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
       ctx.regs[flat_scr_hi].read_mask &= mask;
    }
 
-   const int latency = get_latency(instr);
-
    for (const Definition& def : instr->definitions) {
       for (unsigned i = 0; i < def.size(); i++) {
          unsigned reg = def.physReg().reg() + i;
diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp
index befc12d8731..9a72e720d34 100644
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@@ -473,9 +473,9 @@ BEGIN_TEST(d3d11_derivs.cube_array)
    //>> v_cubeid_f32 v#rf, v#_, v#_, v#_                                                      ; $_ $_
 
    //>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf                                         ; $_ $_
+   //>> v_mov_b32_e32 v#rlf, v#rlf_tmp                                                        ; $_
    //>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000                                            ; $_ $_
    //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                            ; $_ $_
-   //>> v_mov_b32_e32 v#rlf, v#rlf_tmp                                                        ; $_
    //>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp]                                       ; $_ $_
 
    //>> BB1: