diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 38a68a9d2a2..71f74443c7e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -126,7 +126,7 @@ static const struct debug_named_value test_options[] = {
    {"computeblit", DBG(TEST_COMPUTE_BLIT), "Invoke blits tests and exit."},
    {"testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit."},
    {"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."},
-   {"testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"},
+   {"dmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"},
    {"testmemperf", DBG(TEST_MEM_PERF), "Test map + memcpy perf using the winsys."},
    {"blitperf", DBG(TEST_BLIT_PERF), "Test gfx and compute clear/copy/blit/resolve performance"},
 
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
index 046e1b1e6b0..5f84a4e647e 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -11,426 +11,240 @@
 #define MIN_SIZE   512
 #define MAX_SIZE   (128 * 1024 * 1024)
 #define SIZE_SHIFT 1
-#define NUM_RUNS   128
+#define WARMUP_RUNS 16
+#define NUM_RUNS   32
 
-static double get_MBps_rate(unsigned num_bytes, unsigned ns)
-{
-   return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
-}
+enum {
+   TEST_FILL_VRAM,
+   TEST_FILL_VRAM_12B,
+   TEST_FILL_GTT,
+   TEST_FILL_GTT_12B,
+   TEST_COPY_VRAM_VRAM,
+   TEST_COPY_VRAM_GTT,
+   TEST_COPY_GTT_VRAM,
+   NUM_TESTS,
+};
+
+static const char *test_strings[] = {
+   [TEST_FILL_VRAM] = "fill->VRAM",
+   [TEST_FILL_VRAM_12B] = "fill->VRAM 12B",
+   [TEST_FILL_GTT] = "fill->GTT",
+   [TEST_FILL_GTT_12B] = "fill->GTT 12B",
+   [TEST_COPY_VRAM_VRAM] = "VRAM->VRAM",
+   [TEST_COPY_VRAM_GTT] = "VRAM->GTT",
+   [TEST_COPY_GTT_VRAM] = "GTT->VRAM",
+};
+
+enum {
+   METHOD_DEFAULT,
+   METHOD_CP_DMA,
+   METHOD_COMPUTE_2DW,
+   METHOD_COMPUTE_3DW,
+   METHOD_COMPUTE_4DW,
+   NUM_METHODS,
+};
+
+static const char *method_strings[] = {
+   [METHOD_DEFAULT] = "Default",
+   [METHOD_CP_DMA] = "CP DMA",
+   [METHOD_COMPUTE_2DW] = "CS 2dw",
+   [METHOD_COMPUTE_3DW] = "CS 3dw",
+   [METHOD_COMPUTE_4DW] = "CS 4dw",
+};
+
+enum {
+   ALIGN_MAX,
+   ALIGN_256,
+   ALIGN_128,
+   ALIGN_64,
+   ALIGN_4,
+   ALIGN_2,
+   ALIGN_1,
+   ALIGN_SRC4_DST2,
+   ALIGN_SRC4_DST1,
+   ALIGN_SRC2_DST4,
+   ALIGN_SRC2_DST1,
+   ALIGN_SRC1_DST4,
+   ALIGN_SRC1_DST2,
+   NUM_ALIGNMENTS,
+};
+
+struct align_info_t {
+   const char *string;
+   unsigned src_offset;
+   unsigned dst_offset;
+};
+
+static const struct align_info_t align_info[] = {
+   [ALIGN_MAX] = {"both=max", 0, 0},
+   [ALIGN_256] = {"both=256", 256, 256},
+   [ALIGN_128] = {"both=128", 128, 128},
+   [ALIGN_64] = {"both=64", 64, 64},
+   [ALIGN_4] = {"both=4", 4, 4},
+   [ALIGN_2] = {"both=2", 2, 2},
+   [ALIGN_1] = {"both=1", 1, 1},
+   [ALIGN_SRC4_DST2] = {"src=4 dst=2", 4, 2},
+   [ALIGN_SRC4_DST1] = {"src=4 dst=1", 4, 1},
+   [ALIGN_SRC2_DST4] = {"src=2 dst=4", 2, 4},
+   [ALIGN_SRC2_DST1] = {"src=2 dst=1", 2, 1},
+   [ALIGN_SRC1_DST4] = {"src=1 dst=4", 1, 4},
+   [ALIGN_SRC1_DST2] = {"src=1 dst=2", 1, 2},
+};
 
 void si_test_dma_perf(struct si_screen *sscreen)
 {
    struct pipe_screen *screen = &sscreen->b;
    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
    struct si_context *sctx = (struct si_context *)ctx;
-   const uint32_t clear_value = 0x12345678;
-   static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
-   /* The list of per-SA wave limits to test. */
-   static const unsigned cs_waves_per_sh_list[] = {0, 8};
 
-#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+   sscreen->ws->cs_set_pstate(&sctx->gfx_cs, RADEON_CTX_PSTATE_PEAK);
 
-   static const char *method_str[] = {
-      "CP MC   ",
-      "CP L2   ",
-      "CP L2   ",
-   };
-   static const char *placement_str[] = {
-      /* Clear */
-      "fill->VRAM",
-      "fill->GTT ",
-      /* Copy */
-      "VRAM->VRAM",
-      "VRAM->GTT ",
-      "GTT ->VRAM",
-   };
-
-   printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
-   printf("Heap       ,Method  ,L2p,Wa,");
+   printf("Test          , Method , Alignment  ,");
    for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-      if (size >= 1024)
+      if (size >= 1024 * 1024)
+         printf("%6uMB,", size / (1024 * 1024));
+      else if (size >= 1024)
          printf("%6uKB,", size / 1024);
       else
          printf(" %6uB,", size);
    }
    printf("\n");
 
-   /* results[log2(size)][placement][method][] */
-   struct si_result {
-      bool is_valid;
-      bool is_cp;
-      bool is_cs;
-      unsigned cache_policy;
-      unsigned dwords_per_thread;
-      unsigned waves_per_sh;
-      unsigned score;
-      unsigned index; /* index in results[x][y][index] */
-   } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
-
    /* Run benchmarks. */
-   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-      bool is_copy = placement >= 2;
+   for (unsigned test_flavor = 0; test_flavor < NUM_TESTS; test_flavor++) {
+      bool is_copy = test_flavor >= TEST_COPY_VRAM_VRAM;
 
-      printf("-----------,--------,---,--,");
-      for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
-         printf("--------,");
-      printf("\n");
+      if (test_flavor)
+         puts("");
 
       for (unsigned method = 0; method < NUM_METHODS; method++) {
-         bool test_cp = method <= 2;
-         bool test_cs = method >= 3;
-         unsigned cs_method = method - 3;
-         unsigned cs_waves_per_sh =
-            test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
-         cs_method %= 3 * NUM_SHADERS;
-         unsigned cache_policy =
-            test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
-         unsigned cs_dwords_per_thread =
-            test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+         for (unsigned align = 0; align < NUM_ALIGNMENTS; align++) {
+            unsigned dwords_per_thread, clear_value_size;
+            unsigned src_offset = align_info[align].src_offset;
+            unsigned dst_offset = align_info[align].dst_offset;
 
-         if (sctx->gfx_level == GFX6) {
-            /* GFX6 doesn't support CP DMA operations through L2. */
-            if (test_cp && cache_policy != L2_BYPASS)
+            /* offset > 0 && offset < 4 is the only case when the compute shader performs the same
+             * as offset=0 without any alignment optimizations, so shift the offset by 4 to get
+             * unaligned performance.
+             */
+            if (src_offset && src_offset < 4)
+               src_offset += 4;
+            if (dst_offset && dst_offset < 4)
+               dst_offset += 4;
+
+            if (!is_copy && dst_offset != src_offset)
                continue;
-            /* WAVES_PER_SH is in multiples of 16 on GFX6. */
-            if (test_cs && cs_waves_per_sh % 16 != 0)
-               continue;
-         }
 
-         /* SI_RESOURCE_FLAG_GL2_BYPASS setting RADEON_FLAG_GL2_BYPASS doesn't affect
-          * chips before gfx9.
-          */
-         if (test_cs && cache_policy && sctx->gfx_level < GFX9)
-            continue;
+            if (test_flavor == TEST_FILL_VRAM_12B || test_flavor == TEST_FILL_GTT_12B) {
+               if ((method != METHOD_DEFAULT && method != METHOD_COMPUTE_3DW &&
+                    method != METHOD_COMPUTE_4DW) || dst_offset % 4)
+                  continue;
 
-         printf("%s ,", placement_str[placement]);
-         if (test_cs) {
-            printf("CS x%-4u,%3s,", cs_dwords_per_thread,
-                   cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
-         } else {
-            printf("%s,%3s,", method_str[method],
-                   method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
-         }
-         if (test_cs && cs_waves_per_sh)
-            printf("%2u,", cs_waves_per_sh);
-         else
-            printf("  ,");
+               dwords_per_thread = method == METHOD_COMPUTE_3DW ? 3 : 4;
+               clear_value_size = 12;
+            } else {
+               if (method == METHOD_COMPUTE_3DW)
+                  continue;
 
-         void *compute_shader = NULL;
-         if (test_cs) {
-            union si_cs_clear_copy_buffer_key key;
-            key.key = 0;
-
-            key.is_clear = !is_copy;
-            key.dwords_per_thread = cs_dwords_per_thread;
-
-            compute_shader = si_create_dma_compute_shader(sctx, &key);
-         }
-
-         double score = 0;
-         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-            /* Don't test bigger sizes if it's too slow. Print 0. */
-            if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
-               printf("%7.0f ,", 0.0);
-               continue;
+               dwords_per_thread = method == METHOD_COMPUTE_2DW ? 2 : 4;
+               clear_value_size = dst_offset % 4 ? 1 : 4;
             }
 
-            enum pipe_resource_usage dst_usage, src_usage;
-            struct pipe_resource *dst, *src;
-            unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
-            unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_GL2_BYPASS : 0;
+            printf("%-14s, %-7s, %-11s,", test_strings[test_flavor], method_strings[method],
+                   align_info[align].string);
 
-            if (placement == 0 || placement == 2 || placement == 4)
-               dst_usage = PIPE_USAGE_DEFAULT;
-            else
-               dst_usage = PIPE_USAGE_STREAM;
+            for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+               struct pipe_resource *dst, *src;
+               enum pipe_resource_usage dst_usage = PIPE_USAGE_DEFAULT;
+               enum pipe_resource_usage src_usage = PIPE_USAGE_DEFAULT;
 
-            if (placement == 2 || placement == 3)
-               src_usage = PIPE_USAGE_DEFAULT;
-            else
-               src_usage = PIPE_USAGE_STREAM;
+               if (test_flavor == TEST_FILL_GTT || test_flavor == TEST_FILL_GTT_12B ||
+                   test_flavor == TEST_COPY_VRAM_GTT)
+                  dst_usage = PIPE_USAGE_STREAM;
 
-            dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
-            src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
+               if (test_flavor == TEST_COPY_GTT_VRAM)
+                  src_usage = PIPE_USAGE_STREAM;
 
-            /* Wait for idle before testing, so that other processes don't mess up the results. */
-            sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                           SI_CONTEXT_FLUSH_AND_INV_CB |
-                           SI_CONTEXT_FLUSH_AND_INV_DB;
-            si_emit_cache_flush_direct(sctx);
+               /* Don't test large sizes with GTT because it's slow. */
+               if ((dst_usage == PIPE_USAGE_STREAM || src_usage == PIPE_USAGE_STREAM) &&
+                   size > 32 * 1024 * 1024) {
+                  printf("%8s,", "n/a");
+                  continue;
+               }
 
-            struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
-            ctx->begin_query(ctx, q);
+               dst = pipe_aligned_buffer_create(screen, 0, dst_usage, dst_offset + size, 256);
+               src = is_copy ? pipe_aligned_buffer_create(screen, 0, src_usage, src_offset + size, 256) : NULL;
 
-            /* Run tests. */
-            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-               if (test_cp) {
-                  /* CP DMA */
-                  if (is_copy) {
-                     si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, SI_OP_SYNC_BEFORE_AFTER,
-                                           SI_COHERENCY_NONE, cache_policy);
+               struct pipe_query *q = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0);
+               bool success = true;
+
+               /* Run tests. */
+               for (unsigned iter = 0; iter < WARMUP_RUNS + NUM_RUNS; iter++) {
+                  const uint32_t clear_value[4] = {0x12345678, 0x23456789, 0x34567890, 0x45678901};
+
+                  if (iter == WARMUP_RUNS)
+                     ctx->begin_query(ctx, q);
+
+                  if (method == METHOD_DEFAULT) {
+                     if (is_copy) {
+                        si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
+                                       SI_OP_SYNC_BEFORE_AFTER);
+                     } else {
+                        sctx->b.clear_buffer(&sctx->b, dst, dst_offset, size, &clear_value,
+                                             clear_value_size);
+                     }
+                  } else if (method == METHOD_CP_DMA) {
+                     /* CP DMA */
+                     if (is_copy) {
+                        si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
+                                              SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_SHADER, L2_LRU);
+                     } else {
+                        /* CP DMA clears must be aligned to 4 bytes. */
+                        if (dst_offset % 4 || size % 4) {
+                           success = false;
+                           continue;
+                        }
+                        assert(clear_value_size == 4);
+                        si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, dst_offset, size,
+                                               clear_value[0], SI_OP_SYNC_BEFORE_AFTER,
+                                               SI_COHERENCY_SHADER, L2_LRU);
+                     }
                   } else {
-                     si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value,
-                                            SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE,
-                                            cache_policy);
-                  }
-               } else {
-                  /* Compute */
-                  /* The memory accesses are coalesced, meaning that the 1st instruction writes
-                   * the 1st contiguous block of data for the whole wave, the 2nd instruction
-                   * writes the 2nd contiguous block of data, etc.
-                   */
-                  unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
-                  unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
-                  unsigned dwords_per_wave = cs_dwords_per_thread * 64;
-
-                  unsigned num_dwords = size / 4;
-                  unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
-                  struct pipe_grid_info info = {};
-                  info.block[0] = MIN2(64, num_instructions);
-                  info.block[1] = 1;
-                  info.block[2] = 1;
-                  info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
-                  info.grid[1] = 1;
-                  info.grid[2] = 1;
-
-                  struct pipe_shader_buffer sb[2] = {};
-                  sb[is_copy].buffer = dst;
-                  sb[is_copy].buffer_size = size;
-
-                  if (is_copy) {
-                     sb[0].buffer = src;
-                     sb[0].buffer_size = size;
-                  } else {
-                     for (unsigned i = 0; i < 4; i++)
-                        sctx->cs_user_data[i] = clear_value;
+                     /* Compute */
+                     success &=
+                        si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset,
+                                                     size, clear_value, clear_value_size,
+                                                     SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_SHADER,
+                                                     dwords_per_thread, false);
                   }
 
-                  ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
-                  ctx->bind_compute_state(ctx, compute_shader);
-                  sctx->cs_max_waves_per_sh = cs_waves_per_sh;
-
-                  ctx->launch_grid(ctx, &info);
-
-                  ctx->bind_compute_state(ctx, NULL);
-                  sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+                  sctx->flags |= SI_CONTEXT_INV_L2;
                }
 
-               /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
-               sctx->flags |= SI_CONTEXT_INV_VCACHE |
-                              (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
-                              SI_CONTEXT_CS_PARTIAL_FLUSH;
-               si_emit_cache_flush_direct(sctx);
-            }
+               ctx->end_query(ctx, q);
 
-            ctx->end_query(ctx, q);
-            ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+               pipe_resource_reference(&dst, NULL);
+               pipe_resource_reference(&src, NULL);
 
-            pipe_resource_reference(&dst, NULL);
-            pipe_resource_reference(&src, NULL);
+               /* Get results. */
+               union pipe_query_result result;
 
-            /* Get results. */
+               ctx->get_query_result(ctx, q, true, &result);
+               ctx->destroy_query(ctx, q);
 
-            union pipe_query_result result;
-
-            ctx->get_query_result(ctx, q, true, &result);
-            ctx->destroy_query(ctx, q);
-
-            score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS);
-            printf("%7.0f ,", score);
-            fflush(stdout);
-
-            struct si_result *r = &results[util_logbase2(size)][placement][method];
-            r->is_valid = true;
-            r->is_cp = test_cp;
-            r->is_cs = test_cs;
-            r->cache_policy = cache_policy;
-            r->dwords_per_thread = cs_dwords_per_thread;
-            r->waves_per_sh = cs_waves_per_sh;
-            r->score = score;
-            r->index = method;
-         }
-         puts("");
-
-         if (compute_shader)
-            ctx->delete_compute_state(ctx, compute_shader);
-      }
-   }
-
-   puts("");
-   puts("static struct si_method");
-   printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
-          "cached)\n",
-          sctx->screen->info.name);
-   puts("{");
-   puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-
-   /* Analyze results and find the best methods. */
-   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-      if (placement == 0)
-         puts("   if (dst == RADEON_DOMAIN_VRAM) {");
-      else if (placement == 1)
-         puts("   } else { /* GTT */");
-      else if (placement == 2) {
-         puts("}");
-         puts("");
-         puts("static struct si_method");
-         printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
-                sctx->screen->info.name);
-         printf("                     uint64_t size64, bool async, bool cached)\n");
-         puts("{");
-         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-         puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
-      } else if (placement == 3)
-         puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
-      else
-         puts("   } else { /* GTT -> VRAM */");
-
-      for (unsigned mode = 0; mode < 3; mode++) {
-         bool async = mode == 0;
-         bool cached = mode == 1;
-
-         if (async)
-            puts("      if (async) { /* async compute */");
-         else if (cached)
-            puts("      if (cached) { /* gfx ring */");
-         else
-            puts("      } else { /* gfx ring - uncached */");
-
-         /* The list of best chosen methods. */
-         struct si_result *methods[32];
-         unsigned method_max_size[32];
-         unsigned num_methods = 0;
-
-         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-            /* Find the best method. */
-            struct si_result *best = NULL;
-
-            for (unsigned i = 0; i < NUM_METHODS; i++) {
-               struct si_result *r = &results[util_logbase2(size)][placement][i];
-
-               if (!r->is_valid)
-                  continue;
-
-               /* Ban CP DMA clears via MC on <= GFX8. They are super slow
-                * on GTT, which we can get due to BO evictions.
-                */
-               if (sctx->gfx_level <= GFX8 && placement == 1 && r->is_cp &&
-                   r->cache_policy == L2_BYPASS)
-                  continue;
-
-               if (async) {
-                  /* The following constraints for compute IBs try to limit
-                   * resource usage so as not to decrease the performance
-                   * of gfx IBs too much.
-                   */
-
-                  /* Don't use CP DMA on asynchronous rings, because
-                   * the engine is shared with gfx IBs.
-                   */
-                  if (r->is_cp)
-                     continue;
-
-                  /* Don't use L2 caching on asynchronous rings to minimize
-                   * L2 usage.
-                   */
-                  if (r->cache_policy == L2_LRU)
-                     continue;
-
-                  /* Asynchronous compute recommends waves_per_sh != 0
-                   * to limit CU usage. */
-                  if (r->is_cs && r->waves_per_sh == 0)
-                     continue;
+               if (success) {
+                  double GB = 1024.0 * 1024.0 * 1024.0;
+                  double seconds = result.u64 / (double)NUM_RUNS / (1000.0 * 1000.0 * 1000.0);
+                  double GBps = (size / GB) / seconds * (test_flavor == TEST_COPY_VRAM_VRAM ? 2 : 1);
+                  printf("%8.2f,", GBps);
                } else {
-                  if (cached && r->cache_policy == L2_BYPASS)
-                     continue;
-                  if (!cached && r->cache_policy == L2_LRU)
-                     continue;
-               }
-
-               if (!best) {
-                  best = r;
-                  continue;
-               }
-
-               /* Assume some measurement error. Earlier methods occupy fewer
-                * resources, so the next method is always more greedy, and we
-                * don't want to select it due to a measurement error.
-                */
-               double min_improvement = 1.03;
-
-               if (best->score * min_improvement < r->score)
-                  best = r;
-            }
-
-            if (num_methods > 0) {
-               unsigned prev_index = num_methods - 1;
-               struct si_result *prev = methods[prev_index];
-               struct si_result *prev_this_size =
-                  &results[util_logbase2(size)][placement][prev->index];
-
-               /* If the best one is also the best for the previous size,
-                * just bump the size for the previous one.
-                *
-                * If there is no best, it means all methods were too slow
-                * for this size and were not tested. Use the best one for
-                * the previous size.
-                */
-               if (!best ||
-                   /* If it's the same method as for the previous size: */
-                   (prev->is_cp == best->is_cp &&
-                    prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
-                    prev->dwords_per_thread == best->dwords_per_thread &&
-                    prev->waves_per_sh == best->waves_per_sh) ||
-                   /* If the method for the previous size is also the best
-                    * for this size: */
-                   (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
-                  method_max_size[prev_index] = size;
-                  continue;
+                  printf("%8s,", "n/a");
                }
             }
-
-            /* Add it to the list. */
-            assert(num_methods < ARRAY_SIZE(methods));
-            methods[num_methods] = best;
-            method_max_size[num_methods] = size;
-            num_methods++;
-         }
-
-         for (unsigned i = 0; i < num_methods; i++) {
-            struct si_result *best = methods[i];
-            unsigned size = method_max_size[i];
-
-            /* The size threshold is between the current benchmarked
-             * size and the next benchmarked size. */
-            if (i < num_methods - 1)
-               printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
-            else if (i > 0)
-               printf("         else                   ");
-            else
-               printf("         ");
-            printf("return ");
-
-            assert(best);
-            const char *cache_policy_str =
-               best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
-               best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM";
-
-            if (best->is_cp) {
-               printf("CP_DMA(%s);\n", cache_policy_str);
-            }
-            if (best->is_cs) {
-               printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
-                      best->dwords_per_thread, best->waves_per_sh);
-            }
+            puts("");
          }
       }
-      puts("      }");
    }
-   puts("   }");
-   puts("}");
 
    ctx->destroy(ctx);
    exit(0);