diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 38a68a9d2a2..71f74443c7e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -126,7 +126,7 @@ static const struct debug_named_value test_options[] = { {"computeblit", DBG(TEST_COMPUTE_BLIT), "Invoke blits tests and exit."}, {"testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit."}, {"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."}, - {"testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"}, + {"dmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"}, {"testmemperf", DBG(TEST_MEM_PERF), "Test map + memcpy perf using the winsys."}, {"blitperf", DBG(TEST_BLIT_PERF), "Test gfx and compute clear/copy/blit/resolve performance"}, diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 046e1b1e6b0..5f84a4e647e 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -11,426 +11,240 @@ #define MIN_SIZE 512 #define MAX_SIZE (128 * 1024 * 1024) #define SIZE_SHIFT 1 -#define NUM_RUNS 128 +#define WARMUP_RUNS 16 +#define NUM_RUNS 32 -static double get_MBps_rate(unsigned num_bytes, unsigned ns) -{ - return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0); -} +enum { + TEST_FILL_VRAM, + TEST_FILL_VRAM_12B, + TEST_FILL_GTT, + TEST_FILL_GTT_12B, + TEST_COPY_VRAM_VRAM, + TEST_COPY_VRAM_GTT, + TEST_COPY_GTT_VRAM, + NUM_TESTS, +}; + +static const char *test_strings[] = { + [TEST_FILL_VRAM] = "fill->VRAM", + [TEST_FILL_VRAM_12B] = "fill->VRAM 12B", + [TEST_FILL_GTT] = "fill->GTT", + [TEST_FILL_GTT_12B] = "fill->GTT 12B", + [TEST_COPY_VRAM_VRAM] = "VRAM->VRAM", + [TEST_COPY_VRAM_GTT] = "VRAM->GTT", + [TEST_COPY_GTT_VRAM] = "GTT->VRAM", +}; + +enum { + METHOD_DEFAULT, + METHOD_CP_DMA, + METHOD_COMPUTE_2DW, + METHOD_COMPUTE_3DW, + METHOD_COMPUTE_4DW, + NUM_METHODS, +}; + +static const char *method_strings[] = { + [METHOD_DEFAULT] = "Default", + [METHOD_CP_DMA] = "CP DMA", + [METHOD_COMPUTE_2DW] = "CS 2dw", + [METHOD_COMPUTE_3DW] = "CS 3dw", + [METHOD_COMPUTE_4DW] = "CS 4dw", +}; + +enum { + ALIGN_MAX, + ALIGN_256, + ALIGN_128, + ALIGN_64, + ALIGN_4, + ALIGN_2, + ALIGN_1, + ALIGN_SRC4_DST2, + ALIGN_SRC4_DST1, + ALIGN_SRC2_DST4, + ALIGN_SRC2_DST1, + ALIGN_SRC1_DST4, + ALIGN_SRC1_DST2, + NUM_ALIGNMENTS, +}; + +struct align_info_t { + const char *string; + unsigned src_offset; + unsigned dst_offset; +}; + +static const struct align_info_t align_info[] = { + [ALIGN_MAX] = {"both=max", 0, 0}, + [ALIGN_256] = {"both=256", 256, 256}, + [ALIGN_128] = {"both=128", 128, 128}, + [ALIGN_64] = {"both=64", 64, 64}, + [ALIGN_4] = {"both=4", 4, 4}, + [ALIGN_2] = {"both=2", 2, 2}, + [ALIGN_1] = {"both=1", 1, 1}, + [ALIGN_SRC4_DST2] = {"src=4 dst=2", 4, 2}, + [ALIGN_SRC4_DST1] = {"src=4 dst=1", 4, 1}, + [ALIGN_SRC2_DST4] = {"src=2 dst=4", 2, 4}, + [ALIGN_SRC2_DST1] = {"src=2 dst=1", 2, 1}, + [ALIGN_SRC1_DST4] = {"src=1 dst=4", 1, 4}, + [ALIGN_SRC1_DST2] = {"src=1 dst=2", 1, 2}, +}; void si_test_dma_perf(struct si_screen *sscreen) { struct pipe_screen *screen = &sscreen->b; struct pipe_context *ctx = screen->context_create(screen, NULL, 0); struct si_context *sctx = (struct si_context *)ctx; - const uint32_t clear_value = 0x12345678; - static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1}; - /* The list of per-SA wave limits to test. */ - static const unsigned cs_waves_per_sh_list[] = {0, 8}; -#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list) -#define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) + sscreen->ws->cs_set_pstate(&sctx->gfx_cs, RADEON_CTX_PSTATE_PEAK); - static const char *method_str[] = { - "CP MC ", - "CP L2 ", - "CP L2 ", - }; - static const char *placement_str[] = { - /* Clear */ - "fill->VRAM", - "fill->GTT ", - /* Copy */ - "VRAM->VRAM", - "VRAM->GTT ", - "GTT ->VRAM", - }; - - printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n"); - printf("Heap ,Method ,L2p,Wa,"); + printf("Test , Method , Alignment ,"); for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { - if (size >= 1024) + if (size >= 1024 * 1024) + printf("%6uMB,", size / (1024 * 1024)); + else if (size >= 1024) printf("%6uKB,", size / 1024); else printf(" %6uB,", size); } printf("\n"); - /* results[log2(size)][placement][method][] */ - struct si_result { - bool is_valid; - bool is_cp; - bool is_cs; - unsigned cache_policy; - unsigned dwords_per_thread; - unsigned waves_per_sh; - unsigned score; - unsigned index; /* index in results[x][y][index] */ - } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {}; - /* Run benchmarks. */ - for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { - bool is_copy = placement >= 2; + for (unsigned test_flavor = 0; test_flavor < NUM_TESTS; test_flavor++) { + bool is_copy = test_flavor >= TEST_COPY_VRAM_VRAM; - printf("-----------,--------,---,--,"); - for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) - printf("--------,"); - printf("\n"); + if (test_flavor) + puts(""); for (unsigned method = 0; method < NUM_METHODS; method++) { - bool test_cp = method <= 2; - bool test_cs = method >= 3; - unsigned cs_method = method - 3; - unsigned cs_waves_per_sh = - test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0; - cs_method %= 3 * NUM_SHADERS; - unsigned cache_policy = - test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0; - unsigned cs_dwords_per_thread = - test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0; + for (unsigned align = 0; align < NUM_ALIGNMENTS; align++) { + unsigned dwords_per_thread, clear_value_size; + unsigned src_offset = align_info[align].src_offset; + unsigned dst_offset = align_info[align].dst_offset; - if (sctx->gfx_level == GFX6) { - /* GFX6 doesn't support CP DMA operations through L2. */ - if (test_cp && cache_policy != L2_BYPASS) + /* offset > 0 && offset < 4 is the only case when the compute shader performs the same + * as offset=0 without any alignment optimizations, so shift the offset by 4 to get + * unaligned performance. + */ + if (src_offset && src_offset < 4) + src_offset += 4; + if (dst_offset && dst_offset < 4) + dst_offset += 4; + + if (!is_copy && dst_offset != src_offset) continue; - /* WAVES_PER_SH is in multiples of 16 on GFX6. */ - if (test_cs && cs_waves_per_sh % 16 != 0) - continue; - } - /* SI_RESOURCE_FLAG_GL2_BYPASS setting RADEON_FLAG_GL2_BYPASS doesn't affect - * chips before gfx9. - */ - if (test_cs && cache_policy && sctx->gfx_level < GFX9) - continue; + if (test_flavor == TEST_FILL_VRAM_12B || test_flavor == TEST_FILL_GTT_12B) { + if ((method != METHOD_DEFAULT && method != METHOD_COMPUTE_3DW && + method != METHOD_COMPUTE_4DW) || dst_offset % 4) + continue; - printf("%s ,", placement_str[placement]); - if (test_cs) { - printf("CS x%-4u,%3s,", cs_dwords_per_thread, - cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : ""); - } else { - printf("%s,%3s,", method_str[method], - method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : ""); - } - if (test_cs && cs_waves_per_sh) - printf("%2u,", cs_waves_per_sh); - else - printf(" ,"); + dwords_per_thread = method == METHOD_COMPUTE_3DW ? 3 : 4; + clear_value_size = 12; + } else { + if (method == METHOD_COMPUTE_3DW) + continue; - void *compute_shader = NULL; - if (test_cs) { - union si_cs_clear_copy_buffer_key key; - key.key = 0; - - key.is_clear = !is_copy; - key.dwords_per_thread = cs_dwords_per_thread; - - compute_shader = si_create_dma_compute_shader(sctx, &key); - } - - double score = 0; - for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { - /* Don't test bigger sizes if it's too slow. Print 0. */ - if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) { - printf("%7.0f ,", 0.0); - continue; + dwords_per_thread = method == METHOD_COMPUTE_2DW ? 2 : 4; + clear_value_size = dst_offset % 4 ? 1 : 4; } - enum pipe_resource_usage dst_usage, src_usage; - struct pipe_resource *dst, *src; - unsigned query_type = PIPE_QUERY_TIME_ELAPSED; - unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_GL2_BYPASS : 0; + printf("%-14s, %-7s, %-11s,", test_strings[test_flavor], method_strings[method], + align_info[align].string); - if (placement == 0 || placement == 2 || placement == 4) - dst_usage = PIPE_USAGE_DEFAULT; - else - dst_usage = PIPE_USAGE_STREAM; + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { + struct pipe_resource *dst, *src; + enum pipe_resource_usage dst_usage = PIPE_USAGE_DEFAULT; + enum pipe_resource_usage src_usage = PIPE_USAGE_DEFAULT; - if (placement == 2 || placement == 3) - src_usage = PIPE_USAGE_DEFAULT; - else - src_usage = PIPE_USAGE_STREAM; + if (test_flavor == TEST_FILL_GTT || test_flavor == TEST_FILL_GTT_12B || + test_flavor == TEST_COPY_VRAM_GTT) + dst_usage = PIPE_USAGE_STREAM; - dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256); - src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL; + if (test_flavor == TEST_COPY_GTT_VRAM) + src_usage = PIPE_USAGE_STREAM; - /* Wait for idle before testing, so that other processes don't mess up the results. */ - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_FLUSH_AND_INV_DB; - si_emit_cache_flush_direct(sctx); + /* Don't test large sizes with GTT because it's slow. */ + if ((dst_usage == PIPE_USAGE_STREAM || src_usage == PIPE_USAGE_STREAM) && + size > 32 * 1024 * 1024) { + printf("%8s,", "n/a"); + continue; + } - struct pipe_query *q = ctx->create_query(ctx, query_type, 0); - ctx->begin_query(ctx, q); + dst = pipe_aligned_buffer_create(screen, 0, dst_usage, dst_offset + size, 256); + src = is_copy ? pipe_aligned_buffer_create(screen, 0, src_usage, src_offset + size, 256) : NULL; - /* Run tests. */ - for (unsigned iter = 0; iter < NUM_RUNS; iter++) { - if (test_cp) { - /* CP DMA */ - if (is_copy) { - si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, SI_OP_SYNC_BEFORE_AFTER, - SI_COHERENCY_NONE, cache_policy); + struct pipe_query *q = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0); + bool success = true; + + /* Run tests. */ + for (unsigned iter = 0; iter < WARMUP_RUNS + NUM_RUNS; iter++) { + const uint32_t clear_value[4] = {0x12345678, 0x23456789, 0x34567890, 0x45678901}; + + if (iter == WARMUP_RUNS) + ctx->begin_query(ctx, q); + + if (method == METHOD_DEFAULT) { + if (is_copy) { + si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, + SI_OP_SYNC_BEFORE_AFTER); + } else { + sctx->b.clear_buffer(&sctx->b, dst, dst_offset, size, &clear_value, + clear_value_size); + } + } else if (method == METHOD_CP_DMA) { + /* CP DMA */ + if (is_copy) { + si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, + SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_SHADER, L2_LRU); + } else { + /* CP DMA clears must be aligned to 4 bytes. */ + if (dst_offset % 4 || size % 4) { + success = false; + continue; + } + assert(clear_value_size == 4); + si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, dst_offset, size, + clear_value[0], SI_OP_SYNC_BEFORE_AFTER, + SI_COHERENCY_SHADER, L2_LRU); + } } else { - si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value, - SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE, - cache_policy); - } - } else { - /* Compute */ - /* The memory accesses are coalesced, meaning that the 1st instruction writes - * the 1st contiguous block of data for the whole wave, the 2nd instruction - * writes the 2nd contiguous block of data, etc. - */ - unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4); - unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread; - unsigned dwords_per_wave = cs_dwords_per_thread * 64; - - unsigned num_dwords = size / 4; - unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); - - struct pipe_grid_info info = {}; - info.block[0] = MIN2(64, num_instructions); - info.block[1] = 1; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); - info.grid[1] = 1; - info.grid[2] = 1; - - struct pipe_shader_buffer sb[2] = {}; - sb[is_copy].buffer = dst; - sb[is_copy].buffer_size = size; - - if (is_copy) { - sb[0].buffer = src; - sb[0].buffer_size = size; - } else { - for (unsigned i = 0; i < 4; i++) - sctx->cs_user_data[i] = clear_value; + /* Compute */ + success &= + si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, + size, clear_value, clear_value_size, + SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_SHADER, + dwords_per_thread, false); } - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1); - ctx->bind_compute_state(ctx, compute_shader); - sctx->cs_max_waves_per_sh = cs_waves_per_sh; - - ctx->launch_grid(ctx, &info); - - ctx->bind_compute_state(ctx, NULL); - sctx->cs_max_waves_per_sh = 0; /* disable the limit */ + sctx->flags |= SI_CONTEXT_INV_L2; } - /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */ - sctx->flags |= SI_CONTEXT_INV_VCACHE | - (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) | - SI_CONTEXT_CS_PARTIAL_FLUSH; - si_emit_cache_flush_direct(sctx); - } + ctx->end_query(ctx, q); - ctx->end_query(ctx, q); - ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); + pipe_resource_reference(&dst, NULL); + pipe_resource_reference(&src, NULL); - pipe_resource_reference(&dst, NULL); - pipe_resource_reference(&src, NULL); + /* Get results. */ + union pipe_query_result result; - /* Get results. */ + ctx->get_query_result(ctx, q, true, &result); + ctx->destroy_query(ctx, q); - union pipe_query_result result; - - ctx->get_query_result(ctx, q, true, &result); - ctx->destroy_query(ctx, q); - - score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS); - printf("%7.0f ,", score); - fflush(stdout); - - struct si_result *r = &results[util_logbase2(size)][placement][method]; - r->is_valid = true; - r->is_cp = test_cp; - r->is_cs = test_cs; - r->cache_policy = cache_policy; - r->dwords_per_thread = cs_dwords_per_thread; - r->waves_per_sh = cs_waves_per_sh; - r->score = score; - r->index = method; - } - puts(""); - - if (compute_shader) - ctx->delete_compute_state(ctx, compute_shader); - } - } - - puts(""); - puts("static struct si_method"); - printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool " - "cached)\n", - sctx->screen->info.name); - puts("{"); - puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); - - /* Analyze results and find the best methods. */ - for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { - if (placement == 0) - puts(" if (dst == RADEON_DOMAIN_VRAM) {"); - else if (placement == 1) - puts(" } else { /* GTT */"); - else if (placement == 2) { - puts("}"); - puts(""); - puts("static struct si_method"); - printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n", - sctx->screen->info.name); - printf(" uint64_t size64, bool async, bool cached)\n"); - puts("{"); - puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); - puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {"); - } else if (placement == 3) - puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {"); - else - puts(" } else { /* GTT -> VRAM */"); - - for (unsigned mode = 0; mode < 3; mode++) { - bool async = mode == 0; - bool cached = mode == 1; - - if (async) - puts(" if (async) { /* async compute */"); - else if (cached) - puts(" if (cached) { /* gfx ring */"); - else - puts(" } else { /* gfx ring - uncached */"); - - /* The list of best chosen methods. */ - struct si_result *methods[32]; - unsigned method_max_size[32]; - unsigned num_methods = 0; - - for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { - /* Find the best method. */ - struct si_result *best = NULL; - - for (unsigned i = 0; i < NUM_METHODS; i++) { - struct si_result *r = &results[util_logbase2(size)][placement][i]; - - if (!r->is_valid) - continue; - - /* Ban CP DMA clears via MC on <= GFX8. They are super slow - * on GTT, which we can get due to BO evictions. - */ - if (sctx->gfx_level <= GFX8 && placement == 1 && r->is_cp && - r->cache_policy == L2_BYPASS) - continue; - - if (async) { - /* The following constraints for compute IBs try to limit - * resource usage so as not to decrease the performance - * of gfx IBs too much. - */ - - /* Don't use CP DMA on asynchronous rings, because - * the engine is shared with gfx IBs. - */ - if (r->is_cp) - continue; - - /* Don't use L2 caching on asynchronous rings to minimize - * L2 usage. - */ - if (r->cache_policy == L2_LRU) - continue; - - /* Asynchronous compute recommends waves_per_sh != 0 - * to limit CU usage. */ - if (r->is_cs && r->waves_per_sh == 0) - continue; + if (success) { + double GB = 1024.0 * 1024.0 * 1024.0; + double seconds = result.u64 / (double)NUM_RUNS / (1000.0 * 1000.0 * 1000.0); + double GBps = (size / GB) / seconds * (test_flavor == TEST_COPY_VRAM_VRAM ? 2 : 1); + printf("%8.2f,", GBps); } else { - if (cached && r->cache_policy == L2_BYPASS) - continue; - if (!cached && r->cache_policy == L2_LRU) - continue; - } - - if (!best) { - best = r; - continue; - } - - /* Assume some measurement error. Earlier methods occupy fewer - * resources, so the next method is always more greedy, and we - * don't want to select it due to a measurement error. - */ - double min_improvement = 1.03; - - if (best->score * min_improvement < r->score) - best = r; - } - - if (num_methods > 0) { - unsigned prev_index = num_methods - 1; - struct si_result *prev = methods[prev_index]; - struct si_result *prev_this_size = - &results[util_logbase2(size)][placement][prev->index]; - - /* If the best one is also the best for the previous size, - * just bump the size for the previous one. - * - * If there is no best, it means all methods were too slow - * for this size and were not tested. Use the best one for - * the previous size. - */ - if (!best || - /* If it's the same method as for the previous size: */ - (prev->is_cp == best->is_cp && - prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy && - prev->dwords_per_thread == best->dwords_per_thread && - prev->waves_per_sh == best->waves_per_sh) || - /* If the method for the previous size is also the best - * for this size: */ - (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) { - method_max_size[prev_index] = size; - continue; + printf("%8s,", "n/a"); } } - - /* Add it to the list. */ - assert(num_methods < ARRAY_SIZE(methods)); - methods[num_methods] = best; - method_max_size[num_methods] = size; - num_methods++; - } - - for (unsigned i = 0; i < num_methods; i++) { - struct si_result *best = methods[i]; - unsigned size = method_max_size[i]; - - /* The size threshold is between the current benchmarked - * size and the next benchmarked size. */ - if (i < num_methods - 1) - printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2); - else if (i > 0) - printf(" else "); - else - printf(" "); - printf("return "); - - assert(best); - const char *cache_policy_str = - best->cache_policy == L2_BYPASS ? "L2_BYPASS" : - best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM"; - - if (best->is_cp) { - printf("CP_DMA(%s);\n", cache_policy_str); - } - if (best->is_cs) { - printf("COMPUTE(%s, %u, %u);\n", cache_policy_str, - best->dwords_per_thread, best->waves_per_sh); - } + puts(""); } } - puts(" }"); } - puts(" }"); - puts("}"); ctx->destroy(ctx); exit(0);