i965: Add support for GL_AMD_performance_monitor on Ironlake.

Ironlake's counters are always enabled; userspace can simply send a
MI_REPORT_PERF_COUNT packet to take a snapshot of them.  This makes it
easy to implement.

The counters are documented in the source code for the intel-gpu-tools
intel_perf_counters utility.

v2: Adjust for core data structure changes.  Add a table mapping buffer
    object offsets to exposed counters (which changes each generation).
    Finally, add report ID assertions to sanity check the BO layout
    (thanks to Carl Worth).

v3: Update for core BeginPerfMonitor hook changes (requested by Brian).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
This commit is contained in:
Kenneth Graunke 2013-04-11 13:22:29 -07:00
parent b2e327e08f
commit 0f2da77307
6 changed files with 420 additions and 0 deletions

View file

@ -69,6 +69,7 @@ i965_FILES = \
brw_lower_texture_gradients.cpp \
brw_misc_state.c \
brw_object_purgeable.c \
brw_performance_monitor.c \
brw_program.c \
brw_primitive_restart.c \
brw_queryobj.c \

View file

@ -503,6 +503,10 @@ brwCreateContext(int api,
_mesa_initialize_dispatch_tables(ctx);
_mesa_initialize_vbo_vtxfmt(ctx);
if (ctx->Extensions.AMD_performance_monitor) {
brw_init_performance_monitors(brw);
}
return true;
}

View file

@ -128,6 +128,7 @@ struct brw_vs_prog_key;
struct brw_vec4_prog_key;
struct brw_wm_prog_key;
struct brw_wm_prog_data;
struct brw_perf_bo_layout;
enum brw_state_id {
BRW_STATE_URB_FENCE,
@ -1313,6 +1314,16 @@ struct brw_context
bool begin_emitted;
} query;
struct {
/* A map describing which counters are stored at a particular 32-bit
* offset in the buffer object.
*/
const struct brw_perf_bo_layout *bo_layout;
/* Number of 32-bit entries in the buffer object. */
int entries_in_bo;
} perfmon;
int num_atoms;
const struct brw_tracked_state **atoms;
@ -1485,6 +1496,9 @@ bool brw_is_hiz_depth_format(struct brw_context *ctx, gl_format format);
bool brw_render_target_supported(struct brw_context *brw,
struct gl_renderbuffer *rb);
/* brw_performance_monitor.c */
void brw_init_performance_monitors(struct brw_context *brw);
/* gen6_sol.c */
void
brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,

View file

@ -1817,6 +1817,13 @@ enum brw_wm_barycentric_interp_mode {
#define CMD_MI_FLUSH 0x0200
#define GEN5_MI_REPORT_PERF_COUNT ((0x26 << 23) | (3 - 2))
/* DW0 */
# define GEN5_MI_COUNTER_SET_0 (0 << 6)
# define GEN5_MI_COUNTER_SET_1 (1 << 6)
/* DW1 */
# define MI_COUNTER_ADDRESS_GTT (1 << 0)
/* DW2: a user-defined report ID (written to the buffer but can be anything) */
/* Bitfields for the URB_WRITE message, DW2 of message header: */
#define URB_WRITE_PRIM_END 0x1

View file

@ -0,0 +1,391 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* \file brw_performance_monitor.c
*
* Implementation of the GL_AMD_performance_monitor extension.
*
* Currently only for Ironlake.
*/
#include <limits.h>
#include "main/bitset.h"
#include "main/macros.h"
#include "main/mtypes.h"
#include "main/performance_monitor.h"
#include "brw_context.h"
#include "brw_defines.h"
#include "intel_batchbuffer.h"
/**
* i965 representation of a performance monitor object.
*/
struct brw_perf_monitor_object
{
/** The base class. */
struct gl_perf_monitor_object base;
/**
* BO containing raw counter data in a hardware specific form.
*/
drm_intel_bo *bo;
};
/** Downcasting convenience macro. */
static inline struct brw_perf_monitor_object *
brw_perf_monitor(struct gl_perf_monitor_object *m)
{
return (struct brw_perf_monitor_object *) m;
}
#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
/* Two random values used to ensure we're getting valid snapshots. */
#define FIRST_SNAPSHOT_REPORT_ID 0xd2e9c607
#define SECOND_SNAPSHOT_REPORT_ID 0xad584b1d
/******************************************************************************/
#define COUNTER(name) \
{ \
.Name = name, \
.Type = GL_UNSIGNED_INT, \
.Minimum = { .u32 = 0 }, \
.Maximum = { .u32 = ~0 }, \
}
#define GROUP(name, max_active, counter_list) \
{ \
.Name = name, \
.MaxActiveCounters = max_active, \
.Counters = counter_list, \
.NumCounters = ARRAY_SIZE(counter_list), \
}
struct brw_perf_bo_layout {
int group;
int counter;
};
/**
* Ironlake:
* @{
*/
const static struct gl_perf_monitor_counter gen5_raw_aggregating_counters[] = {
COUNTER("cycles the CS unit is starved"),
COUNTER("cycles the CS unit is stalled"),
COUNTER("cycles the VF unit is starved"),
COUNTER("cycles the VF unit is stalled"),
COUNTER("cycles the VS unit is starved"),
COUNTER("cycles the VS unit is stalled"),
COUNTER("cycles the GS unit is starved"),
COUNTER("cycles the GS unit is stalled"),
COUNTER("cycles the CL unit is starved"),
COUNTER("cycles the CL unit is stalled"),
COUNTER("cycles the SF unit is starved"),
COUNTER("cycles the SF unit is stalled"),
COUNTER("cycles the WZ unit is starved"),
COUNTER("cycles the WZ unit is stalled"),
COUNTER("Z buffer read/write"),
COUNTER("cycles each EU was active"),
COUNTER("cycles each EU was suspended"),
COUNTER("cycles threads loaded all EUs"),
COUNTER("cycles filtering active"),
COUNTER("cycles PS threads executed"),
COUNTER("subspans written to RC"),
COUNTER("bytes read for texture reads"),
COUNTER("texels returned from sampler"),
COUNTER("polygons not culled"),
COUNTER("clocks MASF has valid message"),
COUNTER("64b writes/reads from RC"),
COUNTER("reads on dataport"),
COUNTER("clocks MASF has valid msg not consumed by sampler"),
COUNTER("cycles any EU is stalled for math"),
};
const static struct gl_perf_monitor_group gen5_groups[] = {
GROUP("Aggregating Counters", INT_MAX, gen5_raw_aggregating_counters),
};
const static struct brw_perf_bo_layout gen5_perf_bo_layout[] =
{
{ -1, -1, }, /* Report ID */
{ -1, -1, }, /* TIMESTAMP (64-bit) */
{ -1, -1, }, /* ...second half... */
{ 0, 0, }, /* cycles the CS unit is starved */
{ 0, 1, }, /* cycles the CS unit is stalled */
{ 0, 2, }, /* cycles the VF unit is starved */
{ 0, 3, }, /* cycles the VF unit is stalled */
{ 0, 4, }, /* cycles the VS unit is starved */
{ 0, 5, }, /* cycles the VS unit is stalled */
{ 0, 6, }, /* cycles the GS unit is starved */
{ 0, 7, }, /* cycles the GS unit is stalled */
{ 0, 8, }, /* cycles the CL unit is starved */
{ 0, 9, }, /* cycles the CL unit is stalled */
{ 0, 10, }, /* cycles the SF unit is starved */
{ 0, 11, }, /* cycles the SF unit is stalled */
{ 0, 12, }, /* cycles the WZ unit is starved */
{ 0, 13, }, /* cycles the WZ unit is stalled */
{ 0, 14, }, /* Z buffer read/write */
{ 0, 15, }, /* cycles each EU was active */
{ 0, 16, }, /* cycles each EU was suspended */
{ 0, 17, }, /* cycles threads loaded all EUs */
{ 0, 18, }, /* cycles filtering active */
{ 0, 19, }, /* cycles PS threads executed */
{ 0, 20, }, /* subspans written to RC */
{ 0, 21, }, /* bytes read for texture reads */
{ 0, 22, }, /* texels returned from sampler */
{ 0, 23, }, /* polygons not culled */
{ 0, 24, }, /* clocks MASF has valid message */
{ 0, 25, }, /* 64b writes/reads from RC */
{ 0, 26, }, /* reads on dataport */
{ 0, 27, }, /* clocks MASF has valid msg not consumed by sampler */
{ 0, 28, }, /* cycles any EU is stalled for math */
};
/** @} */
/******************************************************************************/
static void
snapshot_aggregating_counters(struct brw_context *brw,
drm_intel_bo *bo, uint32_t offset_in_bytes)
{
uint32_t report_id = offset_in_bytes == 0 ? FIRST_SNAPSHOT_REPORT_ID
: SECOND_SNAPSHOT_REPORT_ID;
if (brw->gen == 5) {
/* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
* the counters. The report ID is ignored in the second set.
*/
BEGIN_BATCH(6);
OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
OUT_RELOC(bo,
I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
offset_in_bytes);
OUT_BATCH(report_id);
OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
OUT_RELOC(bo,
I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
offset_in_bytes + 64);
OUT_BATCH(report_id);
ADVANCE_BATCH();
} else {
assert(!"Unsupported generation for performance counters.");
}
}
static bool
aggregating_counters_needed(struct brw_context *brw,
struct gl_perf_monitor_object *m)
{
return m->ActiveGroups[0];
}
/******************************************************************************/
/**
* Create a new performance monitor object.
*/
static struct gl_perf_monitor_object *
brw_new_perf_monitor(struct gl_context *ctx)
{
return calloc(1, sizeof(struct brw_perf_monitor_object));
}
/**
* Delete a performance monitor object.
*/
static void
brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
{
struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
if (monitor->bo)
drm_intel_bo_unreference(monitor->bo);
free(monitor);
}
/**
* Driver hook for glBeginPerformanceMonitorAMD().
*/
static GLboolean
brw_begin_perf_monitor(struct gl_context *ctx,
struct gl_perf_monitor_object *m)
{
struct brw_context *brw = brw_context(ctx);
struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
/* If the BO already exists, throw it away. It contains old results
* that we're not interested in any more.
*/
if (monitor->bo)
drm_intel_bo_unreference(monitor->bo);
/* Create a new BO. */
monitor->bo =
drm_intel_bo_alloc(brw->bufmgr, "performance monitor", 4096, 64);
drm_intel_bo_map(monitor->bo, true);
memset((char *) monitor->bo->virtual, 0xff, 4096);
drm_intel_bo_unmap(monitor->bo);
/* Take a shapshot of all active counters */
if (aggregating_counters_needed(brw, m)) {
snapshot_aggregating_counters(brw, monitor->bo, 0);
}
return true;
}
/**
* Driver hook for glEndPerformanceMonitorAMD().
*/
static void
brw_end_perf_monitor(struct gl_context *ctx,
struct gl_perf_monitor_object *m)
{
struct brw_context *brw = brw_context(ctx);
struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
if (aggregating_counters_needed(brw, m)) {
snapshot_aggregating_counters(brw, monitor->bo,
SECOND_SNAPSHOT_OFFSET_IN_BYTES);
}
}
/**
* Reset a performance monitor, throwing away any results.
*/
static void
brw_reset_perf_monitor(struct gl_context *ctx,
struct gl_perf_monitor_object *m)
{
struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
if (monitor->bo) {
drm_intel_bo_unreference(monitor->bo);
monitor->bo = NULL;
}
if (m->Active) {
brw_begin_perf_monitor(ctx, m);
}
}
/**
* Is a performance monitor result available?
*/
static GLboolean
brw_is_perf_monitor_result_available(struct gl_context *ctx,
struct gl_perf_monitor_object *m)
{
struct brw_context *brw = brw_context(ctx);
struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
return !m->Active && monitor->bo &&
!drm_intel_bo_references(brw->batch.bo, monitor->bo) &&
!drm_intel_bo_busy(monitor->bo);
}
/**
* Get the performance monitor result.
*/
static void
brw_get_perf_monitor_result(struct gl_context *ctx,
struct gl_perf_monitor_object *m,
GLsizei data_size,
GLuint *data,
GLint *bytes_written)
{
struct brw_context *brw = brw_context(ctx);
struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
/* This hook should only be called when results are available. */
assert(monitor->bo != NULL);
drm_intel_bo_map(monitor->bo, false);
unsigned *gpu_bo = monitor->bo->virtual;
/* Copy data from the BO to the supplied array.
*
* The output data format is: <group ID, counter ID, value> for each
* active counter. The API allows counters to appear in any order.
*/
GLsizei offset = 0;
/* Look for expected report ID values to ensure data is present. */
assert(gpu_bo[0] == FIRST_SNAPSHOT_REPORT_ID);
assert(gpu_bo[SECOND_SNAPSHOT_OFFSET_IN_BYTES/4] == SECOND_SNAPSHOT_REPORT_ID);
for (int i = 0; i < brw->perfmon.entries_in_bo; i++) {
int group = brw->perfmon.bo_layout[i].group;
int counter = brw->perfmon.bo_layout[i].counter;
if (group < 0 || !BITSET_TEST(m->ActiveCounters[group], counter))
continue;
const struct gl_perf_monitor_group *group_obj =
&ctx->PerfMonitor.Groups[group];
const struct gl_perf_monitor_counter *c = &group_obj->Counters[counter];
data[offset++] = group;
data[offset++] = counter;
uint32_t second_snapshot_index =
SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint32_t) + i;
/* Won't work for uint64_t values, but we don't expose any yet. */
data[offset] = gpu_bo[second_snapshot_index] - gpu_bo[i];
offset += _mesa_perf_monitor_counter_size(c) / sizeof(uint32_t);
}
drm_intel_bo_unmap(monitor->bo);
if (bytes_written)
*bytes_written = offset * sizeof(uint32_t);
}
void
brw_init_performance_monitors(struct brw_context *brw)
{
struct gl_context *ctx = &brw->ctx;
ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
if (brw->gen == 5) {
ctx->PerfMonitor.Groups = gen5_groups;
ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups);
brw->perfmon.bo_layout = gen5_perf_bo_layout;
brw->perfmon.entries_in_bo = ARRAY_SIZE(gen5_perf_bo_layout);
}
}

View file

@ -160,6 +160,9 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.EXT_shader_integer_mix = ctx->Const.GLSLVersion >= 130;
}
if (brw->gen == 5)
ctx->Extensions.AMD_performance_monitor = true;
if (ctx->API == API_OPENGL_CORE)
ctx->Extensions.ARB_base_instance = true;
if (ctx->API != API_OPENGL_CORE)