mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 16:08:04 +02:00
i965: Allow a per gen timebase scale factor
Prior to Skylake the Gen HW timestamps were driven by a 12.5MHz clock with the convenient property of being able to scale by an integer (80) to nanosecond units. For Skylake the frequency is 12MHz or a scale factor of 83.333333 This updates gen_device_info to track a floating point timebase_scale factor and makes corresponding _queryobj.c changes to no longer assume a scale factor of 80 works across all gens. Although the gen6_ code could have been been left alone, the changes keep the code more comparable, and it now shares a few utility functions for scaling raw timestamps and calculating deltas. The utility for calculating deltas takes into account 32 or 36bit overflow depending on the current kernel version. Note: this leaves the timestamp handling of ARB_query_buffer_object untouched, which continues to use an incorrect scale of 80 on Skylake for now. This is more awkward to solve since the scaling is currently done using a very limited uint64 ALU available to the command parser that doesn't support multiply or divide where it's already taking a large number of instructions just to effectively multiple by 80. This fixes piglit arb_timer_query-timestamp-get on Skylake v2: (Ken) Update timebase_scale for platforms past Skylake/Broxton too. Signed-off-by: Robert Bragg <robert@sixbynine.org> Reviewed-by: Matt Turner <mattst88@gmail.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
parent
28b134c75c
commit
344d1a4015
6 changed files with 114 additions and 27 deletions
|
|
@ -36,6 +36,7 @@ static const struct gen_device_info gen_device_info_i965 = {
|
|||
.urb = {
|
||||
.size = 256,
|
||||
},
|
||||
.timebase_scale = 80,
|
||||
};
|
||||
|
||||
static const struct gen_device_info gen_device_info_g4x = {
|
||||
|
|
@ -51,6 +52,7 @@ static const struct gen_device_info gen_device_info_g4x = {
|
|||
.urb = {
|
||||
.size = 384,
|
||||
},
|
||||
.timebase_scale = 80,
|
||||
};
|
||||
|
||||
static const struct gen_device_info gen_device_info_ilk = {
|
||||
|
|
@ -65,6 +67,7 @@ static const struct gen_device_info gen_device_info_ilk = {
|
|||
.urb = {
|
||||
.size = 1024,
|
||||
},
|
||||
.timebase_scale = 80,
|
||||
};
|
||||
|
||||
static const struct gen_device_info gen_device_info_snb_gt1 = {
|
||||
|
|
@ -89,6 +92,7 @@ static const struct gen_device_info gen_device_info_snb_gt1 = {
|
|||
[MESA_SHADER_GEOMETRY] = 256,
|
||||
},
|
||||
},
|
||||
.timebase_scale = 80,
|
||||
};
|
||||
|
||||
static const struct gen_device_info gen_device_info_snb_gt2 = {
|
||||
|
|
@ -113,6 +117,7 @@ static const struct gen_device_info gen_device_info_snb_gt2 = {
|
|||
[MESA_SHADER_GEOMETRY] = 256,
|
||||
},
|
||||
},
|
||||
.timebase_scale = 80,
|
||||
};
|
||||
|
||||
#define GEN7_FEATURES \
|
||||
|
|
@ -121,7 +126,8 @@ static const struct gen_device_info gen_device_info_snb_gt2 = {
|
|||
.must_use_separate_stencil = true, \
|
||||
.has_llc = true, \
|
||||
.has_pln = true, \
|
||||
.has_surface_tile_offset = true
|
||||
.has_surface_tile_offset = true, \
|
||||
.timebase_scale = 80
|
||||
|
||||
static const struct gen_device_info gen_device_info_ivb_gt1 = {
|
||||
GEN7_FEATURES, .is_ivybridge = true, .gt = 1,
|
||||
|
|
@ -287,7 +293,8 @@ static const struct gen_device_info gen_device_info_hsw_gt3 = {
|
|||
.max_tcs_threads = 504, \
|
||||
.max_tes_threads = 504, \
|
||||
.max_gs_threads = 504, \
|
||||
.max_wm_threads = 384
|
||||
.max_wm_threads = 384, \
|
||||
.timebase_scale = 80
|
||||
|
||||
static const struct gen_device_info gen_device_info_bdw_gt1 = {
|
||||
GEN8_FEATURES, .gt = 1,
|
||||
|
|
@ -385,6 +392,7 @@ static const struct gen_device_info gen_device_info_chv = {
|
|||
.max_tcs_threads = 336, \
|
||||
.max_tes_threads = 336, \
|
||||
.max_cs_threads = 56, \
|
||||
.timebase_scale = 1000000000.0 / 12000000.0, \
|
||||
.urb = { \
|
||||
.size = 384, \
|
||||
.min_entries = { \
|
||||
|
|
@ -410,6 +418,7 @@ static const struct gen_device_info gen_device_info_chv = {
|
|||
.max_tes_threads = 112, \
|
||||
.max_gs_threads = 112, \
|
||||
.max_cs_threads = 6 * 6, \
|
||||
.timebase_scale = 1000000000.0 / 19200123.0, \
|
||||
.urb = { \
|
||||
.size = 192, \
|
||||
.min_entries = { \
|
||||
|
|
|
|||
|
|
@ -147,6 +147,30 @@ struct gen_device_info
|
|||
*/
|
||||
unsigned max_entries[4];
|
||||
} urb;
|
||||
|
||||
/**
|
||||
* For the longest time the timestamp frequency for Gen's timestamp counter
|
||||
* could be assumed to be 12.5MHz, where the least significant bit neatly
|
||||
* corresponded to 80 nanoseconds.
|
||||
*
|
||||
* Since Gen9 the numbers aren't so round, with a a frequency of 12MHz for
|
||||
* SKL (or scale factor of 83.33333333) and a frequency of 19200123Hz for
|
||||
* BXT.
|
||||
*
|
||||
* For simplicty to fit with the current code scaling by a single constant
|
||||
* to map from raw timestamps to nanoseconds we now do the conversion in
|
||||
* floating point instead of integer arithmetic.
|
||||
*
|
||||
* In general it's probably worth noting that the documented constants we
|
||||
* have for the per-platform timestamp frequencies aren't perfect and
|
||||
* shouldn't be trusted for scaling and comparing timestamps with a large
|
||||
* delta.
|
||||
*
|
||||
* E.g. with crude testing on my system using the 'correct' scale factor I'm
|
||||
* seeing a drift of ~2 milliseconds per second.
|
||||
*/
|
||||
double timebase_scale;
|
||||
|
||||
/** @} */
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -524,6 +524,21 @@ brw_initialize_context_constants(struct brw_context *brw)
|
|||
ctx->Const.MaxCombinedShaderOutputResources =
|
||||
MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
|
||||
|
||||
/* The timestamp register we can read for glGetTimestamp() is
|
||||
* sometimes only 32 bits, before scaling to nanoseconds (depending
|
||||
* on kernel).
|
||||
*
|
||||
* Once scaled to nanoseconds the timestamp would roll over at a
|
||||
* non-power-of-two, so an application couldn't use
|
||||
* GL_QUERY_COUNTER_BITS to handle rollover correctly. Instead, we
|
||||
* report 36 bits and truncate at that (rolling over 5 times as
|
||||
* often as the HW counter), and when the 32-bit counter rolls
|
||||
* over, it happens to also be at a rollover in the reported value
|
||||
* from near (1<<36) to 0.
|
||||
*
|
||||
* The low 32 bits rolls over in ~343 seconds. Our 36-bit result
|
||||
* rolls over every ~69 seconds.
|
||||
*/
|
||||
ctx->Const.QueryCounterBits.Timestamp = 36;
|
||||
|
||||
ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
|
||||
|
|
|
|||
|
|
@ -1298,6 +1298,9 @@ void brw_emit_query_begin(struct brw_context *brw);
|
|||
void brw_emit_query_end(struct brw_context *brw);
|
||||
void brw_query_counter(struct gl_context *ctx, struct gl_query_object *q);
|
||||
bool brw_is_query_pipelined(struct brw_query_object *query);
|
||||
uint64_t brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp);
|
||||
uint64_t brw_raw_timestamp_delta(struct brw_context *brw,
|
||||
uint64_t time0, uint64_t time1);
|
||||
|
||||
/** gen6_queryobj.c */
|
||||
void gen6_init_queryobj_functions(struct dd_function_table *functions);
|
||||
|
|
|
|||
|
|
@ -42,6 +42,42 @@
|
|||
#include "brw_state.h"
|
||||
#include "intel_batchbuffer.h"
|
||||
|
||||
uint64_t
|
||||
brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp)
|
||||
{
|
||||
const struct gen_device_info *devinfo = &brw->screen->devinfo;
|
||||
|
||||
return (double)gpu_timestamp * devinfo->timebase_scale;
|
||||
}
|
||||
|
||||
/* As best we know currently, the Gen HW timestamps are 36bits across
|
||||
* all platforms, which we need to account for when calculating a
|
||||
* delta to measure elapsed time.
|
||||
*
|
||||
* The timestamps read via glGetTimestamp() / brw_get_timestamp() sometimes
|
||||
* only have 32bits due to a kernel bug and so in that case we make sure to
|
||||
* treat all raw timestamps as 32bits so they overflow consistently and remain
|
||||
* comparable. (Note: the timestamps being passed here are not from the kernel
|
||||
* so we don't need to be taking the upper 32bits in this buggy kernel case we
|
||||
* are just clipping to 32bits here for consistency.)
|
||||
*/
|
||||
uint64_t
|
||||
brw_raw_timestamp_delta(struct brw_context *brw, uint64_t time0, uint64_t time1)
|
||||
{
|
||||
if (brw->screen->hw_has_timestamp == 2) {
|
||||
/* Kernel clips timestamps to 32bits in this case, so we also clip
|
||||
* PIPE_CONTROL timestamps for consistency.
|
||||
*/
|
||||
return (uint32_t)time1 - (uint32_t)time0;
|
||||
} else {
|
||||
if (time0 > time1) {
|
||||
return (1ULL << 36) + time1 - time0;
|
||||
} else {
|
||||
return time1 - time0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
|
||||
*/
|
||||
|
|
@ -117,12 +153,18 @@ brw_queryobj_get_results(struct gl_context *ctx,
|
|||
/* The query BO contains the starting and ending timestamps.
|
||||
* Subtract the two and convert to nanoseconds.
|
||||
*/
|
||||
query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
|
||||
query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]);
|
||||
query->Base.Result = brw_timebase_scale(brw, query->Base.Result);
|
||||
break;
|
||||
|
||||
case GL_TIMESTAMP:
|
||||
/* The query BO contains a single timestamp value in results[0]. */
|
||||
query->Base.Result = 1000 * (results[0] >> 32);
|
||||
query->Base.Result = brw_timebase_scale(brw, results[0]);
|
||||
|
||||
/* Ensure the scaled timestamp overflows according to
|
||||
* GL_QUERY_COUNTER_BITS
|
||||
*/
|
||||
query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
|
||||
break;
|
||||
|
||||
case GL_SAMPLES_PASSED_ARB:
|
||||
|
|
@ -508,9 +550,15 @@ brw_get_timestamp(struct gl_context *ctx)
|
|||
break;
|
||||
}
|
||||
|
||||
/* See logic in brw_queryobj_get_results() */
|
||||
result *= 80;
|
||||
result &= (1ull << 36) - 1;
|
||||
/* Scale to nanosecond units */
|
||||
result = brw_timebase_scale(brw, result);
|
||||
|
||||
/* Ensure the scaled timestamp overflows according to
|
||||
* GL_QUERY_COUNTER_BITS. Technically this isn't required if
|
||||
* querying GL_TIMESTAMP via glGetInteger but it seems best to keep
|
||||
* QueryObject and GetInteger timestamps consistent.
|
||||
*/
|
||||
result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -219,30 +219,18 @@ gen6_queryobj_get_results(struct gl_context *ctx,
|
|||
/* The query BO contains the starting and ending timestamps.
|
||||
* Subtract the two and convert to nanoseconds.
|
||||
*/
|
||||
query->Base.Result += 80 * (results[1] - results[0]);
|
||||
query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]);
|
||||
query->Base.Result = brw_timebase_scale(brw, query->Base.Result);
|
||||
break;
|
||||
|
||||
case GL_TIMESTAMP:
|
||||
/* Our timer is a clock that increments every 80ns (regardless of
|
||||
* other clock scaling in the system). The timestamp register we can
|
||||
* read for glGetTimestamp() masks out the top 32 bits, so we do that
|
||||
* here too to let the two counters be compared against each other.
|
||||
*
|
||||
* If we just multiplied that 32 bits of data by 80, it would roll
|
||||
* over at a non-power-of-two, so an application couldn't use
|
||||
* GL_QUERY_COUNTER_BITS to handle rollover correctly. Instead, we
|
||||
* report 36 bits and truncate at that (rolling over 5 times as often
|
||||
* as the HW counter), and when the 32-bit counter rolls over, it
|
||||
* happens to also be at a rollover in the reported value from near
|
||||
* (1<<36) to 0.
|
||||
*
|
||||
* The low 32 bits rolls over in ~343 seconds. Our 36-bit result
|
||||
* rolls over every ~69 seconds.
|
||||
*
|
||||
* The query BO contains a single timestamp value in results[0].
|
||||
/* The query BO contains a single timestamp value in results[0]. */
|
||||
query->Base.Result = brw_timebase_scale(brw, results[0]);
|
||||
|
||||
/* Ensure the scaled timestamp overflows according to
|
||||
* GL_QUERY_COUNTER_BITS
|
||||
*/
|
||||
query->Base.Result = 80 * (results[0] & 0xffffffff);
|
||||
query->Base.Result &= (1ull << 36) - 1;
|
||||
query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
|
||||
break;
|
||||
|
||||
case GL_SAMPLES_PASSED_ARB:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue