From a634451c07208fe99f6893f9ef9a6ee620ccf41f Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 27 Jul 2025 08:48:48 -0700
Subject: [PATCH 2/2] pg_test_timing: Also test fast timing and report time
 source

In passing also reduce the per-loop overhead caused by repeated divisions
in INSTR_TIME_GET_NANOSEC when the ticks variable has become very large,
instead diff first and then turn it into nanosecs.
---
 src/bin/pg_test_timing/pg_test_timing.c | 76 +++++++++++++++++++------
 src/include/portability/instr_time.h    | 30 ++++++----
 2 files changed, 77 insertions(+), 29 deletions(-)

diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index a5621251afc..62e308dd965 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -30,7 +30,7 @@ static long long int largest_diff_count;
 
 
 static void handle_args(int argc, char *argv[]);
-static uint64 test_timing(unsigned int duration);
+static uint64 test_timing(unsigned int duration, bool fast_timing);
 static void output(uint64 loop_count);
 
 int
@@ -43,10 +43,22 @@ main(int argc, char *argv[])
 
 	handle_args(argc, argv);
 
-	loop_count = test_timing(test_duration);
-
+	/*
+	 * First, test default (non-fast) timing code. A clock source for that
+	 * is always available. Hence, we can unconditionally output the result.
+	 */
+	loop_count = test_timing(test_duration, false);
 	output(loop_count);
 
+	/*
+	 * Second, test the fast timing code. This clock source is not always
+	 * available. In that case the loop count will be 0 and we don't print.
+	 */
+	 printf("\n");
+	loop_count = test_timing(test_duration, true);
+	if (loop_count > 0)
+		output(loop_count);
+
 	return 0;
 }
 
@@ -78,7 +90,7 @@ handle_args(int argc, char *argv[])
 		}
 	}
 
-	while ((option = getopt_long(argc, argv, "d:c:",
+	while ((option = getopt_long(argc, argv, "d:c:f:",
 								 long_options, &optindex)) != -1)
 	{
 		switch (option)
@@ -143,23 +155,44 @@ handle_args(int argc, char *argv[])
 		exit(1);
 	}
 
-	printf(ngettext("Testing timing overhead for %u second.\n",
-					"Testing timing overhead for %u seconds.\n",
+	printf(ngettext("Testing timing overhead for %u second.\n\n",
+					"Testing timing overhead for %u seconds.\n\n",
 					test_duration),
 		   test_duration);
 }
 
 static uint64
-test_timing(unsigned int duration)
+test_timing(unsigned int duration, bool fast_timing)
 {
 	uint64		total_time;
 	int64		time_elapsed = 0;
 	uint64		loop_count = 0;
-	uint64		prev,
-				cur;
 	instr_time	start_time,
 				end_time,
-				temp;
+				prev,
+				cur;
+	char	   *time_source = NULL;
+	bool		fast_timing_available = false;
+
+	INSTR_TIME_INITIALIZE();
+
+#if !defined(WIN32) && defined(__x86_64__) && defined(__linux__)
+	if (fast_timing && has_rdtsc)
+	{
+		time_source = "RDTSC";
+		fast_timing_available = true;
+	}
+	else if (has_rdtscp)
+		time_source = "RDTSCP";
+	else
+		time_source = PG_INSTR_CLOCK_NAME;
+#else
+	time_source = PG_INSTR_CLOCK_NAME;
+#endif
+	if (fast_timing && !fast_timing_available)
+		return 0;
+
+	printf(_("Time source: %s\n"), time_source);
 
 	/*
 	 * Pre-zero the statistics data structures.  They're already zero by
@@ -173,8 +206,11 @@ test_timing(unsigned int duration)
 
 	total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0;
 
-	INSTR_TIME_SET_CURRENT(start_time);
-	cur = INSTR_TIME_GET_NANOSEC(start_time);
+	if (fast_timing)
+		INSTR_TIME_SET_CURRENT_FAST(start_time);
+	else
+		INSTR_TIME_SET_CURRENT(start_time);
+	cur = start_time;
 
 	while (time_elapsed < total_time)
 	{
@@ -182,9 +218,11 @@ test_timing(unsigned int duration)
 					bits;
 
 		prev = cur;
-		INSTR_TIME_SET_CURRENT(temp);
-		cur = INSTR_TIME_GET_NANOSEC(temp);
-		diff = cur - prev;
+		if (fast_timing)
+			INSTR_TIME_SET_CURRENT_FAST(cur);
+		else
+			INSTR_TIME_SET_CURRENT(cur);
+		diff = INSTR_TIME_DIFF_NANOSEC(cur, prev);
 
 		/* Did time go backwards? */
 		if (unlikely(diff < 0))
@@ -217,11 +255,13 @@ test_timing(unsigned int duration)
 			largest_diff_count++;
 
 		loop_count++;
-		INSTR_TIME_SUBTRACT(temp, start_time);
-		time_elapsed = INSTR_TIME_GET_NANOSEC(temp);
+		time_elapsed = INSTR_TIME_DIFF_NANOSEC(cur, start_time);
 	}
 
-	INSTR_TIME_SET_CURRENT(end_time);
+	if (fast_timing)
+		INSTR_TIME_SET_CURRENT_FAST(end_time);
+	else
+		INSTR_TIME_SET_CURRENT(end_time);
 
 	INSTR_TIME_SUBTRACT(end_time, start_time);
 
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index e2e339a0c4f..f02296f1026 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -112,10 +112,13 @@ extern int64 max_ticks_no_overflow;
  */
 #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW)
 #define PG_INSTR_CLOCK	CLOCK_MONOTONIC_RAW
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_MONOTONIC_RAW)"
 #elif defined(CLOCK_MONOTONIC)
 #define PG_INSTR_CLOCK	CLOCK_MONOTONIC
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_MONOTONIC)"
 #else
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_REALTIME)"
 #endif
 
 #if defined(__x86_64__) && defined(__linux__)
@@ -174,7 +177,7 @@ pg_get_ticks(void)
 }
 
 static inline int64_t
-pg_ticks_to_ns(instr_time t)
+pg_ticks_to_ns(int64 ticks)
 {
 	/*
 	 * Would multiplication overflow? If so perform computation in two parts.
@@ -183,7 +186,7 @@ pg_ticks_to_ns(instr_time t)
 	 */
 	int64		ns = 0;
 
-	if (unlikely(t.ticks > max_ticks_no_overflow))
+	if (unlikely(ticks > max_ticks_no_overflow))
 	{
 		/*
 		 * Compute how often the maximum number of ticks fits completely into
@@ -192,7 +195,7 @@ pg_ticks_to_ns(instr_time t)
 		 * value. In a 2nd step we adjust the number of elapsed ticks and
 		 * convert the remaining ticks.
 		 */
-		int64		count = t.ticks / max_ticks_no_overflow;
+		int64		count = ticks / max_ticks_no_overflow;
 		int64		max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
 
 		ns = max_ns * count;
@@ -201,11 +204,11 @@ pg_ticks_to_ns(instr_time t)
 		 * Subtract the ticks that we now already accounted for, so that they
 		 * don't get counted twice.
 		 */
-		t.ticks -= count * max_ticks_no_overflow;
-		Assert(t.ticks >= 0);
+		ticks -= count * max_ticks_no_overflow;
+		Assert(ticks >= 0);
 	}
 
-	ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+	ns += ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
 	return ns;
 }
 
@@ -226,14 +229,14 @@ pg_initialize_get_ticks()
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_get_ticks())
 
-#define INSTR_TIME_GET_NANOSEC(t) \
-	pg_ticks_to_ns(t)
-
+#define INSTR_TIME_TICKS_TO_NANOSEC(ticks) \
+	(pg_ticks_to_ns(ticks))
 
 #else							/* WIN32 */
 
 
 /* Use QueryPerformanceCounter() */
+#define PG_INSTR_CLOCK_NAME	"QueryPerformanceCounter"
 
 /* helper for INSTR_TIME_SET_CURRENT / INSTR_TIME_SET_CURRENT_FAST */
 static inline instr_time
@@ -265,8 +268,8 @@ GetTimerFrequency(void)
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
-#define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency())))
+#define INSTR_TIME_TICKS_TO_NANOSEC(ticks) \
+	((int64) ((ticks) * ((double) NS_PER_S / GetTimerFrequency())))
 
 #endif							/* WIN32 */
 
@@ -285,9 +288,14 @@ GetTimerFrequency(void)
 #define INSTR_TIME_SUBTRACT(x,y) \
 	((x).ticks -= (y).ticks)
 
+#define INSTR_TIME_DIFF_NANOSEC(x,y) \
+	(INSTR_TIME_TICKS_TO_NANOSEC((x).ticks - (y).ticks))
+
 #define INSTR_TIME_ACCUM_DIFF(x,y,z) \
 	((x).ticks += (y).ticks - (z).ticks)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(INSTR_TIME_TICKS_TO_NANOSEC((t).ticks))
 
 #define INSTR_TIME_GET_DOUBLE(t) \
 	((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)
-- 
2.51.0

