diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index ad8e272..0bdf8dd 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -167,11 +167,12 @@ char	   *index_tablespace = NULL;
 #define SCALE_32BIT_THRESHOLD 20000
 
 bool		use_log;			/* log transaction latencies to a file */
-bool		use_quiet;			/* quiet logging onto stderr */
 int			agg_interval;		/* log aggregates instead of individual
 								 * transactions */
-int			progress = 0;       /* thread progress report every this seconds */
+int			progress = -1;      /* report every this seconds.
+								   0 is quiet, -1 is not set yet */
 int         progress_nclients = 0; /* number of clients for progress report */
+int         progress_nthreads = 0; /* number of threads for progress report */
 bool		is_connect;			/* establish connection for each transaction */
 bool		is_latencies;		/* report per-command latencies */
 int			main_pid;			/* main process id used in log filename */
@@ -214,6 +215,8 @@ typedef struct
 	int			nvariables;
 	instr_time	txn_begin;		/* used for measuring transaction latencies */
 	instr_time	stmt_begin;		/* used for measuring statement latencies */
+	int64       txn_latencies;	/* cumulated latencies */
+	int64		txn_sqlats;		/* cumulated square latencies */
 	bool		is_throttled;	/* whether transaction throttling is done */
 	int			use_file;		/* index in sql_files for this client */
 	bool		prepared[MAX_FILES];
@@ -243,6 +246,8 @@ typedef struct
 {
 	instr_time	conn_time;
 	int			xacts;
+	int64		latencies;
+	int64		sqlats;
 	int64       throttle_lag;
 	int64       throttle_lag_max;
 } TResult;
@@ -347,7 +352,6 @@ usage(void)
 		   "  -i, --initialize         invokes initialization mode\n"
 		   "  -F, --fillfactor=NUM     set fill factor\n"
 		   "  -n, --no-vacuum          do not run VACUUM after initialization\n"
-		   "  -q, --quiet              quiet logging (one message each 5 seconds)\n"
 		   "  -s, --scale=NUM          scaling factor\n"
 		   "  --foreign-keys           create foreign key constraints between tables\n"
 		   "  --index-tablespace=TABLESPACE\n"
@@ -367,7 +371,6 @@ usage(void)
 		                                           "(default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
 		   "  -N, --skip-some-updates  skip updates of pgbench_tellers and pgbench_branches\n"
-		   "  -P, --progress=NUM       show thread progress report every NUM seconds\n"
 		   "  -r, --report-latencies   report average latency per command\n"
 		   "  -R, --rate=SPEC          target rate in transactions per second\n"
 		   "  -s, --scale=NUM          report this scale factor in output\n"
@@ -382,6 +385,8 @@ usage(void)
 		   "  -d, --debug              print debugging output\n"
 		   "  -h, --host=HOSTNAME      database server host or socket directory\n"
 		   "  -p, --port=PORT          database server port number\n"
+		   "  -P, --progress=NUM       show progress report every NUM seconds\n"
+		   "  -q, --quiet              no progress report\n"
 		   "  -U, --username=USERNAME  connect as specified database user\n"
 		   "  -V, --version            output version information, then exit\n"
 		   "  -?, --help               show this help, then exit\n"
@@ -1003,6 +1008,21 @@ top:
 		}
 
 		/*
+		 * always record latency under progress or throttling
+		 */
+		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		{
+			instr_time	now, diff;
+			int64 latency;
+			INSTR_TIME_SET_CURRENT(now);
+			diff = now;
+			INSTR_TIME_SUBTRACT(diff, st->txn_begin);
+			latency = INSTR_TIME_GET_MICROSEC(diff);
+			st->txn_latencies += latency;
+			st->txn_sqlats += latency*latency;
+		}
+
+		/*
 		 * if transaction finished, record the time it took in the log
 		 */
 		if (logfile && commands[st->state + 1] == NULL)
@@ -1191,8 +1211,11 @@ top:
 		goto top;
 	}
 
-	/* Record transaction start time if logging is enabled */
-	if (logfile && st->state == 0)
+	/*
+	 * Record transaction start time
+	 * if logging, progress or throttling is enabled
+	 */
+	if ((logfile || progress || throttle_delay) && st->state == 0)
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
 	/* Record statement start time if per-command latencies are requested */
@@ -1678,25 +1701,8 @@ init(bool is_no_vacuum)
 			exit(1);
 		}
 
-		/*
-		 * If we want to stick with the original logging, print a message each
-		 * 100k inserted rows.
-		 */
-		if ((!use_quiet) && (j % 100000 == 0))
-		{
-			INSTR_TIME_SET_CURRENT(diff);
-			INSTR_TIME_SUBTRACT(diff, start);
-
-			elapsed_sec = INSTR_TIME_GET_DOUBLE(diff);
-			remaining_sec = (scale * naccounts - j) * elapsed_sec / j;
-
-			fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s).\n",
-					j, (int64) naccounts * scale,
-					(int) (((int64) j * 100) / (naccounts * scale)),
-					elapsed_sec, remaining_sec);
-		}
-		/* let's not call the timing for each row, but only each 100 rows */
-		else if (use_quiet && (j % 100 == 0))
+		/* gettimeofday is called every 1000 rows, should be under 0.01 s */
+		if (progress && (j % 1000 == 0))
 		{
 			INSTR_TIME_SET_CURRENT(diff);
 			INSTR_TIME_SUBTRACT(diff, start);
@@ -1705,14 +1711,17 @@ init(bool is_no_vacuum)
 			remaining_sec = (scale * naccounts - j) * elapsed_sec / j;
 
 			/* have we reached the next interval (or end)? */
-			if ((j == scale * naccounts) || (elapsed_sec >= log_interval * LOG_STEP_SECONDS))
+			if ((elapsed_sec >= log_interval * progress) ||
+				(j == scale * naccounts))
 			{
-				fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s).\n",
+				fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT
+				   " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s).\n",
 						j, (int64) naccounts * scale,
-						(int) (((int64) j * 100) / (naccounts * scale)), elapsed_sec, remaining_sec);
+						(int) (((int64) j * 100) / (naccounts * scale)),
+						elapsed_sec, remaining_sec);
 
 				/* skip to the next interval */
-				log_interval = (int) ceil(elapsed_sec / LOG_STEP_SECONDS);
+				log_interval = (int) ceil(elapsed_sec / progress);
 			}
 		}
 
@@ -2096,6 +2105,7 @@ static void
 printResults(int ttype, int normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
+			 int64 total_latencies, int64 total_sqlats,
 			 int64 throttle_lag, int64 throttle_lag_max)
 {
 	double		time_include,
@@ -2135,6 +2145,22 @@ printResults(int ttype, int normal_xacts, int nclients,
 			   normal_xacts);
 	}
 
+	if (throttle_delay || progress)
+	{
+		/* compute and show latency average and standard deviation */
+		double latency = 0.001 * total_latencies / normal_xacts;
+		double sqlat = (double) total_sqlats / normal_xacts;
+		printf("latency average: %.3f ms\n"
+			   "latency stddev: %.3f ms\n",
+			   latency, 0.001 * sqrt(sqlat - 1000000*latency*latency));
+	}
+	else
+	{
+		/* only the average latency is available */
+		printf("latency average: %.3f ms\n",
+			   1000.0 * duration / normal_xacts);
+	}
+
 	if (throttle_delay)
 	{
 		/*
@@ -2143,8 +2169,9 @@ printResults(int ttype, int normal_xacts, int nclients,
 		 * transaction.  The measured lag may be caused by thread/client load,
 		 * the database load, or the Poisson throttling process.
 		 */
-		printf("average rate limit schedule lag: %.3f ms (max %.3f ms)\n",
-			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
+		printf("rate limit lag: avg %.3f (max %.3f) ms\n",
+			   0.001 * throttle_lag / normal_xacts,
+			   0.001 * throttle_lag_max);
 	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
@@ -2254,7 +2281,9 @@ main(int argc, char **argv)
 	instr_time	start_time;		/* start up time */
 	instr_time	total_time;
 	instr_time	conn_total_time;
-	int			total_xacts;
+	int			total_xacts = 0;
+	int64		total_latencies = 0;
+	int64		total_sqlats = 0;
 	int64       throttle_lag = 0;
 	int64       throttle_lag_max = 0;
 
@@ -2410,7 +2439,7 @@ main(int argc, char **argv)
 				use_log = true;
 				break;
 			case 'q':
-				use_quiet = true;
+				progress = 0;
 				break;
 			case 'f':
 				ttype = 3;
@@ -2519,6 +2548,10 @@ main(int argc, char **argv)
     /* compute a per thread delay */
 	throttle_delay *= nthreads;
 
+	/* set default progress report time */
+	if (progress == -1)
+		progress = is_init_mode? LOG_STEP_SECONDS: 0;
+
 	if (argc > optind)
 		dbName = argv[optind];
 	else
@@ -2554,13 +2587,6 @@ main(int argc, char **argv)
 		exit(1);
 	}
 
-	/* -q may be used only with -i */
-	if (use_quiet && !is_init_mode)
-	{
-		fprintf(stderr, "quiet-logging is allowed only in initialization mode (-i)\n");
-		exit(1);
-	}
-
 	/* --sampling-rate may must not be used with --aggregate-interval */
 	if (sample_rate > 0.0 && agg_interval > 0)
 	{
@@ -2608,6 +2634,7 @@ main(int argc, char **argv)
 	 */
 	main_pid = (int) getpid();
 	progress_nclients = nclients;
+	progress_nthreads = nthreads;
 
 	if (nclients > 1)
 	{
@@ -2796,8 +2823,6 @@ main(int argc, char **argv)
 	{
 		TState	   *thread = &threads[i];
 
-		INSTR_TIME_SET_CURRENT(thread->start_time);
-
 		/* the first thread (i = 0) is executed by main thread */
 		if (i > 0)
 		{
@@ -2816,15 +2841,16 @@ main(int argc, char **argv)
 	}
 
 	/* wait for threads and accumulate results */
-	total_xacts = 0;
 	INSTR_TIME_SET_ZERO(conn_total_time);
 	for (i = 0; i < nthreads; i++)
 	{
 		void	   *ret = NULL;
 
 		if (threads[i].thread == INVALID_THREAD)
+			/* run thread 0 */
 			ret = threadRun(&threads[i]);
 		else
+			/* wait for other threads */
 			pthread_join(threads[i].thread, &ret);
 
 		if (ret != NULL)
@@ -2832,6 +2858,8 @@ main(int argc, char **argv)
 			TResult	   *r = (TResult *) ret;
 
 			total_xacts += r->xacts;
+			total_latencies += r->latencies;
+			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
@@ -2845,7 +2873,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SET_CURRENT(total_time);
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
-				 total_time, conn_total_time, throttle_lag, throttle_lag_max);
+				 total_time, conn_total_time,
+				 total_latencies, total_sqlats, throttle_lag, throttle_lag_max);
 
 	return 0;
 }
@@ -2863,21 +2892,24 @@ threadRun(void *arg)
 	int			remains = nstate;		/* number of remaining clients */
 	int			i;
 	/* for reporting progress: */
-	int64       thread_start = INSTR_TIME_GET_MICROSEC(thread->start_time);
-	int64		last_report = thread_start;
-	int64		next_report = last_report + progress * 1000000;
-	int64		last_count = 0;
+	int64		thread_start, last_report, next_report;
+	int64		last_count = 0, last_lats = 0, last_sqlats = 0, last_lags = 0;
 
 	AggVals		aggs;
 
 	/*
-	 * Initialize throttling rate target for all of the thread's clients.  It
-	 * might be a little more accurate to reset thread->start_time here too.
-	 * The possible drift seems too small relative to typical throttle delay
-	 * times to worry about it.
+	 * This is the real start time of the thread. Take it here to avoid
+	 * including thread creation time in measures.
 	 */
-	INSTR_TIME_SET_CURRENT(start);
-	thread->throttle_trigger = INSTR_TIME_GET_MICROSEC(start);
+	INSTR_TIME_SET_CURRENT(thread->start_time);
+
+	/*
+	 * Initialize throttling rate target for all of the thread's clients.
+	 */
+	thread_start = INSTR_TIME_GET_MICROSEC(thread->start_time);
+	last_report = thread_start;
+	next_report = last_report + progress * 1000000;
+	thread->throttle_trigger = thread_start;
 	thread->throttle_lag = 0;
 	thread->throttle_lag_max = 0;
 
@@ -3066,21 +3098,40 @@ threadRun(void *arg)
 			if (now >= next_report)
 			{
 				/* generate and show report */
-				int64 count = 0;
+				int64 count = 0, lats = 0, sqlats = 0;
+				int64 lags = thread->throttle_lag;
 				int64 run = now - last_report;
-				float tps, total_run, latency;
+				double tps, total_run, latency, sqlat, stdev, lag;
 
 				for (i = 0 ; i < nstate ; i++)
+				{
 					count  += state[i].cnt;
+					lats   += state[i].txn_latencies;
+					sqlats += state[i].txn_sqlats;
+				}
 
 				total_run = (now - thread_start) / 1000000.0;
 				tps = 1000000.0 * (count - last_count) / run;
-				latency = 1000.0 * nstate / tps;
+				latency = 0.001 * (lats - last_lats) / (count - last_count);
+				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
+				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
+				lag = 0.001 * (lags - last_lags) / (count - last_count);
 
-				fprintf(stderr, "progress %d: %.1f s, %.1f tps, %.3f ms lat\n",
-						thread->tid, total_run, tps, latency);
+				if (throttle_delay)
+					fprintf(stderr,
+							"progress %d: %.1f s, %.1f tps, "
+							"%.3f stddev %.3f ms lat, %.3f ms lag\n",
+							thread->tid, total_run, tps, latency, stdev, lag);
+				else
+					fprintf(stderr,
+							"progress %d: %.1f s, %.1f tps, "
+							"%.3f stddev %.3f ms lat\n",
+							thread->tid, total_run, tps, latency, stdev);
 
 				last_count = count;
+				last_lats = lats;
+				last_sqlats = sqlats;
+				last_lags = lags;
 				last_report = now;
 				next_report += progress * 1000000;
 			}
@@ -3096,21 +3147,42 @@ threadRun(void *arg)
 			if (now >= next_report)
 			{
 				/* generate and show report */
-				int64 count = 0;
+				int64 count = 0, lats = 0, sqlats = 0, lags = 0;
 				int64 run = now - last_report;
-				float tps, total_run, latency;
+				float tps, total_run, latency, sqlat, lag, stdev;
 
 				for (i = 0 ; i < progress_nclients ; i++)
+				{
 					count  += state[i].cnt;
+					lats   += state[i].txn_latencies;
+					sqlats += state[i].txn_sqlats;
+				}
+
+				for (i = 0 ; i < progress_nthreads ; i++)
+					lags += thread[i].throttle_lag;
 
 				total_run = (now - thread_start) / 1000000.0;
 				tps = 1000000.0 * (count - last_count) / run;
-				latency = 1000.0 * progress_nclients / tps;
+				latency = 0.001 * (lats - last_lats) / (count - last_count);
+				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
+				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
+				lag = 0.001 * (lags - last_lags) / (count - last_count);
 
-				fprintf(stderr, "progress: %.1f s, %.1f tps, %.3f ms lat\n",
-						total_run, tps, latency);
+				if (throttle_delay)
+					fprintf(stderr,
+							"progress: %.1f s, %.1f tps, "
+							"%.3f stddev %.3f ms lat, %.3f ms lag\n",
+							total_run, tps, latency, stdev, lag);
+				else
+					fprintf(stderr,
+							"progress: %.1f s, %.1f tps, "
+							"%.3f stddev %.3f ms lat\n",
+							total_run, tps, latency, stdev);
 
 				last_count = count;
+				last_lats = lats;
+				last_sqlats = sqlats;
+				last_lags = lags;
 				last_report = now;
 				next_report += progress * 1000000;
 			}
@@ -3122,8 +3194,14 @@ done:
 	INSTR_TIME_SET_CURRENT(start);
 	disconnect_all(state, nstate);
 	result->xacts = 0;
+	result->latencies = 0;
+	result->sqlats = 0;
 	for (i = 0; i < nstate; i++)
+	{
 		result->xacts += state[i].cnt;
+		result->latencies += state[i].txn_latencies;
+		result->sqlats += state[i].txn_sqlats;
+	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
 	INSTR_TIME_SET_CURRENT(end);
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 49a79b1..bcc376e 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -182,18 +182,6 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
-      <term><option>-q</option></term>
-      <term><option>--quiet</option></term>
-      <listitem>
-       <para>
-        Switch logging to quiet mode, producing only one progress message per 5
-        seconds. The default logging prints one message each 100000 rows, which
-        often outputs many lines per second (especially on good hardware).
-       </para>
-      </listitem>
-     </varlistentry>
-
-     <varlistentry>
       <term><option>-s</option> <replaceable>scale_factor</></term>
       <term><option>--scale=</option><replaceable>scale_factor</></term>
       <listitem>
@@ -408,16 +396,6 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
-      <term><option>-P</option> <replaceable>sec</></term>
-      <term><option>--progress=</option><replaceable>sec</></term>
-      <listitem>
-       <para>
-        Show progress report every <literal>sec</> seconds.
-       </para>
-      </listitem>
-     </varlistentry>
-
-     <varlistentry>
       <term><option>-R</option> <replaceable>rate</></term>
       <term><option>--rate=</option><replaceable>rate</></term>
       <listitem>
@@ -591,6 +569,35 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-P</option> <replaceable>sec</></term>
+      <term><option>--progress=</option><replaceable>sec</></term>
+      <listitem>
+       <para>
+        Show progress report every <literal>sec</> seconds.
+
+        When initializing, the report displays the number of inserted rows
+        and its total percentage, the passed time and estimated remaining time
+        for completion.
+
+        When benchmarking, the report displays the time since the beginning
+        of the run, the tps since the last report, the transaction latency
+        average and standard deviation. Under throttling (<option>-R</option>),
+        also includes the lag time behind schedule.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><option>-q</option></term>
+      <term><option>--quiet</option></term>
+      <listitem>
+       <para>
+        Switch off regular (5 second default) progress report.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-U</option> <replaceable>login</></term>
       <term><option>--username=</option><replaceable>login</></term>
       <listitem>
