From 9c5c50476918b55748b336c98aa8fc54fcfb7dd1 Mon Sep 17 00:00:00 2001
From: Chengpeng Yan <41809508+Reminiscent@users.noreply.github.com>
Date: Wed, 17 Dec 2025 13:48:13 +0800
Subject: [PATCH] analyze: Hash MCV candidate lookups in compute_distinct_stats

compute_distinct_stats() tracks possible MCVs for datatypes that have
an equality operator but no ordering.  Finding a match currently requires
a linear scan of the tracking array for every sampled row, which can
become very expensive when statistics targets are set high.

When the tracking array is large enough and the type's default hash
support matches the equality operator, maintain a simplehash table that
maps a tracked value to its current track[] slot.  This reduces match
lookups from O(n) to O(1) on average while keeping the existing linear
path as a fallback.

Add a regression test exercising the hashed path.
---
 src/backend/commands/analyze.c                | 178 ++++++++++++++++--
 .../expected/analyze_distinct_hash.out        |  55 ++++++
 src/test/regress/parallel_schedule            |   2 +-
 .../regress/sql/analyze_distinct_hash.sql     |  52 +++++
 4 files changed, 274 insertions(+), 13 deletions(-)
 create mode 100644 src/test/regress/expected/analyze_distinct_hash.out
 create mode 100644 src/test/regress/sql/analyze_distinct_hash.sql

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 5e2a7a8234ec8..44090a5923cb4 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -54,6 +54,7 @@
 #include "utils/sortsupport.h"
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
+#include "utils/typcache.h"
 
 
 /* Per-index data for ANALYZE */
@@ -1888,6 +1889,70 @@ static int	analyze_mcv_list(int *mcv_counts,
 							 int samplerows,
 							 double totalrows);
 
+#define ANALYZE_HASH_THRESHOLD 200
+
+typedef struct DistinctHashEntry
+{
+	Datum		value;
+	int			index;
+	uint32		hash;
+	char		status;
+} DistinctHashEntry;
+
+typedef struct DistinctHashContext
+{
+	FmgrInfo   *cmpfunc;
+	FmgrInfo   *hashfunc;
+	Oid			collation;
+} DistinctHashContext;
+
+typedef struct DistinctHash_hash DistinctHash_hash;
+
+static uint32 distinct_hash_hash(DistinctHash_hash *tab, Datum key);
+static bool distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1);
+
+#define SH_PREFIX				DistinctHash
+#define SH_ELEMENT_TYPE			DistinctHashEntry
+#define SH_KEY_TYPE				Datum
+#define SH_KEY					value
+#define SH_HASH_KEY(tab, key)	distinct_hash_hash(tab, key)
+#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1)
+#define SH_SCOPE				static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tab, ent)	((ent)->hash)
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static uint32
+distinct_hash_hash(DistinctHash_hash *tab, Datum key)
+{
+	DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+	Datum		result;
+
+	result = FunctionCall1Coll(context->hashfunc, context->collation, key);
+	return DatumGetUInt32(result);
+}
+
+static bool
+distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1)
+{
+	DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+	Datum		result;
+
+	result = FunctionCall2Coll(context->cmpfunc, context->collation, key0, key1);
+	return DatumGetBool(result);
+}
+
+static inline void
+distinct_hash_set_index(DistinctHash_hash *hash, Datum value, uint32 value_hash,
+						int index)
+{
+	DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash);
+
+	if (entry != NULL)
+		entry->index = index;
+}
 
 /*
  * std_typanalyze -- the default type-specific typanalyze function
@@ -2076,15 +2141,21 @@ compute_distinct_stats(VacAttrStatsP stats,
 	bool		is_varwidth = (!stats->attrtype->typbyval &&
 							   stats->attrtype->typlen < 0);
 	FmgrInfo	f_cmpeq;
+	TypeCacheEntry *typentry;
 	typedef struct
 	{
 		Datum		value;
 		int			count;
+		uint32		hash;
 	} TrackItem;
 	TrackItem  *track;
 	int			track_cnt,
 				track_max;
 	int			num_mcv = stats->attstattarget;
+	int			firstcount1 = 0;
+	bool		use_hash;
+	DistinctHashContext hash_context;
+	DistinctHash_hash *track_hash = NULL;
 	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
 
 	/*
@@ -2097,14 +2168,34 @@ compute_distinct_stats(VacAttrStatsP stats,
 	track_cnt = 0;
 
 	fmgr_info(mystats->eqfunc, &f_cmpeq);
+	typentry = lookup_type_cache(stats->attrtypid,
+								 TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR);
+
+	/*
+	 * For sufficiently large statistics targets, use a hash table to avoid
+	 * repeated linear searches of the track[] array, but only when we can use
+	 * the type's default hash support that matches the equality operator.
+	 */
+	use_hash = (track_max >= ANALYZE_HASH_THRESHOLD &&
+				OidIsValid(mystats->eqfunc) &&
+				mystats->eqopr == typentry->eq_opr &&
+				OidIsValid(typentry->hash_proc));
+	if (use_hash)
+	{
+		hash_context.cmpfunc = &f_cmpeq;
+		hash_context.hashfunc = &typentry->hash_proc_finfo;
+		hash_context.collation = stats->attrcollid;
+		track_hash = DistinctHash_create(CurrentMemoryContext,
+										 track_max, &hash_context);
+	}
 
 	for (i = 0; i < samplerows; i++)
 	{
 		Datum		value;
 		bool		isnull;
 		bool		match;
-		int			firstcount1,
-					j;
+		int			j = 0;
+		uint32		value_hash = 0;
 
 		vacuum_delay_point(true);
 
@@ -2151,19 +2242,35 @@ compute_distinct_stats(VacAttrStatsP stats,
 		/*
 		 * See if the value matches anything we're already tracking.
 		 */
-		match = false;
-		firstcount1 = track_cnt;
-		for (j = 0; j < track_cnt; j++)
+		if (use_hash)
+		{
+			DistinctHashEntry *entry;
+
+			value_hash = distinct_hash_hash(track_hash, value);
+			entry = DistinctHash_lookup_hash(track_hash, value, value_hash);
+			match = (entry != NULL);
+			if (match)
+				j = entry->index;
+		}
+		else
 		{
-			if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
-											   stats->attrcollid,
-											   value, track[j].value)))
+			int			firstcount1_local = track_cnt;
+
+			match = false;
+			for (j = 0; j < track_cnt; j++)
 			{
-				match = true;
-				break;
+				if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
+												   stats->attrcollid,
+												   value, track[j].value)))
+				{
+					match = true;
+					break;
+				}
+				if (j < firstcount1_local && track[j].count == 1)
+					firstcount1_local = j;
 			}
-			if (j < firstcount1 && track[j].count == 1)
-				firstcount1 = j;
+
+			firstcount1 = firstcount1_local;
 		}
 
 		if (match)
@@ -2175,23 +2282,70 @@ compute_distinct_stats(VacAttrStatsP stats,
 			{
 				swapDatum(track[j].value, track[j - 1].value);
 				swapInt(track[j].count, track[j - 1].count);
+				if (use_hash)
+				{
+					uint32		tmp;
+
+					tmp = track[j].hash;
+					track[j].hash = track[j - 1].hash;
+					track[j - 1].hash = tmp;
+					distinct_hash_set_index(track_hash, track[j].value,
+											track[j].hash, j);
+					distinct_hash_set_index(track_hash, track[j - 1].value,
+											track[j - 1].hash, j - 1);
+				}
 				j--;
 			}
+			while (use_hash && firstcount1 < track_cnt &&
+				   track[firstcount1].count > 1)
+				firstcount1++;
 		}
 		else
 		{
 			/* No match.  Insert at head of count-1 list */
 			if (track_cnt < track_max)
 				track_cnt++;
+			else if (use_hash && firstcount1 >= track_cnt)
+				continue;
+			else if (use_hash)
+			{
+				DistinctHashEntry *delentry;
+
+				delentry = DistinctHash_lookup_hash(track_hash,
+													track[track_cnt - 1].value,
+													track[track_cnt - 1].hash);
+				Assert(delentry != NULL);
+				if (delentry != NULL)
+					DistinctHash_delete_item(track_hash, delentry);
+				else
+					DistinctHash_delete(track_hash, track[track_cnt - 1].value);
+			}
 			for (j = track_cnt - 1; j > firstcount1; j--)
 			{
 				track[j].value = track[j - 1].value;
 				track[j].count = track[j - 1].count;
+				if (use_hash)
+				{
+					track[j].hash = track[j - 1].hash;
+					distinct_hash_set_index(track_hash, track[j].value,
+											track[j].hash, j);
+				}
 			}
 			if (firstcount1 < track_cnt)
 			{
 				track[firstcount1].value = value;
 				track[firstcount1].count = 1;
+				if (use_hash)
+				{
+					bool		found_hash;
+					DistinctHashEntry *entry;
+
+					track[firstcount1].hash = value_hash;
+					entry = DistinctHash_insert_hash(track_hash, value, value_hash,
+													 &found_hash);
+					Assert(!found_hash);
+					entry->index = firstcount1;
+				}
 			}
 		}
 	}
diff --git a/src/test/regress/expected/analyze_distinct_hash.out b/src/test/regress/expected/analyze_distinct_hash.out
new file mode 100644
index 0000000000000..9f92083f6d6e7
--- /dev/null
+++ b/src/test/regress/expected/analyze_distinct_hash.out
@@ -0,0 +1,55 @@
+--
+-- Exercise compute_distinct_stats() when hashable types allow hashed lookups.
+--
+SET client_min_messages TO WARNING;
+--
+-- Case 1: all values are distinct.  This forces the track[] array to fill
+-- and then exercise the "drop tail item" path repeatedly.
+--
+DROP TABLE IF EXISTS analyze_distinct_hash_unique;
+CREATE TABLE analyze_distinct_hash_unique (x xid);
+ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100;
+INSERT INTO analyze_distinct_hash_unique
+SELECT i::text::xid FROM generate_series(1, 300) i;
+ANALYZE analyze_distinct_hash_unique;
+WITH m AS MATERIALIZED (
+	SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv
+	FROM pg_stats
+	WHERE schemaname = 'public'
+	  AND tablename = 'analyze_distinct_hash_unique'
+	  AND attname = 'x'
+)
+SELECT array_length(mcv, 1) AS mcv_len,
+	   mcv[1] AS mcv_first,
+	   mcv[100] AS mcv_100th
+FROM m;
+ mcv_len | mcv_first | mcv_100th 
+---------+-----------+-----------
+     100 | 300       | 201
+(1 row)
+
+--
+-- Case 2: bubble-up during repeated matches, exercising swaps while keeping
+-- hashed indexes in sync.
+--
+DROP TABLE IF EXISTS analyze_distinct_hash_bubble;
+CREATE TABLE analyze_distinct_hash_bubble (x xid);
+ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100;
+INSERT INTO analyze_distinct_hash_bubble
+SELECT i::text::xid FROM generate_series(1, 10) i;
+INSERT INTO analyze_distinct_hash_bubble
+SELECT '1'::xid FROM generate_series(1, 20);
+ANALYZE analyze_distinct_hash_bubble;
+SELECT most_common_vals::text
+FROM pg_stats
+WHERE schemaname = 'public'
+  AND tablename = 'analyze_distinct_hash_bubble'
+  AND attname = 'x';
+    most_common_vals    
+------------------------
+ {1,10,9,8,7,6,5,4,3,2}
+(1 row)
+
+DROP TABLE analyze_distinct_hash_unique;
+DROP TABLE analyze_distinct_hash_bubble;
+RESET client_min_messages;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 905f9bca95987..d7655b1ea8507 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
 # geometry depends on point, lseg, line, box, path, polygon, circle
 # horology depends on date, time, timetz, timestamp, timestamptz, interval
 # ----------
-test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies
+test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import analyze_distinct_hash pg_ndistinct pg_dependencies
 
 # ----------
 # Load huge amounts of data
diff --git a/src/test/regress/sql/analyze_distinct_hash.sql b/src/test/regress/sql/analyze_distinct_hash.sql
new file mode 100644
index 0000000000000..06e2e273a271d
--- /dev/null
+++ b/src/test/regress/sql/analyze_distinct_hash.sql
@@ -0,0 +1,52 @@
+--
+-- Exercise compute_distinct_stats() when hashable types allow hashed lookups.
+--
+
+SET client_min_messages TO WARNING;
+
+--
+-- Case 1: all values are distinct.  This forces the track[] array to fill
+-- and then exercise the "drop tail item" path repeatedly.
+--
+DROP TABLE IF EXISTS analyze_distinct_hash_unique;
+CREATE TABLE analyze_distinct_hash_unique (x xid);
+ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100;
+INSERT INTO analyze_distinct_hash_unique
+SELECT i::text::xid FROM generate_series(1, 300) i;
+ANALYZE analyze_distinct_hash_unique;
+
+WITH m AS MATERIALIZED (
+	SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv
+	FROM pg_stats
+	WHERE schemaname = 'public'
+	  AND tablename = 'analyze_distinct_hash_unique'
+	  AND attname = 'x'
+)
+SELECT array_length(mcv, 1) AS mcv_len,
+	   mcv[1] AS mcv_first,
+	   mcv[100] AS mcv_100th
+FROM m;
+
+--
+-- Case 2: bubble-up during repeated matches, exercising swaps while keeping
+-- hashed indexes in sync.
+--
+DROP TABLE IF EXISTS analyze_distinct_hash_bubble;
+CREATE TABLE analyze_distinct_hash_bubble (x xid);
+ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100;
+INSERT INTO analyze_distinct_hash_bubble
+SELECT i::text::xid FROM generate_series(1, 10) i;
+INSERT INTO analyze_distinct_hash_bubble
+SELECT '1'::xid FROM generate_series(1, 20);
+ANALYZE analyze_distinct_hash_bubble;
+
+SELECT most_common_vals::text
+FROM pg_stats
+WHERE schemaname = 'public'
+  AND tablename = 'analyze_distinct_hash_bubble'
+  AND attname = 'x';
+
+DROP TABLE analyze_distinct_hash_unique;
+DROP TABLE analyze_distinct_hash_bubble;
+
+RESET client_min_messages;