From 9c5c50476918b55748b336c98aa8fc54fcfb7dd1 Mon Sep 17 00:00:00 2001 From: Chengpeng Yan <41809508+Reminiscent@users.noreply.github.com> Date: Wed, 17 Dec 2025 13:48:13 +0800 Subject: [PATCH] analyze: Hash MCV candidate lookups in compute_distinct_stats compute_distinct_stats() tracks possible MCVs for datatypes that have an equality operator but no ordering. Finding a match currently requires a linear scan of the tracking array for every sampled row, which can become very expensive when statistics targets are set high. When the tracking array is large enough and the type's default hash support matches the equality operator, maintain a simplehash table that maps a tracked value to its current track[] slot. This reduces match lookups from O(n) to O(1) on average while keeping the existing linear path as a fallback. Add a regression test exercising the hashed path. --- src/backend/commands/analyze.c | 178 ++++++++++++++++-- .../expected/analyze_distinct_hash.out | 55 ++++++ src/test/regress/parallel_schedule | 2 +- .../regress/sql/analyze_distinct_hash.sql | 52 +++++ 4 files changed, 274 insertions(+), 13 deletions(-) create mode 100644 src/test/regress/expected/analyze_distinct_hash.out create mode 100644 src/test/regress/sql/analyze_distinct_hash.sql diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 5e2a7a8234ec8..44090a5923cb4 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -54,6 +54,7 @@ #include "utils/sortsupport.h" #include "utils/syscache.h" #include "utils/timestamp.h" +#include "utils/typcache.h" /* Per-index data for ANALYZE */ @@ -1888,6 +1889,70 @@ static int analyze_mcv_list(int *mcv_counts, int samplerows, double totalrows); +#define ANALYZE_HASH_THRESHOLD 200 + +typedef struct DistinctHashEntry +{ + Datum value; + int index; + uint32 hash; + char status; +} DistinctHashEntry; + +typedef struct DistinctHashContext +{ + FmgrInfo *cmpfunc; + FmgrInfo *hashfunc; + Oid collation; +} DistinctHashContext; + +typedef struct DistinctHash_hash DistinctHash_hash; + +static uint32 distinct_hash_hash(DistinctHash_hash *tab, Datum key); +static bool distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1); + +#define SH_PREFIX DistinctHash +#define SH_ELEMENT_TYPE DistinctHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY value +#define SH_HASH_KEY(tab, key) distinct_hash_hash(tab, key) +#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tab, ent) ((ent)->hash) +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +static uint32 +distinct_hash_hash(DistinctHash_hash *tab, Datum key) +{ + DistinctHashContext *context = (DistinctHashContext *) tab->private_data; + Datum result; + + result = FunctionCall1Coll(context->hashfunc, context->collation, key); + return DatumGetUInt32(result); +} + +static bool +distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1) +{ + DistinctHashContext *context = (DistinctHashContext *) tab->private_data; + Datum result; + + result = FunctionCall2Coll(context->cmpfunc, context->collation, key0, key1); + return DatumGetBool(result); +} + +static inline void +distinct_hash_set_index(DistinctHash_hash *hash, Datum value, uint32 value_hash, + int index) +{ + DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash); + + if (entry != NULL) + entry->index = index; +} /* * std_typanalyze -- the default type-specific typanalyze function @@ -2076,15 +2141,21 @@ compute_distinct_stats(VacAttrStatsP stats, bool is_varwidth = (!stats->attrtype->typbyval && stats->attrtype->typlen < 0); FmgrInfo f_cmpeq; + TypeCacheEntry *typentry; typedef struct { Datum value; int count; + uint32 hash; } TrackItem; TrackItem *track; int track_cnt, track_max; int num_mcv = stats->attstattarget; + int firstcount1 = 0; + bool use_hash; + DistinctHashContext hash_context; + DistinctHash_hash *track_hash = NULL; StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; /* @@ -2097,14 +2168,34 @@ compute_distinct_stats(VacAttrStatsP stats, track_cnt = 0; fmgr_info(mystats->eqfunc, &f_cmpeq); + typentry = lookup_type_cache(stats->attrtypid, + TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR); + + /* + * For sufficiently large statistics targets, use a hash table to avoid + * repeated linear searches of the track[] array, but only when we can use + * the type's default hash support that matches the equality operator. + */ + use_hash = (track_max >= ANALYZE_HASH_THRESHOLD && + OidIsValid(mystats->eqfunc) && + mystats->eqopr == typentry->eq_opr && + OidIsValid(typentry->hash_proc)); + if (use_hash) + { + hash_context.cmpfunc = &f_cmpeq; + hash_context.hashfunc = &typentry->hash_proc_finfo; + hash_context.collation = stats->attrcollid; + track_hash = DistinctHash_create(CurrentMemoryContext, + track_max, &hash_context); + } for (i = 0; i < samplerows; i++) { Datum value; bool isnull; bool match; - int firstcount1, - j; + int j = 0; + uint32 value_hash = 0; vacuum_delay_point(true); @@ -2151,19 +2242,35 @@ compute_distinct_stats(VacAttrStatsP stats, /* * See if the value matches anything we're already tracking. */ - match = false; - firstcount1 = track_cnt; - for (j = 0; j < track_cnt; j++) + if (use_hash) + { + DistinctHashEntry *entry; + + value_hash = distinct_hash_hash(track_hash, value); + entry = DistinctHash_lookup_hash(track_hash, value, value_hash); + match = (entry != NULL); + if (match) + j = entry->index; + } + else { - if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, - stats->attrcollid, - value, track[j].value))) + int firstcount1_local = track_cnt; + + match = false; + for (j = 0; j < track_cnt; j++) { - match = true; - break; + if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, + stats->attrcollid, + value, track[j].value))) + { + match = true; + break; + } + if (j < firstcount1_local && track[j].count == 1) + firstcount1_local = j; } - if (j < firstcount1 && track[j].count == 1) - firstcount1 = j; + + firstcount1 = firstcount1_local; } if (match) @@ -2175,23 +2282,70 @@ compute_distinct_stats(VacAttrStatsP stats, { swapDatum(track[j].value, track[j - 1].value); swapInt(track[j].count, track[j - 1].count); + if (use_hash) + { + uint32 tmp; + + tmp = track[j].hash; + track[j].hash = track[j - 1].hash; + track[j - 1].hash = tmp; + distinct_hash_set_index(track_hash, track[j].value, + track[j].hash, j); + distinct_hash_set_index(track_hash, track[j - 1].value, + track[j - 1].hash, j - 1); + } j--; } + while (use_hash && firstcount1 < track_cnt && + track[firstcount1].count > 1) + firstcount1++; } else { /* No match. Insert at head of count-1 list */ if (track_cnt < track_max) track_cnt++; + else if (use_hash && firstcount1 >= track_cnt) + continue; + else if (use_hash) + { + DistinctHashEntry *delentry; + + delentry = DistinctHash_lookup_hash(track_hash, + track[track_cnt - 1].value, + track[track_cnt - 1].hash); + Assert(delentry != NULL); + if (delentry != NULL) + DistinctHash_delete_item(track_hash, delentry); + else + DistinctHash_delete(track_hash, track[track_cnt - 1].value); + } for (j = track_cnt - 1; j > firstcount1; j--) { track[j].value = track[j - 1].value; track[j].count = track[j - 1].count; + if (use_hash) + { + track[j].hash = track[j - 1].hash; + distinct_hash_set_index(track_hash, track[j].value, + track[j].hash, j); + } } if (firstcount1 < track_cnt) { track[firstcount1].value = value; track[firstcount1].count = 1; + if (use_hash) + { + bool found_hash; + DistinctHashEntry *entry; + + track[firstcount1].hash = value_hash; + entry = DistinctHash_insert_hash(track_hash, value, value_hash, + &found_hash); + Assert(!found_hash); + entry->index = firstcount1; + } } } } diff --git a/src/test/regress/expected/analyze_distinct_hash.out b/src/test/regress/expected/analyze_distinct_hash.out new file mode 100644 index 0000000000000..9f92083f6d6e7 --- /dev/null +++ b/src/test/regress/expected/analyze_distinct_hash.out @@ -0,0 +1,55 @@ +-- +-- Exercise compute_distinct_stats() when hashable types allow hashed lookups. +-- +SET client_min_messages TO WARNING; +-- +-- Case 1: all values are distinct. This forces the track[] array to fill +-- and then exercise the "drop tail item" path repeatedly. +-- +DROP TABLE IF EXISTS analyze_distinct_hash_unique; +CREATE TABLE analyze_distinct_hash_unique (x xid); +ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100; +INSERT INTO analyze_distinct_hash_unique +SELECT i::text::xid FROM generate_series(1, 300) i; +ANALYZE analyze_distinct_hash_unique; +WITH m AS MATERIALIZED ( + SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv + FROM pg_stats + WHERE schemaname = 'public' + AND tablename = 'analyze_distinct_hash_unique' + AND attname = 'x' +) +SELECT array_length(mcv, 1) AS mcv_len, + mcv[1] AS mcv_first, + mcv[100] AS mcv_100th +FROM m; + mcv_len | mcv_first | mcv_100th +---------+-----------+----------- + 100 | 300 | 201 +(1 row) + +-- +-- Case 2: bubble-up during repeated matches, exercising swaps while keeping +-- hashed indexes in sync. +-- +DROP TABLE IF EXISTS analyze_distinct_hash_bubble; +CREATE TABLE analyze_distinct_hash_bubble (x xid); +ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100; +INSERT INTO analyze_distinct_hash_bubble +SELECT i::text::xid FROM generate_series(1, 10) i; +INSERT INTO analyze_distinct_hash_bubble +SELECT '1'::xid FROM generate_series(1, 20); +ANALYZE analyze_distinct_hash_bubble; +SELECT most_common_vals::text +FROM pg_stats +WHERE schemaname = 'public' + AND tablename = 'analyze_distinct_hash_bubble' + AND attname = 'x'; + most_common_vals +------------------------ + {1,10,9,8,7,6,5,4,3,2} +(1 row) + +DROP TABLE analyze_distinct_hash_unique; +DROP TABLE analyze_distinct_hash_bubble; +RESET client_min_messages; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 905f9bca95987..d7655b1ea8507 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t # geometry depends on point, lseg, line, box, path, polygon, circle # horology depends on date, time, timetz, timestamp, timestamptz, interval # ---------- -test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies +test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import analyze_distinct_hash pg_ndistinct pg_dependencies # ---------- # Load huge amounts of data diff --git a/src/test/regress/sql/analyze_distinct_hash.sql b/src/test/regress/sql/analyze_distinct_hash.sql new file mode 100644 index 0000000000000..06e2e273a271d --- /dev/null +++ b/src/test/regress/sql/analyze_distinct_hash.sql @@ -0,0 +1,52 @@ +-- +-- Exercise compute_distinct_stats() when hashable types allow hashed lookups. +-- + +SET client_min_messages TO WARNING; + +-- +-- Case 1: all values are distinct. This forces the track[] array to fill +-- and then exercise the "drop tail item" path repeatedly. +-- +DROP TABLE IF EXISTS analyze_distinct_hash_unique; +CREATE TABLE analyze_distinct_hash_unique (x xid); +ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100; +INSERT INTO analyze_distinct_hash_unique +SELECT i::text::xid FROM generate_series(1, 300) i; +ANALYZE analyze_distinct_hash_unique; + +WITH m AS MATERIALIZED ( + SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv + FROM pg_stats + WHERE schemaname = 'public' + AND tablename = 'analyze_distinct_hash_unique' + AND attname = 'x' +) +SELECT array_length(mcv, 1) AS mcv_len, + mcv[1] AS mcv_first, + mcv[100] AS mcv_100th +FROM m; + +-- +-- Case 2: bubble-up during repeated matches, exercising swaps while keeping +-- hashed indexes in sync. +-- +DROP TABLE IF EXISTS analyze_distinct_hash_bubble; +CREATE TABLE analyze_distinct_hash_bubble (x xid); +ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100; +INSERT INTO analyze_distinct_hash_bubble +SELECT i::text::xid FROM generate_series(1, 10) i; +INSERT INTO analyze_distinct_hash_bubble +SELECT '1'::xid FROM generate_series(1, 20); +ANALYZE analyze_distinct_hash_bubble; + +SELECT most_common_vals::text +FROM pg_stats +WHERE schemaname = 'public' + AND tablename = 'analyze_distinct_hash_bubble' + AND attname = 'x'; + +DROP TABLE analyze_distinct_hash_unique; +DROP TABLE analyze_distinct_hash_bubble; + +RESET client_min_messages;