From 4335990e7f04a2cc80c4a98c90b28dd95cafb0d1 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 8 Feb 2023 13:09:27 +0900
Subject: [PATCH 01/61] Make EXEC_BACKEND more convenient on Linux and FreeBSD.

Try to disable ASLR when building in EXEC_BACKEND mode, to avoid random
memory mapping failures while testing.  For developer use only, no
effect on regular builds.

This has been originally applied as of f3e7806 for v15~, but
recently-added buildfarm member gokiburi tests this configuration on
older branches as well, causing it to fail randomly as ASLR would be
enabled.

Suggested-by: Andres Freund <andres@anarazel.de>
Tested-by: Bossart, Nathan <bossartn@amazon.com>
Discussion: https://postgr.es/m/20210806032944.m4tz7j2w47mant26%40alap3.anarazel.de
Backpatch-through: 12
---
 configure                     |  2 +-
 configure.ac                  |  1 +
 src/bin/pg_ctl/pg_ctl.c       |  4 ++++
 src/common/exec.c             | 33 +++++++++++++++++++++++++++++++++
 src/include/pg_config.h.in    |  3 +++
 src/include/port.h            |  5 +++++
 src/test/regress/pg_regress.c |  4 ++++
 src/tools/msvc/Solution.pm    |  1 +
 8 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index 9c946381f6b..3228096e38e 100755
--- a/configure
+++ b/configure
@@ -16895,7 +16895,7 @@ $as_echo "#define HAVE_STDBOOL_H 1" >>confdefs.h
 fi
 
 
-for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h langinfo.h mbarrier.h poll.h sys/epoll.h sys/event.h sys/ipc.h sys/prctl.h sys/procctl.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/signalfd.h sys/sockio.h sys/tas.h sys/uio.h sys/un.h termios.h ucred.h wctype.h
+for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h langinfo.h mbarrier.h poll.h sys/epoll.h sys/event.h sys/ipc.h sys/personality.h sys/prctl.h sys/procctl.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/signalfd.h sys/sockio.h sys/tas.h sys/uio.h sys/un.h termios.h ucred.h wctype.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.ac b/configure.ac
index bcc5cf6c6ab..71c27d57691 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1904,6 +1904,7 @@ AC_CHECK_HEADERS(m4_normalize([
 	sys/epoll.h
 	sys/event.h
 	sys/ipc.h
+	sys/personality.h
 	sys/prctl.h
 	sys/procctl.h
 	sys/pstat.h
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index cc733cb7be1..26d62e823cf 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -466,6 +466,10 @@ start_postmaster(void)
 	fflush(stdout);
 	fflush(stderr);
 
+#ifdef EXEC_BACKEND
+	pg_disable_aslr();
+#endif
+
 	pm_pid = fork();
 	if (pm_pid < 0)
 	{
diff --git a/src/common/exec.c b/src/common/exec.c
index 5159b616a39..dbac0598be0 100644
--- a/src/common/exec.c
+++ b/src/common/exec.c
@@ -27,6 +27,14 @@
 #include "common/mdb_locale.h"
 
 
+#ifdef EXEC_BACKEND
+#if defined(HAVE_SYS_PERSONALITY_H)
+#include <sys/personality.h>
+#elif defined(HAVE_SYS_PROCCTL_H)
+#include <sys/procctl.h>
+#endif
+#endif
+
 /* Inhibit mingw CRT's auto-globbing of command line arguments */
 #if defined(WIN32) && !defined(_MSC_VER)
 extern int _CRT_glob = 0; /* 0 turns off globbing; 1 turns it on */
@@ -477,6 +485,31 @@ set_pglocale_pgservice(const char *argv0, const char *app)
 	}
 }
 
+#ifdef EXEC_BACKEND
+/*
+ * For the benefit of PostgreSQL developers testing EXEC_BACKEND on Unix
+ * systems (code paths normally exercised only on Windows), provide a way to
+ * disable address space layout randomization, if we know how on this platform.
+ * Otherwise, backends may fail to attach to shared memory at the fixed address
+ * chosen by the postmaster.  (See also the macOS-specific hack in
+ * sysv_shmem.c.)
+ */
+int
+pg_disable_aslr(void)
+{
+#if defined(HAVE_SYS_PERSONALITY_H)
+	return personality(ADDR_NO_RANDOMIZE);
+#elif defined(HAVE_SYS_PROCCTL_H) && defined(PROC_ASLR_FORCE_DISABLE)
+	int			data = PROC_ASLR_FORCE_DISABLE;
+
+	return procctl(P_PID, 0, PROC_ASLR_CTL, &data);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+#endif
+
 #ifdef WIN32
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 54de6844f58..d02229909e9 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -699,6 +699,9 @@
 /* Define to 1 if you have the <sys/ipc.h> header file. */
 #undef HAVE_SYS_IPC_H
 
+/* Define to 1 if you have the <sys/personality.h> header file. */
+#undef HAVE_SYS_PERSONALITY_H
+
 /* Define to 1 if you have the <sys/prctl.h> header file. */
 #undef HAVE_SYS_PRCTL_H
 
diff --git a/src/include/port.h b/src/include/port.h
index 4179df36ebc..5aecc883f63 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -141,6 +141,11 @@ extern char *pipe_read_line(char *cmd, char *line, int maxsize);
 #define PG_VERSIONSTR "postgres (Apache Cloudberry) " PG_VERSION "\n"
 #define PG_BACKEND_VERSIONSTR "postgres (Apache Cloudberry) " PG_VERSION "\n"
 
+#ifdef EXEC_BACKEND
+/* Disable ASLR before exec, for developer builds only (in exec.c) */
+extern int pg_disable_aslr(void);
+#endif
+
 
 #if defined(WIN32) || defined(__CYGWIN__)
 #define EXE ".exe"
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index c479142222e..c9921aaf5ce 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -1625,6 +1625,10 @@ spawn_process(const char *cmdline)
 	if (logfile)
 		fflush(logfile);
 
+#ifdef EXEC_BACKEND
+	pg_disable_aslr();
+#endif
+
 	pid = fork();
 	if (pid == -1)
 	{
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index e7af4162759..b6393b41773 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -398,6 +398,7 @@ sub GenerateFiles
 		HAVE_SYS_EPOLL_H                         => undef,
 		HAVE_SYS_EVENT_H                         => undef,
 		HAVE_SYS_IPC_H                           => undef,
+		HAVE_SYS_PERSONALITY_H                   => undef,
 		HAVE_SYS_PRCTL_H                         => undef,
 		HAVE_SYS_PROCCTL_H                       => undef,
 		HAVE_SYS_PSTAT_H                         => undef,

From 976fdea0fecf00c751e86deadc98446af02c2889 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 8 Feb 2023 17:15:23 -0500
Subject: [PATCH 02/61] Stop recommending auto-download of DTD files, and
 indeed disable it.

It appears no longer possible to build the SGML docs without a local
installation of the DocBook DTD, because sourceforge.net now only
permits HTTPS access, and no common version of xsltproc supports that.
Hence, remove the bits of our documentation suggesting that that's
possible or useful.

In fact, we might as well add the --nonet option to the build recipes
automatically, for a bit of extra security.

Also fix our documentation-tool-installation recipes for macOS to
ensure that xmllint and xsltproc are pulled in from MacPorts or
Homebrew.  The previous recipes assumed you could use the
Apple-supplied versions of these tools; which still works, except that
you'd need to set an environment variable to ensure that they would
find DTD files provided by those package managers.  Simpler and easier
to just recommend pulling in the additional packages.

In HEAD, also document how to build docs using Meson, and adjust
"ninja docs" to just build the HTML docs, for consistency with the
default behavior of doc/src/sgml/Makefile.

In a fit of neatnik-ism, I also made the ordering of the package
lists match the order in which the tools are described at the head
of the appendix.

Aleksander Alekseev, Peter Eisentraut, Tom Lane

Discussion: https://postgr.es/m/CAJ7c6TO8Aro2nxg=EQsVGiSDe-TstP4EsSvDHd7DSRsP40PgGA@mail.gmail.com
---
 doc/src/sgml/Makefile        |  8 ++++--
 doc/src/sgml/docguide.sgml   | 55 ++++++++++++++++++------------------
 doc/src/sgml/images/Makefile |  2 +-
 3 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile
index d10f9e5b398..58ae606bccf 100644
--- a/doc/src/sgml/Makefile
+++ b/doc/src/sgml/Makefile
@@ -44,11 +44,15 @@ endif
 
 XMLINCLUDE = --path .
 
-ifndef XMLLINT
+ifdef XMLLINT
+XMLLINT := $(XMLLINT) --nonet
+else
 XMLLINT = $(missing) xmllint
 endif
 
-ifndef XSLTPROC
+ifdef XSLTPROC
+XSLTPROC := $(XSLTPROC) --nonet
+else
 XSLTPROC = $(missing) xsltproc
 endif
 
diff --git a/doc/src/sgml/docguide.sgml b/doc/src/sgml/docguide.sgml
index e1bac68604f..55ef6417749 100644
--- a/doc/src/sgml/docguide.sgml
+++ b/doc/src/sgml/docguide.sgml
@@ -136,6 +136,7 @@
      <listitem>
       <para>
        This is a program for converting, among other things, XML to PDF.
+       It is needed only if you want to build the documentation in PDF format.
       </para>
      </listitem>
     </varlistentry>
@@ -151,25 +152,13 @@
    here.
   </para>
 
-  <para>
-   You can get away with not installing DocBook XML and the DocBook XSLT
-   stylesheets locally, because the required files will be downloaded from the
-   Internet and cached locally.  This may in fact be the preferred solution if
-   your operating system packages provide only an old version of these files,
-   or if no packages are available at all.
-   If you want to prevent any attempt to access the Internet while building
-   the documentation, you need to pass the <option>--nonet</option> option
-   to <command>xmllint</command> and <command>xsltproc</command>; see below
-   for an example.
-  </para>
-
   <sect2>
    <title>Installation on Fedora, RHEL, and Derivatives</title>
 
    <para>
     To install the required packages, use:
 <programlisting>
-yum install docbook-dtds docbook-style-xsl fop libxslt
+yum install docbook-dtds docbook-style-xsl libxslt fop
 </programlisting>
    </para>
   </sect2>
@@ -180,7 +169,7 @@ yum install docbook-dtds docbook-style-xsl fop libxslt
    <para>
     To install the required packages with <command>pkg</command>, use:
 <programlisting>
-pkg install docbook-xml docbook-xsl fop libxslt
+pkg install docbook-xml docbook-xsl libxslt fop
 </programlisting>
    </para>
 
@@ -199,7 +188,7 @@ pkg install docbook-xml docbook-xsl fop libxslt
     available for <productname>Debian GNU/Linux</productname>.
     To install, simply use:
 <programlisting>
-apt-get install docbook-xml docbook-xsl fop libxml2-utils xsltproc
+apt-get install docbook-xml docbook-xsl libxml2-utils xsltproc fop
 </programlisting>
    </para>
   </sect2>
@@ -208,21 +197,37 @@ apt-get install docbook-xml docbook-xsl fop libxml2-utils xsltproc
    <title>macOS</title>
 
    <para>
-    On macOS, you can build the HTML and man documentation without installing
-    anything extra.  If you want to build PDFs or want to install a local copy
-    of DocBook, you can get those from your preferred package manager.
+    If you use MacPorts, the following will get you set up:
+<programlisting>
+sudo port install docbook-xml docbook-xsl-nons libxslt fop
+</programlisting>
+    If you use Homebrew, use this:
+<programlisting>
+brew install docbook docbook-xsl libxslt fop
+</programlisting>
    </para>
 
    <para>
-    If you use MacPorts, the following will get you set up:
+    The Homebrew-supplied programs require the following environment variable
+    to be set:
 <programlisting>
-sudo port install docbook-xml-4.5 docbook-xsl fop
+export XML_CATALOG_FILES=/usr/local/etc/xml/catalog
 </programlisting>
-    If you use Homebrew, use this:
+    Without it, <command>xsltproc</command> will throw errors like this:
 <programlisting>
-brew install docbook docbook-xsl fop
+I/O error : Attempt to load network entity http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd
+postgres.sgml:21: warning: failed to load external entity "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd"
+...
 </programlisting>
    </para>
+
+   <para>
+    While it is possible to use the Apple-provided versions
+    of <command>xmllint</command> and <command>xsltproc</command>
+    instead of those from MacPorts or Homebrew, you'll still need
+    to install the DocBook DTD and stylesheets, and set up a catalog
+    file that points to them.
+   </para>
   </sect2>
 
   <sect2 id="docguide-toolsets-configure">
@@ -253,12 +258,6 @@ checking for dbtoepub... dbtoepub
    these programs, for example
 <screen>
 ./configure ... XMLLINT=/opt/local/bin/xmllint ...
-</screen>
-   Also, if you want to ensure that <filename>xmllint</filename>
-   and <filename>xsltproc</filename> will not perform any network access,
-   you can do something like
-<screen>
-./configure ... XMLLINT="xmllint --nonet" XSLTPROC="xsltproc --nonet" ...
 </screen>
   </para>
   </sect2>
diff --git a/doc/src/sgml/images/Makefile b/doc/src/sgml/images/Makefile
index f9e356348b2..645519095d0 100644
--- a/doc/src/sgml/images/Makefile
+++ b/doc/src/sgml/images/Makefile
@@ -9,7 +9,7 @@ ALL_IMAGES = \
 
 DITAA = ditaa
 DOT = dot
-XSLTPROC = xsltproc
+XSLTPROC = xsltproc --nonet
 
 all: $(ALL_IMAGES)
 

From 32c8bec45d8dc8a5d22fc8b10d64e81afadb2f45 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Mon, 13 Feb 2023 17:09:55 +1300
Subject: [PATCH 03/61] Disable WindowAgg inverse transitions when subplans are
 present

When an aggregate function is used as a WindowFunc and a tuple transitions
out of the window frame, we ordinarily try to make use of the aggregate
function's inverse transition function to "unaggregate" the exiting tuple.

This optimization is disabled for various cases, including when the
aggregate contains a volatile function.  In such a case we'd be unable to
ensure that the transition value was calculated to the same value during
transitions and inverse transitions.  Unfortunately, we did this check by
calling contain_volatile_functions() which does not recursively search
SubPlans for volatile functions.  If the aggregate function's arguments or
its FILTER clause contained a subplan with volatile functions then we'd
fail to notice this.

Here we fix this by just disabling the optimization when the WindowFunc
contains any subplans.  Volatile functions are not the only reason that a
subplan may have nonrepeatable results.

Bug: #17777
Reported-by: Anban Company
Discussion: https://postgr.es/m/17777-860b739b6efde977%40postgresql.org
Reviewed-by: Tom Lane
Backpatch-through: 11
---
 src/backend/executor/nodeWindowAgg.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c
index 611fcf94a89..f36a46cc79f 100644
--- a/src/backend/executor/nodeWindowAgg.c
+++ b/src/backend/executor/nodeWindowAgg.c
@@ -43,6 +43,8 @@
 #include "nodes/execnodes.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
+#include "optimizer/clauses.h"
+#include "optimizer/optimizer.h"
 #include "parser/parse_agg.h"
 #include "parser/parse_coerce.h"
 #include "parser/parse_oper.h"
@@ -3032,16 +3034,24 @@ initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc,
 	 * aggregate's arguments (and FILTER clause if any) contain any calls to
 	 * volatile functions.  Otherwise, the difference between restarting and
 	 * not restarting the aggregation would be user-visible.
+	 *
+	 * We also don't risk using moving aggregates when there are subplans in
+	 * the arguments or FILTER clause.  This is partly because
+	 * contain_volatile_functions() doesn't look inside subplans; but there
+	 * are other reasons why a subplan's output might be volatile.  For
+	 * example, syncscan mode can render the results nonrepeatable.
 	 */
 	if (!OidIsValid(aggform->aggminvtransfn))
 		use_ma_code = false;	/* sine qua non */
 	else if (aggform->aggmfinalmodify == AGGMODIFY_READ_ONLY &&
-			 aggform->aggfinalmodify != AGGMODIFY_READ_ONLY)
+		aggform->aggfinalmodify != AGGMODIFY_READ_ONLY)
 		use_ma_code = true;		/* decision forced by safety */
 	else if (winstate->frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING)
 		use_ma_code = false;	/* non-moving frame head */
 	else if (contain_volatile_functions((Node *) wfunc))
 		use_ma_code = false;	/* avoid possible behavioral change */
+	else if (contain_subplans((Node *) wfunc))
+		use_ma_code = false;	/* subplans might contain volatile functions */
 	else
 		use_ma_code = true;		/* yes, let's use it */
 	if (use_ma_code)

From 6b126f2862e4e61911013f37e3e9f79713c2f142 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 15 Feb 2023 10:12:33 +0900
Subject: [PATCH 04/61] Fix handling of SCRAM-SHA-256's channel binding with
 RSA-PSS certificates

OpenSSL 1.1.1 and newer versions have added support for RSA-PSS
certificates, which requires the use of a specific routine in OpenSSL to
determine which hash function to use when compiling it when using
channel binding in SCRAM-SHA-256.  X509_get_signature_nid(), that is the
original routine the channel binding code has relied on, is not able to
determine which hash algorithm to use for such certificates.  However,
X509_get_signature_info(), new to OpenSSL 1.1.1, is able to do it.  This
commit switches the channel binding logic to rely on
X509_get_signature_info() over X509_get_signature_nid(), which would be
the choice when building with 1.1.1 or newer.

The error could have been triggered on the client or the server, hence
libpq and the backend need to have their related code paths patched.
Note that attempting to load an RSA-PSS certificate with OpenSSL 1.1.0
or older leads to a failure due to an unsupported algorithm.

The discovery of relying on X509_get_signature_info() comes from Jacob,
the tests have been written by Heikki (with few tweaks from me), while I
have bundled the whole together while adding the bits needed for MSVC
and meson.

This issue exists since channel binding exists, so backpatch all the way
down.  Some tests are added in 15~, triggered if compiling with OpenSSL
1.1.1 or newer, where the certificate and key files can easily be
generated for RSA-PSS.

Reported-by: Gunnar "Nick" Bluth
Author: Jacob Champion, Heikki Linnakangas
Discussion: https://postgr.es/m/17760-b6c61e752ec07060@postgresql.org
Backpatch-through: 11
---
 configure                                | 12 ++++++++++++
 configure.ac                             |  2 ++
 src/backend/libpq/be-secure-openssl.c    |  9 +++++++--
 src/include/libpq/libpq-be.h             |  2 +-
 src/include/pg_config.h.in               |  3 +++
 src/interfaces/libpq/fe-secure-openssl.c |  9 +++++++--
 src/interfaces/libpq/libpq-int.h         |  2 +-
 src/tools/msvc/Solution.pm               | 10 +++++++++-
 8 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/configure b/configure
index 3228096e38e..99a9961f025 100755
--- a/configure
+++ b/configure
@@ -15802,6 +15802,18 @@ if test "x$ac_cv_func_CRYPTO_lock" = xyes; then :
 #define HAVE_CRYPTO_LOCK 1
 _ACEOF
 
+fi
+done
+
+  # Function introduced in OpenSSL 1.1.1.
+  for ac_func in X509_get_signature_info
+do :
+  ac_fn_c_check_func "$LINENO" "X509_get_signature_info" "ac_cv_func_X509_get_signature_info"
+if test "x$ac_cv_func_X509_get_signature_info" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_X509_GET_SIGNATURE_INFO 1
+_ACEOF
+
 fi
 done
 
diff --git a/configure.ac b/configure.ac
index 71c27d57691..a50ca5710c4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1741,6 +1741,8 @@ if test "$with_ssl" = openssl ; then
   # thread-safety. In 1.1.0, it's no longer required, and CRYPTO_lock()
   # function was removed.
   AC_CHECK_FUNCS([CRYPTO_lock])
+  # Function introduced in OpenSSL 1.1.1.
+  AC_CHECK_FUNCS([X509_get_signature_info])
   AC_DEFINE([USE_OPENSSL], 1, [Define to 1 to build with OpenSSL support. (--with-ssl=openssl)])
 elif test "$with_ssl" != no ; then
   AC_MSG_ERROR([--with-ssl must specify openssl])
diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c
index c48812f955a..e39952494e6 100644
--- a/src/backend/libpq/be-secure-openssl.c
+++ b/src/backend/libpq/be-secure-openssl.c
@@ -1308,7 +1308,7 @@ be_tls_get_peer_serial(Port *port, char *ptr, size_t len)
 		ptr[0] = '\0';
 }
 
-#ifdef HAVE_X509_GET_SIGNATURE_NID
+#if defined(HAVE_X509_GET_SIGNATURE_NID) || defined(HAVE_X509_GET_SIGNATURE_INFO)
 char *
 be_tls_get_certificate_hash(Port *port, size_t *len)
 {
@@ -1326,10 +1326,15 @@ be_tls_get_certificate_hash(Port *port, size_t *len)
 
 	/*
 	 * Get the signature algorithm of the certificate to determine the hash
-	 * algorithm to use for the result.
+	 * algorithm to use for the result.  Prefer X509_get_signature_info(),
+	 * introduced in OpenSSL 1.1.1, which can handle RSA-PSS signatures.
 	 */
+#if HAVE_X509_GET_SIGNATURE_INFO
+	if (!X509_get_signature_info(server_cert, &algo_nid, NULL, NULL, NULL))
+#else
 	if (!OBJ_find_sigid_algs(X509_get_signature_nid(server_cert),
 							 &algo_nid, NULL))
+#endif
 		elog(ERROR, "could not determine server certificate signature algorithm");
 
 	/*
diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h
index de4883f4f69..8d2258aea44 100644
--- a/src/include/libpq/libpq-be.h
+++ b/src/include/libpq/libpq-be.h
@@ -307,7 +307,7 @@ extern void be_tls_get_peer_serial(Port *port, char *ptr, size_t len);
  * This is not supported with old versions of OpenSSL that don't have
  * the X509_get_signature_nid() function.
  */
-#if defined(USE_OPENSSL) && defined(HAVE_X509_GET_SIGNATURE_NID)
+#if defined(USE_OPENSSL) && (defined(HAVE_X509_GET_SIGNATURE_NID) || defined(HAVE_X509_GET_SIGNATURE_INFO))
 #define HAVE_BE_TLS_GET_CERTIFICATE_HASH
 extern char *be_tls_get_certificate_hash(Port *port, size_t *len);
 #endif
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index d02229909e9..2a06438113b 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -807,6 +807,9 @@
 /* Define to 1 if you have the `writev' function. */
 #undef HAVE_WRITEV
 
+/* Define to 1 if you have the `X509_get_signature_info' function. */
+#undef HAVE_X509_GET_SIGNATURE_INFO
+
 /* Define to 1 if you have the `X509_get_signature_nid' function. */
 #undef HAVE_X509_GET_SIGNATURE_NID
 
diff --git a/src/interfaces/libpq/fe-secure-openssl.c b/src/interfaces/libpq/fe-secure-openssl.c
index c0988e10a30..d75a823b880 100644
--- a/src/interfaces/libpq/fe-secure-openssl.c
+++ b/src/interfaces/libpq/fe-secure-openssl.c
@@ -378,7 +378,7 @@ pgtls_write(PGconn *conn, const void *ptr, size_t len)
 	return n;
 }
 
-#ifdef HAVE_X509_GET_SIGNATURE_NID
+#if defined(HAVE_X509_GET_SIGNATURE_NID) || defined(HAVE_X509_GET_SIGNATURE_INFO)
 char *
 pgtls_get_peer_certificate_hash(PGconn *conn, size_t *len)
 {
@@ -398,10 +398,15 @@ pgtls_get_peer_certificate_hash(PGconn *conn, size_t *len)
 
 	/*
 	 * Get the signature algorithm of the certificate to determine the hash
-	 * algorithm to use for the result.
+	 * algorithm to use for the result.  Prefer X509_get_signature_info(),
+	 * introduced in OpenSSL 1.1.1, which can handle RSA-PSS signatures.
 	 */
+#if HAVE_X509_GET_SIGNATURE_INFO
+	if (!X509_get_signature_info(peer_cert, &algo_nid, NULL, NULL, NULL))
+#else
 	if (!OBJ_find_sigid_algs(X509_get_signature_nid(peer_cert),
 							 &algo_nid, NULL))
+#endif
 	{
 		appendPQExpBufferStr(&conn->errorMessage,
 							 libpq_gettext("could not determine server certificate signature algorithm\n"));
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index 0cbd611bd98..27321d262cf 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -869,7 +869,7 @@ extern ssize_t pgtls_write(PGconn *conn, const void *ptr, size_t len);
  * This is not supported with old versions of OpenSSL that don't have
  * the X509_get_signature_nid() function.
  */
-#if defined(USE_OPENSSL) && defined(HAVE_X509_GET_SIGNATURE_NID)
+#if defined(USE_OPENSSL) && (defined(HAVE_X509_GET_SIGNATURE_NID) || defined(HAVE_X509_GET_SIGNATURE_INFO))
 #define HAVE_PGTLS_GET_PEER_CERTIFICATE_HASH
 extern char *pgtls_get_peer_certificate_hash(PGconn *conn, size_t *len);
 #endif
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index b6393b41773..2b9a77878a4 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -433,6 +433,7 @@ sub GenerateFiles
 		HAVE_WCTYPE_H                            => 1,
 		HAVE_WRITEV                              => undef,
 		HAVE_X509_GET_SIGNATURE_NID              => 1,
+		HAVE_X509_GET_SIGNATURE_INFO             => undef,
 		HAVE_X86_64_POPCNTQ                      => undef,
 		HAVE__BOOL                               => undef,
 		HAVE__BUILTIN_BSWAP16                    => undef,
@@ -548,7 +549,14 @@ sub GenerateFiles
 
 		my ($digit1, $digit2, $digit3) = $self->GetOpenSSLVersion();
 
-		# More symbols are needed with OpenSSL 1.1.0 and above.
+		# Symbols needed with OpenSSL 1.1.1 and above.
+		if (   ($digit1 >= '3' && $digit2 >= '0' && $digit3 >= '0')
+			|| ($digit1 >= '1' && $digit2 >= '1' && $digit3 >= '1'))
+		{
+			$define{HAVE_X509_GET_SIGNATURE_INFO} = 1;
+		}
+
+		# Symbols needed with OpenSSL 1.1.0 and above.
 		if (   ($digit1 >= '3' && $digit2 >= '0' && $digit3 >= '0')
 			|| ($digit1 >= '1' && $digit2 >= '1' && $digit3 >= '0'))
 		{

From 1cdbdfce2d0005e89dd75921726130ee4a00e33d Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 17 Feb 2023 16:40:34 -0500
Subject: [PATCH 05/61] Print the correct aliases for DML target tables in
 ruleutils.

ruleutils.c blindly printed the user-given alias (or nothing if there
hadn't been one) for the target table of INSERT/UPDATE/DELETE queries.
That works a large percentage of the time, but not always: for queries
appearing in WITH, it's possible that we chose a different alias to
avoid conflict with outer-scope names.  Since the chosen alias would
be used in any Var references to the target table, this'd lead to an
inconsistent printout with consequences such as dump/restore failures.

The correct logic for printing (or not) a relation alias was embedded
in get_from_clause_item.  Factor it out to a separate function so that
we don't need a jointree node to use it.  (Only a limited part of that
function can be reached from these new call sites, but this seems like
the cleanest non-duplicative factorization.)

In passing, I got rid of a redundant "\d+ rules_src" step in rules.sql.

Initial report from Jonathan Katz; thanks to Vignesh C for analysis.
This has been broken for a long time, so back-patch to all supported
branches.

Discussion: https://postgr.es/m/e947fa21-24b2-f922-375a-d4f763ef3e4b@postgresql.org
Discussion: https://postgr.es/m/CALDaNm1MMntjmT_NJGp-Z=xbF02qHGAyuSHfYHias3TqQbPF2w@mail.gmail.com
---
 src/backend/utils/adt/ruleutils.c   | 145 ++++++++++++++++------------
 src/test/regress/expected/rules.out |  47 +++++----
 src/test/regress/sql/rules.sql      |  13 ++-
 3 files changed, 126 insertions(+), 79 deletions(-)

diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index bf663afb3c5..dbbb2a70a07 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -481,6 +481,8 @@ static void get_from_clause(Query *query, const char *prefix,
 							deparse_context *context);
 static void get_from_clause_item(Node *jtnode, Query *query,
 								 deparse_context *context);
+static void get_rte_alias(RangeTblEntry *rte, int varno, bool use_as,
+						  deparse_context *context);
 static void get_column_alias_list(deparse_columns *colinfo,
 								  deparse_context *context);
 static void get_from_clause_coldeflist(RangeTblFunction *rtfunc,
@@ -6661,12 +6663,14 @@ get_insert_query_def(Query *query, deparse_context *context,
 		context->indentLevel += PRETTYINDENT_STD;
 		appendStringInfoChar(buf, ' ');
 	}
-	appendStringInfo(buf, "INSERT INTO %s ",
+	appendStringInfo(buf, "INSERT INTO %s",
 					 generate_relation_name(rte->relid, NIL));
-	/* INSERT requires AS keyword for target alias */
-	if (rte->alias != NULL)
-		appendStringInfo(buf, "AS %s ",
-						 quote_identifier(rte->alias->aliasname));
+
+	/* Print the relation alias, if needed; INSERT requires explicit AS */
+	get_rte_alias(rte, query->resultRelation, true, context);
+
+	/* always want a space here */
+	appendStringInfoChar(buf, ' ');
 
 	/*
 	 * Add the insert-column-names list.  Any indirection decoration needed on
@@ -6848,9 +6852,10 @@ get_update_query_def(Query *query, deparse_context *context,
 	appendStringInfo(buf, "UPDATE %s%s",
 					 only_marker(rte),
 					 generate_relation_name(rte->relid, NIL));
-	if (rte->alias != NULL)
-		appendStringInfo(buf, " %s",
-						 quote_identifier(rte->alias->aliasname));
+
+	/* Print the relation alias, if needed */
+	get_rte_alias(rte, query->resultRelation, false, context);
+
 	appendStringInfoString(buf, " SET ");
 
 	/* Deparse targetlist */
@@ -7056,9 +7061,9 @@ get_delete_query_def(Query *query, deparse_context *context,
 	appendStringInfo(buf, "DELETE FROM %s%s",
 					 only_marker(rte),
 					 generate_relation_name(rte->relid, NIL));
-	if (rte->alias != NULL)
-		appendStringInfo(buf, " %s",
-						 quote_identifier(rte->alias->aliasname));
+
+	/* Print the relation alias, if needed */
+	get_rte_alias(rte, query->resultRelation, false, context);
 
 	/* Add the USING clause if given */
 	get_from_clause(query, " USING ", context);
@@ -11231,10 +11236,8 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
 	{
 		int			varno = ((RangeTblRef *) jtnode)->rtindex;
 		RangeTblEntry *rte = rt_fetch(varno, query->rtable);
-		char	   *refname = get_rtable_name(varno, context);
 		deparse_columns *colinfo = deparse_columns_fetch(varno, dpns);
 		RangeTblFunction *rtfunc1 = NULL;
-		bool		printalias;
 
 		if (rte->lateral)
 			appendStringInfoString(buf, "LATERAL ");
@@ -11382,54 +11385,7 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
 		}
 
 		/* Print the relation alias, if needed */
-		printalias = false;
-		if (rte->alias != NULL)
-		{
-			/* Always print alias if user provided one */
-			printalias = true;
-		}
-		else if (colinfo->printaliases)
-		{
-			/* Always print alias if we need to print column aliases */
-			printalias = true;
-		}
-		else if (rte->rtekind == RTE_RELATION)
-		{
-			/*
-			 * No need to print alias if it's same as relation name (this
-			 * would normally be the case, but not if set_rtable_names had to
-			 * resolve a conflict).
-			 */
-			if (strcmp(refname, get_relation_name(rte->relid)) != 0)
-				printalias = true;
-		}
-		else if (rte->rtekind == RTE_FUNCTION || rte->rtekind == RTE_TABLEFUNCTION)
-		{
-			/*
-			 * For a function RTE, always print alias.  This covers possible
-			 * renaming of the function and/or instability of the
-			 * FigureColname rules for things that aren't simple functions.
-			 * Note we'd need to force it anyway for the columndef list case.
-			 */
-			printalias = true;
-		}
-		else if (rte->rtekind == RTE_VALUES)
-		{
-			/* Alias is syntactically required for VALUES */
-			printalias = true;
-		}
-		else if (rte->rtekind == RTE_CTE)
-		{
-			/*
-			 * No need to print alias if it's same as CTE name (this would
-			 * normally be the case, but not if set_rtable_names had to
-			 * resolve a conflict).
-			 */
-			if (strcmp(refname, rte->ctename) != 0)
-				printalias = true;
-		}
-		if (printalias)
-			appendStringInfo(buf, " %s", quote_identifier(refname));
+		get_rte_alias(rte, varno, false, context);
 
 		/* Print the column definitions or aliases, if needed */
 		if (rtfunc1 && rtfunc1->funccolnames != NIL)
@@ -11567,6 +11523,73 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
 			 (int) nodeTag(jtnode));
 }
 
+/*
+ * get_rte_alias - print the relation's alias, if needed
+ *
+ * If printed, the alias is preceded by a space, or by " AS " if use_as is true.
+ */
+static void
+get_rte_alias(RangeTblEntry *rte, int varno, bool use_as,
+			  deparse_context *context)
+{
+	deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces);
+	char	   *refname = get_rtable_name(varno, context);
+	deparse_columns *colinfo = deparse_columns_fetch(varno, dpns);
+	bool		printalias = false;
+
+	if (rte->alias != NULL)
+	{
+		/* Always print alias if user provided one */
+		printalias = true;
+	}
+	else if (colinfo->printaliases)
+	{
+		/* Always print alias if we need to print column aliases */
+		printalias = true;
+	}
+	else if (rte->rtekind == RTE_RELATION)
+	{
+		/*
+		 * No need to print alias if it's same as relation name (this would
+		 * normally be the case, but not if set_rtable_names had to resolve a
+		 * conflict).
+		 */
+		if (strcmp(refname, get_relation_name(rte->relid)) != 0)
+			printalias = true;
+	}
+	else if (rte->rtekind == RTE_FUNCTION)
+	{
+		/*
+		 * For a function RTE, always print alias.  This covers possible
+		 * renaming of the function and/or instability of the FigureColname
+		 * rules for things that aren't simple functions.  Note we'd need to
+		 * force it anyway for the columndef list case.
+		 */
+		printalias = true;
+	}
+	else if (rte->rtekind == RTE_SUBQUERY ||
+			 rte->rtekind == RTE_VALUES)
+	{
+		/* Alias is syntactically required for SUBQUERY and VALUES */
+		printalias = true;
+	}
+	else if (rte->rtekind == RTE_CTE)
+	{
+		/*
+		 * No need to print alias if it's same as CTE name (this would
+		 * normally be the case, but not if set_rtable_names had to resolve a
+		 * conflict).
+		 */
+		if (strcmp(refname, rte->ctename) != 0)
+			printalias = true;
+	}
+
+	if (printalias)
+		appendStringInfo(context->buf, "%s%s",
+						 use_as ? " AS " : " ",
+						 quote_identifier(refname));
+}
+
 /*
  * get_column_alias_list - print column alias list for an RTE
  *
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index a2922a0a9ec..a51729af420 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -3171,28 +3171,21 @@ select * from rules_log;
 (16 rows)
 
 create rule r4 as on delete to rules_src do notify rules_src_deletion;
-\d+ rules_src
-                                 Table "public.rules_src"
- Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
---------+---------+-----------+----------+---------+---------+--------------+-------------
- f1     | integer |           |          |         | plain   |              | 
- f2     | integer |           |          | 0       | plain   |              | 
-Rules:
-    r1 AS
-    ON UPDATE TO rules_src DO  INSERT INTO rules_log (f1, f2, tag, id) VALUES (old.f1,old.f2,'old'::text,DEFAULT), (new.f1,new.f2,'new'::text,DEFAULT)
-    r2 AS
-    ON UPDATE TO rules_src DO  VALUES (old.f1,old.f2,'old'::text), (new.f1,new.f2,'new'::text)
-    r3 AS
-    ON INSERT TO rules_src DO  INSERT INTO rules_log (f1, f2, tag, id) VALUES (NULL::integer,NULL::integer,'-'::text,DEFAULT), (new.f1,new.f2,'new'::text,DEFAULT)
-    r4 AS
-    ON DELETE TO rules_src DO
- NOTIFY rules_src_deletion
-
 --
 -- Ensure an aliased target relation for insert is correctly deparsed.
 --
 create rule r5 as on insert to rules_src do instead insert into rules_log AS trgt SELECT NEW.* RETURNING trgt.f1, trgt.f2;
 create rule r6 as on update to rules_src do instead UPDATE rules_log AS trgt SET tag = 'updated' WHERE trgt.f1 = new.f1;
+--
+-- Check deparse disambiguation of INSERT/UPDATE/DELETE targets.
+--
+create rule r7 as on delete to rules_src do instead
+  with wins as (insert into int4_tbl as trgt values (0) returning *),
+       wupd as (update int4_tbl trgt set f1 = f1+1 returning *),
+       wdel as (delete from int4_tbl trgt where f1 = 0 returning *)
+  insert into rules_log AS trgt select old.* from wins, wupd, wdel
+  returning trgt.f1, trgt.f2;
+-- check display of all rules added above
 \d+ rules_src
                                  Table "public.rules_src"
  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
@@ -3217,6 +3210,26 @@ Rules:
     r6 AS
     ON UPDATE TO rules_src DO INSTEAD  UPDATE rules_log trgt SET tag = 'updated'::text
   WHERE trgt.f1 = new.f1
+    r7 AS
+    ON DELETE TO rules_src DO INSTEAD  WITH wins AS (
+         INSERT INTO int4_tbl AS trgt_1 (f1)
+          VALUES (0)
+          RETURNING trgt_1.f1
+        ), wupd AS (
+         UPDATE int4_tbl trgt_1 SET f1 = trgt_1.f1 + 1
+          RETURNING trgt_1.f1
+        ), wdel AS (
+         DELETE FROM int4_tbl trgt_1
+          WHERE trgt_1.f1 = 0
+          RETURNING trgt_1.f1
+        )
+ INSERT INTO rules_log AS trgt (f1, f2)  SELECT old.f1,
+            old.f2
+           FROM wins,
+            wupd,
+            wdel
+  RETURNING trgt.f1,
+    trgt.f2
 
 --
 -- Also check multiassignment deparsing.
diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql
index 7b0cd28720c..1318482d747 100644
--- a/src/test/regress/sql/rules.sql
+++ b/src/test/regress/sql/rules.sql
@@ -1034,13 +1034,24 @@ insert into rules_src values(22,23), (33,default);
 select * from rules_src;
 select * from rules_log;
 create rule r4 as on delete to rules_src do notify rules_src_deletion;
-\d+ rules_src
 
 --
 -- Ensure an aliased target relation for insert is correctly deparsed.
 --
 create rule r5 as on insert to rules_src do instead insert into rules_log AS trgt SELECT NEW.* RETURNING trgt.f1, trgt.f2;
 create rule r6 as on update to rules_src do instead UPDATE rules_log AS trgt SET tag = 'updated' WHERE trgt.f1 = new.f1;
+
+--
+-- Check deparse disambiguation of INSERT/UPDATE/DELETE targets.
+--
+create rule r7 as on delete to rules_src do instead
+  with wins as (insert into int4_tbl as trgt values (0) returning *),
+       wupd as (update int4_tbl trgt set f1 = f1+1 returning *),
+       wdel as (delete from int4_tbl trgt where f1 = 0 returning *)
+  insert into rules_log AS trgt select old.* from wins, wupd, wdel
+  returning trgt.f1, trgt.f2;
+
+-- check display of all rules added above
 \d+ rules_src
 
 --

From 1bf595604aff657eb621c316e486b456fe3ce484 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Sun, 19 Feb 2023 00:41:18 +0100
Subject: [PATCH 06/61] Fix handling of multi-column BRIN indexes

When evaluating clauses on multiple scan keys of a multi-column BRIN
index, we can stop processing as soon as we find a scan key eliminating
the range, and the range should not be added to tbe bitmap.

That's how it worked before 14, but since a681e3c107a the code treated
the range as matching if it matched at least the last scan key.

Backpatch to 14, where this code was introduced.

Backpatch-through: 14
Discussion: https://postgr.es/m/ebc18613-125e-60df-7520-fcbe0f9274fc%40enterprisedb.com
---
 src/backend/access/brin/brin.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index a64d70bafc7..a6549d9dcbf 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -777,6 +777,13 @@ bringetbitmap(IndexScanDesc scan, Node **bmNodeP)
 								break;
 						}
 					}
+
+					/*
+					 * If we found a scan key eliminating the range, no need to
+					 * check additional ones.
+					 */
+					if (!addrange)
+						break;
 				}
 			}
 		}

From 19c4d0acbf5ecca7320f7e1fda3d612eb6e1dd30 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Tue, 21 Feb 2023 10:56:37 +0100
Subject: [PATCH 07/61] pgbench: Prepare commands in pipelines in advance

Failing to do so results in an error when a pgbench script tries to
start a serializable transaction inside a pipeline, because by the time
BEGIN ISOLATION LEVEL SERIALIZABLE is executed, we're already in a
transaction that has acquired a snapshot, so the server rightfully
complains.

We can work around that by preparing all commands in the pipeline before
actually starting the pipeline.  This changes the existing code in two
aspects: first, we now prepare each command individually at the point
where that command is about to be executed; previously, we would prepare
all commands in a script as soon as the first command of that script
would be executed.  It's hard to see that this would make much of a
difference (particularly since it only affects the first time to execute
each script in a client), but I didn't actually try to measure it.

Secondly, we no longer use PQsendPrepare() in pipeline mode, but only
PQprepare.  There's no specific reason for this change other than no
longer needing to do differently in pipeline mode.  (Previously we had
no choice, because in pipeline mode PQprepare could not be used.)

Backpatch to 14, where pgbench got support for pipeline mode.

Reported-by: Yugo NAGATA <nagata@sraoss.co.jp>
Discussion: https://postgr.es/m/20210716153013.fc53b1c780b06fccc07a7f0d@sraoss.co.jp
---
 src/bin/pgbench/pgbench.c                    | 162 +++++++++++++------
 src/bin/pgbench/t/001_pgbench_with_server.pl |  20 +++
 2 files changed, 130 insertions(+), 52 deletions(-)

diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index e0dfadcf414..c87bc3dd007 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -481,7 +481,8 @@ typedef struct
 	pg_time_usec_t txn_begin;	/* used for measuring schedule lag times */
 	pg_time_usec_t stmt_begin;	/* used for measuring statement latencies */
 
-	bool		prepared[MAX_SCRIPTS];	/* whether client prepared the script */
+	/* whether client prepared each command of each script */
+	bool	  **prepared;
 
 	/* per client collected stats */
 	int64		cnt;			/* client transaction count, for -t */
@@ -573,7 +574,8 @@ static const char *QUERYMODE[] = {"simple", "extended", "prepared"};
  * argv			Command arguments, the first of which is the command or SQL
  *				string itself.  For SQL commands, after post-processing
  *				argv[0] is the same as 'lines' with variables substituted.
- * varprefix 	SQL commands terminated with \gset or \aset have this set
+ * prepname		The name that this command is prepared under, in prepare mode
+ * varprefix	SQL commands terminated with \gset or \aset have this set
  *				to a non NULL value.  If nonempty, it's used to prefix the
  *				variable name that receives the value.
  * aset			do gset on all possible queries of a combined query (\;).
@@ -588,6 +590,7 @@ typedef struct Command
 	MetaCommand meta;
 	int			argc;
 	char	   *argv[MAX_ARGS];
+	char	   *prepname;
 	char	   *varprefix;
 	PgBenchExpr *expr;
 	SimpleStats stats;
@@ -2836,13 +2839,9 @@ runShellCommand(CState *st, char *variable, char **argv, int argc)
 	return true;
 }
 
-#define MAX_PREPARE_NAME		32
-static void
-preparedStatementName(char *buffer, int file, int state)
-{
-	sprintf(buffer, "P%d_%d", file, state);
-}
-
+/*
+ * Report the abortion of the client when processing SQL commands.
+ */
 static void
 commandFailed(CState *st, const char *cmd, const char *message)
 {
@@ -2869,6 +2868,87 @@ chooseScript(TState *thread)
 	return i - 1;
 }
 
+/*
+ * Prepare the SQL command from st->use_file at command_num.
+ */
+static void
+prepareCommand(CState *st, int command_num)
+{
+	Command    *command = sql_script[st->use_file].commands[command_num];
+
+	/* No prepare for non-SQL commands */
+	if (command->type != SQL_COMMAND)
+		return;
+
+	/*
+	 * If not already done, allocate space for 'prepared' flags: one boolean
+	 * for each command of each script.
+	 */
+	if (!st->prepared)
+	{
+		st->prepared = pg_malloc(sizeof(bool *) * num_scripts);
+		for (int i = 0; i < num_scripts; i++)
+		{
+			ParsedScript *script = &sql_script[i];
+			int			numcmds;
+
+			for (numcmds = 0; script->commands[numcmds] != NULL; numcmds++)
+				;
+			st->prepared[i] = pg_malloc0(sizeof(bool) * numcmds);
+		}
+	}
+
+	if (!st->prepared[st->use_file][command_num])
+	{
+		PGresult   *res;
+
+		pg_log_debug("client %d preparing %s", st->id, command->prepname);
+		res = PQprepare(st->con, command->prepname,
+						command->argv[0], command->argc - 1, NULL);
+		if (PQresultStatus(res) != PGRES_COMMAND_OK)
+			pg_log_error("%s", PQerrorMessage(st->con));
+		PQclear(res);
+		st->prepared[st->use_file][command_num] = true;
+	}
+}
+
+/*
+ * Prepare all the commands in the script that come after the \startpipeline
+ * that's at position st->command, and the first \endpipeline we find.
+ *
+ * This sets the ->prepared flag for each relevant command as well as the
+ * \startpipeline itself, but doesn't move the st->command counter.
+ */
+static void
+prepareCommandsInPipeline(CState *st)
+{
+	int			j;
+	Command   **commands = sql_script[st->use_file].commands;
+
+	Assert(commands[st->command]->type == META_COMMAND &&
+		   commands[st->command]->meta == META_STARTPIPELINE);
+
+	/*
+	 * We set the 'prepared' flag on the \startpipeline itself to flag that we
+	 * don't need to do this next time without calling prepareCommand(), even
+	 * though we don't actually prepare this command.
+	 */
+	if (st->prepared &&
+		st->prepared[st->use_file][st->command])
+		return;
+
+	for (j = st->command + 1; commands[j] != NULL; j++)
+	{
+		if (commands[j]->type == META_COMMAND &&
+			commands[j]->meta == META_ENDPIPELINE)
+			break;
+
+		prepareCommand(st, j);
+	}
+
+	st->prepared[st->use_file][st->command] = true;
+}
+
 /* Send a SQL command, using the chosen querymode */
 static bool
 sendCommand(CState *st, Command *command)
@@ -2899,50 +2979,13 @@ sendCommand(CState *st, Command *command)
 	}
 	else if (querymode == QUERY_PREPARED)
 	{
-		char		name[MAX_PREPARE_NAME];
 		const char *params[MAX_ARGS];
 
-		if (!st->prepared[st->use_file])
-		{
-			int			j;
-			Command   **commands = sql_script[st->use_file].commands;
-
-			for (j = 0; commands[j] != NULL; j++)
-			{
-				PGresult   *res;
-				char		name[MAX_PREPARE_NAME];
-
-				if (commands[j]->type != SQL_COMMAND)
-					continue;
-				preparedStatementName(name, st->use_file, j);
-				if (PQpipelineStatus(st->con) == PQ_PIPELINE_OFF)
-				{
-					res = PQprepare(st->con, name,
-									commands[j]->argv[0], commands[j]->argc - 1, NULL);
-					if (PQresultStatus(res) != PGRES_COMMAND_OK)
-						pg_log_error("%s", PQerrorMessage(st->con));
-					PQclear(res);
-				}
-				else
-				{
-					/*
-					 * In pipeline mode, we use asynchronous functions. If a
-					 * server-side error occurs, it will be processed later
-					 * among the other results.
-					 */
-					if (!PQsendPrepare(st->con, name,
-									   commands[j]->argv[0], commands[j]->argc - 1, NULL))
-						pg_log_error("%s", PQerrorMessage(st->con));
-				}
-			}
-			st->prepared[st->use_file] = true;
-		}
-
+		prepareCommand(st, st->command);
 		getQueryParams(st, command, params);
-		preparedStatementName(name, st->use_file, st->command);
 
-		pg_log_debug("client %d sending %s", st->id, name);
-		r = PQsendQueryPrepared(st->con, name, command->argc - 1,
+		pg_log_debug("client %d sending %s", st->id, command->prepname);
+		r = PQsendQueryPrepared(st->con, command->prepname, command->argc - 1,
 								params, NULL, NULL, 0);
 	}
 	else						/* unknown sql mode */
@@ -3202,7 +3245,8 @@ advanceConnectionState(TState *thread, CState *st, StatsData *agg)
 					thread->conn_duration += now - start;
 
 					/* Reset session-local state */
-					memset(st->prepared, 0, sizeof(st->prepared));
+					pg_free(st->prepared);
+					st->prepared = NULL;
 				}
 
 				/* record transaction start time */
@@ -3777,6 +3821,16 @@ executeMetaCommand(CState *st, pg_time_usec_t *now)
 			return CSTATE_ABORTED;
 		}
 
+		/*
+		 * If we're in prepared-query mode, we need to prepare all the
+		 * commands that are inside the pipeline before we actually start the
+		 * pipeline itself.  This solves the problem that running BEGIN
+		 * ISOLATION LEVEL SERIALIZABLE in a pipeline would fail due to a
+		 * snapshot having been acquired by the prepare within the pipeline.
+		 */
+		if (querymode == QUERY_PREPARED)
+			prepareCommandsInPipeline(st);
+
 		if (PQpipelineStatus(st->con) != PQ_PIPELINE_OFF)
 		{
 			commandFailed(st, "startpipeline", "already in pipeline mode");
@@ -4818,6 +4872,7 @@ create_sql_command(PQExpBuffer buf, const char *source)
 	my_command->varprefix = NULL;	/* allocated later, if needed */
 	my_command->expr = NULL;
 	initSimpleStats(&my_command->stats);
+	my_command->prepname = NULL;	/* set later, if needed */
 
 	return my_command;
 }
@@ -4849,6 +4904,7 @@ static void
 postprocess_sql_command(Command *my_command)
 {
 	char		buffer[128];
+	static int	prepnum = 0;
 
 	Assert(my_command->type == SQL_COMMAND);
 
@@ -4857,15 +4913,17 @@ postprocess_sql_command(Command *my_command)
 	buffer[strcspn(buffer, "\n\r")] = '\0';
 	my_command->first_line = pg_strdup(buffer);
 
-	/* parse query if necessary */
+	/* Parse query and generate prepared statement name, if necessary */
 	switch (querymode)
 	{
 		case QUERY_SIMPLE:
 			my_command->argv[0] = my_command->lines.data;
 			my_command->argc++;
 			break;
-		case QUERY_EXTENDED:
 		case QUERY_PREPARED:
+			my_command->prepname = psprintf("P_%d", prepnum++);
+			/* fall through */
+		case QUERY_EXTENDED:
 			if (!parseQuery(my_command))
 				exit(1);
 			break;
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index 282ccc24aeb..76ecd9efeba 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -841,6 +841,26 @@
 }
 	});
 
+# Working \startpipeline in prepared query mode with serializable
+$node->pgbench(
+	'-c4 -j2 -t 10 -n -M prepared',
+	0,
+	[
+		qr{type: .*/001_pgbench_pipeline_serializable},
+		qr{actually processed: (\d+)/\1}
+	],
+	[],
+	'working \startpipeline with serializable',
+	{
+		'001_pgbench_pipeline_serializable' => q{
+-- test startpipeline with serializable
+\startpipeline
+BEGIN ISOLATION LEVEL SERIALIZABLE;
+} . "select 1;\n" x 10 . q{
+END;
+\endpipeline
+}
+	});
 
 # trigger many expression errors
 my @errors = (

From 8f706943e5c20528ab783cc22989b3e4f8499326 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rasheed@gmail.com>
Date: Wed, 22 Feb 2023 13:26:20 +0000
Subject: [PATCH 08/61] Add missing support for the latest SPI status codes.

SPI_result_code_string() was missing support for SPI_OK_TD_REGISTER,
and in v15 and later, it was missing support for SPI_OK_MERGE, as was
pltcl_process_SPI_result().

The last of those would trigger an error if a MERGE was executed from
PL/Tcl. The others seem fairly innocuous, but worth fixing.

Back-patch to all supported branches. Before v15, this is just adding
SPI_OK_TD_REGISTER to SPI_result_code_string(), which is unlikely to
be seen by anyone, but seems worth doing for completeness.

Reviewed by Tom Lane.

Discussion:
  https://postgr.es/m/CAEZATCUg8V%2BK%2BGcafOPqymxk84Y_prXgfe64PDoopjLFH6Z0Aw%40mail.gmail.com
  https://postgr.es/m/CAEZATCUMe%2B_KedPMM9AxKqm%3DSZogSxjUcrMe%2BsakusZh3BFcQw%40mail.gmail.com
---
 src/backend/executor/spi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
index 4a2ddd5dff3..8f04c342d5a 100644
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@ -2084,6 +2084,8 @@ SPI_result_code_string(int code)
 			return "SPI_OK_REL_REGISTER";
 		case SPI_OK_REL_UNREGISTER:
 			return "SPI_OK_REL_UNREGISTER";
+		case SPI_OK_TD_REGISTER:
+			return "SPI_OK_TD_REGISTER";
 	}
 	/* Unrecognized code ... return something useful ... */
 	sprintf(buf, "Unrecognized SPI code %d", code);

From 52fd7adf33e231858b95231169fda6b956e661ff Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Wed, 22 Feb 2023 15:24:09 +0100
Subject: [PATCH 09/61] Fix snapshot handling in logicalmsg_decode

Whe decoding a transactional logical message, logicalmsg_decode called
SnapBuildGetOrBuildSnapshot. But we may not have a consistent snapshot
yet at that point. We don't actually need the snapshot in this case
(during replay we'll have the snapshot from the transaction), so in
practice this is harmless. But in assert-enabled build this crashes.

Fixed by requesting the snapshot only in non-transactional case, where
we are guaranteed to have SNAPBUILD_CONSISTENT.

Backpatch to 11. The issue exists since 9.6.

Backpatch-through: 11
Reviewed-by: Andres Freund
Discussion: https://postgr.es/m/84d60912-6eab-9b84-5de3-41765a5449e8@enterprisedb.com
---
 src/backend/replication/logical/decode.c        | 14 ++++++++++++--
 src/backend/replication/logical/reorderbuffer.c | 10 ++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 7a6323c3989..755d7ae6d2d 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -572,7 +572,7 @@ logicalmsg_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 	TransactionId xid = XLogRecGetXid(r);
 	uint8		info = XLogRecGetInfo(r) & ~XLR_INFO_MASK;
 	RepOriginId origin_id = XLogRecGetOrigin(r);
-	Snapshot	snapshot;
+	Snapshot	snapshot = NULL;
 	xl_logical_message *message;
 
 	if (info != XLOG_LOGICAL_MESSAGE)
@@ -602,7 +602,17 @@ logicalmsg_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 			  SnapBuildXactNeedsSkip(builder, buf->origptr)))
 		return;
 
-	snapshot = SnapBuildGetOrBuildSnapshot(builder, xid);
+	/*
+	 * If this is a non-transactional change, get the snapshot we're expected
+	 * to use. We only get here when the snapshot is consistent, and the
+	 * change is not meant to be skipped.
+	 *
+	 * For transactional changes we don't need a snapshot, we'll use the
+	 * regular snapshot maintained by ReorderBuffer. We just leave it NULL.
+	 */
+	if (!message->transactional)
+		snapshot = SnapBuildGetOrBuildSnapshot(builder, xid);
+
 	ReorderBufferQueueMessage(ctx->reorder, xid, snapshot, buf->endptr,
 							  message->transactional,
 							  message->message, /* first part of message is
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 709365fc8c6..721fa652d25 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -821,6 +821,13 @@ ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
 
 		Assert(xid != InvalidTransactionId);
 
+		/*
+		 * We don't expect snapshots for transactional changes - we'll use the
+		 * snapshot derived later during apply (unless the change gets
+		 * skipped).
+		 */
+		Assert(!snapshot);
+
 		oldcontext = MemoryContextSwitchTo(rb->context);
 
 		change = ReorderBufferGetChange(rb);
@@ -839,6 +846,9 @@ ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
 		ReorderBufferTXN *txn = NULL;
 		volatile Snapshot snapshot_now = snapshot;
 
+		/* Non-transactional changes require a valid snapshot. */
+		Assert(snapshot_now);
+
 		if (xid != InvalidTransactionId)
 			txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
 

From 9d08e8d5474497b4f3b23fd5b9e8f105acfe41d1 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 23 Feb 2023 15:40:28 -0500
Subject: [PATCH 10/61] Don't repeatedly register cache callbacks in pgoutput
 plugin.

Multiple cycles of starting up and shutting down the plugin within a
single session would eventually lead to "out of relcache_callback_list
slots", because pgoutput_startup blindly re-registered its cache
callbacks each time.  Fix it to register them only once, as all other
users of cache callbacks already take care to do.

This has been broken all along, so back-patch to all supported branches.

Shi Yu

Discussion: https://postgr.es/m/OSZPR01MB631004A78D743D68921FFAD3FDA79@OSZPR01MB6310.jpnprd01.prod.outlook.com
---
 src/backend/replication/pgoutput/pgoutput.c | 24 ++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c
index ff9cf5d406d..df2ea94d468 100644
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -260,6 +260,7 @@ pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
 				 bool is_init)
 {
 	PGOutputData *data = palloc0(sizeof(PGOutputData));
+	static bool publication_callback_registered = false;
 
 	/* Create our memory context for private allocations. */
 	data->context = AllocSetContextCreate(ctx->context,
@@ -323,9 +324,18 @@ pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
 		/* Init publication state. */
 		data->publications = NIL;
 		publications_valid = false;
-		CacheRegisterSyscacheCallback(PUBLICATIONOID,
-									  publication_invalidation_cb,
-									  (Datum) 0);
+
+		/*
+		 * Register callback for pg_publication if we didn't already do that
+		 * during some previous call in this process.
+		 */
+		if (!publication_callback_registered)
+		{
+			CacheRegisterSyscacheCallback(PUBLICATIONOID,
+										  publication_invalidation_cb,
+										  (Datum) 0);
+			publication_callback_registered = true;
+		}
 
 		/* Initialize relation schema cache. */
 		init_rel_sync_cache(CacheMemoryContext);
@@ -948,7 +958,9 @@ static void
 init_rel_sync_cache(MemoryContext cachectx)
 {
 	HASHCTL		ctl;
+	static bool relation_callbacks_registered = false;
 
+	/* Nothing to do if hash table already exists */
 	if (RelationSyncCache != NULL)
 		return;
 
@@ -963,10 +975,16 @@ init_rel_sync_cache(MemoryContext cachectx)
 
 	Assert(RelationSyncCache != NULL);
 
+	/* No more to do if we already registered callbacks */
+	if (relation_callbacks_registered)
+		return;
+
 	CacheRegisterRelcacheCallback(rel_sync_cache_relation_cb, (Datum) 0);
 	CacheRegisterSyscacheCallback(PUBLICATIONRELMAP,
 								  rel_sync_cache_publication_cb,
 								  (Datum) 0);
+
+	relation_callbacks_registered = true;
 }
 
 /*

From dfec68fb87e1d1ace84005bbe51930afa3071989 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rasheed@gmail.com>
Date: Sat, 25 Feb 2023 14:44:49 +0000
Subject: [PATCH 11/61] Fix mishandling of OLD/NEW references in subqueries in
 rule actions.

If a rule action contains a subquery that refers to columns from OLD
or NEW, then those are really lateral references, and the planner will
complain if it sees such things in a subquery that isn't marked as
lateral. However, at rule-definition time, the user isn't required to
mark the subquery with LATERAL, and so it can fail when the rule is
used.

Fix this by marking such subqueries as lateral in the rewriter, at the
point where they're used.

Dean Rasheed and Tom Lane, per report from Alexander Lakhin.
Back-patch to all supported branches.

Discussion: https://postgr.es/m/5e09da43-aaba-7ea7-0a51-a2eb981b058b%40gmail.com
---
 src/backend/rewrite/rewriteHandler.c | 22 ++++++++++++++++++----
 src/test/regress/expected/rules.out  | 25 +++++++++++++++++++++++++
 src/test/regress/sql/rules.sql       | 17 +++++++++++++++++
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 2938edf1c6a..5fe8136ab9a 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -403,6 +403,7 @@ rewriteRuleAction(Query *parsetree,
 	Query	   *sub_action;
 	Query	  **sub_action_ptr;
 	acquireLocksOnSubLinks_context context;
+	ListCell   *lc;
 
 	context.for_execute = true;
 
@@ -441,6 +442,23 @@ rewriteRuleAction(Query *parsetree,
 	ChangeVarNodes(rule_qual,
 				   PRS2_OLD_VARNO + rt_length, rt_index, 0);
 
+	/*
+	 * Mark any subquery RTEs in the rule action as LATERAL if they contain
+	 * Vars referring to the current query level (references to NEW/OLD).
+	 * Those really are lateral references, but we've historically not
+	 * required users to mark such subqueries with LATERAL explicitly.  But
+	 * the planner will complain if such Vars exist in a non-LATERAL subquery,
+	 * so we have to fix things up here.
+	 */
+	foreach(lc, sub_action->rtable)
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
+
+		if (rte->rtekind == RTE_SUBQUERY && !rte->lateral &&
+			contain_vars_of_level((Node *) rte->subquery, 1))
+			rte->lateral = true;
+	}
+
 	/*
 	 * Generate expanded rtable consisting of main parsetree's rtable plus
 	 * rule action's rtable; this becomes the complete rtable for the rule
@@ -482,8 +500,6 @@ rewriteRuleAction(Query *parsetree,
 	 */
 	if (parsetree->hasSubLinks && !sub_action->hasSubLinks)
 	{
-		ListCell   *lc;
-
 		foreach(lc, parsetree->rtable)
 		{
 			RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
@@ -585,8 +601,6 @@ rewriteRuleAction(Query *parsetree,
 	 */
 	if (parsetree->cteList != NIL && sub_action->commandType != CMD_UTILITY)
 	{
-		ListCell   *lc;
-
 		/*
 		 * Annoying implementation restriction: because CTEs are identified by
 		 * name within a cteList, we can't merge a CTE from the original query
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index a51729af420..77fbb582cab 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -3256,6 +3256,31 @@ Rules:
 
 drop table rule_t1, rule_dest;
 --
+-- Test implicit LATERAL references to old/new in rules
+--
+CREATE TABLE rule_t1(a int, b text DEFAULT 'xxx', c int);
+CREATE VIEW rule_v1 AS SELECT * FROM rule_t1;
+CREATE RULE v1_ins AS ON INSERT TO rule_v1
+  DO ALSO INSERT INTO rule_t1
+  SELECT * FROM (SELECT a + 10 FROM rule_t1 WHERE a = NEW.a) tt;
+CREATE RULE v1_upd AS ON UPDATE TO rule_v1
+  DO ALSO UPDATE rule_t1 t
+  SET c = tt.a * 10
+  FROM (SELECT a FROM rule_t1 WHERE a = OLD.a) tt WHERE t.a = tt.a;
+INSERT INTO rule_v1 VALUES (1, 'a'), (2, 'b');
+UPDATE rule_v1 SET b = upper(b);
+SELECT * FROM rule_t1;
+ a  |  b  |  c  
+----+-----+-----
+  1 | A   |  10
+  2 | B   |  20
+ 11 | XXX | 110
+ 12 | XXX | 120
+(4 rows)
+
+DROP TABLE rule_t1 CASCADE;
+NOTICE:  drop cascades to view rule_v1
+--
 -- check alter rename rule
 --
 CREATE TABLE rule_t1 (a INT);
diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql
index 1318482d747..90b37af2097 100644
--- a/src/test/regress/sql/rules.sql
+++ b/src/test/regress/sql/rules.sql
@@ -1065,6 +1065,23 @@ create rule rr as on update to rule_t1 do instead UPDATE rule_dest trgt
 \d+ rule_t1
 drop table rule_t1, rule_dest;
 
+--
+-- Test implicit LATERAL references to old/new in rules
+--
+CREATE TABLE rule_t1(a int, b text DEFAULT 'xxx', c int);
+CREATE VIEW rule_v1 AS SELECT * FROM rule_t1;
+CREATE RULE v1_ins AS ON INSERT TO rule_v1
+  DO ALSO INSERT INTO rule_t1
+  SELECT * FROM (SELECT a + 10 FROM rule_t1 WHERE a = NEW.a) tt;
+CREATE RULE v1_upd AS ON UPDATE TO rule_v1
+  DO ALSO UPDATE rule_t1 t
+  SET c = tt.a * 10
+  FROM (SELECT a FROM rule_t1 WHERE a = OLD.a) tt WHERE t.a = tt.a;
+INSERT INTO rule_v1 VALUES (1, 'a'), (2, 'b');
+UPDATE rule_v1 SET b = upper(b);
+SELECT * FROM rule_t1;
+DROP TABLE rule_t1 CASCADE;
+
 --
 -- check alter rename rule
 --

From fcb2564bf1864c10abeab7c134424ea4e46b2900 Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Sun, 26 Feb 2023 06:48:41 -0500
Subject: [PATCH 12/61] Don't force SQL_ASCII/no-locale for installcheck in
 vcregress.pl

It's been this way for a very long time, but it appears to have been
masking an issue that only manifests with different settings. Therefore,
run the tests in the installation's default encoding/locale.

Backpatch to all live branches.
---
 src/tools/msvc/vcregress.pl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/tools/msvc/vcregress.pl b/src/tools/msvc/vcregress.pl
index d4e7d69df2e..885249d08a3 100644
--- a/src/tools/msvc/vcregress.pl
+++ b/src/tools/msvc/vcregress.pl
@@ -163,9 +163,7 @@ sub installcheck_internal
 		"--bindir=../../../$Config/psql",
 		"--schedule=${schedule}_schedule",
 		"--max-concurrent-tests=20",
-		"--make-testtablespace-dir",
-		"--encoding=SQL_ASCII",
-		"--no-locale");
+		"--make-testtablespace-dir");
 	push(@args, $maxconn) if $maxconn;
 	push(@args, @EXTRA_REGRESS_OPTS);
 	system(@args);

From fc9ab81626dd408f33984f49e6364cb788ed5dea Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 27 Feb 2023 16:29:51 -0500
Subject: [PATCH 13/61] Harden postgres_fdw tests against unexpected cache
 flushes.

postgres_fdw will close its remote session if an sinval cache reset
occurs, since it's possible that that means some FDW parameters
changed.  We had two tests that were trying to ensure that the
session remains alive by setting debug_discard_caches = 0; but
that's not sufficient.  Even though the tests seem stable enough
in the buildfarm, they flap a lot under CI.

In the first test, which is checking the ability to recover from
a lost connection, we can stabilize the results by just not
caring whether pg_terminate_backend() finds a victim backend.
If a reset did happen, there won't be a session to terminate
anymore, but the test can proceed anyway.  (Arguably, we are
then not testing the unintentional-disconnect case, but as long
as that scenario is exercised in most runs I think it's fine;
testing the reset-driven case is of value too.)

In the second test, which is trying to verify the application_name
displayed in pg_stat_activity by a remote session, we had a race
condition in that the remote session might go away before we can
fetch its pg_stat_activity entry.  We can close that race and make
the test more certainly test what it intends to by arranging things
so that the remote session itself fetches its pg_stat_activity entry
(based on PID rather than a somewhat-circular assumption about the
application name).

Both tests now demonstrably pass under debug_discard_caches = 1,
so we can remove that hack.

Back-patch into relevant back branches.

Discussion: https://postgr.es/m/20230226194340.u44bkfgyz64c67i6@awork3.anarazel.de
---
 .../postgres_fdw/expected/postgres_fdw.out    | 26 ++++++-------------
 contrib/postgres_fdw/sql/postgres_fdw.sql     | 18 ++++++-------
 2 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 67fde96a858..10700d6fd4a 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -9730,11 +9730,6 @@ WARNING:  there is no transaction in progress
 -- Change application_name of remote connection to special one
 -- so that we can easily terminate the connection later.
 ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
--- If debug_discard_caches is active, it results in
--- dropping remote connections after every transaction, making it
--- impossible to test termination meaningfully.  So turn that off
--- for this test.
-SET debug_discard_caches = 0;
 -- Make sure we have a remote connection.
 SELECT 1 FROM ft1 LIMIT 1;
  ?column? 
@@ -9743,13 +9738,12 @@ SELECT 1 FROM ft1 LIMIT 1;
 (1 row)
 
 -- Terminate the remote connection and wait for the termination to complete.
-SELECT pg_terminate_backend(pid, 180000) FROM pg_stat_activity
+-- (If a cache flush happens, the remote connection might have already been
+-- dropped; so code this step in a way that doesn't fail if no connection.)
+DO $$ BEGIN
+PERFORM pg_terminate_backend(pid, 180000) FROM pg_stat_activity
 	WHERE application_name = 'fdw_retry_check';
- pg_terminate_backend 
-----------------------
- t
-(1 row)
-
+END $$;
 -- This query should detect the broken connection when starting new remote
 -- transaction, reestablish new connection, and then succeed.
 BEGIN;
@@ -9762,13 +9756,10 @@ SELECT 1 FROM ft1 LIMIT 1;
 -- If we detect the broken connection when starting a new remote
 -- subtransaction, we should fail instead of establishing a new connection.
 -- Terminate the remote connection and wait for the termination to complete.
-SELECT pg_terminate_backend(pid, 180000) FROM pg_stat_activity
+DO $$ BEGIN
+PERFORM pg_terminate_backend(pid, 180000) FROM pg_stat_activity
 	WHERE application_name = 'fdw_retry_check';
- pg_terminate_backend 
-----------------------
- t
-(1 row)
-
+END $$;
 SAVEPOINT s;
 -- The text of the error might vary across platforms, so only show SQLSTATE.
 \set VERBOSITY sqlstate
@@ -9776,7 +9767,6 @@ SELECT 1 FROM ft1 LIMIT 1;    -- should fail
 ERROR:  08006
 \set VERBOSITY default
 COMMIT;
-RESET debug_discard_caches;
 -- =============================================================================
 -- test connection invalidation cases and postgres_fdw_get_connections function
 -- =============================================================================
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index f8c813d2175..793dd64811d 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -2912,18 +2912,16 @@ ROLLBACK;
 -- so that we can easily terminate the connection later.
 ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
 
--- If debug_discard_caches is active, it results in
--- dropping remote connections after every transaction, making it
--- impossible to test termination meaningfully.  So turn that off
--- for this test.
-SET debug_discard_caches = 0;
-
 -- Make sure we have a remote connection.
 SELECT 1 FROM ft1 LIMIT 1;
 
 -- Terminate the remote connection and wait for the termination to complete.
-SELECT pg_terminate_backend(pid, 180000) FROM pg_stat_activity
+-- (If a cache flush happens, the remote connection might have already been
+-- dropped; so code this step in a way that doesn't fail if no connection.)
+DO $$ BEGIN
+PERFORM pg_terminate_backend(pid, 180000) FROM pg_stat_activity
 	WHERE application_name = 'fdw_retry_check';
+END $$;
 
 -- This query should detect the broken connection when starting new remote
 -- transaction, reestablish new connection, and then succeed.
@@ -2933,8 +2931,10 @@ SELECT 1 FROM ft1 LIMIT 1;
 -- If we detect the broken connection when starting a new remote
 -- subtransaction, we should fail instead of establishing a new connection.
 -- Terminate the remote connection and wait for the termination to complete.
-SELECT pg_terminate_backend(pid, 180000) FROM pg_stat_activity
+DO $$ BEGIN
+PERFORM pg_terminate_backend(pid, 180000) FROM pg_stat_activity
 	WHERE application_name = 'fdw_retry_check';
+END $$;
 SAVEPOINT s;
 -- The text of the error might vary across platforms, so only show SQLSTATE.
 \set VERBOSITY sqlstate
@@ -2942,8 +2942,6 @@ SELECT 1 FROM ft1 LIMIT 1;    -- should fail
 \set VERBOSITY default
 COMMIT;
 
-RESET debug_discard_caches;
-
 -- =============================================================================
 -- test connection invalidation cases and postgres_fdw_get_connections function
 -- =============================================================================

From aa797f5a2a77b938f56f57007e8b455293288004 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 1 Mar 2023 11:30:17 -0500
Subject: [PATCH 14/61] Avoid fetching one past the end of translate()'s "to"
 parameter.

This is usually harmless, but if you were very unlucky it could
provoke a segfault due to the "to" string being right up against
the end of memory.  Found via valgrind testing (so we might've
found it earlier, except that our regression tests lacked any
exercise of translate()'s deletion feature).

Fix by switching the order of the test-for-end-of-string and
advance-pointer steps.  While here, compute "to_ptr + tolen"
just once.  (Smarter compilers might figure that out for
themselves, but let's just make sure.)

Report and fix by Daniil Anisimov, in bug #17816.

Discussion: https://postgr.es/m/17816-70f3d2764e88a108@postgresql.org
---
 src/backend/utils/adt/oracle_compat.c | 12 +++++++-----
 src/test/regress/expected/strings.out |  6 ++++++
 src/test/regress/sql/strings.sql      |  1 +
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index f737aa6fbde..bd9e5f9e243 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -797,7 +797,8 @@ translate(PG_FUNCTION_ARGS)
 	text	   *to = PG_GETARG_TEXT_PP(2);
 	text	   *result;
 	char	   *from_ptr,
-			   *to_ptr;
+			   *to_ptr,
+			   *to_end;
 	char	   *source,
 			   *target;
 	int			m,
@@ -819,6 +820,7 @@ translate(PG_FUNCTION_ARGS)
 	from_ptr = VARDATA_ANY(from);
 	tolen = VARSIZE_ANY_EXHDR(to);
 	to_ptr = VARDATA_ANY(to);
+	to_end = to_ptr + tolen;
 
 	/*
 	 * The worst-case expansion is to substitute a max-length character for a
@@ -852,16 +854,16 @@ translate(PG_FUNCTION_ARGS)
 		}
 		if (i < fromlen)
 		{
-			/* substitute */
+			/* substitute, or delete if no corresponding "to" character */
 			char	   *p = to_ptr;
 
 			for (i = 0; i < from_index; i++)
 			{
-				p += pg_mblen(p);
-				if (p >= (to_ptr + tolen))
+				if (p >= to_end)
 					break;
+				p += pg_mblen(p);
 			}
-			if (p < (to_ptr + tolen))
+			if (p < to_end)
 			{
 				len = pg_mblen(p);
 				memcpy(target, p, len);
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 1745ca9ca68..d1485df7063 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2256,6 +2256,12 @@ SELECT translate('12345', '14', 'ax');
  a23x5
 (1 row)
 
+SELECT translate('12345', '134', 'a');
+ translate 
+-----------
+ a25
+(1 row)
+
 SELECT ascii('x');
  ascii 
 -------
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index c53727f68d3..3c438a304ac 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -770,6 +770,7 @@ SELECT ltrim('zzzytrim', 'xyz');
 
 SELECT translate('', '14', 'ax');
 SELECT translate('12345', '14', 'ax');
+SELECT translate('12345', '134', 'a');
 
 SELECT ascii('x');
 SELECT ascii('');

From 4a37abbf282daf26ff70857a92714a69a500ca07 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 2 Mar 2023 14:03:21 +0900
Subject: [PATCH 15/61] pageinspect: Fix crash with gist_page_items()

Attempting to use this function with a raw page not coming from a GiST
index would cause a crash, as it was missing the same sanity checks as
gist_page_items_bytea().  This slightly refactors the code so as all the
basic validation checks for GiST pages are done in a single routine,
in the same fashion as the pageinspect functions for hash and BRIN.

This fixes an issue similar to 076f4d9.  A test is added to stress for
this case.  While on it, I have added a similar test for
brin_page_items() with a combination make of a valid GiST index and a
raw btree page.  This one was already protected, but it was not tested.

Reported-by: Egor Chindyaskin
Author: Dmitry Koval
Discussion: https://postgr.es/m/17815-fc4a2d3b74705703@postgresql.org
Backpatch-through: 14
---
 contrib/pageinspect/expected/brin.out |  8 ++-
 contrib/pageinspect/expected/gist.out | 10 ++--
 contrib/pageinspect/gistfuncs.c       | 82 +++++++++++++--------------
 contrib/pageinspect/sql/brin.sql      |  8 ++-
 contrib/pageinspect/sql/gist.sql      | 10 ++--
 5 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/contrib/pageinspect/expected/brin.out b/contrib/pageinspect/expected/brin.out
index d19cdc3b957..e12fbeb4774 100644
--- a/contrib/pageinspect/expected/brin.out
+++ b/contrib/pageinspect/expected/brin.out
@@ -48,12 +48,14 @@ SELECT * FROM brin_page_items(get_raw_page('test1_a_idx', 2), 'test1_a_idx')
           1 |      0 |      1 | f        | f        | f           | {1 .. 1}
 (1 row)
 
--- Failure for non-BRIN index.
+-- Mask DETAIL messages as these are not portable across architectures.
+\set VERBOSITY terse
+-- Failures for non-BRIN index.
 CREATE INDEX test1_a_btree ON test1 (a);
 SELECT brin_page_items(get_raw_page('test1_a_btree', 0), 'test1_a_btree');
 ERROR:  "test1_a_btree" is not a BRIN index
--- Mask DETAIL messages as these are not portable across architectures.
-\set VERBOSITY terse
+SELECT brin_page_items(get_raw_page('test1_a_btree', 0), 'test1_a_idx');
+ERROR:  input page is not a valid BRIN page
 -- Invalid special area size
 SELECT brin_page_type(get_raw_page('test1', 0));
 ERROR:  input page is not a valid BRIN page
diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out
index eec1fd91cb9..cae739219bd 100644
--- a/contrib/pageinspect/expected/gist.out
+++ b/contrib/pageinspect/expected/gist.out
@@ -56,14 +56,16 @@ SELECT itemoffset, ctid, itemlen FROM gist_page_items_bytea(get_raw_page('test_g
           2 | (2,65535) |      40
 (2 rows)
 
--- Failure with non-GiST index.
+-- Suppress the DETAIL message, to allow the tests to work across various
+-- page sizes and architectures.
+\set VERBOSITY terse
+-- Failures with non-GiST index.
 CREATE INDEX test_gist_btree on test_gist(t);
 SELECT gist_page_items(get_raw_page('test_gist_btree', 0), 'test_gist_btree');
 ERROR:  "test_gist_btree" is not a GiST index
+SELECT gist_page_items(get_raw_page('test_gist_btree', 0), 'test_gist_idx');
+ERROR:  input page is not a valid GiST page
 -- Failure with various modes.
--- Suppress the DETAIL message, to allow the tests to work across various
--- page sizes and architectures.
-\set VERBOSITY terse
 -- invalid page size
 SELECT gist_page_items_bytea('aaa'::bytea);
 ERROR:  invalid page size
diff --git a/contrib/pageinspect/gistfuncs.c b/contrib/pageinspect/gistfuncs.c
index d1c3c321f83..0ae8f7459c1 100644
--- a/contrib/pageinspect/gistfuncs.c
+++ b/contrib/pageinspect/gistfuncs.c
@@ -34,29 +34,20 @@ PG_FUNCTION_INFO_V1(gist_page_items_bytea);
 #define ItemPointerGetDatum(X)	 PointerGetDatum(X)
 
 
-Datum
-gist_page_opaque_info(PG_FUNCTION_ARGS)
+static Page verify_gist_page(bytea *raw_page);
+
+/*
+ * Verify that the given bytea contains a GIST page or die in the attempt.
+ * A pointer to the page is returned.
+ */
+static Page
+verify_gist_page(bytea *raw_page)
 {
-	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
-	TupleDesc	tupdesc;
-	Page		page;
+	Page		page = get_page_from_raw(raw_page);
 	GISTPageOpaque opaq;
-	HeapTuple	resultTuple;
-	Datum		values[4];
-	bool		nulls[4];
-	Datum		flags[16];
-	int			nflags = 0;
-	uint16		flagbits;
-
-	if (!superuser())
-		ereport(ERROR,
-				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-				 errmsg("must be superuser to use raw page functions")));
-
-	page = get_page_from_raw(raw_page);
 
 	if (PageIsNew(page))
-		PG_RETURN_NULL();
+		return page;
 
 	/* verify the special space has the expected size */
 	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData)))
@@ -76,12 +67,38 @@ gist_page_opaque_info(PG_FUNCTION_ARGS)
 							   GIST_PAGE_ID,
 							   opaq->gist_page_id)));
 
+	return page;
+}
+
+Datum
+gist_page_opaque_info(PG_FUNCTION_ARGS)
+{
+	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
+	TupleDesc	tupdesc;
+	Page		page;
+	HeapTuple	resultTuple;
+	Datum		values[4];
+	bool		nulls[4];
+	Datum		flags[16];
+	int			nflags = 0;
+	uint16		flagbits;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to use raw page functions")));
+
+	page = verify_gist_page(raw_page);
+
+	if (PageIsNew(page))
+		PG_RETURN_NULL();
+
 	/* Build a tuple descriptor for our result type */
 	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
 		elog(ERROR, "return type must be a row type");
 
 	/* Convert the flags bitmask to an array of human-readable names */
-	flagbits = opaq->flags;
+	flagbits = GistPageGetOpaque(page)->flags;
 	if (flagbits & F_LEAF)
 		flags[nflags++] = CStringGetTextDatum("leaf");
 	if (flagbits & F_DELETED)
@@ -103,7 +120,7 @@ gist_page_opaque_info(PG_FUNCTION_ARGS)
 
 	values[0] = LSNGetDatum(PageGetLSN(page));
 	values[1] = LSNGetDatum(GistPageGetNSN(page));
-	values[2] = Int64GetDatum(opaq->rightlink);
+	values[2] = Int64GetDatum(GistPageGetOpaque(page)->rightlink);
 	values[3] = PointerGetDatum(construct_array(flags, nflags,
 												TEXTOID,
 												-1, false, TYPALIGN_INT));
@@ -124,7 +141,6 @@ gist_page_items_bytea(PG_FUNCTION_ARGS)
 	Tuplestorestate *tupstore;
 	MemoryContext oldcontext;
 	Page		page;
-	GISTPageOpaque opaq;
 	OffsetNumber offset;
 	OffsetNumber maxoff = InvalidOffsetNumber;
 
@@ -157,29 +173,11 @@ gist_page_items_bytea(PG_FUNCTION_ARGS)
 
 	MemoryContextSwitchTo(oldcontext);
 
-	page = get_page_from_raw(raw_page);
+	page = verify_gist_page(raw_page);
 
 	if (PageIsNew(page))
 		PG_RETURN_NULL();
 
-	/* verify the special space has the expected size */
-	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData)))
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("input page is not a valid %s page", "GiST"),
-					 errdetail("Expected special size %d, got %d.",
-							   (int) MAXALIGN(sizeof(GISTPageOpaqueData)),
-							   (int) PageGetSpecialSize(page))));
-
-	opaq = (GISTPageOpaque) PageGetSpecialPointer(page);
-	if (opaq->gist_page_id != GIST_PAGE_ID)
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("input page is not a valid %s page", "GiST"),
-					 errdetail("Expected %08x, got %08x.",
-							   GIST_PAGE_ID,
-							   opaq->gist_page_id)));
-
 	/* Avoid bogus PageGetMaxOffsetNumber() call with deleted pages */
 	if (GistPageIsDeleted(page))
 		elog(NOTICE, "page is deleted");
@@ -276,7 +274,7 @@ gist_page_items(PG_FUNCTION_ARGS)
 				 errmsg("\"%s\" is not a %s index",
 						RelationGetRelationName(indexRel), "GiST")));
 
-	page = get_page_from_raw(raw_page);
+	page = verify_gist_page(raw_page);
 
 	if (PageIsNew(page))
 	{
diff --git a/contrib/pageinspect/sql/brin.sql b/contrib/pageinspect/sql/brin.sql
index 45098c1ef5e..96b4645187e 100644
--- a/contrib/pageinspect/sql/brin.sql
+++ b/contrib/pageinspect/sql/brin.sql
@@ -15,12 +15,14 @@ SELECT * FROM brin_revmap_data(get_raw_page('test1_a_idx', 1)) LIMIT 5;
 SELECT * FROM brin_page_items(get_raw_page('test1_a_idx', 2), 'test1_a_idx')
     ORDER BY blknum, attnum LIMIT 5;
 
--- Failure for non-BRIN index.
+-- Mask DETAIL messages as these are not portable across architectures.
+\set VERBOSITY terse
+
+-- Failures for non-BRIN index.
 CREATE INDEX test1_a_btree ON test1 (a);
 SELECT brin_page_items(get_raw_page('test1_a_btree', 0), 'test1_a_btree');
+SELECT brin_page_items(get_raw_page('test1_a_btree', 0), 'test1_a_idx');
 
--- Mask DETAIL messages as these are not portable across architectures.
-\set VERBOSITY terse
 -- Invalid special area size
 SELECT brin_page_type(get_raw_page('test1', 0));
 SELECT * FROM brin_metapage_info(get_raw_page('test1', 0));
diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql
index ee46e09053e..963d5d40a3c 100644
--- a/contrib/pageinspect/sql/gist.sql
+++ b/contrib/pageinspect/sql/gist.sql
@@ -26,14 +26,16 @@ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 1), 'test_gist_idx')
 -- platform-dependent (endianess), so omit the actual key data from the output.
 SELECT itemoffset, ctid, itemlen FROM gist_page_items_bytea(get_raw_page('test_gist_idx', 0));
 
--- Failure with non-GiST index.
+-- Suppress the DETAIL message, to allow the tests to work across various
+-- page sizes and architectures.
+\set VERBOSITY terse
+
+-- Failures with non-GiST index.
 CREATE INDEX test_gist_btree on test_gist(t);
 SELECT gist_page_items(get_raw_page('test_gist_btree', 0), 'test_gist_btree');
+SELECT gist_page_items(get_raw_page('test_gist_btree', 0), 'test_gist_idx');
 
 -- Failure with various modes.
--- Suppress the DETAIL message, to allow the tests to work across various
--- page sizes and architectures.
-\set VERBOSITY terse
 -- invalid page size
 SELECT gist_page_items_bytea('aaa'::bytea);
 SELECT gist_page_items('aaa'::bytea, 'test_gist_idx'::regclass);

From c31e1165a799788d43939d2ef3f1b707a578cd76 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Mon, 6 Mar 2023 15:07:15 +1300
Subject: [PATCH 16/61] Fix assert failures in parallel SERIALIZABLE READ ONLY.

1.  Make sure that we don't decrement SxactGlobalXminCount twice when
the SXACT_FLAG_RO_SAFE optimization is reached in a parallel query.
This could trigger a sanity check failure in assert builds.  Non-assert
builds recompute the count in SetNewSxactGlobalXmin(), so the problem
was hidden, explaining the lack of field reports.  Add a new isolation
test to exercise that case.

2.  Remove an assertion that the DOOMED flag can't be set on a partially
released SERIALIZABLEXACT.  Instead, ignore the flag (our transaction
was already determined to be read-only safe, and DOOMED is in fact set
during partial release, and there was already an assertion that it
wasn't set sooner).  Improve an existing isolation test so that it
reaches that case (previously it wasn't quite testing what it was
supposed to be testing; see discussion).

Back-patch to 12.  Bug #17116.  Defects in commit 47a338cf.

Reported-by: Alexander Lakhin <exclusion@gmail.com>
Discussion: https://postgr.es/m/17116-d6ca217acc180e30%40postgresql.org
---
 src/backend/storage/lmgr/predicate.c          | 20 +++-
 .../expected/serializable-parallel-2.out      | 57 +++--------
 .../expected/serializable-parallel-3.out      | 97 +++++++++++++++++++
 src/test/isolation/isolation_schedule         |  1 +
 .../specs/serializable-parallel-2.spec        | 12 ++-
 .../specs/serializable-parallel-3.spec        | 47 +++++++++
 6 files changed, 184 insertions(+), 50 deletions(-)
 create mode 100644 src/test/isolation/expected/serializable-parallel-3.out
 create mode 100644 src/test/isolation/specs/serializable-parallel-3.spec

diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index f5668bdb4ff..3b0daf723b6 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -3331,6 +3331,7 @@ SetNewSxactGlobalXmin(void)
 void
 ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
 {
+	bool		partiallyReleasing = false;
 	bool		needToClear;
 	RWConflict	conflict,
 				nextConflict,
@@ -3431,6 +3432,7 @@ ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
 		else
 		{
 			MySerializableXact->flags |= SXACT_FLAG_PARTIALLY_RELEASED;
+			partiallyReleasing = true;
 			/* ... and proceed to perform the partial release below. */
 		}
 	}
@@ -3681,9 +3683,15 @@ ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
 	 * serializable transactions completes.  We then find the "new oldest"
 	 * xmin and purge any transactions which finished before this transaction
 	 * was launched.
+	 *
+	 * For parallel queries in read-only transactions, it might run twice.
+	 * We only release the reference on the first call.
 	 */
 	needToClear = false;
-	if (TransactionIdEquals(MySerializableXact->xmin, PredXact->SxactGlobalXmin))
+	if ((partiallyReleasing ||
+		 !SxactIsPartiallyReleased(MySerializableXact)) &&
+		TransactionIdEquals(MySerializableXact->xmin,
+							PredXact->SxactGlobalXmin))
 	{
 		Assert(PredXact->SxactGlobalXminCount > 0);
 		if (--(PredXact->SxactGlobalXminCount) == 0)
@@ -4839,10 +4847,14 @@ PreCommit_CheckForSerializationFailure(void)
 
 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
 
-	/* Check if someone else has already decided that we need to die */
-	if (SxactIsDoomed(MySerializableXact))
+	/*
+	 * Check if someone else has already decided that we need to die.  Since
+	 * we set our own DOOMED flag when partially releasing, ignore in that
+	 * case.
+	 */
+	if (SxactIsDoomed(MySerializableXact) &&
+		!SxactIsPartiallyReleased(MySerializableXact))
 	{
-		Assert(!SxactIsPartiallyReleased(MySerializableXact));
 		LWLockRelease(SerializableXactHashLock);
 		ereport(ERROR,
 				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
diff --git a/src/test/isolation/expected/serializable-parallel-2.out b/src/test/isolation/expected/serializable-parallel-2.out
index 92753ccf39f..904fdd90806 100644
--- a/src/test/isolation/expected/serializable-parallel-2.out
+++ b/src/test/isolation/expected/serializable-parallel-2.out
@@ -1,50 +1,23 @@
 Parsed test spec with 2 sessions
 
 starting permutation: s1r s2r1 s1c s2r2 s2c
-step s1r: SELECT * FROM foo;
- a
---
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
-10
-(10 rows)
+step s1r: SELECT COUNT(*) FROM foo;
+count
+-----
+  100
+(1 row)
 
-step s2r1: SELECT * FROM foo;
- a
---
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
-10
-(10 rows)
+step s2r1: SELECT COUNT(*) FROM foo;
+count
+-----
+  100
+(1 row)
 
 step s1c: COMMIT;
-step s2r2: SELECT * FROM foo;
- a
---
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
-10
-(10 rows)
+step s2r2: SELECT COUNT(*) FROM foo;
+count
+-----
+  100
+(1 row)
 
 step s2c: COMMIT;
diff --git a/src/test/isolation/expected/serializable-parallel-3.out b/src/test/isolation/expected/serializable-parallel-3.out
new file mode 100644
index 00000000000..654276a3856
--- /dev/null
+++ b/src/test/isolation/expected/serializable-parallel-3.out
@@ -0,0 +1,97 @@
+Parsed test spec with 4 sessions
+
+starting permutation: s1r s3r s2r1 s4r1 s1c s2r2 s3c s4r2 s4c s2c
+step s1r: SELECT * FROM foo;
+ a
+--
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+10
+(10 rows)
+
+step s3r: SELECT * FROM foo;
+ a
+--
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+10
+(10 rows)
+
+step s2r1: SELECT * FROM foo;
+ a
+--
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+10
+(10 rows)
+
+step s4r1: SELECT * FROM foo;
+ a
+--
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+10
+(10 rows)
+
+step s1c: COMMIT;
+step s2r2: SELECT * FROM foo;
+ a
+--
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+10
+(10 rows)
+
+step s3c: COMMIT;
+step s4r2: SELECT * FROM foo;
+ a
+--
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+10
+(10 rows)
+
+step s4c: COMMIT;
+step s2c: COMMIT;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 9122028e15d..3f12f923c02 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -146,6 +146,7 @@ test: plpgsql-toast
 test: truncate-conflict
 #test: serializable-parallel
 #test: serializable-parallel-2
+#test: serializable-parallel-3
 
 #test: prepared-transactions
 
diff --git a/src/test/isolation/specs/serializable-parallel-2.spec b/src/test/isolation/specs/serializable-parallel-2.spec
index f3941f78631..c975d96d772 100644
--- a/src/test/isolation/specs/serializable-parallel-2.spec
+++ b/src/test/isolation/specs/serializable-parallel-2.spec
@@ -3,7 +3,8 @@
 
 setup
 {
-	CREATE TABLE foo AS SELECT generate_series(1, 10)::int a;
+	CREATE TABLE foo AS SELECT generate_series(1, 100)::int a;
+	CREATE INDEX ON foo(a);
 	ALTER TABLE foo SET (parallel_workers = 2);
 }
 
@@ -14,7 +15,7 @@ teardown
 
 session s1
 setup 		{ BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; }
-step s1r	{ SELECT * FROM foo; }
+step s1r	{ SELECT COUNT(*) FROM foo; }
 step s1c 	{ COMMIT; }
 
 session s2
@@ -22,9 +23,12 @@ setup		{
 			  BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
 			  SET parallel_setup_cost = 0;
 			  SET parallel_tuple_cost = 0;
+			  SET min_parallel_index_scan_size = 0;
+			  SET parallel_leader_participation = off;
+			  SET enable_seqscan = off;
 			}
-step s2r1	{ SELECT * FROM foo; }
-step s2r2	{ SELECT * FROM foo; }
+step s2r1	{ SELECT COUNT(*) FROM foo; }
+step s2r2	{ SELECT COUNT(*) FROM foo; }
 step s2c	{ COMMIT; }
 
 permutation s1r s2r1 s1c s2r2 s2c
diff --git a/src/test/isolation/specs/serializable-parallel-3.spec b/src/test/isolation/specs/serializable-parallel-3.spec
new file mode 100644
index 00000000000..c27298c24ff
--- /dev/null
+++ b/src/test/isolation/specs/serializable-parallel-3.spec
@@ -0,0 +1,47 @@
+# Exercise the case where a read-only serializable transaction has
+# SXACT_FLAG_RO_SAFE set in a parallel query.  This variant is like
+# two copies of #2 running at the same time, and excercises the case
+# where another transaction has the same xmin, and it is the oldest.
+
+setup
+{
+	CREATE TABLE foo AS SELECT generate_series(1, 10)::int a;
+	ALTER TABLE foo SET (parallel_workers = 2);
+}
+
+teardown
+{
+	DROP TABLE foo;
+}
+
+session s1
+setup 		{ BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; }
+step s1r	{ SELECT * FROM foo; }
+step s1c 	{ COMMIT; }
+
+session s2
+setup		{
+			  BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
+			  SET parallel_setup_cost = 0;
+			  SET parallel_tuple_cost = 0;
+			}
+step s2r1	{ SELECT * FROM foo; }
+step s2r2	{ SELECT * FROM foo; }
+step s2c	{ COMMIT; }
+
+session s3
+setup		{ BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; }
+step s3r	{ SELECT * FROM foo; }
+step s3c	{ COMMIT; }
+
+session s4
+setup		{
+			  BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
+			  SET parallel_setup_cost = 0;
+			  SET parallel_tuple_cost = 0;
+			}
+step s4r1	{ SELECT * FROM foo; }
+step s4r2	{ SELECT * FROM foo; }
+step s4c	{ COMMIT; }
+
+permutation s1r s3r s2r1 s4r1 s1c s2r2 s3c s4r2 s4c s2c

From 617de9776fef27342afb4626fab3d49457cfd5c2 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 7 Mar 2023 18:21:37 -0500
Subject: [PATCH 17/61] Fix more bugs caused by adding columns to the end of a
 view.

If a view is defined atop another view, and then CREATE OR REPLACE
VIEW is used to add columns to the lower view, then when the upper
view's referencing RTE is expanded by ApplyRetrieveRule we will have
a subquery RTE with fewer eref->colnames than output columns.  This
confuses various code that assumes those lists are always in sync,
as they are in plain parser output.

We have seen such problems before (cf commit d5b760ecb), and now
I think the time has come to do what was speculated about in that
commit: let's make ApplyRetrieveRule synthesize some column names to
preserve the invariant that holds in parser output.  Otherwise we'll
be chasing this class of bugs indefinitely.  Moreover, it appears from
testing that this actually gives us better results in the test case
d5b760ecb added, and likely in other corner cases that we lack
coverage for.

In HEAD, I replaced d5b760ecb's hack to make expandRTE exit early with
an elog(ERROR) call, since the case is now presumably unreachable.
But it seems like changing that in back branches would bring more risk
than benefit, so there I just updated the comment.

Per bug #17811 from Alexander Lakhin.  Back-patch to all supported
branches.

Discussion: https://postgr.es/m/17811-d31686b78f0dffc9@postgresql.org

f
---
 .../src/test/regress/expected/alter_table.out | 14 +++---
 src/backend/parser/parse_relation.c           | 17 ++++---
 src/backend/rewrite/rewriteHandler.c          | 16 +++++++
 src/test/regress/expected/alter_table.out     | 45 +++++++++++++++----
 src/test/regress/sql/alter_table.sql          | 20 ++++++++-
 .../expected/alter_table.out                  | 12 ++---
 6 files changed, 96 insertions(+), 28 deletions(-)

diff --git a/contrib/pax_storage/src/test/regress/expected/alter_table.out b/contrib/pax_storage/src/test/regress/expected/alter_table.out
index fdc79c03c82..8d90e20a98d 100644
--- a/contrib/pax_storage/src/test/regress/expected/alter_table.out
+++ b/contrib/pax_storage/src/test/regress/expected/alter_table.out
@@ -2628,20 +2628,20 @@ View definition:
    FROM at_view_1 v1;
 
 explain (verbose, costs off) select * from at_view_2;
-                              QUERY PLAN                              
-----------------------------------------------------------------------
+                              QUERY PLAN                           
+-------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   Output: bt.id, bt.stuff, (to_json(ROW(bt.id, bt.stuff, NULL)))
+   Output: bt.id, bt.stuff, (to_json(ROW(bt.id, bt.stuff, 4)))
    ->  Seq Scan on public.at_base_table bt
-         Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL))
+         Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, 4))
  Optimizer: Postgres query optimizer
  Settings: constraint_exclusion=partition
 (6 rows)
 
 select * from at_view_2;
- id | stuff  |                   j                    
-----+--------+----------------------------------------
- 23 | skidoo | {"id":23,"stuff":"skidoo","more":null}
+ id | stuff  |                  j                  
+----+--------+-------------------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo","more":4}
 (1 row)
 
 drop view at_view_2;
diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c
index dfe348c1f40..8e158d63dc1 100644
--- a/src/backend/parser/parse_relation.c
+++ b/src/backend/parser/parse_relation.c
@@ -2944,12 +2944,17 @@ expandRTE(RangeTblEntry *rte, int rtindex, int sublevels_up,
 					Assert(varattno == te->resno);
 
 					/*
-					 * In scenarios where columns have been added to a view
-					 * since the outer query was originally parsed, there can
-					 * be more items in the subquery tlist than the outer
-					 * query expects.  We should ignore such extra column(s)
-					 * --- compare the behavior for composite-returning
-					 * functions, in the RTE_FUNCTION case below.
+					 * In a just-parsed subquery RTE, rte->eref->colnames
+					 * should always have exactly as many entries as the
+					 * subquery has non-junk output columns.  However, if the
+					 * subquery RTE was created by expansion of a view,
+					 * perhaps the subquery tlist could now have more entries
+					 * than existed when the outer query was parsed.  Such
+					 * cases should now be prevented because ApplyRetrieveRule
+					 * will extend the colnames list to match.  But out of
+					 * caution, we'll keep the code like this in the back
+					 * branches: just ignore any columns that lack colnames
+					 * entries.
 					 */
 					if (!aliasp_item)
 						break;
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 5fe8136ab9a..3ca8bfff2fb 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -28,6 +28,7 @@
 #include "catalog/dependency.h"
 #include "catalog/pg_type.h"
 #include "commands/trigger.h"
+#include "executor/executor.h"
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
@@ -1801,6 +1802,7 @@ ApplyRetrieveRule(Query *parsetree,
 	RangeTblEntry *rte,
 			   *subrte;
 	RowMarkClause *rc;
+	int			numCols;
 
 	if (list_length(rule->actions) != 1)
 		elog(ERROR, "expected just one rule action");
@@ -1960,6 +1962,20 @@ ApplyRetrieveRule(Query *parsetree,
 	rte->updatedCols = NULL;
 	rte->extraUpdatedCols = NULL;
 
+	/*
+	 * Since we allow CREATE OR REPLACE VIEW to add columns to a view, the
+	 * rule_action might emit more columns than we expected when the current
+	 * query was parsed.  Various places expect rte->eref->colnames to be
+	 * consistent with the non-junk output columns of the subquery, so patch
+	 * things up if necessary by adding some dummy column names.
+	 */
+	numCols = ExecCleanTargetListLength(rule_action->targetList);
+	while (list_length(rte->eref->colnames) < numCols)
+	{
+		rte->eref->colnames = lappend(rte->eref->colnames,
+									  makeString(pstrdup("?column?")));
+	}
+
 	return parsetree;
 }
 
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index d29bcc0da6d..cbc578e3586 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -2644,26 +2644,55 @@ View definition:
    FROM at_view_1 v1;
 
 explain (verbose, costs off) select * from at_view_2;
-                              QUERY PLAN                              
-----------------------------------------------------------------------
+                              QUERY PLAN                           
+-------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   Output: bt.id, bt.stuff, (to_json(ROW(bt.id, bt.stuff, NULL)))
+   Output: bt.id, bt.stuff, (to_json(ROW(bt.id, bt.stuff, 4)))
    ->  Seq Scan on public.at_base_table bt
-         Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL))
+         Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, 4))
  Optimizer: Postgres query optimizer
  Settings: constraint_exclusion=partition
 (6 rows)
 
 select * from at_view_2;
- id | stuff  |                   j                    
-----+--------+----------------------------------------
- 23 | skidoo | {"id":23,"stuff":"skidoo","more":null}
+ id | stuff  |                  j                  
+----+--------+-------------------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo","more":4}
 (1 row)
 
 drop view at_view_2;
 drop view at_view_1;
 drop table at_base_table;
--- check adding a column not iself requiring a rewrite, together with
+-- related case (bug #17811)
+begin;
+create temp table t1 as select * from int8_tbl;
+create temp view v1 as select 1::int8 as q1;
+create temp view v2 as select * from v1;
+create or replace temp view v1 with (security_barrier = true)
+  as select * from t1;
+create temp table log (q1 int8, q2 int8);
+create rule v1_upd_rule as on update to v1
+  do also insert into log values (new.*);
+update v2 set q1 = q1 + 1 where q1 = 123;
+select * from t1;
+        q1        |        q2         
+------------------+-------------------
+ 4567890123456789 |               123
+ 4567890123456789 |  4567890123456789
+ 4567890123456789 | -4567890123456789
+              124 |               456
+              124 |  4567890123456789
+(5 rows)
+
+select * from log;
+ q1  |        q2        
+-----+------------------
+ 124 |              456
+ 124 | 4567890123456789
+(2 rows)
+
+rollback;
+-- check adding a column not itself requiring a rewrite, together with
 -- a column requiring a default (bug #16038)
 -- ensure that rewrites aren't silently optimized away, removing the
 -- value of the test
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 7ddf9f898a8..9da0e5603ea 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1671,7 +1671,25 @@ drop view at_view_2;
 drop view at_view_1;
 drop table at_base_table;
 
--- check adding a column not iself requiring a rewrite, together with
+-- related case (bug #17811)
+begin;
+create temp table t1 as select * from int8_tbl;
+create temp view v1 as select 1::int8 as q1;
+create temp view v2 as select * from v1;
+create or replace temp view v1 with (security_barrier = true)
+  as select * from t1;
+
+create temp table log (q1 int8, q2 int8);
+create rule v1_upd_rule as on update to v1
+  do also insert into log values (new.*);
+
+update v2 set q1 = q1 + 1 where q1 = 123;
+
+select * from t1;
+select * from log;
+rollback;
+
+-- check adding a column not itself requiring a rewrite, together with
 -- a column requiring a default (bug #16038)
 
 -- ensure that rewrites aren't silently optimized away, removing the
diff --git a/src/test/singlenode_regress/expected/alter_table.out b/src/test/singlenode_regress/expected/alter_table.out
index 16d6768bbed..f28310d8bce 100644
--- a/src/test/singlenode_regress/expected/alter_table.out
+++ b/src/test/singlenode_regress/expected/alter_table.out
@@ -2575,18 +2575,18 @@ View definition:
    FROM at_view_1 v1;
 
 explain (verbose, costs off) select * from at_view_2;
-                           QUERY PLAN                           
-----------------------------------------------------------------
+                        QUERY PLAN                          
+-------------------------------------------------------------
  Seq Scan on public.at_base_table bt
-   Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL))
+   Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, 4))
  Settings: constraint_exclusion = 'partition'
  Optimizer: Postgres query optimizer
 (4 rows)
 
 select * from at_view_2;
- id | stuff  |                   j                    
-----+--------+----------------------------------------
- 23 | skidoo | {"id":23,"stuff":"skidoo","more":null}
+ id | stuff  |                  j                  
+----+--------+-------------------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo","more":4}
 (1 row)
 
 drop view at_view_2;

From a523db9b4bbc2e397cba65860f22c0ee0dd5161e Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 7 Mar 2023 21:36:49 -0800
Subject: [PATCH 18/61] Fix corruption due to vacuum_defer_cleanup_age
 underflowing 64bit xids

When vacuum_defer_cleanup_age is bigger than the current xid, including the
epoch, the subtraction of vacuum_defer_cleanup_age would lead to a wrapped
around xid. While that normally is not a problem, the subsequent conversion to
a 64bit xid results in a 64bit-xid very far into the future. As that xid is
used as a horizon to detect whether rows versions are old enough to be
removed, that allows removal of rows that are still visible (i.e. corruption).

If vacuum_defer_cleanup_age was never changed from the default, there is no
chance of this bug occurring.

This bug was introduced in dc7420c2c92.  A lesser version of it exists in
12-13, introduced by fb5344c969a, affecting only GiST.

The 12-13 version of the issue can, in rare cases, lead to pages in a gist
index getting recycled too early, potentially causing index entries to be
found multiple times.

The fix is fairly simple - don't allow vacuum_defer_cleanup_age to retreat
further than FirstNormalTransactionId.

Patches to make similar bugs easier to find, by adding asserts to the 64bit
xid infrastructure, have been proposed, but are not suitable for backpatching.

Currently there are no tests for vacuum_defer_cleanup_age. A patch introducing
infrastructure to make writing a test easier has been posted to the list.

Reported-by: Michail Nikolaev <michail.nikolaev@gmail.com>
Reviewed-by: Matthias van de Meent <boekewurm+postgres@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/20230108002923.cyoser3ttmt63bfn@awork3.anarazel.de
Backpatch: 12-, but impact/fix is smaller for 12-13
---
 src/backend/storage/ipc/procarray.c | 87 ++++++++++++++++++++++++-----
 1 file changed, 74 insertions(+), 13 deletions(-)

diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 3c7701e15b6..208874f793e 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -387,6 +387,9 @@ static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId l
 static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
 static void MaintainLatestCompletedXid(TransactionId latestXid);
 static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
+static void TransactionIdRetreatSafely(TransactionId *xid,
+									   int retreat_by,
+									   FullTransactionId rel);
 
 static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel,
 												  TransactionId xid);
@@ -1995,17 +1998,35 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h, bool updateGlobalVis)
 		 * so guc.c should limit it to no more than the xidStopLimit threshold
 		 * in varsup.c.  Also note that we intentionally don't apply
 		 * vacuum_defer_cleanup_age on standby servers.
+		 *
+		 * Need to use TransactionIdRetreatSafely() instead of open-coding the
+		 * subtraction, to prevent creating an xid before
+		 * FirstNormalTransactionId.
 		 */
-		h->oldest_considered_running =
-			TransactionIdRetreatedBy(h->oldest_considered_running,
-									 vacuum_defer_cleanup_age);
-		h->shared_oldest_nonremovable =
-			TransactionIdRetreatedBy(h->shared_oldest_nonremovable,
-									 vacuum_defer_cleanup_age);
-		h->data_oldest_nonremovable =
-			TransactionIdRetreatedBy(h->data_oldest_nonremovable,
-									 vacuum_defer_cleanup_age);
-		/* defer doesn't apply to temp relations */
+		Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+											 h->shared_oldest_nonremovable));
+		Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+											 h->data_oldest_nonremovable));
+
+		if (vacuum_defer_cleanup_age > 0)
+		{
+			TransactionIdRetreatSafely(&h->oldest_considered_running,
+									   vacuum_defer_cleanup_age,
+									   h->latest_completed);
+			TransactionIdRetreatSafely(&h->shared_oldest_nonremovable,
+									   vacuum_defer_cleanup_age,
+									   h->latest_completed);
+			TransactionIdRetreatSafely(&h->data_oldest_nonremovable,
+									   vacuum_defer_cleanup_age,
+									   h->latest_completed);
+			/* defer doesn't apply to temp relations */
+
+
+			Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+												 h->shared_oldest_nonremovable));
+			Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+												 h->data_oldest_nonremovable));
+		}
 	}
 
 	/*
@@ -3329,8 +3350,10 @@ GetSnapshotData(Snapshot snapshot, DtxContext distributedTransactionContext)
 		oldestfxid = FullXidRelativeTo(latest_completed, oldestxid);
 
 		/* apply vacuum_defer_cleanup_age */
-		def_vis_xid_data =
-			TransactionIdRetreatedBy(globalxmin, vacuum_defer_cleanup_age);
+		def_vis_xid_data = globalxmin;
+		TransactionIdRetreatSafely(&def_vis_xid_data,
+								   vacuum_defer_cleanup_age,
+								   oldestfxid);
 
 		/* Check whether there's a replication slot requiring an older xmin. */
 		def_vis_xid_data =
@@ -5360,6 +5383,44 @@ GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
 	return GlobalVisTestIsRemovableXid(state, xid);
 }
 
+/*
+ * Safely retract *xid by retreat_by, store the result in *xid.
+ *
+ * Need to be careful to prevent *xid from retreating below
+ * FirstNormalTransactionId during epoch 0. This is important to prevent
+ * generating xids that cannot be converted to a FullTransactionId without
+ * wrapping around.
+ *
+ * If retreat_by would lead to a too old xid, FirstNormalTransactionId is
+ * returned instead.
+ */
+static void
+TransactionIdRetreatSafely(TransactionId *xid, int retreat_by, FullTransactionId rel)
+{
+	TransactionId original_xid = *xid;
+	FullTransactionId fxid;
+	uint64		fxid_i;
+
+	Assert(TransactionIdIsNormal(original_xid));
+	Assert(retreat_by >= 0);	/* relevant GUCs are stored as ints */
+	AssertTransactionIdInAllowableRange(original_xid);
+
+	if (retreat_by == 0)
+		return;
+
+	fxid = FullXidRelativeTo(rel, original_xid);
+	fxid_i = U64FromFullTransactionId(fxid);
+
+	if ((fxid_i - FirstNormalTransactionId) <= retreat_by)
+		*xid = FirstNormalTransactionId;
+	else
+	{
+		*xid = TransactionIdRetreatedBy(original_xid, retreat_by);
+		Assert(TransactionIdIsNormal(*xid));
+		Assert(NormalTransactionIdPrecedes(*xid, original_xid));
+	}
+}
+
 /*
  * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it
  * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel).
@@ -6681,4 +6742,4 @@ LoopBackendProc(BackendProcCallbackFunction func, void *args)
 		(*func)(proc, args);
 	}
 	LWLockRelease(ProcArrayLock);
-}
\ No newline at end of file
+}

From 65416007d93ba8395eb1e85b98dab2bfa22ee394 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 9 Mar 2023 16:33:24 +1300
Subject: [PATCH 19/61] Fix race in SERIALIZABLE READ ONLY.

Commit bdaabb9b started skipping doomed transactions when building the
list of possible conflicts for SERIALIZABLE READ ONLY.  That makes
sense, because doomed transactions won't commit, but a couple of subtle
things broke:

1.  If all uncommitted r/w transactions are doomed, a READ ONLY
transaction would arbitrarily not benefit from the safe snapshot
optimization.  It would not be taken immediately, and yet no other
transaction would set SXACT_FLAG_RO_SAFE later.

2.  In the same circumstances but with DEFERRABLE, GetSafeSnapshot()
would correctly exit its wait loop without sleeping and then take the
optimization in non-assert builds, but assert builds would fail a sanity
check that SXACT_FLAG_RO_SAFE had been set by another transaction.

This is similar to the case for PredXact->WritableSxactCount == 0.  We
should opt out immediately if our possibleUnsafeConflicts list is empty
after filtering.

The code to maintain the serializable global xmin is moved down below
the new opt out site, because otherwise we'd have to reverse its effects
before returning.

Back-patch to all supported releases.  Bug #17368.

Reported-by: Alexander Lakhin <exclusion@gmail.com>
Discussion: https://postgr.es/m/17116-d6ca217acc180e30%40postgresql.org
Discussion: https://postgr.es/m/20110707212159.GF76634%40csail.mit.edu
---
 src/backend/storage/lmgr/predicate.c | 49 ++++++++++++++++++----------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 3b0daf723b6..ec6f26a72ba 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -1846,24 +1846,6 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot,
 		return snapshot;
 	}
 
-	/* Maintain serializable global xmin info. */
-	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
-	{
-		Assert(PredXact->SxactGlobalXminCount == 0);
-		PredXact->SxactGlobalXmin = snapshot->xmin;
-		PredXact->SxactGlobalXminCount = 1;
-		SerialSetActiveSerXmin(snapshot->xmin);
-	}
-	else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
-	{
-		Assert(PredXact->SxactGlobalXminCount > 0);
-		PredXact->SxactGlobalXminCount++;
-	}
-	else
-	{
-		Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin));
-	}
-
 	/* Initialize the structure. */
 	sxact->vxid = vxid;
 	sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo;
@@ -1900,6 +1882,19 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot,
 				SetPossibleUnsafeConflict(sxact, othersxact);
 			}
 		}
+
+		/*
+		 * If we didn't find any possibly unsafe conflicts because every
+		 * uncommitted writable transaction turned out to be doomed, then we
+		 * can "opt out" immediately.  See comments above the earlier check for
+		 * PredXact->WritableSxactCount == 0.
+		 */
+		if (SHMQueueEmpty(&sxact->possibleUnsafeConflicts))
+		{
+			ReleasePredXact(sxact);
+			LWLockRelease(SerializableXactHashLock);
+			return snapshot;
+		}
 	}
 	else
 	{
@@ -1908,6 +1903,24 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot,
 			   (MaxBackends + max_prepared_xacts));
 	}
 
+	/* Maintain serializable global xmin info. */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+	{
+		Assert(PredXact->SxactGlobalXminCount == 0);
+		PredXact->SxactGlobalXmin = snapshot->xmin;
+		PredXact->SxactGlobalXminCount = 1;
+		SerialSetActiveSerXmin(snapshot->xmin);
+	}
+	else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
+	{
+		Assert(PredXact->SxactGlobalXminCount > 0);
+		PredXact->SxactGlobalXminCount++;
+	}
+	else
+	{
+		Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin));
+	}
+
 	MySerializableXact = sxact;
 	MyXactDidWrite = false;		/* haven't written anything yet */
 

From de3bedc5aa9db42861a8569f4374b3f763e0cfd4 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 11 Mar 2023 12:15:41 -0500
Subject: [PATCH 20/61] Fix misbehavior in contrib/pg_trgm with an
 unsatisfiable regex.

If the regex compiler can see that a regex is unsatisfiable
(for example, '$foo') then it may emit an NFA having no arcs.
pg_trgm's packGraph function did the wrong thing in this case;
it would access off the end of a work array, and with bad luck
could produce a corrupted output data structure causing more
problems later.  This could end with wrong answers or crashes
in queries using a pg_trgm GIN or GiST index with such a regex.

Fix by not trying to de-duplicate if there aren't at least 2 arcs.

Per bug #17830 from Alexander Lakhin.  Back-patch to all supported
branches.

Discussion: https://postgr.es/m/17830-57ff5f89bdb02b09@postgresql.org
---
 contrib/pg_trgm/expected/pg_word_trgm.out |  6 ++++++
 contrib/pg_trgm/sql/pg_word_trgm.sql      |  3 +++
 contrib/pg_trgm/trgm_regexp.c             | 26 ++++++++++++++---------
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/contrib/pg_trgm/expected/pg_word_trgm.out b/contrib/pg_trgm/expected/pg_word_trgm.out
index 9f0ca502a6d..4c6b49934b6 100644
--- a/contrib/pg_trgm/expected/pg_word_trgm.out
+++ b/contrib/pg_trgm/expected/pg_word_trgm.out
@@ -1048,3 +1048,9 @@ select t,word_similarity('Kabankala',t) as sml from test_trgm2 where t %> 'Kaban
  Waikala                          |      0.3
 (89 rows)
 
+-- test unsatisfiable pattern
+select * from test_trgm2 where t ~ '.*$x';
+ t 
+---
+(0 rows)
+
diff --git a/contrib/pg_trgm/sql/pg_word_trgm.sql b/contrib/pg_trgm/sql/pg_word_trgm.sql
index d9fa1c55e5e..d2ada49133a 100644
--- a/contrib/pg_trgm/sql/pg_word_trgm.sql
+++ b/contrib/pg_trgm/sql/pg_word_trgm.sql
@@ -43,3 +43,6 @@ select t,word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <% t
 select t,word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <% t order by sml desc, t;
 select t,word_similarity('Baykal',t) as sml from test_trgm2 where t %> 'Baykal' order by sml desc, t;
 select t,word_similarity('Kabankala',t) as sml from test_trgm2 where t %> 'Kabankala' order by sml desc, t;
+
+-- test unsatisfiable pattern
+select * from test_trgm2 where t ~ '.*$x';
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index 71e4ebee4e9..3485a725cde 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -1944,9 +1944,7 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
 				arcsCount;
 	HASH_SEQ_STATUS scan_status;
 	TrgmState  *state;
-	TrgmPackArcInfo *arcs,
-			   *p1,
-			   *p2;
+	TrgmPackArcInfo *arcs;
 	TrgmPackedArc *packedArcs;
 	TrgmPackedGraph *result;
 	int			i,
@@ -2018,17 +2016,25 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
 	qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
 
 	/* We could have duplicates because states were merged. Remove them. */
-	/* p1 is probe point, p2 is last known non-duplicate. */
-	p2 = arcs;
-	for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
+	if (arcIndex > 1)
 	{
-		if (packArcInfoCmp(p1, p2) > 0)
+		/* p1 is probe point, p2 is last known non-duplicate. */
+		TrgmPackArcInfo *p1,
+				   *p2;
+
+		p2 = arcs;
+		for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
 		{
-			p2++;
-			*p2 = *p1;
+			if (packArcInfoCmp(p1, p2) > 0)
+			{
+				p2++;
+				*p2 = *p1;
+			}
 		}
+		arcsCount = (p2 - arcs) + 1;
 	}
-	arcsCount = (p2 - arcs) + 1;
+	else
+		arcsCount = arcIndex;
 
 	/* Create packed representation */
 	result = (TrgmPackedGraph *)

From 1b0fd4024745246123b76c273240fbf5aabc09f5 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sat, 11 Mar 2023 14:12:51 -0800
Subject: [PATCH 21/61] amcheck: Fix ordering bug in update_cached_xid_range()

The initialization order in update_cached_xid_range() was wrong, calling
FullTransactionIdFromXidAndCtx() before setting
->next_xid. FullTransactionIdFromXidAndCtx() uses ->next_xid.

In most situations this will not cause visible issues, because the next call
to update_cached_xid_range() will use a less wrong ->next_xid. It's rare that
xids advance fast enough for this to be a problem.

Found while adding more asserts to the 64bit xid infrastructure.

Reviewed-by: Mark Dilger <mark.dilger@enterprisedb.com>
Discussion: https://postgr.es/m/20230108002923.cyoser3ttmt63bfn@awork3.anarazel.de
Backpatch: 14-, where heapam verification was introduced
---
 contrib/amcheck/verify_heapam.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c
index 9abeca607d7..6614b609e83 100644
--- a/contrib/amcheck/verify_heapam.c
+++ b/contrib/amcheck/verify_heapam.c
@@ -1579,6 +1579,9 @@ FullTransactionIdFromXidAndCtx(TransactionId xid, const HeapCheckContext *ctx)
 {
 	uint32		epoch;
 
+	Assert(TransactionIdIsNormal(ctx->next_xid));
+	Assert(FullTransactionIdIsNormal(ctx->next_fxid));
+
 	if (!TransactionIdIsNormal(xid))
 		return FullTransactionIdFromEpochAndXid(0, xid);
 	epoch = EpochFromFullTransactionId(ctx->next_fxid);
@@ -1600,8 +1603,8 @@ update_cached_xid_range(HeapCheckContext *ctx)
 	LWLockRelease(XidGenLock);
 
 	/* And compute alternate versions of the same */
-	ctx->oldest_fxid = FullTransactionIdFromXidAndCtx(ctx->oldest_xid, ctx);
 	ctx->next_xid = XidFromFullTransactionId(ctx->next_fxid);
+	ctx->oldest_fxid = FullTransactionIdFromXidAndCtx(ctx->oldest_xid, ctx);
 }
 
 /*

From f6784db9564cd67758e527a598c2bf55fec1ae4c Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sat, 11 Mar 2023 14:12:51 -0800
Subject: [PATCH 22/61] amcheck: Fix FullTransactionIdFromXidAndCtx() for xids
 before epoch 0

64bit xids can't represent xids before epoch 0 (see also be504a3e974). When
FullTransactionIdFromXidAndCtx() was passed such an xid, it'd create a 64bit
xid far into the future. Noticed while adding assertions in the course of
investigating be504a3e974, as amcheck's test create such xids.

To fix the issue, just return FirstNormalFullTransactionId in this case. A
freshly initdb'd cluster already has a newer horizon. The most minimal version
of this would make the messages for some detected corruptions differently
inaccurate. To make those cases accurate, switch
FullTransactionIdFromXidAndCtx() to use the 32bit modulo difference between
xid and nextxid to compute the 64bit xid, yielding sensible "in the future" /
"in the past" answers.

Reviewed-by: Mark Dilger <mark.dilger@enterprisedb.com>
Discussion: https://postgr.es/m/20230108002923.cyoser3ttmt63bfn@awork3.anarazel.de
Backpatch: 14-, where heapam verification was introduced
---
 contrib/amcheck/verify_heapam.c           | 33 +++++++++++++++++++----
 src/bin/pg_amcheck/t/004_verify_heapam.pl | 30 ++++++++++++++-------
 2 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c
index 6614b609e83..f7964b78173 100644
--- a/contrib/amcheck/verify_heapam.c
+++ b/contrib/amcheck/verify_heapam.c
@@ -1577,17 +1577,40 @@ check_tuple(HeapCheckContext *ctx)
 static FullTransactionId
 FullTransactionIdFromXidAndCtx(TransactionId xid, const HeapCheckContext *ctx)
 {
-	uint32		epoch;
+	uint64		nextfxid_i;
+	int32		diff;
+	FullTransactionId fxid;
 
 	Assert(TransactionIdIsNormal(ctx->next_xid));
 	Assert(FullTransactionIdIsNormal(ctx->next_fxid));
+	Assert(XidFromFullTransactionId(ctx->next_fxid) == ctx->next_xid);
 
 	if (!TransactionIdIsNormal(xid))
 		return FullTransactionIdFromEpochAndXid(0, xid);
-	epoch = EpochFromFullTransactionId(ctx->next_fxid);
-	if (xid > ctx->next_xid)
-		epoch--;
-	return FullTransactionIdFromEpochAndXid(epoch, xid);
+
+	nextfxid_i = U64FromFullTransactionId(ctx->next_fxid);
+
+	/* compute the 32bit modulo difference */
+	diff = (int32) (ctx->next_xid - xid);
+
+	/*
+	 * In cases of corruption we might see a 32bit xid that is before epoch
+	 * 0. We can't represent that as a 64bit xid, due to 64bit xids being
+	 * unsigned integers, without the modulo arithmetic of 32bit xid. There's
+	 * no really nice way to deal with that, but it works ok enough to use
+	 * FirstNormalFullTransactionId in that case, as a freshly initdb'd
+	 * cluster already has a newer horizon.
+	 */
+	if (diff > 0 && (nextfxid_i - FirstNormalTransactionId) < (int64) diff)
+	{
+		Assert(EpochFromFullTransactionId(ctx->next_fxid) == 0);
+		fxid = FirstNormalFullTransactionId;
+	}
+	else
+		fxid = FullTransactionIdFromU64(nextfxid_i - diff);
+
+	Assert(FullTransactionIdIsNormal(fxid));
+	return fxid;
 }
 
 /*
diff --git a/src/bin/pg_amcheck/t/004_verify_heapam.pl b/src/bin/pg_amcheck/t/004_verify_heapam.pl
index b603efad929..4cadb837730 100644
--- a/src/bin/pg_amcheck/t/004_verify_heapam.pl
+++ b/src/bin/pg_amcheck/t/004_verify_heapam.pl
@@ -217,7 +217,7 @@ sub write_tuple
 my $relpath = "$pgdata/$rel";
 
 # Insert data and freeze public.test
-use constant ROWCOUNT => 16;
+use constant ROWCOUNT => 17;
 $node->safe_psql(
 	'postgres', qq(
 	INSERT INTO public.test (a, b, c)
@@ -296,7 +296,7 @@ sub write_tuple
 $node->start;
 
 # Ok, Xids and page layout look ok.  We can run corruption tests.
-plan tests => 19;
+plan tests => 20;
 
 # Check that pg_amcheck runs against the uncorrupted table without error.
 $node->command_ok(
@@ -379,23 +379,24 @@ sub header
 	elsif ($offnum == 3)
 	{
 		# Corruptly set xmin < datfrozenxid, further back, noting circularity
-		# of xid comparison.  For a new cluster with epoch = 0, the corrupt
-		# xmin will be interpreted as in the future
-		$tup->{t_xmin} = 4026531839;
+		# of xid comparison.
+		my $xmin = 4026531839;
+		$tup->{t_xmin} = $xmin;
 		$tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED;
 		$tup->{t_infomask} &= ~HEAP_XMIN_INVALID;
 
 		push @expected,
-		  qr/${$header}xmin 4026531839 equals or exceeds next valid transaction ID 0:\d+/;
+		  qr/${$header}xmin ${xmin} precedes oldest valid transaction ID 0:\d+/;
 	}
 	elsif ($offnum == 4)
 	{
 		# Corruptly set xmax < relminmxid;
-		$tup->{t_xmax} = 4026531839;
+		my $xmax = 4026531839;
+		$tup->{t_xmax} = $xmax;
 		$tup->{t_infomask} &= ~HEAP_XMAX_INVALID;
 
 		push @expected,
-		  qr/${$header}xmax 4026531839 equals or exceeds next valid transaction ID 0:\d+/;
+		  qr/${$header}xmax ${xmax} precedes oldest valid transaction ID 0:\d+/;
 	}
 	elsif ($offnum == 5)
 	{
@@ -503,7 +504,7 @@ sub header
 		push @expected,
 		  qr/${header}multitransaction ID 4 equals or exceeds next valid multitransaction ID 1/;
 	}
-	elsif ($offnum == 15)    # Last offnum must equal ROWCOUNT
+	elsif ($offnum == 15)
 	{
 		# Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI
 		$tup->{t_infomask} |= HEAP_XMAX_COMMITTED;
@@ -513,6 +514,17 @@ sub header
 		push @expected,
 		  qr/${header}multitransaction ID 4000000000 precedes relation minimum multitransaction ID threshold 1/;
 	}
+	elsif ($offnum == 16)    # Last offnum must equal ROWCOUNT
+	{
+		# Corruptly set xmin > next_xid to be in the future.
+		my $xmin = 123456;
+		$tup->{t_xmin} = $xmin;
+		$tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED;
+		$tup->{t_infomask} &= ~HEAP_XMIN_INVALID;
+
+		push @expected,
+          qr/${$header}xmin ${xmin} equals or exceeds next valid transaction ID 0:\d+/;
+	}
 	write_tuple($file, $offset, $tup);
 }
 close($file)

From 567ccb2fe27cf283072ac44c35c56bf834363744 Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Sun, 12 Mar 2023 09:00:32 -0400
Subject: [PATCH 23/61] Mark unsafe_tests module as not runnable with
 installcheck

This was an omission in the original creation of the module.

Also slightly adjust some wording to avoid a double "is".

Backpatch the non-meson piece of this to release 12, where the module
was introduced.

Discussion: https://postgr.es/m/be869e1c-8e3f-4cde-8609-212c899cccf9@dunslane.net
---
 src/test/modules/unsafe_tests/Makefile | 3 +++
 src/test/modules/unsafe_tests/README   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/test/modules/unsafe_tests/Makefile b/src/test/modules/unsafe_tests/Makefile
index 3ecf5fcfc5b..1d989007bd5 100644
--- a/src/test/modules/unsafe_tests/Makefile
+++ b/src/test/modules/unsafe_tests/Makefile
@@ -2,6 +2,9 @@
 
 REGRESS = rolenames alter_system_table
 
+# the whole point of these tests is to not run installcheck
+NO_INSTALLCHECK = 1
+
 ifdef USE_PGXS
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
diff --git a/src/test/modules/unsafe_tests/README b/src/test/modules/unsafe_tests/README
index a7e5b2a04f5..d9dbd038b95 100644
--- a/src/test/modules/unsafe_tests/README
+++ b/src/test/modules/unsafe_tests/README
@@ -1,6 +1,6 @@
 This directory doesn't actually contain any extension module.
 
-What it is is a home for regression tests that we don't want to run
+Instead it is a home for regression tests that we don't want to run
 during "make installcheck" because they could have side-effects that
 seem undesirable for a production installation.
 

From f13a802b6ea842cb24caed05a3a66b0963f09af2 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 13 Mar 2023 16:36:31 +0900
Subject: [PATCH 24/61] Fix inconsistent error handling for GSS encryption in
 PQconnectPoll()

The error cases for TLS and GSS encryption were inconsistent.  After TLS
fails, the connection is marked as dead and follow-up calls of
PQconnectPoll() would return immediately, but GSS encryption was not
doing that, so the connection would still have been allowed to enter the
GSS handling code.  This was handled incorrectly when gssencmode was set
to "require".  "prefer" was working correctly, and this could not happen
under "disable" as GSS encryption would not be attempted.

This commit makes the error handling of GSS encryption on par with TLS
portion, fixing the case of gssencmode=require.

Reported-by: Jacob Champion
Author: Michael Paquier
Reviewed-by: Jacob Champion, Stephen Frost
Discussion: https://postgr.es/m/23787477-5fe1-a161-6d2a-e459f74c4713@timescale.com
Backpatch-through: 12
---
 src/interfaces/libpq/fe-connect.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 7f3dfd462a6..46e8540004e 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -3323,17 +3323,22 @@ PQconnectPoll(PGconn *conn)
 					conn->status = CONNECTION_MADE;
 					return PGRES_POLLING_WRITING;
 				}
-				else if (pollres == PGRES_POLLING_FAILED &&
-						 conn->gssencmode[0] == 'p')
+				else if (pollres == PGRES_POLLING_FAILED)
 				{
-					/*
-					 * We failed, but we can retry on "prefer".  Have to drop
-					 * the current connection to do so, though.
-					 */
-					conn->try_gss = false;
-					need_new_connection = true;
-					goto keep_going;
+					if (conn->gssencmode[0] == 'p')
+					{
+						/*
+						 * We failed, but we can retry on "prefer".  Have to
+						 * drop the current connection to do so, though.
+						 */
+						conn->try_gss = false;
+						need_new_connection = true;
+						goto keep_going;
+					}
+					/* Else it's a hard failure */
+					goto error_return;
 				}
+				/* Else, return POLLING_READING or POLLING_WRITING status */
 				return pollres;
 #else							/* !ENABLE_GSS */
 				/* unreachable */

From a1490b540a3dbe8097b8e2c4551037b2f8da439c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 13 Mar 2023 15:19:00 -0400
Subject: [PATCH 25/61] Fix JSON error reporting for many cases of erroneous
 string values.

The majority of error exit cases in json_lex_string() failed to
set lex->token_terminator, causing problems for the error context
reporting code: it would see token_terminator less than token_start
and do something more or less nuts.  In v14 and up the end result
could be as bad as a crash in report_json_context().  Older
versions accidentally avoided that fate; but all versions produce
error context lines that are far less useful than intended,
because they'd stop at the end of the prior token instead of
continuing to where the actually-bad input is.

To fix, invent some macros that make it less notationally painful
to do the right thing.  Also add documentation about what the
function is actually required to do; and in >= v14, add an assertion
in report_json_context about token_terminator being sufficiently
far advanced.

Per report from Nikolay Shaplov.  Back-patch to all supported
versions.

Discussion: https://postgr.es/m/7332649.x5DLKWyVIX@thinkpad-pgpro
---
 src/backend/utils/adt/jsonfuncs.c             |  1 +
 src/common/jsonapi.c                          | 77 +++++++++++--------
 src/test/regress/expected/json_encoding.out   | 24 +++---
 src/test/regress/expected/json_encoding_1.out | 24 +++---
 4 files changed, 72 insertions(+), 54 deletions(-)

diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c
index b342c81f27b..f6a074aa7d0 100644
--- a/src/backend/utils/adt/jsonfuncs.c
+++ b/src/backend/utils/adt/jsonfuncs.c
@@ -656,6 +656,7 @@ report_json_context(JsonLexContext *lex)
 	line_start = lex->line_start;
 	context_start = line_start;
 	context_end = lex->token_terminator;
+	Assert(context_end >= context_start);
 
 	/* Advance until we are close enough to context_end */
 	while (context_end - context_start >= 50)
diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index ade13aed3a4..3d0fbfa7be1 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -675,6 +675,14 @@ json_lex(JsonLexContext *lex)
 
 /*
  * The next token in the input stream is known to be a string; lex it.
+ *
+ * If lex->strval isn't NULL, fill it with the decoded string.
+ * Set lex->token_terminator to the end of the decoded input, and in
+ * success cases, transfer its previous value to lex->prev_token_terminator.
+ * Return JSON_SUCCESS or an error code.
+ *
+ * Note: be careful that all error exits advance lex->token_terminator
+ * to the point after the character we detected the error on.
  */
 static inline JsonParseErrorType
 json_lex_string(JsonLexContext *lex)
@@ -683,6 +691,19 @@ json_lex_string(JsonLexContext *lex)
 	int			len;
 	int			hi_surrogate = -1;
 
+	/* Convenience macros for error exits */
+#define FAIL_AT_CHAR_START(code) \
+	do { \
+		lex->token_terminator = s; \
+		return code; \
+	} while (0)
+#define FAIL_AT_CHAR_END(code) \
+	do { \
+		lex->token_terminator = \
+			s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
+		return code; \
+	} while (0)
+
 	if (lex->strval != NULL)
 		resetStringInfo(lex->strval);
 
@@ -695,18 +716,14 @@ json_lex_string(JsonLexContext *lex)
 		len++;
 		/* Premature end of the string. */
 		if (len >= lex->input_length)
-		{
-			lex->token_terminator = s;
-			return JSON_INVALID_TOKEN;
-		}
+			FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
 		else if (*s == '"')
 			break;
 		else if ((unsigned char) *s < 32)
 		{
 			/* Per RFC4627, these characters MUST be escaped. */
 			/* Since *s isn't printable, exclude it from the context string */
-			lex->token_terminator = s;
-			return JSON_ESCAPING_REQUIRED;
+			FAIL_AT_CHAR_START(JSON_ESCAPING_REQUIRED);
 		}
 		else if (*s == '\\')
 		{
@@ -714,10 +731,7 @@ json_lex_string(JsonLexContext *lex)
 			s++;
 			len++;
 			if (len >= lex->input_length)
-			{
-				lex->token_terminator = s;
-				return JSON_INVALID_TOKEN;
-			}
+				FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
 			else if (*s == 'u')
 			{
 				int			i;
@@ -728,10 +742,7 @@ json_lex_string(JsonLexContext *lex)
 					s++;
 					len++;
 					if (len >= lex->input_length)
-					{
-						lex->token_terminator = s;
-						return JSON_INVALID_TOKEN;
-					}
+						FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
 					else if (*s >= '0' && *s <= '9')
 						ch = (ch * 16) + (*s - '0');
 					else if (*s >= 'a' && *s <= 'f')
@@ -739,10 +750,7 @@ json_lex_string(JsonLexContext *lex)
 					else if (*s >= 'A' && *s <= 'F')
 						ch = (ch * 16) + (*s - 'A') + 10;
 					else
-					{
-						lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
-						return JSON_UNICODE_ESCAPE_FORMAT;
-					}
+						FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT);
 				}
 				if (lex->strval != NULL)
 				{
@@ -752,20 +760,20 @@ json_lex_string(JsonLexContext *lex)
 					if (is_utf16_surrogate_first(ch))
 					{
 						if (hi_surrogate != -1)
-							return JSON_UNICODE_HIGH_SURROGATE;
+							FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE);
 						hi_surrogate = ch;
 						continue;
 					}
 					else if (is_utf16_surrogate_second(ch))
 					{
 						if (hi_surrogate == -1)
-							return JSON_UNICODE_LOW_SURROGATE;
+							FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
 						ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
 						hi_surrogate = -1;
 					}
 
 					if (hi_surrogate != -1)
-						return JSON_UNICODE_LOW_SURROGATE;
+						FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
 
 					/*
 					 * Reject invalid cases.  We can't have a value above
@@ -775,7 +783,7 @@ json_lex_string(JsonLexContext *lex)
 					if (ch == 0)
 					{
 						/* We can't allow this, since our TEXT type doesn't */
-						return JSON_UNICODE_CODE_POINT_ZERO;
+						FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO);
 					}
 
 					/*
@@ -812,14 +820,14 @@ json_lex_string(JsonLexContext *lex)
 						appendStringInfoChar(lex->strval, (char) ch);
 					}
 					else
-						return JSON_UNICODE_HIGH_ESCAPE;
+						FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE);
 #endif							/* FRONTEND */
 				}
 			}
 			else if (lex->strval != NULL)
 			{
 				if (hi_surrogate != -1)
-					return JSON_UNICODE_LOW_SURROGATE;
+					FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
 
 				switch (*s)
 				{
@@ -844,10 +852,14 @@ json_lex_string(JsonLexContext *lex)
 						appendStringInfoChar(lex->strval, '\t');
 						break;
 					default:
-						/* Not a valid string escape, so signal error. */
+
+						/*
+						 * Not a valid string escape, so signal error.  We
+						 * adjust token_start so that just the escape sequence
+						 * is reported, not the whole string.
+						 */
 						lex->token_start = s;
-						lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
-						return JSON_ESCAPING_INVALID;
+						FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
 				}
 			}
 			else if (strchr("\"\\/bfnrt", *s) == NULL)
@@ -860,15 +872,14 @@ json_lex_string(JsonLexContext *lex)
 				 * shown it's not a performance win.
 				 */
 				lex->token_start = s;
-				lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
-				return JSON_ESCAPING_INVALID;
+				FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
 			}
 
 		}
 		else if (lex->strval != NULL)
 		{
 			if (hi_surrogate != -1)
-				return JSON_UNICODE_LOW_SURROGATE;
+				FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
 
 			appendStringInfoChar(lex->strval, *s);
 		}
@@ -876,12 +887,18 @@ json_lex_string(JsonLexContext *lex)
 	}
 
 	if (hi_surrogate != -1)
+	{
+		lex->token_terminator = s + 1;
 		return JSON_UNICODE_LOW_SURROGATE;
+	}
 
 	/* Hooray, we found the end of the string! */
 	lex->prev_token_terminator = lex->token_terminator;
 	lex->token_terminator = s + 1;
 	return JSON_SUCCESS;
+
+#undef FAIL_AT_CHAR_START
+#undef FAIL_AT_CHAR_END
 }
 
 /*
diff --git a/src/test/regress/expected/json_encoding.out b/src/test/regress/expected/json_encoding.out
index f343f74fe18..fa41b401030 100644
--- a/src/test/regress/expected/json_encoding.out
+++ b/src/test/regress/expected/json_encoding.out
@@ -56,19 +56,19 @@ select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
 select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 --handling of simple unicode escapes
 select json '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
             correct_in_utf8            
@@ -121,7 +121,7 @@ select json '{ "a":  "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
 select json '{ "a":  "null \u0000 escape" }' ->> 'a' as fails;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 select json '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
@@ -159,7 +159,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT '"\u0000"'::jsonb;
                ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\u0000...
 -- use octet_length here so we don't get an odd unicode char in the
 -- output
 SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -180,25 +180,25 @@ ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 SELECT jsonb '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04X" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 -- handling of simple unicode escapes
 SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
         correct_in_utf8        
@@ -223,7 +223,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' as fails;
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' as not_an_escape;
         not_an_escape         
 ------------------------------
@@ -253,7 +253,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' ->> 'a' as fai...
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
diff --git a/src/test/regress/expected/json_encoding_1.out b/src/test/regress/expected/json_encoding_1.out
index e2fc131b0fa..938f8e24aaf 100644
--- a/src/test/regress/expected/json_encoding_1.out
+++ b/src/test/regress/expected/json_encoding_1.out
@@ -52,19 +52,19 @@ ERROR:  conversion between UTF8 and SQL_ASCII is not supported
 select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 --handling of simple unicode escapes
 select json '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
             correct_in_utf8            
@@ -113,7 +113,7 @@ select json '{ "a":  "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
 select json '{ "a":  "null \u0000 escape" }' ->> 'a' as fails;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 select json '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
@@ -151,7 +151,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT '"\u0000"'::jsonb;
                ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\u0000...
 -- use octet_length here so we don't get an odd unicode char in the
 -- output
 SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -168,25 +168,25 @@ ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 SELECT jsonb '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04X" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 -- handling of simple unicode escapes
 SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
 ERROR:  conversion between UTF8 and SQL_ASCII is not supported
@@ -209,7 +209,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' as fails;
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' as not_an_escape;
         not_an_escape         
 ------------------------------
@@ -237,7 +237,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' ->> 'a' as fai...
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------

From b8ac6460e7d3a5372c01ad4faf1c83214b0cb283 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 14 Mar 2023 11:10:45 -0400
Subject: [PATCH 26/61] Remove unnecessary code in
 dependency_is_compatible_expression().

Scanning the expression for compatible Vars isn't really necessary,
because the subsequent match against StatisticExtInfo entries will
eliminate expressions containing other Vars just fine.  Moreover,
this code hadn't stopped to think about what to do with
PlaceHolderVars or Aggrefs in the clause; and at least for the PHV
case, that demonstrably leads to failures.  Rather than work out
whether it's reasonable to ignore those, let's just remove the
whole stanza.

Per report from Richard Guo.  Back-patch to v14 where this code
was added.

Discussion: https://postgr.es/m/CAMbWs48Mmvm-acGevXuwpB=g5JMqVSL6i9z5UaJyLGJqa-XPAA@mail.gmail.com
---
 src/backend/statistics/dependencies.c | 28 +++------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index 8b3d560cb20..06d930f9164 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -1170,13 +1170,12 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
  *		Determines if the expression is compatible with functional dependencies
  *
  * Similar to dependency_is_compatible_clause, but doesn't enforce that the
- * expression is a simple Var. OTOH we check that there's at least one
- * statistics object matching the expression.
+ * expression is a simple Var.  On success, return the matching statistics
+ * expression into *expr.
  */
 static bool
 dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
 {
-	List	   *vars;
 	ListCell   *lc,
 			   *lc2;
 	Node	   *clause_expr;
@@ -1324,29 +1323,8 @@ dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, N
 	if (IsA(clause_expr, RelabelType))
 		clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
 
-	vars = pull_var_clause(clause_expr, 0);
-
-	foreach(lc, vars)
-	{
-		Var		   *var = (Var *) lfirst(lc);
-
-		/* Ensure Var is from the correct relation */
-		if (var->varno != relid)
-			return false;
-
-		/* We also better ensure the Var is from the current level */
-		if (var->varlevelsup != 0)
-			return false;
-
-		/* Also ignore system attributes (we don't allow stats on those) */
-		if (!AttrNumberIsForUserDefinedAttr(var->varattno))
-			return false;
-	}
-
 	/*
-	 * Check if we actually have a matching statistics for the expression.
-	 *
-	 * XXX Maybe this is an overkill. We'll eliminate the expressions later.
+	 * Search for a matching statistics expression.
 	 */
 	foreach(lc, statlist)
 	{

From 99266f720a97ca76b0b085b921f68294da98c760 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 14 Mar 2023 19:17:31 -0400
Subject: [PATCH 27/61] Fix corner case bug in numeric to_char() some more.

The band-aid applied in commit f0bedf3e4 turns out to still need
some work: it made sure we didn't set Np->last_relevant too small
(to the left of the decimal point), but it didn't prevent setting
it too large (off the end of the partially-converted string).
This could result in fetching data beyond the end of the allocated
space, which with very bad luck could cause a SIGSEGV, though
I don't see any hazard of interesting memory disclosure.

Per bug #17839 from Thiago Nunes.  The bug's pretty ancient,
so back-patch to all supported versions.

Discussion: https://postgr.es/m/17839-aada50db24d7b0da@postgresql.org
---
 src/backend/utils/adt/formatting.c    | 11 +++++++++--
 src/test/regress/expected/numeric.out |  6 ++++++
 src/test/regress/sql/numeric.sql      |  1 +
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 7d58e28cd1e..a7a426c3034 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -5719,13 +5719,20 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 
 			/*
 			 * If any '0' specifiers are present, make sure we don't strip
-			 * those digits.
+			 * those digits.  But don't advance last_relevant beyond the last
+			 * character of the Np->number string, which is a hazard if the
+			 * number got shortened due to precision limitations.
 			 */
 			if (Np->last_relevant && Np->Num->zero_end > Np->out_pre_spaces)
 			{
+				int			last_zero_pos;
 				char	   *last_zero;
 
-				last_zero = Np->number + (Np->Num->zero_end - Np->out_pre_spaces);
+				/* note that Np->number cannot be zero-length here */
+				last_zero_pos = strlen(Np->number) - 1;
+				last_zero_pos = Min(last_zero_pos,
+									Np->Num->zero_end - Np->out_pre_spaces);
+				last_zero = Np->number + last_zero_pos;
 				if (Np->last_relevant < last_zero)
 					Np->last_relevant = last_zero;
 			}
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index bea33181bee..2f2818e001c 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -1929,6 +1929,12 @@ SELECT to_char('100'::numeric, 'FM999');
  100
 (1 row)
 
+SELECT to_char('12345678901'::float8, 'FM9999999999D9999900000000000000000');
+     to_char     
+-----------------
+ ##########.####
+(1 row)
+
 -- Check parsing of literal text in a format string
 SELECT to_char('100'::numeric, 'foo999');
  to_char 
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 9233c666d4b..56294da5ae9 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -979,6 +979,7 @@ FROM v;
 SELECT to_char('100'::numeric, 'FM999.9');
 SELECT to_char('100'::numeric, 'FM999.');
 SELECT to_char('100'::numeric, 'FM999');
+SELECT to_char('12345678901'::float8, 'FM9999999999D9999900000000000000000');
 
 -- Check parsing of literal text in a format string
 SELECT to_char('100'::numeric, 'foo999');

From d657b4137b41f39ea62390ea91136e2049a4808d Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Wed, 15 Mar 2023 13:17:18 +1300
Subject: [PATCH 28/61] Fix waitpid() emulation on Windows.

Our waitpid() emulation didn't prevent a PID from being recycled by the
OS before the call to waitpid().  The postmaster could finish up
tracking more than one child process with the same PID, and confuse
them.

Fix, by moving the guts of pgwin32_deadchild_callback() into waitpid(),
so that resources are released synchronously.  The process and PID
continue to exist until we close the process handle, which only happens
once we're ready to adjust our book-keeping of running children.

This seems to explain a couple of failures on CI.  It had never been
reported before, despite the code being as old as the Windows port.
Perhaps Windows started recycling PIDs more rapidly, or perhaps timing
changes due to commit 7389aad6 made it more likely to break.

Thanks to Alexander Lakhin for analysis and Andres Freund for tracking
down the root cause.

Back-patch to all supported branches.

Reported-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/20230208012852.bvkn2am4h4iqjogq%40awork3.anarazel.de
---
 src/backend/postmaster/postmaster.c | 70 ++++++++++++++++-------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 8f21a90f391..aab9d97da83 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -5455,7 +5455,7 @@ internal_forkexec(int argc, char *argv[], Port *port)
 				(errmsg_internal("could not register process for wait: error code %lu",
 								 GetLastError())));
 
-	/* Don't close pi.hProcess here - the wait thread needs access to it */
+	/* Don't close pi.hProcess here - waitpid() needs access to it */
 
 	CloseHandle(pi.hThread);
 
@@ -7288,36 +7288,21 @@ ShmemBackendArrayRemove(Backend *bn)
 static pid_t
 waitpid(pid_t pid, int *exitstatus, int options)
 {
+	win32_deadchild_waitinfo *childinfo;
+	DWORD		exitcode;
 	DWORD		dwd;
 	ULONG_PTR	key;
 	OVERLAPPED *ovl;
 
-	/*
-	 * Check if there are any dead children. If there are, return the pid of
-	 * the first one that died.
-	 */
-	if (GetQueuedCompletionStatus(win32ChildQueue, &dwd, &key, &ovl, 0))
+	/* Try to consume one win32_deadchild_waitinfo from the queue. */
+	if (!GetQueuedCompletionStatus(win32ChildQueue, &dwd, &key, &ovl, 0))
 	{
-		*exitstatus = (int) key;
-		return dwd;
+		errno = EAGAIN;
+		return -1;
 	}
 
-	return -1;
-}
-
-/*
- * Note! Code below executes on a thread pool! All operations must
- * be thread safe! Note that elog() and friends must *not* be used.
- */
-static void WINAPI
-pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
-{
-	win32_deadchild_waitinfo *childinfo = (win32_deadchild_waitinfo *) lpParameter;
-	DWORD		exitcode;
-
-	if (TimerOrWaitFired)
-		return;					/* timeout. Should never happen, since we use
-								 * INFINITE as timeout value. */
+	childinfo = (win32_deadchild_waitinfo *) key;
+	pid = childinfo->procId;
 
 	/*
 	 * Remove handle from wait - required even though it's set to wait only
@@ -7333,13 +7318,11 @@ pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
 		write_stderr("could not read exit code for process\n");
 		exitcode = 255;
 	}
-
-	if (!PostQueuedCompletionStatus(win32ChildQueue, childinfo->procId, (ULONG_PTR) exitcode, NULL))
-		write_stderr("could not post child completion status\n");
+	*exitstatus = exitcode;
 
 	/*
-	 * Handle is per-process, so we close it here instead of in the
-	 * originating thread
+	 * Close the process handle.  Only after this point can the PID can be
+	 * recycled by the kernel.
 	 */
 	CloseHandle(childinfo->procHandle);
 
@@ -7349,7 +7332,34 @@ pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
 	 */
 	free(childinfo);
 
-	/* Queue SIGCHLD signal */
+	return pid;
+}
+
+/*
+ * Note! Code below executes on a thread pool! All operations must
+ * be thread safe! Note that elog() and friends must *not* be used.
+ */
+static void WINAPI
+pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
+{
+	/* Should never happen, since we use INFINITE as timeout value. */
+	if (TimerOrWaitFired)
+		return;
+
+	/*
+	 * Post the win32_deadchild_waitinfo object for waitpid() to deal with. If
+	 * that fails, we leak the object, but we also leak a whole process and
+	 * get into an unrecoverable state, so there's not much point in worrying
+	 * about that.  We'd like to panic, but we can't use that infrastructure
+	 * from this thread.
+	 */
+	if (!PostQueuedCompletionStatus(win32ChildQueue,
+									0,
+									(ULONG_PTR) lpParameter,
+									NULL))
+		write_stderr("could not post child completion status\n");
+
+	/* Queue SIGCHLD signal. */
 	pg_queue_signal(SIGCHLD);
 }
 #endif							/* WIN32 */

From 9b52c8987250009a4cc9e6f203b6e2d49154214c Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Wed, 15 Mar 2023 13:57:00 +1300
Subject: [PATCH 29/61] Fix fractional vacuum_cost_delay.

Commit 4753ef37 changed vacuum_delay_point() to use the WaitLatch() API,
to fix the problem that vacuum could keep running for a very long time
after the postmaster died.

Unfortunately, that broke commit caf626b2's support for fractional
vacuum_cost_delay, which shipped in PostgreSQL 12.  WaitLatch() works in
whole milliseconds.

For now, revert the change from commit 4753ef37, but add an explicit
check for postmaster death.  That's an extra system call on systems
other than Linux and FreeBSD, but that overhead doesn't matter much
considering that we willingly went to sleep and woke up again.  (In
later work, we might add higher resolution timeouts to the latch API so
that we could do this with our standard programming pattern, but that
wouldn't be back-patched.)

Back-patch to 14, where commit 4753ef37 arrived.

Reported-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/CAAKRu_b-q0hXCBUCAATh0Z4Zi6UkiC0k2DFgoD3nC-r3SkR3tg%40mail.gmail.com
---
 src/backend/commands/vacuum.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index c113f274b81..35c9a9fdafe 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -52,6 +52,7 @@
 #include "postmaster/bgworker_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
+#include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/acl.h"
@@ -3005,11 +3006,18 @@ vacuum_delay_point(void)
 		if (msec > VacuumCostDelay * 4)
 			msec = VacuumCostDelay * 4;
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 msec,
-						 WAIT_EVENT_VACUUM_DELAY);
-		ResetLatch(MyLatch);
+		pgstat_report_wait_start(WAIT_EVENT_VACUUM_DELAY);
+		pg_usleep(msec * 1000);
+		pgstat_report_wait_end();
+
+		/*
+		 * We don't want to ignore postmaster death during very long vacuums
+		 * with vacuum_cost_delay configured.  We can't use the usual
+		 * WaitLatch() approach here because we want microsecond-based sleep
+		 * durations above.
+		 */
+		if (IsUnderPostmaster && !PostmasterIsAlive())
+			exit(1);
 
 		VacuumCostBalance = 0;
 

From f32a619a6dbab63930e9f350c16ee10d4d256d26 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 15 Mar 2023 12:56:10 +0900
Subject: [PATCH 30/61] Improve WIN32 port of fstat() to detect more file types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current implementation of _pgfstat64() is ineffective in detecting a
terminal handle or an anonymous named pipe.  This commit improves our
port of fstat() to detect more efficiently such cases by relying on
GetFileType(), and returning more correct data when the type found is
either a FILE_TYPE_PIPE (_S_IFIFO) or a FILE_TYPE_CHAR (_S_IFCHR).

This is part of a more global fix to address failures when feeding the
output generated by pg_dump to pg_restore through a pipe, for example,
but not all of it.   We are also going to need to do something about
fseek() and ftello() which are not reliable on WIN32 for the same cases
where fstat() was incorrect.  Fixing fstat() is independent of the rest,
though, which is why both fixes are handled separately, and this is the
first part of it.

Reported-by: Daniel Watzinger
Author: Daniel Watzinger, Juan José Santamaría Flecha
Discussion: https://postgr.es/m/b1448cd7-871e-20e3-8398-895e2d1d3bf9@gmail.com
Backpatch-through: 14
---
 src/port/win32stat.c | 69 ++++++++++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/src/port/win32stat.c b/src/port/win32stat.c
index 426e01f0efa..36c3b171f40 100644
--- a/src/port/win32stat.c
+++ b/src/port/win32stat.c
@@ -289,39 +289,66 @@ int
 _pgfstat64(int fileno, struct stat *buf)
 {
 	HANDLE		hFile = (HANDLE) _get_osfhandle(fileno);
-	BY_HANDLE_FILE_INFORMATION fiData;
+	DWORD		fileType = FILE_TYPE_UNKNOWN;
+	DWORD		lastError;
+	unsigned short st_mode;
 
-	if (hFile == INVALID_HANDLE_VALUE || buf == NULL)
+	/*
+	 * When stdin, stdout, and stderr aren't associated with a stream the
+	 * special value -2 is returned:
+	 * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/get-osfhandle
+	 */
+	if (hFile == INVALID_HANDLE_VALUE || hFile == (HANDLE) -2 || buf == NULL)
 	{
 		errno = EINVAL;
 		return -1;
 	}
 
+	fileType = GetFileType(hFile);
+	lastError = GetLastError();
+
 	/*
-	 * Check if the fileno is a data stream.  If so, unless it has been
-	 * redirected to a file, getting information through its HANDLE will fail,
-	 * so emulate its stat information in the most appropriate way and return
-	 * it instead.
+	 * Invoke GetLastError in order to distinguish between a "valid" return of
+	 * FILE_TYPE_UNKNOWN and its return due to a calling error.  In case of
+	 * success, GetLastError returns NO_ERROR.
 	 */
-	if ((fileno == _fileno(stdin) ||
-		 fileno == _fileno(stdout) ||
-		 fileno == _fileno(stderr)) &&
-		!GetFileInformationByHandle(hFile, &fiData))
+	if (fileType == FILE_TYPE_UNKNOWN && lastError != NO_ERROR)
 	{
-		memset(buf, 0, sizeof(*buf));
-		buf->st_mode = _S_IFCHR;
-		buf->st_dev = fileno;
-		buf->st_rdev = fileno;
-		buf->st_nlink = 1;
-		return 0;
+		_dosmaperr(lastError);
+		return -1;
 	}
 
-	/*
-	 * Since we already have a file handle there is no need to check for
-	 * ERROR_DELETE_PENDING.
-	 */
+	switch (fileType)
+	{
+			/* The specified file is a disk file */
+		case FILE_TYPE_DISK:
+			return fileinfo_to_stat(hFile, buf);
+
+			/*
+			 * The specified file is a socket, a named pipe, or an anonymous
+			 * pipe.
+			 */
+		case FILE_TYPE_PIPE:
+			st_mode = _S_IFIFO;
+			break;
+			/* The specified file is a character file */
+		case FILE_TYPE_CHAR:
+			st_mode = _S_IFCHR;
+			break;
+			/* Unused flag and unknown file type */
+		case FILE_TYPE_REMOTE:
+		case FILE_TYPE_UNKNOWN:
+		default:
+			errno = EINVAL;
+			return -1;
+	}
 
-	return fileinfo_to_stat(hFile, buf);
+	memset(buf, 0, sizeof(*buf));
+	buf->st_mode = st_mode;
+	buf->st_dev = fileno;
+	buf->st_rdev = fileno;
+	buf->st_nlink = 1;
+	return 0;
 }
 
 #endif							/* WIN32 */

From fd03282bbc301767be4a7fb93d8ba1d12bb8da1d Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 16 Mar 2023 16:50:56 -0400
Subject: [PATCH 31/61] Doc: mention CREATE+ATTACH PARTITION with CREATE
 TABLE...PARTITION OF.

Clarify that ATTACH/DETACH PARTITION can be used to perform partition
maintenance with less locking than straight CREATE TABLE/DROP TABLE.
This was already stated in some places, but not emphasized.

Back-patch to v14 where DETACH PARTITION CONCURRENTLY was added.
(We had lower lock levels for ATTACH PARTITION before that, but
this wording wouldn't apply.)

Justin Pryzby, reviewed by Robert Treat and Jakub Wartak;
a little further wordsmithing by me

Discussion: https://postgr.es/m/20220718143304.GC18011@telsasoft.com
---
 doc/src/sgml/ddl.sgml              | 13 +++++++------
 doc/src/sgml/ref/create_table.sgml | 20 ++++++++++++++++----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml
index e49d19716c1..78f4448a5bc 100644
--- a/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
@@ -3951,9 +3951,15 @@ CREATE TABLE measurement_y2008m02 PARTITION OF measurement
 </programlisting>
 
      As an alternative, it is sometimes more convenient to create the
-     new table outside the partition structure, and make it a proper
+     new table outside the partition structure, and attach it as a
      partition later. This allows new data to be loaded, checked, and
      transformed prior to it appearing in the partitioned table.
+     Moreover, the <literal>ATTACH PARTITION</literal> operation requires
+     only <literal>SHARE UPDATE EXCLUSIVE</literal> lock on the
+     partitioned table, as opposed to the <literal>ACCESS
+     EXCLUSIVE</literal> lock that is required by <command>CREATE TABLE
+     ... PARTITION OF</command>, so it is more friendly to concurrent
+     operations on the partitioned table.
      The <literal>CREATE TABLE ... LIKE</literal> option is helpful
      to avoid tediously repeating the parent table's definition:
 
@@ -3973,11 +3979,6 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02
 </programlisting>
     </para>
 
-    <para>
-     The <command>ATTACH PARTITION</command> command requires taking a
-     <literal>SHARE UPDATE EXCLUSIVE</literal> lock on the partitioned table.
-    </para>
-
     <para>
      Before running the <command>ATTACH PARTITION</command> command, it is
      recommended to create a <literal>CHECK</literal> constraint on the table to
diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index 4a0b6dcfcd4..3a4920c4c35 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -661,12 +661,24 @@ Where column_reference_storage_directive is:
      </para>
 
      <para>
-      Operations such as TRUNCATE which normally affect a table and all of its
+      Operations such as <command>TRUNCATE</command>
+      which normally affect a table and all of its
       inheritance children will cascade to all partitions, but may also be
-      performed on an individual partition.  Note that dropping a partition
-      with <literal>DROP TABLE</literal> requires taking an <literal>ACCESS
-      EXCLUSIVE</literal> lock on the parent table.
+      performed on an individual partition.
      </para>
+
+     <para>
+      Note that creating a partition using <literal>PARTITION OF</literal>
+      requires taking an <literal>ACCESS EXCLUSIVE</literal> lock on the
+      parent partitioned table.  Likewise, dropping a partition
+      with <command>DROP TABLE</command> requires taking
+      an <literal>ACCESS EXCLUSIVE</literal> lock on the parent table.
+      It is possible to use <link linkend="sql-altertable"><command>ALTER
+      TABLE ATTACH/DETACH PARTITION</command></link> to perform these
+      operations with a weaker lock, thus reducing interference with
+      concurrent operations on the partitioned table.
+     </para>
+
     </listitem>
    </varlistentry>
 

From 8a182bb29bccc4ed98cfad07f6950c05825bdd7e Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 17 Mar 2023 09:44:42 +1300
Subject: [PATCH 32/61] Small tidyup for commit d41a178b.

A comment was left behind claiming that we needed to use malloc() rather
than palloc() because the corresponding free would run in another
thread, but that's not true anymore.  Remove that comment.  And, with
the reason being gone, we might as well actually use palloc().

Back-patch to supported releases, like d41a178b.

Discussion: https://postgr.es/m/CA%2BhUKG%2BpdM9v3Jv4tc2BFx2jh_daY3uzUyAGBhtDkotEQDNPYw%40mail.gmail.com
---
 src/backend/postmaster/postmaster.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index aab9d97da83..6138fc32cb1 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -5430,13 +5430,10 @@ internal_forkexec(int argc, char *argv[], Port *port)
 
 	/*
 	 * Queue a waiter to signal when this child dies. The wait will be handled
-	 * automatically by an operating system thread pool.
-	 *
-	 * Note: use malloc instead of palloc, since it needs to be thread-safe.
-	 * Struct will be free():d from the callback function that runs on a
-	 * different thread.
+	 * automatically by an operating system thread pool.  The memory will be
+	 * freed by a later call to waitpid().
 	 */
-	childinfo = malloc(sizeof(win32_deadchild_waitinfo));
+	childinfo = palloc(sizeof(win32_deadchild_waitinfo));
 	if (!childinfo)
 		ereport(FATAL,
 				(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -7330,7 +7327,7 @@ waitpid(pid_t pid, int *exitstatus, int options)
 	 * Free struct that was allocated before the call to
 	 * RegisterWaitForSingleObject()
 	 */
-	free(childinfo);
+	pfree(childinfo);
 
 	return pid;
 }

From 604a6f2825a70ce35e3ba98c1ecb7bbc1191c355 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 17 Mar 2023 14:44:12 +1300
Subject: [PATCH 33/61] Small tidyup for commit d41a178b, part II.

Further to commit 6a9229da, checking for NULL is now redundant.  An "out
of memory" error would have been thrown already by palloc() and treated
as FATAL, so we can delete a few more lines.

Back-patch to all releases, like those other commits.

Reported-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/4040668.1679013388%40sss.pgh.pa.us
---
 src/backend/postmaster/postmaster.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 6138fc32cb1..377055f6f84 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -5434,11 +5434,6 @@ internal_forkexec(int argc, char *argv[], Port *port)
 	 * freed by a later call to waitpid().
 	 */
 	childinfo = palloc(sizeof(win32_deadchild_waitinfo));
-	if (!childinfo)
-		ereport(FATAL,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-
 	childinfo->procHandle = pi.hProcess;
 	childinfo->procId = pi.dwProcessId;
 

From fb5d10013f18bf494766ac3b3b273f06168b1b2e Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 16 Mar 2023 17:48:47 -0700
Subject: [PATCH 34/61] tests: Minimize syslog activity by slapd

Until now the tests using slapd spammed syslog for every connection /
query. Use logfile-only to prevent syslog activity. Unfortunately that only
takes effect after logging the first message, but that's still much better
than the prior situation.

Discussion: https://postgr.es/m/20230311233708.3yjdbjkly2q4gq2j@awork3.anarazel.de
Backpatch: 11-
---
 src/test/ldap/t/001_auth.pl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/test/ldap/t/001_auth.pl b/src/test/ldap/t/001_auth.pl
index 0a310ccb15a..5f2ba8e5967 100644
--- a/src/test/ldap/t/001_auth.pl
+++ b/src/test/ldap/t/001_auth.pl
@@ -78,6 +78,7 @@
 
 pidfile $slapd_pidfile
 logfile $slapd_logfile
+logfile-only on
 
 access to *
         by * read

From b95732b57bd6ce536f256a10e9fcd91d3a445ba7 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 16 Mar 2023 23:03:31 -0700
Subject: [PATCH 35/61] tests: Prevent syslog activity by slapd, take 2

Unfortunately it turns out that the logfile-only option added in b9f8d1cbad7
is only available in openldap starting in 2.6.

Luckily the option to control the log level (loglevel/-s) have been around for
much longer. As it turns out loglevel/-s only control what goes into syslog,
not what ends up in the file specified with 'logfile' and stderr.

While we currently are specifying 'logfile', nothing ends up in it, as the
option only controls debug messages, and we didn't set a debug level. The
debug level can only be configured on the commandline and also prevents
forking. That'd require larger changes, so this commit doesn't tackle that
issue.

Specify the syslog level when starting slapd using -s, as that allows to
prevent all syslog messages if one uses '0' instead of 'none', while loglevel
doesn't prevent the first message.

Discussion: https://postgr.es/m/20230311233708.3yjdbjkly2q4gq2j@awork3.anarazel.de
Backpatch: 11-
---
 src/test/ldap/t/001_auth.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/ldap/t/001_auth.pl b/src/test/ldap/t/001_auth.pl
index 5f2ba8e5967..c2f588deeb8 100644
--- a/src/test/ldap/t/001_auth.pl
+++ b/src/test/ldap/t/001_auth.pl
@@ -78,7 +78,6 @@
 
 pidfile $slapd_pidfile
 logfile $slapd_logfile
-logfile-only on
 
 access to *
         by * read
@@ -114,7 +113,8 @@
   "-CA", "$slapd_certs/ca.crt", "-CAkey", "$slapd_certs/ca.key",
   "-CAcreateserial", "-out", "$slapd_certs/server.crt";
 
-system_or_bail $slapd, '-f', $slapd_conf, '-h', "$ldap_url $ldaps_url";
+# -s0 prevents log messages ending up in syslog
+system_or_bail $slapd, '-f', $slapd_conf,'-s0', '-h', "$ldap_url $ldaps_url";
 
 END
 {

From b00a9926ee1aeef2a5368595744bde1b4be11b42 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 17 Mar 2023 13:31:40 -0400
Subject: [PATCH 36/61] Fix pg_dump for hash partitioning on enum columns.

Hash partitioning on an enum is problematic because the hash codes are
derived from the OIDs assigned to the enum values, which will almost
certainly be different after a dump-and-reload than they were before.
This means that some rows probably end up in different partitions than
before, causing restore to fail because of partition constraint
violations.  (pg_upgrade dodges this problem by using hacks to force
the enum values to keep the same OIDs, but that's not possible nor
desirable for pg_dump.)

Users can work around that by specifying --load-via-partition-root,
but since that's a dump-time not restore-time decision, one might
find out the need for it far too late.  Instead, teach pg_dump to
apply that option automatically when dealing with a partitioned
table that has hash-on-enum partitioning.

Also deal with a pre-existing issue for --load-via-partition-root
mode: in a parallel restore, we try to TRUNCATE target tables just
before loading them, in order to enable some backend optimizations.
This is bad when using --load-via-partition-root because (a) we're
likely to suffer deadlocks from restore jobs trying to restore rows
into other partitions than they came from, and (b) if we miss getting
a deadlock we might still lose data due to a TRUNCATE removing rows
from some already-completed restore job.

The fix for this is conceptually simple: just don't TRUNCATE if we're
dealing with a --load-via-partition-root case.  The tricky bit is for
pg_restore to identify those cases.  In dumps using COPY commands we
can inspect each COPY command to see if it targets the nominal target
table or some ancestor.  However, in dumps using INSERT commands it's
pretty impractical to examine the INSERTs in advance.  To provide a
solution for that going forward, modify pg_dump to mark TABLE DATA
items that are using --load-via-partition-root with a comment.
(This change also responds to a complaint from Robert Haas that
the dump output for --load-via-partition-root is pretty confusing.)
pg_restore checks for the special comment as well as checking the
COPY command if present.  This will fail to identify the combination
of --load-via-partition-root and --inserts in pre-existing dump files,
but that should be a pretty rare case in the field.  If it does
happen you will probably get a deadlock failure that you can work
around by not using parallel restore, which is the same as before
this bug fix.

Having done this, there seems no remaining reason for the alarmism
in the pg_dump man page about combining --load-via-partition-root
with parallel restore, so remove that warning.

Patch by me; thanks to Julien Rouhaud for review.  Back-patch to
v11 where hash partitioning was introduced.

Discussion: https://postgr.es/m/1376149.1675268279@sss.pgh.pa.us
---
 doc/src/sgml/ref/pg_dump.sgml | 10 ----------
 src/bin/pg_dump/pg_dump.c     |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 956f97e2537..4bf68d3fbd6 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -863,16 +863,6 @@ PostgreSQL documentation
         and the two systems have different definitions of the collation used
         to sort the partitioning column.
        </para>
-
-       <para>
-        It is best not to use parallelism when restoring from an archive made
-        with this option, because <application>pg_restore</application> will
-        not know exactly which partition(s) a given archive data item will
-        load data into.  This could result in inefficiency due to lock
-        conflicts between parallel jobs, or perhaps even restore failures due
-        to foreign key constraints being set up before all the relevant data
-        is loaded.
-       </para>
       </listitem>
      </varlistentry>
 
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 4a8f4937605..8a2095b21a3 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -8298,7 +8298,7 @@ getPartitioningInfo(Archive *fout)
 		tbinfo = findTableByOid(tabrelid);
 		if (tbinfo == NULL)
 			fatal("failed sanity check, table OID %u appearing in pg_partitioned_table not found",
-					 tabrelid);
+				  tabrelid);
 		tbinfo->unsafe_partitions = true;
 	}
 

From 9d1db9bbe0b0a0f4a3397214e1341dd56e55a287 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 18 Mar 2023 16:11:22 -0400
Subject: [PATCH 37/61] Doc: fix documentation example for bytea hex output
 format.

Per report from rsindlin

Discussion: https://postgr.es/m/167907221210.1803488.5939223864945604536@wrigleys.postgresql.org
---
 doc/src/sgml/datatype.sgml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 0e89b768c5d..692b6fe2c43 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -1438,7 +1438,12 @@ SELECT b, char_length(b) FROM test2;
    <para>
     Example:
 <programlisting>
-SELECT '\xDEADBEEF';
+SET bytea_output = 'hex';
+
+SELECT '\xDEADBEEF'::bytea;
+   bytea
+------------
+ \xdeadbeef
 </programlisting>
    </para>
   </sect2>

From ac91309afd52217e7349ef93006001aca37fc60a Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Mon, 20 Mar 2023 13:30:55 +1300
Subject: [PATCH 38/61] Fix memory leak in Memoize cache key evaluation

When probing the Memoize cache to check if the current cache key values
exist in the cache, we perform an evaluation of the expressions making up
the cache key before probing the hash table for those values.  This
operation could leak memory as it is possible that the cache key is an
expression which requires allocation of memory, as was the case in bug
17844.

Here we fix this by correctly switching to the per tuple context before
evaluating the cache expressions so that the memory is freed next time the
per tuple context is reset.

Bug: 17844
Reported-by: Alexey Ermakov
Discussion: https://postgr.es/m/17844-d2f6f9e75a622bed@postgresql.org
Backpatch-through: 14, where Memoize was introduced
---
 src/backend/executor/nodeMemoize.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/nodeMemoize.c b/src/backend/executor/nodeMemoize.c
index b32f2469135..c078b68740b 100644
--- a/src/backend/executor/nodeMemoize.c
+++ b/src/backend/executor/nodeMemoize.c
@@ -289,11 +289,18 @@ prepare_probe_slot(MemoizeState *mstate, MemoizeKey *key)
 
 	if (key == NULL)
 	{
+		ExprContext *econtext = mstate->ss.ps.ps_ExprContext;
+		MemoryContext oldcontext;
+
+		oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
 		/* Set the probeslot's values based on the current parameter values */
 		for (int i = 0; i < numKeys; i++)
 			pslot->tts_values[i] = ExecEvalExpr(mstate->param_exprs[i],
-												mstate->ss.ps.ps_ExprContext,
+												econtext,
 												&pslot->tts_isnull[i]);
+
+		MemoryContextSwitchTo(oldcontext);
 	}
 	else
 	{

From bb73725d7d7b5b23f5eea7e74d6ac49e16c4d711 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Mon, 20 Mar 2023 09:51:50 +0100
Subject: [PATCH 39/61] Fix netmask handling in inet_minmax_multi_ops

When calculating distance in brin_minmax_multi_distance_inet(), the
netmask was applied incorrectly. This results in (seemingly) incorrect
ordering of values, triggering an assert.

For builds without asserts this is mostly harmless - we may merge other
ranges, possibly resulting in slightly less efficient index. But it's
still correct and the greedy algorithm doesn't guarantee optimality
anyway.

Backpatch to 14, where minmax-multi indexes were introduced.

Reported by Dmitry Dolgov, investigation and fix by me.

Reported-by: Dmitry Dolgov
Backpatch-through: 14
Discussion: https://postgr.es/m/17774-c6f3e36dd4471e67@postgresql.org
---
 src/backend/access/brin/brin_minmax_multi.c | 4 ++--
 src/test/regress/expected/brin_multi.out    | 6 ++++++
 src/test/regress/sql/brin_multi.sql         | 7 +++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/brin/brin_minmax_multi.c b/src/backend/access/brin/brin_minmax_multi.c
index b343226a191..b4e50937609 100644
--- a/src/backend/access/brin/brin_minmax_multi.c
+++ b/src/backend/access/brin/brin_minmax_multi.c
@@ -2364,14 +2364,14 @@ brin_minmax_multi_distance_inet(PG_FUNCTION_ARGS)
 		unsigned char mask;
 		int			nbits;
 
-		nbits = lena - (i * 8);
+		nbits = Max(0, lena - (i * 8));
 		if (nbits < 8)
 		{
 			mask = (0xFF << (8 - nbits));
 			addra[i] = (addra[i] & mask);
 		}
 
-		nbits = lenb - (i * 8);
+		nbits = Max(0, lenb - (i * 8));
 		if (nbits < 8)
 		{
 			mask = (0xFF << (8 - nbits));
diff --git a/src/test/regress/expected/brin_multi.out b/src/test/regress/expected/brin_multi.out
index 51277fdb887..0720a66b2ae 100644
--- a/src/test/regress/expected/brin_multi.out
+++ b/src/test/regress/expected/brin_multi.out
@@ -352,6 +352,12 @@ VACUUM brintest_multi;  -- force a summarization cycle in brinidx
 insert into public.brintest_multi (float4col) values (real 'nan');
 insert into public.brintest_multi (float8col) values (real 'nan');
 UPDATE brintest_multi SET int8col = int8col * int4col;
+-- Test handling of inet netmasks with inet_minmax_multi_ops
+CREATE TABLE brin_test_inet (a inet);
+CREATE INDEX ON brin_test_inet USING brin (a inet_minmax_multi_ops);
+INSERT INTO brin_test_inet VALUES ('127.0.0.1/0');
+INSERT INTO brin_test_inet VALUES ('0.0.0.0/12');
+DROP TABLE brin_test_inet;
 -- Tests for brin_summarize_new_values
 SELECT brin_summarize_new_values('brintest_multi'); -- error, not an index
 ERROR:  "brintest_multi" is not an index
diff --git a/src/test/regress/sql/brin_multi.sql b/src/test/regress/sql/brin_multi.sql
index 9deb8d2573d..a46c09951b5 100644
--- a/src/test/regress/sql/brin_multi.sql
+++ b/src/test/regress/sql/brin_multi.sql
@@ -359,6 +359,13 @@ insert into public.brintest_multi (float8col) values (real 'nan');
 
 UPDATE brintest_multi SET int8col = int8col * int4col;
 
+-- Test handling of inet netmasks with inet_minmax_multi_ops
+CREATE TABLE brin_test_inet (a inet);
+CREATE INDEX ON brin_test_inet USING brin (a inet_minmax_multi_ops);
+INSERT INTO brin_test_inet VALUES ('127.0.0.1/0');
+INSERT INTO brin_test_inet VALUES ('0.0.0.0/12');
+DROP TABLE brin_test_inet;
+
 -- Tests for brin_summarize_new_values
 SELECT brin_summarize_new_values('brintest_multi'); -- error, not an index
 SELECT brin_summarize_new_values('tenk1_unique1'); -- error, not a BRIN index

From f2b217a9f0886447daf09d98d61f5f42bbf79e31 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Tue, 21 Mar 2023 14:29:34 +1300
Subject: [PATCH 40/61] Fix race in parallel hash join batch cleanup, take II.

With unlucky timing and parallel_leader_participation=off (not the
default), PHJ could attempt to access per-batch shared state just as it
was being freed.  There was code intended to prevent that by checking
for a cleared pointer, but it was racy.  Fix, by introducing an extra
barrier phase.  The new phase PHJ_BUILD_RUNNING means that it's safe to
access the per-batch state to find a batch to help with, and
PHJ_BUILD_DONE means that it is too late.  The last to detach will free
the array of per-batch state as before, but now it will also atomically
advance the phase, so that late attachers can avoid the hazard.  This
mirrors the way per-batch hash tables are freed (see phases
PHJ_BATCH_PROBING and PHJ_BATCH_DONE).

An earlier attempt to fix this (commit 3b8981b6, later reverted) missed
one special case.  When the inner side is empty (the "empty inner
optimization), the build barrier would only make it to
PHJ_BUILD_HASHING_INNER phase before workers attempted to detach from
the hashtable.  In that case, fast-forward the build barrier to
PHJ_BUILD_RUNNING before proceeding, so that our later assertions hold
and we can still negotiate who is cleaning up.

Revealed by build farm failures, where BarrierAttach() failed a sanity
check assertion, because the memory had been clobbered by dsa_free().
In non-assert builds, the result could be a segmentation fault.

Back-patch to all supported releases.

Author: Thomas Munro <thomas.munro@gmail.com>
Author: Melanie Plageman <melanieplageman@gmail.com>
Reported-by: Michael Paquier <michael@paquier.xyz>
Reported-by: David Geier <geidav.pg@gmail.com>
Tested-by: David Geier <geidav.pg@gmail.com>
Discussion: https://postgr.es/m/20200929061142.GA29096%40paquier.xyz
---
 src/backend/executor/nodeHash.c     | 50 +++++++++++++++++---------
 src/backend/executor/nodeHashjoin.c | 54 ++++++++++++++++++++---------
 src/include/executor/hashjoin.h     |  3 +-
 3 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 8779d93b06f..a236f5b4819 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -436,14 +436,21 @@ MultiExecParallelHash(HashState *node)
 	hashtable->nbuckets = pstate->nbuckets;
 	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
 	hashtable->totalTuples = pstate->total_tuples;
-	ExecParallelHashEnsureBatchAccessors(hashtable);
+
+	/*
+	 * Unless we're completely done and the batch state has been freed, make
+	 * sure we have accessors.
+	 */
+	if (BarrierPhase(build_barrier) < PHJ_BUILD_DONE)
+		ExecParallelHashEnsureBatchAccessors(hashtable);
 
 	/*
 	 * The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
-	 * case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
-	 * there already).
+	 * case, which will bring the build phase to PHJ_BUILD_RUNNING (if it
+	 * isn't there already).
 	 */
 	Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+		   BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
 		   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 }
 
@@ -752,7 +759,7 @@ ExecHashTableCreate(HashState *state, HashJoinState *hjstate,
 		/*
 		 * The next Parallel Hash synchronization point is in
 		 * MultiExecParallelHash(), which will progress it all the way to
-		 * PHJ_BUILD_DONE.  The caller must not return control from this
+		 * PHJ_BUILD_RUNNING.  The caller must not return control from this
 		 * executor node between now and then.
 		 */
 	}
@@ -3763,14 +3770,11 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
 	}
 
 	/*
-	 * It's possible for a backend to start up very late so that the whole
-	 * join is finished and the shm state for tracking batches has already
-	 * been freed by ExecHashTableDetach().  In that case we'll just leave
-	 * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
-	 * up early.
+	 * We should never see a state where the batch-tracking array is freed,
+	 * because we should have given up sooner if we join when the build
+	 * barrier has reached the PHJ_BUILD_DONE phase.
 	 */
-	if (!DsaPointerIsValid(pstate->batches))
-		return;
+	Assert(DsaPointerIsValid(pstate->batches));
 
 	/* Use hash join memory context. */
 	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
@@ -3895,9 +3899,18 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
 void
 ExecHashTableDetach(HashJoinTable hashtable)
 {
-	if (hashtable->parallel_state)
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+
+	/*
+	 * If we're involved in a parallel query, we must either have gotten all
+	 * the way to PHJ_BUILD_RUNNING, or joined too late and be in
+	 * PHJ_BUILD_DONE.
+	 */
+	Assert(!pstate ||
+		   BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUNNING);
+
+	if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUNNING)
 	{
-		ParallelHashJoinState *pstate = hashtable->parallel_state;
 		int			i;
 
 		/* Make sure any temporary files are closed. */
@@ -3913,17 +3926,22 @@ ExecHashTableDetach(HashJoinTable hashtable)
 		}
 
 		/* If we're last to detach, clean up shared memory. */
-		if (BarrierDetach(&pstate->build_barrier))
+		if (BarrierArriveAndDetach(&pstate->build_barrier))
 		{
+			/*
+			 * Late joining processes will see this state and give up
+			 * immediately.
+			 */
+			Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_DONE);
+
 			if (DsaPointerIsValid(pstate->batches))
 			{
 				dsa_free(hashtable->area, pstate->batches);
 				pstate->batches = InvalidDsaPointer;
 			}
 		}
-
-		hashtable->parallel_state = NULL;
 	}
+	hashtable->parallel_state = NULL;
 }
 
 /*
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 9ec70f16e31..88eaaa10cef 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -47,7 +47,8 @@
  *   PHJ_BUILD_ALLOCATING            -- one sets up the batches and table 0
  *   PHJ_BUILD_HASHING_INNER         -- all hash the inner rel
  *   PHJ_BUILD_HASHING_OUTER         -- (multi-batch only) all hash the outer
- *   PHJ_BUILD_DONE                  -- building done, probing can begin
+ *   PHJ_BUILD_RUNNING               -- building done, probing can begin
+ *   PHJ_BUILD_DONE                  -- all work complete, one frees batches
  *
  * While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may
  * be used repeatedly as required to coordinate expansions in the number of
@@ -75,7 +76,7 @@
  * batches whenever it encounters them while scanning and probing, which it
  * can do because it processes batches in serial order.
  *
- * Once PHJ_BUILD_DONE is reached, backends then split up and process
+ * Once PHJ_BUILD_RUNNING is reached, backends then split up and process
  * different batches, or gang up and work together on probing batches if there
  * aren't enough to go around.  For each batch there is a separate barrier
  * with the following phases:
@@ -97,11 +98,16 @@
  *
  * To avoid deadlocks, we never wait for any barrier unless it is known that
  * all other backends attached to it are actively executing the node or have
- * already arrived.  Practically, that means that we never return a tuple
- * while attached to a barrier, unless the barrier has reached its final
- * state.  In the slightly special case of the per-batch barrier, we return
- * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use
- * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting.
+ * finished.  Practically, that means that we never emit a tuple while attached
+ * to a barrier, unless the barrier has reached a phase that means that no
+ * process will wait on it again.  We emit tuples while attached to the build
+ * barrier in phase PHJ_BUILD_RUNNING, and to a per-batch barrier in phase
+ * PHJ_BATCH_PROBING.  These are advanced to PHJ_BUILD_DONE and PHJ_BATCH_DONE
+ * respectively without waiting, using BarrierArriveAndDetach().  The last to
+ * detach receives a different return value so that it knows that it's safe to
+ * clean up.  Any straggler process that attaches after that phase is reached
+ * will see that it's too late to participate or access the relevant shared
+ * memory objects.
  *
  *-------------------------------------------------------------------------
  */
@@ -387,7 +393,21 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 				 * outer relation.
 				 */
 				if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
+				{
+					if (parallel)
+					{
+						/*
+						 * Advance the build barrier to PHJ_BUILD_RUNNING
+						 * before proceeding so we can negotiate resource
+						 * cleanup.
+						 */
+						Barrier    *build_barrier = &parallel_state->build_barrier;
+
+						while (BarrierPhase(build_barrier) < PHJ_BUILD_RUNNING)
+							BarrierArriveAndWait(build_barrier, 0);
+					}
 					return NULL;
+				}
 
 				/*
 				 * Prefetch JoinQual or NonJoinQual to prevent motion hazard.
@@ -433,6 +453,7 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 
 					build_barrier = &parallel_state->build_barrier;
 					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+						   BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
 						   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 					if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER)
 					{
@@ -453,9 +474,18 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 						BarrierArriveAndWait(build_barrier,
 											 WAIT_EVENT_HASH_BUILD_HASH_OUTER);
 					}
-					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
+					else if (BarrierPhase(build_barrier) == PHJ_BUILD_DONE)
+					{
+						/*
+						 * If we attached so late that the job is finished and
+						 * the batch state has been freed, we can return
+						 * immediately.
+						 */
+						return NULL;
+					}
 
 					/* Each backend should now select a batch to work on. */
+					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING);
 					hashtable->curbatch = -1;
 					node->hj_JoinState = HJ_NEED_NEW_BATCH;
 
@@ -1428,14 +1458,6 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 	Barrier		*batch0_barrier = NULL;
 	ParallelHashJoinState *pstate = hashtable->parallel_state;
 
-	/*
-	 * If we started up so late that the batch tracking array has been freed
-	 * already by ExecHashTableDetach(), then we are finished.  See also
-	 * ExecParallelHashEnsureBatchAccessors().
-	 */
-	if (hashtable->batches == NULL)
-		return false;
-
 	/*
 	 * If we were already attached to a batch, remember not to bother checking
 	 * it again, and detach from it (possibly freeing the hash table if we are
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index b1fbaacf5e9..e324e67d914 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -298,7 +298,8 @@ typedef struct ParallelHashJoinState
 #define PHJ_BUILD_ALLOCATING			1
 #define PHJ_BUILD_HASHING_INNER			2
 #define PHJ_BUILD_HASHING_OUTER			3
-#define PHJ_BUILD_DONE					4
+#define PHJ_BUILD_RUNNING				4
+#define PHJ_BUILD_DONE					5
 
 /* The phases for probing each batch, used by for batch_barrier. */
 #define PHJ_BATCH_ELECTING				0

From a58f565defe34c3eb92d0b79af257cd72ec896a6 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Tue, 21 Mar 2023 09:18:51 +0530
Subject: [PATCH 41/61] Ignore dropped columns during apply of update/delete.

We fail to apply updates and deletes when the REPLICA IDENTITY FULL is
used for the table having dropped columns. We didn't use to ignore dropped
columns while doing tuple comparison among the tuples from the publisher
and subscriber during apply of updates and deletes.

Author: Onder Kalaci, Shi yu
Reviewed-by: Amit Kapila
Discussion: https://postgr.es/m/CACawEhVQC9WoofunvXg12aXtbqKnEgWxoRx3+v8q32AWYsdpGg@mail.gmail.com
---
 src/backend/executor/execReplication.c | 10 ++++-
 src/test/subscription/t/100_bugs.pl    | 56 +++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 8202e050ec8..093c0228c48 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -243,6 +243,14 @@ tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2,
 		Form_pg_attribute att;
 		TypeCacheEntry *typentry;
 
+		att = TupleDescAttr(slot1->tts_tupleDescriptor, attrnum);
+
+		/*
+		 * Ignore dropped columns as the publisher doesn't send those
+		 */
+		if (att->attisdropped)
+			continue;
+
 		/*
 		 * If one value is NULL and other is not, then they are certainly not
 		 * equal
@@ -256,8 +264,6 @@ tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2,
 		if (slot1->tts_isnull[attrnum] || slot2->tts_isnull[attrnum])
 			continue;
 
-		att = TupleDescAttr(slot1->tts_tupleDescriptor, attrnum);
-
 		typentry = eq[attrnum];
 		if (typentry == NULL)
 		{
diff --git a/src/test/subscription/t/100_bugs.pl b/src/test/subscription/t/100_bugs.pl
index 91602c43399..b9da72eaf2a 100644
--- a/src/test/subscription/t/100_bugs.pl
+++ b/src/test/subscription/t/100_bugs.pl
@@ -6,7 +6,7 @@
 use warnings;
 use PostgresNode;
 use TestLib;
-use Test::More tests => 7;
+use Test::More tests => 8;
 
 # Bug #15114
 
@@ -298,3 +298,57 @@
 
 $node_publisher->stop('fast');
 $node_subscriber->stop('fast');
+
+# The bug was that when the REPLICA IDENTITY FULL is used with dropped columns,
+# we fail to apply updates and deletes
+my $node_publisher_d_cols = get_new_node('node_publisher_d_cols');
+$node_publisher_d_cols->init(allows_streaming => 'logical');
+$node_publisher_d_cols->start;
+
+my $node_subscriber_d_cols = get_new_node('node_subscriber_d_cols');
+$node_subscriber_d_cols->init(allows_streaming => 'logical');
+$node_subscriber_d_cols->start;
+
+$node_publisher_d_cols->safe_psql(
+	'postgres', qq(
+	CREATE TABLE dropped_cols (a int, b_drop int, c int);
+	ALTER TABLE dropped_cols REPLICA IDENTITY FULL;
+	CREATE PUBLICATION pub_dropped_cols FOR TABLE dropped_cols;
+	-- some initial data
+	INSERT INTO dropped_cols VALUES (1, 1, 1);
+));
+
+$node_subscriber_d_cols->safe_psql(
+	'postgres', qq(
+	 CREATE TABLE dropped_cols (a int, b_drop int, c int);
+));
+
+my $publisher_connstr_d_cols =
+  $node_publisher_d_cols->connstr . ' dbname=postgres';
+$node_subscriber_d_cols->safe_psql('postgres',
+	"CREATE SUBSCRIPTION sub_dropped_cols CONNECTION '$publisher_connstr_d_cols' PUBLICATION pub_dropped_cols"
+);
+$node_subscriber_d_cols->wait_for_subscription_sync;
+
+$node_publisher_d_cols->safe_psql(
+	'postgres', qq(
+		ALTER TABLE dropped_cols DROP COLUMN b_drop;
+));
+$node_subscriber_d_cols->safe_psql(
+	'postgres', qq(
+		ALTER TABLE dropped_cols DROP COLUMN b_drop;
+));
+
+$node_publisher_d_cols->safe_psql(
+	'postgres', qq(
+		UPDATE dropped_cols SET a = 100;
+));
+$node_publisher_d_cols->wait_for_catchup('sub_dropped_cols');
+
+is( $node_subscriber_d_cols->safe_psql(
+		'postgres', "SELECT count(*) FROM dropped_cols WHERE a = 100"),
+	qq(1),
+	'replication with RI FULL and dropped columns');
+
+$node_publisher_d_cols->stop('fast');
+$node_subscriber_d_cols->stop('fast');

From 01fbce035911f49015b793978aba63d811e4a943 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 22 Mar 2023 18:32:04 +0900
Subject: [PATCH 42/61] doc: Add description of some missing monitoring
 functions

This commit adds some documentation about two monitoring functions:
- pg_stat_get_xact_blocks_fetched()
- pg_stat_get_xact_blocks_hit()

The description of these functions has been removed in ddfc2d9, later
simplified by 5f2b089, assuming that all the functions whose
descriptions were removed are used in system views.  Unfortunately, some
of them were are not used in any system views, so they lacked
documentation.

This gap exists in the docs for a long time, so backpatch all the way
down.

Reported-by: Michael Paquier
Author: Bertrand Drouvot
Reviewed-by: Kyotaro Horiguchi
Discussion: https://postgr.es/m/ZBeeH5UoNkTPrwHO@paquier.xyz
Backpatch-through: 11
---
 doc/src/sgml/monitoring.sgml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index ce6c34d6ea1..3c56f9fa4f1 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -5051,6 +5051,34 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_stat_get_xact_blocks_fetched</primary>
+        </indexterm>
+        <function>pg_stat_get_xact_blocks_fetched</function> ( <type>oid</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Returns the number of buffers fetched for table or index, in the current
+        transaction.
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_stat_get_xact_blocks_hit</primary>
+        </indexterm>
+        <function>pg_stat_get_xact_blocks_hit</function> ( <type>oid</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Returns the number of buffer hits for table or index, in the current
+        transaction.
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>

From 8bdea85cdb272d89d5eb8493fceecff3ce5220a0 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Thu, 23 Mar 2023 11:32:22 +0530
Subject: [PATCH 43/61] Ignore generated columns during apply of update/delete.

We fail to apply updates and deletes when the REPLICA IDENTITY FULL is
used for the table having generated columns. We didn't use to ignore
generated columns while doing tuple comparison among the tuples from
the publisher and subscriber during apply of updates and deletes.

Author: Onder Kalaci
Reviewed-by: Shi yu, Amit Kapila
Backpatch-through: 12
Discussion: https://postgr.es/m/CACawEhVQC9WoofunvXg12aXtbqKnEgWxoRx3+v8q32AWYsdpGg@mail.gmail.com
---
 src/backend/executor/execReplication.c |  5 +++--
 src/test/subscription/t/100_bugs.pl    | 18 ++++++++++++++----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 093c0228c48..831699ff9f5 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -246,9 +246,10 @@ tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2,
 		att = TupleDescAttr(slot1->tts_tupleDescriptor, attrnum);
 
 		/*
-		 * Ignore dropped columns as the publisher doesn't send those
+		 * Ignore dropped and generated columns as the publisher doesn't send
+		 * those
 		 */
-		if (att->attisdropped)
+		if (att->attisdropped || att->attgenerated)
 			continue;
 
 		/*
diff --git a/src/test/subscription/t/100_bugs.pl b/src/test/subscription/t/100_bugs.pl
index b9da72eaf2a..cce91891ab9 100644
--- a/src/test/subscription/t/100_bugs.pl
+++ b/src/test/subscription/t/100_bugs.pl
@@ -6,7 +6,7 @@
 use warnings;
 use PostgresNode;
 use TestLib;
-use Test::More tests => 8;
+use Test::More tests => 9;
 
 # Bug #15114
 
@@ -299,8 +299,8 @@
 $node_publisher->stop('fast');
 $node_subscriber->stop('fast');
 
-# The bug was that when the REPLICA IDENTITY FULL is used with dropped columns,
-# we fail to apply updates and deletes
+# The bug was that when the REPLICA IDENTITY FULL is used with dropped or
+# generated columns, we fail to apply updates and deletes
 my $node_publisher_d_cols = get_new_node('node_publisher_d_cols');
 $node_publisher_d_cols->init(allows_streaming => 'logical');
 $node_publisher_d_cols->start;
@@ -313,14 +313,18 @@
 	'postgres', qq(
 	CREATE TABLE dropped_cols (a int, b_drop int, c int);
 	ALTER TABLE dropped_cols REPLICA IDENTITY FULL;
-	CREATE PUBLICATION pub_dropped_cols FOR TABLE dropped_cols;
+	CREATE TABLE generated_cols (a int, b_gen int GENERATED ALWAYS AS (5 * a) STORED, c int);
+	ALTER TABLE generated_cols REPLICA IDENTITY FULL;
+	CREATE PUBLICATION pub_dropped_cols FOR TABLE dropped_cols, generated_cols;
 	-- some initial data
 	INSERT INTO dropped_cols VALUES (1, 1, 1);
+	INSERT INTO generated_cols (a, c) VALUES (1, 1);
 ));
 
 $node_subscriber_d_cols->safe_psql(
 	'postgres', qq(
 	 CREATE TABLE dropped_cols (a int, b_drop int, c int);
+	 CREATE TABLE generated_cols (a int, b_gen int GENERATED ALWAYS AS (5 * a) STORED, c int);
 ));
 
 my $publisher_connstr_d_cols =
@@ -342,6 +346,7 @@
 $node_publisher_d_cols->safe_psql(
 	'postgres', qq(
 		UPDATE dropped_cols SET a = 100;
+		UPDATE generated_cols SET a = 100;
 ));
 $node_publisher_d_cols->wait_for_catchup('sub_dropped_cols');
 
@@ -350,5 +355,10 @@
 	qq(1),
 	'replication with RI FULL and dropped columns');
 
+is( $node_subscriber_d_cols->safe_psql(
+		'postgres', "SELECT count(*) FROM generated_cols WHERE a = 100"),
+	qq(1),
+	'replication with RI FULL and generated columns');
+
 $node_publisher_d_cols->stop('fast');
 $node_subscriber_d_cols->stop('fast');

From 9292a84347adbd64da727e456bfa1e27e45da3a4 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 23 Mar 2023 15:29:28 -0400
Subject: [PATCH 44/61] amcheck: Fix verify_heapam for tuples where xmin or
 xmax is 0.

In such cases, get_xid_status() doesn't set its output parameter (the
third argument), so we shouldn't fall through to code which will test
the value of that parameter. There are five existing calls to
get_xid_status(), three of which seem to already handle this case
properly.  This commit tries to fix the other two.

If we're checking xmin and find that it is invalid (i.e. 0) just
report that as corruption, similar to what's already done in the
three cases that seem correct. If we're checking xmax and find
that's invalid, that's fine: it just means that the tuple hasn't
been updated or deleted.

Thanks to Andres Freund and valgrind for finding this problem, and
also to Andres for having a look at the patch.  This bug seems to go
all the way back to where verify_heapam was first introduced, but
wasn't detected until recently, possibly because of the new test cases
added for update chain verification.  Back-patch to v14, where this
code showed up.

Discussion: http://postgr.es/m/CA+TgmoZAYzQZqyUparXy_ks3OEOfLD9-bEXt8N-2tS1qghX9gQ@mail.gmail.com
---
 contrib/amcheck/verify_heapam.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c
index f7964b78173..cc49ccb26f6 100644
--- a/contrib/amcheck/verify_heapam.c
+++ b/contrib/amcheck/verify_heapam.c
@@ -775,6 +775,9 @@ check_tuple_visibility(HeapCheckContext *ctx)
 	switch (get_xid_status(xmin, ctx, &xmin_status))
 	{
 		case XID_INVALID:
+			report_corruption(ctx,
+							  pstrdup("xmin is invalid"));
+			return false;
 		case XID_BOUNDS_OK:
 			break;
 		case XID_IN_FUTURE:
@@ -1110,6 +1113,9 @@ check_tuple_visibility(HeapCheckContext *ctx)
 	xmax = HeapTupleHeaderGetRawXmax(tuphdr);
 	switch (get_xid_status(xmax, ctx, &xmax_status))
 	{
+		case XID_INVALID:
+			ctx->tuple_could_be_pruned = false;
+			return true;
 		case XID_IN_FUTURE:
 			report_corruption(ctx,
 							  psprintf("xmax %u equals or exceeds next valid transaction ID %u:%u",
@@ -1132,7 +1138,6 @@ check_tuple_visibility(HeapCheckContext *ctx)
 									   XidFromFullTransactionId(ctx->oldest_fxid)));
 			return false;		/* corrupt */
 		case XID_BOUNDS_OK:
-		case XID_INVALID:
 			break;
 	}
 

From 9d63ac48f3e5a872d778215f687b8b67c30385aa Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 26 Mar 2023 13:41:06 -0400
Subject: [PATCH 45/61] Fix oversights in array manipulation.

The nested-arrays code path in ExecEvalArrayExpr() used palloc to
allocate the result array, whereas every other array-creating function
has used palloc0 since 18c0b4ecc.  This mostly works, but unused bits
past the end of the nulls bitmap may end up undefined.  That causes
valgrind complaints with -DWRITE_READ_PARSE_PLAN_TREES, and could
cause planner misbehavior as cited in 18c0b4ecc.  There seems no very
good reason why we should strive to avoid palloc0 in just this one case,
so fix it the easy way with s/palloc/palloc0/.

While looking at that I noted that we also failed to check for overflow
of "nbytes" and "nitems" while summing the sizes of the sub-arrays,
potentially allowing a crash due to undersized output allocation.
For "nbytes", follow the policy used by other array-munging code of
checking for overflow after each addition.  (As elsewhere, the last
addition of the array's overhead space doesn't need an extra check,
since palloc itself will catch a value between 1Gb and 2Gb.)
For "nitems", there's no very good reason to sum the inputs at all,
since we can perfectly well use ArrayGetNItems' result instead of
ignoring it.

Per discussion of this bug, also remove redundant zeroing of the
nulls bitmap in array_set_element and array_set_slice.

Patch by Alexander Lakhin and myself, per bug #17858 from Alexander
Lakhin; thanks also to Richard Guo.  These bugs are a dozen years old,
so back-patch to all supported branches.

Discussion: https://postgr.es/m/17858-8fd287fd3663d051@postgresql.org
---
 src/backend/executor/execExprInterp.c | 13 +++++++++----
 src/backend/utils/adt/arrayfuncs.c    |  6 ++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
index 7729117381f..56874a3f78a 100644
--- a/src/backend/executor/execExprInterp.c
+++ b/src/backend/executor/execExprInterp.c
@@ -2817,7 +2817,7 @@ ExecEvalArrayExpr(ExprState *state, ExprEvalStep *op)
 	{
 		/* Must be nested array expressions */
 		int			nbytes = 0;
-		int			nitems = 0;
+		int			nitems;
 		int			outer_nelems = 0;
 		int			elem_ndims = 0;
 		int		   *elem_dims = NULL;
@@ -2912,9 +2912,14 @@ ExecEvalArrayExpr(ExprState *state, ExprEvalStep *op)
 			subbitmaps[outer_nelems] = ARR_NULLBITMAP(array);
 			subbytes[outer_nelems] = ARR_SIZE(array) - ARR_DATA_OFFSET(array);
 			nbytes += subbytes[outer_nelems];
+			/* check for overflow of total request */
+			if (!AllocSizeIsValid(nbytes))
+				ereport(ERROR,
+						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+						 errmsg("array size exceeds the maximum allowed (%d)",
+								(int) MaxAllocSize)));
 			subnitems[outer_nelems] = ArrayGetNItems(this_ndims,
 													 ARR_DIMS(array));
-			nitems += subnitems[outer_nelems];
 			havenulls |= ARR_HASNULL(array);
 			outer_nelems++;
 		}
@@ -2948,7 +2953,7 @@ ExecEvalArrayExpr(ExprState *state, ExprEvalStep *op)
 		}
 
 		/* check for subscript overflow */
-		(void) ArrayGetNItems(ndims, dims);
+		nitems = ArrayGetNItems(ndims, dims);
 		ArrayCheckBounds(ndims, dims, lbs);
 
 		if (havenulls)
@@ -2962,7 +2967,7 @@ ExecEvalArrayExpr(ExprState *state, ExprEvalStep *op)
 			nbytes += ARR_OVERHEAD_NONULLS(ndims);
 		}
 
-		result = (ArrayType *) palloc(nbytes);
+		result = (ArrayType *) palloc0(nbytes);
 		SET_VARSIZE(result, nbytes);
 		result->ndim = ndims;
 		result->dataoffset = dataoffset;
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index 6d00be72b16..cf6ab1d8408 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -2478,8 +2478,7 @@ array_set_element(Datum arraydatum,
 	{
 		bits8	   *newnullbitmap = ARR_NULLBITMAP(newarray);
 
-		/* Zero the bitmap to take care of marking inserted positions null */
-		MemSet(newnullbitmap, 0, (newnitems + 7) / 8);
+		/* palloc0 above already marked any inserted positions as nulls */
 		/* Fix the inserted value */
 		if (addedafter)
 			array_set_isnull(newnullbitmap, newnitems - 1, isNull);
@@ -3127,8 +3126,7 @@ array_set_slice(Datum arraydatum,
 			bits8	   *newnullbitmap = ARR_NULLBITMAP(newarray);
 			bits8	   *oldnullbitmap = ARR_NULLBITMAP(array);
 
-			/* Zero the bitmap to handle marking inserted positions null */
-			MemSet(newnullbitmap, 0, (nitems + 7) / 8);
+			/* palloc0 above already marked any inserted positions as nulls */
 			array_bitmap_copy(newnullbitmap, addedbefore,
 							  oldnullbitmap, 0,
 							  itemsbefore);

From 19f54ea2a3a2885235d2220f1c98ad15d876c5cd Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 27 Mar 2023 15:04:02 -0400
Subject: [PATCH 46/61] Reject attempts to alter composite types used in
 indexes.

find_composite_type_dependencies() ignored indexes, which is a poor
decision because an expression index could have a stored column of
a composite (or other container) type even when the underlying table
does not.  Teach it to detect such cases and error out.  We have to
work a bit harder than for other relations because the pg_depend entry
won't identify the specific index column of concern, but it's not much
new code.

This does not address bug #17872's original complaint that dropping
a column in such a type might lead to violations of the uniqueness
property that a unique index is supposed to ensure.  That seems of
much less concern to me because it won't lead to crashes.

Per bug #17872 from Alexander Lakhin.  Back-patch to all supported
branches.

Discussion: https://postgr.es/m/17872-d0fbb799dc3fd85d@postgresql.org
---
 src/backend/commands/tablecmds.c          | 57 +++++++++++++++++++----
 src/test/regress/expected/alter_table.out | 10 +++-
 src/test/regress/sql/alter_table.sql      | 11 ++++-
 3 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 29a7a57e681..9bdebc6e29b 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -8100,6 +8100,7 @@ find_composite_type_dependencies(Oid typeOid, Relation origRelation,
 	{
 		Form_pg_depend pg_depend = (Form_pg_depend) GETSTRUCT(depTup);
 		Relation	rel;
+		TupleDesc	tupleDesc;
 		Form_pg_attribute att;
 
 		/* Check for directly dependent types */
@@ -8116,18 +8117,58 @@ find_composite_type_dependencies(Oid typeOid, Relation origRelation,
 			continue;
 		}
 
-		/* Else, ignore dependees that aren't user columns of relations */
-		/* (we assume system columns are never of interesting types) */
-		if (pg_depend->classid != RelationRelationId ||
-			pg_depend->objsubid <= 0)
+		/* Else, ignore dependees that aren't relations */
+		if (pg_depend->classid != RelationRelationId)
 			continue;
 
 		rel = relation_open(pg_depend->objid, AccessShareLock);
-		att = TupleDescAttr(rel->rd_att, pg_depend->objsubid - 1);
+		tupleDesc = RelationGetDescr(rel);
 
-		if (rel->rd_rel->relkind == RELKIND_RELATION ||
-			rel->rd_rel->relkind == RELKIND_MATVIEW ||
-			rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		/*
+		 * If objsubid identifies a specific column, refer to that in error
+		 * messages.  Otherwise, search to see if there's a user column of the
+		 * type.  (We assume system columns are never of interesting types.)
+		 * The search is needed because an index containing an expression
+		 * column of the target type will just be recorded as a whole-relation
+		 * dependency.  If we do not find a column of the type, the dependency
+		 * must indicate that the type is transiently referenced in an index
+		 * expression but not stored on disk, which we assume is OK, just as
+		 * we do for references in views.  (It could also be that the target
+		 * type is embedded in some container type that is stored in an index
+		 * column, but the previous recursion should catch such cases.)
+		 */
+		if (pg_depend->objsubid > 0 && pg_depend->objsubid <= tupleDesc->natts)
+			att = TupleDescAttr(tupleDesc, pg_depend->objsubid - 1);
+		else
+		{
+			att = NULL;
+			for (int attno = 1; attno <= tupleDesc->natts; attno++)
+			{
+				att = TupleDescAttr(tupleDesc, attno - 1);
+				if (att->atttypid == typeOid && !att->attisdropped)
+					break;
+				att = NULL;
+			}
+			if (att == NULL)
+			{
+				/* No such column, so assume OK */
+				relation_close(rel, AccessShareLock);
+				continue;
+			}
+		}
+
+		/*
+		 * We definitely should reject if the relation has storage.  If it's
+		 * partitioned, then perhaps we don't have to reject: if there are
+		 * partitions then we'll fail when we find one, else there is no
+		 * stored data to worry about.  However, it's possible that the type
+		 * change would affect conclusions about whether the type is sortable
+		 * or hashable and thus (if it's a partitioning column) break the
+		 * partitioning rule.  For now, reject for partitioned rels too.
+		 */
+		if (RELKIND_HAS_STORAGE(rel->rd_rel->relkind) ||
+			rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE ||
+			rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
 		{
 			if (origTypeName)
 				ereport(ERROR,
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index cbc578e3586..9b203cbc4ec 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3198,6 +3198,13 @@ CREATE TYPE test_type1 AS (a int, b text);
 CREATE TABLE test_tbl1 (x int, y test_type1);
 ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails
 ERROR:  cannot alter type "test_type1" because column "test_tbl1.y" uses it
+DROP TABLE test_tbl1;
+CREATE TABLE test_tbl1 (x int, y text);
+CREATE INDEX test_tbl1_idx ON test_tbl1((row(x,y)::test_type1));
+ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails
+ERROR:  cannot alter type "test_type1" because column "test_tbl1_idx.row" uses it
+DROP TABLE test_tbl1;
+DROP TYPE test_type1;
 CREATE TYPE test_type2 AS (a int, b text);
 CREATE TABLE test_tbl2 OF test_type2;
 CREATE TABLE test_tbl2_subclass () INHERITS (test_tbl2);
@@ -3315,7 +3322,8 @@ Distributed by: (aa)
 Inherits: test_tbl2
 Distributed by: (aa)
 
-DROP TABLE test_tbl2_subclass;
+DROP TABLE test_tbl2_subclass, test_tbl2;
+DROP TYPE test_type2;
 CREATE TYPE test_typex AS (a int, b text);
 CREATE TABLE test_tblx (x int, y test_typex check ((y).a > 0));
 ALTER TYPE test_typex DROP ATTRIBUTE a; -- fails
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 9da0e5603ea..1c6fd62a117 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2019,6 +2019,14 @@ CREATE TYPE test_type1 AS (a int, b text);
 CREATE TABLE test_tbl1 (x int, y test_type1);
 ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails
 
+DROP TABLE test_tbl1;
+CREATE TABLE test_tbl1 (x int, y text);
+CREATE INDEX test_tbl1_idx ON test_tbl1((row(x,y)::test_type1));
+ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails
+
+DROP TABLE test_tbl1;
+DROP TYPE test_type1;
+
 CREATE TYPE test_type2 AS (a int, b text);
 CREATE TABLE test_tbl2 OF test_type2;
 CREATE TABLE test_tbl2_subclass () INHERITS (test_tbl2);
@@ -2046,7 +2054,8 @@ ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa CASCADE;
 \d test_tbl2
 \d test_tbl2_subclass
 
-DROP TABLE test_tbl2_subclass;
+DROP TABLE test_tbl2_subclass, test_tbl2;
+DROP TYPE test_type2;
 
 CREATE TYPE test_typex AS (a int, b text);
 CREATE TABLE test_tblx (x int, y test_typex check ((y).a > 0));

From 2b155a04cbd81915c0e991f72d99c4baa430a8d5 Mon Sep 17 00:00:00 2001
From: Daniel Gustafsson <dgustafsson@postgresql.org>
Date: Mon, 27 Mar 2023 21:35:27 +0200
Subject: [PATCH 47/61] doc: Fix XML_CATALOG_FILES env var for Apple Silicon
 machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Homebrew changed the prefix for Apple Silicon based machines, so
our advice for XML_CATALOG_FILES needs to mention both.  More info
on the Homebrew change can be found at:

https://github.com/Homebrew/brew/issues/9177

This is backpatch of commits 4c8d65408 and 5a91c7975, the latter
which contained a small fix based on a report from Dagfinn Ilmari
Mannsåker.

Author: Julien Rouhaud <julien.rouhaud@free.fr>
Discussion: https://postgr.es/m/20230327082441.h7pa2vqiobbyo7rd@jrouhaud
---
 doc/src/sgml/docguide.sgml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/docguide.sgml b/doc/src/sgml/docguide.sgml
index 55ef6417749..2d8d5cd64f2 100644
--- a/doc/src/sgml/docguide.sgml
+++ b/doc/src/sgml/docguide.sgml
@@ -209,9 +209,13 @@ brew install docbook docbook-xsl libxslt fop
 
    <para>
     The Homebrew-supplied programs require the following environment variable
-    to be set:
+    to be set.  For Intel based machines, use this:
 <programlisting>
 export XML_CATALOG_FILES=/usr/local/etc/xml/catalog
+</programlisting>
+    On Apple Silicon based machines, use this:
+<programlisting>
+export XML_CATALOG_FILES=/opt/homebrew/etc/xml/catalog
 </programlisting>
     Without it, <command>xsltproc</command> will throw errors like this:
 <programlisting>

From f98f014860fd0f4ffdf1ae5f3db3536225fcd68c Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 28 Mar 2023 16:16:53 -0400
Subject: [PATCH 48/61] amcheck: In verify_heapam, allows tuples with xmin 0.

Commit e88754a1965c0f40a723e6e46d670cacda9e19bd caused that case
to be reported as corruption, but Peter Geoghegan pointed out that
it can legitimately happen in the case of a speculative insertion
that aborts, so we'd better not flag it as corruption after all.

Back-patch to v14, like the commit that introduced the issue.

Discussion: http://postgr.es/m/CAH2-WzmEabzcPTxSY-NXKH6Qt3FkAPYHGQSe2PtvGgj17ZQkCw@mail.gmail.com
---
 contrib/amcheck/verify_heapam.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c
index cc49ccb26f6..1cda7b8f45c 100644
--- a/contrib/amcheck/verify_heapam.c
+++ b/contrib/amcheck/verify_heapam.c
@@ -775,8 +775,7 @@ check_tuple_visibility(HeapCheckContext *ctx)
 	switch (get_xid_status(xmin, ctx, &xmin_status))
 	{
 		case XID_INVALID:
-			report_corruption(ctx,
-							  pstrdup("xmin is invalid"));
+			/* Could be the result of a speculative insertion that aborted. */
 			return false;
 		case XID_BOUNDS_OK:
 			break;

From cd83e8232f418d1328c6b31e9c825bb6135cd382 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 29 Mar 2023 11:31:30 -0400
Subject: [PATCH 49/61] Fix dereference of dangling pointer in GiST index
 buffering build.

gistBuildCallback tried to fetch the size of an index tuple that
might have already been freed by gistProcessEmptyingQueue.
While this seems to usually be harmless in production builds,
in principle it could result in a SIGSEGV, or more likely a bogus
value for indtuplesSize leading to poor page-split decisions later
in the build.

The memory management here is confusing and could stand to be
refactored, but for the moment it seems to be enough to fetch
the tuple size sooner.  AFAICT the indtuples[Size] totals aren't
used in between these places; even if they were, the updated
values shouldn't be any worse to use.  So just move the
incrementing of the totals up.

It's not very clear why our valgrind-using buildfarm animals
haven't noticed this problem, because the relevant code path
does seem to be exercised according to the code coverage report.
I think the reason that we didn't fix this bug after the first
report is that I'd wanted to try to understand that better.
However, now that it's been re-discovered let's just be pragmatic
and fix it already.

Original report by Alexander Lakhin (bug #16329),
later rediscovered by Egor Chindyaskin (bug #17874).

Patch by Alexander Lakhin (commentary by Pavel Borisov and me).
Back-patch to all supported branches.

Discussion: https://postgr.es/m/16329-7a6aa9b6fa1118a1@postgresql.org
Discussion: https://postgr.es/m/17874-63ca6c7ce42d2103@postgresql.org
---
 src/backend/access/gist/gistbuild.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index d281d89000f..907cc83857f 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -844,6 +844,19 @@ gistBuildCallback(Relation index,
 						 true);
 	itup->t_tid = *tid;
 
+	/* Update tuple count and total size. */
+	buildstate->indtuples += 1;
+	buildstate->indtuplesSize += IndexTupleSize(itup);
+
+	/*
+	 * XXX In buffering builds, the tempCxt is also reset down inside
+	 * gistProcessEmptyingQueue().  This is not great because it risks
+	 * confusion and possible use of dangling pointers (for example, itup
+	 * might be already freed when control returns here).  It's generally
+	 * better that a memory context be "owned" by only one function.  However,
+	 * currently this isn't causing issues so it doesn't seem worth the amount
+	 * of refactoring that would be needed to avoid it.
+	 */
 	if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
 	{
 		/* We have buffers, so use them. */
@@ -859,10 +872,6 @@ gistBuildCallback(Relation index,
 					 buildstate->giststate, buildstate->heaprel, true);
 	}
 
-	/* Update tuple count and total size. */
-	buildstate->indtuples += 1;
-	buildstate->indtuplesSize += IndexTupleSize(itup);
-
 	MemoryContextSwitchTo(oldCtx);
 	MemoryContextReset(buildstate->giststate->tempCxt);
 

From 4544859f732c4c98b711433f0e03e85c60a4b129 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Fri, 31 Mar 2023 12:14:04 +1300
Subject: [PATCH 50/61] Fix List memory issue in transformColumnDefinition

When calling generateSerialExtraStmts(), we would pass in the
constraint->options.  In some cases, generateSerialExtraStmts() would
modify the referenced List to remove elements from it, but doing so is
invalid without assigning the list back to all variables that point to it.
In the particular reported problem case, the List became empty, in which
cases it became NIL, but the passed in constraint->options didn't get to
find out about that and was left pointing to free'd memory.

To fix this, just perform a list_copy() inside generateSerialExtraStmts().
We could just do a list_copy() just before we perform the delete from the
list, however, that seems less robust.  Let's make sure the generated
CreateSeqStmt gets a completely different copy of the list to be safe.

Bug: #17879
Reported-by: Fei Changhong
Diagnosed-by: Fei Changhong
Discussion: https://postgr.es/m/17879-b7dfb5debee58ff5@postgresql.org
Backpatch-through: 11, all supported versions
---
 src/backend/parser/parse_utilcmd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 223c6fcb48b..358ac56da30 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -476,6 +476,9 @@ generateSerialExtraStmts(CreateStmtContext *cxt, ColumnDef *column,
 
 	int			nameEl_idx = -1;
 
+	/* Make a copy of this as we may end up modifying it in the code below */
+	seqoptions = list_copy(seqoptions);
+
 	/*
 	 * Determine namespace and name to use for the sequence.
 	 *

From 316dc347264bb20b3f7d829f6fdedf5abd4b1769 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 31 Mar 2023 10:08:40 -0400
Subject: [PATCH 51/61] Ensure acquire_inherited_sample_rows sets its output
 parameters.

The totalrows/totaldeadrows outputs were left uninitialized in cases
where we find no analyzable child tables of a partitioned table.  This
could lead to setting the partitioned table's pg_class.reltuples value
to garbage.  It's not clear that that would have any very bad effects
in practice, but fix it anyway because it's making valgrind unhappy.

Reported and diagnosed by Alexander Lakhin (bug #17880).

Discussion: https://postgr.es/m/17880-9282037c923d856e@postgresql.org
---
 src/backend/commands/analyze.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 00fc25b2439..8d329d44503 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2007,6 +2007,10 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 	ListCell   *lc;
 	bool		has_child;
 
+	/* Initialize output parameters to zero now, in case we exit early */
+	*totalrows = 0;
+	*totaldeadrows = 0;
+
 	/*
 	 * Find all members of inheritance set.  We only need AccessShareLock on
 	 * the children.
@@ -2158,8 +2162,6 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 	pgstat_progress_update_param(PROGRESS_ANALYZE_CHILD_TABLES_TOTAL,
 								 nrels);
 	numrows = 0;
-	*totalrows = 0;
-	*totaldeadrows = 0;
 	for (i = 0; i < nrels; i++)
 	{
 		Relation	childrel = rels[i];

From c8f8915363bb97b7183fe6f64e5552938999cbc3 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 31 Mar 2023 11:18:49 -0400
Subject: [PATCH 52/61] Reject system columns as elements of foreign keys.

Up through v11 it was sensible to use the "oid" system column as
a foreign key column, but since that was removed there's no visible
usefulness in making any of the remaining system columns a foreign
key.  Moreover, since the TupleTableSlot rewrites in v12, such cases
actively fail because of implicit assumptions that only user columns
appear in foreign keys.  The lack of complaints about that seems
like good evidence that no one is trying to do it.  Hence, rather
than trying to repair those assumptions (of which there are at least
two, maybe more), let's just forbid the case up front.

Per this patch, a system column in either the referenced or
referencing side of a foreign key will draw this error; however,
putting one in the referenced side would have failed later anyway,
since we don't allow unique indexes to be made on system columns.

Per bug #17877 from Alexander Lakhin.  Back-patch to v12; the
case still appears to work in v11, so we shouldn't break it there.

Discussion: https://postgr.es/m/17877-4bcc658e33df6de1@postgresql.org
---
 src/backend/commands/tablecmds.c          | 15 +++++++++++++--
 src/test/regress/expected/foreign_key.out | 11 ++++++-----
 src/test/regress/sql/foreign_key.sql      |  7 ++++---
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 9bdebc6e29b..34c910565a9 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -12893,6 +12893,11 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName,
  * transformColumnNameList - transform list of column names
  *
  * Lookup each name and return its attnum and type OID
+ *
+ * Note: the name of this function suggests that it's general-purpose,
+ * but actually it's only used to look up names appearing in foreign-key
+ * clauses.  The error messages would need work to use it in other cases,
+ * and perhaps the validity checks as well.
  */
 static int
 transformColumnNameList(Oid relId, List *colList,
@@ -12906,6 +12911,7 @@ transformColumnNameList(Oid relId, List *colList,
 	{
 		char	   *attname = strVal(lfirst(l));
 		HeapTuple	atttuple;
+		Form_pg_attribute attform;
 
 		atttuple = SearchSysCacheAttName(relId, attname);
 		if (!HeapTupleIsValid(atttuple))
@@ -12913,13 +12919,18 @@ transformColumnNameList(Oid relId, List *colList,
 					(errcode(ERRCODE_UNDEFINED_COLUMN),
 					 errmsg("column \"%s\" referenced in foreign key constraint does not exist",
 							attname)));
+		attform = (Form_pg_attribute) GETSTRUCT(atttuple);
+		if (attform->attnum < 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("system columns cannot be used in foreign keys")));
 		if (attnum >= INDEX_MAX_KEYS)
 			ereport(ERROR,
 					(errcode(ERRCODE_TOO_MANY_COLUMNS),
 					 errmsg("cannot have more than %d keys in a foreign key",
 							INDEX_MAX_KEYS)));
-		attnums[attnum] = ((Form_pg_attribute) GETSTRUCT(atttuple))->attnum;
-		atttypids[attnum] = ((Form_pg_attribute) GETSTRUCT(atttuple))->atttypid;
+		attnums[attnum] = attform->attnum;
+		atttypids[attnum] = attform->atttypid;
 		ReleaseSysCache(atttuple);
 		attnum++;
 	}
diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
index 8ea6434e222..ce86e400bcb 100644
--- a/src/test/regress/expected/foreign_key.out
+++ b/src/test/regress/expected/foreign_key.out
@@ -757,15 +757,16 @@ SELECT * from FKTABLE;
 
 DROP TABLE FKTABLE;
 DROP TABLE PKTABLE;
-CREATE TABLE PKTABLE (ptest1 int PRIMARY KEY);
+-- Test some invalid FK definitions
+CREATE TABLE PKTABLE (ptest1 int PRIMARY KEY, someoid oid);
 CREATE TABLE FKTABLE_FAIL1 ( ftest1 int, CONSTRAINT fkfail1 FOREIGN KEY (ftest2) REFERENCES PKTABLE);
 ERROR:  column "ftest2" referenced in foreign key constraint does not exist
 CREATE TABLE FKTABLE_FAIL2 ( ftest1 int, CONSTRAINT fkfail1 FOREIGN KEY (ftest1) REFERENCES PKTABLE(ptest2));
 ERROR:  column "ptest2" referenced in foreign key constraint does not exist
-DROP TABLE FKTABLE_FAIL1;
-ERROR:  table "fktable_fail1" does not exist
-DROP TABLE FKTABLE_FAIL2;
-ERROR:  table "fktable_fail2" does not exist
+CREATE TABLE FKTABLE_FAIL3 ( ftest1 int, CONSTRAINT fkfail1 FOREIGN KEY (tableoid) REFERENCES PKTABLE(someoid));
+ERROR:  system columns cannot be used in foreign keys
+CREATE TABLE FKTABLE_FAIL4 ( ftest1 oid, CONSTRAINT fkfail1 FOREIGN KEY (ftest1) REFERENCES PKTABLE(tableoid));
+ERROR:  system columns cannot be used in foreign keys
 DROP TABLE PKTABLE;
 -- Test for referencing column number smaller than referenced constraint
 CREATE TABLE PKTABLE (ptest1 int, ptest2 int, UNIQUE(ptest1, ptest2));
diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
index a8c9e77b800..426d10bf861 100644
--- a/src/test/regress/sql/foreign_key.sql
+++ b/src/test/regress/sql/foreign_key.sql
@@ -463,12 +463,13 @@ SELECT * from FKTABLE;
 DROP TABLE FKTABLE;
 DROP TABLE PKTABLE;
 
-CREATE TABLE PKTABLE (ptest1 int PRIMARY KEY);
+-- Test some invalid FK definitions
+CREATE TABLE PKTABLE (ptest1 int PRIMARY KEY, someoid oid);
 CREATE TABLE FKTABLE_FAIL1 ( ftest1 int, CONSTRAINT fkfail1 FOREIGN KEY (ftest2) REFERENCES PKTABLE);
 CREATE TABLE FKTABLE_FAIL2 ( ftest1 int, CONSTRAINT fkfail1 FOREIGN KEY (ftest1) REFERENCES PKTABLE(ptest2));
+CREATE TABLE FKTABLE_FAIL3 ( ftest1 int, CONSTRAINT fkfail1 FOREIGN KEY (tableoid) REFERENCES PKTABLE(someoid));
+CREATE TABLE FKTABLE_FAIL4 ( ftest1 oid, CONSTRAINT fkfail1 FOREIGN KEY (ftest1) REFERENCES PKTABLE(tableoid));
 
-DROP TABLE FKTABLE_FAIL1;
-DROP TABLE FKTABLE_FAIL2;
 DROP TABLE PKTABLE;
 
 -- Test for referencing column number smaller than referenced constraint

From 5e43f84cb184c0e08a01d7bdf3d1602e9d44d52d Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 5 Apr 2023 07:59:52 +0900
Subject: [PATCH 53/61] doc: Add more details about
 pg_stat_get_xact_blocks_{fetched,hit}

The explanation describing the dependency to system read() calls for
these two functions has been removed in ddfc2d9.  And after more
discussion about d69c404, we have concluded that adding more details
makes them easier to understand.

While on it, use the term "block read requests" (maybe found in cache)
rather than "buffers fetched" and "buffer hits".

Per discussion with Melanie Plageman, Kyotaro Horiguchi, Bertrand
Drouvot and myself.

Discussion: https://postgr.es/m/CAAKRu_ZmdiScT4q83OAbfmR5AH-L5zWya3SXjaxiJvhCob-e2A@mail.gmail.com
Backpatch-through: 11
---
 doc/src/sgml/monitoring.sgml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 3c56f9fa4f1..44239b26775 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -5060,8 +5060,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
         <returnvalue>bigint</returnvalue>
        </para>
        <para>
-        Returns the number of buffers fetched for table or index, in the current
-        transaction.
+        Returns the number of block read requests for table or index, in the
+        current transaction. This number minus
+        <function>pg_stat_get_xact_blocks_hit</function> gives the number of
+        kernel <function>read()</function> calls; the number of actual
+        physical reads is usually lower due to kernel-level buffering.
        </para></entry>
       </row>
 
@@ -5074,8 +5077,9 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
         <returnvalue>bigint</returnvalue>
        </para>
        <para>
-        Returns the number of buffer hits for table or index, in the current
-        transaction.
+        Returns the number of block read requests for table or index, in the
+        current transaction, found in cache (not triggering kernel
+        <function>read()</function> calls).
        </para></entry>
       </row>
 

From 4663ccf7dff35e322ebcaf37972e788e8938ff68 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 5 Apr 2023 14:16:19 +0700
Subject: [PATCH 54/61] doc: Update error messages in RLS examples

Since 8b9e9644d, the messages for failed permissions checks report
"table" where appropriate, rather than "relation".

Backpatch to all supported branches
---
 doc/src/sgml/ddl.sgml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml
index 78f4448a5bc..53ca87cc020 100644
--- a/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
@@ -2457,7 +2457,7 @@ postgres=&gt; table passwd;
 postgres=&gt; set role alice;
 SET
 postgres=&gt; table passwd;
-ERROR:  permission denied for relation passwd
+ERROR:  permission denied for table passwd
 postgres=&gt; select user_name,real_name,home_phone,extra_info,home_dir,shell from passwd;
  user_name | real_name |  home_phone  | extra_info | home_dir    |   shell
 -----------+-----------+--------------+------------+-------------+-----------
@@ -2467,7 +2467,7 @@ postgres=&gt; select user_name,real_name,home_phone,extra_info,home_dir,shell fr
 (3 rows)
 
 postgres=&gt; update passwd set user_name = 'joe';
-ERROR:  permission denied for relation passwd
+ERROR:  permission denied for table passwd
 -- Alice is allowed to change her own real_name, but no others
 postgres=&gt; update passwd set real_name = 'Alice Doe';
 UPDATE 1
@@ -2476,9 +2476,9 @@ UPDATE 0
 postgres=&gt; update passwd set shell = '/bin/xx';
 ERROR:  new row violates WITH CHECK OPTION for "passwd"
 postgres=&gt; delete from passwd;
-ERROR:  permission denied for relation passwd
+ERROR:  permission denied for table passwd
 postgres=&gt; insert into passwd (user_name) values ('xxx');
-ERROR:  permission denied for relation passwd
+ERROR:  permission denied for table passwd
 -- Alice can change her own password; RLS silently prevents updating other rows
 postgres=&gt; update passwd set pwhash = 'abc';
 UPDATE 1

From 5d1c6c21c055151264aa668a46f94dd1dcb7177b Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 6 Apr 2023 15:52:37 -0400
Subject: [PATCH 55/61] Fix ts_headline() edge cases for empty query and empty
 search text.

tsquery's GETQUERY() macro is only safe to apply to a tsquery
that is known non-empty; otherwise it gives a pointer to garbage.
Before commit 5a617d75d, ts_headline() avoided this pitfall, but
only in a very indirect, nonobvious way.  (hlCover could not reach
its TS_execute call, because if the query contains no lexemes
then hlFirstIndex would surely return -1.)  After that commit,
it fell into the trap, resulting in weird errors such as
"unrecognized operator" and/or valgrind complaints.  In HEAD,
fix this by not calling TS_execute_locations() at all for an
empty query.  In the back branches, add a defensive check to
hlCover() --- that's not fixing any live bug, but I judge the
code a bit too fragile as-is.

Also, both mark_hl_fragments() and mark_hl_words() were careless
about the possibility of empty search text: in the cases where
no match has been found, they'd end up telling mark_fragment() to
mark from word indexes 0 to 0 inclusive, even when there is no
word 0.  This is harmless since we over-allocated the prs->words
array, but it does annoy valgrind.  Fix so that the end index is -1
and thus mark_fragment() will do nothing in such cases.

Bottom line is that this fixes a live bug in HEAD, but in the
back branches it's only getting rid of a valgrind nitpick.
Back-patch anyway.

Per report from Alexander Lakhin.

Discussion: https://postgr.es/m/c27f642d-020b-01ff-ae61-086af287c4fd@gmail.com
---
 src/backend/tsearch/wparser_def.c     |  8 ++++++--
 src/test/regress/expected/tsearch.out | 21 +++++++++++++++++++++
 src/test/regress/sql/tsearch.sql      |  6 ++++++
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index a3e5baf9782..98029e78fb4 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -2039,6 +2039,9 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
 				nextpmax;
 	hlCheck		ch;
 
+	if (query->size <= 0)
+		return false;			/* empty query matches nothing */
+
 	/*
 	 * We look for the earliest, shortest substring of prs->words that
 	 * satisfies the query.  Both the pmin and pmax indices must be words
@@ -2343,7 +2346,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 	/* show the first min_words words if we have not marked anything */
 	if (num_f <= 0)
 	{
-		startpos = endpos = curlen = 0;
+		startpos = curlen = 0;
+		endpos = -1;
 		for (i = 0; i < prs->curwords && curlen < min_words; i++)
 		{
 			if (!NONWORDTOKEN(prs->words[i].type))
@@ -2498,7 +2502,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 		if (bestlen < 0)
 		{
 			curlen = 0;
-			pose = 0;
+			pose = -1;
 			for (i = 0; i < prs->curwords && curlen < min_words; i++)
 			{
 				if (!NONWORDTOKEN(prs->words[i].type))
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
index 27e71bb15d6..5f712a5484e 100644
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -2012,6 +2012,27 @@ to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
  <b>Lorem</b> ipsum <b>urna</b>.  Nullam nullam <b>ullamcorper</b> <b>urna</b>
 (1 row)
 
+-- Edge cases with empty query
+SELECT ts_headline('english',
+'', ''::tsquery);
+NOTICE:  text-search query doesn't contain lexemes: ""
+LINE 2: '', ''::tsquery);
+            ^
+ ts_headline 
+-------------
+ 
+(1 row)
+
+SELECT ts_headline('english',
+'foo bar', ''::tsquery);
+NOTICE:  text-search query doesn't contain lexemes: ""
+LINE 2: 'foo bar', ''::tsquery);
+                   ^
+ ts_headline 
+-------------
+ foo bar
+(1 row)
+
 --Rewrite sub system
 CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
 \set ECHO none
diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
index af5004602a8..80a3e0558b9 100644
--- a/src/test/regress/sql/tsearch.sql
+++ b/src/test/regress/sql/tsearch.sql
@@ -550,6 +550,12 @@ SELECT ts_headline('english',
 to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
 'MaxFragments=100, MaxWords=100, MinWords=1');
 
+-- Edge cases with empty query
+SELECT ts_headline('english',
+'', ''::tsquery);
+SELECT ts_headline('english',
+'foo bar', ''::tsquery);
+
 --Rewrite sub system
 
 CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);

From 3196a4f9ea24df654fb63b7037313007f73db311 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 6 Apr 2023 18:13:49 -0400
Subject: [PATCH 56/61] Stabilize just-added regression test cases.

The tests added by commits 029dea882 et al turn out to produce
different output under -DRANDOMIZE_ALLOCATED_MEMORY.  This is
not a bug exactly: that flag causes coerce_type() to invoke
the input function twice when coercing an unknown-type literal
to a specific type.  So you get tsqueryin's bleat about an empty
tsquery twice.  Revise the test query to avoid that.

Discussion: https://postgr.es/m/20230406213813.uep7plg6lvcywujo@awork3.anarazel.de
---
 src/test/regress/expected/tsearch.out | 8 ++------
 src/test/regress/sql/tsearch.sql      | 4 ++--
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
index 5f712a5484e..36ae94a5255 100644
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -2014,20 +2014,16 @@ to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
 
 -- Edge cases with empty query
 SELECT ts_headline('english',
-'', ''::tsquery);
+'', to_tsquery('english', ''));
 NOTICE:  text-search query doesn't contain lexemes: ""
-LINE 2: '', ''::tsquery);
-            ^
  ts_headline 
 -------------
  
 (1 row)
 
 SELECT ts_headline('english',
-'foo bar', ''::tsquery);
+'foo bar', to_tsquery('english', ''));
 NOTICE:  text-search query doesn't contain lexemes: ""
-LINE 2: 'foo bar', ''::tsquery);
-                   ^
  ts_headline 
 -------------
  foo bar
diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
index 80a3e0558b9..e7e2aa907f7 100644
--- a/src/test/regress/sql/tsearch.sql
+++ b/src/test/regress/sql/tsearch.sql
@@ -552,9 +552,9 @@ to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
 
 -- Edge cases with empty query
 SELECT ts_headline('english',
-'', ''::tsquery);
+'', to_tsquery('english', ''));
 SELECT ts_headline('english',
-'foo bar', ''::tsquery);
+'foo bar', to_tsquery('english', ''));
 
 --Rewrite sub system
 

From b103dcc04389538acca42279bf03205266c08d7b Mon Sep 17 00:00:00 2001
From: Stephen Frost <sfrost@snowman.net>
Date: Fri, 7 Apr 2023 19:36:12 -0400
Subject: [PATCH 57/61] For Kerberos testing, disable reverse DNS lookup

In our Kerberos test suite, there isn't much need to worry about the
normal canonicalization that Kerberos provides by looking up the reverse
DNS for the IP address connected to, and in some cases it can actively
cause problems (eg: a captive portal wifi where the normally not
resolvable localhost address used ends up being resolved anyway, and
not to the domain we are using for testing, causing the entire
regression test to fail with errors about not being able to get a TGT
for the remote realm for cross-realm trust).

Therefore, disable it by adding rdns = false into the krb5.conf that's
generated for the test.

Reviewed-By: Heikki Linnakangas
Discussion: https://postgr.es/m/Y/QD2zDkDYQA1GQt@tamriel.snowman.net
---
 src/test/kerberos/t/001_auth.pl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/test/kerberos/t/001_auth.pl b/src/test/kerberos/t/001_auth.pl
index bd55339c761..b2a733f90fa 100644
--- a/src/test/kerberos/t/001_auth.pl
+++ b/src/test/kerberos/t/001_auth.pl
@@ -93,6 +93,17 @@
   or BAIL_OUT("could not get Kerberos version");
 $krb5_version = $1;
 
+# Build the krb5.conf to use.
+#
+# Explicitly specify the default (test) realm and the KDC for
+# that realm to avoid the Kerberos library trying to look up
+# that information in DNS, and also because we're using a
+# non-standard KDC port.
+#
+# Reverse DNS is explicitly disabled to avoid any issue with a
+# captive portal or other cases where the reverse DNS succeeds
+# and the Kerberos library uses that as the canonical name of
+# the host and then tries to acquire a cross-realm ticket.
 append_to_file(
 	$krb5_conf,
 	qq![logging]
@@ -101,6 +112,7 @@
 
 [libdefaults]
 default_realm = $realm
+rdns = false
 
 [realms]
 $realm = {

From fd27f65690f8cd3b58636b1aae69d6419f95f0ec Mon Sep 17 00:00:00 2001
From: Stephen Frost <sfrost@snowman.net>
Date: Fri, 7 Apr 2023 19:36:12 -0400
Subject: [PATCH 58/61] For Kerberos testing, disable DNS lookups

Similar to 8dff2f224, this disables DNS lookups by the Kerberos library
to look up the KDC and the realm while the Kerberos tests are running.
In some environments, these lookups can take a long time and end up
timing out and causing tests to fail.  Further, since this isn't really
our domain, we shouldn't be sending out these DNS requests during our
tests.
---
 src/test/kerberos/t/001_auth.pl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/test/kerberos/t/001_auth.pl b/src/test/kerberos/t/001_auth.pl
index b2a733f90fa..6ced0b1faf4 100644
--- a/src/test/kerberos/t/001_auth.pl
+++ b/src/test/kerberos/t/001_auth.pl
@@ -100,6 +100,11 @@
 # that information in DNS, and also because we're using a
 # non-standard KDC port.
 #
+# Also explicitly disable DNS lookups since this isn't really
+# our domain and we shouldn't be causing random DNS requests
+# to be sent out (not to mention that broken DNS environments
+# can cause the tests to take an extra long time and timeout).
+#
 # Reverse DNS is explicitly disabled to avoid any issue with a
 # captive portal or other cases where the reverse DNS succeeds
 # and the Kerberos library uses that as the canonical name of
@@ -111,6 +116,8 @@
 kdc = FILE:$kdc_log
 
 [libdefaults]
+dns_lookup_realm = false
+dns_lookup_kdc = false
 default_realm = $realm
 rdns = false
 

From 80337718ae226884b253a9c8f40c2ed69085162e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 10 Apr 2023 13:09:18 -0400
Subject: [PATCH 59/61] Doc: adjust examples of EXTRACT() output to match
 current reality.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EXTRACT(EPOCH), EXTRACT(SECOND), and some related cases print more
trailing zeroes than they used to.  This behavior change happened
with commit a2da77cdb (Change return type of EXTRACT to numeric),
and it was intentional according to the commit log:

    - Return values when extracting fields with possibly fractional
      values, such as second and epoch, now have the full scale that the
      value has internally (so, for example, '1.000000' instead of just
      '1').

It's been like that for two releases now, so while I suggested
changing this back, it's probably better to adjust the documentation
examples.

Per bug #17866 from Евгений Жужнев.  Back-patch to v14 where the
change came in.

Discussion: https://postgr.es/m/17866-18eb70095b1594e2@postgresql.org
---
 doc/src/sgml/func.sgml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index ba8bccc135d..a7b91c033a4 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -9389,11 +9389,11 @@ SELECT timestamp with time zone '2005-04-02 12:00:00-07' + interval '24 hours';
 <screen>
 SELECT EXTRACT(EPOCH FROM timestamptz '2013-07-01 12:00:00') -
        EXTRACT(EPOCH FROM timestamptz '2013-03-01 12:00:00');
-<lineannotation>Result: </lineannotation><computeroutput>10537200</computeroutput>
+<lineannotation>Result: </lineannotation><computeroutput>10537200.000000</computeroutput>
 SELECT (EXTRACT(EPOCH FROM timestamptz '2013-07-01 12:00:00') -
         EXTRACT(EPOCH FROM timestamptz '2013-03-01 12:00:00'))
         / 60 / 60 / 24;
-<lineannotation>Result: </lineannotation><computeroutput>121.958333333333</computeroutput>
+<lineannotation>Result: </lineannotation><computeroutput>121.9583333333333333</computeroutput>
 SELECT timestamptz '2013-07-01 12:00:00' - timestamptz '2013-03-01 12:00:00';
 <lineannotation>Result: </lineannotation><computeroutput>121 days 23:00:00</computeroutput>
 SELECT age(timestamptz '2013-07-01 12:00:00', timestamptz '2013-03-01 12:00:00');
@@ -9539,13 +9539,13 @@ SELECT EXTRACT(DOY FROM TIMESTAMP '2001-02-16 20:38:40');
 
 <screen>
 SELECT EXTRACT(EPOCH FROM TIMESTAMP WITH TIME ZONE '2001-02-16 20:38:40.12-08');
-<lineannotation>Result: </lineannotation><computeroutput>982384720.12</computeroutput>
+<lineannotation>Result: </lineannotation><computeroutput>982384720.120000</computeroutput>
 
 SELECT EXTRACT(EPOCH FROM TIMESTAMP '2001-02-16 20:38:40.12');
-<lineannotation>Result: </lineannotation><computeroutput>982355920.12</computeroutput>
+<lineannotation>Result: </lineannotation><computeroutput>982355920.120000</computeroutput>
 
 SELECT EXTRACT(EPOCH FROM INTERVAL '5 days 3 hours');
-<lineannotation>Result: </lineannotation><computeroutput>442800</computeroutput>
+<lineannotation>Result: </lineannotation><computeroutput>442800.000000</computeroutput>
 </screen>
 
        <para>
@@ -9692,7 +9692,7 @@ SELECT EXTRACT(MILLENNIUM FROM TIMESTAMP '2001-02-16 20:38:40');
 
 <screen>
 SELECT EXTRACT(MILLISECONDS FROM TIME '17:12:28.5');
-<lineannotation>Result: </lineannotation><computeroutput>28500</computeroutput>
+<lineannotation>Result: </lineannotation><computeroutput>28500.000</computeroutput>
 </screen>
       </listitem>
      </varlistentry>
@@ -9756,10 +9756,10 @@ SELECT EXTRACT(QUARTER FROM TIMESTAMP '2001-02-16 20:38:40');
 
 <screen>
 SELECT EXTRACT(SECOND FROM TIMESTAMP '2001-02-16 20:38:40');
-<lineannotation>Result: </lineannotation><computeroutput>40</computeroutput>
+<lineannotation>Result: </lineannotation><computeroutput>40.000000</computeroutput>
 
 SELECT EXTRACT(SECOND FROM TIME '17:12:28.5');
-<lineannotation>Result: </lineannotation><computeroutput>28.5</computeroutput>
+<lineannotation>Result: </lineannotation><computeroutput>28.500000</computeroutput>
 </screen>
       </listitem>
      </varlistentry>

From d4a2acfc28f10ffc5009204669f7cb3147eb61db Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 10 Apr 2023 15:49:48 -0400
Subject: [PATCH 60/61] Doc: add missed entries in BRIN extensibility tables.

The tables in "71.3. Extensibility" listing the support functions
for bloom and minmax-multi opclasses should include the associated
options function.  While this isn't quite as required as the rest,
you need it for full functionality of the opclass.

Back-patch to v14 where these functions were added.
---
 doc/src/sgml/brin.sgml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/src/sgml/brin.sgml b/doc/src/sgml/brin.sgml
index 71697155d7c..9c5ffcddf84 100644
--- a/doc/src/sgml/brin.sgml
+++ b/doc/src/sgml/brin.sgml
@@ -1228,6 +1228,10 @@ typedef struct BrinOpcInfo
      <entry>Support Procedure 4</entry>
      <entry>internal function <function>brin_bloom_union()</function></entry>
     </row>
+    <row>
+     <entry>Support Procedure 5</entry>
+     <entry>internal function <function>brin_bloom_options()</function></entry>
+    </row>
     <row>
      <entry>Support Procedure 11</entry>
      <entry>function to compute hash of an element</entry>
@@ -1286,6 +1290,10 @@ typedef struct BrinOpcInfo
      <entry>Support Procedure 4</entry>
      <entry>internal function <function>brin_minmax_multi_union()</function></entry>
     </row>
+    <row>
+     <entry>Support Procedure 5</entry>
+     <entry>internal function <function>brin_minmax_multi_options()</function></entry>
+    </row>
     <row>
      <entry>Support Procedure 11</entry>
      <entry>function to compute distance between two values (length of a range)</entry>

From 4ebcaf011f555b046a4e39d0844e7a786bbe0435 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 12 Apr 2023 09:09:58 +0900
Subject: [PATCH 61/61] Fix detection of unseekable files for fseek() and
 ftello() with MSVC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Calling fseek() or ftello() on a handle to a non-seeking device such as
a pipe or a communications device is not supported.  Unfortunately,
MSVC's flavor of these routines, _fseeki64() and _ftelli64(), do not
return an error when given a pipe as handle.  Some of the logic of
pg_dump and restore relies on these routines to check if a handle is
seekable, causing failures when passing the contents of pg_dump to
pg_restore through a pipe, for example.

This commit introduces wrappers for fseeko() and ftello() on MSVC so as
any callers are able to properly detect the cases of non-seekable
handles.  This relies mainly on GetFileType(), sharing a bit of code
with the MSVC port for fstat().  The code in charge of getting a file
type is refactored into a new file called win32common.c, shared by
win32stat.c and the new win32fseek.c.  It includes the MSVC ports for
fseeko() and ftello().

Like 765f5df, this is backpatched down to 14, where the fstat()
implementation for MSVC is able to understand about files larger than
4GB in size.  Using a TAP test for that is proving to be tricky as
IPC::Run handles the pipes by itself, still I have been able to check
the fix manually.

Reported-by: Daniel Watzinger
Author: Juan José Santamaría Flecha, Michael Paquier
Discussion: https://postgr.es/m/CAC+AXB26a4EmxM2suXxPpJaGrqAdxracd7hskLg-zxtPB50h7A@mail.gmail.com
Backpatch-through: 14
---
 configure                     |  6 +++
 configure.ac                  |  1 +
 src/include/port/win32_port.h | 12 ++++--
 src/port/win32common.c        | 68 +++++++++++++++++++++++++++++++
 src/port/win32fseek.c         | 75 +++++++++++++++++++++++++++++++++++
 src/port/win32stat.c          | 22 ++--------
 src/tools/msvc/Mkvcbuild.pm   |  3 +-
 7 files changed, 164 insertions(+), 23 deletions(-)
 create mode 100644 src/port/win32common.c
 create mode 100644 src/port/win32fseek.c

diff --git a/configure b/configure
index 99a9961f025..03fac094065 100755
--- a/configure
+++ b/configure
@@ -20405,6 +20405,12 @@ esac
  ;;
 esac
 
+  case " $LIBOBJS " in
+  *" win32common.$ac_objext "* ) ;;
+  *) LIBOBJS="$LIBOBJS win32common.$ac_objext"
+ ;;
+esac
+
   case " $LIBOBJS " in
   *" win32env.$ac_objext "* ) ;;
   *) LIBOBJS="$LIBOBJS win32env.$ac_objext"
diff --git a/configure.ac b/configure.ac
index a50ca5710c4..5bef7d6dbaa 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2520,6 +2520,7 @@ if test "$PORTNAME" = "win32"; then
   AC_LIBOBJ(kill)
   AC_LIBOBJ(open)
   AC_LIBOBJ(system)
+  AC_LIBOBJ(win32common)
   AC_LIBOBJ(win32env)
   AC_LIBOBJ(win32error)
   AC_LIBOBJ(win32security)
diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h
index 05c5a534420..41427a9c1af 100644
--- a/src/include/port/win32_port.h
+++ b/src/include/port/win32_port.h
@@ -193,15 +193,21 @@ struct itimerval
 
 int			setitimer(int which, const struct itimerval *value, struct itimerval *ovalue);
 
+/* Convenience wrapper for GetFileType() */
+extern DWORD pgwin32_get_file_type(HANDLE hFile);
+
 /*
  * WIN32 does not provide 64-bit off_t, but does provide the functions operating
- * with 64-bit offsets.
+ * with 64-bit offsets.  Also, fseek() might not give an error for unseekable
+ * streams, so harden that function with our version.
  */
 #define pgoff_t __int64
 
 #ifdef _MSC_VER
-#define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin)
-#define ftello(stream) _ftelli64(stream)
+extern int	_pgfseeko64(FILE *stream, pgoff_t offset, int origin);
+extern pgoff_t _pgftello64(FILE *stream);
+#define fseeko(stream, offset, origin) _pgfseeko64(stream, offset, origin)
+#define ftello(stream) _pgftello64(stream)
 #else
 #ifndef fseeko
 #define fseeko(stream, offset, origin) fseeko64(stream, offset, origin)
diff --git a/src/port/win32common.c b/src/port/win32common.c
new file mode 100644
index 00000000000..2fd78f7f936
--- /dev/null
+++ b/src/port/win32common.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * win32common.c
+ *	  Common routines shared among the win32*.c ports.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/win32common.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifdef FRONTEND
+#include "postgres_fe.h"
+#else
+#include "postgres.h"
+#endif
+
+#ifdef WIN32
+
+/*
+ * pgwin32_get_file_type
+ *
+ * Convenience wrapper for GetFileType() with specific error handling for all the
+ * port implementations.  Returns the file type associated with a HANDLE.
+ *
+ * On error, sets errno with FILE_TYPE_UNKNOWN as file type.
+ */
+DWORD
+pgwin32_get_file_type(HANDLE hFile)
+{
+	DWORD		fileType = FILE_TYPE_UNKNOWN;
+	DWORD		lastError;
+
+	errno = 0;
+
+	/*
+	 * When stdin, stdout, and stderr aren't associated with a stream the
+	 * special value -2 is returned:
+	 * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/get-osfhandle
+	 */
+	if (hFile == INVALID_HANDLE_VALUE || hFile == (HANDLE) -2)
+	{
+		errno = EINVAL;
+		return FILE_TYPE_UNKNOWN;
+	}
+
+	fileType = GetFileType(hFile);
+	lastError = GetLastError();
+
+	/*
+	 * Invoke GetLastError in order to distinguish between a "valid" return of
+	 * FILE_TYPE_UNKNOWN and its return due to a calling error.  In case of
+	 * success, GetLastError() returns NO_ERROR.
+	 */
+	if (fileType == FILE_TYPE_UNKNOWN && lastError != NO_ERROR)
+	{
+		_dosmaperr(lastError);
+		return FILE_TYPE_UNKNOWN;
+	}
+
+	return fileType;
+}
+
+#endif							/* WIN32 */
diff --git a/src/port/win32fseek.c b/src/port/win32fseek.c
new file mode 100644
index 00000000000..985313c825f
--- /dev/null
+++ b/src/port/win32fseek.c
@@ -0,0 +1,75 @@
+/*-------------------------------------------------------------------------
+ *
+ * win32fseek.c
+ *	  Replacements for fseeko() and ftello().
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/win32fseek.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifdef FRONTEND
+#include "postgres_fe.h"
+#else
+#include "postgres.h"
+#endif
+
+#if defined(WIN32) && defined(_MSC_VER)
+
+/*
+ * _pgfseeko64
+ *
+ * Calling fseek() on a handle to a non-seeking device such as a pipe or
+ * a communications device is not supported, and fseek() may not return
+ * an error.  This wrapper relies on the file type to check which cases
+ * are supported.
+ */
+int
+_pgfseeko64(FILE *stream, pgoff_t offset, int origin)
+{
+	DWORD		fileType;
+	HANDLE		hFile = (HANDLE) _get_osfhandle(_fileno(stream));
+
+	fileType = pgwin32_get_file_type(hFile);
+	if (errno != 0)
+		return -1;
+
+	if (fileType == FILE_TYPE_DISK)
+		return _fseeki64(stream, offset, origin);
+	else if (fileType == FILE_TYPE_CHAR || fileType == FILE_TYPE_PIPE)
+		errno = ESPIPE;
+	else
+		errno = EINVAL;
+
+	return -1;
+}
+
+/*
+ * _pgftello64
+ *
+ * Same as _pgfseeko64().
+ */
+pgoff_t
+_pgftello64(FILE *stream)
+{
+	DWORD		fileType;
+	HANDLE		hFile = (HANDLE) _get_osfhandle(_fileno(stream));
+
+	fileType = pgwin32_get_file_type(hFile);
+	if (errno != 0)
+		return -1;
+
+	if (fileType == FILE_TYPE_DISK)
+		return _ftelli64(stream);
+	else if (fileType == FILE_TYPE_CHAR || fileType == FILE_TYPE_PIPE)
+		errno = ESPIPE;
+	else
+		errno = EINVAL;
+
+	return -1;
+}
+
+#endif							/* defined(WIN32) && defined(_MSC_VER) */
diff --git a/src/port/win32stat.c b/src/port/win32stat.c
index 36c3b171f40..acbf4c7e279 100644
--- a/src/port/win32stat.c
+++ b/src/port/win32stat.c
@@ -290,33 +290,17 @@ _pgfstat64(int fileno, struct stat *buf)
 {
 	HANDLE		hFile = (HANDLE) _get_osfhandle(fileno);
 	DWORD		fileType = FILE_TYPE_UNKNOWN;
-	DWORD		lastError;
 	unsigned short st_mode;
 
-	/*
-	 * When stdin, stdout, and stderr aren't associated with a stream the
-	 * special value -2 is returned:
-	 * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/get-osfhandle
-	 */
-	if (hFile == INVALID_HANDLE_VALUE || hFile == (HANDLE) -2 || buf == NULL)
+	if (buf == NULL)
 	{
 		errno = EINVAL;
 		return -1;
 	}
 
-	fileType = GetFileType(hFile);
-	lastError = GetLastError();
-
-	/*
-	 * Invoke GetLastError in order to distinguish between a "valid" return of
-	 * FILE_TYPE_UNKNOWN and its return due to a calling error.  In case of
-	 * success, GetLastError returns NO_ERROR.
-	 */
-	if (fileType == FILE_TYPE_UNKNOWN && lastError != NO_ERROR)
-	{
-		_dosmaperr(lastError);
+	fileType = pgwin32_get_file_type(hFile);
+	if (errno != 0)
 		return -1;
-	}
 
 	switch (fileType)
 	{
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 3a9246e8578..735caa2b6f0 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -115,7 +115,8 @@ sub mkvcbuild
 	  pg_strong_random.c pgcheckdir.c pgmkdirp.c pgsleep.c pgstrcasecmp.c
 	  pqsignal.c mkdtemp.c qsort.c qsort_arg.c bsearch_arg.c quotes.c system.c
 	  strerror.c tar.c thread.c
-	  win32env.c win32error.c win32security.c win32setlocale.c win32stat.c);
+	  win32common.c win32env.c win32error.c win32fseek.c win32security.c
+	  win32setlocale.c win32stat.c);
 
 	push(@pgportfiles, 'strtof.c') if ($vsVersion < '14.00');