From 5ecdf25ff6d2bab925a0ce9fe4a9c63873516e35 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 18 Jun 2024 16:09:00 -0400
Subject: [PATCH v1 1/2] Repro for vacuum infinite loop

This repro is not stable enough to be added as a test to the regression
suite. It is for demonstration purposes on the thread.
---
 src/test/recovery/t/099_vacuum_hang.pl | 265 +++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 src/test/recovery/t/099_vacuum_hang.pl

diff --git a/src/test/recovery/t/099_vacuum_hang.pl b/src/test/recovery/t/099_vacuum_hang.pl
new file mode 100644
index 00000000000..ab58182d04d
--- /dev/null
+++ b/src/test/recovery/t/099_vacuum_hang.pl
@@ -0,0 +1,265 @@
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use IPC::Run qw(pump);
+use Time::HiRes qw(usleep);
+
+# Set up nodes
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 'physical');
+
+my $tablespace1 = "test_vacuum_hang_tblspc";
+
+$node_primary->append_conf(
+	'postgresql.conf', qq[
+hot_standby_feedback = on
+log_recovery_conflict_waits = true
+log_statement='all'
+log_connections=true
+log_lock_waits = true
+autovacuum = off
+maintenance_work_mem = 1024
+]);
+$node_primary->start;
+
+my $backup_name = 'my_backup';
+
+$node_primary->backup($backup_name);
+my $node_replica = PostgreSQL::Test::Cluster->new('standby');
+$node_replica->init_from_backup($node_primary, $backup_name,
+	has_streaming => 1);
+
+$node_replica->start;
+
+my $test_db = "test_db";
+$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db");
+
+my $orig_conninfo = $node_primary->connstr();
+
+# test schema / data
+my $table1 = "test_vacuum_hang_table";
+my $index1 = "test_vacuum_hang_index";
+my $col1 = "col1";
+
+my $psql_timeout = IPC::Run::timer(10);
+
+# Long-running Primary Session A
+my %psql_primaryA = ('stdin' => '', 'stdout' => '');
+$psql_primaryA{run} =
+  $node_primary->background_psql($test_db, \$psql_primaryA{stdin},
+	\$psql_primaryA{stdout},
+	$psql_timeout);
+$psql_primaryA{stdout} = '';
+$psql_primaryA{stdin} = "set application_name=A;\n";
+
+# Long-running Primary Session B
+my %psql_primaryB = ('stdin' => '', 'stdout' => '');
+$psql_primaryB{run} =
+  $node_primary->background_psql($test_db, \$psql_primaryB{stdin},
+	\$psql_primaryB{stdout},
+	$psql_timeout);
+$psql_primaryB{stdout} = '';
+$psql_primaryB{stdin} = "set application_name=B;\n";
+
+# Long-running Replica Session A
+my %psql_replicaA = ('stdin' => '', 'stdout' => '');
+$psql_replicaA{run} =
+  $node_replica->background_psql($test_db, \$psql_replicaA{stdin},
+	\$psql_replicaA{stdout},
+	$psql_timeout);
+$psql_replicaA{stdout} = '';
+
+# Insert one tuple with value 1 which we can use to make sure the cursor has
+# successfully pinned and locked the buffer.
+#
+# Then insert and update enough rows that we force at least one round of index
+# vacuuming before getting to a dead tuple which was killed after the standby
+# is disconnected.
+#
+# Multiple index vacuuming passes is required to repro because after the
+# standby reconnects to the primary, our backend's GlobalVisStates will not
+# have been updated with the new horizon until an update is forced.
+#
+# _bt_pendingfsm_finalize() calls GetOldestNonRemovableTransactionId() at the
+# end of a round of index vacuuming, updating the backend's GlobalVisState
+# and, in our case, moving maybe_needed backwards.
+#
+# Then vacuum's first pass will continue and pruning will find our later
+# inserted and updated tuple HEAPTUPLE_RECENTLY_DEAD when compared to
+# maybe_needed but HEAPTUPLE_DEAD when compared to OldestXmin.
+$node_primary->safe_psql($test_db, qq[
+	CREATE TABLE ${table1}(${col1} int) with (autovacuum_enabled=false);
+	INSERT INTO $table1 VALUES (1);
+	INSERT INTO $table1 SELECT generate_series(2, 30000);
+	CREATE INDEX ${index1} on ${table1}(${col1});
+	UPDATE $table1 SET $col1 = 0 WHERE $col1 > 1;
+	UPDATE $table1 SET $col1 = 3 WHERE $col1 = 0;
+	UPDATE $table1 SET $col1 = 0 WHERE $col1 = 3;
+	UPDATE $table1 SET $col1 = 3 WHERE $col1 = 0;
+	UPDATE $table1 SET $col1 = 0 WHERE $col1 = 3;
+	UPDATE $table1 SET $col1 = 3 WHERE $col1 = 0;
+]);
+
+my $primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn);
+
+my $walreceiver_pid = $node_replica->safe_psql($test_db, qq[
+	select pid from pg_stat_activity where backend_type = 'walreceiver';]);
+
+# Set primary_conninfo to something invalid on the replica and reload the
+# config. This will prevent the standby from reconnecting once the connection
+# is terminated. Then terminate the wal receiver. When a new WAL receiver
+# process starts up, it has to use the primary_conninfo to connect to the
+# primary and it will be unable to do so.
+$node_replica->safe_psql($test_db, qq[
+		ALTER SYSTEM SET primary_conninfo = '';
+		SELECT pg_reload_conf();
+		SELECT pg_terminate_backend($walreceiver_pid)]);
+
+# Ensure the WAL receiver is no longer active on replica.
+$node_replica->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_activity where pid = $walreceiver_pid);] , 'f');
+
+# DECLARE and use a cursor on standby, causing the block of the relation to be
+# pinned and locked in a buffer on the replica. It is important that this is
+# after termination of the WAL receiver so that the primary does not know about
+# the cursor and it can't hold back the horizon on the primary.
+my $replica_cursor1 = "test_vacuum_hang_cursor_replica1";
+$psql_replicaA{stdin} .= qq[
+        BEGIN;
+			DECLARE $replica_cursor1 CURSOR FOR SELECT * FROM $table1;
+			FETCH FORWARD FROM $replica_cursor1;
+        ];
+
+# FETCH FORWARD should have returned a 1. That's how we know the cursor has a
+# pin and lock.
+ok(pump_until($psql_replicaA{run}, $psql_timeout,
+	\$psql_replicaA{stdout}, qr/^1$/m), "got first value from replica cursor");
+
+# Now insert and update a tuple which will be visible to the vacuum on the
+# primary but which will have xmax newer than the standby that was recently
+# disconnected.
+$psql_primaryA{stdin} .= qq[
+		INSERT INTO $table1 VALUES (99);
+		UPDATE $table1 SET $col1 = 100 WHERE $col1 = 99;
+		SELECT 'after_update';
+        ];
+
+# Make sure the UPDATE finished
+ok(pump_until($psql_primaryA{run}, $psql_timeout,
+	\$psql_primaryA{stdout}, qr/^after_update$/m), "SELECT output from primary session A");
+
+# Open a cursor on the primary whose lock will keep VACUUM from getting a
+# cleanup lock on the first page of the relation. We want VACUUM to be able to
+# start, calculate initial values for OldestXmin and GlobalVisState and then be
+# unable to proceed with pruning our dead tuples. This will allow us to
+# reconnect the standby and push the horizon back before we start actual
+# pruning and vacuuming.
+my $primary_cursor1 = "test_vacuum_hang_cursor_primary1";
+$psql_primaryB{stdin} .= qq[
+        BEGIN;
+			DECLARE $primary_cursor1 CURSOR FOR SELECT * FROM $table1;
+			FETCH FORWARD FROM $primary_cursor1;
+        ];
+
+# FETCH FORWARD should return a 1. That's how we know the cursor has a pin and
+# lock.
+ok(pump_until($psql_primaryB{run}, $psql_timeout,
+	\$psql_primaryB{stdout}, qr/^1$/m), "got first value from primary cursor");
+
+# Now start a VACUUM FREEZE on the primary. It will be unable to get a cleanup
+# lock and start pruning, so it will hang. It will call
+# vacuum_get_cutoffs() and establish values of OldestXmin and
+# GlobalVisState which are newer than all of our dead tuples.
+$psql_primaryA{stdin} .= qq[
+		VACUUM FREEZE $table1;
+		\\echo VACUUM
+        ];
+
+# Try and make sure our vacuum command has reached the server before we commit
+# the cursor.
+$psql_primaryA{run}->pump_nb();
+
+# Make sure that the VACUUM has already called vacuum_get_cutoffs() and is just
+# waiting on the cleanup lock to start vacuuming. It needs to reach the state
+# of scanning heap. On 16+ we can easily ensure this by checking for
+# pg_stat_progress_vacuum phase 'scanning heap' with poll_query_until(). Here
+# just use a sleep. Maybe I could do something better with pump_until() but I
+# gave up.
+# We don't want the standby to re-establish a connection to the primary and
+# push the horizon back until we've saved initial values in GlobalVisState and
+# calculated OldestXmin.
+usleep(300_000);
+
+# Ensure the WAL receiver is still not active on the replica.
+$node_replica->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_wal_receiver);] , 'f');
+
+# Allow the WAL receiver connection to re-establish. VACUUM is still
+# waiting for the ALTER INDEX to commit.
+$node_replica->safe_psql(
+	$test_db, qq[
+		ALTER SYSTEM SET primary_conninfo = '$orig_conninfo';
+		SELECT pg_reload_conf();
+	]);
+
+$node_replica->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_wal_receiver);] , 't');
+
+# Once the WAL sender is shown on the primary, the replica should have
+# connected with the primary and pushed the horizon backward. Primary Session A
+# won't see that until the VACUUM FREEZE proceeds and does its first round of
+# index vacuuming.
+$node_primary->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_replication);] , 't');
+
+# Commit the cursor so that the VACUUM can proceed.
+$psql_primaryB{stdin} .= qq[
+			COMMIT;
+			\\echo commit
+        ];
+
+ok(pump_until($psql_primaryB{run}, $psql_timeout,
+	\$psql_primaryB{stdout}, qr/^commit$/m), "CURSOR committing");
+
+$psql_primaryA{run}->pump_nb();
+
+# VACUUM proceeds with pruning and does a visibility check on each tuple. It
+# will find our final dead tuple non-removable (HEAPTUPLE_RECENTLY_DEAD) since
+# its xmax is after the new value of maybe_needed. Without the fix, after
+# pruning, in lazy_scan_prune(), vacuum does another visibility check, this
+# time with HeapTupleSatisfiesVacuum() which compares dead_after to OldestXmin.
+# It will find the tuple HEAPTUPLE_DEAD since its xmax precedes OldestXmin.
+# This will cause the infinite loop.
+pump $psql_primaryA{run} until ($psql_primaryA{stdout} =~ /VACUUM/ || $psql_timeout->is_expired);
+
+ok(!$psql_timeout->is_expired);
+
+# Commit the original cursor transaction on the replica so it can catch up. it
+# will end up replaying the VACUUM and not removing the tuple too.
+$psql_replicaA{stdin} .= qq[ COMMIT; ];
+
+$psql_replicaA{run}->pump_nb();
+
+$primary_lsn = $node_primary->lsn('flush');
+# Make sure something causes us to flush
+$node_primary->safe_psql($test_db, "insert into $table1 values (1);");
+$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn);
+
+# Shut down psqls
+$psql_primaryA{stdin} .= "\\q\n";
+$psql_primaryA{run}->finish;
+
+$psql_primaryB{stdin} .= "\\q\n";
+$psql_primaryB{run}->finish;
+
+
+$psql_replicaA{stdin} .= "\\q\n";
+$psql_replicaA{run}->finish;
+
+$node_replica->stop();
+$node_primary->stop();
+
+done_testing();
-- 
2.34.1