From b9a152c56d844f0543c2e5c1c09e7b7de4e20bc2 Mon Sep 17 00:00:00 2001
From: Ashwin Agrawal <aagrawal@pivotal.io>
Date: Wed, 22 May 2019 16:32:57 -0700
Subject: [PATCH v2] Zedstore compressed in-core columnar storage.

---
 configure                                     |  118 +
 configure.in                                  |   19 +
 src/backend/access/Makefile                   |    2 +-
 src/backend/access/gin/ginbtree.c             |    2 +-
 src/backend/access/gin/ginfast.c              |    2 +-
 src/backend/access/gin/gininsert.c            |    4 +-
 src/backend/access/gist/gist.c                |    2 +-
 src/backend/access/hash/hashinsert.c          |    2 +-
 src/backend/access/heap/heapam.c              |   20 +-
 src/backend/access/heap/heapam_handler.c      |    5 +-
 src/backend/access/index/indexam.c            |    4 +-
 src/backend/access/nbtree/nbtinsert.c         |    4 +-
 src/backend/access/zedstore/Makefile          |   21 +
 src/backend/access/zedstore/README            |  295 ++
 .../access/zedstore/zedstore_attpage.c        | 1589 +++++++
 src/backend/access/zedstore/zedstore_btree.c  |  632 +++
 .../access/zedstore/zedstore_compression.c    |  364 ++
 .../access/zedstore/zedstore_freepagemap.c    | 1076 +++++
 .../access/zedstore/zedstore_inspect.c        |  448 ++
 src/backend/access/zedstore/zedstore_meta.c   |  216 +
 .../access/zedstore/zedstore_tidpage.c        | 1774 ++++++++
 src/backend/access/zedstore/zedstore_toast.c  |  192 +
 .../access/zedstore/zedstore_tupslot.c        |  348 ++
 src/backend/access/zedstore/zedstore_undo.c   |  918 ++++
 src/backend/access/zedstore/zedstore_utils.c  |   76 +
 .../access/zedstore/zedstore_visibility.c     |  728 +++
 .../access/zedstore/zedstoream_handler.c      | 3163 +++++++++++++
 src/backend/commands/analyze.c                |    7 +-
 src/backend/commands/copy.c                   |   22 +-
 src/backend/commands/tablecmds.c              |   14 +-
 src/backend/commands/trigger.c                |    8 +
 src/backend/executor/execScan.c               |   90 +
 src/backend/executor/nodeIndexonlyscan.c      |   16 +-
 src/backend/executor/nodeIndexscan.c          |   20 +-
 src/backend/executor/nodeSeqscan.c            |   18 +-
 src/backend/optimizer/plan/createplan.c       |    3 +
 src/backend/optimizer/util/plancat.c          |    2 +
 src/backend/partitioning/partbounds.c         |   15 +-
 src/backend/storage/lmgr/predicate.c          |   45 +-
 src/include/access/tableam.h                  |   41 +
 src/include/access/zedstore_compression.h     |   51 +
 src/include/access/zedstore_internal.h        |  618 +++
 src/include/access/zedstore_undo.h            |  171 +
 src/include/catalog/pg_am.dat                 |    3 +
 src/include/catalog/pg_proc.dat               |   24 +
 src/include/executor/executor.h               |    3 +-
 src/include/nodes/execnodes.h                 |    1 +
 src/include/nodes/pathnodes.h                 |    1 +
 src/include/pg_config.h.in                    |    9 +
 src/include/storage/predicate.h               |    9 +-
 .../isolation/specs/read-only-anomaly-2.spec  |    6 +-
 src/test/regress/expected/.gitignore          |    1 +
 src/test/regress/expected/alter_table_1.out   | 3997 +++++++++++++++++
 src/test/regress/expected/cluster_1.out       |  475 ++
 src/test/regress/expected/create_am.out       |   11 +-
 src/test/regress/expected/fsm_1.out           |   73 +
 src/test/regress/expected/rangefuncs_1.out    | 2100 +++++++++
 src/test/regress/expected/reloptions_1.out    |  219 +
 src/test/regress/expected/strings_1.out       | 1823 ++++++++
 src/test/regress/expected/tsrf_1.out          |  712 +++
 src/test/regress/expected/zedstore.out        |  599 +++
 src/test/regress/output/misc_1.source         |  692 +++
 src/test/regress/parallel_schedule            |    2 +-
 src/test/regress/serial_schedule              |    1 +
 src/test/regress/sql/zedstore.sql             |  176 +
 src/test/storageperf/driver.sql               |   36 +
 src/test/storageperf/sql/onecol.sql           |   38 +
 src/test/storageperf/tests.sql                |    3 +
 68 files changed, 24118 insertions(+), 61 deletions(-)
 create mode 100644 src/backend/access/zedstore/Makefile
 create mode 100644 src/backend/access/zedstore/README
 create mode 100644 src/backend/access/zedstore/zedstore_attpage.c
 create mode 100644 src/backend/access/zedstore/zedstore_btree.c
 create mode 100644 src/backend/access/zedstore/zedstore_compression.c
 create mode 100644 src/backend/access/zedstore/zedstore_freepagemap.c
 create mode 100644 src/backend/access/zedstore/zedstore_inspect.c
 create mode 100644 src/backend/access/zedstore/zedstore_meta.c
 create mode 100644 src/backend/access/zedstore/zedstore_tidpage.c
 create mode 100644 src/backend/access/zedstore/zedstore_toast.c
 create mode 100644 src/backend/access/zedstore/zedstore_tupslot.c
 create mode 100644 src/backend/access/zedstore/zedstore_undo.c
 create mode 100644 src/backend/access/zedstore/zedstore_utils.c
 create mode 100644 src/backend/access/zedstore/zedstore_visibility.c
 create mode 100644 src/backend/access/zedstore/zedstoream_handler.c
 create mode 100644 src/include/access/zedstore_compression.h
 create mode 100644 src/include/access/zedstore_internal.h
 create mode 100644 src/include/access/zedstore_undo.h
 create mode 100644 src/test/regress/expected/alter_table_1.out
 create mode 100644 src/test/regress/expected/cluster_1.out
 create mode 100644 src/test/regress/expected/fsm_1.out
 create mode 100644 src/test/regress/expected/rangefuncs_1.out
 create mode 100644 src/test/regress/expected/reloptions_1.out
 create mode 100644 src/test/regress/expected/strings_1.out
 create mode 100644 src/test/regress/expected/tsrf_1.out
 create mode 100644 src/test/regress/expected/zedstore.out
 create mode 100644 src/test/regress/output/misc_1.source
 create mode 100644 src/test/regress/sql/zedstore.sql
 create mode 100644 src/test/storageperf/driver.sql
 create mode 100644 src/test/storageperf/sql/onecol.sql
 create mode 100644 src/test/storageperf/tests.sql

diff --git a/configure b/configure
index fd61bf6472..59a8a8080d 100755
--- a/configure
+++ b/configure
@@ -700,6 +700,7 @@ LDFLAGS_EX
 ELF_SYS
 EGREP
 GREP
+with_lz4
 with_zlib
 with_system_tzdata
 with_libxslt
@@ -864,6 +865,7 @@ with_libxml
 with_libxslt
 with_system_tzdata
 with_zlib
+with_lz4
 with_gnu_ld
 enable_largefile
 enable_float4_byval
@@ -1570,6 +1572,7 @@ Optional Packages:
   --with-system-tzdata=DIR
                           use system time zone data in DIR
   --without-zlib          do not use Zlib
+  --with-lz4              build with LZ4 support
   --with-gnu-ld           assume the C compiler uses GNU ld [default=no]

 Some influential environment variables:
@@ -8306,6 +8309,41 @@ fi

+#
+# LZ4
+#
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with LZ4 support" >&5
+$as_echo_n "checking whether to build with LZ4 support... " >&6; }
+
+
+
+# Check whether --with-lz4 was given.
+if test "${with_lz4+set}" = set; then :
+  withval=$with_lz4;
+  case $withval in
+    yes)
+
+$as_echo "#define USE_LZ4 1" >>confdefs.h
+
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --with-lz4 option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  with_lz4=no
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_lz4" >&5
+$as_echo "$with_lz4" >&6; }
+
+
 #
 # Elf
 #
@@ -11828,6 +11866,56 @@ fi

 fi

+if test "$with_lz4" = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5
+$as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; }
+if ${ac_cv_lib_lz4_LZ4_compress_default+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-llz4  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char LZ4_compress_default ();
+int
+main ()
+{
+return LZ4_compress_default ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_lz4_LZ4_compress_default=yes
+else
+  ac_cv_lib_lz4_LZ4_compress_default=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lz4_LZ4_compress_default" >&5
+$as_echo "$ac_cv_lib_lz4_LZ4_compress_default" >&6; }
+if test "x$ac_cv_lib_lz4_LZ4_compress_default" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBLZ4 1
+_ACEOF
+
+  LIBS="-llz4 $LIBS"
+
+else
+  as_fn_error $? "library 'lz4' is required for LZ4 support" "$LINENO" 5
+fi
+
+fi
+
 if test "$enable_spinlocks" = yes; then

 $as_echo "#define HAVE_SPINLOCKS 1" >>confdefs.h
@@ -13027,6 +13115,36 @@ Use --without-zlib to disable zlib support." "$LINENO" 5
 fi

+fi
+
+if test "$with_lz4" = yes; then
+  for ac_header in lz4.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "lz4.h" "ac_cv_header_lz4_h" "$ac_includes_default"
+if test "x$ac_cv_header_lz4_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LZ4_H 1
+_ACEOF
+
+else
+  for ac_header in lz4.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "lz4.h" "ac_cv_header_lz4_h" "$ac_includes_default"
+if test "x$ac_cv_header_lz4_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LZ4_H 1
+_ACEOF
+
+else
+  as_fn_error $? "lz4.h header file is required for LZ4" "$LINENO" 5
+fi
+
+done
+
+fi
+
+done
+
 fi

 if test "$with_gssapi" = yes ; then
diff --git a/configure.in b/configure.in
index 4586a1716c..183fad3462 100644
--- a/configure.in
+++ b/configure.in
@@ -964,6 +964,16 @@ PGAC_ARG_BOOL(with, zlib, yes,
               [do not use Zlib])
 AC_SUBST(with_zlib)

+#
+# LZ4
+#
+AC_MSG_CHECKING([whether to build with LZ4 support])
+PGAC_ARG_BOOL(with, lz4, no,
+              [build with LZ4 support],
+              [AC_DEFINE([USE_LZ4], 1, [Define to 1 to build with LZ4 support. (--with-lz4)])])
+AC_MSG_RESULT([$with_lz4])
+AC_SUBST(with_lz4)
+
 #
 # Elf
 #
@@ -1174,6 +1184,10 @@ failure.  It is possible the compiler isn't looking in the proper directory.
 Use --without-zlib to disable zlib support.])])
 fi

+if test "$with_lz4" = yes; then
+  AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])])
+fi
+
 if test "$enable_spinlocks" = yes; then
   AC_DEFINE(HAVE_SPINLOCKS, 1, [Define to 1 if you have spinlocks.])
 else
@@ -1387,6 +1401,11 @@ failure.  It is possible the compiler isn't looking in the proper directory.
 Use --without-zlib to disable zlib support.])])
 fi

+if test "$with_lz4" = yes; then
+  AC_CHECK_HEADERS(lz4.h, [],
+	[AC_CHECK_HEADERS(lz4.h, [], [AC_MSG_ERROR([lz4.h header file is required for LZ4])])])
+fi
+
 if test "$with_gssapi" = yes ; then
   AC_CHECK_HEADERS(gssapi/gssapi.h, [],
 	[AC_CHECK_HEADERS(gssapi.h, [], [AC_MSG_ERROR([gssapi.h header file is required for GSSAPI])])])
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile
index 0880e0a8bb..6d36f3bd26 100644
--- a/src/backend/access/Makefile
+++ b/src/backend/access/Makefile
@@ -9,6 +9,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global

 SUBDIRS	    = brin common gin gist hash heap index nbtree rmgrdesc spgist \
-			  table tablesample transam
+			  table tablesample transam zedstore

 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index 11a8ed7bbc..e795375495 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -89,7 +89,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode,
 	stack->predictNumber = 1;

 	if (rootConflictCheck)
-		CheckForSerializableConflictIn(btree->index, NULL, stack->buffer);
+		CheckForSerializableConflictIn(btree->index, NULL, btree->rootBlkno);

 	for (;;)
 	{
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index 2b3dd1c677..f8ffeb06f8 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -246,7 +246,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
 	 * tree, so it conflicts with all serializable scans.  All scans acquire a
 	 * predicate lock on the metabuffer to represent that.
 	 */
-	CheckForSerializableConflictIn(index, NULL, metabuffer);
+	CheckForSerializableConflictIn(index, NULL, GIN_METAPAGE_BLKNO);

 	if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
 	{
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 55eab14617..046a20a3d4 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -221,7 +221,7 @@ ginEntryInsert(GinState *ginstate,
 			return;
 		}

-		CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
+		CheckForSerializableConflictIn(ginstate->index, NULL, BufferGetBlockNumber(stack->buffer));
 		/* modify an existing leaf entry */
 		itup = addItemPointersToLeafTuple(ginstate, itup,
 										  items, nitem, buildStats, stack->buffer);
@@ -230,7 +230,7 @@ ginEntryInsert(GinState *ginstate,
 	}
 	else
 	{
-		CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
+		CheckForSerializableConflictIn(ginstate->index, NULL, BufferGetBlockNumber(stack->buffer));
 		/* no match, so construct a new leaf entry */
 		itup = buildFreshLeafTuple(ginstate, attnum, key, category,
 								   items, nitem, buildStats, stack->buffer);
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 45c00aaa87..4f150b02cb 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -1273,7 +1273,7 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
 	 * Check for any rw conflicts (in serializable isolation level) just
 	 * before we intend to modify the page
 	 */
-	CheckForSerializableConflictIn(state->r, NULL, stack->buffer);
+	CheckForSerializableConflictIn(state->r, NULL, BufferGetBlockNumber(stack->buffer));

 	/* Insert the tuple(s) to the page, splitting the page if necessary */
 	is_split = gistplacetopage(state->r, state->freespace, giststate,
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index 5321762d5e..e3fb47f9e3 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -88,7 +88,7 @@ restart_insert:
 										  &usedmetap);
 	Assert(usedmetap != NULL);

-	CheckForSerializableConflictIn(rel, NULL, buf);
+	CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(buf));

 	/* remember the primary bucket buffer to release the pin on it at end. */
 	bucket_buf = buf;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 6c342635e8..b09263364e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -446,7 +446,7 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 			else
 				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);

-			CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
+			heap_CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
 											&loctup, buffer, snapshot);

 			if (valid)
@@ -668,7 +668,7 @@ heapgettup(HeapScanDesc scan,
 													 snapshot,
 													 scan->rs_cbuf);

-				CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
+				heap_CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
 												tuple, scan->rs_cbuf,
 												snapshot);

@@ -1488,7 +1488,7 @@ heap_fetch(Relation relation,
 	if (valid)
 		PredicateLockTuple(relation, tuple, snapshot);

-	CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
+	heap_CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);

 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

@@ -1622,7 +1622,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,

 			/* If it's visible per the snapshot, we must return it */
 			valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
-			CheckForSerializableConflictOut(valid, relation, heapTuple,
+			heap_CheckForSerializableConflictOut(valid, relation, heapTuple,
 											buffer, snapshot);
 			/* reset to original, non-redirected, tid */
 			heapTuple->t_self = *tid;
@@ -1764,7 +1764,7 @@ heap_get_latest_tid(TableScanDesc sscan,
 		 * candidate.
 		 */
 		valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
-		CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
+		heap_CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
 		if (valid)
 			*tid = ctid;

@@ -1919,7 +1919,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	 * lock "gaps" as index page locks do.  So we don't need to specify a
 	 * buffer when making the call, which makes for a faster check.
 	 */
-	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
+	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);

 	/* NO EREPORT(ERROR) from here till changes are logged */
 	START_CRIT_SECTION();
@@ -2173,7 +2173,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 	 * lock "gaps" as index page locks do.  So we don't need to specify a
 	 * buffer when making the call, which makes for a faster check.
 	 */
-	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
+	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);

 	ndone = 0;
 	while (ndone < ntuples)
@@ -2364,7 +2364,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 	 * lock "gaps" as index page locks do.  So we don't need to specify a
 	 * buffer when making the call.
 	 */
-	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
+	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);

 	/*
 	 * If tuples are cachable, mark them for invalidation from the caches in
@@ -2673,7 +2673,7 @@ l1:
 	 * being visible to the scan (i.e., an exclusive buffer content lock is
 	 * continuously held from this point until the tuple delete is visible).
 	 */
-	CheckForSerializableConflictIn(relation, &tp, buffer);
+	CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));

 	/* replace cid with a combo cid if necessary */
 	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
@@ -3583,7 +3583,7 @@ l2:
 	 * will include checking the relation level, there is no benefit to a
 	 * separate check for the new tuple.
 	 */
-	CheckForSerializableConflictIn(relation, &oldtup, buffer);
+	CheckForSerializableConflictIn(relation, otid, BufferGetBlockNumber(buffer));

 	/*
 	 * At this point newbuf and buffer are both pinned and locked, and newbuf
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index a4a28e88ec..b1643790b9 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2275,7 +2275,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 				hscan->rs_vistuples[ntup++] = offnum;
 				PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
 			}
-			CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
+			heap_CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
 											buffer, snapshot);
 		}
 	}
@@ -2463,7 +2463,7 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,

 			/* in pagemode, heapgetpage did this for us */
 			if (!pagemode)
-				CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
+				heap_CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
 												hscan->rs_cbuf, scan->rs_snapshot);

 			/* Try next tuple from same page. */
@@ -2602,6 +2602,7 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,

 static const TableAmRoutine heapam_methods = {
 	.type = T_TableAmRoutine,
+	.scans_leverage_column_projection = false,

 	.slot_callbacks = heapam_slot_callbacks,

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index aefdd2916d..61ed3167fe 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -180,8 +180,8 @@ index_insert(Relation indexRelation,

 	if (!(indexRelation->rd_indam->ampredlocks))
 		CheckForSerializableConflictIn(indexRelation,
-									   (HeapTuple) NULL,
-									   InvalidBuffer);
+									   (ItemPointer) NULL,
+									   InvalidBlockNumber);

 	return indexRelation->rd_indam->aminsert(indexRelation, values, isnull,
 											 heap_t_ctid, heapRelation,
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 2eccc99023..b905cb1986 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -290,7 +290,7 @@ top:
 		 * checkingunique and !heapkeyspace cases, but it's okay to use the
 		 * first page the value could be on (with scantid omitted) instead.
 		 */
-		CheckForSerializableConflictIn(rel, NULL, insertstate.buf);
+		CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));

 		/*
 		 * Do the insertion.  Note that insertstate contains cached binary
@@ -533,7 +533,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					 * otherwise be masked by this unique constraint
 					 * violation.
 					 */
-					CheckForSerializableConflictIn(rel, NULL, insertstate->buf);
+					CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));

 					/*
 					 * This is a definite conflict.  Break the tuple down into
diff --git a/src/backend/access/zedstore/Makefile b/src/backend/access/zedstore/Makefile
new file mode 100644
index 0000000000..ae5b939026
--- /dev/null
+++ b/src/backend/access/zedstore/Makefile
@@ -0,0 +1,21 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for access/zedstore
+#
+# IDENTIFICATION
+#    src/backend/access/zedstore/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/zedstore
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = zedstore_btree.o zedstore_tidpage.o zedstore_attpage.o \
+       zedstore_compression.o zedstoream_handler.o \
+       zedstore_meta.o zedstore_undo.o zedstore_toast.o zedstore_visibility.o \
+       zedstore_inspect.o zedstore_freepagemap.o zedstore_utils.o \
+       zedstore_tupslot.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/zedstore/README b/src/backend/access/zedstore/README
new file mode 100644
index 0000000000..85493caf03
--- /dev/null
+++ b/src/backend/access/zedstore/README
@@ -0,0 +1,295 @@
+
+src/backend/access/zedstore/README
+
+ZedStore - compressed column (and row) store for PostgreSQL
+===========================================================
+
+The purpose of this README is to provide overview of zedstore's
+design, major requirements/objectives it intends to fulfill and
+high-level implementation details.
+
+Objectives
+----------
+
+* Performance improvement for queries selecting subset of columns
+(reduced IO).
+
+* Reduced on-disk footprint compared to heap table. Shorter tuple
+headers and also leveraging compression of similar type data
+
+* Be first-class citizen in the Postgres architecture (tables data can
+just independently live in columnar storage) and not be at arm's
+length though an opaque interface.
+
+* Fully MVCC compliant - basically all operations supported similar to
+heap, like update, delete, serializable transactions etc...
+
+* All Indexes supported
+
+* Hybrid row-column store, where some columns are stored together, and
+others separately. Provide flexibility of granularity on how to divide
+the columns. Columns accessed together can be stored together.
+
+* Provide better control over bloat (using zheap)
+
+* Eliminate need for separate toast tables
+
+* Faster add / drop column or changing data type of column by avoiding
+full rewrite of the table.
+
+Highlevel design of zedStore - B-trees for the win!
+---------------------------------------------------
+
+To start simple, let's ignore column store aspect and consider it as
+compressed row store. The column store is natural externsion of this
+concept, explained in next section.
+
+The basic on-disk data structure leveraged is a B-tree, indexed by
+TID. BTree being a great data structure, fast and versatile. Note this
+is not refering to existing Btree indexes, but instead net new BTree
+for table data storage.
+
+TID - used as a logical row identifier:
+TID is just a 48-bit row identifier. The traditional division into
+block and offset numbers is meaningless. In order to find a tuple with
+a given TID, one must always descend the B-tree. Having logical TID
+provides flexibility to move the tuples around different pages on page
+splits or page merges can be performed.
+
+The internal pages of the B-tree are super simple and boring. Each
+internal page just stores an array of TID/downlink pairs. Let's focus
+on the leaf level. Leaf blocks have short uncompressed header,
+followed by btree items. It contains three kind of items:
+
+- plain item, holds one tuple or one datum, uncompressed payload
+
+- array item, holds multiple datums, with consecutive TIDs and the
+same visibility information. An array item saves space compared to
+multiple single items, by leaving out repetitive UNDO and TID
+fields. An array item cannot mix NULLs and non-NULLs, so the ZSBT_NULL
+flag applies to all elements.
+
+- a "container item", holds multiple plain items, compressed payload
+
++-----------------------------
+| Fixed-size page header:
+|
+|   LSN
+|   TID low and hi key (for Lehman & Yao B-tree operations)
+|   left and right page pointers
+|
+| Items:
+|
+|   TID | size | flags | uncompressed size | lastTID | payload (container item)
+|   TID | size | flags | uncompressed size | lastTID | payload (container item)
+|   TID | size | flags | undo pointer | payload (plain item)
+|   TID | size | flags | undo pointer | payload (plain item)
+|   ...
+|
++----------------------------
+
+Row store
+---------
+
+The tuples are stored one after another, sorted by TID. For each
+tuple, we store its 48-bit TID, a undo record pointer, and the actual
+tuple data uncompressed.
+
+In uncompressed form, the page can be arbitrarily large. But after
+compression, it must fit into a physical 8k block. If on insert or
+update of a tuple, the page cannot be compressed below 8k anymore, the
+page is split. Note that because TIDs are logical rather than physical
+identifiers, we can freely move tuples from one physical page to
+another during page split. A tuple's TID never changes.
+
+The buffer cache caches compressed blocks. Likewise, WAL-logging,
+full-page images etc. work on compressed blocks. Uncompression is done
+on-the-fly, as and when needed in backend-private memory, when
+reading. For some compressions like rel encoding or delta encoding
+tuples can be constructed directly from compressed data.
+
+Column store
+------------
+
+A column store uses the same structure but we have *multiple* B-trees,
+one for each column plus one for storing meta-data
+(a.k.a. meta-column), all indexed by TID. Imagine zedstore as a forest
+of B-trees. The B-trees for all columns are stored in the same
+physical file.
+
+A metapage at block 0, has links to the roots of the B-trees. Leaf
+pages look the same, but instead of storing the whole tuple, stores
+just a single attribute. To reconstruct a row with given TID, scan
+descends down the B-trees for all the columns using that TID, and
+fetches all attributes. Likewise, a sequential scan walks all the
+B-trees in lockstep.
+
+The special or first btree for meta-column is used to allocate TIDs
+for tuples, track the UNDO location which provides visibility
+information. Also this special btree, which always exists, helps to
+support zero column tables (which can be result of ADD COLUMN DROP
+COLUMN actions as well). Plus, having meta-data stored separately from
+data, helps to get better compression ratios. And also helps to
+simplify the overall design/implementation as for deletes just need to
+edit the meta-column and avoid touching the actual data btrees.
+
+
+MVCC
+----
+
+Undo record pointers are used to implement MVCC, like in zheap. Hence,
+transaction information if not directly stored with the data. In
+zheap, there's a small, fixed, number of "transaction slots" on each
+page, but zedstore has undo pointer with each item directly; in normal
+cases, the compression squeezes this down to almost nothing. In case
+of bulk load the undo record pointer is maintained for array of items
+and not per item. Undo pointer is only stored in meta-column and all
+MVCC operations are performed using the meta-column only.
+
+
+Insert:
+Inserting a new row, splits the row into datums. Then while adding
+entry for meta-column adds, decides block to insert, picks a TID for
+it, and writes undo record for the same. All the data columns are
+inserted using that TID.
+
+Toast:
+When an overly large datum is stored, it is divided into chunks, and
+each chunk is stored on a dedicated toast page within the same
+physical file. The toast pages of a datum form list, each page has a
+next/prev pointer.
+
+Select:
+Property is added to Table AM to convey if column projection is
+leveraged by AM for scans. While scanning tables with AM leveraging
+this property, executor parses the plan. Leverages the target list and
+quals to find the required columns for query. This list is passed down
+to AM on beginscan. Zedstore uses this column projection list to only
+pull data from selected columns. Virtual tuple table slot is used to
+pass back the datums for subset of columns.
+
+Current table am API requires enhancement here to pass down column
+projection to AM. The patch showcases two different ways for the same.
+
+* For sequential scans added new beginscan_with_column_projection()
+API. Executor checks AM property and if it leverages column projection
+uses this new API else normal beginscan() API.
+
+* For index scans instead of modifying the begin scan API, added new
+API to specifically pass column projection list after calling begin
+scan to populate the scan descriptor but before fetching the tuples.
+
+Delete:
+When deleting a tuple, new undo record is created for delete and only
+meta-column item is updated with this new undo record. New undo record
+created points to previous undo record pointer (insert undo record)
+present for the tuple. Hence, delete only operates on meta-column and
+no data column is edited.
+
+Update:
+Update in zedstore is pretty equivalent to delete and insert. Delete
+action is performed as stated above and new entry is added with
+updated values. So, no in-place update happens.
+
+Index Support:
+Building index also leverages columnar storage and only scans columns
+required to build the index. Indexes work pretty similar to heap
+tables. Data is inserted into tables and TID for the tuple gets stored
+in index. On index scans, required column Btrees are scanned for given
+TID and datums passed back using virtual tuple. Since only meta-column
+is leveraged to perform visibility check, only visible tuples data are
+fetched from rest of the Btrees.
+
+Page Format
+-----------
+A ZedStore table contains different kinds of pages, all in the same
+file. Kinds of pages are meta-page, per-attribute btree internal and
+leaf pages, UNDO log page, and toast pages. Each page type has its own
+distinct data storage format.
+
+META Page:
+Block 0 is always a metapage. It contains the block numbers of the
+other data structures stored within the file, like the per-attribute
+B-trees, and the UNDO log.
+
+BTREE Page:
+
+UNDO Page:
+
+TOAST Page:
+
+
+Free Space Map
+--------------
+
+
+Enhancements
+------------
+
+Instead of compressing all the tuples on a page in one batch, store a
+small "dictionary", e.g. in page header or meta page or separate
+dedicated page, and use it to compress tuple by tuple. That could make
+random reads and updates of individual tuples faster. Need to find how
+to create the dictionary first.
+
+Only cached compressed pages in the page cache. If we want to cache
+uncompressed pages instead, or in addition to that, we need to invent
+a whole new kind of a buffer cache that can deal with the
+variable-size blocks. For a first version, I think we can live without
+it.
+
+Instead of storing all columns in the same file, we could store them
+in separate files (separate forks?). That would allow immediate reuse
+of space, after dropping a column. It's not clear how to use an FSM in
+that case, though. Might have to implement an integrated FSM,
+too. (Which might not be a bad idea, anyway).
+
+Design allows for hybrid row-column store, where some columns are
+stored together, and others have a dedicated B-tree. Need to have user
+facing syntax to allow specifying how to group the columns.
+
+Salient points for the design
+------------------------------
+
+* Layout the data/tuples in mapped fashion instead of keeping the
+logical to physical mapping separate from actual data. So, keep all
+the meta-data and data logically in single stream of file, avoiding
+the need for separate forks/files to store meta-data and data.
+
+* Handle/treat operations at tuple level and not block level.
+
+* Stick to fixed size physical blocks. Variable size blocks (for
+possibly higher compression ratios) pose need for increased logical to
+physical mapping maintenance, plus restrictions on concurrency of
+writes and reads to files. Hence adopt compression to fit fixed size
+blocks instead of other way round.
+
+
+Predicate locking
+-----------------
+
+Predicate locks, to support SERIALIZABLE transactinons, are taken like
+with the heap. From README-SSI:
+
+* For a table scan, the entire relation will be locked.
+
+* Each tuple read which is visible to the reading transaction will be
+locked, whether or not it meets selection criteria; except that there
+is no need to acquire an SIREAD lock on a tuple when the transaction
+already holds a write lock on any tuple representing the row, since a
+rw-conflict would also create a ww-dependency which has more
+aggressive enforcement and thus will prevent any anomaly.
+
+* Modifying a heap tuple creates a rw-conflict with any transaction
+that holds a SIREAD lock on that tuple, or on the page or relation
+that contains it.
+
+* Inserting a new tuple creates a rw-conflict with any transaction
+holding a SIREAD lock on the entire relation. It doesn't conflict with
+page-level locks, because page-level locks are only used to aggregate
+tuple locks. Unlike index page locks, they don't lock "gaps" on the
+page.
+
+
+ZedStore isn't block-based, so page-level locks really just mean a
+range of TIDs. They're only used to aggregate tuple locks.
diff --git a/src/backend/access/zedstore/zedstore_attpage.c b/src/backend/access/zedstore/zedstore_attpage.c
new file mode 100644
index 0000000000..10e6517c26
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_attpage.c
@@ -0,0 +1,1589 @@
+/*
+ * zedstore_attpage.c
+ *		Routines for handling attribute leaf pages.
+ *
+ * A Zedstore table consists of multiple B-trees, one for each attribute. The
+ * functions in this file deal with one B-tree at a time, it is the caller's
+ * responsibility to tie together the scans of each btree.
+ *
+ * Operations:
+ *
+ * - Sequential scan in TID order
+ *  - must be efficient with scanning multiple trees in sync
+ *
+ * - random lookups, by TID (for index scan)
+ *
+ * - range scans by TID (for bitmap index scan)
+ *
+ * NOTES:
+ * - Locking order: child before parent, left before right
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_attpage.c
+ */
+#include "postgres.h"
+
+#include "access/zedstore_compression.h"
+#include "access/zedstore_internal.h"
+#include "access/zedstore_undo.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "utils/datum.h"
+#include "utils/rel.h"
+
+/* prototypes for local functions */
+static void zsbt_attr_recompress_replace(Relation rel, AttrNumber attno,
+										 Buffer oldbuf, List *items);
+static ZSSingleBtreeItem *zsbt_attr_fetch(Relation rel, AttrNumber attno,
+		   zstid tid, Buffer *buf_p);
+static void zsbt_attr_replace_item(Relation rel, AttrNumber attno, Buffer buf,
+								   zstid oldtid, ZSBtreeItem *replacementitem,
+								   List *newitems);
+static Size zsbt_compute_data_size(Form_pg_attribute atti, Datum val, bool isnull);
+static ZSBtreeItem *zsbt_attr_create_item(Form_pg_attribute att, zstid tid,
+				 int nelements, Datum *datums,
+				 char *dataptr, Size datasz, bool isnull);
+
+/* ----------------------------------------------------------------
+ *						 Public interface
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * Begin a scan of the btree.
+ */
+void
+zsbt_attr_begin_scan(Relation rel, TupleDesc tdesc, AttrNumber attno, zstid starttid,
+					 zstid endtid, ZSBtreeScan *scan)
+{
+	Buffer		buf;
+
+	scan->rel = rel;
+	scan->attno = attno;
+	scan->tupledesc = tdesc;
+
+	scan->snapshot = NULL;
+	scan->context = CurrentMemoryContext;
+	scan->lastoff = InvalidOffsetNumber;
+	scan->has_decompressed = false;
+	scan->nexttid = starttid;
+	scan->endtid = endtid;
+	memset(&scan->recent_oldest_undo, 0, sizeof(scan->recent_oldest_undo));
+	memset(&scan->array_undoptr, 0, sizeof(scan->array_undoptr));
+	scan->array_datums = palloc(sizeof(Datum));
+	scan->array_datums_allocated_size = 1;
+	scan->array_elements_left = 0;
+
+	buf = zsbt_descend(rel, attno, starttid, 0, true);
+	if (!BufferIsValid(buf))
+	{
+		/* completely empty tree */
+		scan->active = false;
+		scan->lastbuf = InvalidBuffer;
+		return;
+	}
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	scan->active = true;
+	scan->lastbuf = buf;
+
+	zs_decompress_init(&scan->decompressor);
+	scan->recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+}
+
+/*
+ * Reset the 'next' TID in a scan to the given TID.
+ */
+void
+zsbt_attr_reset_scan(ZSBtreeScan *scan, zstid starttid)
+{
+	if (starttid < scan->nexttid)
+	{
+		/* have to restart from scratch. */
+		scan->array_elements_left = 0;
+		scan->nexttid = starttid;
+		scan->has_decompressed = false;
+		if (scan->lastbuf != InvalidBuffer)
+			ReleaseBuffer(scan->lastbuf);
+		scan->lastbuf = InvalidBuffer;
+	}
+	else
+		zsbt_scan_skip(scan, starttid);
+}
+
+void
+zsbt_attr_end_scan(ZSBtreeScan *scan)
+{
+	if (!scan->active)
+		return;
+
+	if (scan->lastbuf != InvalidBuffer)
+		ReleaseBuffer(scan->lastbuf);
+	zs_decompress_free(&scan->decompressor);
+
+	scan->active = false;
+	scan->array_elements_left = 0;
+}
+
+/*
+ * Helper function of zsbt_attr_scan_next(), to extract Datums from the given
+ * array item into the scan->array_* fields.
+ */
+static void
+zsbt_attr_scan_extract_array(ZSBtreeScan *scan, ZSArrayBtreeItem *aitem)
+{
+	int			nelements = aitem->t_nelements;
+	zstid		tid = aitem->t_tid;
+	bool		isnull = (aitem->t_flags & ZSBT_NULL) != 0;
+	char	   *p = aitem->t_payload;
+
+	/* skip over elements that we are not interested in */
+	while (tid < scan->nexttid && nelements > 0)
+	{
+		Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan);
+		if (!isnull)
+		{
+			if (attr->attlen > 0)
+			{
+				p += att_align_nominal(attr->attlen, attr->attalign);
+			}
+			else
+			{
+				p = (Pointer) att_align_pointer(p, attr->attalign, attr->attlen, p);
+				p = att_addlength_pointer(p, attr->attlen, p);
+			}
+		}
+		tid++;
+		nelements--;
+	}
+
+	/* leave out elements that are past end of range */
+	if (tid + nelements > scan->endtid)
+		nelements = scan->endtid - tid;
+
+	scan->array_isnull = isnull;
+
+	if (nelements > scan->array_datums_allocated_size)
+	{
+		if (scan->array_datums)
+			pfree(scan->array_datums);
+		scan->array_datums = palloc(nelements * sizeof(Datum));
+		scan->array_datums_allocated_size = nelements;
+	}
+
+	if (isnull)
+	{
+		/*
+		 * For NULLs, clear the Datum array. Not strictly necessary, I think,
+		 * but less confusing when debugging.
+		 */
+		memset(scan->array_datums, 0, nelements * sizeof(Datum));
+	}
+	else
+	{
+		/*
+		 * Expand the packed array data into an array of Datums.
+		 *
+		 * It would perhaps be more natural to loop through the elements with
+		 * datumGetSize() and fetch_att(), but this is a pretty hot loop, so it's
+		 * better to avoid checking attlen/attbyval in the loop.
+		 *
+		 * TODO: a different on-disk representation might make this better still,
+		 * for varlenas (this is pretty optimal for fixed-lengths already).
+		 * For example, storing an array of sizes or an array of offsets, followed
+		 * by the data itself, might incur fewer pipeline stalls in the CPU.
+		 */
+		Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan);
+		int16		attlen = attr->attlen;
+
+		if (attr->attbyval)
+		{
+			if (attlen == sizeof(Datum))
+			{
+				memcpy(scan->array_datums, p, nelements * sizeof(Datum));
+			}
+			else if (attlen == sizeof(int32))
+			{
+				for (int i = 0; i < nelements; i++)
+				{
+					scan->array_datums[i] = fetch_att(p, true, sizeof(int32));
+					p += sizeof(int32);
+				}
+			}
+			else if (attlen == sizeof(int16))
+			{
+				for (int i = 0; i < nelements; i++)
+				{
+					scan->array_datums[i] = fetch_att(p, true, sizeof(int16));
+					p += sizeof(int16);
+				}
+			}
+			else if (attlen == 1)
+			{
+				for (int i = 0; i < nelements; i++)
+				{
+					scan->array_datums[i] = fetch_att(p, true, 1);
+					p += 1;
+				}
+			}
+			else
+				Assert(false);
+		}
+		else if (attlen > 0)
+		{
+			for (int i = 0; i < nelements; i++)
+			{
+				scan->array_datums[i] = PointerGetDatum(p);
+				p += att_align_nominal(attr->attlen, attr->attalign);
+			}
+		}
+		else if (attlen == -1)
+		{
+			for (int i = 0; i < nelements; i++)
+			{
+				p = (Pointer) att_align_pointer(p, attr->attalign, attr->attlen, p);
+				scan->array_datums[i] = PointerGetDatum(p);
+				p = att_addlength_pointer(p, attr->attlen, p);
+			}
+		}
+		else
+		{
+			/* TODO: convert cstrings to varlenas before we get here? */
+			elog(ERROR, "cstrings not supported");
+		}
+	}
+	scan->array_undoptr = aitem->t_undo_ptr;
+	scan->array_next_datum = &scan->array_datums[0];
+	scan->array_elements_left = nelements;
+}
+
+/*
+ * Advance scan to next item.
+ *
+ * Return true if there was another item. The Datum/isnull of the item is
+ * placed in scan->array_* fields. For a pass-by-ref datum, it's a palloc'd
+ * copy that's valid until the next call.
+ *
+ * This is normally not used directly. See zsbt_scan_next_tid() and
+ * zsbt_scan_next_fetch() wrappers, instead.
+ */
+bool
+zsbt_attr_scan_next(ZSBtreeScan *scan)
+{
+	Buffer		buf;
+	bool		buf_is_locked = false;
+	Page		page;
+	ZSBtreePageOpaque *opaque;
+	OffsetNumber off;
+	OffsetNumber maxoff;
+	BlockNumber	next;
+
+	Assert(scan->active);
+
+	/*
+	 * Advance to the next TID >= nexttid.
+	 *
+	 * This advances scan->nexttid as it goes.
+	 */
+	while (scan->nexttid < scan->endtid)
+	{
+		/*
+		 * If we are still processing an array item, return next element from it.
+		 */
+		if (scan->array_elements_left > 0)
+		{
+			return true;
+		}
+
+		/*
+		 * If we are still processing a compressed item, process the next item
+		 * from the it. If it's an array item, we start iterating the array by
+		 * setting the scan->array_* fields, and loop back to top to return the
+		 * first element from the array.
+		 */
+		if (scan->has_decompressed)
+		{
+			zstid		lasttid;
+			ZSBtreeItem *uitem;
+
+			uitem = zs_decompress_read_item(&scan->decompressor);
+
+			if (uitem == NULL)
+			{
+				scan->has_decompressed = false;
+				continue;
+			}
+
+			/* a compressed item cannot contain nested compressed items */
+			Assert((uitem->t_flags & ZSBT_COMPRESSED) == 0);
+
+			lasttid = zsbt_item_lasttid(uitem);
+			if (lasttid < scan->nexttid)
+				continue;
+
+			if (uitem->t_tid >= scan->endtid)
+				break;
+
+			if ((uitem->t_flags & ZSBT_ARRAY) != 0)
+			{
+				/* no need to make a copy, because the uncompressed buffer
+				 * is already a copy */
+				ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) uitem;
+
+				zsbt_attr_scan_extract_array(scan, aitem);
+				continue;
+			}
+			else
+			{
+				/* single item */
+				ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) uitem;
+				Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan);
+
+				scan->nexttid = sitem->t_tid;
+				scan->array_undoptr = sitem->t_undo_ptr;
+				scan->array_elements_left = 1;
+				scan->array_next_datum = &scan->array_datums[0];
+				if (sitem->t_flags & ZSBT_NULL)
+					scan->array_isnull = true;
+				else
+				{
+					scan->array_isnull = false;
+					scan->array_datums[0] = fetch_att(sitem->t_payload, attr->attbyval, attr->attlen);
+					/* no need to copy, because the uncompression buffer is a copy already */
+					/* FIXME: do we need to copy anyway, to make sure it's aligned correctly? */
+				}
+
+				if (buf_is_locked)
+					LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK);
+				buf_is_locked = false;
+				return true;
+			}
+		}
+
+		/*
+		 * Scan the page for the next item.
+		 */
+		buf = scan->lastbuf;
+		if (!buf_is_locked)
+		{
+			if (BufferIsValid(buf))
+			{
+				LockBuffer(buf, BUFFER_LOCK_SHARE);
+				buf_is_locked = true;
+
+				/*
+				 * It's possible that the page was concurrently split or recycled by
+				 * another backend (or ourselves). Have to re-check that the page is
+				 * still valid.
+				 */
+				if (!zsbt_page_is_expected(scan->rel, scan->attno, scan->nexttid, 0, buf))
+				{
+					/*
+					 * It's not valid for the TID we're looking for, but maybe it was the
+					 * right page for the previous TID. In that case, we don't need to
+					 * restart from the root, we can follow the right-link instead.
+					 */
+					if (zsbt_page_is_expected(scan->rel, scan->attno, scan->nexttid - 1, 0, buf))
+					{
+						page = BufferGetPage(buf);
+						opaque = ZSBtreePageGetOpaque(page);
+						next = opaque->zs_next;
+						if (next != InvalidBlockNumber)
+						{
+							LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+							buf_is_locked = false;
+							buf = ReleaseAndReadBuffer(buf, scan->rel, next);
+							scan->lastbuf = buf;
+							continue;
+						}
+					}
+
+					UnlockReleaseBuffer(buf);
+					buf_is_locked = false;
+					buf = scan->lastbuf = InvalidBuffer;
+				}
+			}
+
+			if (!BufferIsValid(buf))
+			{
+				buf = scan->lastbuf = zsbt_descend(scan->rel, scan->attno, scan->nexttid, 0, true);
+				buf_is_locked = true;
+			}
+		}
+		page = BufferGetPage(buf);
+		opaque = ZSBtreePageGetOpaque(page);
+		Assert(opaque->zs_page_id == ZS_BTREE_PAGE_ID);
+
+		/* TODO: check the last offset first, as an optimization */
+		maxoff = PageGetMaxOffsetNumber(page);
+		for (off = FirstOffsetNumber; off <= maxoff; off++)
+		{
+			ItemId		iid = PageGetItemId(page, off);
+			ZSBtreeItem	*item = (ZSBtreeItem *) PageGetItem(page, iid);
+			zstid		lasttid;
+
+			lasttid = zsbt_item_lasttid(item);
+
+			if (scan->nexttid > lasttid)
+				continue;
+
+			if (item->t_tid >= scan->endtid)
+			{
+				scan->nexttid = scan->endtid;
+				break;
+			}
+
+			if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+			{
+				ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item;
+				MemoryContext oldcxt = MemoryContextSwitchTo(scan->context);
+
+				zs_decompress_chunk(&scan->decompressor, citem);
+				MemoryContextSwitchTo(oldcxt);
+				scan->has_decompressed = true;
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				buf_is_locked = false;
+				break;
+			}
+			else
+			{
+				if ((item->t_flags & ZSBT_ARRAY) != 0)
+				{
+					/* copy the item, because we can't hold a lock on the page  */
+					ZSArrayBtreeItem *aitem;
+
+					aitem = MemoryContextAlloc(scan->context, item->t_size);
+					memcpy(aitem, item, item->t_size);
+
+					zsbt_attr_scan_extract_array(scan, aitem);
+
+					if (scan->array_elements_left > 0)
+					{
+						LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK);
+						buf_is_locked = false;
+						break;
+					}
+				}
+				else
+				{
+					/* single item */
+					ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) item;
+					Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan);
+
+					scan->nexttid = sitem->t_tid;
+					scan->array_undoptr = sitem->t_undo_ptr;
+					scan->array_elements_left = 1;
+					scan->array_next_datum = &scan->array_datums[0];
+					if (item->t_flags & ZSBT_NULL)
+						scan->array_isnull = true;
+					else
+					{
+						scan->array_isnull = false;
+						scan->array_datums[0] = fetch_att(sitem->t_payload, attr->attbyval, attr->attlen);
+						scan->array_datums[0] = zs_datumCopy(scan->array_datums[0], attr->attbyval, attr->attlen);
+					}
+					LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK);
+					buf_is_locked = false;
+					return true;
+				}
+			}
+		}
+
+		if (scan->array_elements_left > 0 || scan->has_decompressed)
+			continue;
+
+		/* No more items on this page. Walk right, if possible */
+		next = opaque->zs_next;
+		if (next == BufferGetBlockNumber(buf))
+			elog(ERROR, "btree page %u next-pointer points to itself", next);
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		buf_is_locked = false;
+
+		if (next == InvalidBlockNumber || scan->nexttid >= scan->endtid)
+		{
+			scan->active = false;
+			scan->array_elements_left = 0;
+			ReleaseBuffer(scan->lastbuf);
+			scan->lastbuf = InvalidBuffer;
+			break;
+		}
+
+		scan->lastbuf = ReleaseAndReadBuffer(scan->lastbuf, scan->rel, next);
+	}
+
+	return false;
+}
+
+/*
+ * Insert a multiple items to the given attribute's btree.
+ *
+ * Populates the TIDs of the new tuples.
+ *
+ * If 'tid' in list is valid, then that TID is used. It better not be in use already. If
+ * it's invalid, then a new TID is allocated, as we see best. (When inserting the
+ * first column of the row, pass invalid, and for other columns, pass the TID
+ * you got for the first column.)
+ */
+void
+zsbt_attr_multi_insert(Relation rel, AttrNumber attno,
+					   Datum *datums, bool *isnulls, zstid *tids, int nitems)
+{
+	Form_pg_attribute attr;
+	zstid		tid = tids[0];
+	Buffer		buf;
+	zstid		insert_target_key;
+	int			i;
+	List	   *newitems;
+
+	Assert (attno >= 1);
+	attr = &rel->rd_att->attrs[attno - 1];
+
+	/*
+	 * Find the right place for the given TID. 
+	 */
+	insert_target_key = tid;
+
+	buf = zsbt_descend(rel, attno, insert_target_key, 0, false);
+
+	/* Create items to insert. */
+	newitems = NIL;
+	i = 0;
+	while (i < nitems)
+	{
+		Size		datasz;
+		int			j;
+		ZSBtreeItem *newitem;
+
+		/*
+		 * Try to collapse as many items as possible into an Array item.
+		 * The first item in the array is now at tids[i]/datums[i]/isnulls[i].
+		 * Items can be stored in the same array as long as the TIDs are
+		 * consecutive, they all have the same isnull flag, and the array
+		 * isn't too large to be stored on a single leaf page. Scan the
+		 * arrays, checking those conditions.
+		 */
+		datasz = zsbt_compute_data_size(attr, datums[i], isnulls[i]);
+		for (j = i + 1; j < nitems; j++)
+		{
+			if (isnulls[j] != isnulls[i])
+				break;
+
+			if (tids[j] != tids[j - 1] + 1)
+				break;
+
+			/*
+			 * Will the array still fit on a leaf page, if this datum is
+			 * included in it? We actually use 1/4 of the page, to avoid
+			 * making very large arrays, which might be slower to update in
+			 * the future. Also, using an array that completely fills a page
+			 * might cause more fragmentation. (XXX: The 1/4 threshold
+			 * is arbitrary, though, and this probably needs more smarts
+			 * or testing to determine the optimum.)
+			 */
+			if (!isnulls[i])
+			{
+				Datum		val = datums[j];
+				Size		datum_sz;
+
+				datum_sz = zsbt_compute_data_size(attr, val, false);
+				if (datasz + datum_sz < MaxZedStoreDatumSize / 4)
+					break;
+				datasz += datum_sz;
+			}
+		}
+
+		/*
+		 * 'i' is now the first entry to store in the array, and 'j' is the
+		 * last + 1 elemnt to store. If j == i + 1, then there is only one
+		 * element and zsbt_create_item() will create a 'single' item rather
+		 * than an array.
+		 */
+		newitem = zsbt_attr_create_item(attr, tids[i],
+										j - i, &datums[i], NULL, datasz, isnulls[i]);
+
+		newitems = lappend(newitems, newitem);
+		i = j;
+	}
+
+	/* recompress and possibly split the page */
+	zsbt_attr_replace_item(rel, attno, buf,
+						   InvalidZSTid, NULL,
+						   newitems);
+	/* zsbt_replace_item unlocked 'buf' */
+	ReleaseBuffer(buf);
+}
+
+void
+zsbt_attr_remove(Relation rel, AttrNumber attno, zstid tid)
+{
+	Buffer		buf;
+	ZSSingleBtreeItem *item;
+
+	/* Find the item to delete. (It could be compressed) */
+	item = zsbt_attr_fetch(rel, attno, tid, &buf);
+	if (item == NULL)
+	{
+		elog(WARNING, "could not find tuple to remove with TID (%u, %u) for attribute %d",
+			 ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid), attno);
+		return;
+	}
+
+	/* remove it */
+	zsbt_attr_replace_item(rel, attno, buf,
+						   tid, NULL,
+						   NIL);
+	ReleaseBuffer(buf); 	/* zsbt_replace_item released */
+}
+
+/* ----------------------------------------------------------------
+ *						 Internal routines
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * Fetch the item with given TID. The page containing the item is kept locked, and
+ * returned to the caller in *buf_p. This is used to locate a tuple for updating
+ * or deleting it.
+ */
+static ZSSingleBtreeItem *
+zsbt_attr_fetch(Relation rel, AttrNumber attno, zstid tid, Buffer *buf_p)
+{
+	Buffer		buf;
+	Page		page;
+	ZSBtreeItem *item = NULL;
+	bool		found = false;
+	OffsetNumber maxoff;
+	OffsetNumber off;
+
+	buf = zsbt_descend(rel, attno, tid, 0, false);
+	if (buf == InvalidBuffer)
+	{
+		*buf_p = InvalidBuffer;
+		return NULL;
+	}
+	page = BufferGetPage(buf);
+
+	/* Find the item on the page that covers the target TID */
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (off = FirstOffsetNumber; off <= maxoff; off++)
+	{
+		ItemId		iid = PageGetItemId(page, off);
+		item = (ZSBtreeItem *) PageGetItem(page, iid);
+
+		if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+		{
+			ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item;
+			ZSDecompressContext decompressor;
+
+			zs_decompress_init(&decompressor);
+			zs_decompress_chunk(&decompressor, citem);
+
+			while ((item = zs_decompress_read_item(&decompressor)) != NULL)
+			{
+				zstid		lasttid = zsbt_item_lasttid(item);
+
+				if (item->t_tid <= tid && lasttid >= tid)
+				{
+					found = true;
+					break;
+				}
+			}
+			if (found)
+			{
+				/* FIXME: decompressor is leaked. Can't free it yet, because we still
+				 * need to access the item below
+				 */
+				break;
+			}
+			zs_decompress_free(&decompressor);
+		}
+		else
+		{
+			zstid		lasttid = zsbt_item_lasttid(item);
+
+			if (item->t_tid <= tid && lasttid >= tid)
+			{
+				found = true;
+				break;
+			}
+		}
+	}
+
+	if (found)
+	{
+		ZSSingleBtreeItem *result;
+
+		if ((item->t_flags & ZSBT_ARRAY) != 0)
+		{
+			ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item;
+			int			elemno = tid - aitem->t_tid;
+			char	   *dataptr = NULL;
+			int			datasz;
+			int			resultsize;
+
+			Assert(elemno < aitem->t_nelements);
+
+			if ((item->t_flags & ZSBT_NULL) == 0)
+			{
+				/*
+				 * TODO: Currently, zsbt_fetch() is called from functions
+				 * which don't have Slot, and Relation object can be trusted
+				 * for attlen and attbyval. Ideally, we wish to not rely on
+				 * Relation object and see how to decouple it. Previously, we
+				 * stored these two values in meta-page and get these values
+				 * from it but just storing them for this purpose, seems
+				 * heavy. Ideally, catalog stores those values so shouldn't
+				 * need to duplicate storing the same.
+				 */
+				TupleDesc tdesc = RelationGetDescr(rel);
+				int attlen = tdesc->attrs[attno - 1].attlen;
+				bool attbyval = tdesc->attrs[attno - 1].attbyval;
+
+				if (attlen > 0)
+				{
+					dataptr = aitem->t_payload + elemno * attlen;
+					datasz = attlen;
+				}
+				else
+				{
+					dataptr = aitem->t_payload;
+					for (int i = 0; i < elemno; i++)
+					{
+						dataptr += zs_datumGetSize(PointerGetDatum(dataptr), attbyval, attlen);
+					}
+					datasz = zs_datumGetSize(PointerGetDatum(dataptr), attbyval, attlen);
+				}
+			}
+			else
+				datasz = 0;
+
+			resultsize = offsetof(ZSSingleBtreeItem, t_payload) + datasz;
+			result = palloc(resultsize);
+			memset(result, 0, offsetof(ZSSingleBtreeItem, t_payload)); /* zero padding */
+			result->t_tid = tid;
+			result->t_flags = item->t_flags & ~ZSBT_ARRAY;
+			result->t_size = resultsize;
+			result->t_undo_ptr = aitem->t_undo_ptr;
+			if (datasz > 0)
+				memcpy(result->t_payload, dataptr, datasz);
+		}
+		else
+		{
+			/* single item */
+			result = (ZSSingleBtreeItem *) item;
+		}
+
+		*buf_p = buf;
+		return result;
+	}
+	else
+	{
+		UnlockReleaseBuffer(buf);
+		*buf_p = InvalidBuffer;
+		return NULL;
+	}
+}
+
+/*
+ * Compute the size of a slice of an array, from an array item. 'dataptr'
+ * points to the packed on-disk representation of the array item's data.
+ * The elements are stored one after each other.
+ */
+static Size
+zsbt_get_array_slice_len(int16 attlen, bool attbyval, bool isnull,
+						 char *dataptr, int nelements)
+{
+	Size		datasz;
+
+	if (isnull)
+		datasz = 0;
+	else
+	{
+		/*
+		 * For a fixed-width type, we can just multiply. For variable-length,
+		 * we have to walk through the elements, looking at the length of each
+		 * element.
+		 */
+		if (attlen > 0)
+		{
+			datasz = attlen * nelements;
+		}
+		else
+		{
+			char	   *p = dataptr;
+
+			datasz = 0;
+			for (int i = 0; i < nelements; i++)
+			{
+				Size		datumsz;
+
+				datumsz = zs_datumGetSize(PointerGetDatum(p), attbyval, attlen);
+
+				/*
+				 * The array should already use short varlen representation whenever
+				 * possible.
+				 */
+				Assert(!VARATT_CAN_MAKE_SHORT(DatumGetPointer(p)));
+
+				datasz += datumsz;
+				p += datumsz;
+			}
+		}
+	}
+	return datasz;
+}
+
+
+/* Does att's datatype allow packing into the 1-byte-header varlena format? */
+#define ATT_IS_PACKABLE(att) \
+	((att)->attlen == -1 && (att)->attstorage != 'p')
+/* Use this if it's already known varlena */
+#define VARLENA_ATT_IS_PACKABLE(att) \
+	((att)->attstorage != 'p')
+
+/*
+ * This is very similar to heap_compute_data_size()
+ */
+static Size
+zsbt_compute_data_size(Form_pg_attribute atti, Datum val, bool isnull)
+{
+	Size		data_length = 0;
+
+	if (isnull)
+		return 0;
+
+	if (ATT_IS_PACKABLE(atti) &&
+		VARATT_CAN_MAKE_SHORT(DatumGetPointer(val)))
+	{
+		/*
+		 * we're anticipating converting to a short varlena header, so
+		 * adjust length and don't count any alignment
+		 */
+		data_length += VARATT_CONVERTED_SHORT_SIZE(DatumGetPointer(val));
+	}
+	else if (atti->attlen == -1 &&
+			 VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val)))
+	{
+		/*
+		 * we want to flatten the expanded value so that the constructed
+		 * tuple doesn't depend on it
+		 */
+		data_length = att_align_nominal(data_length, atti->attalign);
+		data_length += EOH_get_flat_size(DatumGetEOHP(val));
+	}
+	else if (atti->attlen == -1 &&
+			 VARATT_IS_EXTERNAL(val) && VARTAG_EXTERNAL(val) == VARTAG_ZEDSTORE)
+	{
+		data_length += sizeof(varatt_zs_toastptr);
+	}
+	else
+	{
+		data_length = att_align_datum(data_length, atti->attalign,
+									  atti->attlen, val);
+		data_length = att_addlength_datum(data_length, atti->attlen,
+										  val);
+	}
+
+	return data_length;
+}
+
+/*
+ * Form a ZSBtreeItem out of the given datums, or data that's already in on-disk
+ * array format, for insertion.
+ *
+ * If there's more than one element, an array item is created. Otherwise, a single
+ * item.
+ */
+static ZSBtreeItem *
+zsbt_attr_create_item(Form_pg_attribute att, zstid tid,
+				 int nelements, Datum *datums,
+				 char *datasrc, Size datasz, bool isnull)
+{
+	ZSBtreeItem *result;
+	Size		itemsz;
+	char	   *databegin;
+
+	Assert(nelements > 0);
+
+	if (nelements > 1)
+	{
+		ZSArrayBtreeItem *newitem;
+
+		itemsz = offsetof(ZSArrayBtreeItem, t_payload) + datasz;
+
+		newitem = palloc(itemsz);
+		memset(newitem, 0, offsetof(ZSArrayBtreeItem, t_payload)); /* zero padding */
+		newitem->t_tid = tid;
+		newitem->t_size = itemsz;
+		newitem->t_flags = ZSBT_ARRAY;
+		if (isnull)
+			newitem->t_flags |= ZSBT_NULL;
+		newitem->t_nelements = nelements;
+		ZSUndoRecPtrInitialize(&newitem->t_undo_ptr);
+
+		databegin = newitem->t_payload;
+
+		result = (ZSBtreeItem *) newitem;
+	}
+	else
+	{
+		ZSSingleBtreeItem *newitem;
+
+		itemsz = offsetof(ZSSingleBtreeItem, t_payload) + datasz;
+
+		newitem = palloc(itemsz);
+		memset(newitem, 0, offsetof(ZSSingleBtreeItem, t_payload)); /* zero padding */
+		newitem->t_tid = tid;
+		newitem->t_flags = 0;
+		if (isnull)
+			newitem->t_flags |= ZSBT_NULL;
+		newitem->t_size = itemsz;
+		ZSUndoRecPtrInitialize(&newitem->t_undo_ptr);
+
+		databegin = newitem->t_payload;
+
+		result = (ZSBtreeItem *) newitem;
+	}
+
+	/*
+	 * Copy the data.
+	 *
+	 * This is largely copied from heaptuple.c's fill_val().
+	 */
+	if (!isnull)
+	{
+		char	   *data = databegin;
+
+		if (datums)
+		{
+			for (int i = 0; i < nelements; i++)
+			{
+				Datum		datum = datums[i];
+				Size		data_length;
+
+				/*
+				 * XXX we use the att_align macros on the pointer value itself, not on an
+				 * offset.  This is a bit of a hack.
+				 */
+				if (att->attbyval)
+				{
+					/* pass-by-value */
+					data = (char *) att_align_nominal(data, att->attalign);
+					store_att_byval(data, datum, att->attlen);
+					data_length = att->attlen;
+				}
+				else if (att->attlen == -1)
+				{
+					/* varlena */
+					Pointer		val = DatumGetPointer(datum);
+
+					if (VARATT_IS_EXTERNAL(val))
+					{
+						if (VARATT_IS_EXTERNAL_EXPANDED(val))
+						{
+							/*
+							 * we want to flatten the expanded value so that the
+							 * constructed tuple doesn't depend on it
+							 */
+							/* FIXME: This should happen earlier, because if the
+							 * datum is very large, it should be toasted, and
+							 * that should happen earlier.
+							 */
+							ExpandedObjectHeader *eoh = DatumGetEOHP(datum);
+
+							data = (char *) att_align_nominal(data,
+															  att->attalign);
+							data_length = EOH_get_flat_size(eoh);
+							EOH_flatten_into(eoh, data, data_length);
+						}
+						else if (VARATT_IS_EXTERNAL(val) && VARTAG_EXTERNAL(val) == VARTAG_ZEDSTORE)
+						{
+							data_length = sizeof(varatt_zs_toastptr);
+							memcpy(data, val, data_length);
+						}
+						else
+						{
+							/* no alignment, since it's short by definition */
+							data_length = VARSIZE_EXTERNAL(val);
+							memcpy(data, val, data_length);
+						}
+					}
+					else if (VARATT_IS_SHORT(val))
+					{
+						/* no alignment for short varlenas */
+						data_length = VARSIZE_SHORT(val);
+						memcpy(data, val, data_length);
+					}
+					else if (VARLENA_ATT_IS_PACKABLE(att) &&
+							 VARATT_CAN_MAKE_SHORT(val))
+					{
+						/* convert to short varlena -- no alignment */
+						data_length = VARATT_CONVERTED_SHORT_SIZE(val);
+						SET_VARSIZE_SHORT(data, data_length);
+						memcpy(data + 1, VARDATA(val), data_length - 1);
+					}
+					else
+					{
+						/* full 4-byte header varlena */
+						data = (char *) att_align_nominal(data,
+														  att->attalign);
+						data_length = VARSIZE(val);
+						memcpy(data, val, data_length);
+					}
+				}
+				else if (att->attlen == -2)
+				{
+					/* cstring ... never needs alignment */
+					Assert(att->attalign == 'c');
+					data_length = strlen(DatumGetCString(datum)) + 1;
+					memcpy(data, DatumGetPointer(datum), data_length);
+				}
+				else
+				{
+					/* fixed-length pass-by-reference */
+					data = (char *) att_align_nominal(data, att->attalign);
+					Assert(att->attlen > 0);
+					data_length = att->attlen;
+					memcpy(data, DatumGetPointer(datum), data_length);
+				}
+				data += data_length;
+			}
+			Assert(data - databegin == datasz);
+		}
+		else
+			memcpy(data, datasrc, datasz);
+	}
+
+	return result;
+}
+
+/*
+ * This helper function is used to implement INSERT, UPDATE and DELETE.
+ *
+ * If 'olditem' is not NULL, then 'olditem' on the page is replaced with
+ * 'replacementitem'. 'replacementitem' can be NULL, to remove an old item.
+ *
+ * If 'newitems' is not empty, the items in the list are added to the page,
+ * to the correct position. FIXME: Actually, they're always just added to
+ * the end of the page, and that better be the correct position.
+ *
+ * This function handles decompressing and recompressing items, and splitting
+ * the page if needed.
+ */
+static void
+zsbt_attr_replace_item(Relation rel, AttrNumber attno, Buffer buf,
+					   zstid oldtid,
+					   ZSBtreeItem *replacementitem,
+					   List       *newitems)
+{
+	Form_pg_attribute attr;
+	int16		attlen;
+	bool		attbyval;
+	Page		page = BufferGetPage(buf);
+	OffsetNumber off;
+	OffsetNumber maxoff;
+	List	   *items;
+	bool		found_old_item = false;
+	/* We might need to decompress up to two previously compressed items */
+	ZSDecompressContext decompressor;
+	bool		decompressor_used = false;
+	bool		decompressing;
+
+	if (attno == ZS_META_ATTRIBUTE_NUM)
+	{
+		attr = NULL;
+		attlen = 0;
+		attbyval = true;
+	}
+	else
+	{
+		attr = &rel->rd_att->attrs[attno - 1];
+		attlen = attr->attlen;
+		attbyval = attr->attbyval;
+	}
+
+	if (replacementitem)
+		Assert(replacementitem->t_tid == oldtid);
+
+	/*
+	 * TODO: It would be good to have a fast path, for the common case that we're
+	 * just adding items to the end.
+	 */
+
+	/* Loop through all old items on the page */
+	items = NIL;
+	maxoff = PageGetMaxOffsetNumber(page);
+	decompressing = false;
+	off = 1;
+	for (;;)
+	{
+		ZSBtreeItem *item;
+
+		/*
+		 * Get the next item to process. If we're decompressing, get the next
+		 * tuple from the decompressor, otherwise get the next item from the page.
+		 */
+		if (decompressing)
+		{
+			item = zs_decompress_read_item(&decompressor);
+			if (!item)
+			{
+				decompressing = false;
+				continue;
+			}
+		}
+		else if (off <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, off);
+
+			item = (ZSBtreeItem *) PageGetItem(page, iid);
+			off++;
+
+		}
+		else
+		{
+			/* out of items */
+			break;
+		}
+
+		/* we now have an item to process, either straight from the page or from
+		 * the decompressor */
+		if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+		{
+			zstid		item_lasttid = zsbt_item_lasttid(item);
+
+			/* there shouldn't nested compressed items */
+			if (decompressing)
+				elog(ERROR, "nested compressed items on zedstore page not supported");
+
+			if (oldtid != InvalidZSTid && item->t_tid <= oldtid && oldtid <= item_lasttid)
+			{
+				ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item;
+
+				/* Found it, this compressed item covers the target or the new TID. */
+				/* We have to decompress it, and recompress */
+				Assert(!decompressor_used);
+
+				zs_decompress_init(&decompressor);
+				zs_decompress_chunk(&decompressor, citem);
+				decompressor_used = true;
+				decompressing = true;
+				continue;
+			}
+			else
+			{
+				/* keep this compressed item as it is */
+				items = lappend(items, item);
+			}
+		}
+		else if ((item->t_flags & ZSBT_ARRAY) != 0)
+		{
+			/* array item */
+			ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item;
+			zstid		item_lasttid = zsbt_item_lasttid(item);
+
+			if (oldtid != InvalidZSTid && item->t_tid <= oldtid && oldtid <= item_lasttid)
+			{
+				/*
+				 * The target TID is currently part of an array item. We have to split
+				 * the array item into two, and put the replacement item in the middle.
+				 */
+				int			cutoff;
+				Size		olddatalen;
+				int			nelements = aitem->t_nelements;
+				bool		isnull = (aitem->t_flags & ZSBT_NULL) != 0;
+				char	   *dataptr;
+
+				cutoff = oldtid - item->t_tid;
+
+				/* Array slice before the target TID */
+				dataptr = aitem->t_payload;
+				if (cutoff > 0)
+				{
+					ZSBtreeItem *item1;
+					Size		datalen1;
+
+					datalen1 = zsbt_get_array_slice_len(attlen, attbyval, isnull,
+														dataptr, cutoff);
+					item1 = zsbt_attr_create_item(attr, aitem->t_tid,
+												  cutoff, NULL, dataptr, datalen1, isnull);
+					dataptr += datalen1;
+					items = lappend(items, item1);
+				}
+
+				/*
+				 * Skip over the target element, and store the replacement
+				 * item, if any, in its place
+				 */
+				olddatalen = zsbt_get_array_slice_len(attlen, attbyval, isnull,
+													  dataptr, 1);
+				dataptr += olddatalen;
+				if (replacementitem)
+					items = lappend(items, replacementitem);
+
+				/* Array slice after the target */
+				if (cutoff + 1 < nelements)
+				{
+					ZSBtreeItem *item2;
+					Size		datalen2;
+
+					datalen2 = zsbt_get_array_slice_len(attlen, attbyval, isnull,
+														dataptr, nelements - (cutoff + 1));
+					item2 = zsbt_attr_create_item(attr, oldtid + 1,
+												  nelements - (cutoff + 1), NULL, dataptr, datalen2, isnull);
+					items = lappend(items, item2);
+				}
+
+				found_old_item = true;
+			}
+			else
+				items = lappend(items, item);
+		}
+		else
+		{
+			/* single item */
+			if (oldtid != InvalidZSTid && item->t_tid == oldtid)
+			{
+				Assert(!found_old_item);
+				found_old_item = true;
+				if (replacementitem)
+					items = lappend(items, replacementitem);
+			}
+			else
+				items = lappend(items, item);
+		}
+	}
+
+	if (oldtid != InvalidZSTid && !found_old_item)
+		elog(ERROR, "could not find old item to replace");
+
+	/* Add any new items to the end */
+	if (newitems)
+		items = list_concat(items, newitems);
+
+	/* Now pass the list to the recompressor. */
+	IncrBufferRefCount(buf);
+	if (items)
+	{
+		zsbt_attr_recompress_replace(rel, attno, buf, items);
+	}
+	else
+	{
+		zs_split_stack *stack;
+
+		stack = zsbt_unlink_page(rel, attno, buf, 0);
+
+		if (!stack)
+		{
+			/* failed. */
+			Page		newpage = PageGetTempPageCopySpecial(BufferGetPage(buf));
+
+			stack = zs_new_split_stack_entry(buf, newpage);
+		}
+
+		/* apply the changes */
+		zs_apply_split_changes(rel, stack);
+	}
+
+	/*
+	 * We can now free the decompression contexts. The pointers in the 'items' list
+	 * point to decompression buffers, so we cannot free them until after writing out
+	 * the pages.
+	 */
+	if (decompressor_used)
+		zs_decompress_free(&decompressor);
+	list_free(items);
+}
+
+/*
+ * Recompressor routines
+ */
+typedef struct
+{
+	Page		currpage;
+	ZSCompressContext compressor;
+	int			compressed_items;
+
+	/* first page writes over the old buffer, subsequent pages get newly-allocated buffers */
+	zs_split_stack *stack_head;
+	zs_split_stack *stack_tail;
+
+	int			total_items;
+	int			total_compressed_items;
+	int			total_already_compressed_items;
+
+	AttrNumber	attno;
+	zstid		hikey;
+} zsbt_attr_recompress_context;
+
+static void
+zsbt_attr_recompress_newpage(zsbt_attr_recompress_context *cxt, zstid nexttid, int flags)
+{
+	Page		newpage;
+	ZSBtreePageOpaque *newopaque;
+	zs_split_stack *stack;
+
+	if (cxt->currpage)
+	{
+		/* set the last tid on previous page */
+		ZSBtreePageOpaque *oldopaque = ZSBtreePageGetOpaque(cxt->currpage);
+
+		oldopaque->zs_hikey = nexttid;
+	}
+
+	newpage = (Page) palloc(BLCKSZ);
+	PageInit(newpage, BLCKSZ, sizeof(ZSBtreePageOpaque));
+
+	stack = zs_new_split_stack_entry(InvalidBuffer, /* will be assigned later */
+									 newpage);
+	if (cxt->stack_tail)
+		cxt->stack_tail->next = stack;
+	else
+		cxt->stack_head = stack;
+	cxt->stack_tail = stack;
+
+	cxt->currpage = newpage;
+
+	newopaque = ZSBtreePageGetOpaque(newpage);
+	newopaque->zs_attno = cxt->attno;
+	newopaque->zs_next = InvalidBlockNumber; /* filled in later */
+	newopaque->zs_lokey = nexttid;
+	newopaque->zs_hikey = cxt->hikey;		/* overwritten later, if this is not last page */
+	newopaque->zs_level = 0;
+	newopaque->zs_flags = flags;
+	newopaque->zs_page_id = ZS_BTREE_PAGE_ID;
+}
+
+static void
+zsbt_attr_recompress_add_to_page(zsbt_attr_recompress_context *cxt, ZSBtreeItem *item)
+{
+	if (PageGetFreeSpace(cxt->currpage) < MAXALIGN(item->t_size))
+		zsbt_attr_recompress_newpage(cxt, item->t_tid, 0);
+
+	if (PageAddItemExtended(cxt->currpage,
+							(Item) item, item->t_size,
+							PageGetMaxOffsetNumber(cxt->currpage) + 1,
+							PAI_OVERWRITE) == InvalidOffsetNumber)
+		elog(ERROR, "could not add item to page while recompressing");
+
+	cxt->total_items++;
+}
+
+static bool
+zsbt_attr_recompress_add_to_compressor(zsbt_attr_recompress_context *cxt, ZSBtreeItem *item)
+{
+	bool		result;
+
+	if (cxt->compressed_items == 0)
+		zs_compress_begin(&cxt->compressor, PageGetFreeSpace(cxt->currpage));
+
+	result = zs_compress_add(&cxt->compressor, item);
+	if (result)
+	{
+		cxt->compressed_items++;
+
+		cxt->total_compressed_items++;
+	}
+
+	return result;
+}
+
+static void
+zsbt_attr_recompress_flush(zsbt_attr_recompress_context *cxt)
+{
+	ZSCompressedBtreeItem *citem;
+
+	if (cxt->compressed_items == 0)
+		return;
+
+	citem = zs_compress_finish(&cxt->compressor);
+
+	if (citem)
+		zsbt_attr_recompress_add_to_page(cxt, (ZSBtreeItem *) citem);
+	else
+	{
+		uint16 size = 0;
+		/*
+		 * compression failed hence add items uncompressed. We should maybe
+		 * note that these items/pattern are not compressible and skip future
+		 * attempts to compress but its possible this clubbed with some other
+		 * future items may compress. So, better avoid recording such info and
+		 * try compression again later if required.
+		 */
+		for (int i = 0; i < cxt->compressor.nitems; i++)
+		{
+			citem = (ZSCompressedBtreeItem *) (cxt->compressor.uncompressedbuffer + size);
+			zsbt_attr_recompress_add_to_page(cxt, (ZSBtreeItem *) citem);
+
+			size += MAXALIGN(citem->t_size);
+		}
+	}
+
+	cxt->compressed_items = 0;
+}
+
+/*
+ * Rewrite a leaf page, with given 'items' as the new content.
+ *
+ * If there are any uncompressed items in the list, we try to compress them.
+ * Any already-compressed items are added as is.
+ *
+ * If the items no longer fit on the page, then the page is split. It is
+ * entirely possible that they don't fit even on two pages; we split the page
+ * into as many pages as needed. Hopefully not more than a few pages, though,
+ * because otherwise you might hit limits on the number of buffer pins (with
+ * tiny shared_buffers).
+ *
+ * On entry, 'oldbuf' must be pinned and exclusive-locked. On exit, the lock
+ * is released, but it's still pinned.
+ *
+ * TODO: Try to combine single items, and existing array-items, into new array
+ * items.
+ */
+static void
+zsbt_attr_recompress_replace(Relation rel, AttrNumber attno, Buffer oldbuf, List *items)
+{
+	ListCell   *lc;
+	zsbt_attr_recompress_context cxt;
+	ZSBtreePageOpaque *oldopaque = ZSBtreePageGetOpaque(BufferGetPage(oldbuf));
+	ZSUndoRecPtr recent_oldest_undo = { 0 };
+	BlockNumber orignextblk;
+	zs_split_stack *stack;
+	List	   *downlinks = NIL;
+
+	orignextblk = oldopaque->zs_next;
+
+	cxt.currpage = NULL;
+	zs_compress_init(&cxt.compressor);
+	cxt.compressed_items = 0;
+	cxt.stack_head = cxt.stack_tail = NULL;
+	cxt.attno = attno;
+	cxt.hikey = oldopaque->zs_hikey;
+
+	cxt.total_items = 0;
+	cxt.total_compressed_items = 0;
+	cxt.total_already_compressed_items = 0;
+
+	zsbt_attr_recompress_newpage(&cxt, oldopaque->zs_lokey, (oldopaque->zs_flags & ZSBT_ROOT));
+
+	foreach(lc, items)
+	{
+		ZSBtreeItem *item = (ZSBtreeItem *) lfirst(lc);
+
+		/* We can leave out any old-enough DEAD items */
+		if ((item->t_flags & ZSBT_DEAD) != 0)
+		{
+			ZSBtreeItem *uitem = (ZSBtreeItem *) item;
+
+			if (recent_oldest_undo.counter == 0)
+				recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+
+			if (zsbt_item_undoptr(uitem).counter <= recent_oldest_undo.counter)
+				continue;
+		}
+
+		if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+		{
+			/* already compressed, add as it is. */
+			zsbt_attr_recompress_flush(&cxt);
+			cxt.total_already_compressed_items++;
+			zsbt_attr_recompress_add_to_page(&cxt, item);
+		}
+		else
+		{
+			/* try to add this item to the compressor */
+			if (!zsbt_attr_recompress_add_to_compressor(&cxt, item))
+			{
+				if (cxt.compressed_items > 0)
+				{
+					/* flush, and retry */
+					zsbt_attr_recompress_flush(&cxt);
+
+					if (!zsbt_attr_recompress_add_to_compressor(&cxt, item))
+					{
+						/* could not compress, even on its own. Store it uncompressed, then */
+						zsbt_attr_recompress_add_to_page(&cxt, item);
+					}
+				}
+				else
+				{
+					/* could not compress, even on its own. Store it uncompressed, then */
+					zsbt_attr_recompress_add_to_page(&cxt, item);
+				}
+			}
+		}
+	}
+
+	/* flush the last one, if any */
+	zsbt_attr_recompress_flush(&cxt);
+
+	zs_compress_free(&cxt.compressor);
+
+	/*
+	 * Ok, we now have a list of pages, to replace the original page, as private
+	 * in-memory copies. Allocate buffers for them, and write them out.
+	 *
+	 * allocate all the pages before entering critical section, so that
+	 * out-of-disk-space doesn't lead to PANIC
+	 */
+	stack = cxt.stack_head;
+	Assert(stack->buf == InvalidBuffer);
+	stack->buf = oldbuf;
+	while (stack->next)
+	{
+		Page	thispage = stack->page;
+		ZSBtreePageOpaque *thisopaque = ZSBtreePageGetOpaque(thispage);
+		ZSBtreeInternalPageItem *downlink;
+		Buffer	nextbuf;
+
+		Assert(stack->next->buf == InvalidBuffer);
+
+		nextbuf = zspage_getnewbuf(rel, InvalidBuffer);
+		stack->next->buf = nextbuf;
+
+		thisopaque->zs_next = BufferGetBlockNumber(nextbuf);
+
+		downlink = palloc(sizeof(ZSBtreeInternalPageItem));
+		downlink->tid = thisopaque->zs_hikey;
+		downlink->childblk = BufferGetBlockNumber(nextbuf);
+		downlinks = lappend(downlinks, downlink);
+
+		stack = stack->next;
+	}
+	/* last one in the chain */
+	ZSBtreePageGetOpaque(stack->page)->zs_next = orignextblk;
+
+	/* If we had to split, insert downlinks for the new pages. */
+	if (cxt.stack_head->next)
+	{
+		oldopaque = ZSBtreePageGetOpaque(cxt.stack_head->page);
+
+		if ((oldopaque->zs_flags & ZSBT_ROOT) != 0)
+		{
+			ZSBtreeInternalPageItem *downlink;
+
+			downlink = palloc(sizeof(ZSBtreeInternalPageItem));
+			downlink->tid = MinZSTid;
+			downlink->childblk = BufferGetBlockNumber(cxt.stack_head->buf);
+			downlinks = lcons(downlink, downlinks);
+
+			cxt.stack_tail->next = zsbt_newroot(rel, attno, oldopaque->zs_level + 1, downlinks);
+
+			/* clear the ZSBT_ROOT flag on the old root page */
+			oldopaque->zs_flags &= ~ZSBT_ROOT;
+		}
+		else
+		{
+			cxt.stack_tail->next = zsbt_insert_downlinks(rel, attno,
+														 oldopaque->zs_lokey, BufferGetBlockNumber(oldbuf), oldopaque->zs_level + 1,
+														 downlinks);
+		}
+		/* note: stack_tail is not the real tail anymore */
+	}
+
+	/* Finally, overwrite all the pages we had to modify */
+	zs_apply_split_changes(rel, cxt.stack_head);
+}
diff --git a/src/backend/access/zedstore/zedstore_btree.c b/src/backend/access/zedstore/zedstore_btree.c
new file mode 100644
index 0000000000..108170ffee
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_btree.c
@@ -0,0 +1,632 @@
+/*
+ * zedstore_btree.c
+ *		Common routines for handling TID and attibute B-tree structures
+ *
+ * A Zedstore table consists of multiple B-trees, one to store TIDs and
+ * visibility information of the rows, and one tree for each attribute,
+ * to hold the data. The TID and attribute trees differ at the leaf
+ * level, but the internal pages have the same layout. This file contains
+ * routines to deal with internal pages, and some other common
+ * functionality.
+ *
+ * When dealing with the TID tree, pass ZS_META_ATTRIBUTE_NUM as the
+ * attribute number.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_btree.c
+ */
+#include "postgres.h"
+
+#include "access/zedstore_internal.h"
+#include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "utils/rel.h"
+
+/* prototypes for local functions */
+static zs_split_stack *zsbt_split_internal_page(Relation rel, AttrNumber attno,
+												Buffer leftbuf, OffsetNumber newoff, List *downlinks);
+static zs_split_stack *zsbt_merge_pages(Relation rel, AttrNumber attno, Buffer leftbuf, Buffer rightbuf, bool target_is_left);
+
+static int zsbt_binsrch_internal(zstid key, ZSBtreeInternalPageItem *arr, int arr_elems);
+
+/*
+ * Find the page containing the given key TID at the given level.
+ *
+ * Level 0 means leaf. The returned buffer is exclusive-locked.
+ */
+Buffer
+zsbt_descend(Relation rel, AttrNumber attno, zstid key, int level, bool readonly)
+{
+	BlockNumber next;
+	Buffer		buf;
+	Page		page;
+	ZSBtreePageOpaque *opaque;
+	ZSBtreeInternalPageItem *items;
+	int			nitems;
+	int			itemno;
+	BlockNumber rootblk;
+	int			nextlevel = -1;
+	BlockNumber failblk = InvalidBlockNumber;
+
+	/* start from root */
+restart:
+	rootblk = zsmeta_get_root_for_attribute(rel, attno, readonly);
+
+	if (rootblk == InvalidBlockNumber)
+	{
+		/* completely empty tree */
+		return InvalidBuffer;
+	}
+
+	next = rootblk;
+	for (;;)
+	{
+		/*
+		 * If we arrive again to a block that was a dead-end earlier, it seems
+		 * that the tree is corrupt.
+		 *
+		 * XXX: It's theoretically possible that the block was removed, but then
+		 * added back at the same location, and removed again. So perhaps retry
+		 * a few times?
+		 */
+		if (next == failblk || next == ZS_META_BLK)
+			elog(ERROR, "arrived at incorrect block %u while descending zedstore btree", next);
+
+		buf = ReadBuffer(rel, next);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);		/* TODO: shared */
+		page = BufferGetPage(buf);
+		if (!zsbt_page_is_expected(rel, attno, key, nextlevel, buf))
+		{
+			/*
+			 * We arrived at an unexpected page. This can happen with concurrent
+			 * splits, or page deletions. We could try following the right-link, but
+			 * there's no guarantee that's the correct page either, so let's restart
+			 * from the root. If we landed here because of concurrent modifications,
+			 * the next attempt should land on the correct page. Remember that we
+			 * incorrectly ended up on this page, so that if this happens because
+			 * the tree is corrupt, rather than concurrent splits, and we land here
+			 * again, we won't loop forever.
+			 */
+			failblk = next;
+			goto restart;
+		}
+		opaque = ZSBtreePageGetOpaque(page);
+
+		if (nextlevel == -1)
+			nextlevel = opaque->zs_level;
+
+		else if (opaque->zs_level != nextlevel)
+			elog(ERROR, "unexpected level encountered when descending tree");
+
+		if (opaque->zs_level == level)
+			return buf;
+
+		/* Find the downlink and follow it */
+		items = ZSBtreeInternalPageGetItems(page);
+		nitems = ZSBtreeInternalPageGetNumItems(page);
+
+		itemno = zsbt_binsrch_internal(key, items, nitems);
+		if (itemno < 0)
+			elog(ERROR, "could not descend tree for tid (%u, %u)",
+				 ZSTidGetBlockNumber(key), ZSTidGetOffsetNumber(key));
+
+		next = items[itemno].childblk;
+		nextlevel--;
+
+		UnlockReleaseBuffer(buf);
+	}
+}
+
+/*
+ * Check that a page is a valid B-tree page, and covers the given key.
+ *
+ * This is used when traversing the tree, to check that e.g. a concurrent page
+ * split didn't move pages around, so that the page we were walking to isn't
+ * the correct one anymore.
+ */
+bool
+zsbt_page_is_expected(Relation rel, AttrNumber attno, zstid key, int level, Buffer buf)
+{
+	Page		page = BufferGetPage(buf);
+	ZSBtreePageOpaque *opaque;
+
+	/*
+	 * The page might have been deleted and even reused as a completely different
+	 * kind of a page, so we must be prepared for anything.
+	 */
+	if (PageIsNew(page))
+		return false;
+
+	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(ZSBtreePageOpaque)))
+		return false;
+
+	opaque = ZSBtreePageGetOpaque(page);
+	if (opaque->zs_page_id != ZS_BTREE_PAGE_ID)
+		return false;
+
+	if (opaque->zs_attno != attno)
+		return false;
+
+	if (level != -1 && opaque->zs_level != level)
+		return false;
+
+	if (opaque->zs_lokey > key || opaque->zs_hikey <= key)
+		return false;
+
+	return true;
+}
+
+/*
+ * Create a new btree root page, containing two downlinks.
+ *
+ * NOTE: the very first root page of a btree, which is also the leaf, is created
+ * in zsmeta_get_root_for_attribute(), not here.
+ *
+ * XXX: What if there are too many downlinks to fit on a page? Shouldn't happen
+ * in practice..
+ */
+zs_split_stack *
+zsbt_newroot(Relation rel, AttrNumber attno, int level, List *downlinks)
+{
+	Page		metapage;
+	ZSMetaPage *metapg;
+	Buffer		newrootbuf;
+	Page		newrootpage;
+	ZSBtreePageOpaque *newrootopaque;
+	ZSBtreeInternalPageItem *items;
+	Buffer		metabuf;
+	zs_split_stack *stack1;
+	zs_split_stack *stack2;
+	ListCell   *lc;
+	int			i;
+
+	metabuf = ReadBuffer(rel, ZS_META_BLK);
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+
+	/* allocate a new root page */
+	newrootbuf = zspage_getnewbuf(rel, metabuf);
+	newrootpage = palloc(BLCKSZ);
+	PageInit(newrootpage, BLCKSZ, sizeof(ZSBtreePageOpaque));
+	newrootopaque = ZSBtreePageGetOpaque(newrootpage);
+	newrootopaque->zs_attno = attno;
+	newrootopaque->zs_next = InvalidBlockNumber;
+	newrootopaque->zs_lokey = MinZSTid;
+	newrootopaque->zs_hikey = MaxPlusOneZSTid;
+	newrootopaque->zs_level = level;
+	newrootopaque->zs_flags = ZSBT_ROOT;
+	newrootopaque->zs_page_id = ZS_BTREE_PAGE_ID;
+
+	items = ZSBtreeInternalPageGetItems(newrootpage);
+
+	/* add all the downlinks */
+	i = 0;
+	foreach (lc, downlinks)
+	{
+		ZSBtreeInternalPageItem *downlink = (ZSBtreeInternalPageItem *) lfirst(lc);
+
+		items[i++] = *downlink;
+	}
+	((PageHeader) newrootpage)->pd_lower += i * sizeof(ZSBtreeInternalPageItem);
+
+	/* FIXME: Check that all the downlinks fit on the page. */
+
+	/* update the metapage */
+	metapage = PageGetTempPageCopy(BufferGetPage(metabuf));
+
+	metapg = (ZSMetaPage *) PageGetContents(metapage);
+	if ((attno != ZS_META_ATTRIBUTE_NUM) && (attno <= 0 || attno > metapg->nattributes))
+		elog(ERROR, "invalid attribute number %d (table \"%s\" has only %d attributes)",
+			 attno, RelationGetRelationName(rel), metapg->nattributes);
+
+	metapg->tree_root_dir[attno].root = BufferGetBlockNumber(newrootbuf);
+
+	stack1 = zs_new_split_stack_entry(metabuf, metapage);
+	stack2 = zs_new_split_stack_entry(newrootbuf, newrootpage);
+	stack2->next = stack1;
+
+	return stack2;
+}
+
+/*
+ * After page split, insert the downlink of 'rightblkno' to the parent.
+ *
+ * On entry, 'leftbuf' must be pinned exclusive-locked.
+ */
+zs_split_stack *
+zsbt_insert_downlinks(Relation rel, AttrNumber attno,
+					  zstid leftlokey, BlockNumber leftblkno, int level,
+					  List *downlinks)
+{
+	int			numdownlinks = list_length(downlinks);
+	ZSBtreeInternalPageItem *items;
+	int			nitems;
+	int			itemno;
+	Buffer		parentbuf;
+	Page		parentpage;
+	zs_split_stack *split_stack;
+	ZSBtreeInternalPageItem *firstdownlink;
+
+	/*
+	 * re-find parent
+	 *
+	 * TODO: this is a bit inefficient. Usually, we have just descended the
+	 * tree, and if we just remembered the path we descended, we could just
+	 * walk back up.
+	 */
+	parentbuf = zsbt_descend(rel, attno, leftlokey, level, false);
+	parentpage = BufferGetPage(parentbuf);
+
+	firstdownlink = (ZSBtreeInternalPageItem *) linitial(downlinks);
+
+	/* Find the position in the parent for the downlink */
+	items = ZSBtreeInternalPageGetItems(parentpage);
+	nitems = ZSBtreeInternalPageGetNumItems(parentpage);
+	itemno = zsbt_binsrch_internal(firstdownlink->tid, items, nitems);
+
+	/* sanity checks */
+	if (itemno < 0 || items[itemno].tid != leftlokey ||
+		items[itemno].childblk != leftblkno)
+	{
+		elog(ERROR, "could not find downlink for block %u TID (%u, %u)",
+			 leftblkno, ZSTidGetBlockNumber(leftlokey),
+			 ZSTidGetOffsetNumber(leftlokey));
+	}
+	itemno++;
+
+	if (PageGetExactFreeSpace(parentpage) < numdownlinks * sizeof(ZSBtreeInternalPageItem))
+	{
+		/* split internal page */
+		split_stack = zsbt_split_internal_page(rel, attno, parentbuf, itemno, downlinks);
+	}
+	else
+	{
+		ZSBtreeInternalPageItem *newitems;
+		Page		newpage;
+		int			i;
+		ListCell   *lc;
+
+		newpage = PageGetTempPageCopySpecial(parentpage);
+
+		split_stack = zs_new_split_stack_entry(parentbuf, newpage);
+
+		/* insert the new downlink for the right page. */
+		newitems = ZSBtreeInternalPageGetItems(newpage);
+		memcpy(newitems, items, itemno * sizeof(ZSBtreeInternalPageItem));
+
+		i = itemno;
+		foreach(lc, downlinks)
+		{
+			ZSBtreeInternalPageItem *downlink = (ZSBtreeInternalPageItem *) lfirst(lc);
+
+			Assert(downlink->childblk != 0);
+			newitems[i++] = *downlink;
+		}
+
+		memcpy(&newitems[i], &items[itemno], (nitems - itemno) * sizeof(ZSBtreeInternalPageItem));
+		((PageHeader) newpage)->pd_lower += (nitems + numdownlinks) * sizeof(ZSBtreeInternalPageItem);
+	}
+	return split_stack;
+}
+
+/*
+ * Split an internal page.
+ *
+ * The new downlink specified by 'newkey' is inserted to position 'newoff', on 'leftbuf'.
+ * The page is split.
+ */
+static zs_split_stack *
+zsbt_split_internal_page(Relation rel, AttrNumber attno, Buffer origbuf,
+						 OffsetNumber newoff, List *newitems)
+{
+	Page		origpage = BufferGetPage(origbuf);
+	ZSBtreePageOpaque *origopaque = ZSBtreePageGetOpaque(origpage);
+	Buffer		buf;
+	Page		page;
+	ZSBtreeInternalPageItem *origitems;
+	int			orignitems;
+	zs_split_stack *stack_first;
+	zs_split_stack *stack;
+	Size		splitthreshold;
+	ListCell   *lc;
+	int			origitemno;
+	List	   *downlinks = NIL;
+
+	origitems = ZSBtreeInternalPageGetItems(origpage);
+	orignitems = ZSBtreeInternalPageGetNumItems(origpage);
+
+	page = PageGetTempPageCopySpecial(origpage);
+	buf = origbuf;
+
+	stack = zs_new_split_stack_entry(buf, page);
+	stack_first = stack;
+
+	/* XXX: currently, we always do 90/10 splits */
+	splitthreshold = PageGetExactFreeSpace(page) * 0.10;
+
+	lc = list_head(newitems);
+	origitemno = 0;
+	for (;;)
+	{
+		ZSBtreeInternalPageItem *item;
+		ZSBtreeInternalPageItem *p;
+
+		if (origitemno == newoff && lc)
+		{
+			item = lfirst(lc);
+			lc = lnext(lc);
+		}
+		else
+		{
+			if (origitemno == orignitems)
+				break;
+			item = &origitems[origitemno];
+			origitemno++;
+		}
+
+		if (PageGetExactFreeSpace(page) < splitthreshold)
+		{
+			/* have to split to another page */
+			ZSBtreePageOpaque *prevopaque = ZSBtreePageGetOpaque(page);
+			ZSBtreePageOpaque *opaque = ZSBtreePageGetOpaque(page);
+			BlockNumber blkno;
+			ZSBtreeInternalPageItem *downlink;
+
+			buf = zspage_getnewbuf(rel, InvalidBuffer);
+			blkno = BufferGetBlockNumber(buf);
+			page = palloc(BLCKSZ);
+			PageInit(page, BLCKSZ, sizeof(ZSBtreePageOpaque));
+
+			opaque = ZSBtreePageGetOpaque(page);
+			opaque->zs_attno = attno;
+			opaque->zs_next = prevopaque->zs_next;
+			opaque->zs_lokey = item->tid;
+			opaque->zs_hikey = prevopaque->zs_hikey;
+			opaque->zs_level = prevopaque->zs_level;
+			opaque->zs_flags = 0;
+			opaque->zs_page_id = ZS_BTREE_PAGE_ID;
+
+			prevopaque->zs_next = blkno;
+			prevopaque->zs_hikey = item->tid;
+
+			stack->next = zs_new_split_stack_entry(buf, page);
+			stack = stack->next;
+
+			downlink = palloc(sizeof(ZSBtreeInternalPageItem));
+			downlink->tid = item->tid;
+			downlink->childblk = blkno;
+			downlinks = lappend(downlinks, downlink);
+		}
+
+		p = (ZSBtreeInternalPageItem *) ((char *) page + ((PageHeader) page)->pd_lower);
+		*p = *item;
+		((PageHeader) page)->pd_lower += sizeof(ZSBtreeInternalPageItem);
+	}
+
+	/* recurse to insert downlinks, if we had to split. */
+	if (downlinks)
+	{
+		if ((origopaque->zs_flags & ZSBT_ROOT) != 0)
+		{
+			ZSBtreeInternalPageItem *downlink;
+
+			downlink = palloc(sizeof(ZSBtreeInternalPageItem));
+			downlink->tid = MinZSTid;
+			downlink->childblk = BufferGetBlockNumber(origbuf);
+			downlinks = lcons(downlink, downlinks);
+
+			stack->next = zsbt_newroot(rel, attno, origopaque->zs_level + 1, downlinks);
+
+			/* clear the ZSBT_ROOT flag on the old root page */
+			ZSBtreePageGetOpaque(stack_first->page)->zs_flags &= ~ZSBT_ROOT;
+		}
+		else
+		{
+			stack->next = zsbt_insert_downlinks(rel, attno,
+												origopaque->zs_lokey,
+												BufferGetBlockNumber(origbuf),
+												origopaque->zs_level + 1,
+												downlinks);
+		}
+	}
+
+	return stack_first;
+}
+
+
+/*
+ * Removes the last item from page, and unlinks the page from the tree.
+ *
+ * NOTE: you cannot remove the only leaf. Returns NULL if the page could not
+ * be deleted.
+ */
+zs_split_stack *
+zsbt_unlink_page(Relation rel, AttrNumber attno, Buffer buf, int level)
+{
+	Page		page = BufferGetPage(buf);
+	ZSBtreePageOpaque *opaque = ZSBtreePageGetOpaque(page);
+	Buffer		leftbuf;
+	Buffer		rightbuf;
+	zs_split_stack *stack;
+
+	/* cannot currently remove the only page at its level. */
+	if (opaque->zs_lokey == MinZSTid && opaque->zs_hikey == MaxPlusOneZSTid)
+	{
+		return NULL;
+	}
+
+	/*
+	 * Find left sibling.
+	 * or if this is leftmost page, find right sibling.
+	 */
+	if (opaque->zs_lokey != MinZSTid)
+	{
+		rightbuf = buf;
+		leftbuf = zsbt_descend(rel, attno, opaque->zs_lokey - 1, level, false);
+
+		stack = zsbt_merge_pages(rel, attno, leftbuf, rightbuf, false);
+		if (!stack)
+		{
+			UnlockReleaseBuffer(leftbuf);
+			return NULL;
+		}
+	}
+	else
+	{
+		rightbuf = zsbt_descend(rel, attno, opaque->zs_hikey, level, false);
+		leftbuf = buf;
+		stack = zsbt_merge_pages(rel, attno, leftbuf, rightbuf, true);
+		if (!stack)
+		{
+			UnlockReleaseBuffer(rightbuf);
+			return NULL;
+		}
+	}
+
+	return stack;
+}
+
+/*
+ * Page deletion:
+ *
+ * Mark page empty, remove downlink. If parent becomes empty, recursively delete it.
+ *
+ * Unlike in the nbtree index, we don't need to worry about concurrent scans. They
+ * will simply retry if they land on an unexpected page.
+ */
+static zs_split_stack *
+zsbt_merge_pages(Relation rel, AttrNumber attno, Buffer leftbuf, Buffer rightbuf, bool target_is_left)
+{
+	Buffer		parentbuf;
+	Page		origleftpage;
+	Page		leftpage;
+	Page		rightpage;
+	ZSBtreePageOpaque *leftopaque;
+	ZSBtreePageOpaque *origleftopaque;
+	ZSBtreePageOpaque *rightopaque;
+	ZSBtreeInternalPageItem *parentitems;
+	int			parentnitems;
+	Page		parentpage;
+	int			itemno;
+	zs_split_stack *stack;
+	zs_split_stack *stack_head;
+	zs_split_stack *stack_tail;
+
+	origleftpage = BufferGetPage(leftbuf);
+	origleftopaque = ZSBtreePageGetOpaque(origleftpage);
+	rightpage = BufferGetPage(rightbuf);
+	rightopaque = ZSBtreePageGetOpaque(rightpage);
+
+	/* find downlink for 'rightbuf' in the parent */
+	parentbuf = zsbt_descend(rel, attno, rightopaque->zs_lokey, origleftopaque->zs_level + 1, false);
+	parentpage = BufferGetPage(parentbuf);
+
+	parentitems = ZSBtreeInternalPageGetItems(parentpage);
+	parentnitems = ZSBtreeInternalPageGetNumItems(parentpage);
+	itemno = zsbt_binsrch_internal(rightopaque->zs_lokey, parentitems, parentnitems);
+	if (itemno < 0 || parentitems[itemno].childblk != BufferGetBlockNumber(rightbuf))
+		elog(ERROR, "could not find downlink to FPM page %u", BufferGetBlockNumber(rightbuf));
+
+	if (parentnitems > 1 && itemno == 0)
+	{
+		/*
+		 * Don't delete the leftmost child of a parent. That would move the
+		 * keyspace of the parent, so we'd need to adjust the lo/hikey of
+		 * the parent page, and the parent's downlink in the grandparent.
+		 * Maybe later...
+		 */
+		UnlockReleaseBuffer(parentbuf);
+		elog(DEBUG1, "deleting leftmost child of a parent not implemented");
+		return NULL;
+	}
+
+	if (target_is_left)
+	{
+		/* move all items from right to left before unlinking the right page */
+		leftpage = PageGetTempPageCopy(rightpage);
+		leftopaque = ZSBtreePageGetOpaque(leftpage);
+
+		memcpy(leftopaque, origleftopaque, sizeof(ZSBtreePageOpaque));
+	}
+	else
+	{
+		/* right page is empty. */
+		leftpage = PageGetTempPageCopy(origleftpage);
+		leftopaque = ZSBtreePageGetOpaque(leftpage);
+	}
+
+	/* update left hikey */
+	leftopaque->zs_hikey = ZSBtreePageGetOpaque(rightpage)->zs_hikey;
+
+	Assert(ZSBtreePageGetOpaque(leftpage)->zs_level == ZSBtreePageGetOpaque(rightpage)->zs_level);
+
+	stack = zs_new_split_stack_entry(leftbuf, leftpage);
+	stack_head = stack_tail = stack;
+
+	/* Mark right page as empty/unused */
+	rightpage = palloc0(BLCKSZ);
+
+	stack = zs_new_split_stack_entry(rightbuf, rightpage);
+	stack->recycle = true;
+	stack_tail->next = stack;
+	stack_tail = stack;
+
+	/* remove downlink from parent */
+	if (parentnitems > 1)
+	{
+		Page		newpage = PageGetTempPageCopySpecial(parentpage);
+		ZSBtreeInternalPageItem *newitems = ZSBtreeInternalPageGetItems(newpage);
+
+		memcpy(newitems, parentitems, itemno * sizeof(ZSBtreeInternalPageItem));
+		memcpy(&newitems[itemno], &parentitems[itemno + 1], (parentnitems - itemno -1) * sizeof(ZSBtreeInternalPageItem));
+
+		((PageHeader) newpage)->pd_lower += (parentnitems - 1) * sizeof(ZSBtreeInternalPageItem);
+
+		stack = zs_new_split_stack_entry(parentbuf, newpage);
+		stack_tail->next = stack;
+		stack_tail = stack;
+	}
+	else
+	{
+		/* the parent becomes empty as well. Recursively remove it. */
+		stack_tail->next = zsbt_unlink_page(rel, attno, parentbuf, leftopaque->zs_level + 1);
+		if (stack_tail->next == NULL)
+		{
+			/* oops, couldn't remove the parent. Back out */
+			stack = stack_head;
+			while (stack)
+			{
+				zs_split_stack *next = stack->next;
+
+				pfree(stack->page);
+				pfree(stack);
+				stack = next;
+			}
+		}
+	}
+
+	return stack_head;
+}
+
+static int
+zsbt_binsrch_internal(zstid key, ZSBtreeInternalPageItem *arr, int arr_elems)
+{
+	int			low,
+		high,
+		mid;
+
+	low = 0;
+	high = arr_elems;
+	while (high > low)
+	{
+		mid = low + (high - low) / 2;
+
+		if (key >= arr[mid].tid)
+			low = mid + 1;
+		else
+			high = mid;
+	}
+	return low - 1;
+}
diff --git a/src/backend/access/zedstore/zedstore_compression.c b/src/backend/access/zedstore/zedstore_compression.c
new file mode 100644
index 0000000000..1a1d9a018c
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_compression.c
@@ -0,0 +1,364 @@
+/*
+ * zedstore_compression.c
+ *		Routines for compression
+ *
+ * There are two implementations at the moment: LZ4, and the Postgres
+ * pg_lzcompress(). LZ4 support requires that the server was compiled
+ * with --with-lz4.
+ *
+ * The compressor works on ZSUncompressedBtreeItems.
+ *
+ * Compression interface
+ * ---------------------
+ *
+ * Call zs_compress_init() to initialize.
+ *
+ * Call zs_compress_begin(), to begin compressing a group of items. Pass the
+ * maximum amount of space it's allowed to use after compression, as argument.
+ *
+ * Feed them to the compressor one by one with zs_compress_add(), until it
+ * returns false.
+ *
+ * Finally, call zs_compress_finish(). It returns a ZSCompressedBtreeItem,
+ * which contains all the plain items that were added (except for the last one
+ * for which zs_compress_add() returned false)
+ *
+ * Decompression interface
+ * -----------------------
+ *
+ * zs_decompress_chunk() takes a ZSCompressedBtreeItem as argument. It
+ * initializes a "context" with the given chunk.
+ *
+ * Call zs_decompress_read_item() to return the uncompressed items one by one.
+ *
+ *
+ * NOTES:
+ *
+ * Currently, the compressor accepts input, until the *uncompressed* size exceeds
+ * the *compressed* size available. I.e it assumes that the compressed size is never
+ * larger than uncompressed size.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_compression.c
+ */
+#include "postgres.h"
+
+#ifdef USE_LZ4
+#include <lz4.h>
+#endif
+
+#include "access/zedstore_compression.h"
+#include "access/zedstore_internal.h"
+#include "common/pg_lzcompress.h"
+#include "utils/datum.h"
+
+
+/*
+ * There are two implementations at the moment: LZ4, and the Postgres
+ * pg_lzcompress(). LZ4 support requires that the server was compiled
+ * with --with-lz4.
+ */
+#ifdef USE_LZ4
+
+/*
+ * Begin compression, with given max compressed size.
+ */
+void
+zs_compress_init(ZSCompressContext *context)
+{
+	context->uncompressedbuffer = palloc(BLCKSZ * 10); // FIXME: arbitrary size
+	context->buffer = palloc(BLCKSZ);
+	context->maxCompressedSize = 0;
+	context->maxUncompressedSize = 0;
+	context->nitems = 0;
+	context->rawsize = 0;
+}
+
+void
+zs_compress_begin(ZSCompressContext *context, int maxCompressedSize)
+{
+	context->buffer = repalloc(context->buffer, maxCompressedSize);
+
+	maxCompressedSize -= offsetof(ZSCompressedBtreeItem, t_payload);
+	if (maxCompressedSize < 0)
+		maxCompressedSize = 0;
+
+	context->maxCompressedSize = maxCompressedSize;
+	context->nitems = 0;
+	context->rawsize = 0;
+}
+
+/*
+ * Try to add some data to the compressed block.
+ *
+ * If it wouldn't fit, return false.
+ */
+bool
+zs_compress_add(ZSCompressContext *context, ZSBtreeItem *item)
+{
+	ZSCompressedBtreeItem *chunk = (ZSCompressedBtreeItem *) context->buffer;
+
+	Assert((item->t_flags & ZSBT_COMPRESSED) == 0);
+	Assert(item->t_tid != InvalidZSTid);
+
+	if (LZ4_COMPRESSBOUND(context->rawsize + MAXALIGN(item->t_size)) > context->maxCompressedSize)
+		return false;
+
+	memcpy(context->uncompressedbuffer + context->rawsize, item, item->t_size);
+	/* TODO: clear alignment padding */
+	if (context->nitems == 0)
+		chunk->t_tid = item->t_tid;
+	chunk->t_lasttid = zsbt_item_lasttid(item);
+	context->nitems++;
+	context->rawsize += MAXALIGN(item->t_size);
+
+	return true;
+}
+
+ZSCompressedBtreeItem *
+zs_compress_finish(ZSCompressContext *context)
+{
+	ZSCompressedBtreeItem *chunk = (ZSCompressedBtreeItem *) context->buffer;
+	int32		compressed_size;
+
+	compressed_size = LZ4_compress_default(context->uncompressedbuffer,
+										   chunk->t_payload,
+										   context->rawsize,
+										   context->maxCompressedSize);
+	if (compressed_size < 0)
+		return NULL;
+
+	chunk->t_size = offsetof(ZSCompressedBtreeItem, t_payload) + compressed_size;
+	chunk->t_flags = ZSBT_COMPRESSED;
+	chunk->t_uncompressedsize = context->rawsize;
+
+	return chunk;
+}
+
+void
+zs_compress_free(ZSCompressContext *context)
+{
+	pfree(context->uncompressedbuffer);
+	pfree(context->buffer);
+}
+
+void
+zs_decompress_init(ZSDecompressContext *context)
+{
+	context->buffer = NULL;
+	context->bufsize = 0;
+	context->uncompressedsize = 0;
+}
+
+void
+zs_decompress_chunk(ZSDecompressContext *context, ZSCompressedBtreeItem *chunk)
+{
+	Assert((chunk->t_flags & ZSBT_COMPRESSED) != 0);
+	Assert(chunk->t_uncompressedsize > 0);
+	if (context->bufsize < chunk->t_uncompressedsize)
+	{
+		if (context->buffer)
+			pfree(context->buffer);
+		context->buffer = palloc(chunk->t_uncompressedsize);
+		context->bufsize = chunk->t_uncompressedsize;
+	}
+	context->uncompressedsize = chunk->t_uncompressedsize;
+
+	if (LZ4_decompress_safe(chunk->t_payload,
+							context->buffer,
+							chunk->t_size - offsetof(ZSCompressedBtreeItem, t_payload),
+							context->uncompressedsize) != context->uncompressedsize)
+		elog(ERROR, "could not decompress chunk");
+
+	context->bytesread = 0;
+}
+
+ZSBtreeItem *
+zs_decompress_read_item(ZSDecompressContext *context)
+{
+	ZSBtreeItem *next;
+
+	if (context->bytesread == context->uncompressedsize)
+		return NULL;
+	next = (ZSBtreeItem *) (context->buffer + context->bytesread);
+	if (context->bytesread + MAXALIGN(next->t_size) > context->uncompressedsize)
+		elog(ERROR, "invalid compressed item");
+	context->bytesread += MAXALIGN(next->t_size);
+
+	Assert(next->t_size >= sizeof(ZSBtreeItem));
+	Assert(next->t_tid != InvalidZSTid);
+
+	return next;
+}
+
+void
+zs_decompress_free(ZSDecompressContext *context)
+{
+	if (context->buffer)
+		pfree(context->buffer);
+	context->buffer = NULL;
+	context->bufsize = 0;
+	context->uncompressedsize = 0;
+}
+
+
+#else
+/* PGLZ imlementation */
+
+/*
+ * In the worst case, pg_lz outputs everything as "literals", and emits one
+ * "control byte" ever 8 bytes. Also, it requires 4 bytes extra at the end
+ * of the buffer. And add 10 bytes of slop, for good measure.
+ */
+#define MAX_COMPRESS_EXPANSION_OVERHEAD	(8)
+#define MAX_COMPRESS_EXPANSION_BYTES	(4 + 10)
+
+/*
+ * Begin compression, with given max compressed size.
+ */
+void
+zs_compress_init(ZSCompressContext *context)
+{
+	context->uncompressedbuffer = palloc(BLCKSZ * 10); // FIXME: arbitrary size
+	context->buffer = palloc(BLCKSZ);
+	context->maxCompressedSize = 0;
+	context->maxUncompressedSize = 0;
+	context->nitems = 0;
+	context->rawsize = 0;
+}
+
+void
+zs_compress_begin(ZSCompressContext *context, int maxCompressedSize)
+{
+	int			maxUncompressedSize;
+
+	context->buffer = repalloc(context->buffer, maxCompressedSize + 4 /* LZ slop */);
+
+	context->maxCompressedSize = maxCompressedSize;
+
+	/* determine the max uncompressed size */
+	maxUncompressedSize = maxCompressedSize;
+	maxUncompressedSize -= offsetof(ZSCompressedBtreeItem, t_payload);
+	maxUncompressedSize -= maxUncompressedSize / MAX_COMPRESS_EXPANSION_OVERHEAD;
+	maxUncompressedSize -= MAX_COMPRESS_EXPANSION_BYTES;
+	if (maxUncompressedSize < 0)
+		maxUncompressedSize = 0;
+	context->maxUncompressedSize = maxUncompressedSize;
+	context->nitems = 0;
+	context->rawsize = 0;
+}
+
+/*
+ * Try to add some data to the compressed block.
+ *
+ * If it wouldn't fit, return false.
+ */
+bool
+zs_compress_add(ZSCompressContext *context, ZSBtreeItem *item)
+{
+	ZSCompressedBtreeItem *chunk = (ZSCompressedBtreeItem *) context->buffer;
+
+	Assert ((item->t_flags & ZSBT_COMPRESSED) == 0);
+
+	if (context->rawsize + item->t_size > context->maxUncompressedSize)
+		return false;
+
+	memcpy(context->uncompressedbuffer + context->rawsize, item, item->t_size);
+	if (context->nitems == 0)
+		chunk->t_tid = item->t_tid;
+	chunk->t_lasttid = zsbt_item_lasttid(item);
+	context->nitems++;
+	context->rawsize += MAXALIGN(item->t_size);
+
+	return true;
+}
+
+ZSCompressedBtreeItem *
+zs_compress_finish(ZSCompressContext *context)
+{
+	ZSCompressedBtreeItem *chunk = (ZSCompressedBtreeItem *) context->buffer;
+	int32		compressed_size;
+
+	compressed_size = pglz_compress(context->uncompressedbuffer, context->rawsize,
+									chunk->t_payload,
+									PGLZ_strategy_always);
+	if (compressed_size < 0)
+		return NULL;
+
+	chunk->t_size = offsetof(ZSCompressedBtreeItem, t_payload) + compressed_size;
+	chunk->t_flags = ZSBT_COMPRESSED;
+	chunk->t_uncompressedsize = context->rawsize;
+
+	return chunk;
+}
+
+void
+zs_compress_free(ZSCompressContext *context)
+{
+	pfree(context->uncompressedbuffer);
+	pfree(context->buffer);
+}
+
+void
+zs_decompress_init(ZSDecompressContext *context)
+{
+	context->buffer = NULL;
+	context->bufsize = 0;
+	context->uncompressedsize = 0;
+}
+
+void
+zs_decompress_chunk(ZSDecompressContext *context, ZSCompressedBtreeItem *chunk)
+{
+	Assert((chunk->t_flags & ZSBT_COMPRESSED) != 0);
+	Assert(chunk->t_uncompressedsize > 0);
+	if (context->bufsize < chunk->t_uncompressedsize)
+	{
+		if (context->buffer)
+			pfree(context->buffer);
+		context->buffer = palloc(chunk->t_uncompressedsize);
+		context->bufsize = chunk->t_uncompressedsize;
+	}
+	context->uncompressedsize = chunk->t_uncompressedsize;
+
+	if (pglz_decompress(chunk->t_payload,
+						chunk->t_size - offsetof(ZSCompressedBtreeItem, t_payload),
+						context->buffer,
+						context->uncompressedsize, true) != context->uncompressedsize)
+		elog(ERROR, "could not decompress chunk");
+
+	context->bytesread = 0;
+}
+
+ZSBtreeItem *
+zs_decompress_read_item(ZSDecompressContext *context)
+{
+	ZSBtreeItem *next;
+
+	if (context->bytesread == context->uncompressedsize)
+		return NULL;
+	next = (ZSBtreeItem *) (context->buffer + context->bytesread);
+	if (context->bytesread + MAXALIGN(next->t_size) > context->uncompressedsize)
+		elog(ERROR, "invalid compressed item");
+	context->bytesread += MAXALIGN(next->t_size);
+
+	Assert(next->t_size >= sizeof(ZSBtreeItem));
+	Assert(next->t_tid != InvalidZSTid);
+
+	return next;
+}
+
+void
+zs_decompress_free(ZSDecompressContext *context)
+{
+	if (context->buffer)
+		pfree(context->buffer);
+	context->buffer = NULL;
+	context->bufsize = 0;
+	context->uncompressedsize = 0;
+}
+
+#endif		/* !USE_LZ4 */
diff --git a/src/backend/access/zedstore/zedstore_freepagemap.c b/src/backend/access/zedstore/zedstore_freepagemap.c
new file mode 100644
index 0000000000..efd01fd6c6
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_freepagemap.c
@@ -0,0 +1,1076 @@
+/*-------------------------------------------------------------------------
+ *
+ * zedstore_freepagemap.c
+ *	  ZedStore free space management
+ *
+ * The Free Page Map keeps track of unused pages in the relation.
+ *
+ * The FPM is a b-tree, indexed by physical block number.  To be more compact,
+ * it stores "extents", i.e. block ranges, rather than just blocks, when
+ * possible.
+
+ * Design principles:
+ *
+ * - it's ok to have a block incorrectly stored in the FPM. Before actually
+ *   reusing a page, we must check that it's safe.
+ *
+ * - a deletable page must be simple to detect just by looking at the page,
+ *   and perhaps a few other pages. It should *not* require scanning the
+ *   whole table, or even a whole b-tree. For example, if a column is dropped,
+ *   we can detect if a b-tree page belongs to the dropped column just by
+ *   looking at the information (the attribute number) stored in the page
+ *   header.
+ *
+ * - if a page is deletable, it should become immediately reusable. No
+ *   "wait out all possible readers that might be about to follow a link
+ *   to it" business. All code that reads pages need to keep pages locked
+ *   while following a link, or be prepared to retry if they land on an
+ *   unexpected page.
+ *
+ *
+ * TODO:
+ *
+ * - Avoid fragmentation. If B-tree page is split, try to hand out a page
+ *   that's close to the old page. When the relation is extended, allocate
+ *   a larger chunk at once.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_freepagemap.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/zedstore_internal.h"
+#include "miscadmin.h"
+#include "storage/bufpage.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+
+/*
+ * On-disk format of the Free Page Map.
+ *
+ * The FPM is a b-tree, indexed by block number. Each page contains a
+ * ZSFreePageMapOpaque in the "special area", and an array of
+ * ZSFreePageMapItems as the content (ie. after the normal page header,
+ * up to pd_lower). On an internal page, each item contains the starting
+ * block number, and a pointer to the child FPM page. On a leaf page,
+ * each entry contains the start and end of the block range that the item
+ * represents.
+ *
+ * The block ranges stored on leaf pages must not overlap!
+ */
+typedef struct
+{
+	BlockNumber	zs_lokey;		/* inclusive */
+	BlockNumber	zs_hikey;		/* exclusive */
+	uint16		zs_level;			/* 0 = leaf */
+	uint16		zs_flags;
+	char		padding[2];			/* padding, to put zs_page_id last */
+	uint16		zs_page_id;			/* always ZS_FPM_PAGE_ID */
+} ZSFreePageMapOpaque;
+
+typedef struct
+{
+	BlockNumber	zs_startblk;	/* inclusive */
+	union {
+		BlockNumber	zs_endblk;		/* on a leaf page, end of extent, exclusive */
+		BlockNumber	zs_downlink;	/* on an internal page, pointer to child */
+	} u;
+} ZSFreePageMapItem;
+
+#define ZSFreePageMapGetOpaque(page) ((ZSFreePageMapOpaque *) PageGetSpecialPointer(page))
+
+/* overlap, or touch? */
+static inline bool
+zsextent_overlap(BlockNumber start1, BlockNumber end1, BlockNumber start2, BlockNumber end2)
+{
+	if (start2 < end1)
+		return false;
+	if (start1 < end2)
+		return false;
+	return true;
+}
+
+static inline ZSFreePageMapItem *
+ZSFreePageMapPageGetItems(Page page)
+{
+	ZSFreePageMapItem *items;
+
+	items = (ZSFreePageMapItem *) PageGetContents(page);
+
+	return items;
+}
+static inline int
+ZSFreePageMapPageGetNumItems(Page page)
+{
+	ZSFreePageMapItem *begin;
+	ZSFreePageMapItem *end;
+
+	begin = (ZSFreePageMapItem *) PageGetContents(page);
+	end = (ZSFreePageMapItem *) ((char *) page + ((PageHeader) page)->pd_lower);
+
+	return end - begin;
+}
+
+static zs_split_stack *zsfpm_unlink_page(Relation rel, Buffer buf, int level, Buffer metabuf);
+static zs_split_stack *zsfpm_merge_pages(Relation rel, Buffer leftbuf, Buffer rightbuf, bool target_is_left, Buffer metabuf);
+static BlockNumber zsfpm_consume_page(Relation rel, Buffer metabuf);
+static void zsfpm_insert(Relation rel, BlockNumber startblk, BlockNumber endblk);
+static zs_split_stack *zsfpm_split(Relation rel, Buffer leftbuf,
+						int newpos, ZSFreePageMapItem *newitem);
+static zs_split_stack *zsfpm_insert_downlink(Relation rel, Buffer leftbuf,
+								  BlockNumber rightlokey, BlockNumber rightblkno);
+static zs_split_stack *zsfpm_newroot(Relation rel, Buffer metabuf, int level,
+			  ZSFreePageMapItem *item1, ZSFreePageMapItem *item2);
+static Buffer zsfpm_descend(Relation rel, Buffer metabuf, BlockNumber key, int level);
+static int zsfpm_binsrch_blkno(BlockNumber key, ZSFreePageMapItem *arr, int arr_elems);
+
+/*
+ * zspage_is_recyclable()
+ *
+ * Is the current page recyclable?
+ *
+ * It can be:
+ *
+ * - an empty, all-zeros page,
+ * - explicitly marked as deleted,
+ * - an UNDO page older than oldest_undo_ptr
+ * - a b-tree page belonging to a deleted attribute
+ * - a TOAST page belonging to a dead item
+ *
+ */
+static bool
+zspage_is_recyclable(Buffer buf)
+{
+	if (PageIsNew(BufferGetPage(buf)))
+		return true;
+	return false;
+}
+
+
+static void
+zsfpm_delete_leaf(Relation rel, Buffer buf, Buffer metabuf)
+{
+	Page		page = BufferGetPage(buf);
+	ZSFreePageMapOpaque *opaque = ZSFreePageMapGetOpaque(page);
+
+	if (opaque->zs_lokey == 0 && opaque->zs_hikey == MaxBlockNumber + 1)
+	{
+		/* Don't delete the last leaf page. Just mark it empty */
+		START_CRIT_SECTION();
+
+		((PageHeader) page)->pd_lower = SizeOfPageHeaderData;
+
+		MarkBufferDirty(buf);
+
+		/* TODO: WAL-log */
+
+		END_CRIT_SECTION();
+
+		UnlockReleaseBuffer(buf);
+
+		return;
+	}
+	else
+	{
+		zs_split_stack *stack;
+
+		stack = zsfpm_unlink_page(rel, buf, 0, metabuf);
+
+		/* apply the changes */
+		zs_apply_split_changes(rel, stack);
+	}
+}
+
+/*
+ * Removes the last item from page, and unlinks the page from the tree.
+ *
+ *
+ * NOTE: you cannot remove the only leaf.
+ */
+static zs_split_stack *
+zsfpm_unlink_page(Relation rel, Buffer buf, int level, Buffer metabuf)
+{
+	Page		page = BufferGetPage(buf);
+	ZSFreePageMapOpaque *opaque = ZSFreePageMapGetOpaque(page);
+	Buffer		leftbuf;
+	Buffer		rightbuf;
+	bool		target_is_left;
+
+	Assert(opaque->zs_lokey != 0 || opaque->zs_hikey != MaxBlockNumber + 1);
+	Assert(ZSFreePageMapPageGetNumItems(page) == 1);
+
+	/*
+	 * Find left sibling.
+	 * or if this is leftmost page, find right sibling.
+	 */
+	if (opaque->zs_lokey != 0)
+	{
+		rightbuf = buf;
+		leftbuf = zsfpm_descend(rel, metabuf, opaque->zs_lokey - 1, level);
+		target_is_left = false;
+	}
+	else
+	{
+		rightbuf = zsfpm_descend(rel, metabuf, opaque->zs_hikey, level);
+		leftbuf = buf;
+		target_is_left = true;
+	}
+
+	return zsfpm_merge_pages(rel, leftbuf, rightbuf, target_is_left, metabuf);
+}
+
+/*
+ * Page deletion:
+ *
+ * Mark page empty, remove downlink. If parent becomes empty, recursively delete it.
+ *
+ * Unlike in the nbtree index, we don't need to worry about concurrent scans. They
+ * will simply retry if they land on an unexpected page.
+ */
+static zs_split_stack *
+zsfpm_merge_pages(Relation rel, Buffer leftbuf, Buffer rightbuf, bool target_is_left, Buffer metabuf)
+{
+	Buffer		parentbuf;
+	Page		origleftpage;
+	Page		leftpage;
+	Page		rightpage;
+	ZSFreePageMapOpaque *leftopaque;
+	ZSFreePageMapOpaque *rightopaque;
+	ZSFreePageMapItem *leftitems;
+	ZSFreePageMapItem *origleftitems;
+	ZSFreePageMapItem *rightitems;
+	ZSFreePageMapItem *parentitems;
+	int			origleftnitems;
+	int			rightnitems;
+	int			parentnitems;
+	Page		parentpage;
+	int			itemno;
+	zs_split_stack *stack;
+	zs_split_stack *stack_head;
+	zs_split_stack *stack_tail;
+
+	origleftpage = BufferGetPage(leftbuf);
+	leftpage = PageGetTempPageCopySpecial(origleftpage);
+	leftopaque = ZSFreePageMapGetOpaque(leftpage);
+
+	origleftitems = ZSFreePageMapPageGetItems(origleftpage);
+	origleftnitems = ZSFreePageMapPageGetNumItems(origleftpage);
+
+	leftitems = ZSFreePageMapPageGetItems(leftpage);
+
+	rightpage = BufferGetPage(rightbuf);
+	rightopaque = ZSFreePageMapGetOpaque(rightpage);
+	rightitems = ZSFreePageMapPageGetItems(rightpage);
+	rightnitems = ZSFreePageMapPageGetNumItems(rightpage);
+
+	/* move all items from right to left */
+
+	if (target_is_left)
+	{
+		Assert(origleftnitems == 1);
+
+		memcpy(leftitems,
+			   rightitems,
+			   rightnitems * sizeof(ZSFreePageMapItem));
+		((PageHeader) leftpage)->pd_lower += rightnitems * sizeof(ZSFreePageMapItem);
+	}
+	else
+	{
+		origleftitems = ZSFreePageMapPageGetItems(origleftpage);
+		leftitems = ZSFreePageMapPageGetItems(leftpage);
+
+		Assert(rightnitems == 1);
+
+		memcpy(leftitems,
+			   origleftitems,
+			   origleftnitems * sizeof(ZSFreePageMapItem));
+	}
+
+	/* update left hikey */
+	leftopaque->zs_hikey = ZSFreePageMapGetOpaque(rightpage)->zs_hikey;
+
+	Assert(ZSFreePageMapGetOpaque(leftpage)->zs_level == ZSFreePageMapGetOpaque(rightpage)->zs_level);
+
+	stack = zs_new_split_stack_entry(leftbuf, leftpage);
+	stack_head = stack_tail = stack;
+
+	/* Mark right page as empty/unused */
+	rightpage = palloc0(BLCKSZ);
+
+	stack = zs_new_split_stack_entry(rightbuf, rightpage);
+	stack->recycle = true;
+	stack_tail->next = stack;
+	stack_tail = stack;
+
+	/* find downlink for 'rightbuf' in the parent */
+	parentbuf = zsfpm_descend(rel, metabuf, rightopaque->zs_lokey, leftopaque->zs_level + 1);
+	parentpage = BufferGetPage(parentbuf);
+
+	parentitems = ZSFreePageMapPageGetItems(parentpage);
+	parentnitems = ZSFreePageMapPageGetNumItems(parentpage);
+	itemno = zsfpm_binsrch_blkno(rightopaque->zs_lokey, parentitems, parentnitems);
+	if (itemno < 0 || parentitems[itemno].u.zs_downlink != BufferGetBlockNumber(rightbuf))
+		elog(ERROR, "could not find downlink to FPM page %u", BufferGetBlockNumber(rightbuf));
+
+	/* remove downlink from parent */
+	if (parentnitems > 1)
+	{
+		Page		newpage = PageGetTempPageCopySpecial(parentpage);
+		ZSFreePageMapItem *newitems = ZSFreePageMapPageGetItems(newpage);
+
+		memcpy(newitems, parentitems, itemno * sizeof(ZSFreePageMapItem));
+		memcpy(&newitems[itemno], &parentitems[itemno + 1], (parentnitems - itemno -1) * sizeof(ZSFreePageMapItem));
+
+		((PageHeader) newpage)->pd_lower += (parentnitems - 1) * sizeof(ZSFreePageMapItem);
+
+		stack = zs_new_split_stack_entry(parentbuf, newpage);
+		stack_tail->next = stack;
+		stack_tail = stack;
+	}
+	else
+	{
+		/* the parent becomes empty as well. Recursively remove it. */
+		stack_tail->next = zsfpm_unlink_page(rel, parentbuf, leftopaque->zs_level + 1, metabuf);
+	}
+	return stack_head;
+}
+
+/*
+ * Allocate a new page.
+ *
+ * The page is exclusive-locked, but not initialized.
+ */
+Buffer
+zspage_getnewbuf(Relation rel, Buffer metabuf)
+{
+	bool		release_metabuf;
+	Buffer		buf;
+	BlockNumber blk;
+
+	if (metabuf == InvalidBuffer)
+	{
+		metabuf = ReadBuffer(rel, ZS_META_BLK);
+		LockBuffer(metabuf, BUFFER_LOCK_SHARE);
+		release_metabuf = true;
+	}
+	else
+		release_metabuf = false;
+
+retry:
+	/* Get a block from the FPM. */
+	blk = zsfpm_consume_page(rel, metabuf);
+	if (blk == 0)
+	{
+		/* metapage, not expected */
+		elog(ERROR, "could not find valid page in FPM");
+	}
+	if (blk == InvalidBlockNumber)
+	{
+		/* No free pages. Have to extend the relation. */
+		buf = zspage_extendrel_newbuf(rel);
+		blk = BufferGetBlockNumber(buf);
+	}
+	else
+	{
+		buf = ReadBuffer(rel, blk);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+		/* Check that the page really is unused. */
+		if (!zspage_is_recyclable(buf))
+		{
+			UnlockReleaseBuffer(buf);
+			goto retry;
+		}
+	}
+
+	if (release_metabuf)
+		UnlockReleaseBuffer(metabuf);
+	return buf;
+}
+
+/*
+ * Extend the relation.
+ *
+ * Returns the new page, exclusive-locked.
+ */
+Buffer
+zspage_extendrel_newbuf(Relation rel)
+{
+	Buffer		buf;
+	bool		needLock;
+
+	/*
+	 * Extend the relation by one page.
+	 *
+	 * We have to use a lock to ensure no one else is extending the rel at
+	 * the same time, else we will both try to initialize the same new
+	 * page.  We can skip locking for new or temp relations, however,
+	 * since no one else could be accessing them.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	if (needLock)
+		LockRelationForExtension(rel, ExclusiveLock);
+
+	buf = ReadBuffer(rel, P_NEW);
+
+	/* Acquire buffer lock on new page */
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+	/*
+	 * Release the file-extension lock; it's now OK for someone else to
+	 * extend the relation some more.  Note that we cannot release this
+	 * lock before we have buffer lock on the new page, or we risk a race
+	 * condition against btvacuumscan --- see comments therein.
+	 */
+	if (needLock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+
+	return buf;
+}
+
+
+/*
+ * Explictly mark a page as deleted and recyclable, and add it to the FPM.
+ *
+ * The caller must hold an exclusive-lock on the page.
+ */
+void
+zspage_delete_page(Relation rel, Buffer buf)
+{
+	BlockNumber blk = BufferGetBlockNumber(buf);
+	Page		page;
+
+	page = BufferGetPage(buf);
+	memset(page, 0, BLCKSZ);
+
+	zsfpm_insert(rel, blk, blk + 1);
+}
+
+/*
+ * Remove and return a page from the FPM.
+ */
+static BlockNumber
+zsfpm_consume_page(Relation rel, Buffer metabuf)
+{
+	/* TODO: add some smarts, to allocate the page nearby old page, etc. */
+	/* currently, we just pick the first available page. */
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	BlockNumber rootblk;
+	Buffer		buf;
+	Page		page;
+	ZSFreePageMapItem *items;
+	int			nitems;
+	BlockNumber result;
+
+	metapage = BufferGetPage(metabuf);
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+	rootblk = metaopaque->zs_fpm_root;
+
+	if (rootblk == InvalidBlockNumber)
+		return InvalidBlockNumber;
+
+	buf = zsfpm_descend(rel, metabuf, 0, 0);
+	page = BufferGetPage(buf);
+
+	items = ZSFreePageMapPageGetItems(page);
+	nitems = ZSFreePageMapPageGetNumItems(page);
+
+	if (nitems == 0)
+	{
+		UnlockReleaseBuffer(buf);
+		return InvalidBlockNumber;
+	}
+
+	result = items[0].zs_startblk;
+	items[0].zs_startblk++;
+	if (items[0].u.zs_endblk == items[0].zs_startblk)
+	{
+		if (nitems > 1)
+		{
+			memmove(&items[0],
+					&items[1],
+					(nitems - 1) * sizeof(ZSFreePageMapItem));
+			((PageHeader) page)->pd_lower -= sizeof(ZSFreePageMapItem);
+
+			UnlockReleaseBuffer(buf);
+		}
+		else
+		{
+			zsfpm_delete_leaf(rel, buf, metabuf);
+			/* zsfpm_delete_leaf() released 'buf' */
+		}
+	}
+	else
+	{
+		UnlockReleaseBuffer(buf);
+	}
+	return result;
+}
+
+/*
+ * Add a block range to the FPM.
+ */
+static void
+zsfpm_insert(Relation rel, BlockNumber startblk, BlockNumber endblk)
+{
+	Buffer		metabuf;
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	BlockNumber rootblk;
+	Buffer		buf;
+	Page		page;
+	ZSFreePageMapItem *items;
+	int			nitems;
+	int			pos;
+	int			replacepos_first;
+	int			replacepos_last;
+
+	metabuf = ReadBuffer(rel, ZS_META_BLK);
+	metapage = BufferGetPage(metabuf);
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);	/* TODO: get shared lock first */
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+	rootblk = metaopaque->zs_fpm_root;
+
+	if (rootblk == InvalidBlockNumber)
+	{
+		/* Create a new FPM root page */
+		ZSFreePageMapOpaque *opaque;
+
+		buf = zspage_extendrel_newbuf(rel);
+		page = BufferGetPage(buf);
+		rootblk = BufferGetBlockNumber(buf);
+
+		PageInit(page, BLCKSZ, sizeof(ZSFreePageMapOpaque));
+		opaque = ZSFreePageMapGetOpaque(page);
+		opaque->zs_lokey = 0;
+		opaque->zs_hikey = MaxBlockNumber + 1;
+		opaque->zs_level = 0;
+		opaque->zs_flags = 0;
+		opaque->zs_page_id = ZS_FPM_PAGE_ID;
+
+		metaopaque->zs_fpm_root = rootblk;
+
+		items = ZSFreePageMapPageGetItems(page);
+		Assert(ZSFreePageMapPageGetNumItems(page) == 0);
+		items[0].zs_startblk = startblk;
+		items[0].u.zs_endblk = endblk;
+
+		/* TODO: WAL-logging */
+
+		MarkBufferDirty(metabuf);
+		MarkBufferDirty(buf);
+
+		UnlockReleaseBuffer(metabuf);
+		UnlockReleaseBuffer(buf);
+		return;
+	}
+
+	/* Descend to the correct leaf page for this block */
+
+	buf = zsfpm_descend(rel, metabuf, startblk, 0);
+
+	UnlockReleaseBuffer(metabuf);
+
+	page = BufferGetPage(buf);
+	items = ZSFreePageMapPageGetItems(page);
+	nitems = ZSFreePageMapPageGetNumItems(page);
+
+	pos = zsfpm_binsrch_blkno(startblk, items, nitems);
+
+	/* FIXME: this merging business won't work correctly if the range crosses
+	 * a b-tree page boundary. Not a problem currently, when we only insert
+	 * individual pages.
+	 */
+
+	/* Check if this item can be merged with the previous item */
+	replacepos_first = -1;
+	if (pos >= 0 && items[pos].u.zs_endblk >= startblk)
+	{
+		replacepos_first = pos;
+	}
+	/* If not, can this be merged with the next item? */
+	else if (pos + 1 < nitems && endblk >= items[pos + 1].zs_startblk)
+	{
+		/* yes, merge */
+		replacepos_first = pos + 1;
+	}
+
+	if (replacepos_first >= 0)
+	{
+		/* adjust the start block of this item */
+		if (startblk < items[replacepos_first].zs_startblk)
+		{
+			items[replacepos_first].zs_startblk = startblk;
+		}
+
+		/*
+		 * The new end block might overlap with any number of existing
+		 * ranges. Replace all overlapping ranges with one range that
+		 * covers them all.
+		 */
+		replacepos_last = replacepos_first;
+		if (endblk > items[replacepos_first].u.zs_endblk)
+		{
+			int			j;
+			BlockNumber replace_end;
+
+			replace_end = endblk;
+
+			for (j = replacepos_first + 1; j < nitems; j++)
+			{
+				if (items[j].zs_startblk > replace_end)
+					break;
+
+				/*
+				 * This item will be replaced. Check the end, to see
+				 * if this is the last one that can be replaced.
+				 */
+				replacepos_last = j;
+
+				if (items[j].u.zs_endblk > replace_end)
+				{
+					replace_end = items[j].u.zs_endblk;
+					break;
+				}
+			}
+
+			items[replacepos_first].u.zs_endblk = replace_end;
+		}
+
+		/* we already adjusted the item at 'replacepos_first'. Remove the rest. */
+		if (replacepos_last > replacepos_first)
+		{
+			int			move_items = nitems - replacepos_last;
+			int			remain_items = nitems - (replacepos_last - replacepos_first);
+
+			if (move_items > 0)
+				memmove(&items[replacepos_first + 1],
+						&items[replacepos_last + 1],
+						move_items * sizeof(ZSFreePageMapItem));
+
+			((PageHeader) page)->pd_lower = SizeOfPageHeaderData + remain_items * sizeof(ZSFreePageMapItem);
+
+		}
+
+		MarkBufferDirty(buf);
+		UnlockReleaseBuffer(buf);
+
+		return;
+	}
+
+	/*
+	 * No overlap with any existing ranges. Add a new one. This might require
+	 * splitting the page.
+	 */
+	pos = pos + 1;
+
+	if (PageGetExactFreeSpace(page) >= sizeof(ZSFreePageMapItem))
+	{
+		START_CRIT_SECTION();
+
+		memmove(&items[pos + 1],
+				&items[pos],
+				(nitems - pos) * sizeof(ZSFreePageMapItem));
+
+		items[pos].zs_startblk = startblk;
+		items[pos].u.zs_endblk = endblk;
+
+		((PageHeader) page)->pd_lower += sizeof(ZSFreePageMapItem);
+
+		/* TODO: WAL-log */
+
+		MarkBufferDirty(buf);
+
+		END_CRIT_SECTION();
+
+		UnlockReleaseBuffer(buf);
+		return;
+	}
+	else
+	{
+		/* last resort: split the page */
+		zs_split_stack *split_stack;
+		ZSFreePageMapItem newitem;
+
+		newitem.zs_startblk = startblk;
+		newitem.u.zs_endblk = endblk;
+		split_stack = zsfpm_split(rel, buf, pos, &newitem);
+
+		/* write out the temporary page copies */
+		zs_apply_split_changes(rel, split_stack);
+	}
+}
+
+/*
+ * Insert a downlink for right page, after splitting 'leftbuf' FPM page.
+ */
+static zs_split_stack *
+zsfpm_insert_downlink(Relation rel, Buffer leftbuf,
+					  BlockNumber rightlokey, BlockNumber rightblkno)
+{
+	Buffer		parentbuf;
+	Page		leftpage = BufferGetPage(leftbuf);
+	BlockNumber leftblkno = BufferGetBlockNumber(leftbuf);
+	ZSFreePageMapOpaque *leftopaque = ZSFreePageMapGetOpaque(leftpage);
+	zstid		leftlokey = leftopaque->zs_lokey;
+	ZSFreePageMapItem downlink;
+	Buffer		metabuf;
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	BlockNumber rootblk;
+	Page		parentpage;
+	ZSFreePageMapItem *items;
+	int			nitems;
+	int			pos;
+	zs_split_stack *split_stack;
+
+	/*
+	 * First, find the parent of 'leftbuf'.
+	 *
+	 * TODO: this is a bit inefficient. Usually, we have just descended the
+	 * tree, and if we just remembered the path we descended, we could just
+	 * walk back up.
+	 */
+	metabuf = ReadBuffer(rel, ZS_META_BLK);
+	metapage = BufferGetPage(metabuf);
+	LockBuffer(metabuf, BUFFER_LOCK_SHARE);
+	metapage = BufferGetPage(metabuf);
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+	rootblk = metaopaque->zs_fpm_root;
+
+	if (rootblk == BufferGetBlockNumber(leftbuf))
+	{
+		/* Root split. Create new root with downlinks for the left and right page. */
+		ZSFreePageMapItem downlink1;
+		ZSFreePageMapItem downlink2;
+
+		/* re-acquire the lock on metapage in exclusive mode */
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+		LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+		metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+
+		/*
+		 * No one should have been able to change the root pointer, because we were
+		 * holding a lock on the root page
+		 */
+		Assert(metaopaque->zs_fpm_root == BufferGetBlockNumber(leftbuf));
+
+		downlink1.zs_startblk = leftlokey;
+		downlink1.u.zs_downlink = leftblkno;
+		downlink2.zs_startblk = rightlokey;
+		downlink2.u.zs_downlink = rightblkno;
+
+		return zsfpm_newroot(rel, metabuf, leftopaque->zs_level + 1,
+							 &downlink1, &downlink2);
+	}
+
+	UnlockReleaseBuffer(metabuf);
+
+	parentbuf = zsfpm_descend(rel, metabuf, leftlokey, leftopaque->zs_level + 1);
+	parentpage = BufferGetPage(parentbuf);
+
+	downlink.zs_startblk = rightlokey;
+	downlink.u.zs_downlink = rightblkno;
+
+	/* insert the item */
+	items = ZSFreePageMapPageGetItems(parentpage);
+	nitems = ZSFreePageMapPageGetNumItems(parentpage);
+
+	pos = zsfpm_binsrch_blkno(rightlokey, items, nitems);
+	pos = pos + 1;
+
+	if (PageGetExactFreeSpace(parentpage) >= sizeof(ZSFreePageMapItem))
+	{
+		ZSFreePageMapItem *newitems;
+		Page		newpage;
+
+		newpage = PageGetTempPageCopySpecial(parentpage);
+
+		split_stack = zs_new_split_stack_entry(parentbuf, newpage);
+
+		newitems = ZSFreePageMapPageGetItems(newpage);
+		memcpy(newitems, items, pos * sizeof(ZSFreePageMapItem));
+
+		newitems[pos] = downlink;
+
+		memcpy(&newitems[pos + 1], &items[pos], (nitems - pos) * sizeof(ZSFreePageMapItem));
+
+		((PageHeader) newpage)->pd_lower += (nitems + 1) * sizeof(ZSFreePageMapItem);
+
+	}
+	else
+	{
+		/* have to split the page. */
+		split_stack = zsfpm_split(rel, parentbuf, pos, &downlink);
+	}
+	return split_stack;
+}
+
+/*
+ * Split a page for insertion of 'newitem', at 'newpos'.
+ *
+ * A page split needs to modify the page being split, the block allocated for
+ * the new page, and also the downlink in the parent. If the parent needs to
+ * be split as well, its parent also needs to be recursively updated, all the
+ * way up to the root page, in the worst case. zsfpm_split() doesn't modify
+ * any pages directly, but locks them exclusively, and returns a list of
+ * zs_split_stack structs to represent the modifications. The caller must
+ * WAL-log and apply all the changes represented by the list.
+ */
+static zs_split_stack *
+zsfpm_split(Relation rel, Buffer leftbuf, int newpos, ZSFreePageMapItem *newitem)
+{
+	Buffer		rightbuf;
+	Page		origpage = BufferGetPage(leftbuf);
+	Page		leftpage;
+	Page		rightpage;
+	BlockNumber rightblkno;
+	ZSFreePageMapOpaque *leftopaque;
+	ZSFreePageMapOpaque *rightopaque;
+	ZSFreePageMapItem *origitems;
+	ZSFreePageMapItem *leftitems;
+	ZSFreePageMapItem *rightitems;
+	int			orignitems;
+	int			leftnitems;
+	int			rightnitems;
+	int			splitpoint;
+	BlockNumber splitkey;
+	bool		newitemonleft;
+	int			i;
+	zs_split_stack *stack1;
+	zs_split_stack *stack2;
+
+	leftpage = PageGetTempPageCopySpecial(origpage);
+	leftopaque = ZSFreePageMapGetOpaque(leftpage);
+
+	/*
+	 * FIXME: can't use the FPM to get a page, because we might deadlock with
+	 * ourself. We could steal a block from the page we're splitting...
+	 */
+	rightbuf = zspage_extendrel_newbuf(rel);
+	rightblkno = BufferGetBlockNumber(rightbuf);
+
+	rightpage = palloc(BLCKSZ);
+	PageInit(rightpage, BLCKSZ, sizeof(ZSFreePageMapOpaque));
+	rightopaque = ZSFreePageMapGetOpaque(rightpage);
+
+	/*
+	 * Figure out the split point.
+	 *
+	 * TODO: currently, always do 90/10 split.
+	 */
+	origitems = ZSFreePageMapPageGetItems(origpage);
+	orignitems = ZSFreePageMapPageGetNumItems(origpage);
+	splitpoint = orignitems * 0.9;
+	splitkey = origitems[splitpoint].zs_startblk;
+	newitemonleft = (newitem->zs_startblk < splitkey);
+
+	/* Set up the page headers */
+	rightopaque->zs_lokey = splitkey;
+	rightopaque->zs_hikey = leftopaque->zs_hikey;
+	rightopaque->zs_level = leftopaque->zs_level;
+	rightopaque->zs_flags = 0;
+	rightopaque->zs_page_id = ZS_FPM_PAGE_ID;
+
+	leftopaque->zs_hikey = splitkey;
+
+	/* copy the items */
+	leftitems = ZSFreePageMapPageGetItems(leftpage);
+	leftnitems = 0;
+	rightitems = ZSFreePageMapPageGetItems(rightpage);
+	rightnitems = 0;
+
+	for (i = 0; i < orignitems; i++)
+	{
+		if (i == newpos)
+		{
+			if (newitemonleft)
+				leftitems[leftnitems++] = *newitem;
+			else
+				rightitems[rightnitems++] = *newitem;
+		}
+
+		if (i < splitpoint)
+			leftitems[leftnitems++] = origitems[i];
+		else
+			rightitems[rightnitems++] = origitems[i];
+	}
+	/* cope with possibility that newitem goes at the end */
+	if (i <= newpos)
+	{
+		Assert(!newitemonleft);
+		rightitems[rightnitems++] = *newitem;
+	}
+	((PageHeader) leftpage)->pd_lower += leftnitems * sizeof(ZSFreePageMapItem);
+	((PageHeader) rightpage)->pd_lower += rightnitems * sizeof(ZSFreePageMapItem);
+
+	Assert(leftnitems + rightnitems == orignitems + 1);
+
+	stack1 = zs_new_split_stack_entry(leftbuf, leftpage);
+	stack2 = zs_new_split_stack_entry(rightbuf, rightpage);
+	stack1->next = stack2;
+
+	/* recurse to insert downlink. */
+	stack2->next = zsfpm_insert_downlink(rel, leftbuf, splitkey, rightblkno);
+
+	return stack1;
+}
+
+static zs_split_stack *
+zsfpm_newroot(Relation rel, Buffer metabuf, int level,
+			  ZSFreePageMapItem *item1, ZSFreePageMapItem *item2)
+{
+	/* Create a new FPM root page */
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	ZSFreePageMapOpaque *opaque;
+	Buffer		buf;
+	Page		page;
+	BlockNumber rootblk;
+	ZSFreePageMapItem *items;
+	zs_split_stack *stack1;
+	zs_split_stack *stack2;
+
+	metapage = PageGetTempPageCopy(BufferGetPage(metabuf));
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+
+	/* TODO: get the page from the FPM */
+	buf = zspage_extendrel_newbuf(rel);
+	rootblk = BufferGetBlockNumber(buf);
+
+	page = palloc(BLCKSZ);
+	PageInit(page, BLCKSZ, sizeof(ZSFreePageMapOpaque));
+	opaque = ZSFreePageMapGetOpaque(page);
+	opaque->zs_lokey = 0;
+	opaque->zs_hikey = MaxBlockNumber + 1;
+	opaque->zs_level = level;
+	opaque->zs_flags = 0;
+	opaque->zs_page_id = ZS_FPM_PAGE_ID;
+
+	items = ZSFreePageMapPageGetItems(page);
+	items[0] = *item1;
+	items[1] = *item2;
+	((PageHeader) page)->pd_lower += 2 * sizeof(ZSFreePageMapItem);
+	Assert(ZSFreePageMapPageGetNumItems(page) == 2);
+
+	metaopaque->zs_fpm_root = rootblk;
+
+	stack1 = zs_new_split_stack_entry(metabuf, metapage);
+
+	stack2 = zs_new_split_stack_entry(buf, page);
+	stack2->next = stack1;
+
+	return stack2;
+}
+
+static Buffer
+zsfpm_descend(Relation rel, Buffer metabuf, BlockNumber key, int level)
+{
+	BlockNumber next;
+	Buffer		buf;
+	Page		page;
+	ZSFreePageMapOpaque *opaque;
+	ZSFreePageMapItem *items;
+	int			nitems;
+	int			itemno;
+	int			nextlevel = -1;
+	BlockNumber failblk = InvalidBlockNumber;
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	BlockNumber rootblk;
+
+	metapage = BufferGetPage(metabuf);
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+	rootblk = metaopaque->zs_fpm_root;
+
+	next = rootblk;
+	for (;;)
+	{
+		/*
+		 * If we arrive again to a block that was a dead-end earlier, it seems
+		 * that the tree is corrupt.
+		 *
+		 * XXX: It's theoretically possible that the block was removed, but then
+		 * added back at the same location, and removed again. So perhaps retry
+		 * a few times?
+		 */
+		if (next == failblk)
+			elog(ERROR, "could not descend to block %u in FPM", key);
+
+		buf = ReadBuffer(rel, next);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);		/* TODO: shared */
+		page = BufferGetPage(buf);
+		opaque = ZSFreePageMapGetOpaque(page);
+
+		if (nextlevel == -1)
+			nextlevel = opaque->zs_level;
+		else if (opaque->zs_level != nextlevel)
+			elog(ERROR, "unexpected level encountered when descending FPM tree");
+
+		if (opaque->zs_level < level)
+			elog(ERROR, "unexpected page level encountered");
+
+		/*
+		 * Do we need to walk right? This could happen if the page was concurrently split.
+		 *
+		 * XXX: actually, we restart from root. We're holding a lock on the metapage,
+		 * so the root cannot change.
+		 */
+		if (key >= opaque->zs_hikey)
+		{
+			/* Restart from the root */
+			failblk = next;
+			next = rootblk;
+			nextlevel = -1;
+		}
+		else
+		{
+			if (opaque->zs_level == level)
+				return buf;
+
+			/* Find the downlink and follow it */
+			items = ZSFreePageMapPageGetItems(page);
+			nitems = ZSFreePageMapPageGetNumItems(page);
+
+			itemno = zsfpm_binsrch_blkno(key, items, nitems);
+
+			if (itemno < 0)
+				elog(ERROR, "could not descend FPM tree for key blk %u", key);
+
+			next = items[itemno].u.zs_downlink;
+			nextlevel--;
+		}
+		UnlockReleaseBuffer(buf);
+	}
+}
+
+
+static int
+zsfpm_binsrch_blkno(BlockNumber key, ZSFreePageMapItem *arr, int arr_elems)
+{
+	int			low,
+				high,
+				mid;
+
+	low = 0;
+	high = arr_elems;
+	while (high > low)
+	{
+		mid = low + (high - low) / 2;
+
+		if (key >= arr[mid].zs_startblk)
+			low = mid + 1;
+		else
+			high = mid;
+	}
+	return low - 1;
+}
diff --git a/src/backend/access/zedstore/zedstore_inspect.c b/src/backend/access/zedstore/zedstore_inspect.c
new file mode 100644
index 0000000000..4992c52102
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_inspect.c
@@ -0,0 +1,448 @@
+/*-------------------------------------------------------------------------
+ *
+ * zedstoream_inspect.c
+ *	  Debugging functions, for viewing ZedStore page contents
+ *
+ * These should probably be moved to contrib/, but it's handy to have them
+ * here during development.
+ *
+ * Example queries
+ * ---------------
+ *
+ * How many pages of each type a table has?
+ *
+ * select count(*), pg_zs_page_type('t_zedstore', g)
+ *   from generate_series(0, pg_table_size('t_zedstore') / 8192 - 1) g group by 2;
+ *
+ *  count | pg_zs_page_type 
+ * -------+-----------------
+ *      1 | META
+ *   3701 | BTREE
+ *      6 | UNDO
+ * (3 rows)
+ *
+ * Compression ratio of B-tree leaf pages (other pages are not compressed):
+ *
+ * select sum(uncompressedsz::numeric) / sum(totalsz) as compratio
+ *   from pg_zs_btree_pages('t_zedstore') ;
+ *      compratio      
+ * --------------------
+ *  3.6623829559208134
+ * (1 row)
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstoream_inspect.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "miscadmin.h"
+
+#include "access/relscan.h"
+#include "access/table.h"
+#include "access/zedstore_internal.h"
+#include "access/zedstore_undo.h"
+#include "commands/vacuum.h"
+#include "funcapi.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+
+Datum pg_zs_page_type(PG_FUNCTION_ARGS);
+Datum pg_zs_undo_pages(PG_FUNCTION_ARGS);
+Datum pg_zs_btree_pages(PG_FUNCTION_ARGS);
+
+Datum
+pg_zs_page_type(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	uint64		pageno = PG_GETARG_INT64(1);
+	Relation	rel;
+	uint16		zs_page_id;
+	Buffer		buf;
+	Page		page;
+	char	   *result;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use zedstore inspection functions"))));
+
+	rel = table_open(relid, AccessShareLock);
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	buf = ReadBuffer(rel, pageno);
+	LockBuffer(buf, BUFFER_LOCK_SHARE);
+	page = BufferGetPage(buf);
+
+	zs_page_id = *((uint16 *) ((char *) page + BLCKSZ - sizeof(uint16)));
+
+	UnlockReleaseBuffer(buf);
+				  
+	table_close(rel, AccessShareLock);
+
+	switch (zs_page_id)
+	{
+		case ZS_META_PAGE_ID:
+			result = "META";
+			break;
+		case ZS_BTREE_PAGE_ID:
+			result = "BTREE";
+			break;
+		case ZS_UNDO_PAGE_ID:
+			result = "UNDO";
+			break;
+		case ZS_TOAST_PAGE_ID:
+			result = "TOAST";
+			break;
+		case ZS_FPM_PAGE_ID:
+			result = "FPM";
+			break;
+		default:
+			result = psprintf("UNKNOWN 0x%04x", zs_page_id);
+	}
+
+	PG_RETURN_TEXT_P(cstring_to_text(result));
+}
+
+/*
+ *  blkno int8
+ *  nrecords int4
+ *  freespace int4
+ *  firstrecptr int8
+ *  lastrecptr int8
+ */
+Datum
+pg_zs_undo_pages(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Relation	rel;
+	Buffer		metabuf;
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	BlockNumber	firstblk;
+	BlockNumber	blkno;
+	char	   *ptr;
+	char	   *endptr;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use zedstore inspection functions"))));
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Switch into long-lived context to construct returned data structures */
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	rel = table_open(relid, AccessShareLock);
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	/*
+	 * Get the current oldest undo page from the metapage.
+	 */
+	metabuf = ReadBuffer(rel, ZS_META_BLK);
+	metapage = BufferGetPage(metabuf);
+	LockBuffer(metabuf, BUFFER_LOCK_SHARE);
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+
+	firstblk = metaopaque->zs_undo_head;
+
+	UnlockReleaseBuffer(metabuf);
+
+	/*
+	 * Loop through UNDO records, starting from the oldest page.
+	 */
+	blkno = firstblk;
+	while (blkno != InvalidBlockNumber)
+	{
+		Datum		values[5];
+		bool		nulls[5];
+		Buffer		buf;
+		Page		page;
+		ZSUndoPageOpaque *opaque;
+		int			nrecords;
+		ZSUndoRecPtr firstptr = { 0, 0, 0 };
+		ZSUndoRecPtr lastptr = { 0, 0, 0 };
+
+		memset(values, 0, sizeof(values));
+		memset(nulls, 0, sizeof(nulls));
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Read the UNDO page */
+		buf = ReadBuffer(rel, blkno);
+		page = BufferGetPage(buf);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page);
+
+		if (opaque->zs_page_id != ZS_UNDO_PAGE_ID)
+		{
+			elog(WARNING, "unexpected page id on UNDO page %u", blkno);
+			break;
+		}
+
+		/* loop through all records on the page */
+		endptr = (char *) page + ((PageHeader) page)->pd_lower;
+		ptr = (char *) page + SizeOfPageHeaderData;
+		nrecords = 0;
+		while (ptr < endptr)
+		{
+			ZSUndoRec *undorec = (ZSUndoRec *) ptr;
+
+			Assert(undorec->undorecptr.blkno == blkno);
+
+			lastptr = undorec->undorecptr;
+			if (nrecords == 0)
+				firstptr = lastptr;
+			nrecords++;
+
+			ptr += undorec->size;
+		}
+
+		values[0] = Int64GetDatum(blkno);
+		values[1] = Int32GetDatum(nrecords);
+		values[2] = Int32GetDatum(PageGetExactFreeSpace(page));
+		values[3] = Int64GetDatum(firstptr.counter);
+		values[4] = Int64GetDatum(lastptr.counter);
+
+		blkno = opaque->next;
+		UnlockReleaseBuffer(buf);
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+	tuplestore_donestoring(tupstore);
+
+	table_close(rel, AccessShareLock);
+
+	return (Datum) 0;
+}
+
+
+/*
+ *  blkno int8
+ *  nextblk int8
+ *  attno int4
+ *  level int4
+ *  
+ *  lokey int8
+ *  hikey int8
+
+ *  nitems int4
+ *  ncompressed int4
+ *  totalsz int4
+ *  uncompressedsz int4
+ *  freespace int4
+ */
+Datum
+pg_zs_btree_pages(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Relation	rel;
+	BlockNumber	blkno;
+	BlockNumber	nblocks;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use zedstore inspection functions"))));
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Switch into long-lived context to construct returned data structures */
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	rel = table_open(relid, AccessShareLock);
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	nblocks = RelationGetNumberOfBlocks(rel);
+
+	/* scan all blocks in physical order */
+	for (blkno = 1; blkno < nblocks; blkno++)
+	{
+		Datum		values[11];
+		bool		nulls[11];
+		OffsetNumber off;
+		OffsetNumber maxoff;
+		Buffer		buf;
+		Page		page;
+		ZSBtreePageOpaque *opaque;
+		int			nitems;
+		int			ncompressed;
+		int			totalsz;
+		int			uncompressedsz;
+
+		memset(values, 0, sizeof(values));
+		memset(nulls, 0, sizeof(nulls));
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Read the page */
+		buf = ReadBuffer(rel, blkno);
+		page = BufferGetPage(buf);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+		/*
+		 * we're only interested in B-tree pages. (Presumably, most of the
+		 * pages in the relation are b-tree pages, so it makes sense to
+		 * scan the whole relation in physical order)
+		 */
+		if (PageGetSpecialSize(page) != MAXALIGN(sizeof(ZSBtreePageOpaque)))
+		{
+			UnlockReleaseBuffer(buf);
+			continue;
+		}
+		opaque = (ZSBtreePageOpaque *) PageGetSpecialPointer(page);
+		if (opaque->zs_page_id != ZS_BTREE_PAGE_ID)
+		{
+			UnlockReleaseBuffer(buf);
+			continue;
+		}
+
+		nitems = 0;
+		ncompressed = 0;
+		totalsz = 0;
+		uncompressedsz = 0;
+		if (opaque->zs_level == 0)
+		{
+			/* leaf page */
+			maxoff = PageGetMaxOffsetNumber(page);
+			for (off = FirstOffsetNumber; off <= maxoff; off++)
+			{
+				ItemId		iid = PageGetItemId(page, off);
+				ZSBtreeItem	*item = (ZSBtreeItem *) PageGetItem(page, iid);
+
+				nitems++;
+				totalsz += item->t_size;
+
+				if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+				{
+					ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) PageGetItem(page, iid);
+
+					ncompressed++;
+					uncompressedsz += citem->t_uncompressedsize;
+				}
+				else
+					uncompressedsz += item->t_size;
+			}
+		}
+		else
+		{
+			/* internal page */
+			nitems = ZSBtreeInternalPageGetNumItems(page);
+		}
+		values[0] = Int64GetDatum(blkno);
+		values[1] = Int64GetDatum(opaque->zs_next);
+		values[2] = Int32GetDatum(opaque->zs_attno);
+		values[3] = Int32GetDatum(opaque->zs_level);
+		values[4] = Int64GetDatum(opaque->zs_lokey);
+		values[5] = Int64GetDatum(opaque->zs_hikey);
+		values[6] = Int32GetDatum(nitems);
+		if (opaque->zs_level == 0)
+		{
+			values[7] = Int32GetDatum(ncompressed);
+			values[8] = Int32GetDatum(totalsz);
+			values[9] = Int32GetDatum(uncompressedsz);
+		}
+		else
+		{
+			nulls[7] = true;
+			nulls[8] = true;
+			nulls[9] = true;
+		}
+		values[10] = Int32GetDatum(PageGetExactFreeSpace(page));
+
+		UnlockReleaseBuffer(buf);
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+	tuplestore_donestoring(tupstore);
+
+	table_close(rel, AccessShareLock);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/access/zedstore/zedstore_meta.c b/src/backend/access/zedstore/zedstore_meta.c
new file mode 100644
index 0000000000..a415645914
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_meta.c
@@ -0,0 +1,216 @@
+/*
+ * zedstore_meta.c
+ *		Routines for handling ZedStore metapage
+ *
+ * The metapage holds a directory of B-tree root block numbers, one for each
+ * column.
+ *
+ * TODO:
+ * - support ALTER TABLE ADD COLUMN.
+ * - extend the root block dir to an overflow page if there are too many
+ *   attributes to fit on one page
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_meta.c
+ */
+#include "postgres.h"
+
+#include "access/itup.h"
+#include "access/zedstore_internal.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+
+static void
+zsmeta_add_root_for_attributes(Relation rel, Page page, bool init)
+{
+	int natts = RelationGetNumberOfAttributes(rel) + 1;
+	int cur_natts;
+	int maxatts;
+	Size freespace;
+	ZSMetaPage *metapg;
+
+	/* Initialize the attribute root dir for new attribute */
+	freespace = PageGetExactFreeSpace(page);
+	maxatts = freespace / sizeof(ZSRootDirItem);
+	if (natts > maxatts)
+	{
+		/*
+		 * The root block directory must fit on the metapage.
+		 *
+		 * TODO: We could extend this by overflowing to another page.
+		 */
+		elog(ERROR, "too many attributes for zedstore");
+	}
+
+	metapg = (ZSMetaPage *) PageGetContents(page);
+
+	if (init)
+		metapg->nattributes = 0;
+
+	for (cur_natts = metapg->nattributes; cur_natts < natts; cur_natts++)
+	{
+		metapg->tree_root_dir[cur_natts].root = InvalidBlockNumber;
+	}
+
+	metapg->nattributes = natts;
+	((PageHeader) page)->pd_lower += sizeof(ZSRootDirItem);
+}
+
+/*
+ * Initialize the metapage for an empty relation.
+ */
+void
+zsmeta_initmetapage(Relation rel)
+{
+	Buffer		buf;
+	Page		page;
+	ZSMetaPageOpaque *opaque;
+
+	/*
+	 * It's possible that we error out when building the metapage, if there
+	 * are too many attribute, so work on a temporary copy first, before actually
+	 * allocating the buffer.
+	 */
+	page = palloc(BLCKSZ);
+	PageInit(page, BLCKSZ, sizeof(ZSMetaPageOpaque));
+	zsmeta_add_root_for_attributes(rel, page, true);
+
+	opaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(page);
+	opaque->zs_flags = 0;
+	opaque->zs_page_id = ZS_META_PAGE_ID;
+
+	/* UNDO-related fields */
+	opaque->zs_undo_counter = 1; /* start at 1, so that 0 is always "old" */
+	opaque->zs_undo_head = InvalidBlockNumber;
+	opaque->zs_undo_tail = InvalidBlockNumber;
+	opaque->zs_undo_oldestptr.counter = 1;
+
+	opaque->zs_fpm_root = InvalidBlockNumber;
+
+	/* Ok, write it out to disk */
+	buf = ReadBuffer(rel, P_NEW);
+	if (BufferGetBlockNumber(buf) != ZS_META_BLK)
+		elog(ERROR, "index is not empty");
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+	PageRestoreTempPage(page, BufferGetPage(buf));
+
+	MarkBufferDirty(buf);
+	/* TODO: WAL-log */
+
+	UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Get the block number of the b-tree root for given attribute.
+ *
+ * If 'readonly' is true, and the root doesn't exist yet (ie. it's an empty
+ * table), returns InvalidBlockNumber. Otherwise new root is allocated if
+ * the root doesn't exist.
+ */
+BlockNumber
+zsmeta_get_root_for_attribute(Relation rel, AttrNumber attno, bool readonly)
+{
+	Buffer		metabuf;
+	ZSMetaPage *metapg;
+	BlockNumber	rootblk;
+	Page        page;
+
+	if (RelationGetNumberOfBlocks(rel) == 0)
+	{
+		if (readonly)
+			return InvalidBlockNumber;
+
+		zsmeta_initmetapage(rel);
+	}
+
+	metabuf = ReadBuffer(rel, ZS_META_BLK);
+
+	/* TODO: get share lock to begin with */
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+	page = BufferGetPage(metabuf);
+	metapg = (ZSMetaPage *) PageGetContents(page);
+
+	if ((attno != ZS_META_ATTRIBUTE_NUM) && attno <= 0)
+		elog(ERROR, "invalid attribute number %d (table has only %d attributes)", attno, metapg->nattributes);
+
+	/*
+	 * file has less number of attributes stored compared to catalog. This
+	 * happens due to add column default value storing value in catalog and
+	 * absent in table. This attribute must be marked with atthasmissing.
+	 */
+	if (attno >= metapg->nattributes)
+	{
+		if (readonly)
+		{
+			UnlockReleaseBuffer(metabuf);
+			return InvalidBlockNumber;
+		}
+		else
+		{
+			zsmeta_add_root_for_attributes(rel, page, false);
+		}
+	}
+
+	rootblk = metapg->tree_root_dir[attno].root;
+
+	if (!readonly && rootblk == InvalidBlockNumber)
+	{
+		/* try to allocate one */
+		Buffer		rootbuf;
+		Page		rootpage;
+		ZSBtreePageOpaque *opaque;
+
+		/* TODO: release lock on metapage while we do I/O */
+		rootbuf = zspage_getnewbuf(rel, metabuf);
+		rootblk = BufferGetBlockNumber(rootbuf);
+
+		metapg->tree_root_dir[attno].root = rootblk;
+
+		/* initialize the page to look like a root leaf */
+		rootpage = BufferGetPage(rootbuf);
+		PageInit(rootpage, BLCKSZ, sizeof(ZSBtreePageOpaque));
+		opaque = ZSBtreePageGetOpaque(rootpage);
+		opaque->zs_attno = attno;
+		opaque->zs_next = InvalidBlockNumber;
+		opaque->zs_lokey = MinZSTid;
+		opaque->zs_hikey = MaxPlusOneZSTid;
+		opaque->zs_level = 0;
+		opaque->zs_flags = ZSBT_ROOT;
+		opaque->zs_page_id = ZS_BTREE_PAGE_ID;
+
+		MarkBufferDirty(rootbuf);
+		MarkBufferDirty(metabuf);
+		/* TODO: WAL-log both pages */
+
+		UnlockReleaseBuffer(rootbuf);
+	}
+
+	UnlockReleaseBuffer(metabuf);
+
+	return rootblk;
+}
+
+/*
+ *
+ * Caller is responsible for WAL-logging this.
+ */
+void
+zsmeta_update_root_for_attribute(Relation rel, AttrNumber attno,
+								 Buffer metabuf, BlockNumber rootblk)
+{
+	ZSMetaPage *metapg;
+
+	metapg = (ZSMetaPage *) PageGetContents(BufferGetPage(metabuf));
+
+	if ((attno != ZS_META_ATTRIBUTE_NUM) && (attno <= 0 || attno > metapg->nattributes))
+		elog(ERROR, "invalid attribute number %d (table \"%s\" has only %d attributes)",
+			 attno, RelationGetRelationName(rel), metapg->nattributes);
+
+	metapg->tree_root_dir[attno].root = rootblk;
+
+	MarkBufferDirty(metabuf);
+}
diff --git a/src/backend/access/zedstore/zedstore_tidpage.c b/src/backend/access/zedstore/zedstore_tidpage.c
new file mode 100644
index 0000000000..cffc5f2a75
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_tidpage.c
@@ -0,0 +1,1774 @@
+/*
+ * zedstore_tidpage.c
+ *		Routines for handling the TID tree.
+ *
+ * A Zedstore table consists of multiple B-trees, one for each attribute. The
+ * functions in this file deal with one B-tree at a time, it is the caller's
+ * responsibility to tie together the scans of each btree.
+ *
+ * Operations:
+ *
+ * - Sequential scan in TID order
+ *  - must be efficient with scanning multiple trees in sync
+ *
+ * - random lookups, by TID (for index scan)
+ *
+ * - range scans by TID (for bitmap index scan)
+ *
+ * NOTES:
+ * - Locking order: child before parent, left before right
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_tidpage.c
+ */
+#include "postgres.h"
+
+#include "access/zedstore_compression.h"
+#include "access/zedstore_internal.h"
+#include "access/zedstore_undo.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "utils/rel.h"
+
+/* prototypes for local functions */
+static void zsbt_tid_recompress_replace(Relation rel, Buffer oldbuf, List *items);
+static ZSSingleBtreeItem *zsbt_tid_fetch(Relation rel,
+		   ZSUndoRecPtr *recent_oldest_undo, zstid tid, Buffer *buf_p);
+static void zsbt_tid_replace_item(Relation rel, Buffer buf,
+								  zstid oldtid, ZSBtreeItem *replacementitem,
+								  List *newitems);
+static ZSBtreeItem *zsbt_tid_create_item(zstid tid, ZSUndoRecPtr undo_ptr, int nelements);
+
+static TM_Result zsbt_tid_update_lock_old(Relation rel, zstid otid,
+									  TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot,
+									  Snapshot crosscheck, bool wait, TM_FailureData *hufd, ZSUndoRecPtr *prevundoptr_p);
+static void zsbt_tid_update_insert_new(Relation rel, zstid *newtid,
+					   TransactionId xid, CommandId cid, ZSUndoRecPtr prevundoptr);
+static void zsbt_tid_mark_old_updated(Relation rel, zstid otid, zstid newtid,
+					  TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot);
+
+/* ----------------------------------------------------------------
+ *						 Public interface
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * Begin a scan of the btree.
+ */
+void
+zsbt_tid_begin_scan(Relation rel, zstid starttid,
+					zstid endtid, Snapshot snapshot, ZSBtreeScan *scan)
+{
+	Buffer		buf;
+
+	scan->rel = rel;
+	scan->attno = ZS_META_ATTRIBUTE_NUM;
+	scan->tupledesc = NULL;
+
+	scan->snapshot = snapshot;
+	scan->context = CurrentMemoryContext;
+	scan->lastoff = InvalidOffsetNumber;
+	scan->has_decompressed = false;
+	scan->nexttid = starttid;
+	scan->endtid = endtid;
+	memset(&scan->recent_oldest_undo, 0, sizeof(scan->recent_oldest_undo));
+	memset(&scan->array_undoptr, 0, sizeof(scan->array_undoptr));
+	scan->array_datums = palloc(sizeof(Datum));
+	scan->array_datums_allocated_size = 1;
+	scan->array_elements_left = 0;
+
+	buf = zsbt_descend(rel, ZS_META_ATTRIBUTE_NUM, starttid, 0, true);
+	if (!BufferIsValid(buf))
+	{
+		/* completely empty tree */
+		scan->active = false;
+		scan->lastbuf = InvalidBuffer;
+		return;
+	}
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	scan->active = true;
+	scan->lastbuf = buf;
+
+	zs_decompress_init(&scan->decompressor);
+	scan->recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+}
+
+/*
+ * Reset the 'next' TID in a scan to the given TID.
+ */
+void
+zsbt_tid_reset_scan(ZSBtreeScan *scan, zstid starttid)
+{
+	if (starttid < scan->nexttid)
+	{
+		/* have to restart from scratch. */
+		scan->array_elements_left = 0;
+		scan->nexttid = starttid;
+		scan->has_decompressed = false;
+		if (scan->lastbuf != InvalidBuffer)
+			ReleaseBuffer(scan->lastbuf);
+		scan->lastbuf = InvalidBuffer;
+	}
+	else
+		zsbt_scan_skip(scan, starttid);
+}
+
+void
+zsbt_tid_end_scan(ZSBtreeScan *scan)
+{
+	if (!scan->active)
+		return;
+
+	if (scan->lastbuf != InvalidBuffer)
+		ReleaseBuffer(scan->lastbuf);
+	zs_decompress_free(&scan->decompressor);
+
+	scan->active = false;
+	scan->array_elements_left = 0;
+}
+
+/*
+ * Helper function of zsbt_scan_next(), to extract Datums from the given
+ * array item into the scan->array_* fields.
+ */
+static void
+zsbt_tid_scan_extract_array(ZSBtreeScan *scan, ZSArrayBtreeItem *aitem)
+{
+	int			nelements = aitem->t_nelements;
+	zstid		tid = aitem->t_tid;
+
+	/* skip over elements that we are not interested in */
+	while (tid < scan->nexttid && nelements > 0)
+	{
+		tid++;
+		nelements--;
+	}
+
+	/* leave out elements that are past end of range */
+	if (tid + nelements > scan->endtid)
+		nelements = scan->endtid - tid;
+
+	scan->array_undoptr = aitem->t_undo_ptr;
+	scan->array_elements_left = nelements;
+}
+
+/*
+ * Advance scan to next item.
+ *
+ * Return true if there was another item. The Datum/isnull of the item is
+ * placed in scan->array_* fields. For a pass-by-ref datum, it's a palloc'd
+ * copy that's valid until the next call.
+ *
+ * This is normally not used directly. See zsbt_scan_next_tid() and
+ * zsbt_scan_next_fetch() wrappers, instead.
+ */
+zstid
+zsbt_tid_scan_next(ZSBtreeScan *scan)
+{
+	Buffer		buf;
+	bool		buf_is_locked = false;
+	Page		page;
+	ZSBtreePageOpaque *opaque;
+	OffsetNumber off;
+	OffsetNumber maxoff;
+	BlockNumber	next;
+	bool		visible;
+
+	if (!scan->active)
+		return InvalidZSTid;
+
+	/*
+	 * Process items, until we find something that is visible to the snapshot.
+	 *
+	 * This advances scan->nexttid as it goes.
+	 */
+	while (scan->nexttid < scan->endtid)
+	{
+		/*
+		 * If we are still processing an array item, return next element from it.
+		 */
+		if (scan->array_elements_left > 0)
+			goto have_array;
+
+		/*
+		 * If we are still processing a compressed item, process the next item
+		 * from the it. If it's an array item, we start iterating the array by
+		 * setting the scan->array_* fields, and loop back to top to return the
+		 * first element from the array.
+		 */
+		if (scan->has_decompressed)
+		{
+			zstid		lasttid;
+			ZSBtreeItem *uitem;
+			TransactionId obsoleting_xid;
+
+			uitem = zs_decompress_read_item(&scan->decompressor);
+
+			if (uitem == NULL)
+			{
+				scan->has_decompressed = false;
+				continue;
+			}
+
+			/* a compressed item cannot contain nested compressed items */
+			Assert((uitem->t_flags & ZSBT_COMPRESSED) == 0);
+
+			lasttid = zsbt_item_lasttid(uitem);
+			if (lasttid < scan->nexttid)
+				continue;
+
+			if (uitem->t_tid >= scan->endtid)
+				break;
+
+			visible = zs_SatisfiesVisibility(scan, uitem, &obsoleting_xid, NULL);
+
+			if (scan->serializable && TransactionIdIsValid(obsoleting_xid))
+				CheckForSerializableConflictOut(scan->rel, obsoleting_xid, scan->snapshot);
+
+			if (!visible)
+			{
+				scan->nexttid = lasttid + 1;
+				continue;
+			}
+			if ((uitem->t_flags & ZSBT_ARRAY) != 0)
+			{
+				/* no need to make a copy, because the uncompressed buffer
+				 * is already a copy */
+				ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) uitem;
+
+				zsbt_tid_scan_extract_array(scan, aitem);
+				continue;
+			}
+			else
+			{
+				/* single item */
+				ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) uitem;
+
+				scan->nexttid = sitem->t_tid;
+				scan->array_undoptr = sitem->t_undo_ptr;
+				scan->array_elements_left = 1;
+
+				if (buf_is_locked)
+					LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK);
+				buf_is_locked = false;
+				goto have_array;
+			}
+		}
+
+		/*
+		 * Scan the page for the next item.
+		 */
+		buf = scan->lastbuf;
+		if (!buf_is_locked)
+		{
+			if (BufferIsValid(buf))
+			{
+				LockBuffer(buf, BUFFER_LOCK_SHARE);
+				buf_is_locked = true;
+
+				/*
+				 * It's possible that the page was concurrently split or recycled by
+				 * another backend (or ourselves). Have to re-check that the page is
+				 * still valid.
+				 */
+				if (!zsbt_page_is_expected(scan->rel, scan->attno, scan->nexttid, 0, buf))
+				{
+					/*
+					 * It's not valid for the TID we're looking for, but maybe it was the
+					 * right page for the previous TID. In that case, we don't need to
+					 * restart from the root, we can follow the right-link instead.
+					 */
+					if (zsbt_page_is_expected(scan->rel, scan->attno, scan->nexttid - 1, 0, buf))
+					{
+						page = BufferGetPage(buf);
+						opaque = ZSBtreePageGetOpaque(page);
+						next = opaque->zs_next;
+						if (next != InvalidBlockNumber)
+						{
+							LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+							buf_is_locked = false;
+							buf = ReleaseAndReadBuffer(buf, scan->rel, next);
+							scan->lastbuf = buf;
+							continue;
+						}
+					}
+
+					UnlockReleaseBuffer(buf);
+					buf_is_locked = false;
+					buf = scan->lastbuf = InvalidBuffer;
+				}
+			}
+
+			if (!BufferIsValid(buf))
+			{
+				buf = scan->lastbuf = zsbt_descend(scan->rel, scan->attno, scan->nexttid, 0, true);
+				buf_is_locked = true;
+			}
+		}
+		page = BufferGetPage(buf);
+		opaque = ZSBtreePageGetOpaque(page);
+		Assert(opaque->zs_page_id == ZS_BTREE_PAGE_ID);
+
+		/* TODO: check the last offset first, as an optimization */
+		maxoff = PageGetMaxOffsetNumber(page);
+		for (off = FirstOffsetNumber; off <= maxoff; off++)
+		{
+			ItemId		iid = PageGetItemId(page, off);
+			ZSBtreeItem	*item = (ZSBtreeItem *) PageGetItem(page, iid);
+			zstid		lasttid;
+
+			lasttid = zsbt_item_lasttid(item);
+
+			if (scan->nexttid > lasttid)
+				continue;
+
+			if (item->t_tid >= scan->endtid)
+			{
+				scan->nexttid = scan->endtid;
+				break;
+			}
+
+			if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+			{
+				ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item;
+				MemoryContext oldcxt = MemoryContextSwitchTo(scan->context);
+
+				zs_decompress_chunk(&scan->decompressor, citem);
+				MemoryContextSwitchTo(oldcxt);
+				scan->has_decompressed = true;
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				buf_is_locked = false;
+				break;
+			}
+			else
+			{
+				TransactionId obsoleting_xid;
+
+				visible = zs_SatisfiesVisibility(scan, item, &obsoleting_xid, NULL);
+
+				if (!visible)
+				{
+					if (scan->serializable && TransactionIdIsValid(obsoleting_xid))
+						CheckForSerializableConflictOut(scan->rel, obsoleting_xid, scan->snapshot);
+					scan->nexttid = lasttid + 1;
+					continue;
+				}
+
+				if ((item->t_flags & ZSBT_ARRAY) != 0)
+				{
+					/* copy the item, because we can't hold a lock on the page  */
+					ZSArrayBtreeItem *aitem;
+
+					aitem = MemoryContextAlloc(scan->context, item->t_size);
+					memcpy(aitem, item, item->t_size);
+
+					zsbt_tid_scan_extract_array(scan, aitem);
+
+					if (scan->array_elements_left > 0)
+					{
+						LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK);
+						buf_is_locked = false;
+						break;
+					}
+				}
+				else
+				{
+					/* single item */
+					ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) item;
+
+					scan->nexttid = sitem->t_tid;
+					scan->array_undoptr = sitem->t_undo_ptr;
+					scan->array_elements_left = 1;
+					LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK);
+					buf_is_locked = false;
+					goto have_array;
+				}
+			}
+		}
+
+		if (scan->array_elements_left > 0 || scan->has_decompressed)
+			continue;
+
+		/* No more items on this page. Walk right, if possible */
+		next = opaque->zs_next;
+		if (next == BufferGetBlockNumber(buf))
+			elog(ERROR, "btree page %u next-pointer points to itself", next);
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		buf_is_locked = false;
+
+		if (next == InvalidBlockNumber || scan->nexttid >= scan->endtid)
+		{
+			scan->active = false;
+			scan->array_elements_left = 0;
+			ReleaseBuffer(scan->lastbuf);
+			scan->lastbuf = InvalidBuffer;
+			break;
+		}
+
+		scan->lastbuf = ReleaseAndReadBuffer(scan->lastbuf, scan->rel, next);
+	}
+
+	return InvalidZSTid;
+
+have_array:
+	/*
+	 * If we are still processing an array item, return next element from it.
+	 */
+	Assert(scan->array_elements_left > 0);
+
+	scan->array_elements_left--;
+	return scan->nexttid++;
+}
+
+/*
+ * Get the last tid (plus one) in the tree.
+ */
+zstid
+zsbt_get_last_tid(Relation rel)
+{
+	zstid		rightmostkey;
+	zstid		tid;
+	Buffer		buf;
+	Page		page;
+	ZSBtreePageOpaque *opaque;
+	OffsetNumber maxoff;
+
+	/* Find the rightmost leaf */
+	rightmostkey = MaxZSTid;
+	buf = zsbt_descend(rel, ZS_META_ATTRIBUTE_NUM, rightmostkey, 0, true);
+	if (!BufferIsValid(buf))
+	{
+		return MinZSTid;
+	}
+	page = BufferGetPage(buf);
+	opaque = ZSBtreePageGetOpaque(page);
+
+	/*
+	 * Look at the last item, for its tid.
+	 */
+	maxoff = PageGetMaxOffsetNumber(page);
+	if (maxoff >= FirstOffsetNumber)
+	{
+		ItemId		iid = PageGetItemId(page, maxoff);
+		ZSBtreeItem	*hitup = (ZSBtreeItem *) PageGetItem(page, iid);
+
+		tid = zsbt_item_lasttid(hitup) + 1;
+	}
+	else
+	{
+		tid = opaque->zs_lokey;
+	}
+	UnlockReleaseBuffer(buf);
+
+	return tid;
+}
+
+/*
+ * Insert a multiple TIDs.
+ *
+ * Populates the TIDs of the new tuples.
+ *
+ * If 'tid' in list is valid, then that TID is used. It better not be in use already. If
+ * it's invalid, then a new TID is allocated, as we see best. (When inserting the
+ * first column of the row, pass invalid, and for other columns, pass the TID
+ * you got for the first column.)
+ */
+void
+zsbt_tid_multi_insert(Relation rel, zstid *tids, int nitems,
+					  TransactionId xid, CommandId cid, uint32 speculative_token, ZSUndoRecPtr prevundoptr)
+{
+	bool		assign_tids;
+	zstid		tid = tids[0];
+	Buffer		buf;
+	Page		page;
+	ZSBtreePageOpaque *opaque;
+	OffsetNumber maxoff;
+	zstid		insert_target_key;
+	ZSUndoRec_Insert undorec;
+	int			i;
+	List	   *newitems;
+	ZSUndoRecPtr undorecptr;
+
+	/*
+	 * If TID was given, find the right place for it. Otherwise, insert to
+	 * the rightmost leaf.
+	 *
+	 * TODO: use a Free Space Map to find suitable target.
+	 */
+	assign_tids = (tid == InvalidZSTid);
+
+	if (!assign_tids)
+		insert_target_key = tid;
+	else
+		insert_target_key = MaxZSTid;
+
+	buf = zsbt_descend(rel, ZS_META_ATTRIBUTE_NUM, insert_target_key, 0, false);
+	page = BufferGetPage(buf);
+	opaque = ZSBtreePageGetOpaque(page);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * Look at the last item, for its tid.
+	 *
+	 * assign TIDS for each item, if needed.
+	 */
+	if (assign_tids)
+	{
+		zstid		lasttid;
+
+		if (maxoff >= FirstOffsetNumber)
+		{
+			ItemId		iid = PageGetItemId(page, maxoff);
+			ZSBtreeItem	*hitup = (ZSBtreeItem *) PageGetItem(page, iid);
+
+			lasttid = zsbt_item_lasttid(hitup);
+			tid = lasttid + 1;
+		}
+		else
+		{
+			lasttid = opaque->zs_lokey;
+			tid = lasttid;
+		}
+
+		for (i = 0; i < nitems; i++)
+		{
+			tids[i] = tid;
+			tid++;
+		}
+	}
+
+	/* Form an undo record */
+	if (xid != FrozenTransactionId)
+	{
+		undorec.rec.size = sizeof(ZSUndoRec_Insert);
+		undorec.rec.type = ZSUNDO_TYPE_INSERT;
+		undorec.rec.xid = xid;
+		undorec.rec.cid = cid;
+		undorec.rec.tid = tids[0];
+		undorec.rec.speculative_token = speculative_token;
+		undorec.rec.prevundorec = prevundoptr;
+		undorec.endtid = tids[nitems - 1];
+
+		undorecptr = zsundo_insert(rel, &undorec.rec);
+	}
+	else
+	{
+		ZSUndoRecPtrInitialize(&undorecptr);
+	}
+
+	/* Create items to insert. */
+	newitems = NIL;
+	i = 0;
+	while (i < nitems)
+	{
+		int			j;
+		ZSBtreeItem *newitem;
+
+		/*
+		 * Try to collapse as many items as possible into an Array item.
+		 * The first item in the array is now at tids[i]/datums[i]/isnulls[i].
+		 * Items can be stored in the same array as long as the TIDs are
+		 * consecutive, they all have the same isnull flag, and the array
+		 * isn't too large to be stored on a single leaf page. Scan the
+		 * arrays, checking those conditions.
+		 *
+		 * FIXME: this math is bogus for TIDs
+		 */
+		for (j = i + 1; j < nitems; j++)
+		{
+			if (tids[j] != tids[j - 1] + 1)
+				break;
+		}
+
+		/*
+		 * 'i' is now the first entry to store in the array, and 'j' is the
+		 * last + 1 elemnt to store. If j == i + 1, then there is only one
+		 * element and zsbt_create_item() will create a 'single' item rather
+		 * than an array.
+		 */
+		newitem = zsbt_tid_create_item(tids[i], undorecptr, j - i);
+
+		newitems = lappend(newitems, newitem);
+		i = j;
+	}
+
+	/* recompress and possibly split the page */
+	zsbt_tid_replace_item(rel, buf,
+						  InvalidZSTid, NULL,
+						  newitems);
+	/* zsbt_replace_item unlocked 'buf' */
+	ReleaseBuffer(buf);
+}
+
+TM_Result
+zsbt_tid_delete(Relation rel, zstid tid,
+			TransactionId xid, CommandId cid,
+			Snapshot snapshot, Snapshot crosscheck, bool wait,
+			TM_FailureData *hufd, bool changingPart)
+{
+	ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+	ZSSingleBtreeItem *item;
+	TM_Result	result;
+	bool		keep_old_undo_ptr = true;
+	ZSUndoRecPtr undorecptr;
+	ZSSingleBtreeItem *deleteditem;
+	Buffer		buf;
+	zstid		next_tid;
+
+	/* Find the item to delete. (It could be compressed) */
+	item = zsbt_tid_fetch(rel, &recent_oldest_undo, tid, &buf);
+	if (item == NULL)
+	{
+		/*
+		 * or should this be TM_Invisible? The heapam at least just throws
+		 * an error, I think..
+		 */
+		elog(ERROR, "could not find tuple to delete with TID (%u, %u) in TID tree",
+			 ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid));
+	}
+
+	if (snapshot)
+	{
+		result = zs_SatisfiesUpdate(rel, snapshot, recent_oldest_undo,
+									(ZSBtreeItem *) item, LockTupleExclusive,
+									&keep_old_undo_ptr, hufd, &next_tid);
+		if (result != TM_Ok)
+		{
+			UnlockReleaseBuffer(buf);
+			/* FIXME: We should fill TM_FailureData *hufd correctly */
+			return result;
+		}
+
+		if (crosscheck != InvalidSnapshot && result == TM_Ok)
+		{
+			/* Perform additional check for transaction-snapshot mode RI updates */
+			/* FIXME: dummmy scan */
+			ZSBtreeScan scan;
+			TransactionId obsoleting_xid;
+
+			memset(&scan, 0, sizeof(scan));
+			scan.rel = rel;
+			scan.snapshot = crosscheck;
+			scan.recent_oldest_undo = recent_oldest_undo;
+
+			if (!zs_SatisfiesVisibility(&scan, (ZSBtreeItem *) item, &obsoleting_xid, NULL))
+			{
+				UnlockReleaseBuffer(buf);
+				/* FIXME: We should fill TM_FailureData *hufd correctly */
+				result = TM_Updated;
+			}
+		}
+	}
+
+	/* Create UNDO record. */
+	{
+		ZSUndoRec_Delete undorec;
+
+		undorec.rec.size = sizeof(ZSUndoRec_Delete);
+		undorec.rec.type = ZSUNDO_TYPE_DELETE;
+		undorec.rec.xid = xid;
+		undorec.rec.cid = cid;
+		undorec.rec.tid = tid;
+		undorec.changedPart = changingPart;
+
+		if (keep_old_undo_ptr)
+			undorec.rec.prevundorec = item->t_undo_ptr;
+		else
+			ZSUndoRecPtrInitialize(&undorec.rec.prevundorec);
+
+		undorecptr = zsundo_insert(rel, &undorec.rec);
+	}
+
+	/* Replace the ZSBreeItem with one with the new UNDO pointer. */
+	deleteditem = palloc(item->t_size);
+	memcpy(deleteditem, item, item->t_size);
+	deleteditem->t_undo_ptr = undorecptr;
+
+	zsbt_tid_replace_item(rel, buf,
+						  item->t_tid, (ZSBtreeItem *) deleteditem,
+						  NIL);
+	ReleaseBuffer(buf);	/* zsbt_replace_item unlocked */
+
+	pfree(deleteditem);
+
+	return TM_Ok;
+}
+
+void
+zsbt_find_latest_tid(Relation rel, zstid *tid, Snapshot snapshot)
+{
+	ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+	ZSSingleBtreeItem *item;
+	Buffer		buf;
+	/* Just using meta attribute, we can follow the update chain */
+	zstid curr_tid = *tid;
+
+	for(;;)
+	{
+		zstid next_tid = InvalidZSTid;
+		if (curr_tid == InvalidZSTid)
+			break;
+
+		/* Find the item */
+		item = zsbt_tid_fetch(rel, &recent_oldest_undo, curr_tid, &buf);
+		if (item == NULL)
+			break;
+
+		if (snapshot)
+		{
+			/* FIXME: dummmy scan */
+			ZSBtreeScan scan;
+			TransactionId obsoleting_xid;
+
+			memset(&scan, 0, sizeof(scan));
+			scan.rel = rel;
+			scan.snapshot = snapshot;
+			scan.recent_oldest_undo = recent_oldest_undo;
+
+			if (zs_SatisfiesVisibility(&scan, (ZSBtreeItem *) item,
+										&obsoleting_xid, &next_tid))
+			{
+				*tid = curr_tid;
+			}
+
+			curr_tid = next_tid;
+			UnlockReleaseBuffer(buf);
+		}
+	}
+}
+
+/*
+ * A new TID is allocated, as we see best and returned to the caller. This
+ * function is only called for META attribute btree. Data columns will use the
+ * returned tid to insert new items.
+ */
+TM_Result
+zsbt_tid_update(Relation rel, zstid otid,
+				TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot,
+				Snapshot crosscheck, bool wait, TM_FailureData *hufd,
+				zstid *newtid_p)
+{
+	TM_Result	result;
+	ZSUndoRecPtr prevundoptr;
+
+	/*
+	 * This is currently only used on the meta-attribute. The other attributes
+	 * don't need to carry visibility information, so the caller just inserts
+	 * the new values with (multi_)insert() instead. This will change once we
+	 * start doing the equivalent of HOT updates, where the TID doesn't change.
+	 */
+	Assert(*newtid_p == InvalidZSTid);
+
+	/*
+	 * Find and lock the old item.
+	 *
+	 * TODO: If there's free TID space left on the same page, we should keep the
+	 * buffer locked, and use the same page for the new tuple.
+	 */
+	result = zsbt_tid_update_lock_old(rel, otid,
+									  xid, cid, key_update, snapshot,
+									  crosscheck, wait, hufd, &prevundoptr);
+
+	if (result != TM_Ok)
+		return result;
+
+	/* insert new version */
+	zsbt_tid_update_insert_new(rel, newtid_p, xid, cid, prevundoptr);
+
+	/* update the old item with the "t_ctid pointer" for the new item */
+	zsbt_tid_mark_old_updated(rel, otid, *newtid_p, xid, cid, key_update, snapshot);
+
+	return TM_Ok;
+}
+
+/*
+ * Subroutine of zsbt_update(): locks the old item for update.
+ */
+static TM_Result
+zsbt_tid_update_lock_old(Relation rel, zstid otid,
+					 TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot,
+					 Snapshot crosscheck, bool wait, TM_FailureData *hufd, ZSUndoRecPtr *prevundoptr_p)
+{
+	ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+	Buffer		buf;
+	ZSSingleBtreeItem *olditem;
+	TM_Result	result;
+	bool		keep_old_undo_ptr = true;
+	zstid		next_tid;
+
+	/*
+	 * Find the item to delete.
+	 */
+	olditem = zsbt_tid_fetch(rel, &recent_oldest_undo, otid, &buf);
+	if (olditem == NULL)
+	{
+		/*
+		 * or should this be TM_Invisible? The heapam at least just throws
+		 * an error, I think..
+		 */
+		elog(ERROR, "could not find old tuple to update with TID (%u, %u) in TID tree",
+			 ZSTidGetBlockNumber(otid), ZSTidGetOffsetNumber(otid));
+	}
+	*prevundoptr_p = olditem->t_undo_ptr;
+
+	/*
+	 * Is it visible to us?
+	 */
+	result = zs_SatisfiesUpdate(rel, snapshot, recent_oldest_undo,
+								(ZSBtreeItem *) olditem,
+								key_update ? LockTupleExclusive : LockTupleNoKeyExclusive,
+								&keep_old_undo_ptr, hufd, &next_tid);
+	if (result != TM_Ok)
+	{
+		UnlockReleaseBuffer(buf);
+		/* FIXME: We should fill TM_FailureData *hufd correctly */
+		return result;
+	}
+
+	if (crosscheck != InvalidSnapshot && result == TM_Ok)
+	{
+		/* Perform additional check for transaction-snapshot mode RI updates */
+		/* FIXME: dummmy scan */
+		ZSBtreeScan scan;
+		TransactionId obsoleting_xid;
+
+		memset(&scan, 0, sizeof(scan));
+		scan.rel = rel;
+		scan.snapshot = crosscheck;
+		scan.recent_oldest_undo = recent_oldest_undo;
+
+		if (!zs_SatisfiesVisibility(&scan, (ZSBtreeItem *) olditem, &obsoleting_xid, NULL))
+		{
+			UnlockReleaseBuffer(buf);
+			/* FIXME: We should fill TM_FailureData *hufd correctly */
+			result = TM_Updated;
+		}
+	}
+
+	/*
+	 * TODO: tuple-locking not implemented. Pray that there is no competing
+	 * concurrent update!
+	 */
+
+	UnlockReleaseBuffer(buf);
+
+	return TM_Ok;
+}
+
+/*
+ * Subroutine of zsbt_update(): inserts the new, updated, item.
+ */
+static void
+zsbt_tid_update_insert_new(Relation rel,
+					   zstid *newtid,
+					   TransactionId xid, CommandId cid, ZSUndoRecPtr prevundoptr)
+{
+	zsbt_tid_multi_insert(rel, newtid, 1, xid, cid, INVALID_SPECULATIVE_TOKEN, prevundoptr);
+}
+
+/*
+ * Subroutine of zsbt_update(): mark old item as updated.
+ */
+static void
+zsbt_tid_mark_old_updated(Relation rel, zstid otid, zstid newtid,
+					  TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot)
+{
+	ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+	Buffer		buf;
+	ZSSingleBtreeItem *olditem;
+	TM_Result	result;
+	bool		keep_old_undo_ptr = true;
+	TM_FailureData tmfd;
+	ZSUndoRecPtr undorecptr;
+	ZSSingleBtreeItem *deleteditem;
+	zstid		next_tid;
+
+	/*
+	 * Find the item to delete.  It could be part of a compressed item,
+	 * we let zsbt_fetch() handle that.
+	 */
+	olditem = zsbt_tid_fetch(rel, &recent_oldest_undo, otid, &buf);
+	if (olditem == NULL)
+	{
+		/*
+		 * or should this be TM_Invisible? The heapam at least just throws
+		 * an error, I think..
+		 */
+		elog(ERROR, "could not find old tuple to update with TID (%u, %u) in TID tree",
+			 ZSTidGetBlockNumber(otid), ZSTidGetOffsetNumber(otid));
+	}
+
+	/*
+	 * Is it visible to us?
+	 */
+	result = zs_SatisfiesUpdate(rel, snapshot, recent_oldest_undo,
+								(ZSBtreeItem *) olditem,
+								key_update ? LockTupleExclusive : LockTupleNoKeyExclusive,
+								&keep_old_undo_ptr, &tmfd, &next_tid);
+	if (result != TM_Ok)
+	{
+		UnlockReleaseBuffer(buf);
+		elog(ERROR, "tuple concurrently updated - not implemented");
+	}
+
+	/* Create UNDO record. */
+	{
+		ZSUndoRec_Update undorec;
+
+		undorec.rec.size = sizeof(ZSUndoRec_Update);
+		undorec.rec.type = ZSUNDO_TYPE_UPDATE;
+		undorec.rec.xid = xid;
+		undorec.rec.cid = cid;
+		undorec.rec.tid = otid;
+		if (keep_old_undo_ptr)
+			undorec.rec.prevundorec = olditem->t_undo_ptr;
+		else
+			ZSUndoRecPtrInitialize(&undorec.rec.prevundorec);
+		undorec.newtid = newtid;
+		undorec.key_update = key_update;
+
+		undorecptr = zsundo_insert(rel, &undorec.rec);
+	}
+
+	/* Replace the ZSBreeItem with one with the updated undo pointer. */
+	deleteditem = palloc(olditem->t_size);
+	memcpy(deleteditem, olditem, olditem->t_size);
+	deleteditem->t_undo_ptr = undorecptr;
+
+	zsbt_tid_replace_item(rel, buf,
+						  otid, (ZSBtreeItem *) deleteditem,
+						  NIL);
+	ReleaseBuffer(buf);		/* zsbt_recompress_replace released */
+
+	pfree(deleteditem);
+}
+
+TM_Result
+zsbt_tid_lock(Relation rel, zstid tid,
+			   TransactionId xid, CommandId cid,
+			   LockTupleMode mode, Snapshot snapshot,
+			   TM_FailureData *hufd, zstid *next_tid)
+{
+	ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+	Buffer		buf;
+	ZSSingleBtreeItem *item;
+	TM_Result	result;
+	bool		keep_old_undo_ptr = true;
+	ZSUndoRecPtr undorecptr;
+	ZSSingleBtreeItem *newitem;
+
+	*next_tid = tid;
+
+	/* Find the item to delete. (It could be compressed) */
+	item = zsbt_tid_fetch(rel, &recent_oldest_undo, tid, &buf);
+	if (item == NULL)
+	{
+		/*
+		 * or should this be TM_Invisible? The heapam at least just throws
+		 * an error, I think..
+		 */
+		elog(ERROR, "could not find tuple to lock with TID (%u, %u)",
+			 ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid));
+	}
+	result = zs_SatisfiesUpdate(rel, snapshot, recent_oldest_undo,
+								   (ZSBtreeItem *) item, mode,
+								&keep_old_undo_ptr, hufd, next_tid);
+	if (result != TM_Ok)
+	{
+		UnlockReleaseBuffer(buf);
+		return result;
+	}
+
+	/* Create UNDO record. */
+	{
+		ZSUndoRec_TupleLock undorec;
+
+		undorec.rec.size = sizeof(ZSUndoRec_TupleLock);
+		undorec.rec.type = ZSUNDO_TYPE_TUPLE_LOCK;
+		undorec.rec.xid = xid;
+		undorec.rec.cid = cid;
+		undorec.rec.tid = tid;
+		undorec.lockmode = mode;
+		if (keep_old_undo_ptr)
+			undorec.rec.prevundorec = item->t_undo_ptr;
+		else
+			ZSUndoRecPtrInitialize(&undorec.rec.prevundorec);
+
+		undorecptr = zsundo_insert(rel, &undorec.rec);
+	}
+
+	/* Replace the item with an identical one, but with updated undo pointer. */
+	newitem = palloc(item->t_size);
+	memcpy(newitem, item, item->t_size);
+	newitem->t_undo_ptr = undorecptr;
+
+	zsbt_tid_replace_item(rel, buf,
+						  item->t_tid, (ZSBtreeItem *) newitem,
+						  NIL);
+	ReleaseBuffer(buf);		/* zsbt_replace_item unlocked */
+
+	pfree(newitem);
+
+	return TM_Ok;
+}
+
+/*
+ * Mark item with given TID as dead.
+ *
+ * This is used during VACUUM.
+ */
+void
+zsbt_tid_mark_dead(Relation rel, zstid tid, ZSUndoRecPtr undoptr)
+{
+	Buffer		buf;
+	ZSSingleBtreeItem *item;
+	ZSSingleBtreeItem deaditem;
+
+	/* Find the item to delete. (It could be compressed) */
+	item = zsbt_tid_fetch(rel, NULL, tid, &buf);
+	if (item == NULL)
+	{
+		elog(WARNING, "could not find tuple to mark dead with TID (%u, %u)",
+			 ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid));
+		return;
+	}
+
+	/* Replace the ZSBreeItem with a DEAD item. (Unless it's already dead) */
+	if ((item->t_flags & ZSBT_DEAD) != 0)
+	{
+		UnlockReleaseBuffer(buf);
+		return;
+	}
+
+	memset(&deaditem, 0, offsetof(ZSSingleBtreeItem, t_payload));
+	deaditem.t_tid = tid;
+	deaditem.t_size = sizeof(ZSSingleBtreeItem);
+	deaditem.t_flags = ZSBT_DEAD;
+	deaditem.t_undo_ptr = undoptr;
+
+	zsbt_tid_replace_item(rel, buf,
+						  tid, (ZSBtreeItem *) &deaditem,
+						  NIL);
+	ReleaseBuffer(buf); 	/* zsbt_replace_item released */
+}
+
+/*
+ * Clear an item's UNDO pointer.
+ *
+ * This is used during VACUUM, to clear out aborted deletions.
+ */
+void
+zsbt_tid_undo_deletion(Relation rel, zstid tid, ZSUndoRecPtr undoptr)
+{
+	Buffer		buf;
+	ZSSingleBtreeItem *item;
+	ZSSingleBtreeItem *copy;
+
+	/* Find the item to delete. (It could be compressed) */
+	item = zsbt_tid_fetch(rel, NULL, tid, &buf);
+	if (item == NULL)
+	{
+		elog(WARNING, "could not find aborted tuple to remove with TID (%u, %u)",
+			 ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid));
+		return;
+	}
+
+	if (ZSUndoRecPtrEquals(item->t_undo_ptr, undoptr))
+	{
+		copy = palloc(item->t_size);
+		memcpy(copy, item, item->t_size);
+		ZSUndoRecPtrInitialize(&copy->t_undo_ptr);
+		zsbt_tid_replace_item(rel, buf,
+							  tid, (ZSBtreeItem *) copy,
+							  NIL);
+		ReleaseBuffer(buf); 	/* zsbt_replace_item unlocked */
+	}
+	else
+	{
+		Assert(item->t_undo_ptr.counter > undoptr.counter ||
+			   !IsZSUndoRecPtrValid(&item->t_undo_ptr));
+		UnlockReleaseBuffer(buf);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *						 Internal routines
+ * ----------------------------------------------------------------
+ */
+
+void
+zsbt_tid_clear_speculative_token(Relation rel, zstid tid, uint32 spectoken, bool forcomplete)
+{
+	Buffer		buf;
+	ZSSingleBtreeItem *item = NULL;
+	ZSUndoRecPtr recent_oldest_undo;
+
+	item = zsbt_tid_fetch(rel, &recent_oldest_undo, tid, &buf);
+
+	if (item == NULL)
+		elog(ERROR, "couldn't find item for meta column for inserted tuple with TID (%u, %u) in rel %s",
+			 ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid), rel->rd_rel->relname.data);
+	Assert(item->t_tid == tid);
+
+	zsundo_clear_speculative_token(rel, item->t_undo_ptr);
+
+	UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Fetch the item with given TID. The page containing the item is kept locked, and
+ * returned to the caller in *buf_p. This is used to locate a tuple for updating
+ * or deleting it.
+ */
+static ZSSingleBtreeItem *
+zsbt_tid_fetch(Relation rel, ZSUndoRecPtr *recent_oldest_undo,
+		   zstid tid, Buffer *buf_p)
+{
+	Buffer		buf;
+	Page		page;
+	ZSBtreeItem *item = NULL;
+	bool		found = false;
+	OffsetNumber maxoff;
+	OffsetNumber off;
+
+	buf = zsbt_descend(rel, ZS_META_ATTRIBUTE_NUM, tid, 0, false);
+	if (buf == InvalidBuffer)
+	{
+		*buf_p = InvalidBuffer;
+		return NULL;
+	}
+	page = BufferGetPage(buf);
+
+	/* Find the item on the page that covers the target TID */
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (off = FirstOffsetNumber; off <= maxoff; off++)
+	{
+		ItemId		iid = PageGetItemId(page, off);
+		item = (ZSBtreeItem *) PageGetItem(page, iid);
+
+		if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+		{
+			ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item;
+			ZSDecompressContext decompressor;
+
+			zs_decompress_init(&decompressor);
+			zs_decompress_chunk(&decompressor, citem);
+
+			while ((item = zs_decompress_read_item(&decompressor)) != NULL)
+			{
+				zstid		lasttid = zsbt_item_lasttid(item);
+
+				if (item->t_tid <= tid && lasttid >= tid)
+				{
+					found = true;
+					break;
+				}
+			}
+			if (found)
+			{
+				/* FIXME: decompressor is leaked. Can't free it yet, because we still
+				 * need to access the item below
+				 */
+				break;
+			}
+			zs_decompress_free(&decompressor);
+		}
+		else
+		{
+			zstid		lasttid = zsbt_item_lasttid(item);
+
+			if (item->t_tid <= tid && lasttid >= tid)
+			{
+				found = true;
+				break;
+			}
+		}
+	}
+
+	if (found)
+	{
+		ZSSingleBtreeItem *result;
+
+		if ((item->t_flags & ZSBT_ARRAY) != 0)
+		{
+			ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item;
+			int			resultsize;
+
+			Assert((tid - aitem->t_tid) < aitem->t_nelements);
+
+			resultsize = offsetof(ZSSingleBtreeItem, t_payload);
+			result = palloc(resultsize);
+			memset(result, 0, offsetof(ZSSingleBtreeItem, t_payload)); /* zero padding */
+			result->t_tid = tid;
+			result->t_flags = item->t_flags & ~ZSBT_ARRAY;
+			result->t_size = resultsize;
+			result->t_undo_ptr = aitem->t_undo_ptr;
+		}
+		else
+		{
+			/* single item */
+			result = (ZSSingleBtreeItem *) item;
+		}
+
+		*buf_p = buf;
+		return result;
+	}
+	else
+	{
+		UnlockReleaseBuffer(buf);
+		*buf_p = InvalidBuffer;
+		return NULL;
+	}
+}
+
+/*
+ * Form a ZSBtreeItem out of the given datums, or data that's already in on-disk
+ * array format, for insertion.
+ *
+ * If there's more than one element, an array item is created. Otherwise, a single
+ * item.
+ */
+static ZSBtreeItem *
+zsbt_tid_create_item(zstid tid, ZSUndoRecPtr undo_ptr,
+				 int nelements)
+{
+	ZSBtreeItem *result;
+	Size		itemsz;
+
+	Assert(nelements > 0);
+
+	if (nelements > 1)
+	{
+		ZSArrayBtreeItem *newitem;
+
+		itemsz = offsetof(ZSArrayBtreeItem, t_payload);
+
+		newitem = palloc(itemsz);
+		memset(newitem, 0, offsetof(ZSArrayBtreeItem, t_payload)); /* zero padding */
+		newitem->t_tid = tid;
+		newitem->t_size = itemsz;
+		newitem->t_flags = ZSBT_ARRAY;
+		newitem->t_nelements = nelements;
+		newitem->t_undo_ptr = undo_ptr;
+
+		result = (ZSBtreeItem *) newitem;
+	}
+	else
+	{
+		ZSSingleBtreeItem *newitem;
+
+		itemsz = offsetof(ZSSingleBtreeItem, t_payload);
+
+		newitem = palloc(itemsz);
+		memset(newitem, 0, offsetof(ZSSingleBtreeItem, t_payload)); /* zero padding */
+		newitem->t_tid = tid;
+		newitem->t_flags = 0;
+		newitem->t_size = itemsz;
+		newitem->t_undo_ptr = undo_ptr;
+
+		result = (ZSBtreeItem *) newitem;
+	}
+
+	return result;
+}
+
+/*
+ * This helper function is used to implement INSERT, UPDATE and DELETE.
+ *
+ * If 'olditem' is not NULL, then 'olditem' on the page is replaced with
+ * 'replacementitem'. 'replacementitem' can be NULL, to remove an old item.
+ *
+ * If 'newitems' is not empty, the items in the list are added to the page,
+ * to the correct position. FIXME: Actually, they're always just added to
+ * the end of the page, and that better be the correct position.
+ *
+ * This function handles decompressing and recompressing items, and splitting
+ * the page if needed.
+ */
+static void
+zsbt_tid_replace_item(Relation rel, Buffer buf,
+					  zstid oldtid,
+					  ZSBtreeItem *replacementitem,
+					  List       *newitems)
+{
+	Page		page = BufferGetPage(buf);
+	OffsetNumber off;
+	OffsetNumber maxoff;
+	List	   *items;
+	bool		found_old_item = false;
+	/* We might need to decompress up to two previously compressed items */
+	ZSDecompressContext decompressor;
+	bool		decompressor_used = false;
+	bool		decompressing;
+
+	if (replacementitem)
+		Assert(replacementitem->t_tid == oldtid);
+
+	/*
+	 * TODO: It would be good to have a fast path, for the common case that we're
+	 * just adding items to the end.
+	 */
+
+	/* Loop through all old items on the page */
+	items = NIL;
+	maxoff = PageGetMaxOffsetNumber(page);
+	decompressing = false;
+	off = 1;
+	for (;;)
+	{
+		ZSBtreeItem *item;
+
+		/*
+		 * Get the next item to process. If we're decompressing, get the next
+		 * tuple from the decompressor, otherwise get the next item from the page.
+		 */
+		if (decompressing)
+		{
+			item = zs_decompress_read_item(&decompressor);
+			if (!item)
+			{
+				decompressing = false;
+				continue;
+			}
+		}
+		else if (off <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, off);
+
+			item = (ZSBtreeItem *) PageGetItem(page, iid);
+			off++;
+
+		}
+		else
+		{
+			/* out of items */
+			break;
+		}
+
+		/* we now have an item to process, either straight from the page or from
+		 * the decompressor */
+		if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+		{
+			zstid		item_lasttid = zsbt_item_lasttid(item);
+
+			/* there shouldn't nested compressed items */
+			if (decompressing)
+				elog(ERROR, "nested compressed items on zedstore page not supported");
+
+			if (oldtid != InvalidZSTid && item->t_tid <= oldtid && oldtid <= item_lasttid)
+			{
+				ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item;
+
+				/* Found it, this compressed item covers the target or the new TID. */
+				/* We have to decompress it, and recompress */
+				Assert(!decompressor_used);
+
+				zs_decompress_init(&decompressor);
+				zs_decompress_chunk(&decompressor, citem);
+				decompressor_used = true;
+				decompressing = true;
+				continue;
+			}
+			else
+			{
+				/* keep this compressed item as it is */
+				items = lappend(items, item);
+			}
+		}
+		else if ((item->t_flags & ZSBT_ARRAY) != 0)
+		{
+			/* array item */
+			ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item;
+			zstid		item_lasttid = zsbt_item_lasttid(item);
+
+			if (oldtid != InvalidZSTid && item->t_tid <= oldtid && oldtid <= item_lasttid)
+			{
+				/*
+				 * The target TID is currently part of an array item. We have to split
+				 * the array item into two, and put the replacement item in the middle.
+				 */
+				int			cutoff;
+				int			nelements = aitem->t_nelements;
+
+				cutoff = oldtid - item->t_tid;
+
+				/* Array slice before the target TID */
+				if (cutoff > 0)
+				{
+					ZSBtreeItem *item1;
+
+					item1 = zsbt_tid_create_item(aitem->t_tid, aitem->t_undo_ptr,
+												 cutoff);
+					items = lappend(items, item1);
+				}
+
+				/*
+				 * Skip over the target element, and store the replacement
+				 * item, if any, in its place
+				 */
+				if (replacementitem)
+					items = lappend(items, replacementitem);
+
+				/* Array slice after the target */
+				if (cutoff + 1 < nelements)
+				{
+					ZSBtreeItem *item2;
+
+					item2 = zsbt_tid_create_item(oldtid + 1, aitem->t_undo_ptr,
+												 nelements - (cutoff + 1));
+					items = lappend(items, item2);
+				}
+
+				found_old_item = true;
+			}
+			else
+				items = lappend(items, item);
+		}
+		else
+		{
+			/* single item */
+			if (oldtid != InvalidZSTid && item->t_tid == oldtid)
+			{
+				Assert(!found_old_item);
+				found_old_item = true;
+				if (replacementitem)
+					items = lappend(items, replacementitem);
+			}
+			else
+				items = lappend(items, item);
+		}
+	}
+
+	if (oldtid != InvalidZSTid && !found_old_item)
+		elog(ERROR, "could not find old item to replace");
+
+	/* Add any new items to the end */
+	if (newitems)
+		items = list_concat(items, newitems);
+
+	/* Now pass the list to the recompressor. */
+	IncrBufferRefCount(buf);
+	if (items)
+	{
+		zsbt_tid_recompress_replace(rel, buf, items);
+	}
+	else
+	{
+		zs_split_stack *stack;
+
+		stack = zsbt_unlink_page(rel, ZS_META_ATTRIBUTE_NUM, buf, 0);
+
+		if (!stack)
+		{
+			/* failed. */
+			Page		newpage = PageGetTempPageCopySpecial(BufferGetPage(buf));
+
+			stack = zs_new_split_stack_entry(buf, newpage);
+		}
+
+		/* apply the changes */
+		zs_apply_split_changes(rel, stack);
+	}
+
+	/*
+	 * We can now free the decompression contexts. The pointers in the 'items' list
+	 * point to decompression buffers, so we cannot free them until after writing out
+	 * the pages.
+	 */
+	if (decompressor_used)
+		zs_decompress_free(&decompressor);
+	list_free(items);
+}
+
+/*
+ * Recompressor routines
+ */
+typedef struct
+{
+	Page		currpage;
+	ZSCompressContext compressor;
+	int			compressed_items;
+
+	/* first page writes over the old buffer, subsequent pages get newly-allocated buffers */
+	zs_split_stack *stack_head;
+	zs_split_stack *stack_tail;
+
+	int			total_items;
+	int			total_compressed_items;
+	int			total_already_compressed_items;
+
+	zstid		hikey;
+} zsbt_tid_recompress_context;
+
+static void
+zsbt_recompress_newpage(zsbt_tid_recompress_context *cxt, zstid nexttid, int flags)
+{
+	Page		newpage;
+	ZSBtreePageOpaque *newopaque;
+	zs_split_stack *stack;
+
+	if (cxt->currpage)
+	{
+		/* set the last tid on previous page */
+		ZSBtreePageOpaque *oldopaque = ZSBtreePageGetOpaque(cxt->currpage);
+
+		oldopaque->zs_hikey = nexttid;
+	}
+
+	newpage = (Page) palloc(BLCKSZ);
+	PageInit(newpage, BLCKSZ, sizeof(ZSBtreePageOpaque));
+
+	stack = zs_new_split_stack_entry(InvalidBuffer, /* will be assigned later */
+									 newpage);
+	if (cxt->stack_tail)
+		cxt->stack_tail->next = stack;
+	else
+		cxt->stack_head = stack;
+	cxt->stack_tail = stack;
+
+	cxt->currpage = newpage;
+
+	newopaque = ZSBtreePageGetOpaque(newpage);
+	newopaque->zs_attno = ZS_META_ATTRIBUTE_NUM;
+	newopaque->zs_next = InvalidBlockNumber; /* filled in later */
+	newopaque->zs_lokey = nexttid;
+	newopaque->zs_hikey = cxt->hikey;		/* overwritten later, if this is not last page */
+	newopaque->zs_level = 0;
+	newopaque->zs_flags = flags;
+	newopaque->zs_page_id = ZS_BTREE_PAGE_ID;
+}
+
+static void
+zsbt_recompress_add_to_page(zsbt_tid_recompress_context *cxt, ZSBtreeItem *item)
+{
+	if (PageGetFreeSpace(cxt->currpage) < MAXALIGN(item->t_size))
+		zsbt_recompress_newpage(cxt, item->t_tid, 0);
+
+	if (PageAddItemExtended(cxt->currpage,
+							(Item) item, item->t_size,
+							PageGetMaxOffsetNumber(cxt->currpage) + 1,
+							PAI_OVERWRITE) == InvalidOffsetNumber)
+		elog(ERROR, "could not add item to page while recompressing");
+
+	cxt->total_items++;
+}
+
+static bool
+zsbt_recompress_add_to_compressor(zsbt_tid_recompress_context *cxt, ZSBtreeItem *item)
+{
+	bool		result;
+
+	if (cxt->compressed_items == 0)
+		zs_compress_begin(&cxt->compressor, PageGetFreeSpace(cxt->currpage));
+
+	result = zs_compress_add(&cxt->compressor, item);
+	if (result)
+	{
+		cxt->compressed_items++;
+
+		cxt->total_compressed_items++;
+	}
+
+	return result;
+}
+
+static void
+zsbt_recompress_flush(zsbt_tid_recompress_context *cxt)
+{
+	ZSCompressedBtreeItem *citem;
+
+	if (cxt->compressed_items == 0)
+		return;
+
+	citem = zs_compress_finish(&cxt->compressor);
+
+	if (citem)
+		zsbt_recompress_add_to_page(cxt, (ZSBtreeItem *) citem);
+	else
+	{
+		uint16 size = 0;
+		/*
+		 * compression failed hence add items uncompressed. We should maybe
+		 * note that these items/pattern are not compressible and skip future
+		 * attempts to compress but its possible this clubbed with some other
+		 * future items may compress. So, better avoid recording such info and
+		 * try compression again later if required.
+		 */
+		for (int i = 0; i < cxt->compressor.nitems; i++)
+		{
+			citem = (ZSCompressedBtreeItem *) (cxt->compressor.uncompressedbuffer + size);
+			zsbt_recompress_add_to_page(cxt, (ZSBtreeItem *) citem);
+
+			size += MAXALIGN(citem->t_size);
+		}
+	}
+
+	cxt->compressed_items = 0;
+}
+
+/*
+ * Rewrite a leaf page, with given 'items' as the new content.
+ *
+ * If there are any uncompressed items in the list, we try to compress them.
+ * Any already-compressed items are added as is.
+ *
+ * If the items no longer fit on the page, then the page is split. It is
+ * entirely possible that they don't fit even on two pages; we split the page
+ * into as many pages as needed. Hopefully not more than a few pages, though,
+ * because otherwise you might hit limits on the number of buffer pins (with
+ * tiny shared_buffers).
+ *
+ * On entry, 'oldbuf' must be pinned and exclusive-locked. On exit, the lock
+ * is released, but it's still pinned.
+ *
+ * TODO: Try to combine single items, and existing array-items, into new array
+ * items.
+ */
+static void
+zsbt_tid_recompress_replace(Relation rel, Buffer oldbuf, List *items)
+{
+	ListCell   *lc;
+	zsbt_tid_recompress_context cxt;
+	ZSBtreePageOpaque *oldopaque = ZSBtreePageGetOpaque(BufferGetPage(oldbuf));
+	ZSUndoRecPtr recent_oldest_undo = { 0 };
+	BlockNumber orignextblk;
+	zs_split_stack *stack;
+	List	   *downlinks = NIL;
+
+	orignextblk = oldopaque->zs_next;
+
+	cxt.currpage = NULL;
+	zs_compress_init(&cxt.compressor);
+	cxt.compressed_items = 0;
+	cxt.stack_head = cxt.stack_tail = NULL;
+	cxt.hikey = oldopaque->zs_hikey;
+
+	cxt.total_items = 0;
+	cxt.total_compressed_items = 0;
+	cxt.total_already_compressed_items = 0;
+
+	zsbt_recompress_newpage(&cxt, oldopaque->zs_lokey, (oldopaque->zs_flags & ZSBT_ROOT));
+
+	foreach(lc, items)
+	{
+		ZSBtreeItem *item = (ZSBtreeItem *) lfirst(lc);
+
+		/* We can leave out any old-enough DEAD items */
+		if ((item->t_flags & ZSBT_DEAD) != 0)
+		{
+			ZSBtreeItem *uitem = (ZSBtreeItem *) item;
+
+			if (recent_oldest_undo.counter == 0)
+				recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+
+			if (zsbt_item_undoptr(uitem).counter <= recent_oldest_undo.counter)
+				continue;
+		}
+
+		if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+		{
+			/* already compressed, add as it is. */
+			zsbt_recompress_flush(&cxt);
+			cxt.total_already_compressed_items++;
+			zsbt_recompress_add_to_page(&cxt, item);
+		}
+		else
+		{
+			/* try to add this item to the compressor */
+			if (!zsbt_recompress_add_to_compressor(&cxt, item))
+			{
+				if (cxt.compressed_items > 0)
+				{
+					/* flush, and retry */
+					zsbt_recompress_flush(&cxt);
+
+					if (!zsbt_recompress_add_to_compressor(&cxt, item))
+					{
+						/* could not compress, even on its own. Store it uncompressed, then */
+						zsbt_recompress_add_to_page(&cxt, item);
+					}
+				}
+				else
+				{
+					/* could not compress, even on its own. Store it uncompressed, then */
+					zsbt_recompress_add_to_page(&cxt, item);
+				}
+			}
+		}
+	}
+
+	/* flush the last one, if any */
+	zsbt_recompress_flush(&cxt);
+
+	zs_compress_free(&cxt.compressor);
+
+	/*
+	 * Ok, we now have a list of pages, to replace the original page, as private
+	 * in-memory copies. Allocate buffers for them, and write them out.
+	 *
+	 * allocate all the pages before entering critical section, so that
+	 * out-of-disk-space doesn't lead to PANIC
+	 */
+	stack = cxt.stack_head;
+	Assert(stack->buf == InvalidBuffer);
+	stack->buf = oldbuf;
+	while (stack->next)
+	{
+		Page	thispage = stack->page;
+		ZSBtreePageOpaque *thisopaque = ZSBtreePageGetOpaque(thispage);
+		ZSBtreeInternalPageItem *downlink;
+		Buffer	nextbuf;
+
+		Assert(stack->next->buf == InvalidBuffer);
+
+		nextbuf = zspage_getnewbuf(rel, InvalidBuffer);
+		stack->next->buf = nextbuf;
+
+		thisopaque->zs_next = BufferGetBlockNumber(nextbuf);
+
+		downlink = palloc(sizeof(ZSBtreeInternalPageItem));
+		downlink->tid = thisopaque->zs_hikey;
+		downlink->childblk = BufferGetBlockNumber(nextbuf);
+		downlinks = lappend(downlinks, downlink);
+
+		stack = stack->next;
+	}
+	/* last one in the chain */
+	ZSBtreePageGetOpaque(stack->page)->zs_next = orignextblk;
+
+	/* If we had to split, insert downlinks for the new pages. */
+	if (cxt.stack_head->next)
+	{
+		oldopaque = ZSBtreePageGetOpaque(cxt.stack_head->page);
+
+		if ((oldopaque->zs_flags & ZSBT_ROOT) != 0)
+		{
+			ZSBtreeInternalPageItem *downlink;
+
+			downlink = palloc(sizeof(ZSBtreeInternalPageItem));
+			downlink->tid = MinZSTid;
+			downlink->childblk = BufferGetBlockNumber(cxt.stack_head->buf);
+			downlinks = lcons(downlink, downlinks);
+
+			cxt.stack_tail->next = zsbt_newroot(rel, ZS_META_ATTRIBUTE_NUM,
+												oldopaque->zs_level + 1, downlinks);
+
+			/* clear the ZSBT_ROOT flag on the old root page */
+			oldopaque->zs_flags &= ~ZSBT_ROOT;
+		}
+		else
+		{
+			cxt.stack_tail->next = zsbt_insert_downlinks(rel, ZS_META_ATTRIBUTE_NUM,
+														 oldopaque->zs_lokey, BufferGetBlockNumber(oldbuf), oldopaque->zs_level + 1,
+														 downlinks);
+		}
+		/* note: stack_tail is not the real tail anymore */
+	}
+
+	/* Finally, overwrite all the pages we had to modify */
+	zs_apply_split_changes(rel, cxt.stack_head);
+}
diff --git a/src/backend/access/zedstore/zedstore_toast.c b/src/backend/access/zedstore/zedstore_toast.c
new file mode 100644
index 0000000000..8e25591b16
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_toast.c
@@ -0,0 +1,192 @@
+/*
+ * zedstore_toast.c
+ *		Routines for Toasting oversized tuples in Zedstore
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_toast.c
+ */
+#include "postgres.h"
+
+#include "access/zedstore_compression.h"
+#include "access/zedstore_internal.h"
+#include "storage/bufmgr.h"
+#include "utils/datum.h"
+#include "utils/rel.h"
+
+/*
+ * Toast a datum, inside the ZedStore file.
+ *
+ * This is similar to regular toasting, but instead of using a separate index and
+ * heap, the datum is stored within the same ZedStore file as all the btrees and
+ * stuff. A chain of "toast-pages" is allocated for the datum, and each page is filled
+ * with as much of the datum as possible.
+ *
+ *
+ * Note: You must call zedstore_toast_finish() after this,
+ * to set the TID in the toast-chain's first block. Otherwise, it's considered recyclable.
+ */
+Datum
+zedstore_toast_datum(Relation rel, AttrNumber attno, Datum value)
+{
+	varatt_zs_toastptr *toastptr;
+	BlockNumber firstblk = InvalidBlockNumber;
+	Buffer		buf = InvalidBuffer;
+	Page		page;
+	ZSToastPageOpaque *opaque;
+	Buffer		prevbuf = InvalidBuffer;
+	ZSToastPageOpaque *prevopaque = NULL;
+	char	   *ptr;
+	int32		total_size;
+	int32		offset;
+
+	/* it's possible that this is the very first insertion to the relation. */
+	if (RelationGetNumberOfBlocks(rel) == 0)
+		zsmeta_initmetapage(rel);
+
+	/* TODO: try to compress it in place first. Maybe just call toast_compress_datum? */
+
+	/*
+	 * If that doesn't reduce it enough, allocate a toast page
+	 * for it.
+	 */
+	ptr = VARDATA_ANY(value);
+	total_size = VARSIZE_ANY_EXHDR(value);
+	offset = 0;
+
+	while (total_size - offset > 0)
+	{
+		Size		thisbytes;
+
+		buf = zspage_getnewbuf(rel, InvalidBuffer);
+		if (prevbuf == InvalidBuffer)
+			firstblk = BufferGetBlockNumber(buf);
+
+		page = BufferGetPage(buf);
+		PageInit(page, BLCKSZ, sizeof(ZSToastPageOpaque));
+
+		thisbytes = Min(total_size - offset, PageGetExactFreeSpace(page));
+
+		opaque = (ZSToastPageOpaque *) PageGetSpecialPointer(page);
+		opaque->zs_attno = attno;
+		opaque->zs_tid = InvalidZSTid;
+		opaque->zs_total_size = total_size;
+		opaque->zs_slice_offset = offset;
+		opaque->zs_prev = BufferIsValid(prevbuf) ? BufferGetBlockNumber(prevbuf) : InvalidBlockNumber;
+		opaque->zs_next = InvalidBlockNumber;
+		opaque->zs_flags = 0;
+		opaque->zs_page_id = ZS_TOAST_PAGE_ID;
+
+		memcpy((char *) page + SizeOfPageHeaderData, ptr, thisbytes);
+		((PageHeader) page)->pd_lower += thisbytes;
+		ptr += thisbytes;
+		offset += thisbytes;
+
+		if (prevbuf != InvalidBuffer)
+		{
+			prevopaque->zs_next = BufferGetBlockNumber(buf);
+			MarkBufferDirty(prevbuf);
+		}
+
+		/* TODO: WAL-log */
+		MarkBufferDirty(buf);
+
+		if (prevbuf != InvalidBuffer)
+			UnlockReleaseBuffer(prevbuf);
+		prevbuf = buf;
+		prevopaque = opaque;
+	}
+
+	UnlockReleaseBuffer(buf);
+
+	toastptr = palloc0(sizeof(varatt_zs_toastptr));
+	SET_VARTAG_1B_E(toastptr, VARTAG_ZEDSTORE);
+	toastptr->zst_block = firstblk;
+
+	return PointerGetDatum(toastptr);
+}
+
+void
+zedstore_toast_finish(Relation rel, AttrNumber attno, Datum toasted, zstid tid)
+{
+	varatt_zs_toastptr *toastptr = (varatt_zs_toastptr *) DatumGetPointer(toasted);
+	Buffer		buf;
+	Page		page;
+	ZSToastPageOpaque *opaque;
+
+	Assert(toastptr->va_tag == VARTAG_ZEDSTORE);
+
+	buf = ReadBuffer(rel, toastptr->zst_block);
+	page = BufferGetPage(buf);
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+	opaque = (ZSToastPageOpaque *) PageGetSpecialPointer(page);
+
+	Assert(opaque->zs_tid == InvalidZSTid);
+	Assert(opaque->zs_attno == attno);
+	Assert(opaque->zs_prev == InvalidBlockNumber);
+
+	opaque->zs_tid = tid;
+
+	/* TODO: WAL-log */
+	MarkBufferDirty(buf);
+
+	UnlockReleaseBuffer(buf);
+}
+
+Datum
+zedstore_toast_flatten(Relation rel, AttrNumber attno, zstid tid, Datum toasted)
+{
+	varatt_zs_toastptr *toastptr = (varatt_zs_toastptr *) DatumGetPointer(toasted);
+	BlockNumber	nextblk;
+	BlockNumber	prevblk;
+	char	   *result = NULL;
+	char	   *ptr = NULL;
+	int32		total_size = 0;
+
+	Assert(toastptr->va_tag == VARTAG_ZEDSTORE);
+
+	prevblk = InvalidBlockNumber;
+	nextblk = toastptr->zst_block;
+
+	while (nextblk != InvalidBlockNumber)
+	{
+		Buffer		buf;
+		Page		page;
+		ZSToastPageOpaque *opaque;
+		uint32		size;
+
+		buf = ReadBuffer(rel, nextblk);
+		page = BufferGetPage(buf);
+		LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+		opaque = (ZSToastPageOpaque *) PageGetSpecialPointer(page);
+
+		Assert(opaque->zs_attno == attno);
+		Assert(opaque->zs_prev == prevblk);
+
+		if (prevblk == InvalidBlockNumber)
+		{
+			Assert(opaque->zs_tid == tid);
+
+			total_size = opaque->zs_total_size;
+
+			result = palloc(total_size + VARHDRSZ);
+			SET_VARSIZE(result, total_size + VARHDRSZ);
+			ptr = result + VARHDRSZ;
+		}
+
+		size = ((PageHeader) page)->pd_lower - SizeOfPageHeaderData;
+		memcpy(ptr, (char *) page + SizeOfPageHeaderData, size);
+		ptr += size;
+
+		prevblk = nextblk;
+		nextblk = opaque->zs_next;
+		UnlockReleaseBuffer(buf);
+	}
+	Assert(total_size > 0);
+	Assert(ptr == result + total_size + VARHDRSZ);
+
+	return PointerGetDatum(result);
+}
diff --git a/src/backend/access/zedstore/zedstore_tupslot.c b/src/backend/access/zedstore/zedstore_tupslot.c
new file mode 100644
index 0000000000..8528287d51
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_tupslot.c
@@ -0,0 +1,348 @@
+/*
+ * zedstore_tupslot.c
+ *		Implementation of a TupleTableSlot for zedstore.
+ *
+ * This implementation is identical to a Virtual tuple slot
+ * (TTSOpsVirtual), but it has a slot_getsysattr() implementation
+ * that can fetch and compute the 'xmin' for the tuple.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_tupslot.c
+ */
+#include "postgres.h"
+
+#include "access/table.h"
+#include "access/zedstore_internal.h"
+#include "executor/tuptable.h"
+#include "utils/expandeddatum.h"
+
+const TupleTableSlotOps TTSOpsZedstore;
+
+
+typedef struct ZedstoreTupleTableSlot
+{
+	TupleTableSlot base;
+
+	char	   *data;		/* data for materialized slots */
+} ZedstoreTupleTableSlot;
+
+
+static void
+tts_zedstore_init(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_zedstore_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_zedstore_clear(TupleTableSlot *slot)
+{
+	if (unlikely(TTS_SHOULDFREE(slot)))
+	{
+		ZedstoreTupleTableSlot *vslot = (ZedstoreTupleTableSlot *) slot;
+
+		pfree(vslot->data);
+		vslot->data = NULL;
+
+		slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+	}
+
+	slot->tts_nvalid = 0;
+	slot->tts_flags |= TTS_FLAG_EMPTY;
+	ItemPointerSetInvalid(&slot->tts_tid);
+}
+
+/*
+ * Attribute values are readily available in tts_values and tts_isnull array
+ * in a ZedstoreTupleTableSlot. So there should be no need to call either of the
+ * following two functions.
+ */
+static void
+tts_zedstore_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+	elog(ERROR, "getsomeattrs is not required to be called on a zedstore tuple table slot");
+}
+
+static void
+zs_get_xmin_cmin(Relation rel, ZSUndoRecPtr recent_oldest_undo, zstid tid, ZSUndoRecPtr undo_ptr,
+				 TransactionId *xmin, CommandId *cmin)
+{
+	TransactionId this_xmin;
+	CommandId	this_cmin;
+	ZSUndoRec  *undorec;
+
+	/*
+	 * Follow the chain of UNDO records for this tuple, to find the
+	 * transaction that originally inserted the row  (xmin/cmin).
+	 *
+	 * XXX: this is similar logic to zs_cluster_process_tuple(). Can
+	 * we merge it?
+	 */
+	this_xmin = FrozenTransactionId;
+	this_cmin = InvalidCommandId;
+
+	for (;;)
+	{
+		if (undo_ptr.counter < recent_oldest_undo.counter)
+		{
+			/* This tuple version is visible to everyone. */
+			break;
+		}
+
+		/* Fetch the next UNDO record. */
+		undorec = zsundo_fetch(rel, undo_ptr);
+
+		if (undorec->type == ZSUNDO_TYPE_INSERT)
+		{
+			this_xmin = undorec->xid;
+			this_cmin = undorec->cid;
+			break;
+		}
+		else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK ||
+				 undorec->type == ZSUNDO_TYPE_DELETE ||
+				 undorec->type == ZSUNDO_TYPE_UPDATE)
+		{
+			undo_ptr = undorec->prevundorec;
+			continue;
+		}
+	}
+
+	*xmin = this_xmin;
+	*cmin = this_cmin;
+}
+
+/*
+ * We only support fetching 'xmin', currently. It's needed for referential
+ * integrity triggers (i.e. foreign keys).
+ */
+static Datum
+tts_zedstore_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+	if (attnum == MinTransactionIdAttributeNumber ||
+		attnum == MinCommandIdAttributeNumber)
+	{
+		zstid		tid = ZSTidFromItemPointer(slot->tts_tid);
+		ZSBtreeScan btree_scan;
+		bool		found;
+		Relation	rel;
+		ZSUndoRecPtr recent_oldest_undo;
+		TransactionId xmin;
+		CommandId cmin;
+
+		/*
+		 * We assume that the table OID and TID in the slot are set. We
+		 * fetch the tuple from the table, and follow its UNDO chain to
+		 * find the transaction that inserted it.
+		 *
+		 * XXX: This is very slow compared to e.g. the heap, where we
+		 * always store the xmin in tuple itself. We should probably do
+		 * the same in zedstore, and add extra fields in the slot to hold
+		 * xmin/cmin and fill them in when we fetch the tuple and check its
+		 * visibility for the first time.
+		 */
+		if (!OidIsValid(slot->tts_tableOid))
+			elog(ERROR, "zedstore tuple table slot does not have a table oid");
+
+		/* assume the caller is already holding a suitable lock on the table */
+		rel = table_open(slot->tts_tableOid, NoLock);
+		recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel);
+
+		/* Use the meta-data tree for the visibility information. */
+		zsbt_tid_begin_scan(rel, tid, tid + 1, SnapshotAny, &btree_scan);
+
+		found = zsbt_tid_scan_next(&btree_scan) != InvalidZSTid;
+		if (!found)
+			elog(ERROR, "could not find zedstore tuple (%u, %u)",
+				 ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid));
+
+		zs_get_xmin_cmin(rel, recent_oldest_undo, tid, btree_scan.array_undoptr, &xmin, &cmin);
+
+		zsbt_tid_end_scan(&btree_scan);
+
+		table_close(rel, NoLock);
+
+		*isnull = false;
+		if (attnum == MinTransactionIdAttributeNumber)
+			return TransactionIdGetDatum(xmin);
+		else
+		{
+			Assert(attnum == MinCommandIdAttributeNumber);
+			return CommandIdGetDatum(cmin);
+		}
+	}	
+	elog(ERROR, "zedstore tuple table slot does not have system attributes (except xmin and cmin)");
+
+	return 0; /* silence compiler warnings */
+}
+
+/*
+ * To materialize a zedstore slot all the datums that aren't passed by value
+ * have to be copied into the slot's memory context.  To do so, compute the
+ * required size, and allocate enough memory to store all attributes.  That's
+ * good for cache hit ratio, but more importantly requires only memory
+ * allocation/deallocation.
+ */
+static void
+tts_zedstore_materialize(TupleTableSlot *slot)
+{
+	ZedstoreTupleTableSlot *vslot = (ZedstoreTupleTableSlot *) slot;
+	TupleDesc	desc = slot->tts_tupleDescriptor;
+	Size		sz = 0;
+	char	   *data;
+
+	/* already materialized */
+	if (TTS_SHOULDFREE(slot))
+		return;
+
+	/* compute size of memory required */
+	for (int natt = 0; natt < desc->natts; natt++)
+	{
+		Form_pg_attribute att = TupleDescAttr(desc, natt);
+		Datum val;
+
+		if (att->attbyval || slot->tts_isnull[natt])
+			continue;
+
+		val = slot->tts_values[natt];
+
+		if (att->attlen == -1 &&
+			VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val)))
+		{
+			/*
+			 * We want to flatten the expanded value so that the materialized
+			 * slot doesn't depend on it.
+			 */
+			sz = att_align_nominal(sz, att->attalign);
+			sz += EOH_get_flat_size(DatumGetEOHP(val));
+		}
+		else
+		{
+			sz = att_align_nominal(sz, att->attalign);
+			sz = att_addlength_datum(sz, att->attlen, val);
+		}
+	}
+
+	/* all data is byval */
+	if (sz == 0)
+		return;
+
+	/* allocate memory */
+	vslot->data = data = MemoryContextAlloc(slot->tts_mcxt, sz);
+	slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+	/* and copy all attributes into the pre-allocated space */
+	for (int natt = 0; natt < desc->natts; natt++)
+	{
+		Form_pg_attribute att = TupleDescAttr(desc, natt);
+		Datum val;
+
+		if (att->attbyval || slot->tts_isnull[natt])
+			continue;
+
+		val = slot->tts_values[natt];
+
+		if (att->attlen == -1 &&
+			VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val)))
+		{
+			Size data_length;
+
+			/*
+			 * We want to flatten the expanded value so that the materialized
+			 * slot doesn't depend on it.
+			 */
+			ExpandedObjectHeader *eoh = DatumGetEOHP(val);
+
+			data = (char *) att_align_nominal(data,
+											  att->attalign);
+			data_length = EOH_get_flat_size(eoh);
+			EOH_flatten_into(eoh, data, data_length);
+
+			slot->tts_values[natt] = PointerGetDatum(data);
+			data += data_length;
+		}
+		else
+		{
+			Size data_length = 0;
+
+			data = (char *) att_align_nominal(data, att->attalign);
+			data_length = att_addlength_datum(data_length, att->attlen, val);
+
+			memcpy(data, DatumGetPointer(val), data_length);
+
+			slot->tts_values[natt] = PointerGetDatum(data);
+			data += data_length;
+		}
+	}
+}
+
+static void
+tts_zedstore_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+	TupleDesc	srcdesc = dstslot->tts_tupleDescriptor;
+
+	Assert(srcdesc->natts <= dstslot->tts_tupleDescriptor->natts);
+
+	tts_zedstore_clear(dstslot);
+
+	slot_getallattrs(srcslot);
+
+	for (int natt = 0; natt < srcdesc->natts; natt++)
+	{
+		dstslot->tts_values[natt] = srcslot->tts_values[natt];
+		dstslot->tts_isnull[natt] = srcslot->tts_isnull[natt];
+	}
+
+	dstslot->tts_nvalid = srcdesc->natts;
+	dstslot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+	/* make sure storage doesn't depend on external memory */
+	tts_zedstore_materialize(dstslot);
+}
+
+static HeapTuple
+tts_zedstore_copy_heap_tuple(TupleTableSlot *slot)
+{
+	Assert(!TTS_EMPTY(slot));
+
+	return heap_form_tuple(slot->tts_tupleDescriptor,
+						   slot->tts_values,
+						   slot->tts_isnull);
+
+}
+
+static MinimalTuple
+tts_zedstore_copy_minimal_tuple(TupleTableSlot *slot)
+{
+	Assert(!TTS_EMPTY(slot));
+
+	return heap_form_minimal_tuple(slot->tts_tupleDescriptor,
+								   slot->tts_values,
+								   slot->tts_isnull);
+}
+
+
+const TupleTableSlotOps TTSOpsZedstore = {
+	.base_slot_size = sizeof(ZedstoreTupleTableSlot),
+	.init = tts_zedstore_init,
+	.release = tts_zedstore_release,
+	.clear = tts_zedstore_clear,
+	.getsomeattrs = tts_zedstore_getsomeattrs,
+	.getsysattr = tts_zedstore_getsysattr,
+	.materialize = tts_zedstore_materialize,
+	.copyslot = tts_zedstore_copyslot,
+
+	/*
+	 * A zedstore tuple table slot can not "own" a heap tuple or a minimal
+	 * tuple.
+	 */
+	.get_heap_tuple = NULL,
+	.get_minimal_tuple = NULL,
+	.copy_heap_tuple = tts_zedstore_copy_heap_tuple,
+	.copy_minimal_tuple = tts_zedstore_copy_minimal_tuple
+};
diff --git a/src/backend/access/zedstore/zedstore_undo.c b/src/backend/access/zedstore/zedstore_undo.c
new file mode 100644
index 0000000000..0767307253
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_undo.c
@@ -0,0 +1,918 @@
+/*
+ * zedstore_undo.c
+ *		Temporary UNDO-logging for zedstore.
+ *
+ * XXX: This is hopefully replaced with an upstream UNDO facility later.
+ *
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_undo.c
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/multixact.h"
+#include "access/zedstore_internal.h"
+#include "access/zedstore_undo.h"
+#include "commands/progress.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "postmaster/autovacuum.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#include "utils/pg_rusage.h"
+#include "utils/rel.h"
+#include "utils/lsyscache.h"
+
+/*
+ * Working area for zsundo_scan().
+ */
+typedef struct ZSUndoTrimStats
+{
+	/* List of TIDs of tuples we intend to delete */
+	/* NB: this list is ordered by TID address */
+	int			num_dead_tuples;	/* current # of entries */
+	int			max_dead_tuples;	/* # slots allocated in array */
+	ItemPointer dead_tuples;	/* array of ItemPointerData */
+	bool		dead_tuples_overflowed;
+
+	BlockNumber	deleted_undo_pages;
+
+	bool		can_advance_oldestundorecptr;
+} ZSUndoTrimStats;
+
+/*
+ * Working area for VACUUM.
+ */
+typedef struct ZSVacRelStats
+{
+	int			elevel;
+	BufferAccessStrategy vac_strategy;
+
+	/* hasindex = true means two-pass strategy; false means one-pass */
+	bool		hasindex;
+	/* Overall statistics about rel */
+	BlockNumber old_rel_pages;	/* previous value of pg_class.relpages */
+	BlockNumber rel_pages;		/* total number of pages */
+	BlockNumber scanned_pages;	/* number of pages we examined */
+	BlockNumber pinskipped_pages;	/* # of pages we skipped due to a pin */
+	BlockNumber frozenskipped_pages;	/* # of frozen pages we skipped */
+	BlockNumber tupcount_pages; /* pages whose tuples we counted */
+	double		old_live_tuples;	/* previous value of pg_class.reltuples */
+	double		new_rel_tuples; /* new estimated total # of tuples */
+	double		new_live_tuples;	/* new estimated total # of live tuples */
+	double		new_dead_tuples;	/* new estimated total # of dead tuples */
+	BlockNumber pages_removed;
+	double		tuples_deleted;
+	BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
+
+	ZSUndoTrimStats trimstats;
+} ZSVacRelStats;
+
+/*
+ * Guesstimation of number of dead tuples per page.  This is used to
+ * provide an upper limit to memory allocated when vacuuming small
+ * tables.
+ */
+#define LAZY_ALLOC_TUPLES		MaxHeapTuplesPerPage
+
+static int zs_vac_cmp_itemptr(const void *left, const void *right);
+static bool zs_lazy_tid_reaped(ItemPointer itemptr, void *state);
+static void lazy_space_alloc(ZSVacRelStats *vacrelstats, BlockNumber relblocks);
+static void lazy_vacuum_index(Relation indrel,
+				  IndexBulkDeleteResult **stats,
+				  ZSVacRelStats *vacrelstats);
+static void lazy_cleanup_index(Relation indrel,
+				   IndexBulkDeleteResult *stats,
+				   ZSVacRelStats *vacrelstats);
+static ZSUndoRecPtr zsundo_scan(Relation rel, TransactionId OldestXmin, ZSUndoTrimStats *trimstats, BlockNumber *oldest_undopage, List **unused_pages);
+static void zsundo_update_oldest_ptr(Relation rel, ZSUndoRecPtr oldest_undorecptr, BlockNumber oldest_undopage, List *unused_pages);
+static void zsundo_record_dead_tuple(ZSUndoTrimStats *trimstats, zstid tid);
+
+/*
+ * Insert the given UNDO record to the UNDO log.
+ */
+ZSUndoRecPtr
+zsundo_insert(Relation rel, ZSUndoRec *rec)
+{
+	Buffer		metabuf;
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	BlockNumber	tail_blk;
+	Buffer		tail_buf = InvalidBuffer;
+	Page		tail_pg = NULL;
+	ZSUndoPageOpaque *tail_opaque = NULL;
+	char	   *dst;
+	ZSUndoRecPtr undorecptr;
+	int			offset;
+	uint64		undo_counter;
+
+	metabuf = ReadBuffer(rel, ZS_META_BLK);
+	metapage = BufferGetPage(metabuf);
+
+	/* TODO: get share lock to begin with, for more concurrency */
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+
+retry_lock_tail:
+	tail_blk = metaopaque->zs_undo_tail;
+
+	/*
+	 * Is there space on the tail page? If not, allocate a new UNDO page.
+	 */
+	if (tail_blk != InvalidBlockNumber)
+	{
+		tail_buf = ReadBuffer(rel, tail_blk);
+		LockBuffer(tail_buf, BUFFER_LOCK_EXCLUSIVE);
+		tail_pg = BufferGetPage(tail_buf);
+		tail_opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(tail_pg);
+	}
+	if (tail_blk == InvalidBlockNumber || PageGetExactFreeSpace(tail_pg) < rec->size)
+	{
+		Buffer 		newbuf;
+		BlockNumber newblk;
+		Page		newpage;
+		ZSUndoPageOpaque *newopaque;
+
+		/*
+		 * Release the lock on the metapage while we find a new block, because
+		 * that could take a while. (And accessing the Free Page Map might lock
+		 * the metapage, too, causing self-deadlock.)
+		 */
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+
+		/* new page */
+		newbuf = zspage_getnewbuf(rel, metabuf);
+
+		LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+		if (metaopaque->zs_undo_tail != tail_blk)
+		{
+			/*
+			 * It should not be possible for another backend to extend the UNDO log
+			 * while we're holding the tail block locked.
+			 */
+			if (tail_blk != InvalidBlockNumber)
+				elog(ERROR, "UNDO tail block pointer was changed unexpectedly");
+
+			/*
+			 * we don't need the new page, after all. (Or maybe we do, if the new
+			 * tail block is already full, but we're not smart about it.)
+			 */
+			zspage_delete_page(rel, newbuf);
+			goto retry_lock_tail;
+		}
+
+		newblk = BufferGetBlockNumber(newbuf);
+		newpage = BufferGetPage(newbuf);
+		PageInit(newpage, BLCKSZ, sizeof(ZSUndoPageOpaque));
+		newopaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(newpage);
+		newopaque->next = InvalidBlockNumber;
+		newopaque->zs_page_id = ZS_UNDO_PAGE_ID;
+
+		metaopaque->zs_undo_tail = newblk;
+		if (tail_blk == InvalidBlockNumber)
+			metaopaque->zs_undo_head = newblk;
+
+		MarkBufferDirty(metabuf);
+
+		if (tail_blk != InvalidBlockNumber)
+		{
+			tail_opaque->next = newblk;
+			MarkBufferDirty(tail_buf);
+			UnlockReleaseBuffer(tail_buf);
+		}
+
+		tail_blk = newblk;
+		tail_buf = newbuf;
+		tail_pg = newpage;
+		tail_opaque = newopaque;
+	}
+
+	undo_counter = metaopaque->zs_undo_counter++;
+	MarkBufferDirty(metabuf);
+
+	UnlockReleaseBuffer(metabuf);
+
+	/* insert the record to this page */
+	offset = ((PageHeader) tail_pg)->pd_lower;
+
+	undorecptr.counter = undo_counter;
+	undorecptr.blkno = tail_blk;
+	undorecptr.offset = offset;
+	rec->undorecptr = undorecptr;
+	dst = ((char *) tail_pg) + offset;
+	memcpy(dst, rec, rec->size);
+	((PageHeader) tail_pg)->pd_lower += rec->size;
+	MarkBufferDirty(tail_buf);
+	UnlockReleaseBuffer(tail_buf);
+
+	return undorecptr;
+}
+
+/*
+ * Fetch the UNDO record with the given undo-pointer.
+ *
+ * The returned record is a palloc'd copy.
+ */
+ZSUndoRec *
+zsundo_fetch(Relation rel, ZSUndoRecPtr undoptr)
+{
+	Buffer		buf;
+	Page		page;
+	PageHeader	pagehdr;
+	ZSUndoPageOpaque *opaque;
+	ZSUndoRec  *undorec;
+	ZSUndoRec  *undorec_copy;
+
+	buf = ReadBuffer(rel, undoptr.blkno);
+	page = BufferGetPage(buf);
+	pagehdr = (PageHeader) page;
+
+	LockBuffer(buf, BUFFER_LOCK_SHARE);
+	if (PageIsNew(page))
+		elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u; not an UNDO page",
+			 undoptr.counter, undoptr.blkno, undoptr.offset);
+	opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page);
+	if (opaque->zs_page_id != ZS_UNDO_PAGE_ID)
+		elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u; not an UNDO page",
+			 undoptr.counter, undoptr.blkno, undoptr.offset);
+
+	/* Sanity check that the pointer pointed to a valid place */
+	if (undoptr.offset < SizeOfPageHeaderData ||
+		undoptr.offset + sizeof(ZSUndoRec) > pagehdr->pd_lower)
+		elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u",
+			 undoptr.counter, undoptr.blkno, undoptr.offset);
+
+	undorec = (ZSUndoRec *) (((char *) page) + undoptr.offset);
+
+	if (memcmp(&undorec->undorecptr, &undoptr, sizeof(ZSUndoRecPtr)) != 0)
+		elog(ERROR, "could not find UNDO record");
+
+	undorec_copy = palloc(undorec->size);
+	memcpy(undorec_copy, undorec, undorec->size);
+
+	UnlockReleaseBuffer(buf);
+
+	return undorec_copy;
+}
+
+void
+zsundo_clear_speculative_token(Relation rel, ZSUndoRecPtr undoptr)
+{
+	Buffer		buf;
+	Page		page;
+	PageHeader	pagehdr;
+	ZSUndoPageOpaque *opaque;
+	ZSUndoRec  *undorec;
+
+	buf = ReadBuffer(rel, undoptr.blkno);
+	page = BufferGetPage(buf);
+	pagehdr = (PageHeader) page;
+
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+	opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page);
+	if (opaque->zs_page_id != ZS_UNDO_PAGE_ID)
+		elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u; not an UNDO page",
+			undoptr.counter, undoptr.blkno, undoptr.offset);
+
+	/* Sanity check that the pointer pointed to a valid place */
+	if (undoptr.offset < SizeOfPageHeaderData ||
+		undoptr.offset + sizeof(ZSUndoRec) > pagehdr->pd_lower)
+		elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u",
+			undoptr.counter, undoptr.blkno, undoptr.offset);
+
+	undorec = (ZSUndoRec *) (((char *) page) + undoptr.offset);
+
+	if (undorec->type != ZSUNDO_TYPE_INSERT)
+		elog(ERROR, "unexpected undo record type %d on speculatively inserted row", undorec->type);
+
+	undorec->speculative_token = INVALID_SPECULATIVE_TOKEN;
+	MarkBufferDirty(buf);
+	UnlockReleaseBuffer(buf);
+}
+
+static bool
+zs_lazy_tid_reaped(ItemPointer itemptr, void *state)
+{
+	ZSVacRelStats *vacrelstats = (ZSVacRelStats *) state;
+	ItemPointer res;
+
+	res = (ItemPointer) bsearch((void *) itemptr,
+								(void *) vacrelstats->trimstats.dead_tuples,
+								vacrelstats->trimstats.num_dead_tuples,
+								sizeof(ItemPointerData),
+								zs_vac_cmp_itemptr);
+
+	return (res != NULL);
+}
+
+/*
+ * Comparator routines for use with qsort() and bsearch().
+ */
+static int
+zs_vac_cmp_itemptr(const void *left, const void *right)
+{
+	BlockNumber lblk,
+				rblk;
+	OffsetNumber loff,
+				roff;
+
+	lblk = ItemPointerGetBlockNumber((ItemPointer) left);
+	rblk = ItemPointerGetBlockNumber((ItemPointer) right);
+
+	if (lblk < rblk)
+		return -1;
+	if (lblk > rblk)
+		return 1;
+
+	loff = ItemPointerGetOffsetNumber((ItemPointer) left);
+	roff = ItemPointerGetOffsetNumber((ItemPointer) right);
+
+	if (loff < roff)
+		return -1;
+	if (loff > roff)
+		return 1;
+
+	return 0;
+}
+
+void
+zsundo_vacuum(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy,
+			  TransactionId OldestXmin)
+{
+	ZSVacRelStats *vacrelstats;
+	ZSUndoTrimStats *trimstats;
+	Relation   *Irel;
+	int			nindexes;
+	IndexBulkDeleteResult **indstats;
+	BlockNumber	nblocks;
+
+	nblocks = RelationGetNumberOfBlocks(rel);
+	if (nblocks == 0)
+		return;		/* empty table */
+
+	vacrelstats = (ZSVacRelStats *) palloc0(sizeof(ZSVacRelStats));
+	trimstats = &vacrelstats->trimstats;
+
+	if (params->options & VACOPT_VERBOSE)
+		vacrelstats->elevel = INFO;
+	else
+		vacrelstats->elevel = DEBUG2;
+	vacrelstats->vac_strategy = bstrategy;
+
+	/* Open all indexes of the relation */
+	vac_open_indexes(rel, RowExclusiveLock, &nindexes, &Irel);
+	vacrelstats->hasindex = (nindexes > 0);
+	indstats = (IndexBulkDeleteResult **)
+		palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
+
+	lazy_space_alloc(vacrelstats, nblocks);
+
+	ereport(vacrelstats->elevel,
+			(errmsg("vacuuming \"%s.%s\"",
+					get_namespace_name(RelationGetNamespace(rel)),
+					RelationGetRelationName(rel))));
+
+	do
+	{
+		ZSUndoRecPtr reaped_upto;
+		BlockNumber oldest_undopage;
+		int			j;
+		List	   *unused_pages = NIL;
+
+		trimstats->dead_tuples_overflowed = false;
+		trimstats->num_dead_tuples = 0;
+		trimstats->deleted_undo_pages = 0;
+
+		reaped_upto = zsundo_scan(rel, OldestXmin, trimstats, &oldest_undopage, &unused_pages);
+
+		if (trimstats->num_dead_tuples > 0)
+		{
+			pg_qsort(trimstats->dead_tuples, trimstats->num_dead_tuples,
+					 sizeof(ItemPointerData), zs_vac_cmp_itemptr);
+			/* TODO: currently, we write a separate UNDO record for each attribute, so there will
+			 * be duplicates. Eliminate them. */
+			j = 1;
+			for (int i = 1; i < trimstats->num_dead_tuples; i++)
+			{
+				if (!ItemPointerEquals(&trimstats->dead_tuples[j - 1],
+									   &trimstats->dead_tuples[i]))
+					trimstats->dead_tuples[j++] = trimstats->dead_tuples[i];
+			}
+			trimstats->num_dead_tuples = j;
+
+			/* Remove index entries */
+			for (int i = 0; i < nindexes; i++)
+				lazy_vacuum_index(Irel[i],
+								  &indstats[i],
+								  vacrelstats);
+
+			/*
+			 * Mark the items as dead in the attribute b-trees.
+			 *
+			 * We cannot remove them immediately, because we must prevent the TIDs from
+			 * being reused, until we have trimmed the UNDO records. Otherwise, this might
+			 * happen:
+			 *
+			 * 1. We remove items from all the B-trees.
+			 * 2. An inserter reuses the now-unused TID for a new tuple
+			 * 3. We abort the VACUUM, for some reason
+			 * 4. We start VACUUM again. We will now try to remove the item again, but
+			 *    we will remove the new item with the same TID instead.
+			 *
+			 * There would be other ways to deal with it. For example in step #4, we could
+			 * refrain from removing items, whose UNDO pointers are newer than expected.
+			 * But that's tricky, because we scan the indexes first, and we must refrain
+			 * from removing index entries for new items, too.
+			 */
+			for (int i = 0; i < trimstats->num_dead_tuples; i++)
+				zsbt_tid_mark_dead(rel,
+								   ZSTidFromItemPointer(trimstats->dead_tuples[i]),
+								   reaped_upto);
+
+			for (int attno = 1; attno <= RelationGetNumberOfAttributes(rel); attno++)
+			{
+				for (int i = 0; i < trimstats->num_dead_tuples; i++)
+					zsbt_attr_remove(rel, attno, ZSTidFromItemPointer(trimstats->dead_tuples[i]));
+			}
+		}
+
+		/*
+		 * The UNDO records for the tuple versions we just removed are no longer
+		 * interesting to anyone. Advance the UNDO tail, so that the UNDO pages
+		 * can be recycled.
+		 */
+		zsundo_update_oldest_ptr(rel, reaped_upto, oldest_undopage, unused_pages);
+
+		ereport(vacrelstats->elevel,
+				(errmsg("\"%s\": removed %d row versions and %d undo pages",
+						RelationGetRelationName(rel),
+						trimstats->num_dead_tuples,
+						trimstats->deleted_undo_pages)));
+	} while(trimstats->dead_tuples_overflowed);
+
+	/* Do post-vacuum cleanup and statistics update for each index */
+	for (int i = 0; i < nindexes; i++)
+		lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
+
+	/* Done with indexes */
+	vac_close_indexes(nindexes, Irel, NoLock);
+}
+
+
+/*
+ * lazy_space_alloc - space allocation decisions for lazy vacuum
+ *
+ * See the comments at the head of this file for rationale.
+ */
+static void
+lazy_space_alloc(ZSVacRelStats *vacrelstats, BlockNumber relblocks)
+{
+	long		maxtuples;
+	int			vac_work_mem = IsAutoVacuumWorkerProcess() &&
+	autovacuum_work_mem != -1 ?
+	autovacuum_work_mem : maintenance_work_mem;
+
+	if (vacrelstats->hasindex)
+	{
+		maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
+		maxtuples = Min(maxtuples, INT_MAX);
+		maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
+
+		/* curious coding here to ensure the multiplication can't overflow */
+		if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
+			maxtuples = relblocks * LAZY_ALLOC_TUPLES;
+
+		/* stay sane if small maintenance_work_mem */
+		maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
+	}
+	else
+	{
+		/*
+		 * TODO: In heap vacuum code, this is MaxHeapTuplesPerPage. We have no
+		 * particular reason to size this by that, but the same principle applies:
+		 * without indexes, it's pretty cheap to do multiple iterations, so let's
+		 * avoid making a huge allocation
+		 */
+		maxtuples = 1000;
+	}
+
+	vacrelstats->trimstats.num_dead_tuples = 0;
+	vacrelstats->trimstats.max_dead_tuples = (int) maxtuples;
+	vacrelstats->trimstats.dead_tuples = (ItemPointer)
+		palloc(maxtuples * sizeof(ItemPointerData));
+}
+
+/*
+ *	lazy_vacuum_index() -- vacuum one index relation.
+ *
+ *		Delete all the index entries pointing to tuples listed in
+ *		vacrelstats->dead_tuples, and update running statistics.
+ */
+static void
+lazy_vacuum_index(Relation indrel,
+				  IndexBulkDeleteResult **stats,
+				  ZSVacRelStats *vacrelstats)
+{
+	IndexVacuumInfo ivinfo;
+	PGRUsage	ru0;
+
+	pg_rusage_init(&ru0);
+
+	ivinfo.index = indrel;
+	ivinfo.analyze_only = false;
+	ivinfo.estimated_count = true;
+	ivinfo.message_level = vacrelstats->elevel;
+	/* We can only provide an approximate value of num_heap_tuples here */
+	ivinfo.num_heap_tuples = vacrelstats->old_live_tuples;
+	ivinfo.strategy = vacrelstats->vac_strategy;
+
+	/* Do bulk deletion */
+	*stats = index_bulk_delete(&ivinfo, *stats,
+							   zs_lazy_tid_reaped, (void *) vacrelstats);
+
+	ereport(vacrelstats->elevel,
+			(errmsg("scanned index \"%s\" to remove %d row versions",
+					RelationGetRelationName(indrel),
+					vacrelstats->trimstats.num_dead_tuples),
+			 errdetail_internal("%s", pg_rusage_show(&ru0))));
+}
+
+/*
+ *	lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
+ */
+static void
+lazy_cleanup_index(Relation indrel,
+				   IndexBulkDeleteResult *stats,
+				   ZSVacRelStats *vacrelstats)
+{
+	IndexVacuumInfo ivinfo;
+	PGRUsage	ru0;
+
+	pg_rusage_init(&ru0);
+
+	ivinfo.index = indrel;
+	ivinfo.analyze_only = false;
+	ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages);
+	ivinfo.message_level = vacrelstats->elevel;
+
+	/*
+	 * Now we can provide a better estimate of total number of surviving
+	 * tuples (we assume indexes are more interested in that than in the
+	 * number of nominally live tuples).
+	 */
+	ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
+	ivinfo.strategy = vacrelstats->vac_strategy;
+
+	stats = index_vacuum_cleanup(&ivinfo, stats);
+
+	if (!stats)
+		return;
+
+	/*
+	 * Now update statistics in pg_class, but only if the index says the count
+	 * is accurate.
+	 */
+	if (!stats->estimated_count)
+		vac_update_relstats(indrel,
+							stats->num_pages,
+							stats->num_index_tuples,
+							0,
+							false,
+							InvalidTransactionId,
+							InvalidMultiXactId,
+							false);
+
+	ereport(vacrelstats->elevel,
+			(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
+					RelationGetRelationName(indrel),
+					stats->num_index_tuples,
+					stats->num_pages),
+			 errdetail("%.0f index row versions were removed.\n"
+					   "%u index pages have been deleted, %u are currently reusable.\n"
+					   "%s.",
+					   stats->tuples_removed,
+					   stats->pages_deleted, stats->pages_free,
+					   pg_rusage_show(&ru0))));
+
+	pfree(stats);
+}
+
+/*
+ * Scan the UNDO log, starting from oldest entry. For every tuple that is
+ * now considered dead, add it to 'dead_tuples'. Records for committed
+ * transactions can be trimmed away immediately.
+ *
+ * Returns the value that the oldest UNDO ptr can be trimmed upto, after
+ * removing all the dead TIDs.
+ *
+ * The caller must initialize ZSUndoTrimStats. This function updates the
+ * counters, and adds dead TIDs that can be removed to trimstats->dead_tuples.
+ * If there are more dead TIDs than fit in the dead_tuples array, this
+ * function sets trimstats->dead_tuples_overflow flag, and stops just before
+ * the UNDO record for the TID that did not fit. An important special case is
+ * calling this with trimstats->max_dead_tuples == 0. In that case, we scan
+ * as much as is possible without scanning the indexes (i.e. only UNDO
+ * records belonging to committed transactions at the tail of the UNDO log).
+ * IOW, it returns the oldest UNDO rec pointer that is still needed by
+ * active snapshots.
+ */
+static ZSUndoRecPtr
+zsundo_scan(Relation rel, TransactionId OldestXmin, ZSUndoTrimStats *trimstats,
+			BlockNumber *oldest_undopage, List **unused_pages)
+{
+	/* Scan the undo log from oldest to newest */
+	Buffer		metabuf;
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	BlockNumber	firstblk;
+	BlockNumber	lastblk;
+	ZSUndoRecPtr oldest_undorecptr;
+	bool		can_advance_oldestundorecptr;
+	char	   *ptr;
+	char	   *endptr;
+
+	/*
+	 * Get the current oldest undo page from the metapage.
+	 */
+	metabuf = ReadBuffer(rel, ZS_META_BLK);
+	metapage = BufferGetPage(metabuf);
+	LockBuffer(metabuf, BUFFER_LOCK_SHARE);
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+
+	firstblk = metaopaque->zs_undo_head;
+
+	oldest_undorecptr = metaopaque->zs_undo_oldestptr;
+
+	/*
+	 * If we assume that only one process can call TRIM at a time, then we
+	 * don't need to hold the metapage locked. Alternatively, if multiple
+	 * concurrent trims is possible, we could check after reading the head
+	 * page, that it is the page we expect, and re-read the metapage if it's
+	 * not.
+	 *
+	 * FIXME: Currently this works even if two backends call zsundo_trim()
+	 * concurrently, because we never recycle UNDO pages.
+	 */
+	UnlockReleaseBuffer(metabuf);
+
+	/*
+	 * Loop through UNDO records, starting from the oldest page, until we
+	 * hit a record that we cannot remove.
+	 */
+	lastblk = firstblk;
+	can_advance_oldestundorecptr = false;
+	while (lastblk != InvalidBlockNumber && !trimstats->dead_tuples_overflowed)
+	{
+		Buffer		buf;
+		Page		page;
+		ZSUndoPageOpaque *opaque;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Read the UNDO page */
+		buf = ReadBuffer(rel, lastblk);
+		page = BufferGetPage(buf);
+		LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+		opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page);
+
+		if (opaque->zs_page_id != ZS_UNDO_PAGE_ID)
+			elog(ERROR, "unexpected page id on UNDO page");
+
+		/* loop through all records on the page */
+		endptr = (char *) page + ((PageHeader) page)->pd_lower;
+		ptr = (char *) page + SizeOfPageHeaderData;
+		while (ptr < endptr && !trimstats->dead_tuples_overflowed)
+		{
+			ZSUndoRec *undorec = (ZSUndoRec *) ptr;
+			bool		did_commit;
+
+			Assert(undorec->undorecptr.blkno == lastblk);
+
+			if (undorec->undorecptr.counter < oldest_undorecptr.counter)
+			{
+				ptr += undorec->size;
+				continue;
+			}
+			oldest_undorecptr = undorec->undorecptr;
+
+			if (!TransactionIdPrecedes(undorec->xid, OldestXmin))
+			{
+				/* This is still needed. Bail out */
+				break;
+			}
+
+			/*
+			 * No one thinks this transaction is in-progress anymore. If it
+			 * committed, we can just trim away its UNDO record. If it aborted,
+			 * we need to apply the UNDO record first.
+			 */
+			did_commit = TransactionIdDidCommit(undorec->xid);
+
+			switch (undorec->type)
+			{
+				case ZSUNDO_TYPE_INSERT:
+					if (!did_commit)
+						zsundo_record_dead_tuple(trimstats, undorec->tid);
+					break;
+				case ZSUNDO_TYPE_DELETE:
+					if (did_commit)
+					{
+						zsundo_record_dead_tuple(trimstats, undorec->tid);
+					}
+					else
+					{
+						/*
+						 * must clear the item's UNDO pointer, otherwise the deletion
+						 * becomes visible to everyone when the UNDO record is trimmed
+						 * away
+						 */
+						/*
+						 * Don't do this if we're called from zsundo_get_oldest_undo_ptr(),
+						 * because we might be holding a lock on the page, and deadlock.
+						 */
+						if (trimstats->max_dead_tuples == 0)
+							trimstats->dead_tuples_overflowed = true;
+						else
+							zsbt_tid_undo_deletion(rel, undorec->tid, undorec->undorecptr);
+					}
+					break;
+				case ZSUNDO_TYPE_UPDATE:
+					if (did_commit)
+						zsundo_record_dead_tuple(trimstats, undorec->tid);
+					break;
+			}
+
+			if (!trimstats->dead_tuples_overflowed)
+			{
+				ptr += undorec->size;
+
+				can_advance_oldestundorecptr = true;
+			}
+		}
+
+		if (ptr < endptr)
+		{
+			UnlockReleaseBuffer(buf);
+			break;
+		}
+		else
+		{
+			/* We processed all records on the page. Step to the next one, if any. */
+			Assert(ptr == endptr);
+			*unused_pages = lappend_int(*unused_pages, lastblk);
+			lastblk = opaque->next;
+			UnlockReleaseBuffer(buf);
+			if (lastblk != InvalidBlockNumber)
+				trimstats->deleted_undo_pages++;
+		}
+	}
+
+	if (can_advance_oldestundorecptr && lastblk == InvalidBlockNumber)
+	{
+		/*
+		 * We stopped after the last valid record. Advance by one, to the next
+		 * record which hasn't been created yet, and which  is still needed
+		 */
+		oldest_undorecptr.counter++;
+		oldest_undorecptr.blkno = InvalidBlockNumber;
+		oldest_undorecptr.offset = 0;
+	}
+
+	trimstats->can_advance_oldestundorecptr = can_advance_oldestundorecptr;
+	*oldest_undopage = lastblk;
+	return oldest_undorecptr;
+}
+
+/* Update metapage with the oldest value */
+static void
+zsundo_update_oldest_ptr(Relation rel, ZSUndoRecPtr oldest_undorecptr, BlockNumber oldest_undopage, List *unused_pages)
+{
+	/* Scan the undo log from oldest to newest */
+	Buffer		metabuf;
+	Page		metapage;
+	ZSMetaPageOpaque *metaopaque;
+	ListCell   *lc;
+
+	metabuf = ReadBuffer(rel, ZS_META_BLK);
+	metapage = BufferGetPage(metabuf);
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+	metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage);
+
+	metaopaque->zs_undo_oldestptr = oldest_undorecptr;
+	if (oldest_undopage == InvalidBlockNumber)
+	{
+		metaopaque->zs_undo_head = InvalidBlockNumber;
+		metaopaque->zs_undo_tail = InvalidBlockNumber;
+	}
+	else
+		metaopaque->zs_undo_head = oldest_undopage;
+
+	/* TODO: WAL-log */
+
+	MarkBufferDirty(metabuf);
+	UnlockReleaseBuffer(metabuf);
+
+	foreach(lc, unused_pages)
+	{
+		BlockNumber blk = (BlockNumber) lfirst_int(lc);
+		Buffer		buf;
+		Page		page;
+		ZSUndoPageOpaque *opaque;
+
+		/* check that the page still looks like what we'd expect. */
+		buf = ReadBuffer(rel, blk);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		page = BufferGetPage(buf);
+		if (PageIsEmpty(page) ||
+			PageGetSpecialSize(page) != MAXALIGN(sizeof(ZSUndoPageOpaque)))
+		{
+			UnlockReleaseBuffer(buf);
+			continue;
+		}
+		opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page);
+		if (opaque->zs_page_id != ZS_UNDO_PAGE_ID)
+		{
+			UnlockReleaseBuffer(buf);
+			continue;
+		}
+
+		/* FIXME: Also check here that the max UndoRecPtr on the page is less
+		 * than the new 'oldest_undorecptr'
+		 */
+
+		zspage_delete_page(rel, buf);
+		UnlockReleaseBuffer(buf);
+	}
+}
+
+/*
+ * zsundo_record_dead_tuple - remember one deletable tuple
+ */
+static void
+zsundo_record_dead_tuple(ZSUndoTrimStats *trimstats, zstid tid)
+{
+	/*
+	 * The array shouldn't overflow under normal behavior, but perhaps it
+	 * could if we are given a really small maintenance_work_mem. In that
+	 * case, just forget the last few tuples (we'll get 'em next time).
+	 */
+	if (trimstats->num_dead_tuples < trimstats->max_dead_tuples)
+	{
+		trimstats->dead_tuples[trimstats->num_dead_tuples] = ItemPointerFromZSTid(tid);
+		 trimstats->num_dead_tuples++;
+		pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
+									 trimstats->num_dead_tuples);
+	}
+	else
+		trimstats->dead_tuples_overflowed = true;
+}
+
+/*
+ * Return the current "Oldest undo pointer". The effects of any actions with
+ * undo pointer older than this is known to be visible to everyone. (i.e.
+ * an inserted tuple is known to be visible, and a deleted tuple is known to
+ * be invisible.)
+ */
+ZSUndoRecPtr
+zsundo_get_oldest_undo_ptr(Relation rel)
+{
+	ZSUndoRecPtr result;
+	ZSUndoTrimStats trimstats;
+	BlockNumber oldest_undopage;
+	List	   *unused_pages = NIL;
+
+	if (RelationGetNumberOfBlocks(rel) == 0)
+	{
+		memset(&result, 0, sizeof(ZSUndoRecPtr));
+		return result;
+	}
+
+	/*
+	 * Call zsundo_scan, with max_dead_tuples = 0. It scans the UNDO log,
+	 * starting from the oldest record, and advances the oldest UNDO pointer
+	 * past as many committed, visible-to-all transactions as possible.
+	 *
+	 * TODO:
+	 * We could get the latest cached value directly from the metapage, but
+	 * this allows trimming the UNDO log more aggressively, whenever we're
+	 * scanning. Fetching records from the UNDO log is pretty expensive,
+	 * so until that is somehow sped up, it is a good tradeoff to be
+	 * aggressive about that.
+	 */
+	trimstats.num_dead_tuples = 0;
+	trimstats.max_dead_tuples = 0;
+	trimstats.dead_tuples = NULL;
+	trimstats.dead_tuples_overflowed = false;
+	trimstats.deleted_undo_pages = 0;
+	result = zsundo_scan(rel, RecentGlobalXmin, &trimstats, &oldest_undopage, &unused_pages);
+
+	if (trimstats.can_advance_oldestundorecptr)
+		zsundo_update_oldest_ptr(rel, result, oldest_undopage, unused_pages);
+
+	return result;
+}
diff --git a/src/backend/access/zedstore/zedstore_utils.c b/src/backend/access/zedstore/zedstore_utils.c
new file mode 100644
index 0000000000..7673537292
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_utils.c
@@ -0,0 +1,76 @@
+/*-------------------------------------------------------------------------
+ *
+ * zedstore_utils.c
+ *	  ZedStore utility functions
+ *
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_freepagemap.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/zedstore_internal.h"
+#include "miscadmin.h"
+
+/*
+ * Allocate a new zs_split_stack struct.
+ */
+zs_split_stack *
+zs_new_split_stack_entry(Buffer buf, Page page)
+{
+	zs_split_stack *stack;
+
+	stack = palloc(sizeof(zs_split_stack));
+	stack->next = NULL;
+	stack->buf = buf;
+	stack->page = page;
+	stack->recycle = false;		/* caller can change this */
+
+	return stack;
+}
+
+/*
+ * Apply all the changes represented by a list of zs_split_stack
+ * entries.
+ */
+void
+zs_apply_split_changes(Relation rel, zs_split_stack *stack)
+{
+	zs_split_stack *head = stack;
+
+	START_CRIT_SECTION();
+
+	while (stack)
+	{
+		PageRestoreTempPage(stack->page, BufferGetPage(stack->buf));
+		MarkBufferDirty(stack->buf);
+		stack = stack->next;
+	}
+
+	/* TODO: WAL-log all the changes  */
+
+	END_CRIT_SECTION();
+
+	stack = head;
+	while (stack)
+	{
+		zs_split_stack *next;
+
+		/* add this page to the Free Page Map for recycling */
+		if (stack->recycle)
+			zspage_delete_page(rel, stack->buf);
+
+		UnlockReleaseBuffer(stack->buf);
+
+		next = stack->next;
+		pfree(stack);
+		stack = next;
+	}
+}
diff --git a/src/backend/access/zedstore/zedstore_visibility.c b/src/backend/access/zedstore/zedstore_visibility.c
new file mode 100644
index 0000000000..0087991f78
--- /dev/null
+++ b/src/backend/access/zedstore/zedstore_visibility.c
@@ -0,0 +1,728 @@
+/*
+ * zedstore_visibility.c
+ *		Routines for MVCC in Zedstore
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstore_visibility.c
+ */
+#include "postgres.h"
+
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "access/zedstore_internal.h"
+#include "access/zedstore_undo.h"
+#include "storage/procarray.h"
+
+static bool
+zs_tuplelock_compatible(LockTupleMode mode, LockTupleMode newmode)
+{
+	switch (newmode)
+	{
+		case LockTupleKeyShare:
+			return mode == LockTupleKeyShare ||
+				mode == LockTupleShare ||
+				mode == LockTupleNoKeyExclusive;
+
+		case LockTupleShare:
+			return mode == LockTupleKeyShare ||
+				mode == LockTupleShare;
+
+		case LockTupleNoKeyExclusive:
+			return mode == LockTupleKeyShare;
+		case LockTupleExclusive:
+			return false;
+
+		default:
+			elog(ERROR, "unknown tuple lock mode %d", newmode);
+	}
+}
+
+/*
+ * Like HeapTupleSatisfiesUpdate.
+ *
+ * When returns TM_Ok, this also returns a flag in *undo_record_needed, to indicate
+ * whether the old UNDO record is still of interest to anyone. If the old record
+ * belonged to an aborted deleting transaction, for example, it can be ignored.
+ *
+ * This does more than HeapTupleSatisfiesUpdate. If HeapTupleSatisfiesUpdate sees
+ * an updated or locked tuple, it returns TM_BeingUpdated, and the caller has to
+ * check if the tuple lock is compatible with the update. zs_SatisfiesUpdate
+ * checks if the new lock mode is compatible with the old one, and returns TM_Ok
+ * if so. Waiting for conflicting locks is left to the caller.
+ *
+ * This is also used for tuple locking (e.g. SELECT FOR UPDATE). 'mode' indicates
+ * the lock mode. For a genuine UPDATE, pass LockTupleExclusive or
+ * LockTupleNoKeyExclusive depending on whether key columns are being modified.
+ *
+ * If the tuple was UPDATEd, *next_tid is set to the TID of the new row version.
+ */
+TM_Result
+zs_SatisfiesUpdate(Relation rel, Snapshot snapshot,
+				   ZSUndoRecPtr recent_oldest_undo, ZSBtreeItem *item,
+				   LockTupleMode mode,
+				   bool *undo_record_needed, TM_FailureData *tmfd, zstid *next_tid)
+{
+	ZSUndoRecPtr undo_ptr;
+	ZSUndoRec  *undorec;
+	int			chain_depth = 0;
+
+	Assert((item->t_flags & ZSBT_COMPRESSED) == 0);
+
+	*undo_record_needed = true;
+
+	undo_ptr = zsbt_item_undoptr(item);
+
+fetch_undo_record:
+	chain_depth++;
+
+	/* Is it visible? */
+	if (undo_ptr.counter < recent_oldest_undo.counter)
+	{
+		/*
+		 * The old UNDO record is no longer visible to anyone, so we don't
+		 * need to keep it. If this record was not the one directly referenced
+		 * from the item, then we must keep it, though. For example, if there
+		 * is a chain (item -> LOCK_TUPLE -> INSERT), and the INSERT record is
+		 * no longer needed by anyone, we must still keep the pointer to the LOCK
+		 * record.
+		 */
+		if (chain_depth == 1)
+			*undo_record_needed = false;
+		return TM_Ok;
+	}
+
+	/* have to fetch the UNDO record */
+	undorec = zsundo_fetch(rel, undo_ptr);
+
+	if (undorec->type == ZSUNDO_TYPE_INSERT)
+	{
+		if (TransactionIdIsCurrentTransactionId(undorec->xid))
+		{
+			if (undorec->cid >= snapshot->curcid)
+				return TM_Invisible;	/* inserted after scan started */
+		}
+		else if (TransactionIdIsInProgress(undorec->xid))
+			return TM_Invisible;		/* inserter has not committed yet */
+		else if (!TransactionIdDidCommit(undorec->xid))
+		{
+			/* it must have aborted or crashed */
+			return TM_Invisible;
+		}
+
+		/* The tuple is visible to use. But can we lock it? */
+
+		/*
+		 * No conflict with this lock. Look at the previous UNDO record, there
+		 * might be more locks.
+		 *
+		 * FIXME: Shouldn't we drill down to the INSERT record and check if
+		 * that's visible to us first, before looking at the lockers?
+		 */
+		undo_ptr = undorec->prevundorec;
+		goto fetch_undo_record;
+	}
+	else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK)
+	{
+		ZSUndoRec_TupleLock *lock_undorec = (ZSUndoRec_TupleLock *) undorec;
+
+		/*
+		 * If any subtransaction of the current top transaction already holds
+		 * a lock as strong as or stronger than what we're requesting, we
+		 * effectively hold the desired lock already.  We *must* succeed
+		 * without trying to take the tuple lock, else we will deadlock
+		 * against anyone wanting to acquire a stronger lock.
+		 */
+		if (TransactionIdIsCurrentTransactionId(undorec->xid))
+		{
+			if (lock_undorec->lockmode >= mode)
+			{
+				*undo_record_needed = true;
+				return TM_Ok;
+			}
+		}
+		else if (!zs_tuplelock_compatible(lock_undorec->lockmode, mode) &&
+				 TransactionIdIsInProgress(undorec->xid))
+		{
+			tmfd->ctid = ItemPointerFromZSTid(item->t_tid);
+			tmfd->xmax = undorec->xid;
+			tmfd->cmax = InvalidCommandId;
+			return TM_BeingModified;
+		}
+
+		/*
+		 * No conflict with this lock. Look at the previous UNDO record, there
+		 * might be more locks.
+		 *
+		 * FIXME: Shouldn't we drill down to the INSERT record and check if
+		 * that's visible to us first, before looking at the lockers?
+		 */
+		undo_ptr = undorec->prevundorec;
+		goto fetch_undo_record;
+	}
+	else if (undorec->type == ZSUNDO_TYPE_DELETE)
+	{
+		ZSUndoRec_Delete *deleterec = (ZSUndoRec_Delete *) undorec;
+
+		if (TransactionIdIsCurrentTransactionId(undorec->xid))
+		{
+			if (undorec->cid >= snapshot->curcid)
+			{
+				tmfd->ctid = ItemPointerFromZSTid(item->t_tid);
+				tmfd->xmax = undorec->xid;
+				tmfd->cmax = undorec->cid;
+				return TM_SelfModified;	/* deleted/updated after scan started */
+			}
+			else
+				return TM_Invisible;	/* deleted before scan started */
+		}
+
+		if (TransactionIdIsInProgress(undorec->xid))
+		{
+			tmfd->ctid = ItemPointerFromZSTid(item->t_tid);
+			tmfd->xmax = undorec->xid;
+			tmfd->cmax = InvalidCommandId;
+
+			return TM_BeingModified;
+		}
+
+		if (!TransactionIdDidCommit(undorec->xid))
+		{
+			/* deleter must have aborted or crashed. We have to keep following the
+			 * undo chain, in case there are LOCK records that are still visible
+			 */
+			undo_ptr = undorec->prevundorec;
+			goto fetch_undo_record;
+		}
+
+		tmfd->xmax = undorec->xid;
+		tmfd->cmax = InvalidCommandId;
+		if (deleterec->changedPart)
+		{
+			ItemPointerSet(&tmfd->ctid, MovedPartitionsBlockNumber, MovedPartitionsOffsetNumber);
+			*next_tid = InvalidZSTid;
+			return TM_Updated;
+		}
+		else
+		{
+			tmfd->ctid = ItemPointerFromZSTid(item->t_tid);
+			return TM_Deleted;
+		}
+	}
+	else if (undorec->type == ZSUNDO_TYPE_UPDATE)
+	{
+		/* updated-away tuple */
+		ZSUndoRec_Update *updaterec = (ZSUndoRec_Update *) undorec;
+		LockTupleMode old_lockmode;
+
+		*next_tid = updaterec->newtid;
+		old_lockmode = updaterec->key_update ? LockTupleExclusive : LockTupleNoKeyExclusive;
+
+		if (TransactionIdIsCurrentTransactionId(undorec->xid))
+		{
+			if (zs_tuplelock_compatible(old_lockmode, mode))
+				return TM_Ok;
+
+			if (undorec->cid >= snapshot->curcid)
+			{
+				tmfd->ctid = ItemPointerFromZSTid(item->t_tid);
+				tmfd->xmax = undorec->xid;
+				tmfd->cmax = undorec->cid;
+				return TM_SelfModified;	/* deleted/updated after scan started */
+			}
+			else
+				return TM_Invisible;	/* deleted before scan started */
+		}
+
+		if (TransactionIdIsInProgress(undorec->xid))
+		{
+			if (zs_tuplelock_compatible(old_lockmode, mode))
+				return TM_Ok;
+
+			tmfd->ctid = ItemPointerFromZSTid(item->t_tid);
+			tmfd->xmax = undorec->xid;
+			tmfd->cmax = InvalidCommandId;
+
+			return TM_BeingModified;
+		}
+
+		if (!TransactionIdDidCommit(undorec->xid))
+		{
+			/* deleter must have aborted or crashed. We have to keep following the
+			 * undo chain, in case there are LOCK records that are still visible
+			 */
+			undo_ptr = undorec->prevundorec;
+			goto fetch_undo_record;
+		}
+
+		if (zs_tuplelock_compatible(old_lockmode, mode))
+			return TM_Ok;
+
+		tmfd->ctid = ItemPointerFromZSTid(((ZSUndoRec_Update *) undorec)->newtid);
+		tmfd->xmax = undorec->xid;
+		tmfd->cmax = InvalidCommandId;
+		return TM_Updated;
+	}
+	else
+		elog(ERROR, "unexpected UNDO record type: %d", undorec->type);
+}
+
+
+/*
+ * Like HeapTupleSatisfiesAny
+ */
+static bool
+zs_SatisfiesAny(ZSBtreeScan *scan, ZSBtreeItem *item)
+{
+	return true;
+}
+
+/*
+ * helper function to zs_SatisfiesMVCC(), to check if the given XID
+ * is visible to the snapshot.
+ */
+static bool
+xid_is_visible(Snapshot snapshot, TransactionId xid, CommandId cid, bool *aborted)
+{
+	*aborted = false;
+	if (TransactionIdIsCurrentTransactionId(xid))
+	{
+		if (cid >= snapshot->curcid)
+			return false;
+		else
+			return true;
+	}
+	else if (XidInMVCCSnapshot(xid, snapshot))
+		return false;
+	else if (TransactionIdDidCommit(xid))
+	{
+		return true;
+	}
+	else
+	{
+		/* it must have aborted or crashed */
+		*aborted = true;
+		return false;
+	}
+}
+
+/*
+ * Like HeapTupleSatisfiesMVCC
+ */
+static bool
+zs_SatisfiesMVCC(ZSBtreeScan *scan, ZSBtreeItem *item,
+				 TransactionId *obsoleting_xid, zstid *next_tid)
+{
+	Relation	rel = scan->rel;
+	Snapshot	snapshot = scan->snapshot;
+	ZSUndoRecPtr recent_oldest_undo = scan->recent_oldest_undo;
+	ZSUndoRecPtr undo_ptr;
+	ZSUndoRec  *undorec;
+	bool		aborted;
+
+	Assert((item->t_flags & ZSBT_COMPRESSED) == 0);
+	Assert (snapshot->snapshot_type == SNAPSHOT_MVCC);
+
+	undo_ptr = zsbt_item_undoptr(item);
+
+fetch_undo_record:
+	/* If this record is "old", then the record is visible. */
+	if (undo_ptr.counter < recent_oldest_undo.counter)
+		return true;
+
+	/* have to fetch the UNDO record */
+	undorec = zsundo_fetch(rel, undo_ptr);
+
+	if (undorec->type == ZSUNDO_TYPE_INSERT)
+	{
+		/* Inserted tuple */
+		bool		result;
+
+		result = xid_is_visible(snapshot, undorec->xid, undorec->cid, &aborted);
+		if (!result && !aborted)
+			*obsoleting_xid = undorec->xid;
+		return result;
+	}
+	else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK)
+	{
+		/* we don't care about tuple locks here. Follow the link to the
+		 * previous UNDO record for this tuple. */
+		undo_ptr = undorec->prevundorec;
+		goto fetch_undo_record;
+	}
+	else if (undorec->type == ZSUNDO_TYPE_DELETE ||
+			 undorec->type == ZSUNDO_TYPE_UPDATE)
+	{
+		if (undorec->type == ZSUNDO_TYPE_UPDATE)
+		{
+			ZSUndoRec_Update *updaterec = (ZSUndoRec_Update *) undorec;
+			if (next_tid)
+				*next_tid = updaterec->newtid;
+		}
+
+		/*
+		 * Deleted or updated-away. They are treated the same in an MVCC snapshot.
+		 * They only need different treatment when updating or locking the row,
+		 * in SatisfiesUpdate().
+		 */
+		if (xid_is_visible(snapshot, undorec->xid, undorec->cid, &aborted))
+		{
+			/* we can see the deletion */
+			return false;
+		}
+		else
+		{
+			if (!aborted)
+				*obsoleting_xid = undorec->xid;
+			undo_ptr = undorec->prevundorec;
+			goto fetch_undo_record;
+		}
+	}
+	else
+		elog(ERROR, "unexpected UNDO record type: %d", undorec->type);
+}
+
+/*
+ * Like HeapTupleSatisfiesSelf
+ */
+static bool
+zs_SatisfiesSelf(ZSBtreeScan *scan, ZSBtreeItem *item, zstid *next_tid)
+{
+	Relation	rel = scan->rel;
+	ZSUndoRecPtr recent_oldest_undo = scan->recent_oldest_undo;
+	ZSUndoRec  *undorec;
+	ZSUndoRecPtr undo_ptr;
+
+	Assert((item->t_flags & ZSBT_COMPRESSED) == 0);
+	Assert (scan->snapshot->snapshot_type == SNAPSHOT_SELF);
+
+	undo_ptr = zsbt_item_undoptr(item);
+
+fetch_undo_record:
+	if (undo_ptr.counter < recent_oldest_undo.counter)
+		return true;
+
+	/* have to fetch the UNDO record */
+	undorec = zsundo_fetch(rel, undo_ptr);
+
+	if (undorec->type == ZSUNDO_TYPE_INSERT)
+	{
+		/* Inserted tuple */
+		if (TransactionIdIsCurrentTransactionId(undorec->xid))
+			return true;		/* inserted by me */
+		else if (TransactionIdIsInProgress(undorec->xid))
+			return false;
+		else if (TransactionIdDidCommit(undorec->xid))
+			return true;
+		else
+		{
+			/* it must have aborted or crashed */
+			return false;
+		}
+	}
+	else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK)
+	{
+		/* we don't care about tuple locks here. Follow the link to the
+		 * previous UNDO record for this tuple. */
+		undo_ptr = undorec->prevundorec;
+		goto fetch_undo_record;
+	}
+	else if (undorec->type == ZSUNDO_TYPE_DELETE ||
+			 undorec->type == ZSUNDO_TYPE_UPDATE)
+	{
+		if (undorec->type == ZSUNDO_TYPE_UPDATE)
+		{
+			ZSUndoRec_Update *updaterec = (ZSUndoRec_Update *) undorec;
+			if (next_tid)
+				*next_tid = updaterec->newtid;
+		}
+
+		if (TransactionIdIsCurrentTransactionId(undorec->xid))
+		{
+			/* deleted by me */
+			return false;
+		}
+
+		if (TransactionIdIsInProgress(undorec->xid))
+			return true;
+
+		if (!TransactionIdDidCommit(undorec->xid))
+		{
+			/*
+			 * Deleter must have aborted or crashed. But we have to keep following the
+			 * undo chain, to check if the insertion was visible in the first
+			 * place.
+			 */
+			undo_ptr = undorec->prevundorec;
+			goto fetch_undo_record;
+		}
+
+		return false;
+	}
+	else
+		elog(ERROR, "unexpected UNDO record type: %d", undorec->type);
+}
+
+/*
+ * Like HeapTupleSatisfiesDirty
+ */
+static bool
+zs_SatisfiesDirty(ZSBtreeScan *scan, ZSBtreeItem *item, zstid *next_tid)
+{
+	Relation	rel = scan->rel;
+	Snapshot	snapshot = scan->snapshot;
+	ZSUndoRecPtr recent_oldest_undo = scan->recent_oldest_undo;
+	ZSUndoRecPtr undo_ptr;
+	ZSUndoRec  *undorec;
+
+	Assert((item->t_flags & ZSBT_COMPRESSED) == 0);
+	Assert (snapshot->snapshot_type == SNAPSHOT_DIRTY);
+
+	snapshot->xmin = snapshot->xmax = InvalidTransactionId;
+	snapshot->speculativeToken = INVALID_SPECULATIVE_TOKEN;
+
+	undo_ptr = zsbt_item_undoptr(item);
+
+fetch_undo_record:
+	if (undo_ptr.counter < recent_oldest_undo.counter)
+		return true;
+
+	/* have to fetch the UNDO record */
+	undorec = zsundo_fetch(rel, undo_ptr);
+
+	if (undorec->type == ZSUNDO_TYPE_INSERT)
+	{
+		snapshot->speculativeToken = undorec->speculative_token;
+		/* Inserted tuple */
+		if (TransactionIdIsCurrentTransactionId(undorec->xid))
+			return true;		/* inserted by me */
+		else if (TransactionIdIsInProgress(undorec->xid))
+		{
+			snapshot->xmin = undorec->xid;
+			return true;
+		}
+		else if (TransactionIdDidCommit(undorec->xid))
+		{
+			return true;
+		}
+		else
+		{
+			/* it must have aborted or crashed */
+			return false;
+		}
+	}
+	else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK)
+	{
+		/* locked tuple. */
+		/* look at the previous UNDO record to find the insert record */
+		undo_ptr = undorec->prevundorec;
+		goto fetch_undo_record;
+	}
+	else if (undorec->type == ZSUNDO_TYPE_DELETE ||
+			 undorec->type == ZSUNDO_TYPE_UPDATE)
+	{
+		if (undorec->type == ZSUNDO_TYPE_UPDATE)
+		{
+			ZSUndoRec_Update *updaterec = (ZSUndoRec_Update *) undorec;
+			if (next_tid)
+				*next_tid = updaterec->newtid;
+		}
+
+		/* deleted or updated-away tuple */
+		if (TransactionIdIsCurrentTransactionId(undorec->xid))
+		{
+			/* deleted by me */
+			return false;
+		}
+
+		if (TransactionIdIsInProgress(undorec->xid))
+		{
+			snapshot->xmax = undorec->xid;
+			return true;
+		}
+
+		if (!TransactionIdDidCommit(undorec->xid))
+		{
+			/*
+			 * Deleter must have aborted or crashed. But we have to keep following the
+			 * undo chain, to check if the insertion was visible in the first
+			 * place.
+			 */
+			undo_ptr = undorec->prevundorec;
+			goto fetch_undo_record;
+		}
+
+		return false;
+	}
+	else
+		elog(ERROR, "unexpected UNDO record type: %d", undorec->type);
+}
+
+/*
+ * True if tuple might be visible to some transaction; false if it's
+ * surely dead to everyone, ie, vacuumable.
+ */
+static bool
+zs_SatisfiesNonVacuumable(ZSBtreeScan *scan, ZSBtreeItem *item)
+{
+	Relation	rel = scan->rel;
+	TransactionId OldestXmin = scan->snapshot->xmin;
+	ZSUndoRecPtr recent_oldest_undo = scan->recent_oldest_undo;
+	ZSUndoRecPtr undo_ptr;
+	ZSUndoRec  *undorec;
+
+	Assert (scan->snapshot->snapshot_type == SNAPSHOT_NON_VACUUMABLE);
+	Assert(TransactionIdIsValid(OldestXmin));
+
+	undo_ptr = zsbt_item_undoptr(item);
+
+fetch_undo_record:
+
+	/* Is it visible? */
+	if (undo_ptr.counter < recent_oldest_undo.counter)
+		return true;
+
+	/* have to fetch the UNDO record */
+	undorec = zsundo_fetch(rel, undo_ptr);
+
+	if (undorec->type == ZSUNDO_TYPE_INSERT)
+	{
+		/* Inserted tuple */
+		if (TransactionIdIsInProgress(undorec->xid))
+			return true;		/* inserter has not committed yet */
+
+		if (TransactionIdDidCommit(undorec->xid))
+			return true;
+
+		/* it must have aborted or crashed */
+		return false;
+	}
+	else if (undorec->type == ZSUNDO_TYPE_DELETE ||
+			 undorec->type == ZSUNDO_TYPE_UPDATE)
+	{
+		/* deleted or updated-away tuple */
+		ZSUndoRecPtr	prevptr;
+
+		if (TransactionIdIsInProgress(undorec->xid))
+			return true;	/* delete-in-progress */
+		else if (TransactionIdDidCommit(undorec->xid))
+		{
+			/*
+			 * Deleter committed. But perhaps it was recent enough that some open
+			 * transactions could still see the tuple.
+			 */
+			if (!TransactionIdPrecedes(undorec->xid, OldestXmin))
+				return true;
+
+			return false;
+		}
+
+		/*
+		 * The deleting transaction did not commit. But before concluding
+		 * that the tuple is live, we have to check if the inserting
+		 * XID is live.
+		 */
+		do {
+			prevptr = undorec->prevundorec;
+
+			if (prevptr.counter < recent_oldest_undo.counter)
+				return true;
+			undorec = zsundo_fetch(rel, prevptr);
+		} while(undorec->type == ZSUNDO_TYPE_TUPLE_LOCK);
+
+		Assert(undorec->type == ZSUNDO_TYPE_INSERT);
+
+		if (TransactionIdIsInProgress(undorec->xid))
+			return true;	/* insert-in-progress */
+		else if (TransactionIdDidCommit(undorec->xid))
+			return true;	/* inserted committed */
+
+		/* inserter must have aborted or crashed */
+		return false;
+	}
+	else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK)
+	{
+		/* look at the previous UNDO record, to find the Insert record */
+		undo_ptr = undorec->prevundorec;
+		goto fetch_undo_record;
+	}
+	else
+		elog(ERROR, "unexpected UNDO record type: %d", undorec->type);
+}
+
+/*
+ * Like HeapTupleSatisfiesVisibility
+ *
+ * If next_tid is not NULL then gets populated for the tuple if tuple was
+ * UPDATEd. *next_tid_p is set to the TID of the new row version.
+ */
+bool
+zs_SatisfiesVisibility(ZSBtreeScan *scan, ZSBtreeItem *item,
+					   TransactionId *obsoleting_xid, zstid *next_tid)
+{
+	ZSUndoRecPtr undo_ptr;
+
+	/* initialize as invalid, if we find valid one populate the same */
+	if (next_tid)
+		*next_tid = InvalidZSTid;
+
+	/*
+	 * This works on a single or array item. Compressed items don't have
+	 * visibility information (the items inside the compressed container
+	 * do)
+	 */
+	Assert((item->t_flags & ZSBT_COMPRESSED) == 0);
+
+	/* The caller should've filled in the recent_oldest_undo pointer */
+	Assert(scan->recent_oldest_undo.counter != 0);
+
+	*obsoleting_xid = InvalidTransactionId;
+
+	/* dead items are never considered visible. */
+	if ((item->t_flags & ZSBT_DEAD) != 0)
+		return false;
+
+	/*
+	 * Items with invalid undo record are considered visible. Mostly META
+	 * column stores the valid undo record, all other columns stores invalid
+	 * undo pointer. Visibility check is performed based on META column and
+	 * only if visible rest of columns are fetched. For in-place updates,
+	 * columns other than META column may have valid undo record, in which
+	 * case the visibility check needs to be performed for the same. META
+	 * column can sometime also have items with invalid undo, see
+	 * zsbt_undo_item_deletion().
+	 */
+	undo_ptr = zsbt_item_undoptr(item);
+	if (!IsZSUndoRecPtrValid(&undo_ptr))
+		return true;
+
+	switch (scan->snapshot->snapshot_type)
+	{
+		case SNAPSHOT_MVCC:
+			return zs_SatisfiesMVCC(scan, item, obsoleting_xid, next_tid);
+
+		case SNAPSHOT_SELF:
+			return zs_SatisfiesSelf(scan, item, next_tid);
+
+		case SNAPSHOT_ANY:
+			return zs_SatisfiesAny(scan, item);
+
+		case SNAPSHOT_TOAST:
+			elog(ERROR, "SnapshotToast not implemented in zedstore");
+			break;
+
+		case SNAPSHOT_DIRTY:
+			return zs_SatisfiesDirty(scan, item, next_tid);
+
+		case SNAPSHOT_HISTORIC_MVCC:
+			elog(ERROR, "SnapshotHistoricMVCC not implemented in zedstore yet");
+			break;
+
+		case SNAPSHOT_NON_VACUUMABLE:
+			return zs_SatisfiesNonVacuumable(scan, item);
+	}
+
+	return false;				/* keep compiler quiet */
+}
diff --git a/src/backend/access/zedstore/zedstoream_handler.c b/src/backend/access/zedstore/zedstoream_handler.c
new file mode 100644
index 0000000000..5a79b7a1fc
--- /dev/null
+++ b/src/backend/access/zedstore/zedstoream_handler.c
@@ -0,0 +1,3163 @@
+/*-------------------------------------------------------------------------
+ *
+ * zedstoream_handler.c
+ *	  ZedStore table access method code
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/zedstore/zedstoream_handler.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "miscadmin.h"
+
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/tsmapi.h"
+#include "access/tupdesc_details.h"
+#include "access/tuptoaster.h"
+#include "access/xact.h"
+#include "access/zedstore_internal.h"
+#include "access/zedstore_undo.h"
+#include "catalog/catalog.h"
+#include "catalog/index.h"
+#include "catalog/storage.h"
+#include "catalog/storage_xlog.h"
+#include "commands/progress.h"
+#include "commands/vacuum.h"
+#include "executor/executor.h"
+#include "optimizer/plancat.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/procarray.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+
+
+typedef enum
+{
+	ZSSCAN_STATE_UNSTARTED,
+	ZSSCAN_STATE_SCANNING,
+	ZSSCAN_STATE_FINISHED_RANGE,
+	ZSSCAN_STATE_FINISHED
+} zs_scan_state;
+
+typedef struct ZedStoreProjectData
+{
+	int			num_proj_atts;
+	bool       *project_columns;
+	int		   *proj_atts;
+	ZSBtreeScan *btree_scans;
+	MemoryContext context;
+}  ZedStoreProjectData;
+
+typedef struct ZedStoreDescData
+{
+	/* scan parameters */
+	TableScanDescData rs_scan;  /* */
+	ZedStoreProjectData proj_data;
+
+	zs_scan_state state;
+	zstid		cur_range_start;
+	zstid		cur_range_end;
+	bool		finished;
+
+	/* These fields are used for bitmap scans, to hold a "block's" worth of data */
+#define	MAX_ITEMS_PER_LOGICAL_BLOCK		MaxHeapTuplesPerPage
+	int			bmscan_ntuples;
+	zstid	   *bmscan_tids;
+	Datum	  **bmscan_datums;
+	bool	  **bmscan_isnulls;
+	int			bmscan_nexttuple;
+
+	/* These fields are use for TABLESAMPLE scans */
+	zstid       max_tid_to_scan;
+	zstid       next_tid_to_scan;
+
+} ZedStoreDescData;
+
+typedef struct ZedStoreDescData *ZedStoreDesc;
+
+typedef struct ZedStoreIndexFetchData
+{
+	IndexFetchTableData idx_fetch_data;
+	ZedStoreProjectData proj_data;
+} ZedStoreIndexFetchData;
+
+typedef struct ZedStoreIndexFetchData *ZedStoreIndexFetch;
+
+typedef struct ParallelZSScanDescData *ParallelZSScanDesc;
+
+static IndexFetchTableData *zedstoream_begin_index_fetch(Relation rel);
+static void zedstoream_end_index_fetch(IndexFetchTableData *scan);
+static bool zedstoream_fetch_row(ZedStoreIndexFetchData *fetch,
+								 ItemPointer tid_p,
+								 Snapshot snapshot,
+								 TupleTableSlot *slot);
+
+static Size zs_parallelscan_estimate(Relation rel);
+static Size zs_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan);
+static void zs_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan);
+static bool zs_parallelscan_nextrange(Relation rel, ParallelZSScanDesc pzscan,
+									  zstid *start, zstid *end);
+static void zsbt_fill_missing_attribute_value(ZSBtreeScan *scan, Datum *datum, bool *isnull);
+
+/* ----------------------------------------------------------------
+ *				storage AM support routines for zedstoream
+ * ----------------------------------------------------------------
+ */
+
+static bool
+zedstoream_fetch_row_version(Relation rel,
+							 ItemPointer tid_p,
+							 Snapshot snapshot,
+							 TupleTableSlot *slot)
+{
+	IndexFetchTableData *fetcher;
+	bool		result;
+
+	fetcher = zedstoream_begin_index_fetch(rel);
+
+	result = zedstoream_fetch_row((ZedStoreIndexFetchData *) fetcher,
+								  tid_p, snapshot, slot);
+	if (result)
+	{
+		/* FIXME: heapam acquires the predicate lock first, and then
+		 * calls CheckForSerializableConflictOut(). We do it in the
+		 * opposite order, because CheckForSerializableConflictOut()
+		 * call as done in zsbt_get_last_tid() already. Does it matter?
+		 * I'm not sure.
+		 */
+		PredicateLockTID(rel, tid_p, snapshot);
+	}
+	ExecMaterializeSlot(slot);
+	slot->tts_tableOid = RelationGetRelid(rel);
+	slot->tts_tid = *tid_p;
+
+	zedstoream_end_index_fetch(fetcher);
+
+	return result;
+}
+
+static void
+zedstoream_get_latest_tid(TableScanDesc sscan,
+						  ItemPointer tid)
+{
+	zstid ztid = ZSTidFromItemPointer(*tid);
+	zsbt_find_latest_tid(sscan->rs_rd, &ztid, sscan->rs_snapshot);
+	*tid = ItemPointerFromZSTid(ztid);
+}
+
+static inline void
+zedstoream_insert_internal(Relation relation, TupleTableSlot *slot, CommandId cid,
+				  int options, struct BulkInsertStateData *bistate, uint32 speculative_token)
+{
+	AttrNumber	attno;
+	Datum	   *d;
+	bool	   *isnulls;
+	zstid		tid;
+	TransactionId xid = GetCurrentTransactionId();
+	bool        isnull;
+	Datum       datum;
+	ZSUndoRecPtr prevundoptr;
+
+	ZSUndoRecPtrInitialize(&prevundoptr);
+
+	if (slot->tts_tupleDescriptor->natts != relation->rd_att->natts)
+		elog(ERROR, "slot's attribute count doesn't match relcache entry");
+
+	slot_getallattrs(slot);
+	d = slot->tts_values;
+	isnulls = slot->tts_isnull;
+
+	tid = InvalidZSTid;
+
+	isnull = true;
+	ZSUndoRecPtrInitialize(&prevundoptr);
+	zsbt_tid_multi_insert(relation,
+						  &tid, 1,
+						  xid, cid, speculative_token, prevundoptr);
+
+	/*
+	 * We only need to check for table-level SSI locks. Our
+	 * new tuple can't possibly conflict with existing tuple locks, and
+	 * page locks are only consolidated versions of tuple locks; they do not
+	 * lock "gaps" as index page locks do.
+	 */
+	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
+
+	for (attno = 1; attno <= relation->rd_att->natts; attno++)
+	{
+		Form_pg_attribute attr = TupleDescAttr(slot->tts_tupleDescriptor, attno - 1);
+		Datum		toastptr = (Datum) 0;
+		datum = d[attno - 1];
+		isnull = isnulls[attno - 1];
+
+		if (!isnull && attr->attlen < 0 && VARATT_IS_EXTERNAL(datum))
+			datum = PointerGetDatum(heap_tuple_fetch_attr((struct varlena *) DatumGetPointer(datum)));
+
+		/* If this datum is too large, toast it */
+		if (!isnull && attr->attlen < 0 &&
+			VARSIZE_ANY_EXHDR(datum) > MaxZedStoreDatumSize)
+		{
+			toastptr = datum = zedstore_toast_datum(relation, attno, datum);
+		}
+
+		zsbt_attr_multi_insert(relation, attno,
+						  &datum, &isnull, &tid, 1);
+
+		if (toastptr != (Datum) 0)
+			zedstore_toast_finish(relation, attno, toastptr, tid);
+	}
+
+	slot->tts_tableOid = RelationGetRelid(relation);
+	slot->tts_tid = ItemPointerFromZSTid(tid);
+
+	/* Note: speculative insertions are counted too, even if aborted later */
+	pgstat_count_heap_insert(relation, 1);
+}
+
+static void
+zedstoream_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
+					int options, struct BulkInsertStateData *bistate)
+{
+	zedstoream_insert_internal(relation, slot, cid, options, bistate, INVALID_SPECULATIVE_TOKEN);
+}
+
+static void
+zedstoream_insert_speculative(Relation relation, TupleTableSlot *slot, CommandId cid,
+							  int options, BulkInsertState bistate, uint32 specToken)
+{
+	zedstoream_insert_internal(relation, slot, cid, options, bistate, specToken);
+}
+
+static void
+zedstoream_complete_speculative(Relation relation, TupleTableSlot *slot, uint32 spekToken,
+								bool succeeded)
+{
+	zstid tid;
+
+	tid = ZSTidFromItemPointer(slot->tts_tid);
+	zsbt_tid_clear_speculative_token(relation, tid, spekToken, true /* for complete */);
+	/*
+	 * there is a conflict
+	 */
+	if (!succeeded)
+		elog(ERROR, "zedstoream_complete_speculative abort is not handled");
+}
+
+static void
+zedstoream_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
+						CommandId cid, int options, BulkInsertState bistate)
+{
+	AttrNumber	attno;
+	int			i;
+	bool		slotgetandset = true;
+	TransactionId xid = GetCurrentTransactionId();
+	int		   *tupletoasted;
+	Datum	   *datums;
+	bool	   *isnulls;
+	zstid	   *tids;
+	ZSUndoRecPtr prevundoptr;
+
+	tupletoasted = palloc(ntuples * sizeof(int));
+	datums = palloc0(ntuples * sizeof(Datum));
+	isnulls = palloc(ntuples * sizeof(bool));
+	tids = palloc0(ntuples * sizeof(zstid));
+
+	for (i = 0; i < ntuples; i++)
+		isnulls[i] = true;
+
+	ZSUndoRecPtrInitialize(&prevundoptr);
+	zsbt_tid_multi_insert(relation, tids, ntuples,
+						  xid, cid, INVALID_SPECULATIVE_TOKEN, prevundoptr);
+
+	/*
+	 * We only need to check for table-level SSI locks. Our
+	 * new tuple can't possibly conflict with existing tuple locks, and
+	 * page locks are only consolidated versions of tuple locks; they do not
+	 * lock "gaps" as index page locks do.
+	 */
+	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
+
+	for (attno = 1; attno <= relation->rd_att->natts; attno++)
+	{
+		Form_pg_attribute attr = TupleDescAttr((slots[0])->tts_tupleDescriptor, attno - 1);
+		int			ntupletoasted = 0;
+
+		for (i = 0; i < ntuples; i++)
+		{
+			Datum		datum = slots[i]->tts_values[attno - 1];
+			bool		isnull = slots[i]->tts_isnull[attno - 1];
+
+			if (slotgetandset)
+			{
+				slot_getallattrs(slots[i]);
+			}
+
+			/* If this datum is too large, toast it */
+			if (!isnull && attr->attlen < 0 &&
+				VARSIZE_ANY_EXHDR(datum) > MaxZedStoreDatumSize)
+			{
+				datum = zedstore_toast_datum(relation, attno, datum);
+				tupletoasted[ntupletoasted++] = i;
+			}
+			datums[i] = datum;
+			isnulls[i] = isnull;
+		}
+
+		zsbt_attr_multi_insert(relation, attno,
+							   datums, isnulls, tids, ntuples);
+
+		for (i = 0; i < ntupletoasted; i++)
+		{
+			int		idx = tupletoasted[i];
+
+			zedstore_toast_finish(relation, attno, datums[idx], tids[idx]);
+		}
+
+		slotgetandset = false;
+	}
+
+	for (i = 0; i < ntuples; i++)
+	{
+		slots[i]->tts_tableOid = RelationGetRelid(relation);
+		slots[i]->tts_tid = ItemPointerFromZSTid(tids[i]);
+	}
+
+	pgstat_count_heap_insert(relation, ntuples);
+
+	pfree(tids);
+	pfree(tupletoasted);
+	pfree(datums);
+	pfree(isnulls);
+}
+
+static TM_Result
+zedstoream_delete(Relation relation, ItemPointer tid_p, CommandId cid,
+				  Snapshot snapshot, Snapshot crosscheck, bool wait,
+				  TM_FailureData *hufd, bool changingPart)
+{
+	zstid		tid = ZSTidFromItemPointer(*tid_p);
+	TransactionId xid = GetCurrentTransactionId();
+	TM_Result result = TM_Ok;
+
+retry:
+	result = zsbt_tid_delete(relation, tid, xid, cid,
+							 snapshot, crosscheck, wait, hufd, changingPart);
+
+	if (result != TM_Ok)
+	{
+		if (result == TM_Invisible)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("attempted to delete invisible tuple")));
+		else if (result == TM_BeingModified && wait)
+		{
+			TransactionId	xwait = hufd->xmax;
+
+			/* TODO: use something like heap_acquire_tuplock() for priority */
+			if (!TransactionIdIsCurrentTransactionId(xwait))
+			{
+				XactLockTableWait(xwait, relation, tid_p, XLTW_Delete);
+				goto retry;
+			}
+		}
+	}
+
+	/*
+	 * Check for SSI conflicts.
+	 */
+	CheckForSerializableConflictIn(relation, tid_p, ItemPointerGetBlockNumber(tid_p));
+
+	if (result == TM_Ok)
+		pgstat_count_heap_delete(relation);
+
+	return result;
+}
+
+
+/*
+ * Each tuple lock mode has a corresponding heavyweight lock, and one or two
+ * corresponding MultiXactStatuses (one to merely lock tuples, another one to
+ * update them).  This table (and the macros below) helps us determine the
+ * heavyweight lock mode and MultiXactStatus values to use for any particular
+ * tuple lock strength.
+ *
+ * Don't look at lockstatus/updstatus directly!  Use get_mxact_status_for_lock
+ * instead.
+ */
+static const struct
+{
+	LOCKMODE	hwlock;
+	int			lockstatus;
+	int			updstatus;
+}
+
+			tupleLockExtraInfo[MaxLockTupleMode + 1] =
+{
+	{							/* LockTupleKeyShare */
+		AccessShareLock,
+		MultiXactStatusForKeyShare,
+		-1						/* KeyShare does not allow updating tuples */
+	},
+	{							/* LockTupleShare */
+		RowShareLock,
+		MultiXactStatusForShare,
+		-1						/* Share does not allow updating tuples */
+	},
+	{							/* LockTupleNoKeyExclusive */
+		ExclusiveLock,
+		MultiXactStatusForNoKeyUpdate,
+		MultiXactStatusNoKeyUpdate
+	},
+	{							/* LockTupleExclusive */
+		AccessExclusiveLock,
+		MultiXactStatusForUpdate,
+		MultiXactStatusUpdate
+	}
+};
+
+
+/*
+ * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
+ * This is more readable than having every caller translate it to lock.h's
+ * LOCKMODE.
+ */
+#define LockTupleTuplock(rel, tup, mode) \
+	LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+#define UnlockTupleTuplock(rel, tup, mode) \
+	UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+#define ConditionalLockTupleTuplock(rel, tup, mode) \
+	ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+
+/*
+ * Acquire heavyweight lock on the given tuple, in preparation for acquiring
+ * its normal, Xmax-based tuple lock.
+ *
+ * have_tuple_lock is an input and output parameter: on input, it indicates
+ * whether the lock has previously been acquired (and this function does
+ * nothing in that case).  If this function returns success, have_tuple_lock
+ * has been flipped to true.
+ *
+ * Returns false if it was unable to obtain the lock; this can only happen if
+ * wait_policy is Skip.
+ *
+ * XXX: This is identical to heap_acquire_tuplock
+ */
+
+static bool
+zs_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
+				   LockWaitPolicy wait_policy, bool *have_tuple_lock)
+{
+	if (*have_tuple_lock)
+		return true;
+
+	switch (wait_policy)
+	{
+		case LockWaitBlock:
+			LockTupleTuplock(relation, tid, mode);
+			break;
+
+		case LockWaitSkip:
+			if (!ConditionalLockTupleTuplock(relation, tid, mode))
+				return false;
+			break;
+
+		case LockWaitError:
+			if (!ConditionalLockTupleTuplock(relation, tid, mode))
+				ereport(ERROR,
+						(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+						 errmsg("could not obtain lock on row in relation \"%s\"",
+								RelationGetRelationName(relation))));
+			break;
+	}
+	*have_tuple_lock = true;
+
+	return true;
+}
+
+
+static TM_Result
+zedstoream_lock_tuple(Relation relation, ItemPointer tid_p, Snapshot snapshot,
+					  TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
+					  LockWaitPolicy wait_policy, uint8 flags,
+					  TM_FailureData *tmfd)
+{
+	zstid		tid = ZSTidFromItemPointer(*tid_p);
+	TransactionId xid = GetCurrentTransactionId();
+	TM_Result result;
+	bool		have_tuple_lock = false;
+	zstid		next_tid = tid;
+	SnapshotData SnapshotDirty;
+	bool		locked_something = false;
+
+	slot->tts_tableOid = RelationGetRelid(relation);
+	slot->tts_tid = *tid_p;
+
+	tmfd->traversed = false;
+	/*
+	 * For now, we lock just the first attribute. As long as everyone
+	 * does that, that's enough.
+	 */
+retry:
+	result = zsbt_tid_lock(relation, tid, xid, cid,
+						   mode, snapshot, tmfd, &next_tid);
+
+	if (result == TM_Invisible)
+	{
+		/*
+		 * This is possible, but only when locking a tuple for ON CONFLICT
+		 * UPDATE.  We return this value here rather than throwing an error in
+		 * order to give that case the opportunity to throw a more specific
+		 * error.
+		 */
+		/*
+		 * This can also happen, if we're locking an UPDATE chain for KEY SHARE mode:
+		 * A tuple has been inserted, and then updated, by a different transaction.
+		 * The updating transaction is still in progress. We can lock the row
+		 * in KEY SHARE mode, assuming the key columns were not updated, and we will
+		 * try to lock all the row version, even the still in-progress UPDATEs.
+		 * It's possible that the UPDATE aborts while we're chasing the update chain,
+		 * so that the updated tuple becomes invisible to us. That's OK.
+		 */
+		 if (mode == LockTupleKeyShare && locked_something)
+			 return TM_Ok;
+		 else
+			 return TM_Invisible;
+	}
+	else if (result == TM_Updated ||
+			 (result == TM_SelfModified && tmfd->cmax == cid))
+	{
+		/*
+		 * The other transaction is an update and it already committed.
+		 *
+		 * If the caller asked for the latest version, find it.
+		 */
+		if ((flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION) != 0 && next_tid != tid)
+		{
+			if (have_tuple_lock)
+			{
+				UnlockTupleTuplock(relation, tid_p, mode);
+				have_tuple_lock = false;
+			}
+
+			if (ItemPointerIndicatesMovedPartitions(&tmfd->ctid))
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
+
+			/* it was updated, so look at the updated version */
+			*tid_p = ItemPointerFromZSTid(next_tid);
+
+			/* signal that a tuple later in the chain is getting locked */
+			tmfd->traversed = true;
+
+			/* loop back to fetch next in chain */
+
+			/* FIXME: In the corresponding code in heapam, we cross-check the xmin/xmax
+			 * of the old and new tuple. Should we do the same here?
+			 */
+
+			InitDirtySnapshot(SnapshotDirty);
+			snapshot = &SnapshotDirty;
+			tid = next_tid;
+			goto retry;
+		}
+
+		return result;
+	}
+	else if (result == TM_Deleted)
+	{
+		/*
+		 * The other transaction is a delete and it already committed.
+		 */
+		return result;
+	}
+	else if (result == TM_BeingModified)
+	{
+		TransactionId xwait = tmfd->xmax;
+
+		/*
+		 * Acquire tuple lock to establish our priority for the tuple, or
+		 * die trying.  LockTuple will release us when we are next-in-line
+		 * for the tuple.  We must do this even if we are share-locking.
+		 *
+		 * If we are forced to "start over" below, we keep the tuple lock;
+		 * this arranges that we stay at the head of the line while
+		 * rechecking tuple state.
+		 */
+		if (!zs_acquire_tuplock(relation, tid_p, mode, wait_policy,
+								  &have_tuple_lock))
+		{
+			/*
+			 * This can only happen if wait_policy is Skip and the lock
+			 * couldn't be obtained.
+			 */
+			return TM_WouldBlock;
+		}
+
+		/* wait for regular transaction to end, or die trying */
+		switch (wait_policy)
+		{
+			case LockWaitBlock:
+				XactLockTableWait(xwait, relation, tid_p, XLTW_Lock);
+				break;
+			case LockWaitSkip:
+				if (!ConditionalXactLockTableWait(xwait))
+				{
+					return TM_WouldBlock;
+				}
+				break;
+			case LockWaitError:
+				if (!ConditionalXactLockTableWait(xwait))
+					ereport(ERROR,
+							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+							 errmsg("could not obtain lock on row in relation \"%s\"",
+									RelationGetRelationName(relation))));
+				break;
+		}
+
+		/*
+		 * xwait is done. Retry.
+		 */
+		goto retry;
+	}
+	if (result == TM_Ok)
+		locked_something = true;
+
+	/*
+	 * Now that we have successfully marked the tuple as locked, we can
+	 * release the lmgr tuple lock, if we had it.
+	 */
+	if (have_tuple_lock)
+	{
+		UnlockTupleTuplock(relation, tid_p, mode);
+		have_tuple_lock = false;
+	}
+
+	if (mode == LockTupleKeyShare)
+	{
+		/* lock all row versions, if it's a KEY SHARE lock */
+		if (result == TM_Ok && tid != next_tid && next_tid != InvalidZSTid)
+		{
+			tid = next_tid;
+			goto retry;
+		}
+	}
+
+	/* Fetch the tuple, too. */
+	if (!zedstoream_fetch_row_version(relation, tid_p, SnapshotAny, slot))
+		elog(ERROR, "could not fetch locked tuple");
+
+	return TM_Ok;
+}
+
+/* like heap_tuple_attr_equals */
+static bool
+zs_tuple_attr_equals(int attrnum, TupleTableSlot *slot1, TupleTableSlot *slot2)
+{
+	TupleDesc	tupdesc = slot1->tts_tupleDescriptor;
+	Datum		value1,
+				value2;
+	bool		isnull1,
+				isnull2;
+	Form_pg_attribute att;
+
+	/*
+	 * If it's a whole-tuple reference, say "not equal".  It's not really
+	 * worth supporting this case, since it could only succeed after a no-op
+	 * update, which is hardly a case worth optimizing for.
+	 */
+	if (attrnum == 0)
+		return false;
+
+	/*
+	 * Likewise, automatically say "not equal" for any system attribute other
+	 * than tableOID; we cannot expect these to be consistent in a HOT chain,
+	 * or even to be set correctly yet in the new tuple.
+	 */
+	if (attrnum < 0)
+	{
+		if (attrnum != TableOidAttributeNumber)
+			return false;
+	}
+
+	/*
+	 * Extract the corresponding values.  XXX this is pretty inefficient if
+	 * there are many indexed columns.  Should HeapDetermineModifiedColumns do
+	 * a single heap_deform_tuple call on each tuple, instead?	But that
+	 * doesn't work for system columns ...
+	 */
+	value1 = slot_getattr(slot1, attrnum, &isnull1);
+	value2 = slot_getattr(slot2, attrnum, &isnull2);
+
+	/*
+	 * If one value is NULL and other is not, then they are certainly not
+	 * equal
+	 */
+	if (isnull1 != isnull2)
+		return false;
+
+	/*
+	 * If both are NULL, they can be considered equal.
+	 */
+	if (isnull1)
+		return true;
+
+	/*
+	 * We do simple binary comparison of the two datums.  This may be overly
+	 * strict because there can be multiple binary representations for the
+	 * same logical value.  But we should be OK as long as there are no false
+	 * positives.  Using a type-specific equality operator is messy because
+	 * there could be multiple notions of equality in different operator
+	 * classes; furthermore, we cannot safely invoke user-defined functions
+	 * while holding exclusive buffer lock.
+	 */
+	if (attrnum <= 0)
+	{
+		/* The only allowed system columns are OIDs, so do this */
+		return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
+	}
+	else
+	{
+		Assert(attrnum <= tupdesc->natts);
+		att = TupleDescAttr(tupdesc, attrnum - 1);
+		return datumIsEqual(value1, value2, att->attbyval, att->attlen);
+	}
+}
+
+static bool
+is_key_update(Relation relation, TupleTableSlot *oldslot, TupleTableSlot *newslot)
+{
+	Bitmapset  *key_attrs;
+	Bitmapset  *interesting_attrs;
+	Bitmapset  *modified_attrs;
+	int			attnum;
+
+	/*
+	 * Fetch the list of attributes to be checked for various operations.
+	 *
+	 * For HOT considerations, this is wasted effort if we fail to update or
+	 * have to put the new tuple on a different page.  But we must compute the
+	 * list before obtaining buffer lock --- in the worst case, if we are
+	 * doing an update on one of the relevant system catalogs, we could
+	 * deadlock if we try to fetch the list later.  In any case, the relcache
+	 * caches the data so this is usually pretty cheap.
+	 *
+	 * We also need columns used by the replica identity and columns that are
+	 * considered the "key" of rows in the table.
+	 *
+	 * Note that we get copies of each bitmap, so we need not worry about
+	 * relcache flush happening midway through.
+	 */
+	key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
+
+	interesting_attrs = NULL;
+	interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
+
+	/* Determine columns modified by the update. */
+	modified_attrs = NULL;
+	while ((attnum = bms_first_member(interesting_attrs)) >= 0)
+	{
+		attnum += FirstLowInvalidHeapAttributeNumber;
+
+		if (!zs_tuple_attr_equals(attnum, oldslot, newslot))
+			modified_attrs = bms_add_member(modified_attrs,
+											attnum - FirstLowInvalidHeapAttributeNumber);
+	}
+
+	return bms_overlap(modified_attrs, key_attrs);
+}
+
+static TM_Result
+zedstoream_update(Relation relation, ItemPointer otid_p, TupleTableSlot *slot,
+				  CommandId cid, Snapshot snapshot, Snapshot crosscheck,
+				  bool wait, TM_FailureData *hufd,
+				  LockTupleMode *lockmode, bool *update_indexes)
+{
+	zstid		otid = ZSTidFromItemPointer(*otid_p);
+	TransactionId xid = GetCurrentTransactionId();
+	AttrNumber	attno;
+	bool		key_update;
+	Datum	   *d;
+	bool	   *isnulls;
+	TM_Result	result;
+	zstid		newtid;
+	TupleTableSlot *oldslot;
+	IndexFetchTableData *fetcher;
+	ZSUndoRecPtr prevundoptr;
+
+	ZSUndoRecPtrInitialize(&prevundoptr);
+
+	*update_indexes = true;
+
+	slot_getallattrs(slot);
+	d = slot->tts_values;
+	isnulls = slot->tts_isnull;
+
+	oldslot = table_slot_create(relation, NULL);
+	fetcher = zedstoream_begin_index_fetch(relation);
+
+	/*
+	 * The meta-attribute holds the visibility information, including the "t_ctid"
+	 * pointer to the updated version. All the real attributes are just inserted,
+	 * as if for a new row.
+	 */
+retry:
+	newtid = InvalidZSTid;
+
+	/*
+	 * Fetch the old row, so that we can figure out which columns were modified.
+	 *
+	 * FIXME: if we have to follow the update chain, we should look at the
+	 * currently latest tuple version, rather than the one visible to our snapshot.
+	 */
+	if (!zedstoream_fetch_row((ZedStoreIndexFetchData *) fetcher,
+							 otid_p, SnapshotAny, oldslot))
+	{
+		return TM_Invisible;
+	}
+	key_update = is_key_update(relation, oldslot, slot);
+
+	*lockmode = key_update ? LockTupleExclusive : LockTupleNoKeyExclusive;
+
+	result = zsbt_tid_update(relation, otid,
+							 xid, cid, key_update, snapshot, crosscheck,
+							 wait, hufd, &newtid);
+
+	if (result == TM_Ok)
+	{
+		/*
+		 * Check for SSI conflicts.
+		 */
+		CheckForSerializableConflictIn(relation, otid_p, ItemPointerGetBlockNumber(otid_p));
+
+		for (attno = 1; attno <= relation->rd_att->natts; attno++)
+		{
+			Form_pg_attribute attr = TupleDescAttr(relation->rd_att, attno - 1);
+			Datum		newdatum = d[attno - 1];
+			bool		newisnull = isnulls[attno - 1];
+			Datum		toastptr = (Datum) 0;
+
+			if (!newisnull && attr->attlen < 0 && VARATT_IS_EXTERNAL(newdatum))
+				newdatum = PointerGetDatum(heap_tuple_fetch_attr((struct varlena *) DatumGetPointer(newdatum)));
+
+			/* If this datum is too large, toast it */
+			if (!newisnull && attr->attlen < 0 &&
+				VARSIZE_ANY_EXHDR(newdatum) > MaxZedStoreDatumSize)
+			{
+				toastptr = newdatum = zedstore_toast_datum(relation, attno, newdatum);
+			}
+
+			zsbt_attr_multi_insert(relation, attno,
+								   &newdatum, &newisnull, &newtid, 1);
+
+			if (toastptr != (Datum) 0)
+				zedstore_toast_finish(relation, attno, toastptr, newtid);
+		}
+
+		slot->tts_tid = ItemPointerFromZSTid(newtid);
+
+		pgstat_count_heap_update(relation, false);
+	}
+	else
+	{
+		if (result == TM_Invisible)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("attempted to update invisible tuple")));
+		else if (result == TM_BeingModified && wait)
+		{
+			TransactionId	xwait = hufd->xmax;
+
+			/* TODO: use something like heap_acquire_tuplock() for priority */
+			if (!TransactionIdIsCurrentTransactionId(xwait))
+			{
+				XactLockTableWait(xwait, relation, otid_p, XLTW_Delete);
+				goto retry;
+			}
+		}
+	}
+
+	zedstoream_end_index_fetch(fetcher);
+	ExecDropSingleTupleTableSlot(oldslot);
+
+	return result;
+}
+
+static const TupleTableSlotOps *
+zedstoream_slot_callbacks(Relation relation)
+{
+	return &TTSOpsZedstore;
+}
+
+static inline void
+zs_initialize_proj_attributes(TupleDesc tupledesc, ZedStoreProjectData *proj_data)
+{
+	MemoryContext oldcontext;
+
+	if (proj_data->num_proj_atts != 0)
+		return;
+
+	oldcontext = MemoryContextSwitchTo(proj_data->context);
+	/* add one for meta-attribute */
+	proj_data->proj_atts = palloc((tupledesc->natts + 1) * sizeof(int));
+	proj_data->btree_scans = palloc0((tupledesc->natts + 1) * sizeof(ZSBtreeScan));
+
+	proj_data->proj_atts[proj_data->num_proj_atts++] = ZS_META_ATTRIBUTE_NUM;
+
+	/*
+	 * convert booleans array into an array of the attribute numbers of the
+	 * required columns.
+	 */
+	for (int idx = 0; idx < tupledesc->natts; idx++)
+	{
+		int att_no = idx + 1;
+
+		/*
+		 * never project dropped columns, null will be returned for them
+		 * in slot by default.
+		 */
+		if  (TupleDescAttr(tupledesc, idx)->attisdropped)
+			continue;
+
+		/* project_columns empty also conveys need all the columns */
+		if (proj_data->project_columns == NULL || proj_data->project_columns[idx])
+			proj_data->proj_atts[proj_data->num_proj_atts++] = att_no;
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+static inline void
+zs_initialize_proj_attributes_extended(ZedStoreDesc scan, TupleDesc tupledesc)
+{
+	MemoryContext oldcontext;
+	ZedStoreProjectData *proj_data = &scan->proj_data;
+
+	/* if already initialized return */
+	if (proj_data->num_proj_atts != 0)
+		return;
+
+	zs_initialize_proj_attributes(tupledesc, proj_data);
+
+	oldcontext = MemoryContextSwitchTo(proj_data->context);
+	/* Extra setup for bitmap and sample scans */
+	if ((scan->rs_scan.rs_flags & SO_TYPE_BITMAPSCAN) ||
+		(scan->rs_scan.rs_flags & SO_TYPE_SAMPLESCAN) ||
+		(scan->rs_scan.rs_flags & SO_TYPE_ANALYZE))
+	{
+		scan->bmscan_ntuples = 0;
+		scan->bmscan_tids = palloc(MAX_ITEMS_PER_LOGICAL_BLOCK * sizeof(zstid));
+
+		scan->bmscan_datums = palloc(proj_data->num_proj_atts * sizeof(Datum *));
+		scan->bmscan_isnulls = palloc(proj_data->num_proj_atts * sizeof(bool *));
+		for (int i = 0; i < proj_data->num_proj_atts; i++)
+		{
+			scan->bmscan_datums[i] = palloc(MAX_ITEMS_PER_LOGICAL_BLOCK * sizeof(Datum));
+			scan->bmscan_isnulls[i] = palloc(MAX_ITEMS_PER_LOGICAL_BLOCK * sizeof(bool));
+		}
+	}
+	MemoryContextSwitchTo(oldcontext);
+}
+
+static TableScanDesc
+zedstoream_beginscan_with_column_projection(Relation relation, Snapshot snapshot,
+											int nkeys, ScanKey key,
+											ParallelTableScanDesc parallel_scan,
+											uint32 flags,
+											bool *project_columns)
+{
+	ZedStoreDesc scan;
+
+	/* Sample scans have no snapshot, but we need one */
+	if (!snapshot)
+	{
+		Assert(!(flags & SO_TYPE_SAMPLESCAN));
+		snapshot = SnapshotAny;
+	}
+
+	/*
+	 * allocate and initialize scan descriptor
+	 */
+	scan = (ZedStoreDesc) palloc0(sizeof(ZedStoreDescData));
+
+	scan->rs_scan.rs_rd = relation;
+	scan->rs_scan.rs_snapshot = snapshot;
+	scan->rs_scan.rs_nkeys = nkeys;
+	scan->rs_scan.rs_flags = flags;
+	scan->rs_scan.rs_parallel = parallel_scan;
+
+	/*
+	 * we can use page-at-a-time mode if it's an MVCC-safe snapshot
+	 */
+	scan->state = ZSSCAN_STATE_UNSTARTED;
+
+	/*
+	 * we do this here instead of in initscan() because heap_rescan also calls
+	 * initscan() and we don't want to allocate memory again
+	 */
+	if (nkeys > 0)
+		scan->rs_scan.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
+	else
+		scan->rs_scan.rs_key = NULL;
+
+	scan->proj_data.context = CurrentMemoryContext;
+	scan->proj_data.project_columns = project_columns;
+
+	/*
+	 * For a seqscan in a serializable transaction, acquire a predicate lock
+	 * on the entire relation. This is required not only to lock all the
+	 * matching tuples, but also to conflict with new insertions into the
+	 * table. In an indexscan, we take page locks on the index pages covering
+	 * the range specified in the scan qual, but in a heap scan there is
+	 * nothing more fine-grained to lock. A bitmap scan is a different story,
+	 * there we have already scanned the index and locked the index pages
+	 * covering the predicate. But in that case we still have to lock any
+	 * matching heap tuples.
+	 */
+	if (!(flags & SO_TYPE_BITMAPSCAN) &&
+		!(flags & SO_TYPE_ANALYZE))
+		PredicateLockRelation(relation, snapshot);
+
+	/*
+	 * Currently, we don't have a stats counter for bitmap heap scans (but the
+	 * underlying bitmap index scans will be counted) or sample scans (we only
+	 * update stats for tuple fetches there)
+	 */
+	if (!(flags & SO_TYPE_BITMAPSCAN) && !(flags & SO_TYPE_SAMPLESCAN))
+		pgstat_count_heap_scan(relation);
+
+	return (TableScanDesc) scan;
+}
+
+static TableScanDesc
+zedstoream_beginscan(Relation relation, Snapshot snapshot,
+					 int nkeys, ScanKey key,
+					 ParallelTableScanDesc parallel_scan,
+					 uint32 flags)
+{
+	return zedstoream_beginscan_with_column_projection(relation, snapshot,
+													   nkeys, key, parallel_scan, flags, NULL);
+}
+
+static void
+zedstoream_endscan(TableScanDesc sscan)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	ZedStoreProjectData *proj_data = &scan->proj_data;
+
+	if (proj_data->proj_atts)
+		pfree(proj_data->proj_atts);
+
+	if (proj_data->num_proj_atts > 0)
+	{
+		zsbt_tid_end_scan(&proj_data->btree_scans[0]);
+		for (int i = 1; i < proj_data->num_proj_atts; i++)
+			zsbt_attr_end_scan(&proj_data->btree_scans[i]);
+	}
+
+	if (scan->rs_scan.rs_flags & SO_TEMP_SNAPSHOT)
+		UnregisterSnapshot(scan->rs_scan.rs_snapshot);
+
+	if (proj_data->btree_scans)
+		pfree(proj_data->btree_scans);
+	pfree(scan);
+}
+
+static void
+zedstoream_rescan(TableScanDesc sscan, struct ScanKeyData *key,
+				  bool set_params, bool allow_strat,
+				  bool allow_sync, bool allow_pagemode)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+
+	/* these params don't do much in zedstore yet, but whatever */
+	if (set_params)
+	{
+		if (allow_strat)
+			scan->rs_scan.rs_flags |= SO_ALLOW_STRAT;
+		else
+			scan->rs_scan.rs_flags &= ~SO_ALLOW_STRAT;
+
+		if (allow_sync)
+			scan->rs_scan.rs_flags |= SO_ALLOW_SYNC;
+		else
+			scan->rs_scan.rs_flags &= ~SO_ALLOW_SYNC;
+
+		if (allow_pagemode && scan->rs_scan.rs_snapshot &&
+			IsMVCCSnapshot(scan->rs_scan.rs_snapshot))
+			scan->rs_scan.rs_flags |= SO_ALLOW_PAGEMODE;
+		else
+			scan->rs_scan.rs_flags &= ~SO_ALLOW_PAGEMODE;
+	}
+
+	if (scan->proj_data.num_proj_atts > 0)
+	{
+		zsbt_tid_end_scan(&scan->proj_data.btree_scans[0]);
+		for (int i = 1; i < scan->proj_data.num_proj_atts; i++)
+			zsbt_attr_end_scan(&scan->proj_data.btree_scans[i]);
+	}
+
+	scan->state = ZSSCAN_STATE_UNSTARTED;
+}
+
+static bool
+zedstoream_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	ZedStoreProjectData *scan_proj = &scan->proj_data;
+	int			i;
+	int			slot_natts = slot->tts_tupleDescriptor->natts;
+	Datum	   *slot_values = slot->tts_values;
+	bool	   *slot_isnull = slot->tts_isnull;
+
+	if (direction != ForwardScanDirection)
+		elog(ERROR, "backward scan not implemented in zedstore");
+
+	zs_initialize_proj_attributes(slot->tts_tupleDescriptor, scan_proj);
+	Assert((scan_proj->num_proj_atts - 1) <= slot_natts);
+
+	/*
+	 * Initialize the slot.
+	 *
+	 * We initialize all columns to NULL. The values for columns that are projected
+	 * will be set to the actual values below, but it's important that non-projected
+	 * columns are NULL.
+	 */
+	ExecClearTuple(slot);
+	for (i = 0; i < slot_natts; i++)
+		slot_isnull[i] = true;
+
+	while (scan->state != ZSSCAN_STATE_FINISHED)
+	{
+		zstid		this_tid;
+		Datum		datum;
+		bool        isnull;
+
+		if (scan->state == ZSSCAN_STATE_UNSTARTED ||
+			scan->state == ZSSCAN_STATE_FINISHED_RANGE)
+		{
+			MemoryContext oldcontext;
+
+			if (scan->rs_scan.rs_parallel)
+			{
+				/* Allocate next range of TIDs to scan */
+				if (!zs_parallelscan_nextrange(scan->rs_scan.rs_rd,
+											   (ParallelZSScanDesc) scan->rs_scan.rs_parallel,
+											   &scan->cur_range_start, &scan->cur_range_end))
+				{
+					scan->state = ZSSCAN_STATE_FINISHED;
+					break;
+				}
+			}
+			else
+			{
+				if (scan->state == ZSSCAN_STATE_FINISHED_RANGE)
+				{
+					scan->state = ZSSCAN_STATE_FINISHED;
+					break;
+				}
+				scan->cur_range_start = MinZSTid;
+				scan->cur_range_end = MaxPlusOneZSTid;
+			}
+
+			oldcontext = MemoryContextSwitchTo(scan_proj->context);
+			zsbt_tid_begin_scan(scan->rs_scan.rs_rd,
+								scan->cur_range_start,
+								scan->cur_range_end,
+								scan->rs_scan.rs_snapshot,
+								&scan_proj->btree_scans[0]);
+			scan_proj->btree_scans[0].serializable = true;
+			for (int i = 1; i < scan_proj->num_proj_atts; i++)
+			{
+				int			attno = scan_proj->proj_atts[i];
+
+				zsbt_attr_begin_scan(scan->rs_scan.rs_rd,
+									 slot->tts_tupleDescriptor,
+									 attno,
+									 scan->cur_range_start,
+									 scan->cur_range_end,
+									 &scan_proj->btree_scans[i]);
+			}
+			MemoryContextSwitchTo(oldcontext);
+			scan->state = ZSSCAN_STATE_SCANNING;
+		}
+
+		/* We now have a range to scan. Find the next visible TID. */
+		Assert(scan->state == ZSSCAN_STATE_SCANNING);
+
+		this_tid = zsbt_tid_scan_next(&scan_proj->btree_scans[0]);
+		if (this_tid == InvalidZSTid)
+		{
+			scan->state = ZSSCAN_STATE_FINISHED_RANGE;
+		}
+		else
+		{
+			Assert (this_tid < scan->cur_range_end);
+
+			/* Note: We don't need to predicate-lock tuples in Serializable mode,
+			 * because in a sequential scan, we predicate-locked the whole table.
+			 */
+
+			/* Fetch the datums of each attribute for this row */
+			for (int i = 1; i < scan_proj->num_proj_atts; i++)
+			{
+				ZSBtreeScan	*btscan = &scan_proj->btree_scans[i];
+				Form_pg_attribute attr = ZSBtreeScanGetAttInfo(btscan);
+				int			natt;
+
+				if (!zsbt_scan_next_fetch(btscan, &datum, &isnull, this_tid))
+					zsbt_fill_missing_attribute_value(btscan, &datum, &isnull);
+
+				/*
+				 * flatten any ZS-TOASTed values, because the rest of the system
+				 * doesn't know how to deal with them.
+				 */
+				natt = scan_proj->proj_atts[i];
+
+				if (!isnull && attr->attlen == -1 &&
+					VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE)
+				{
+					datum = zedstore_toast_flatten(scan->rs_scan.rs_rd, natt, this_tid, datum);
+				}
+
+				/* Check that the values coming out of the b-tree are aligned properly */
+				if (!isnull && attr->attlen == -1)
+				{
+					Assert (VARATT_IS_1B(datum) || INTALIGN(datum) == datum);
+				}
+
+				if (natt != ZS_META_ATTRIBUTE_NUM)
+				{
+					Assert(natt > 0);
+					slot_values[natt - 1] = datum;
+					slot_isnull[natt - 1] = isnull;
+				}
+			}
+		}
+
+		if (scan->state == ZSSCAN_STATE_FINISHED_RANGE)
+		{
+			zsbt_tid_end_scan(&scan_proj->btree_scans[0]);
+			for (int i = 1; i < scan_proj->num_proj_atts; i++)
+				zsbt_attr_end_scan(&scan_proj->btree_scans[i]);
+		}
+		else
+		{
+			Assert(scan->state == ZSSCAN_STATE_SCANNING);
+			slot->tts_tid = ItemPointerFromZSTid(this_tid);
+			slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+			slot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+			pgstat_count_heap_getnext(scan->rs_scan.rs_rd);
+			return true;
+		}
+	}
+
+	ExecClearTuple(slot);
+	return false;
+}
+
+static bool
+zedstoream_tuple_tid_valid(TableScanDesc sscan, ItemPointer tid)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	zstid ztid = ZSTidFromItemPointer(*tid);
+
+	if (scan->max_tid_to_scan == InvalidZSTid)
+	{
+		/*
+		 * get the max tid once and store it
+		 */
+		scan->max_tid_to_scan = zsbt_get_last_tid(sscan->rs_rd);
+	}
+
+	/*
+	 * FIXME: should we get lowest TID as well to further optimize the check.
+	 */
+	if (ztid <= scan->max_tid_to_scan)
+		return true;
+	else
+		return false;
+}
+
+static bool
+zedstoream_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
+									Snapshot snapshot)
+{
+	/*
+	 * TODO: we didn't keep any visibility information about the tuple in the
+	 * slot, so we have to fetch it again. A custom slot type might be a
+	 * good idea..
+	 */
+	zstid		tid = ZSTidFromItemPointer(slot->tts_tid);
+	ZSBtreeScan meta_scan;
+	bool		found;
+
+	/* Use the meta-data tree for the visibility information. */
+	zsbt_tid_begin_scan(rel, tid, tid + 1, snapshot, &meta_scan);
+
+	found = zsbt_tid_scan_next(&meta_scan) != InvalidZSTid;
+
+	zsbt_tid_end_scan(&meta_scan);
+
+	return found;
+}
+
+static TransactionId
+zedstoream_compute_xid_horizon_for_tuples(Relation rel,
+										  ItemPointerData *items,
+										  int nitems)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+
+}
+
+static IndexFetchTableData *
+zedstoream_begin_index_fetch(Relation rel)
+{
+	ZedStoreIndexFetch zscan = palloc0(sizeof(ZedStoreIndexFetchData));
+
+	zscan->idx_fetch_data.rel = rel;
+	zscan->proj_data.context = CurrentMemoryContext;
+
+	return (IndexFetchTableData *) zscan;
+}
+
+static void
+zedstoream_fetch_set_column_projection(struct IndexFetchTableData *scan,
+									   bool *project_columns)
+{
+	ZedStoreIndexFetch zscan = (ZedStoreIndexFetch) scan;
+	zscan->proj_data.project_columns = project_columns;
+}
+
+static void
+zedstoream_reset_index_fetch(IndexFetchTableData *scan)
+{
+	/* TODO: we could close the scans here, but currently we don't bother */
+}
+
+static void
+zedstoream_end_index_fetch(IndexFetchTableData *scan)
+{
+	ZedStoreIndexFetch zscan = (ZedStoreIndexFetch) scan;
+	ZedStoreProjectData *zscan_proj = &zscan->proj_data;
+
+	if (zscan_proj->num_proj_atts > 0)
+	{
+		zsbt_tid_end_scan(&zscan_proj->btree_scans[0]);
+		for (int i = 1; i < zscan_proj->num_proj_atts; i++)
+			zsbt_attr_end_scan(&zscan_proj->btree_scans[i]);
+	}
+
+	if (zscan_proj->proj_atts)
+		pfree(zscan_proj->proj_atts);
+
+	if (zscan_proj->btree_scans)
+		pfree(zscan_proj->btree_scans);
+	pfree(zscan);
+}
+
+static bool
+zedstoream_index_fetch_tuple(struct IndexFetchTableData *scan,
+							 ItemPointer tid_p,
+							 Snapshot snapshot,
+							 TupleTableSlot *slot,
+							 bool *call_again, bool *all_dead)
+{
+	bool		result;
+
+	/*
+	 * we don't do in-place updates, so this is essentially the same as
+	 * fetch_row_version.
+	 */
+	if (call_again)
+		*call_again = false;
+	if (all_dead)
+		*all_dead = false;
+
+	result = zedstoream_fetch_row((ZedStoreIndexFetchData *) scan, tid_p, snapshot, slot);
+	if (result)
+	{
+		/* FIXME: heapam acquires the predicate lock first, and then
+		 * calls CheckForSerializableConflictOut(). We do it in the
+		 * opposite order, because CheckForSerializableConflictOut()
+		 * call as done in zsbt_get_last_tid() already. Does it matter?
+		 * I'm not sure.
+		 */
+		PredicateLockTID(scan->rel, tid_p, snapshot);
+	}
+	return result;
+}
+
+/*
+ * Shared implementation of fetch_row_version and index_fetch_tuple callbacks.
+ */
+static bool
+zedstoream_fetch_row(ZedStoreIndexFetchData *fetch,
+					 ItemPointer tid_p,
+					 Snapshot snapshot,
+					 TupleTableSlot *slot)
+{
+	Relation	rel = fetch->idx_fetch_data.rel;
+	zstid		tid = ZSTidFromItemPointer(*tid_p);
+	bool		found = true;
+	ZedStoreProjectData *fetch_proj = &fetch->proj_data;
+
+	/* first time here, initialize */
+	if (fetch_proj->num_proj_atts == 0)
+		zs_initialize_proj_attributes(slot->tts_tupleDescriptor, fetch_proj);
+	else
+	{
+		/* If we had a previous fetches still open, close them first */
+		zsbt_tid_end_scan(&fetch_proj->btree_scans[0]);
+		for (int i = 1; i < fetch_proj->num_proj_atts; i++)
+			zsbt_attr_end_scan(&fetch_proj->btree_scans[i]);
+	}
+
+	/*
+	 * Initialize the slot.
+	 *
+	 * If we're not fetching all columns, initialize the unfetched values
+	 * in the slot to NULL. (Actually, this initializes all to NULL, and the
+	 * code below will overwrite them for the columns that are projected)
+	 */
+	ExecClearTuple(slot);
+	for (int i = 0; i < slot->tts_tupleDescriptor->natts; i++)
+		slot->tts_isnull[i] = true;
+
+	zsbt_tid_begin_scan(rel, tid, tid + 1, snapshot, &fetch_proj->btree_scans[0]);
+	fetch_proj->btree_scans[0].serializable = true;
+	found = zsbt_tid_scan_next(&fetch_proj->btree_scans[0]) != InvalidZSTid;
+	if (found)
+	{
+		for (int i = 1; i < fetch_proj->num_proj_atts; i++)
+		{
+			int         natt = fetch_proj->proj_atts[i];
+			ZSBtreeScan *btscan = &fetch_proj->btree_scans[i];
+			Form_pg_attribute attr;
+			Datum		datum;
+			bool        isnull;
+
+			zsbt_attr_begin_scan(rel, slot->tts_tupleDescriptor, natt, tid, tid + 1,
+								 btscan);
+
+			attr = ZSBtreeScanGetAttInfo(btscan);
+			if (zsbt_scan_next_fetch(btscan, &datum, &isnull, tid))
+			{
+				/*
+				 * flatten any ZS-TOASTed values, because the rest of the system
+				 * doesn't know how to deal with them.
+				 */
+				if (!isnull && attr->attlen == -1 &&
+					VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE)
+				{
+					datum = zedstore_toast_flatten(rel, natt, tid, datum);
+				}
+			}
+			else
+				zsbt_fill_missing_attribute_value(btscan, &datum, &isnull);
+
+			slot->tts_values[natt - 1] = datum;
+			slot->tts_isnull[natt - 1] = isnull;
+		}
+	}
+
+	if (found)
+	{
+		slot->tts_tid = ItemPointerFromZSTid(tid);
+		slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+		slot->tts_flags &= ~TTS_FLAG_EMPTY;
+		return true;
+	}
+
+	return false;
+}
+
+static void
+zedstoream_index_validate_scan(Relation baseRelation,
+							   Relation indexRelation,
+							   IndexInfo *indexInfo,
+							   Snapshot snapshot,
+							   ValidateIndexState *state)
+{
+	Datum		values[INDEX_MAX_KEYS];
+	bool		isnull[INDEX_MAX_KEYS];
+	ExprState  *predicate;
+	TupleTableSlot *slot;
+	EState	   *estate;
+	ExprContext *econtext;
+	bool	   *proj;
+	int			attno;
+	TableScanDesc scan;
+	ItemPointerData idx_ptr;
+	bool		tuplesort_empty = false;
+
+	/*
+	 * sanity checks
+	 */
+	Assert(OidIsValid(indexRelation->rd_rel->relam));
+
+	/*
+	 * Need an EState for evaluation of index expressions and partial-index
+	 * predicates.  Also a slot to hold the current tuple.
+	 */
+	estate = CreateExecutorState();
+	econtext = GetPerTupleExprContext(estate);
+	slot = table_slot_create(baseRelation, NULL);
+
+	/* Arrange for econtext's scan tuple to be the tuple under test */
+	econtext->ecxt_scantuple = slot;
+
+	/* Set up execution state for predicate, if any. */
+	predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+
+	/*
+	 * Prepare for scan of the base relation.  We need just those tuples
+	 * satisfying the passed-in reference snapshot.  We must disable syncscan
+	 * here, because it's critical that we read from block zero forward to
+	 * match the sorted TIDs.
+	 */
+
+	/*
+	 * TODO: It would be very good to fetch only the columns we need.
+	 */
+	proj = palloc0(baseRelation->rd_att->natts * sizeof(bool));
+	for (attno = 0; attno < indexInfo->ii_NumIndexKeyAttrs; attno++)
+	{
+		Assert(indexInfo->ii_IndexAttrNumbers[attno] <= baseRelation->rd_att->natts);
+		/* skip expressions */
+		if (indexInfo->ii_IndexAttrNumbers[attno] > 0)
+			proj[indexInfo->ii_IndexAttrNumbers[attno] - 1] = true;
+	}
+	GetNeededColumnsForNode((Node *)indexInfo->ii_Predicate, proj,
+							baseRelation->rd_att->natts);
+	GetNeededColumnsForNode((Node *)indexInfo->ii_Expressions, proj,
+							baseRelation->rd_att->natts);
+
+	scan = table_beginscan_with_column_projection(baseRelation,	/* relation */
+												  snapshot,	/* snapshot */
+												  0, /* number of keys */
+												  NULL,	/* scan key */
+												  proj);
+
+	/*
+	 * Scan all tuples matching the snapshot.
+	 */
+	ItemPointerSet(&idx_ptr, 0, 0); /* this is less than any real TID */
+	while (table_scan_getnextslot(scan, ForwardScanDirection, slot))
+	{
+		ItemPointerData tup_ptr = slot->tts_tid;
+		HeapTuple	heapTuple;
+		int			cmp;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * TODO: Once we have in-place updates, like HOT, this will need
+		 * to work harder, like heapam's function.
+		 */
+
+		MemoryContextReset(econtext->ecxt_per_tuple_memory);
+
+		if (tuplesort_empty)
+			cmp = -1;
+		else
+		{
+			while ((cmp = ItemPointerCompare(&tup_ptr, &idx_ptr)) > 0)
+			{
+				Datum		ts_val;
+				bool		ts_isnull;
+
+				tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
+													  &ts_val, &ts_isnull, NULL);
+				if (!tuplesort_empty)
+				{
+					Assert(!ts_isnull);
+					itemptr_decode(&idx_ptr, DatumGetInt64(ts_val));
+
+					/* If int8 is pass-by-ref, free (encoded) TID Datum memory */
+#ifndef USE_FLOAT8_BYVAL
+					pfree(DatumGetPointer(ts_val));
+#endif
+					break;
+				}
+				else
+				{
+					/* Be tidy */
+					ItemPointerSetInvalid(&idx_ptr);
+					cmp = -1;
+				}
+			}
+		}
+		if (cmp < 0)
+		{
+			/* This item is not in the index */
+
+			/*
+			 * In a partial index, discard tuples that don't satisfy the
+			 * predicate.
+			 */
+			if (predicate != NULL)
+			{
+				if (!ExecQual(predicate, econtext))
+					continue;
+			}
+
+			/*
+			 * For the current heap tuple, extract all the attributes we use in
+			 * this index, and note which are null.  This also performs evaluation
+			 * of any expressions needed.
+			 */
+			FormIndexDatum(indexInfo,
+						   slot,
+						   estate,
+						   values,
+						   isnull);
+
+			/* Call the AM's callback routine to process the tuple */
+			heapTuple = ExecCopySlotHeapTuple(slot);
+			heapTuple->t_self = slot->tts_tid;
+			index_insert(indexRelation, values, isnull, &tup_ptr, baseRelation,
+						 indexInfo->ii_Unique ?
+						 UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
+						 indexInfo);
+			pfree(heapTuple);
+
+			state->tups_inserted += 1;
+		}
+	}
+
+	table_endscan(scan);
+
+	ExecDropSingleTupleTableSlot(slot);
+
+	FreeExecutorState(estate);
+
+	/* These may have been pointing to the now-gone estate */
+	indexInfo->ii_ExpressionsState = NIL;
+	indexInfo->ii_PredicateState = NULL;
+}
+
+static double
+zedstoream_index_build_range_scan(Relation baseRelation,
+								  Relation indexRelation,
+								  IndexInfo *indexInfo,
+								  bool allow_sync,
+								  bool anyvisible,
+								  bool progress,
+								  BlockNumber start_blockno,
+								  BlockNumber numblocks,
+								  IndexBuildCallback callback,
+								  void *callback_state,
+								  TableScanDesc scan)
+{
+	Datum		values[INDEX_MAX_KEYS];
+	bool		isnull[INDEX_MAX_KEYS];
+	double		reltuples;
+	ExprState  *predicate;
+	TupleTableSlot *slot;
+	EState	   *estate;
+	ExprContext *econtext;
+	Snapshot	snapshot;
+	SnapshotData NonVacuumableSnapshot;
+	bool		need_unregister_snapshot = false;
+	TransactionId OldestXmin;
+
+#ifdef USE_ASSERT_CHECKING
+	bool		checking_uniqueness;
+	/* See whether we're verifying uniqueness/exclusion properties */
+	checking_uniqueness = (indexInfo->ii_Unique ||
+						   indexInfo->ii_ExclusionOps != NULL);
+
+	/*
+	 * "Any visible" mode is not compatible with uniqueness checks; make sure
+	 * only one of those is requested.
+	 */
+	Assert(!(anyvisible && checking_uniqueness));
+#endif
+
+	/*
+	 * sanity checks
+	 */
+	Assert(OidIsValid(indexRelation->rd_rel->relam));
+
+	/*
+	 * Need an EState for evaluation of index expressions and partial-index
+	 * predicates.  Also a slot to hold the current tuple.
+	 */
+	estate = CreateExecutorState();
+	econtext = GetPerTupleExprContext(estate);
+	slot = table_slot_create(baseRelation, NULL);
+
+	/* Arrange for econtext's scan tuple to be the tuple under test */
+	econtext->ecxt_scantuple = slot;
+
+	/* Set up execution state for predicate, if any. */
+	predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+
+	/*
+	 * Prepare for scan of the base relation.  In a normal index build, we use
+	 * SnapshotAny because we must retrieve all tuples and do our own time
+	 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
+	 * concurrent build, or during bootstrap, we take a regular MVCC snapshot
+	 * and index whatever's live according to that.
+	 */
+	OldestXmin = InvalidTransactionId;
+
+	/* okay to ignore lazy VACUUMs here */
+	if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
+		OldestXmin = GetOldestXmin(baseRelation, PROCARRAY_FLAGS_VACUUM);
+
+	/*
+	 * TODO: It would be very good to fetch only the columns we need.
+	 */
+	if (!scan)
+	{
+		bool	   *proj;
+		int			attno;
+
+		/*
+		 * Serial index build.
+		 *
+		 * Must begin our own zedstore scan in this case.  We may also need to
+		 * register a snapshot whose lifetime is under our direct control.
+		 */
+		if (!TransactionIdIsValid(OldestXmin))
+		{
+			snapshot = RegisterSnapshot(GetTransactionSnapshot());
+			need_unregister_snapshot = true;
+		}
+		else
+		{
+			/* leave out completely dead items even with SnapshotAny */
+			InitNonVacuumableSnapshot(NonVacuumableSnapshot, OldestXmin);
+			snapshot = &NonVacuumableSnapshot;
+		}
+
+		proj = palloc0(baseRelation->rd_att->natts * sizeof(bool));
+		for (attno = 0; attno < indexInfo->ii_NumIndexKeyAttrs; attno++)
+		{
+			Assert(indexInfo->ii_IndexAttrNumbers[attno] <= baseRelation->rd_att->natts);
+			/* skip expressions */
+			if (indexInfo->ii_IndexAttrNumbers[attno] > 0)
+				proj[indexInfo->ii_IndexAttrNumbers[attno] - 1] = true;
+		}
+
+		GetNeededColumnsForNode((Node *)indexInfo->ii_Predicate, proj,
+								baseRelation->rd_att->natts);
+		GetNeededColumnsForNode((Node *)indexInfo->ii_Expressions, proj,
+								baseRelation->rd_att->natts);
+
+		scan = table_beginscan_with_column_projection(baseRelation,	/* relation */
+													  snapshot,	/* snapshot */
+													  0, /* number of keys */
+													  NULL,	/* scan key */
+													  proj);
+
+		if (start_blockno != 0 || numblocks != InvalidBlockNumber)
+		{
+			ZedStoreDesc zscan = (ZedStoreDesc) scan;
+			ZedStoreProjectData *zscan_proj = &zscan->proj_data;
+
+			zscan->cur_range_start = ZSTidFromBlkOff(start_blockno, 1);
+			zscan->cur_range_end = ZSTidFromBlkOff(numblocks, 1);
+
+			/* FIXME: when can 'num_proj_atts' be 0? */
+			if (zscan_proj->num_proj_atts > 0)
+			{
+				zsbt_tid_begin_scan(zscan->rs_scan.rs_rd,
+									zscan->cur_range_start,
+									zscan->cur_range_end,
+									zscan->rs_scan.rs_snapshot,
+									&zscan_proj->btree_scans[0]);
+				for (int i = 1; i < zscan_proj->num_proj_atts; i++)
+				{
+					int			natt = zscan_proj->proj_atts[i];
+
+					zsbt_attr_begin_scan(zscan->rs_scan.rs_rd,
+										 RelationGetDescr(zscan->rs_scan.rs_rd),
+										 natt,
+										 zscan->cur_range_start,
+										 zscan->cur_range_end,
+										 &zscan_proj->btree_scans[i]);
+				}
+			}
+			zscan->state = ZSSCAN_STATE_SCANNING;
+		}
+	}
+	else
+	{
+		/*
+		 * Parallel index build.
+		 *
+		 * Parallel case never registers/unregisters own snapshot.  Snapshot
+		 * is taken from parallel zedstore scan, and is SnapshotAny or an MVCC
+		 * snapshot, based on same criteria as serial case.
+		 */
+		Assert(!IsBootstrapProcessingMode());
+		Assert(allow_sync);
+		Assert(start_blockno == 0);
+		Assert(numblocks == InvalidBlockNumber);
+		snapshot = scan->rs_snapshot;
+
+		if (snapshot == SnapshotAny)
+		{
+			/* leave out completely dead items even with SnapshotAny */
+			InitNonVacuumableSnapshot(NonVacuumableSnapshot, OldestXmin);
+			snapshot = &NonVacuumableSnapshot;
+		}
+	}
+
+	/*
+	 * Must call GetOldestXmin() with SnapshotAny.  Should never call
+	 * GetOldestXmin() with MVCC snapshot. (It's especially worth checking
+	 * this for parallel builds, since ambuild routines that support parallel
+	 * builds must work these details out for themselves.)
+	 */
+	Assert(snapshot == &NonVacuumableSnapshot || IsMVCCSnapshot(snapshot));
+	Assert(snapshot == &NonVacuumableSnapshot ? TransactionIdIsValid(OldestXmin) :
+		   !TransactionIdIsValid(OldestXmin));
+	Assert(snapshot == &NonVacuumableSnapshot || !anyvisible);
+
+	reltuples = 0;
+
+	/*
+	 * Scan all tuples in the base relation.
+	 */
+	while (table_scan_getnextslot(scan, ForwardScanDirection, slot))
+	{
+		bool		tupleIsAlive;
+		HeapTuple	heapTuple;
+
+		if (numblocks != InvalidBlockNumber &&
+			ItemPointerGetBlockNumber(&slot->tts_tid) >= numblocks)
+			break;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* table_scan_getnextslot did the visibility check */
+		tupleIsAlive = true;
+		reltuples += 1;
+
+		/*
+		 * TODO: Once we have in-place updates, like HOT, this will need
+		 * to work harder, to figure out which tuple version to index.
+		 */
+
+		MemoryContextReset(econtext->ecxt_per_tuple_memory);
+
+		/*
+		 * In a partial index, discard tuples that don't satisfy the
+		 * predicate.
+		 */
+		if (predicate != NULL)
+		{
+			if (!ExecQual(predicate, econtext))
+				continue;
+		}
+
+		/*
+		 * For the current heap tuple, extract all the attributes we use in
+		 * this index, and note which are null.  This also performs evaluation
+		 * of any expressions needed.
+		 */
+		FormIndexDatum(indexInfo,
+					   slot,
+					   estate,
+					   values,
+					   isnull);
+
+		/* Call the AM's callback routine to process the tuple */
+		heapTuple = ExecCopySlotHeapTuple(slot);
+		heapTuple->t_self = slot->tts_tid;
+		callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
+				 callback_state);
+		pfree(heapTuple);
+	}
+
+	table_endscan(scan);
+
+	/* we can now forget our snapshot, if set and registered by us */
+	if (need_unregister_snapshot)
+		UnregisterSnapshot(snapshot);
+
+	ExecDropSingleTupleTableSlot(slot);
+
+	FreeExecutorState(estate);
+
+	/* These may have been pointing to the now-gone estate */
+	indexInfo->ii_ExpressionsState = NIL;
+	indexInfo->ii_PredicateState = NULL;
+
+	return reltuples;
+}
+
+static void
+zedstoream_finish_bulk_insert(Relation relation, int options)
+{
+	/*
+	 * If we skipped writing WAL, then we need to sync the zedstore (but not
+	 * indexes since those use WAL anyway / don't go through tableam)
+	 */
+	if (options & HEAP_INSERT_SKIP_WAL)
+		heap_sync(relation);
+}
+
+/* ------------------------------------------------------------------------
+ * DDL related callbacks for zedstore AM.
+ * ------------------------------------------------------------------------
+ */
+
+static void
+zedstoream_relation_set_new_filenode(Relation rel,
+									 const RelFileNode *newrnode,
+									 char persistence,
+									 TransactionId *freezeXid,
+									 MultiXactId *minmulti)
+{
+	SMgrRelation srel;
+
+	/*
+	 * Initialize to the minimum XID that could put tuples in the table. We
+	 * know that no xacts older than RecentXmin are still running, so that
+	 * will do.
+	 */
+	*freezeXid = RecentXmin;
+
+	/*
+	 * Similarly, initialize the minimum Multixact to the first value that
+	 * could possibly be stored in tuples in the table.  Running transactions
+	 * could reuse values from their local cache, so we are careful to
+	 * consider all currently running multis.
+	 *
+	 * XXX this could be refined further, but is it worth the hassle?
+	 */
+	*minmulti = GetOldestMultiXactId();
+
+	srel = RelationCreateStorage(*newrnode, persistence);
+
+	/*
+	 * If required, set up an init fork for an unlogged table so that it can
+	 * be correctly reinitialized on restart.  An immediate sync is required
+	 * even if the page has been logged, because the write did not go through
+	 * shared_buffers and therefore a concurrent checkpoint may have moved the
+	 * redo pointer past our xlog record.  Recovery may as well remove it
+	 * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
+	 * record. Therefore, logging is necessary even if wal_level=minimal.
+	 */
+	if (persistence == RELPERSISTENCE_UNLOGGED)
+	{
+		Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
+			   rel->rd_rel->relkind == RELKIND_MATVIEW ||
+			   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+		smgrcreate(srel, INIT_FORKNUM, false);
+		log_smgrcreate(newrnode, INIT_FORKNUM);
+		smgrimmedsync(srel, INIT_FORKNUM);
+	}
+}
+
+static void
+zedstoream_relation_nontransactional_truncate(Relation rel)
+{
+	RelationTruncate(rel, 0);
+}
+
+static void
+zedstoream_relation_copy_data(Relation rel, const RelFileNode *newrnode)
+{
+	SMgrRelation dstrel;
+
+	dstrel = smgropen(*newrnode, rel->rd_backend);
+	RelationOpenSmgr(rel);
+
+	/*
+	 * Since we copy the file directly without looking at the shared buffers,
+	 * we'd better first flush out any pages of the source relation that are
+	 * in shared buffers.  We assume no new changes will be made while we are
+	 * holding exclusive lock on the rel.
+	 */
+	FlushRelationBuffers(rel);
+
+	/*
+	 * Create and copy all the relation, and schedule unlinking of the
+	 * old physical file.
+	 *
+	 * NOTE: any conflict in relfilenode value will be caught in
+	 * RelationCreateStorage().
+	 *
+	 * NOTE: There is only the main fork in zedstore. Otherwise
+	 * this would need to copy other forks, too.
+	 */
+	RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence);
+
+	/* copy main fork */
+	RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
+						rel->rd_rel->relpersistence);
+
+	/* drop old relation, and close new one */
+	RelationDropStorage(rel);
+	smgrclose(dstrel);
+}
+
+/*
+ * Subroutine of the zedstoream_relation_copy_for_cluster() callback.
+ *
+ * Creates the TID item with correct visibility information for the
+ * given tuple in the old table. Returns the tid of the tuple in the
+ * new table, or InvalidZSTid if this tuple can be left out completely.
+ *
+ * FIXME: This break UPDATE chains. I.e. after this is done, an UPDATE
+ * looks like DELETE + INSERT, instead of an UPDATe, to any transaction that
+ * might try to follow the update chain.
+ */
+static zstid
+zs_cluster_process_tuple(Relation OldHeap, Relation NewHeap,
+						 zstid oldtid, ZSUndoRecPtr old_undoptr,
+						 ZSUndoRecPtr recent_oldest_undo,
+						 TransactionId OldestXmin)
+{
+	TransactionId this_xmin;
+	CommandId this_cmin;
+	TransactionId this_xmax;
+	CommandId this_cmax;
+	bool		this_changedPart;
+	ZSUndoRecPtr undo_ptr;
+	ZSUndoRec  *undorec;
+
+	/*
+	 * Follow the chain of UNDO records for this tuple, to find the
+	 * transaction that originally inserted the row  (xmin/cmin), and
+	 * the transaction that deleted or updated it away, if any (xmax/cmax)
+	 */
+	this_xmin = FrozenTransactionId;
+	this_cmin = InvalidCommandId;
+	this_xmax = InvalidTransactionId;
+	this_cmax = InvalidCommandId;
+
+	undo_ptr = old_undoptr;
+	for (;;)
+	{
+		if (undo_ptr.counter < recent_oldest_undo.counter)
+		{
+			/* This tuple version is visible to everyone. */
+			break;
+		}
+
+		/* Fetch the next UNDO record. */
+		undorec = zsundo_fetch(OldHeap, undo_ptr);
+
+		if (undorec->type == ZSUNDO_TYPE_INSERT)
+		{
+			if (!TransactionIdIsCurrentTransactionId(undorec->xid) &&
+				!TransactionIdIsInProgress(undorec->xid) &&
+				!TransactionIdDidCommit(undorec->xid))
+			{
+				/*
+				 * inserter aborted or crashed. This row is not visible to
+				 * anyone. Including any later tuple versions we might have
+				 * seen.
+				 */
+				this_xmin = InvalidTransactionId;
+				break;
+			}
+			else
+			{
+				/* Inserter committed. */
+				this_xmin = undorec->xid;
+				this_cmin = undorec->cid;
+
+				/* we know everything there is to know about this tuple version. */
+				break;
+			}
+		}
+		else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK)
+		{
+			/* Ignore tuple locks for now.
+			 *
+			 * FIXME: we should propagate them to the new copy of the table
+			 */
+			undo_ptr = undorec->prevundorec;
+			continue;
+		}
+		else if (undorec->type == ZSUNDO_TYPE_DELETE ||
+				undorec->type == ZSUNDO_TYPE_UPDATE)
+		{
+			/* Row was deleted (or updated away). */
+			if (!TransactionIdIsCurrentTransactionId(undorec->xid) &&
+				!TransactionIdIsInProgress(undorec->xid) &&
+				!TransactionIdDidCommit(undorec->xid))
+			{
+				/* deleter aborted or crashed. The previous record should
+				 * be an insertion (possibly with some tuple-locking in
+				 * between). We'll remember the tuple when we see the
+				 * insertion.
+				 */
+				undo_ptr = undorec->prevundorec;
+				continue;
+			}
+			else
+			{
+				/* deleter committed or is still in progress. */
+				if (TransactionIdPrecedes(undorec->xid, OldestXmin))
+				{
+					/* the deletion is visible to everyone. We can skip the row completely. */
+					this_xmin = InvalidTransactionId;
+					break;
+				}
+				else
+				{
+					/* deleter committed or is in progress. Remember that it was
+					 * deleted by this XID.
+					 */
+					this_xmax = undorec->xid;
+					this_cmax = undorec->cid;
+					if (undorec->type == ZSUNDO_TYPE_DELETE)
+						this_changedPart = ((ZSUndoRec_Delete *) undorec)->changedPart;
+					else
+						this_changedPart = false;
+
+					/* follow the UNDO chain to find information about the inserting
+					 * transaction (xmin/cmin)
+					 */
+					undo_ptr = undorec->prevundorec;
+					continue;
+				}
+			}
+		}
+	}
+
+	/*
+	 * We now know the visibility of this tuple. Re-create it in the new table.
+	 */
+	if (this_xmin != InvalidTransactionId)
+	{
+		/* Insert the first version of the row. */
+		ZSUndoRecPtr prevundoptr;
+		zstid		newtid = InvalidZSTid;
+
+		/* First, insert the tuple. */
+		ZSUndoRecPtrInitialize(&prevundoptr);
+		zsbt_tid_multi_insert(NewHeap,
+							  &newtid, 1,
+							  this_xmin,
+							  this_cmin,
+							  INVALID_SPECULATIVE_TOKEN,
+							  prevundoptr);
+
+		/* And if the tuple was deleted/updated away, do the same in the new table. */
+		if (this_xmax != InvalidTransactionId)
+		{
+			TM_Result	delete_result;
+
+			/* tuple was deleted. */
+			delete_result = zsbt_tid_delete(NewHeap, newtid,
+											this_xmax, this_cmax,
+											NULL, NULL, false, NULL, this_changedPart);
+			if (delete_result != TM_Ok)
+				elog(ERROR, "tuple deletion failed during table rewrite");
+		}
+		return newtid;
+	}
+	else
+		return InvalidZSTid;
+}
+
+
+static void
+zedstoream_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
+									 Relation OldIndex, bool use_sort,
+									 TransactionId OldestXmin,
+									 TransactionId *xid_cutoff,
+									 MultiXactId *multi_cutoff,
+									 double *num_tuples,
+									 double *tups_vacuumed,
+									 double *tups_recently_dead)
+{
+	TupleDesc	olddesc;
+	ZSBtreeScan meta_scan;
+	ZSBtreeScan	*attr_scans;
+	ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(OldHeap);
+	int			attno;
+	IndexScanDesc indexScan;
+
+	olddesc = RelationGetDescr(OldHeap),
+
+	attr_scans = palloc((olddesc->natts + 1) * sizeof(ZSBtreeScan));
+
+	/*
+	 * Scan the old table. We ignore any old updated-away tuple versions,
+	 * and only stop at the latest tuple version of each row. At the latest
+	 * version, follow the update chain to get all the old versions of that
+	 * row, too. That way, the whole update chain is processed in one go,
+	 * and can be reproduced in the new table.
+	 */
+	zsbt_tid_begin_scan(OldHeap, MinZSTid, MaxPlusOneZSTid,
+						SnapshotAny, &meta_scan);
+
+	for (attno = 1; attno <= olddesc->natts; attno++)
+	{
+		if (TupleDescAttr(olddesc, attno - 1)->attisdropped)
+			continue;
+
+		zsbt_attr_begin_scan(OldHeap,
+							 olddesc,
+							 attno,
+							 MinZSTid,
+							 MaxPlusOneZSTid,
+							 &attr_scans[attno]);
+	}
+
+	/* TODO: sorting not implemented yet. (it would require materializing each
+	 * row into a HeapTuple or something like that, which could carry the xmin/xmax
+	 * information through the sorter).
+	 */
+	use_sort = false;
+
+	/*
+	 * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
+	 * that still need to be copied, we scan with SnapshotAny and use
+	 * HeapTupleSatisfiesVacuum for the visibility test.
+	 */
+	if (OldIndex != NULL && !use_sort)
+	{
+		const int	ci_index[] = {
+			PROGRESS_CLUSTER_PHASE,
+			PROGRESS_CLUSTER_INDEX_RELID
+		};
+		int64		ci_val[2];
+
+		/* Set phase and OIDOldIndex to columns */
+		ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
+		ci_val[1] = RelationGetRelid(OldIndex);
+		pgstat_progress_update_multi_param(2, ci_index, ci_val);
+
+		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
+		index_rescan(indexScan, NULL, 0, NULL, 0);
+	}
+	else
+	{
+		/* In scan-and-sort mode and also VACUUM FULL, set phase */
+		pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+									 PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
+
+		indexScan = NULL;
+
+		/* Set total heap blocks */
+		/* TODO */
+#if 0
+		pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
+									 heapScan->rs_nblocks);
+#endif
+	}
+
+	for (;;)
+	{
+		zstid		old_tid;
+		ZSUndoRecPtr old_undoptr;
+		zstid		new_tid;
+		Datum		datum;
+		bool        isnull;
+		zstid		fetchtid = InvalidZSTid;
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (indexScan != NULL)
+		{
+			ItemPointer itemptr;
+
+			itemptr = index_getnext_tid(indexScan, ForwardScanDirection);
+			if (!itemptr)
+				break;
+
+			/* Since we used no scan keys, should never need to recheck */
+			if (indexScan->xs_recheck)
+				elog(ERROR, "CLUSTER does not support lossy index conditions");
+
+			fetchtid = ZSTidFromItemPointer(*itemptr);
+			zsbt_tid_reset_scan(&meta_scan, fetchtid);
+			old_tid = zsbt_tid_scan_next(&meta_scan);
+		}
+		else
+		{
+			old_tid = zsbt_tid_scan_next(&meta_scan);
+			fetchtid = old_tid;
+		}
+		if (old_tid == InvalidZSTid)
+			break;
+		if (old_tid != fetchtid)
+			break;
+		old_undoptr = meta_scan.array_undoptr;
+
+		new_tid = zs_cluster_process_tuple(OldHeap, NewHeap,
+										   old_tid, old_undoptr,
+										   recent_oldest_undo,
+										   OldestXmin);
+		if (new_tid != InvalidZSTid)
+		{
+			/* Fetch the attributes and write them out */
+			for (attno = 1; attno <= olddesc->natts; attno++)
+			{
+				Form_pg_attribute att = TupleDescAttr(olddesc, attno - 1);
+				Datum		toastptr = (Datum) 0;
+
+				if (att->attisdropped)
+				{
+					datum = (Datum) 0;
+					isnull = true;
+				}
+				else
+				{
+					if (indexScan)
+						zsbt_attr_reset_scan(&attr_scans[attno], old_tid);
+
+					if (!zsbt_scan_next_fetch(&attr_scans[attno], &datum, &isnull, old_tid))
+						zsbt_fill_missing_attribute_value(&attr_scans[attno], &datum, &isnull);
+				}
+
+				/* flatten and re-toast any ZS-TOASTed values */
+				if (!isnull && att->attlen == -1)
+				{
+					if (VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE)
+					{
+						datum = zedstore_toast_flatten(OldHeap, attno, old_tid, datum);
+					}
+
+					if (VARSIZE_ANY_EXHDR(datum) > MaxZedStoreDatumSize)
+					{
+						toastptr = datum = zedstore_toast_datum(NewHeap, attno, datum);
+					}
+				}
+
+				zsbt_attr_multi_insert(NewHeap, attno, &datum, &isnull, &new_tid, 1);
+
+				if (toastptr != (Datum) 0)
+					zedstore_toast_finish(NewHeap, attno, toastptr, new_tid);
+			}
+		}
+	}
+
+	if (indexScan != NULL)
+		index_endscan(indexScan);
+
+	zsbt_tid_end_scan(&meta_scan);
+	for (attno = 1; attno <= olddesc->natts; attno++)
+	{
+		if (TupleDescAttr(olddesc, attno - 1)->attisdropped)
+			continue;
+
+		zsbt_attr_end_scan(&attr_scans[attno]);
+	}
+}
+
+/*
+ * FIXME: The ANALYZE API is problematic for us. acquire_sample_rows() calls
+ * RelationGetNumberOfBlocks() directly on the relation, and chooses the
+ * block numbers to sample based on that. But the logical block numbers
+ * have little to do with physical ones in zedstore.
+ */
+static bool
+zedstoream_scan_analyze_next_block(TableScanDesc sscan, BlockNumber blockno,
+								   BufferAccessStrategy bstrategy)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	Relation	rel = scan->rs_scan.rs_rd;
+	int			ntuples;
+	ZSBtreeScan	btree_scan;
+	zstid		tid;
+
+	/* TODO: for now, assume that we need all columns */
+	zs_initialize_proj_attributes_extended(scan, RelationGetDescr(rel));
+
+	ntuples = 0;
+	zsbt_tid_begin_scan(scan->rs_scan.rs_rd,
+						ZSTidFromBlkOff(blockno, 1),
+						ZSTidFromBlkOff(blockno + 1, 1),
+						scan->rs_scan.rs_snapshot,
+						&btree_scan);
+	/*
+	 * TODO: it would be good to pass the next expected TID down to zsbt_scan_next,
+	 * so that it could skip over to it more efficiently.
+	 */
+	ntuples = 0;
+	while ((tid = zsbt_tid_scan_next(&btree_scan)) != InvalidZSTid)
+	{
+		Assert(ZSTidGetBlockNumber(tid) == blockno);
+		scan->bmscan_tids[ntuples] = tid;
+		ntuples++;
+	}
+	zsbt_tid_end_scan(&btree_scan);
+
+	if (ntuples)
+	{
+		for (int i = 1; i < scan->proj_data.num_proj_atts; i++)
+		{
+			int			natt = scan->proj_data.proj_atts[i];
+			ZSBtreeScan	btree_scan;
+			Datum		datum;
+			bool        isnull;
+			Datum	   *datums = scan->bmscan_datums[i];
+			bool	   *isnulls = scan->bmscan_isnulls[i];
+
+			zsbt_attr_begin_scan(scan->rs_scan.rs_rd,
+								 RelationGetDescr(scan->rs_scan.rs_rd),
+								 natt,
+								 ZSTidFromBlkOff(blockno, 1),
+								 ZSTidFromBlkOff(blockno + 1, 1),
+								 &btree_scan);
+			for (int n = 0; n < ntuples; n++)
+			{
+				zstid       tid = scan->bmscan_tids[n];
+				if (zsbt_scan_next_fetch(&btree_scan, &datum, &isnull, tid))
+				{
+					Assert(ZSTidGetBlockNumber(tid) == blockno);
+				}
+				else
+					zsbt_fill_missing_attribute_value(&btree_scan, &datum, &isnull);
+
+				/*
+				 * have to make a copy because we close the scan immediately.
+				 * FIXME: I think this leaks into a too-long-lived context
+				 */
+				if (!isnull)
+					datum = zs_datumCopy(datum,
+										 ZSBtreeScanGetAttInfo(&btree_scan)->attbyval,
+										 ZSBtreeScanGetAttInfo(&btree_scan)->attlen);
+				datums[n] = datum;
+				isnulls[n] = isnull;
+			}
+			zsbt_attr_end_scan(&btree_scan);
+		}
+	}
+
+	scan->bmscan_nexttuple = 0;
+	scan->bmscan_ntuples = ntuples;
+
+	return true;
+}
+
+static bool
+zedstoream_scan_analyze_next_tuple(TableScanDesc sscan, TransactionId OldestXmin,
+								   double *liverows, double *deadrows,
+								   TupleTableSlot *slot)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	zstid		tid;
+
+	if (scan->bmscan_nexttuple >= scan->bmscan_ntuples)
+		return false;
+	/*
+	 * projection attributes were created based on Relation tuple descriptor
+	 * it better match TupleTableSlot.
+	 */
+	Assert((scan->proj_data.num_proj_atts - 1) <= slot->tts_tupleDescriptor->natts);
+	tid = scan->bmscan_tids[scan->bmscan_nexttuple];
+	for (int i = 1; i < scan->proj_data.num_proj_atts; i++)
+	{
+		int			natt = scan->proj_data.proj_atts[i];
+		Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, natt - 1);
+
+		Datum		datum;
+		bool        isnull;
+
+		datum = (scan->bmscan_datums[i])[scan->bmscan_nexttuple];
+		isnull = (scan->bmscan_isnulls[i])[scan->bmscan_nexttuple];
+
+		/*
+		 * flatten any ZS-TOASTed values, because the rest of the system
+		 * doesn't know how to deal with them.
+		 */
+		if (!isnull && att->attlen == -1 &&
+			VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE)
+		{
+			datum = zedstore_toast_flatten(scan->rs_scan.rs_rd, natt, tid, datum);
+		}
+
+		slot->tts_values[natt - 1] = datum;
+		slot->tts_isnull[natt - 1] = isnull;
+	}
+	slot->tts_tid = ItemPointerFromZSTid(tid);
+	slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+	slot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+	scan->bmscan_nexttuple++;
+	(*liverows)++;
+
+	return true;
+}
+
+/* ------------------------------------------------------------------------
+ * Miscellaneous callbacks for the heap AM
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * FIXME: Implement this function as best for zedstore. The return value is
+ * for example leveraged by analyze to find which blocks to sample.
+ */
+static uint64
+zedstoream_relation_size(Relation rel, ForkNumber forkNumber)
+{
+	uint64		nblocks = 0;
+
+	/* Open it at the smgr level if not already done */
+	RelationOpenSmgr(rel);
+	nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
+	return nblocks * BLCKSZ;
+}
+
+/*
+ * Zedstore stores TOAST chunks within the table file itself. Hence, doesn't
+ * need separate toast table to be created. Return false for this callback
+ * avoids creation of toast table.
+ */
+static bool
+zedstoream_relation_needs_toast_table(Relation rel)
+{
+	return false;
+}
+
+/* ------------------------------------------------------------------------
+ * Planner related callbacks for the zedstore AM
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * currently this is exact duplicate of heapam_estimate_rel_size().
+ * TODO fix to tune it based on zedstore storage.
+ */
+static void
+zedstoream_relation_estimate_size(Relation rel, int32 *attr_widths,
+								  BlockNumber *pages, double *tuples,
+								  double *allvisfrac)
+{
+	BlockNumber curpages;
+	BlockNumber relpages;
+	double		reltuples;
+	BlockNumber relallvisible;
+	double		density;
+
+	/* it has storage, ok to call the smgr */
+	curpages = RelationGetNumberOfBlocks(rel);
+
+	/* coerce values in pg_class to more desirable types */
+	relpages = (BlockNumber) rel->rd_rel->relpages;
+	reltuples = (double) rel->rd_rel->reltuples;
+	relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
+
+	/*
+	 * HACK: if the relation has never yet been vacuumed, use a minimum size
+	 * estimate of 10 pages.  The idea here is to avoid assuming a
+	 * newly-created table is really small, even if it currently is, because
+	 * that may not be true once some data gets loaded into it.  Once a vacuum
+	 * or analyze cycle has been done on it, it's more reasonable to believe
+	 * the size is somewhat stable.
+	 *
+	 * (Note that this is only an issue if the plan gets cached and used again
+	 * after the table has been filled.  What we're trying to avoid is using a
+	 * nestloop-type plan on a table that has grown substantially since the
+	 * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
+	 * inserts have happened and cause cached-plan invalidation; but that
+	 * doesn't happen instantaneously, and it won't happen at all for cases
+	 * such as temporary tables.)
+	 *
+	 * We approximate "never vacuumed" by "has relpages = 0", which means this
+	 * will also fire on genuinely empty relations.  Not great, but
+	 * fortunately that's a seldom-seen case in the real world, and it
+	 * shouldn't degrade the quality of the plan too much anyway to err in
+	 * this direction.
+	 *
+	 * If the table has inheritance children, we don't apply this heuristic.
+	 * Totally empty parent tables are quite common, so we should be willing
+	 * to believe that they are empty.
+	 */
+	if (curpages < 10 &&
+		relpages == 0 &&
+		!rel->rd_rel->relhassubclass)
+		curpages = 10;
+
+	/* report estimated # pages */
+	*pages = curpages;
+	/* quick exit if rel is clearly empty */
+	if (curpages == 0)
+	{
+		*tuples = 0;
+		*allvisfrac = 0;
+		return;
+	}
+
+	/* estimate number of tuples from previous tuple density */
+	if (relpages > 0)
+		density = reltuples / (double) relpages;
+	else
+	{
+		/*
+		 * When we have no data because the relation was truncated, estimate
+		 * tuple width from attribute datatypes.  We assume here that the
+		 * pages are completely full, which is OK for tables (since they've
+		 * presumably not been VACUUMed yet) but is probably an overestimate
+		 * for indexes.  Fortunately get_relation_info() can clamp the
+		 * overestimate to the parent table's size.
+		 *
+		 * Note: this code intentionally disregards alignment considerations,
+		 * because (a) that would be gilding the lily considering how crude
+		 * the estimate is, and (b) it creates platform dependencies in the
+		 * default plans which are kind of a headache for regression testing.
+		 */
+		int32		tuple_width;
+
+		tuple_width = get_rel_data_width(rel, attr_widths);
+		tuple_width += MAXALIGN(SizeofHeapTupleHeader);
+		tuple_width += sizeof(ItemIdData);
+		/* note: integer division is intentional here */
+		density = (BLCKSZ - SizeOfPageHeaderData) / tuple_width;
+	}
+	*tuples = rint(density * (double) curpages);
+
+	/*
+	 * We use relallvisible as-is, rather than scaling it up like we do for
+	 * the pages and tuples counts, on the theory that any pages added since
+	 * the last VACUUM are most likely not marked all-visible.  But costsize.c
+	 * wants it converted to a fraction.
+	 */
+	if (relallvisible == 0 || curpages <= 0)
+		*allvisfrac = 0;
+	else if ((double) relallvisible >= curpages)
+		*allvisfrac = 1;
+	else
+		*allvisfrac = (double) relallvisible / curpages;
+}
+
+/* ------------------------------------------------------------------------
+ * Executor related callbacks for the zedstore AM
+ * ------------------------------------------------------------------------
+ */
+
+static bool
+zedstoream_scan_bitmap_next_block(TableScanDesc sscan,
+								  TBMIterateResult *tbmres)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	BlockNumber tid_blkno = tbmres->blockno;
+	int			ntuples;
+	ZSBtreeScan	btree_scan;
+	zstid		tid;
+	int			noff = 0;
+
+	zs_initialize_proj_attributes_extended(scan, RelationGetDescr(scan->rs_scan.rs_rd));
+
+	/*
+	 * Our strategy for a bitmap scan is to scan the tree of each attribute,
+	 * starting at the given logical block number, and store all the datums
+	 * in the scan struct. zedstoream_scan_analyze_next_tuple() then just
+	 * needs to store the datums of the next TID in the slot.
+	 *
+	 * An alternative would be to keep the scans of each attribute open,
+	 * like in a sequential scan. I'm not sure which is better.
+	 */
+	ntuples = 0;
+	zsbt_tid_begin_scan(scan->rs_scan.rs_rd,
+						ZSTidFromBlkOff(tid_blkno, 1),
+						ZSTidFromBlkOff(tid_blkno + 1, 1),
+						scan->rs_scan.rs_snapshot,
+						&btree_scan);
+	btree_scan.serializable = true;
+	while ((tid = zsbt_tid_scan_next(&btree_scan)) != InvalidZSTid)
+	{
+		ItemPointerData itemptr;
+
+		Assert(ZSTidGetBlockNumber(tid) == tid_blkno);
+
+		ItemPointerSet(&itemptr, tid_blkno, ZSTidGetOffsetNumber(tid));
+
+		if (tbmres->ntuples != -1)
+		{
+			while (ZSTidGetOffsetNumber(tid) > tbmres->offsets[noff] && noff < tbmres->ntuples)
+			{
+				/*
+				 * Acquire predicate lock on all tuples that we scan, even those that are
+				 * not visible to the snapshot.
+				 */
+				PredicateLockTID(scan->rs_scan.rs_rd, &itemptr, scan->rs_scan.rs_snapshot);
+
+				noff++;
+			}
+
+			if (noff == tbmres->ntuples)
+				break;
+
+			if (ZSTidGetOffsetNumber(tid) < tbmres->offsets[noff])
+				continue;
+		}
+
+		Assert(ZSTidGetBlockNumber(tid) == tid_blkno);
+
+		scan->bmscan_tids[ntuples] = tid;
+		ntuples++;
+
+		/* FIXME: heapam acquires the predicate lock first, and then
+		 * calls CheckForSerializableConflictOut(). We do it in the
+		 * opposite order, because CheckForSerializableConflictOut()
+		 * call as done in zsbt_get_last_tid() already. Does it matter?
+		 * I'm not sure.
+		 */
+		PredicateLockTID(scan->rs_scan.rs_rd, &itemptr, scan->rs_scan.rs_snapshot);
+	}
+	zsbt_tid_end_scan(&btree_scan);
+
+	if (ntuples)
+	{
+		for (int i = 1; i < scan->proj_data.num_proj_atts; i++)
+		{
+			int			natt = scan->proj_data.proj_atts[i];
+			ZSBtreeScan	btree_scan;
+			Datum		datum;
+			bool        isnull;
+			Datum	   *datums = scan->bmscan_datums[i];
+			bool	   *isnulls = scan->bmscan_isnulls[i];
+
+			zsbt_attr_begin_scan(scan->rs_scan.rs_rd,
+								 RelationGetDescr(scan->rs_scan.rs_rd),
+								 natt,
+								 ZSTidFromBlkOff(tid_blkno, 1),
+								 ZSTidFromBlkOff(tid_blkno + 1, 1),
+								 &btree_scan);
+			for (int n = 0; n < ntuples; n++)
+			{
+				if (!zsbt_scan_next_fetch(&btree_scan, &datum, &isnull, scan->bmscan_tids[n]))
+					zsbt_fill_missing_attribute_value(&btree_scan, &datum, &isnull);
+
+				/* have to make a copy because we close the scan immediately. */
+				if (!isnull)
+					datum = zs_datumCopy(datum,
+										 ZSBtreeScanGetAttInfo(&btree_scan)->attbyval,
+										 ZSBtreeScanGetAttInfo(&btree_scan)->attlen);
+				datums[n] = datum;
+				isnulls[n] = isnull;
+			}
+			zsbt_attr_end_scan(&btree_scan);
+		}
+	}
+	scan->bmscan_nexttuple = 0;
+	scan->bmscan_ntuples = ntuples;
+
+	return ntuples > 0;
+}
+
+static bool
+zedstoream_scan_bitmap_next_tuple(TableScanDesc sscan,
+								  TBMIterateResult *tbmres,
+								  TupleTableSlot *slot)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	zstid		tid;
+
+	if (scan->bmscan_nexttuple >= scan->bmscan_ntuples)
+		return false;
+	/*
+	 * projection attributes were created based on Relation tuple descriptor
+	 * it better match TupleTableSlot.
+	 */
+	Assert((scan->proj_data.num_proj_atts - 1) <= slot->tts_tupleDescriptor->natts);
+	tid = scan->bmscan_tids[scan->bmscan_nexttuple];
+	for (int i = 1; i < scan->proj_data.num_proj_atts; i++)
+	{
+		int			natt = scan->proj_data.proj_atts[i];
+		Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, natt - 1);
+		Datum		datum;
+		bool        isnull;
+
+		datum = (scan->bmscan_datums[i])[scan->bmscan_nexttuple];
+		isnull = (scan->bmscan_isnulls[i])[scan->bmscan_nexttuple];
+
+		/*
+		 * flatten any ZS-TOASTed values, because the rest of the system
+		 * doesn't know how to deal with them.
+		 */
+		if (!isnull && att->attlen == -1 &&
+			VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE)
+		{
+			datum = zedstore_toast_flatten(scan->rs_scan.rs_rd, natt, tid, datum);
+		}
+
+		slot->tts_values[natt - 1] = datum;
+		slot->tts_isnull[natt - 1] = isnull;
+	}
+	slot->tts_tid = ItemPointerFromZSTid(tid);
+	slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+	slot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+	scan->bmscan_nexttuple++;
+
+	pgstat_count_heap_fetch(scan->rs_scan.rs_rd);
+
+	return true;
+}
+
+static bool
+zedstoream_scan_sample_next_block(TableScanDesc sscan, SampleScanState *scanstate)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	Relation	rel = scan->rs_scan.rs_rd;
+	TsmRoutine *tsm = scanstate->tsmroutine;
+	int			ntuples;
+	ZSBtreeScan	btree_scan;
+	zstid		tid;
+	BlockNumber blockno;
+
+	/* TODO: for now, assume that we need all columns */
+	zs_initialize_proj_attributes_extended(scan, RelationGetDescr(rel));
+
+	if (scan->max_tid_to_scan == InvalidZSTid)
+	{
+		/*
+		 * get the max tid once and store it, used to calculate max blocks to
+		 * scan either for SYSTEM or BERNOULLI sampling.
+		 */
+		scan->max_tid_to_scan = zsbt_get_last_tid(rel);
+		/*
+		 * TODO: should get lowest tid instead of starting from 0
+		 */
+		scan->next_tid_to_scan = ZSTidFromBlkOff(0, 1);
+	}
+
+	if (tsm->NextSampleBlock)
+	{
+		/* Adding one below to convert block number to number of blocks. */
+		blockno = tsm->NextSampleBlock(scanstate,
+									   ZSTidGetBlockNumber(scan->max_tid_to_scan) + 1);
+
+		if (!BlockNumberIsValid(blockno))
+			return false;
+	}
+	else
+	{
+		/* scanning table sequentially */
+		if (scan->next_tid_to_scan > scan->max_tid_to_scan)
+			return false;
+
+		blockno = ZSTidGetBlockNumber(scan->next_tid_to_scan);
+		/* move on to next block of tids for next iteration of scan */
+		scan->next_tid_to_scan = ZSTidFromBlkOff(blockno + 1, 1);
+	}
+
+	Assert(BlockNumberIsValid(blockno));
+
+	ntuples = 0;
+	zsbt_tid_begin_scan(scan->rs_scan.rs_rd,
+						ZSTidFromBlkOff(blockno, 1),
+						ZSTidFromBlkOff(blockno + 1, 1),
+						scan->rs_scan.rs_snapshot,
+						&btree_scan);
+	while ((tid = zsbt_tid_scan_next(&btree_scan)) != InvalidZSTid)
+	{
+		Assert(ZSTidGetBlockNumber(tid) == blockno);
+		scan->bmscan_tids[ntuples] = tid;
+		ntuples++;
+	}
+	zsbt_tid_end_scan(&btree_scan);
+
+	scan->bmscan_nexttuple = 0;
+	scan->bmscan_ntuples = ntuples;
+
+	return true;
+}
+
+static bool
+zedstoream_scan_sample_next_tuple(TableScanDesc sscan, SampleScanState *scanstate,
+								  TupleTableSlot *slot)
+{
+	ZedStoreDesc scan = (ZedStoreDesc) sscan;
+	TsmRoutine *tsm = scanstate->tsmroutine;
+	zstid		tid;
+	BlockNumber blockno;
+	OffsetNumber tupoffset;
+	bool found;
+
+	/* all tuples on this block are invisible */
+	if (scan->bmscan_ntuples == 0)
+		return false;
+
+	blockno = ZSTidGetBlockNumber(scan->bmscan_tids[0]);
+
+	/* find which visible tuple in this block to sample */
+	for (;;)
+	{
+		zstid lasttid_for_block = scan->bmscan_tids[scan->bmscan_ntuples - 1];
+		OffsetNumber maxoffset = ZSTidGetOffsetNumber(lasttid_for_block);
+		/* Ask the tablesample method which tuples to check on this page. */
+		tupoffset = tsm->NextSampleTuple(scanstate, blockno, maxoffset);
+
+		if (!OffsetNumberIsValid(tupoffset))
+			return false;
+
+		tid = ZSTidFromBlkOff(blockno, tupoffset);
+
+		found = false;
+		for (int n = 0; n < scan->bmscan_ntuples; n++)
+		{
+			if (scan->bmscan_tids[n] == tid)
+			{
+				/* visible tuple */
+				found = true;
+				break;
+			}
+		}
+
+		if (found)
+			break;
+		else
+			continue;
+	}
+
+	/*
+	 * projection attributes were created based on Relation tuple descriptor
+	 * it better match TupleTableSlot.
+	 */
+	Assert((scan->proj_data.num_proj_atts - 1) <= slot->tts_tupleDescriptor->natts);
+	/* fetch values for tuple pointed by tid to sample */
+	for (int i = 1; i < scan->proj_data.num_proj_atts; i++)
+	{
+		int			natt = scan->proj_data.proj_atts[i];
+		ZSBtreeScan btree_scan;
+		Form_pg_attribute attr;
+		Datum		datum;
+		bool        isnull;
+
+		zsbt_attr_begin_scan(scan->rs_scan.rs_rd,
+							 slot->tts_tupleDescriptor,
+							 natt,
+							 tid, tid + 1,
+							 &btree_scan);
+
+		attr = ZSBtreeScanGetAttInfo(&btree_scan);
+		if (zsbt_scan_next_fetch(&btree_scan, &datum, &isnull, tid))
+		{
+			Assert(ZSTidGetBlockNumber(tid) == blockno);
+		}
+		else
+		{
+			zsbt_fill_missing_attribute_value(&btree_scan, &datum, &isnull);
+		}
+
+		/*
+		 * have to make a copy because we close the scan immediately.
+		 * FIXME: I think this leaks into a too-long-lived context
+		 */
+		if (!isnull)
+			datum = zs_datumCopy(datum, attr->attbyval, attr->attlen);
+
+		slot->tts_values[natt - 1] = datum;
+		slot->tts_isnull[natt - 1] = isnull;
+
+		zsbt_attr_end_scan(&btree_scan);
+	}
+	slot->tts_tid = ItemPointerFromZSTid(tid);
+	slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+	slot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+	return true;
+}
+
+static void
+zedstoream_vacuum_rel(Relation onerel, VacuumParams *params,
+					  BufferAccessStrategy bstrategy)
+{
+	zsundo_vacuum(onerel, params, bstrategy,
+				  GetOldestXmin(onerel, PROCARRAY_FLAGS_VACUUM));
+}
+
+static const TableAmRoutine zedstoream_methods = {
+	.type = T_TableAmRoutine,
+	.scans_leverage_column_projection = true,
+
+	.slot_callbacks = zedstoream_slot_callbacks,
+
+	.scan_begin = zedstoream_beginscan,
+	.scan_begin_with_column_projection = zedstoream_beginscan_with_column_projection,
+	.scan_end = zedstoream_endscan,
+	.scan_rescan = zedstoream_rescan,
+	.scan_getnextslot = zedstoream_getnextslot,
+
+	.parallelscan_estimate = zs_parallelscan_estimate,
+	.parallelscan_initialize = zs_parallelscan_initialize,
+	.parallelscan_reinitialize = zs_parallelscan_reinitialize,
+
+	.index_fetch_begin = zedstoream_begin_index_fetch,
+	.index_fetch_reset = zedstoream_reset_index_fetch,
+	.index_fetch_end = zedstoream_end_index_fetch,
+	.index_fetch_set_column_projection = zedstoream_fetch_set_column_projection,
+	.index_fetch_tuple = zedstoream_index_fetch_tuple,
+
+	.tuple_insert = zedstoream_insert,
+	.tuple_insert_speculative = zedstoream_insert_speculative,
+	.tuple_complete_speculative = zedstoream_complete_speculative,
+	.multi_insert = zedstoream_multi_insert,
+	.tuple_delete = zedstoream_delete,
+	.tuple_update = zedstoream_update,
+	.tuple_lock = zedstoream_lock_tuple,
+	.finish_bulk_insert = zedstoream_finish_bulk_insert,
+
+	.tuple_fetch_row_version = zedstoream_fetch_row_version,
+	.tuple_get_latest_tid = zedstoream_get_latest_tid,
+	.tuple_tid_valid = zedstoream_tuple_tid_valid,
+	.tuple_satisfies_snapshot = zedstoream_tuple_satisfies_snapshot,
+	.compute_xid_horizon_for_tuples = zedstoream_compute_xid_horizon_for_tuples,
+
+	.relation_set_new_filenode = zedstoream_relation_set_new_filenode,
+	.relation_nontransactional_truncate = zedstoream_relation_nontransactional_truncate,
+	.relation_copy_data = zedstoream_relation_copy_data,
+	.relation_copy_for_cluster = zedstoream_relation_copy_for_cluster,
+	.relation_vacuum = zedstoream_vacuum_rel,
+	.scan_analyze_next_block = zedstoream_scan_analyze_next_block,
+	.scan_analyze_next_tuple = zedstoream_scan_analyze_next_tuple,
+
+	.index_build_range_scan = zedstoream_index_build_range_scan,
+	.index_validate_scan = zedstoream_index_validate_scan,
+
+	.relation_size = zedstoream_relation_size,
+	.relation_needs_toast_table = zedstoream_relation_needs_toast_table,
+	.relation_estimate_size = zedstoream_relation_estimate_size,
+
+	.scan_bitmap_next_block = zedstoream_scan_bitmap_next_block,
+	.scan_bitmap_next_tuple = zedstoream_scan_bitmap_next_tuple,
+	.scan_sample_next_block = zedstoream_scan_sample_next_block,
+	.scan_sample_next_tuple = zedstoream_scan_sample_next_tuple
+};
+
+Datum
+zedstore_tableam_handler(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_POINTER(&zedstoream_methods);
+}
+
+
+/*
+ * Routines for dividing up the TID range for parallel seq scans
+ */
+
+typedef struct ParallelZSScanDescData
+{
+	ParallelTableScanDescData base;
+
+	zstid		pzs_endtid;		/* last tid + 1 in relation at start of scan */
+	pg_atomic_uint64 pzs_allocatedtid_blk;	/* TID space allocated to workers so far. (in  65536 increments) */
+} ParallelZSScanDescData;
+typedef struct ParallelZSScanDescData *ParallelZSScanDesc;
+
+static Size
+zs_parallelscan_estimate(Relation rel)
+{
+	return sizeof(ParallelZSScanDescData);
+}
+
+static Size
+zs_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
+{
+	ParallelZSScanDesc zpscan = (ParallelZSScanDesc) pscan;
+
+	zpscan->base.phs_relid = RelationGetRelid(rel);
+	zpscan->pzs_endtid = zsbt_get_last_tid(rel);
+	pg_atomic_init_u64(&zpscan->pzs_allocatedtid_blk, 0);
+
+	return sizeof(ParallelZSScanDescData);
+}
+
+static void
+zs_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
+{
+	ParallelZSScanDesc bpscan = (ParallelZSScanDesc) pscan;
+
+	pg_atomic_write_u64(&bpscan->pzs_allocatedtid_blk, 0);
+}
+
+/*
+ * get the next TID range to scan
+ *
+ * Returns true if there is more to scan, false otherwise.
+ *
+ * Get the next TID range to scan.  Even if there are no TIDs left to scan,
+ * another backend could have grabbed a range to scan and not yet finished
+ * looking at it, so it doesn't follow that the scan is done when the first
+ * backend gets 'false' return.
+ */
+static bool
+zs_parallelscan_nextrange(Relation rel, ParallelZSScanDesc pzscan,
+						  zstid *start, zstid *end)
+{
+	uint64		allocatedtid_blk;
+
+	/*
+	 * zhs_allocatedtid tracks how much has been allocated to workers
+	 * already.  When phs_allocatedtid >= rs_lasttid, all TIDs have been
+	 * allocated.
+	 *
+	 * Because we use an atomic fetch-and-add to fetch the current value, the
+	 * phs_allocatedtid counter will exceed rs_lasttid, because workers will
+	 * still increment the value, when they try to allocate the next block but
+	 * all blocks have been allocated already. The counter must be 64 bits
+	 * wide because of that, to avoid wrapping around when rs_lasttid is close
+	 * to 2^32.  That's also one reason we do this at granularity of 2^16 TIDs,
+	 * even though zedstore isn't block-oriented.
+	 *
+	 * TODO: we divide the TID space into chunks of 2^16 TIDs each. That's
+	 * pretty inefficient, there's a fair amount of overhead in re-starting
+	 * the B-tree scans between each range. We probably should use much larger
+	 * ranges. But this is good for testing.
+	 */
+	allocatedtid_blk = pg_atomic_fetch_add_u64(&pzscan->pzs_allocatedtid_blk, 1);
+	*start = ZSTidFromBlkOff(allocatedtid_blk, 1);
+	*end = ZSTidFromBlkOff(allocatedtid_blk + 1, 1);
+
+	return *start < pzscan->pzs_endtid;
+}
+
+static void
+zsbt_fill_missing_attribute_value(ZSBtreeScan *scan, Datum *datum, bool *isnull)
+{
+	int attno = scan->attno - 1;
+	TupleDesc tupleDesc = scan->tupledesc;
+	Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan);
+
+	*isnull = true;
+	*datum = (Datum) 0;
+
+	/* This means catalog doesn't have the default value for this attribute */
+	if (!attr->atthasmissing)
+		return;
+
+	if (tupleDesc->constr &&
+		tupleDesc->constr->missing)
+	{
+		AttrMissing *attrmiss = NULL;
+		/*
+		 * If there are missing values we want to put them into the
+		 * tuple.
+		 */
+		attrmiss = tupleDesc->constr->missing;
+
+		if (attrmiss[attno].am_present)
+		{
+			*isnull = false;
+			if (attr->attbyval)
+				*datum = fetch_att(&attrmiss[attno].am_value, attr->attbyval, attr->attlen);
+			else
+				*datum = zs_datumCopy(attrmiss[attno].am_value, attr->attbyval, attr->attlen);
+		}
+	}
+}
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 6cb545c126..e795a510ae 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1057,7 +1057,11 @@ acquire_sample_rows(Relation onerel, int elevel,
 			 * done.
 			 */
 			if (numrows < targrows)
-				rows[numrows++] = ExecCopySlotHeapTuple(slot);
+			{
+				rows[numrows] = ExecCopySlotHeapTuple(slot);
+				rows[numrows]->t_self = slot->tts_tid;
+				numrows++;
+			}
 			else
 			{
 				/*
@@ -1079,6 +1083,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 					Assert(k >= 0 && k < targrows);
 					heap_freetuple(rows[k]);
 					rows[k] = ExecCopySlotHeapTuple(slot);
+					rows[k]->t_self = slot->tts_tid;
 				}

 				rowstoskip -= 1;
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index b00891ffd2..ab9fea881a 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2116,9 +2116,27 @@ CopyTo(CopyState cstate)
 	{
 		TupleTableSlot *slot;
 		TableScanDesc scandesc;
+		bool *proj = NULL;

-		scandesc = table_beginscan(cstate->rel, GetActiveSnapshot(), 0, NULL);
 		slot = table_slot_create(cstate->rel, NULL);
+		if (table_scans_leverage_column_projection(cstate->rel))
+		{
+			proj = palloc0(slot->tts_tupleDescriptor->natts * sizeof(bool));
+			foreach(cur, cstate->attnumlist)
+			{
+				int attnum = lfirst_int(cur);
+				Assert(attnum <= slot->tts_tupleDescriptor->natts);
+				proj[attnum-1] = true;
+			}
+
+			scandesc = table_beginscan_with_column_projection(cstate->rel,
+															  GetActiveSnapshot(),
+															  0, NULL, proj);
+		}
+		else
+		{
+			scandesc = table_beginscan(cstate->rel, GetActiveSnapshot(), 0, NULL);
+		}

 		processed = 0;
 		while (table_scan_getnextslot(scandesc, ForwardScanDirection, slot))
@@ -2135,6 +2153,8 @@ CopyTo(CopyState cstate)

 		ExecDropSingleTupleTableSlot(slot);
 		table_endscan(scandesc);
+		if (proj)
+			pfree(proj);
 	}
 	else
 	{
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 602a8dbd1c..2af39c8fdc 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -9586,6 +9586,7 @@ validateCheckConstraint(Relation rel, HeapTuple constrtup)
 	Form_pg_constraint constrForm;
 	bool		isnull;
 	Snapshot	snapshot;
+	bool *proj = NULL;

 	/*
 	 * VALIDATE CONSTRAINT is a no-op for foreign tables and partitioned
@@ -9618,7 +9619,16 @@ validateCheckConstraint(Relation rel, HeapTuple constrtup)
 	econtext->ecxt_scantuple = slot;

 	snapshot = RegisterSnapshot(GetLatestSnapshot());
-	scan = table_beginscan(rel, snapshot, 0, NULL);
+	if (table_scans_leverage_column_projection(rel))
+	{
+		proj = palloc0(slot->tts_tupleDescriptor->natts * sizeof(bool));
+		GetNeededColumnsForNode((Node*)exprstate->expr, proj, slot->tts_tupleDescriptor->natts);
+		scan = table_beginscan_with_column_projection(rel, snapshot, 0, NULL, proj);
+	}
+	else
+	{
+		scan = table_beginscan(rel, snapshot, 0, NULL);
+	}

 	/*
 	 * Switch to per-tuple memory context and reset it for each tuple
@@ -9643,6 +9653,8 @@ validateCheckConstraint(Relation rel, HeapTuple constrtup)
 	UnregisterSnapshot(snapshot);
 	ExecDropSingleTupleTableSlot(slot);
 	FreeExecutorState(estate);
+	if (proj)
+		pfree(proj);
 }

 /*
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 3132a13785..db09b3ac9c 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2841,6 +2841,10 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
 		if (newtuple != trigtuple)
 			heap_freetuple(newtuple);
 	}
+
+	/* Make sure the the new slot is not dependent on the original tuple */
+	ExecMaterializeSlot(slot);
+
 	if (should_free)
 		heap_freetuple(trigtuple);

@@ -3125,6 +3129,10 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
 			newtuple = NULL;
 		}
 	}
+
+	/* Make sure the the new slot is not dependent on the original tuple */
+	ExecMaterializeSlot(newslot);
+
 	if (should_free_trig)
 		heap_freetuple(trigtuple);

diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c
index 67c4be5108..73483aa835 100644
--- a/src/backend/executor/execScan.c
+++ b/src/backend/executor/execScan.c
@@ -20,6 +20,7 @@

 #include "executor/executor.h"
 #include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
 #include "utils/memutils.h"

@@ -301,3 +302,92 @@ ExecScanReScan(ScanState *node)
 		}
 	}
 }
+
+typedef struct neededColumnContext
+{
+	bool *mask;
+	int n;
+} neededColumnContext;
+
+static bool
+neededColumnContextWalker(Node *node, neededColumnContext *c)
+{
+	if (node == NULL)
+		return false;
+
+	if (IsA(node, Var))
+	{
+		Var *var = (Var *)node;
+
+		if (var->varattno > 0)
+		{
+			Assert(var->varattno <= c->n);
+			c->mask[var->varattno - 1] = true;
+		}
+		/*
+		 * If all attributes are included,
+		 * set all entries in mask to true.
+		 */
+		else if (var->varattno == 0)
+			memset(c->mask, true, c->n);
+
+		return false;
+	}
+	return expression_tree_walker(node, neededColumnContextWalker, (void * )c);
+}
+
+/*
+ * n specifies the number of allowed entries in mask: we use
+ * it for bounds-checking in the walker above.
+ */
+void
+GetNeededColumnsForNode(Node *expr, bool *mask, int n)
+{
+	neededColumnContext c;
+
+	c.mask = mask;
+	c.n = n;
+
+	neededColumnContextWalker(expr, &c);
+}
+
+bool *
+GetNeededColumnsForScan(ScanState *scanstate, int ncol)
+{
+	Plan	   *plan = scanstate->ps.plan;
+	bool	   *proj;
+	int			i;
+
+	proj = palloc0(ncol * sizeof(bool));
+	GetNeededColumnsForNode((Node *) plan->targetlist, proj, ncol);
+	GetNeededColumnsForNode((Node *) plan->qual, proj, ncol);
+
+	/*
+	 * Some node types have more fields with expressions. FIXME: This list
+	 * surely very incomplete. Should teach the planner to do this for us.
+	 */
+	if (IsA(plan, IndexScan))
+	{
+		GetNeededColumnsForNode((Node *) ((IndexScan *) plan)->indexqualorig, proj, ncol);
+		GetNeededColumnsForNode((Node *) ((IndexScan *) plan)->indexorderbyorig, proj, ncol);
+	}
+	else if (IsA(plan, BitmapHeapScan))
+	{
+		GetNeededColumnsForNode((Node *) ((BitmapHeapScan *) plan)->bitmapqualorig, proj, ncol);
+	}
+
+	for (i = 0; i < ncol; i++)
+	{
+		if (proj[i])
+			break;
+	}
+
+	/*
+	 * In some cases (for example, count(*)), no columns are specified.
+	 * We always scan the first column.
+	 */
+	if (i == ncol && ncol > 0)
+		proj[0] = true;
+
+	return proj;
+}
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index ee5b1c493b..8a4d795d1a 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -166,10 +166,10 @@ IndexOnlyNext(IndexOnlyScanState *node)
 			 * Rats, we have to visit the heap to check visibility.
 			 */
 			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, slot))
+			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
 				continue;		/* no visible tuple, try next index entry */

-			ExecClearTuple(slot);
+			ExecClearTuple(node->ioss_TableSlot);

 			/*
 			 * Only MVCC snapshots are supported here, so there should be no
@@ -528,7 +528,17 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	 */
 	tupDesc = ExecTypeFromTL(node->indextlist);
 	ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc,
-						  table_slot_callbacks(currentRelation));
+						  &TTSOpsVirtual);
+
+	/*
+	 * We need another slot, in a format that's suitable for the table AM,
+	 * for when we need to fetch a tuple from the table for rechecking
+	 * visibility.
+	 */
+	indexstate->ioss_TableSlot =
+		ExecAllocTableSlot(&estate->es_tupleTable,
+						   RelationGetDescr(currentRelation),
+						   table_slot_callbacks(currentRelation));

 	/*
 	 * Initialize result type and projection info.  The node's targetlist will
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index ac7aa81f67..5492816b6d 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -115,6 +115,13 @@ IndexNext(IndexScanState *node)
 								   node->iss_NumScanKeys,
 								   node->iss_NumOrderByKeys);

+		if (table_scans_leverage_column_projection(node->ss.ss_currentRelation))
+		{
+			bool *proj;
+			proj = GetNeededColumnsForScan(&node->ss, node->ss.ss_currentRelation->rd_att->natts);
+			table_index_fetch_set_column_projection(scandesc->xs_heapfetch, proj);
+		}
+
 		node->iss_ScanDesc = scandesc;

 		/*
@@ -897,6 +904,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 {
 	IndexScanState *indexstate;
 	Relation	currentRelation;
+	const TupleTableSlotOps *table_slot_ops;
 	LOCKMODE	lockmode;

 	/*
@@ -923,11 +931,19 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	indexstate->ss.ss_currentScanDesc = NULL;	/* no heap scan here */

 	/*
-	 * get the scan type from the relation descriptor.
+	 * Initialize the scan slot.
+	 *
+	 * With the reorder queue, we will sometimes use the reorderqueue's slot,
+	 * which uses heap ops, and sometimes the table AM's slot directly.  We
+	 * have to set scanopsfixed to false, unless the table AM also uses heap
+	 * ops.
 	 */
+	table_slot_ops = table_slot_callbacks(currentRelation);
 	ExecInitScanTupleSlot(estate, &indexstate->ss,
 						  RelationGetDescr(currentRelation),
-						  table_slot_callbacks(currentRelation));
+						  table_slot_ops);
+	if (node->indexorderby && table_slot_ops != &TTSOpsHeapTuple)
+		indexstate->ss.ps.scanopsfixed = false;

 	/*
 	 * Initialize result type and projection.
diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c
index 436b43f8ca..c0922ff823 100644
--- a/src/backend/executor/nodeSeqscan.c
+++ b/src/backend/executor/nodeSeqscan.c
@@ -31,6 +31,7 @@
 #include "access/tableam.h"
 #include "executor/execdebug.h"
 #include "executor/nodeSeqscan.h"
+#include "nodes/nodeFuncs.h"
 #include "utils/rel.h"

 static TupleTableSlot *SeqNext(SeqScanState *node);
@@ -68,9 +69,20 @@ SeqNext(SeqScanState *node)
 		 * We reach here if the scan is not parallel, or if we're serially
 		 * executing a scan that was planned to be parallel.
 		 */
-		scandesc = table_beginscan(node->ss.ss_currentRelation,
-								   estate->es_snapshot,
-								   0, NULL);
+		if (table_scans_leverage_column_projection(node->ss.ss_currentRelation))
+		{
+			bool *proj;
+			proj = GetNeededColumnsForScan(&node->ss, node->ss.ss_currentRelation->rd_att->natts);
+			scandesc = table_beginscan_with_column_projection(node->ss.ss_currentRelation,
+															  estate->es_snapshot,
+															  0, NULL, proj);
+		}
+		else
+		{
+			scandesc = table_beginscan(node->ss.ss_currentRelation,
+									   estate->es_snapshot,
+									   0, NULL);
+		}
 		node->ss.ss_currentScanDesc = scandesc;
 	}

diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 608d5adfed..6527e0d5d2 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -822,6 +822,9 @@ use_physical_tlist(PlannerInfo *root, Path *path, int flags)
 		rel->rtekind != RTE_CTE)
 		return false;

+	if (rel->rtekind == RTE_RELATION && rel->leverage_column_projection)
+		return false;
+
 	/*
 	 * Can't do it with inheritance cases either (mainly because Append
 	 * doesn't project; this test may be unnecessary now that
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 2405acbf6f..00d125378b 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -123,6 +123,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	 */
 	relation = table_open(relationObjectId, NoLock);

+	if (relation->rd_tableam)
+		rel->leverage_column_projection = relation->rd_tableam->scans_leverage_column_projection;
 	/* Temporary and unlogged relations are inaccessible during recovery. */
 	if (!RelationNeedsWAL(relation) && RecoveryInProgress())
 		ereport(ERROR,
diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c
index 99d26de7e6..b4110e4152 100644
--- a/src/backend/partitioning/partbounds.c
+++ b/src/backend/partitioning/partbounds.c
@@ -1274,6 +1274,7 @@ check_default_partition_contents(Relation parent, Relation default_rel,
 		TableScanDesc scan;
 		MemoryContext oldCxt;
 		TupleTableSlot *tupslot;
+		bool *proj = NULL;

 		/* Lock already taken above. */
 		if (part_relid != RelationGetRelid(default_rel))
@@ -1330,7 +1331,16 @@ check_default_partition_contents(Relation parent, Relation default_rel,
 		econtext = GetPerTupleExprContext(estate);
 		snapshot = RegisterSnapshot(GetLatestSnapshot());
 		tupslot = table_slot_create(part_rel, &estate->es_tupleTable);
-		scan = table_beginscan(part_rel, snapshot, 0, NULL);
+		if (table_scans_leverage_column_projection(part_rel))
+		{
+			proj = palloc0(tupslot->tts_tupleDescriptor->natts * sizeof(bool));
+			GetNeededColumnsForNode((Node*)partqualstate->expr, proj, tupslot->tts_tupleDescriptor->natts);
+			scan = table_beginscan_with_column_projection(part_rel, snapshot, 0, NULL, proj);
+		}
+		else
+		{
+			scan = table_beginscan(part_rel, snapshot, 0, NULL);
+		}

 		/*
 		 * Switch to per-tuple memory context and reset it for each tuple
@@ -1360,6 +1370,9 @@ check_default_partition_contents(Relation parent, Relation default_rel,

 		if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel))
 			table_close(part_rel, NoLock);	/* keep the lock until commit */
+
+		if (proj)
+			pfree(proj);
 	}
 }

diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 2fedbc4c15..b31c0bfe00 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -2547,8 +2547,6 @@ PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
 void
 PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot)
 {
-	PREDICATELOCKTARGETTAG tag;
-	ItemPointer tid;
 	TransactionId targetxmin;

 	if (!SerializationNeededForRead(relation, snapshot))
@@ -2579,6 +2577,17 @@ PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot)
 		}
 	}

+	PredicateLockTID(relation, &(tuple->t_self), snapshot);
+}
+
+void
+PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot)
+{
+	PREDICATELOCKTARGETTAG tag;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
 	/*
 	 * Do quick-but-not-definitive test for a relation lock first.  This will
 	 * never cause a return when the relation is *not* locked, but will
@@ -2591,7 +2600,6 @@ PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot)
 	if (PredicateLockExists(&tag))
 		return;

-	tid = &(tuple->t_self);
 	SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
 									 relation->rd_node.dbNode,
 									 relation->rd_id,
@@ -4054,14 +4062,11 @@ XidIsConcurrent(TransactionId xid)
  * currently no known reason to call this function from an index AM.
  */
 void
-CheckForSerializableConflictOut(bool visible, Relation relation,
+heap_CheckForSerializableConflictOut(bool visible, Relation relation,
 								HeapTuple tuple, Buffer buffer,
 								Snapshot snapshot)
 {
 	TransactionId xid;
-	SERIALIZABLEXIDTAG sxidtag;
-	SERIALIZABLEXID *sxid;
-	SERIALIZABLEXACT *sxact;
 	HTSV_Result htsvResult;

 	if (!SerializationNeededForRead(relation, snapshot))
@@ -4125,6 +4130,19 @@ CheckForSerializableConflictOut(bool visible, Relation relation,
 	Assert(TransactionIdIsValid(xid));
 	Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));

+	return CheckForSerializableConflictOut(relation, xid, snapshot);
+}
+
+void
+CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
+{
+	SERIALIZABLEXIDTAG sxidtag;
+	SERIALIZABLEXID *sxid;
+	SERIALIZABLEXACT *sxact;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
 	/*
 	 * Find top level xid.  Bail out if xid is too early to be a conflict, or
 	 * if it's our own xid.
@@ -4439,8 +4457,7 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
  * tuple itself.
  */
 void
-CheckForSerializableConflictIn(Relation relation, HeapTuple tuple,
-							   Buffer buffer)
+CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
 {
 	PREDICATELOCKTARGETTAG targettag;

@@ -4470,22 +4487,22 @@ CheckForSerializableConflictIn(Relation relation, HeapTuple tuple,
 	 * It is not possible to take and hold a lock across the checks for all
 	 * granularities because each target could be in a separate partition.
 	 */
-	if (tuple != NULL)
+	if (tid != NULL)
 	{
 		SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
 										 relation->rd_node.dbNode,
 										 relation->rd_id,
-										 ItemPointerGetBlockNumber(&(tuple->t_self)),
-										 ItemPointerGetOffsetNumber(&(tuple->t_self)));
+										 ItemPointerGetBlockNumber(tid),
+										 ItemPointerGetOffsetNumber(tid));
 		CheckTargetForConflictsIn(&targettag);
 	}

-	if (BufferIsValid(buffer))
+	if (blkno != InvalidBlockNumber)
 	{
 		SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
 										relation->rd_node.dbNode,
 										relation->rd_id,
-										BufferGetBlockNumber(buffer));
+										blkno);
 		CheckTargetForConflictsIn(&targettag);
 	}

diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 6f1cd382d8..d914d395c9 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -163,6 +163,7 @@ typedef struct TableAmRoutine
 {
 	/* this must be set to T_TableAmRoutine */
 	NodeTag		type;
+	bool scans_leverage_column_projection;

 	/* ------------------------------------------------------------------------
@@ -203,6 +204,13 @@ typedef struct TableAmRoutine
 								 ParallelTableScanDesc pscan,
 								 uint32 flags);

+	TableScanDesc (*scan_begin_with_column_projection)(Relation relation,
+													   Snapshot snapshot,
+													   int nkeys, struct ScanKeyData *key,
+													   ParallelTableScanDesc parallel_scan,
+													   uint32 flags,
+													   bool *project_column);
+
 	/*
 	 * Release resources and deallocate scan. If TableScanDesc.temp_snap,
 	 * TableScanDesc.rs_snapshot needs to be unregistered.
@@ -278,6 +286,13 @@ typedef struct TableAmRoutine
 	 */
 	void		(*index_fetch_end) (struct IndexFetchTableData *data);

+	/*
+	 * Set column projections for AM which leverage column projections for
+	 * scanning.
+	 */
+	void (*index_fetch_set_column_projection) (struct IndexFetchTableData *data,
+											   bool *project_column);
+
 	/*
 	 * Fetch tuple at `tid` into `slot`, after doing a visibility test
 	 * according to `snapshot`. If a tuple was found and passed the visibility
@@ -743,6 +758,12 @@ table_beginscan(Relation rel, Snapshot snapshot,
 	return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
 }

+static inline bool
+table_scans_leverage_column_projection(Relation relation)
+{
+	return relation->rd_tableam->scans_leverage_column_projection;
+}
+
 /*
  * Like table_beginscan(), but for scanning catalog. It'll automatically use a
  * snapshot appropriate for scanning catalog relations.
@@ -772,6 +793,19 @@ table_beginscan_strat(Relation rel, Snapshot snapshot,
 	return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
 }

+static inline TableScanDesc
+table_beginscan_with_column_projection(Relation relation, Snapshot snapshot,
+									   int nkeys, struct ScanKeyData *key,
+									   bool *project_column)
+{
+	uint32		flags = SO_TYPE_SEQSCAN |
+		SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
+
+	Assert(relation->rd_tableam->scans_leverage_column_projection);
+	return relation->rd_tableam->scan_begin_with_column_projection(
+		relation, snapshot, nkeys, key, NULL, flags, project_column);
+}
+
 /*
  * table_beginscan_bm is an alternative entry point for setting up a
  * TableScanDesc for a bitmap heap scan.  Although that scan technology is
@@ -956,6 +990,13 @@ table_index_fetch_end(struct IndexFetchTableData *scan)
 	scan->rel->rd_tableam->index_fetch_end(scan);
 }

+static inline void
+table_index_fetch_set_column_projection(struct IndexFetchTableData *scan,
+										 bool *project_column)
+{
+	scan->rel->rd_tableam->index_fetch_set_column_projection(scan, project_column);
+}
+
 /*
  * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing
  * a visibility test according to `snapshot`. If a tuple was found and passed
diff --git a/src/include/access/zedstore_compression.h b/src/include/access/zedstore_compression.h
new file mode 100644
index 0000000000..f70713a1a7
--- /dev/null
+++ b/src/include/access/zedstore_compression.h
@@ -0,0 +1,51 @@
+/*
+ * zedstore_compression.h
+ *		internal declarations for ZedStore compression
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/access/zedstore_compression.h
+ */
+#ifndef ZEDSTORE_COMPRESSION_H
+#define ZEDSTORE_COMPRESSION_H
+
+#include "storage/itemptr.h"
+
+typedef struct ZSDecompressContext
+{
+	char	   *buffer;
+	int			bufsize;		/* allocated size of 'buffer' */
+	int			uncompressedsize;
+	int			bytesread;
+} ZSDecompressContext;
+
+typedef struct ZSCompressContext
+{
+	char	   *uncompressedbuffer;
+
+	int			maxCompressedSize;
+	int			maxUncompressedSize;
+	char	   *buffer;
+	int			nitems;
+	int			rawsize;
+} ZSCompressContext;
+
+typedef struct ZSBtreeItem ZSBtreeItem;
+typedef struct ZSCompressedBtreeItem ZSCompressedBtreeItem;
+typedef struct ZSSingleBtreeItem ZSSingleBtreeItem;
+
+/* compression functions */
+extern void zs_compress_init(ZSCompressContext *context);
+extern void zs_compress_begin(ZSCompressContext *context, int maxCompressedSize);
+extern bool zs_compress_add(ZSCompressContext *context, ZSBtreeItem *item);
+extern ZSCompressedBtreeItem *zs_compress_finish(ZSCompressContext *context);
+extern void zs_compress_free(ZSCompressContext *context);
+
+/* decompression functions */
+extern void zs_decompress_init(ZSDecompressContext *context);
+extern void zs_decompress_chunk(ZSDecompressContext *context, ZSCompressedBtreeItem *chunk);
+extern ZSBtreeItem *zs_decompress_read_item(ZSDecompressContext *context);
+extern void zs_decompress_free(ZSDecompressContext *context);
+
+#endif							/* ZEDSTORE_COMPRESSION_H */
diff --git a/src/include/access/zedstore_internal.h b/src/include/access/zedstore_internal.h
new file mode 100644
index 0000000000..8eb9f74b96
--- /dev/null
+++ b/src/include/access/zedstore_internal.h
@@ -0,0 +1,618 @@
+/*
+ * zedstore_internal.h
+ *		internal declarations for ZedStore tables
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/access/zedstore_internal.h
+ */
+#ifndef ZEDSTORE_INTERNAL_H
+#define ZEDSTORE_INTERNAL_H
+
+#include "access/tableam.h"
+#include "access/zedstore_compression.h"
+#include "access/zedstore_undo.h"
+#include "storage/bufmgr.h"
+#include "utils/datum.h"
+
+#define ZS_META_ATTRIBUTE_NUM 0
+
+/*
+ * Throughout ZedStore, we pass around TIDs as uint64's, rather than ItemPointers,
+ * for speed.
+ */
+typedef uint64	zstid;
+
+#define InvalidZSTid		0
+#define MinZSTid			1		/* blk 0, off 1 */
+#define MaxZSTid			((uint64) MaxBlockNumber << 16 | 0xffff)
+/* note: if this is converted to ItemPointer, it is invalid */
+#define MaxPlusOneZSTid		(MaxZSTid + 1)
+
+#define MaxZSTidOffsetNumber	129
+
+static inline zstid
+ZSTidFromBlkOff(BlockNumber blk, OffsetNumber off)
+{
+	Assert(off != 0);
+
+	return (uint64) blk * (MaxZSTidOffsetNumber - 1) + off;
+}
+
+static inline zstid
+ZSTidFromItemPointer(ItemPointerData iptr)
+{
+	Assert(ItemPointerIsValid(&iptr));
+	return ZSTidFromBlkOff(ItemPointerGetBlockNumber(&iptr),
+						   ItemPointerGetOffsetNumber(&iptr));
+}
+
+static inline ItemPointerData
+ItemPointerFromZSTid(zstid tid)
+{
+	ItemPointerData iptr;
+	BlockNumber blk;
+	OffsetNumber off;
+
+	blk = (tid - 1) / (MaxZSTidOffsetNumber - 1);
+	off = (tid - 1) % (MaxZSTidOffsetNumber - 1) + 1;
+
+	ItemPointerSet(&iptr, blk, off);
+	Assert(ItemPointerIsValid(&iptr));
+	return iptr;
+}
+
+static inline BlockNumber
+ZSTidGetBlockNumber(zstid tid)
+{
+	return (BlockNumber) ((tid - 1) / (MaxZSTidOffsetNumber - 1));
+}
+
+static inline OffsetNumber
+ZSTidGetOffsetNumber(zstid tid)
+{
+	return (OffsetNumber) ((tid - 1) % (MaxZSTidOffsetNumber - 1) + 1);
+}
+
+/*
+ * A ZedStore table contains different kinds of pages, all in the same file.
+ *
+ * Block 0 is always a metapage. It contains the block numbers of the other
+ * data structures stored within the file, like the per-attribute B-trees,
+ * and the UNDO log. In addition, if there are overly large datums in the
+ * the table, they are chopped into separate "toast" pages.
+ */
+#define	ZS_META_PAGE_ID		0xF083
+#define	ZS_BTREE_PAGE_ID	0xF084
+#define	ZS_UNDO_PAGE_ID		0xF085
+#define	ZS_TOAST_PAGE_ID	0xF086
+#define	ZS_FPM_PAGE_ID		0xF087
+
+/* flags for zedstore b-tree pages */
+#define ZSBT_ROOT				0x0001
+
+typedef struct ZSBtreePageOpaque
+{
+	AttrNumber	zs_attno;
+	BlockNumber zs_next;
+	zstid		zs_lokey;		/* inclusive */
+	zstid		zs_hikey;		/* exclusive */
+	uint16		zs_level;			/* 0 = leaf */
+	uint16		zs_flags;
+	uint16		padding;			/* padding, to put zs_page_id last */
+	uint16		zs_page_id;			/* always ZS_BTREE_PAGE_ID */
+} ZSBtreePageOpaque;
+
+#define ZSBtreePageGetOpaque(page) ((ZSBtreePageOpaque *) PageGetSpecialPointer(page))
+
+/*
+ * Internal B-tree page layout.
+ *
+ * The "contents" of the page is an array of ZSBtreeInternalPageItem. The number
+ * of items can be deduced from pd_lower.
+ */
+typedef struct ZSBtreeInternalPageItem
+{
+	zstid		tid;
+	BlockNumber childblk;
+} ZSBtreeInternalPageItem;
+
+static inline ZSBtreeInternalPageItem *
+ZSBtreeInternalPageGetItems(Page page)
+{
+	ZSBtreeInternalPageItem *items;
+
+	items = (ZSBtreeInternalPageItem *) PageGetContents(page);
+
+	return items;
+}
+static inline int
+ZSBtreeInternalPageGetNumItems(Page page)
+{
+	ZSBtreeInternalPageItem *begin;
+	ZSBtreeInternalPageItem *end;
+
+	begin = (ZSBtreeInternalPageItem *) PageGetContents(page);
+	end = (ZSBtreeInternalPageItem *) ((char *) page + ((PageHeader) page)->pd_lower);
+
+	return end - begin;
+}
+
+static inline bool
+ZSBtreeInternalPageIsFull(Page page)
+{
+	PageHeader phdr = (PageHeader) page;
+
+	return phdr->pd_upper - phdr->pd_lower < sizeof(ZSBtreeInternalPageItem);
+}
+
+/*
+ * Leaf B-tree page layout
+ *
+ * Leaf pages are packed with ZSBtreeItems. There are three kinds of items:
+ *
+ * 1. Single item, holds one tuple (or rather, one datum).
+ *
+ * 2. "Array item", holds multiple datums, with consecutive TIDs and the same
+ *    visibility information. An array item saves space compared to multiple
+ *    single items, by leaving out repetitive UNDO and TID fields. An array
+ *    item cannot mix NULLs and non-NULLs, so the ZSBT_NULL flag applies to
+ *    all elements.
+ *
+ * 3. "Compressed item", which can hold multiple single or array items.
+ *
+ * A single or array item can furthermore be marked as DEAD. A dead item
+ * prevents the TID (or TID range, for an array item) from being reused. It's
+ * used during VACUUM, to mark items for which there are no index pointers
+ * anymore. But it cannot be removed until the undo record has been trimmed
+ * away, because if the TID was reused for a new record, vacuum might remove
+ * the new tuple version instead. After t_undo_ptr becomes older than "oldest
+ * undo ptr", the item can be removed and the TID recycled.
+ *
+ * TODO: squeeze harder: eliminate padding, use high bits of t_tid for flags or size
+ */
+typedef struct ZSBtreeItem
+{
+	zstid		t_tid;
+	uint16		t_size;
+	uint16		t_flags;
+} ZSBtreeItem;
+
+typedef struct ZSSingleBtreeItem
+{
+	/* these fields must match ZSBtreeItem */
+	zstid		t_tid;
+	uint16		t_size;
+	uint16		t_flags;
+
+	ZSUndoRecPtr t_undo_ptr;
+
+	char		t_payload[FLEXIBLE_ARRAY_MEMBER];
+} ZSSingleBtreeItem;
+
+typedef struct ZSArrayBtreeItem
+{
+	/* these fields must match ZSBtreeItem */
+	zstid		t_tid;
+	uint16		t_size;
+	uint16		t_flags;
+
+	uint16		t_nelements;
+	ZSUndoRecPtr t_undo_ptr;
+
+	char		t_payload[FLEXIBLE_ARRAY_MEMBER];
+} ZSArrayBtreeItem;
+
+typedef struct ZSCompressedBtreeItem
+{
+	/* these fields must match ZSBtreeItem */
+	zstid		t_tid;
+	uint16		t_size;
+	uint16		t_flags;
+
+	uint16		t_uncompressedsize;
+	zstid		t_lasttid;	/* inclusive */
+
+	char		t_payload[FLEXIBLE_ARRAY_MEMBER];
+} ZSCompressedBtreeItem;
+
+#define ZSBT_COMPRESSED		0x0001
+#define ZSBT_ARRAY			0x0002
+#define ZSBT_NULL			0x0010
+#define ZSBT_DEAD			0x0020
+
+/*
+ * Get the last TID that the given item spans.
+ *
+ * For a single item, it's the TID of the item. For an array item, it's the
+ * TID of the last element. For a compressed item, it's the last TID of the
+ * last item it contains (which is stored explicitly in the item header).
+ */
+static inline zstid
+zsbt_item_lasttid(ZSBtreeItem *item)
+{
+	if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+		return ((ZSCompressedBtreeItem *) item)->t_lasttid;
+	else if ((item->t_flags & ZSBT_ARRAY) != 0)
+	{
+		ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item;
+		return aitem->t_tid + aitem->t_nelements - 1;
+	}
+	else
+		return item->t_tid;
+}
+
+static inline ZSUndoRecPtr
+zsbt_item_undoptr(ZSBtreeItem *item)
+{
+	if ((item->t_flags & ZSBT_COMPRESSED) != 0)
+		elog(ERROR, "cannot get undo pointer from compressed item");
+	else if ((item->t_flags & ZSBT_ARRAY) != 0)
+	{
+		ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item;
+		return aitem->t_undo_ptr;
+	}
+	else
+	{
+		ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) item;
+		return sitem->t_undo_ptr;
+	}
+}
+
+/*
+ * Toast page layout.
+ *
+ * When an overly large datum is stored, it is divided into chunks, and each
+ * chunk is stored on a dedicated toast page. The toast pages of a datum form
+ * list, each page has a next/prev pointer.
+ */
+/*
+ * Maximum size of an individual untoasted Datum stored in ZedStore. Datums
+ * larger than this need to be toasted.
+ *
+ * A datum needs to fit on a B-tree page, with page and item headers.
+ *
+ * XXX: 500 accounts for all the headers. Need to compute this correctly...
+ */
+#define		MaxZedStoreDatumSize		(BLCKSZ - 500)
+
+typedef struct ZSToastPageOpaque
+{
+	AttrNumber	zs_attno;
+
+	/* these are only set on the first page. */
+	zstid		zs_tid;
+	uint32		zs_total_size;
+
+	uint32		zs_slice_offset;
+	BlockNumber	zs_prev;
+	BlockNumber	zs_next;
+	uint16		zs_flags;
+	uint16		padding1;			/* padding, to put zs_page_id last */
+	uint16		padding2;			/* padding, to put zs_page_id last */
+	uint16		zs_page_id;
+} ZSToastPageOpaque;
+
+/*
+ * "Toast pointer" of a datum that's stored in zedstore toast pages.
+ *
+ * This looks somewhat like a normal TOAST pointer, but we mustn't let these
+ * escape out of zedstore code, because the rest of the system doesn't know
+ * how to deal with them.
+ *
+ * This must look like varattrib_1b_e!
+ */
+typedef struct varatt_zs_toastptr
+{
+	/* varattrib_1b_e */
+	uint8		va_header;
+	uint8		va_tag;			/* VARTAG_ZEDSTORE in zedstore toast datums */
+
+	/* first block */
+	BlockNumber	zst_block;
+} varatt_zs_toastptr;
+
+/*
+ * va_tag value. this should be distinguishable from the values in
+ * vartag_external
+ */
+#define		VARTAG_ZEDSTORE		10
+
+/*
+ * Versions of datumGetSize and datumCopy that know about ZedStore-toasted
+ * datums.
+ */
+static inline Size
+zs_datumGetSize(Datum value, bool typByVal, int typLen)
+{
+	if (typLen > 0)
+		return typLen;
+	else if (typLen == -1)
+	{
+		if (VARATT_IS_EXTERNAL(value) && VARTAG_EXTERNAL(value) == VARTAG_ZEDSTORE)
+			return sizeof(varatt_zs_toastptr);
+		else
+			return VARSIZE_ANY(value);
+	}
+	else
+		return datumGetSize(value, typByVal, typLen);
+}
+
+static inline Datum
+zs_datumCopy(Datum value, bool typByVal, int typLen)
+{
+	if (typLen < 0 && VARATT_IS_EXTERNAL(value) && VARTAG_EXTERNAL(value) == VARTAG_ZEDSTORE)
+	{
+		char	   *result = palloc(sizeof(varatt_zs_toastptr));
+
+		memcpy(result, DatumGetPointer(value), sizeof(varatt_zs_toastptr));
+
+		return PointerGetDatum(result);
+	}
+	else
+		return datumCopy(value, typByVal, typLen);
+}
+
+/*
+ * Block 0 on every ZedStore table is a metapage.
+ *
+ * It contains a directory of b-tree roots for each attribute, and lots more.
+ */
+#define ZS_META_BLK		0
+
+/*
+ * The metapage stores one of these for each attribute.
+ */
+typedef struct ZSRootDirItem
+{
+	BlockNumber root;
+} ZSRootDirItem;
+
+typedef struct ZSMetaPage
+{
+	int			nattributes;
+	ZSRootDirItem tree_root_dir[FLEXIBLE_ARRAY_MEMBER];	/* one for each attribute */
+} ZSMetaPage;
+
+/*
+ * it's not clear what we should store in the "opaque" special area, and what
+ * as page contents, on a metapage. But have at least the page_id field here,
+ * so that tools like pg_filedump can recognize it as a zedstore metapage.
+ */
+typedef struct ZSMetaPageOpaque
+{
+	uint64		zs_undo_counter;
+	BlockNumber	zs_undo_head;
+	BlockNumber	zs_undo_tail;
+	ZSUndoRecPtr zs_undo_oldestptr;
+
+	BlockNumber zs_fpm_root;		/* root of the Free Page Map */
+
+	uint16		zs_flags;
+	uint16		zs_page_id;
+} ZSMetaPageOpaque;
+
+
+/*
+ * Holds the state of an in-progress scan on a zedstore btree.
+ */
+typedef struct ZSBtreeScan
+{
+	Relation	rel;
+	AttrNumber	attno;
+	TupleDesc   tupledesc;
+
+	/*
+	 * memory context that should be used for any allocations that go with the scan,
+	 * like the decompression buffers. This isn't a dedicated context, you must still
+	 * free everything to avoid leaking! We need this because the getnext function
+	 * might be called in a short-lived memory context that is reset between calls.
+	 */
+	MemoryContext context;
+
+	bool		active;
+	Buffer		lastbuf;
+	OffsetNumber lastoff;
+	zstid		nexttid;
+	zstid		endtid;
+	Snapshot	snapshot;
+
+	/* in the "real" UNDO-log, this would probably be a global variable */
+	ZSUndoRecPtr recent_oldest_undo;
+
+	/* should this scan do predicate locking? Or check for conflicts? */
+	bool		serializable;
+	bool		acquire_predicate_tuple_locks;
+
+	/*
+	 * if we have remaining items from a compressed container tuple, they
+	 * are kept in the decompressor context, and 'has_decompressed' is true.
+	 */
+	ZSDecompressContext decompressor;
+	bool		has_decompressed;
+
+	/*
+	 * These fields are used, if the scan is processing an array tuple.
+	 * And also for a single-item tuple - it works just like a single-element
+	 * array tuple.
+	 */
+	ZSUndoRecPtr array_undoptr;
+	int			array_datums_allocated_size;
+	Datum	   *array_datums;
+	Datum	   *array_next_datum;
+	int			array_elements_left;
+	bool		array_isnull;
+
+} ZSBtreeScan;
+
+static inline Form_pg_attribute
+ZSBtreeScanGetAttInfo(ZSBtreeScan *scan)
+{
+	return TupleDescAttr(scan->tupledesc, scan->attno - 1);
+}
+
+/*
+ * zs_split_stack is used during page split, or page merge, to keep track
+ * of all the modified pages. The page split (or merge) routines don't
+ * modify pages directly, but they construct a list of 'zs_split_stack'
+ * entries. Each entry holds a buffer, and a temporary in-memory copy of
+ * a page that should be written to the buffer, once everything is completed.
+ * All the buffers are exclusively-locked.
+ */
+typedef struct zs_split_stack zs_split_stack;
+
+struct zs_split_stack
+{
+	zs_split_stack *next;
+
+	Buffer		buf;
+	Page		page;		/* temp in-memory copy of page */
+	bool		recycle;	/* should the page be added to the FPM? */
+};
+
+/* prototypes for functions in zedstore_tidpage.c */
+extern void zsbt_tid_begin_scan(Relation rel,
+								zstid starttid, zstid endtid, Snapshot snapshot, ZSBtreeScan *scan);
+extern void zsbt_tid_reset_scan(ZSBtreeScan *scan, zstid starttid);
+extern void zsbt_tid_end_scan(ZSBtreeScan *scan);
+extern zstid zsbt_tid_scan_next(ZSBtreeScan *scan);
+
+extern void zsbt_tid_multi_insert(Relation rel,
+							  zstid *tids, int ndatums,
+							  TransactionId xid, CommandId cid, uint32 speculative_token, ZSUndoRecPtr prevundoptr);
+extern TM_Result zsbt_tid_delete(Relation rel, zstid tid,
+								 TransactionId xid, CommandId cid,
+								 Snapshot snapshot, Snapshot crosscheck, bool wait,
+								 TM_FailureData *hufd, bool changingPart);
+extern TM_Result zsbt_tid_update(Relation rel, zstid otid,
+								 TransactionId xid,
+								 CommandId cid, bool key_update, Snapshot snapshot, Snapshot crosscheck,
+								 bool wait, TM_FailureData *hufd, zstid *newtid_p);
+extern void zsbt_tid_clear_speculative_token(Relation rel, zstid tid, uint32 spectoken, bool forcomplete);
+extern void zsbt_tid_mark_dead(Relation rel, zstid tid, ZSUndoRecPtr undoptr);
+extern TM_Result zsbt_tid_lock(Relation rel, zstid tid,
+			   TransactionId xid, CommandId cid,
+								LockTupleMode lockmode, Snapshot snapshot, TM_FailureData *hufd, zstid *next_tid);
+extern void zsbt_tid_undo_deletion(Relation rel, zstid tid, ZSUndoRecPtr undoptr);
+extern zstid zsbt_get_last_tid(Relation rel);
+extern void zsbt_find_latest_tid(Relation rel, zstid *tid, Snapshot snapshot);
+
+/* prototypes for functions in zedstore_attrpage.c */
+extern void zsbt_attr_begin_scan(Relation rel, TupleDesc tdesc, AttrNumber attno,
+								zstid starttid, zstid endtid, ZSBtreeScan *scan);
+extern void zsbt_attr_reset_scan(ZSBtreeScan *scan, zstid starttid);
+extern void zsbt_attr_end_scan(ZSBtreeScan *scan);
+extern bool zsbt_attr_scan_next(ZSBtreeScan *scan);
+
+extern void zsbt_attr_multi_insert(Relation rel, AttrNumber attno,
+							  Datum *datums, bool *isnulls, zstid *tids, int ndatums);
+
+/* prototypes for functions in zedstore_btree.c */
+extern zs_split_stack *zsbt_newroot(Relation rel, AttrNumber attno, int level, List *downlinks);
+extern zs_split_stack *zsbt_insert_downlinks(Relation rel, AttrNumber attno,
+					  zstid leftlokey, BlockNumber leftblkno, int level,
+					  List *downlinks);
+extern void zsbt_attr_remove(Relation rel, AttrNumber attno, zstid tid);
+extern zs_split_stack *zsbt_unlink_page(Relation rel, AttrNumber attno, Buffer buf, int level);
+extern Buffer zsbt_descend(Relation rel, AttrNumber attno, zstid key, int level, bool readonly);
+extern bool zsbt_page_is_expected(Relation rel, AttrNumber attno, zstid key, int level, Buffer buf);
+
+static inline void
+zsbt_scan_skip(ZSBtreeScan *scan, zstid tid)
+{
+	if (tid > scan->nexttid)
+	{
+		if (scan->array_elements_left > 0)
+		{
+			int64		skip = tid - scan->nexttid - 1;
+
+			if (skip < scan->array_elements_left)
+			{
+				scan->array_next_datum += skip;
+				scan->array_elements_left -= skip;
+			}
+			else
+			{
+				scan->array_elements_left = 0;
+			}
+		}
+		scan->nexttid = tid;
+	}
+}
+
+/*
+ * Return the value of row identified with 'tid' in a scan.
+ *
+ * 'tid' must be greater than any previously returned item.
+ *
+ * Returns true if a matching item is found, false otherwise. After
+ * a false return, it's OK to call this again with another greater TID.
+ */
+static inline bool
+zsbt_scan_next_fetch(ZSBtreeScan *scan, Datum *datum, bool *isnull, zstid tid)
+{
+	if (!scan->active)
+		return false;
+
+	/* skip to the given tid. */
+	zsbt_scan_skip(scan, tid);
+
+	/*
+	 * Fetch the next item from the scan. The item we're looking for might
+	 * already be in scan->array_*.
+	 */
+	do
+	{
+		if (tid < scan->nexttid)
+		{
+			/* The next item from this scan is beyond the TID we're looking for. */
+			return false;
+		}
+
+		if (scan->array_elements_left > 0)
+		{
+			*isnull = scan->array_isnull;
+			*datum = *(scan->array_next_datum++);
+			scan->nexttid++;
+			scan->array_elements_left--;
+			return true;
+		}
+		/* Advance the scan, and check again. */
+	} while (zsbt_attr_scan_next(scan));
+
+	return false;
+}
+
+extern PGDLLIMPORT const TupleTableSlotOps TTSOpsZedstore;
+
+/* prototypes for functions in zedstore_meta.c */
+extern void zsmeta_initmetapage(Relation rel);
+extern BlockNumber zsmeta_get_root_for_attribute(Relation rel, AttrNumber attno, bool for_update);
+extern void zsmeta_update_root_for_attribute(Relation rel, AttrNumber attno, Buffer metabuf, BlockNumber rootblk);
+extern void zsmeta_add_root_for_new_attributes(Relation rel, Page page);
+
+/* prototypes for functions in zedstore_visibility.c */
+extern TM_Result zs_SatisfiesUpdate(Relation rel, Snapshot snapshot,
+									ZSUndoRecPtr recent_oldest_undo, ZSBtreeItem *item,
+									LockTupleMode mode,
+									bool *undo_record_needed,
+									TM_FailureData *tmfd, zstid *next_tid);
+extern bool zs_SatisfiesVisibility(ZSBtreeScan *scan, ZSBtreeItem *item,
+								   TransactionId *obsoleting_xid, zstid *next_tid);
+
+/* prototypes for functions in zedstore_toast.c */
+extern Datum zedstore_toast_datum(Relation rel, AttrNumber attno, Datum value);
+extern void zedstore_toast_finish(Relation rel, AttrNumber attno, Datum toasted, zstid tid);
+extern Datum zedstore_toast_flatten(Relation rel, AttrNumber attno, zstid tid, Datum toasted);
+
+/* prototypes for functions in zedstore_freepagemap.c */
+extern Buffer zspage_getnewbuf(Relation rel, Buffer metabuf);
+extern Buffer zspage_extendrel_newbuf(Relation rel);
+extern void zspage_delete_page(Relation rel, Buffer buf);
+
+/* prototypes for functions in zedstore_utils.c */
+extern zs_split_stack *zs_new_split_stack_entry(Buffer buf, Page page);
+extern void zs_apply_split_changes(Relation rel, zs_split_stack *stack);
+
+#endif							/* ZEDSTORE_INTERNAL_H */
diff --git a/src/include/access/zedstore_undo.h b/src/include/access/zedstore_undo.h
new file mode 100644
index 0000000000..2b0c5406a6
--- /dev/null
+++ b/src/include/access/zedstore_undo.h
@@ -0,0 +1,171 @@
+/*
+ * zedstore_undo.h
+ *		internal declarations for ZedStore undo logging
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/access/zedstore_undo.h
+ */
+#ifndef ZEDSTORE_UNDO_H
+#define ZEDSTORE_UNDO_H
+
+#include "commands/vacuum.h"
+#include "utils/relcache.h"
+
+/* this must match the definition in zedstore_internal.h */
+typedef uint64	zstid;
+
+/*
+ * An UNDO-pointer.
+ *
+ * In the "real" UNDO-logging work from EDB, an UndoRecPtr is only 64 bits.
+ * But we make life easier for us, by encoding more information in it.
+ *
+ * 'counter' is a number that's incremented every time a new undo record is
+ * created. It can be used to determine if an undo pointer is too old to be
+ * of interest to anyone.
+ *
+ * 'blkno' and 'offset' are the physical location of the UNDO record. They
+ * can be used to easily fetch a given record.
+ */
+typedef struct
+{
+	uint64		counter;
+	BlockNumber blkno;
+	int32		offset;
+} ZSUndoRecPtr;
+
+/* TODO: assert that blkno and offset match, too, if counter matches */
+#define ZSUndoRecPtrEquals(a, b) ((a).counter == (b).counter)
+
+#define INVALID_SPECULATIVE_TOKEN 0
+
+typedef struct
+{
+	int16		size;			/* size of this record, including header */
+	uint8		type;			/* ZSUNDO_TYPE_* */
+	ZSUndoRecPtr undorecptr;
+	TransactionId xid;
+	CommandId	cid;
+	zstid		tid;
+	uint32		speculative_token; /* Only used for INSERT records */
+
+	/*
+	 * UNDO-record of the inserter. This is needed if a row is inserted, and
+	 * deleted, and there are some snapshots active don't don't consider even
+	 * the insertion as visible.
+	 *
+	 * This is also used in Insert records, if the record represents the
+	 * new tuple version of an UPDATE, rather than an INSERT. It's needed to
+	 * dig into possible KEY SHARE locks held on the row, which didn't prevent
+	 * the tuple from being updated.
+	 */
+	ZSUndoRecPtr prevundorec;
+} ZSUndoRec;
+
+#define ZSUNDO_TYPE_INSERT		1
+#define ZSUNDO_TYPE_DELETE		2
+#define ZSUNDO_TYPE_UPDATE		3
+#define ZSUNDO_TYPE_TUPLE_LOCK	4
+
+/*
+ * Type-specific record formats.
+ *
+ * We store similar info as zheap for INSERT/UPDATE/DELETE. See zheap README.
+ */
+typedef struct
+{
+	ZSUndoRec	rec;
+	zstid       endtid; /* inclusive */
+
+} ZSUndoRec_Insert;
+
+typedef struct
+{
+	ZSUndoRec	rec;
+
+	bool		changedPart;	/* tuple was moved to a different partition by UPDATE */
+
+	/*
+	 * TODO: It might be good to move the deleted tuple to the undo-log, so
+	 * that the space can immediately be reused. But currently, we don't do
+	 * that. (or even better, move the old tuple to the undo-log lazily, if
+	 * the space is needed for a new insertion, before the old tuple becomes
+	 * recyclable.
+	 */
+} ZSUndoRec_Delete;
+
+/*
+ * This is used for an UPDATE, to mark the old tuple version as updated.
+ * It's the same as a deletion, except this stores the TID of the new tuple
+ * version, so it can be followed in READ COMMITTED mode.
+ *
+ * The ZSUndoRec_Insert record is used for the insertion of the new tuple
+ * version.
+ */
+typedef struct
+{
+	ZSUndoRec	rec;
+
+	bool		key_update;		/* were key columns updated?
+								 * (for conflicting with FOR KEY SHARE) */
+
+	zstid		newtid;
+
+} ZSUndoRec_Update;
+
+/*
+ * This is used when a tuple is locked e.g. with SELECT FOR UPDATE.
+ * The tuple isn't really changed in any way, but the undo record gives
+ * a place to store the XID of the locking transaction.
+ *
+ * In case of a FOR SHARE lock, there can be multiple lockers. Each locker
+ * will create a new undo record with its own XID that points to the previous
+ * record. So the records will form a chain, leading finally to the insertion
+ * record (or beyond the UNDO horizon, meaning the tuple's insertion is visible
+ * to everyone)
+ */
+typedef struct
+{
+	ZSUndoRec	rec;
+
+	/*
+	 * XXX: Is it OK to store this on disk? The enum values could change. Then
+	 * again, no one should care about old locks that were acquired before
+	 * last restart. Except with two-phase commit prepared transactions.
+	 */
+	LockTupleMode	lockmode;
+} ZSUndoRec_TupleLock;
+
+typedef struct
+{
+	BlockNumber	next;
+	uint16		padding;			/* padding, to put zs_page_id last */
+	uint16		zs_page_id; /* ZS_UNDO_PAGE_ID */
+} ZSUndoPageOpaque;
+
+static inline void
+ZSUndoRecPtrInitialize(ZSUndoRecPtr *uptr)
+{
+	uptr->blkno = InvalidBlockNumber;
+	uptr->offset = InvalidOffsetNumber;
+	uptr->counter = 0;
+}
+
+static inline bool
+IsZSUndoRecPtrValid(ZSUndoRecPtr *uptr)
+{
+	return (uptr->blkno != InvalidBlockNumber &&
+			uptr->offset != InvalidOffsetNumber);
+}
+
+/* prototypes for functions in zstore_undo.c */
+extern ZSUndoRecPtr zsundo_insert(Relation rel, ZSUndoRec *rec);
+extern ZSUndoRec *zsundo_fetch(Relation rel, ZSUndoRecPtr undorecptr);
+extern void zsundo_clear_speculative_token(Relation rel, ZSUndoRecPtr undoptr);
+extern void zsundo_vacuum(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy,
+			  TransactionId OldestXmin);
+extern ZSUndoRecPtr zsundo_get_oldest_undo_ptr(Relation rel);
+
+#endif							/* ZEDSTORE_UNDO_H */
diff --git a/src/include/catalog/pg_am.dat b/src/include/catalog/pg_am.dat
index 393b41dd68..f370f63460 100644
--- a/src/include/catalog/pg_am.dat
+++ b/src/include/catalog/pg_am.dat
@@ -33,5 +33,8 @@
 { oid => '3580', oid_symbol => 'BRIN_AM_OID',
   descr => 'block range index (BRIN) access method',
   amname => 'brin', amhandler => 'brinhandler', amtype => 'i' },
+{ oid => '6668', oid_symbol => 'ZEDSTORE_TABLE_AM_OID',
+  descr => 'zedstore table access method',
+  amname => 'zedstore', amhandler => 'zedstore_tableam_handler', amtype => 't' },

 ]
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 87335248a0..1df6febeca 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -873,6 +873,11 @@
   proname => 'heap_tableam_handler', provolatile => 'v',
   prorettype => 'table_am_handler', proargtypes => 'internal',
   prosrc => 'heap_tableam_handler' },
+{ oid => '6669', oid_symbol => 'ZEDSTORE_TABLE_AM_HANDLER_OID',
+  descr => 'column-oriented table access method handler',
+  proname => 'zedstore_tableam_handler', provolatile => 'v',
+  prorettype => 'table_am_handler', proargtypes => 'internal',
+  prosrc => 'zedstore_tableam_handler' },

 # Index access method handlers
 { oid => '330', descr => 'btree index access method handler',
@@ -10677,4 +10682,23 @@
   proname => 'pg_partition_root', prorettype => 'regclass',
   proargtypes => 'regclass', prosrc => 'pg_partition_root' },

+# zedstore inspection functions
+{ oid => '7000', descr => 'get zedstore page type',
+  proname => 'pg_zs_page_type', prorettype => 'text',
+  proargtypes => 'regclass int8', prosrc => 'pg_zs_page_type' },
+{ oid => '7001', descr => 'show stats about active zedstore undo pages',
+  proname => 'pg_zs_undo_pages', prorows => '1000', proretset => 't',
+  prorettype => 'record', proargtypes => 'regclass',
+  proallargtypes => '{regclass,int8,int4,int4,int8,int8}',
+  proargmodes => '{i,o,o,o,o,o}',
+  proargnames => '{relid,blkno,nrecords,freespace,firstrecptr,lastrecptr}',
+  prosrc => 'pg_zs_undo_pages' },
+{ oid => '7002', descr => 'show stats about zedstore btree pages',
+  proname => 'pg_zs_btree_pages', prorows => '1000', proretset => 't',
+  prorettype => 'record', proargtypes => 'regclass',
+  proallargtypes => '{regclass,int8,int8,int4,int4,int8,int8,int4,int4,int4,int4,int4}',
+  proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{relid,blkno,nextblk,attno,level,lokey,hikey,nitems,ncompressed,totalsz,uncompressedsz,freespace}',
+  prosrc => 'pg_zs_btree_pages' },
+
 ]
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 88134bcc71..2317c688e8 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -597,5 +597,6 @@ extern void CheckCmdReplicaIdentity(Relation rel, CmdType cmd);

 extern void CheckSubscriptionRelkind(char relkind, const char *nspname,
 									 const char *relname);
-
+extern void GetNeededColumnsForNode(Node *expr, bool *mask, int n);
+extern bool *GetNeededColumnsForScan(ScanState *scanstate, int ncol);
 #endif							/* EXECUTOR_H  */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 64122bc1e3..cd5b26118b 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1423,6 +1423,7 @@ typedef struct IndexOnlyScanState
 	struct IndexScanDescData *ioss_ScanDesc;
 	Buffer		ioss_VMBuffer;
 	Size		ioss_PscanLen;
+	TupleTableSlot *ioss_TableSlot;
 } IndexOnlyScanState;

 /* ----------------
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 4b7703d478..b413bb9f78 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -681,6 +681,7 @@ typedef struct RelOptInfo
 	PlannerInfo *subroot;		/* if subquery */
 	List	   *subplan_params; /* if subquery */
 	int			rel_parallel_workers;	/* wanted number of parallel workers */
+	bool        leverage_column_projection;

 	/* Information about foreign tables and foreign joins */
 	Oid			serverid;		/* identifies server for the table or join */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 6cd4cfed0a..ad7870a0bb 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -350,6 +350,9 @@
 /* Define to 1 if you have the `ldap_r' library (-lldap_r). */
 #undef HAVE_LIBLDAP_R

+/* Define to 1 if you have the `lz4' library (-llz4). */
+#undef HAVE_LIBLZ4
+
 /* Define to 1 if you have the `m' library (-lm). */
 #undef HAVE_LIBM

@@ -389,6 +392,9 @@
 /* Define to 1 if `long long int' works and is 64 bits. */
 #undef HAVE_LONG_LONG_INT_64

+/* Define to 1 if you have the <lz4.h> header file. */
+#undef HAVE_LZ4_H
+
 /* Define to 1 if you have the <mbarrier.h> header file. */
 #undef HAVE_MBARRIER_H

@@ -932,6 +938,9 @@
 /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
 #undef USE_LLVM

+/* Define to 1 to build with LZ4 support. (--with-lz4) */
+#undef USE_LZ4
+
 /* Define to select named POSIX semaphores. */
 #undef USE_NAMED_POSIX_SEMAPHORES

diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h
index 376245ecd7..866c3a76f9 100644
--- a/src/include/storage/predicate.h
+++ b/src/include/storage/predicate.h
@@ -58,15 +58,18 @@ extern void RegisterPredicateLockingXid(TransactionId xid);
 extern void PredicateLockRelation(Relation relation, Snapshot snapshot);
 extern void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot);
 extern void PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot);
+extern void PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot);
 extern void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno);
 extern void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno);
 extern void TransferPredicateLocksToHeapRelation(Relation relation);
 extern void ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe);

 /* conflict detection (may also trigger rollback) */
-extern void CheckForSerializableConflictOut(bool valid, Relation relation, HeapTuple tuple,
-											Buffer buffer, Snapshot snapshot);
-extern void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer);
+extern void heap_CheckForSerializableConflictOut(bool valid, Relation relation, HeapTuple tuple,
+												 Buffer buffer, Snapshot snapshot);
+extern void CheckForSerializableConflictOut(Relation relation, TransactionId xid,
+											Snapshot snapshot);
+extern void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno);
 extern void CheckTableForSerializableConflictIn(Relation relation);

 /* final rollback checking */
diff --git a/src/test/isolation/specs/read-only-anomaly-2.spec b/src/test/isolation/specs/read-only-anomaly-2.spec
index 9812f49ee4..2b17fcb521 100644
--- a/src/test/isolation/specs/read-only-anomaly-2.spec
+++ b/src/test/isolation/specs/read-only-anomaly-2.spec
@@ -18,13 +18,15 @@ teardown
 }

 session "s1"
-setup 		{ BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; }
+setup 		{ BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;
+		  SET enable_seqscan=off; }
 step "s1ry"	{ SELECT balance FROM bank_account WHERE id = 'Y'; }
 step "s1wy"	{ UPDATE bank_account SET balance = 20 WHERE id = 'Y'; }
 step "s1c" 	{ COMMIT; }

 session "s2"
-setup		{ BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; }
+setup 		{ BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;
+		  SET enable_seqscan=off; }
 step "s2rx"	{ SELECT balance FROM bank_account WHERE id = 'X'; }
 step "s2ry"	{ SELECT balance FROM bank_account WHERE id = 'Y'; }
 step "s2wx"	{ UPDATE bank_account SET balance = -11 WHERE id = 'X'; }
diff --git a/src/test/regress/expected/.gitignore b/src/test/regress/expected/.gitignore
index 93c56c85a0..0eb6984372 100644
--- a/src/test/regress/expected/.gitignore
+++ b/src/test/regress/expected/.gitignore
@@ -5,5 +5,6 @@
 /largeobject.out
 /largeobject_1.out
 /misc.out
+/misc_1.out
 /security_label.out
 /tablespace.out
diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out
new file mode 100644
index 0000000000..09d60af3b7
--- /dev/null
+++ b/src/test/regress/expected/alter_table_1.out
@@ -0,0 +1,3997 @@
+--
+-- ALTER_TABLE
+--
+-- Clean up in case a prior regression run failed
+SET client_min_messages TO 'warning';
+DROP ROLE IF EXISTS regress_alter_table_user1;
+RESET client_min_messages;
+CREATE USER regress_alter_table_user1;
+--
+-- add attribute
+--
+CREATE TABLE attmp (initial int4);
+COMMENT ON TABLE attmp_wrong IS 'table comment';
+ERROR:  relation "attmp_wrong" does not exist
+COMMENT ON TABLE attmp IS 'table comment';
+COMMENT ON TABLE attmp IS NULL;
+ALTER TABLE attmp ADD COLUMN xmin integer; -- fails
+ERROR:  column name "xmin" conflicts with a system column name
+ALTER TABLE attmp ADD COLUMN a int4 default 3;
+ALTER TABLE attmp ADD COLUMN b name;
+ALTER TABLE attmp ADD COLUMN c text;
+ALTER TABLE attmp ADD COLUMN d float8;
+ALTER TABLE attmp ADD COLUMN e float4;
+ALTER TABLE attmp ADD COLUMN f int2;
+ALTER TABLE attmp ADD COLUMN g polygon;
+ALTER TABLE attmp ADD COLUMN i char;
+ALTER TABLE attmp ADD COLUMN k int4;
+ALTER TABLE attmp ADD COLUMN l tid;
+ALTER TABLE attmp ADD COLUMN m xid;
+ALTER TABLE attmp ADD COLUMN n oidvector;
+--ALTER TABLE attmp ADD COLUMN o lock;
+ALTER TABLE attmp ADD COLUMN p boolean;
+ALTER TABLE attmp ADD COLUMN q point;
+ALTER TABLE attmp ADD COLUMN r lseg;
+ALTER TABLE attmp ADD COLUMN s path;
+ALTER TABLE attmp ADD COLUMN t box;
+ALTER TABLE attmp ADD COLUMN v timestamp;
+ALTER TABLE attmp ADD COLUMN w interval;
+ALTER TABLE attmp ADD COLUMN x float8[];
+ALTER TABLE attmp ADD COLUMN y float4[];
+ALTER TABLE attmp ADD COLUMN z int2[];
+INSERT INTO attmp (a, b, c, d, e, f, g,    i,    k, l, m, n, p, q, r, s, t,
+	v, w, x, y, z)
+   VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)',
+	'c',
+	314159, '(1,1)', '512',
+	'1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)',
+	'(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)',
+	'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}');
+SELECT * FROM attmp;
+ initial | a |  b   |  c   |  d  |  e  | f |           g           | i |   k    |   l   |  m  |        n        | p |     q     |           r           |              s              |          t          |            v             |        w         |     x     |     y     |     z     
+---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+-----------
+         | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4}
+(1 row)
+
+DROP TABLE attmp;
+-- the wolf bug - schema mods caused inconsistent row descriptors
+CREATE TABLE attmp (
+	initial 	int4
+);
+ALTER TABLE attmp ADD COLUMN a int4;
+ALTER TABLE attmp ADD COLUMN b name;
+ALTER TABLE attmp ADD COLUMN c text;
+ALTER TABLE attmp ADD COLUMN d float8;
+ALTER TABLE attmp ADD COLUMN e float4;
+ALTER TABLE attmp ADD COLUMN f int2;
+ALTER TABLE attmp ADD COLUMN g polygon;
+ALTER TABLE attmp ADD COLUMN i char;
+ALTER TABLE attmp ADD COLUMN k int4;
+ALTER TABLE attmp ADD COLUMN l tid;
+ALTER TABLE attmp ADD COLUMN m xid;
+ALTER TABLE attmp ADD COLUMN n oidvector;
+--ALTER TABLE attmp ADD COLUMN o lock;
+ALTER TABLE attmp ADD COLUMN p boolean;
+ALTER TABLE attmp ADD COLUMN q point;
+ALTER TABLE attmp ADD COLUMN r lseg;
+ALTER TABLE attmp ADD COLUMN s path;
+ALTER TABLE attmp ADD COLUMN t box;
+ALTER TABLE attmp ADD COLUMN v timestamp;
+ALTER TABLE attmp ADD COLUMN w interval;
+ALTER TABLE attmp ADD COLUMN x float8[];
+ALTER TABLE attmp ADD COLUMN y float4[];
+ALTER TABLE attmp ADD COLUMN z int2[];
+INSERT INTO attmp (a, b, c, d, e, f, g,    i,   k, l, m, n, p, q, r, s, t,
+	v, w, x, y, z)
+   VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)',
+        'c',
+	314159, '(1,1)', '512',
+	'1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)',
+	'(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)',
+	'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}');
+SELECT * FROM attmp;
+ initial | a |  b   |  c   |  d  |  e  | f |           g           | i |   k    |   l   |  m  |        n        | p |     q     |           r           |              s              |          t          |            v             |        w         |     x     |     y     |     z     
+---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+-----------
+         | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4}
+(1 row)
+
+CREATE INDEX attmp_idx ON attmp (a, (d + e), b);
+ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000;
+ERROR:  column number must be in range from 1 to 32767
+LINE 1: ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000;
+                                           ^
+ALTER INDEX attmp_idx ALTER COLUMN 1 SET STATISTICS 1000;
+ERROR:  cannot alter statistics on non-expression column "a" of index "attmp_idx"
+HINT:  Alter statistics on table column instead.
+ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS 1000;
+\d+ attmp_idx
+                        Index "public.attmp_idx"
+ Column |       Type       | Key? | Definition | Storage | Stats target 
+--------+------------------+------+------------+---------+--------------
+ a      | integer          | yes  | a          | plain   | 
+ expr   | double precision | yes  | (d + e)    | plain   | 1000
+ b      | cstring          | yes  | b          | plain   | 
+btree, for table "public.attmp"
+
+ALTER INDEX attmp_idx ALTER COLUMN 3 SET STATISTICS 1000;
+ERROR:  cannot alter statistics on non-expression column "b" of index "attmp_idx"
+HINT:  Alter statistics on table column instead.
+ALTER INDEX attmp_idx ALTER COLUMN 4 SET STATISTICS 1000;
+ERROR:  column number 4 of relation "attmp_idx" does not exist
+ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS -1;
+DROP TABLE attmp;
+--
+-- rename - check on both non-temp and temp tables
+--
+CREATE TABLE attmp (regtable int);
+CREATE TEMP TABLE attmp (attmptable int);
+ALTER TABLE attmp RENAME TO attmp_new;
+SELECT * FROM attmp;
+ regtable 
+----------
+(0 rows)
+
+SELECT * FROM attmp_new;
+ attmptable 
+------------
+(0 rows)
+
+ALTER TABLE attmp RENAME TO attmp_new2;
+SELECT * FROM attmp;		-- should fail
+ERROR:  relation "attmp" does not exist
+LINE 1: SELECT * FROM attmp;
+                      ^
+SELECT * FROM attmp_new;
+ attmptable 
+------------
+(0 rows)
+
+SELECT * FROM attmp_new2;
+ regtable 
+----------
+(0 rows)
+
+DROP TABLE attmp_new;
+DROP TABLE attmp_new2;
+-- check rename of partitioned tables and indexes also
+CREATE TABLE part_attmp (a int primary key) partition by range (a);
+CREATE TABLE part_attmp1 PARTITION OF part_attmp FOR VALUES FROM (0) TO (100);
+ALTER INDEX part_attmp_pkey RENAME TO part_attmp_index;
+ALTER INDEX part_attmp1_pkey RENAME TO part_attmp1_index;
+ALTER TABLE part_attmp RENAME TO part_at2tmp;
+ALTER TABLE part_attmp1 RENAME TO part_at2tmp1;
+SET ROLE regress_alter_table_user1;
+ALTER INDEX part_attmp_index RENAME TO fail;
+ERROR:  must be owner of index part_attmp_index
+ALTER INDEX part_attmp1_index RENAME TO fail;
+ERROR:  must be owner of index part_attmp1_index
+ALTER TABLE part_at2tmp RENAME TO fail;
+ERROR:  must be owner of table part_at2tmp
+ALTER TABLE part_at2tmp1 RENAME TO fail;
+ERROR:  must be owner of table part_at2tmp1
+RESET ROLE;
+DROP TABLE part_at2tmp;
+--
+-- check renaming to a table's array type's autogenerated name
+-- (the array type's name should get out of the way)
+--
+CREATE TABLE attmp_array (id int);
+CREATE TABLE attmp_array2 (id int);
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+   typname    
+--------------
+ _attmp_array
+(1 row)
+
+SELECT typname FROM pg_type WHERE oid = 'attmp_array2[]'::regtype;
+    typname    
+---------------
+ _attmp_array2
+(1 row)
+
+ALTER TABLE attmp_array2 RENAME TO _attmp_array;
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+    typname    
+---------------
+ __attmp_array
+(1 row)
+
+SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype;
+    typname     
+----------------
+ ___attmp_array
+(1 row)
+
+DROP TABLE _attmp_array;
+DROP TABLE attmp_array;
+-- renaming to table's own array type's name is an interesting corner case
+CREATE TABLE attmp_array (id int);
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+   typname    
+--------------
+ _attmp_array
+(1 row)
+
+ALTER TABLE attmp_array RENAME TO _attmp_array;
+SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype;
+    typname    
+---------------
+ __attmp_array
+(1 row)
+
+DROP TABLE _attmp_array;
+-- ALTER TABLE ... RENAME on non-table relations
+-- renaming indexes (FIXME: this should probably test the index's functionality)
+ALTER INDEX IF EXISTS __onek_unique1 RENAME TO attmp_onek_unique1;
+NOTICE:  relation "__onek_unique1" does not exist, skipping
+ALTER INDEX IF EXISTS __attmp_onek_unique1 RENAME TO onek_unique1;
+NOTICE:  relation "__attmp_onek_unique1" does not exist, skipping
+ALTER INDEX onek_unique1 RENAME TO attmp_onek_unique1;
+ALTER INDEX attmp_onek_unique1 RENAME TO onek_unique1;
+SET ROLE regress_alter_table_user1;
+ALTER INDEX onek_unique1 RENAME TO fail;  -- permission denied
+ERROR:  must be owner of index onek_unique1
+RESET ROLE;
+-- renaming views
+CREATE VIEW attmp_view (unique1) AS SELECT unique1 FROM tenk1;
+ALTER TABLE attmp_view RENAME TO attmp_view_new;
+SET ROLE regress_alter_table_user1;
+ALTER VIEW attmp_view_new RENAME TO fail;  -- permission denied
+ERROR:  must be owner of view attmp_view_new
+RESET ROLE;
+-- hack to ensure we get an indexscan here
+set enable_seqscan to off;
+set enable_bitmapscan to off;
+-- 5 values, sorted
+SELECT unique1 FROM tenk1 WHERE unique1 < 5;
+ unique1 
+---------
+       0
+       1
+       2
+       3
+       4
+(5 rows)
+
+reset enable_seqscan;
+reset enable_bitmapscan;
+DROP VIEW attmp_view_new;
+-- toast-like relation name
+alter table stud_emp rename to pg_toast_stud_emp;
+alter table pg_toast_stud_emp rename to stud_emp;
+-- renaming index should rename constraint as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+ALTER INDEX onek_unique1_constraint RENAME TO onek_unique1_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+-- renaming constraint
+ALTER TABLE onek ADD CONSTRAINT onek_check_constraint CHECK (unique1 >= 0);
+ALTER TABLE onek RENAME CONSTRAINT onek_check_constraint TO onek_check_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_check_constraint_foo;
+-- renaming constraint should rename index as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+DROP INDEX onek_unique1_constraint;  -- to see whether it's there
+ERROR:  cannot drop index onek_unique1_constraint because constraint onek_unique1_constraint on table onek requires it
+HINT:  You can drop constraint onek_unique1_constraint on table onek instead.
+ALTER TABLE onek RENAME CONSTRAINT onek_unique1_constraint TO onek_unique1_constraint_foo;
+DROP INDEX onek_unique1_constraint_foo;  -- to see whether it's there
+ERROR:  cannot drop index onek_unique1_constraint_foo because constraint onek_unique1_constraint_foo on table onek requires it
+HINT:  You can drop constraint onek_unique1_constraint_foo on table onek instead.
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+-- renaming constraints vs. inheritance
+CREATE TABLE constraint_rename_test (a int CONSTRAINT con1 CHECK (a > 0), b int, c int);
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1" CHECK (a > 0)
+
+CREATE TABLE constraint_rename_test2 (a int CONSTRAINT con1 CHECK (a > 0), d int) INHERITS (constraint_rename_test);
+NOTICE:  merging column "a" with inherited definition
+NOTICE:  merging constraint "con1" with inherited definition
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test2 RENAME CONSTRAINT con1 TO con1foo; -- fail
+ERROR:  cannot rename inherited constraint "con1"
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- fail
+ERROR:  inherited constraint "con1" must be renamed in child tables too
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test ADD CONSTRAINT con2 CHECK (b > 0) NO INHERIT;
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con2 TO con2foo; -- ok
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con2foo TO con2bar; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+    "con2bar" CHECK (b > 0) NO INHERIT
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test ADD CONSTRAINT con3 PRIMARY KEY (a);
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con3 TO con3foo; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "con3foo" PRIMARY KEY, btree (a)
+Check constraints:
+    "con1foo" CHECK (a > 0)
+    "con2bar" CHECK (b > 0) NO INHERIT
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+DROP TABLE constraint_rename_test2;
+DROP TABLE constraint_rename_test;
+ALTER TABLE IF EXISTS constraint_not_exist RENAME CONSTRAINT con3 TO con3foo; -- ok
+NOTICE:  relation "constraint_not_exist" does not exist, skipping
+ALTER TABLE IF EXISTS constraint_rename_test ADD CONSTRAINT con4 UNIQUE (a);
+NOTICE:  relation "constraint_rename_test" does not exist, skipping
+-- renaming constraints with cache reset of target relation
+CREATE TABLE constraint_rename_cache (a int,
+  CONSTRAINT chk_a CHECK (a > 0),
+  PRIMARY KEY (a));
+ALTER TABLE constraint_rename_cache
+  RENAME CONSTRAINT chk_a TO chk_a_new;
+ALTER TABLE constraint_rename_cache
+  RENAME CONSTRAINT constraint_rename_cache_pkey TO constraint_rename_pkey_new;
+CREATE TABLE like_constraint_rename_cache
+  (LIKE constraint_rename_cache INCLUDING ALL);
+\d like_constraint_rename_cache
+    Table "public.like_constraint_rename_cache"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+Indexes:
+    "like_constraint_rename_cache_pkey" PRIMARY KEY, btree (a)
+Check constraints:
+    "chk_a_new" CHECK (a > 0)
+
+DROP TABLE constraint_rename_cache;
+DROP TABLE like_constraint_rename_cache;
+-- FOREIGN KEY CONSTRAINT adding TEST
+CREATE TABLE attmp2 (a int primary key);
+CREATE TABLE attmp3 (a int, b int);
+CREATE TABLE attmp4 (a int, b int, unique(a,b));
+CREATE TABLE attmp5 (a int, b int);
+-- Insert rows into attmp2 (pktable)
+INSERT INTO attmp2 values (1);
+INSERT INTO attmp2 values (2);
+INSERT INTO attmp2 values (3);
+INSERT INTO attmp2 values (4);
+-- Insert rows into attmp3
+INSERT INTO attmp3 values (1,10);
+INSERT INTO attmp3 values (1,20);
+INSERT INTO attmp3 values (5,50);
+-- Try (and fail) to add constraint due to invalid source columns
+ALTER TABLE attmp3 add constraint attmpconstr foreign key(c) references attmp2 match full;
+ERROR:  column "c" referenced in foreign key constraint does not exist
+-- Try (and fail) to add constraint due to invalid destination columns explicitly given
+ALTER TABLE attmp3 add constraint attmpconstr foreign key(a) references attmp2(b) match full;
+ERROR:  column "b" referenced in foreign key constraint does not exist
+-- Try (and fail) to add constraint due to invalid data
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full;
+ERROR:  insert or update on table "attmp3" violates foreign key constraint "attmpconstr"
+DETAIL:  Key (a)=(5) is not present in table "attmp2".
+-- Delete failing row
+DELETE FROM attmp3 where a=5;
+-- Try (and succeed)
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full;
+ALTER TABLE attmp3 drop constraint attmpconstr;
+INSERT INTO attmp3 values (5,50);
+-- Try NOT VALID and then VALIDATE CONSTRAINT, but fails. Delete failure then re-validate
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full NOT VALID;
+ALTER TABLE attmp3 validate constraint attmpconstr;
+ERROR:  insert or update on table "attmp3" violates foreign key constraint "attmpconstr"
+DETAIL:  Key (a)=(5) is not present in table "attmp2".
+-- Delete failing row
+DELETE FROM attmp3 where a=5;
+-- Try (and succeed) and repeat to show it works on already valid constraint
+ALTER TABLE attmp3 validate constraint attmpconstr;
+ALTER TABLE attmp3 validate constraint attmpconstr;
+-- Try a non-verified CHECK constraint
+ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10); -- fail
+ERROR:  check constraint "b_greater_than_ten" is violated by some row
+ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10) NOT VALID; -- succeeds
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- fails
+ERROR:  check constraint "b_greater_than_ten" is violated by some row
+DELETE FROM attmp3 WHERE NOT b > 10;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds
+-- Test inherited NOT VALID CHECK constraints
+select * from attmp3;
+ a | b  
+---+----
+ 1 | 20
+(1 row)
+
+CREATE TABLE attmp6 () INHERITS (attmp3);
+CREATE TABLE attmp7 () INHERITS (attmp3);
+INSERT INTO attmp6 VALUES (6, 30), (7, 16);
+ALTER TABLE attmp3 ADD CONSTRAINT b_le_20 CHECK (b <= 20) NOT VALID;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20;	-- fails
+ERROR:  check constraint "b_le_20" is violated by some row
+DELETE FROM attmp6 WHERE b > 20;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20;	-- succeeds
+-- An already validated constraint must not be revalidated
+CREATE FUNCTION boo(int) RETURNS int IMMUTABLE STRICT LANGUAGE plpgsql AS $$ BEGIN RAISE NOTICE 'boo: %', $1; RETURN $1; END; $$;
+INSERT INTO attmp7 VALUES (8, 18);
+ALTER TABLE attmp7 ADD CONSTRAINT identity CHECK (b = boo(b));
+NOTICE:  boo: 18
+ALTER TABLE attmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID;
+NOTICE:  merging constraint "identity" with inherited definition
+ALTER TABLE attmp3 VALIDATE CONSTRAINT identity;
+NOTICE:  boo: 16
+NOTICE:  boo: 20
+-- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT
+create table parent_noinh_convalid (a int);
+create table child_noinh_convalid () inherits (parent_noinh_convalid);
+insert into parent_noinh_convalid values (1);
+insert into child_noinh_convalid values (1);
+alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid;
+-- fail, because of the row in parent
+alter table parent_noinh_convalid validate constraint check_a_is_2;
+ERROR:  check constraint "check_a_is_2" is violated by some row
+delete from only parent_noinh_convalid;
+-- ok (parent itself contains no violating rows)
+alter table parent_noinh_convalid validate constraint check_a_is_2;
+select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2';
+ convalidated 
+--------------
+ t
+(1 row)
+
+-- cleanup
+drop table parent_noinh_convalid, child_noinh_convalid;
+-- Try (and fail) to create constraint from attmp5(a) to attmp4(a) - unique constraint on
+-- attmp4 is a,b
+ALTER TABLE attmp5 add constraint attmpconstr foreign key(a) references attmp4(a) match full;
+ERROR:  there is no unique constraint matching given keys for referenced table "attmp4"
+DROP TABLE attmp7;
+DROP TABLE attmp6;
+DROP TABLE attmp5;
+DROP TABLE attmp4;
+DROP TABLE attmp3;
+DROP TABLE attmp2;
+-- NOT VALID with plan invalidation -- ensure we don't use a constraint for
+-- exclusion until validated
+set constraint_exclusion TO 'partition';
+create table nv_parent (d date, check (false) no inherit not valid);
+-- not valid constraint added at creation time should automatically become valid
+\d nv_parent
+            Table "public.nv_parent"
+ Column | Type | Collation | Nullable | Default 
+--------+------+-----------+----------+---------
+ d      | date |           |          | 
+Check constraints:
+    "nv_parent_check" CHECK (false) NO INHERIT
+
+create table nv_child_2010 () inherits (nv_parent);
+create table nv_child_2011 () inherits (nv_parent);
+alter table nv_child_2010 add check (d between '2010-01-01'::date and '2010-12-31'::date) not valid;
+alter table nv_child_2011 add check (d between '2011-01-01'::date and '2011-12-31'::date) not valid;
+explain (costs off) select * from nv_parent where d between '2011-08-01' and '2011-08-31';
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2010
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2011
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+(7 rows)
+
+create table nv_child_2009 (check (d between '2009-01-01'::date and '2009-12-31'::date)) inherits (nv_parent);
+explain (costs off) select * from nv_parent where d between '2011-08-01'::date and '2011-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2010
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2011
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+(7 rows)
+
+explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2010
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2011
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2009
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+(9 rows)
+
+-- after validation, the constraint should be used
+alter table nv_child_2011 VALIDATE CONSTRAINT nv_child_2011_d_check;
+explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2010
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2009
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+(7 rows)
+
+-- add an inherited NOT VALID constraint
+alter table nv_parent add check (d between '2001-01-01'::date and '2099-12-31'::date) not valid;
+\d nv_child_2009
+          Table "public.nv_child_2009"
+ Column | Type | Collation | Nullable | Default 
+--------+------+-----------+----------+---------
+ d      | date |           |          | 
+Check constraints:
+    "nv_child_2009_d_check" CHECK (d >= '01-01-2009'::date AND d <= '12-31-2009'::date)
+    "nv_parent_d_check" CHECK (d >= '01-01-2001'::date AND d <= '12-31-2099'::date) NOT VALID
+Inherits: nv_parent
+
+-- we leave nv_parent and children around to help test pg_dump logic
+-- Foreign key adding test with mixed types
+-- Note: these tables are TEMP to avoid name conflicts when this test
+-- is run in parallel with foreign_key.sql.
+CREATE TEMP TABLE PKTABLE (ptest1 int PRIMARY KEY);
+INSERT INTO PKTABLE VALUES(42);
+CREATE TEMP TABLE FKTABLE (ftest1 inet);
+-- This next should fail, because int=inet does not exist
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer.
+-- This should also fail for the same reason, but here we
+-- give the column name
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable(ptest1);
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer.
+DROP TABLE FKTABLE;
+-- This should succeed, even though they are different types,
+-- because int=int8 exists and is a member of the integer opfamily
+CREATE TEMP TABLE FKTABLE (ftest1 int8);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+-- Check it actually works
+INSERT INTO FKTABLE VALUES(42);		-- should succeed
+INSERT INTO FKTABLE VALUES(43);		-- should fail
+ERROR:  insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey"
+DETAIL:  Key (ftest1)=(43) is not present in table "pktable".
+DROP TABLE FKTABLE;
+-- This should fail, because we'd have to cast numeric to int which is
+-- not an implicit coercion (or use numeric=numeric, but that's not part
+-- of the integer opfamily)
+CREATE TEMP TABLE FKTABLE (ftest1 numeric);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: numeric and integer.
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+-- On the other hand, this should work because int implicitly promotes to
+-- numeric, and we allow promotion on the FK side
+CREATE TEMP TABLE PKTABLE (ptest1 numeric PRIMARY KEY);
+INSERT INTO PKTABLE VALUES(42);
+CREATE TEMP TABLE FKTABLE (ftest1 int);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+-- Check it actually works
+INSERT INTO FKTABLE VALUES(42);		-- should succeed
+INSERT INTO FKTABLE VALUES(43);		-- should fail
+ERROR:  insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey"
+DETAIL:  Key (ftest1)=(43) is not present in table "pktable".
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+CREATE TEMP TABLE PKTABLE (ptest1 int, ptest2 inet,
+                           PRIMARY KEY(ptest1, ptest2));
+-- This should fail, because we just chose really odd types
+CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer.
+DROP TABLE FKTABLE;
+-- Again, so should this...
+CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2)
+     references pktable(ptest1, ptest2);
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer.
+DROP TABLE FKTABLE;
+-- This fails because we mixed up the column ordering
+CREATE TEMP TABLE FKTABLE (ftest1 int, ftest2 inet);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2)
+     references pktable(ptest2, ptest1);
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest2" are of incompatible types: integer and inet.
+-- As does this...
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1)
+     references pktable(ptest1, ptest2);
+ERROR:  foreign key constraint "fktable_ftest2_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer.
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+-- Test that ALTER CONSTRAINT updates trigger deferrability properly
+CREATE TEMP TABLE PKTABLE (ptest1 int primary key);
+CREATE TEMP TABLE FKTABLE (ftest1 int);
+ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE;
+SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+WHERE tgrelid = 'pktable'::regclass
+ORDER BY 1,2,3;
+ conname |         tgfoid         | tgtype | tgdeferrable | tginitdeferred 
+---------+------------------------+--------+--------------+----------------
+ fkdd    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdd    | "RI_FKey_noaction_upd" |     17 | t            | t
+ fkdd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdd2   | "RI_FKey_noaction_upd" |     17 | t            | t
+ fkdi    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdi    | "RI_FKey_noaction_upd" |     17 | t            | f
+ fkdi2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdi2   | "RI_FKey_noaction_upd" |     17 | t            | f
+ fknd    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fknd    | "RI_FKey_noaction_upd" |     17 | f            | f
+ fknd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fknd2   | "RI_FKey_noaction_upd" |     17 | f            | f
+(12 rows)
+
+SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+WHERE tgrelid = 'fktable'::regclass
+ORDER BY 1,2,3;
+ conname |       tgfoid        | tgtype | tgdeferrable | tginitdeferred 
+---------+---------------------+--------+--------------+----------------
+ fkdd    | "RI_FKey_check_ins" |      5 | t            | t
+ fkdd    | "RI_FKey_check_upd" |     17 | t            | t
+ fkdd2   | "RI_FKey_check_ins" |      5 | t            | t
+ fkdd2   | "RI_FKey_check_upd" |     17 | t            | t
+ fkdi    | "RI_FKey_check_ins" |      5 | t            | f
+ fkdi    | "RI_FKey_check_upd" |     17 | t            | f
+ fkdi2   | "RI_FKey_check_ins" |      5 | t            | f
+ fkdi2   | "RI_FKey_check_upd" |     17 | t            | f
+ fknd    | "RI_FKey_check_ins" |      5 | f            | f
+ fknd    | "RI_FKey_check_upd" |     17 | f            | f
+ fknd2   | "RI_FKey_check_ins" |      5 | f            | f
+ fknd2   | "RI_FKey_check_upd" |     17 | f            | f
+(12 rows)
+
+-- temp tables should go away by themselves, need not drop them.
+-- test check constraint adding
+create table atacc1 ( test int );
+-- add a check constraint
+alter table atacc1 add constraint atacc_test1 check (test>3);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc_test1"
+DETAIL:  Failing row contains (2).
+-- should succeed
+insert into atacc1 (test) values (4);
+drop table atacc1;
+-- let's do one where the check fails when added
+create table atacc1 ( test int );
+-- insert a soon to be failing row
+insert into atacc1 (test) values (2);
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test>3);
+ERROR:  check constraint "atacc_test1" is violated by some row
+insert into atacc1 (test) values (4);
+drop table atacc1;
+-- let's do one where the check fails because the column doesn't exist
+create table atacc1 ( test int );
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test1>3);
+ERROR:  column "test1" does not exist
+HINT:  Perhaps you meant to reference the column "atacc1.test".
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int, test3 int);
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test+test2<test3*4);
+-- should fail
+insert into atacc1 (test,test2,test3) values (4,4,2);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc_test1"
+DETAIL:  Failing row contains (4, 4, 2).
+-- should succeed
+insert into atacc1 (test,test2,test3) values (4,4,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int check (test>3), test2 int);
+alter table atacc1 add check (test2>test);
+-- should fail for $2
+insert into atacc1 (test2, test) values (3, 4);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_check"
+DETAIL:  Failing row contains (4, 3).
+drop table atacc1;
+-- inheritance related tests
+create table atacc1 (test int);
+create table atacc2 (test2 int);
+create table atacc3 (test3 int) inherits (atacc1, atacc2);
+alter table atacc2 add constraint foo check (test2>0);
+-- fail and then succeed on atacc2
+insert into atacc2 (test2) values (-3);
+ERROR:  new row for relation "atacc2" violates check constraint "foo"
+DETAIL:  Failing row contains (-3).
+insert into atacc2 (test2) values (3);
+-- fail and then succeed on atacc3
+insert into atacc3 (test2) values (-3);
+ERROR:  new row for relation "atacc3" violates check constraint "foo"
+DETAIL:  Failing row contains (null, -3, null).
+insert into atacc3 (test2) values (3);
+drop table atacc3;
+drop table atacc2;
+drop table atacc1;
+-- same things with one created with INHERIT
+create table atacc1 (test int);
+create table atacc2 (test2 int);
+create table atacc3 (test3 int) inherits (atacc1, atacc2);
+alter table atacc3 no inherit atacc2;
+-- fail
+alter table atacc3 no inherit atacc2;
+ERROR:  relation "atacc2" is not a parent of relation "atacc3"
+-- make sure it really isn't a child
+insert into atacc3 (test2) values (3);
+select test2 from atacc2;
+ test2 
+-------
+(0 rows)
+
+-- fail due to missing constraint
+alter table atacc2 add constraint foo check (test2>0);
+alter table atacc3 inherit atacc2;
+ERROR:  child table is missing constraint "foo"
+-- fail due to missing column
+alter table atacc3 rename test2 to testx;
+alter table atacc3 inherit atacc2;
+ERROR:  child table is missing column "test2"
+-- fail due to mismatched data type
+alter table atacc3 add test2 bool;
+alter table atacc3 inherit atacc2;
+ERROR:  child table "atacc3" has different type for column "test2"
+alter table atacc3 drop test2;
+-- succeed
+alter table atacc3 add test2 int;
+update atacc3 set test2 = 4 where test2 is null;
+alter table atacc3 add constraint foo check (test2>0);
+alter table atacc3 inherit atacc2;
+-- fail due to duplicates and circular inheritance
+alter table atacc3 inherit atacc2;
+ERROR:  relation "atacc2" would be inherited from more than once
+alter table atacc2 inherit atacc3;
+ERROR:  circular inheritance not allowed
+DETAIL:  "atacc3" is already a child of "atacc2".
+alter table atacc2 inherit atacc2;
+ERROR:  circular inheritance not allowed
+DETAIL:  "atacc2" is already a child of "atacc2".
+-- test that we really are a child now (should see 4 not 3 and cascade should go through)
+select test2 from atacc2;
+ test2 
+-------
+     4
+(1 row)
+
+drop table atacc2 cascade;
+NOTICE:  drop cascades to table atacc3
+drop table atacc1;
+-- adding only to a parent is allowed as of 9.2
+create table atacc1 (test int);
+create table atacc2 (test2 int) inherits (atacc1);
+-- ok:
+alter table atacc1 add constraint foo check (test>0) no inherit;
+-- check constraint is not there on child
+insert into atacc2 (test) values (-3);
+-- check constraint is there on parent
+insert into atacc1 (test) values (-3);
+ERROR:  new row for relation "atacc1" violates check constraint "foo"
+DETAIL:  Failing row contains (-3).
+insert into atacc1 (test) values (3);
+-- fail, violating row:
+alter table atacc2 add constraint foo check (test>0) no inherit;
+ERROR:  check constraint "foo" is violated by some row
+drop table atacc2;
+drop table atacc1;
+-- test unique constraint adding
+create table atacc1 ( test int ) ;
+-- add a unique constraint
+alter table atacc1 add constraint atacc_test1 unique (test);
+-- insert first value
+insert into atacc1 (test) values (2);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test)=(2) already exists.
+-- should succeed
+insert into atacc1 (test) values (4);
+-- try to create duplicates via alter table using - should fail
+alter table atacc1 alter column test type integer using 0;
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(0) is duplicated.
+drop table atacc1;
+-- let's do one where the unique constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing rows
+insert into atacc1 (test) values (2);
+insert into atacc1 (test) values (2);
+-- add a unique constraint (fails)
+alter table atacc1 add constraint atacc_test1 unique (test);
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(2) is duplicated.
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do one where the unique constraint fails
+-- because the column doesn't exist
+create table atacc1 ( test int );
+-- add a unique constraint (fails)
+alter table atacc1 add constraint atacc_test1 unique (test1);
+ERROR:  column "test1" named in key does not exist
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int);
+-- add a unique constraint
+alter table atacc1 add constraint atacc_test1 unique (test, test2);
+-- insert initial value
+insert into atacc1 (test,test2) values (4,4);
+-- should fail
+insert into atacc1 (test,test2) values (4,4);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test, test2)=(4, 4) already exists.
+-- should all succeed
+insert into atacc1 (test,test2) values (4,5);
+insert into atacc1 (test,test2) values (5,4);
+insert into atacc1 (test,test2) values (5,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int, test2 int, unique(test));
+alter table atacc1 add unique (test2);
+-- should fail for @@ second one @@
+insert into atacc1 (test2, test) values (3, 3);
+insert into atacc1 (test2, test) values (2, 3);
+ERROR:  duplicate key value violates unique constraint "atacc1_test_key"
+DETAIL:  Key (test)=(3) already exists.
+drop table atacc1;
+-- test primary key constraint adding
+create table atacc1 ( id serial, test int) ;
+-- add a primary key constraint
+alter table atacc1 add constraint atacc_test1 primary key (test);
+-- insert first value
+insert into atacc1 (test) values (2);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test)=(2) already exists.
+-- should succeed
+insert into atacc1 (test) values (4);
+-- inserting NULL should fail
+insert into atacc1 (test) values(NULL);
+ERROR:  null value in column "test" violates not-null constraint
+DETAIL:  Failing row contains (4, null).
+-- try adding a second primary key (should fail)
+alter table atacc1 add constraint atacc_oid1 primary key(id);
+ERROR:  multiple primary keys for table "atacc1" are not allowed
+-- drop first primary key constraint
+alter table atacc1 drop constraint atacc_test1 restrict;
+-- try adding a primary key on oid (should succeed)
+alter table atacc1 add constraint atacc_oid1 primary key(id);
+drop table atacc1;
+-- let's do one where the primary key constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing rows
+insert into atacc1 (test) values (2);
+insert into atacc1 (test) values (2);
+-- add a primary key (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test);
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(2) is duplicated.
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do another one where the primary key constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing row
+insert into atacc1 (test) values (NULL);
+-- add a primary key (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test);
+ERROR:  column "test" contains null values
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do one where the primary key constraint fails
+-- because the column doesn't exist
+create table atacc1 ( test int );
+-- add a primary key constraint (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test1);
+ERROR:  column "test1" of relation "atacc1" does not exist
+drop table atacc1;
+-- adding a new column as primary key to a non-empty table.
+-- should fail unless the column has a non-null default value.
+create table atacc1 ( test int );
+insert into atacc1 (test) values (0);
+-- add a primary key column without a default (fails).
+alter table atacc1 add column test2 int primary key;
+ERROR:  column "test2" contains null values
+-- now add a primary key column with a default (succeeds).
+alter table atacc1 add column test2 int default 0 primary key;
+drop table atacc1;
+-- this combination used to have order-of-execution problems (bug #15580)
+create table atacc1 (a int);
+insert into atacc1 values(1);
+alter table atacc1
+  add column b float8 not null default random(),
+  add primary key(a);
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int);
+-- add a primary key constraint
+alter table atacc1 add constraint atacc_test1 primary key (test, test2);
+-- try adding a second primary key - should fail
+alter table atacc1 add constraint atacc_test2 primary key (test);
+ERROR:  multiple primary keys for table "atacc1" are not allowed
+-- insert initial value
+insert into atacc1 (test,test2) values (4,4);
+-- should fail
+insert into atacc1 (test,test2) values (4,4);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test, test2)=(4, 4) already exists.
+insert into atacc1 (test,test2) values (NULL,3);
+ERROR:  null value in column "test" violates not-null constraint
+DETAIL:  Failing row contains (null, 3).
+insert into atacc1 (test,test2) values (3, NULL);
+ERROR:  null value in column "test2" violates not-null constraint
+DETAIL:  Failing row contains (3, null).
+insert into atacc1 (test,test2) values (NULL,NULL);
+ERROR:  null value in column "test" violates not-null constraint
+DETAIL:  Failing row contains (null, null).
+-- should all succeed
+insert into atacc1 (test,test2) values (4,5);
+insert into atacc1 (test,test2) values (5,4);
+insert into atacc1 (test,test2) values (5,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int, test2 int, primary key(test));
+-- only first should succeed
+insert into atacc1 (test2, test) values (3, 3);
+insert into atacc1 (test2, test) values (2, 3);
+ERROR:  duplicate key value violates unique constraint "atacc1_pkey"
+DETAIL:  Key (test)=(3) already exists.
+insert into atacc1 (test2, test) values (1, NULL);
+ERROR:  null value in column "test" violates not-null constraint
+DETAIL:  Failing row contains (null, 1).
+drop table atacc1;
+-- alter table / alter column [set/drop] not null tests
+-- try altering system catalogs, should fail
+alter table pg_class alter column relname drop not null;
+ERROR:  permission denied: "pg_class" is a system catalog
+alter table pg_class alter relname set not null;
+ERROR:  permission denied: "pg_class" is a system catalog
+-- try altering non-existent table, should fail
+alter table non_existent alter column bar set not null;
+ERROR:  relation "non_existent" does not exist
+alter table non_existent alter column bar drop not null;
+ERROR:  relation "non_existent" does not exist
+-- test setting columns to null and not null and vice versa
+-- test checking for null values and primary key
+create table atacc1 (test int not null);
+alter table atacc1 add constraint "atacc1_pkey" primary key (test);
+alter table atacc1 alter column test drop not null;
+ERROR:  column "test" is in a primary key
+alter table atacc1 drop constraint "atacc1_pkey";
+alter table atacc1 alter column test drop not null;
+insert into atacc1 values (null);
+alter table atacc1 alter test set not null;
+ERROR:  column "test" contains null values
+delete from atacc1;
+alter table atacc1 alter test set not null;
+-- try altering a non-existent column, should fail
+alter table atacc1 alter bar set not null;
+ERROR:  column "bar" of relation "atacc1" does not exist
+alter table atacc1 alter bar drop not null;
+ERROR:  column "bar" of relation "atacc1" does not exist
+-- try creating a view and altering that, should fail
+create view myview as select * from atacc1;
+alter table myview alter column test drop not null;
+ERROR:  "myview" is not a table or foreign table
+alter table myview alter column test set not null;
+ERROR:  "myview" is not a table or foreign table
+drop view myview;
+drop table atacc1;
+-- set not null verified by constraints
+create table atacc1 (test_a int, test_b int);
+insert into atacc1 values (null, 1);
+-- constraint not cover all values, should fail
+alter table atacc1 add constraint atacc1_constr_or check(test_a is not null or test_b < 10);
+alter table atacc1 alter test_a set not null;
+ERROR:  column "test_a" contains null values
+alter table atacc1 drop constraint atacc1_constr_or;
+-- not valid constraint, should fail
+alter table atacc1 add constraint atacc1_constr_invalid check(test_a is not null) not valid;
+alter table atacc1 alter test_a set not null;
+ERROR:  column "test_a" contains null values
+alter table atacc1 drop constraint atacc1_constr_invalid;
+-- with valid constraint
+update atacc1 set test_a = 1;
+alter table atacc1 add constraint atacc1_constr_a_valid check(test_a is not null);
+alter table atacc1 alter test_a set not null;
+delete from atacc1;
+insert into atacc1 values (2, null);
+alter table atacc1 alter test_a drop not null;
+-- test multiple set not null at same time
+-- test_a checked by atacc1_constr_a_valid, test_b should fail by table scan
+alter table atacc1 alter test_a set not null, alter test_b set not null;
+ERROR:  column "test_b" contains null values
+-- commands order has no importance
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+ERROR:  column "test_b" contains null values
+-- valid one by table scan, one by check constraints
+update atacc1 set test_b = 1;
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+alter table atacc1 alter test_a drop not null, alter test_b drop not null;
+-- both column has check constraints
+alter table atacc1 add constraint atacc1_constr_b_valid check(test_b is not null);
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+drop table atacc1;
+-- test inheritance
+create table parent (a int);
+create table child (b varchar(255)) inherits (parent);
+alter table parent alter a set not null;
+insert into parent values (NULL);
+ERROR:  null value in column "a" violates not-null constraint
+DETAIL:  Failing row contains (null).
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+alter table parent alter a drop not null;
+insert into parent values (NULL);
+insert into child (a, b) values (NULL, 'foo');
+alter table only parent alter a set not null;
+ERROR:  column "a" contains null values
+alter table child alter a set not null;
+ERROR:  column "a" contains null values
+delete from parent;
+alter table only parent alter a set not null;
+insert into parent values (NULL);
+ERROR:  null value in column "a" violates not-null constraint
+DETAIL:  Failing row contains (null).
+alter table child alter a set not null;
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+delete from child;
+alter table child alter a set not null;
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+drop table child;
+drop table parent;
+-- test setting and removing default values
+create table def_test (
+	c1	int4 default 5,
+	c2	text default 'initial_default'
+);
+insert into def_test default values;
+alter table def_test alter column c1 drop default;
+insert into def_test default values;
+alter table def_test alter column c2 drop default;
+insert into def_test default values;
+alter table def_test alter column c1 set default 10;
+alter table def_test alter column c2 set default 'new_default';
+insert into def_test default values;
+select * from def_test;
+ c1 |       c2        
+----+-----------------
+  5 | initial_default
+    | initial_default
+    | 
+ 10 | new_default
+(4 rows)
+
+-- set defaults to an incorrect type: this should fail
+alter table def_test alter column c1 set default 'wrong_datatype';
+ERROR:  invalid input syntax for type integer: "wrong_datatype"
+alter table def_test alter column c2 set default 20;
+-- set defaults on a non-existent column: this should fail
+alter table def_test alter column c3 set default 30;
+ERROR:  column "c3" of relation "def_test" does not exist
+-- set defaults on views: we need to create a view, add a rule
+-- to allow insertions into it, and then alter the view to add
+-- a default
+create view def_view_test as select * from def_test;
+create rule def_view_test_ins as
+	on insert to def_view_test
+	do instead insert into def_test select new.*;
+insert into def_view_test default values;
+alter table def_view_test alter column c1 set default 45;
+insert into def_view_test default values;
+alter table def_view_test alter column c2 set default 'view_default';
+insert into def_view_test default values;
+select * from def_view_test;
+ c1 |       c2        
+----+-----------------
+  5 | initial_default
+    | initial_default
+    | 
+ 10 | new_default
+    | 
+ 45 | 
+ 45 | view_default
+(7 rows)
+
+drop rule def_view_test_ins on def_view_test;
+drop view def_view_test;
+drop table def_test;
+-- alter table / drop column tests
+-- try altering system catalogs, should fail
+alter table pg_class drop column relname;
+ERROR:  permission denied: "pg_class" is a system catalog
+-- try altering non-existent table, should fail
+alter table nosuchtable drop column bar;
+ERROR:  relation "nosuchtable" does not exist
+-- test dropping columns
+create table atacc1 (a int4 not null, b int4, c int4 not null, d int4);
+insert into atacc1 values (1, 2, 3, 4);
+alter table atacc1 drop a;
+alter table atacc1 drop a;
+ERROR:  column "a" of relation "atacc1" does not exist
+-- SELECTs
+select * from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select * from atacc1 order by a;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 order by a;
+                                      ^
+select * from atacc1 order by "........pg.dropped.1........";
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 order by "........pg.dropped.1........"...
+                                      ^
+select * from atacc1 group by a;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 group by a;
+                                      ^
+select * from atacc1 group by "........pg.dropped.1........";
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 group by "........pg.dropped.1........"...
+                                      ^
+select atacc1.* from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select a from atacc1;
+ERROR:  column "a" does not exist
+LINE 1: select a from atacc1;
+               ^
+select atacc1.a from atacc1;
+ERROR:  column atacc1.a does not exist
+LINE 1: select atacc1.a from atacc1;
+               ^
+select b,c,d from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select a,b,c,d from atacc1;
+ERROR:  column "a" does not exist
+LINE 1: select a,b,c,d from atacc1;
+               ^
+select * from atacc1 where a = 1;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 where a = 1;
+                                   ^
+select "........pg.dropped.1........" from atacc1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select "........pg.dropped.1........" from atacc1;
+               ^
+select atacc1."........pg.dropped.1........" from atacc1;
+ERROR:  column atacc1.........pg.dropped.1........ does not exist
+LINE 1: select atacc1."........pg.dropped.1........" from atacc1;
+               ^
+select "........pg.dropped.1........",b,c,d from atacc1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select "........pg.dropped.1........",b,c,d from atacc1;
+               ^
+select * from atacc1 where "........pg.dropped.1........" = 1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 where "........pg.dropped.1........" = ...
+                                   ^
+-- UPDATEs
+update atacc1 set a = 3;
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: update atacc1 set a = 3;
+                          ^
+update atacc1 set b = 2 where a = 3;
+ERROR:  column "a" does not exist
+LINE 1: update atacc1 set b = 2 where a = 3;
+                                      ^
+update atacc1 set "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: update atacc1 set "........pg.dropped.1........" = 3;
+                          ^
+update atacc1 set b = 2 where "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: update atacc1 set b = 2 where "........pg.dropped.1........"...
+                                      ^
+-- INSERTs
+insert into atacc1 values (10, 11, 12, 13);
+ERROR:  INSERT has more expressions than target columns
+LINE 1: insert into atacc1 values (10, 11, 12, 13);
+                                               ^
+insert into atacc1 values (default, 11, 12, 13);
+ERROR:  INSERT has more expressions than target columns
+LINE 1: insert into atacc1 values (default, 11, 12, 13);
+                                                    ^
+insert into atacc1 values (11, 12, 13);
+insert into atacc1 (a) values (10);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a) values (10);
+                            ^
+insert into atacc1 (a) values (default);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a) values (default);
+                            ^
+insert into atacc1 (a,b,c,d) values (10,11,12,13);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a,b,c,d) values (10,11,12,13);
+                            ^
+insert into atacc1 (a,b,c,d) values (default,11,12,13);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a,b,c,d) values (default,11,12,13);
+                            ^
+insert into atacc1 (b,c,d) values (11,12,13);
+insert into atacc1 ("........pg.dropped.1........") values (10);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........") values (...
+                            ^
+insert into atacc1 ("........pg.dropped.1........") values (default);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........") values (...
+                            ^
+insert into atacc1 ("........pg.dropped.1........",b,c,d) values (10,11,12,13);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va...
+                            ^
+insert into atacc1 ("........pg.dropped.1........",b,c,d) values (default,11,12,13);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va...
+                            ^
+-- DELETEs
+delete from atacc1 where a = 3;
+ERROR:  column "a" does not exist
+LINE 1: delete from atacc1 where a = 3;
+                                 ^
+delete from atacc1 where "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: delete from atacc1 where "........pg.dropped.1........" = 3;
+                                 ^
+delete from atacc1;
+-- try dropping a non-existent column, should fail
+alter table atacc1 drop bar;
+ERROR:  column "bar" of relation "atacc1" does not exist
+-- try removing an oid column, should succeed (as it's nonexistant)
+alter table atacc1 SET WITHOUT OIDS;
+-- try adding an oid column, should fail (not supported)
+alter table atacc1 SET WITH OIDS;
+ERROR:  syntax error at or near "WITH"
+LINE 1: alter table atacc1 SET WITH OIDS;
+                               ^
+-- try dropping the xmin column, should fail
+alter table atacc1 drop xmin;
+ERROR:  cannot drop system column "xmin"
+-- try creating a view and altering that, should fail
+create view myview as select * from atacc1;
+select * from myview;
+ b | c | d 
+---+---+---
+(0 rows)
+
+alter table myview drop d;
+ERROR:  "myview" is not a table, composite type, or foreign table
+drop view myview;
+-- test some commands to make sure they fail on the dropped column
+analyze atacc1(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+analyze atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+vacuum analyze atacc1(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+vacuum analyze atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+comment on column atacc1.a is 'testing';
+ERROR:  column "a" of relation "atacc1" does not exist
+comment on column atacc1."........pg.dropped.1........" is 'testing';
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set storage plain;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set storage plain;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set statistics 0;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set statistics 0;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set default 3;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set default 3;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a drop default;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" drop default;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set not null;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set not null;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a drop not null;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" drop not null;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 rename a to x;
+ERROR:  column "a" does not exist
+alter table atacc1 rename "........pg.dropped.1........" to x;
+ERROR:  column "........pg.dropped.1........" does not exist
+alter table atacc1 add primary key(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 add primary key("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 add unique(a);
+ERROR:  column "a" named in key does not exist
+alter table atacc1 add unique("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" named in key does not exist
+alter table atacc1 add check (a > 3);
+ERROR:  column "a" does not exist
+alter table atacc1 add check ("........pg.dropped.1........" > 3);
+ERROR:  column "........pg.dropped.1........" does not exist
+create table atacc2 (id int4 unique);
+alter table atacc1 add foreign key (a) references atacc2(id);
+ERROR:  column "a" referenced in foreign key constraint does not exist
+alter table atacc1 add foreign key ("........pg.dropped.1........") references atacc2(id);
+ERROR:  column "........pg.dropped.1........" referenced in foreign key constraint does not exist
+alter table atacc2 add foreign key (id) references atacc1(a);
+ERROR:  column "a" referenced in foreign key constraint does not exist
+alter table atacc2 add foreign key (id) references atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" referenced in foreign key constraint does not exist
+drop table atacc2;
+create index "testing_idx" on atacc1(a);
+ERROR:  column "a" does not exist
+create index "testing_idx" on atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" does not exist
+-- test create as and select into
+insert into atacc1 values (21, 22, 23);
+create table attest1 as select * from atacc1;
+select * from attest1;
+ b  | c  | d  
+----+----+----
+ 21 | 22 | 23
+(1 row)
+
+drop table attest1;
+select * into attest2 from atacc1;
+select * from attest2;
+ b  | c  | d  
+----+----+----
+ 21 | 22 | 23
+(1 row)
+
+drop table attest2;
+-- try dropping all columns
+alter table atacc1 drop c;
+alter table atacc1 drop d;
+alter table atacc1 drop b;
+select * from atacc1;
+--
+(1 row)
+
+drop table atacc1;
+-- test constraint error reporting in presence of dropped columns
+create table atacc1 (id serial primary key, value int check (value < 10));
+insert into atacc1(value) values (100);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_value_check"
+DETAIL:  Failing row contains (1, 100).
+alter table atacc1 drop column value;
+alter table atacc1 add column value int check (value < 10);
+insert into atacc1(value) values (100);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_value_check"
+DETAIL:  Failing row contains (2, 100).
+insert into atacc1(id, value) values (null, 0);
+ERROR:  null value in column "id" violates not-null constraint
+DETAIL:  Failing row contains (null, 0).
+drop table atacc1;
+-- test inheritance
+create table parent (a int, b int, c int);
+insert into parent values (1, 2, 3);
+alter table parent drop a;
+create table child (d varchar(255)) inherits (parent);
+insert into child values (12, 13, 'testing');
+select * from parent;
+ b  | c  
+----+----
+  2 |  3
+ 12 | 13
+(2 rows)
+
+select * from child;
+ b  | c  |    d    
+----+----+---------
+ 12 | 13 | testing
+(1 row)
+
+alter table parent drop c;
+select * from parent;
+ b  
+----
+  2
+ 12
+(2 rows)
+
+select * from child;
+ b  |    d    
+----+---------
+ 12 | testing
+(1 row)
+
+drop table child;
+drop table parent;
+-- check error cases for inheritance column merging
+create table parent (a float8, b numeric(10,4), c text collate "C");
+create table child (a float4) inherits (parent); -- fail
+NOTICE:  merging column "a" with inherited definition
+ERROR:  column "a" has a type conflict
+DETAIL:  double precision versus real
+create table child (b decimal(10,7)) inherits (parent); -- fail
+NOTICE:  moving and merging column "b" with inherited definition
+DETAIL:  User-specified column moved to the position of the inherited column.
+ERROR:  column "b" has a type conflict
+DETAIL:  numeric(10,4) versus numeric(10,7)
+create table child (c text collate "POSIX") inherits (parent); -- fail
+NOTICE:  moving and merging column "c" with inherited definition
+DETAIL:  User-specified column moved to the position of the inherited column.
+ERROR:  column "c" has a collation conflict
+DETAIL:  "C" versus "POSIX"
+create table child (a double precision, b decimal(10,4)) inherits (parent);
+NOTICE:  merging column "a" with inherited definition
+NOTICE:  merging column "b" with inherited definition
+drop table child;
+drop table parent;
+-- test copy in/out
+create table attest (a int4, b int4, c int4);
+insert into attest values (1,2,3);
+alter table attest drop a;
+copy attest to stdout;
+2	3
+copy attest(a) to stdout;
+ERROR:  column "a" of relation "attest" does not exist
+copy attest("........pg.dropped.1........") to stdout;
+ERROR:  column "........pg.dropped.1........" of relation "attest" does not exist
+copy attest from stdin;
+ERROR:  extra data after last expected column
+CONTEXT:  COPY attest, line 1: "10	11	12"
+select * from attest;
+ b | c 
+---+---
+ 2 | 3
+(1 row)
+
+copy attest from stdin;
+select * from attest;
+ b  | c  
+----+----
+  2 |  3
+ 21 | 22
+(2 rows)
+
+copy attest(a) from stdin;
+ERROR:  column "a" of relation "attest" does not exist
+copy attest("........pg.dropped.1........") from stdin;
+ERROR:  column "........pg.dropped.1........" of relation "attest" does not exist
+copy attest(b,c) from stdin;
+select * from attest;
+ b  | c  
+----+----
+  2 |  3
+ 21 | 22
+ 31 | 32
+(3 rows)
+
+drop table attest;
+-- test inheritance
+create table dropColumn (a int, b int, e int);
+create table dropColumnChild (c int) inherits (dropColumn);
+create table dropColumnAnother (d int) inherits (dropColumnChild);
+-- these two should fail
+alter table dropColumnchild drop column a;
+ERROR:  cannot drop inherited column "a"
+alter table only dropColumnChild drop column b;
+ERROR:  cannot drop inherited column "b"
+-- these three should work
+alter table only dropColumn drop column e;
+alter table dropColumnChild drop column c;
+alter table dropColumn drop column a;
+create table renameColumn (a int);
+create table renameColumnChild (b int) inherits (renameColumn);
+create table renameColumnAnother (c int) inherits (renameColumnChild);
+-- these three should fail
+alter table renameColumnChild rename column a to d;
+ERROR:  cannot rename inherited column "a"
+alter table only renameColumnChild rename column a to d;
+ERROR:  inherited column "a" must be renamed in child tables too
+alter table only renameColumn rename column a to d;
+ERROR:  inherited column "a" must be renamed in child tables too
+-- these should work
+alter table renameColumn rename column a to d;
+alter table renameColumnChild rename column b to a;
+-- these should work
+alter table if exists doesnt_exist_tab rename column a to d;
+NOTICE:  relation "doesnt_exist_tab" does not exist, skipping
+alter table if exists doesnt_exist_tab rename column b to a;
+NOTICE:  relation "doesnt_exist_tab" does not exist, skipping
+-- this should work
+alter table renameColumn add column w int;
+-- this should fail
+alter table only renameColumn add column x int;
+ERROR:  column must be added to child tables too
+-- Test corner cases in dropping of inherited columns
+create table p1 (f1 int, f2 int);
+create table c1 (f1 int not null) inherits(p1);
+NOTICE:  merging column "f1" with inherited definition
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+-- should work
+alter table p1 drop column f1;
+-- c1.f1 is still there, but no longer inherited
+select f1 from c1;
+ f1 
+----
+(0 rows)
+
+alter table c1 drop column f1;
+select f1 from c1;
+ERROR:  column "f1" does not exist
+LINE 1: select f1 from c1;
+               ^
+HINT:  Perhaps you meant to reference the column "c1.f2".
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 () inherits(p1);
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table p1 drop column f1;
+-- c1.f1 is dropped now, since there is no local definition for it
+select f1 from c1;
+ERROR:  column "f1" does not exist
+LINE 1: select f1 from c1;
+               ^
+HINT:  Perhaps you meant to reference the column "c1.f2".
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 () inherits(p1);
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table only p1 drop column f1;
+-- c1.f1 is NOT dropped, but must now be considered non-inherited
+alter table c1 drop column f1;
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 (f1 int not null) inherits(p1);
+NOTICE:  merging column "f1" with inherited definition
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table only p1 drop column f1;
+-- c1.f1 is still there, but no longer inherited
+alter table c1 drop column f1;
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1(id int, name text);
+create table p2(id2 int, name text, height int);
+create table c1(age int) inherits(p1,p2);
+NOTICE:  merging multiple inherited definitions of column "name"
+create table gc1() inherits (c1);
+select relname, attname, attinhcount, attislocal
+from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid)
+where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped
+order by relname, attnum;
+ relname | attname | attinhcount | attislocal 
+---------+---------+-------------+------------
+ c1      | id      |           1 | f
+ c1      | name    |           2 | f
+ c1      | id2     |           1 | f
+ c1      | height  |           1 | f
+ c1      | age     |           0 | t
+ gc1     | id      |           1 | f
+ gc1     | name    |           1 | f
+ gc1     | id2     |           1 | f
+ gc1     | height  |           1 | f
+ gc1     | age     |           1 | f
+ p1      | id      |           0 | t
+ p1      | name    |           0 | t
+ p2      | id2     |           0 | t
+ p2      | name    |           0 | t
+ p2      | height  |           0 | t
+(15 rows)
+
+-- should work
+alter table only p1 drop column name;
+-- should work. Now c1.name is local and inhcount is 0.
+alter table p2 drop column name;
+-- should be rejected since its inherited
+alter table gc1 drop column name;
+ERROR:  cannot drop inherited column "name"
+-- should work, and drop gc1.name along
+alter table c1 drop column name;
+-- should fail: column does not exist
+alter table gc1 drop column name;
+ERROR:  column "name" of relation "gc1" does not exist
+-- should work and drop the attribute in all tables
+alter table p2 drop column height;
+-- IF EXISTS test
+create table dropColumnExists ();
+alter table dropColumnExists drop column non_existing; --fail
+ERROR:  column "non_existing" of relation "dropcolumnexists" does not exist
+alter table dropColumnExists drop column if exists non_existing; --succeed
+NOTICE:  column "non_existing" of relation "dropcolumnexists" does not exist, skipping
+select relname, attname, attinhcount, attislocal
+from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid)
+where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped
+order by relname, attnum;
+ relname | attname | attinhcount | attislocal 
+---------+---------+-------------+------------
+ c1      | id      |           1 | f
+ c1      | id2     |           1 | f
+ c1      | age     |           0 | t
+ gc1     | id      |           1 | f
+ gc1     | id2     |           1 | f
+ gc1     | age     |           1 | f
+ p1      | id      |           0 | t
+ p2      | id2     |           0 | t
+(8 rows)
+
+drop table p1, p2 cascade;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table c1
+drop cascades to table gc1
+-- test attinhcount tracking with merged columns
+create table depth0();
+create table depth1(c text) inherits (depth0);
+create table depth2() inherits (depth1);
+alter table depth0 add c text;
+NOTICE:  merging definition of column "c" for child "depth1"
+select attrelid::regclass, attname, attinhcount, attislocal
+from pg_attribute
+where attnum > 0 and attrelid::regclass in ('depth0', 'depth1', 'depth2')
+order by attrelid::regclass::text, attnum;
+ attrelid | attname | attinhcount | attislocal 
+----------+---------+-------------+------------
+ depth0   | c       |           0 | t
+ depth1   | c       |           1 | t
+ depth2   | c       |           1 | f
+(3 rows)
+
+-- test renumbering of child-table columns in inherited operations
+create table p1 (f1 int);
+create table c1 (f2 text, f3 int) inherits (p1);
+alter table p1 add column a1 int check (a1 > 0);
+alter table p1 add column f2 text;
+NOTICE:  merging definition of column "f2" for child "c1"
+insert into p1 values (1,2,'abc');
+insert into c1 values(11,'xyz',33,0); -- should fail
+ERROR:  new row for relation "c1" violates check constraint "p1_a1_check"
+DETAIL:  Failing row contains (11, xyz, 33, 0).
+insert into c1 values(11,'xyz',33,22);
+select * from p1;
+ f1 | a1 | f2  
+----+----+-----
+  1 |  2 | abc
+ 11 | 22 | xyz
+(2 rows)
+
+update p1 set a1 = a1 + 1, f2 = upper(f2);
+select * from p1;
+ f1 | a1 | f2  
+----+----+-----
+  1 |  3 | ABC
+ 11 | 23 | XYZ
+(2 rows)
+
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+-- test that operations with a dropped column do not try to reference
+-- its datatype
+create domain mytype as text;
+create temp table foo (f1 text, f2 mytype, f3 text);
+insert into foo values('bb','cc','dd');
+select * from foo;
+ f1 | f2 | f3 
+----+----+----
+ bb | cc | dd
+(1 row)
+
+drop domain mytype cascade;
+NOTICE:  drop cascades to column f2 of table foo
+select * from foo;
+ f1 | f3 
+----+----
+ bb | dd
+(1 row)
+
+insert into foo values('qq','rr');
+select * from foo;
+ f1 | f3 
+----+----
+ bb | dd
+ qq | rr
+(2 rows)
+
+update foo set f3 = 'zz';
+select * from foo;
+ f1 | f3 
+----+----
+ bb | zz
+ qq | zz
+(2 rows)
+
+select f3,max(f1) from foo group by f3;
+ f3 | max 
+----+-----
+ zz | qq
+(1 row)
+
+-- Simple tests for alter table column type
+alter table foo alter f1 TYPE integer; -- fails
+ERROR:  column "f1" cannot be cast automatically to type integer
+HINT:  You might need to specify "USING f1::integer".
+alter table foo alter f1 TYPE varchar(10);
+create table anothertab (atcol1 serial8, atcol2 boolean,
+	constraint anothertab_chk check (atcol1 <= 3));
+insert into anothertab (atcol1, atcol2) values (default, true);
+insert into anothertab (atcol1, atcol2) values (default, false);
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+(2 rows)
+
+alter table anothertab alter column atcol1 type boolean; -- fails
+ERROR:  column "atcol1" cannot be cast automatically to type boolean
+HINT:  You might need to specify "USING atcol1::boolean".
+alter table anothertab alter column atcol1 type boolean using atcol1::int; -- fails
+ERROR:  result of USING clause for column "atcol1" cannot be cast automatically to type boolean
+HINT:  You might need to add an explicit cast.
+alter table anothertab alter column atcol1 type integer;
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+(2 rows)
+
+insert into anothertab (atcol1, atcol2) values (45, null); -- fails
+ERROR:  new row for relation "anothertab" violates check constraint "anothertab_chk"
+DETAIL:  Failing row contains (45, null).
+insert into anothertab (atcol1, atcol2) values (default, null);
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+      3 | 
+(3 rows)
+
+alter table anothertab alter column atcol2 type text
+      using case when atcol2 is true then 'IT WAS TRUE'
+                 when atcol2 is false then 'IT WAS FALSE'
+                 else 'IT WAS NULL!' end;
+select * from anothertab;
+ atcol1 |    atcol2    
+--------+--------------
+      1 | IT WAS TRUE
+      2 | IT WAS FALSE
+      3 | IT WAS NULL!
+(3 rows)
+
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end; -- fails
+ERROR:  default for column "atcol1" cannot be cast automatically to type boolean
+alter table anothertab alter column atcol1 drop default;
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end; -- fails
+ERROR:  operator does not exist: boolean <= integer
+HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.
+alter table anothertab drop constraint anothertab_chk;
+alter table anothertab drop constraint anothertab_chk; -- fails
+ERROR:  constraint "anothertab_chk" of relation "anothertab" does not exist
+alter table anothertab drop constraint IF EXISTS anothertab_chk; -- succeeds
+NOTICE:  constraint "anothertab_chk" of relation "anothertab" does not exist, skipping
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end;
+select * from anothertab;
+ atcol1 |    atcol2    
+--------+--------------
+ f      | IT WAS TRUE
+ t      | IT WAS FALSE
+ f      | IT WAS NULL!
+(3 rows)
+
+drop table anothertab;
+create table another (f1 int, f2 text);
+insert into another values(1, 'one');
+insert into another values(2, 'two');
+insert into another values(3, 'three');
+select * from another;
+ f1 |  f2   
+----+-------
+  1 | one
+  2 | two
+  3 | three
+(3 rows)
+
+alter table another
+  alter f1 type text using f2 || ' more',
+  alter f2 type bigint using f1 * 10;
+select * from another;
+     f1     | f2 
+------------+----
+ one more   | 10
+ two more   | 20
+ three more | 30
+(3 rows)
+
+drop table another;
+-- table's row type
+create table tab1 (a int, b text);
+create table tab2 (x int, y tab1);
+alter table tab1 alter column b type varchar; -- fails
+ERROR:  cannot alter table "tab1" because column "tab2.y" uses its row type
+-- Alter column type that's part of a partitioned index
+create table at_partitioned (a int, b text) partition by range (a);
+create table at_part_1 partition of at_partitioned for values from (0) to (1000);
+insert into at_partitioned values (512, '0.123');
+create table at_part_2 (b text, a int);
+insert into at_part_2 values ('1.234', 1024);
+create index on at_partitioned (b);
+create index on at_partitioned (a);
+\d at_part_1
+             Table "public.at_part_1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "at_part_1_a_idx" btree (a)
+    "at_part_1_b_idx" btree (b)
+
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | text    |           |          | 
+ a      | integer |           |          | 
+
+alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000);
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | text    |           |          | 
+ a      | integer |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
+Indexes:
+    "at_part_2_a_idx" btree (a)
+    "at_part_2_b_idx" btree (b)
+
+alter table at_partitioned alter column b type numeric using b::numeric;
+\d at_part_1
+             Table "public.at_part_1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | numeric |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "at_part_1_a_idx" btree (a)
+    "at_part_1_b_idx" btree (b)
+
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | numeric |           |          | 
+ a      | integer |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
+Indexes:
+    "at_part_2_a_idx" btree (a)
+    "at_part_2_b_idx" btree (b)
+
+drop table at_partitioned;
+-- Alter column type when no table rewrite is required
+-- Also check that comments are preserved
+create table at_partitioned(id int, name varchar(64), unique (id, name))
+  partition by hash(id);
+comment on constraint at_partitioned_id_name_key on at_partitioned is 'parent constraint';
+comment on index at_partitioned_id_name_key is 'parent index';
+create table at_partitioned_0 partition of at_partitioned
+  for values with (modulus 2, remainder 0);
+comment on constraint at_partitioned_0_id_name_key on at_partitioned_0 is 'child 0 constraint';
+comment on index at_partitioned_0_id_name_key is 'child 0 index';
+create table at_partitioned_1 partition of at_partitioned
+  for values with (modulus 2, remainder 1);
+comment on constraint at_partitioned_1_id_name_key on at_partitioned_1 is 'child 1 constraint';
+comment on index at_partitioned_1_id_name_key is 'child 1 index';
+insert into at_partitioned values(1, 'foo');
+insert into at_partitioned values(3, 'bar');
+create temp table old_oids as
+  select relname, oid as oldoid, relfilenode as oldfilenode
+  from pg_class where relname like 'at_partitioned%';
+select relname,
+  c.oid = oldoid as orig_oid,
+  case relfilenode
+    when 0 then 'none'
+    when c.oid then 'own'
+    when oldfilenode then 'orig'
+    else 'OTHER'
+    end as storage,
+  obj_description(c.oid, 'pg_class') as desc
+  from pg_class c left join old_oids using (relname)
+  where relname like 'at_partitioned%'
+  order by relname;
+           relname            | orig_oid | storage |     desc      
+------------------------------+----------+---------+---------------
+ at_partitioned               | t        | none    | 
+ at_partitioned_0             | t        | own     | 
+ at_partitioned_0_id_name_key | t        | own     | child 0 index
+ at_partitioned_1             | t        | own     | 
+ at_partitioned_1_id_name_key | t        | own     | child 1 index
+ at_partitioned_id_name_key   | t        | none    | parent index
+(6 rows)
+
+select conname, obj_description(oid, 'pg_constraint') as desc
+  from pg_constraint where conname like 'at_partitioned%'
+  order by conname;
+           conname            |        desc        
+------------------------------+--------------------
+ at_partitioned_0_id_name_key | child 0 constraint
+ at_partitioned_1_id_name_key | child 1 constraint
+ at_partitioned_id_name_key   | parent constraint
+(3 rows)
+
+alter table at_partitioned alter column name type varchar(127);
+-- Note: these tests currently show the wrong behavior for comments :-(
+select relname,
+  c.oid = oldoid as orig_oid,
+  case relfilenode
+    when 0 then 'none'
+    when c.oid then 'own'
+    when oldfilenode then 'orig'
+    else 'OTHER'
+    end as storage,
+  obj_description(c.oid, 'pg_class') as desc
+  from pg_class c left join old_oids using (relname)
+  where relname like 'at_partitioned%'
+  order by relname;
+           relname            | orig_oid | storage |     desc     
+------------------------------+----------+---------+--------------
+ at_partitioned               | t        | none    | 
+ at_partitioned_0             | t        | own     | 
+ at_partitioned_0_id_name_key | f        | own     | parent index
+ at_partitioned_1             | t        | own     | 
+ at_partitioned_1_id_name_key | f        | own     | parent index
+ at_partitioned_id_name_key   | f        | none    | parent index
+(6 rows)
+
+select conname, obj_description(oid, 'pg_constraint') as desc
+  from pg_constraint where conname like 'at_partitioned%'
+  order by conname;
+           conname            |       desc        
+------------------------------+-------------------
+ at_partitioned_0_id_name_key | 
+ at_partitioned_1_id_name_key | 
+ at_partitioned_id_name_key   | parent constraint
+(3 rows)
+
+-- Don't remove this DROP, it exposes bug #15672
+drop table at_partitioned;
+-- disallow recursive containment of row types
+create temp table recur1 (f1 int);
+alter table recur1 add column f2 recur1; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+alter table recur1 add column f2 recur1[]; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+create domain array_of_recur1 as recur1[];
+alter table recur1 add column f2 array_of_recur1; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+create temp table recur2 (f1 int, f2 recur1);
+alter table recur1 add column f2 recur2; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+alter table recur1 add column f2 int;
+alter table recur1 alter column f2 type recur2; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+-- SET STORAGE may need to add a TOAST table
+create table test_storage (a text);
+alter table test_storage alter a set storage plain;
+alter table test_storage add b int default 0; -- rewrite table to remove its TOAST table
+alter table test_storage alter a set storage extended; -- re-add TOAST table
+select reltoastrelid <> 0 as has_toast_table
+from pg_class
+where oid = 'test_storage'::regclass;
+ has_toast_table 
+-----------------
+ f
+(1 row)
+
+-- ALTER COLUMN TYPE with a check constraint and a child table (bug #13779)
+CREATE TABLE test_inh_check (a float check (a > 10.2), b float);
+CREATE TABLE test_inh_check_child() INHERITS(test_inh_check);
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | double precision |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | double precision |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(2 rows)
+
+ALTER TABLE test_inh_check ALTER COLUMN a TYPE numeric;
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(2 rows)
+
+-- also try noinherit, local, and local+inherited cases
+ALTER TABLE test_inh_check ADD CONSTRAINT bnoinherit CHECK (b > 100) NO INHERIT;
+ALTER TABLE test_inh_check_child ADD CONSTRAINT blocal CHECK (b < 1000);
+ALTER TABLE test_inh_check_child ADD CONSTRAINT bmerged CHECK (b > 1);
+ALTER TABLE test_inh_check ADD CONSTRAINT bmerged CHECK (b > 1);
+NOTICE:  merging constraint "bmerged" with inherited definition
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "bmerged" CHECK (b > 1::double precision)
+    "bnoinherit" CHECK (b > 100::double precision) NO INHERIT
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "blocal" CHECK (b < 1000::double precision)
+    "bmerged" CHECK (b > 1::double precision)
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | bmerged                |           0 | t          | f
+ test_inh_check       | bnoinherit             |           0 | t          | t
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | blocal                 |           0 | t          | f
+ test_inh_check_child | bmerged                |           1 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(6 rows)
+
+ALTER TABLE test_inh_check ALTER COLUMN b TYPE numeric;
+NOTICE:  merging constraint "bmerged" with inherited definition
+\d test_inh_check
+           Table "public.test_inh_check"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | numeric |           |          | 
+ b      | numeric |           |          | 
+Check constraints:
+    "bmerged" CHECK (b::double precision > 1::double precision)
+    "bnoinherit" CHECK (b::double precision > 100::double precision) NO INHERIT
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+        Table "public.test_inh_check_child"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | numeric |           |          | 
+ b      | numeric |           |          | 
+Check constraints:
+    "blocal" CHECK (b::double precision < 1000::double precision)
+    "bmerged" CHECK (b::double precision > 1::double precision)
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | bmerged                |           0 | t          | f
+ test_inh_check       | bnoinherit             |           0 | t          | t
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | blocal                 |           0 | t          | f
+ test_inh_check_child | bmerged                |           1 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(6 rows)
+
+-- ALTER COLUMN TYPE with different schema in children
+-- Bug at https://postgr.es/m/20170102225618.GA10071@telsasoft.com
+CREATE TABLE test_type_diff (f1 int);
+CREATE TABLE test_type_diff_c (extra smallint) INHERITS (test_type_diff);
+ALTER TABLE test_type_diff ADD COLUMN f2 int;
+INSERT INTO test_type_diff_c VALUES (1, 2, 3);
+ALTER TABLE test_type_diff ALTER COLUMN f2 TYPE bigint USING f2::bigint;
+CREATE TABLE test_type_diff2 (int_two int2, int_four int4, int_eight int8);
+CREATE TABLE test_type_diff2_c1 (int_four int4, int_eight int8, int_two int2);
+CREATE TABLE test_type_diff2_c2 (int_eight int8, int_two int2, int_four int4);
+CREATE TABLE test_type_diff2_c3 (int_two int2, int_four int4, int_eight int8);
+ALTER TABLE test_type_diff2_c1 INHERIT test_type_diff2;
+ALTER TABLE test_type_diff2_c2 INHERIT test_type_diff2;
+ALTER TABLE test_type_diff2_c3 INHERIT test_type_diff2;
+INSERT INTO test_type_diff2_c1 VALUES (1, 2, 3);
+INSERT INTO test_type_diff2_c2 VALUES (4, 5, 6);
+INSERT INTO test_type_diff2_c3 VALUES (7, 8, 9);
+ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int8 USING int_four::int8;
+-- whole-row references are disallowed
+ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int4 USING (pg_column_size(test_type_diff2));
+ERROR:  cannot convert whole-row table reference
+DETAIL:  USING expression contains a whole-row table reference.
+-- check for rollback of ANALYZE corrupting table property flags (bug #11638)
+CREATE TABLE check_fk_presence_1 (id int PRIMARY KEY, t text);
+CREATE TABLE check_fk_presence_2 (id int REFERENCES check_fk_presence_1, t text);
+BEGIN;
+ALTER TABLE check_fk_presence_2 DROP CONSTRAINT check_fk_presence_2_id_fkey;
+ANALYZE check_fk_presence_2;
+ROLLBACK;
+\d check_fk_presence_2
+        Table "public.check_fk_presence_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ t      | text    |           |          | 
+Foreign-key constraints:
+    "check_fk_presence_2_id_fkey" FOREIGN KEY (id) REFERENCES check_fk_presence_1(id)
+
+DROP TABLE check_fk_presence_1, check_fk_presence_2;
+-- check column addition within a view (bug #14876)
+create table at_base_table(id int, stuff text);
+insert into at_base_table values (23, 'skidoo');
+create view at_view_1 as select * from at_base_table bt;
+create view at_view_2 as select *, to_json(v1) as j from at_view_1 v1;
+\d+ at_view_1
+                          View "public.at_view_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+View definition:
+ SELECT bt.id,
+    bt.stuff
+   FROM at_base_table bt;
+
+\d+ at_view_2
+                          View "public.at_view_2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ j      | json    |           |          |         | extended | 
+View definition:
+ SELECT v1.id,
+    v1.stuff,
+    to_json(v1.*) AS j
+   FROM at_view_1 v1;
+
+explain (verbose, costs off) select * from at_view_2;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Seq Scan on public.at_base_table bt
+   Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff))
+(2 rows)
+
+select * from at_view_2;
+ id | stuff  |             j              
+----+--------+----------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo"}
+(1 row)
+
+create or replace view at_view_1 as select *, 2+2 as more from at_base_table bt;
+\d+ at_view_1
+                          View "public.at_view_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ more   | integer |           |          |         | plain    | 
+View definition:
+ SELECT bt.id,
+    bt.stuff,
+    2 + 2 AS more
+   FROM at_base_table bt;
+
+\d+ at_view_2
+                          View "public.at_view_2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ j      | json    |           |          |         | extended | 
+View definition:
+ SELECT v1.id,
+    v1.stuff,
+    to_json(v1.*) AS j
+   FROM at_view_1 v1;
+
+explain (verbose, costs off) select * from at_view_2;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Seq Scan on public.at_base_table bt
+   Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL))
+(2 rows)
+
+select * from at_view_2;
+ id | stuff  |                   j                    
+----+--------+----------------------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo","more":null}
+(1 row)
+
+drop view at_view_2;
+drop view at_view_1;
+drop table at_base_table;
+--
+-- lock levels
+--
+drop type lockmodes;
+ERROR:  type "lockmodes" does not exist
+create type lockmodes as enum (
+ 'SIReadLock'
+,'AccessShareLock'
+,'RowShareLock'
+,'RowExclusiveLock'
+,'ShareUpdateExclusiveLock'
+,'ShareLock'
+,'ShareRowExclusiveLock'
+,'ExclusiveLock'
+,'AccessExclusiveLock'
+);
+drop view my_locks;
+ERROR:  view "my_locks" does not exist
+create or replace view my_locks as
+select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode
+from pg_locks l join pg_class c on l.relation = c.oid
+where virtualtransaction = (
+        select virtualtransaction
+        from pg_locks
+        where transactionid = txid_current()::integer)
+and locktype = 'relation'
+and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog')
+and c.relname != 'my_locks'
+group by c.relname;
+create table alterlock (f1 int primary key, f2 text);
+insert into alterlock values (1, 'foo');
+create table alterlock2 (f3 int primary key, f1 int);
+insert into alterlock2 values (1, 1);
+begin; alter table alterlock alter column f2 set statistics 150;
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+rollback;
+begin; alter table alterlock cluster on alterlock_pkey;
+select * from my_locks order by 1;
+    relname     |       max_lockmode       
+----------------+--------------------------
+ alterlock      | ShareUpdateExclusiveLock
+ alterlock_pkey | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock set without cluster;
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+commit;
+begin; alter table alterlock set (fillfactor = 100);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+commit;
+begin; alter table alterlock reset (fillfactor);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+commit;
+begin; alter table alterlock set (toast.autovacuum_enabled = off);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+commit;
+begin; alter table alterlock set (autovacuum_enabled = off);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+commit;
+begin; alter table alterlock alter column f2 set (n_distinct = 1);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+rollback;
+-- test that mixing options with different lock levels works as expected
+begin; alter table alterlock set (autovacuum_enabled = off, fillfactor = 80);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+commit;
+begin; alter table alterlock alter column f2 set storage extended;
+select * from my_locks order by 1;
+  relname  |    max_lockmode     
+-----------+---------------------
+ alterlock | AccessExclusiveLock
+(1 row)
+
+rollback;
+begin; alter table alterlock alter column f2 set default 'x';
+select * from my_locks order by 1;
+  relname  |    max_lockmode     
+-----------+---------------------
+ alterlock | AccessExclusiveLock
+(1 row)
+
+rollback;
+begin;
+create trigger ttdummy
+	before delete or update on alterlock
+	for each row
+	execute procedure
+	ttdummy (1, 1);
+select * from my_locks order by 1;
+  relname  |     max_lockmode      
+-----------+-----------------------
+ alterlock | ShareRowExclusiveLock
+(1 row)
+
+rollback;
+begin;
+select * from my_locks order by 1;
+ relname | max_lockmode 
+---------+--------------
+(0 rows)
+
+alter table alterlock2 add foreign key (f1) references alterlock (f1);
+select * from my_locks order by 1;
+     relname     |     max_lockmode      
+-----------------+-----------------------
+ alterlock       | ShareRowExclusiveLock
+ alterlock2      | ShareRowExclusiveLock
+ alterlock2_pkey | AccessShareLock
+ alterlock_pkey  | AccessShareLock
+(4 rows)
+
+rollback;
+begin;
+alter table alterlock2
+add constraint alterlock2nv foreign key (f1) references alterlock (f1) NOT VALID;
+select * from my_locks order by 1;
+  relname   |     max_lockmode      
+------------+-----------------------
+ alterlock  | ShareRowExclusiveLock
+ alterlock2 | ShareRowExclusiveLock
+(2 rows)
+
+commit;
+begin;
+alter table alterlock2 validate constraint alterlock2nv;
+select * from my_locks order by 1;
+     relname     |       max_lockmode       
+-----------------+--------------------------
+ alterlock       | RowShareLock
+ alterlock2      | ShareUpdateExclusiveLock
+ alterlock2_pkey | AccessShareLock
+ alterlock_pkey  | AccessShareLock
+(4 rows)
+
+rollback;
+create or replace view my_locks as
+select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode
+from pg_locks l join pg_class c on l.relation = c.oid
+where virtualtransaction = (
+        select virtualtransaction
+        from pg_locks
+        where transactionid = txid_current()::integer)
+and locktype = 'relation'
+and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog')
+and c.relname = 'my_locks'
+group by c.relname;
+-- raise exception
+alter table my_locks set (autovacuum_enabled = false);
+ERROR:  unrecognized parameter "autovacuum_enabled"
+alter view my_locks set (autovacuum_enabled = false);
+ERROR:  unrecognized parameter "autovacuum_enabled"
+alter table my_locks reset (autovacuum_enabled);
+alter view my_locks reset (autovacuum_enabled);
+begin;
+alter view my_locks set (security_barrier=off);
+select * from my_locks order by 1;
+ relname  |    max_lockmode     
+----------+---------------------
+ my_locks | AccessExclusiveLock
+(1 row)
+
+alter view my_locks reset (security_barrier);
+rollback;
+-- this test intentionally applies the ALTER TABLE command against a view, but
+-- uses a view option so we expect this to succeed. This form of SQL is
+-- accepted for historical reasons, as shown in the docs for ALTER VIEW
+begin;
+alter table my_locks set (security_barrier=off);
+select * from my_locks order by 1;
+ relname  |    max_lockmode     
+----------+---------------------
+ my_locks | AccessExclusiveLock
+(1 row)
+
+alter table my_locks reset (security_barrier);
+rollback;
+-- cleanup
+drop table alterlock2;
+drop table alterlock;
+drop view my_locks;
+drop type lockmodes;
+--
+-- alter function
+--
+create function test_strict(text) returns text as
+    'select coalesce($1, ''got passed a null'');'
+    language sql returns null on null input;
+select test_strict(NULL);
+ test_strict 
+-------------
+ 
+(1 row)
+
+alter function test_strict(text) called on null input;
+select test_strict(NULL);
+    test_strict    
+-------------------
+ got passed a null
+(1 row)
+
+create function non_strict(text) returns text as
+    'select coalesce($1, ''got passed a null'');'
+    language sql called on null input;
+select non_strict(NULL);
+    non_strict     
+-------------------
+ got passed a null
+(1 row)
+
+alter function non_strict(text) returns null on null input;
+select non_strict(NULL);
+ non_strict 
+------------
+ 
+(1 row)
+
+--
+-- alter object set schema
+--
+create schema alter1;
+create schema alter2;
+create table alter1.t1(f1 serial primary key, f2 int check (f2 > 0));
+create view alter1.v1 as select * from alter1.t1;
+create function alter1.plus1(int) returns int as 'select $1+1' language sql;
+create domain alter1.posint integer check (value > 0);
+create type alter1.ctype as (f1 int, f2 text);
+create function alter1.same(alter1.ctype, alter1.ctype) returns boolean language sql
+as 'select $1.f1 is not distinct from $2.f1 and $1.f2 is not distinct from $2.f2';
+create operator alter1.=(procedure = alter1.same, leftarg  = alter1.ctype, rightarg = alter1.ctype);
+create operator class alter1.ctype_hash_ops default for type alter1.ctype using hash as
+  operator 1 alter1.=(alter1.ctype, alter1.ctype);
+create conversion alter1.ascii_to_utf8 for 'sql_ascii' to 'utf8' from ascii_to_utf8;
+create text search parser alter1.prs(start = prsd_start, gettoken = prsd_nexttoken, end = prsd_end, lextypes = prsd_lextype);
+create text search configuration alter1.cfg(parser = alter1.prs);
+create text search template alter1.tmpl(init = dsimple_init, lexize = dsimple_lexize);
+create text search dictionary alter1.dict(template = alter1.tmpl);
+insert into alter1.t1(f2) values(11);
+insert into alter1.t1(f2) values(12);
+alter table alter1.t1 set schema alter1; -- no-op, same schema
+alter table alter1.t1 set schema alter2;
+alter table alter1.v1 set schema alter2;
+alter function alter1.plus1(int) set schema alter2;
+alter domain alter1.posint set schema alter2;
+alter operator class alter1.ctype_hash_ops using hash set schema alter2;
+alter operator family alter1.ctype_hash_ops using hash set schema alter2;
+alter operator alter1.=(alter1.ctype, alter1.ctype) set schema alter2;
+alter function alter1.same(alter1.ctype, alter1.ctype) set schema alter2;
+alter type alter1.ctype set schema alter1; -- no-op, same schema
+alter type alter1.ctype set schema alter2;
+alter conversion alter1.ascii_to_utf8 set schema alter2;
+alter text search parser alter1.prs set schema alter2;
+alter text search configuration alter1.cfg set schema alter2;
+alter text search template alter1.tmpl set schema alter2;
+alter text search dictionary alter1.dict set schema alter2;
+-- this should succeed because nothing is left in alter1
+drop schema alter1;
+insert into alter2.t1(f2) values(13);
+insert into alter2.t1(f2) values(14);
+select * from alter2.t1;
+ f1 | f2 
+----+----
+  1 | 11
+  2 | 12
+  3 | 13
+  4 | 14
+(4 rows)
+
+select * from alter2.v1;
+ f1 | f2 
+----+----
+  1 | 11
+  2 | 12
+  3 | 13
+  4 | 14
+(4 rows)
+
+select alter2.plus1(41);
+ plus1 
+-------
+    42
+(1 row)
+
+-- clean up
+drop schema alter2 cascade;
+NOTICE:  drop cascades to 13 other objects
+DETAIL:  drop cascades to table alter2.t1
+drop cascades to view alter2.v1
+drop cascades to function alter2.plus1(integer)
+drop cascades to type alter2.posint
+drop cascades to type alter2.ctype
+drop cascades to function alter2.same(alter2.ctype,alter2.ctype)
+drop cascades to operator alter2.=(alter2.ctype,alter2.ctype)
+drop cascades to operator family alter2.ctype_hash_ops for access method hash
+drop cascades to conversion alter2.ascii_to_utf8
+drop cascades to text search parser alter2.prs
+drop cascades to text search configuration alter2.cfg
+drop cascades to text search template alter2.tmpl
+drop cascades to text search dictionary alter2.dict
+--
+-- composite types
+--
+CREATE TYPE test_type AS (a int);
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+
+ALTER TYPE nosuchtype ADD ATTRIBUTE b text; -- fails
+ERROR:  relation "nosuchtype" does not exist
+ALTER TYPE test_type ADD ATTRIBUTE b text;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+
+ALTER TYPE test_type ADD ATTRIBUTE b text; -- fails
+ERROR:  column "b" of relation "test_type" already exists
+ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE varchar;
+\d test_type
+              Composite type "public.test_type"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+
+ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE integer;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+
+ALTER TYPE test_type DROP ATTRIBUTE b;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+
+ALTER TYPE test_type DROP ATTRIBUTE c; -- fails
+ERROR:  column "c" of relation "test_type" does not exist
+ALTER TYPE test_type DROP ATTRIBUTE IF EXISTS c;
+NOTICE:  column "c" of relation "test_type" does not exist, skipping
+ALTER TYPE test_type DROP ATTRIBUTE a, ADD ATTRIBUTE d boolean;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ d      | boolean |           |          | 
+
+ALTER TYPE test_type RENAME ATTRIBUTE a TO aa;
+ERROR:  column "a" does not exist
+ALTER TYPE test_type RENAME ATTRIBUTE d TO dd;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ dd     | boolean |           |          | 
+
+DROP TYPE test_type;
+CREATE TYPE test_type1 AS (a int, b text);
+CREATE TABLE test_tbl1 (x int, y test_type1);
+ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails
+ERROR:  cannot alter type "test_type1" because column "test_tbl1.y" uses it
+CREATE TYPE test_type2 AS (a int, b text);
+CREATE TABLE test_tbl2 OF test_type2;
+CREATE TABLE test_tbl2_subclass () INHERITS (test_tbl2);
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 ADD ATTRIBUTE c text; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 ADD ATTRIBUTE c text CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar CASCADE;
+\d test_type2
+             Composite type "public.test_type2"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+ c      | text              |           |          | 
+
+\d test_tbl2
+                  Table "public.test_tbl2"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+ c      | text              |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 DROP ATTRIBUTE b; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 DROP ATTRIBUTE b CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+\d test_tbl2_subclass
+         Table "public.test_tbl2_subclass"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+Inherits: test_tbl2
+
+DROP TABLE test_tbl2_subclass;
+CREATE TYPE test_typex AS (a int, b text);
+CREATE TABLE test_tblx (x int, y test_typex check ((y).a > 0));
+ALTER TYPE test_typex DROP ATTRIBUTE a; -- fails
+ERROR:  cannot drop column a of composite type test_typex because other objects depend on it
+DETAIL:  constraint test_tblx_y_check on table test_tblx depends on column a of composite type test_typex
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+ALTER TYPE test_typex DROP ATTRIBUTE a CASCADE;
+NOTICE:  drop cascades to constraint test_tblx_y_check on table test_tblx
+\d test_tblx
+               Table "public.test_tblx"
+ Column |    Type    | Collation | Nullable | Default 
+--------+------------+-----------+----------+---------
+ x      | integer    |           |          | 
+ y      | test_typex |           |          | 
+
+DROP TABLE test_tblx;
+DROP TYPE test_typex;
+-- This test isn't that interesting on its own, but the purpose is to leave
+-- behind a table to test pg_upgrade with. The table has a composite type
+-- column in it, and the composite type has a dropped attribute.
+CREATE TYPE test_type3 AS (a int);
+CREATE TABLE test_tbl3 (c) AS SELECT '(1)'::test_type3;
+ALTER TYPE test_type3 DROP ATTRIBUTE a, ADD ATTRIBUTE b int;
+CREATE TYPE test_type_empty AS ();
+DROP TYPE test_type_empty;
+--
+-- typed tables: OF / NOT OF
+--
+CREATE TYPE tt_t0 AS (z inet, x int, y numeric(8,2));
+ALTER TYPE tt_t0 DROP ATTRIBUTE z;
+CREATE TABLE tt0 (x int NOT NULL, y numeric(8,2));	-- OK
+CREATE TABLE tt1 (x int, y bigint);					-- wrong base type
+CREATE TABLE tt2 (x int, y numeric(9,2));			-- wrong typmod
+CREATE TABLE tt3 (y numeric(8,2), x int);			-- wrong column order
+CREATE TABLE tt4 (x int);							-- too few columns
+CREATE TABLE tt5 (x int, y numeric(8,2), z int);	-- too few columns
+CREATE TABLE tt6 () INHERITS (tt0);					-- can't have a parent
+CREATE TABLE tt7 (x int, q text, y numeric(8,2));
+ALTER TABLE tt7 DROP q;								-- OK
+ALTER TABLE tt0 OF tt_t0;
+ALTER TABLE tt1 OF tt_t0;
+ERROR:  table "tt1" has different type for column "y"
+ALTER TABLE tt2 OF tt_t0;
+ERROR:  table "tt2" has different type for column "y"
+ALTER TABLE tt3 OF tt_t0;
+ERROR:  table has column "y" where type requires "x"
+ALTER TABLE tt4 OF tt_t0;
+ERROR:  table is missing column "y"
+ALTER TABLE tt5 OF tt_t0;
+ERROR:  table has extra column "z"
+ALTER TABLE tt6 OF tt_t0;
+ERROR:  typed tables cannot inherit
+ALTER TABLE tt7 OF tt_t0;
+CREATE TYPE tt_t1 AS (x int, y numeric(8,2));
+ALTER TABLE tt7 OF tt_t1;			-- reassign an already-typed table
+ALTER TABLE tt7 NOT OF;
+\d tt7
+                   Table "public.tt7"
+ Column |     Type     | Collation | Nullable | Default 
+--------+--------------+-----------+----------+---------
+ x      | integer      |           |          | 
+ y      | numeric(8,2) |           |          | 
+
+-- make sure we can drop a constraint on the parent but it remains on the child
+CREATE TABLE test_drop_constr_parent (c text CHECK (c IS NOT NULL));
+CREATE TABLE test_drop_constr_child () INHERITS (test_drop_constr_parent);
+ALTER TABLE ONLY test_drop_constr_parent DROP CONSTRAINT "test_drop_constr_parent_c_check";
+-- should fail
+INSERT INTO test_drop_constr_child (c) VALUES (NULL);
+ERROR:  new row for relation "test_drop_constr_child" violates check constraint "test_drop_constr_parent_c_check"
+DETAIL:  Failing row contains (null).
+DROP TABLE test_drop_constr_parent CASCADE;
+NOTICE:  drop cascades to table test_drop_constr_child
+--
+-- IF EXISTS test
+--
+ALTER TABLE IF EXISTS tt8 ADD COLUMN f int;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f);
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10);
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2;
+NOTICE:  relation "tt8" does not exist, skipping
+CREATE TABLE tt8(a int);
+CREATE SCHEMA alter2;
+ALTER TABLE IF EXISTS tt8 ADD COLUMN f int;
+ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f);
+ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10);
+ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0;
+ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1;
+ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2;
+\d alter2.tt8
+                Table "alter2.tt8"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ f1     | integer |           | not null | 0
+Indexes:
+    "xxx" PRIMARY KEY, btree (f1)
+Check constraints:
+    "tt8_f_check" CHECK (f1 >= 0 AND f1 <= 10)
+
+DROP TABLE alter2.tt8;
+DROP SCHEMA alter2;
+--
+-- Check conflicts between index and CHECK constraint names
+--
+CREATE TABLE tt9(c integer);
+ALTER TABLE tt9 ADD CHECK(c > 1);
+ALTER TABLE tt9 ADD CHECK(c > 2);  -- picks nonconflicting name
+ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 3);
+ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 4);  -- fail, dup name
+ERROR:  constraint "foo" for relation "tt9" already exists
+ALTER TABLE tt9 ADD UNIQUE(c);
+ALTER TABLE tt9 ADD UNIQUE(c);  -- picks nonconflicting name
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key UNIQUE(c);  -- fail, dup name
+ERROR:  relation "tt9_c_key" already exists
+ALTER TABLE tt9 ADD CONSTRAINT foo UNIQUE(c);  -- fail, dup name
+ERROR:  constraint "foo" for relation "tt9" already exists
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key CHECK(c > 5);  -- fail, dup name
+ERROR:  constraint "tt9_c_key" for relation "tt9" already exists
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key2 CHECK(c > 6);
+ALTER TABLE tt9 ADD UNIQUE(c);  -- picks nonconflicting name
+\d tt9
+                Table "public.tt9"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c      | integer |           |          | 
+Indexes:
+    "tt9_c_key" UNIQUE CONSTRAINT, btree (c)
+    "tt9_c_key1" UNIQUE CONSTRAINT, btree (c)
+    "tt9_c_key3" UNIQUE CONSTRAINT, btree (c)
+Check constraints:
+    "foo" CHECK (c > 3)
+    "tt9_c_check" CHECK (c > 1)
+    "tt9_c_check1" CHECK (c > 2)
+    "tt9_c_key2" CHECK (c > 6)
+
+DROP TABLE tt9;
+-- Check that comments on constraints and indexes are not lost at ALTER TABLE.
+CREATE TABLE comment_test (
+  id int,
+  positive_col int CHECK (positive_col > 0),
+  indexed_col int,
+  CONSTRAINT comment_test_pk PRIMARY KEY (id));
+CREATE INDEX comment_test_index ON comment_test(indexed_col);
+COMMENT ON COLUMN comment_test.id IS 'Column ''id'' on comment_test';
+COMMENT ON INDEX comment_test_index IS 'Simple index on comment_test';
+COMMENT ON CONSTRAINT comment_test_positive_col_check ON comment_test IS 'CHECK constraint on comment_test.positive_col';
+COMMENT ON CONSTRAINT comment_test_pk ON comment_test IS 'PRIMARY KEY constraint of comment_test';
+COMMENT ON INDEX comment_test_pk IS 'Index backing the PRIMARY KEY of comment_test';
+SELECT col_description('comment_test'::regclass, 1) as comment;
+           comment           
+-----------------------------
+ Column 'id' on comment_test
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2;
+       index        |                    comment                    
+--------------------+-----------------------------------------------
+ comment_test_index | Simple index on comment_test
+ comment_test_pk    | Index backing the PRIMARY KEY of comment_test
+(2 rows)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2;
+           constraint            |                    comment                    
+---------------------------------+-----------------------------------------------
+ comment_test_pk                 | PRIMARY KEY constraint of comment_test
+ comment_test_positive_col_check | CHECK constraint on comment_test.positive_col
+(2 rows)
+
+-- Change the datatype of all the columns. ALTER TABLE is optimized to not
+-- rebuild an index if the new data type is binary compatible with the old
+-- one. Check do a dummy ALTER TABLE that doesn't change the datatype
+-- first, to test that no-op codepath, and another one that does.
+ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE bigint;
+-- Check that the comments are intact.
+SELECT col_description('comment_test'::regclass, 1) as comment;
+           comment           
+-----------------------------
+ Column 'id' on comment_test
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2;
+       index        |                    comment                    
+--------------------+-----------------------------------------------
+ comment_test_index | Simple index on comment_test
+ comment_test_pk    | Index backing the PRIMARY KEY of comment_test
+(2 rows)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2;
+           constraint            |                    comment                    
+---------------------------------+-----------------------------------------------
+ comment_test_pk                 | PRIMARY KEY constraint of comment_test
+ comment_test_positive_col_check | CHECK constraint on comment_test.positive_col
+(2 rows)
+
+-- Check compatibility for foreign keys and comments. This is done
+-- separately as rebuilding the column type of the parent leads
+-- to an error and would reduce the test scope.
+CREATE TABLE comment_test_child (
+  id text CONSTRAINT comment_test_child_fk REFERENCES comment_test);
+CREATE INDEX comment_test_child_fk ON comment_test_child(id);
+COMMENT ON COLUMN comment_test_child.id IS 'Column ''id'' on comment_test_child';
+COMMENT ON INDEX comment_test_child_fk IS 'Index backing the FOREIGN KEY of comment_test_child';
+COMMENT ON CONSTRAINT comment_test_child_fk ON comment_test_child IS 'FOREIGN KEY constraint of comment_test_child';
+-- Change column type of parent
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int USING id::integer;
+ERROR:  foreign key constraint "comment_test_child_fk" cannot be implemented
+DETAIL:  Key columns "id" and "id" are of incompatible types: text and integer.
+-- Comments should be intact
+SELECT col_description('comment_test_child'::regclass, 1) as comment;
+              comment              
+-----------------------------------
+ Column 'id' on comment_test_child
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test_child'::regclass ORDER BY 1, 2;
+         index         |                       comment                       
+-----------------------+-----------------------------------------------------
+ comment_test_child_fk | Index backing the FOREIGN KEY of comment_test_child
+(1 row)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test_child'::regclass ORDER BY 1, 2;
+      constraint       |                   comment                    
+-----------------------+----------------------------------------------
+ comment_test_child_fk | FOREIGN KEY constraint of comment_test_child
+(1 row)
+
+-- Check that we map relation oids to filenodes and back correctly.  Only
+-- display bad mappings so the test output doesn't change all the time.  A
+-- filenode function call can return NULL for a relation dropped concurrently
+-- with the call's surrounding query, so ignore a NULL mapped_oid for
+-- relations that no longer exist after all calls finish.
+CREATE TEMP TABLE filenode_mapping AS
+SELECT
+    oid, mapped_oid, reltablespace, relfilenode, relname
+FROM pg_class,
+    pg_filenode_relation(reltablespace, pg_relation_filenode(oid)) AS mapped_oid
+WHERE relkind IN ('r', 'i', 'S', 't', 'm') AND mapped_oid IS DISTINCT FROM oid;
+SELECT m.* FROM filenode_mapping m LEFT JOIN pg_class c ON c.oid = m.oid
+WHERE c.oid IS NOT NULL OR m.mapped_oid IS NOT NULL;
+ oid | mapped_oid | reltablespace | relfilenode | relname 
+-----+------------+---------------+-------------+---------
+(0 rows)
+
+-- Checks on creating and manipulation of user defined relations in
+-- pg_catalog.
+--
+-- XXX: It would be useful to add checks around trying to manipulate
+-- catalog tables, but that might have ugly consequences when run
+-- against an existing server with allow_system_table_mods = on.
+SHOW allow_system_table_mods;
+ allow_system_table_mods 
+-------------------------
+ off
+(1 row)
+
+-- disallowed because of search_path issues with pg_dump
+CREATE TABLE pg_catalog.new_system_table();
+ERROR:  permission denied to create "pg_catalog.new_system_table"
+DETAIL:  System catalog modifications are currently disallowed.
+-- instead create in public first, move to catalog
+CREATE TABLE new_system_table(id serial primary key, othercol text);
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+ALTER TABLE new_system_table SET SCHEMA public;
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+-- will be ignored -- already there:
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+ALTER TABLE new_system_table RENAME TO old_system_table;
+CREATE INDEX old_system_table__othercol ON old_system_table (othercol);
+INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata');
+UPDATE old_system_table SET id = -id;
+DELETE FROM old_system_table WHERE othercol = 'somedata';
+TRUNCATE old_system_table;
+ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey;
+ALTER TABLE old_system_table DROP COLUMN othercol;
+DROP TABLE old_system_table;
+-- set logged
+CREATE UNLOGGED TABLE unlogged1(f1 SERIAL PRIMARY KEY, f2 TEXT);
+-- check relpersistence of an unlogged table
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1'
+ORDER BY relname;
+     relname      | relkind | relpersistence 
+------------------+---------+----------------
+ unlogged1        | r       | u
+ unlogged1_f1_seq | S       | p
+ unlogged1_pkey   | i       | u
+(3 rows)
+
+CREATE UNLOGGED TABLE unlogged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged1); -- foreign key
+CREATE UNLOGGED TABLE unlogged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged3); -- self-referencing foreign key
+ALTER TABLE unlogged3 SET LOGGED; -- skip self-referencing foreign key
+ALTER TABLE unlogged2 SET LOGGED; -- fails because a foreign key to an unlogged table exists
+ERROR:  could not change table "unlogged2" to logged because it references unlogged table "unlogged1"
+ALTER TABLE unlogged1 SET LOGGED;
+-- check relpersistence of an unlogged table after changing to permanent
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1'
+ORDER BY relname;
+     relname      | relkind | relpersistence 
+------------------+---------+----------------
+ unlogged1        | r       | p
+ unlogged1_f1_seq | S       | p
+ unlogged1_pkey   | i       | p
+(3 rows)
+
+ALTER TABLE unlogged1 SET LOGGED; -- silently do nothing
+DROP TABLE unlogged3;
+DROP TABLE unlogged2;
+DROP TABLE unlogged1;
+-- set unlogged
+CREATE TABLE logged1(f1 SERIAL PRIMARY KEY, f2 TEXT);
+-- check relpersistence of a permanent table
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1'
+ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ logged1        | r       | p
+ logged1_f1_seq | S       | p
+ logged1_pkey   | i       | p
+(3 rows)
+
+CREATE TABLE logged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged1); -- foreign key
+CREATE TABLE logged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged3); -- self-referencing foreign key
+ALTER TABLE logged1 SET UNLOGGED; -- fails because a foreign key from a permanent table exists
+ERROR:  could not change table "logged1" to unlogged because it references logged table "logged2"
+ALTER TABLE logged3 SET UNLOGGED; -- skip self-referencing foreign key
+ALTER TABLE logged2 SET UNLOGGED;
+ALTER TABLE logged1 SET UNLOGGED;
+-- check relpersistence of a permanent table after changing to unlogged
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1'
+ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ logged1        | r       | u
+ logged1_f1_seq | S       | p
+ logged1_pkey   | i       | u
+(3 rows)
+
+ALTER TABLE logged1 SET UNLOGGED; -- silently do nothing
+DROP TABLE logged3;
+DROP TABLE logged2;
+DROP TABLE logged1;
+-- test ADD COLUMN IF NOT EXISTS
+CREATE TABLE test_add_column(c1 integer);
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer;
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer; -- fail because c2 already exists
+ERROR:  column "c2" of relation "test_add_column" already exists
+ALTER TABLE ONLY test_add_column
+	ADD COLUMN c2 integer; -- fail because c2 already exists
+ERROR:  column "c2" of relation "test_add_column" already exists
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+ALTER TABLE ONLY test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer, -- fail because c2 already exists
+	ADD COLUMN c3 integer;
+ERROR:  column "c2" of relation "test_add_column" already exists
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN c3 integer; -- fail because c3 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN IF NOT EXISTS c3 integer; -- skipping because c3 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+NOTICE:  column "c3" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN IF NOT EXISTS c3 integer, -- skipping because c3 already exists
+	ADD COLUMN c4 integer;
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+NOTICE:  column "c3" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           |          | 
+ c4     | integer |           |          | 
+
+DROP TABLE test_add_column;
+-- unsupported constraint types for partitioned tables
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE (a, (a+b+1));
+ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
+ERROR:  exclusion constraints are not supported on partitioned tables
+LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
+                                    ^
+-- cannot drop column that is part of the partition key
+ALTER TABLE partitioned DROP COLUMN a;
+ERROR:  cannot drop column named in partition key
+ALTER TABLE partitioned ALTER COLUMN a TYPE char(5);
+ERROR:  cannot alter type of column named in partition key
+ALTER TABLE partitioned DROP COLUMN b;
+ERROR:  cannot drop column referenced in partition key expression
+ALTER TABLE partitioned ALTER COLUMN b TYPE char(5);
+ERROR:  cannot alter type of column referenced in partition key expression
+-- partitioned table cannot participate in regular inheritance
+CREATE TABLE nonpartitioned (
+	a int,
+	b int
+);
+ALTER TABLE partitioned INHERIT nonpartitioned;
+ERROR:  cannot change inheritance of partitioned table
+ALTER TABLE nonpartitioned INHERIT partitioned;
+ERROR:  cannot inherit from partitioned table "partitioned"
+-- cannot add NO INHERIT constraint to partitioned tables
+ALTER TABLE partitioned ADD CONSTRAINT chk_a CHECK (a > 0) NO INHERIT;
+ERROR:  cannot add NO INHERIT constraint to partitioned table "partitioned"
+DROP TABLE partitioned, nonpartitioned;
+--
+-- ATTACH PARTITION
+--
+-- check that target table is partitioned
+CREATE TABLE unparted (
+	a int
+);
+CREATE TABLE fail_part (like unparted);
+ALTER TABLE unparted ATTACH PARTITION fail_part FOR VALUES IN ('a');
+ERROR:  table "unparted" is not partitioned
+DROP TABLE unparted, fail_part;
+-- check that partition bound is compatible
+CREATE TABLE list_parted (
+	a int NOT NULL,
+	b char(2) COLLATE "C",
+	CONSTRAINT check_a CHECK (a > 0)
+) PARTITION BY LIST (a);
+CREATE TABLE fail_part (LIKE list_parted);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) TO (10);
+ERROR:  invalid bound specification for a list partition
+LINE 1: ...list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) T...
+                                                             ^
+DROP TABLE fail_part;
+-- check that the table being attached exists
+ALTER TABLE list_parted ATTACH PARTITION nonexistant FOR VALUES IN (1);
+ERROR:  relation "nonexistant" does not exist
+-- check ownership of the source table
+CREATE ROLE regress_test_me;
+CREATE ROLE regress_test_not_me;
+CREATE TABLE not_owned_by_me (LIKE list_parted);
+ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+SET SESSION AUTHORIZATION regress_test_me;
+CREATE TABLE owned_by_me (
+	a int
+) PARTITION BY LIST (a);
+ALTER TABLE owned_by_me ATTACH PARTITION not_owned_by_me FOR VALUES IN (1);
+ERROR:  must be owner of table not_owned_by_me
+RESET SESSION AUTHORIZATION;
+DROP TABLE owned_by_me, not_owned_by_me;
+DROP ROLE regress_test_not_me;
+DROP ROLE regress_test_me;
+-- check that the table being attached is not part of regular inheritance
+CREATE TABLE parent (LIKE list_parted);
+CREATE TABLE child () INHERITS (parent);
+ALTER TABLE list_parted ATTACH PARTITION child FOR VALUES IN (1);
+ERROR:  cannot attach inheritance child as partition
+ALTER TABLE list_parted ATTACH PARTITION parent FOR VALUES IN (1);
+ERROR:  cannot attach inheritance parent as partition
+DROP TABLE parent CASCADE;
+NOTICE:  drop cascades to table child
+-- check any TEMP-ness
+CREATE TEMP TABLE temp_parted (a int) PARTITION BY LIST (a);
+CREATE TABLE perm_part (a int);
+ALTER TABLE temp_parted ATTACH PARTITION perm_part FOR VALUES IN (1);
+ERROR:  cannot attach a permanent relation as partition of temporary relation "temp_parted"
+DROP TABLE temp_parted, perm_part;
+-- check that the table being attached is not a typed table
+CREATE TYPE mytype AS (a int);
+CREATE TABLE fail_part OF mytype;
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  cannot attach a typed table as partition
+DROP TYPE mytype CASCADE;
+NOTICE:  drop cascades to table fail_part
+-- check that the table being attached has only columns present in the parent
+CREATE TABLE fail_part (like list_parted, c int);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  table "fail_part" contains column "c" not found in parent "list_parted"
+DETAIL:  The new partition may contain only the columns present in parent.
+DROP TABLE fail_part;
+-- check that the table being attached has every column of the parent
+CREATE TABLE fail_part (a int NOT NULL);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table is missing column "b"
+DROP TABLE fail_part;
+-- check that columns match in type, collation and NOT NULL status
+CREATE TABLE fail_part (
+	b char(3),
+	a int NOT NULL
+);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different type for column "b"
+ALTER TABLE fail_part ALTER b TYPE char (2) COLLATE "POSIX";
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different collation for column "b"
+DROP TABLE fail_part;
+-- check that the table being attached has all constraints of the parent
+CREATE TABLE fail_part (
+	b char(2) COLLATE "C",
+	a int NOT NULL
+);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table is missing constraint "check_a"
+-- check that the constraint matches in definition with parent's constraint
+ALTER TABLE fail_part ADD CONSTRAINT check_a CHECK (a >= 0);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different definition for check constraint "check_a"
+DROP TABLE fail_part;
+-- check the attributes and constraints after partition is attached
+CREATE TABLE part_1 (
+	a int NOT NULL,
+	b char(2) COLLATE "C",
+	CONSTRAINT check_a CHECK (a > 0)
+);
+ALTER TABLE list_parted ATTACH PARTITION part_1 FOR VALUES IN (1);
+-- attislocal and conislocal are always false for merged attributes and constraints respectively.
+SELECT attislocal, attinhcount FROM pg_attribute WHERE attrelid = 'part_1'::regclass AND attnum > 0;
+ attislocal | attinhcount 
+------------+-------------
+ f          |           1
+ f          |           1
+(2 rows)
+
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::regclass AND conname = 'check_a';
+ conislocal | coninhcount 
+------------+-------------
+ f          |           1
+(1 row)
+
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  partition "fail_part" would overlap partition "part_1"
+DROP TABLE fail_part;
+-- check that an existing table can be attached as a default partition
+CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT;
+-- check attaching default partition fails if a default partition already
+-- exists
+CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT;
+ERROR:  partition "fail_def_part" conflicts with existing default partition "def_part"
+-- check validation when attaching list partitions
+CREATE TABLE list_parted2 (
+	a int,
+	b char
+) PARTITION BY LIST (a);
+-- check that violating rows are correctly reported
+CREATE TABLE part_2 (LIKE list_parted2);
+INSERT INTO part_2 VALUES (3, 'a');
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part_2;
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+-- check partition cannot be attached if default has some row for its values
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
+INSERT INTO list_parted2_def VALUES (11, 'z');
+CREATE TABLE part_3 (LIKE list_parted2);
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+ERROR:  updated partition constraint for default partition would be violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM list_parted2_def WHERE a = 11;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+-- adding constraints that describe the desired partition constraint
+-- (or more restrictive) will help skip the validation scan
+CREATE TABLE part_3_4 (
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IN (3))
+);
+-- however, if a list partition does not accept nulls, there should be
+-- an explicit NOT NULL constraint on the partition key column for the
+-- validation scan to be skipped;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4);
+-- adding a NOT NULL constraint will cause the scan to be skipped
+ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
+ALTER TABLE part_3_4 ALTER a SET NOT NULL;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4);
+INFO:  partition constraint for table "part_3_4" is implied by existing constraints
+-- check if default partition scan skipped
+ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6));
+CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66);
+INFO:  updated partition constraint for default partition "list_parted2_def" is implied by existing constraints
+-- check validation when attaching range partitions
+CREATE TABLE range_parted (
+	a int,
+	b int
+) PARTITION BY RANGE (a, b);
+-- check that violating rows are correctly reported
+CREATE TABLE part1 (
+	a int NOT NULL CHECK (a = 1),
+	b int NOT NULL CHECK (b >= 1 AND b <= 10)
+);
+INSERT INTO part1 VALUES (1, 10);
+-- Remember the TO bound is exclusive
+ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part1;
+ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10);
+-- adding constraints that describe the desired partition constraint
+-- (or more restrictive) will help skip the validation scan
+CREATE TABLE part2 (
+	a int NOT NULL CHECK (a = 1),
+	b int NOT NULL CHECK (b >= 10 AND b < 18)
+);
+ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20);
+INFO:  partition constraint for table "part2" is implied by existing constraints
+-- Create default partition
+CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT;
+-- Only one default partition is allowed, hence, following should give error
+CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS);
+ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT;
+ERROR:  partition "partr_def2" conflicts with existing default partition "partr_def1"
+-- Overlapping partitions cannot be attached, hence, following should give error
+INSERT INTO partr_def1 VALUES (2, 10);
+CREATE TABLE part3 (LIKE range_parted);
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20);
+ERROR:  updated partition constraint for default partition would be violated by some row
+-- Attaching partitions should be successful when there are no overlapping rows
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE part_5 (
+	LIKE list_parted2
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE part_5_a PARTITION OF part_5 FOR VALUES IN ('a');
+INSERT INTO part_5_a (a, b) VALUES (6, 'a');
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+ERROR:  partition constraint is violated by some row
+-- delete the faulting row and also add a constraint to skip the scan
+DELETE FROM part_5_a WHERE a NOT IN (3);
+ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 5);
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+INFO:  partition constraint for table "part_5" is implied by existing constraints
+ALTER TABLE list_parted2 DETACH PARTITION part_5;
+ALTER TABLE part_5 DROP CONSTRAINT check_a;
+-- scan should again be skipped, even though NOT NULL is now a column property
+ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IN (5)), ALTER a SET NOT NULL;
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+INFO:  partition constraint for table "part_5" is implied by existing constraints
+-- Check the case where attnos of the partitioning columns in the table being
+-- attached differs from the parent.  It should not affect the constraint-
+-- checking logic that allows to skip the scan.
+CREATE TABLE part_6 (
+	c int,
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 6)
+);
+ALTER TABLE part_6 DROP c;
+ALTER TABLE list_parted2 ATTACH PARTITION part_6 FOR VALUES IN (6);
+INFO:  partition constraint for table "part_6" is implied by existing constraints
+-- Similar to above, but the table being attached is a partitioned table
+-- whose partition has still different attnos for the root partitioning
+-- columns.
+CREATE TABLE part_7 (
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7)
+) PARTITION BY LIST (b);
+CREATE TABLE part_7_a_null (
+	c int,
+	d int,
+	e int,
+	LIKE list_parted2,  -- 'a' will have attnum = 4
+	CONSTRAINT check_b CHECK (b IS NULL OR b = 'a'),
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7)
+);
+ALTER TABLE part_7_a_null DROP c, DROP d, DROP e;
+ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null);
+INFO:  partition constraint for table "part_7_a_null" is implied by existing constraints
+ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
+INFO:  partition constraint for table "part_7" is implied by existing constraints
+INFO:  updated partition constraint for default partition "list_parted2_def" is implied by existing constraints
+-- Same example, but check this time that the constraint correctly detects
+-- violating rows
+ALTER TABLE list_parted2 DETACH PARTITION part_7;
+ALTER TABLE part_7 DROP CONSTRAINT check_a; -- thusly, scan won't be skipped
+INSERT INTO part_7 (a, b) VALUES (8, null), (9, 'a');
+SELECT tableoid::regclass, a, b FROM part_7 order by a;
+   tableoid    | a | b 
+---------------+---+---
+ part_7_a_null | 8 | 
+ part_7_a_null | 9 | a
+(2 rows)
+
+ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
+INFO:  updated partition constraint for default partition "list_parted2_def" is implied by existing constraints
+ERROR:  partition constraint is violated by some row
+-- check that leaf partitions of default partition are scanned when
+-- attaching a partitioned table.
+ALTER TABLE part_5 DROP CONSTRAINT check_a;
+CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a);
+CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5);
+INSERT INTO part5_def_p1 VALUES (5, 'y');
+CREATE TABLE part5_p1 (LIKE part_5);
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+ERROR:  updated partition constraint for default partition would be violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part5_def_p1 WHERE b = 'y';
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+-- check that the table being attached is not already a partition
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+ERROR:  "part_2" is already a partition
+-- check that circular inheritance is not allowed
+ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b');
+ERROR:  circular inheritance not allowed
+DETAIL:  "part_5" is already a child of "list_parted2".
+ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
+ERROR:  circular inheritance not allowed
+DETAIL:  "list_parted2" is already a child of "list_parted2".
+-- If a partitioned table being created or an existing table being attached
+-- as a partition does not have a constraint that would allow validation scan
+-- to be skipped, but an individual partition does, then the partition's
+-- validation scan is skipped.
+CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a);
+CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b);
+CREATE TABLE quuux_default1 PARTITION OF quuux_default (
+	CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1)
+) FOR VALUES IN ('b');
+CREATE TABLE quuux1 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate!
+CREATE TABLE quuux2 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation
+INFO:  updated partition constraint for default partition "quuux_default1" is implied by existing constraints
+DROP TABLE quuux1, quuux2;
+-- should validate for quuux1, but not for quuux2
+CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1);
+CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2);
+INFO:  updated partition constraint for default partition "quuux_default1" is implied by existing constraints
+DROP TABLE quuux;
+-- check validation when attaching hash partitions
+-- Use hand-rolled hash functions and operator class to get predictable result
+-- on different matchines. part_test_int4_ops is defined in insert.sql.
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE hash_parted (
+	a int,
+	b int
+) PARTITION BY HASH (a part_test_int4_ops);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+CREATE TABLE fail_part (LIKE hpart_1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+DROP TABLE fail_part;
+-- check validation when attaching hash partitions
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_2 (LIKE hash_parted);
+INSERT INTO hpart_2 VALUES (3, 0);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_2;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE hpart_5 (
+	LIKE hash_parted
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
+INSERT INTO hpart_5_a (a, b) VALUES (7, 1);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_5_a;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+-- check that the table being attach is with valid modulus and remainder value
+CREATE TABLE fail_part(LIKE hash_parted);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DROP TABLE fail_part;
+--
+-- DETACH PARTITION
+--
+-- check that the table is partitioned at all
+CREATE TABLE regular_table (a int);
+ALTER TABLE regular_table DETACH PARTITION any_name;
+ERROR:  table "regular_table" is not partitioned
+DROP TABLE regular_table;
+-- check that the partition being detached exists at all
+ALTER TABLE list_parted2 DETACH PARTITION part_4;
+ERROR:  relation "part_4" does not exist
+ALTER TABLE hash_parted DETACH PARTITION hpart_4;
+ERROR:  relation "hpart_4" does not exist
+-- check that the partition being detached is actually a partition of the parent
+CREATE TABLE not_a_part (a int);
+ALTER TABLE list_parted2 DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "list_parted2"
+ALTER TABLE list_parted2 DETACH PARTITION part_1;
+ERROR:  relation "part_1" is not a partition of relation "list_parted2"
+ALTER TABLE hash_parted DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "hash_parted"
+DROP TABLE not_a_part;
+-- check that, after being detached, attinhcount/coninhcount is dropped to 0 and
+-- attislocal/conislocal is set to true
+ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
+SELECT attinhcount, attislocal FROM pg_attribute WHERE attrelid = 'part_3_4'::regclass AND attnum > 0;
+ attinhcount | attislocal 
+-------------+------------
+           0 | t
+           0 | t
+(2 rows)
+
+SELECT coninhcount, conislocal FROM pg_constraint WHERE conrelid = 'part_3_4'::regclass AND conname = 'check_a';
+ coninhcount | conislocal 
+-------------+------------
+           0 | t
+(1 row)
+
+DROP TABLE part_3_4;
+-- check that a detached partition is not dropped on dropping a partitioned table
+CREATE TABLE range_parted2 (
+    a int
+) PARTITION BY RANGE(a);
+CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100);
+ALTER TABLE range_parted2 DETACH PARTITION part_rp;
+DROP TABLE range_parted2;
+SELECT * from part_rp;
+ a 
+---
+(0 rows)
+
+DROP TABLE part_rp;
+-- Check ALTER TABLE commands for partitioned tables and partitions
+-- cannot add/drop column to/from *only* the parent
+ALTER TABLE ONLY list_parted2 ADD COLUMN c int;
+ERROR:  column must be added to child tables too
+ALTER TABLE ONLY list_parted2 DROP COLUMN b;
+ERROR:  cannot drop column from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+-- cannot add a column to partition or drop an inherited one
+ALTER TABLE part_2 ADD COLUMN c text;
+ERROR:  cannot add column to a partition
+ALTER TABLE part_2 DROP COLUMN b;
+ERROR:  cannot drop inherited column "b"
+-- Nor rename, alter type
+ALTER TABLE part_2 RENAME COLUMN b to c;
+ERROR:  cannot rename inherited column "b"
+ALTER TABLE part_2 ALTER COLUMN b TYPE text;
+ERROR:  cannot alter inherited column "b"
+-- cannot add/drop NOT NULL or check constraints to *only* the parent, when
+-- partitions exist
+ALTER TABLE ONLY list_parted2 ALTER b SET NOT NULL;
+ERROR:  constraint must be added to child tables too
+DETAIL:  Column "b" of relation "part_2" is not already NOT NULL.
+HINT:  Do not specify the ONLY keyword.
+ALTER TABLE ONLY list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz');
+ERROR:  constraint must be added to child tables too
+ALTER TABLE list_parted2 ALTER b SET NOT NULL;
+ALTER TABLE ONLY list_parted2 ALTER b DROP NOT NULL;
+ERROR:  cannot remove constraint from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+ALTER TABLE list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz');
+ALTER TABLE ONLY list_parted2 DROP CONSTRAINT check_b;
+ERROR:  cannot remove constraint from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+-- It's alright though, if no partitions are yet created
+CREATE TABLE parted_no_parts (a int) PARTITION BY LIST (a);
+ALTER TABLE ONLY parted_no_parts ALTER a SET NOT NULL;
+ALTER TABLE ONLY parted_no_parts ADD CONSTRAINT check_a CHECK (a > 0);
+ALTER TABLE ONLY parted_no_parts ALTER a DROP NOT NULL;
+ALTER TABLE ONLY parted_no_parts DROP CONSTRAINT check_a;
+DROP TABLE parted_no_parts;
+-- cannot drop inherited NOT NULL or check constraints from partition
+ALTER TABLE list_parted2 ALTER b SET NOT NULL, ADD CONSTRAINT check_a2 CHECK (a > 0);
+ALTER TABLE part_2 ALTER b DROP NOT NULL;
+ERROR:  column "b" is marked NOT NULL in parent table
+ALTER TABLE part_2 DROP CONSTRAINT check_a2;
+ERROR:  cannot drop inherited constraint "check_a2" of relation "part_2"
+-- Doesn't make sense to add NO INHERIT constraints on partitioned tables
+ALTER TABLE list_parted2 add constraint check_b2 check (b <> 'zz') NO INHERIT;
+ERROR:  cannot add NO INHERIT constraint to partitioned table "list_parted2"
+-- check that a partition cannot participate in regular inheritance
+CREATE TABLE inh_test () INHERITS (part_2);
+ERROR:  cannot inherit from partition "part_2"
+CREATE TABLE inh_test (LIKE part_2);
+ALTER TABLE inh_test INHERIT part_2;
+ERROR:  cannot inherit from a partition
+ALTER TABLE part_2 INHERIT inh_test;
+ERROR:  cannot change inheritance of a partition
+-- cannot drop or alter type of partition key columns of lower level
+-- partitioned tables; for example, part_5, which is list_parted2's
+-- partition, is partitioned on b;
+ALTER TABLE list_parted2 DROP COLUMN b;
+ERROR:  cannot drop column named in partition key
+ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
+ERROR:  cannot alter type of column named in partition key
+-- dropping non-partition key columns should be allowed on the parent table.
+ALTER TABLE list_parted DROP COLUMN b;
+SELECT * FROM list_parted;
+ a 
+---
+(0 rows)
+
+-- cleanup
+DROP TABLE list_parted, list_parted2, range_parted;
+DROP TABLE fail_def_part;
+DROP TABLE hash_parted;
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass)
+order by attrelid::regclass::text;
+ attrelid | attname | attnum 
+----------+---------+--------
+ p        | a       |      1
+ p1       | a       |      2
+ p11      | a       |      4
+(3 rows)
+
+alter table p1 attach partition p11 for values from (2) to (5);
+insert into p1 (a, b) values (2, 3);
+-- check that partition validation scan correctly detects violating rows
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+ERROR:  partition constraint is violated by some row
+-- cleanup
+drop table p;
+drop table p1;
+-- validate constraint on partitioned tables should only scan leaf partitions
+create table parted_validate_test (a int) partition by list (a);
+create table parted_validate_test_1 partition of parted_validate_test for values in (0, 1);
+alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid;
+alter table parted_validate_test validate constraint parted_validate_test_chka;
+drop table parted_validate_test;
+-- test alter column options
+CREATE TABLE attmp(i integer);
+INSERT INTO attmp VALUES (1);
+ALTER TABLE attmp ALTER COLUMN i SET (n_distinct = 1, n_distinct_inherited = 2);
+ALTER TABLE attmp ALTER COLUMN i RESET (n_distinct_inherited);
+ANALYZE attmp;
+DROP TABLE attmp;
+DROP USER regress_alter_table_user1;
+-- check that violating rows are correctly reported when attaching as the
+-- default partition
+create table defpart_attach_test (a int) partition by list (a);
+create table defpart_attach_test1 partition of defpart_attach_test for values in (1);
+create table defpart_attach_test_d (like defpart_attach_test);
+insert into defpart_attach_test_d values (1), (2);
+-- error because its constraint as the default partition would be violated
+-- by the row containing 1
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+ERROR:  partition constraint is violated by some row
+delete from defpart_attach_test_d where a = 1;
+alter table defpart_attach_test_d add check (a > 1);
+-- should be attached successfully and without needing to be scanned
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
+drop table defpart_attach_test;
+-- check combinations of temporary and permanent relations when attaching
+-- partitions.
+create table perm_part_parent (a int) partition by list (a);
+create temp table temp_part_parent (a int) partition by list (a);
+create table perm_part_child (a int);
+create temp table temp_part_child (a int);
+alter table temp_part_parent attach partition perm_part_child default; -- error
+ERROR:  cannot attach a permanent relation as partition of temporary relation "temp_part_parent"
+alter table perm_part_parent attach partition temp_part_child default; -- error
+ERROR:  cannot attach a temporary relation as partition of permanent relation "perm_part_parent"
+alter table temp_part_parent attach partition temp_part_child default; -- ok
+drop table perm_part_parent cascade;
+drop table temp_part_parent cascade;
+-- check that attaching partitions to a table while it is being used is
+-- prevented
+create table tab_part_attach (a int) partition by list (a);
+create or replace function func_part_attach() returns trigger
+  language plpgsql as $$
+  begin
+    execute 'create table tab_part_attach_1 (a int)';
+    execute 'alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)';
+    return null;
+  end $$;
+create trigger trig_part_attach before insert on tab_part_attach
+  for each statement execute procedure func_part_attach();
+insert into tab_part_attach values (1);
+ERROR:  cannot ALTER TABLE "tab_part_attach" because it is being used by active queries in this session
+CONTEXT:  SQL statement "alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)"
+PL/pgSQL function func_part_attach() line 4 at EXECUTE
+drop table tab_part_attach;
+drop function func_part_attach();
+-- test case where the partitioning operator is a SQL function whose
+-- evaluation results in the table's relcache being rebuilt partway through
+-- the execution of an ATTACH PARTITION command
+create function at_test_sql_partop (int4, int4) returns int language sql
+as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$;
+create operator class at_test_sql_partop for type int4 using btree as
+    operator 1 < (int4, int4), operator 2 <= (int4, int4),
+    operator 3 = (int4, int4), operator 4 >= (int4, int4),
+    operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4);
+create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop);
+create table at_test_sql_partop_1 (a int);
+alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
+drop table at_test_sql_partop;
+drop operator class at_test_sql_partop using btree;
+drop function at_test_sql_partop;
diff --git a/src/test/regress/expected/cluster_1.out b/src/test/regress/expected/cluster_1.out
new file mode 100644
index 0000000000..a707ea30cb
--- /dev/null
+++ b/src/test/regress/expected/cluster_1.out
@@ -0,0 +1,475 @@
+--
+--  CLUSTER
+--
+CREATE TABLE clstr_tst_s (rf_a SERIAL PRIMARY KEY,
+	b INT);
+CREATE TABLE clstr_tst (a SERIAL PRIMARY KEY,
+	b INT,
+	c TEXT,
+	d TEXT,
+	CONSTRAINT clstr_tst_con FOREIGN KEY (b) REFERENCES clstr_tst_s);
+CREATE INDEX clstr_tst_b ON clstr_tst (b);
+CREATE INDEX clstr_tst_c ON clstr_tst (c);
+CREATE INDEX clstr_tst_c_b ON clstr_tst (c,b);
+CREATE INDEX clstr_tst_b_c ON clstr_tst (b,c);
+INSERT INTO clstr_tst_s (b) VALUES (0);
+INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s;
+INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s;
+INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s;
+INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s;
+INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s;
+CREATE TABLE clstr_tst_inh () INHERITS (clstr_tst);
+INSERT INTO clstr_tst (b, c) VALUES (11, 'once');
+INSERT INTO clstr_tst (b, c) VALUES (10, 'diez');
+INSERT INTO clstr_tst (b, c) VALUES (31, 'treinta y uno');
+INSERT INTO clstr_tst (b, c) VALUES (22, 'veintidos');
+INSERT INTO clstr_tst (b, c) VALUES (3, 'tres');
+INSERT INTO clstr_tst (b, c) VALUES (20, 'veinte');
+INSERT INTO clstr_tst (b, c) VALUES (23, 'veintitres');
+INSERT INTO clstr_tst (b, c) VALUES (21, 'veintiuno');
+INSERT INTO clstr_tst (b, c) VALUES (4, 'cuatro');
+INSERT INTO clstr_tst (b, c) VALUES (14, 'catorce');
+INSERT INTO clstr_tst (b, c) VALUES (2, 'dos');
+INSERT INTO clstr_tst (b, c) VALUES (18, 'dieciocho');
+INSERT INTO clstr_tst (b, c) VALUES (27, 'veintisiete');
+INSERT INTO clstr_tst (b, c) VALUES (25, 'veinticinco');
+INSERT INTO clstr_tst (b, c) VALUES (13, 'trece');
+INSERT INTO clstr_tst (b, c) VALUES (28, 'veintiocho');
+INSERT INTO clstr_tst (b, c) VALUES (32, 'treinta y dos');
+INSERT INTO clstr_tst (b, c) VALUES (5, 'cinco');
+INSERT INTO clstr_tst (b, c) VALUES (29, 'veintinueve');
+INSERT INTO clstr_tst (b, c) VALUES (1, 'uno');
+INSERT INTO clstr_tst (b, c) VALUES (24, 'veinticuatro');
+INSERT INTO clstr_tst (b, c) VALUES (30, 'treinta');
+INSERT INTO clstr_tst (b, c) VALUES (12, 'doce');
+INSERT INTO clstr_tst (b, c) VALUES (17, 'diecisiete');
+INSERT INTO clstr_tst (b, c) VALUES (9, 'nueve');
+INSERT INTO clstr_tst (b, c) VALUES (19, 'diecinueve');
+INSERT INTO clstr_tst (b, c) VALUES (26, 'veintiseis');
+INSERT INTO clstr_tst (b, c) VALUES (15, 'quince');
+INSERT INTO clstr_tst (b, c) VALUES (7, 'siete');
+INSERT INTO clstr_tst (b, c) VALUES (16, 'dieciseis');
+INSERT INTO clstr_tst (b, c) VALUES (8, 'ocho');
+-- This entry is needed to test that TOASTED values are copied correctly.
+INSERT INTO clstr_tst (b, c, d) VALUES (6, 'seis', repeat('xyzzy', 100000));
+CLUSTER clstr_tst_c ON clstr_tst;
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst;
+ a  | b  |       c       |           substring            | length 
+----+----+---------------+--------------------------------+--------
+ 10 | 14 | catorce       |                                |       
+ 18 |  5 | cinco         |                                |       
+  9 |  4 | cuatro        |                                |       
+ 26 | 19 | diecinueve    |                                |       
+ 12 | 18 | dieciocho     |                                |       
+ 30 | 16 | dieciseis     |                                |       
+ 24 | 17 | diecisiete    |                                |       
+  2 | 10 | diez          |                                |       
+ 23 | 12 | doce          |                                |       
+ 11 |  2 | dos           |                                |       
+ 25 |  9 | nueve         |                                |       
+ 31 |  8 | ocho          |                                |       
+  1 | 11 | once          |                                |       
+ 28 | 15 | quince        |                                |       
+ 32 |  6 | seis          | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+ 29 |  7 | siete         |                                |       
+ 15 | 13 | trece         |                                |       
+ 22 | 30 | treinta       |                                |       
+ 17 | 32 | treinta y dos |                                |       
+  3 | 31 | treinta y uno |                                |       
+  5 |  3 | tres          |                                |       
+ 20 |  1 | uno           |                                |       
+  6 | 20 | veinte        |                                |       
+ 14 | 25 | veinticinco   |                                |       
+ 21 | 24 | veinticuatro  |                                |       
+  4 | 22 | veintidos     |                                |       
+ 19 | 29 | veintinueve   |                                |       
+ 16 | 28 | veintiocho    |                                |       
+ 27 | 26 | veintiseis    |                                |       
+ 13 | 27 | veintisiete   |                                |       
+  7 | 23 | veintitres    |                                |       
+  8 | 21 | veintiuno     |                                |       
+(32 rows)
+
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY a;
+ a  | b  |       c       |           substring            | length 
+----+----+---------------+--------------------------------+--------
+  1 | 11 | once          |                                |       
+  2 | 10 | diez          |                                |       
+  3 | 31 | treinta y uno |                                |       
+  4 | 22 | veintidos     |                                |       
+  5 |  3 | tres          |                                |       
+  6 | 20 | veinte        |                                |       
+  7 | 23 | veintitres    |                                |       
+  8 | 21 | veintiuno     |                                |       
+  9 |  4 | cuatro        |                                |       
+ 10 | 14 | catorce       |                                |       
+ 11 |  2 | dos           |                                |       
+ 12 | 18 | dieciocho     |                                |       
+ 13 | 27 | veintisiete   |                                |       
+ 14 | 25 | veinticinco   |                                |       
+ 15 | 13 | trece         |                                |       
+ 16 | 28 | veintiocho    |                                |       
+ 17 | 32 | treinta y dos |                                |       
+ 18 |  5 | cinco         |                                |       
+ 19 | 29 | veintinueve   |                                |       
+ 20 |  1 | uno           |                                |       
+ 21 | 24 | veinticuatro  |                                |       
+ 22 | 30 | treinta       |                                |       
+ 23 | 12 | doce          |                                |       
+ 24 | 17 | diecisiete    |                                |       
+ 25 |  9 | nueve         |                                |       
+ 26 | 19 | diecinueve    |                                |       
+ 27 | 26 | veintiseis    |                                |       
+ 28 | 15 | quince        |                                |       
+ 29 |  7 | siete         |                                |       
+ 30 | 16 | dieciseis     |                                |       
+ 31 |  8 | ocho          |                                |       
+ 32 |  6 | seis          | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+(32 rows)
+
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY b;
+ a  | b  |       c       |           substring            | length 
+----+----+---------------+--------------------------------+--------
+ 20 |  1 | uno           |                                |       
+ 11 |  2 | dos           |                                |       
+  5 |  3 | tres          |                                |       
+  9 |  4 | cuatro        |                                |       
+ 18 |  5 | cinco         |                                |       
+ 32 |  6 | seis          | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+ 29 |  7 | siete         |                                |       
+ 31 |  8 | ocho          |                                |       
+ 25 |  9 | nueve         |                                |       
+  2 | 10 | diez          |                                |       
+  1 | 11 | once          |                                |       
+ 23 | 12 | doce          |                                |       
+ 15 | 13 | trece         |                                |       
+ 10 | 14 | catorce       |                                |       
+ 28 | 15 | quince        |                                |       
+ 30 | 16 | dieciseis     |                                |       
+ 24 | 17 | diecisiete    |                                |       
+ 12 | 18 | dieciocho     |                                |       
+ 26 | 19 | diecinueve    |                                |       
+  6 | 20 | veinte        |                                |       
+  8 | 21 | veintiuno     |                                |       
+  4 | 22 | veintidos     |                                |       
+  7 | 23 | veintitres    |                                |       
+ 21 | 24 | veinticuatro  |                                |       
+ 14 | 25 | veinticinco   |                                |       
+ 27 | 26 | veintiseis    |                                |       
+ 13 | 27 | veintisiete   |                                |       
+ 16 | 28 | veintiocho    |                                |       
+ 19 | 29 | veintinueve   |                                |       
+ 22 | 30 | treinta       |                                |       
+  3 | 31 | treinta y uno |                                |       
+ 17 | 32 | treinta y dos |                                |       
+(32 rows)
+
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY c;
+ a  | b  |       c       |           substring            | length 
+----+----+---------------+--------------------------------+--------
+ 10 | 14 | catorce       |                                |       
+ 18 |  5 | cinco         |                                |       
+  9 |  4 | cuatro        |                                |       
+ 26 | 19 | diecinueve    |                                |       
+ 12 | 18 | dieciocho     |                                |       
+ 30 | 16 | dieciseis     |                                |       
+ 24 | 17 | diecisiete    |                                |       
+  2 | 10 | diez          |                                |       
+ 23 | 12 | doce          |                                |       
+ 11 |  2 | dos           |                                |       
+ 25 |  9 | nueve         |                                |       
+ 31 |  8 | ocho          |                                |       
+  1 | 11 | once          |                                |       
+ 28 | 15 | quince        |                                |       
+ 32 |  6 | seis          | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+ 29 |  7 | siete         |                                |       
+ 15 | 13 | trece         |                                |       
+ 22 | 30 | treinta       |                                |       
+ 17 | 32 | treinta y dos |                                |       
+  3 | 31 | treinta y uno |                                |       
+  5 |  3 | tres          |                                |       
+ 20 |  1 | uno           |                                |       
+  6 | 20 | veinte        |                                |       
+ 14 | 25 | veinticinco   |                                |       
+ 21 | 24 | veinticuatro  |                                |       
+  4 | 22 | veintidos     |                                |       
+ 19 | 29 | veintinueve   |                                |       
+ 16 | 28 | veintiocho    |                                |       
+ 27 | 26 | veintiseis    |                                |       
+ 13 | 27 | veintisiete   |                                |       
+  7 | 23 | veintitres    |                                |       
+  8 | 21 | veintiuno     |                                |       
+(32 rows)
+
+-- Verify that inheritance link still works
+INSERT INTO clstr_tst_inh VALUES (0, 100, 'in child table');
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst;
+ a  |  b  |       c        |           substring            | length 
+----+-----+----------------+--------------------------------+--------
+ 10 |  14 | catorce        |                                |       
+ 18 |   5 | cinco          |                                |       
+  9 |   4 | cuatro         |                                |       
+ 26 |  19 | diecinueve     |                                |       
+ 12 |  18 | dieciocho      |                                |       
+ 30 |  16 | dieciseis      |                                |       
+ 24 |  17 | diecisiete     |                                |       
+  2 |  10 | diez           |                                |       
+ 23 |  12 | doce           |                                |       
+ 11 |   2 | dos            |                                |       
+ 25 |   9 | nueve          |                                |       
+ 31 |   8 | ocho           |                                |       
+  1 |  11 | once           |                                |       
+ 28 |  15 | quince         |                                |       
+ 32 |   6 | seis           | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+ 29 |   7 | siete          |                                |       
+ 15 |  13 | trece          |                                |       
+ 22 |  30 | treinta        |                                |       
+ 17 |  32 | treinta y dos  |                                |       
+  3 |  31 | treinta y uno  |                                |       
+  5 |   3 | tres           |                                |       
+ 20 |   1 | uno            |                                |       
+  6 |  20 | veinte         |                                |       
+ 14 |  25 | veinticinco    |                                |       
+ 21 |  24 | veinticuatro   |                                |       
+  4 |  22 | veintidos      |                                |       
+ 19 |  29 | veintinueve    |                                |       
+ 16 |  28 | veintiocho     |                                |       
+ 27 |  26 | veintiseis     |                                |       
+ 13 |  27 | veintisiete    |                                |       
+  7 |  23 | veintitres     |                                |       
+  8 |  21 | veintiuno      |                                |       
+  0 | 100 | in child table |                                |       
+(33 rows)
+
+-- Verify that foreign key link still works
+INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail');
+ERROR:  insert or update on table "clstr_tst" violates foreign key constraint "clstr_tst_con"
+DETAIL:  Key (b)=(1111) is not present in table "clstr_tst_s".
+SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass
+ORDER BY 1;
+    conname     
+----------------
+ clstr_tst_con
+ clstr_tst_pkey
+(2 rows)
+
+SELECT relname, relkind,
+    EXISTS(SELECT 1 FROM pg_class WHERE oid = c.reltoastrelid) AS hastoast
+FROM pg_class c WHERE relname LIKE 'clstr_tst%' ORDER BY relname;
+       relname        | relkind | hastoast 
+----------------------+---------+----------
+ clstr_tst            | r       | f
+ clstr_tst_a_seq      | S       | f
+ clstr_tst_b          | i       | f
+ clstr_tst_b_c        | i       | f
+ clstr_tst_c          | i       | f
+ clstr_tst_c_b        | i       | f
+ clstr_tst_inh        | r       | f
+ clstr_tst_pkey       | i       | f
+ clstr_tst_s          | r       | f
+ clstr_tst_s_pkey     | i       | f
+ clstr_tst_s_rf_a_seq | S       | f
+(11 rows)
+
+-- Verify that indisclustered is correctly set
+SELECT pg_class.relname FROM pg_index, pg_class, pg_class AS pg_class_2
+WHERE pg_class.oid=indexrelid
+	AND indrelid=pg_class_2.oid
+	AND pg_class_2.relname = 'clstr_tst'
+	AND indisclustered;
+   relname   
+-------------
+ clstr_tst_c
+(1 row)
+
+-- Try changing indisclustered
+ALTER TABLE clstr_tst CLUSTER ON clstr_tst_b_c;
+SELECT pg_class.relname FROM pg_index, pg_class, pg_class AS pg_class_2
+WHERE pg_class.oid=indexrelid
+	AND indrelid=pg_class_2.oid
+	AND pg_class_2.relname = 'clstr_tst'
+	AND indisclustered;
+    relname    
+---------------
+ clstr_tst_b_c
+(1 row)
+
+-- Try turning off all clustering
+ALTER TABLE clstr_tst SET WITHOUT CLUSTER;
+SELECT pg_class.relname FROM pg_index, pg_class, pg_class AS pg_class_2
+WHERE pg_class.oid=indexrelid
+	AND indrelid=pg_class_2.oid
+	AND pg_class_2.relname = 'clstr_tst'
+	AND indisclustered;
+ relname 
+---------
+(0 rows)
+
+-- Verify that clustering all tables does in fact cluster the right ones
+CREATE USER regress_clstr_user;
+CREATE TABLE clstr_1 (a INT PRIMARY KEY);
+CREATE TABLE clstr_2 (a INT PRIMARY KEY);
+CREATE TABLE clstr_3 (a INT PRIMARY KEY);
+ALTER TABLE clstr_1 OWNER TO regress_clstr_user;
+ALTER TABLE clstr_3 OWNER TO regress_clstr_user;
+GRANT SELECT ON clstr_2 TO regress_clstr_user;
+INSERT INTO clstr_1 VALUES (2);
+INSERT INTO clstr_1 VALUES (1);
+INSERT INTO clstr_2 VALUES (2);
+INSERT INTO clstr_2 VALUES (1);
+INSERT INTO clstr_3 VALUES (2);
+INSERT INTO clstr_3 VALUES (1);
+-- "CLUSTER <tablename>" on a table that hasn't been clustered
+CLUSTER clstr_2;
+ERROR:  there is no previously clustered index for table "clstr_2"
+CLUSTER clstr_1_pkey ON clstr_1;
+CLUSTER clstr_2 USING clstr_2_pkey;
+SELECT * FROM clstr_1 UNION ALL
+  SELECT * FROM clstr_2 UNION ALL
+  SELECT * FROM clstr_3;
+ a 
+---
+ 1
+ 2
+ 1
+ 2
+ 2
+ 1
+(6 rows)
+
+-- revert to the original state
+DELETE FROM clstr_1;
+DELETE FROM clstr_2;
+DELETE FROM clstr_3;
+INSERT INTO clstr_1 VALUES (2);
+INSERT INTO clstr_1 VALUES (1);
+INSERT INTO clstr_2 VALUES (2);
+INSERT INTO clstr_2 VALUES (1);
+INSERT INTO clstr_3 VALUES (2);
+INSERT INTO clstr_3 VALUES (1);
+-- this user can only cluster clstr_1 and clstr_3, but the latter
+-- has not been clustered
+SET SESSION AUTHORIZATION regress_clstr_user;
+CLUSTER;
+SELECT * FROM clstr_1 UNION ALL
+  SELECT * FROM clstr_2 UNION ALL
+  SELECT * FROM clstr_3;
+ a 
+---
+ 1
+ 2
+ 2
+ 1
+ 2
+ 1
+(6 rows)
+
+-- cluster a single table using the indisclustered bit previously set
+DELETE FROM clstr_1;
+INSERT INTO clstr_1 VALUES (2);
+INSERT INTO clstr_1 VALUES (1);
+CLUSTER clstr_1;
+SELECT * FROM clstr_1;
+ a 
+---
+ 1
+ 2
+(2 rows)
+
+-- Test MVCC-safety of cluster. There isn't much we can do to verify the
+-- results with a single backend...
+CREATE TABLE clustertest (key int PRIMARY KEY);
+INSERT INTO clustertest VALUES (10);
+INSERT INTO clustertest VALUES (20);
+INSERT INTO clustertest VALUES (30);
+INSERT INTO clustertest VALUES (40);
+INSERT INTO clustertest VALUES (50);
+-- Use a transaction so that updates are not committed when CLUSTER sees 'em
+BEGIN;
+-- Test update where the old row version is found first in the scan
+UPDATE clustertest SET key = 100 WHERE key = 10;
+-- Test update where the new row version is found first in the scan
+UPDATE clustertest SET key = 35 WHERE key = 40;
+-- Test longer update chain
+UPDATE clustertest SET key = 60 WHERE key = 50;
+UPDATE clustertest SET key = 70 WHERE key = 60;
+UPDATE clustertest SET key = 80 WHERE key = 70;
+SELECT * FROM clustertest;
+ key 
+-----
+  20
+  30
+ 100
+  35
+  80
+(5 rows)
+
+CLUSTER clustertest_pkey ON clustertest;
+SELECT * FROM clustertest;
+ key 
+-----
+  20
+  30
+  35
+  80
+ 100
+(5 rows)
+
+COMMIT;
+SELECT * FROM clustertest;
+ key 
+-----
+  20
+  30
+  35
+  80
+ 100
+(5 rows)
+
+-- check that temp tables can be clustered
+create temp table clstr_temp (col1 int primary key, col2 text);
+insert into clstr_temp values (2, 'two'), (1, 'one');
+cluster clstr_temp using clstr_temp_pkey;
+select * from clstr_temp;
+ col1 | col2 
+------+------
+    1 | one
+    2 | two
+(2 rows)
+
+drop table clstr_temp;
+RESET SESSION AUTHORIZATION;
+-- Check that partitioned tables cannot be clustered
+CREATE TABLE clstrpart (a int) PARTITION BY RANGE (a);
+CREATE INDEX clstrpart_idx ON clstrpart (a);
+ALTER TABLE clstrpart CLUSTER ON clstrpart_idx;
+ERROR:  cannot mark index clustered in partitioned table
+CLUSTER clstrpart USING clstrpart_idx;
+ERROR:  cannot cluster a partitioned table
+DROP TABLE clstrpart;
+-- Test CLUSTER with external tuplesorting
+create table clstr_4 as select * from tenk1;
+create index cluster_sort on clstr_4 (hundred, thousand, tenthous);
+-- ensure we don't use the index in CLUSTER nor the checking SELECTs
+set enable_indexscan = off;
+-- Use external sort:
+set maintenance_work_mem = '1MB';
+cluster clstr_4 using cluster_sort;
+select * from
+(select hundred, lag(hundred) over () as lhundred,
+        thousand, lag(thousand) over () as lthousand,
+        tenthous, lag(tenthous) over () as ltenthous from clstr_4) ss
+where row(hundred, thousand, tenthous) <= row(lhundred, lthousand, ltenthous);
+ hundred | lhundred | thousand | lthousand | tenthous | ltenthous 
+---------+----------+----------+-----------+----------+-----------
+(0 rows)
+
+reset enable_indexscan;
+reset maintenance_work_mem;
+-- clean up
+DROP TABLE clustertest;
+DROP TABLE clstr_1;
+DROP TABLE clstr_2;
+DROP TABLE clstr_3;
+DROP TABLE clstr_4;
+DROP USER regress_clstr_user;
diff --git a/src/test/regress/expected/create_am.out b/src/test/regress/expected/create_am.out
index 352959b751..6eae2bab97 100644
--- a/src/test/regress/expected/create_am.out
+++ b/src/test/regress/expected/create_am.out
@@ -126,11 +126,12 @@ ERROR:  function int4in(internal) does not exist
 CREATE ACCESS METHOD bogus TYPE TABLE HANDLER bthandler;
 ERROR:  function bthandler must return type table_am_handler
 SELECT amname, amhandler, amtype FROM pg_am where amtype = 't' ORDER BY 1, 2;
- amname |      amhandler       | amtype 
---------+----------------------+--------
- heap   | heap_tableam_handler | t
- heap2  | heap_tableam_handler | t
-(2 rows)
+  amname  |        amhandler         | amtype 
+----------+--------------------------+--------
+ heap     | heap_tableam_handler     | t
+ heap2    | heap_tableam_handler     | t
+ zedstore | zedstore_tableam_handler | t
+(3 rows)

 -- First create tables employing the new AM using USING
 -- plain CREATE TABLE
diff --git a/src/test/regress/expected/fsm_1.out b/src/test/regress/expected/fsm_1.out
new file mode 100644
index 0000000000..9b5f9be13a
--- /dev/null
+++ b/src/test/regress/expected/fsm_1.out
@@ -0,0 +1,73 @@
+--
+-- Free Space Map test
+--
+SELECT current_setting('block_size')::integer AS blocksize,
+current_setting('block_size')::integer / 8 AS strsize
+\gset
+CREATE TABLE fsm_check_size (num int, str text);
+-- Fill 3 blocks with one record each
+ALTER TABLE fsm_check_size SET (fillfactor=15);
+INSERT INTO fsm_check_size SELECT i, rpad('', :strsize, 'a')
+FROM generate_series(1,3) i;
+-- There should be no FSM
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'main') / :blocksize AS heap_nblocks,
+pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks;
+ heap_nblocks | fsm_nblocks 
+--------------+-------------
+            5 |           0
+(1 row)
+
+-- The following operations are for testing the functionality of the local
+-- in-memory map. In particular, we want to be able to insert into some
+-- other block than the one at the end of the heap, without using a FSM.
+-- Fill most of the last block
+ALTER TABLE fsm_check_size SET (fillfactor=100);
+INSERT INTO fsm_check_size SELECT i, rpad('', :strsize, 'a')
+FROM generate_series(101,105) i;
+-- Make sure records can go into any block but the last one
+ALTER TABLE fsm_check_size SET (fillfactor=30);
+-- Insert large record and make sure it does not cause the relation to extend
+INSERT INTO fsm_check_size VALUES (111, rpad('', :strsize, 'a'));
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'main') / :blocksize AS heap_nblocks,
+pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks;
+ heap_nblocks | fsm_nblocks 
+--------------+-------------
+            5 |           0
+(1 row)
+
+-- Extend table with enough blocks to exceed the FSM threshold
+DO $$
+DECLARE curtid tid;
+num int;
+BEGIN
+num = 11;
+  LOOP
+    INSERT INTO fsm_check_size VALUES (num, 'b') RETURNING ctid INTO curtid;
+    EXIT WHEN curtid >= tid '(4, 0)';
+    num = num + 1;
+  END LOOP;
+END;
+$$;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks;
+ fsm_nblocks 
+-------------
+           0
+(1 row)
+
+-- Add long random string to extend TOAST table to 1 block
+INSERT INTO fsm_check_size
+VALUES(0, (SELECT string_agg(md5(chr(i)), '')
+		   FROM generate_series(1, :blocksize / 100) i));
+VACUUM fsm_check_size;
+SELECT pg_relation_size(reltoastrelid, 'main') / :blocksize AS toast_nblocks,
+pg_relation_size(reltoastrelid, 'fsm') / :blocksize AS toast_fsm_nblocks
+FROM pg_class WHERE relname = 'fsm_check_size';
+ toast_nblocks | toast_fsm_nblocks 
+---------------+-------------------
+               |                  
+(1 row)
+
+DROP TABLE fsm_check_size;
diff --git a/src/test/regress/expected/rangefuncs_1.out b/src/test/regress/expected/rangefuncs_1.out
new file mode 100644
index 0000000000..78b177ceb0
--- /dev/null
+++ b/src/test/regress/expected/rangefuncs_1.out
@@ -0,0 +1,2100 @@
+CREATE TABLE rngfunc2(rngfuncid int, f2 int);
+INSERT INTO rngfunc2 VALUES(1, 11);
+INSERT INTO rngfunc2 VALUES(2, 22);
+INSERT INTO rngfunc2 VALUES(1, 111);
+CREATE FUNCTION rngfunct(int) returns setof rngfunc2 as 'SELECT * FROM rngfunc2 WHERE rngfuncid = $1 ORDER BY f2;' LANGUAGE SQL;
+-- function with ORDINALITY
+select * from rngfunct(1) with ordinality as z(a,b,ord);
+ a |  b  | ord 
+---+-----+-----
+ 1 |  11 |   1
+ 1 | 111 |   2
+(2 rows)
+
+select * from rngfunct(1) with ordinality as z(a,b,ord) where b > 100;   -- ordinal 2, not 1
+ a |  b  | ord 
+---+-----+-----
+ 1 | 111 |   2
+(1 row)
+
+-- ordinality vs. column names and types
+select a,b,ord from rngfunct(1) with ordinality as z(a,b,ord);
+ a |  b  | ord 
+---+-----+-----
+ 1 |  11 |   1
+ 1 | 111 |   2
+(2 rows)
+
+select a,ord from unnest(array['a','b']) with ordinality as z(a,ord);
+ a | ord 
+---+-----
+ a |   1
+ b |   2
+(2 rows)
+
+select * from unnest(array['a','b']) with ordinality as z(a,ord);
+ a | ord 
+---+-----
+ a |   1
+ b |   2
+(2 rows)
+
+select a,ord from unnest(array[1.0::float8]) with ordinality as z(a,ord);
+ a | ord 
+---+-----
+ 1 |   1
+(1 row)
+
+select * from unnest(array[1.0::float8]) with ordinality as z(a,ord);
+ a | ord 
+---+-----
+ 1 |   1
+(1 row)
+
+select row_to_json(s.*) from generate_series(11,14) with ordinality s;
+       row_to_json       
+-------------------------
+ {"s":11,"ordinality":1}
+ {"s":12,"ordinality":2}
+ {"s":13,"ordinality":3}
+ {"s":14,"ordinality":4}
+(4 rows)
+
+-- ordinality vs. views
+create temporary view vw_ord as select * from (values (1)) v(n) join rngfunct(1) with ordinality as z(a,b,ord) on (n=ord);
+select * from vw_ord;
+ n | a | b  | ord 
+---+---+----+-----
+ 1 | 1 | 11 |   1
+(1 row)
+
+select definition from pg_views where viewname='vw_ord';
+                               definition                                
+-------------------------------------------------------------------------
+  SELECT v.n,                                                           +
+     z.a,                                                               +
+     z.b,                                                               +
+     z.ord                                                              +
+    FROM (( VALUES (1)) v(n)                                            +
+      JOIN rngfunct(1) WITH ORDINALITY z(a, b, ord) ON ((v.n = z.ord)));
+(1 row)
+
+drop view vw_ord;
+-- multiple functions
+select * from rows from(rngfunct(1),rngfunct(2)) with ordinality as z(a,b,c,d,ord);
+ a |  b  | c | d  | ord 
+---+-----+---+----+-----
+ 1 |  11 | 2 | 22 |   1
+ 1 | 111 |   |    |   2
+(2 rows)
+
+create temporary view vw_ord as select * from (values (1)) v(n) join rows from(rngfunct(1),rngfunct(2)) with ordinality as z(a,b,c,d,ord) on (n=ord);
+select * from vw_ord;
+ n | a | b  | c | d  | ord 
+---+---+----+---+----+-----
+ 1 | 1 | 11 | 2 | 22 |   1
+(1 row)
+
+select definition from pg_views where viewname='vw_ord';
+                                              definition                                               
+-------------------------------------------------------------------------------------------------------
+  SELECT v.n,                                                                                         +
+     z.a,                                                                                             +
+     z.b,                                                                                             +
+     z.c,                                                                                             +
+     z.d,                                                                                             +
+     z.ord                                                                                            +
+    FROM (( VALUES (1)) v(n)                                                                          +
+      JOIN ROWS FROM(rngfunct(1), rngfunct(2)) WITH ORDINALITY z(a, b, c, d, ord) ON ((v.n = z.ord)));
+(1 row)
+
+drop view vw_ord;
+-- expansions of unnest()
+select * from unnest(array[10,20],array['foo','bar'],array[1.0]);
+ unnest | unnest | unnest 
+--------+--------+--------
+     10 | foo    |    1.0
+     20 | bar    |       
+(2 rows)
+
+select * from unnest(array[10,20],array['foo','bar'],array[1.0]) with ordinality as z(a,b,c,ord);
+ a  |  b  |  c  | ord 
+----+-----+-----+-----
+ 10 | foo | 1.0 |   1
+ 20 | bar |     |   2
+(2 rows)
+
+select * from rows from(unnest(array[10,20],array['foo','bar'],array[1.0])) with ordinality as z(a,b,c,ord);
+ a  |  b  |  c  | ord 
+----+-----+-----+-----
+ 10 | foo | 1.0 |   1
+ 20 | bar |     |   2
+(2 rows)
+
+select * from rows from(unnest(array[10,20],array['foo','bar']), generate_series(101,102)) with ordinality as z(a,b,c,ord);
+ a  |  b  |  c  | ord 
+----+-----+-----+-----
+ 10 | foo | 101 |   1
+ 20 | bar | 102 |   2
+(2 rows)
+
+create temporary view vw_ord as select * from unnest(array[10,20],array['foo','bar'],array[1.0]) as z(a,b,c);
+select * from vw_ord;
+ a  |  b  |  c  
+----+-----+-----
+ 10 | foo | 1.0
+ 20 | bar |    
+(2 rows)
+
+select definition from pg_views where viewname='vw_ord';
+                                       definition                                       
+----------------------------------------------------------------------------------------
+  SELECT z.a,                                                                          +
+     z.b,                                                                              +
+     z.c                                                                               +
+    FROM UNNEST(ARRAY[10, 20], ARRAY['foo'::text, 'bar'::text], ARRAY[1.0]) z(a, b, c);
+(1 row)
+
+drop view vw_ord;
+create temporary view vw_ord as select * from rows from(unnest(array[10,20],array['foo','bar'],array[1.0])) as z(a,b,c);
+select * from vw_ord;
+ a  |  b  |  c  
+----+-----+-----
+ 10 | foo | 1.0
+ 20 | bar |    
+(2 rows)
+
+select definition from pg_views where viewname='vw_ord';
+                                       definition                                       
+----------------------------------------------------------------------------------------
+  SELECT z.a,                                                                          +
+     z.b,                                                                              +
+     z.c                                                                               +
+    FROM UNNEST(ARRAY[10, 20], ARRAY['foo'::text, 'bar'::text], ARRAY[1.0]) z(a, b, c);
+(1 row)
+
+drop view vw_ord;
+create temporary view vw_ord as select * from rows from(unnest(array[10,20],array['foo','bar']), generate_series(1,2)) as z(a,b,c);
+select * from vw_ord;
+ a  |  b  | c 
+----+-----+---
+ 10 | foo | 1
+ 20 | bar | 2
+(2 rows)
+
+select definition from pg_views where viewname='vw_ord';
+                                                      definition                                                      
+----------------------------------------------------------------------------------------------------------------------
+  SELECT z.a,                                                                                                        +
+     z.b,                                                                                                            +
+     z.c                                                                                                             +
+    FROM ROWS FROM(unnest(ARRAY[10, 20]), unnest(ARRAY['foo'::text, 'bar'::text]), generate_series(1, 2)) z(a, b, c);
+(1 row)
+
+drop view vw_ord;
+-- ordinality and multiple functions vs. rewind and reverse scan
+begin;
+declare rf_cur scroll cursor for select * from rows from(generate_series(1,5),generate_series(1,2)) with ordinality as g(i,j,o);
+fetch all from rf_cur;
+ i | j | o 
+---+---+---
+ 1 | 1 | 1
+ 2 | 2 | 2
+ 3 |   | 3
+ 4 |   | 4
+ 5 |   | 5
+(5 rows)
+
+fetch backward all from rf_cur;
+ i | j | o 
+---+---+---
+ 5 |   | 5
+ 4 |   | 4
+ 3 |   | 3
+ 2 | 2 | 2
+ 1 | 1 | 1
+(5 rows)
+
+fetch all from rf_cur;
+ i | j | o 
+---+---+---
+ 1 | 1 | 1
+ 2 | 2 | 2
+ 3 |   | 3
+ 4 |   | 4
+ 5 |   | 5
+(5 rows)
+
+fetch next from rf_cur;
+ i | j | o 
+---+---+---
+(0 rows)
+
+fetch next from rf_cur;
+ i | j | o 
+---+---+---
+(0 rows)
+
+fetch prior from rf_cur;
+ i | j | o 
+---+---+---
+ 5 |   | 5
+(1 row)
+
+fetch absolute 1 from rf_cur;
+ i | j | o 
+---+---+---
+ 1 | 1 | 1
+(1 row)
+
+fetch next from rf_cur;
+ i | j | o 
+---+---+---
+ 2 | 2 | 2
+(1 row)
+
+fetch next from rf_cur;
+ i | j | o 
+---+---+---
+ 3 |   | 3
+(1 row)
+
+fetch next from rf_cur;
+ i | j | o 
+---+---+---
+ 4 |   | 4
+(1 row)
+
+fetch prior from rf_cur;
+ i | j | o 
+---+---+---
+ 3 |   | 3
+(1 row)
+
+fetch prior from rf_cur;
+ i | j | o 
+---+---+---
+ 2 | 2 | 2
+(1 row)
+
+fetch prior from rf_cur;
+ i | j | o 
+---+---+---
+ 1 | 1 | 1
+(1 row)
+
+commit;
+-- function with implicit LATERAL
+select * from rngfunc2, rngfunct(rngfunc2.rngfuncid) z where rngfunc2.f2 = z.f2;
+ rngfuncid | f2  | rngfuncid | f2  
+-----------+-----+-----------+-----
+         1 |  11 |         1 |  11
+         2 |  22 |         2 |  22
+         1 | 111 |         1 | 111
+(3 rows)
+
+-- function with implicit LATERAL and explicit ORDINALITY
+select * from rngfunc2, rngfunct(rngfunc2.rngfuncid) with ordinality as z(rngfuncid,f2,ord) where rngfunc2.f2 = z.f2;
+ rngfuncid | f2  | rngfuncid | f2  | ord 
+-----------+-----+-----------+-----+-----
+         1 |  11 |         1 |  11 |   1
+         2 |  22 |         2 |  22 |   1
+         1 | 111 |         1 | 111 |   2
+(3 rows)
+
+-- function in subselect
+select * from rngfunc2 where f2 in (select f2 from rngfunct(rngfunc2.rngfuncid) z where z.rngfuncid = rngfunc2.rngfuncid) ORDER BY 1,2;
+ rngfuncid | f2  
+-----------+-----
+         1 |  11
+         1 | 111
+         2 |  22
+(3 rows)
+
+-- function in subselect
+select * from rngfunc2 where f2 in (select f2 from rngfunct(1) z where z.rngfuncid = rngfunc2.rngfuncid) ORDER BY 1,2;
+ rngfuncid | f2  
+-----------+-----
+         1 |  11
+         1 | 111
+(2 rows)
+
+-- function in subselect
+select * from rngfunc2 where f2 in (select f2 from rngfunct(rngfunc2.rngfuncid) z where z.rngfuncid = 1) ORDER BY 1,2;
+ rngfuncid | f2  
+-----------+-----
+         1 |  11
+         1 | 111
+(2 rows)
+
+-- nested functions
+select rngfunct.rngfuncid, rngfunct.f2 from rngfunct(sin(pi()/2)::int) ORDER BY 1,2;
+ rngfuncid | f2  
+-----------+-----
+         1 |  11
+         1 | 111
+(2 rows)
+
+CREATE TABLE rngfunc (rngfuncid int, rngfuncsubid int, rngfuncname text, primary key(rngfuncid,rngfuncsubid));
+INSERT INTO rngfunc VALUES(1,1,'Joe');
+INSERT INTO rngfunc VALUES(1,2,'Ed');
+INSERT INTO rngfunc VALUES(2,1,'Mary');
+-- sql, proretset = f, prorettype = b
+CREATE FUNCTION getrngfunc1(int) RETURNS int AS 'SELECT $1;' LANGUAGE SQL;
+SELECT * FROM getrngfunc1(1) AS t1;
+ t1 
+----
+  1
+(1 row)
+
+SELECT * FROM getrngfunc1(1) WITH ORDINALITY AS t1(v,o);
+ v | o 
+---+---
+ 1 | 1
+(1 row)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc1(1);
+SELECT * FROM vw_getrngfunc;
+ getrngfunc1 
+-------------
+           1
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc1(1) WITH ORDINALITY as t1(v,o);
+SELECT * FROM vw_getrngfunc;
+ v | o 
+---+---
+ 1 | 1
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+-- sql, proretset = t, prorettype = b
+CREATE FUNCTION getrngfunc2(int) RETURNS setof int AS 'SELECT rngfuncid FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL;
+SELECT * FROM getrngfunc2(1) AS t1;
+ t1 
+----
+  1
+  1
+(2 rows)
+
+SELECT * FROM getrngfunc2(1) WITH ORDINALITY AS t1(v,o);
+ v | o 
+---+---
+ 1 | 1
+ 1 | 2
+(2 rows)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc2(1);
+SELECT * FROM vw_getrngfunc;
+ getrngfunc2 
+-------------
+           1
+           1
+(2 rows)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc2(1) WITH ORDINALITY AS t1(v,o);
+SELECT * FROM vw_getrngfunc;
+ v | o 
+---+---
+ 1 | 1
+ 1 | 2
+(2 rows)
+
+DROP VIEW vw_getrngfunc;
+-- sql, proretset = t, prorettype = b
+CREATE FUNCTION getrngfunc3(int) RETURNS setof text AS 'SELECT rngfuncname FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL;
+SELECT * FROM getrngfunc3(1) AS t1;
+ t1  
+-----
+ Joe
+ Ed
+(2 rows)
+
+SELECT * FROM getrngfunc3(1) WITH ORDINALITY AS t1(v,o);
+  v  | o 
+-----+---
+ Joe | 1
+ Ed  | 2
+(2 rows)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc3(1);
+SELECT * FROM vw_getrngfunc;
+ getrngfunc3 
+-------------
+ Joe
+ Ed
+(2 rows)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc3(1) WITH ORDINALITY AS t1(v,o);
+SELECT * FROM vw_getrngfunc;
+  v  | o 
+-----+---
+ Joe | 1
+ Ed  | 2
+(2 rows)
+
+DROP VIEW vw_getrngfunc;
+-- sql, proretset = f, prorettype = c
+CREATE FUNCTION getrngfunc4(int) RETURNS rngfunc AS 'SELECT * FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL;
+SELECT * FROM getrngfunc4(1) AS t1;
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+(1 row)
+
+SELECT * FROM getrngfunc4(1) WITH ORDINALITY AS t1(a,b,c,o);
+ a | b |  c  | o 
+---+---+-----+---
+ 1 | 1 | Joe | 1
+(1 row)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc4(1);
+SELECT * FROM vw_getrngfunc;
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc4(1) WITH ORDINALITY AS t1(a,b,c,o);
+SELECT * FROM vw_getrngfunc;
+ a | b |  c  | o 
+---+---+-----+---
+ 1 | 1 | Joe | 1
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+-- sql, proretset = t, prorettype = c
+CREATE FUNCTION getrngfunc5(int) RETURNS setof rngfunc AS 'SELECT * FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL;
+SELECT * FROM getrngfunc5(1) AS t1;
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+         1 |            2 | Ed
+(2 rows)
+
+SELECT * FROM getrngfunc5(1) WITH ORDINALITY AS t1(a,b,c,o);
+ a | b |  c  | o 
+---+---+-----+---
+ 1 | 1 | Joe | 1
+ 1 | 2 | Ed  | 2
+(2 rows)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc5(1);
+SELECT * FROM vw_getrngfunc;
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+         1 |            2 | Ed
+(2 rows)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc5(1) WITH ORDINALITY AS t1(a,b,c,o);
+SELECT * FROM vw_getrngfunc;
+ a | b |  c  | o 
+---+---+-----+---
+ 1 | 1 | Joe | 1
+ 1 | 2 | Ed  | 2
+(2 rows)
+
+DROP VIEW vw_getrngfunc;
+-- sql, proretset = f, prorettype = record
+CREATE FUNCTION getrngfunc6(int) RETURNS RECORD AS 'SELECT * FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL;
+SELECT * FROM getrngfunc6(1) AS t1(rngfuncid int, rngfuncsubid int, rngfuncname text);
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+(1 row)
+
+SELECT * FROM ROWS FROM( getrngfunc6(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text) ) WITH ORDINALITY;
+ rngfuncid | rngfuncsubid | rngfuncname | ordinality 
+-----------+--------------+-------------+------------
+         1 |            1 | Joe         |          1
+(1 row)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc6(1) AS
+(rngfuncid int, rngfuncsubid int, rngfuncname text);
+SELECT * FROM vw_getrngfunc;
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS
+  SELECT * FROM ROWS FROM( getrngfunc6(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text) )
+                WITH ORDINALITY;
+SELECT * FROM vw_getrngfunc;
+ rngfuncid | rngfuncsubid | rngfuncname | ordinality 
+-----------+--------------+-------------+------------
+         1 |            1 | Joe         |          1
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+-- sql, proretset = t, prorettype = record
+CREATE FUNCTION getrngfunc7(int) RETURNS setof record AS 'SELECT * FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL;
+SELECT * FROM getrngfunc7(1) AS t1(rngfuncid int, rngfuncsubid int, rngfuncname text);
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+         1 |            2 | Ed
+(2 rows)
+
+SELECT * FROM ROWS FROM( getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text) ) WITH ORDINALITY;
+ rngfuncid | rngfuncsubid | rngfuncname | ordinality 
+-----------+--------------+-------------+------------
+         1 |            1 | Joe         |          1
+         1 |            2 | Ed          |          2
+(2 rows)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc7(1) AS
+(rngfuncid int, rngfuncsubid int, rngfuncname text);
+SELECT * FROM vw_getrngfunc;
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+         1 |            2 | Ed
+(2 rows)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS
+  SELECT * FROM ROWS FROM( getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text) )
+                WITH ORDINALITY;
+SELECT * FROM vw_getrngfunc;
+ rngfuncid | rngfuncsubid | rngfuncname | ordinality 
+-----------+--------------+-------------+------------
+         1 |            1 | Joe         |          1
+         1 |            2 | Ed          |          2
+(2 rows)
+
+DROP VIEW vw_getrngfunc;
+-- plpgsql, proretset = f, prorettype = b
+CREATE FUNCTION getrngfunc8(int) RETURNS int AS 'DECLARE rngfuncint int; BEGIN SELECT rngfuncid into rngfuncint FROM rngfunc WHERE rngfuncid = $1; RETURN rngfuncint; END;' LANGUAGE plpgsql;
+SELECT * FROM getrngfunc8(1) AS t1;
+ t1 
+----
+  1
+(1 row)
+
+SELECT * FROM getrngfunc8(1) WITH ORDINALITY AS t1(v,o);
+ v | o 
+---+---
+ 1 | 1
+(1 row)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc8(1);
+SELECT * FROM vw_getrngfunc;
+ getrngfunc8 
+-------------
+           1
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc8(1) WITH ORDINALITY AS t1(v,o);
+SELECT * FROM vw_getrngfunc;
+ v | o 
+---+---
+ 1 | 1
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+-- plpgsql, proretset = f, prorettype = c
+CREATE FUNCTION getrngfunc9(int) RETURNS rngfunc AS 'DECLARE rngfunctup rngfunc%ROWTYPE; BEGIN SELECT * into rngfunctup FROM rngfunc WHERE rngfuncid = $1; RETURN rngfunctup; END;' LANGUAGE plpgsql;
+SELECT * FROM getrngfunc9(1) AS t1;
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+(1 row)
+
+SELECT * FROM getrngfunc9(1) WITH ORDINALITY AS t1(a,b,c,o);
+ a | b |  c  | o 
+---+---+-----+---
+ 1 | 1 | Joe | 1
+(1 row)
+
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc9(1);
+SELECT * FROM vw_getrngfunc;
+ rngfuncid | rngfuncsubid | rngfuncname 
+-----------+--------------+-------------
+         1 |            1 | Joe
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc9(1) WITH ORDINALITY AS t1(a,b,c,o);
+SELECT * FROM vw_getrngfunc;
+ a | b |  c  | o 
+---+---+-----+---
+ 1 | 1 | Joe | 1
+(1 row)
+
+DROP VIEW vw_getrngfunc;
+-- mix 'n match kinds, to exercise expandRTE and related logic
+select * from rows from(getrngfunc1(1),getrngfunc2(1),getrngfunc3(1),getrngfunc4(1),getrngfunc5(1),
+                    getrngfunc6(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text),
+                    getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text),
+                    getrngfunc8(1),getrngfunc9(1))
+              with ordinality as t1(a,b,c,d,e,f,g,h,i,j,k,l,m,o,p,q,r,s,t,u);
+ a | b |  c  | d | e |  f  | g | h |  i  | j | k |  l  | m | o |  p  | q | r | s |  t  | u 
+---+---+-----+---+---+-----+---+---+-----+---+---+-----+---+---+-----+---+---+---+-----+---
+ 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | 1 | Joe | 1
+   | 1 | Ed  |   |   |     | 1 | 2 | Ed  |   |   |     | 1 | 2 | Ed  |   |   |   |     | 2
+(2 rows)
+
+select * from rows from(getrngfunc9(1),getrngfunc8(1),
+                    getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text),
+                    getrngfunc6(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text),
+                    getrngfunc5(1),getrngfunc4(1),getrngfunc3(1),getrngfunc2(1),getrngfunc1(1))
+              with ordinality as t1(a,b,c,d,e,f,g,h,i,j,k,l,m,o,p,q,r,s,t,u);
+ a | b |  c  | d | e | f |  g  | h | i |  j  | k | l |  m  | o | p |  q  |  r  | s | t | u 
+---+---+-----+---+---+---+-----+---+---+-----+---+---+-----+---+---+-----+-----+---+---+---
+ 1 | 1 | Joe | 1 | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | Joe | 1 | 1 | 1
+   |   |     |   | 1 | 2 | Ed  |   |   |     | 1 | 2 | Ed  |   |   |     | Ed  | 1 |   | 2
+(2 rows)
+
+create temporary view vw_rngfunc as
+  select * from rows from(getrngfunc9(1),
+                      getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text),
+                      getrngfunc1(1))
+                with ordinality as t1(a,b,c,d,e,f,g,n);
+select * from vw_rngfunc;
+ a | b |  c  | d | e |  f  | g | n 
+---+---+-----+---+---+-----+---+---
+ 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1
+   |   |     | 1 | 2 | Ed  |   | 2
+(2 rows)
+
+select pg_get_viewdef('vw_rngfunc');
+                                                                                pg_get_viewdef                                                                                
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  SELECT t1.a,                                                                                                                                                               +
+     t1.b,                                                                                                                                                                   +
+     t1.c,                                                                                                                                                                   +
+     t1.d,                                                                                                                                                                   +
+     t1.e,                                                                                                                                                                   +
+     t1.f,                                                                                                                                                                   +
+     t1.g,                                                                                                                                                                   +
+     t1.n                                                                                                                                                                    +
+    FROM ROWS FROM(getrngfunc9(1), getrngfunc7(1) AS (rngfuncid integer, rngfuncsubid integer, rngfuncname text), getrngfunc1(1)) WITH ORDINALITY t1(a, b, c, d, e, f, g, n);
+(1 row)
+
+drop view vw_rngfunc;
+DROP FUNCTION getrngfunc1(int);
+DROP FUNCTION getrngfunc2(int);
+DROP FUNCTION getrngfunc3(int);
+DROP FUNCTION getrngfunc4(int);
+DROP FUNCTION getrngfunc5(int);
+DROP FUNCTION getrngfunc6(int);
+DROP FUNCTION getrngfunc7(int);
+DROP FUNCTION getrngfunc8(int);
+DROP FUNCTION getrngfunc9(int);
+DROP FUNCTION rngfunct(int);
+DROP TABLE rngfunc2;
+DROP TABLE rngfunc;
+-- Rescan tests --
+CREATE TEMPORARY SEQUENCE rngfunc_rescan_seq1;
+CREATE TEMPORARY SEQUENCE rngfunc_rescan_seq2;
+CREATE TYPE rngfunc_rescan_t AS (i integer, s bigint);
+CREATE FUNCTION rngfunc_sql(int,int) RETURNS setof rngfunc_rescan_t AS 'SELECT i, nextval(''rngfunc_rescan_seq1'') FROM generate_series($1,$2) i;' LANGUAGE SQL;
+-- plpgsql functions use materialize mode
+CREATE FUNCTION rngfunc_mat(int,int) RETURNS setof rngfunc_rescan_t AS 'begin for i in $1..$2 loop return next (i, nextval(''rngfunc_rescan_seq2'')); end loop; end;' LANGUAGE plpgsql;
+--invokes ExecReScanFunctionScan - all these cases should materialize the function only once
+-- LEFT JOIN on a condition that the planner can't prove to be true is used to ensure the function
+-- is on the inner path of a nestloop join
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN rngfunc_sql(11,13) ON (r+i)<100;
+ r | i  | s 
+---+----+---
+ 1 | 11 | 1
+ 1 | 12 | 2
+ 1 | 13 | 3
+ 2 | 11 | 1
+ 2 | 12 | 2
+ 2 | 13 | 3
+ 3 | 11 | 1
+ 3 | 12 | 2
+ 3 | 13 | 3
+(9 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN rngfunc_sql(11,13) WITH ORDINALITY AS f(i,s,o) ON (r+i)<100;
+ r | i  | s | o 
+---+----+---+---
+ 1 | 11 | 1 | 1
+ 1 | 12 | 2 | 2
+ 1 | 13 | 3 | 3
+ 2 | 11 | 1 | 1
+ 2 | 12 | 2 | 2
+ 2 | 13 | 3 | 3
+ 3 | 11 | 1 | 1
+ 3 | 12 | 2 | 2
+ 3 | 13 | 3 | 3
+(9 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN rngfunc_mat(11,13) ON (r+i)<100;
+ r | i  | s 
+---+----+---
+ 1 | 11 | 1
+ 1 | 12 | 2
+ 1 | 13 | 3
+ 2 | 11 | 1
+ 2 | 12 | 2
+ 2 | 13 | 3
+ 3 | 11 | 1
+ 3 | 12 | 2
+ 3 | 13 | 3
+(9 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN rngfunc_mat(11,13) WITH ORDINALITY AS f(i,s,o) ON (r+i)<100;
+ r | i  | s | o 
+---+----+---+---
+ 1 | 11 | 1 | 1
+ 1 | 12 | 2 | 2
+ 1 | 13 | 3 | 3
+ 2 | 11 | 1 | 1
+ 2 | 12 | 2 | 2
+ 2 | 13 | 3 | 3
+ 3 | 11 | 1 | 1
+ 3 | 12 | 2 | 2
+ 3 | 13 | 3 | 3
+(9 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN ROWS FROM( rngfunc_sql(11,13), rngfunc_mat(11,13) ) WITH ORDINALITY AS f(i1,s1,i2,s2,o) ON (r+i1+i2)<100;
+ r | i1 | s1 | i2 | s2 | o 
+---+----+----+----+----+---
+ 1 | 11 |  1 | 11 |  1 | 1
+ 1 | 12 |  2 | 12 |  2 | 2
+ 1 | 13 |  3 | 13 |  3 | 3
+ 2 | 11 |  1 | 11 |  1 | 1
+ 2 | 12 |  2 | 12 |  2 | 2
+ 2 | 13 |  3 | 13 |  3 | 3
+ 3 | 11 |  1 | 11 |  1 | 1
+ 3 | 12 |  2 | 12 |  2 | 2
+ 3 | 13 |  3 | 13 |  3 | 3
+(9 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN generate_series(11,13) f(i) ON (r+i)<100;
+ r | i  
+---+----
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 2 | 11
+ 2 | 12
+ 2 | 13
+ 3 | 11
+ 3 | 12
+ 3 | 13
+(9 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN generate_series(11,13) WITH ORDINALITY AS f(i,o) ON (r+i)<100;
+ r | i  | o 
+---+----+---
+ 1 | 11 | 1
+ 1 | 12 | 2
+ 1 | 13 | 3
+ 2 | 11 | 1
+ 2 | 12 | 2
+ 2 | 13 | 3
+ 3 | 11 | 1
+ 3 | 12 | 2
+ 3 | 13 | 3
+(9 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN unnest(array[10,20,30]) f(i) ON (r+i)<100;
+ r | i  
+---+----
+ 1 | 10
+ 1 | 20
+ 1 | 30
+ 2 | 10
+ 2 | 20
+ 2 | 30
+ 3 | 10
+ 3 | 20
+ 3 | 30
+(9 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN unnest(array[10,20,30]) WITH ORDINALITY AS f(i,o) ON (r+i)<100;
+ r | i  | o 
+---+----+---
+ 1 | 10 | 1
+ 1 | 20 | 2
+ 1 | 30 | 3
+ 2 | 10 | 1
+ 2 | 20 | 2
+ 2 | 30 | 3
+ 3 | 10 | 1
+ 3 | 20 | 2
+ 3 | 30 | 3
+(9 rows)
+
+--invokes ExecReScanFunctionScan with chgParam != NULL (using implied LATERAL)
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_sql(10+r,13);
+ r | i  | s 
+---+----+---
+ 1 | 11 | 1
+ 1 | 12 | 2
+ 1 | 13 | 3
+ 2 | 12 | 4
+ 2 | 13 | 5
+ 3 | 13 | 6
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_sql(10+r,13) WITH ORDINALITY AS f(i,s,o);
+ r | i  | s | o 
+---+----+---+---
+ 1 | 11 | 1 | 1
+ 1 | 12 | 2 | 2
+ 1 | 13 | 3 | 3
+ 2 | 12 | 4 | 1
+ 2 | 13 | 5 | 2
+ 3 | 13 | 6 | 1
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_sql(11,10+r);
+ r | i  | s 
+---+----+---
+ 1 | 11 | 1
+ 2 | 11 | 2
+ 2 | 12 | 3
+ 3 | 11 | 4
+ 3 | 12 | 5
+ 3 | 13 | 6
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_sql(11,10+r) WITH ORDINALITY AS f(i,s,o);
+ r | i  | s | o 
+---+----+---+---
+ 1 | 11 | 1 | 1
+ 2 | 11 | 2 | 1
+ 2 | 12 | 3 | 2
+ 3 | 11 | 4 | 1
+ 3 | 12 | 5 | 2
+ 3 | 13 | 6 | 3
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (11,12),(13,15),(16,20)) v(r1,r2), rngfunc_sql(r1,r2);
+ r1 | r2 | i  | s  
+----+----+----+----
+ 11 | 12 | 11 |  1
+ 11 | 12 | 12 |  2
+ 13 | 15 | 13 |  3
+ 13 | 15 | 14 |  4
+ 13 | 15 | 15 |  5
+ 16 | 20 | 16 |  6
+ 16 | 20 | 17 |  7
+ 16 | 20 | 18 |  8
+ 16 | 20 | 19 |  9
+ 16 | 20 | 20 | 10
+(10 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (11,12),(13,15),(16,20)) v(r1,r2), rngfunc_sql(r1,r2) WITH ORDINALITY AS f(i,s,o);
+ r1 | r2 | i  | s  | o 
+----+----+----+----+---
+ 11 | 12 | 11 |  1 | 1
+ 11 | 12 | 12 |  2 | 2
+ 13 | 15 | 13 |  3 | 1
+ 13 | 15 | 14 |  4 | 2
+ 13 | 15 | 15 |  5 | 3
+ 16 | 20 | 16 |  6 | 1
+ 16 | 20 | 17 |  7 | 2
+ 16 | 20 | 18 |  8 | 3
+ 16 | 20 | 19 |  9 | 4
+ 16 | 20 | 20 | 10 | 5
+(10 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_mat(10+r,13);
+ r | i  | s 
+---+----+---
+ 1 | 11 | 1
+ 1 | 12 | 2
+ 1 | 13 | 3
+ 2 | 12 | 4
+ 2 | 13 | 5
+ 3 | 13 | 6
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_mat(10+r,13) WITH ORDINALITY AS f(i,s,o);
+ r | i  | s | o 
+---+----+---+---
+ 1 | 11 | 1 | 1
+ 1 | 12 | 2 | 2
+ 1 | 13 | 3 | 3
+ 2 | 12 | 4 | 1
+ 2 | 13 | 5 | 2
+ 3 | 13 | 6 | 1
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_mat(11,10+r);
+ r | i  | s 
+---+----+---
+ 1 | 11 | 1
+ 2 | 11 | 2
+ 2 | 12 | 3
+ 3 | 11 | 4
+ 3 | 12 | 5
+ 3 | 13 | 6
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_mat(11,10+r) WITH ORDINALITY AS f(i,s,o);
+ r | i  | s | o 
+---+----+---+---
+ 1 | 11 | 1 | 1
+ 2 | 11 | 2 | 1
+ 2 | 12 | 3 | 2
+ 3 | 11 | 4 | 1
+ 3 | 12 | 5 | 2
+ 3 | 13 | 6 | 3
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (11,12),(13,15),(16,20)) v(r1,r2), rngfunc_mat(r1,r2);
+ r1 | r2 | i  | s  
+----+----+----+----
+ 11 | 12 | 11 |  1
+ 11 | 12 | 12 |  2
+ 13 | 15 | 13 |  3
+ 13 | 15 | 14 |  4
+ 13 | 15 | 15 |  5
+ 16 | 20 | 16 |  6
+ 16 | 20 | 17 |  7
+ 16 | 20 | 18 |  8
+ 16 | 20 | 19 |  9
+ 16 | 20 | 20 | 10
+(10 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (11,12),(13,15),(16,20)) v(r1,r2), rngfunc_mat(r1,r2) WITH ORDINALITY AS f(i,s,o);
+ r1 | r2 | i  | s  | o 
+----+----+----+----+---
+ 11 | 12 | 11 |  1 | 1
+ 11 | 12 | 12 |  2 | 2
+ 13 | 15 | 13 |  3 | 1
+ 13 | 15 | 14 |  4 | 2
+ 13 | 15 | 15 |  5 | 3
+ 16 | 20 | 16 |  6 | 1
+ 16 | 20 | 17 |  7 | 2
+ 16 | 20 | 18 |  8 | 3
+ 16 | 20 | 19 |  9 | 4
+ 16 | 20 | 20 | 10 | 5
+(10 rows)
+
+-- selective rescan of multiple functions:
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), ROWS FROM( rngfunc_sql(11,11), rngfunc_mat(10+r,13) );
+ r | i  | s | i  | s 
+---+----+---+----+---
+ 1 | 11 | 1 | 11 | 1
+ 1 |    |   | 12 | 2
+ 1 |    |   | 13 | 3
+ 2 | 11 | 1 | 12 | 4
+ 2 |    |   | 13 | 5
+ 3 | 11 | 1 | 13 | 6
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), ROWS FROM( rngfunc_sql(10+r,13), rngfunc_mat(11,11) );
+ r | i  | s | i  | s 
+---+----+---+----+---
+ 1 | 11 | 1 | 11 | 1
+ 1 | 12 | 2 |    |  
+ 1 | 13 | 3 |    |  
+ 2 | 12 | 4 | 11 | 1
+ 2 | 13 | 5 |    |  
+ 3 | 13 | 6 | 11 | 1
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), ROWS FROM( rngfunc_sql(10+r,13), rngfunc_mat(10+r,13) );
+ r | i  | s | i  | s 
+---+----+---+----+---
+ 1 | 11 | 1 | 11 | 1
+ 1 | 12 | 2 | 12 | 2
+ 1 | 13 | 3 | 13 | 3
+ 2 | 12 | 4 | 12 | 4
+ 2 | 13 | 5 | 13 | 5
+ 3 | 13 | 6 | 13 | 6
+(6 rows)
+
+SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false);
+ setval | setval 
+--------+--------
+      1 |      1
+(1 row)
+
+SELECT * FROM generate_series(1,2) r1, generate_series(r1,3) r2, ROWS FROM( rngfunc_sql(10+r1,13), rngfunc_mat(10+r2,13) );
+ r1 | r2 | i  | s  | i  | s 
+----+----+----+----+----+---
+  1 |  1 | 11 |  1 | 11 | 1
+  1 |  1 | 12 |  2 | 12 | 2
+  1 |  1 | 13 |  3 | 13 | 3
+  1 |  2 | 11 |  4 | 12 | 4
+  1 |  2 | 12 |  5 | 13 | 5
+  1 |  2 | 13 |  6 |    |  
+  1 |  3 | 11 |  7 | 13 | 6
+  1 |  3 | 12 |  8 |    |  
+  1 |  3 | 13 |  9 |    |  
+  2 |  2 | 12 | 10 | 12 | 7
+  2 |  2 | 13 | 11 | 13 | 8
+  2 |  3 | 12 | 12 | 13 | 9
+  2 |  3 | 13 | 13 |    |  
+(13 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), generate_series(10+r,20-r) f(i);
+ r | i  
+---+----
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 12
+ 2 | 13
+ 2 | 14
+ 2 | 15
+ 2 | 16
+ 2 | 17
+ 2 | 18
+ 3 | 13
+ 3 | 14
+ 3 | 15
+ 3 | 16
+ 3 | 17
+(21 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), generate_series(10+r,20-r) WITH ORDINALITY AS f(i,o);
+ r | i  | o 
+---+----+---
+ 1 | 11 | 1
+ 1 | 12 | 2
+ 1 | 13 | 3
+ 1 | 14 | 4
+ 1 | 15 | 5
+ 1 | 16 | 6
+ 1 | 17 | 7
+ 1 | 18 | 8
+ 1 | 19 | 9
+ 2 | 12 | 1
+ 2 | 13 | 2
+ 2 | 14 | 3
+ 2 | 15 | 4
+ 2 | 16 | 5
+ 2 | 17 | 6
+ 2 | 18 | 7
+ 3 | 13 | 1
+ 3 | 14 | 2
+ 3 | 15 | 3
+ 3 | 16 | 4
+ 3 | 17 | 5
+(21 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), unnest(array[r*10,r*20,r*30]) f(i);
+ r | i  
+---+----
+ 1 | 10
+ 1 | 20
+ 1 | 30
+ 2 | 20
+ 2 | 40
+ 2 | 60
+ 3 | 30
+ 3 | 60
+ 3 | 90
+(9 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v(r), unnest(array[r*10,r*20,r*30]) WITH ORDINALITY AS f(i,o);
+ r | i  | o 
+---+----+---
+ 1 | 10 | 1
+ 1 | 20 | 2
+ 1 | 30 | 3
+ 2 | 20 | 1
+ 2 | 40 | 2
+ 2 | 60 | 3
+ 3 | 30 | 1
+ 3 | 60 | 2
+ 3 | 90 | 3
+(9 rows)
+
+-- deep nesting
+SELECT * FROM (VALUES (1),(2),(3)) v1(r1),
+              LATERAL (SELECT r1, * FROM (VALUES (10),(20),(30)) v2(r2)
+                                         LEFT JOIN generate_series(21,23) f(i) ON ((r2+i)<100) OFFSET 0) s1;
+ r1 | r1 | r2 | i  
+----+----+----+----
+  1 |  1 | 10 | 21
+  1 |  1 | 10 | 22
+  1 |  1 | 10 | 23
+  1 |  1 | 20 | 21
+  1 |  1 | 20 | 22
+  1 |  1 | 20 | 23
+  1 |  1 | 30 | 21
+  1 |  1 | 30 | 22
+  1 |  1 | 30 | 23
+  2 |  2 | 10 | 21
+  2 |  2 | 10 | 22
+  2 |  2 | 10 | 23
+  2 |  2 | 20 | 21
+  2 |  2 | 20 | 22
+  2 |  2 | 20 | 23
+  2 |  2 | 30 | 21
+  2 |  2 | 30 | 22
+  2 |  2 | 30 | 23
+  3 |  3 | 10 | 21
+  3 |  3 | 10 | 22
+  3 |  3 | 10 | 23
+  3 |  3 | 20 | 21
+  3 |  3 | 20 | 22
+  3 |  3 | 20 | 23
+  3 |  3 | 30 | 21
+  3 |  3 | 30 | 22
+  3 |  3 | 30 | 23
+(27 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v1(r1),
+              LATERAL (SELECT r1, * FROM (VALUES (10),(20),(30)) v2(r2)
+                                         LEFT JOIN generate_series(20+r1,23) f(i) ON ((r2+i)<100) OFFSET 0) s1;
+ r1 | r1 | r2 | i  
+----+----+----+----
+  1 |  1 | 10 | 21
+  1 |  1 | 10 | 22
+  1 |  1 | 10 | 23
+  1 |  1 | 20 | 21
+  1 |  1 | 20 | 22
+  1 |  1 | 20 | 23
+  1 |  1 | 30 | 21
+  1 |  1 | 30 | 22
+  1 |  1 | 30 | 23
+  2 |  2 | 10 | 22
+  2 |  2 | 10 | 23
+  2 |  2 | 20 | 22
+  2 |  2 | 20 | 23
+  2 |  2 | 30 | 22
+  2 |  2 | 30 | 23
+  3 |  3 | 10 | 23
+  3 |  3 | 20 | 23
+  3 |  3 | 30 | 23
+(18 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v1(r1),
+              LATERAL (SELECT r1, * FROM (VALUES (10),(20),(30)) v2(r2)
+                                         LEFT JOIN generate_series(r2,r2+3) f(i) ON ((r2+i)<100) OFFSET 0) s1;
+ r1 | r1 | r2 | i  
+----+----+----+----
+  1 |  1 | 10 | 10
+  1 |  1 | 10 | 11
+  1 |  1 | 10 | 12
+  1 |  1 | 10 | 13
+  1 |  1 | 20 | 20
+  1 |  1 | 20 | 21
+  1 |  1 | 20 | 22
+  1 |  1 | 20 | 23
+  1 |  1 | 30 | 30
+  1 |  1 | 30 | 31
+  1 |  1 | 30 | 32
+  1 |  1 | 30 | 33
+  2 |  2 | 10 | 10
+  2 |  2 | 10 | 11
+  2 |  2 | 10 | 12
+  2 |  2 | 10 | 13
+  2 |  2 | 20 | 20
+  2 |  2 | 20 | 21
+  2 |  2 | 20 | 22
+  2 |  2 | 20 | 23
+  2 |  2 | 30 | 30
+  2 |  2 | 30 | 31
+  2 |  2 | 30 | 32
+  2 |  2 | 30 | 33
+  3 |  3 | 10 | 10
+  3 |  3 | 10 | 11
+  3 |  3 | 10 | 12
+  3 |  3 | 10 | 13
+  3 |  3 | 20 | 20
+  3 |  3 | 20 | 21
+  3 |  3 | 20 | 22
+  3 |  3 | 20 | 23
+  3 |  3 | 30 | 30
+  3 |  3 | 30 | 31
+  3 |  3 | 30 | 32
+  3 |  3 | 30 | 33
+(36 rows)
+
+SELECT * FROM (VALUES (1),(2),(3)) v1(r1),
+              LATERAL (SELECT r1, * FROM (VALUES (10),(20),(30)) v2(r2)
+                                         LEFT JOIN generate_series(r1,2+r2/5) f(i) ON ((r2+i)<100) OFFSET 0) s1;
+ r1 | r1 | r2 | i 
+----+----+----+---
+  1 |  1 | 10 | 1
+  1 |  1 | 10 | 2
+  1 |  1 | 10 | 3
+  1 |  1 | 10 | 4
+  1 |  1 | 20 | 1
+  1 |  1 | 20 | 2
+  1 |  1 | 20 | 3
+  1 |  1 | 20 | 4
+  1 |  1 | 20 | 5
+  1 |  1 | 20 | 6
+  1 |  1 | 30 | 1
+  1 |  1 | 30 | 2
+  1 |  1 | 30 | 3
+  1 |  1 | 30 | 4
+  1 |  1 | 30 | 5
+  1 |  1 | 30 | 6
+  1 |  1 | 30 | 7
+  1 |  1 | 30 | 8
+  2 |  2 | 10 | 2
+  2 |  2 | 10 | 3
+  2 |  2 | 10 | 4
+  2 |  2 | 20 | 2
+  2 |  2 | 20 | 3
+  2 |  2 | 20 | 4
+  2 |  2 | 20 | 5
+  2 |  2 | 20 | 6
+  2 |  2 | 30 | 2
+  2 |  2 | 30 | 3
+  2 |  2 | 30 | 4
+  2 |  2 | 30 | 5
+  2 |  2 | 30 | 6
+  2 |  2 | 30 | 7
+  2 |  2 | 30 | 8
+  3 |  3 | 10 | 3
+  3 |  3 | 10 | 4
+  3 |  3 | 20 | 3
+  3 |  3 | 20 | 4
+  3 |  3 | 20 | 5
+  3 |  3 | 20 | 6
+  3 |  3 | 30 | 3
+  3 |  3 | 30 | 4
+  3 |  3 | 30 | 5
+  3 |  3 | 30 | 6
+  3 |  3 | 30 | 7
+  3 |  3 | 30 | 8
+(45 rows)
+
+-- check handling of FULL JOIN with multiple lateral references (bug #15741)
+SELECT *
+FROM (VALUES (1),(2)) v1(r1)
+    LEFT JOIN LATERAL (
+        SELECT *
+        FROM generate_series(1, v1.r1) AS gs1
+        LEFT JOIN LATERAL (
+            SELECT *
+            FROM generate_series(1, gs1) AS gs2
+            LEFT JOIN generate_series(1, gs2) AS gs3 ON TRUE
+        ) AS ss1 ON TRUE
+        FULL JOIN generate_series(1, v1.r1) AS gs4 ON FALSE
+    ) AS ss0 ON TRUE;
+ r1 | gs1 | gs2 | gs3 | gs4 
+----+-----+-----+-----+-----
+  1 |     |     |     |   1
+  1 |   1 |   1 |   1 |    
+  2 |     |     |     |   1
+  2 |     |     |     |   2
+  2 |   1 |   1 |   1 |    
+  2 |   2 |   1 |   1 |    
+  2 |   2 |   2 |   1 |    
+  2 |   2 |   2 |   2 |    
+(8 rows)
+
+DROP FUNCTION rngfunc_sql(int,int);
+DROP FUNCTION rngfunc_mat(int,int);
+DROP SEQUENCE rngfunc_rescan_seq1;
+DROP SEQUENCE rngfunc_rescan_seq2;
+--
+-- Test cases involving OUT parameters
+--
+CREATE FUNCTION rngfunc(in f1 int, out f2 int)
+AS 'select $1+1' LANGUAGE sql;
+SELECT rngfunc(42);
+ rngfunc 
+---------
+      43
+(1 row)
+
+SELECT * FROM rngfunc(42);
+ f2 
+----
+ 43
+(1 row)
+
+SELECT * FROM rngfunc(42) AS p(x);
+ x  
+----
+ 43
+(1 row)
+
+-- explicit spec of return type is OK
+CREATE OR REPLACE FUNCTION rngfunc(in f1 int, out f2 int) RETURNS int
+AS 'select $1+1' LANGUAGE sql;
+-- error, wrong result type
+CREATE OR REPLACE FUNCTION rngfunc(in f1 int, out f2 int) RETURNS float
+AS 'select $1+1' LANGUAGE sql;
+ERROR:  function result type must be integer because of OUT parameters
+-- with multiple OUT params you must get a RECORD result
+CREATE OR REPLACE FUNCTION rngfunc(in f1 int, out f2 int, out f3 text) RETURNS int
+AS 'select $1+1' LANGUAGE sql;
+ERROR:  function result type must be record because of OUT parameters
+CREATE OR REPLACE FUNCTION rngfunc(in f1 int, out f2 int, out f3 text)
+RETURNS record
+AS 'select $1+1' LANGUAGE sql;
+ERROR:  cannot change return type of existing function
+HINT:  Use DROP FUNCTION rngfunc(integer) first.
+CREATE OR REPLACE FUNCTION rngfuncr(in f1 int, out f2 int, out text)
+AS $$select $1-1, $1::text || 'z'$$ LANGUAGE sql;
+SELECT f1, rngfuncr(f1) FROM int4_tbl;
+     f1      |          rngfuncr          
+-------------+----------------------------
+           0 | (-1,0z)
+      123456 | (123455,123456z)
+     -123456 | (-123457,-123456z)
+  2147483647 | (2147483646,2147483647z)
+ -2147483647 | (-2147483648,-2147483647z)
+(5 rows)
+
+SELECT * FROM rngfuncr(42);
+ f2 | column2 
+----+---------
+ 41 | 42z
+(1 row)
+
+SELECT * FROM rngfuncr(42) AS p(a,b);
+ a  |  b  
+----+-----
+ 41 | 42z
+(1 row)
+
+CREATE OR REPLACE FUNCTION rngfuncb(in f1 int, inout f2 int, out text)
+AS $$select $2-1, $1::text || 'z'$$ LANGUAGE sql;
+SELECT f1, rngfuncb(f1, f1/2) FROM int4_tbl;
+     f1      |          rngfuncb          
+-------------+----------------------------
+           0 | (-1,0z)
+      123456 | (61727,123456z)
+     -123456 | (-61729,-123456z)
+  2147483647 | (1073741822,2147483647z)
+ -2147483647 | (-1073741824,-2147483647z)
+(5 rows)
+
+SELECT * FROM rngfuncb(42, 99);
+ f2 | column2 
+----+---------
+ 98 | 42z
+(1 row)
+
+SELECT * FROM rngfuncb(42, 99) AS p(a,b);
+ a  |  b  
+----+-----
+ 98 | 42z
+(1 row)
+
+-- Can reference function with or without OUT params for DROP, etc
+DROP FUNCTION rngfunc(int);
+DROP FUNCTION rngfuncr(in f2 int, out f1 int, out text);
+DROP FUNCTION rngfuncb(in f1 int, inout f2 int);
+--
+-- For my next trick, polymorphic OUT parameters
+--
+CREATE FUNCTION dup (f1 anyelement, f2 out anyelement, f3 out anyarray)
+AS 'select $1, array[$1,$1]' LANGUAGE sql;
+SELECT dup(22);
+      dup       
+----------------
+ (22,"{22,22}")
+(1 row)
+
+SELECT dup('xyz');	-- fails
+ERROR:  could not determine polymorphic type because input has type unknown
+SELECT dup('xyz'::text);
+        dup        
+-------------------
+ (xyz,"{xyz,xyz}")
+(1 row)
+
+SELECT * FROM dup('xyz'::text);
+ f2  |    f3     
+-----+-----------
+ xyz | {xyz,xyz}
+(1 row)
+
+-- fails, as we are attempting to rename first argument
+CREATE OR REPLACE FUNCTION dup (inout f2 anyelement, out f3 anyarray)
+AS 'select $1, array[$1,$1]' LANGUAGE sql;
+ERROR:  cannot change name of input parameter "f1"
+HINT:  Use DROP FUNCTION dup(anyelement) first.
+DROP FUNCTION dup(anyelement);
+-- equivalent behavior, though different name exposed for input arg
+CREATE OR REPLACE FUNCTION dup (inout f2 anyelement, out f3 anyarray)
+AS 'select $1, array[$1,$1]' LANGUAGE sql;
+SELECT dup(22);
+      dup       
+----------------
+ (22,"{22,22}")
+(1 row)
+
+DROP FUNCTION dup(anyelement);
+-- fails, no way to deduce outputs
+CREATE FUNCTION bad (f1 int, out f2 anyelement, out f3 anyarray)
+AS 'select $1, array[$1,$1]' LANGUAGE sql;
+ERROR:  cannot determine result data type
+DETAIL:  A function returning a polymorphic type must have at least one polymorphic argument.
+--
+-- table functions
+--
+CREATE OR REPLACE FUNCTION rngfunc()
+RETURNS TABLE(a int)
+AS $$ SELECT a FROM generate_series(1,5) a(a) $$ LANGUAGE sql;
+SELECT * FROM rngfunc();
+ a 
+---
+ 1
+ 2
+ 3
+ 4
+ 5
+(5 rows)
+
+DROP FUNCTION rngfunc();
+CREATE OR REPLACE FUNCTION rngfunc(int)
+RETURNS TABLE(a int, b int)
+AS $$ SELECT a, b
+         FROM generate_series(1,$1) a(a),
+              generate_series(1,$1) b(b) $$ LANGUAGE sql;
+SELECT * FROM rngfunc(3);
+ a | b 
+---+---
+ 1 | 1
+ 1 | 2
+ 1 | 3
+ 2 | 1
+ 2 | 2
+ 2 | 3
+ 3 | 1
+ 3 | 2
+ 3 | 3
+(9 rows)
+
+DROP FUNCTION rngfunc(int);
+-- case that causes change of typmod knowledge during inlining
+CREATE OR REPLACE FUNCTION rngfunc()
+RETURNS TABLE(a varchar(5))
+AS $$ SELECT 'hello'::varchar(5) $$ LANGUAGE sql STABLE;
+SELECT * FROM rngfunc() GROUP BY 1;
+   a   
+-------
+ hello
+(1 row)
+
+DROP FUNCTION rngfunc();
+--
+-- some tests on SQL functions with RETURNING
+--
+create temp table tt(f1 serial, data text);
+create function insert_tt(text) returns int as
+$$ insert into tt(data) values($1) returning f1 $$
+language sql;
+select insert_tt('foo');
+ insert_tt 
+-----------
+         1
+(1 row)
+
+select insert_tt('bar');
+ insert_tt 
+-----------
+         2
+(1 row)
+
+select * from tt;
+ f1 | data 
+----+------
+  1 | foo
+  2 | bar
+(2 rows)
+
+-- insert will execute to completion even if function needs just 1 row
+create or replace function insert_tt(text) returns int as
+$$ insert into tt(data) values($1),($1||$1) returning f1 $$
+language sql;
+select insert_tt('fool');
+ insert_tt 
+-----------
+         3
+(1 row)
+
+select * from tt;
+ f1 |   data   
+----+----------
+  1 | foo
+  2 | bar
+  3 | fool
+  4 | foolfool
+(4 rows)
+
+-- setof does what's expected
+create or replace function insert_tt2(text,text) returns setof int as
+$$ insert into tt(data) values($1),($2) returning f1 $$
+language sql;
+select insert_tt2('foolish','barrish');
+ insert_tt2 
+------------
+          5
+          6
+(2 rows)
+
+select * from insert_tt2('baz','quux');
+ insert_tt2 
+------------
+          7
+          8
+(2 rows)
+
+select * from tt;
+ f1 |   data   
+----+----------
+  1 | foo
+  2 | bar
+  3 | fool
+  4 | foolfool
+  5 | foolish
+  6 | barrish
+  7 | baz
+  8 | quux
+(8 rows)
+
+-- limit doesn't prevent execution to completion
+select insert_tt2('foolish','barrish') limit 1;
+ insert_tt2 
+------------
+          9
+(1 row)
+
+select * from tt;
+ f1 |   data   
+----+----------
+  1 | foo
+  2 | bar
+  3 | fool
+  4 | foolfool
+  5 | foolish
+  6 | barrish
+  7 | baz
+  8 | quux
+  9 | foolish
+ 10 | barrish
+(10 rows)
+
+-- triggers will fire, too
+create function noticetrigger() returns trigger as $$
+begin
+  raise notice 'noticetrigger % %', new.f1, new.data;
+  return null;
+end $$ language plpgsql;
+create trigger tnoticetrigger after insert on tt for each row
+execute procedure noticetrigger();
+select insert_tt2('foolme','barme') limit 1;
+NOTICE:  noticetrigger 11 foolme
+NOTICE:  noticetrigger 12 barme
+ insert_tt2 
+------------
+         11
+(1 row)
+
+select * from tt;
+ f1 |   data   
+----+----------
+  1 | foo
+  2 | bar
+  3 | fool
+  4 | foolfool
+  5 | foolish
+  6 | barrish
+  7 | baz
+  8 | quux
+  9 | foolish
+ 10 | barrish
+ 11 | foolme
+ 12 | barme
+(12 rows)
+
+-- and rules work
+create temp table tt_log(f1 int, data text);
+create rule insert_tt_rule as on insert to tt do also
+  insert into tt_log values(new.*);
+select insert_tt2('foollog','barlog') limit 1;
+NOTICE:  noticetrigger 13 foollog
+NOTICE:  noticetrigger 14 barlog
+ insert_tt2 
+------------
+         13
+(1 row)
+
+select * from tt;
+ f1 |   data   
+----+----------
+  1 | foo
+  2 | bar
+  3 | fool
+  4 | foolfool
+  5 | foolish
+  6 | barrish
+  7 | baz
+  8 | quux
+  9 | foolish
+ 10 | barrish
+ 11 | foolme
+ 12 | barme
+ 13 | foollog
+ 14 | barlog
+(14 rows)
+
+-- note that nextval() gets executed a second time in the rule expansion,
+-- which is expected.
+select * from tt_log;
+ f1 |  data   
+----+---------
+ 15 | foollog
+ 16 | barlog
+(2 rows)
+
+-- test case for a whole-row-variable bug
+create function rngfunc1(n integer, out a text, out b text)
+  returns setof record
+  language sql
+  as $$ select 'foo ' || i, 'bar ' || i from generate_series(1,$1) i $$;
+set work_mem='64kB';
+select t.a, t, t.a from rngfunc1(10000) t limit 1;
+   a   |         t         |   a   
+-------+-------------------+-------
+ foo 1 | ("foo 1","bar 1") | foo 1
+(1 row)
+
+reset work_mem;
+select t.a, t, t.a from rngfunc1(10000) t limit 1;
+   a   |         t         |   a   
+-------+-------------------+-------
+ foo 1 | ("foo 1","bar 1") | foo 1
+(1 row)
+
+drop function rngfunc1(n integer);
+-- test use of SQL functions returning record
+-- this is supported in some cases where the query doesn't specify
+-- the actual record type ...
+create function array_to_set(anyarray) returns setof record as $$
+  select i AS "index", $1[i] AS "value" from generate_subscripts($1, 1) i
+$$ language sql strict immutable;
+select array_to_set(array['one', 'two']);
+ array_to_set 
+--------------
+ (1,one)
+ (2,two)
+(2 rows)
+
+select * from array_to_set(array['one', 'two']) as t(f1 int,f2 text);
+ f1 | f2  
+----+-----
+  1 | one
+  2 | two
+(2 rows)
+
+select * from array_to_set(array['one', 'two']); -- fail
+ERROR:  a column definition list is required for functions returning "record"
+LINE 1: select * from array_to_set(array['one', 'two']);
+                      ^
+create temp table rngfunc(f1 int8, f2 int8);
+create function testrngfunc() returns record as $$
+  insert into rngfunc values (1,2) returning *;
+$$ language sql;
+select testrngfunc();
+ testrngfunc 
+-------------
+ (1,2)
+(1 row)
+
+select * from testrngfunc() as t(f1 int8,f2 int8);
+ f1 | f2 
+----+----
+  1 |  2
+(1 row)
+
+select * from testrngfunc(); -- fail
+ERROR:  a column definition list is required for functions returning "record"
+LINE 1: select * from testrngfunc();
+                      ^
+drop function testrngfunc();
+create function testrngfunc() returns setof record as $$
+  insert into rngfunc values (1,2), (3,4) returning *;
+$$ language sql;
+select testrngfunc();
+ testrngfunc 
+-------------
+ (1,2)
+ (3,4)
+(2 rows)
+
+select * from testrngfunc() as t(f1 int8,f2 int8);
+ f1 | f2 
+----+----
+  1 |  2
+  3 |  4
+(2 rows)
+
+select * from testrngfunc(); -- fail
+ERROR:  a column definition list is required for functions returning "record"
+LINE 1: select * from testrngfunc();
+                      ^
+drop function testrngfunc();
+--
+-- Check some cases involving added/dropped columns in a rowtype result
+--
+create temp table users (userid text, seq int, email text, todrop bool, moredrop int, enabled bool);
+insert into users values ('id',1,'email',true,11,true);
+insert into users values ('id2',2,'email2',true,12,true);
+alter table users drop column todrop;
+create or replace function get_first_user() returns users as
+$$ SELECT * FROM users ORDER BY userid LIMIT 1; $$
+language sql stable;
+SELECT get_first_user();
+  get_first_user   
+-------------------
+ (id,1,email,11,t)
+(1 row)
+
+SELECT * FROM get_first_user();
+ userid | seq | email | moredrop | enabled 
+--------+-----+-------+----------+---------
+ id     |   1 | email |       11 | t
+(1 row)
+
+create or replace function get_users() returns setof users as
+$$ SELECT * FROM users ORDER BY userid; $$
+language sql stable;
+SELECT get_users();
+      get_users      
+---------------------
+ (id,1,email,11,t)
+ (id2,2,email2,12,t)
+(2 rows)
+
+SELECT * FROM get_users();
+ userid | seq | email  | moredrop | enabled 
+--------+-----+--------+----------+---------
+ id     |   1 | email  |       11 | t
+ id2    |   2 | email2 |       12 | t
+(2 rows)
+
+SELECT * FROM get_users() WITH ORDINALITY;   -- make sure ordinality copes
+ userid | seq | email  | moredrop | enabled | ordinality 
+--------+-----+--------+----------+---------+------------
+ id     |   1 | email  |       11 | t       |          1
+ id2    |   2 | email2 |       12 | t       |          2
+(2 rows)
+
+-- multiple functions vs. dropped columns
+SELECT * FROM ROWS FROM(generate_series(10,11), get_users()) WITH ORDINALITY;
+ generate_series | userid | seq | email  | moredrop | enabled | ordinality 
+-----------------+--------+-----+--------+----------+---------+------------
+              10 | id     |   1 | email  |       11 | t       |          1
+              11 | id2    |   2 | email2 |       12 | t       |          2
+(2 rows)
+
+SELECT * FROM ROWS FROM(get_users(), generate_series(10,11)) WITH ORDINALITY;
+ userid | seq | email  | moredrop | enabled | generate_series | ordinality 
+--------+-----+--------+----------+---------+-----------------+------------
+ id     |   1 | email  |       11 | t       |              10 |          1
+ id2    |   2 | email2 |       12 | t       |              11 |          2
+(2 rows)
+
+-- check that we can cope with post-parsing changes in rowtypes
+create temp view usersview as
+SELECT * FROM ROWS FROM(get_users(), generate_series(10,11)) WITH ORDINALITY;
+select * from usersview;
+ userid | seq | email  | moredrop | enabled | generate_series | ordinality 
+--------+-----+--------+----------+---------+-----------------+------------
+ id     |   1 | email  |       11 | t       |              10 |          1
+ id2    |   2 | email2 |       12 | t       |              11 |          2
+(2 rows)
+
+alter table users add column junk text;
+select * from usersview;
+ userid | seq | email  | moredrop | enabled | generate_series | ordinality 
+--------+-----+--------+----------+---------+-----------------+------------
+ id     |   1 | email  |       11 | t       |              10 |          1
+ id2    |   2 | email2 |       12 | t       |              11 |          2
+(2 rows)
+
+begin;
+alter table users drop column moredrop;
+select * from usersview;  -- expect clean failure
+ERROR:  attribute 5 of type record has been dropped
+rollback;
+alter table users alter column seq type numeric;
+select * from usersview;  -- expect clean failure
+ERROR:  attribute 2 of type record has wrong type
+DETAIL:  Table has type numeric, but query expects integer.
+drop view usersview;
+drop function get_first_user();
+drop function get_users();
+drop table users;
+-- this won't get inlined because of type coercion, but it shouldn't fail
+create or replace function rngfuncbar() returns setof text as
+$$ select 'foo'::varchar union all select 'bar'::varchar ; $$
+language sql stable;
+select rngfuncbar();
+ rngfuncbar 
+------------
+ foo
+ bar
+(2 rows)
+
+select * from rngfuncbar();
+ rngfuncbar 
+------------
+ foo
+ bar
+(2 rows)
+
+drop function rngfuncbar();
+-- check handling of a SQL function with multiple OUT params (bug #5777)
+create or replace function rngfuncbar(out integer, out numeric) as
+$$ select (1, 2.1) $$ language sql;
+select * from rngfuncbar();
+ column1 | column2 
+---------+---------
+       1 |     2.1
+(1 row)
+
+create or replace function rngfuncbar(out integer, out numeric) as
+$$ select (1, 2) $$ language sql;
+select * from rngfuncbar();  -- fail
+ERROR:  function return row and query-specified return row do not match
+DETAIL:  Returned type integer at ordinal position 2, but query expects numeric.
+create or replace function rngfuncbar(out integer, out numeric) as
+$$ select (1, 2.1, 3) $$ language sql;
+select * from rngfuncbar();  -- fail
+ERROR:  function return row and query-specified return row do not match
+DETAIL:  Returned row contains 3 attributes, but query expects 2.
+drop function rngfuncbar();
+-- check whole-row-Var handling in nested lateral functions (bug #11703)
+create function extractq2(t int8_tbl) returns int8 as $$
+  select t.q2
+$$ language sql immutable;
+explain (verbose, costs off)
+select x from int8_tbl, extractq2(int8_tbl) f(x);
+             QUERY PLAN             
+------------------------------------
+ Nested Loop
+   Output: f.x
+   ->  Seq Scan on public.int8_tbl
+         Output: int8_tbl.q2
+   ->  Function Scan on f
+         Output: f.x
+         Function Call: int8_tbl.q2
+(7 rows)
+
+select x from int8_tbl, extractq2(int8_tbl) f(x);
+         x         
+-------------------
+               456
+  4567890123456789
+               123
+  4567890123456789
+ -4567890123456789
+(5 rows)
+
+create function extractq2_2(t int8_tbl) returns table(ret1 int8) as $$
+  select extractq2(t) offset 0
+$$ language sql immutable;
+explain (verbose, costs off)
+select x from int8_tbl, extractq2_2(int8_tbl) f(x);
+            QUERY PLAN             
+-----------------------------------
+ Nested Loop
+   Output: ((int8_tbl.*).q2)
+   ->  Seq Scan on public.int8_tbl
+         Output: int8_tbl.*
+   ->  Result
+         Output: (int8_tbl.*).q2
+(6 rows)
+
+select x from int8_tbl, extractq2_2(int8_tbl) f(x);
+         x         
+-------------------
+               456
+  4567890123456789
+               123
+  4567890123456789
+ -4567890123456789
+(5 rows)
+
+-- without the "offset 0", this function gets optimized quite differently
+create function extractq2_2_opt(t int8_tbl) returns table(ret1 int8) as $$
+  select extractq2(t)
+$$ language sql immutable;
+explain (verbose, costs off)
+select x from int8_tbl, extractq2_2_opt(int8_tbl) f(x);
+         QUERY PLAN          
+-----------------------------
+ Seq Scan on public.int8_tbl
+   Output: int8_tbl.q2
+(2 rows)
+
+select x from int8_tbl, extractq2_2_opt(int8_tbl) f(x);
+         x         
+-------------------
+               456
+  4567890123456789
+               123
+  4567890123456789
+ -4567890123456789
+(5 rows)
+
+-- check handling of nulls in SRF results (bug #7808)
+create type rngfunc2 as (a integer, b text);
+select *, row_to_json(u) from unnest(array[(1,'foo')::rngfunc2, null::rngfunc2]) u;
+ a |  b  |     row_to_json     
+---+-----+---------------------
+ 1 | foo | {"a":1,"b":"foo"}
+   |     | {"a":null,"b":null}
+(2 rows)
+
+select *, row_to_json(u) from unnest(array[null::rngfunc2, null::rngfunc2]) u;
+ a | b |     row_to_json     
+---+---+---------------------
+   |   | {"a":null,"b":null}
+   |   | {"a":null,"b":null}
+(2 rows)
+
+select *, row_to_json(u) from unnest(array[null::rngfunc2, (1,'foo')::rngfunc2, null::rngfunc2]) u;
+ a |  b  |     row_to_json     
+---+-----+---------------------
+   |     | {"a":null,"b":null}
+ 1 | foo | {"a":1,"b":"foo"}
+   |     | {"a":null,"b":null}
+(3 rows)
+
+select *, row_to_json(u) from unnest(array[]::rngfunc2[]) u;
+ a | b | row_to_json 
+---+---+-------------
+(0 rows)
+
+drop type rngfunc2;
diff --git a/src/test/regress/expected/reloptions_1.out b/src/test/regress/expected/reloptions_1.out
new file mode 100644
index 0000000000..fd0b73a365
--- /dev/null
+++ b/src/test/regress/expected/reloptions_1.out
@@ -0,0 +1,219 @@
+-- Simple create
+CREATE TABLE reloptions_test(i INT) WITH (FiLLFaCToR=30,
+	autovacuum_enabled = false, autovacuum_analyze_scale_factor = 0.2);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass;
+                                  reloptions                                  
+------------------------------------------------------------------------------
+ {fillfactor=30,autovacuum_enabled=false,autovacuum_analyze_scale_factor=0.2}
+(1 row)
+
+-- Fail min/max values check
+CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=2);
+ERROR:  value 2 out of bounds for option "fillfactor"
+DETAIL:  Valid values are between "10" and "100".
+CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=110);
+ERROR:  value 110 out of bounds for option "fillfactor"
+DETAIL:  Valid values are between "10" and "100".
+CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_analyze_scale_factor = -10.0);
+ERROR:  value -10.0 out of bounds for option "autovacuum_analyze_scale_factor"
+DETAIL:  Valid values are between "0.000000" and "100.000000".
+CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_analyze_scale_factor = 110.0);
+ERROR:  value 110.0 out of bounds for option "autovacuum_analyze_scale_factor"
+DETAIL:  Valid values are between "0.000000" and "100.000000".
+-- Fail when option and namespace do not exist
+CREATE TABLE reloptions_test2(i INT) WITH (not_existing_option=2);
+ERROR:  unrecognized parameter "not_existing_option"
+CREATE TABLE reloptions_test2(i INT) WITH (not_existing_namespace.fillfactor=2);
+ERROR:  unrecognized parameter namespace "not_existing_namespace"
+-- Fail while setting improper values
+CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=-30.1);
+ERROR:  value -30.1 out of bounds for option "fillfactor"
+DETAIL:  Valid values are between "10" and "100".
+CREATE TABLE reloptions_test2(i INT) WITH (fillfactor='string');
+ERROR:  invalid value for integer option "fillfactor": string
+CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=true);
+ERROR:  invalid value for integer option "fillfactor": true
+CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_enabled=12);
+ERROR:  invalid value for boolean option "autovacuum_enabled": 12
+CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_enabled=30.5);
+ERROR:  invalid value for boolean option "autovacuum_enabled": 30.5
+CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_enabled='string');
+ERROR:  invalid value for boolean option "autovacuum_enabled": string
+CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_analyze_scale_factor='string');
+ERROR:  invalid value for floating point option "autovacuum_analyze_scale_factor": string
+CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_analyze_scale_factor=true);
+ERROR:  invalid value for floating point option "autovacuum_analyze_scale_factor": true
+-- Fail if option is specified twice
+CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=30, fillfactor=40);
+ERROR:  parameter "fillfactor" specified more than once
+-- Specifying name only for a non-Boolean option should fail
+CREATE TABLE reloptions_test2(i INT) WITH (fillfactor);
+ERROR:  invalid value for integer option "fillfactor": true
+-- Simple ALTER TABLE
+ALTER TABLE reloptions_test SET (fillfactor=31,
+	autovacuum_analyze_scale_factor = 0.3);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass;
+                                  reloptions                                  
+------------------------------------------------------------------------------
+ {autovacuum_enabled=false,fillfactor=31,autovacuum_analyze_scale_factor=0.3}
+(1 row)
+
+-- Set boolean option to true without specifying value
+ALTER TABLE reloptions_test SET (autovacuum_enabled, fillfactor=32);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass;
+                                 reloptions                                  
+-----------------------------------------------------------------------------
+ {autovacuum_analyze_scale_factor=0.3,autovacuum_enabled=true,fillfactor=32}
+(1 row)
+
+-- Check that RESET works well
+ALTER TABLE reloptions_test RESET (fillfactor);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass;
+                          reloptions                           
+---------------------------------------------------------------
+ {autovacuum_analyze_scale_factor=0.3,autovacuum_enabled=true}
+(1 row)
+
+-- Resetting all values causes the column to become null
+ALTER TABLE reloptions_test RESET (autovacuum_enabled,
+	autovacuum_analyze_scale_factor);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass AND
+       reloptions IS NULL;
+ reloptions 
+------------
+ 
+(1 row)
+
+-- RESET fails if a value is specified
+ALTER TABLE reloptions_test RESET (fillfactor=12);
+ERROR:  RESET must not include values for parameters
+-- Test vacuum_truncate option
+DROP TABLE reloptions_test;
+CREATE TABLE reloptions_test(i INT NOT NULL, j text)
+	WITH (vacuum_truncate=false,
+	toast.vacuum_truncate=false,
+	autovacuum_enabled=false);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass;
+                    reloptions                    
+--------------------------------------------------
+ {vacuum_truncate=false,autovacuum_enabled=false}
+(1 row)
+
+INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL);
+ERROR:  null value in column "i" violates not-null constraint
+DETAIL:  Failing row contains (null, null).
+VACUUM reloptions_test;
+SELECT pg_relation_size('reloptions_test') > 0;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT reloptions FROM pg_class WHERE oid =
+	(SELECT reltoastrelid FROM pg_class
+	WHERE oid = 'reloptions_test'::regclass);
+ reloptions 
+------------
+(0 rows)
+
+ALTER TABLE reloptions_test RESET (vacuum_truncate);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass;
+         reloptions         
+----------------------------
+ {autovacuum_enabled=false}
+(1 row)
+
+INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL);
+ERROR:  null value in column "i" violates not-null constraint
+DETAIL:  Failing row contains (null, null).
+VACUUM reloptions_test;
+SELECT pg_relation_size('reloptions_test') = 0;
+ ?column? 
+----------
+ f
+(1 row)
+
+-- Test toast.* options
+DROP TABLE reloptions_test;
+CREATE TABLE reloptions_test (s VARCHAR)
+	WITH (toast.autovacuum_vacuum_cost_delay = 23);
+SELECT reltoastrelid as toast_oid
+	FROM pg_class WHERE oid = 'reloptions_test'::regclass \gset
+SELECT reloptions FROM pg_class WHERE oid = :toast_oid;
+ reloptions 
+------------
+(0 rows)
+
+ALTER TABLE reloptions_test SET (toast.autovacuum_vacuum_cost_delay = 24);
+SELECT reloptions FROM pg_class WHERE oid = :toast_oid;
+ reloptions 
+------------
+(0 rows)
+
+ALTER TABLE reloptions_test RESET (toast.autovacuum_vacuum_cost_delay);
+SELECT reloptions FROM pg_class WHERE oid = :toast_oid;
+ reloptions 
+------------
+(0 rows)
+
+-- Fail on non-existent options in toast namespace
+CREATE TABLE reloptions_test2 (i int) WITH (toast.not_existing_option = 42);
+ERROR:  unrecognized parameter "not_existing_option"
+-- Mix TOAST & heap
+DROP TABLE reloptions_test;
+CREATE TABLE reloptions_test (s VARCHAR) WITH
+	(toast.autovacuum_vacuum_cost_delay = 23,
+	autovacuum_vacuum_cost_delay = 24, fillfactor = 40);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass;
+                   reloptions                    
+-------------------------------------------------
+ {autovacuum_vacuum_cost_delay=24,fillfactor=40}
+(1 row)
+
+SELECT reloptions FROM pg_class WHERE oid = (
+	SELECT reltoastrelid FROM pg_class WHERE oid = 'reloptions_test'::regclass);
+ reloptions 
+------------
+(0 rows)
+
+--
+-- CREATE INDEX, ALTER INDEX for btrees
+--
+CREATE INDEX reloptions_test_idx ON reloptions_test (s) WITH (fillfactor=30);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test_idx'::regclass;
+   reloptions    
+-----------------
+ {fillfactor=30}
+(1 row)
+
+-- Fail when option and namespace do not exist
+CREATE INDEX reloptions_test_idx ON reloptions_test (s)
+	WITH (not_existing_option=2);
+ERROR:  unrecognized parameter "not_existing_option"
+CREATE INDEX reloptions_test_idx ON reloptions_test (s)
+	WITH (not_existing_ns.fillfactor=2);
+ERROR:  unrecognized parameter namespace "not_existing_ns"
+-- Check allowed ranges
+CREATE INDEX reloptions_test_idx2 ON reloptions_test (s) WITH (fillfactor=1);
+ERROR:  value 1 out of bounds for option "fillfactor"
+DETAIL:  Valid values are between "10" and "100".
+CREATE INDEX reloptions_test_idx2 ON reloptions_test (s) WITH (fillfactor=130);
+ERROR:  value 130 out of bounds for option "fillfactor"
+DETAIL:  Valid values are between "10" and "100".
+-- Check ALTER
+ALTER INDEX reloptions_test_idx SET (fillfactor=40);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test_idx'::regclass;
+   reloptions    
+-----------------
+ {fillfactor=40}
+(1 row)
+
+-- Check ALTER on empty reloption list
+CREATE INDEX reloptions_test_idx3 ON reloptions_test (s);
+ALTER INDEX reloptions_test_idx3 SET (fillfactor=40);
+SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test_idx3'::regclass;
+   reloptions    
+-----------------
+ {fillfactor=40}
+(1 row)
+
diff --git a/src/test/regress/expected/strings_1.out b/src/test/regress/expected/strings_1.out
new file mode 100644
index 0000000000..a5c324a8b7
--- /dev/null
+++ b/src/test/regress/expected/strings_1.out
@@ -0,0 +1,1823 @@
+--
+-- STRINGS
+-- Test various data entry syntaxes.
+--
+-- SQL string continuation syntax
+-- E021-03 character string literals
+SELECT 'first line'
+' - next line'
+	' - third line'
+	AS "Three lines to one";
+         Three lines to one          
+-------------------------------------
+ first line - next line - third line
+(1 row)
+
+-- illegal string continuation syntax
+SELECT 'first line'
+' - next line' /* this comment is not allowed here */
+' - third line'
+	AS "Illegal comment within continuation";
+ERROR:  syntax error at or near "' - third line'"
+LINE 3: ' - third line'
+        ^
+-- Unicode escapes
+SET standard_conforming_strings TO on;
+SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
+ data 
+------
+ data
+(1 row)
+
+SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
+ dat\+000061 
+-------------
+ dat\+000061
+(1 row)
+
+SELECT U&' \' UESCAPE '!' AS "tricky";
+ tricky 
+--------
+  \
+(1 row)
+
+SELECT 'tricky' AS U&"\" UESCAPE '!';
+   \    
+--------
+ tricky
+(1 row)
+
+SELECT U&'wrong: \061';
+ERROR:  invalid Unicode escape value at or near "\061'"
+LINE 1: SELECT U&'wrong: \061';
+                         ^
+SELECT U&'wrong: \+0061';
+ERROR:  invalid Unicode escape value at or near "\+0061'"
+LINE 1: SELECT U&'wrong: \+0061';
+                         ^
+SELECT U&'wrong: +0061' UESCAPE '+';
+ERROR:  invalid Unicode escape character at or near "+'"
+LINE 1: SELECT U&'wrong: +0061' UESCAPE '+';
+                                         ^
+SET standard_conforming_strings TO off;
+SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
+ERROR:  unsafe use of string constant with Unicode escapes
+LINE 1: SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
+               ^
+DETAIL:  String constants with Unicode escapes cannot be used when standard_conforming_strings is off.
+SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
+ERROR:  unsafe use of string constant with Unicode escapes
+LINE 1: SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061...
+               ^
+DETAIL:  String constants with Unicode escapes cannot be used when standard_conforming_strings is off.
+SELECT U&' \' UESCAPE '!' AS "tricky";
+ERROR:  unsafe use of string constant with Unicode escapes
+LINE 1: SELECT U&' \' UESCAPE '!' AS "tricky";
+               ^
+DETAIL:  String constants with Unicode escapes cannot be used when standard_conforming_strings is off.
+SELECT 'tricky' AS U&"\" UESCAPE '!';
+   \    
+--------
+ tricky
+(1 row)
+
+SELECT U&'wrong: \061';
+ERROR:  unsafe use of string constant with Unicode escapes
+LINE 1: SELECT U&'wrong: \061';
+               ^
+DETAIL:  String constants with Unicode escapes cannot be used when standard_conforming_strings is off.
+SELECT U&'wrong: \+0061';
+ERROR:  unsafe use of string constant with Unicode escapes
+LINE 1: SELECT U&'wrong: \+0061';
+               ^
+DETAIL:  String constants with Unicode escapes cannot be used when standard_conforming_strings is off.
+SELECT U&'wrong: +0061' UESCAPE '+';
+ERROR:  unsafe use of string constant with Unicode escapes
+LINE 1: SELECT U&'wrong: +0061' UESCAPE '+';
+               ^
+DETAIL:  String constants with Unicode escapes cannot be used when standard_conforming_strings is off.
+RESET standard_conforming_strings;
+-- bytea
+SET bytea_output TO hex;
+SELECT E'\\xDeAdBeEf'::bytea;
+   bytea    
+------------
+ \xdeadbeef
+(1 row)
+
+SELECT E'\\x De Ad Be Ef '::bytea;
+   bytea    
+------------
+ \xdeadbeef
+(1 row)
+
+SELECT E'\\xDeAdBeE'::bytea;
+ERROR:  invalid hexadecimal data: odd number of digits
+LINE 1: SELECT E'\\xDeAdBeE'::bytea;
+               ^
+SELECT E'\\xDeAdBeEx'::bytea;
+ERROR:  invalid hexadecimal digit: "x"
+LINE 1: SELECT E'\\xDeAdBeEx'::bytea;
+               ^
+SELECT E'\\xDe00BeEf'::bytea;
+   bytea    
+------------
+ \xde00beef
+(1 row)
+
+SELECT E'DeAdBeEf'::bytea;
+       bytea        
+--------------------
+ \x4465416442654566
+(1 row)
+
+SELECT E'De\\000dBeEf'::bytea;
+       bytea        
+--------------------
+ \x4465006442654566
+(1 row)
+
+SELECT E'De\123dBeEf'::bytea;
+       bytea        
+--------------------
+ \x4465536442654566
+(1 row)
+
+SELECT E'De\\123dBeEf'::bytea;
+       bytea        
+--------------------
+ \x4465536442654566
+(1 row)
+
+SELECT E'De\\678dBeEf'::bytea;
+ERROR:  invalid input syntax for type bytea
+LINE 1: SELECT E'De\\678dBeEf'::bytea;
+               ^
+SET bytea_output TO escape;
+SELECT E'\\xDeAdBeEf'::bytea;
+      bytea       
+------------------
+ \336\255\276\357
+(1 row)
+
+SELECT E'\\x De Ad Be Ef '::bytea;
+      bytea       
+------------------
+ \336\255\276\357
+(1 row)
+
+SELECT E'\\xDe00BeEf'::bytea;
+      bytea       
+------------------
+ \336\000\276\357
+(1 row)
+
+SELECT E'DeAdBeEf'::bytea;
+  bytea   
+----------
+ DeAdBeEf
+(1 row)
+
+SELECT E'De\\000dBeEf'::bytea;
+    bytea    
+-------------
+ De\000dBeEf
+(1 row)
+
+SELECT E'De\\123dBeEf'::bytea;
+  bytea   
+----------
+ DeSdBeEf
+(1 row)
+
+--
+-- test conversions between various string types
+-- E021-10 implicit casting among the character data types
+--
+SELECT CAST(f1 AS text) AS "text(char)" FROM CHAR_TBL;
+ text(char) 
+------------
+ a
+ ab
+ abcd
+ abcd
+(4 rows)
+
+SELECT CAST(f1 AS text) AS "text(varchar)" FROM VARCHAR_TBL;
+ text(varchar) 
+---------------
+ a
+ ab
+ abcd
+ abcd
+(4 rows)
+
+SELECT CAST(name 'namefield' AS text) AS "text(name)";
+ text(name) 
+------------
+ namefield
+(1 row)
+
+-- since this is an explicit cast, it should truncate w/o error:
+SELECT CAST(f1 AS char(10)) AS "char(text)" FROM TEXT_TBL;
+ char(text) 
+------------
+ doh!      
+ hi de ho n
+(2 rows)
+
+-- note: implicit-cast case is tested in char.sql
+SELECT CAST(f1 AS char(20)) AS "char(text)" FROM TEXT_TBL;
+      char(text)      
+----------------------
+ doh!                
+ hi de ho neighbor   
+(2 rows)
+
+SELECT CAST(f1 AS char(10)) AS "char(varchar)" FROM VARCHAR_TBL;
+ char(varchar) 
+---------------
+ a         
+ ab        
+ abcd      
+ abcd      
+(4 rows)
+
+SELECT CAST(name 'namefield' AS char(10)) AS "char(name)";
+ char(name) 
+------------
+ namefield 
+(1 row)
+
+SELECT CAST(f1 AS varchar) AS "varchar(text)" FROM TEXT_TBL;
+   varchar(text)   
+-------------------
+ doh!
+ hi de ho neighbor
+(2 rows)
+
+SELECT CAST(f1 AS varchar) AS "varchar(char)" FROM CHAR_TBL;
+ varchar(char) 
+---------------
+ a
+ ab
+ abcd
+ abcd
+(4 rows)
+
+SELECT CAST(name 'namefield' AS varchar) AS "varchar(name)";
+ varchar(name) 
+---------------
+ namefield
+(1 row)
+
+--
+-- test SQL string functions
+-- E### and T### are feature reference numbers from SQL99
+--
+-- E021-09 trim function
+SELECT TRIM(BOTH FROM '  bunch o blanks  ') = 'bunch o blanks' AS "bunch o blanks";
+ bunch o blanks 
+----------------
+ t
+(1 row)
+
+SELECT TRIM(LEADING FROM '  bunch o blanks  ') = 'bunch o blanks  ' AS "bunch o blanks  ";
+ bunch o blanks   
+------------------
+ t
+(1 row)
+
+SELECT TRIM(TRAILING FROM '  bunch o blanks  ') = '  bunch o blanks' AS "  bunch o blanks";
+   bunch o blanks 
+------------------
+ t
+(1 row)
+
+SELECT TRIM(BOTH 'x' FROM 'xxxxxsome Xsxxxxx') = 'some Xs' AS "some Xs";
+ some Xs 
+---------
+ t
+(1 row)
+
+-- E021-06 substring expression
+SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890";
+ 34567890 
+----------
+ t
+(1 row)
+
+SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
+ 456 
+-----
+ t
+(1 row)
+
+-- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
+ bcd 
+-----
+ bcd
+(1 row)
+
+-- No match should return NULL
+SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+ True 
+------
+ t
+(1 row)
+
+-- Null inputs should return NULL
+SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
+ True 
+------
+ t
+(1 row)
+
+SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
+ True 
+------
+ t
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+ True 
+------
+ t
+(1 row)
+
+-- The first and last parts should act non-greedy
+SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
+ bcdef 
+-------
+ bcdef
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+ abcdefg 
+---------
+ abcdefg
+(1 row)
+
+-- Vertical bar in any part affects only that part
+SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
+ bcdef 
+-------
+ bcdef
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
+ bcdef 
+-------
+ bcdef
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+ bcdef 
+-------
+ bcdef
+(1 row)
+
+-- Can't have more than two part separators
+SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+ERROR:  SQL regular expression may not contain more than two escape-double-quote separators
+CONTEXT:  SQL function "substring" statement 1
+-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
+SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
+ bcdefg 
+--------
+ bcdefg
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+ abcdefg 
+---------
+ abcdefg
+(1 row)
+
+-- substring() with just two arguments is not allowed by SQL spec;
+-- we accept it, but we interpret the pattern as a POSIX regexp not SQL
+SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
+ cde 
+-----
+ cde
+(1 row)
+
+-- With a parenthesized subexpression, return only what matches the subexpr
+SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde";
+ cde 
+-----
+ cde
+(1 row)
+
+-- PostgreSQL extension to allow using back reference in replace string;
+SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
+ regexp_replace 
+----------------
+ (111) 222-3333
+(1 row)
+
+SELECT regexp_replace('AAA   BBB   CCC   ', E'\\s+', ' ', 'g');
+ regexp_replace 
+----------------
+ AAA BBB CCC 
+(1 row)
+
+SELECT regexp_replace('AAA', '^|$', 'Z', 'g');
+ regexp_replace 
+----------------
+ ZAAAZ
+(1 row)
+
+SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');
+ regexp_replace 
+----------------
+ Z Z
+(1 row)
+
+-- invalid regexp option
+SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
+ERROR:  invalid regular expression option: "z"
+-- set so we can tell NULL from empty string
+\pset null '\\N'
+-- return all matches from regexp
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$);
+ regexp_matches 
+----------------
+ {bar,beque}
+(1 row)
+
+-- test case insensitive
+SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i');
+ regexp_matches 
+----------------
+ {bAR,bEqUE}
+(1 row)
+
+-- global option - more than one match
+SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g');
+ regexp_matches 
+----------------
+ {bar,beque}
+ {bazil,barf}
+(2 rows)
+
+-- empty capture group (matched empty string)
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$);
+ regexp_matches 
+----------------
+ {bar,"",beque}
+(1 row)
+
+-- no match
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$);
+ regexp_matches 
+----------------
+(0 rows)
+
+-- optional capture group did not match, null entry in array
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
+  regexp_matches  
+------------------
+ {bar,NULL,beque}
+(1 row)
+
+-- no capture groups
+SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
+ regexp_matches 
+----------------
+ {barbeque}
+(1 row)
+
+-- start/end-of-line matches are of zero length
+SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^', 'mg');
+ regexp_matches 
+----------------
+ {""}
+ {""}
+ {""}
+ {""}
+(4 rows)
+
+SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '$', 'mg');
+ regexp_matches 
+----------------
+ {""}
+ {""}
+ {""}
+ {""}
+(4 rows)
+
+SELECT regexp_matches('1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '^.?', 'mg');
+ regexp_matches 
+----------------
+ {1}
+ {2}
+ {3}
+ {4}
+ {""}
+(5 rows)
+
+SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '.?$', 'mg');
+ regexp_matches 
+----------------
+ {""}
+ {1}
+ {""}
+ {2}
+ {""}
+ {3}
+ {""}
+ {4}
+ {""}
+ {""}
+(10 rows)
+
+SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4', '.?$', 'mg');
+ regexp_matches 
+----------------
+ {""}
+ {1}
+ {""}
+ {2}
+ {""}
+ {3}
+ {""}
+ {4}
+ {""}
+(9 rows)
+
+-- give me errors
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
+ERROR:  invalid regular expression option: "z"
+SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
+ERROR:  invalid regular expression: parentheses () not balanced
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
+ERROR:  invalid regular expression: invalid repetition count(s)
+-- split string on regexp
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s+$re$) AS foo;
+  foo  | length 
+-------+--------
+ the   |      3
+ quick |      5
+ brown |      5
+ fox   |      3
+ jumps |      5
+ over  |      4
+ the   |      3
+ lazy  |      4
+ dog   |      3
+(9 rows)
+
+SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s+$re$);
+             regexp_split_to_array             
+-----------------------------------------------
+ {the,quick,brown,fox,jumps,over,the,lazy,dog}
+(1 row)
+
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s*$re$) AS foo;
+ foo | length 
+-----+--------
+ t   |      1
+ h   |      1
+ e   |      1
+ q   |      1
+ u   |      1
+ i   |      1
+ c   |      1
+ k   |      1
+ b   |      1
+ r   |      1
+ o   |      1
+ w   |      1
+ n   |      1
+ f   |      1
+ o   |      1
+ x   |      1
+ j   |      1
+ u   |      1
+ m   |      1
+ p   |      1
+ s   |      1
+ o   |      1
+ v   |      1
+ e   |      1
+ r   |      1
+ t   |      1
+ h   |      1
+ e   |      1
+ l   |      1
+ a   |      1
+ z   |      1
+ y   |      1
+ d   |      1
+ o   |      1
+ g   |      1
+(35 rows)
+
+SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s*$re$);
+                          regexp_split_to_array                          
+-------------------------------------------------------------------------
+ {t,h,e,q,u,i,c,k,b,r,o,w,n,f,o,x,j,u,m,p,s,o,v,e,r,t,h,e,l,a,z,y,d,o,g}
+(1 row)
+
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', '') AS foo;
+ foo | length 
+-----+--------
+ t   |      1
+ h   |      1
+ e   |      1
+     |      1
+ q   |      1
+ u   |      1
+ i   |      1
+ c   |      1
+ k   |      1
+     |      1
+ b   |      1
+ r   |      1
+ o   |      1
+ w   |      1
+ n   |      1
+     |      1
+ f   |      1
+ o   |      1
+ x   |      1
+     |      1
+ j   |      1
+ u   |      1
+ m   |      1
+ p   |      1
+ s   |      1
+     |      1
+ o   |      1
+ v   |      1
+ e   |      1
+ r   |      1
+     |      1
+ t   |      1
+ h   |      1
+ e   |      1
+     |      1
+ l   |      1
+ a   |      1
+ z   |      1
+ y   |      1
+     |      1
+ d   |      1
+ o   |      1
+ g   |      1
+(43 rows)
+
+SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '');
+                                          regexp_split_to_array                                          
+---------------------------------------------------------------------------------------------------------
+ {t,h,e," ",q,u,i,c,k," ",b,r,o,w,n," ",f,o,x," ",j,u,m,p,s," ",o,v,e,r," ",t,h,e," ",l,a,z,y," ",d,o,g}
+(1 row)
+
+-- case insensitive
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i') AS foo;
+            foo            | length 
+---------------------------+--------
+ th                        |      2
+  QUick bROWn FOx jUMPs ov |     25
+ r Th                      |      4
+  lazy dOG                 |      9
+(4 rows)
+
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i');
+                regexp_split_to_array                
+-----------------------------------------------------
+ {th," QUick bROWn FOx jUMPs ov","r Th"," lazy dOG"}
+(1 row)
+
+-- no match of pattern
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', 'nomatch') AS foo;
+                     foo                     | length 
+---------------------------------------------+--------
+ the quick brown fox jumps over the lazy dog |     43
+(1 row)
+
+SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nomatch');
+              regexp_split_to_array              
+-------------------------------------------------
+ {"the quick brown fox jumps over the lazy dog"}
+(1 row)
+
+-- some corner cases
+SELECT regexp_split_to_array('123456','1');
+ regexp_split_to_array 
+-----------------------
+ {"",23456}
+(1 row)
+
+SELECT regexp_split_to_array('123456','6');
+ regexp_split_to_array 
+-----------------------
+ {12345,""}
+(1 row)
+
+SELECT regexp_split_to_array('123456','.');
+ regexp_split_to_array  
+------------------------
+ {"","","","","","",""}
+(1 row)
+
+SELECT regexp_split_to_array('123456','');
+ regexp_split_to_array 
+-----------------------
+ {1,2,3,4,5,6}
+(1 row)
+
+SELECT regexp_split_to_array('123456','(?:)');
+ regexp_split_to_array 
+-----------------------
+ {1,2,3,4,5,6}
+(1 row)
+
+SELECT regexp_split_to_array('1','');
+ regexp_split_to_array 
+-----------------------
+ {1}
+(1 row)
+
+-- errors
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
+ERROR:  invalid regular expression option: "z"
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz');
+ERROR:  invalid regular expression option: "z"
+-- global option meaningless for regexp_split
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g') AS foo;
+ERROR:  regexp_split_to_table() does not support the "global" option
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g');
+ERROR:  regexp_split_to_array() does not support the "global" option
+-- change NULL-display back
+\pset null ''
+-- E021-11 position expression
+SELECT POSITION('4' IN '1234567890') = '4' AS "4";
+ 4 
+---
+ t
+(1 row)
+
+SELECT POSITION('5' IN '1234567890') = '5' AS "5";
+ 5 
+---
+ t
+(1 row)
+
+-- T312 character overlay function
+SELECT OVERLAY('abcdef' PLACING '45' FROM 4) AS "abc45f";
+ abc45f 
+--------
+ abc45f
+(1 row)
+
+SELECT OVERLAY('yabadoo' PLACING 'daba' FROM 5) AS "yabadaba";
+ yabadaba 
+----------
+ yabadaba
+(1 row)
+
+SELECT OVERLAY('yabadoo' PLACING 'daba' FROM 5 FOR 0) AS "yabadabadoo";
+ yabadabadoo 
+-------------
+ yabadabadoo
+(1 row)
+
+SELECT OVERLAY('babosa' PLACING 'ubb' FROM 2 FOR 4) AS "bubba";
+ bubba 
+-------
+ bubba
+(1 row)
+
+--
+-- test LIKE
+-- Be sure to form every test as a LIKE/NOT LIKE pair.
+--
+-- simplest examples
+-- E061-04 like predicate
+SELECT 'hawkeye' LIKE 'h%' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'hawkeye' NOT LIKE 'h%' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'hawkeye' LIKE 'H%' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'hawkeye' NOT LIKE 'H%' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'hawkeye' LIKE 'indio%' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'hawkeye' NOT LIKE 'indio%' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'hawkeye' LIKE 'h%eye' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'hawkeye' NOT LIKE 'h%eye' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'indio' LIKE '_ndio' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'indio' NOT LIKE '_ndio' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'indio' LIKE 'in__o' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'indio' NOT LIKE 'in__o' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'indio' LIKE 'in_o' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'indio' NOT LIKE 'in_o' AS "true";
+ true 
+------
+ t
+(1 row)
+
+-- unused escape character
+SELECT 'hawkeye' LIKE 'h%' ESCAPE '#' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'hawkeye' NOT LIKE 'h%' ESCAPE '#' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'indio' LIKE 'ind_o' ESCAPE '$' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'indio' NOT LIKE 'ind_o' ESCAPE '$' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+-- escape character
+-- E061-05 like predicate with escape clause
+SELECT 'h%' LIKE 'h#%' ESCAPE '#' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'h%' NOT LIKE 'h#%' ESCAPE '#' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'h%wkeye' LIKE 'h#%' ESCAPE '#' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'h%wkeye' NOT LIKE 'h#%' ESCAPE '#' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'h%wkeye' LIKE 'h#%%' ESCAPE '#' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'h%wkeye' NOT LIKE 'h#%%' ESCAPE '#' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'h%awkeye' LIKE 'h#%a%k%e' ESCAPE '#' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'h%awkeye' NOT LIKE 'h#%a%k%e' ESCAPE '#' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'indio' LIKE '_ndio' ESCAPE '$' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'indio' NOT LIKE '_ndio' ESCAPE '$' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'i_dio' LIKE 'i$_d_o' ESCAPE '$' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'i_dio' NOT LIKE 'i$_d_o' ESCAPE '$' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'i_dio' LIKE 'i$_nd_o' ESCAPE '$' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'i_dio' NOT LIKE 'i$_nd_o' ESCAPE '$' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'i_dio' LIKE 'i$_d%o' ESCAPE '$' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'i_dio' NOT LIKE 'i$_d%o' ESCAPE '$' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+-- escape character same as pattern character
+SELECT 'maca' LIKE 'm%aca' ESCAPE '%' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'maca' NOT LIKE 'm%aca' ESCAPE '%' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'ma%a' LIKE 'm%a%%a' ESCAPE '%' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'ma%a' NOT LIKE 'm%a%%a' ESCAPE '%' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'bear' LIKE 'b_ear' ESCAPE '_' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'bear' NOT LIKE 'b_ear' ESCAPE '_' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'be_r' LIKE 'b_e__r' ESCAPE '_' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'be_r' NOT LIKE 'b_e__r' ESCAPE '_' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'be_r' LIKE '__e__r' ESCAPE '_' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'be_r' NOT LIKE '__e__r' ESCAPE '_' AS "true";
+ true 
+------
+ t
+(1 row)
+
+--
+-- test ILIKE (case-insensitive LIKE)
+-- Be sure to form every test as an ILIKE/NOT ILIKE pair.
+--
+SELECT 'hawkeye' ILIKE 'h%' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'hawkeye' NOT ILIKE 'h%' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'hawkeye' ILIKE 'H%' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'hawkeye' NOT ILIKE 'H%' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'hawkeye' ILIKE 'H%Eye' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'hawkeye' NOT ILIKE 'H%Eye' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'Hawkeye' ILIKE 'h%' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+--
+-- test %/_ combination cases, cf bugs #4821 and #5478
+--
+SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
+ t | t | f 
+---+---+---
+ t | t | f
+(1 row)
+
+SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f;
+ t | t | f 
+---+---+---
+ t | t | f
+(1 row)
+
+SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f;
+ t | t | f 
+---+---+---
+ t | t | f
+(1 row)
+
+SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
+ t | t | f 
+---+---+---
+ t | t | f
+(1 row)
+
+SELECT 'jack' LIKE '%____%' AS t;
+ t 
+---
+ t
+(1 row)
+
+--
+-- basic tests of LIKE with indexes
+--
+CREATE TABLE texttest (a text PRIMARY KEY, b int);
+SELECT * FROM texttest WHERE a LIKE '%1%';
+ a | b 
+---+---
+(0 rows)
+
+CREATE TABLE byteatest (a bytea PRIMARY KEY, b int);
+SELECT * FROM byteatest WHERE a LIKE '%1%';
+ a | b 
+---+---
+(0 rows)
+
+DROP TABLE texttest, byteatest;
+--
+-- test implicit type conversion
+--
+-- E021-07 character concatenation
+SELECT 'unknown' || ' and unknown' AS "Concat unknown types";
+ Concat unknown types 
+----------------------
+ unknown and unknown
+(1 row)
+
+SELECT text 'text' || ' and unknown' AS "Concat text to unknown type";
+ Concat text to unknown type 
+-----------------------------
+ text and unknown
+(1 row)
+
+SELECT char(20) 'characters' || ' and text' AS "Concat char to unknown type";
+ Concat char to unknown type 
+-----------------------------
+ characters and text
+(1 row)
+
+SELECT text 'text' || char(20) ' and characters' AS "Concat text to char";
+ Concat text to char 
+---------------------
+ text and characters
+(1 row)
+
+SELECT text 'text' || varchar ' and varchar' AS "Concat text to varchar";
+ Concat text to varchar 
+------------------------
+ text and varchar
+(1 row)
+
+--
+-- test substr with toasted text values
+--
+CREATE TABLE toasttest(f1 text);
+insert into toasttest values(repeat('1234567890',10000));
+insert into toasttest values(repeat('1234567890',10000));
+--
+-- Ensure that some values are uncompressed, to test the faster substring
+-- operation used in that case
+--
+alter table toasttest alter column f1 set storage external;
+insert into toasttest values(repeat('1234567890',10000));
+insert into toasttest values(repeat('1234567890',10000));
+-- If the starting position is zero or less, then return from the start of the string
+-- adjusting the length to be consistent with the "negative start" per SQL.
+SELECT substr(f1, -1, 5) from toasttest;
+ substr 
+--------
+ 123
+ 123
+ 123
+ 123
+(4 rows)
+
+-- If the length is less than zero, an ERROR is thrown.
+SELECT substr(f1, 5, -1) from toasttest;
+ERROR:  negative substring length not allowed
+-- If no third argument (length) is provided, the length to the end of the
+-- string is assumed.
+SELECT substr(f1, 99995) from toasttest;
+ substr 
+--------
+ 567890
+ 567890
+ 567890
+ 567890
+(4 rows)
+
+-- If start plus length is > string length, the result is truncated to
+-- string length
+SELECT substr(f1, 99995, 10) from toasttest;
+ substr 
+--------
+ 567890
+ 567890
+ 567890
+ 567890
+(4 rows)
+
+TRUNCATE TABLE toasttest;
+INSERT INTO toasttest values (repeat('1234567890',300));
+INSERT INTO toasttest values (repeat('1234567890',300));
+INSERT INTO toasttest values (repeat('1234567890',300));
+INSERT INTO toasttest values (repeat('1234567890',300));
+-- expect >0 blocks
+SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty
+  FROM pg_class where relname = 'toasttest';
+ is_empty 
+----------
+ 
+(1 row)
+
+TRUNCATE TABLE toasttest;
+ALTER TABLE toasttest set (toast_tuple_target = 4080);
+INSERT INTO toasttest values (repeat('1234567890',300));
+INSERT INTO toasttest values (repeat('1234567890',300));
+INSERT INTO toasttest values (repeat('1234567890',300));
+INSERT INTO toasttest values (repeat('1234567890',300));
+-- expect 0 blocks
+SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty
+  FROM pg_class where relname = 'toasttest';
+ is_empty 
+----------
+ 
+(1 row)
+
+DROP TABLE toasttest;
+--
+-- test substr with toasted bytea values
+--
+CREATE TABLE toasttest(f1 bytea);
+insert into toasttest values(decode(repeat('1234567890',10000),'escape'));
+insert into toasttest values(decode(repeat('1234567890',10000),'escape'));
+--
+-- Ensure that some values are uncompressed, to test the faster substring
+-- operation used in that case
+--
+alter table toasttest alter column f1 set storage external;
+insert into toasttest values(decode(repeat('1234567890',10000),'escape'));
+insert into toasttest values(decode(repeat('1234567890',10000),'escape'));
+-- If the starting position is zero or less, then return from the start of the string
+-- adjusting the length to be consistent with the "negative start" per SQL.
+SELECT substr(f1, -1, 5) from toasttest;
+ substr 
+--------
+ 123
+ 123
+ 123
+ 123
+(4 rows)
+
+-- If the length is less than zero, an ERROR is thrown.
+SELECT substr(f1, 5, -1) from toasttest;
+ERROR:  negative substring length not allowed
+-- If no third argument (length) is provided, the length to the end of the
+-- string is assumed.
+SELECT substr(f1, 99995) from toasttest;
+ substr 
+--------
+ 567890
+ 567890
+ 567890
+ 567890
+(4 rows)
+
+-- If start plus length is > string length, the result is truncated to
+-- string length
+SELECT substr(f1, 99995, 10) from toasttest;
+ substr 
+--------
+ 567890
+ 567890
+ 567890
+ 567890
+(4 rows)
+
+DROP TABLE toasttest;
+-- test internally compressing datums
+-- this tests compressing a datum to a very small size which exercises a
+-- corner case in packed-varlena handling: even though small, the compressed
+-- datum must be given a 4-byte header because there are no bits to indicate
+-- compression in a 1-byte header
+CREATE TABLE toasttest (c char(4096));
+INSERT INTO toasttest VALUES('x');
+SELECT length(c), c::text FROM toasttest;
+ length | c 
+--------+---
+      1 | x
+(1 row)
+
+SELECT c FROM toasttest;
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                c                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ x                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+(1 row)
+
+DROP TABLE toasttest;
+--
+-- test length
+--
+SELECT length('abcdef') AS "length_6";
+ length_6 
+----------
+        6
+(1 row)
+
+--
+-- test strpos
+--
+SELECT strpos('abcdef', 'cd') AS "pos_3";
+ pos_3 
+-------
+     3
+(1 row)
+
+SELECT strpos('abcdef', 'xy') AS "pos_0";
+ pos_0 
+-------
+     0
+(1 row)
+
+--
+-- test replace
+--
+SELECT replace('abcdef', 'de', '45') AS "abc45f";
+ abc45f 
+--------
+ abc45f
+(1 row)
+
+SELECT replace('yabadabadoo', 'ba', '123') AS "ya123da123doo";
+ ya123da123doo 
+---------------
+ ya123da123doo
+(1 row)
+
+SELECT replace('yabadoo', 'bad', '') AS "yaoo";
+ yaoo 
+------
+ yaoo
+(1 row)
+
+--
+-- test split_part
+--
+select split_part('joeuser@mydatabase','@',0) AS "an error";
+ERROR:  field position must be greater than zero
+select split_part('joeuser@mydatabase','@',1) AS "joeuser";
+ joeuser 
+---------
+ joeuser
+(1 row)
+
+select split_part('joeuser@mydatabase','@',2) AS "mydatabase";
+ mydatabase 
+------------
+ mydatabase
+(1 row)
+
+select split_part('joeuser@mydatabase','@',3) AS "empty string";
+ empty string 
+--------------
+ 
+(1 row)
+
+select split_part('@joeuser@mydatabase@','@',2) AS "joeuser";
+ joeuser 
+---------
+ joeuser
+(1 row)
+
+--
+-- test to_hex
+--
+select to_hex(256*256*256 - 1) AS "ffffff";
+ ffffff 
+--------
+ ffffff
+(1 row)
+
+select to_hex(256::bigint*256::bigint*256::bigint*256::bigint - 1) AS "ffffffff";
+ ffffffff 
+----------
+ ffffffff
+(1 row)
+
+--
+-- MD5 test suite - from IETF RFC 1321
+-- (see: ftp://ftp.rfc-editor.org/in-notes/rfc1321.txt)
+--
+select md5('') = 'd41d8cd98f00b204e9800998ecf8427e' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('a') = '0cc175b9c0f1b6a831c399e269772661' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('abc') = '900150983cd24fb0d6963f7d28e17f72' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('message digest') = 'f96b697d7cb7938d525a2f31aaf161d0' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('abcdefghijklmnopqrstuvwxyz') = 'c3fcd3d76192e4007dfb496cca67e13b' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') = 'd174ab98d277d9f5a5611c2c9f419d9f' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('12345678901234567890123456789012345678901234567890123456789012345678901234567890') = '57edf4a22be3c955ac49da2e2107b67a' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5(''::bytea) = 'd41d8cd98f00b204e9800998ecf8427e' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('a'::bytea) = '0cc175b9c0f1b6a831c399e269772661' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('abc'::bytea) = '900150983cd24fb0d6963f7d28e17f72' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('message digest'::bytea) = 'f96b697d7cb7938d525a2f31aaf161d0' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('abcdefghijklmnopqrstuvwxyz'::bytea) = 'c3fcd3d76192e4007dfb496cca67e13b' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'::bytea) = 'd174ab98d277d9f5a5611c2c9f419d9f' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+select md5('12345678901234567890123456789012345678901234567890123456789012345678901234567890'::bytea) = '57edf4a22be3c955ac49da2e2107b67a' AS "TRUE";
+ TRUE 
+------
+ t
+(1 row)
+
+--
+-- SHA-2
+--
+SET bytea_output TO hex;
+SELECT sha224('');
+                           sha224                           
+------------------------------------------------------------
+ \xd14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f
+(1 row)
+
+SELECT sha224('The quick brown fox jumps over the lazy dog.');
+                           sha224                           
+------------------------------------------------------------
+ \x619cba8e8e05826e9b8c519c0a5c68f4fb653e8a3d8aa04bb2c8cd4c
+(1 row)
+
+SELECT sha256('');
+                               sha256                               
+--------------------------------------------------------------------
+ \xe3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+(1 row)
+
+SELECT sha256('The quick brown fox jumps over the lazy dog.');
+                               sha256                               
+--------------------------------------------------------------------
+ \xef537f25c895bfa782526529a9b63d97aa631564d5d789c2b765448c8635fb6c
+(1 row)
+
+SELECT sha384('');
+                                               sha384                                               
+----------------------------------------------------------------------------------------------------
+ \x38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b
+(1 row)
+
+SELECT sha384('The quick brown fox jumps over the lazy dog.');
+                                               sha384                                               
+----------------------------------------------------------------------------------------------------
+ \xed892481d8272ca6df370bf706e4d7bc1b5739fa2177aae6c50e946678718fc67a7af2819a021c2fc34e91bdb63409d7
+(1 row)
+
+SELECT sha512('');
+                                                               sha512                                                               
+------------------------------------------------------------------------------------------------------------------------------------
+ \xcf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e
+(1 row)
+
+SELECT sha512('The quick brown fox jumps over the lazy dog.');
+                                                               sha512                                                               
+------------------------------------------------------------------------------------------------------------------------------------
+ \x91ea1245f20d46ae9a037a989f54f1f790f0a47607eeb8a14d12890cea77a1bbc6c7ed9cf205e67b7f2b8fd4c7dfd3a7a8617e45f3c463d481c7e586c39ac1ed
+(1 row)
+
+--
+-- test behavior of escape_string_warning and standard_conforming_strings options
+--
+set escape_string_warning = off;
+set standard_conforming_strings = off;
+show escape_string_warning;
+ escape_string_warning 
+-----------------------
+ off
+(1 row)
+
+show standard_conforming_strings;
+ standard_conforming_strings 
+-----------------------------
+ off
+(1 row)
+
+set escape_string_warning = on;
+set standard_conforming_strings = on;
+show escape_string_warning;
+ escape_string_warning 
+-----------------------
+ on
+(1 row)
+
+show standard_conforming_strings;
+ standard_conforming_strings 
+-----------------------------
+ on
+(1 row)
+
+select 'a\bcd' as f1, 'a\b''cd' as f2, 'a\b''''cd' as f3, 'abcd\'   as f4, 'ab\''cd' as f5, '\\' as f6;
+  f1   |   f2   |   f3    |  f4   |   f5   | f6 
+-------+--------+---------+-------+--------+----
+ a\bcd | a\b'cd | a\b''cd | abcd\ | ab\'cd | \\
+(1 row)
+
+set standard_conforming_strings = off;
+select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3, 'abcd\\'   as f4, 'ab\\\'cd' as f5, '\\\\' as f6;
+WARNING:  nonstandard use of \\ in a string literal
+LINE 1: select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3,...
+               ^
+HINT:  Use the escape string syntax for backslashes, e.g., E'\\'.
+WARNING:  nonstandard use of \\ in a string literal
+LINE 1: select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3,...
+                               ^
+HINT:  Use the escape string syntax for backslashes, e.g., E'\\'.
+WARNING:  nonstandard use of \\ in a string literal
+LINE 1: select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3,...
+                                                 ^
+HINT:  Use the escape string syntax for backslashes, e.g., E'\\'.
+WARNING:  nonstandard use of \\ in a string literal
+LINE 1: ...bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3, 'abcd\\'  ...
+                                                             ^
+HINT:  Use the escape string syntax for backslashes, e.g., E'\\'.
+WARNING:  nonstandard use of \\ in a string literal
+LINE 1: ...'cd' as f2, 'a\\b\'''cd' as f3, 'abcd\\'   as f4, 'ab\\\'cd'...
+                                                             ^
+HINT:  Use the escape string syntax for backslashes, e.g., E'\\'.
+WARNING:  nonstandard use of \\ in a string literal
+LINE 1: ...'''cd' as f3, 'abcd\\'   as f4, 'ab\\\'cd' as f5, '\\\\' as ...
+                                                             ^
+HINT:  Use the escape string syntax for backslashes, e.g., E'\\'.
+  f1   |   f2   |   f3    |  f4   |   f5   | f6 
+-------+--------+---------+-------+--------+----
+ a\bcd | a\b'cd | a\b''cd | abcd\ | ab\'cd | \\
+(1 row)
+
+set escape_string_warning = off;
+set standard_conforming_strings = on;
+select 'a\bcd' as f1, 'a\b''cd' as f2, 'a\b''''cd' as f3, 'abcd\'   as f4, 'ab\''cd' as f5, '\\' as f6;
+  f1   |   f2   |   f3    |  f4   |   f5   | f6 
+-------+--------+---------+-------+--------+----
+ a\bcd | a\b'cd | a\b''cd | abcd\ | ab\'cd | \\
+(1 row)
+
+set standard_conforming_strings = off;
+select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3, 'abcd\\'   as f4, 'ab\\\'cd' as f5, '\\\\' as f6;
+  f1   |   f2   |   f3    |  f4   |   f5   | f6 
+-------+--------+---------+-------+--------+----
+ a\bcd | a\b'cd | a\b''cd | abcd\ | ab\'cd | \\
+(1 row)
+
+--
+-- Additional string functions
+--
+SET bytea_output TO escape;
+SELECT initcap('hi THOMAS');
+  initcap  
+-----------
+ Hi Thomas
+(1 row)
+
+SELECT lpad('hi', 5, 'xy');
+ lpad  
+-------
+ xyxhi
+(1 row)
+
+SELECT lpad('hi', 5);
+ lpad  
+-------
+    hi
+(1 row)
+
+SELECT lpad('hi', -5, 'xy');
+ lpad 
+------
+ 
+(1 row)
+
+SELECT lpad('hello', 2);
+ lpad 
+------
+ he
+(1 row)
+
+SELECT lpad('hi', 5, '');
+ lpad 
+------
+ hi
+(1 row)
+
+SELECT rpad('hi', 5, 'xy');
+ rpad  
+-------
+ hixyx
+(1 row)
+
+SELECT rpad('hi', 5);
+ rpad  
+-------
+ hi   
+(1 row)
+
+SELECT rpad('hi', -5, 'xy');
+ rpad 
+------
+ 
+(1 row)
+
+SELECT rpad('hello', 2);
+ rpad 
+------
+ he
+(1 row)
+
+SELECT rpad('hi', 5, '');
+ rpad 
+------
+ hi
+(1 row)
+
+SELECT ltrim('zzzytrim', 'xyz');
+ ltrim 
+-------
+ trim
+(1 row)
+
+SELECT translate('', '14', 'ax');
+ translate 
+-----------
+ 
+(1 row)
+
+SELECT translate('12345', '14', 'ax');
+ translate 
+-----------
+ a23x5
+(1 row)
+
+SELECT ascii('x');
+ ascii 
+-------
+   120
+(1 row)
+
+SELECT ascii('');
+ ascii 
+-------
+     0
+(1 row)
+
+SELECT chr(65);
+ chr 
+-----
+ A
+(1 row)
+
+SELECT chr(0);
+ERROR:  null character not permitted
+SELECT repeat('Pg', 4);
+  repeat  
+----------
+ PgPgPgPg
+(1 row)
+
+SELECT repeat('Pg', -4);
+ repeat 
+--------
+ 
+(1 row)
+
+SELECT trim(E'\\000'::bytea from E'\\000Tom\\000'::bytea);
+ btrim 
+-------
+ Tom
+(1 row)
+
+SELECT btrim(E'\\000trim\\000'::bytea, E'\\000'::bytea);
+ btrim 
+-------
+ trim
+(1 row)
+
+SELECT btrim(''::bytea, E'\\000'::bytea);
+ btrim 
+-------
+ 
+(1 row)
+
+SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
+    btrim     
+--------------
+ \000trim\000
+(1 row)
+
+SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
+   encode    
+-------------
+ TTh\x01omas
+(1 row)
+
+SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
+       encode       
+--------------------
+ Th\000omas\x02\x03
+(1 row)
+
+SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+     encode      
+-----------------
+ Th\000o\x02\x03
+(1 row)
+
diff --git a/src/test/regress/expected/tsrf_1.out b/src/test/regress/expected/tsrf_1.out
new file mode 100644
index 0000000000..a0f7d80c69
--- /dev/null
+++ b/src/test/regress/expected/tsrf_1.out
@@ -0,0 +1,712 @@
+--
+-- tsrf - targetlist set returning function tests
+--
+-- simple srf
+SELECT generate_series(1, 3);
+ generate_series 
+-----------------
+               1
+               2
+               3
+(3 rows)
+
+-- parallel iteration
+SELECT generate_series(1, 3), generate_series(3,5);
+ generate_series | generate_series 
+-----------------+-----------------
+               1 |               3
+               2 |               4
+               3 |               5
+(3 rows)
+
+-- parallel iteration, different number of rows
+SELECT generate_series(1, 2), generate_series(1,4);
+ generate_series | generate_series 
+-----------------+-----------------
+               1 |               1
+               2 |               2
+                 |               3
+                 |               4
+(4 rows)
+
+-- srf, with SRF argument
+SELECT generate_series(1, generate_series(1, 3));
+ generate_series 
+-----------------
+               1
+               1
+               2
+               1
+               2
+               3
+(6 rows)
+
+-- but we've traditionally rejected the same in FROM
+SELECT * FROM generate_series(1, generate_series(1, 3));
+ERROR:  set-returning functions must appear at top level of FROM
+LINE 1: SELECT * FROM generate_series(1, generate_series(1, 3));
+                                         ^
+-- srf, with two SRF arguments
+SELECT generate_series(generate_series(1,3), generate_series(2, 4));
+ generate_series 
+-----------------
+               1
+               2
+               2
+               3
+               3
+               4
+(6 rows)
+
+-- check proper nesting of SRFs in different expressions
+explain (verbose, costs off)
+SELECT generate_series(1, generate_series(1, 3)), generate_series(2, 4);
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ ProjectSet
+   Output: generate_series(1, (generate_series(1, 3))), (generate_series(2, 4))
+   ->  ProjectSet
+         Output: generate_series(1, 3), generate_series(2, 4)
+         ->  Result
+(5 rows)
+
+SELECT generate_series(1, generate_series(1, 3)), generate_series(2, 4);
+ generate_series | generate_series 
+-----------------+-----------------
+               1 |               2
+               1 |               3
+               2 |               3
+               1 |               4
+               2 |               4
+               3 |               4
+(6 rows)
+
+CREATE TABLE few(id int, dataa text, datab text);
+INSERT INTO few VALUES(1, 'a', 'foo'),(2, 'a', 'bar'),(3, 'b', 'bar');
+-- SRF with a provably-dummy relation
+explain (verbose, costs off)
+SELECT unnest(ARRAY[1, 2]) FROM few WHERE false;
+              QUERY PLAN              
+--------------------------------------
+ ProjectSet
+   Output: unnest('{1,2}'::integer[])
+   ->  Result
+         One-Time Filter: false
+(4 rows)
+
+SELECT unnest(ARRAY[1, 2]) FROM few WHERE false;
+ unnest 
+--------
+(0 rows)
+
+-- SRF shouldn't prevent upper query from recognizing lower as dummy
+explain (verbose, costs off)
+SELECT * FROM few f1,
+  (SELECT unnest(ARRAY[1,2]) FROM few f2 WHERE false OFFSET 0) ss;
+                   QUERY PLAN                   
+------------------------------------------------
+ Result
+   Output: f1.id, f1.dataa, f1.datab, ss.unnest
+   One-Time Filter: false
+(3 rows)
+
+SELECT * FROM few f1,
+  (SELECT unnest(ARRAY[1,2]) FROM few f2 WHERE false OFFSET 0) ss;
+ id | dataa | datab | unnest 
+----+-------+-------+--------
+(0 rows)
+
+-- SRF output order of sorting is maintained, if SRF is not referenced
+SELECT few.id, generate_series(1,3) g FROM few ORDER BY id DESC;
+ id | g 
+----+---
+  3 | 1
+  3 | 2
+  3 | 3
+  2 | 1
+  2 | 2
+  2 | 3
+  1 | 1
+  1 | 2
+  1 | 3
+(9 rows)
+
+-- but SRFs can be referenced in sort
+SELECT few.id, generate_series(1,3) g FROM few ORDER BY id, g DESC;
+ id | g 
+----+---
+  1 | 3
+  1 | 2
+  1 | 1
+  2 | 3
+  2 | 2
+  2 | 1
+  3 | 3
+  3 | 2
+  3 | 1
+(9 rows)
+
+SELECT few.id, generate_series(1,3) g FROM few ORDER BY id, generate_series(1,3) DESC;
+ id | g 
+----+---
+  1 | 3
+  1 | 2
+  1 | 1
+  2 | 3
+  2 | 2
+  2 | 1
+  3 | 3
+  3 | 2
+  3 | 1
+(9 rows)
+
+-- it's weird to have ORDER BYs that increase the number of results
+SELECT few.id FROM few ORDER BY id, generate_series(1,3) DESC;
+ id 
+----
+  1
+  1
+  1
+  2
+  2
+  2
+  3
+  3
+  3
+(9 rows)
+
+-- SRFs are computed after aggregation
+SET enable_hashagg TO 0; -- stable output order
+SELECT few.dataa, count(*), min(id), max(id), unnest('{1,1,3}'::int[]) FROM few WHERE few.id = 1 GROUP BY few.dataa;
+ dataa | count | min | max | unnest 
+-------+-------+-----+-----+--------
+ a     |     1 |   1 |   1 |      1
+ a     |     1 |   1 |   1 |      1
+ a     |     1 |   1 |   1 |      3
+(3 rows)
+
+-- unless referenced in GROUP BY clause
+SELECT few.dataa, count(*), min(id), max(id), unnest('{1,1,3}'::int[]) FROM few WHERE few.id = 1 GROUP BY few.dataa, unnest('{1,1,3}'::int[]);
+ dataa | count | min | max | unnest 
+-------+-------+-----+-----+--------
+ a     |     2 |   1 |   1 |      1
+ a     |     1 |   1 |   1 |      3
+(2 rows)
+
+SELECT few.dataa, count(*), min(id), max(id), unnest('{1,1,3}'::int[]) FROM few WHERE few.id = 1 GROUP BY few.dataa, 5;
+ dataa | count | min | max | unnest 
+-------+-------+-----+-----+--------
+ a     |     2 |   1 |   1 |      1
+ a     |     1 |   1 |   1 |      3
+(2 rows)
+
+RESET enable_hashagg;
+-- check HAVING works when GROUP BY does [not] reference SRF output
+SELECT dataa, generate_series(1,1), count(*) FROM few GROUP BY 1 HAVING count(*) > 1;
+ dataa | generate_series | count 
+-------+-----------------+-------
+ a     |               1 |     2
+(1 row)
+
+SELECT dataa, generate_series(1,1), count(*) FROM few GROUP BY 1, 2 HAVING count(*) > 1;
+ dataa | generate_series | count 
+-------+-----------------+-------
+ a     |               1 |     2
+(1 row)
+
+-- it's weird to have GROUP BYs that increase the number of results
+SELECT few.dataa, count(*) FROM few WHERE dataa = 'a' GROUP BY few.dataa ORDER BY 2;
+ dataa | count 
+-------+-------
+ a     |     2
+(1 row)
+
+SELECT few.dataa, count(*) FROM few WHERE dataa = 'a' GROUP BY few.dataa, unnest('{1,1,3}'::int[]) ORDER BY 2;
+ dataa | count 
+-------+-------
+ a     |     2
+ a     |     4
+(2 rows)
+
+-- SRFs are not allowed if they'd need to be conditionally executed
+SELECT q1, case when q1 > 0 then generate_series(1,3) else 0 end FROM int8_tbl;
+ERROR:  set-returning functions are not allowed in CASE
+LINE 1: SELECT q1, case when q1 > 0 then generate_series(1,3) else 0...
+                                         ^
+HINT:  You might be able to move the set-returning function into a LATERAL FROM item.
+SELECT q1, coalesce(generate_series(1,3), 0) FROM int8_tbl;
+ERROR:  set-returning functions are not allowed in COALESCE
+LINE 1: SELECT q1, coalesce(generate_series(1,3), 0) FROM int8_tbl;
+                            ^
+HINT:  You might be able to move the set-returning function into a LATERAL FROM item.
+-- SRFs are not allowed in aggregate arguments
+SELECT min(generate_series(1, 3)) FROM few;
+ERROR:  aggregate function calls cannot contain set-returning function calls
+LINE 1: SELECT min(generate_series(1, 3)) FROM few;
+                   ^
+HINT:  You might be able to move the set-returning function into a LATERAL FROM item.
+-- ... unless they're within a sub-select
+SELECT sum((3 = ANY(SELECT generate_series(1,4)))::int);
+ sum 
+-----
+   1
+(1 row)
+
+SELECT sum((3 = ANY(SELECT lag(x) over(order by x)
+                    FROM generate_series(1,4) x))::int);
+ sum 
+-----
+   1
+(1 row)
+
+-- SRFs are not allowed in window function arguments, either
+SELECT min(generate_series(1, 3)) OVER() FROM few;
+ERROR:  window function calls cannot contain set-returning function calls
+LINE 1: SELECT min(generate_series(1, 3)) OVER() FROM few;
+                   ^
+HINT:  You might be able to move the set-returning function into a LATERAL FROM item.
+-- SRFs are normally computed after window functions
+SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few;
+ id | lag | count | generate_series 
+----+-----+-------+-----------------
+  1 |     |     3 |               1
+  1 |     |     3 |               2
+  1 |     |     3 |               3
+  2 |   1 |     3 |               1
+  2 |   1 |     3 |               2
+  2 |   1 |     3 |               3
+  3 |   2 |     3 |               1
+  3 |   2 |     3 |               2
+  3 |   2 |     3 |               3
+(9 rows)
+
+-- unless referencing SRFs
+SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_series(1,3)), generate_series(1,3) g FROM few GROUP BY g;
+ sum | g 
+-----+---
+   3 | 1
+   3 | 2
+   3 | 3
+(3 rows)
+
+-- sorting + grouping
+SELECT few.dataa, count(*), min(id), max(id), generate_series(1,3) FROM few GROUP BY few.dataa ORDER BY 5, 1;
+ dataa | count | min | max | generate_series 
+-------+-------+-----+-----+-----------------
+ a     |     2 |   1 |   2 |               1
+ b     |     1 |   3 |   3 |               1
+ a     |     2 |   1 |   2 |               2
+ b     |     1 |   3 |   3 |               2
+ a     |     2 |   1 |   2 |               3
+ b     |     1 |   3 |   3 |               3
+(6 rows)
+
+-- grouping sets are a bit special, they produce NULLs in columns not actually NULL
+set enable_hashagg = false;
+SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab);
+ dataa |  b  | g | count 
+-------+-----+---+-------
+ a     | bar | 1 |     1
+ a     | bar | 2 |     1
+ a     | foo | 1 |     1
+ a     | foo | 2 |     1
+ a     |     | 1 |     2
+ a     |     | 2 |     2
+ b     | bar | 1 |     1
+ b     | bar | 2 |     1
+ b     |     | 1 |     1
+ b     |     | 2 |     1
+       |     | 1 |     3
+       |     | 2 |     3
+       | bar | 1 |     2
+       | bar | 2 |     2
+       | foo | 1 |     1
+       | foo | 2 |     1
+(16 rows)
+
+SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab) ORDER BY dataa;
+ dataa |  b  | g | count 
+-------+-----+---+-------
+ a     | bar | 1 |     1
+ a     | bar | 2 |     1
+ a     | foo | 1 |     1
+ a     | foo | 2 |     1
+ a     |     | 1 |     2
+ a     |     | 2 |     2
+ b     | bar | 1 |     1
+ b     | bar | 2 |     1
+ b     |     | 1 |     1
+ b     |     | 2 |     1
+       |     | 1 |     3
+       |     | 2 |     3
+       | bar | 1 |     2
+       | bar | 2 |     2
+       | foo | 1 |     1
+       | foo | 2 |     1
+(16 rows)
+
+SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab) ORDER BY g;
+ dataa |  b  | g | count 
+-------+-----+---+-------
+ a     | bar | 1 |     1
+ a     | foo | 1 |     1
+ a     |     | 1 |     2
+ b     | bar | 1 |     1
+ b     |     | 1 |     1
+       |     | 1 |     3
+       | bar | 1 |     2
+       | foo | 1 |     1
+       | foo | 2 |     1
+ a     | bar | 2 |     1
+ b     |     | 2 |     1
+ a     | foo | 2 |     1
+       | bar | 2 |     2
+ a     |     | 2 |     2
+       |     | 2 |     3
+ b     | bar | 2 |     1
+(16 rows)
+
+SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g);
+ dataa |  b  | g | count 
+-------+-----+---+-------
+ a     | bar | 1 |     1
+ a     | bar | 2 |     1
+ a     | bar |   |     2
+ a     | foo | 1 |     1
+ a     | foo | 2 |     1
+ a     | foo |   |     2
+ a     |     |   |     4
+ b     | bar | 1 |     1
+ b     | bar | 2 |     1
+ b     | bar |   |     2
+ b     |     |   |     2
+       |     |   |     6
+       | bar | 1 |     2
+       | bar | 2 |     2
+       | bar |   |     4
+       | foo | 1 |     1
+       | foo | 2 |     1
+       | foo |   |     2
+ a     |     | 1 |     2
+ b     |     | 1 |     1
+       |     | 1 |     3
+ a     |     | 2 |     2
+ b     |     | 2 |     1
+       |     | 2 |     3
+(24 rows)
+
+SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g) ORDER BY dataa;
+ dataa |  b  | g | count 
+-------+-----+---+-------
+ a     | foo |   |     2
+ a     |     |   |     4
+ a     |     | 2 |     2
+ a     | bar | 1 |     1
+ a     | bar | 2 |     1
+ a     | bar |   |     2
+ a     | foo | 1 |     1
+ a     | foo | 2 |     1
+ a     |     | 1 |     2
+ b     | bar | 1 |     1
+ b     |     |   |     2
+ b     |     | 1 |     1
+ b     | bar | 2 |     1
+ b     | bar |   |     2
+ b     |     | 2 |     1
+       |     | 2 |     3
+       |     |   |     6
+       | bar | 1 |     2
+       | bar | 2 |     2
+       | bar |   |     4
+       | foo | 1 |     1
+       | foo | 2 |     1
+       | foo |   |     2
+       |     | 1 |     3
+(24 rows)
+
+SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g) ORDER BY g;
+ dataa |  b  | g | count 
+-------+-----+---+-------
+ a     | bar | 1 |     1
+ a     | foo | 1 |     1
+ b     | bar | 1 |     1
+       | bar | 1 |     2
+       | foo | 1 |     1
+ a     |     | 1 |     2
+ b     |     | 1 |     1
+       |     | 1 |     3
+ a     |     | 2 |     2
+ b     |     | 2 |     1
+       | bar | 2 |     2
+       |     | 2 |     3
+       | foo | 2 |     1
+ a     | bar | 2 |     1
+ a     | foo | 2 |     1
+ b     | bar | 2 |     1
+ a     |     |   |     4
+ b     | bar |   |     2
+ b     |     |   |     2
+       |     |   |     6
+ a     | foo |   |     2
+ a     | bar |   |     2
+       | bar |   |     4
+       | foo |   |     2
+(24 rows)
+
+reset enable_hashagg;
+-- case with degenerate ORDER BY
+explain (verbose, costs off)
+select 'foo' as f, generate_series(1,2) as g from few order by 1;
+                   QUERY PLAN                   
+------------------------------------------------
+ ProjectSet
+   Output: ('foo'::text), generate_series(1, 2)
+   ->  Seq Scan on public.few
+         Output: 'foo'::text
+(4 rows)
+
+select 'foo' as f, generate_series(1,2) as g from few order by 1;
+  f  | g 
+-----+---
+ foo | 1
+ foo | 2
+ foo | 1
+ foo | 2
+ foo | 1
+ foo | 2
+(6 rows)
+
+-- data modification
+CREATE TABLE fewmore AS SELECT generate_series(1,3) AS data;
+INSERT INTO fewmore VALUES(generate_series(4,5));
+SELECT * FROM fewmore;
+ data 
+------
+    1
+    2
+    3
+    4
+    5
+(5 rows)
+
+-- SRFs are not allowed in UPDATE (they once were, but it was nonsense)
+UPDATE fewmore SET data = generate_series(4,9);
+ERROR:  set-returning functions are not allowed in UPDATE
+LINE 1: UPDATE fewmore SET data = generate_series(4,9);
+                                  ^
+-- SRFs are not allowed in RETURNING
+INSERT INTO fewmore VALUES(1) RETURNING generate_series(1,3);
+ERROR:  set-returning functions are not allowed in RETURNING
+LINE 1: INSERT INTO fewmore VALUES(1) RETURNING generate_series(1,3)...
+                                                ^
+-- nor standalone VALUES (but surely this is a bug?)
+VALUES(1, generate_series(1,2));
+ERROR:  set-returning functions are not allowed in VALUES
+LINE 1: VALUES(1, generate_series(1,2));
+                  ^
+-- We allow tSRFs that are not at top level
+SELECT int4mul(generate_series(1,2), 10);
+ int4mul 
+---------
+      10
+      20
+(2 rows)
+
+SELECT generate_series(1,3) IS DISTINCT FROM 2;
+ ?column? 
+----------
+ t
+ f
+ t
+(3 rows)
+
+-- but SRFs in function RTEs must be at top level (annoying restriction)
+SELECT * FROM int4mul(generate_series(1,2), 10);
+ERROR:  set-returning functions must appear at top level of FROM
+LINE 1: SELECT * FROM int4mul(generate_series(1,2), 10);
+                              ^
+-- DISTINCT ON is evaluated before tSRF evaluation if SRF is not
+-- referenced either in ORDER BY or in the DISTINCT ON list. The ORDER
+-- BY reference can be implicitly generated, if there's no other ORDER BY.
+-- implicit reference (via implicit ORDER) to all columns
+SELECT DISTINCT ON (a) a, b, generate_series(1,3) g
+FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b);
+ a | b | g 
+---+---+---
+ 1 | 1 | 1
+ 3 | 2 | 1
+ 5 | 3 | 1
+(3 rows)
+
+-- unreferenced in DISTINCT ON or ORDER BY
+SELECT DISTINCT ON (a) a, b, generate_series(1,3) g
+FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b)
+ORDER BY a, b DESC;
+ a | b | g 
+---+---+---
+ 1 | 4 | 1
+ 1 | 4 | 2
+ 1 | 4 | 3
+ 3 | 2 | 1
+ 3 | 2 | 2
+ 3 | 2 | 3
+ 5 | 3 | 1
+ 5 | 3 | 2
+ 5 | 3 | 3
+(9 rows)
+
+-- referenced in ORDER BY
+SELECT DISTINCT ON (a) a, b, generate_series(1,3) g
+FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b)
+ORDER BY a, b DESC, g DESC;
+ a | b | g 
+---+---+---
+ 1 | 4 | 3
+ 3 | 2 | 3
+ 5 | 3 | 3
+(3 rows)
+
+-- referenced in ORDER BY and DISTINCT ON
+SELECT DISTINCT ON (a, b, g) a, b, generate_series(1,3) g
+FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b)
+ORDER BY a, b DESC, g DESC;
+ a | b | g 
+---+---+---
+ 1 | 4 | 3
+ 1 | 4 | 2
+ 1 | 4 | 1
+ 1 | 1 | 3
+ 1 | 1 | 2
+ 1 | 1 | 1
+ 3 | 2 | 3
+ 3 | 2 | 2
+ 3 | 2 | 1
+ 3 | 1 | 3
+ 3 | 1 | 2
+ 3 | 1 | 1
+ 5 | 3 | 3
+ 5 | 3 | 2
+ 5 | 3 | 1
+ 5 | 1 | 3
+ 5 | 1 | 2
+ 5 | 1 | 1
+(18 rows)
+
+-- only SRF mentioned in DISTINCT ON
+SELECT DISTINCT ON (g) a, b, generate_series(1,3) g
+FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b);
+ a | b | g 
+---+---+---
+ 3 | 2 | 1
+ 5 | 1 | 2
+ 3 | 1 | 3
+(3 rows)
+
+-- LIMIT / OFFSET is evaluated after SRF evaluation
+SELECT a, generate_series(1,2) FROM (VALUES(1),(2),(3)) r(a) LIMIT 2 OFFSET 2;
+ a | generate_series 
+---+-----------------
+ 2 |               1
+ 2 |               2
+(2 rows)
+
+-- SRFs are not allowed in LIMIT.
+SELECT 1 LIMIT generate_series(1,3);
+ERROR:  set-returning functions are not allowed in LIMIT
+LINE 1: SELECT 1 LIMIT generate_series(1,3);
+                       ^
+-- tSRF in correlated subquery, referencing table outside
+SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few;
+ generate_series 
+-----------------
+               2
+               3
+                
+(3 rows)
+
+-- tSRF in correlated subquery, referencing SRF outside
+SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET g.i) FROM generate_series(0,3) g(i);
+ generate_series 
+-----------------
+               1
+               2
+               3
+                
+(4 rows)
+
+-- Operators can return sets too
+CREATE OPERATOR |@| (PROCEDURE = unnest, RIGHTARG = ANYARRAY);
+SELECT |@|ARRAY[1,2,3];
+ ?column? 
+----------
+        1
+        2
+        3
+(3 rows)
+
+-- Some fun cases involving duplicate SRF calls
+explain (verbose, costs off)
+select generate_series(1,3) as x, generate_series(1,3) + 1 as xp1;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Result
+   Output: (generate_series(1, 3)), ((generate_series(1, 3)) + 1)
+   ->  ProjectSet
+         Output: generate_series(1, 3)
+         ->  Result
+(5 rows)
+
+select generate_series(1,3) as x, generate_series(1,3) + 1 as xp1;
+ x | xp1 
+---+-----
+ 1 |   2
+ 2 |   3
+ 3 |   4
+(3 rows)
+
+explain (verbose, costs off)
+select generate_series(1,3)+1 order by generate_series(1,3);
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Sort
+   Output: (((generate_series(1, 3)) + 1)), (generate_series(1, 3))
+   Sort Key: (generate_series(1, 3))
+   ->  Result
+         Output: ((generate_series(1, 3)) + 1), (generate_series(1, 3))
+         ->  ProjectSet
+               Output: generate_series(1, 3)
+               ->  Result
+(8 rows)
+
+select generate_series(1,3)+1 order by generate_series(1,3);
+ ?column? 
+----------
+        2
+        3
+        4
+(3 rows)
+
+-- Check that SRFs of same nesting level run in lockstep
+explain (verbose, costs off)
+select generate_series(1,3) as x, generate_series(3,6) + 1 as y;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Result
+   Output: (generate_series(1, 3)), ((generate_series(3, 6)) + 1)
+   ->  ProjectSet
+         Output: generate_series(1, 3), generate_series(3, 6)
+         ->  Result
+(5 rows)
+
+select generate_series(1,3) as x, generate_series(3,6) + 1 as y;
+ x | y 
+---+---
+ 1 | 4
+ 2 | 5
+ 3 | 6
+   | 7
+(4 rows)
+
+-- Clean up
+DROP TABLE few;
+DROP TABLE fewmore;
diff --git a/src/test/regress/expected/zedstore.out b/src/test/regress/expected/zedstore.out
new file mode 100644
index 0000000000..6041e42a93
--- /dev/null
+++ b/src/test/regress/expected/zedstore.out
@@ -0,0 +1,599 @@
+-- simple tests to iteratively build the zedstore
+-- create and drop works
+create table t_zedstore(c1 int, c2 int, c3 int) USING zedstore;
+drop table t_zedstore;
+-- insert and select works
+create table t_zedstore(c1 int, c2 int, c3 int) USING zedstore;
+insert into t_zedstore select i,i+1,i+2 from generate_series(1, 10)i;
+select * from t_zedstore;
+ c1 | c2 | c3 
+----+----+----
+  1 |  2 |  3
+  2 |  3 |  4
+  3 |  4 |  5
+  4 |  5 |  6
+  5 |  6 |  7
+  6 |  7 |  8
+  7 |  8 |  9
+  8 |  9 | 10
+  9 | 10 | 11
+ 10 | 11 | 12
+(10 rows)
+
+-- selecting only few columns work
+select c1, c3 from t_zedstore;
+ c1 | c3 
+----+----
+  1 |  3
+  2 |  4
+  3 |  5
+  4 |  6
+  5 |  7
+  6 |  8
+  7 |  9
+  8 | 10
+  9 | 11
+ 10 | 12
+(10 rows)
+
+-- only few columns in output and where clause work
+select c3 from t_zedstore where c2 > 5;
+ c3 
+----
+  7
+  8
+  9
+ 10
+ 11
+ 12
+(6 rows)
+
+-- Test abort works
+begin;
+insert into t_zedstore select i,i+1,i+2 from generate_series(21, 25)i;
+abort;
+insert into t_zedstore select i,i+1,i+2 from generate_series(31, 35)i;
+select * from t_zedstore;
+ c1 | c2 | c3 
+----+----+----
+  1 |  2 |  3
+  2 |  3 |  4
+  3 |  4 |  5
+  4 |  5 |  6
+  5 |  6 |  7
+  6 |  7 |  8
+  7 |  8 |  9
+  8 |  9 | 10
+  9 | 10 | 11
+ 10 | 11 | 12
+ 31 | 32 | 33
+ 32 | 33 | 34
+ 33 | 34 | 35
+ 34 | 35 | 36
+ 35 | 36 | 37
+(15 rows)
+
+--
+-- Test indexing
+--
+create index on t_zedstore (c1);
+set enable_seqscan=off;
+set enable_indexscan=on;
+set enable_bitmapscan=off;
+-- index scan
+select * from t_zedstore where c1 = 5;
+ c1 | c2 | c3 
+----+----+----
+  5 |  6 |  7
+(1 row)
+
+-- index-only scan
+select c1 from t_zedstore where c1 = 5;
+ c1 
+----
+  5
+(1 row)
+
+-- bitmap scan
+set enable_indexscan=off;
+set enable_bitmapscan=on;
+select c1, c2 from t_zedstore where c1 between 5 and 10;
+ c1 | c2 
+----+----
+  5 |  6
+  6 |  7
+  7 |  8
+  8 |  9
+  9 | 10
+ 10 | 11
+(6 rows)
+
+--
+-- Test DELETE and UPDATE
+--
+delete from t_zedstore where c2 = 5;
+select * from t_zedstore;
+ c1 | c2 | c3 
+----+----+----
+  1 |  2 |  3
+  2 |  3 |  4
+  3 |  4 |  5
+  5 |  6 |  7
+  6 |  7 |  8
+  7 |  8 |  9
+  8 |  9 | 10
+  9 | 10 | 11
+ 10 | 11 | 12
+ 31 | 32 | 33
+ 32 | 33 | 34
+ 33 | 34 | 35
+ 34 | 35 | 36
+ 35 | 36 | 37
+(14 rows)
+
+delete from t_zedstore where c2 < 5;
+select * from t_zedstore;
+ c1 | c2 | c3 
+----+----+----
+  5 |  6 |  7
+  6 |  7 |  8
+  7 |  8 |  9
+  8 |  9 | 10
+  9 | 10 | 11
+ 10 | 11 | 12
+ 31 | 32 | 33
+ 32 | 33 | 34
+ 33 | 34 | 35
+ 34 | 35 | 36
+ 35 | 36 | 37
+(11 rows)
+
+update t_zedstore set c2 = 100 where c1 = 8;
+select * from t_zedstore;
+ c1 | c2  | c3 
+----+-----+----
+  5 |   6 |  7
+  6 |   7 |  8
+  7 |   8 |  9
+  9 |  10 | 11
+ 10 |  11 | 12
+ 31 |  32 | 33
+ 32 |  33 | 34
+ 33 |  34 | 35
+ 34 |  35 | 36
+ 35 |  36 | 37
+  8 | 100 | 10
+(11 rows)
+
+--
+-- Test page deletion, by deleting a bigger range of values
+--
+insert into t_zedstore select i,i+1,i+2 from generate_series(10000, 15000)i;
+delete from t_zedstore where c1 >= 10000;
+--
+-- Test VACUUM
+--
+vacuum t_zedstore;
+select * from t_zedstore;
+ c1 | c2  | c3 
+----+-----+----
+  5 |   6 |  7
+  6 |   7 |  8
+  7 |   8 |  9
+  9 |  10 | 11
+ 10 |  11 | 12
+ 31 |  32 | 33
+ 32 |  33 | 34
+ 33 |  34 | 35
+ 34 |  35 | 36
+ 35 |  36 | 37
+  8 | 100 | 10
+(11 rows)
+
+--
+-- Test toasting
+--
+create table t_zedtoast(c1 int, t text) USING zedstore;
+insert into t_zedtoast select i, repeat('x', 10000) from generate_series(1, 10) i;
+select c1, length(t) from t_zedtoast;
+ c1 | length 
+----+--------
+  1 |  10000
+  2 |  10000
+  3 |  10000
+  4 |  10000
+  5 |  10000
+  6 |  10000
+  7 |  10000
+  8 |  10000
+  9 |  10000
+ 10 |  10000
+(10 rows)
+
+--
+-- Test NULL values
+--
+create table t_zednullvalues(c1 int, c2 int) USING zedstore;
+insert into t_zednullvalues values(1, NULL), (NULL, 2);
+select * from t_zednullvalues;
+ c1 | c2 
+----+----
+  1 |   
+    |  2
+(2 rows)
+
+select c2 from t_zednullvalues;
+ c2 
+----
+   
+  2
+(2 rows)
+
+update t_zednullvalues set c1 = 1, c2 = NULL;
+select * from t_zednullvalues;
+ c1 | c2 
+----+----
+  1 |   
+  1 |   
+(2 rows)
+
+--
+-- Test COPY
+--
+create table t_zedcopy(a serial, b int, c text not null default 'stuff', d text,e text) USING zedstore;
+COPY t_zedcopy (a, b, c, d, e) from stdin;
+COPY t_zedcopy (b, d) from stdin;
+COPY t_zedcopy (b, d) from stdin;
+COPY t_zedcopy (a, b, c, d, e) from stdin;
+select * from t_zedcopy;
+   a   | b  |   c   |   d    | e  
+-------+----+-------+--------+----
+  9999 |    | \N    | NN     | 
+ 10000 | 21 | 31    | 41     | 51
+     1 |  1 | stuff | test_1 | 
+     2 |  2 | stuff | test_2 | 
+     3 |  3 | stuff | test_3 | 
+     4 |  4 | stuff | test_4 | 
+     5 |  5 | stuff | test_5 | 
+ 10001 | 22 | 32    | 42     | 52
+ 10002 | 23 | 33    | 43     | 53
+ 10003 | 24 | 34    | 44     | 54
+ 10004 | 25 | 35    | 45     | 55
+ 10005 | 26 | 36    | 46     | 56
+(12 rows)
+
+COPY t_zedcopy (a, d, e) to stdout;
+9999	NN	\N
+10000	41	51
+1	test_1	\N
+2	test_2	\N
+3	test_3	\N
+4	test_4	\N
+5	test_5	\N
+10001	42	52
+10002	43	53
+10003	44	54
+10004	45	55
+10005	46	56
+--
+-- Also test delete and update on the table that was populated with COPY.
+-- This exercises splitting the array item. (A table not populated with
+-- COPY only contains single items, at the moment.)
+--
+delete from t_zedcopy where b = 4;
+select * from t_zedcopy;
+   a   | b  |   c   |   d    | e  
+-------+----+-------+--------+----
+  9999 |    | \N    | NN     | 
+ 10000 | 21 | 31    | 41     | 51
+     1 |  1 | stuff | test_1 | 
+     2 |  2 | stuff | test_2 | 
+     3 |  3 | stuff | test_3 | 
+     5 |  5 | stuff | test_5 | 
+ 10001 | 22 | 32    | 42     | 
+ 10002 | 23 | 33    | 43     | 53
+ 10003 | 24 | 34    | 44     | 54
+ 10004 | 25 | 35    | 45     | 55
+ 10005 | 26 | 36    | 46     | 56
+(11 rows)
+
+delete from t_zedcopy where b < 3;
+select * from t_zedcopy;
+   a   | b  |   c   |   d    | e  
+-------+----+-------+--------+----
+  9999 |    | \N    | NN     | 
+ 10000 | 21 | 31    | 41     | 51
+     3 |  3 | stuff | test_3 | 
+     5 |  5 | stuff | test_5 | 
+ 10001 | 22 | 32    | 42     | 
+ 10002 | 23 | 33    | 43     | 53
+ 10003 | 24 | 34    | 44     | 54
+ 10004 | 25 | 35    | 45     | 55
+ 10005 | 26 | 36    | 46     | 56
+(9 rows)
+
+update t_zedcopy set b = 100 where b = 5;
+select * from t_zedcopy;
+   a   |  b  |   c   |   d    | e  
+-------+-----+-------+--------+----
+  9999 |     | \N    | NN     | 
+ 10000 |  21 | 31    | 41     | 51
+     3 |   3 | stuff | test_3 | 
+ 10001 |  22 | 32    | 42     | 
+ 10002 |  23 | 33    | 43     | 53
+ 10003 |  24 | 34    | 44     | 54
+ 10004 |  25 | 35    | 45     | 55
+ 10005 |  26 | 36    | 46     | 56
+     5 | 100 | stuff | test_5 | 
+(9 rows)
+
+--
+-- Test zero column table
+--
+create table t_zwithzerocols() using zedstore;
+insert into t_zwithzerocols select t.* from t_zwithzerocols t right join generate_series(1,1) on true;
+select count(*) from t_zwithzerocols;
+ count 
+-------
+     1
+(1 row)
+
+-- Test for alter table add column
+create table t_zaddcol(a int) using zedstore;
+insert into t_zaddcol select * from generate_series(1, 3);
+-- rewrite case
+alter table t_zaddcol add column b int generated always as (a + 1) stored;
+select * from t_zaddcol;
+ a | b 
+---+---
+ 1 | 2
+ 2 | 3
+ 3 | 4
+(3 rows)
+
+-- test alter table add column with no default
+create table t_zaddcol_simple(a int) using zedstore;
+insert into t_zaddcol_simple values (1);
+alter table t_zaddcol_simple add b int;
+select * from t_zaddcol_simple;
+ a | b 
+---+---
+ 1 |  
+(1 row)
+
+insert into t_zaddcol_simple values(2,3);
+select * from t_zaddcol_simple;
+ a | b 
+---+---
+ 1 |  
+ 2 | 3
+(2 rows)
+
+-- fixed length default value stored in catalog
+alter table t_zaddcol add column c int default 3;
+select * from t_zaddcol;
+ a | b | c 
+---+---+---
+ 1 | 2 | 3
+ 2 | 3 | 3
+ 3 | 4 | 3
+(3 rows)
+
+-- variable length default value stored in catalog
+alter table t_zaddcol add column d text default 'abcdefgh';
+select d from t_zaddcol;
+    d     
+----------
+ abcdefgh
+ abcdefgh
+ abcdefgh
+(3 rows)
+
+-- insert after add column
+insert into t_zaddcol values (2);
+select * from t_zaddcol;
+ a | b | c |    d     
+---+---+---+----------
+ 1 | 2 | 3 | abcdefgh
+ 2 | 3 | 3 | abcdefgh
+ 3 | 4 | 3 | abcdefgh
+ 2 | 3 | 3 | abcdefgh
+(4 rows)
+
+insert into t_zaddcol (a, c, d) values (3,5, 'test_insert');
+select b,c,d from t_zaddcol;
+ b | c |      d      
+---+---+-------------
+ 2 | 3 | abcdefgh
+ 3 | 3 | abcdefgh
+ 4 | 3 | abcdefgh
+ 3 | 3 | abcdefgh
+ 4 | 5 | test_insert
+(5 rows)
+
+--
+-- Test TABLESAMPLE
+--
+-- regular test tablesample.sql doesn't directly work for zedstore as
+-- its using fillfactor to create specific block layout for
+-- heap. Hence, output differs between heap and zedstore table while
+-- sampling. We need to use many tuples here to have multiple logical
+-- blocks as don't have way to force TIDs spread / jump for zedstore.
+--
+CREATE TABLE t_ztablesample (id int, name text) using zedstore;
+INSERT INTO t_ztablesample
+       SELECT i, repeat(i::text, 2) FROM generate_series(0, 299) s(i);
+-- lets delete half (even numbered ids) rows to limit the output
+DELETE FROM t_ztablesample WHERE id%2 = 0;
+-- should return ALL visible tuples from SOME blocks
+SELECT ctid,t.id FROM t_ztablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (0);
+  ctid   | id  
+---------+-----
+ (1,2)   | 129
+ (1,4)   | 131
+ (1,6)   | 133
+ (1,8)   | 135
+ (1,10)  | 137
+ (1,12)  | 139
+ (1,14)  | 141
+ (1,16)  | 143
+ (1,18)  | 145
+ (1,20)  | 147
+ (1,22)  | 149
+ (1,24)  | 151
+ (1,26)  | 153
+ (1,28)  | 155
+ (1,30)  | 157
+ (1,32)  | 159
+ (1,34)  | 161
+ (1,36)  | 163
+ (1,38)  | 165
+ (1,40)  | 167
+ (1,42)  | 169
+ (1,44)  | 171
+ (1,46)  | 173
+ (1,48)  | 175
+ (1,50)  | 177
+ (1,52)  | 179
+ (1,54)  | 181
+ (1,56)  | 183
+ (1,58)  | 185
+ (1,60)  | 187
+ (1,62)  | 189
+ (1,64)  | 191
+ (1,66)  | 193
+ (1,68)  | 195
+ (1,70)  | 197
+ (1,72)  | 199
+ (1,74)  | 201
+ (1,76)  | 203
+ (1,78)  | 205
+ (1,80)  | 207
+ (1,82)  | 209
+ (1,84)  | 211
+ (1,86)  | 213
+ (1,88)  | 215
+ (1,90)  | 217
+ (1,92)  | 219
+ (1,94)  | 221
+ (1,96)  | 223
+ (1,98)  | 225
+ (1,100) | 227
+ (1,102) | 229
+ (1,104) | 231
+ (1,106) | 233
+ (1,108) | 235
+ (1,110) | 237
+ (1,112) | 239
+ (1,114) | 241
+ (1,116) | 243
+ (1,118) | 245
+ (1,120) | 247
+ (1,122) | 249
+ (1,124) | 251
+ (1,126) | 253
+ (1,128) | 255
+ (2,2)   | 257
+ (2,4)   | 259
+ (2,6)   | 261
+ (2,8)   | 263
+ (2,10)  | 265
+ (2,12)  | 267
+ (2,14)  | 269
+ (2,16)  | 271
+ (2,18)  | 273
+ (2,20)  | 275
+ (2,22)  | 277
+ (2,24)  | 279
+ (2,26)  | 281
+ (2,28)  | 283
+ (2,30)  | 285
+ (2,32)  | 287
+ (2,34)  | 289
+ (2,36)  | 291
+ (2,38)  | 293
+ (2,40)  | 295
+ (2,42)  | 297
+ (2,44)  | 299
+(86 rows)
+
+-- should return SOME visible tuples but from ALL the blocks
+SELECT ctid,id FROM t_ztablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (0);
+  ctid   | id  
+---------+-----
+ (0,4)   |   3
+ (0,6)   |   5
+ (0,8)   |   7
+ (0,20)  |  19
+ (0,30)  |  29
+ (0,42)  |  41
+ (0,44)  |  43
+ (0,48)  |  47
+ (0,52)  |  51
+ (0,54)  |  53
+ (0,56)  |  55
+ (0,62)  |  61
+ (0,64)  |  63
+ (0,66)  |  65
+ (0,76)  |  75
+ (0,80)  |  79
+ (0,82)  |  81
+ (0,84)  |  83
+ (0,88)  |  87
+ (0,90)  |  89
+ (0,92)  |  91
+ (0,98)  |  97
+ (0,106) | 105
+ (0,108) | 107
+ (0,122) | 121
+ (0,126) | 125
+ (1,2)   | 129
+ (1,4)   | 131
+ (1,6)   | 133
+ (1,8)   | 135
+ (1,10)  | 137
+ (1,12)  | 139
+ (1,20)  | 147
+ (1,24)  | 151
+ (1,26)  | 153
+ (1,28)  | 155
+ (1,30)  | 157
+ (1,32)  | 159
+ (1,34)  | 161
+ (1,40)  | 167
+ (1,44)  | 171
+ (1,46)  | 173
+ (1,58)  | 185
+ (1,66)  | 193
+ (1,68)  | 195
+ (1,70)  | 197
+ (1,78)  | 205
+ (1,80)  | 207
+ (1,88)  | 215
+ (1,92)  | 219
+ (1,96)  | 223
+ (1,100) | 227
+ (1,102) | 229
+ (1,106) | 233
+ (1,112) | 239
+ (1,116) | 243
+ (1,120) | 247
+ (1,122) | 249
+ (1,126) | 253
+ (2,2)   | 257
+ (2,6)   | 261
+ (2,8)   | 263
+ (2,10)  | 265
+ (2,12)  | 267
+ (2,16)  | 271
+ (2,18)  | 273
+ (2,24)  | 279
+ (2,26)  | 281
+ (2,28)  | 283
+ (2,30)  | 285
+ (2,34)  | 289
+ (2,36)  | 291
+ (2,42)  | 297
+ (2,44)  | 299
+(74 rows)
+
diff --git a/src/test/regress/output/misc_1.source b/src/test/regress/output/misc_1.source
new file mode 100644
index 0000000000..c29c54c414
--- /dev/null
+++ b/src/test/regress/output/misc_1.source
@@ -0,0 +1,692 @@
+--
+-- MISC
+--
+--
+-- BTREE
+--
+UPDATE onek
+   SET unique1 = onek.unique1 + 1;
+UPDATE onek
+   SET unique1 = onek.unique1 - 1;
+--
+-- BTREE partial
+--
+-- UPDATE onek2
+--   SET unique1 = onek2.unique1 + 1;
+--UPDATE onek2
+--   SET unique1 = onek2.unique1 - 1;
+--
+-- BTREE shutting out non-functional updates
+--
+-- the following two tests seem to take a long time on some
+-- systems.    This non-func update stuff needs to be examined
+-- more closely.  			- jolly (2/22/96)
+--
+UPDATE tmp
+   SET stringu1 = reverse_name(onek.stringu1)
+   FROM onek
+   WHERE onek.stringu1 = 'JBAAAA' and
+	  onek.stringu1 = tmp.stringu1;
+UPDATE tmp
+   SET stringu1 = reverse_name(onek2.stringu1)
+   FROM onek2
+   WHERE onek2.stringu1 = 'JCAAAA' and
+	  onek2.stringu1 = tmp.stringu1;
+DROP TABLE tmp;
+--UPDATE person*
+--   SET age = age + 1;
+--UPDATE person*
+--   SET age = age + 3
+--   WHERE name = 'linda';
+--
+-- copy
+--
+COPY onek TO '@abs_builddir@/results/onek.data';
+DELETE FROM onek;
+COPY onek FROM '@abs_builddir@/results/onek.data';
+SELECT unique1 FROM onek WHERE unique1 < 2 ORDER BY unique1;
+ unique1 
+---------
+       0
+       1
+(2 rows)
+
+DELETE FROM onek2;
+COPY onek2 FROM '@abs_builddir@/results/onek.data';
+SELECT unique1 FROM onek2 WHERE unique1 < 2 ORDER BY unique1;
+ unique1 
+---------
+       0
+       1
+(2 rows)
+
+COPY BINARY stud_emp TO '@abs_builddir@/results/stud_emp.data';
+DELETE FROM stud_emp;
+COPY BINARY stud_emp FROM '@abs_builddir@/results/stud_emp.data';
+SELECT * FROM stud_emp;
+ name  | age |  location  | salary | manager | gpa | percent 
+-------+-----+------------+--------+---------+-----+---------
+ jeff  |  23 | (8,7.7)    |    600 | sharon  | 3.5 |        
+ cim   |  30 | (10.5,4.7) |    400 |         | 3.4 |        
+ linda |  19 | (0.9,6.1)  |    100 |         | 2.9 |        
+(3 rows)
+
+-- COPY aggtest FROM stdin;
+-- 56	7.8
+-- 100	99.097
+-- 0	0.09561
+-- 42	324.78
+-- .
+-- COPY aggtest TO stdout;
+--
+-- inheritance stress test
+--
+SELECT * FROM a_star*;
+ class | a  
+-------+----
+ a     |  1
+ a     |  2
+ a     |   
+ b     |  3
+ b     |  4
+ b     |   
+ b     |   
+ c     |  5
+ c     |  6
+ c     |   
+ c     |   
+ d     |  7
+ d     |  8
+ d     |  9
+ d     | 10
+ d     |   
+ d     | 11
+ d     | 12
+ d     | 13
+ d     |   
+ d     |   
+ d     |   
+ d     | 14
+ d     |   
+ d     |   
+ d     |   
+ d     |   
+ e     | 15
+ e     | 16
+ e     | 17
+ e     |   
+ e     | 18
+ e     |   
+ e     |   
+ f     | 19
+ f     | 20
+ f     | 21
+ f     | 22
+ f     |   
+ f     | 24
+ f     | 25
+ f     | 26
+ f     |   
+ f     |   
+ f     |   
+ f     | 27
+ f     |   
+ f     |   
+ f     |   
+ f     |   
+(50 rows)
+
+SELECT *
+   FROM b_star* x
+   WHERE x.b = text 'bumble' or x.a < 3;
+ class | a |   b    
+-------+---+--------
+ b     |   | bumble
+(1 row)
+
+SELECT class, a
+   FROM c_star* x
+   WHERE x.c ~ text 'hi';
+ class | a  
+-------+----
+ c     |  5
+ c     |   
+ d     |  7
+ d     |  8
+ d     | 10
+ d     |   
+ d     | 12
+ d     |   
+ d     |   
+ d     |   
+ e     | 15
+ e     | 16
+ e     |   
+ e     |   
+ f     | 19
+ f     | 20
+ f     | 21
+ f     |   
+ f     | 24
+ f     |   
+ f     |   
+ f     |   
+(22 rows)
+
+SELECT class, b, c
+   FROM d_star* x
+   WHERE x.a < 100;
+ class |    b    |     c      
+-------+---------+------------
+ d     | grumble | hi sunita
+ d     | stumble | hi koko
+ d     | rumble  | 
+ d     |         | hi kristin
+ d     | fumble  | 
+ d     |         | hi avi
+ d     |         | 
+ d     |         | 
+(8 rows)
+
+SELECT class, c FROM e_star* x WHERE x.c NOTNULL;
+ class |      c      
+-------+-------------
+ e     | hi carol
+ e     | hi bob
+ e     | hi michelle
+ e     | hi elisa
+ f     | hi claire
+ f     | hi mike
+ f     | hi marcel
+ f     | hi keith
+ f     | hi marc
+ f     | hi allison
+ f     | hi jeff
+ f     | hi carl
+(12 rows)
+
+SELECT * FROM f_star* x WHERE x.c ISNULL;
+ class | a  | c |  e  |                     f                     
+-------+----+---+-----+-------------------------------------------
+ f     | 22 |   |  -7 | ((111,555),(222,666),(333,777),(444,888))
+ f     | 25 |   |  -9 | 
+ f     | 26 |   |     | ((11111,33333),(22222,44444))
+ f     |    |   | -11 | ((1111111,3333333),(2222222,4444444))
+ f     | 27 |   |     | 
+ f     |    |   | -12 | 
+ f     |    |   |     | ((11111111,33333333),(22222222,44444444))
+ f     |    |   |     | 
+(8 rows)
+
+-- grouping and aggregation on inherited sets have been busted in the past...
+SELECT sum(a) FROM a_star*;
+ sum 
+-----
+ 355
+(1 row)
+
+SELECT class, sum(a) FROM a_star* GROUP BY class ORDER BY class;
+ class | sum 
+-------+-----
+ a     |   3
+ b     |   7
+ c     |  11
+ d     |  84
+ e     |  66
+ f     | 184
+(6 rows)
+
+ALTER TABLE f_star RENAME COLUMN f TO ff;
+ALTER TABLE e_star* RENAME COLUMN e TO ee;
+ALTER TABLE d_star* RENAME COLUMN d TO dd;
+ALTER TABLE c_star* RENAME COLUMN c TO cc;
+ALTER TABLE b_star* RENAME COLUMN b TO bb;
+ALTER TABLE a_star* RENAME COLUMN a TO aa;
+SELECT class, aa
+   FROM a_star* x
+   WHERE aa ISNULL;
+ class | aa 
+-------+----
+ a     |   
+ b     |   
+ b     |   
+ c     |   
+ c     |   
+ d     |   
+ d     |   
+ d     |   
+ d     |   
+ d     |   
+ d     |   
+ d     |   
+ d     |   
+ e     |   
+ e     |   
+ e     |   
+ f     |   
+ f     |   
+ f     |   
+ f     |   
+ f     |   
+ f     |   
+ f     |   
+ f     |   
+(24 rows)
+
+-- As of Postgres 7.1, ALTER implicitly recurses,
+-- so this should be same as ALTER a_star*
+ALTER TABLE a_star RENAME COLUMN aa TO foo;
+SELECT class, foo
+   FROM a_star* x
+   WHERE x.foo >= 2;
+ class | foo 
+-------+-----
+ a     |   2
+ b     |   3
+ b     |   4
+ c     |   5
+ c     |   6
+ d     |   7
+ d     |   8
+ d     |   9
+ d     |  10
+ d     |  11
+ d     |  12
+ d     |  13
+ d     |  14
+ e     |  15
+ e     |  16
+ e     |  17
+ e     |  18
+ f     |  19
+ f     |  20
+ f     |  21
+ f     |  22
+ f     |  24
+ f     |  25
+ f     |  26
+ f     |  27
+(25 rows)
+
+ALTER TABLE a_star RENAME COLUMN foo TO aa;
+SELECT *
+   from a_star*
+   WHERE aa < 1000;
+ class | aa 
+-------+----
+ a     |  1
+ a     |  2
+ b     |  3
+ b     |  4
+ c     |  5
+ c     |  6
+ d     |  7
+ d     |  8
+ d     |  9
+ d     | 10
+ d     | 11
+ d     | 12
+ d     | 13
+ d     | 14
+ e     | 15
+ e     | 16
+ e     | 17
+ e     | 18
+ f     | 19
+ f     | 20
+ f     | 21
+ f     | 22
+ f     | 24
+ f     | 25
+ f     | 26
+ f     | 27
+(26 rows)
+
+ALTER TABLE f_star ADD COLUMN f int4;
+UPDATE f_star SET f = 10;
+ALTER TABLE e_star* ADD COLUMN e int4;
+--UPDATE e_star* SET e = 42;
+SELECT * FROM e_star*;
+ class | aa |     cc      | ee  | e 
+-------+----+-------------+-----+---
+ e     | 15 | hi carol    |  -1 |  
+ e     | 16 | hi bob      |     |  
+ e     | 17 |             |  -2 |  
+ e     |    | hi michelle |  -3 |  
+ e     | 18 |             |     |  
+ e     |    | hi elisa    |     |  
+ e     |    |             |  -4 |  
+ f     | 19 | hi claire   |  -5 |  
+ f     | 20 | hi mike     |  -6 |  
+ f     | 21 | hi marcel   |     |  
+ f     | 22 |             |  -7 |  
+ f     |    | hi keith    |  -8 |  
+ f     | 24 | hi marc     |     |  
+ f     | 25 |             |  -9 |  
+ f     | 26 |             |     |  
+ f     |    | hi allison  | -10 |  
+ f     |    | hi jeff     |     |  
+ f     |    |             | -11 |  
+ f     | 27 |             |     |  
+ f     |    | hi carl     |     |  
+ f     |    |             | -12 |  
+ f     |    |             |     |  
+ f     |    |             |     |  
+(23 rows)
+
+ALTER TABLE a_star* ADD COLUMN a text;
+NOTICE:  merging definition of column "a" for child "d_star"
+-- That ALTER TABLE should have added TOAST tables.
+SELECT relname, reltoastrelid <> 0 AS has_toast_table
+   FROM pg_class
+   WHERE oid::regclass IN ('a_star', 'c_star')
+   ORDER BY 1;
+ relname | has_toast_table 
+---------+-----------------
+ a_star  | f
+ c_star  | f
+(2 rows)
+
+--UPDATE b_star*
+--   SET a = text 'gazpacho'
+--   WHERE aa > 4;
+SELECT class, aa, a FROM a_star*;
+ class | aa | a 
+-------+----+---
+ a     |  1 | 
+ a     |  2 | 
+ a     |    | 
+ b     |  3 | 
+ b     |  4 | 
+ b     |    | 
+ b     |    | 
+ c     |  5 | 
+ c     |  6 | 
+ c     |    | 
+ c     |    | 
+ d     |  7 | 
+ d     |  8 | 
+ d     |  9 | 
+ d     | 10 | 
+ d     |    | 
+ d     | 11 | 
+ d     | 12 | 
+ d     | 13 | 
+ d     |    | 
+ d     |    | 
+ d     |    | 
+ d     | 14 | 
+ d     |    | 
+ d     |    | 
+ d     |    | 
+ d     |    | 
+ e     | 15 | 
+ e     | 16 | 
+ e     | 17 | 
+ e     |    | 
+ e     | 18 | 
+ e     |    | 
+ e     |    | 
+ f     | 19 | 
+ f     | 20 | 
+ f     | 21 | 
+ f     | 22 | 
+ f     |    | 
+ f     | 24 | 
+ f     | 25 | 
+ f     | 26 | 
+ f     |    | 
+ f     |    | 
+ f     |    | 
+ f     | 27 | 
+ f     |    | 
+ f     |    | 
+ f     |    | 
+ f     |    | 
+(50 rows)
+
+--
+-- versions
+--
+--
+-- postquel functions
+--
+--
+-- mike does post_hacking,
+-- joe and sally play basketball, and
+-- everyone else does nothing.
+--
+SELECT p.name, name(p.hobbies) FROM ONLY person p;
+ name  |    name     
+-------+-------------
+ mike  | posthacking
+ joe   | basketball
+ sally | basketball
+(3 rows)
+
+--
+-- as above, but jeff also does post_hacking.
+--
+SELECT p.name, name(p.hobbies) FROM person* p;
+ name  |    name     
+-------+-------------
+ mike  | posthacking
+ joe   | basketball
+ sally | basketball
+ jeff  | posthacking
+(4 rows)
+
+--
+-- the next two queries demonstrate how functions generate bogus duplicates.
+-- this is a "feature" ..
+--
+SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r
+  ORDER BY 1,2;
+    name     |     name      
+-------------+---------------
+ basketball  | hightops
+ posthacking | advil
+ posthacking | peet's coffee
+ skywalking  | guts
+(4 rows)
+
+SELECT hobbies_r.name, (hobbies_r.equipment).name FROM hobbies_r;
+    name     |     name      
+-------------+---------------
+ posthacking | advil
+ posthacking | peet's coffee
+ posthacking | advil
+ posthacking | peet's coffee
+ basketball  | hightops
+ basketball  | hightops
+ skywalking  | guts
+(7 rows)
+
+--
+-- mike needs advil and peet's coffee,
+-- joe and sally need hightops, and
+-- everyone else is fine.
+--
+SELECT p.name, name(p.hobbies), name(equipment(p.hobbies)) FROM ONLY person p;
+ name  |    name     |     name      
+-------+-------------+---------------
+ mike  | posthacking | advil
+ mike  | posthacking | peet's coffee
+ joe   | basketball  | hightops
+ sally | basketball  | hightops
+(4 rows)
+
+--
+-- as above, but jeff needs advil and peet's coffee as well.
+--
+SELECT p.name, name(p.hobbies), name(equipment(p.hobbies)) FROM person* p;
+ name  |    name     |     name      
+-------+-------------+---------------
+ mike  | posthacking | advil
+ mike  | posthacking | peet's coffee
+ joe   | basketball  | hightops
+ sally | basketball  | hightops
+ jeff  | posthacking | advil
+ jeff  | posthacking | peet's coffee
+(6 rows)
+
+--
+-- just like the last two, but make sure that the target list fixup and
+-- unflattening is being done correctly.
+--
+SELECT name(equipment(p.hobbies)), p.name, name(p.hobbies) FROM ONLY person p;
+     name      | name  |    name     
+---------------+-------+-------------
+ advil         | mike  | posthacking
+ peet's coffee | mike  | posthacking
+ hightops      | joe   | basketball
+ hightops      | sally | basketball
+(4 rows)
+
+SELECT (p.hobbies).equipment.name, p.name, name(p.hobbies) FROM person* p;
+     name      | name  |    name     
+---------------+-------+-------------
+ advil         | mike  | posthacking
+ peet's coffee | mike  | posthacking
+ hightops      | joe   | basketball
+ hightops      | sally | basketball
+ advil         | jeff  | posthacking
+ peet's coffee | jeff  | posthacking
+(6 rows)
+
+SELECT (p.hobbies).equipment.name, name(p.hobbies), p.name FROM ONLY person p;
+     name      |    name     | name  
+---------------+-------------+-------
+ advil         | posthacking | mike
+ peet's coffee | posthacking | mike
+ hightops      | basketball  | joe
+ hightops      | basketball  | sally
+(4 rows)
+
+SELECT name(equipment(p.hobbies)), name(p.hobbies), p.name FROM person* p;
+     name      |    name     | name  
+---------------+-------------+-------
+ advil         | posthacking | mike
+ peet's coffee | posthacking | mike
+ hightops      | basketball  | joe
+ hightops      | basketball  | sally
+ advil         | posthacking | jeff
+ peet's coffee | posthacking | jeff
+(6 rows)
+
+SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer')));
+ name 
+------
+ guts
+(1 row)
+
+SELECT name(equipment(hobby_construct_named(text 'skywalking', text 'mer')));
+ name 
+------
+ guts
+(1 row)
+
+SELECT name(equipment_named(hobby_construct_named(text 'skywalking', text 'mer')));
+ name 
+------
+ guts
+(1 row)
+
+SELECT name(equipment_named_ambiguous_1a(hobby_construct_named(text 'skywalking', text 'mer')));
+ name 
+------
+ guts
+(1 row)
+
+SELECT name(equipment_named_ambiguous_1b(hobby_construct_named(text 'skywalking', text 'mer')));
+ name 
+------
+ guts
+(1 row)
+
+SELECT name(equipment_named_ambiguous_1c(hobby_construct_named(text 'skywalking', text 'mer')));
+ name 
+------
+ guts
+(1 row)
+
+SELECT name(equipment_named_ambiguous_2a(text 'skywalking'));
+ name 
+------
+ guts
+(1 row)
+
+SELECT name(equipment_named_ambiguous_2b(text 'skywalking'));
+     name      
+---------------
+ advil
+ peet's coffee
+ hightops
+ guts
+(4 rows)
+
+SELECT hobbies_by_name('basketball');
+ hobbies_by_name 
+-----------------
+ joe
+(1 row)
+
+SELECT name, overpaid(emp.*) FROM emp;
+  name  | overpaid 
+--------+----------
+ sharon | t
+ sam    | t
+ bill   | t
+ jeff   | f
+ cim    | f
+ linda  | f
+(6 rows)
+
+--
+-- Try a few cases with SQL-spec row constructor expressions
+--
+SELECT * FROM equipment(ROW('skywalking', 'mer'));
+ name |   hobby    
+------+------------
+ guts | skywalking
+(1 row)
+
+SELECT name(equipment(ROW('skywalking', 'mer')));
+ name 
+------
+ guts
+(1 row)
+
+SELECT *, name(equipment(h.*)) FROM hobbies_r h;
+    name     | person |     name      
+-------------+--------+---------------
+ posthacking | mike   | advil
+ posthacking | mike   | peet's coffee
+ posthacking | jeff   | advil
+ posthacking | jeff   | peet's coffee
+ basketball  | joe    | hightops
+ basketball  | sally  | hightops
+ skywalking  |        | guts
+(7 rows)
+
+SELECT *, (equipment(CAST((h.*) AS hobbies_r))).name FROM hobbies_r h;
+    name     | person |     name      
+-------------+--------+---------------
+ posthacking | mike   | advil
+ posthacking | mike   | peet's coffee
+ posthacking | jeff   | advil
+ posthacking | jeff   | peet's coffee
+ basketball  | joe    | hightops
+ basketball  | sally  | hightops
+ skywalking  |        | guts
+(7 rows)
+
+--
+-- functional joins
+--
+--
+-- instance rules
+--
+--
+-- rewrite rules
+--
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index f23fe8d870..aad070d48e 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -78,7 +78,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview
 # ----------
 # Another group of parallel tests
 # ----------
-test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan
+test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan zedstore

 # rules cannot run concurrently with any test that creates
 # a view or rule in the public schema
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index ca200eb599..5ad9d90b58 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -134,6 +134,7 @@ test: misc_functions
 test: sysviews
 test: tsrf
 test: tidscan
+test: zedstore
 test: rules
 test: psql
 test: psql_crosstab
diff --git a/src/test/regress/sql/zedstore.sql b/src/test/regress/sql/zedstore.sql
new file mode 100644
index 0000000000..d987e70c4f
--- /dev/null
+++ b/src/test/regress/sql/zedstore.sql
@@ -0,0 +1,176 @@
+-- simple tests to iteratively build the zedstore
+-- create and drop works
+create table t_zedstore(c1 int, c2 int, c3 int) USING zedstore;
+drop table t_zedstore;
+-- insert and select works
+create table t_zedstore(c1 int, c2 int, c3 int) USING zedstore;
+insert into t_zedstore select i,i+1,i+2 from generate_series(1, 10)i;
+select * from t_zedstore;
+-- selecting only few columns work
+select c1, c3 from t_zedstore;
+-- only few columns in output and where clause work
+select c3 from t_zedstore where c2 > 5;
+
+-- Test abort works
+begin;
+insert into t_zedstore select i,i+1,i+2 from generate_series(21, 25)i;
+abort;
+insert into t_zedstore select i,i+1,i+2 from generate_series(31, 35)i;
+select * from t_zedstore;
+
+--
+-- Test indexing
+--
+create index on t_zedstore (c1);
+set enable_seqscan=off;
+set enable_indexscan=on;
+set enable_bitmapscan=off;
+
+-- index scan
+select * from t_zedstore where c1 = 5;
+
+-- index-only scan
+select c1 from t_zedstore where c1 = 5;
+
+-- bitmap scan
+set enable_indexscan=off;
+set enable_bitmapscan=on;
+select c1, c2 from t_zedstore where c1 between 5 and 10;
+
+--
+-- Test DELETE and UPDATE
+--
+delete from t_zedstore where c2 = 5;
+select * from t_zedstore;
+delete from t_zedstore where c2 < 5;
+select * from t_zedstore;
+
+update t_zedstore set c2 = 100 where c1 = 8;
+select * from t_zedstore;
+
+--
+-- Test page deletion, by deleting a bigger range of values
+--
+insert into t_zedstore select i,i+1,i+2 from generate_series(10000, 15000)i;
+delete from t_zedstore where c1 >= 10000;
+
+--
+-- Test VACUUM
+--
+vacuum t_zedstore;
+select * from t_zedstore;
+
+--
+-- Test toasting
+--
+create table t_zedtoast(c1 int, t text) USING zedstore;
+insert into t_zedtoast select i, repeat('x', 10000) from generate_series(1, 10) i;
+
+select c1, length(t) from t_zedtoast;
+
+--
+-- Test NULL values
+--
+create table t_zednullvalues(c1 int, c2 int) USING zedstore;
+insert into t_zednullvalues values(1, NULL), (NULL, 2);
+select * from t_zednullvalues;
+select c2 from t_zednullvalues;
+update t_zednullvalues set c1 = 1, c2 = NULL;
+select * from t_zednullvalues;
+
+--
+-- Test COPY
+--
+create table t_zedcopy(a serial, b int, c text not null default 'stuff', d text,e text) USING zedstore;
+
+COPY t_zedcopy (a, b, c, d, e) from stdin;
+9999	\N	\\N	\NN	\N
+10000	21	31	41	51
+\.
+
+COPY t_zedcopy (b, d) from stdin;
+1	test_1
+\.
+
+COPY t_zedcopy (b, d) from stdin;
+2	test_2
+3	test_3
+4	test_4
+5	test_5
+\.
+
+COPY t_zedcopy (a, b, c, d, e) from stdin;
+10001	22	32	42	52
+10002	23	33	43	53
+10003	24	34	44	54
+10004	25	35	45	55
+10005	26	36	46	56
+\.
+
+select * from t_zedcopy;
+COPY t_zedcopy (a, d, e) to stdout;
+
+--
+-- Also test delete and update on the table that was populated with COPY.
+-- This exercises splitting the array item. (A table not populated with
+-- COPY only contains single items, at the moment.)
+--
+
+delete from t_zedcopy where b = 4;
+select * from t_zedcopy;
+delete from t_zedcopy where b < 3;
+select * from t_zedcopy;
+
+update t_zedcopy set b = 100 where b = 5;
+select * from t_zedcopy;
+
+--
+-- Test zero column table
+--
+create table t_zwithzerocols() using zedstore;
+insert into t_zwithzerocols select t.* from t_zwithzerocols t right join generate_series(1,1) on true;
+select count(*) from t_zwithzerocols;
+
+-- Test for alter table add column
+create table t_zaddcol(a int) using zedstore;
+insert into t_zaddcol select * from generate_series(1, 3);
+-- rewrite case
+alter table t_zaddcol add column b int generated always as (a + 1) stored;
+select * from t_zaddcol;
+-- test alter table add column with no default
+create table t_zaddcol_simple(a int) using zedstore;
+insert into t_zaddcol_simple values (1);
+alter table t_zaddcol_simple add b int;
+select * from t_zaddcol_simple;
+insert into t_zaddcol_simple values(2,3);
+select * from t_zaddcol_simple;
+-- fixed length default value stored in catalog
+alter table t_zaddcol add column c int default 3;
+select * from t_zaddcol;
+-- variable length default value stored in catalog
+alter table t_zaddcol add column d text default 'abcdefgh';
+select d from t_zaddcol;
+-- insert after add column
+insert into t_zaddcol values (2);
+select * from t_zaddcol;
+insert into t_zaddcol (a, c, d) values (3,5, 'test_insert');
+select b,c,d from t_zaddcol;
+
+--
+-- Test TABLESAMPLE
+--
+-- regular test tablesample.sql doesn't directly work for zedstore as
+-- its using fillfactor to create specific block layout for
+-- heap. Hence, output differs between heap and zedstore table while
+-- sampling. We need to use many tuples here to have multiple logical
+-- blocks as don't have way to force TIDs spread / jump for zedstore.
+--
+CREATE TABLE t_ztablesample (id int, name text) using zedstore;
+INSERT INTO t_ztablesample
+       SELECT i, repeat(i::text, 2) FROM generate_series(0, 299) s(i);
+-- lets delete half (even numbered ids) rows to limit the output
+DELETE FROM t_ztablesample WHERE id%2 = 0;
+-- should return ALL visible tuples from SOME blocks
+SELECT ctid,t.id FROM t_ztablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (0);
+-- should return SOME visible tuples but from ALL the blocks
+SELECT ctid,id FROM t_ztablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (0);
diff --git a/src/test/storageperf/driver.sql b/src/test/storageperf/driver.sql
new file mode 100644
index 0000000000..73981d1c94
--- /dev/null
+++ b/src/test/storageperf/driver.sql
@@ -0,0 +1,36 @@
+--
+-- Main script, to run all the tests, and print the results.
+--
+--
+
+-- First run the tests using heap.
+DROP SCHEMA IF EXISTS storagetest_heap CASCADE;
+CREATE SCHEMA storagetest_heap;
+SET search_path='storagetest_heap';
+
+CREATE TABLE results (testname text, val numeric) USING heap;
+
+SET default_table_access_method=heap;
+\i tests.sql
+
+
+-- Repeat with zedstore
+
+DROP SCHEMA IF EXISTS storagetest_zedstore CASCADE;
+CREATE SCHEMA storagetest_zedstore;
+SET search_path='storagetest_zedstore';
+
+CREATE TABLE results (testname text, val numeric) USING heap;
+
+SET default_table_access_method=zedstore;
+\i tests.sql
+
+
+SET search_path='public';
+
+SELECT COALESCE(h.testname, zs.testname) as testname,
+       h.val as heap,
+       zs.val as zedstore,
+       round(zs.val / h.val, 2) as "heap / zedstore"
+FROM storagetest_heap.results h
+FULL OUTER JOIN storagetest_zedstore.results zs ON (h.testname = zs.testname);
diff --git a/src/test/storageperf/sql/onecol.sql b/src/test/storageperf/sql/onecol.sql
new file mode 100644
index 0000000000..5cf18158c9
--- /dev/null
+++ b/src/test/storageperf/sql/onecol.sql
@@ -0,0 +1,38 @@
+-- Tests with a narrow, single-column table.
+
+CREATE UNLOGGED TABLE onecol (i int4);
+
+-- Populate the table with a bunch of INSERT ... SELECT statements.
+-- Measure how long it takes, and the resulting table size.
+select extract(epoch from now()) as before
+\gset
+
+INSERT INTO onecol SELECT generate_series(1, 100000);
+INSERT INTO onecol SELECT generate_series(1, 100000);
+INSERT INTO onecol SELECT generate_series(1, 100000);
+INSERT INTO onecol SELECT generate_series(1, 100000);
+INSERT INTO onecol SELECT generate_series(1, 100000);
+
+select extract(epoch from now()) as after
+\gset
+
+INSERT INTO results (testname, val) VALUES ('onecol, insert-select, size', pg_total_relation_size('onecol'));
+INSERT INTO results (testname, val) VALUES ('onecol, insert-select, time', :after - :before);
+
+COPY onecol TO '/tmp/onecol.data'; -- dump the data, for COPY test below.
+
+--
+-- Truncate and populate it again with the same data, but this time using COPY.
+--
+TRUNCATE onecol;
+
+select extract(epoch from now()) as before
+\gset
+
+COPY onecol FROM '/tmp/onecol.data';
+
+select extract(epoch from now()) as after
+\gset
+
+INSERT INTO results (testname, val) VALUES ('onecol, COPY, size', pg_total_relation_size('onecol'));
+INSERT INTO results (testname, val) VALUES ('onecol, COPY, time', :after - :before);
diff --git a/src/test/storageperf/tests.sql b/src/test/storageperf/tests.sql
new file mode 100644
index 0000000000..d1f25ed029
--- /dev/null
+++ b/src/test/storageperf/tests.sql
@@ -0,0 +1,3 @@
+-- Test "schedule". List all the tests you want to run here.
+
+\i sql/onecol.sql

base-commit: db6e2b4c52ade524f3db419d75084728e96e1f9c
-- 
2.19.1