From b9a152c56d844f0543c2e5c1c09e7b7de4e20bc2 Mon Sep 17 00:00:00 2001 From: Ashwin Agrawal Date: Wed, 22 May 2019 16:32:57 -0700 Subject: [PATCH v2] Zedstore compressed in-core columnar storage. --- configure | 118 + configure.in | 19 + src/backend/access/Makefile | 2 +- src/backend/access/gin/ginbtree.c | 2 +- src/backend/access/gin/ginfast.c | 2 +- src/backend/access/gin/gininsert.c | 4 +- src/backend/access/gist/gist.c | 2 +- src/backend/access/hash/hashinsert.c | 2 +- src/backend/access/heap/heapam.c | 20 +- src/backend/access/heap/heapam_handler.c | 5 +- src/backend/access/index/indexam.c | 4 +- src/backend/access/nbtree/nbtinsert.c | 4 +- src/backend/access/zedstore/Makefile | 21 + src/backend/access/zedstore/README | 295 ++ .../access/zedstore/zedstore_attpage.c | 1589 +++++++ src/backend/access/zedstore/zedstore_btree.c | 632 +++ .../access/zedstore/zedstore_compression.c | 364 ++ .../access/zedstore/zedstore_freepagemap.c | 1076 +++++ .../access/zedstore/zedstore_inspect.c | 448 ++ src/backend/access/zedstore/zedstore_meta.c | 216 + .../access/zedstore/zedstore_tidpage.c | 1774 ++++++++ src/backend/access/zedstore/zedstore_toast.c | 192 + .../access/zedstore/zedstore_tupslot.c | 348 ++ src/backend/access/zedstore/zedstore_undo.c | 918 ++++ src/backend/access/zedstore/zedstore_utils.c | 76 + .../access/zedstore/zedstore_visibility.c | 728 +++ .../access/zedstore/zedstoream_handler.c | 3163 +++++++++++++ src/backend/commands/analyze.c | 7 +- src/backend/commands/copy.c | 22 +- src/backend/commands/tablecmds.c | 14 +- src/backend/commands/trigger.c | 8 + src/backend/executor/execScan.c | 90 + src/backend/executor/nodeIndexonlyscan.c | 16 +- src/backend/executor/nodeIndexscan.c | 20 +- src/backend/executor/nodeSeqscan.c | 18 +- src/backend/optimizer/plan/createplan.c | 3 + src/backend/optimizer/util/plancat.c | 2 + src/backend/partitioning/partbounds.c | 15 +- src/backend/storage/lmgr/predicate.c | 45 +- src/include/access/tableam.h | 41 + src/include/access/zedstore_compression.h | 51 + src/include/access/zedstore_internal.h | 618 +++ src/include/access/zedstore_undo.h | 171 + src/include/catalog/pg_am.dat | 3 + src/include/catalog/pg_proc.dat | 24 + src/include/executor/executor.h | 3 +- src/include/nodes/execnodes.h | 1 + src/include/nodes/pathnodes.h | 1 + src/include/pg_config.h.in | 9 + src/include/storage/predicate.h | 9 +- .../isolation/specs/read-only-anomaly-2.spec | 6 +- src/test/regress/expected/.gitignore | 1 + src/test/regress/expected/alter_table_1.out | 3997 +++++++++++++++++ src/test/regress/expected/cluster_1.out | 475 ++ src/test/regress/expected/create_am.out | 11 +- src/test/regress/expected/fsm_1.out | 73 + src/test/regress/expected/rangefuncs_1.out | 2100 +++++++++ src/test/regress/expected/reloptions_1.out | 219 + src/test/regress/expected/strings_1.out | 1823 ++++++++ src/test/regress/expected/tsrf_1.out | 712 +++ src/test/regress/expected/zedstore.out | 599 +++ src/test/regress/output/misc_1.source | 692 +++ src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/zedstore.sql | 176 + src/test/storageperf/driver.sql | 36 + src/test/storageperf/sql/onecol.sql | 38 + src/test/storageperf/tests.sql | 3 + 68 files changed, 24118 insertions(+), 61 deletions(-) create mode 100644 src/backend/access/zedstore/Makefile create mode 100644 src/backend/access/zedstore/README create mode 100644 src/backend/access/zedstore/zedstore_attpage.c create mode 100644 src/backend/access/zedstore/zedstore_btree.c create mode 100644 src/backend/access/zedstore/zedstore_compression.c create mode 100644 src/backend/access/zedstore/zedstore_freepagemap.c create mode 100644 src/backend/access/zedstore/zedstore_inspect.c create mode 100644 src/backend/access/zedstore/zedstore_meta.c create mode 100644 src/backend/access/zedstore/zedstore_tidpage.c create mode 100644 src/backend/access/zedstore/zedstore_toast.c create mode 100644 src/backend/access/zedstore/zedstore_tupslot.c create mode 100644 src/backend/access/zedstore/zedstore_undo.c create mode 100644 src/backend/access/zedstore/zedstore_utils.c create mode 100644 src/backend/access/zedstore/zedstore_visibility.c create mode 100644 src/backend/access/zedstore/zedstoream_handler.c create mode 100644 src/include/access/zedstore_compression.h create mode 100644 src/include/access/zedstore_internal.h create mode 100644 src/include/access/zedstore_undo.h create mode 100644 src/test/regress/expected/alter_table_1.out create mode 100644 src/test/regress/expected/cluster_1.out create mode 100644 src/test/regress/expected/fsm_1.out create mode 100644 src/test/regress/expected/rangefuncs_1.out create mode 100644 src/test/regress/expected/reloptions_1.out create mode 100644 src/test/regress/expected/strings_1.out create mode 100644 src/test/regress/expected/tsrf_1.out create mode 100644 src/test/regress/expected/zedstore.out create mode 100644 src/test/regress/output/misc_1.source create mode 100644 src/test/regress/sql/zedstore.sql create mode 100644 src/test/storageperf/driver.sql create mode 100644 src/test/storageperf/sql/onecol.sql create mode 100644 src/test/storageperf/tests.sql diff --git a/configure b/configure index fd61bf6472..59a8a8080d 100755 --- a/configure +++ b/configure @@ -700,6 +700,7 @@ LDFLAGS_EX ELF_SYS EGREP GREP +with_lz4 with_zlib with_system_tzdata with_libxslt @@ -864,6 +865,7 @@ with_libxml with_libxslt with_system_tzdata with_zlib +with_lz4 with_gnu_ld enable_largefile enable_float4_byval @@ -1570,6 +1572,7 @@ Optional Packages: --with-system-tzdata=DIR use system time zone data in DIR --without-zlib do not use Zlib + --with-lz4 build with LZ4 support --with-gnu-ld assume the C compiler uses GNU ld [default=no] Some influential environment variables: @@ -8306,6 +8309,41 @@ fi +# +# LZ4 +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with LZ4 support" >&5 +$as_echo_n "checking whether to build with LZ4 support... " >&6; } + + + +# Check whether --with-lz4 was given. +if test "${with_lz4+set}" = set; then : + withval=$with_lz4; + case $withval in + yes) + +$as_echo "#define USE_LZ4 1" >>confdefs.h + + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --with-lz4 option" "$LINENO" 5 + ;; + esac + +else + with_lz4=no + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_lz4" >&5 +$as_echo "$with_lz4" >&6; } + + # # Elf # @@ -11828,6 +11866,56 @@ fi fi +if test "$with_lz4" = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5 +$as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; } +if ${ac_cv_lib_lz4_LZ4_compress_default+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-llz4 $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char LZ4_compress_default (); +int +main () +{ +return LZ4_compress_default (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_lz4_LZ4_compress_default=yes +else + ac_cv_lib_lz4_LZ4_compress_default=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lz4_LZ4_compress_default" >&5 +$as_echo "$ac_cv_lib_lz4_LZ4_compress_default" >&6; } +if test "x$ac_cv_lib_lz4_LZ4_compress_default" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBLZ4 1 +_ACEOF + + LIBS="-llz4 $LIBS" + +else + as_fn_error $? "library 'lz4' is required for LZ4 support" "$LINENO" 5 +fi + +fi + if test "$enable_spinlocks" = yes; then $as_echo "#define HAVE_SPINLOCKS 1" >>confdefs.h @@ -13027,6 +13115,36 @@ Use --without-zlib to disable zlib support." "$LINENO" 5 fi +fi + +if test "$with_lz4" = yes; then + for ac_header in lz4.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "lz4.h" "ac_cv_header_lz4_h" "$ac_includes_default" +if test "x$ac_cv_header_lz4_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LZ4_H 1 +_ACEOF + +else + for ac_header in lz4.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "lz4.h" "ac_cv_header_lz4_h" "$ac_includes_default" +if test "x$ac_cv_header_lz4_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LZ4_H 1 +_ACEOF + +else + as_fn_error $? "lz4.h header file is required for LZ4" "$LINENO" 5 +fi + +done + +fi + +done + fi if test "$with_gssapi" = yes ; then diff --git a/configure.in b/configure.in index 4586a1716c..183fad3462 100644 --- a/configure.in +++ b/configure.in @@ -964,6 +964,16 @@ PGAC_ARG_BOOL(with, zlib, yes, [do not use Zlib]) AC_SUBST(with_zlib) +# +# LZ4 +# +AC_MSG_CHECKING([whether to build with LZ4 support]) +PGAC_ARG_BOOL(with, lz4, no, + [build with LZ4 support], + [AC_DEFINE([USE_LZ4], 1, [Define to 1 to build with LZ4 support. (--with-lz4)])]) +AC_MSG_RESULT([$with_lz4]) +AC_SUBST(with_lz4) + # # Elf # @@ -1174,6 +1184,10 @@ failure. It is possible the compiler isn't looking in the proper directory. Use --without-zlib to disable zlib support.])]) fi +if test "$with_lz4" = yes; then + AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])]) +fi + if test "$enable_spinlocks" = yes; then AC_DEFINE(HAVE_SPINLOCKS, 1, [Define to 1 if you have spinlocks.]) else @@ -1387,6 +1401,11 @@ failure. It is possible the compiler isn't looking in the proper directory. Use --without-zlib to disable zlib support.])]) fi +if test "$with_lz4" = yes; then + AC_CHECK_HEADERS(lz4.h, [], + [AC_CHECK_HEADERS(lz4.h, [], [AC_MSG_ERROR([lz4.h header file is required for LZ4])])]) +fi + if test "$with_gssapi" = yes ; then AC_CHECK_HEADERS(gssapi/gssapi.h, [], [AC_CHECK_HEADERS(gssapi.h, [], [AC_MSG_ERROR([gssapi.h header file is required for GSSAPI])])]) diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 0880e0a8bb..6d36f3bd26 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -9,6 +9,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ - table tablesample transam + table tablesample transam zedstore include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 11a8ed7bbc..e795375495 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -89,7 +89,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode, stack->predictNumber = 1; if (rootConflictCheck) - CheckForSerializableConflictIn(btree->index, NULL, stack->buffer); + CheckForSerializableConflictIn(btree->index, NULL, btree->rootBlkno); for (;;) { diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index 2b3dd1c677..f8ffeb06f8 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -246,7 +246,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) * tree, so it conflicts with all serializable scans. All scans acquire a * predicate lock on the metabuffer to represent that. */ - CheckForSerializableConflictIn(index, NULL, metabuffer); + CheckForSerializableConflictIn(index, NULL, GIN_METAPAGE_BLKNO); if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) { diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 55eab14617..046a20a3d4 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -221,7 +221,7 @@ ginEntryInsert(GinState *ginstate, return; } - CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer); + CheckForSerializableConflictIn(ginstate->index, NULL, BufferGetBlockNumber(stack->buffer)); /* modify an existing leaf entry */ itup = addItemPointersToLeafTuple(ginstate, itup, items, nitem, buildStats, stack->buffer); @@ -230,7 +230,7 @@ ginEntryInsert(GinState *ginstate, } else { - CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer); + CheckForSerializableConflictIn(ginstate->index, NULL, BufferGetBlockNumber(stack->buffer)); /* no match, so construct a new leaf entry */ itup = buildFreshLeafTuple(ginstate, attnum, key, category, items, nitem, buildStats, stack->buffer); diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 45c00aaa87..4f150b02cb 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -1273,7 +1273,7 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, * Check for any rw conflicts (in serializable isolation level) just * before we intend to modify the page */ - CheckForSerializableConflictIn(state->r, NULL, stack->buffer); + CheckForSerializableConflictIn(state->r, NULL, BufferGetBlockNumber(stack->buffer)); /* Insert the tuple(s) to the page, splitting the page if necessary */ is_split = gistplacetopage(state->r, state->freespace, giststate, diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 5321762d5e..e3fb47f9e3 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -88,7 +88,7 @@ restart_insert: &usedmetap); Assert(usedmetap != NULL); - CheckForSerializableConflictIn(rel, NULL, buf); + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(buf)); /* remember the primary bucket buffer to release the pin on it at end. */ bucket_buf = buf; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 6c342635e8..b09263364e 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -446,7 +446,7 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) else valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); - CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, + heap_CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, &loctup, buffer, snapshot); if (valid) @@ -668,7 +668,7 @@ heapgettup(HeapScanDesc scan, snapshot, scan->rs_cbuf); - CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, + heap_CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, tuple, scan->rs_cbuf, snapshot); @@ -1488,7 +1488,7 @@ heap_fetch(Relation relation, if (valid) PredicateLockTuple(relation, tuple, snapshot); - CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); + heap_CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); @@ -1622,7 +1622,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, /* If it's visible per the snapshot, we must return it */ valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer); - CheckForSerializableConflictOut(valid, relation, heapTuple, + heap_CheckForSerializableConflictOut(valid, relation, heapTuple, buffer, snapshot); /* reset to original, non-redirected, tid */ heapTuple->t_self = *tid; @@ -1764,7 +1764,7 @@ heap_get_latest_tid(TableScanDesc sscan, * candidate. */ valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer); - CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot); + heap_CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot); if (valid) *tid = ctid; @@ -1919,7 +1919,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, * lock "gaps" as index page locks do. So we don't need to specify a * buffer when making the call, which makes for a faster check. */ - CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -2173,7 +2173,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, * lock "gaps" as index page locks do. So we don't need to specify a * buffer when making the call, which makes for a faster check. */ - CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); ndone = 0; while (ndone < ntuples) @@ -2364,7 +2364,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, * lock "gaps" as index page locks do. So we don't need to specify a * buffer when making the call. */ - CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); /* * If tuples are cachable, mark them for invalidation from the caches in @@ -2673,7 +2673,7 @@ l1: * being visible to the scan (i.e., an exclusive buffer content lock is * continuously held from this point until the tuple delete is visible). */ - CheckForSerializableConflictIn(relation, &tp, buffer); + CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer)); /* replace cid with a combo cid if necessary */ HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo); @@ -3583,7 +3583,7 @@ l2: * will include checking the relation level, there is no benefit to a * separate check for the new tuple. */ - CheckForSerializableConflictIn(relation, &oldtup, buffer); + CheckForSerializableConflictIn(relation, otid, BufferGetBlockNumber(buffer)); /* * At this point newbuf and buffer are both pinned and locked, and newbuf diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a4a28e88ec..b1643790b9 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2275,7 +2275,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, hscan->rs_vistuples[ntup++] = offnum; PredicateLockTuple(scan->rs_rd, &loctup, snapshot); } - CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, + heap_CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer, snapshot); } } @@ -2463,7 +2463,7 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, /* in pagemode, heapgetpage did this for us */ if (!pagemode) - CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, + heap_CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, hscan->rs_cbuf, scan->rs_snapshot); /* Try next tuple from same page. */ @@ -2602,6 +2602,7 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, static const TableAmRoutine heapam_methods = { .type = T_TableAmRoutine, + .scans_leverage_column_projection = false, .slot_callbacks = heapam_slot_callbacks, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index aefdd2916d..61ed3167fe 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -180,8 +180,8 @@ index_insert(Relation indexRelation, if (!(indexRelation->rd_indam->ampredlocks)) CheckForSerializableConflictIn(indexRelation, - (HeapTuple) NULL, - InvalidBuffer); + (ItemPointer) NULL, + InvalidBlockNumber); return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, heap_t_ctid, heapRelation, diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 2eccc99023..b905cb1986 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -290,7 +290,7 @@ top: * checkingunique and !heapkeyspace cases, but it's okay to use the * first page the value could be on (with scantid omitted) instead. */ - CheckForSerializableConflictIn(rel, NULL, insertstate.buf); + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf)); /* * Do the insertion. Note that insertstate contains cached binary @@ -533,7 +533,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * otherwise be masked by this unique constraint * violation. */ - CheckForSerializableConflictIn(rel, NULL, insertstate->buf); + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf)); /* * This is a definite conflict. Break the tuple down into diff --git a/src/backend/access/zedstore/Makefile b/src/backend/access/zedstore/Makefile new file mode 100644 index 0000000000..ae5b939026 --- /dev/null +++ b/src/backend/access/zedstore/Makefile @@ -0,0 +1,21 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/zedstore +# +# IDENTIFICATION +# src/backend/access/zedstore/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/zedstore +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = zedstore_btree.o zedstore_tidpage.o zedstore_attpage.o \ + zedstore_compression.o zedstoream_handler.o \ + zedstore_meta.o zedstore_undo.o zedstore_toast.o zedstore_visibility.o \ + zedstore_inspect.o zedstore_freepagemap.o zedstore_utils.o \ + zedstore_tupslot.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/zedstore/README b/src/backend/access/zedstore/README new file mode 100644 index 0000000000..85493caf03 --- /dev/null +++ b/src/backend/access/zedstore/README @@ -0,0 +1,295 @@ + +src/backend/access/zedstore/README + +ZedStore - compressed column (and row) store for PostgreSQL +=========================================================== + +The purpose of this README is to provide overview of zedstore's +design, major requirements/objectives it intends to fulfill and +high-level implementation details. + +Objectives +---------- + +* Performance improvement for queries selecting subset of columns +(reduced IO). + +* Reduced on-disk footprint compared to heap table. Shorter tuple +headers and also leveraging compression of similar type data + +* Be first-class citizen in the Postgres architecture (tables data can +just independently live in columnar storage) and not be at arm's +length though an opaque interface. + +* Fully MVCC compliant - basically all operations supported similar to +heap, like update, delete, serializable transactions etc... + +* All Indexes supported + +* Hybrid row-column store, where some columns are stored together, and +others separately. Provide flexibility of granularity on how to divide +the columns. Columns accessed together can be stored together. + +* Provide better control over bloat (using zheap) + +* Eliminate need for separate toast tables + +* Faster add / drop column or changing data type of column by avoiding +full rewrite of the table. + +Highlevel design of zedStore - B-trees for the win! +--------------------------------------------------- + +To start simple, let's ignore column store aspect and consider it as +compressed row store. The column store is natural externsion of this +concept, explained in next section. + +The basic on-disk data structure leveraged is a B-tree, indexed by +TID. BTree being a great data structure, fast and versatile. Note this +is not refering to existing Btree indexes, but instead net new BTree +for table data storage. + +TID - used as a logical row identifier: +TID is just a 48-bit row identifier. The traditional division into +block and offset numbers is meaningless. In order to find a tuple with +a given TID, one must always descend the B-tree. Having logical TID +provides flexibility to move the tuples around different pages on page +splits or page merges can be performed. + +The internal pages of the B-tree are super simple and boring. Each +internal page just stores an array of TID/downlink pairs. Let's focus +on the leaf level. Leaf blocks have short uncompressed header, +followed by btree items. It contains three kind of items: + +- plain item, holds one tuple or one datum, uncompressed payload + +- array item, holds multiple datums, with consecutive TIDs and the +same visibility information. An array item saves space compared to +multiple single items, by leaving out repetitive UNDO and TID +fields. An array item cannot mix NULLs and non-NULLs, so the ZSBT_NULL +flag applies to all elements. + +- a "container item", holds multiple plain items, compressed payload + ++----------------------------- +| Fixed-size page header: +| +| LSN +| TID low and hi key (for Lehman & Yao B-tree operations) +| left and right page pointers +| +| Items: +| +| TID | size | flags | uncompressed size | lastTID | payload (container item) +| TID | size | flags | uncompressed size | lastTID | payload (container item) +| TID | size | flags | undo pointer | payload (plain item) +| TID | size | flags | undo pointer | payload (plain item) +| ... +| ++---------------------------- + +Row store +--------- + +The tuples are stored one after another, sorted by TID. For each +tuple, we store its 48-bit TID, a undo record pointer, and the actual +tuple data uncompressed. + +In uncompressed form, the page can be arbitrarily large. But after +compression, it must fit into a physical 8k block. If on insert or +update of a tuple, the page cannot be compressed below 8k anymore, the +page is split. Note that because TIDs are logical rather than physical +identifiers, we can freely move tuples from one physical page to +another during page split. A tuple's TID never changes. + +The buffer cache caches compressed blocks. Likewise, WAL-logging, +full-page images etc. work on compressed blocks. Uncompression is done +on-the-fly, as and when needed in backend-private memory, when +reading. For some compressions like rel encoding or delta encoding +tuples can be constructed directly from compressed data. + +Column store +------------ + +A column store uses the same structure but we have *multiple* B-trees, +one for each column plus one for storing meta-data +(a.k.a. meta-column), all indexed by TID. Imagine zedstore as a forest +of B-trees. The B-trees for all columns are stored in the same +physical file. + +A metapage at block 0, has links to the roots of the B-trees. Leaf +pages look the same, but instead of storing the whole tuple, stores +just a single attribute. To reconstruct a row with given TID, scan +descends down the B-trees for all the columns using that TID, and +fetches all attributes. Likewise, a sequential scan walks all the +B-trees in lockstep. + +The special or first btree for meta-column is used to allocate TIDs +for tuples, track the UNDO location which provides visibility +information. Also this special btree, which always exists, helps to +support zero column tables (which can be result of ADD COLUMN DROP +COLUMN actions as well). Plus, having meta-data stored separately from +data, helps to get better compression ratios. And also helps to +simplify the overall design/implementation as for deletes just need to +edit the meta-column and avoid touching the actual data btrees. + + +MVCC +---- + +Undo record pointers are used to implement MVCC, like in zheap. Hence, +transaction information if not directly stored with the data. In +zheap, there's a small, fixed, number of "transaction slots" on each +page, but zedstore has undo pointer with each item directly; in normal +cases, the compression squeezes this down to almost nothing. In case +of bulk load the undo record pointer is maintained for array of items +and not per item. Undo pointer is only stored in meta-column and all +MVCC operations are performed using the meta-column only. + + +Insert: +Inserting a new row, splits the row into datums. Then while adding +entry for meta-column adds, decides block to insert, picks a TID for +it, and writes undo record for the same. All the data columns are +inserted using that TID. + +Toast: +When an overly large datum is stored, it is divided into chunks, and +each chunk is stored on a dedicated toast page within the same +physical file. The toast pages of a datum form list, each page has a +next/prev pointer. + +Select: +Property is added to Table AM to convey if column projection is +leveraged by AM for scans. While scanning tables with AM leveraging +this property, executor parses the plan. Leverages the target list and +quals to find the required columns for query. This list is passed down +to AM on beginscan. Zedstore uses this column projection list to only +pull data from selected columns. Virtual tuple table slot is used to +pass back the datums for subset of columns. + +Current table am API requires enhancement here to pass down column +projection to AM. The patch showcases two different ways for the same. + +* For sequential scans added new beginscan_with_column_projection() +API. Executor checks AM property and if it leverages column projection +uses this new API else normal beginscan() API. + +* For index scans instead of modifying the begin scan API, added new +API to specifically pass column projection list after calling begin +scan to populate the scan descriptor but before fetching the tuples. + +Delete: +When deleting a tuple, new undo record is created for delete and only +meta-column item is updated with this new undo record. New undo record +created points to previous undo record pointer (insert undo record) +present for the tuple. Hence, delete only operates on meta-column and +no data column is edited. + +Update: +Update in zedstore is pretty equivalent to delete and insert. Delete +action is performed as stated above and new entry is added with +updated values. So, no in-place update happens. + +Index Support: +Building index also leverages columnar storage and only scans columns +required to build the index. Indexes work pretty similar to heap +tables. Data is inserted into tables and TID for the tuple gets stored +in index. On index scans, required column Btrees are scanned for given +TID and datums passed back using virtual tuple. Since only meta-column +is leveraged to perform visibility check, only visible tuples data are +fetched from rest of the Btrees. + +Page Format +----------- +A ZedStore table contains different kinds of pages, all in the same +file. Kinds of pages are meta-page, per-attribute btree internal and +leaf pages, UNDO log page, and toast pages. Each page type has its own +distinct data storage format. + +META Page: +Block 0 is always a metapage. It contains the block numbers of the +other data structures stored within the file, like the per-attribute +B-trees, and the UNDO log. + +BTREE Page: + +UNDO Page: + +TOAST Page: + + +Free Space Map +-------------- + + +Enhancements +------------ + +Instead of compressing all the tuples on a page in one batch, store a +small "dictionary", e.g. in page header or meta page or separate +dedicated page, and use it to compress tuple by tuple. That could make +random reads and updates of individual tuples faster. Need to find how +to create the dictionary first. + +Only cached compressed pages in the page cache. If we want to cache +uncompressed pages instead, or in addition to that, we need to invent +a whole new kind of a buffer cache that can deal with the +variable-size blocks. For a first version, I think we can live without +it. + +Instead of storing all columns in the same file, we could store them +in separate files (separate forks?). That would allow immediate reuse +of space, after dropping a column. It's not clear how to use an FSM in +that case, though. Might have to implement an integrated FSM, +too. (Which might not be a bad idea, anyway). + +Design allows for hybrid row-column store, where some columns are +stored together, and others have a dedicated B-tree. Need to have user +facing syntax to allow specifying how to group the columns. + +Salient points for the design +------------------------------ + +* Layout the data/tuples in mapped fashion instead of keeping the +logical to physical mapping separate from actual data. So, keep all +the meta-data and data logically in single stream of file, avoiding +the need for separate forks/files to store meta-data and data. + +* Handle/treat operations at tuple level and not block level. + +* Stick to fixed size physical blocks. Variable size blocks (for +possibly higher compression ratios) pose need for increased logical to +physical mapping maintenance, plus restrictions on concurrency of +writes and reads to files. Hence adopt compression to fit fixed size +blocks instead of other way round. + + +Predicate locking +----------------- + +Predicate locks, to support SERIALIZABLE transactinons, are taken like +with the heap. From README-SSI: + +* For a table scan, the entire relation will be locked. + +* Each tuple read which is visible to the reading transaction will be +locked, whether or not it meets selection criteria; except that there +is no need to acquire an SIREAD lock on a tuple when the transaction +already holds a write lock on any tuple representing the row, since a +rw-conflict would also create a ww-dependency which has more +aggressive enforcement and thus will prevent any anomaly. + +* Modifying a heap tuple creates a rw-conflict with any transaction +that holds a SIREAD lock on that tuple, or on the page or relation +that contains it. + +* Inserting a new tuple creates a rw-conflict with any transaction +holding a SIREAD lock on the entire relation. It doesn't conflict with +page-level locks, because page-level locks are only used to aggregate +tuple locks. Unlike index page locks, they don't lock "gaps" on the +page. + + +ZedStore isn't block-based, so page-level locks really just mean a +range of TIDs. They're only used to aggregate tuple locks. diff --git a/src/backend/access/zedstore/zedstore_attpage.c b/src/backend/access/zedstore/zedstore_attpage.c new file mode 100644 index 0000000000..10e6517c26 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_attpage.c @@ -0,0 +1,1589 @@ +/* + * zedstore_attpage.c + * Routines for handling attribute leaf pages. + * + * A Zedstore table consists of multiple B-trees, one for each attribute. The + * functions in this file deal with one B-tree at a time, it is the caller's + * responsibility to tie together the scans of each btree. + * + * Operations: + * + * - Sequential scan in TID order + * - must be efficient with scanning multiple trees in sync + * + * - random lookups, by TID (for index scan) + * + * - range scans by TID (for bitmap index scan) + * + * NOTES: + * - Locking order: child before parent, left before right + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_attpage.c + */ +#include "postgres.h" + +#include "access/zedstore_compression.h" +#include "access/zedstore_internal.h" +#include "access/zedstore_undo.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "utils/datum.h" +#include "utils/rel.h" + +/* prototypes for local functions */ +static void zsbt_attr_recompress_replace(Relation rel, AttrNumber attno, + Buffer oldbuf, List *items); +static ZSSingleBtreeItem *zsbt_attr_fetch(Relation rel, AttrNumber attno, + zstid tid, Buffer *buf_p); +static void zsbt_attr_replace_item(Relation rel, AttrNumber attno, Buffer buf, + zstid oldtid, ZSBtreeItem *replacementitem, + List *newitems); +static Size zsbt_compute_data_size(Form_pg_attribute atti, Datum val, bool isnull); +static ZSBtreeItem *zsbt_attr_create_item(Form_pg_attribute att, zstid tid, + int nelements, Datum *datums, + char *dataptr, Size datasz, bool isnull); + +/* ---------------------------------------------------------------- + * Public interface + * ---------------------------------------------------------------- + */ + +/* + * Begin a scan of the btree. + */ +void +zsbt_attr_begin_scan(Relation rel, TupleDesc tdesc, AttrNumber attno, zstid starttid, + zstid endtid, ZSBtreeScan *scan) +{ + Buffer buf; + + scan->rel = rel; + scan->attno = attno; + scan->tupledesc = tdesc; + + scan->snapshot = NULL; + scan->context = CurrentMemoryContext; + scan->lastoff = InvalidOffsetNumber; + scan->has_decompressed = false; + scan->nexttid = starttid; + scan->endtid = endtid; + memset(&scan->recent_oldest_undo, 0, sizeof(scan->recent_oldest_undo)); + memset(&scan->array_undoptr, 0, sizeof(scan->array_undoptr)); + scan->array_datums = palloc(sizeof(Datum)); + scan->array_datums_allocated_size = 1; + scan->array_elements_left = 0; + + buf = zsbt_descend(rel, attno, starttid, 0, true); + if (!BufferIsValid(buf)) + { + /* completely empty tree */ + scan->active = false; + scan->lastbuf = InvalidBuffer; + return; + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + scan->active = true; + scan->lastbuf = buf; + + zs_decompress_init(&scan->decompressor); + scan->recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); +} + +/* + * Reset the 'next' TID in a scan to the given TID. + */ +void +zsbt_attr_reset_scan(ZSBtreeScan *scan, zstid starttid) +{ + if (starttid < scan->nexttid) + { + /* have to restart from scratch. */ + scan->array_elements_left = 0; + scan->nexttid = starttid; + scan->has_decompressed = false; + if (scan->lastbuf != InvalidBuffer) + ReleaseBuffer(scan->lastbuf); + scan->lastbuf = InvalidBuffer; + } + else + zsbt_scan_skip(scan, starttid); +} + +void +zsbt_attr_end_scan(ZSBtreeScan *scan) +{ + if (!scan->active) + return; + + if (scan->lastbuf != InvalidBuffer) + ReleaseBuffer(scan->lastbuf); + zs_decompress_free(&scan->decompressor); + + scan->active = false; + scan->array_elements_left = 0; +} + +/* + * Helper function of zsbt_attr_scan_next(), to extract Datums from the given + * array item into the scan->array_* fields. + */ +static void +zsbt_attr_scan_extract_array(ZSBtreeScan *scan, ZSArrayBtreeItem *aitem) +{ + int nelements = aitem->t_nelements; + zstid tid = aitem->t_tid; + bool isnull = (aitem->t_flags & ZSBT_NULL) != 0; + char *p = aitem->t_payload; + + /* skip over elements that we are not interested in */ + while (tid < scan->nexttid && nelements > 0) + { + Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan); + if (!isnull) + { + if (attr->attlen > 0) + { + p += att_align_nominal(attr->attlen, attr->attalign); + } + else + { + p = (Pointer) att_align_pointer(p, attr->attalign, attr->attlen, p); + p = att_addlength_pointer(p, attr->attlen, p); + } + } + tid++; + nelements--; + } + + /* leave out elements that are past end of range */ + if (tid + nelements > scan->endtid) + nelements = scan->endtid - tid; + + scan->array_isnull = isnull; + + if (nelements > scan->array_datums_allocated_size) + { + if (scan->array_datums) + pfree(scan->array_datums); + scan->array_datums = palloc(nelements * sizeof(Datum)); + scan->array_datums_allocated_size = nelements; + } + + if (isnull) + { + /* + * For NULLs, clear the Datum array. Not strictly necessary, I think, + * but less confusing when debugging. + */ + memset(scan->array_datums, 0, nelements * sizeof(Datum)); + } + else + { + /* + * Expand the packed array data into an array of Datums. + * + * It would perhaps be more natural to loop through the elements with + * datumGetSize() and fetch_att(), but this is a pretty hot loop, so it's + * better to avoid checking attlen/attbyval in the loop. + * + * TODO: a different on-disk representation might make this better still, + * for varlenas (this is pretty optimal for fixed-lengths already). + * For example, storing an array of sizes or an array of offsets, followed + * by the data itself, might incur fewer pipeline stalls in the CPU. + */ + Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan); + int16 attlen = attr->attlen; + + if (attr->attbyval) + { + if (attlen == sizeof(Datum)) + { + memcpy(scan->array_datums, p, nelements * sizeof(Datum)); + } + else if (attlen == sizeof(int32)) + { + for (int i = 0; i < nelements; i++) + { + scan->array_datums[i] = fetch_att(p, true, sizeof(int32)); + p += sizeof(int32); + } + } + else if (attlen == sizeof(int16)) + { + for (int i = 0; i < nelements; i++) + { + scan->array_datums[i] = fetch_att(p, true, sizeof(int16)); + p += sizeof(int16); + } + } + else if (attlen == 1) + { + for (int i = 0; i < nelements; i++) + { + scan->array_datums[i] = fetch_att(p, true, 1); + p += 1; + } + } + else + Assert(false); + } + else if (attlen > 0) + { + for (int i = 0; i < nelements; i++) + { + scan->array_datums[i] = PointerGetDatum(p); + p += att_align_nominal(attr->attlen, attr->attalign); + } + } + else if (attlen == -1) + { + for (int i = 0; i < nelements; i++) + { + p = (Pointer) att_align_pointer(p, attr->attalign, attr->attlen, p); + scan->array_datums[i] = PointerGetDatum(p); + p = att_addlength_pointer(p, attr->attlen, p); + } + } + else + { + /* TODO: convert cstrings to varlenas before we get here? */ + elog(ERROR, "cstrings not supported"); + } + } + scan->array_undoptr = aitem->t_undo_ptr; + scan->array_next_datum = &scan->array_datums[0]; + scan->array_elements_left = nelements; +} + +/* + * Advance scan to next item. + * + * Return true if there was another item. The Datum/isnull of the item is + * placed in scan->array_* fields. For a pass-by-ref datum, it's a palloc'd + * copy that's valid until the next call. + * + * This is normally not used directly. See zsbt_scan_next_tid() and + * zsbt_scan_next_fetch() wrappers, instead. + */ +bool +zsbt_attr_scan_next(ZSBtreeScan *scan) +{ + Buffer buf; + bool buf_is_locked = false; + Page page; + ZSBtreePageOpaque *opaque; + OffsetNumber off; + OffsetNumber maxoff; + BlockNumber next; + + Assert(scan->active); + + /* + * Advance to the next TID >= nexttid. + * + * This advances scan->nexttid as it goes. + */ + while (scan->nexttid < scan->endtid) + { + /* + * If we are still processing an array item, return next element from it. + */ + if (scan->array_elements_left > 0) + { + return true; + } + + /* + * If we are still processing a compressed item, process the next item + * from the it. If it's an array item, we start iterating the array by + * setting the scan->array_* fields, and loop back to top to return the + * first element from the array. + */ + if (scan->has_decompressed) + { + zstid lasttid; + ZSBtreeItem *uitem; + + uitem = zs_decompress_read_item(&scan->decompressor); + + if (uitem == NULL) + { + scan->has_decompressed = false; + continue; + } + + /* a compressed item cannot contain nested compressed items */ + Assert((uitem->t_flags & ZSBT_COMPRESSED) == 0); + + lasttid = zsbt_item_lasttid(uitem); + if (lasttid < scan->nexttid) + continue; + + if (uitem->t_tid >= scan->endtid) + break; + + if ((uitem->t_flags & ZSBT_ARRAY) != 0) + { + /* no need to make a copy, because the uncompressed buffer + * is already a copy */ + ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) uitem; + + zsbt_attr_scan_extract_array(scan, aitem); + continue; + } + else + { + /* single item */ + ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) uitem; + Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan); + + scan->nexttid = sitem->t_tid; + scan->array_undoptr = sitem->t_undo_ptr; + scan->array_elements_left = 1; + scan->array_next_datum = &scan->array_datums[0]; + if (sitem->t_flags & ZSBT_NULL) + scan->array_isnull = true; + else + { + scan->array_isnull = false; + scan->array_datums[0] = fetch_att(sitem->t_payload, attr->attbyval, attr->attlen); + /* no need to copy, because the uncompression buffer is a copy already */ + /* FIXME: do we need to copy anyway, to make sure it's aligned correctly? */ + } + + if (buf_is_locked) + LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + return true; + } + } + + /* + * Scan the page for the next item. + */ + buf = scan->lastbuf; + if (!buf_is_locked) + { + if (BufferIsValid(buf)) + { + LockBuffer(buf, BUFFER_LOCK_SHARE); + buf_is_locked = true; + + /* + * It's possible that the page was concurrently split or recycled by + * another backend (or ourselves). Have to re-check that the page is + * still valid. + */ + if (!zsbt_page_is_expected(scan->rel, scan->attno, scan->nexttid, 0, buf)) + { + /* + * It's not valid for the TID we're looking for, but maybe it was the + * right page for the previous TID. In that case, we don't need to + * restart from the root, we can follow the right-link instead. + */ + if (zsbt_page_is_expected(scan->rel, scan->attno, scan->nexttid - 1, 0, buf)) + { + page = BufferGetPage(buf); + opaque = ZSBtreePageGetOpaque(page); + next = opaque->zs_next; + if (next != InvalidBlockNumber) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + buf = ReleaseAndReadBuffer(buf, scan->rel, next); + scan->lastbuf = buf; + continue; + } + } + + UnlockReleaseBuffer(buf); + buf_is_locked = false; + buf = scan->lastbuf = InvalidBuffer; + } + } + + if (!BufferIsValid(buf)) + { + buf = scan->lastbuf = zsbt_descend(scan->rel, scan->attno, scan->nexttid, 0, true); + buf_is_locked = true; + } + } + page = BufferGetPage(buf); + opaque = ZSBtreePageGetOpaque(page); + Assert(opaque->zs_page_id == ZS_BTREE_PAGE_ID); + + /* TODO: check the last offset first, as an optimization */ + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + ZSBtreeItem *item = (ZSBtreeItem *) PageGetItem(page, iid); + zstid lasttid; + + lasttid = zsbt_item_lasttid(item); + + if (scan->nexttid > lasttid) + continue; + + if (item->t_tid >= scan->endtid) + { + scan->nexttid = scan->endtid; + break; + } + + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item; + MemoryContext oldcxt = MemoryContextSwitchTo(scan->context); + + zs_decompress_chunk(&scan->decompressor, citem); + MemoryContextSwitchTo(oldcxt); + scan->has_decompressed = true; + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + break; + } + else + { + if ((item->t_flags & ZSBT_ARRAY) != 0) + { + /* copy the item, because we can't hold a lock on the page */ + ZSArrayBtreeItem *aitem; + + aitem = MemoryContextAlloc(scan->context, item->t_size); + memcpy(aitem, item, item->t_size); + + zsbt_attr_scan_extract_array(scan, aitem); + + if (scan->array_elements_left > 0) + { + LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + break; + } + } + else + { + /* single item */ + ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) item; + Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan); + + scan->nexttid = sitem->t_tid; + scan->array_undoptr = sitem->t_undo_ptr; + scan->array_elements_left = 1; + scan->array_next_datum = &scan->array_datums[0]; + if (item->t_flags & ZSBT_NULL) + scan->array_isnull = true; + else + { + scan->array_isnull = false; + scan->array_datums[0] = fetch_att(sitem->t_payload, attr->attbyval, attr->attlen); + scan->array_datums[0] = zs_datumCopy(scan->array_datums[0], attr->attbyval, attr->attlen); + } + LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + return true; + } + } + } + + if (scan->array_elements_left > 0 || scan->has_decompressed) + continue; + + /* No more items on this page. Walk right, if possible */ + next = opaque->zs_next; + if (next == BufferGetBlockNumber(buf)) + elog(ERROR, "btree page %u next-pointer points to itself", next); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + + if (next == InvalidBlockNumber || scan->nexttid >= scan->endtid) + { + scan->active = false; + scan->array_elements_left = 0; + ReleaseBuffer(scan->lastbuf); + scan->lastbuf = InvalidBuffer; + break; + } + + scan->lastbuf = ReleaseAndReadBuffer(scan->lastbuf, scan->rel, next); + } + + return false; +} + +/* + * Insert a multiple items to the given attribute's btree. + * + * Populates the TIDs of the new tuples. + * + * If 'tid' in list is valid, then that TID is used. It better not be in use already. If + * it's invalid, then a new TID is allocated, as we see best. (When inserting the + * first column of the row, pass invalid, and for other columns, pass the TID + * you got for the first column.) + */ +void +zsbt_attr_multi_insert(Relation rel, AttrNumber attno, + Datum *datums, bool *isnulls, zstid *tids, int nitems) +{ + Form_pg_attribute attr; + zstid tid = tids[0]; + Buffer buf; + zstid insert_target_key; + int i; + List *newitems; + + Assert (attno >= 1); + attr = &rel->rd_att->attrs[attno - 1]; + + /* + * Find the right place for the given TID. + */ + insert_target_key = tid; + + buf = zsbt_descend(rel, attno, insert_target_key, 0, false); + + /* Create items to insert. */ + newitems = NIL; + i = 0; + while (i < nitems) + { + Size datasz; + int j; + ZSBtreeItem *newitem; + + /* + * Try to collapse as many items as possible into an Array item. + * The first item in the array is now at tids[i]/datums[i]/isnulls[i]. + * Items can be stored in the same array as long as the TIDs are + * consecutive, they all have the same isnull flag, and the array + * isn't too large to be stored on a single leaf page. Scan the + * arrays, checking those conditions. + */ + datasz = zsbt_compute_data_size(attr, datums[i], isnulls[i]); + for (j = i + 1; j < nitems; j++) + { + if (isnulls[j] != isnulls[i]) + break; + + if (tids[j] != tids[j - 1] + 1) + break; + + /* + * Will the array still fit on a leaf page, if this datum is + * included in it? We actually use 1/4 of the page, to avoid + * making very large arrays, which might be slower to update in + * the future. Also, using an array that completely fills a page + * might cause more fragmentation. (XXX: The 1/4 threshold + * is arbitrary, though, and this probably needs more smarts + * or testing to determine the optimum.) + */ + if (!isnulls[i]) + { + Datum val = datums[j]; + Size datum_sz; + + datum_sz = zsbt_compute_data_size(attr, val, false); + if (datasz + datum_sz < MaxZedStoreDatumSize / 4) + break; + datasz += datum_sz; + } + } + + /* + * 'i' is now the first entry to store in the array, and 'j' is the + * last + 1 elemnt to store. If j == i + 1, then there is only one + * element and zsbt_create_item() will create a 'single' item rather + * than an array. + */ + newitem = zsbt_attr_create_item(attr, tids[i], + j - i, &datums[i], NULL, datasz, isnulls[i]); + + newitems = lappend(newitems, newitem); + i = j; + } + + /* recompress and possibly split the page */ + zsbt_attr_replace_item(rel, attno, buf, + InvalidZSTid, NULL, + newitems); + /* zsbt_replace_item unlocked 'buf' */ + ReleaseBuffer(buf); +} + +void +zsbt_attr_remove(Relation rel, AttrNumber attno, zstid tid) +{ + Buffer buf; + ZSSingleBtreeItem *item; + + /* Find the item to delete. (It could be compressed) */ + item = zsbt_attr_fetch(rel, attno, tid, &buf); + if (item == NULL) + { + elog(WARNING, "could not find tuple to remove with TID (%u, %u) for attribute %d", + ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid), attno); + return; + } + + /* remove it */ + zsbt_attr_replace_item(rel, attno, buf, + tid, NULL, + NIL); + ReleaseBuffer(buf); /* zsbt_replace_item released */ +} + +/* ---------------------------------------------------------------- + * Internal routines + * ---------------------------------------------------------------- + */ + +/* + * Fetch the item with given TID. The page containing the item is kept locked, and + * returned to the caller in *buf_p. This is used to locate a tuple for updating + * or deleting it. + */ +static ZSSingleBtreeItem * +zsbt_attr_fetch(Relation rel, AttrNumber attno, zstid tid, Buffer *buf_p) +{ + Buffer buf; + Page page; + ZSBtreeItem *item = NULL; + bool found = false; + OffsetNumber maxoff; + OffsetNumber off; + + buf = zsbt_descend(rel, attno, tid, 0, false); + if (buf == InvalidBuffer) + { + *buf_p = InvalidBuffer; + return NULL; + } + page = BufferGetPage(buf); + + /* Find the item on the page that covers the target TID */ + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + item = (ZSBtreeItem *) PageGetItem(page, iid); + + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item; + ZSDecompressContext decompressor; + + zs_decompress_init(&decompressor); + zs_decompress_chunk(&decompressor, citem); + + while ((item = zs_decompress_read_item(&decompressor)) != NULL) + { + zstid lasttid = zsbt_item_lasttid(item); + + if (item->t_tid <= tid && lasttid >= tid) + { + found = true; + break; + } + } + if (found) + { + /* FIXME: decompressor is leaked. Can't free it yet, because we still + * need to access the item below + */ + break; + } + zs_decompress_free(&decompressor); + } + else + { + zstid lasttid = zsbt_item_lasttid(item); + + if (item->t_tid <= tid && lasttid >= tid) + { + found = true; + break; + } + } + } + + if (found) + { + ZSSingleBtreeItem *result; + + if ((item->t_flags & ZSBT_ARRAY) != 0) + { + ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item; + int elemno = tid - aitem->t_tid; + char *dataptr = NULL; + int datasz; + int resultsize; + + Assert(elemno < aitem->t_nelements); + + if ((item->t_flags & ZSBT_NULL) == 0) + { + /* + * TODO: Currently, zsbt_fetch() is called from functions + * which don't have Slot, and Relation object can be trusted + * for attlen and attbyval. Ideally, we wish to not rely on + * Relation object and see how to decouple it. Previously, we + * stored these two values in meta-page and get these values + * from it but just storing them for this purpose, seems + * heavy. Ideally, catalog stores those values so shouldn't + * need to duplicate storing the same. + */ + TupleDesc tdesc = RelationGetDescr(rel); + int attlen = tdesc->attrs[attno - 1].attlen; + bool attbyval = tdesc->attrs[attno - 1].attbyval; + + if (attlen > 0) + { + dataptr = aitem->t_payload + elemno * attlen; + datasz = attlen; + } + else + { + dataptr = aitem->t_payload; + for (int i = 0; i < elemno; i++) + { + dataptr += zs_datumGetSize(PointerGetDatum(dataptr), attbyval, attlen); + } + datasz = zs_datumGetSize(PointerGetDatum(dataptr), attbyval, attlen); + } + } + else + datasz = 0; + + resultsize = offsetof(ZSSingleBtreeItem, t_payload) + datasz; + result = palloc(resultsize); + memset(result, 0, offsetof(ZSSingleBtreeItem, t_payload)); /* zero padding */ + result->t_tid = tid; + result->t_flags = item->t_flags & ~ZSBT_ARRAY; + result->t_size = resultsize; + result->t_undo_ptr = aitem->t_undo_ptr; + if (datasz > 0) + memcpy(result->t_payload, dataptr, datasz); + } + else + { + /* single item */ + result = (ZSSingleBtreeItem *) item; + } + + *buf_p = buf; + return result; + } + else + { + UnlockReleaseBuffer(buf); + *buf_p = InvalidBuffer; + return NULL; + } +} + +/* + * Compute the size of a slice of an array, from an array item. 'dataptr' + * points to the packed on-disk representation of the array item's data. + * The elements are stored one after each other. + */ +static Size +zsbt_get_array_slice_len(int16 attlen, bool attbyval, bool isnull, + char *dataptr, int nelements) +{ + Size datasz; + + if (isnull) + datasz = 0; + else + { + /* + * For a fixed-width type, we can just multiply. For variable-length, + * we have to walk through the elements, looking at the length of each + * element. + */ + if (attlen > 0) + { + datasz = attlen * nelements; + } + else + { + char *p = dataptr; + + datasz = 0; + for (int i = 0; i < nelements; i++) + { + Size datumsz; + + datumsz = zs_datumGetSize(PointerGetDatum(p), attbyval, attlen); + + /* + * The array should already use short varlen representation whenever + * possible. + */ + Assert(!VARATT_CAN_MAKE_SHORT(DatumGetPointer(p))); + + datasz += datumsz; + p += datumsz; + } + } + } + return datasz; +} + + +/* Does att's datatype allow packing into the 1-byte-header varlena format? */ +#define ATT_IS_PACKABLE(att) \ + ((att)->attlen == -1 && (att)->attstorage != 'p') +/* Use this if it's already known varlena */ +#define VARLENA_ATT_IS_PACKABLE(att) \ + ((att)->attstorage != 'p') + +/* + * This is very similar to heap_compute_data_size() + */ +static Size +zsbt_compute_data_size(Form_pg_attribute atti, Datum val, bool isnull) +{ + Size data_length = 0; + + if (isnull) + return 0; + + if (ATT_IS_PACKABLE(atti) && + VARATT_CAN_MAKE_SHORT(DatumGetPointer(val))) + { + /* + * we're anticipating converting to a short varlena header, so + * adjust length and don't count any alignment + */ + data_length += VARATT_CONVERTED_SHORT_SIZE(DatumGetPointer(val)); + } + else if (atti->attlen == -1 && + VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val))) + { + /* + * we want to flatten the expanded value so that the constructed + * tuple doesn't depend on it + */ + data_length = att_align_nominal(data_length, atti->attalign); + data_length += EOH_get_flat_size(DatumGetEOHP(val)); + } + else if (atti->attlen == -1 && + VARATT_IS_EXTERNAL(val) && VARTAG_EXTERNAL(val) == VARTAG_ZEDSTORE) + { + data_length += sizeof(varatt_zs_toastptr); + } + else + { + data_length = att_align_datum(data_length, atti->attalign, + atti->attlen, val); + data_length = att_addlength_datum(data_length, atti->attlen, + val); + } + + return data_length; +} + +/* + * Form a ZSBtreeItem out of the given datums, or data that's already in on-disk + * array format, for insertion. + * + * If there's more than one element, an array item is created. Otherwise, a single + * item. + */ +static ZSBtreeItem * +zsbt_attr_create_item(Form_pg_attribute att, zstid tid, + int nelements, Datum *datums, + char *datasrc, Size datasz, bool isnull) +{ + ZSBtreeItem *result; + Size itemsz; + char *databegin; + + Assert(nelements > 0); + + if (nelements > 1) + { + ZSArrayBtreeItem *newitem; + + itemsz = offsetof(ZSArrayBtreeItem, t_payload) + datasz; + + newitem = palloc(itemsz); + memset(newitem, 0, offsetof(ZSArrayBtreeItem, t_payload)); /* zero padding */ + newitem->t_tid = tid; + newitem->t_size = itemsz; + newitem->t_flags = ZSBT_ARRAY; + if (isnull) + newitem->t_flags |= ZSBT_NULL; + newitem->t_nelements = nelements; + ZSUndoRecPtrInitialize(&newitem->t_undo_ptr); + + databegin = newitem->t_payload; + + result = (ZSBtreeItem *) newitem; + } + else + { + ZSSingleBtreeItem *newitem; + + itemsz = offsetof(ZSSingleBtreeItem, t_payload) + datasz; + + newitem = palloc(itemsz); + memset(newitem, 0, offsetof(ZSSingleBtreeItem, t_payload)); /* zero padding */ + newitem->t_tid = tid; + newitem->t_flags = 0; + if (isnull) + newitem->t_flags |= ZSBT_NULL; + newitem->t_size = itemsz; + ZSUndoRecPtrInitialize(&newitem->t_undo_ptr); + + databegin = newitem->t_payload; + + result = (ZSBtreeItem *) newitem; + } + + /* + * Copy the data. + * + * This is largely copied from heaptuple.c's fill_val(). + */ + if (!isnull) + { + char *data = databegin; + + if (datums) + { + for (int i = 0; i < nelements; i++) + { + Datum datum = datums[i]; + Size data_length; + + /* + * XXX we use the att_align macros on the pointer value itself, not on an + * offset. This is a bit of a hack. + */ + if (att->attbyval) + { + /* pass-by-value */ + data = (char *) att_align_nominal(data, att->attalign); + store_att_byval(data, datum, att->attlen); + data_length = att->attlen; + } + else if (att->attlen == -1) + { + /* varlena */ + Pointer val = DatumGetPointer(datum); + + if (VARATT_IS_EXTERNAL(val)) + { + if (VARATT_IS_EXTERNAL_EXPANDED(val)) + { + /* + * we want to flatten the expanded value so that the + * constructed tuple doesn't depend on it + */ + /* FIXME: This should happen earlier, because if the + * datum is very large, it should be toasted, and + * that should happen earlier. + */ + ExpandedObjectHeader *eoh = DatumGetEOHP(datum); + + data = (char *) att_align_nominal(data, + att->attalign); + data_length = EOH_get_flat_size(eoh); + EOH_flatten_into(eoh, data, data_length); + } + else if (VARATT_IS_EXTERNAL(val) && VARTAG_EXTERNAL(val) == VARTAG_ZEDSTORE) + { + data_length = sizeof(varatt_zs_toastptr); + memcpy(data, val, data_length); + } + else + { + /* no alignment, since it's short by definition */ + data_length = VARSIZE_EXTERNAL(val); + memcpy(data, val, data_length); + } + } + else if (VARATT_IS_SHORT(val)) + { + /* no alignment for short varlenas */ + data_length = VARSIZE_SHORT(val); + memcpy(data, val, data_length); + } + else if (VARLENA_ATT_IS_PACKABLE(att) && + VARATT_CAN_MAKE_SHORT(val)) + { + /* convert to short varlena -- no alignment */ + data_length = VARATT_CONVERTED_SHORT_SIZE(val); + SET_VARSIZE_SHORT(data, data_length); + memcpy(data + 1, VARDATA(val), data_length - 1); + } + else + { + /* full 4-byte header varlena */ + data = (char *) att_align_nominal(data, + att->attalign); + data_length = VARSIZE(val); + memcpy(data, val, data_length); + } + } + else if (att->attlen == -2) + { + /* cstring ... never needs alignment */ + Assert(att->attalign == 'c'); + data_length = strlen(DatumGetCString(datum)) + 1; + memcpy(data, DatumGetPointer(datum), data_length); + } + else + { + /* fixed-length pass-by-reference */ + data = (char *) att_align_nominal(data, att->attalign); + Assert(att->attlen > 0); + data_length = att->attlen; + memcpy(data, DatumGetPointer(datum), data_length); + } + data += data_length; + } + Assert(data - databegin == datasz); + } + else + memcpy(data, datasrc, datasz); + } + + return result; +} + +/* + * This helper function is used to implement INSERT, UPDATE and DELETE. + * + * If 'olditem' is not NULL, then 'olditem' on the page is replaced with + * 'replacementitem'. 'replacementitem' can be NULL, to remove an old item. + * + * If 'newitems' is not empty, the items in the list are added to the page, + * to the correct position. FIXME: Actually, they're always just added to + * the end of the page, and that better be the correct position. + * + * This function handles decompressing and recompressing items, and splitting + * the page if needed. + */ +static void +zsbt_attr_replace_item(Relation rel, AttrNumber attno, Buffer buf, + zstid oldtid, + ZSBtreeItem *replacementitem, + List *newitems) +{ + Form_pg_attribute attr; + int16 attlen; + bool attbyval; + Page page = BufferGetPage(buf); + OffsetNumber off; + OffsetNumber maxoff; + List *items; + bool found_old_item = false; + /* We might need to decompress up to two previously compressed items */ + ZSDecompressContext decompressor; + bool decompressor_used = false; + bool decompressing; + + if (attno == ZS_META_ATTRIBUTE_NUM) + { + attr = NULL; + attlen = 0; + attbyval = true; + } + else + { + attr = &rel->rd_att->attrs[attno - 1]; + attlen = attr->attlen; + attbyval = attr->attbyval; + } + + if (replacementitem) + Assert(replacementitem->t_tid == oldtid); + + /* + * TODO: It would be good to have a fast path, for the common case that we're + * just adding items to the end. + */ + + /* Loop through all old items on the page */ + items = NIL; + maxoff = PageGetMaxOffsetNumber(page); + decompressing = false; + off = 1; + for (;;) + { + ZSBtreeItem *item; + + /* + * Get the next item to process. If we're decompressing, get the next + * tuple from the decompressor, otherwise get the next item from the page. + */ + if (decompressing) + { + item = zs_decompress_read_item(&decompressor); + if (!item) + { + decompressing = false; + continue; + } + } + else if (off <= maxoff) + { + ItemId iid = PageGetItemId(page, off); + + item = (ZSBtreeItem *) PageGetItem(page, iid); + off++; + + } + else + { + /* out of items */ + break; + } + + /* we now have an item to process, either straight from the page or from + * the decompressor */ + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + zstid item_lasttid = zsbt_item_lasttid(item); + + /* there shouldn't nested compressed items */ + if (decompressing) + elog(ERROR, "nested compressed items on zedstore page not supported"); + + if (oldtid != InvalidZSTid && item->t_tid <= oldtid && oldtid <= item_lasttid) + { + ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item; + + /* Found it, this compressed item covers the target or the new TID. */ + /* We have to decompress it, and recompress */ + Assert(!decompressor_used); + + zs_decompress_init(&decompressor); + zs_decompress_chunk(&decompressor, citem); + decompressor_used = true; + decompressing = true; + continue; + } + else + { + /* keep this compressed item as it is */ + items = lappend(items, item); + } + } + else if ((item->t_flags & ZSBT_ARRAY) != 0) + { + /* array item */ + ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item; + zstid item_lasttid = zsbt_item_lasttid(item); + + if (oldtid != InvalidZSTid && item->t_tid <= oldtid && oldtid <= item_lasttid) + { + /* + * The target TID is currently part of an array item. We have to split + * the array item into two, and put the replacement item in the middle. + */ + int cutoff; + Size olddatalen; + int nelements = aitem->t_nelements; + bool isnull = (aitem->t_flags & ZSBT_NULL) != 0; + char *dataptr; + + cutoff = oldtid - item->t_tid; + + /* Array slice before the target TID */ + dataptr = aitem->t_payload; + if (cutoff > 0) + { + ZSBtreeItem *item1; + Size datalen1; + + datalen1 = zsbt_get_array_slice_len(attlen, attbyval, isnull, + dataptr, cutoff); + item1 = zsbt_attr_create_item(attr, aitem->t_tid, + cutoff, NULL, dataptr, datalen1, isnull); + dataptr += datalen1; + items = lappend(items, item1); + } + + /* + * Skip over the target element, and store the replacement + * item, if any, in its place + */ + olddatalen = zsbt_get_array_slice_len(attlen, attbyval, isnull, + dataptr, 1); + dataptr += olddatalen; + if (replacementitem) + items = lappend(items, replacementitem); + + /* Array slice after the target */ + if (cutoff + 1 < nelements) + { + ZSBtreeItem *item2; + Size datalen2; + + datalen2 = zsbt_get_array_slice_len(attlen, attbyval, isnull, + dataptr, nelements - (cutoff + 1)); + item2 = zsbt_attr_create_item(attr, oldtid + 1, + nelements - (cutoff + 1), NULL, dataptr, datalen2, isnull); + items = lappend(items, item2); + } + + found_old_item = true; + } + else + items = lappend(items, item); + } + else + { + /* single item */ + if (oldtid != InvalidZSTid && item->t_tid == oldtid) + { + Assert(!found_old_item); + found_old_item = true; + if (replacementitem) + items = lappend(items, replacementitem); + } + else + items = lappend(items, item); + } + } + + if (oldtid != InvalidZSTid && !found_old_item) + elog(ERROR, "could not find old item to replace"); + + /* Add any new items to the end */ + if (newitems) + items = list_concat(items, newitems); + + /* Now pass the list to the recompressor. */ + IncrBufferRefCount(buf); + if (items) + { + zsbt_attr_recompress_replace(rel, attno, buf, items); + } + else + { + zs_split_stack *stack; + + stack = zsbt_unlink_page(rel, attno, buf, 0); + + if (!stack) + { + /* failed. */ + Page newpage = PageGetTempPageCopySpecial(BufferGetPage(buf)); + + stack = zs_new_split_stack_entry(buf, newpage); + } + + /* apply the changes */ + zs_apply_split_changes(rel, stack); + } + + /* + * We can now free the decompression contexts. The pointers in the 'items' list + * point to decompression buffers, so we cannot free them until after writing out + * the pages. + */ + if (decompressor_used) + zs_decompress_free(&decompressor); + list_free(items); +} + +/* + * Recompressor routines + */ +typedef struct +{ + Page currpage; + ZSCompressContext compressor; + int compressed_items; + + /* first page writes over the old buffer, subsequent pages get newly-allocated buffers */ + zs_split_stack *stack_head; + zs_split_stack *stack_tail; + + int total_items; + int total_compressed_items; + int total_already_compressed_items; + + AttrNumber attno; + zstid hikey; +} zsbt_attr_recompress_context; + +static void +zsbt_attr_recompress_newpage(zsbt_attr_recompress_context *cxt, zstid nexttid, int flags) +{ + Page newpage; + ZSBtreePageOpaque *newopaque; + zs_split_stack *stack; + + if (cxt->currpage) + { + /* set the last tid on previous page */ + ZSBtreePageOpaque *oldopaque = ZSBtreePageGetOpaque(cxt->currpage); + + oldopaque->zs_hikey = nexttid; + } + + newpage = (Page) palloc(BLCKSZ); + PageInit(newpage, BLCKSZ, sizeof(ZSBtreePageOpaque)); + + stack = zs_new_split_stack_entry(InvalidBuffer, /* will be assigned later */ + newpage); + if (cxt->stack_tail) + cxt->stack_tail->next = stack; + else + cxt->stack_head = stack; + cxt->stack_tail = stack; + + cxt->currpage = newpage; + + newopaque = ZSBtreePageGetOpaque(newpage); + newopaque->zs_attno = cxt->attno; + newopaque->zs_next = InvalidBlockNumber; /* filled in later */ + newopaque->zs_lokey = nexttid; + newopaque->zs_hikey = cxt->hikey; /* overwritten later, if this is not last page */ + newopaque->zs_level = 0; + newopaque->zs_flags = flags; + newopaque->zs_page_id = ZS_BTREE_PAGE_ID; +} + +static void +zsbt_attr_recompress_add_to_page(zsbt_attr_recompress_context *cxt, ZSBtreeItem *item) +{ + if (PageGetFreeSpace(cxt->currpage) < MAXALIGN(item->t_size)) + zsbt_attr_recompress_newpage(cxt, item->t_tid, 0); + + if (PageAddItemExtended(cxt->currpage, + (Item) item, item->t_size, + PageGetMaxOffsetNumber(cxt->currpage) + 1, + PAI_OVERWRITE) == InvalidOffsetNumber) + elog(ERROR, "could not add item to page while recompressing"); + + cxt->total_items++; +} + +static bool +zsbt_attr_recompress_add_to_compressor(zsbt_attr_recompress_context *cxt, ZSBtreeItem *item) +{ + bool result; + + if (cxt->compressed_items == 0) + zs_compress_begin(&cxt->compressor, PageGetFreeSpace(cxt->currpage)); + + result = zs_compress_add(&cxt->compressor, item); + if (result) + { + cxt->compressed_items++; + + cxt->total_compressed_items++; + } + + return result; +} + +static void +zsbt_attr_recompress_flush(zsbt_attr_recompress_context *cxt) +{ + ZSCompressedBtreeItem *citem; + + if (cxt->compressed_items == 0) + return; + + citem = zs_compress_finish(&cxt->compressor); + + if (citem) + zsbt_attr_recompress_add_to_page(cxt, (ZSBtreeItem *) citem); + else + { + uint16 size = 0; + /* + * compression failed hence add items uncompressed. We should maybe + * note that these items/pattern are not compressible and skip future + * attempts to compress but its possible this clubbed with some other + * future items may compress. So, better avoid recording such info and + * try compression again later if required. + */ + for (int i = 0; i < cxt->compressor.nitems; i++) + { + citem = (ZSCompressedBtreeItem *) (cxt->compressor.uncompressedbuffer + size); + zsbt_attr_recompress_add_to_page(cxt, (ZSBtreeItem *) citem); + + size += MAXALIGN(citem->t_size); + } + } + + cxt->compressed_items = 0; +} + +/* + * Rewrite a leaf page, with given 'items' as the new content. + * + * If there are any uncompressed items in the list, we try to compress them. + * Any already-compressed items are added as is. + * + * If the items no longer fit on the page, then the page is split. It is + * entirely possible that they don't fit even on two pages; we split the page + * into as many pages as needed. Hopefully not more than a few pages, though, + * because otherwise you might hit limits on the number of buffer pins (with + * tiny shared_buffers). + * + * On entry, 'oldbuf' must be pinned and exclusive-locked. On exit, the lock + * is released, but it's still pinned. + * + * TODO: Try to combine single items, and existing array-items, into new array + * items. + */ +static void +zsbt_attr_recompress_replace(Relation rel, AttrNumber attno, Buffer oldbuf, List *items) +{ + ListCell *lc; + zsbt_attr_recompress_context cxt; + ZSBtreePageOpaque *oldopaque = ZSBtreePageGetOpaque(BufferGetPage(oldbuf)); + ZSUndoRecPtr recent_oldest_undo = { 0 }; + BlockNumber orignextblk; + zs_split_stack *stack; + List *downlinks = NIL; + + orignextblk = oldopaque->zs_next; + + cxt.currpage = NULL; + zs_compress_init(&cxt.compressor); + cxt.compressed_items = 0; + cxt.stack_head = cxt.stack_tail = NULL; + cxt.attno = attno; + cxt.hikey = oldopaque->zs_hikey; + + cxt.total_items = 0; + cxt.total_compressed_items = 0; + cxt.total_already_compressed_items = 0; + + zsbt_attr_recompress_newpage(&cxt, oldopaque->zs_lokey, (oldopaque->zs_flags & ZSBT_ROOT)); + + foreach(lc, items) + { + ZSBtreeItem *item = (ZSBtreeItem *) lfirst(lc); + + /* We can leave out any old-enough DEAD items */ + if ((item->t_flags & ZSBT_DEAD) != 0) + { + ZSBtreeItem *uitem = (ZSBtreeItem *) item; + + if (recent_oldest_undo.counter == 0) + recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); + + if (zsbt_item_undoptr(uitem).counter <= recent_oldest_undo.counter) + continue; + } + + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + /* already compressed, add as it is. */ + zsbt_attr_recompress_flush(&cxt); + cxt.total_already_compressed_items++; + zsbt_attr_recompress_add_to_page(&cxt, item); + } + else + { + /* try to add this item to the compressor */ + if (!zsbt_attr_recompress_add_to_compressor(&cxt, item)) + { + if (cxt.compressed_items > 0) + { + /* flush, and retry */ + zsbt_attr_recompress_flush(&cxt); + + if (!zsbt_attr_recompress_add_to_compressor(&cxt, item)) + { + /* could not compress, even on its own. Store it uncompressed, then */ + zsbt_attr_recompress_add_to_page(&cxt, item); + } + } + else + { + /* could not compress, even on its own. Store it uncompressed, then */ + zsbt_attr_recompress_add_to_page(&cxt, item); + } + } + } + } + + /* flush the last one, if any */ + zsbt_attr_recompress_flush(&cxt); + + zs_compress_free(&cxt.compressor); + + /* + * Ok, we now have a list of pages, to replace the original page, as private + * in-memory copies. Allocate buffers for them, and write them out. + * + * allocate all the pages before entering critical section, so that + * out-of-disk-space doesn't lead to PANIC + */ + stack = cxt.stack_head; + Assert(stack->buf == InvalidBuffer); + stack->buf = oldbuf; + while (stack->next) + { + Page thispage = stack->page; + ZSBtreePageOpaque *thisopaque = ZSBtreePageGetOpaque(thispage); + ZSBtreeInternalPageItem *downlink; + Buffer nextbuf; + + Assert(stack->next->buf == InvalidBuffer); + + nextbuf = zspage_getnewbuf(rel, InvalidBuffer); + stack->next->buf = nextbuf; + + thisopaque->zs_next = BufferGetBlockNumber(nextbuf); + + downlink = palloc(sizeof(ZSBtreeInternalPageItem)); + downlink->tid = thisopaque->zs_hikey; + downlink->childblk = BufferGetBlockNumber(nextbuf); + downlinks = lappend(downlinks, downlink); + + stack = stack->next; + } + /* last one in the chain */ + ZSBtreePageGetOpaque(stack->page)->zs_next = orignextblk; + + /* If we had to split, insert downlinks for the new pages. */ + if (cxt.stack_head->next) + { + oldopaque = ZSBtreePageGetOpaque(cxt.stack_head->page); + + if ((oldopaque->zs_flags & ZSBT_ROOT) != 0) + { + ZSBtreeInternalPageItem *downlink; + + downlink = palloc(sizeof(ZSBtreeInternalPageItem)); + downlink->tid = MinZSTid; + downlink->childblk = BufferGetBlockNumber(cxt.stack_head->buf); + downlinks = lcons(downlink, downlinks); + + cxt.stack_tail->next = zsbt_newroot(rel, attno, oldopaque->zs_level + 1, downlinks); + + /* clear the ZSBT_ROOT flag on the old root page */ + oldopaque->zs_flags &= ~ZSBT_ROOT; + } + else + { + cxt.stack_tail->next = zsbt_insert_downlinks(rel, attno, + oldopaque->zs_lokey, BufferGetBlockNumber(oldbuf), oldopaque->zs_level + 1, + downlinks); + } + /* note: stack_tail is not the real tail anymore */ + } + + /* Finally, overwrite all the pages we had to modify */ + zs_apply_split_changes(rel, cxt.stack_head); +} diff --git a/src/backend/access/zedstore/zedstore_btree.c b/src/backend/access/zedstore/zedstore_btree.c new file mode 100644 index 0000000000..108170ffee --- /dev/null +++ b/src/backend/access/zedstore/zedstore_btree.c @@ -0,0 +1,632 @@ +/* + * zedstore_btree.c + * Common routines for handling TID and attibute B-tree structures + * + * A Zedstore table consists of multiple B-trees, one to store TIDs and + * visibility information of the rows, and one tree for each attribute, + * to hold the data. The TID and attribute trees differ at the leaf + * level, but the internal pages have the same layout. This file contains + * routines to deal with internal pages, and some other common + * functionality. + * + * When dealing with the TID tree, pass ZS_META_ATTRIBUTE_NUM as the + * attribute number. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_btree.c + */ +#include "postgres.h" + +#include "access/zedstore_internal.h" +#include "storage/bufmgr.h" +#include "storage/procarray.h" +#include "utils/rel.h" + +/* prototypes for local functions */ +static zs_split_stack *zsbt_split_internal_page(Relation rel, AttrNumber attno, + Buffer leftbuf, OffsetNumber newoff, List *downlinks); +static zs_split_stack *zsbt_merge_pages(Relation rel, AttrNumber attno, Buffer leftbuf, Buffer rightbuf, bool target_is_left); + +static int zsbt_binsrch_internal(zstid key, ZSBtreeInternalPageItem *arr, int arr_elems); + +/* + * Find the page containing the given key TID at the given level. + * + * Level 0 means leaf. The returned buffer is exclusive-locked. + */ +Buffer +zsbt_descend(Relation rel, AttrNumber attno, zstid key, int level, bool readonly) +{ + BlockNumber next; + Buffer buf; + Page page; + ZSBtreePageOpaque *opaque; + ZSBtreeInternalPageItem *items; + int nitems; + int itemno; + BlockNumber rootblk; + int nextlevel = -1; + BlockNumber failblk = InvalidBlockNumber; + + /* start from root */ +restart: + rootblk = zsmeta_get_root_for_attribute(rel, attno, readonly); + + if (rootblk == InvalidBlockNumber) + { + /* completely empty tree */ + return InvalidBuffer; + } + + next = rootblk; + for (;;) + { + /* + * If we arrive again to a block that was a dead-end earlier, it seems + * that the tree is corrupt. + * + * XXX: It's theoretically possible that the block was removed, but then + * added back at the same location, and removed again. So perhaps retry + * a few times? + */ + if (next == failblk || next == ZS_META_BLK) + elog(ERROR, "arrived at incorrect block %u while descending zedstore btree", next); + + buf = ReadBuffer(rel, next); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* TODO: shared */ + page = BufferGetPage(buf); + if (!zsbt_page_is_expected(rel, attno, key, nextlevel, buf)) + { + /* + * We arrived at an unexpected page. This can happen with concurrent + * splits, or page deletions. We could try following the right-link, but + * there's no guarantee that's the correct page either, so let's restart + * from the root. If we landed here because of concurrent modifications, + * the next attempt should land on the correct page. Remember that we + * incorrectly ended up on this page, so that if this happens because + * the tree is corrupt, rather than concurrent splits, and we land here + * again, we won't loop forever. + */ + failblk = next; + goto restart; + } + opaque = ZSBtreePageGetOpaque(page); + + if (nextlevel == -1) + nextlevel = opaque->zs_level; + + else if (opaque->zs_level != nextlevel) + elog(ERROR, "unexpected level encountered when descending tree"); + + if (opaque->zs_level == level) + return buf; + + /* Find the downlink and follow it */ + items = ZSBtreeInternalPageGetItems(page); + nitems = ZSBtreeInternalPageGetNumItems(page); + + itemno = zsbt_binsrch_internal(key, items, nitems); + if (itemno < 0) + elog(ERROR, "could not descend tree for tid (%u, %u)", + ZSTidGetBlockNumber(key), ZSTidGetOffsetNumber(key)); + + next = items[itemno].childblk; + nextlevel--; + + UnlockReleaseBuffer(buf); + } +} + +/* + * Check that a page is a valid B-tree page, and covers the given key. + * + * This is used when traversing the tree, to check that e.g. a concurrent page + * split didn't move pages around, so that the page we were walking to isn't + * the correct one anymore. + */ +bool +zsbt_page_is_expected(Relation rel, AttrNumber attno, zstid key, int level, Buffer buf) +{ + Page page = BufferGetPage(buf); + ZSBtreePageOpaque *opaque; + + /* + * The page might have been deleted and even reused as a completely different + * kind of a page, so we must be prepared for anything. + */ + if (PageIsNew(page)) + return false; + + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(ZSBtreePageOpaque))) + return false; + + opaque = ZSBtreePageGetOpaque(page); + if (opaque->zs_page_id != ZS_BTREE_PAGE_ID) + return false; + + if (opaque->zs_attno != attno) + return false; + + if (level != -1 && opaque->zs_level != level) + return false; + + if (opaque->zs_lokey > key || opaque->zs_hikey <= key) + return false; + + return true; +} + +/* + * Create a new btree root page, containing two downlinks. + * + * NOTE: the very first root page of a btree, which is also the leaf, is created + * in zsmeta_get_root_for_attribute(), not here. + * + * XXX: What if there are too many downlinks to fit on a page? Shouldn't happen + * in practice.. + */ +zs_split_stack * +zsbt_newroot(Relation rel, AttrNumber attno, int level, List *downlinks) +{ + Page metapage; + ZSMetaPage *metapg; + Buffer newrootbuf; + Page newrootpage; + ZSBtreePageOpaque *newrootopaque; + ZSBtreeInternalPageItem *items; + Buffer metabuf; + zs_split_stack *stack1; + zs_split_stack *stack2; + ListCell *lc; + int i; + + metabuf = ReadBuffer(rel, ZS_META_BLK); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* allocate a new root page */ + newrootbuf = zspage_getnewbuf(rel, metabuf); + newrootpage = palloc(BLCKSZ); + PageInit(newrootpage, BLCKSZ, sizeof(ZSBtreePageOpaque)); + newrootopaque = ZSBtreePageGetOpaque(newrootpage); + newrootopaque->zs_attno = attno; + newrootopaque->zs_next = InvalidBlockNumber; + newrootopaque->zs_lokey = MinZSTid; + newrootopaque->zs_hikey = MaxPlusOneZSTid; + newrootopaque->zs_level = level; + newrootopaque->zs_flags = ZSBT_ROOT; + newrootopaque->zs_page_id = ZS_BTREE_PAGE_ID; + + items = ZSBtreeInternalPageGetItems(newrootpage); + + /* add all the downlinks */ + i = 0; + foreach (lc, downlinks) + { + ZSBtreeInternalPageItem *downlink = (ZSBtreeInternalPageItem *) lfirst(lc); + + items[i++] = *downlink; + } + ((PageHeader) newrootpage)->pd_lower += i * sizeof(ZSBtreeInternalPageItem); + + /* FIXME: Check that all the downlinks fit on the page. */ + + /* update the metapage */ + metapage = PageGetTempPageCopy(BufferGetPage(metabuf)); + + metapg = (ZSMetaPage *) PageGetContents(metapage); + if ((attno != ZS_META_ATTRIBUTE_NUM) && (attno <= 0 || attno > metapg->nattributes)) + elog(ERROR, "invalid attribute number %d (table \"%s\" has only %d attributes)", + attno, RelationGetRelationName(rel), metapg->nattributes); + + metapg->tree_root_dir[attno].root = BufferGetBlockNumber(newrootbuf); + + stack1 = zs_new_split_stack_entry(metabuf, metapage); + stack2 = zs_new_split_stack_entry(newrootbuf, newrootpage); + stack2->next = stack1; + + return stack2; +} + +/* + * After page split, insert the downlink of 'rightblkno' to the parent. + * + * On entry, 'leftbuf' must be pinned exclusive-locked. + */ +zs_split_stack * +zsbt_insert_downlinks(Relation rel, AttrNumber attno, + zstid leftlokey, BlockNumber leftblkno, int level, + List *downlinks) +{ + int numdownlinks = list_length(downlinks); + ZSBtreeInternalPageItem *items; + int nitems; + int itemno; + Buffer parentbuf; + Page parentpage; + zs_split_stack *split_stack; + ZSBtreeInternalPageItem *firstdownlink; + + /* + * re-find parent + * + * TODO: this is a bit inefficient. Usually, we have just descended the + * tree, and if we just remembered the path we descended, we could just + * walk back up. + */ + parentbuf = zsbt_descend(rel, attno, leftlokey, level, false); + parentpage = BufferGetPage(parentbuf); + + firstdownlink = (ZSBtreeInternalPageItem *) linitial(downlinks); + + /* Find the position in the parent for the downlink */ + items = ZSBtreeInternalPageGetItems(parentpage); + nitems = ZSBtreeInternalPageGetNumItems(parentpage); + itemno = zsbt_binsrch_internal(firstdownlink->tid, items, nitems); + + /* sanity checks */ + if (itemno < 0 || items[itemno].tid != leftlokey || + items[itemno].childblk != leftblkno) + { + elog(ERROR, "could not find downlink for block %u TID (%u, %u)", + leftblkno, ZSTidGetBlockNumber(leftlokey), + ZSTidGetOffsetNumber(leftlokey)); + } + itemno++; + + if (PageGetExactFreeSpace(parentpage) < numdownlinks * sizeof(ZSBtreeInternalPageItem)) + { + /* split internal page */ + split_stack = zsbt_split_internal_page(rel, attno, parentbuf, itemno, downlinks); + } + else + { + ZSBtreeInternalPageItem *newitems; + Page newpage; + int i; + ListCell *lc; + + newpage = PageGetTempPageCopySpecial(parentpage); + + split_stack = zs_new_split_stack_entry(parentbuf, newpage); + + /* insert the new downlink for the right page. */ + newitems = ZSBtreeInternalPageGetItems(newpage); + memcpy(newitems, items, itemno * sizeof(ZSBtreeInternalPageItem)); + + i = itemno; + foreach(lc, downlinks) + { + ZSBtreeInternalPageItem *downlink = (ZSBtreeInternalPageItem *) lfirst(lc); + + Assert(downlink->childblk != 0); + newitems[i++] = *downlink; + } + + memcpy(&newitems[i], &items[itemno], (nitems - itemno) * sizeof(ZSBtreeInternalPageItem)); + ((PageHeader) newpage)->pd_lower += (nitems + numdownlinks) * sizeof(ZSBtreeInternalPageItem); + } + return split_stack; +} + +/* + * Split an internal page. + * + * The new downlink specified by 'newkey' is inserted to position 'newoff', on 'leftbuf'. + * The page is split. + */ +static zs_split_stack * +zsbt_split_internal_page(Relation rel, AttrNumber attno, Buffer origbuf, + OffsetNumber newoff, List *newitems) +{ + Page origpage = BufferGetPage(origbuf); + ZSBtreePageOpaque *origopaque = ZSBtreePageGetOpaque(origpage); + Buffer buf; + Page page; + ZSBtreeInternalPageItem *origitems; + int orignitems; + zs_split_stack *stack_first; + zs_split_stack *stack; + Size splitthreshold; + ListCell *lc; + int origitemno; + List *downlinks = NIL; + + origitems = ZSBtreeInternalPageGetItems(origpage); + orignitems = ZSBtreeInternalPageGetNumItems(origpage); + + page = PageGetTempPageCopySpecial(origpage); + buf = origbuf; + + stack = zs_new_split_stack_entry(buf, page); + stack_first = stack; + + /* XXX: currently, we always do 90/10 splits */ + splitthreshold = PageGetExactFreeSpace(page) * 0.10; + + lc = list_head(newitems); + origitemno = 0; + for (;;) + { + ZSBtreeInternalPageItem *item; + ZSBtreeInternalPageItem *p; + + if (origitemno == newoff && lc) + { + item = lfirst(lc); + lc = lnext(lc); + } + else + { + if (origitemno == orignitems) + break; + item = &origitems[origitemno]; + origitemno++; + } + + if (PageGetExactFreeSpace(page) < splitthreshold) + { + /* have to split to another page */ + ZSBtreePageOpaque *prevopaque = ZSBtreePageGetOpaque(page); + ZSBtreePageOpaque *opaque = ZSBtreePageGetOpaque(page); + BlockNumber blkno; + ZSBtreeInternalPageItem *downlink; + + buf = zspage_getnewbuf(rel, InvalidBuffer); + blkno = BufferGetBlockNumber(buf); + page = palloc(BLCKSZ); + PageInit(page, BLCKSZ, sizeof(ZSBtreePageOpaque)); + + opaque = ZSBtreePageGetOpaque(page); + opaque->zs_attno = attno; + opaque->zs_next = prevopaque->zs_next; + opaque->zs_lokey = item->tid; + opaque->zs_hikey = prevopaque->zs_hikey; + opaque->zs_level = prevopaque->zs_level; + opaque->zs_flags = 0; + opaque->zs_page_id = ZS_BTREE_PAGE_ID; + + prevopaque->zs_next = blkno; + prevopaque->zs_hikey = item->tid; + + stack->next = zs_new_split_stack_entry(buf, page); + stack = stack->next; + + downlink = palloc(sizeof(ZSBtreeInternalPageItem)); + downlink->tid = item->tid; + downlink->childblk = blkno; + downlinks = lappend(downlinks, downlink); + } + + p = (ZSBtreeInternalPageItem *) ((char *) page + ((PageHeader) page)->pd_lower); + *p = *item; + ((PageHeader) page)->pd_lower += sizeof(ZSBtreeInternalPageItem); + } + + /* recurse to insert downlinks, if we had to split. */ + if (downlinks) + { + if ((origopaque->zs_flags & ZSBT_ROOT) != 0) + { + ZSBtreeInternalPageItem *downlink; + + downlink = palloc(sizeof(ZSBtreeInternalPageItem)); + downlink->tid = MinZSTid; + downlink->childblk = BufferGetBlockNumber(origbuf); + downlinks = lcons(downlink, downlinks); + + stack->next = zsbt_newroot(rel, attno, origopaque->zs_level + 1, downlinks); + + /* clear the ZSBT_ROOT flag on the old root page */ + ZSBtreePageGetOpaque(stack_first->page)->zs_flags &= ~ZSBT_ROOT; + } + else + { + stack->next = zsbt_insert_downlinks(rel, attno, + origopaque->zs_lokey, + BufferGetBlockNumber(origbuf), + origopaque->zs_level + 1, + downlinks); + } + } + + return stack_first; +} + + +/* + * Removes the last item from page, and unlinks the page from the tree. + * + * NOTE: you cannot remove the only leaf. Returns NULL if the page could not + * be deleted. + */ +zs_split_stack * +zsbt_unlink_page(Relation rel, AttrNumber attno, Buffer buf, int level) +{ + Page page = BufferGetPage(buf); + ZSBtreePageOpaque *opaque = ZSBtreePageGetOpaque(page); + Buffer leftbuf; + Buffer rightbuf; + zs_split_stack *stack; + + /* cannot currently remove the only page at its level. */ + if (opaque->zs_lokey == MinZSTid && opaque->zs_hikey == MaxPlusOneZSTid) + { + return NULL; + } + + /* + * Find left sibling. + * or if this is leftmost page, find right sibling. + */ + if (opaque->zs_lokey != MinZSTid) + { + rightbuf = buf; + leftbuf = zsbt_descend(rel, attno, opaque->zs_lokey - 1, level, false); + + stack = zsbt_merge_pages(rel, attno, leftbuf, rightbuf, false); + if (!stack) + { + UnlockReleaseBuffer(leftbuf); + return NULL; + } + } + else + { + rightbuf = zsbt_descend(rel, attno, opaque->zs_hikey, level, false); + leftbuf = buf; + stack = zsbt_merge_pages(rel, attno, leftbuf, rightbuf, true); + if (!stack) + { + UnlockReleaseBuffer(rightbuf); + return NULL; + } + } + + return stack; +} + +/* + * Page deletion: + * + * Mark page empty, remove downlink. If parent becomes empty, recursively delete it. + * + * Unlike in the nbtree index, we don't need to worry about concurrent scans. They + * will simply retry if they land on an unexpected page. + */ +static zs_split_stack * +zsbt_merge_pages(Relation rel, AttrNumber attno, Buffer leftbuf, Buffer rightbuf, bool target_is_left) +{ + Buffer parentbuf; + Page origleftpage; + Page leftpage; + Page rightpage; + ZSBtreePageOpaque *leftopaque; + ZSBtreePageOpaque *origleftopaque; + ZSBtreePageOpaque *rightopaque; + ZSBtreeInternalPageItem *parentitems; + int parentnitems; + Page parentpage; + int itemno; + zs_split_stack *stack; + zs_split_stack *stack_head; + zs_split_stack *stack_tail; + + origleftpage = BufferGetPage(leftbuf); + origleftopaque = ZSBtreePageGetOpaque(origleftpage); + rightpage = BufferGetPage(rightbuf); + rightopaque = ZSBtreePageGetOpaque(rightpage); + + /* find downlink for 'rightbuf' in the parent */ + parentbuf = zsbt_descend(rel, attno, rightopaque->zs_lokey, origleftopaque->zs_level + 1, false); + parentpage = BufferGetPage(parentbuf); + + parentitems = ZSBtreeInternalPageGetItems(parentpage); + parentnitems = ZSBtreeInternalPageGetNumItems(parentpage); + itemno = zsbt_binsrch_internal(rightopaque->zs_lokey, parentitems, parentnitems); + if (itemno < 0 || parentitems[itemno].childblk != BufferGetBlockNumber(rightbuf)) + elog(ERROR, "could not find downlink to FPM page %u", BufferGetBlockNumber(rightbuf)); + + if (parentnitems > 1 && itemno == 0) + { + /* + * Don't delete the leftmost child of a parent. That would move the + * keyspace of the parent, so we'd need to adjust the lo/hikey of + * the parent page, and the parent's downlink in the grandparent. + * Maybe later... + */ + UnlockReleaseBuffer(parentbuf); + elog(DEBUG1, "deleting leftmost child of a parent not implemented"); + return NULL; + } + + if (target_is_left) + { + /* move all items from right to left before unlinking the right page */ + leftpage = PageGetTempPageCopy(rightpage); + leftopaque = ZSBtreePageGetOpaque(leftpage); + + memcpy(leftopaque, origleftopaque, sizeof(ZSBtreePageOpaque)); + } + else + { + /* right page is empty. */ + leftpage = PageGetTempPageCopy(origleftpage); + leftopaque = ZSBtreePageGetOpaque(leftpage); + } + + /* update left hikey */ + leftopaque->zs_hikey = ZSBtreePageGetOpaque(rightpage)->zs_hikey; + + Assert(ZSBtreePageGetOpaque(leftpage)->zs_level == ZSBtreePageGetOpaque(rightpage)->zs_level); + + stack = zs_new_split_stack_entry(leftbuf, leftpage); + stack_head = stack_tail = stack; + + /* Mark right page as empty/unused */ + rightpage = palloc0(BLCKSZ); + + stack = zs_new_split_stack_entry(rightbuf, rightpage); + stack->recycle = true; + stack_tail->next = stack; + stack_tail = stack; + + /* remove downlink from parent */ + if (parentnitems > 1) + { + Page newpage = PageGetTempPageCopySpecial(parentpage); + ZSBtreeInternalPageItem *newitems = ZSBtreeInternalPageGetItems(newpage); + + memcpy(newitems, parentitems, itemno * sizeof(ZSBtreeInternalPageItem)); + memcpy(&newitems[itemno], &parentitems[itemno + 1], (parentnitems - itemno -1) * sizeof(ZSBtreeInternalPageItem)); + + ((PageHeader) newpage)->pd_lower += (parentnitems - 1) * sizeof(ZSBtreeInternalPageItem); + + stack = zs_new_split_stack_entry(parentbuf, newpage); + stack_tail->next = stack; + stack_tail = stack; + } + else + { + /* the parent becomes empty as well. Recursively remove it. */ + stack_tail->next = zsbt_unlink_page(rel, attno, parentbuf, leftopaque->zs_level + 1); + if (stack_tail->next == NULL) + { + /* oops, couldn't remove the parent. Back out */ + stack = stack_head; + while (stack) + { + zs_split_stack *next = stack->next; + + pfree(stack->page); + pfree(stack); + stack = next; + } + } + } + + return stack_head; +} + +static int +zsbt_binsrch_internal(zstid key, ZSBtreeInternalPageItem *arr, int arr_elems) +{ + int low, + high, + mid; + + low = 0; + high = arr_elems; + while (high > low) + { + mid = low + (high - low) / 2; + + if (key >= arr[mid].tid) + low = mid + 1; + else + high = mid; + } + return low - 1; +} diff --git a/src/backend/access/zedstore/zedstore_compression.c b/src/backend/access/zedstore/zedstore_compression.c new file mode 100644 index 0000000000..1a1d9a018c --- /dev/null +++ b/src/backend/access/zedstore/zedstore_compression.c @@ -0,0 +1,364 @@ +/* + * zedstore_compression.c + * Routines for compression + * + * There are two implementations at the moment: LZ4, and the Postgres + * pg_lzcompress(). LZ4 support requires that the server was compiled + * with --with-lz4. + * + * The compressor works on ZSUncompressedBtreeItems. + * + * Compression interface + * --------------------- + * + * Call zs_compress_init() to initialize. + * + * Call zs_compress_begin(), to begin compressing a group of items. Pass the + * maximum amount of space it's allowed to use after compression, as argument. + * + * Feed them to the compressor one by one with zs_compress_add(), until it + * returns false. + * + * Finally, call zs_compress_finish(). It returns a ZSCompressedBtreeItem, + * which contains all the plain items that were added (except for the last one + * for which zs_compress_add() returned false) + * + * Decompression interface + * ----------------------- + * + * zs_decompress_chunk() takes a ZSCompressedBtreeItem as argument. It + * initializes a "context" with the given chunk. + * + * Call zs_decompress_read_item() to return the uncompressed items one by one. + * + * + * NOTES: + * + * Currently, the compressor accepts input, until the *uncompressed* size exceeds + * the *compressed* size available. I.e it assumes that the compressed size is never + * larger than uncompressed size. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_compression.c + */ +#include "postgres.h" + +#ifdef USE_LZ4 +#include +#endif + +#include "access/zedstore_compression.h" +#include "access/zedstore_internal.h" +#include "common/pg_lzcompress.h" +#include "utils/datum.h" + + +/* + * There are two implementations at the moment: LZ4, and the Postgres + * pg_lzcompress(). LZ4 support requires that the server was compiled + * with --with-lz4. + */ +#ifdef USE_LZ4 + +/* + * Begin compression, with given max compressed size. + */ +void +zs_compress_init(ZSCompressContext *context) +{ + context->uncompressedbuffer = palloc(BLCKSZ * 10); // FIXME: arbitrary size + context->buffer = palloc(BLCKSZ); + context->maxCompressedSize = 0; + context->maxUncompressedSize = 0; + context->nitems = 0; + context->rawsize = 0; +} + +void +zs_compress_begin(ZSCompressContext *context, int maxCompressedSize) +{ + context->buffer = repalloc(context->buffer, maxCompressedSize); + + maxCompressedSize -= offsetof(ZSCompressedBtreeItem, t_payload); + if (maxCompressedSize < 0) + maxCompressedSize = 0; + + context->maxCompressedSize = maxCompressedSize; + context->nitems = 0; + context->rawsize = 0; +} + +/* + * Try to add some data to the compressed block. + * + * If it wouldn't fit, return false. + */ +bool +zs_compress_add(ZSCompressContext *context, ZSBtreeItem *item) +{ + ZSCompressedBtreeItem *chunk = (ZSCompressedBtreeItem *) context->buffer; + + Assert((item->t_flags & ZSBT_COMPRESSED) == 0); + Assert(item->t_tid != InvalidZSTid); + + if (LZ4_COMPRESSBOUND(context->rawsize + MAXALIGN(item->t_size)) > context->maxCompressedSize) + return false; + + memcpy(context->uncompressedbuffer + context->rawsize, item, item->t_size); + /* TODO: clear alignment padding */ + if (context->nitems == 0) + chunk->t_tid = item->t_tid; + chunk->t_lasttid = zsbt_item_lasttid(item); + context->nitems++; + context->rawsize += MAXALIGN(item->t_size); + + return true; +} + +ZSCompressedBtreeItem * +zs_compress_finish(ZSCompressContext *context) +{ + ZSCompressedBtreeItem *chunk = (ZSCompressedBtreeItem *) context->buffer; + int32 compressed_size; + + compressed_size = LZ4_compress_default(context->uncompressedbuffer, + chunk->t_payload, + context->rawsize, + context->maxCompressedSize); + if (compressed_size < 0) + return NULL; + + chunk->t_size = offsetof(ZSCompressedBtreeItem, t_payload) + compressed_size; + chunk->t_flags = ZSBT_COMPRESSED; + chunk->t_uncompressedsize = context->rawsize; + + return chunk; +} + +void +zs_compress_free(ZSCompressContext *context) +{ + pfree(context->uncompressedbuffer); + pfree(context->buffer); +} + +void +zs_decompress_init(ZSDecompressContext *context) +{ + context->buffer = NULL; + context->bufsize = 0; + context->uncompressedsize = 0; +} + +void +zs_decompress_chunk(ZSDecompressContext *context, ZSCompressedBtreeItem *chunk) +{ + Assert((chunk->t_flags & ZSBT_COMPRESSED) != 0); + Assert(chunk->t_uncompressedsize > 0); + if (context->bufsize < chunk->t_uncompressedsize) + { + if (context->buffer) + pfree(context->buffer); + context->buffer = palloc(chunk->t_uncompressedsize); + context->bufsize = chunk->t_uncompressedsize; + } + context->uncompressedsize = chunk->t_uncompressedsize; + + if (LZ4_decompress_safe(chunk->t_payload, + context->buffer, + chunk->t_size - offsetof(ZSCompressedBtreeItem, t_payload), + context->uncompressedsize) != context->uncompressedsize) + elog(ERROR, "could not decompress chunk"); + + context->bytesread = 0; +} + +ZSBtreeItem * +zs_decompress_read_item(ZSDecompressContext *context) +{ + ZSBtreeItem *next; + + if (context->bytesread == context->uncompressedsize) + return NULL; + next = (ZSBtreeItem *) (context->buffer + context->bytesread); + if (context->bytesread + MAXALIGN(next->t_size) > context->uncompressedsize) + elog(ERROR, "invalid compressed item"); + context->bytesread += MAXALIGN(next->t_size); + + Assert(next->t_size >= sizeof(ZSBtreeItem)); + Assert(next->t_tid != InvalidZSTid); + + return next; +} + +void +zs_decompress_free(ZSDecompressContext *context) +{ + if (context->buffer) + pfree(context->buffer); + context->buffer = NULL; + context->bufsize = 0; + context->uncompressedsize = 0; +} + + +#else +/* PGLZ imlementation */ + +/* + * In the worst case, pg_lz outputs everything as "literals", and emits one + * "control byte" ever 8 bytes. Also, it requires 4 bytes extra at the end + * of the buffer. And add 10 bytes of slop, for good measure. + */ +#define MAX_COMPRESS_EXPANSION_OVERHEAD (8) +#define MAX_COMPRESS_EXPANSION_BYTES (4 + 10) + +/* + * Begin compression, with given max compressed size. + */ +void +zs_compress_init(ZSCompressContext *context) +{ + context->uncompressedbuffer = palloc(BLCKSZ * 10); // FIXME: arbitrary size + context->buffer = palloc(BLCKSZ); + context->maxCompressedSize = 0; + context->maxUncompressedSize = 0; + context->nitems = 0; + context->rawsize = 0; +} + +void +zs_compress_begin(ZSCompressContext *context, int maxCompressedSize) +{ + int maxUncompressedSize; + + context->buffer = repalloc(context->buffer, maxCompressedSize + 4 /* LZ slop */); + + context->maxCompressedSize = maxCompressedSize; + + /* determine the max uncompressed size */ + maxUncompressedSize = maxCompressedSize; + maxUncompressedSize -= offsetof(ZSCompressedBtreeItem, t_payload); + maxUncompressedSize -= maxUncompressedSize / MAX_COMPRESS_EXPANSION_OVERHEAD; + maxUncompressedSize -= MAX_COMPRESS_EXPANSION_BYTES; + if (maxUncompressedSize < 0) + maxUncompressedSize = 0; + context->maxUncompressedSize = maxUncompressedSize; + context->nitems = 0; + context->rawsize = 0; +} + +/* + * Try to add some data to the compressed block. + * + * If it wouldn't fit, return false. + */ +bool +zs_compress_add(ZSCompressContext *context, ZSBtreeItem *item) +{ + ZSCompressedBtreeItem *chunk = (ZSCompressedBtreeItem *) context->buffer; + + Assert ((item->t_flags & ZSBT_COMPRESSED) == 0); + + if (context->rawsize + item->t_size > context->maxUncompressedSize) + return false; + + memcpy(context->uncompressedbuffer + context->rawsize, item, item->t_size); + if (context->nitems == 0) + chunk->t_tid = item->t_tid; + chunk->t_lasttid = zsbt_item_lasttid(item); + context->nitems++; + context->rawsize += MAXALIGN(item->t_size); + + return true; +} + +ZSCompressedBtreeItem * +zs_compress_finish(ZSCompressContext *context) +{ + ZSCompressedBtreeItem *chunk = (ZSCompressedBtreeItem *) context->buffer; + int32 compressed_size; + + compressed_size = pglz_compress(context->uncompressedbuffer, context->rawsize, + chunk->t_payload, + PGLZ_strategy_always); + if (compressed_size < 0) + return NULL; + + chunk->t_size = offsetof(ZSCompressedBtreeItem, t_payload) + compressed_size; + chunk->t_flags = ZSBT_COMPRESSED; + chunk->t_uncompressedsize = context->rawsize; + + return chunk; +} + +void +zs_compress_free(ZSCompressContext *context) +{ + pfree(context->uncompressedbuffer); + pfree(context->buffer); +} + +void +zs_decompress_init(ZSDecompressContext *context) +{ + context->buffer = NULL; + context->bufsize = 0; + context->uncompressedsize = 0; +} + +void +zs_decompress_chunk(ZSDecompressContext *context, ZSCompressedBtreeItem *chunk) +{ + Assert((chunk->t_flags & ZSBT_COMPRESSED) != 0); + Assert(chunk->t_uncompressedsize > 0); + if (context->bufsize < chunk->t_uncompressedsize) + { + if (context->buffer) + pfree(context->buffer); + context->buffer = palloc(chunk->t_uncompressedsize); + context->bufsize = chunk->t_uncompressedsize; + } + context->uncompressedsize = chunk->t_uncompressedsize; + + if (pglz_decompress(chunk->t_payload, + chunk->t_size - offsetof(ZSCompressedBtreeItem, t_payload), + context->buffer, + context->uncompressedsize, true) != context->uncompressedsize) + elog(ERROR, "could not decompress chunk"); + + context->bytesread = 0; +} + +ZSBtreeItem * +zs_decompress_read_item(ZSDecompressContext *context) +{ + ZSBtreeItem *next; + + if (context->bytesread == context->uncompressedsize) + return NULL; + next = (ZSBtreeItem *) (context->buffer + context->bytesread); + if (context->bytesread + MAXALIGN(next->t_size) > context->uncompressedsize) + elog(ERROR, "invalid compressed item"); + context->bytesread += MAXALIGN(next->t_size); + + Assert(next->t_size >= sizeof(ZSBtreeItem)); + Assert(next->t_tid != InvalidZSTid); + + return next; +} + +void +zs_decompress_free(ZSDecompressContext *context) +{ + if (context->buffer) + pfree(context->buffer); + context->buffer = NULL; + context->bufsize = 0; + context->uncompressedsize = 0; +} + +#endif /* !USE_LZ4 */ diff --git a/src/backend/access/zedstore/zedstore_freepagemap.c b/src/backend/access/zedstore/zedstore_freepagemap.c new file mode 100644 index 0000000000..efd01fd6c6 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_freepagemap.c @@ -0,0 +1,1076 @@ +/*------------------------------------------------------------------------- + * + * zedstore_freepagemap.c + * ZedStore free space management + * + * The Free Page Map keeps track of unused pages in the relation. + * + * The FPM is a b-tree, indexed by physical block number. To be more compact, + * it stores "extents", i.e. block ranges, rather than just blocks, when + * possible. + + * Design principles: + * + * - it's ok to have a block incorrectly stored in the FPM. Before actually + * reusing a page, we must check that it's safe. + * + * - a deletable page must be simple to detect just by looking at the page, + * and perhaps a few other pages. It should *not* require scanning the + * whole table, or even a whole b-tree. For example, if a column is dropped, + * we can detect if a b-tree page belongs to the dropped column just by + * looking at the information (the attribute number) stored in the page + * header. + * + * - if a page is deletable, it should become immediately reusable. No + * "wait out all possible readers that might be about to follow a link + * to it" business. All code that reads pages need to keep pages locked + * while following a link, or be prepared to retry if they land on an + * unexpected page. + * + * + * TODO: + * + * - Avoid fragmentation. If B-tree page is split, try to hand out a page + * that's close to the old page. When the relation is extended, allocate + * a larger chunk at once. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_freepagemap.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/zedstore_internal.h" +#include "miscadmin.h" +#include "storage/bufpage.h" +#include "storage/lmgr.h" +#include "utils/rel.h" + +/* + * On-disk format of the Free Page Map. + * + * The FPM is a b-tree, indexed by block number. Each page contains a + * ZSFreePageMapOpaque in the "special area", and an array of + * ZSFreePageMapItems as the content (ie. after the normal page header, + * up to pd_lower). On an internal page, each item contains the starting + * block number, and a pointer to the child FPM page. On a leaf page, + * each entry contains the start and end of the block range that the item + * represents. + * + * The block ranges stored on leaf pages must not overlap! + */ +typedef struct +{ + BlockNumber zs_lokey; /* inclusive */ + BlockNumber zs_hikey; /* exclusive */ + uint16 zs_level; /* 0 = leaf */ + uint16 zs_flags; + char padding[2]; /* padding, to put zs_page_id last */ + uint16 zs_page_id; /* always ZS_FPM_PAGE_ID */ +} ZSFreePageMapOpaque; + +typedef struct +{ + BlockNumber zs_startblk; /* inclusive */ + union { + BlockNumber zs_endblk; /* on a leaf page, end of extent, exclusive */ + BlockNumber zs_downlink; /* on an internal page, pointer to child */ + } u; +} ZSFreePageMapItem; + +#define ZSFreePageMapGetOpaque(page) ((ZSFreePageMapOpaque *) PageGetSpecialPointer(page)) + +/* overlap, or touch? */ +static inline bool +zsextent_overlap(BlockNumber start1, BlockNumber end1, BlockNumber start2, BlockNumber end2) +{ + if (start2 < end1) + return false; + if (start1 < end2) + return false; + return true; +} + +static inline ZSFreePageMapItem * +ZSFreePageMapPageGetItems(Page page) +{ + ZSFreePageMapItem *items; + + items = (ZSFreePageMapItem *) PageGetContents(page); + + return items; +} +static inline int +ZSFreePageMapPageGetNumItems(Page page) +{ + ZSFreePageMapItem *begin; + ZSFreePageMapItem *end; + + begin = (ZSFreePageMapItem *) PageGetContents(page); + end = (ZSFreePageMapItem *) ((char *) page + ((PageHeader) page)->pd_lower); + + return end - begin; +} + +static zs_split_stack *zsfpm_unlink_page(Relation rel, Buffer buf, int level, Buffer metabuf); +static zs_split_stack *zsfpm_merge_pages(Relation rel, Buffer leftbuf, Buffer rightbuf, bool target_is_left, Buffer metabuf); +static BlockNumber zsfpm_consume_page(Relation rel, Buffer metabuf); +static void zsfpm_insert(Relation rel, BlockNumber startblk, BlockNumber endblk); +static zs_split_stack *zsfpm_split(Relation rel, Buffer leftbuf, + int newpos, ZSFreePageMapItem *newitem); +static zs_split_stack *zsfpm_insert_downlink(Relation rel, Buffer leftbuf, + BlockNumber rightlokey, BlockNumber rightblkno); +static zs_split_stack *zsfpm_newroot(Relation rel, Buffer metabuf, int level, + ZSFreePageMapItem *item1, ZSFreePageMapItem *item2); +static Buffer zsfpm_descend(Relation rel, Buffer metabuf, BlockNumber key, int level); +static int zsfpm_binsrch_blkno(BlockNumber key, ZSFreePageMapItem *arr, int arr_elems); + +/* + * zspage_is_recyclable() + * + * Is the current page recyclable? + * + * It can be: + * + * - an empty, all-zeros page, + * - explicitly marked as deleted, + * - an UNDO page older than oldest_undo_ptr + * - a b-tree page belonging to a deleted attribute + * - a TOAST page belonging to a dead item + * + */ +static bool +zspage_is_recyclable(Buffer buf) +{ + if (PageIsNew(BufferGetPage(buf))) + return true; + return false; +} + + +static void +zsfpm_delete_leaf(Relation rel, Buffer buf, Buffer metabuf) +{ + Page page = BufferGetPage(buf); + ZSFreePageMapOpaque *opaque = ZSFreePageMapGetOpaque(page); + + if (opaque->zs_lokey == 0 && opaque->zs_hikey == MaxBlockNumber + 1) + { + /* Don't delete the last leaf page. Just mark it empty */ + START_CRIT_SECTION(); + + ((PageHeader) page)->pd_lower = SizeOfPageHeaderData; + + MarkBufferDirty(buf); + + /* TODO: WAL-log */ + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + return; + } + else + { + zs_split_stack *stack; + + stack = zsfpm_unlink_page(rel, buf, 0, metabuf); + + /* apply the changes */ + zs_apply_split_changes(rel, stack); + } +} + +/* + * Removes the last item from page, and unlinks the page from the tree. + * + * + * NOTE: you cannot remove the only leaf. + */ +static zs_split_stack * +zsfpm_unlink_page(Relation rel, Buffer buf, int level, Buffer metabuf) +{ + Page page = BufferGetPage(buf); + ZSFreePageMapOpaque *opaque = ZSFreePageMapGetOpaque(page); + Buffer leftbuf; + Buffer rightbuf; + bool target_is_left; + + Assert(opaque->zs_lokey != 0 || opaque->zs_hikey != MaxBlockNumber + 1); + Assert(ZSFreePageMapPageGetNumItems(page) == 1); + + /* + * Find left sibling. + * or if this is leftmost page, find right sibling. + */ + if (opaque->zs_lokey != 0) + { + rightbuf = buf; + leftbuf = zsfpm_descend(rel, metabuf, opaque->zs_lokey - 1, level); + target_is_left = false; + } + else + { + rightbuf = zsfpm_descend(rel, metabuf, opaque->zs_hikey, level); + leftbuf = buf; + target_is_left = true; + } + + return zsfpm_merge_pages(rel, leftbuf, rightbuf, target_is_left, metabuf); +} + +/* + * Page deletion: + * + * Mark page empty, remove downlink. If parent becomes empty, recursively delete it. + * + * Unlike in the nbtree index, we don't need to worry about concurrent scans. They + * will simply retry if they land on an unexpected page. + */ +static zs_split_stack * +zsfpm_merge_pages(Relation rel, Buffer leftbuf, Buffer rightbuf, bool target_is_left, Buffer metabuf) +{ + Buffer parentbuf; + Page origleftpage; + Page leftpage; + Page rightpage; + ZSFreePageMapOpaque *leftopaque; + ZSFreePageMapOpaque *rightopaque; + ZSFreePageMapItem *leftitems; + ZSFreePageMapItem *origleftitems; + ZSFreePageMapItem *rightitems; + ZSFreePageMapItem *parentitems; + int origleftnitems; + int rightnitems; + int parentnitems; + Page parentpage; + int itemno; + zs_split_stack *stack; + zs_split_stack *stack_head; + zs_split_stack *stack_tail; + + origleftpage = BufferGetPage(leftbuf); + leftpage = PageGetTempPageCopySpecial(origleftpage); + leftopaque = ZSFreePageMapGetOpaque(leftpage); + + origleftitems = ZSFreePageMapPageGetItems(origleftpage); + origleftnitems = ZSFreePageMapPageGetNumItems(origleftpage); + + leftitems = ZSFreePageMapPageGetItems(leftpage); + + rightpage = BufferGetPage(rightbuf); + rightopaque = ZSFreePageMapGetOpaque(rightpage); + rightitems = ZSFreePageMapPageGetItems(rightpage); + rightnitems = ZSFreePageMapPageGetNumItems(rightpage); + + /* move all items from right to left */ + + if (target_is_left) + { + Assert(origleftnitems == 1); + + memcpy(leftitems, + rightitems, + rightnitems * sizeof(ZSFreePageMapItem)); + ((PageHeader) leftpage)->pd_lower += rightnitems * sizeof(ZSFreePageMapItem); + } + else + { + origleftitems = ZSFreePageMapPageGetItems(origleftpage); + leftitems = ZSFreePageMapPageGetItems(leftpage); + + Assert(rightnitems == 1); + + memcpy(leftitems, + origleftitems, + origleftnitems * sizeof(ZSFreePageMapItem)); + } + + /* update left hikey */ + leftopaque->zs_hikey = ZSFreePageMapGetOpaque(rightpage)->zs_hikey; + + Assert(ZSFreePageMapGetOpaque(leftpage)->zs_level == ZSFreePageMapGetOpaque(rightpage)->zs_level); + + stack = zs_new_split_stack_entry(leftbuf, leftpage); + stack_head = stack_tail = stack; + + /* Mark right page as empty/unused */ + rightpage = palloc0(BLCKSZ); + + stack = zs_new_split_stack_entry(rightbuf, rightpage); + stack->recycle = true; + stack_tail->next = stack; + stack_tail = stack; + + /* find downlink for 'rightbuf' in the parent */ + parentbuf = zsfpm_descend(rel, metabuf, rightopaque->zs_lokey, leftopaque->zs_level + 1); + parentpage = BufferGetPage(parentbuf); + + parentitems = ZSFreePageMapPageGetItems(parentpage); + parentnitems = ZSFreePageMapPageGetNumItems(parentpage); + itemno = zsfpm_binsrch_blkno(rightopaque->zs_lokey, parentitems, parentnitems); + if (itemno < 0 || parentitems[itemno].u.zs_downlink != BufferGetBlockNumber(rightbuf)) + elog(ERROR, "could not find downlink to FPM page %u", BufferGetBlockNumber(rightbuf)); + + /* remove downlink from parent */ + if (parentnitems > 1) + { + Page newpage = PageGetTempPageCopySpecial(parentpage); + ZSFreePageMapItem *newitems = ZSFreePageMapPageGetItems(newpage); + + memcpy(newitems, parentitems, itemno * sizeof(ZSFreePageMapItem)); + memcpy(&newitems[itemno], &parentitems[itemno + 1], (parentnitems - itemno -1) * sizeof(ZSFreePageMapItem)); + + ((PageHeader) newpage)->pd_lower += (parentnitems - 1) * sizeof(ZSFreePageMapItem); + + stack = zs_new_split_stack_entry(parentbuf, newpage); + stack_tail->next = stack; + stack_tail = stack; + } + else + { + /* the parent becomes empty as well. Recursively remove it. */ + stack_tail->next = zsfpm_unlink_page(rel, parentbuf, leftopaque->zs_level + 1, metabuf); + } + return stack_head; +} + +/* + * Allocate a new page. + * + * The page is exclusive-locked, but not initialized. + */ +Buffer +zspage_getnewbuf(Relation rel, Buffer metabuf) +{ + bool release_metabuf; + Buffer buf; + BlockNumber blk; + + if (metabuf == InvalidBuffer) + { + metabuf = ReadBuffer(rel, ZS_META_BLK); + LockBuffer(metabuf, BUFFER_LOCK_SHARE); + release_metabuf = true; + } + else + release_metabuf = false; + +retry: + /* Get a block from the FPM. */ + blk = zsfpm_consume_page(rel, metabuf); + if (blk == 0) + { + /* metapage, not expected */ + elog(ERROR, "could not find valid page in FPM"); + } + if (blk == InvalidBlockNumber) + { + /* No free pages. Have to extend the relation. */ + buf = zspage_extendrel_newbuf(rel); + blk = BufferGetBlockNumber(buf); + } + else + { + buf = ReadBuffer(rel, blk); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* Check that the page really is unused. */ + if (!zspage_is_recyclable(buf)) + { + UnlockReleaseBuffer(buf); + goto retry; + } + } + + if (release_metabuf) + UnlockReleaseBuffer(metabuf); + return buf; +} + +/* + * Extend the relation. + * + * Returns the new page, exclusive-locked. + */ +Buffer +zspage_extendrel_newbuf(Relation rel) +{ + Buffer buf; + bool needLock; + + /* + * Extend the relation by one page. + * + * We have to use a lock to ensure no one else is extending the rel at + * the same time, else we will both try to initialize the same new + * page. We can skip locking for new or temp relations, however, + * since no one else could be accessing them. + */ + needLock = !RELATION_IS_LOCAL(rel); + + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + + buf = ReadBuffer(rel, P_NEW); + + /* Acquire buffer lock on new page */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Release the file-extension lock; it's now OK for someone else to + * extend the relation some more. Note that we cannot release this + * lock before we have buffer lock on the new page, or we risk a race + * condition against btvacuumscan --- see comments therein. + */ + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + return buf; +} + + +/* + * Explictly mark a page as deleted and recyclable, and add it to the FPM. + * + * The caller must hold an exclusive-lock on the page. + */ +void +zspage_delete_page(Relation rel, Buffer buf) +{ + BlockNumber blk = BufferGetBlockNumber(buf); + Page page; + + page = BufferGetPage(buf); + memset(page, 0, BLCKSZ); + + zsfpm_insert(rel, blk, blk + 1); +} + +/* + * Remove and return a page from the FPM. + */ +static BlockNumber +zsfpm_consume_page(Relation rel, Buffer metabuf) +{ + /* TODO: add some smarts, to allocate the page nearby old page, etc. */ + /* currently, we just pick the first available page. */ + Page metapage; + ZSMetaPageOpaque *metaopaque; + BlockNumber rootblk; + Buffer buf; + Page page; + ZSFreePageMapItem *items; + int nitems; + BlockNumber result; + + metapage = BufferGetPage(metabuf); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + rootblk = metaopaque->zs_fpm_root; + + if (rootblk == InvalidBlockNumber) + return InvalidBlockNumber; + + buf = zsfpm_descend(rel, metabuf, 0, 0); + page = BufferGetPage(buf); + + items = ZSFreePageMapPageGetItems(page); + nitems = ZSFreePageMapPageGetNumItems(page); + + if (nitems == 0) + { + UnlockReleaseBuffer(buf); + return InvalidBlockNumber; + } + + result = items[0].zs_startblk; + items[0].zs_startblk++; + if (items[0].u.zs_endblk == items[0].zs_startblk) + { + if (nitems > 1) + { + memmove(&items[0], + &items[1], + (nitems - 1) * sizeof(ZSFreePageMapItem)); + ((PageHeader) page)->pd_lower -= sizeof(ZSFreePageMapItem); + + UnlockReleaseBuffer(buf); + } + else + { + zsfpm_delete_leaf(rel, buf, metabuf); + /* zsfpm_delete_leaf() released 'buf' */ + } + } + else + { + UnlockReleaseBuffer(buf); + } + return result; +} + +/* + * Add a block range to the FPM. + */ +static void +zsfpm_insert(Relation rel, BlockNumber startblk, BlockNumber endblk) +{ + Buffer metabuf; + Page metapage; + ZSMetaPageOpaque *metaopaque; + BlockNumber rootblk; + Buffer buf; + Page page; + ZSFreePageMapItem *items; + int nitems; + int pos; + int replacepos_first; + int replacepos_last; + + metabuf = ReadBuffer(rel, ZS_META_BLK); + metapage = BufferGetPage(metabuf); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* TODO: get shared lock first */ + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + rootblk = metaopaque->zs_fpm_root; + + if (rootblk == InvalidBlockNumber) + { + /* Create a new FPM root page */ + ZSFreePageMapOpaque *opaque; + + buf = zspage_extendrel_newbuf(rel); + page = BufferGetPage(buf); + rootblk = BufferGetBlockNumber(buf); + + PageInit(page, BLCKSZ, sizeof(ZSFreePageMapOpaque)); + opaque = ZSFreePageMapGetOpaque(page); + opaque->zs_lokey = 0; + opaque->zs_hikey = MaxBlockNumber + 1; + opaque->zs_level = 0; + opaque->zs_flags = 0; + opaque->zs_page_id = ZS_FPM_PAGE_ID; + + metaopaque->zs_fpm_root = rootblk; + + items = ZSFreePageMapPageGetItems(page); + Assert(ZSFreePageMapPageGetNumItems(page) == 0); + items[0].zs_startblk = startblk; + items[0].u.zs_endblk = endblk; + + /* TODO: WAL-logging */ + + MarkBufferDirty(metabuf); + MarkBufferDirty(buf); + + UnlockReleaseBuffer(metabuf); + UnlockReleaseBuffer(buf); + return; + } + + /* Descend to the correct leaf page for this block */ + + buf = zsfpm_descend(rel, metabuf, startblk, 0); + + UnlockReleaseBuffer(metabuf); + + page = BufferGetPage(buf); + items = ZSFreePageMapPageGetItems(page); + nitems = ZSFreePageMapPageGetNumItems(page); + + pos = zsfpm_binsrch_blkno(startblk, items, nitems); + + /* FIXME: this merging business won't work correctly if the range crosses + * a b-tree page boundary. Not a problem currently, when we only insert + * individual pages. + */ + + /* Check if this item can be merged with the previous item */ + replacepos_first = -1; + if (pos >= 0 && items[pos].u.zs_endblk >= startblk) + { + replacepos_first = pos; + } + /* If not, can this be merged with the next item? */ + else if (pos + 1 < nitems && endblk >= items[pos + 1].zs_startblk) + { + /* yes, merge */ + replacepos_first = pos + 1; + } + + if (replacepos_first >= 0) + { + /* adjust the start block of this item */ + if (startblk < items[replacepos_first].zs_startblk) + { + items[replacepos_first].zs_startblk = startblk; + } + + /* + * The new end block might overlap with any number of existing + * ranges. Replace all overlapping ranges with one range that + * covers them all. + */ + replacepos_last = replacepos_first; + if (endblk > items[replacepos_first].u.zs_endblk) + { + int j; + BlockNumber replace_end; + + replace_end = endblk; + + for (j = replacepos_first + 1; j < nitems; j++) + { + if (items[j].zs_startblk > replace_end) + break; + + /* + * This item will be replaced. Check the end, to see + * if this is the last one that can be replaced. + */ + replacepos_last = j; + + if (items[j].u.zs_endblk > replace_end) + { + replace_end = items[j].u.zs_endblk; + break; + } + } + + items[replacepos_first].u.zs_endblk = replace_end; + } + + /* we already adjusted the item at 'replacepos_first'. Remove the rest. */ + if (replacepos_last > replacepos_first) + { + int move_items = nitems - replacepos_last; + int remain_items = nitems - (replacepos_last - replacepos_first); + + if (move_items > 0) + memmove(&items[replacepos_first + 1], + &items[replacepos_last + 1], + move_items * sizeof(ZSFreePageMapItem)); + + ((PageHeader) page)->pd_lower = SizeOfPageHeaderData + remain_items * sizeof(ZSFreePageMapItem); + + } + + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); + + return; + } + + /* + * No overlap with any existing ranges. Add a new one. This might require + * splitting the page. + */ + pos = pos + 1; + + if (PageGetExactFreeSpace(page) >= sizeof(ZSFreePageMapItem)) + { + START_CRIT_SECTION(); + + memmove(&items[pos + 1], + &items[pos], + (nitems - pos) * sizeof(ZSFreePageMapItem)); + + items[pos].zs_startblk = startblk; + items[pos].u.zs_endblk = endblk; + + ((PageHeader) page)->pd_lower += sizeof(ZSFreePageMapItem); + + /* TODO: WAL-log */ + + MarkBufferDirty(buf); + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + return; + } + else + { + /* last resort: split the page */ + zs_split_stack *split_stack; + ZSFreePageMapItem newitem; + + newitem.zs_startblk = startblk; + newitem.u.zs_endblk = endblk; + split_stack = zsfpm_split(rel, buf, pos, &newitem); + + /* write out the temporary page copies */ + zs_apply_split_changes(rel, split_stack); + } +} + +/* + * Insert a downlink for right page, after splitting 'leftbuf' FPM page. + */ +static zs_split_stack * +zsfpm_insert_downlink(Relation rel, Buffer leftbuf, + BlockNumber rightlokey, BlockNumber rightblkno) +{ + Buffer parentbuf; + Page leftpage = BufferGetPage(leftbuf); + BlockNumber leftblkno = BufferGetBlockNumber(leftbuf); + ZSFreePageMapOpaque *leftopaque = ZSFreePageMapGetOpaque(leftpage); + zstid leftlokey = leftopaque->zs_lokey; + ZSFreePageMapItem downlink; + Buffer metabuf; + Page metapage; + ZSMetaPageOpaque *metaopaque; + BlockNumber rootblk; + Page parentpage; + ZSFreePageMapItem *items; + int nitems; + int pos; + zs_split_stack *split_stack; + + /* + * First, find the parent of 'leftbuf'. + * + * TODO: this is a bit inefficient. Usually, we have just descended the + * tree, and if we just remembered the path we descended, we could just + * walk back up. + */ + metabuf = ReadBuffer(rel, ZS_META_BLK); + metapage = BufferGetPage(metabuf); + LockBuffer(metabuf, BUFFER_LOCK_SHARE); + metapage = BufferGetPage(metabuf); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + rootblk = metaopaque->zs_fpm_root; + + if (rootblk == BufferGetBlockNumber(leftbuf)) + { + /* Root split. Create new root with downlinks for the left and right page. */ + ZSFreePageMapItem downlink1; + ZSFreePageMapItem downlink2; + + /* re-acquire the lock on metapage in exclusive mode */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + + /* + * No one should have been able to change the root pointer, because we were + * holding a lock on the root page + */ + Assert(metaopaque->zs_fpm_root == BufferGetBlockNumber(leftbuf)); + + downlink1.zs_startblk = leftlokey; + downlink1.u.zs_downlink = leftblkno; + downlink2.zs_startblk = rightlokey; + downlink2.u.zs_downlink = rightblkno; + + return zsfpm_newroot(rel, metabuf, leftopaque->zs_level + 1, + &downlink1, &downlink2); + } + + UnlockReleaseBuffer(metabuf); + + parentbuf = zsfpm_descend(rel, metabuf, leftlokey, leftopaque->zs_level + 1); + parentpage = BufferGetPage(parentbuf); + + downlink.zs_startblk = rightlokey; + downlink.u.zs_downlink = rightblkno; + + /* insert the item */ + items = ZSFreePageMapPageGetItems(parentpage); + nitems = ZSFreePageMapPageGetNumItems(parentpage); + + pos = zsfpm_binsrch_blkno(rightlokey, items, nitems); + pos = pos + 1; + + if (PageGetExactFreeSpace(parentpage) >= sizeof(ZSFreePageMapItem)) + { + ZSFreePageMapItem *newitems; + Page newpage; + + newpage = PageGetTempPageCopySpecial(parentpage); + + split_stack = zs_new_split_stack_entry(parentbuf, newpage); + + newitems = ZSFreePageMapPageGetItems(newpage); + memcpy(newitems, items, pos * sizeof(ZSFreePageMapItem)); + + newitems[pos] = downlink; + + memcpy(&newitems[pos + 1], &items[pos], (nitems - pos) * sizeof(ZSFreePageMapItem)); + + ((PageHeader) newpage)->pd_lower += (nitems + 1) * sizeof(ZSFreePageMapItem); + + } + else + { + /* have to split the page. */ + split_stack = zsfpm_split(rel, parentbuf, pos, &downlink); + } + return split_stack; +} + +/* + * Split a page for insertion of 'newitem', at 'newpos'. + * + * A page split needs to modify the page being split, the block allocated for + * the new page, and also the downlink in the parent. If the parent needs to + * be split as well, its parent also needs to be recursively updated, all the + * way up to the root page, in the worst case. zsfpm_split() doesn't modify + * any pages directly, but locks them exclusively, and returns a list of + * zs_split_stack structs to represent the modifications. The caller must + * WAL-log and apply all the changes represented by the list. + */ +static zs_split_stack * +zsfpm_split(Relation rel, Buffer leftbuf, int newpos, ZSFreePageMapItem *newitem) +{ + Buffer rightbuf; + Page origpage = BufferGetPage(leftbuf); + Page leftpage; + Page rightpage; + BlockNumber rightblkno; + ZSFreePageMapOpaque *leftopaque; + ZSFreePageMapOpaque *rightopaque; + ZSFreePageMapItem *origitems; + ZSFreePageMapItem *leftitems; + ZSFreePageMapItem *rightitems; + int orignitems; + int leftnitems; + int rightnitems; + int splitpoint; + BlockNumber splitkey; + bool newitemonleft; + int i; + zs_split_stack *stack1; + zs_split_stack *stack2; + + leftpage = PageGetTempPageCopySpecial(origpage); + leftopaque = ZSFreePageMapGetOpaque(leftpage); + + /* + * FIXME: can't use the FPM to get a page, because we might deadlock with + * ourself. We could steal a block from the page we're splitting... + */ + rightbuf = zspage_extendrel_newbuf(rel); + rightblkno = BufferGetBlockNumber(rightbuf); + + rightpage = palloc(BLCKSZ); + PageInit(rightpage, BLCKSZ, sizeof(ZSFreePageMapOpaque)); + rightopaque = ZSFreePageMapGetOpaque(rightpage); + + /* + * Figure out the split point. + * + * TODO: currently, always do 90/10 split. + */ + origitems = ZSFreePageMapPageGetItems(origpage); + orignitems = ZSFreePageMapPageGetNumItems(origpage); + splitpoint = orignitems * 0.9; + splitkey = origitems[splitpoint].zs_startblk; + newitemonleft = (newitem->zs_startblk < splitkey); + + /* Set up the page headers */ + rightopaque->zs_lokey = splitkey; + rightopaque->zs_hikey = leftopaque->zs_hikey; + rightopaque->zs_level = leftopaque->zs_level; + rightopaque->zs_flags = 0; + rightopaque->zs_page_id = ZS_FPM_PAGE_ID; + + leftopaque->zs_hikey = splitkey; + + /* copy the items */ + leftitems = ZSFreePageMapPageGetItems(leftpage); + leftnitems = 0; + rightitems = ZSFreePageMapPageGetItems(rightpage); + rightnitems = 0; + + for (i = 0; i < orignitems; i++) + { + if (i == newpos) + { + if (newitemonleft) + leftitems[leftnitems++] = *newitem; + else + rightitems[rightnitems++] = *newitem; + } + + if (i < splitpoint) + leftitems[leftnitems++] = origitems[i]; + else + rightitems[rightnitems++] = origitems[i]; + } + /* cope with possibility that newitem goes at the end */ + if (i <= newpos) + { + Assert(!newitemonleft); + rightitems[rightnitems++] = *newitem; + } + ((PageHeader) leftpage)->pd_lower += leftnitems * sizeof(ZSFreePageMapItem); + ((PageHeader) rightpage)->pd_lower += rightnitems * sizeof(ZSFreePageMapItem); + + Assert(leftnitems + rightnitems == orignitems + 1); + + stack1 = zs_new_split_stack_entry(leftbuf, leftpage); + stack2 = zs_new_split_stack_entry(rightbuf, rightpage); + stack1->next = stack2; + + /* recurse to insert downlink. */ + stack2->next = zsfpm_insert_downlink(rel, leftbuf, splitkey, rightblkno); + + return stack1; +} + +static zs_split_stack * +zsfpm_newroot(Relation rel, Buffer metabuf, int level, + ZSFreePageMapItem *item1, ZSFreePageMapItem *item2) +{ + /* Create a new FPM root page */ + Page metapage; + ZSMetaPageOpaque *metaopaque; + ZSFreePageMapOpaque *opaque; + Buffer buf; + Page page; + BlockNumber rootblk; + ZSFreePageMapItem *items; + zs_split_stack *stack1; + zs_split_stack *stack2; + + metapage = PageGetTempPageCopy(BufferGetPage(metabuf)); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + + /* TODO: get the page from the FPM */ + buf = zspage_extendrel_newbuf(rel); + rootblk = BufferGetBlockNumber(buf); + + page = palloc(BLCKSZ); + PageInit(page, BLCKSZ, sizeof(ZSFreePageMapOpaque)); + opaque = ZSFreePageMapGetOpaque(page); + opaque->zs_lokey = 0; + opaque->zs_hikey = MaxBlockNumber + 1; + opaque->zs_level = level; + opaque->zs_flags = 0; + opaque->zs_page_id = ZS_FPM_PAGE_ID; + + items = ZSFreePageMapPageGetItems(page); + items[0] = *item1; + items[1] = *item2; + ((PageHeader) page)->pd_lower += 2 * sizeof(ZSFreePageMapItem); + Assert(ZSFreePageMapPageGetNumItems(page) == 2); + + metaopaque->zs_fpm_root = rootblk; + + stack1 = zs_new_split_stack_entry(metabuf, metapage); + + stack2 = zs_new_split_stack_entry(buf, page); + stack2->next = stack1; + + return stack2; +} + +static Buffer +zsfpm_descend(Relation rel, Buffer metabuf, BlockNumber key, int level) +{ + BlockNumber next; + Buffer buf; + Page page; + ZSFreePageMapOpaque *opaque; + ZSFreePageMapItem *items; + int nitems; + int itemno; + int nextlevel = -1; + BlockNumber failblk = InvalidBlockNumber; + Page metapage; + ZSMetaPageOpaque *metaopaque; + BlockNumber rootblk; + + metapage = BufferGetPage(metabuf); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + rootblk = metaopaque->zs_fpm_root; + + next = rootblk; + for (;;) + { + /* + * If we arrive again to a block that was a dead-end earlier, it seems + * that the tree is corrupt. + * + * XXX: It's theoretically possible that the block was removed, but then + * added back at the same location, and removed again. So perhaps retry + * a few times? + */ + if (next == failblk) + elog(ERROR, "could not descend to block %u in FPM", key); + + buf = ReadBuffer(rel, next); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* TODO: shared */ + page = BufferGetPage(buf); + opaque = ZSFreePageMapGetOpaque(page); + + if (nextlevel == -1) + nextlevel = opaque->zs_level; + else if (opaque->zs_level != nextlevel) + elog(ERROR, "unexpected level encountered when descending FPM tree"); + + if (opaque->zs_level < level) + elog(ERROR, "unexpected page level encountered"); + + /* + * Do we need to walk right? This could happen if the page was concurrently split. + * + * XXX: actually, we restart from root. We're holding a lock on the metapage, + * so the root cannot change. + */ + if (key >= opaque->zs_hikey) + { + /* Restart from the root */ + failblk = next; + next = rootblk; + nextlevel = -1; + } + else + { + if (opaque->zs_level == level) + return buf; + + /* Find the downlink and follow it */ + items = ZSFreePageMapPageGetItems(page); + nitems = ZSFreePageMapPageGetNumItems(page); + + itemno = zsfpm_binsrch_blkno(key, items, nitems); + + if (itemno < 0) + elog(ERROR, "could not descend FPM tree for key blk %u", key); + + next = items[itemno].u.zs_downlink; + nextlevel--; + } + UnlockReleaseBuffer(buf); + } +} + + +static int +zsfpm_binsrch_blkno(BlockNumber key, ZSFreePageMapItem *arr, int arr_elems) +{ + int low, + high, + mid; + + low = 0; + high = arr_elems; + while (high > low) + { + mid = low + (high - low) / 2; + + if (key >= arr[mid].zs_startblk) + low = mid + 1; + else + high = mid; + } + return low - 1; +} diff --git a/src/backend/access/zedstore/zedstore_inspect.c b/src/backend/access/zedstore/zedstore_inspect.c new file mode 100644 index 0000000000..4992c52102 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_inspect.c @@ -0,0 +1,448 @@ +/*------------------------------------------------------------------------- + * + * zedstoream_inspect.c + * Debugging functions, for viewing ZedStore page contents + * + * These should probably be moved to contrib/, but it's handy to have them + * here during development. + * + * Example queries + * --------------- + * + * How many pages of each type a table has? + * + * select count(*), pg_zs_page_type('t_zedstore', g) + * from generate_series(0, pg_table_size('t_zedstore') / 8192 - 1) g group by 2; + * + * count | pg_zs_page_type + * -------+----------------- + * 1 | META + * 3701 | BTREE + * 6 | UNDO + * (3 rows) + * + * Compression ratio of B-tree leaf pages (other pages are not compressed): + * + * select sum(uncompressedsz::numeric) / sum(totalsz) as compratio + * from pg_zs_btree_pages('t_zedstore') ; + * compratio + * -------------------- + * 3.6623829559208134 + * (1 row) + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstoream_inspect.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "miscadmin.h" + +#include "access/relscan.h" +#include "access/table.h" +#include "access/zedstore_internal.h" +#include "access/zedstore_undo.h" +#include "commands/vacuum.h" +#include "funcapi.h" +#include "utils/builtins.h" +#include "utils/rel.h" + +Datum pg_zs_page_type(PG_FUNCTION_ARGS); +Datum pg_zs_undo_pages(PG_FUNCTION_ARGS); +Datum pg_zs_btree_pages(PG_FUNCTION_ARGS); + +Datum +pg_zs_page_type(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + uint64 pageno = PG_GETARG_INT64(1); + Relation rel; + uint16 zs_page_id; + Buffer buf; + Page page; + char *result; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use zedstore inspection functions")))); + + rel = table_open(relid, AccessShareLock); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + buf = ReadBuffer(rel, pageno); + LockBuffer(buf, BUFFER_LOCK_SHARE); + page = BufferGetPage(buf); + + zs_page_id = *((uint16 *) ((char *) page + BLCKSZ - sizeof(uint16))); + + UnlockReleaseBuffer(buf); + + table_close(rel, AccessShareLock); + + switch (zs_page_id) + { + case ZS_META_PAGE_ID: + result = "META"; + break; + case ZS_BTREE_PAGE_ID: + result = "BTREE"; + break; + case ZS_UNDO_PAGE_ID: + result = "UNDO"; + break; + case ZS_TOAST_PAGE_ID: + result = "TOAST"; + break; + case ZS_FPM_PAGE_ID: + result = "FPM"; + break; + default: + result = psprintf("UNKNOWN 0x%04x", zs_page_id); + } + + PG_RETURN_TEXT_P(cstring_to_text(result)); +} + +/* + * blkno int8 + * nrecords int4 + * freespace int4 + * firstrecptr int8 + * lastrecptr int8 + */ +Datum +pg_zs_undo_pages(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Relation rel; + Buffer metabuf; + Page metapage; + ZSMetaPageOpaque *metaopaque; + BlockNumber firstblk; + BlockNumber blkno; + char *ptr; + char *endptr; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use zedstore inspection functions")))); + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not " \ + "allowed in this context"))); + + /* Switch into long-lived context to construct returned data structures */ + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + rel = table_open(relid, AccessShareLock); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + /* + * Get the current oldest undo page from the metapage. + */ + metabuf = ReadBuffer(rel, ZS_META_BLK); + metapage = BufferGetPage(metabuf); + LockBuffer(metabuf, BUFFER_LOCK_SHARE); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + + firstblk = metaopaque->zs_undo_head; + + UnlockReleaseBuffer(metabuf); + + /* + * Loop through UNDO records, starting from the oldest page. + */ + blkno = firstblk; + while (blkno != InvalidBlockNumber) + { + Datum values[5]; + bool nulls[5]; + Buffer buf; + Page page; + ZSUndoPageOpaque *opaque; + int nrecords; + ZSUndoRecPtr firstptr = { 0, 0, 0 }; + ZSUndoRecPtr lastptr = { 0, 0, 0 }; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + CHECK_FOR_INTERRUPTS(); + + /* Read the UNDO page */ + buf = ReadBuffer(rel, blkno); + page = BufferGetPage(buf); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page); + + if (opaque->zs_page_id != ZS_UNDO_PAGE_ID) + { + elog(WARNING, "unexpected page id on UNDO page %u", blkno); + break; + } + + /* loop through all records on the page */ + endptr = (char *) page + ((PageHeader) page)->pd_lower; + ptr = (char *) page + SizeOfPageHeaderData; + nrecords = 0; + while (ptr < endptr) + { + ZSUndoRec *undorec = (ZSUndoRec *) ptr; + + Assert(undorec->undorecptr.blkno == blkno); + + lastptr = undorec->undorecptr; + if (nrecords == 0) + firstptr = lastptr; + nrecords++; + + ptr += undorec->size; + } + + values[0] = Int64GetDatum(blkno); + values[1] = Int32GetDatum(nrecords); + values[2] = Int32GetDatum(PageGetExactFreeSpace(page)); + values[3] = Int64GetDatum(firstptr.counter); + values[4] = Int64GetDatum(lastptr.counter); + + blkno = opaque->next; + UnlockReleaseBuffer(buf); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + tuplestore_donestoring(tupstore); + + table_close(rel, AccessShareLock); + + return (Datum) 0; +} + + +/* + * blkno int8 + * nextblk int8 + * attno int4 + * level int4 + * + * lokey int8 + * hikey int8 + + * nitems int4 + * ncompressed int4 + * totalsz int4 + * uncompressedsz int4 + * freespace int4 + */ +Datum +pg_zs_btree_pages(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Relation rel; + BlockNumber blkno; + BlockNumber nblocks; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use zedstore inspection functions")))); + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not " \ + "allowed in this context"))); + + /* Switch into long-lived context to construct returned data structures */ + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + rel = table_open(relid, AccessShareLock); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + nblocks = RelationGetNumberOfBlocks(rel); + + /* scan all blocks in physical order */ + for (blkno = 1; blkno < nblocks; blkno++) + { + Datum values[11]; + bool nulls[11]; + OffsetNumber off; + OffsetNumber maxoff; + Buffer buf; + Page page; + ZSBtreePageOpaque *opaque; + int nitems; + int ncompressed; + int totalsz; + int uncompressedsz; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + CHECK_FOR_INTERRUPTS(); + + /* Read the page */ + buf = ReadBuffer(rel, blkno); + page = BufferGetPage(buf); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * we're only interested in B-tree pages. (Presumably, most of the + * pages in the relation are b-tree pages, so it makes sense to + * scan the whole relation in physical order) + */ + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(ZSBtreePageOpaque))) + { + UnlockReleaseBuffer(buf); + continue; + } + opaque = (ZSBtreePageOpaque *) PageGetSpecialPointer(page); + if (opaque->zs_page_id != ZS_BTREE_PAGE_ID) + { + UnlockReleaseBuffer(buf); + continue; + } + + nitems = 0; + ncompressed = 0; + totalsz = 0; + uncompressedsz = 0; + if (opaque->zs_level == 0) + { + /* leaf page */ + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + ZSBtreeItem *item = (ZSBtreeItem *) PageGetItem(page, iid); + + nitems++; + totalsz += item->t_size; + + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) PageGetItem(page, iid); + + ncompressed++; + uncompressedsz += citem->t_uncompressedsize; + } + else + uncompressedsz += item->t_size; + } + } + else + { + /* internal page */ + nitems = ZSBtreeInternalPageGetNumItems(page); + } + values[0] = Int64GetDatum(blkno); + values[1] = Int64GetDatum(opaque->zs_next); + values[2] = Int32GetDatum(opaque->zs_attno); + values[3] = Int32GetDatum(opaque->zs_level); + values[4] = Int64GetDatum(opaque->zs_lokey); + values[5] = Int64GetDatum(opaque->zs_hikey); + values[6] = Int32GetDatum(nitems); + if (opaque->zs_level == 0) + { + values[7] = Int32GetDatum(ncompressed); + values[8] = Int32GetDatum(totalsz); + values[9] = Int32GetDatum(uncompressedsz); + } + else + { + nulls[7] = true; + nulls[8] = true; + nulls[9] = true; + } + values[10] = Int32GetDatum(PageGetExactFreeSpace(page)); + + UnlockReleaseBuffer(buf); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + tuplestore_donestoring(tupstore); + + table_close(rel, AccessShareLock); + + return (Datum) 0; +} diff --git a/src/backend/access/zedstore/zedstore_meta.c b/src/backend/access/zedstore/zedstore_meta.c new file mode 100644 index 0000000000..a415645914 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_meta.c @@ -0,0 +1,216 @@ +/* + * zedstore_meta.c + * Routines for handling ZedStore metapage + * + * The metapage holds a directory of B-tree root block numbers, one for each + * column. + * + * TODO: + * - support ALTER TABLE ADD COLUMN. + * - extend the root block dir to an overflow page if there are too many + * attributes to fit on one page + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_meta.c + */ +#include "postgres.h" + +#include "access/itup.h" +#include "access/zedstore_internal.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "utils/rel.h" + +static void +zsmeta_add_root_for_attributes(Relation rel, Page page, bool init) +{ + int natts = RelationGetNumberOfAttributes(rel) + 1; + int cur_natts; + int maxatts; + Size freespace; + ZSMetaPage *metapg; + + /* Initialize the attribute root dir for new attribute */ + freespace = PageGetExactFreeSpace(page); + maxatts = freespace / sizeof(ZSRootDirItem); + if (natts > maxatts) + { + /* + * The root block directory must fit on the metapage. + * + * TODO: We could extend this by overflowing to another page. + */ + elog(ERROR, "too many attributes for zedstore"); + } + + metapg = (ZSMetaPage *) PageGetContents(page); + + if (init) + metapg->nattributes = 0; + + for (cur_natts = metapg->nattributes; cur_natts < natts; cur_natts++) + { + metapg->tree_root_dir[cur_natts].root = InvalidBlockNumber; + } + + metapg->nattributes = natts; + ((PageHeader) page)->pd_lower += sizeof(ZSRootDirItem); +} + +/* + * Initialize the metapage for an empty relation. + */ +void +zsmeta_initmetapage(Relation rel) +{ + Buffer buf; + Page page; + ZSMetaPageOpaque *opaque; + + /* + * It's possible that we error out when building the metapage, if there + * are too many attribute, so work on a temporary copy first, before actually + * allocating the buffer. + */ + page = palloc(BLCKSZ); + PageInit(page, BLCKSZ, sizeof(ZSMetaPageOpaque)); + zsmeta_add_root_for_attributes(rel, page, true); + + opaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(page); + opaque->zs_flags = 0; + opaque->zs_page_id = ZS_META_PAGE_ID; + + /* UNDO-related fields */ + opaque->zs_undo_counter = 1; /* start at 1, so that 0 is always "old" */ + opaque->zs_undo_head = InvalidBlockNumber; + opaque->zs_undo_tail = InvalidBlockNumber; + opaque->zs_undo_oldestptr.counter = 1; + + opaque->zs_fpm_root = InvalidBlockNumber; + + /* Ok, write it out to disk */ + buf = ReadBuffer(rel, P_NEW); + if (BufferGetBlockNumber(buf) != ZS_META_BLK) + elog(ERROR, "index is not empty"); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + PageRestoreTempPage(page, BufferGetPage(buf)); + + MarkBufferDirty(buf); + /* TODO: WAL-log */ + + UnlockReleaseBuffer(buf); +} + +/* + * Get the block number of the b-tree root for given attribute. + * + * If 'readonly' is true, and the root doesn't exist yet (ie. it's an empty + * table), returns InvalidBlockNumber. Otherwise new root is allocated if + * the root doesn't exist. + */ +BlockNumber +zsmeta_get_root_for_attribute(Relation rel, AttrNumber attno, bool readonly) +{ + Buffer metabuf; + ZSMetaPage *metapg; + BlockNumber rootblk; + Page page; + + if (RelationGetNumberOfBlocks(rel) == 0) + { + if (readonly) + return InvalidBlockNumber; + + zsmeta_initmetapage(rel); + } + + metabuf = ReadBuffer(rel, ZS_META_BLK); + + /* TODO: get share lock to begin with */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(metabuf); + metapg = (ZSMetaPage *) PageGetContents(page); + + if ((attno != ZS_META_ATTRIBUTE_NUM) && attno <= 0) + elog(ERROR, "invalid attribute number %d (table has only %d attributes)", attno, metapg->nattributes); + + /* + * file has less number of attributes stored compared to catalog. This + * happens due to add column default value storing value in catalog and + * absent in table. This attribute must be marked with atthasmissing. + */ + if (attno >= metapg->nattributes) + { + if (readonly) + { + UnlockReleaseBuffer(metabuf); + return InvalidBlockNumber; + } + else + { + zsmeta_add_root_for_attributes(rel, page, false); + } + } + + rootblk = metapg->tree_root_dir[attno].root; + + if (!readonly && rootblk == InvalidBlockNumber) + { + /* try to allocate one */ + Buffer rootbuf; + Page rootpage; + ZSBtreePageOpaque *opaque; + + /* TODO: release lock on metapage while we do I/O */ + rootbuf = zspage_getnewbuf(rel, metabuf); + rootblk = BufferGetBlockNumber(rootbuf); + + metapg->tree_root_dir[attno].root = rootblk; + + /* initialize the page to look like a root leaf */ + rootpage = BufferGetPage(rootbuf); + PageInit(rootpage, BLCKSZ, sizeof(ZSBtreePageOpaque)); + opaque = ZSBtreePageGetOpaque(rootpage); + opaque->zs_attno = attno; + opaque->zs_next = InvalidBlockNumber; + opaque->zs_lokey = MinZSTid; + opaque->zs_hikey = MaxPlusOneZSTid; + opaque->zs_level = 0; + opaque->zs_flags = ZSBT_ROOT; + opaque->zs_page_id = ZS_BTREE_PAGE_ID; + + MarkBufferDirty(rootbuf); + MarkBufferDirty(metabuf); + /* TODO: WAL-log both pages */ + + UnlockReleaseBuffer(rootbuf); + } + + UnlockReleaseBuffer(metabuf); + + return rootblk; +} + +/* + * + * Caller is responsible for WAL-logging this. + */ +void +zsmeta_update_root_for_attribute(Relation rel, AttrNumber attno, + Buffer metabuf, BlockNumber rootblk) +{ + ZSMetaPage *metapg; + + metapg = (ZSMetaPage *) PageGetContents(BufferGetPage(metabuf)); + + if ((attno != ZS_META_ATTRIBUTE_NUM) && (attno <= 0 || attno > metapg->nattributes)) + elog(ERROR, "invalid attribute number %d (table \"%s\" has only %d attributes)", + attno, RelationGetRelationName(rel), metapg->nattributes); + + metapg->tree_root_dir[attno].root = rootblk; + + MarkBufferDirty(metabuf); +} diff --git a/src/backend/access/zedstore/zedstore_tidpage.c b/src/backend/access/zedstore/zedstore_tidpage.c new file mode 100644 index 0000000000..cffc5f2a75 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_tidpage.c @@ -0,0 +1,1774 @@ +/* + * zedstore_tidpage.c + * Routines for handling the TID tree. + * + * A Zedstore table consists of multiple B-trees, one for each attribute. The + * functions in this file deal with one B-tree at a time, it is the caller's + * responsibility to tie together the scans of each btree. + * + * Operations: + * + * - Sequential scan in TID order + * - must be efficient with scanning multiple trees in sync + * + * - random lookups, by TID (for index scan) + * + * - range scans by TID (for bitmap index scan) + * + * NOTES: + * - Locking order: child before parent, left before right + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_tidpage.c + */ +#include "postgres.h" + +#include "access/zedstore_compression.h" +#include "access/zedstore_internal.h" +#include "access/zedstore_undo.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/rel.h" + +/* prototypes for local functions */ +static void zsbt_tid_recompress_replace(Relation rel, Buffer oldbuf, List *items); +static ZSSingleBtreeItem *zsbt_tid_fetch(Relation rel, + ZSUndoRecPtr *recent_oldest_undo, zstid tid, Buffer *buf_p); +static void zsbt_tid_replace_item(Relation rel, Buffer buf, + zstid oldtid, ZSBtreeItem *replacementitem, + List *newitems); +static ZSBtreeItem *zsbt_tid_create_item(zstid tid, ZSUndoRecPtr undo_ptr, int nelements); + +static TM_Result zsbt_tid_update_lock_old(Relation rel, zstid otid, + TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot, + Snapshot crosscheck, bool wait, TM_FailureData *hufd, ZSUndoRecPtr *prevundoptr_p); +static void zsbt_tid_update_insert_new(Relation rel, zstid *newtid, + TransactionId xid, CommandId cid, ZSUndoRecPtr prevundoptr); +static void zsbt_tid_mark_old_updated(Relation rel, zstid otid, zstid newtid, + TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot); + +/* ---------------------------------------------------------------- + * Public interface + * ---------------------------------------------------------------- + */ + +/* + * Begin a scan of the btree. + */ +void +zsbt_tid_begin_scan(Relation rel, zstid starttid, + zstid endtid, Snapshot snapshot, ZSBtreeScan *scan) +{ + Buffer buf; + + scan->rel = rel; + scan->attno = ZS_META_ATTRIBUTE_NUM; + scan->tupledesc = NULL; + + scan->snapshot = snapshot; + scan->context = CurrentMemoryContext; + scan->lastoff = InvalidOffsetNumber; + scan->has_decompressed = false; + scan->nexttid = starttid; + scan->endtid = endtid; + memset(&scan->recent_oldest_undo, 0, sizeof(scan->recent_oldest_undo)); + memset(&scan->array_undoptr, 0, sizeof(scan->array_undoptr)); + scan->array_datums = palloc(sizeof(Datum)); + scan->array_datums_allocated_size = 1; + scan->array_elements_left = 0; + + buf = zsbt_descend(rel, ZS_META_ATTRIBUTE_NUM, starttid, 0, true); + if (!BufferIsValid(buf)) + { + /* completely empty tree */ + scan->active = false; + scan->lastbuf = InvalidBuffer; + return; + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + scan->active = true; + scan->lastbuf = buf; + + zs_decompress_init(&scan->decompressor); + scan->recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); +} + +/* + * Reset the 'next' TID in a scan to the given TID. + */ +void +zsbt_tid_reset_scan(ZSBtreeScan *scan, zstid starttid) +{ + if (starttid < scan->nexttid) + { + /* have to restart from scratch. */ + scan->array_elements_left = 0; + scan->nexttid = starttid; + scan->has_decompressed = false; + if (scan->lastbuf != InvalidBuffer) + ReleaseBuffer(scan->lastbuf); + scan->lastbuf = InvalidBuffer; + } + else + zsbt_scan_skip(scan, starttid); +} + +void +zsbt_tid_end_scan(ZSBtreeScan *scan) +{ + if (!scan->active) + return; + + if (scan->lastbuf != InvalidBuffer) + ReleaseBuffer(scan->lastbuf); + zs_decompress_free(&scan->decompressor); + + scan->active = false; + scan->array_elements_left = 0; +} + +/* + * Helper function of zsbt_scan_next(), to extract Datums from the given + * array item into the scan->array_* fields. + */ +static void +zsbt_tid_scan_extract_array(ZSBtreeScan *scan, ZSArrayBtreeItem *aitem) +{ + int nelements = aitem->t_nelements; + zstid tid = aitem->t_tid; + + /* skip over elements that we are not interested in */ + while (tid < scan->nexttid && nelements > 0) + { + tid++; + nelements--; + } + + /* leave out elements that are past end of range */ + if (tid + nelements > scan->endtid) + nelements = scan->endtid - tid; + + scan->array_undoptr = aitem->t_undo_ptr; + scan->array_elements_left = nelements; +} + +/* + * Advance scan to next item. + * + * Return true if there was another item. The Datum/isnull of the item is + * placed in scan->array_* fields. For a pass-by-ref datum, it's a palloc'd + * copy that's valid until the next call. + * + * This is normally not used directly. See zsbt_scan_next_tid() and + * zsbt_scan_next_fetch() wrappers, instead. + */ +zstid +zsbt_tid_scan_next(ZSBtreeScan *scan) +{ + Buffer buf; + bool buf_is_locked = false; + Page page; + ZSBtreePageOpaque *opaque; + OffsetNumber off; + OffsetNumber maxoff; + BlockNumber next; + bool visible; + + if (!scan->active) + return InvalidZSTid; + + /* + * Process items, until we find something that is visible to the snapshot. + * + * This advances scan->nexttid as it goes. + */ + while (scan->nexttid < scan->endtid) + { + /* + * If we are still processing an array item, return next element from it. + */ + if (scan->array_elements_left > 0) + goto have_array; + + /* + * If we are still processing a compressed item, process the next item + * from the it. If it's an array item, we start iterating the array by + * setting the scan->array_* fields, and loop back to top to return the + * first element from the array. + */ + if (scan->has_decompressed) + { + zstid lasttid; + ZSBtreeItem *uitem; + TransactionId obsoleting_xid; + + uitem = zs_decompress_read_item(&scan->decompressor); + + if (uitem == NULL) + { + scan->has_decompressed = false; + continue; + } + + /* a compressed item cannot contain nested compressed items */ + Assert((uitem->t_flags & ZSBT_COMPRESSED) == 0); + + lasttid = zsbt_item_lasttid(uitem); + if (lasttid < scan->nexttid) + continue; + + if (uitem->t_tid >= scan->endtid) + break; + + visible = zs_SatisfiesVisibility(scan, uitem, &obsoleting_xid, NULL); + + if (scan->serializable && TransactionIdIsValid(obsoleting_xid)) + CheckForSerializableConflictOut(scan->rel, obsoleting_xid, scan->snapshot); + + if (!visible) + { + scan->nexttid = lasttid + 1; + continue; + } + if ((uitem->t_flags & ZSBT_ARRAY) != 0) + { + /* no need to make a copy, because the uncompressed buffer + * is already a copy */ + ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) uitem; + + zsbt_tid_scan_extract_array(scan, aitem); + continue; + } + else + { + /* single item */ + ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) uitem; + + scan->nexttid = sitem->t_tid; + scan->array_undoptr = sitem->t_undo_ptr; + scan->array_elements_left = 1; + + if (buf_is_locked) + LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + goto have_array; + } + } + + /* + * Scan the page for the next item. + */ + buf = scan->lastbuf; + if (!buf_is_locked) + { + if (BufferIsValid(buf)) + { + LockBuffer(buf, BUFFER_LOCK_SHARE); + buf_is_locked = true; + + /* + * It's possible that the page was concurrently split or recycled by + * another backend (or ourselves). Have to re-check that the page is + * still valid. + */ + if (!zsbt_page_is_expected(scan->rel, scan->attno, scan->nexttid, 0, buf)) + { + /* + * It's not valid for the TID we're looking for, but maybe it was the + * right page for the previous TID. In that case, we don't need to + * restart from the root, we can follow the right-link instead. + */ + if (zsbt_page_is_expected(scan->rel, scan->attno, scan->nexttid - 1, 0, buf)) + { + page = BufferGetPage(buf); + opaque = ZSBtreePageGetOpaque(page); + next = opaque->zs_next; + if (next != InvalidBlockNumber) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + buf = ReleaseAndReadBuffer(buf, scan->rel, next); + scan->lastbuf = buf; + continue; + } + } + + UnlockReleaseBuffer(buf); + buf_is_locked = false; + buf = scan->lastbuf = InvalidBuffer; + } + } + + if (!BufferIsValid(buf)) + { + buf = scan->lastbuf = zsbt_descend(scan->rel, scan->attno, scan->nexttid, 0, true); + buf_is_locked = true; + } + } + page = BufferGetPage(buf); + opaque = ZSBtreePageGetOpaque(page); + Assert(opaque->zs_page_id == ZS_BTREE_PAGE_ID); + + /* TODO: check the last offset first, as an optimization */ + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + ZSBtreeItem *item = (ZSBtreeItem *) PageGetItem(page, iid); + zstid lasttid; + + lasttid = zsbt_item_lasttid(item); + + if (scan->nexttid > lasttid) + continue; + + if (item->t_tid >= scan->endtid) + { + scan->nexttid = scan->endtid; + break; + } + + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item; + MemoryContext oldcxt = MemoryContextSwitchTo(scan->context); + + zs_decompress_chunk(&scan->decompressor, citem); + MemoryContextSwitchTo(oldcxt); + scan->has_decompressed = true; + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + break; + } + else + { + TransactionId obsoleting_xid; + + visible = zs_SatisfiesVisibility(scan, item, &obsoleting_xid, NULL); + + if (!visible) + { + if (scan->serializable && TransactionIdIsValid(obsoleting_xid)) + CheckForSerializableConflictOut(scan->rel, obsoleting_xid, scan->snapshot); + scan->nexttid = lasttid + 1; + continue; + } + + if ((item->t_flags & ZSBT_ARRAY) != 0) + { + /* copy the item, because we can't hold a lock on the page */ + ZSArrayBtreeItem *aitem; + + aitem = MemoryContextAlloc(scan->context, item->t_size); + memcpy(aitem, item, item->t_size); + + zsbt_tid_scan_extract_array(scan, aitem); + + if (scan->array_elements_left > 0) + { + LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + break; + } + } + else + { + /* single item */ + ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) item; + + scan->nexttid = sitem->t_tid; + scan->array_undoptr = sitem->t_undo_ptr; + scan->array_elements_left = 1; + LockBuffer(scan->lastbuf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + goto have_array; + } + } + } + + if (scan->array_elements_left > 0 || scan->has_decompressed) + continue; + + /* No more items on this page. Walk right, if possible */ + next = opaque->zs_next; + if (next == BufferGetBlockNumber(buf)) + elog(ERROR, "btree page %u next-pointer points to itself", next); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + buf_is_locked = false; + + if (next == InvalidBlockNumber || scan->nexttid >= scan->endtid) + { + scan->active = false; + scan->array_elements_left = 0; + ReleaseBuffer(scan->lastbuf); + scan->lastbuf = InvalidBuffer; + break; + } + + scan->lastbuf = ReleaseAndReadBuffer(scan->lastbuf, scan->rel, next); + } + + return InvalidZSTid; + +have_array: + /* + * If we are still processing an array item, return next element from it. + */ + Assert(scan->array_elements_left > 0); + + scan->array_elements_left--; + return scan->nexttid++; +} + +/* + * Get the last tid (plus one) in the tree. + */ +zstid +zsbt_get_last_tid(Relation rel) +{ + zstid rightmostkey; + zstid tid; + Buffer buf; + Page page; + ZSBtreePageOpaque *opaque; + OffsetNumber maxoff; + + /* Find the rightmost leaf */ + rightmostkey = MaxZSTid; + buf = zsbt_descend(rel, ZS_META_ATTRIBUTE_NUM, rightmostkey, 0, true); + if (!BufferIsValid(buf)) + { + return MinZSTid; + } + page = BufferGetPage(buf); + opaque = ZSBtreePageGetOpaque(page); + + /* + * Look at the last item, for its tid. + */ + maxoff = PageGetMaxOffsetNumber(page); + if (maxoff >= FirstOffsetNumber) + { + ItemId iid = PageGetItemId(page, maxoff); + ZSBtreeItem *hitup = (ZSBtreeItem *) PageGetItem(page, iid); + + tid = zsbt_item_lasttid(hitup) + 1; + } + else + { + tid = opaque->zs_lokey; + } + UnlockReleaseBuffer(buf); + + return tid; +} + +/* + * Insert a multiple TIDs. + * + * Populates the TIDs of the new tuples. + * + * If 'tid' in list is valid, then that TID is used. It better not be in use already. If + * it's invalid, then a new TID is allocated, as we see best. (When inserting the + * first column of the row, pass invalid, and for other columns, pass the TID + * you got for the first column.) + */ +void +zsbt_tid_multi_insert(Relation rel, zstid *tids, int nitems, + TransactionId xid, CommandId cid, uint32 speculative_token, ZSUndoRecPtr prevundoptr) +{ + bool assign_tids; + zstid tid = tids[0]; + Buffer buf; + Page page; + ZSBtreePageOpaque *opaque; + OffsetNumber maxoff; + zstid insert_target_key; + ZSUndoRec_Insert undorec; + int i; + List *newitems; + ZSUndoRecPtr undorecptr; + + /* + * If TID was given, find the right place for it. Otherwise, insert to + * the rightmost leaf. + * + * TODO: use a Free Space Map to find suitable target. + */ + assign_tids = (tid == InvalidZSTid); + + if (!assign_tids) + insert_target_key = tid; + else + insert_target_key = MaxZSTid; + + buf = zsbt_descend(rel, ZS_META_ATTRIBUTE_NUM, insert_target_key, 0, false); + page = BufferGetPage(buf); + opaque = ZSBtreePageGetOpaque(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Look at the last item, for its tid. + * + * assign TIDS for each item, if needed. + */ + if (assign_tids) + { + zstid lasttid; + + if (maxoff >= FirstOffsetNumber) + { + ItemId iid = PageGetItemId(page, maxoff); + ZSBtreeItem *hitup = (ZSBtreeItem *) PageGetItem(page, iid); + + lasttid = zsbt_item_lasttid(hitup); + tid = lasttid + 1; + } + else + { + lasttid = opaque->zs_lokey; + tid = lasttid; + } + + for (i = 0; i < nitems; i++) + { + tids[i] = tid; + tid++; + } + } + + /* Form an undo record */ + if (xid != FrozenTransactionId) + { + undorec.rec.size = sizeof(ZSUndoRec_Insert); + undorec.rec.type = ZSUNDO_TYPE_INSERT; + undorec.rec.xid = xid; + undorec.rec.cid = cid; + undorec.rec.tid = tids[0]; + undorec.rec.speculative_token = speculative_token; + undorec.rec.prevundorec = prevundoptr; + undorec.endtid = tids[nitems - 1]; + + undorecptr = zsundo_insert(rel, &undorec.rec); + } + else + { + ZSUndoRecPtrInitialize(&undorecptr); + } + + /* Create items to insert. */ + newitems = NIL; + i = 0; + while (i < nitems) + { + int j; + ZSBtreeItem *newitem; + + /* + * Try to collapse as many items as possible into an Array item. + * The first item in the array is now at tids[i]/datums[i]/isnulls[i]. + * Items can be stored in the same array as long as the TIDs are + * consecutive, they all have the same isnull flag, and the array + * isn't too large to be stored on a single leaf page. Scan the + * arrays, checking those conditions. + * + * FIXME: this math is bogus for TIDs + */ + for (j = i + 1; j < nitems; j++) + { + if (tids[j] != tids[j - 1] + 1) + break; + } + + /* + * 'i' is now the first entry to store in the array, and 'j' is the + * last + 1 elemnt to store. If j == i + 1, then there is only one + * element and zsbt_create_item() will create a 'single' item rather + * than an array. + */ + newitem = zsbt_tid_create_item(tids[i], undorecptr, j - i); + + newitems = lappend(newitems, newitem); + i = j; + } + + /* recompress and possibly split the page */ + zsbt_tid_replace_item(rel, buf, + InvalidZSTid, NULL, + newitems); + /* zsbt_replace_item unlocked 'buf' */ + ReleaseBuffer(buf); +} + +TM_Result +zsbt_tid_delete(Relation rel, zstid tid, + TransactionId xid, CommandId cid, + Snapshot snapshot, Snapshot crosscheck, bool wait, + TM_FailureData *hufd, bool changingPart) +{ + ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); + ZSSingleBtreeItem *item; + TM_Result result; + bool keep_old_undo_ptr = true; + ZSUndoRecPtr undorecptr; + ZSSingleBtreeItem *deleteditem; + Buffer buf; + zstid next_tid; + + /* Find the item to delete. (It could be compressed) */ + item = zsbt_tid_fetch(rel, &recent_oldest_undo, tid, &buf); + if (item == NULL) + { + /* + * or should this be TM_Invisible? The heapam at least just throws + * an error, I think.. + */ + elog(ERROR, "could not find tuple to delete with TID (%u, %u) in TID tree", + ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid)); + } + + if (snapshot) + { + result = zs_SatisfiesUpdate(rel, snapshot, recent_oldest_undo, + (ZSBtreeItem *) item, LockTupleExclusive, + &keep_old_undo_ptr, hufd, &next_tid); + if (result != TM_Ok) + { + UnlockReleaseBuffer(buf); + /* FIXME: We should fill TM_FailureData *hufd correctly */ + return result; + } + + if (crosscheck != InvalidSnapshot && result == TM_Ok) + { + /* Perform additional check for transaction-snapshot mode RI updates */ + /* FIXME: dummmy scan */ + ZSBtreeScan scan; + TransactionId obsoleting_xid; + + memset(&scan, 0, sizeof(scan)); + scan.rel = rel; + scan.snapshot = crosscheck; + scan.recent_oldest_undo = recent_oldest_undo; + + if (!zs_SatisfiesVisibility(&scan, (ZSBtreeItem *) item, &obsoleting_xid, NULL)) + { + UnlockReleaseBuffer(buf); + /* FIXME: We should fill TM_FailureData *hufd correctly */ + result = TM_Updated; + } + } + } + + /* Create UNDO record. */ + { + ZSUndoRec_Delete undorec; + + undorec.rec.size = sizeof(ZSUndoRec_Delete); + undorec.rec.type = ZSUNDO_TYPE_DELETE; + undorec.rec.xid = xid; + undorec.rec.cid = cid; + undorec.rec.tid = tid; + undorec.changedPart = changingPart; + + if (keep_old_undo_ptr) + undorec.rec.prevundorec = item->t_undo_ptr; + else + ZSUndoRecPtrInitialize(&undorec.rec.prevundorec); + + undorecptr = zsundo_insert(rel, &undorec.rec); + } + + /* Replace the ZSBreeItem with one with the new UNDO pointer. */ + deleteditem = palloc(item->t_size); + memcpy(deleteditem, item, item->t_size); + deleteditem->t_undo_ptr = undorecptr; + + zsbt_tid_replace_item(rel, buf, + item->t_tid, (ZSBtreeItem *) deleteditem, + NIL); + ReleaseBuffer(buf); /* zsbt_replace_item unlocked */ + + pfree(deleteditem); + + return TM_Ok; +} + +void +zsbt_find_latest_tid(Relation rel, zstid *tid, Snapshot snapshot) +{ + ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); + ZSSingleBtreeItem *item; + Buffer buf; + /* Just using meta attribute, we can follow the update chain */ + zstid curr_tid = *tid; + + for(;;) + { + zstid next_tid = InvalidZSTid; + if (curr_tid == InvalidZSTid) + break; + + /* Find the item */ + item = zsbt_tid_fetch(rel, &recent_oldest_undo, curr_tid, &buf); + if (item == NULL) + break; + + if (snapshot) + { + /* FIXME: dummmy scan */ + ZSBtreeScan scan; + TransactionId obsoleting_xid; + + memset(&scan, 0, sizeof(scan)); + scan.rel = rel; + scan.snapshot = snapshot; + scan.recent_oldest_undo = recent_oldest_undo; + + if (zs_SatisfiesVisibility(&scan, (ZSBtreeItem *) item, + &obsoleting_xid, &next_tid)) + { + *tid = curr_tid; + } + + curr_tid = next_tid; + UnlockReleaseBuffer(buf); + } + } +} + +/* + * A new TID is allocated, as we see best and returned to the caller. This + * function is only called for META attribute btree. Data columns will use the + * returned tid to insert new items. + */ +TM_Result +zsbt_tid_update(Relation rel, zstid otid, + TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot, + Snapshot crosscheck, bool wait, TM_FailureData *hufd, + zstid *newtid_p) +{ + TM_Result result; + ZSUndoRecPtr prevundoptr; + + /* + * This is currently only used on the meta-attribute. The other attributes + * don't need to carry visibility information, so the caller just inserts + * the new values with (multi_)insert() instead. This will change once we + * start doing the equivalent of HOT updates, where the TID doesn't change. + */ + Assert(*newtid_p == InvalidZSTid); + + /* + * Find and lock the old item. + * + * TODO: If there's free TID space left on the same page, we should keep the + * buffer locked, and use the same page for the new tuple. + */ + result = zsbt_tid_update_lock_old(rel, otid, + xid, cid, key_update, snapshot, + crosscheck, wait, hufd, &prevundoptr); + + if (result != TM_Ok) + return result; + + /* insert new version */ + zsbt_tid_update_insert_new(rel, newtid_p, xid, cid, prevundoptr); + + /* update the old item with the "t_ctid pointer" for the new item */ + zsbt_tid_mark_old_updated(rel, otid, *newtid_p, xid, cid, key_update, snapshot); + + return TM_Ok; +} + +/* + * Subroutine of zsbt_update(): locks the old item for update. + */ +static TM_Result +zsbt_tid_update_lock_old(Relation rel, zstid otid, + TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot, + Snapshot crosscheck, bool wait, TM_FailureData *hufd, ZSUndoRecPtr *prevundoptr_p) +{ + ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); + Buffer buf; + ZSSingleBtreeItem *olditem; + TM_Result result; + bool keep_old_undo_ptr = true; + zstid next_tid; + + /* + * Find the item to delete. + */ + olditem = zsbt_tid_fetch(rel, &recent_oldest_undo, otid, &buf); + if (olditem == NULL) + { + /* + * or should this be TM_Invisible? The heapam at least just throws + * an error, I think.. + */ + elog(ERROR, "could not find old tuple to update with TID (%u, %u) in TID tree", + ZSTidGetBlockNumber(otid), ZSTidGetOffsetNumber(otid)); + } + *prevundoptr_p = olditem->t_undo_ptr; + + /* + * Is it visible to us? + */ + result = zs_SatisfiesUpdate(rel, snapshot, recent_oldest_undo, + (ZSBtreeItem *) olditem, + key_update ? LockTupleExclusive : LockTupleNoKeyExclusive, + &keep_old_undo_ptr, hufd, &next_tid); + if (result != TM_Ok) + { + UnlockReleaseBuffer(buf); + /* FIXME: We should fill TM_FailureData *hufd correctly */ + return result; + } + + if (crosscheck != InvalidSnapshot && result == TM_Ok) + { + /* Perform additional check for transaction-snapshot mode RI updates */ + /* FIXME: dummmy scan */ + ZSBtreeScan scan; + TransactionId obsoleting_xid; + + memset(&scan, 0, sizeof(scan)); + scan.rel = rel; + scan.snapshot = crosscheck; + scan.recent_oldest_undo = recent_oldest_undo; + + if (!zs_SatisfiesVisibility(&scan, (ZSBtreeItem *) olditem, &obsoleting_xid, NULL)) + { + UnlockReleaseBuffer(buf); + /* FIXME: We should fill TM_FailureData *hufd correctly */ + result = TM_Updated; + } + } + + /* + * TODO: tuple-locking not implemented. Pray that there is no competing + * concurrent update! + */ + + UnlockReleaseBuffer(buf); + + return TM_Ok; +} + +/* + * Subroutine of zsbt_update(): inserts the new, updated, item. + */ +static void +zsbt_tid_update_insert_new(Relation rel, + zstid *newtid, + TransactionId xid, CommandId cid, ZSUndoRecPtr prevundoptr) +{ + zsbt_tid_multi_insert(rel, newtid, 1, xid, cid, INVALID_SPECULATIVE_TOKEN, prevundoptr); +} + +/* + * Subroutine of zsbt_update(): mark old item as updated. + */ +static void +zsbt_tid_mark_old_updated(Relation rel, zstid otid, zstid newtid, + TransactionId xid, CommandId cid, bool key_update, Snapshot snapshot) +{ + ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); + Buffer buf; + ZSSingleBtreeItem *olditem; + TM_Result result; + bool keep_old_undo_ptr = true; + TM_FailureData tmfd; + ZSUndoRecPtr undorecptr; + ZSSingleBtreeItem *deleteditem; + zstid next_tid; + + /* + * Find the item to delete. It could be part of a compressed item, + * we let zsbt_fetch() handle that. + */ + olditem = zsbt_tid_fetch(rel, &recent_oldest_undo, otid, &buf); + if (olditem == NULL) + { + /* + * or should this be TM_Invisible? The heapam at least just throws + * an error, I think.. + */ + elog(ERROR, "could not find old tuple to update with TID (%u, %u) in TID tree", + ZSTidGetBlockNumber(otid), ZSTidGetOffsetNumber(otid)); + } + + /* + * Is it visible to us? + */ + result = zs_SatisfiesUpdate(rel, snapshot, recent_oldest_undo, + (ZSBtreeItem *) olditem, + key_update ? LockTupleExclusive : LockTupleNoKeyExclusive, + &keep_old_undo_ptr, &tmfd, &next_tid); + if (result != TM_Ok) + { + UnlockReleaseBuffer(buf); + elog(ERROR, "tuple concurrently updated - not implemented"); + } + + /* Create UNDO record. */ + { + ZSUndoRec_Update undorec; + + undorec.rec.size = sizeof(ZSUndoRec_Update); + undorec.rec.type = ZSUNDO_TYPE_UPDATE; + undorec.rec.xid = xid; + undorec.rec.cid = cid; + undorec.rec.tid = otid; + if (keep_old_undo_ptr) + undorec.rec.prevundorec = olditem->t_undo_ptr; + else + ZSUndoRecPtrInitialize(&undorec.rec.prevundorec); + undorec.newtid = newtid; + undorec.key_update = key_update; + + undorecptr = zsundo_insert(rel, &undorec.rec); + } + + /* Replace the ZSBreeItem with one with the updated undo pointer. */ + deleteditem = palloc(olditem->t_size); + memcpy(deleteditem, olditem, olditem->t_size); + deleteditem->t_undo_ptr = undorecptr; + + zsbt_tid_replace_item(rel, buf, + otid, (ZSBtreeItem *) deleteditem, + NIL); + ReleaseBuffer(buf); /* zsbt_recompress_replace released */ + + pfree(deleteditem); +} + +TM_Result +zsbt_tid_lock(Relation rel, zstid tid, + TransactionId xid, CommandId cid, + LockTupleMode mode, Snapshot snapshot, + TM_FailureData *hufd, zstid *next_tid) +{ + ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); + Buffer buf; + ZSSingleBtreeItem *item; + TM_Result result; + bool keep_old_undo_ptr = true; + ZSUndoRecPtr undorecptr; + ZSSingleBtreeItem *newitem; + + *next_tid = tid; + + /* Find the item to delete. (It could be compressed) */ + item = zsbt_tid_fetch(rel, &recent_oldest_undo, tid, &buf); + if (item == NULL) + { + /* + * or should this be TM_Invisible? The heapam at least just throws + * an error, I think.. + */ + elog(ERROR, "could not find tuple to lock with TID (%u, %u)", + ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid)); + } + result = zs_SatisfiesUpdate(rel, snapshot, recent_oldest_undo, + (ZSBtreeItem *) item, mode, + &keep_old_undo_ptr, hufd, next_tid); + if (result != TM_Ok) + { + UnlockReleaseBuffer(buf); + return result; + } + + /* Create UNDO record. */ + { + ZSUndoRec_TupleLock undorec; + + undorec.rec.size = sizeof(ZSUndoRec_TupleLock); + undorec.rec.type = ZSUNDO_TYPE_TUPLE_LOCK; + undorec.rec.xid = xid; + undorec.rec.cid = cid; + undorec.rec.tid = tid; + undorec.lockmode = mode; + if (keep_old_undo_ptr) + undorec.rec.prevundorec = item->t_undo_ptr; + else + ZSUndoRecPtrInitialize(&undorec.rec.prevundorec); + + undorecptr = zsundo_insert(rel, &undorec.rec); + } + + /* Replace the item with an identical one, but with updated undo pointer. */ + newitem = palloc(item->t_size); + memcpy(newitem, item, item->t_size); + newitem->t_undo_ptr = undorecptr; + + zsbt_tid_replace_item(rel, buf, + item->t_tid, (ZSBtreeItem *) newitem, + NIL); + ReleaseBuffer(buf); /* zsbt_replace_item unlocked */ + + pfree(newitem); + + return TM_Ok; +} + +/* + * Mark item with given TID as dead. + * + * This is used during VACUUM. + */ +void +zsbt_tid_mark_dead(Relation rel, zstid tid, ZSUndoRecPtr undoptr) +{ + Buffer buf; + ZSSingleBtreeItem *item; + ZSSingleBtreeItem deaditem; + + /* Find the item to delete. (It could be compressed) */ + item = zsbt_tid_fetch(rel, NULL, tid, &buf); + if (item == NULL) + { + elog(WARNING, "could not find tuple to mark dead with TID (%u, %u)", + ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid)); + return; + } + + /* Replace the ZSBreeItem with a DEAD item. (Unless it's already dead) */ + if ((item->t_flags & ZSBT_DEAD) != 0) + { + UnlockReleaseBuffer(buf); + return; + } + + memset(&deaditem, 0, offsetof(ZSSingleBtreeItem, t_payload)); + deaditem.t_tid = tid; + deaditem.t_size = sizeof(ZSSingleBtreeItem); + deaditem.t_flags = ZSBT_DEAD; + deaditem.t_undo_ptr = undoptr; + + zsbt_tid_replace_item(rel, buf, + tid, (ZSBtreeItem *) &deaditem, + NIL); + ReleaseBuffer(buf); /* zsbt_replace_item released */ +} + +/* + * Clear an item's UNDO pointer. + * + * This is used during VACUUM, to clear out aborted deletions. + */ +void +zsbt_tid_undo_deletion(Relation rel, zstid tid, ZSUndoRecPtr undoptr) +{ + Buffer buf; + ZSSingleBtreeItem *item; + ZSSingleBtreeItem *copy; + + /* Find the item to delete. (It could be compressed) */ + item = zsbt_tid_fetch(rel, NULL, tid, &buf); + if (item == NULL) + { + elog(WARNING, "could not find aborted tuple to remove with TID (%u, %u)", + ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid)); + return; + } + + if (ZSUndoRecPtrEquals(item->t_undo_ptr, undoptr)) + { + copy = palloc(item->t_size); + memcpy(copy, item, item->t_size); + ZSUndoRecPtrInitialize(©->t_undo_ptr); + zsbt_tid_replace_item(rel, buf, + tid, (ZSBtreeItem *) copy, + NIL); + ReleaseBuffer(buf); /* zsbt_replace_item unlocked */ + } + else + { + Assert(item->t_undo_ptr.counter > undoptr.counter || + !IsZSUndoRecPtrValid(&item->t_undo_ptr)); + UnlockReleaseBuffer(buf); + } +} + +/* ---------------------------------------------------------------- + * Internal routines + * ---------------------------------------------------------------- + */ + +void +zsbt_tid_clear_speculative_token(Relation rel, zstid tid, uint32 spectoken, bool forcomplete) +{ + Buffer buf; + ZSSingleBtreeItem *item = NULL; + ZSUndoRecPtr recent_oldest_undo; + + item = zsbt_tid_fetch(rel, &recent_oldest_undo, tid, &buf); + + if (item == NULL) + elog(ERROR, "couldn't find item for meta column for inserted tuple with TID (%u, %u) in rel %s", + ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid), rel->rd_rel->relname.data); + Assert(item->t_tid == tid); + + zsundo_clear_speculative_token(rel, item->t_undo_ptr); + + UnlockReleaseBuffer(buf); +} + +/* + * Fetch the item with given TID. The page containing the item is kept locked, and + * returned to the caller in *buf_p. This is used to locate a tuple for updating + * or deleting it. + */ +static ZSSingleBtreeItem * +zsbt_tid_fetch(Relation rel, ZSUndoRecPtr *recent_oldest_undo, + zstid tid, Buffer *buf_p) +{ + Buffer buf; + Page page; + ZSBtreeItem *item = NULL; + bool found = false; + OffsetNumber maxoff; + OffsetNumber off; + + buf = zsbt_descend(rel, ZS_META_ATTRIBUTE_NUM, tid, 0, false); + if (buf == InvalidBuffer) + { + *buf_p = InvalidBuffer; + return NULL; + } + page = BufferGetPage(buf); + + /* Find the item on the page that covers the target TID */ + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + item = (ZSBtreeItem *) PageGetItem(page, iid); + + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item; + ZSDecompressContext decompressor; + + zs_decompress_init(&decompressor); + zs_decompress_chunk(&decompressor, citem); + + while ((item = zs_decompress_read_item(&decompressor)) != NULL) + { + zstid lasttid = zsbt_item_lasttid(item); + + if (item->t_tid <= tid && lasttid >= tid) + { + found = true; + break; + } + } + if (found) + { + /* FIXME: decompressor is leaked. Can't free it yet, because we still + * need to access the item below + */ + break; + } + zs_decompress_free(&decompressor); + } + else + { + zstid lasttid = zsbt_item_lasttid(item); + + if (item->t_tid <= tid && lasttid >= tid) + { + found = true; + break; + } + } + } + + if (found) + { + ZSSingleBtreeItem *result; + + if ((item->t_flags & ZSBT_ARRAY) != 0) + { + ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item; + int resultsize; + + Assert((tid - aitem->t_tid) < aitem->t_nelements); + + resultsize = offsetof(ZSSingleBtreeItem, t_payload); + result = palloc(resultsize); + memset(result, 0, offsetof(ZSSingleBtreeItem, t_payload)); /* zero padding */ + result->t_tid = tid; + result->t_flags = item->t_flags & ~ZSBT_ARRAY; + result->t_size = resultsize; + result->t_undo_ptr = aitem->t_undo_ptr; + } + else + { + /* single item */ + result = (ZSSingleBtreeItem *) item; + } + + *buf_p = buf; + return result; + } + else + { + UnlockReleaseBuffer(buf); + *buf_p = InvalidBuffer; + return NULL; + } +} + +/* + * Form a ZSBtreeItem out of the given datums, or data that's already in on-disk + * array format, for insertion. + * + * If there's more than one element, an array item is created. Otherwise, a single + * item. + */ +static ZSBtreeItem * +zsbt_tid_create_item(zstid tid, ZSUndoRecPtr undo_ptr, + int nelements) +{ + ZSBtreeItem *result; + Size itemsz; + + Assert(nelements > 0); + + if (nelements > 1) + { + ZSArrayBtreeItem *newitem; + + itemsz = offsetof(ZSArrayBtreeItem, t_payload); + + newitem = palloc(itemsz); + memset(newitem, 0, offsetof(ZSArrayBtreeItem, t_payload)); /* zero padding */ + newitem->t_tid = tid; + newitem->t_size = itemsz; + newitem->t_flags = ZSBT_ARRAY; + newitem->t_nelements = nelements; + newitem->t_undo_ptr = undo_ptr; + + result = (ZSBtreeItem *) newitem; + } + else + { + ZSSingleBtreeItem *newitem; + + itemsz = offsetof(ZSSingleBtreeItem, t_payload); + + newitem = palloc(itemsz); + memset(newitem, 0, offsetof(ZSSingleBtreeItem, t_payload)); /* zero padding */ + newitem->t_tid = tid; + newitem->t_flags = 0; + newitem->t_size = itemsz; + newitem->t_undo_ptr = undo_ptr; + + result = (ZSBtreeItem *) newitem; + } + + return result; +} + +/* + * This helper function is used to implement INSERT, UPDATE and DELETE. + * + * If 'olditem' is not NULL, then 'olditem' on the page is replaced with + * 'replacementitem'. 'replacementitem' can be NULL, to remove an old item. + * + * If 'newitems' is not empty, the items in the list are added to the page, + * to the correct position. FIXME: Actually, they're always just added to + * the end of the page, and that better be the correct position. + * + * This function handles decompressing and recompressing items, and splitting + * the page if needed. + */ +static void +zsbt_tid_replace_item(Relation rel, Buffer buf, + zstid oldtid, + ZSBtreeItem *replacementitem, + List *newitems) +{ + Page page = BufferGetPage(buf); + OffsetNumber off; + OffsetNumber maxoff; + List *items; + bool found_old_item = false; + /* We might need to decompress up to two previously compressed items */ + ZSDecompressContext decompressor; + bool decompressor_used = false; + bool decompressing; + + if (replacementitem) + Assert(replacementitem->t_tid == oldtid); + + /* + * TODO: It would be good to have a fast path, for the common case that we're + * just adding items to the end. + */ + + /* Loop through all old items on the page */ + items = NIL; + maxoff = PageGetMaxOffsetNumber(page); + decompressing = false; + off = 1; + for (;;) + { + ZSBtreeItem *item; + + /* + * Get the next item to process. If we're decompressing, get the next + * tuple from the decompressor, otherwise get the next item from the page. + */ + if (decompressing) + { + item = zs_decompress_read_item(&decompressor); + if (!item) + { + decompressing = false; + continue; + } + } + else if (off <= maxoff) + { + ItemId iid = PageGetItemId(page, off); + + item = (ZSBtreeItem *) PageGetItem(page, iid); + off++; + + } + else + { + /* out of items */ + break; + } + + /* we now have an item to process, either straight from the page or from + * the decompressor */ + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + zstid item_lasttid = zsbt_item_lasttid(item); + + /* there shouldn't nested compressed items */ + if (decompressing) + elog(ERROR, "nested compressed items on zedstore page not supported"); + + if (oldtid != InvalidZSTid && item->t_tid <= oldtid && oldtid <= item_lasttid) + { + ZSCompressedBtreeItem *citem = (ZSCompressedBtreeItem *) item; + + /* Found it, this compressed item covers the target or the new TID. */ + /* We have to decompress it, and recompress */ + Assert(!decompressor_used); + + zs_decompress_init(&decompressor); + zs_decompress_chunk(&decompressor, citem); + decompressor_used = true; + decompressing = true; + continue; + } + else + { + /* keep this compressed item as it is */ + items = lappend(items, item); + } + } + else if ((item->t_flags & ZSBT_ARRAY) != 0) + { + /* array item */ + ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item; + zstid item_lasttid = zsbt_item_lasttid(item); + + if (oldtid != InvalidZSTid && item->t_tid <= oldtid && oldtid <= item_lasttid) + { + /* + * The target TID is currently part of an array item. We have to split + * the array item into two, and put the replacement item in the middle. + */ + int cutoff; + int nelements = aitem->t_nelements; + + cutoff = oldtid - item->t_tid; + + /* Array slice before the target TID */ + if (cutoff > 0) + { + ZSBtreeItem *item1; + + item1 = zsbt_tid_create_item(aitem->t_tid, aitem->t_undo_ptr, + cutoff); + items = lappend(items, item1); + } + + /* + * Skip over the target element, and store the replacement + * item, if any, in its place + */ + if (replacementitem) + items = lappend(items, replacementitem); + + /* Array slice after the target */ + if (cutoff + 1 < nelements) + { + ZSBtreeItem *item2; + + item2 = zsbt_tid_create_item(oldtid + 1, aitem->t_undo_ptr, + nelements - (cutoff + 1)); + items = lappend(items, item2); + } + + found_old_item = true; + } + else + items = lappend(items, item); + } + else + { + /* single item */ + if (oldtid != InvalidZSTid && item->t_tid == oldtid) + { + Assert(!found_old_item); + found_old_item = true; + if (replacementitem) + items = lappend(items, replacementitem); + } + else + items = lappend(items, item); + } + } + + if (oldtid != InvalidZSTid && !found_old_item) + elog(ERROR, "could not find old item to replace"); + + /* Add any new items to the end */ + if (newitems) + items = list_concat(items, newitems); + + /* Now pass the list to the recompressor. */ + IncrBufferRefCount(buf); + if (items) + { + zsbt_tid_recompress_replace(rel, buf, items); + } + else + { + zs_split_stack *stack; + + stack = zsbt_unlink_page(rel, ZS_META_ATTRIBUTE_NUM, buf, 0); + + if (!stack) + { + /* failed. */ + Page newpage = PageGetTempPageCopySpecial(BufferGetPage(buf)); + + stack = zs_new_split_stack_entry(buf, newpage); + } + + /* apply the changes */ + zs_apply_split_changes(rel, stack); + } + + /* + * We can now free the decompression contexts. The pointers in the 'items' list + * point to decompression buffers, so we cannot free them until after writing out + * the pages. + */ + if (decompressor_used) + zs_decompress_free(&decompressor); + list_free(items); +} + +/* + * Recompressor routines + */ +typedef struct +{ + Page currpage; + ZSCompressContext compressor; + int compressed_items; + + /* first page writes over the old buffer, subsequent pages get newly-allocated buffers */ + zs_split_stack *stack_head; + zs_split_stack *stack_tail; + + int total_items; + int total_compressed_items; + int total_already_compressed_items; + + zstid hikey; +} zsbt_tid_recompress_context; + +static void +zsbt_recompress_newpage(zsbt_tid_recompress_context *cxt, zstid nexttid, int flags) +{ + Page newpage; + ZSBtreePageOpaque *newopaque; + zs_split_stack *stack; + + if (cxt->currpage) + { + /* set the last tid on previous page */ + ZSBtreePageOpaque *oldopaque = ZSBtreePageGetOpaque(cxt->currpage); + + oldopaque->zs_hikey = nexttid; + } + + newpage = (Page) palloc(BLCKSZ); + PageInit(newpage, BLCKSZ, sizeof(ZSBtreePageOpaque)); + + stack = zs_new_split_stack_entry(InvalidBuffer, /* will be assigned later */ + newpage); + if (cxt->stack_tail) + cxt->stack_tail->next = stack; + else + cxt->stack_head = stack; + cxt->stack_tail = stack; + + cxt->currpage = newpage; + + newopaque = ZSBtreePageGetOpaque(newpage); + newopaque->zs_attno = ZS_META_ATTRIBUTE_NUM; + newopaque->zs_next = InvalidBlockNumber; /* filled in later */ + newopaque->zs_lokey = nexttid; + newopaque->zs_hikey = cxt->hikey; /* overwritten later, if this is not last page */ + newopaque->zs_level = 0; + newopaque->zs_flags = flags; + newopaque->zs_page_id = ZS_BTREE_PAGE_ID; +} + +static void +zsbt_recompress_add_to_page(zsbt_tid_recompress_context *cxt, ZSBtreeItem *item) +{ + if (PageGetFreeSpace(cxt->currpage) < MAXALIGN(item->t_size)) + zsbt_recompress_newpage(cxt, item->t_tid, 0); + + if (PageAddItemExtended(cxt->currpage, + (Item) item, item->t_size, + PageGetMaxOffsetNumber(cxt->currpage) + 1, + PAI_OVERWRITE) == InvalidOffsetNumber) + elog(ERROR, "could not add item to page while recompressing"); + + cxt->total_items++; +} + +static bool +zsbt_recompress_add_to_compressor(zsbt_tid_recompress_context *cxt, ZSBtreeItem *item) +{ + bool result; + + if (cxt->compressed_items == 0) + zs_compress_begin(&cxt->compressor, PageGetFreeSpace(cxt->currpage)); + + result = zs_compress_add(&cxt->compressor, item); + if (result) + { + cxt->compressed_items++; + + cxt->total_compressed_items++; + } + + return result; +} + +static void +zsbt_recompress_flush(zsbt_tid_recompress_context *cxt) +{ + ZSCompressedBtreeItem *citem; + + if (cxt->compressed_items == 0) + return; + + citem = zs_compress_finish(&cxt->compressor); + + if (citem) + zsbt_recompress_add_to_page(cxt, (ZSBtreeItem *) citem); + else + { + uint16 size = 0; + /* + * compression failed hence add items uncompressed. We should maybe + * note that these items/pattern are not compressible and skip future + * attempts to compress but its possible this clubbed with some other + * future items may compress. So, better avoid recording such info and + * try compression again later if required. + */ + for (int i = 0; i < cxt->compressor.nitems; i++) + { + citem = (ZSCompressedBtreeItem *) (cxt->compressor.uncompressedbuffer + size); + zsbt_recompress_add_to_page(cxt, (ZSBtreeItem *) citem); + + size += MAXALIGN(citem->t_size); + } + } + + cxt->compressed_items = 0; +} + +/* + * Rewrite a leaf page, with given 'items' as the new content. + * + * If there are any uncompressed items in the list, we try to compress them. + * Any already-compressed items are added as is. + * + * If the items no longer fit on the page, then the page is split. It is + * entirely possible that they don't fit even on two pages; we split the page + * into as many pages as needed. Hopefully not more than a few pages, though, + * because otherwise you might hit limits on the number of buffer pins (with + * tiny shared_buffers). + * + * On entry, 'oldbuf' must be pinned and exclusive-locked. On exit, the lock + * is released, but it's still pinned. + * + * TODO: Try to combine single items, and existing array-items, into new array + * items. + */ +static void +zsbt_tid_recompress_replace(Relation rel, Buffer oldbuf, List *items) +{ + ListCell *lc; + zsbt_tid_recompress_context cxt; + ZSBtreePageOpaque *oldopaque = ZSBtreePageGetOpaque(BufferGetPage(oldbuf)); + ZSUndoRecPtr recent_oldest_undo = { 0 }; + BlockNumber orignextblk; + zs_split_stack *stack; + List *downlinks = NIL; + + orignextblk = oldopaque->zs_next; + + cxt.currpage = NULL; + zs_compress_init(&cxt.compressor); + cxt.compressed_items = 0; + cxt.stack_head = cxt.stack_tail = NULL; + cxt.hikey = oldopaque->zs_hikey; + + cxt.total_items = 0; + cxt.total_compressed_items = 0; + cxt.total_already_compressed_items = 0; + + zsbt_recompress_newpage(&cxt, oldopaque->zs_lokey, (oldopaque->zs_flags & ZSBT_ROOT)); + + foreach(lc, items) + { + ZSBtreeItem *item = (ZSBtreeItem *) lfirst(lc); + + /* We can leave out any old-enough DEAD items */ + if ((item->t_flags & ZSBT_DEAD) != 0) + { + ZSBtreeItem *uitem = (ZSBtreeItem *) item; + + if (recent_oldest_undo.counter == 0) + recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); + + if (zsbt_item_undoptr(uitem).counter <= recent_oldest_undo.counter) + continue; + } + + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + { + /* already compressed, add as it is. */ + zsbt_recompress_flush(&cxt); + cxt.total_already_compressed_items++; + zsbt_recompress_add_to_page(&cxt, item); + } + else + { + /* try to add this item to the compressor */ + if (!zsbt_recompress_add_to_compressor(&cxt, item)) + { + if (cxt.compressed_items > 0) + { + /* flush, and retry */ + zsbt_recompress_flush(&cxt); + + if (!zsbt_recompress_add_to_compressor(&cxt, item)) + { + /* could not compress, even on its own. Store it uncompressed, then */ + zsbt_recompress_add_to_page(&cxt, item); + } + } + else + { + /* could not compress, even on its own. Store it uncompressed, then */ + zsbt_recompress_add_to_page(&cxt, item); + } + } + } + } + + /* flush the last one, if any */ + zsbt_recompress_flush(&cxt); + + zs_compress_free(&cxt.compressor); + + /* + * Ok, we now have a list of pages, to replace the original page, as private + * in-memory copies. Allocate buffers for them, and write them out. + * + * allocate all the pages before entering critical section, so that + * out-of-disk-space doesn't lead to PANIC + */ + stack = cxt.stack_head; + Assert(stack->buf == InvalidBuffer); + stack->buf = oldbuf; + while (stack->next) + { + Page thispage = stack->page; + ZSBtreePageOpaque *thisopaque = ZSBtreePageGetOpaque(thispage); + ZSBtreeInternalPageItem *downlink; + Buffer nextbuf; + + Assert(stack->next->buf == InvalidBuffer); + + nextbuf = zspage_getnewbuf(rel, InvalidBuffer); + stack->next->buf = nextbuf; + + thisopaque->zs_next = BufferGetBlockNumber(nextbuf); + + downlink = palloc(sizeof(ZSBtreeInternalPageItem)); + downlink->tid = thisopaque->zs_hikey; + downlink->childblk = BufferGetBlockNumber(nextbuf); + downlinks = lappend(downlinks, downlink); + + stack = stack->next; + } + /* last one in the chain */ + ZSBtreePageGetOpaque(stack->page)->zs_next = orignextblk; + + /* If we had to split, insert downlinks for the new pages. */ + if (cxt.stack_head->next) + { + oldopaque = ZSBtreePageGetOpaque(cxt.stack_head->page); + + if ((oldopaque->zs_flags & ZSBT_ROOT) != 0) + { + ZSBtreeInternalPageItem *downlink; + + downlink = palloc(sizeof(ZSBtreeInternalPageItem)); + downlink->tid = MinZSTid; + downlink->childblk = BufferGetBlockNumber(cxt.stack_head->buf); + downlinks = lcons(downlink, downlinks); + + cxt.stack_tail->next = zsbt_newroot(rel, ZS_META_ATTRIBUTE_NUM, + oldopaque->zs_level + 1, downlinks); + + /* clear the ZSBT_ROOT flag on the old root page */ + oldopaque->zs_flags &= ~ZSBT_ROOT; + } + else + { + cxt.stack_tail->next = zsbt_insert_downlinks(rel, ZS_META_ATTRIBUTE_NUM, + oldopaque->zs_lokey, BufferGetBlockNumber(oldbuf), oldopaque->zs_level + 1, + downlinks); + } + /* note: stack_tail is not the real tail anymore */ + } + + /* Finally, overwrite all the pages we had to modify */ + zs_apply_split_changes(rel, cxt.stack_head); +} diff --git a/src/backend/access/zedstore/zedstore_toast.c b/src/backend/access/zedstore/zedstore_toast.c new file mode 100644 index 0000000000..8e25591b16 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_toast.c @@ -0,0 +1,192 @@ +/* + * zedstore_toast.c + * Routines for Toasting oversized tuples in Zedstore + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_toast.c + */ +#include "postgres.h" + +#include "access/zedstore_compression.h" +#include "access/zedstore_internal.h" +#include "storage/bufmgr.h" +#include "utils/datum.h" +#include "utils/rel.h" + +/* + * Toast a datum, inside the ZedStore file. + * + * This is similar to regular toasting, but instead of using a separate index and + * heap, the datum is stored within the same ZedStore file as all the btrees and + * stuff. A chain of "toast-pages" is allocated for the datum, and each page is filled + * with as much of the datum as possible. + * + * + * Note: You must call zedstore_toast_finish() after this, + * to set the TID in the toast-chain's first block. Otherwise, it's considered recyclable. + */ +Datum +zedstore_toast_datum(Relation rel, AttrNumber attno, Datum value) +{ + varatt_zs_toastptr *toastptr; + BlockNumber firstblk = InvalidBlockNumber; + Buffer buf = InvalidBuffer; + Page page; + ZSToastPageOpaque *opaque; + Buffer prevbuf = InvalidBuffer; + ZSToastPageOpaque *prevopaque = NULL; + char *ptr; + int32 total_size; + int32 offset; + + /* it's possible that this is the very first insertion to the relation. */ + if (RelationGetNumberOfBlocks(rel) == 0) + zsmeta_initmetapage(rel); + + /* TODO: try to compress it in place first. Maybe just call toast_compress_datum? */ + + /* + * If that doesn't reduce it enough, allocate a toast page + * for it. + */ + ptr = VARDATA_ANY(value); + total_size = VARSIZE_ANY_EXHDR(value); + offset = 0; + + while (total_size - offset > 0) + { + Size thisbytes; + + buf = zspage_getnewbuf(rel, InvalidBuffer); + if (prevbuf == InvalidBuffer) + firstblk = BufferGetBlockNumber(buf); + + page = BufferGetPage(buf); + PageInit(page, BLCKSZ, sizeof(ZSToastPageOpaque)); + + thisbytes = Min(total_size - offset, PageGetExactFreeSpace(page)); + + opaque = (ZSToastPageOpaque *) PageGetSpecialPointer(page); + opaque->zs_attno = attno; + opaque->zs_tid = InvalidZSTid; + opaque->zs_total_size = total_size; + opaque->zs_slice_offset = offset; + opaque->zs_prev = BufferIsValid(prevbuf) ? BufferGetBlockNumber(prevbuf) : InvalidBlockNumber; + opaque->zs_next = InvalidBlockNumber; + opaque->zs_flags = 0; + opaque->zs_page_id = ZS_TOAST_PAGE_ID; + + memcpy((char *) page + SizeOfPageHeaderData, ptr, thisbytes); + ((PageHeader) page)->pd_lower += thisbytes; + ptr += thisbytes; + offset += thisbytes; + + if (prevbuf != InvalidBuffer) + { + prevopaque->zs_next = BufferGetBlockNumber(buf); + MarkBufferDirty(prevbuf); + } + + /* TODO: WAL-log */ + MarkBufferDirty(buf); + + if (prevbuf != InvalidBuffer) + UnlockReleaseBuffer(prevbuf); + prevbuf = buf; + prevopaque = opaque; + } + + UnlockReleaseBuffer(buf); + + toastptr = palloc0(sizeof(varatt_zs_toastptr)); + SET_VARTAG_1B_E(toastptr, VARTAG_ZEDSTORE); + toastptr->zst_block = firstblk; + + return PointerGetDatum(toastptr); +} + +void +zedstore_toast_finish(Relation rel, AttrNumber attno, Datum toasted, zstid tid) +{ + varatt_zs_toastptr *toastptr = (varatt_zs_toastptr *) DatumGetPointer(toasted); + Buffer buf; + Page page; + ZSToastPageOpaque *opaque; + + Assert(toastptr->va_tag == VARTAG_ZEDSTORE); + + buf = ReadBuffer(rel, toastptr->zst_block); + page = BufferGetPage(buf); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + opaque = (ZSToastPageOpaque *) PageGetSpecialPointer(page); + + Assert(opaque->zs_tid == InvalidZSTid); + Assert(opaque->zs_attno == attno); + Assert(opaque->zs_prev == InvalidBlockNumber); + + opaque->zs_tid = tid; + + /* TODO: WAL-log */ + MarkBufferDirty(buf); + + UnlockReleaseBuffer(buf); +} + +Datum +zedstore_toast_flatten(Relation rel, AttrNumber attno, zstid tid, Datum toasted) +{ + varatt_zs_toastptr *toastptr = (varatt_zs_toastptr *) DatumGetPointer(toasted); + BlockNumber nextblk; + BlockNumber prevblk; + char *result = NULL; + char *ptr = NULL; + int32 total_size = 0; + + Assert(toastptr->va_tag == VARTAG_ZEDSTORE); + + prevblk = InvalidBlockNumber; + nextblk = toastptr->zst_block; + + while (nextblk != InvalidBlockNumber) + { + Buffer buf; + Page page; + ZSToastPageOpaque *opaque; + uint32 size; + + buf = ReadBuffer(rel, nextblk); + page = BufferGetPage(buf); + LockBuffer(buf, BUFFER_LOCK_SHARE); + + opaque = (ZSToastPageOpaque *) PageGetSpecialPointer(page); + + Assert(opaque->zs_attno == attno); + Assert(opaque->zs_prev == prevblk); + + if (prevblk == InvalidBlockNumber) + { + Assert(opaque->zs_tid == tid); + + total_size = opaque->zs_total_size; + + result = palloc(total_size + VARHDRSZ); + SET_VARSIZE(result, total_size + VARHDRSZ); + ptr = result + VARHDRSZ; + } + + size = ((PageHeader) page)->pd_lower - SizeOfPageHeaderData; + memcpy(ptr, (char *) page + SizeOfPageHeaderData, size); + ptr += size; + + prevblk = nextblk; + nextblk = opaque->zs_next; + UnlockReleaseBuffer(buf); + } + Assert(total_size > 0); + Assert(ptr == result + total_size + VARHDRSZ); + + return PointerGetDatum(result); +} diff --git a/src/backend/access/zedstore/zedstore_tupslot.c b/src/backend/access/zedstore/zedstore_tupslot.c new file mode 100644 index 0000000000..8528287d51 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_tupslot.c @@ -0,0 +1,348 @@ +/* + * zedstore_tupslot.c + * Implementation of a TupleTableSlot for zedstore. + * + * This implementation is identical to a Virtual tuple slot + * (TTSOpsVirtual), but it has a slot_getsysattr() implementation + * that can fetch and compute the 'xmin' for the tuple. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_tupslot.c + */ +#include "postgres.h" + +#include "access/table.h" +#include "access/zedstore_internal.h" +#include "executor/tuptable.h" +#include "utils/expandeddatum.h" + +const TupleTableSlotOps TTSOpsZedstore; + + +typedef struct ZedstoreTupleTableSlot +{ + TupleTableSlot base; + + char *data; /* data for materialized slots */ +} ZedstoreTupleTableSlot; + + +static void +tts_zedstore_init(TupleTableSlot *slot) +{ +} + +static void +tts_zedstore_release(TupleTableSlot *slot) +{ +} + +static void +tts_zedstore_clear(TupleTableSlot *slot) +{ + if (unlikely(TTS_SHOULDFREE(slot))) + { + ZedstoreTupleTableSlot *vslot = (ZedstoreTupleTableSlot *) slot; + + pfree(vslot->data); + vslot->data = NULL; + + slot->tts_flags &= ~TTS_FLAG_SHOULDFREE; + } + + slot->tts_nvalid = 0; + slot->tts_flags |= TTS_FLAG_EMPTY; + ItemPointerSetInvalid(&slot->tts_tid); +} + +/* + * Attribute values are readily available in tts_values and tts_isnull array + * in a ZedstoreTupleTableSlot. So there should be no need to call either of the + * following two functions. + */ +static void +tts_zedstore_getsomeattrs(TupleTableSlot *slot, int natts) +{ + elog(ERROR, "getsomeattrs is not required to be called on a zedstore tuple table slot"); +} + +static void +zs_get_xmin_cmin(Relation rel, ZSUndoRecPtr recent_oldest_undo, zstid tid, ZSUndoRecPtr undo_ptr, + TransactionId *xmin, CommandId *cmin) +{ + TransactionId this_xmin; + CommandId this_cmin; + ZSUndoRec *undorec; + + /* + * Follow the chain of UNDO records for this tuple, to find the + * transaction that originally inserted the row (xmin/cmin). + * + * XXX: this is similar logic to zs_cluster_process_tuple(). Can + * we merge it? + */ + this_xmin = FrozenTransactionId; + this_cmin = InvalidCommandId; + + for (;;) + { + if (undo_ptr.counter < recent_oldest_undo.counter) + { + /* This tuple version is visible to everyone. */ + break; + } + + /* Fetch the next UNDO record. */ + undorec = zsundo_fetch(rel, undo_ptr); + + if (undorec->type == ZSUNDO_TYPE_INSERT) + { + this_xmin = undorec->xid; + this_cmin = undorec->cid; + break; + } + else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK || + undorec->type == ZSUNDO_TYPE_DELETE || + undorec->type == ZSUNDO_TYPE_UPDATE) + { + undo_ptr = undorec->prevundorec; + continue; + } + } + + *xmin = this_xmin; + *cmin = this_cmin; +} + +/* + * We only support fetching 'xmin', currently. It's needed for referential + * integrity triggers (i.e. foreign keys). + */ +static Datum +tts_zedstore_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull) +{ + if (attnum == MinTransactionIdAttributeNumber || + attnum == MinCommandIdAttributeNumber) + { + zstid tid = ZSTidFromItemPointer(slot->tts_tid); + ZSBtreeScan btree_scan; + bool found; + Relation rel; + ZSUndoRecPtr recent_oldest_undo; + TransactionId xmin; + CommandId cmin; + + /* + * We assume that the table OID and TID in the slot are set. We + * fetch the tuple from the table, and follow its UNDO chain to + * find the transaction that inserted it. + * + * XXX: This is very slow compared to e.g. the heap, where we + * always store the xmin in tuple itself. We should probably do + * the same in zedstore, and add extra fields in the slot to hold + * xmin/cmin and fill them in when we fetch the tuple and check its + * visibility for the first time. + */ + if (!OidIsValid(slot->tts_tableOid)) + elog(ERROR, "zedstore tuple table slot does not have a table oid"); + + /* assume the caller is already holding a suitable lock on the table */ + rel = table_open(slot->tts_tableOid, NoLock); + recent_oldest_undo = zsundo_get_oldest_undo_ptr(rel); + + /* Use the meta-data tree for the visibility information. */ + zsbt_tid_begin_scan(rel, tid, tid + 1, SnapshotAny, &btree_scan); + + found = zsbt_tid_scan_next(&btree_scan) != InvalidZSTid; + if (!found) + elog(ERROR, "could not find zedstore tuple (%u, %u)", + ZSTidGetBlockNumber(tid), ZSTidGetOffsetNumber(tid)); + + zs_get_xmin_cmin(rel, recent_oldest_undo, tid, btree_scan.array_undoptr, &xmin, &cmin); + + zsbt_tid_end_scan(&btree_scan); + + table_close(rel, NoLock); + + *isnull = false; + if (attnum == MinTransactionIdAttributeNumber) + return TransactionIdGetDatum(xmin); + else + { + Assert(attnum == MinCommandIdAttributeNumber); + return CommandIdGetDatum(cmin); + } + } + elog(ERROR, "zedstore tuple table slot does not have system attributes (except xmin and cmin)"); + + return 0; /* silence compiler warnings */ +} + +/* + * To materialize a zedstore slot all the datums that aren't passed by value + * have to be copied into the slot's memory context. To do so, compute the + * required size, and allocate enough memory to store all attributes. That's + * good for cache hit ratio, but more importantly requires only memory + * allocation/deallocation. + */ +static void +tts_zedstore_materialize(TupleTableSlot *slot) +{ + ZedstoreTupleTableSlot *vslot = (ZedstoreTupleTableSlot *) slot; + TupleDesc desc = slot->tts_tupleDescriptor; + Size sz = 0; + char *data; + + /* already materialized */ + if (TTS_SHOULDFREE(slot)) + return; + + /* compute size of memory required */ + for (int natt = 0; natt < desc->natts; natt++) + { + Form_pg_attribute att = TupleDescAttr(desc, natt); + Datum val; + + if (att->attbyval || slot->tts_isnull[natt]) + continue; + + val = slot->tts_values[natt]; + + if (att->attlen == -1 && + VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val))) + { + /* + * We want to flatten the expanded value so that the materialized + * slot doesn't depend on it. + */ + sz = att_align_nominal(sz, att->attalign); + sz += EOH_get_flat_size(DatumGetEOHP(val)); + } + else + { + sz = att_align_nominal(sz, att->attalign); + sz = att_addlength_datum(sz, att->attlen, val); + } + } + + /* all data is byval */ + if (sz == 0) + return; + + /* allocate memory */ + vslot->data = data = MemoryContextAlloc(slot->tts_mcxt, sz); + slot->tts_flags |= TTS_FLAG_SHOULDFREE; + + /* and copy all attributes into the pre-allocated space */ + for (int natt = 0; natt < desc->natts; natt++) + { + Form_pg_attribute att = TupleDescAttr(desc, natt); + Datum val; + + if (att->attbyval || slot->tts_isnull[natt]) + continue; + + val = slot->tts_values[natt]; + + if (att->attlen == -1 && + VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val))) + { + Size data_length; + + /* + * We want to flatten the expanded value so that the materialized + * slot doesn't depend on it. + */ + ExpandedObjectHeader *eoh = DatumGetEOHP(val); + + data = (char *) att_align_nominal(data, + att->attalign); + data_length = EOH_get_flat_size(eoh); + EOH_flatten_into(eoh, data, data_length); + + slot->tts_values[natt] = PointerGetDatum(data); + data += data_length; + } + else + { + Size data_length = 0; + + data = (char *) att_align_nominal(data, att->attalign); + data_length = att_addlength_datum(data_length, att->attlen, val); + + memcpy(data, DatumGetPointer(val), data_length); + + slot->tts_values[natt] = PointerGetDatum(data); + data += data_length; + } + } +} + +static void +tts_zedstore_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot) +{ + TupleDesc srcdesc = dstslot->tts_tupleDescriptor; + + Assert(srcdesc->natts <= dstslot->tts_tupleDescriptor->natts); + + tts_zedstore_clear(dstslot); + + slot_getallattrs(srcslot); + + for (int natt = 0; natt < srcdesc->natts; natt++) + { + dstslot->tts_values[natt] = srcslot->tts_values[natt]; + dstslot->tts_isnull[natt] = srcslot->tts_isnull[natt]; + } + + dstslot->tts_nvalid = srcdesc->natts; + dstslot->tts_flags &= ~TTS_FLAG_EMPTY; + + /* make sure storage doesn't depend on external memory */ + tts_zedstore_materialize(dstslot); +} + +static HeapTuple +tts_zedstore_copy_heap_tuple(TupleTableSlot *slot) +{ + Assert(!TTS_EMPTY(slot)); + + return heap_form_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); + +} + +static MinimalTuple +tts_zedstore_copy_minimal_tuple(TupleTableSlot *slot) +{ + Assert(!TTS_EMPTY(slot)); + + return heap_form_minimal_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); +} + + +const TupleTableSlotOps TTSOpsZedstore = { + .base_slot_size = sizeof(ZedstoreTupleTableSlot), + .init = tts_zedstore_init, + .release = tts_zedstore_release, + .clear = tts_zedstore_clear, + .getsomeattrs = tts_zedstore_getsomeattrs, + .getsysattr = tts_zedstore_getsysattr, + .materialize = tts_zedstore_materialize, + .copyslot = tts_zedstore_copyslot, + + /* + * A zedstore tuple table slot can not "own" a heap tuple or a minimal + * tuple. + */ + .get_heap_tuple = NULL, + .get_minimal_tuple = NULL, + .copy_heap_tuple = tts_zedstore_copy_heap_tuple, + .copy_minimal_tuple = tts_zedstore_copy_minimal_tuple +}; diff --git a/src/backend/access/zedstore/zedstore_undo.c b/src/backend/access/zedstore/zedstore_undo.c new file mode 100644 index 0000000000..0767307253 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_undo.c @@ -0,0 +1,918 @@ +/* + * zedstore_undo.c + * Temporary UNDO-logging for zedstore. + * + * XXX: This is hopefully replaced with an upstream UNDO facility later. + * + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_undo.c + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/multixact.h" +#include "access/zedstore_internal.h" +#include "access/zedstore_undo.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "pgstat.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/lsyscache.h" + +/* + * Working area for zsundo_scan(). + */ +typedef struct ZSUndoTrimStats +{ + /* List of TIDs of tuples we intend to delete */ + /* NB: this list is ordered by TID address */ + int num_dead_tuples; /* current # of entries */ + int max_dead_tuples; /* # slots allocated in array */ + ItemPointer dead_tuples; /* array of ItemPointerData */ + bool dead_tuples_overflowed; + + BlockNumber deleted_undo_pages; + + bool can_advance_oldestundorecptr; +} ZSUndoTrimStats; + +/* + * Working area for VACUUM. + */ +typedef struct ZSVacRelStats +{ + int elevel; + BufferAccessStrategy vac_strategy; + + /* hasindex = true means two-pass strategy; false means one-pass */ + bool hasindex; + /* Overall statistics about rel */ + BlockNumber old_rel_pages; /* previous value of pg_class.relpages */ + BlockNumber rel_pages; /* total number of pages */ + BlockNumber scanned_pages; /* number of pages we examined */ + BlockNumber pinskipped_pages; /* # of pages we skipped due to a pin */ + BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */ + BlockNumber tupcount_pages; /* pages whose tuples we counted */ + double old_live_tuples; /* previous value of pg_class.reltuples */ + double new_rel_tuples; /* new estimated total # of tuples */ + double new_live_tuples; /* new estimated total # of live tuples */ + double new_dead_tuples; /* new estimated total # of dead tuples */ + BlockNumber pages_removed; + double tuples_deleted; + BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + + ZSUndoTrimStats trimstats; +} ZSVacRelStats; + +/* + * Guesstimation of number of dead tuples per page. This is used to + * provide an upper limit to memory allocated when vacuuming small + * tables. + */ +#define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage + +static int zs_vac_cmp_itemptr(const void *left, const void *right); +static bool zs_lazy_tid_reaped(ItemPointer itemptr, void *state); +static void lazy_space_alloc(ZSVacRelStats *vacrelstats, BlockNumber relblocks); +static void lazy_vacuum_index(Relation indrel, + IndexBulkDeleteResult **stats, + ZSVacRelStats *vacrelstats); +static void lazy_cleanup_index(Relation indrel, + IndexBulkDeleteResult *stats, + ZSVacRelStats *vacrelstats); +static ZSUndoRecPtr zsundo_scan(Relation rel, TransactionId OldestXmin, ZSUndoTrimStats *trimstats, BlockNumber *oldest_undopage, List **unused_pages); +static void zsundo_update_oldest_ptr(Relation rel, ZSUndoRecPtr oldest_undorecptr, BlockNumber oldest_undopage, List *unused_pages); +static void zsundo_record_dead_tuple(ZSUndoTrimStats *trimstats, zstid tid); + +/* + * Insert the given UNDO record to the UNDO log. + */ +ZSUndoRecPtr +zsundo_insert(Relation rel, ZSUndoRec *rec) +{ + Buffer metabuf; + Page metapage; + ZSMetaPageOpaque *metaopaque; + BlockNumber tail_blk; + Buffer tail_buf = InvalidBuffer; + Page tail_pg = NULL; + ZSUndoPageOpaque *tail_opaque = NULL; + char *dst; + ZSUndoRecPtr undorecptr; + int offset; + uint64 undo_counter; + + metabuf = ReadBuffer(rel, ZS_META_BLK); + metapage = BufferGetPage(metabuf); + + /* TODO: get share lock to begin with, for more concurrency */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + +retry_lock_tail: + tail_blk = metaopaque->zs_undo_tail; + + /* + * Is there space on the tail page? If not, allocate a new UNDO page. + */ + if (tail_blk != InvalidBlockNumber) + { + tail_buf = ReadBuffer(rel, tail_blk); + LockBuffer(tail_buf, BUFFER_LOCK_EXCLUSIVE); + tail_pg = BufferGetPage(tail_buf); + tail_opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(tail_pg); + } + if (tail_blk == InvalidBlockNumber || PageGetExactFreeSpace(tail_pg) < rec->size) + { + Buffer newbuf; + BlockNumber newblk; + Page newpage; + ZSUndoPageOpaque *newopaque; + + /* + * Release the lock on the metapage while we find a new block, because + * that could take a while. (And accessing the Free Page Map might lock + * the metapage, too, causing self-deadlock.) + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* new page */ + newbuf = zspage_getnewbuf(rel, metabuf); + + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + if (metaopaque->zs_undo_tail != tail_blk) + { + /* + * It should not be possible for another backend to extend the UNDO log + * while we're holding the tail block locked. + */ + if (tail_blk != InvalidBlockNumber) + elog(ERROR, "UNDO tail block pointer was changed unexpectedly"); + + /* + * we don't need the new page, after all. (Or maybe we do, if the new + * tail block is already full, but we're not smart about it.) + */ + zspage_delete_page(rel, newbuf); + goto retry_lock_tail; + } + + newblk = BufferGetBlockNumber(newbuf); + newpage = BufferGetPage(newbuf); + PageInit(newpage, BLCKSZ, sizeof(ZSUndoPageOpaque)); + newopaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(newpage); + newopaque->next = InvalidBlockNumber; + newopaque->zs_page_id = ZS_UNDO_PAGE_ID; + + metaopaque->zs_undo_tail = newblk; + if (tail_blk == InvalidBlockNumber) + metaopaque->zs_undo_head = newblk; + + MarkBufferDirty(metabuf); + + if (tail_blk != InvalidBlockNumber) + { + tail_opaque->next = newblk; + MarkBufferDirty(tail_buf); + UnlockReleaseBuffer(tail_buf); + } + + tail_blk = newblk; + tail_buf = newbuf; + tail_pg = newpage; + tail_opaque = newopaque; + } + + undo_counter = metaopaque->zs_undo_counter++; + MarkBufferDirty(metabuf); + + UnlockReleaseBuffer(metabuf); + + /* insert the record to this page */ + offset = ((PageHeader) tail_pg)->pd_lower; + + undorecptr.counter = undo_counter; + undorecptr.blkno = tail_blk; + undorecptr.offset = offset; + rec->undorecptr = undorecptr; + dst = ((char *) tail_pg) + offset; + memcpy(dst, rec, rec->size); + ((PageHeader) tail_pg)->pd_lower += rec->size; + MarkBufferDirty(tail_buf); + UnlockReleaseBuffer(tail_buf); + + return undorecptr; +} + +/* + * Fetch the UNDO record with the given undo-pointer. + * + * The returned record is a palloc'd copy. + */ +ZSUndoRec * +zsundo_fetch(Relation rel, ZSUndoRecPtr undoptr) +{ + Buffer buf; + Page page; + PageHeader pagehdr; + ZSUndoPageOpaque *opaque; + ZSUndoRec *undorec; + ZSUndoRec *undorec_copy; + + buf = ReadBuffer(rel, undoptr.blkno); + page = BufferGetPage(buf); + pagehdr = (PageHeader) page; + + LockBuffer(buf, BUFFER_LOCK_SHARE); + if (PageIsNew(page)) + elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u; not an UNDO page", + undoptr.counter, undoptr.blkno, undoptr.offset); + opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page); + if (opaque->zs_page_id != ZS_UNDO_PAGE_ID) + elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u; not an UNDO page", + undoptr.counter, undoptr.blkno, undoptr.offset); + + /* Sanity check that the pointer pointed to a valid place */ + if (undoptr.offset < SizeOfPageHeaderData || + undoptr.offset + sizeof(ZSUndoRec) > pagehdr->pd_lower) + elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u", + undoptr.counter, undoptr.blkno, undoptr.offset); + + undorec = (ZSUndoRec *) (((char *) page) + undoptr.offset); + + if (memcmp(&undorec->undorecptr, &undoptr, sizeof(ZSUndoRecPtr)) != 0) + elog(ERROR, "could not find UNDO record"); + + undorec_copy = palloc(undorec->size); + memcpy(undorec_copy, undorec, undorec->size); + + UnlockReleaseBuffer(buf); + + return undorec_copy; +} + +void +zsundo_clear_speculative_token(Relation rel, ZSUndoRecPtr undoptr) +{ + Buffer buf; + Page page; + PageHeader pagehdr; + ZSUndoPageOpaque *opaque; + ZSUndoRec *undorec; + + buf = ReadBuffer(rel, undoptr.blkno); + page = BufferGetPage(buf); + pagehdr = (PageHeader) page; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page); + if (opaque->zs_page_id != ZS_UNDO_PAGE_ID) + elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u; not an UNDO page", + undoptr.counter, undoptr.blkno, undoptr.offset); + + /* Sanity check that the pointer pointed to a valid place */ + if (undoptr.offset < SizeOfPageHeaderData || + undoptr.offset + sizeof(ZSUndoRec) > pagehdr->pd_lower) + elog(ERROR, "could not find UNDO record " UINT64_FORMAT " at blk %u offset %u", + undoptr.counter, undoptr.blkno, undoptr.offset); + + undorec = (ZSUndoRec *) (((char *) page) + undoptr.offset); + + if (undorec->type != ZSUNDO_TYPE_INSERT) + elog(ERROR, "unexpected undo record type %d on speculatively inserted row", undorec->type); + + undorec->speculative_token = INVALID_SPECULATIVE_TOKEN; + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); +} + +static bool +zs_lazy_tid_reaped(ItemPointer itemptr, void *state) +{ + ZSVacRelStats *vacrelstats = (ZSVacRelStats *) state; + ItemPointer res; + + res = (ItemPointer) bsearch((void *) itemptr, + (void *) vacrelstats->trimstats.dead_tuples, + vacrelstats->trimstats.num_dead_tuples, + sizeof(ItemPointerData), + zs_vac_cmp_itemptr); + + return (res != NULL); +} + +/* + * Comparator routines for use with qsort() and bsearch(). + */ +static int +zs_vac_cmp_itemptr(const void *left, const void *right) +{ + BlockNumber lblk, + rblk; + OffsetNumber loff, + roff; + + lblk = ItemPointerGetBlockNumber((ItemPointer) left); + rblk = ItemPointerGetBlockNumber((ItemPointer) right); + + if (lblk < rblk) + return -1; + if (lblk > rblk) + return 1; + + loff = ItemPointerGetOffsetNumber((ItemPointer) left); + roff = ItemPointerGetOffsetNumber((ItemPointer) right); + + if (loff < roff) + return -1; + if (loff > roff) + return 1; + + return 0; +} + +void +zsundo_vacuum(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy, + TransactionId OldestXmin) +{ + ZSVacRelStats *vacrelstats; + ZSUndoTrimStats *trimstats; + Relation *Irel; + int nindexes; + IndexBulkDeleteResult **indstats; + BlockNumber nblocks; + + nblocks = RelationGetNumberOfBlocks(rel); + if (nblocks == 0) + return; /* empty table */ + + vacrelstats = (ZSVacRelStats *) palloc0(sizeof(ZSVacRelStats)); + trimstats = &vacrelstats->trimstats; + + if (params->options & VACOPT_VERBOSE) + vacrelstats->elevel = INFO; + else + vacrelstats->elevel = DEBUG2; + vacrelstats->vac_strategy = bstrategy; + + /* Open all indexes of the relation */ + vac_open_indexes(rel, RowExclusiveLock, &nindexes, &Irel); + vacrelstats->hasindex = (nindexes > 0); + indstats = (IndexBulkDeleteResult **) + palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); + + lazy_space_alloc(vacrelstats, nblocks); + + ereport(vacrelstats->elevel, + (errmsg("vacuuming \"%s.%s\"", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)))); + + do + { + ZSUndoRecPtr reaped_upto; + BlockNumber oldest_undopage; + int j; + List *unused_pages = NIL; + + trimstats->dead_tuples_overflowed = false; + trimstats->num_dead_tuples = 0; + trimstats->deleted_undo_pages = 0; + + reaped_upto = zsundo_scan(rel, OldestXmin, trimstats, &oldest_undopage, &unused_pages); + + if (trimstats->num_dead_tuples > 0) + { + pg_qsort(trimstats->dead_tuples, trimstats->num_dead_tuples, + sizeof(ItemPointerData), zs_vac_cmp_itemptr); + /* TODO: currently, we write a separate UNDO record for each attribute, so there will + * be duplicates. Eliminate them. */ + j = 1; + for (int i = 1; i < trimstats->num_dead_tuples; i++) + { + if (!ItemPointerEquals(&trimstats->dead_tuples[j - 1], + &trimstats->dead_tuples[i])) + trimstats->dead_tuples[j++] = trimstats->dead_tuples[i]; + } + trimstats->num_dead_tuples = j; + + /* Remove index entries */ + for (int i = 0; i < nindexes; i++) + lazy_vacuum_index(Irel[i], + &indstats[i], + vacrelstats); + + /* + * Mark the items as dead in the attribute b-trees. + * + * We cannot remove them immediately, because we must prevent the TIDs from + * being reused, until we have trimmed the UNDO records. Otherwise, this might + * happen: + * + * 1. We remove items from all the B-trees. + * 2. An inserter reuses the now-unused TID for a new tuple + * 3. We abort the VACUUM, for some reason + * 4. We start VACUUM again. We will now try to remove the item again, but + * we will remove the new item with the same TID instead. + * + * There would be other ways to deal with it. For example in step #4, we could + * refrain from removing items, whose UNDO pointers are newer than expected. + * But that's tricky, because we scan the indexes first, and we must refrain + * from removing index entries for new items, too. + */ + for (int i = 0; i < trimstats->num_dead_tuples; i++) + zsbt_tid_mark_dead(rel, + ZSTidFromItemPointer(trimstats->dead_tuples[i]), + reaped_upto); + + for (int attno = 1; attno <= RelationGetNumberOfAttributes(rel); attno++) + { + for (int i = 0; i < trimstats->num_dead_tuples; i++) + zsbt_attr_remove(rel, attno, ZSTidFromItemPointer(trimstats->dead_tuples[i])); + } + } + + /* + * The UNDO records for the tuple versions we just removed are no longer + * interesting to anyone. Advance the UNDO tail, so that the UNDO pages + * can be recycled. + */ + zsundo_update_oldest_ptr(rel, reaped_upto, oldest_undopage, unused_pages); + + ereport(vacrelstats->elevel, + (errmsg("\"%s\": removed %d row versions and %d undo pages", + RelationGetRelationName(rel), + trimstats->num_dead_tuples, + trimstats->deleted_undo_pages))); + } while(trimstats->dead_tuples_overflowed); + + /* Do post-vacuum cleanup and statistics update for each index */ + for (int i = 0; i < nindexes; i++) + lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); + + /* Done with indexes */ + vac_close_indexes(nindexes, Irel, NoLock); +} + + +/* + * lazy_space_alloc - space allocation decisions for lazy vacuum + * + * See the comments at the head of this file for rationale. + */ +static void +lazy_space_alloc(ZSVacRelStats *vacrelstats, BlockNumber relblocks) +{ + long maxtuples; + int vac_work_mem = IsAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem : maintenance_work_mem; + + if (vacrelstats->hasindex) + { + maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData); + maxtuples = Min(maxtuples, INT_MAX); + maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData)); + + /* curious coding here to ensure the multiplication can't overflow */ + if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks) + maxtuples = relblocks * LAZY_ALLOC_TUPLES; + + /* stay sane if small maintenance_work_mem */ + maxtuples = Max(maxtuples, MaxHeapTuplesPerPage); + } + else + { + /* + * TODO: In heap vacuum code, this is MaxHeapTuplesPerPage. We have no + * particular reason to size this by that, but the same principle applies: + * without indexes, it's pretty cheap to do multiple iterations, so let's + * avoid making a huge allocation + */ + maxtuples = 1000; + } + + vacrelstats->trimstats.num_dead_tuples = 0; + vacrelstats->trimstats.max_dead_tuples = (int) maxtuples; + vacrelstats->trimstats.dead_tuples = (ItemPointer) + palloc(maxtuples * sizeof(ItemPointerData)); +} + +/* + * lazy_vacuum_index() -- vacuum one index relation. + * + * Delete all the index entries pointing to tuples listed in + * vacrelstats->dead_tuples, and update running statistics. + */ +static void +lazy_vacuum_index(Relation indrel, + IndexBulkDeleteResult **stats, + ZSVacRelStats *vacrelstats) +{ + IndexVacuumInfo ivinfo; + PGRUsage ru0; + + pg_rusage_init(&ru0); + + ivinfo.index = indrel; + ivinfo.analyze_only = false; + ivinfo.estimated_count = true; + ivinfo.message_level = vacrelstats->elevel; + /* We can only provide an approximate value of num_heap_tuples here */ + ivinfo.num_heap_tuples = vacrelstats->old_live_tuples; + ivinfo.strategy = vacrelstats->vac_strategy; + + /* Do bulk deletion */ + *stats = index_bulk_delete(&ivinfo, *stats, + zs_lazy_tid_reaped, (void *) vacrelstats); + + ereport(vacrelstats->elevel, + (errmsg("scanned index \"%s\" to remove %d row versions", + RelationGetRelationName(indrel), + vacrelstats->trimstats.num_dead_tuples), + errdetail_internal("%s", pg_rusage_show(&ru0)))); +} + +/* + * lazy_cleanup_index() -- do post-vacuum cleanup for one index relation. + */ +static void +lazy_cleanup_index(Relation indrel, + IndexBulkDeleteResult *stats, + ZSVacRelStats *vacrelstats) +{ + IndexVacuumInfo ivinfo; + PGRUsage ru0; + + pg_rusage_init(&ru0); + + ivinfo.index = indrel; + ivinfo.analyze_only = false; + ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages); + ivinfo.message_level = vacrelstats->elevel; + + /* + * Now we can provide a better estimate of total number of surviving + * tuples (we assume indexes are more interested in that than in the + * number of nominally live tuples). + */ + ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples; + ivinfo.strategy = vacrelstats->vac_strategy; + + stats = index_vacuum_cleanup(&ivinfo, stats); + + if (!stats) + return; + + /* + * Now update statistics in pg_class, but only if the index says the count + * is accurate. + */ + if (!stats->estimated_count) + vac_update_relstats(indrel, + stats->num_pages, + stats->num_index_tuples, + 0, + false, + InvalidTransactionId, + InvalidMultiXactId, + false); + + ereport(vacrelstats->elevel, + (errmsg("index \"%s\" now contains %.0f row versions in %u pages", + RelationGetRelationName(indrel), + stats->num_index_tuples, + stats->num_pages), + errdetail("%.0f index row versions were removed.\n" + "%u index pages have been deleted, %u are currently reusable.\n" + "%s.", + stats->tuples_removed, + stats->pages_deleted, stats->pages_free, + pg_rusage_show(&ru0)))); + + pfree(stats); +} + +/* + * Scan the UNDO log, starting from oldest entry. For every tuple that is + * now considered dead, add it to 'dead_tuples'. Records for committed + * transactions can be trimmed away immediately. + * + * Returns the value that the oldest UNDO ptr can be trimmed upto, after + * removing all the dead TIDs. + * + * The caller must initialize ZSUndoTrimStats. This function updates the + * counters, and adds dead TIDs that can be removed to trimstats->dead_tuples. + * If there are more dead TIDs than fit in the dead_tuples array, this + * function sets trimstats->dead_tuples_overflow flag, and stops just before + * the UNDO record for the TID that did not fit. An important special case is + * calling this with trimstats->max_dead_tuples == 0. In that case, we scan + * as much as is possible without scanning the indexes (i.e. only UNDO + * records belonging to committed transactions at the tail of the UNDO log). + * IOW, it returns the oldest UNDO rec pointer that is still needed by + * active snapshots. + */ +static ZSUndoRecPtr +zsundo_scan(Relation rel, TransactionId OldestXmin, ZSUndoTrimStats *trimstats, + BlockNumber *oldest_undopage, List **unused_pages) +{ + /* Scan the undo log from oldest to newest */ + Buffer metabuf; + Page metapage; + ZSMetaPageOpaque *metaopaque; + BlockNumber firstblk; + BlockNumber lastblk; + ZSUndoRecPtr oldest_undorecptr; + bool can_advance_oldestundorecptr; + char *ptr; + char *endptr; + + /* + * Get the current oldest undo page from the metapage. + */ + metabuf = ReadBuffer(rel, ZS_META_BLK); + metapage = BufferGetPage(metabuf); + LockBuffer(metabuf, BUFFER_LOCK_SHARE); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + + firstblk = metaopaque->zs_undo_head; + + oldest_undorecptr = metaopaque->zs_undo_oldestptr; + + /* + * If we assume that only one process can call TRIM at a time, then we + * don't need to hold the metapage locked. Alternatively, if multiple + * concurrent trims is possible, we could check after reading the head + * page, that it is the page we expect, and re-read the metapage if it's + * not. + * + * FIXME: Currently this works even if two backends call zsundo_trim() + * concurrently, because we never recycle UNDO pages. + */ + UnlockReleaseBuffer(metabuf); + + /* + * Loop through UNDO records, starting from the oldest page, until we + * hit a record that we cannot remove. + */ + lastblk = firstblk; + can_advance_oldestundorecptr = false; + while (lastblk != InvalidBlockNumber && !trimstats->dead_tuples_overflowed) + { + Buffer buf; + Page page; + ZSUndoPageOpaque *opaque; + + CHECK_FOR_INTERRUPTS(); + + /* Read the UNDO page */ + buf = ReadBuffer(rel, lastblk); + page = BufferGetPage(buf); + LockBuffer(buf, BUFFER_LOCK_SHARE); + + opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page); + + if (opaque->zs_page_id != ZS_UNDO_PAGE_ID) + elog(ERROR, "unexpected page id on UNDO page"); + + /* loop through all records on the page */ + endptr = (char *) page + ((PageHeader) page)->pd_lower; + ptr = (char *) page + SizeOfPageHeaderData; + while (ptr < endptr && !trimstats->dead_tuples_overflowed) + { + ZSUndoRec *undorec = (ZSUndoRec *) ptr; + bool did_commit; + + Assert(undorec->undorecptr.blkno == lastblk); + + if (undorec->undorecptr.counter < oldest_undorecptr.counter) + { + ptr += undorec->size; + continue; + } + oldest_undorecptr = undorec->undorecptr; + + if (!TransactionIdPrecedes(undorec->xid, OldestXmin)) + { + /* This is still needed. Bail out */ + break; + } + + /* + * No one thinks this transaction is in-progress anymore. If it + * committed, we can just trim away its UNDO record. If it aborted, + * we need to apply the UNDO record first. + */ + did_commit = TransactionIdDidCommit(undorec->xid); + + switch (undorec->type) + { + case ZSUNDO_TYPE_INSERT: + if (!did_commit) + zsundo_record_dead_tuple(trimstats, undorec->tid); + break; + case ZSUNDO_TYPE_DELETE: + if (did_commit) + { + zsundo_record_dead_tuple(trimstats, undorec->tid); + } + else + { + /* + * must clear the item's UNDO pointer, otherwise the deletion + * becomes visible to everyone when the UNDO record is trimmed + * away + */ + /* + * Don't do this if we're called from zsundo_get_oldest_undo_ptr(), + * because we might be holding a lock on the page, and deadlock. + */ + if (trimstats->max_dead_tuples == 0) + trimstats->dead_tuples_overflowed = true; + else + zsbt_tid_undo_deletion(rel, undorec->tid, undorec->undorecptr); + } + break; + case ZSUNDO_TYPE_UPDATE: + if (did_commit) + zsundo_record_dead_tuple(trimstats, undorec->tid); + break; + } + + if (!trimstats->dead_tuples_overflowed) + { + ptr += undorec->size; + + can_advance_oldestundorecptr = true; + } + } + + if (ptr < endptr) + { + UnlockReleaseBuffer(buf); + break; + } + else + { + /* We processed all records on the page. Step to the next one, if any. */ + Assert(ptr == endptr); + *unused_pages = lappend_int(*unused_pages, lastblk); + lastblk = opaque->next; + UnlockReleaseBuffer(buf); + if (lastblk != InvalidBlockNumber) + trimstats->deleted_undo_pages++; + } + } + + if (can_advance_oldestundorecptr && lastblk == InvalidBlockNumber) + { + /* + * We stopped after the last valid record. Advance by one, to the next + * record which hasn't been created yet, and which is still needed + */ + oldest_undorecptr.counter++; + oldest_undorecptr.blkno = InvalidBlockNumber; + oldest_undorecptr.offset = 0; + } + + trimstats->can_advance_oldestundorecptr = can_advance_oldestundorecptr; + *oldest_undopage = lastblk; + return oldest_undorecptr; +} + +/* Update metapage with the oldest value */ +static void +zsundo_update_oldest_ptr(Relation rel, ZSUndoRecPtr oldest_undorecptr, BlockNumber oldest_undopage, List *unused_pages) +{ + /* Scan the undo log from oldest to newest */ + Buffer metabuf; + Page metapage; + ZSMetaPageOpaque *metaopaque; + ListCell *lc; + + metabuf = ReadBuffer(rel, ZS_META_BLK); + metapage = BufferGetPage(metabuf); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + metaopaque = (ZSMetaPageOpaque *) PageGetSpecialPointer(metapage); + + metaopaque->zs_undo_oldestptr = oldest_undorecptr; + if (oldest_undopage == InvalidBlockNumber) + { + metaopaque->zs_undo_head = InvalidBlockNumber; + metaopaque->zs_undo_tail = InvalidBlockNumber; + } + else + metaopaque->zs_undo_head = oldest_undopage; + + /* TODO: WAL-log */ + + MarkBufferDirty(metabuf); + UnlockReleaseBuffer(metabuf); + + foreach(lc, unused_pages) + { + BlockNumber blk = (BlockNumber) lfirst_int(lc); + Buffer buf; + Page page; + ZSUndoPageOpaque *opaque; + + /* check that the page still looks like what we'd expect. */ + buf = ReadBuffer(rel, blk); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + if (PageIsEmpty(page) || + PageGetSpecialSize(page) != MAXALIGN(sizeof(ZSUndoPageOpaque))) + { + UnlockReleaseBuffer(buf); + continue; + } + opaque = (ZSUndoPageOpaque *) PageGetSpecialPointer(page); + if (opaque->zs_page_id != ZS_UNDO_PAGE_ID) + { + UnlockReleaseBuffer(buf); + continue; + } + + /* FIXME: Also check here that the max UndoRecPtr on the page is less + * than the new 'oldest_undorecptr' + */ + + zspage_delete_page(rel, buf); + UnlockReleaseBuffer(buf); + } +} + +/* + * zsundo_record_dead_tuple - remember one deletable tuple + */ +static void +zsundo_record_dead_tuple(ZSUndoTrimStats *trimstats, zstid tid) +{ + /* + * The array shouldn't overflow under normal behavior, but perhaps it + * could if we are given a really small maintenance_work_mem. In that + * case, just forget the last few tuples (we'll get 'em next time). + */ + if (trimstats->num_dead_tuples < trimstats->max_dead_tuples) + { + trimstats->dead_tuples[trimstats->num_dead_tuples] = ItemPointerFromZSTid(tid); + trimstats->num_dead_tuples++; + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, + trimstats->num_dead_tuples); + } + else + trimstats->dead_tuples_overflowed = true; +} + +/* + * Return the current "Oldest undo pointer". The effects of any actions with + * undo pointer older than this is known to be visible to everyone. (i.e. + * an inserted tuple is known to be visible, and a deleted tuple is known to + * be invisible.) + */ +ZSUndoRecPtr +zsundo_get_oldest_undo_ptr(Relation rel) +{ + ZSUndoRecPtr result; + ZSUndoTrimStats trimstats; + BlockNumber oldest_undopage; + List *unused_pages = NIL; + + if (RelationGetNumberOfBlocks(rel) == 0) + { + memset(&result, 0, sizeof(ZSUndoRecPtr)); + return result; + } + + /* + * Call zsundo_scan, with max_dead_tuples = 0. It scans the UNDO log, + * starting from the oldest record, and advances the oldest UNDO pointer + * past as many committed, visible-to-all transactions as possible. + * + * TODO: + * We could get the latest cached value directly from the metapage, but + * this allows trimming the UNDO log more aggressively, whenever we're + * scanning. Fetching records from the UNDO log is pretty expensive, + * so until that is somehow sped up, it is a good tradeoff to be + * aggressive about that. + */ + trimstats.num_dead_tuples = 0; + trimstats.max_dead_tuples = 0; + trimstats.dead_tuples = NULL; + trimstats.dead_tuples_overflowed = false; + trimstats.deleted_undo_pages = 0; + result = zsundo_scan(rel, RecentGlobalXmin, &trimstats, &oldest_undopage, &unused_pages); + + if (trimstats.can_advance_oldestundorecptr) + zsundo_update_oldest_ptr(rel, result, oldest_undopage, unused_pages); + + return result; +} diff --git a/src/backend/access/zedstore/zedstore_utils.c b/src/backend/access/zedstore/zedstore_utils.c new file mode 100644 index 0000000000..7673537292 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_utils.c @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * zedstore_utils.c + * ZedStore utility functions + * + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_freepagemap.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/zedstore_internal.h" +#include "miscadmin.h" + +/* + * Allocate a new zs_split_stack struct. + */ +zs_split_stack * +zs_new_split_stack_entry(Buffer buf, Page page) +{ + zs_split_stack *stack; + + stack = palloc(sizeof(zs_split_stack)); + stack->next = NULL; + stack->buf = buf; + stack->page = page; + stack->recycle = false; /* caller can change this */ + + return stack; +} + +/* + * Apply all the changes represented by a list of zs_split_stack + * entries. + */ +void +zs_apply_split_changes(Relation rel, zs_split_stack *stack) +{ + zs_split_stack *head = stack; + + START_CRIT_SECTION(); + + while (stack) + { + PageRestoreTempPage(stack->page, BufferGetPage(stack->buf)); + MarkBufferDirty(stack->buf); + stack = stack->next; + } + + /* TODO: WAL-log all the changes */ + + END_CRIT_SECTION(); + + stack = head; + while (stack) + { + zs_split_stack *next; + + /* add this page to the Free Page Map for recycling */ + if (stack->recycle) + zspage_delete_page(rel, stack->buf); + + UnlockReleaseBuffer(stack->buf); + + next = stack->next; + pfree(stack); + stack = next; + } +} diff --git a/src/backend/access/zedstore/zedstore_visibility.c b/src/backend/access/zedstore/zedstore_visibility.c new file mode 100644 index 0000000000..0087991f78 --- /dev/null +++ b/src/backend/access/zedstore/zedstore_visibility.c @@ -0,0 +1,728 @@ +/* + * zedstore_visibility.c + * Routines for MVCC in Zedstore + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstore_visibility.c + */ +#include "postgres.h" + +#include "access/tableam.h" +#include "access/xact.h" +#include "access/zedstore_internal.h" +#include "access/zedstore_undo.h" +#include "storage/procarray.h" + +static bool +zs_tuplelock_compatible(LockTupleMode mode, LockTupleMode newmode) +{ + switch (newmode) + { + case LockTupleKeyShare: + return mode == LockTupleKeyShare || + mode == LockTupleShare || + mode == LockTupleNoKeyExclusive; + + case LockTupleShare: + return mode == LockTupleKeyShare || + mode == LockTupleShare; + + case LockTupleNoKeyExclusive: + return mode == LockTupleKeyShare; + case LockTupleExclusive: + return false; + + default: + elog(ERROR, "unknown tuple lock mode %d", newmode); + } +} + +/* + * Like HeapTupleSatisfiesUpdate. + * + * When returns TM_Ok, this also returns a flag in *undo_record_needed, to indicate + * whether the old UNDO record is still of interest to anyone. If the old record + * belonged to an aborted deleting transaction, for example, it can be ignored. + * + * This does more than HeapTupleSatisfiesUpdate. If HeapTupleSatisfiesUpdate sees + * an updated or locked tuple, it returns TM_BeingUpdated, and the caller has to + * check if the tuple lock is compatible with the update. zs_SatisfiesUpdate + * checks if the new lock mode is compatible with the old one, and returns TM_Ok + * if so. Waiting for conflicting locks is left to the caller. + * + * This is also used for tuple locking (e.g. SELECT FOR UPDATE). 'mode' indicates + * the lock mode. For a genuine UPDATE, pass LockTupleExclusive or + * LockTupleNoKeyExclusive depending on whether key columns are being modified. + * + * If the tuple was UPDATEd, *next_tid is set to the TID of the new row version. + */ +TM_Result +zs_SatisfiesUpdate(Relation rel, Snapshot snapshot, + ZSUndoRecPtr recent_oldest_undo, ZSBtreeItem *item, + LockTupleMode mode, + bool *undo_record_needed, TM_FailureData *tmfd, zstid *next_tid) +{ + ZSUndoRecPtr undo_ptr; + ZSUndoRec *undorec; + int chain_depth = 0; + + Assert((item->t_flags & ZSBT_COMPRESSED) == 0); + + *undo_record_needed = true; + + undo_ptr = zsbt_item_undoptr(item); + +fetch_undo_record: + chain_depth++; + + /* Is it visible? */ + if (undo_ptr.counter < recent_oldest_undo.counter) + { + /* + * The old UNDO record is no longer visible to anyone, so we don't + * need to keep it. If this record was not the one directly referenced + * from the item, then we must keep it, though. For example, if there + * is a chain (item -> LOCK_TUPLE -> INSERT), and the INSERT record is + * no longer needed by anyone, we must still keep the pointer to the LOCK + * record. + */ + if (chain_depth == 1) + *undo_record_needed = false; + return TM_Ok; + } + + /* have to fetch the UNDO record */ + undorec = zsundo_fetch(rel, undo_ptr); + + if (undorec->type == ZSUNDO_TYPE_INSERT) + { + if (TransactionIdIsCurrentTransactionId(undorec->xid)) + { + if (undorec->cid >= snapshot->curcid) + return TM_Invisible; /* inserted after scan started */ + } + else if (TransactionIdIsInProgress(undorec->xid)) + return TM_Invisible; /* inserter has not committed yet */ + else if (!TransactionIdDidCommit(undorec->xid)) + { + /* it must have aborted or crashed */ + return TM_Invisible; + } + + /* The tuple is visible to use. But can we lock it? */ + + /* + * No conflict with this lock. Look at the previous UNDO record, there + * might be more locks. + * + * FIXME: Shouldn't we drill down to the INSERT record and check if + * that's visible to us first, before looking at the lockers? + */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK) + { + ZSUndoRec_TupleLock *lock_undorec = (ZSUndoRec_TupleLock *) undorec; + + /* + * If any subtransaction of the current top transaction already holds + * a lock as strong as or stronger than what we're requesting, we + * effectively hold the desired lock already. We *must* succeed + * without trying to take the tuple lock, else we will deadlock + * against anyone wanting to acquire a stronger lock. + */ + if (TransactionIdIsCurrentTransactionId(undorec->xid)) + { + if (lock_undorec->lockmode >= mode) + { + *undo_record_needed = true; + return TM_Ok; + } + } + else if (!zs_tuplelock_compatible(lock_undorec->lockmode, mode) && + TransactionIdIsInProgress(undorec->xid)) + { + tmfd->ctid = ItemPointerFromZSTid(item->t_tid); + tmfd->xmax = undorec->xid; + tmfd->cmax = InvalidCommandId; + return TM_BeingModified; + } + + /* + * No conflict with this lock. Look at the previous UNDO record, there + * might be more locks. + * + * FIXME: Shouldn't we drill down to the INSERT record and check if + * that's visible to us first, before looking at the lockers? + */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + else if (undorec->type == ZSUNDO_TYPE_DELETE) + { + ZSUndoRec_Delete *deleterec = (ZSUndoRec_Delete *) undorec; + + if (TransactionIdIsCurrentTransactionId(undorec->xid)) + { + if (undorec->cid >= snapshot->curcid) + { + tmfd->ctid = ItemPointerFromZSTid(item->t_tid); + tmfd->xmax = undorec->xid; + tmfd->cmax = undorec->cid; + return TM_SelfModified; /* deleted/updated after scan started */ + } + else + return TM_Invisible; /* deleted before scan started */ + } + + if (TransactionIdIsInProgress(undorec->xid)) + { + tmfd->ctid = ItemPointerFromZSTid(item->t_tid); + tmfd->xmax = undorec->xid; + tmfd->cmax = InvalidCommandId; + + return TM_BeingModified; + } + + if (!TransactionIdDidCommit(undorec->xid)) + { + /* deleter must have aborted or crashed. We have to keep following the + * undo chain, in case there are LOCK records that are still visible + */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + + tmfd->xmax = undorec->xid; + tmfd->cmax = InvalidCommandId; + if (deleterec->changedPart) + { + ItemPointerSet(&tmfd->ctid, MovedPartitionsBlockNumber, MovedPartitionsOffsetNumber); + *next_tid = InvalidZSTid; + return TM_Updated; + } + else + { + tmfd->ctid = ItemPointerFromZSTid(item->t_tid); + return TM_Deleted; + } + } + else if (undorec->type == ZSUNDO_TYPE_UPDATE) + { + /* updated-away tuple */ + ZSUndoRec_Update *updaterec = (ZSUndoRec_Update *) undorec; + LockTupleMode old_lockmode; + + *next_tid = updaterec->newtid; + old_lockmode = updaterec->key_update ? LockTupleExclusive : LockTupleNoKeyExclusive; + + if (TransactionIdIsCurrentTransactionId(undorec->xid)) + { + if (zs_tuplelock_compatible(old_lockmode, mode)) + return TM_Ok; + + if (undorec->cid >= snapshot->curcid) + { + tmfd->ctid = ItemPointerFromZSTid(item->t_tid); + tmfd->xmax = undorec->xid; + tmfd->cmax = undorec->cid; + return TM_SelfModified; /* deleted/updated after scan started */ + } + else + return TM_Invisible; /* deleted before scan started */ + } + + if (TransactionIdIsInProgress(undorec->xid)) + { + if (zs_tuplelock_compatible(old_lockmode, mode)) + return TM_Ok; + + tmfd->ctid = ItemPointerFromZSTid(item->t_tid); + tmfd->xmax = undorec->xid; + tmfd->cmax = InvalidCommandId; + + return TM_BeingModified; + } + + if (!TransactionIdDidCommit(undorec->xid)) + { + /* deleter must have aborted or crashed. We have to keep following the + * undo chain, in case there are LOCK records that are still visible + */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + + if (zs_tuplelock_compatible(old_lockmode, mode)) + return TM_Ok; + + tmfd->ctid = ItemPointerFromZSTid(((ZSUndoRec_Update *) undorec)->newtid); + tmfd->xmax = undorec->xid; + tmfd->cmax = InvalidCommandId; + return TM_Updated; + } + else + elog(ERROR, "unexpected UNDO record type: %d", undorec->type); +} + + +/* + * Like HeapTupleSatisfiesAny + */ +static bool +zs_SatisfiesAny(ZSBtreeScan *scan, ZSBtreeItem *item) +{ + return true; +} + +/* + * helper function to zs_SatisfiesMVCC(), to check if the given XID + * is visible to the snapshot. + */ +static bool +xid_is_visible(Snapshot snapshot, TransactionId xid, CommandId cid, bool *aborted) +{ + *aborted = false; + if (TransactionIdIsCurrentTransactionId(xid)) + { + if (cid >= snapshot->curcid) + return false; + else + return true; + } + else if (XidInMVCCSnapshot(xid, snapshot)) + return false; + else if (TransactionIdDidCommit(xid)) + { + return true; + } + else + { + /* it must have aborted or crashed */ + *aborted = true; + return false; + } +} + +/* + * Like HeapTupleSatisfiesMVCC + */ +static bool +zs_SatisfiesMVCC(ZSBtreeScan *scan, ZSBtreeItem *item, + TransactionId *obsoleting_xid, zstid *next_tid) +{ + Relation rel = scan->rel; + Snapshot snapshot = scan->snapshot; + ZSUndoRecPtr recent_oldest_undo = scan->recent_oldest_undo; + ZSUndoRecPtr undo_ptr; + ZSUndoRec *undorec; + bool aborted; + + Assert((item->t_flags & ZSBT_COMPRESSED) == 0); + Assert (snapshot->snapshot_type == SNAPSHOT_MVCC); + + undo_ptr = zsbt_item_undoptr(item); + +fetch_undo_record: + /* If this record is "old", then the record is visible. */ + if (undo_ptr.counter < recent_oldest_undo.counter) + return true; + + /* have to fetch the UNDO record */ + undorec = zsundo_fetch(rel, undo_ptr); + + if (undorec->type == ZSUNDO_TYPE_INSERT) + { + /* Inserted tuple */ + bool result; + + result = xid_is_visible(snapshot, undorec->xid, undorec->cid, &aborted); + if (!result && !aborted) + *obsoleting_xid = undorec->xid; + return result; + } + else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK) + { + /* we don't care about tuple locks here. Follow the link to the + * previous UNDO record for this tuple. */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + else if (undorec->type == ZSUNDO_TYPE_DELETE || + undorec->type == ZSUNDO_TYPE_UPDATE) + { + if (undorec->type == ZSUNDO_TYPE_UPDATE) + { + ZSUndoRec_Update *updaterec = (ZSUndoRec_Update *) undorec; + if (next_tid) + *next_tid = updaterec->newtid; + } + + /* + * Deleted or updated-away. They are treated the same in an MVCC snapshot. + * They only need different treatment when updating or locking the row, + * in SatisfiesUpdate(). + */ + if (xid_is_visible(snapshot, undorec->xid, undorec->cid, &aborted)) + { + /* we can see the deletion */ + return false; + } + else + { + if (!aborted) + *obsoleting_xid = undorec->xid; + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + } + else + elog(ERROR, "unexpected UNDO record type: %d", undorec->type); +} + +/* + * Like HeapTupleSatisfiesSelf + */ +static bool +zs_SatisfiesSelf(ZSBtreeScan *scan, ZSBtreeItem *item, zstid *next_tid) +{ + Relation rel = scan->rel; + ZSUndoRecPtr recent_oldest_undo = scan->recent_oldest_undo; + ZSUndoRec *undorec; + ZSUndoRecPtr undo_ptr; + + Assert((item->t_flags & ZSBT_COMPRESSED) == 0); + Assert (scan->snapshot->snapshot_type == SNAPSHOT_SELF); + + undo_ptr = zsbt_item_undoptr(item); + +fetch_undo_record: + if (undo_ptr.counter < recent_oldest_undo.counter) + return true; + + /* have to fetch the UNDO record */ + undorec = zsundo_fetch(rel, undo_ptr); + + if (undorec->type == ZSUNDO_TYPE_INSERT) + { + /* Inserted tuple */ + if (TransactionIdIsCurrentTransactionId(undorec->xid)) + return true; /* inserted by me */ + else if (TransactionIdIsInProgress(undorec->xid)) + return false; + else if (TransactionIdDidCommit(undorec->xid)) + return true; + else + { + /* it must have aborted or crashed */ + return false; + } + } + else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK) + { + /* we don't care about tuple locks here. Follow the link to the + * previous UNDO record for this tuple. */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + else if (undorec->type == ZSUNDO_TYPE_DELETE || + undorec->type == ZSUNDO_TYPE_UPDATE) + { + if (undorec->type == ZSUNDO_TYPE_UPDATE) + { + ZSUndoRec_Update *updaterec = (ZSUndoRec_Update *) undorec; + if (next_tid) + *next_tid = updaterec->newtid; + } + + if (TransactionIdIsCurrentTransactionId(undorec->xid)) + { + /* deleted by me */ + return false; + } + + if (TransactionIdIsInProgress(undorec->xid)) + return true; + + if (!TransactionIdDidCommit(undorec->xid)) + { + /* + * Deleter must have aborted or crashed. But we have to keep following the + * undo chain, to check if the insertion was visible in the first + * place. + */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + + return false; + } + else + elog(ERROR, "unexpected UNDO record type: %d", undorec->type); +} + +/* + * Like HeapTupleSatisfiesDirty + */ +static bool +zs_SatisfiesDirty(ZSBtreeScan *scan, ZSBtreeItem *item, zstid *next_tid) +{ + Relation rel = scan->rel; + Snapshot snapshot = scan->snapshot; + ZSUndoRecPtr recent_oldest_undo = scan->recent_oldest_undo; + ZSUndoRecPtr undo_ptr; + ZSUndoRec *undorec; + + Assert((item->t_flags & ZSBT_COMPRESSED) == 0); + Assert (snapshot->snapshot_type == SNAPSHOT_DIRTY); + + snapshot->xmin = snapshot->xmax = InvalidTransactionId; + snapshot->speculativeToken = INVALID_SPECULATIVE_TOKEN; + + undo_ptr = zsbt_item_undoptr(item); + +fetch_undo_record: + if (undo_ptr.counter < recent_oldest_undo.counter) + return true; + + /* have to fetch the UNDO record */ + undorec = zsundo_fetch(rel, undo_ptr); + + if (undorec->type == ZSUNDO_TYPE_INSERT) + { + snapshot->speculativeToken = undorec->speculative_token; + /* Inserted tuple */ + if (TransactionIdIsCurrentTransactionId(undorec->xid)) + return true; /* inserted by me */ + else if (TransactionIdIsInProgress(undorec->xid)) + { + snapshot->xmin = undorec->xid; + return true; + } + else if (TransactionIdDidCommit(undorec->xid)) + { + return true; + } + else + { + /* it must have aborted or crashed */ + return false; + } + } + else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK) + { + /* locked tuple. */ + /* look at the previous UNDO record to find the insert record */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + else if (undorec->type == ZSUNDO_TYPE_DELETE || + undorec->type == ZSUNDO_TYPE_UPDATE) + { + if (undorec->type == ZSUNDO_TYPE_UPDATE) + { + ZSUndoRec_Update *updaterec = (ZSUndoRec_Update *) undorec; + if (next_tid) + *next_tid = updaterec->newtid; + } + + /* deleted or updated-away tuple */ + if (TransactionIdIsCurrentTransactionId(undorec->xid)) + { + /* deleted by me */ + return false; + } + + if (TransactionIdIsInProgress(undorec->xid)) + { + snapshot->xmax = undorec->xid; + return true; + } + + if (!TransactionIdDidCommit(undorec->xid)) + { + /* + * Deleter must have aborted or crashed. But we have to keep following the + * undo chain, to check if the insertion was visible in the first + * place. + */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + + return false; + } + else + elog(ERROR, "unexpected UNDO record type: %d", undorec->type); +} + +/* + * True if tuple might be visible to some transaction; false if it's + * surely dead to everyone, ie, vacuumable. + */ +static bool +zs_SatisfiesNonVacuumable(ZSBtreeScan *scan, ZSBtreeItem *item) +{ + Relation rel = scan->rel; + TransactionId OldestXmin = scan->snapshot->xmin; + ZSUndoRecPtr recent_oldest_undo = scan->recent_oldest_undo; + ZSUndoRecPtr undo_ptr; + ZSUndoRec *undorec; + + Assert (scan->snapshot->snapshot_type == SNAPSHOT_NON_VACUUMABLE); + Assert(TransactionIdIsValid(OldestXmin)); + + undo_ptr = zsbt_item_undoptr(item); + +fetch_undo_record: + + /* Is it visible? */ + if (undo_ptr.counter < recent_oldest_undo.counter) + return true; + + /* have to fetch the UNDO record */ + undorec = zsundo_fetch(rel, undo_ptr); + + if (undorec->type == ZSUNDO_TYPE_INSERT) + { + /* Inserted tuple */ + if (TransactionIdIsInProgress(undorec->xid)) + return true; /* inserter has not committed yet */ + + if (TransactionIdDidCommit(undorec->xid)) + return true; + + /* it must have aborted or crashed */ + return false; + } + else if (undorec->type == ZSUNDO_TYPE_DELETE || + undorec->type == ZSUNDO_TYPE_UPDATE) + { + /* deleted or updated-away tuple */ + ZSUndoRecPtr prevptr; + + if (TransactionIdIsInProgress(undorec->xid)) + return true; /* delete-in-progress */ + else if (TransactionIdDidCommit(undorec->xid)) + { + /* + * Deleter committed. But perhaps it was recent enough that some open + * transactions could still see the tuple. + */ + if (!TransactionIdPrecedes(undorec->xid, OldestXmin)) + return true; + + return false; + } + + /* + * The deleting transaction did not commit. But before concluding + * that the tuple is live, we have to check if the inserting + * XID is live. + */ + do { + prevptr = undorec->prevundorec; + + if (prevptr.counter < recent_oldest_undo.counter) + return true; + undorec = zsundo_fetch(rel, prevptr); + } while(undorec->type == ZSUNDO_TYPE_TUPLE_LOCK); + + Assert(undorec->type == ZSUNDO_TYPE_INSERT); + + if (TransactionIdIsInProgress(undorec->xid)) + return true; /* insert-in-progress */ + else if (TransactionIdDidCommit(undorec->xid)) + return true; /* inserted committed */ + + /* inserter must have aborted or crashed */ + return false; + } + else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK) + { + /* look at the previous UNDO record, to find the Insert record */ + undo_ptr = undorec->prevundorec; + goto fetch_undo_record; + } + else + elog(ERROR, "unexpected UNDO record type: %d", undorec->type); +} + +/* + * Like HeapTupleSatisfiesVisibility + * + * If next_tid is not NULL then gets populated for the tuple if tuple was + * UPDATEd. *next_tid_p is set to the TID of the new row version. + */ +bool +zs_SatisfiesVisibility(ZSBtreeScan *scan, ZSBtreeItem *item, + TransactionId *obsoleting_xid, zstid *next_tid) +{ + ZSUndoRecPtr undo_ptr; + + /* initialize as invalid, if we find valid one populate the same */ + if (next_tid) + *next_tid = InvalidZSTid; + + /* + * This works on a single or array item. Compressed items don't have + * visibility information (the items inside the compressed container + * do) + */ + Assert((item->t_flags & ZSBT_COMPRESSED) == 0); + + /* The caller should've filled in the recent_oldest_undo pointer */ + Assert(scan->recent_oldest_undo.counter != 0); + + *obsoleting_xid = InvalidTransactionId; + + /* dead items are never considered visible. */ + if ((item->t_flags & ZSBT_DEAD) != 0) + return false; + + /* + * Items with invalid undo record are considered visible. Mostly META + * column stores the valid undo record, all other columns stores invalid + * undo pointer. Visibility check is performed based on META column and + * only if visible rest of columns are fetched. For in-place updates, + * columns other than META column may have valid undo record, in which + * case the visibility check needs to be performed for the same. META + * column can sometime also have items with invalid undo, see + * zsbt_undo_item_deletion(). + */ + undo_ptr = zsbt_item_undoptr(item); + if (!IsZSUndoRecPtrValid(&undo_ptr)) + return true; + + switch (scan->snapshot->snapshot_type) + { + case SNAPSHOT_MVCC: + return zs_SatisfiesMVCC(scan, item, obsoleting_xid, next_tid); + + case SNAPSHOT_SELF: + return zs_SatisfiesSelf(scan, item, next_tid); + + case SNAPSHOT_ANY: + return zs_SatisfiesAny(scan, item); + + case SNAPSHOT_TOAST: + elog(ERROR, "SnapshotToast not implemented in zedstore"); + break; + + case SNAPSHOT_DIRTY: + return zs_SatisfiesDirty(scan, item, next_tid); + + case SNAPSHOT_HISTORIC_MVCC: + elog(ERROR, "SnapshotHistoricMVCC not implemented in zedstore yet"); + break; + + case SNAPSHOT_NON_VACUUMABLE: + return zs_SatisfiesNonVacuumable(scan, item); + } + + return false; /* keep compiler quiet */ +} diff --git a/src/backend/access/zedstore/zedstoream_handler.c b/src/backend/access/zedstore/zedstoream_handler.c new file mode 100644 index 0000000000..5a79b7a1fc --- /dev/null +++ b/src/backend/access/zedstore/zedstoream_handler.c @@ -0,0 +1,3163 @@ +/*------------------------------------------------------------------------- + * + * zedstoream_handler.c + * ZedStore table access method code + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/zedstore/zedstoream_handler.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "miscadmin.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/multixact.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/tsmapi.h" +#include "access/tupdesc_details.h" +#include "access/tuptoaster.h" +#include "access/xact.h" +#include "access/zedstore_internal.h" +#include "access/zedstore_undo.h" +#include "catalog/catalog.h" +#include "catalog/index.h" +#include "catalog/storage.h" +#include "catalog/storage_xlog.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "executor/executor.h" +#include "optimizer/plancat.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/rel.h" + + +typedef enum +{ + ZSSCAN_STATE_UNSTARTED, + ZSSCAN_STATE_SCANNING, + ZSSCAN_STATE_FINISHED_RANGE, + ZSSCAN_STATE_FINISHED +} zs_scan_state; + +typedef struct ZedStoreProjectData +{ + int num_proj_atts; + bool *project_columns; + int *proj_atts; + ZSBtreeScan *btree_scans; + MemoryContext context; +} ZedStoreProjectData; + +typedef struct ZedStoreDescData +{ + /* scan parameters */ + TableScanDescData rs_scan; /* */ + ZedStoreProjectData proj_data; + + zs_scan_state state; + zstid cur_range_start; + zstid cur_range_end; + bool finished; + + /* These fields are used for bitmap scans, to hold a "block's" worth of data */ +#define MAX_ITEMS_PER_LOGICAL_BLOCK MaxHeapTuplesPerPage + int bmscan_ntuples; + zstid *bmscan_tids; + Datum **bmscan_datums; + bool **bmscan_isnulls; + int bmscan_nexttuple; + + /* These fields are use for TABLESAMPLE scans */ + zstid max_tid_to_scan; + zstid next_tid_to_scan; + +} ZedStoreDescData; + +typedef struct ZedStoreDescData *ZedStoreDesc; + +typedef struct ZedStoreIndexFetchData +{ + IndexFetchTableData idx_fetch_data; + ZedStoreProjectData proj_data; +} ZedStoreIndexFetchData; + +typedef struct ZedStoreIndexFetchData *ZedStoreIndexFetch; + +typedef struct ParallelZSScanDescData *ParallelZSScanDesc; + +static IndexFetchTableData *zedstoream_begin_index_fetch(Relation rel); +static void zedstoream_end_index_fetch(IndexFetchTableData *scan); +static bool zedstoream_fetch_row(ZedStoreIndexFetchData *fetch, + ItemPointer tid_p, + Snapshot snapshot, + TupleTableSlot *slot); + +static Size zs_parallelscan_estimate(Relation rel); +static Size zs_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan); +static void zs_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan); +static bool zs_parallelscan_nextrange(Relation rel, ParallelZSScanDesc pzscan, + zstid *start, zstid *end); +static void zsbt_fill_missing_attribute_value(ZSBtreeScan *scan, Datum *datum, bool *isnull); + +/* ---------------------------------------------------------------- + * storage AM support routines for zedstoream + * ---------------------------------------------------------------- + */ + +static bool +zedstoream_fetch_row_version(Relation rel, + ItemPointer tid_p, + Snapshot snapshot, + TupleTableSlot *slot) +{ + IndexFetchTableData *fetcher; + bool result; + + fetcher = zedstoream_begin_index_fetch(rel); + + result = zedstoream_fetch_row((ZedStoreIndexFetchData *) fetcher, + tid_p, snapshot, slot); + if (result) + { + /* FIXME: heapam acquires the predicate lock first, and then + * calls CheckForSerializableConflictOut(). We do it in the + * opposite order, because CheckForSerializableConflictOut() + * call as done in zsbt_get_last_tid() already. Does it matter? + * I'm not sure. + */ + PredicateLockTID(rel, tid_p, snapshot); + } + ExecMaterializeSlot(slot); + slot->tts_tableOid = RelationGetRelid(rel); + slot->tts_tid = *tid_p; + + zedstoream_end_index_fetch(fetcher); + + return result; +} + +static void +zedstoream_get_latest_tid(TableScanDesc sscan, + ItemPointer tid) +{ + zstid ztid = ZSTidFromItemPointer(*tid); + zsbt_find_latest_tid(sscan->rs_rd, &ztid, sscan->rs_snapshot); + *tid = ItemPointerFromZSTid(ztid); +} + +static inline void +zedstoream_insert_internal(Relation relation, TupleTableSlot *slot, CommandId cid, + int options, struct BulkInsertStateData *bistate, uint32 speculative_token) +{ + AttrNumber attno; + Datum *d; + bool *isnulls; + zstid tid; + TransactionId xid = GetCurrentTransactionId(); + bool isnull; + Datum datum; + ZSUndoRecPtr prevundoptr; + + ZSUndoRecPtrInitialize(&prevundoptr); + + if (slot->tts_tupleDescriptor->natts != relation->rd_att->natts) + elog(ERROR, "slot's attribute count doesn't match relcache entry"); + + slot_getallattrs(slot); + d = slot->tts_values; + isnulls = slot->tts_isnull; + + tid = InvalidZSTid; + + isnull = true; + ZSUndoRecPtrInitialize(&prevundoptr); + zsbt_tid_multi_insert(relation, + &tid, 1, + xid, cid, speculative_token, prevundoptr); + + /* + * We only need to check for table-level SSI locks. Our + * new tuple can't possibly conflict with existing tuple locks, and + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + + for (attno = 1; attno <= relation->rd_att->natts; attno++) + { + Form_pg_attribute attr = TupleDescAttr(slot->tts_tupleDescriptor, attno - 1); + Datum toastptr = (Datum) 0; + datum = d[attno - 1]; + isnull = isnulls[attno - 1]; + + if (!isnull && attr->attlen < 0 && VARATT_IS_EXTERNAL(datum)) + datum = PointerGetDatum(heap_tuple_fetch_attr((struct varlena *) DatumGetPointer(datum))); + + /* If this datum is too large, toast it */ + if (!isnull && attr->attlen < 0 && + VARSIZE_ANY_EXHDR(datum) > MaxZedStoreDatumSize) + { + toastptr = datum = zedstore_toast_datum(relation, attno, datum); + } + + zsbt_attr_multi_insert(relation, attno, + &datum, &isnull, &tid, 1); + + if (toastptr != (Datum) 0) + zedstore_toast_finish(relation, attno, toastptr, tid); + } + + slot->tts_tableOid = RelationGetRelid(relation); + slot->tts_tid = ItemPointerFromZSTid(tid); + + /* Note: speculative insertions are counted too, even if aborted later */ + pgstat_count_heap_insert(relation, 1); +} + +static void +zedstoream_insert(Relation relation, TupleTableSlot *slot, CommandId cid, + int options, struct BulkInsertStateData *bistate) +{ + zedstoream_insert_internal(relation, slot, cid, options, bistate, INVALID_SPECULATIVE_TOKEN); +} + +static void +zedstoream_insert_speculative(Relation relation, TupleTableSlot *slot, CommandId cid, + int options, BulkInsertState bistate, uint32 specToken) +{ + zedstoream_insert_internal(relation, slot, cid, options, bistate, specToken); +} + +static void +zedstoream_complete_speculative(Relation relation, TupleTableSlot *slot, uint32 spekToken, + bool succeeded) +{ + zstid tid; + + tid = ZSTidFromItemPointer(slot->tts_tid); + zsbt_tid_clear_speculative_token(relation, tid, spekToken, true /* for complete */); + /* + * there is a conflict + */ + if (!succeeded) + elog(ERROR, "zedstoream_complete_speculative abort is not handled"); +} + +static void +zedstoream_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, + CommandId cid, int options, BulkInsertState bistate) +{ + AttrNumber attno; + int i; + bool slotgetandset = true; + TransactionId xid = GetCurrentTransactionId(); + int *tupletoasted; + Datum *datums; + bool *isnulls; + zstid *tids; + ZSUndoRecPtr prevundoptr; + + tupletoasted = palloc(ntuples * sizeof(int)); + datums = palloc0(ntuples * sizeof(Datum)); + isnulls = palloc(ntuples * sizeof(bool)); + tids = palloc0(ntuples * sizeof(zstid)); + + for (i = 0; i < ntuples; i++) + isnulls[i] = true; + + ZSUndoRecPtrInitialize(&prevundoptr); + zsbt_tid_multi_insert(relation, tids, ntuples, + xid, cid, INVALID_SPECULATIVE_TOKEN, prevundoptr); + + /* + * We only need to check for table-level SSI locks. Our + * new tuple can't possibly conflict with existing tuple locks, and + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + + for (attno = 1; attno <= relation->rd_att->natts; attno++) + { + Form_pg_attribute attr = TupleDescAttr((slots[0])->tts_tupleDescriptor, attno - 1); + int ntupletoasted = 0; + + for (i = 0; i < ntuples; i++) + { + Datum datum = slots[i]->tts_values[attno - 1]; + bool isnull = slots[i]->tts_isnull[attno - 1]; + + if (slotgetandset) + { + slot_getallattrs(slots[i]); + } + + /* If this datum is too large, toast it */ + if (!isnull && attr->attlen < 0 && + VARSIZE_ANY_EXHDR(datum) > MaxZedStoreDatumSize) + { + datum = zedstore_toast_datum(relation, attno, datum); + tupletoasted[ntupletoasted++] = i; + } + datums[i] = datum; + isnulls[i] = isnull; + } + + zsbt_attr_multi_insert(relation, attno, + datums, isnulls, tids, ntuples); + + for (i = 0; i < ntupletoasted; i++) + { + int idx = tupletoasted[i]; + + zedstore_toast_finish(relation, attno, datums[idx], tids[idx]); + } + + slotgetandset = false; + } + + for (i = 0; i < ntuples; i++) + { + slots[i]->tts_tableOid = RelationGetRelid(relation); + slots[i]->tts_tid = ItemPointerFromZSTid(tids[i]); + } + + pgstat_count_heap_insert(relation, ntuples); + + pfree(tids); + pfree(tupletoasted); + pfree(datums); + pfree(isnulls); +} + +static TM_Result +zedstoream_delete(Relation relation, ItemPointer tid_p, CommandId cid, + Snapshot snapshot, Snapshot crosscheck, bool wait, + TM_FailureData *hufd, bool changingPart) +{ + zstid tid = ZSTidFromItemPointer(*tid_p); + TransactionId xid = GetCurrentTransactionId(); + TM_Result result = TM_Ok; + +retry: + result = zsbt_tid_delete(relation, tid, xid, cid, + snapshot, crosscheck, wait, hufd, changingPart); + + if (result != TM_Ok) + { + if (result == TM_Invisible) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("attempted to delete invisible tuple"))); + else if (result == TM_BeingModified && wait) + { + TransactionId xwait = hufd->xmax; + + /* TODO: use something like heap_acquire_tuplock() for priority */ + if (!TransactionIdIsCurrentTransactionId(xwait)) + { + XactLockTableWait(xwait, relation, tid_p, XLTW_Delete); + goto retry; + } + } + } + + /* + * Check for SSI conflicts. + */ + CheckForSerializableConflictIn(relation, tid_p, ItemPointerGetBlockNumber(tid_p)); + + if (result == TM_Ok) + pgstat_count_heap_delete(relation); + + return result; +} + + +/* + * Each tuple lock mode has a corresponding heavyweight lock, and one or two + * corresponding MultiXactStatuses (one to merely lock tuples, another one to + * update them). This table (and the macros below) helps us determine the + * heavyweight lock mode and MultiXactStatus values to use for any particular + * tuple lock strength. + * + * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock + * instead. + */ +static const struct +{ + LOCKMODE hwlock; + int lockstatus; + int updstatus; +} + + tupleLockExtraInfo[MaxLockTupleMode + 1] = +{ + { /* LockTupleKeyShare */ + AccessShareLock, + MultiXactStatusForKeyShare, + -1 /* KeyShare does not allow updating tuples */ + }, + { /* LockTupleShare */ + RowShareLock, + MultiXactStatusForShare, + -1 /* Share does not allow updating tuples */ + }, + { /* LockTupleNoKeyExclusive */ + ExclusiveLock, + MultiXactStatusForNoKeyUpdate, + MultiXactStatusNoKeyUpdate + }, + { /* LockTupleExclusive */ + AccessExclusiveLock, + MultiXactStatusForUpdate, + MultiXactStatusUpdate + } +}; + + +/* + * Acquire heavyweight locks on tuples, using a LockTupleMode strength value. + * This is more readable than having every caller translate it to lock.h's + * LOCKMODE. + */ +#define LockTupleTuplock(rel, tup, mode) \ + LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +#define UnlockTupleTuplock(rel, tup, mode) \ + UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +#define ConditionalLockTupleTuplock(rel, tup, mode) \ + ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) + +/* + * Acquire heavyweight lock on the given tuple, in preparation for acquiring + * its normal, Xmax-based tuple lock. + * + * have_tuple_lock is an input and output parameter: on input, it indicates + * whether the lock has previously been acquired (and this function does + * nothing in that case). If this function returns success, have_tuple_lock + * has been flipped to true. + * + * Returns false if it was unable to obtain the lock; this can only happen if + * wait_policy is Skip. + * + * XXX: This is identical to heap_acquire_tuplock + */ + +static bool +zs_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, + LockWaitPolicy wait_policy, bool *have_tuple_lock) +{ + if (*have_tuple_lock) + return true; + + switch (wait_policy) + { + case LockWaitBlock: + LockTupleTuplock(relation, tid, mode); + break; + + case LockWaitSkip: + if (!ConditionalLockTupleTuplock(relation, tid, mode)) + return false; + break; + + case LockWaitError: + if (!ConditionalLockTupleTuplock(relation, tid, mode)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + *have_tuple_lock = true; + + return true; +} + + +static TM_Result +zedstoream_lock_tuple(Relation relation, ItemPointer tid_p, Snapshot snapshot, + TupleTableSlot *slot, CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, uint8 flags, + TM_FailureData *tmfd) +{ + zstid tid = ZSTidFromItemPointer(*tid_p); + TransactionId xid = GetCurrentTransactionId(); + TM_Result result; + bool have_tuple_lock = false; + zstid next_tid = tid; + SnapshotData SnapshotDirty; + bool locked_something = false; + + slot->tts_tableOid = RelationGetRelid(relation); + slot->tts_tid = *tid_p; + + tmfd->traversed = false; + /* + * For now, we lock just the first attribute. As long as everyone + * does that, that's enough. + */ +retry: + result = zsbt_tid_lock(relation, tid, xid, cid, + mode, snapshot, tmfd, &next_tid); + + if (result == TM_Invisible) + { + /* + * This is possible, but only when locking a tuple for ON CONFLICT + * UPDATE. We return this value here rather than throwing an error in + * order to give that case the opportunity to throw a more specific + * error. + */ + /* + * This can also happen, if we're locking an UPDATE chain for KEY SHARE mode: + * A tuple has been inserted, and then updated, by a different transaction. + * The updating transaction is still in progress. We can lock the row + * in KEY SHARE mode, assuming the key columns were not updated, and we will + * try to lock all the row version, even the still in-progress UPDATEs. + * It's possible that the UPDATE aborts while we're chasing the update chain, + * so that the updated tuple becomes invisible to us. That's OK. + */ + if (mode == LockTupleKeyShare && locked_something) + return TM_Ok; + else + return TM_Invisible; + } + else if (result == TM_Updated || + (result == TM_SelfModified && tmfd->cmax == cid)) + { + /* + * The other transaction is an update and it already committed. + * + * If the caller asked for the latest version, find it. + */ + if ((flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION) != 0 && next_tid != tid) + { + if (have_tuple_lock) + { + UnlockTupleTuplock(relation, tid_p, mode); + have_tuple_lock = false; + } + + if (ItemPointerIndicatesMovedPartitions(&tmfd->ctid)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update"))); + + /* it was updated, so look at the updated version */ + *tid_p = ItemPointerFromZSTid(next_tid); + + /* signal that a tuple later in the chain is getting locked */ + tmfd->traversed = true; + + /* loop back to fetch next in chain */ + + /* FIXME: In the corresponding code in heapam, we cross-check the xmin/xmax + * of the old and new tuple. Should we do the same here? + */ + + InitDirtySnapshot(SnapshotDirty); + snapshot = &SnapshotDirty; + tid = next_tid; + goto retry; + } + + return result; + } + else if (result == TM_Deleted) + { + /* + * The other transaction is a delete and it already committed. + */ + return result; + } + else if (result == TM_BeingModified) + { + TransactionId xwait = tmfd->xmax; + + /* + * Acquire tuple lock to establish our priority for the tuple, or + * die trying. LockTuple will release us when we are next-in-line + * for the tuple. We must do this even if we are share-locking. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while + * rechecking tuple state. + */ + if (!zs_acquire_tuplock(relation, tid_p, mode, wait_policy, + &have_tuple_lock)) + { + /* + * This can only happen if wait_policy is Skip and the lock + * couldn't be obtained. + */ + return TM_WouldBlock; + } + + /* wait for regular transaction to end, or die trying */ + switch (wait_policy) + { + case LockWaitBlock: + XactLockTableWait(xwait, relation, tid_p, XLTW_Lock); + break; + case LockWaitSkip: + if (!ConditionalXactLockTableWait(xwait)) + { + return TM_WouldBlock; + } + break; + case LockWaitError: + if (!ConditionalXactLockTableWait(xwait)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + + /* + * xwait is done. Retry. + */ + goto retry; + } + if (result == TM_Ok) + locked_something = true; + + /* + * Now that we have successfully marked the tuple as locked, we can + * release the lmgr tuple lock, if we had it. + */ + if (have_tuple_lock) + { + UnlockTupleTuplock(relation, tid_p, mode); + have_tuple_lock = false; + } + + if (mode == LockTupleKeyShare) + { + /* lock all row versions, if it's a KEY SHARE lock */ + if (result == TM_Ok && tid != next_tid && next_tid != InvalidZSTid) + { + tid = next_tid; + goto retry; + } + } + + /* Fetch the tuple, too. */ + if (!zedstoream_fetch_row_version(relation, tid_p, SnapshotAny, slot)) + elog(ERROR, "could not fetch locked tuple"); + + return TM_Ok; +} + +/* like heap_tuple_attr_equals */ +static bool +zs_tuple_attr_equals(int attrnum, TupleTableSlot *slot1, TupleTableSlot *slot2) +{ + TupleDesc tupdesc = slot1->tts_tupleDescriptor; + Datum value1, + value2; + bool isnull1, + isnull2; + Form_pg_attribute att; + + /* + * If it's a whole-tuple reference, say "not equal". It's not really + * worth supporting this case, since it could only succeed after a no-op + * update, which is hardly a case worth optimizing for. + */ + if (attrnum == 0) + return false; + + /* + * Likewise, automatically say "not equal" for any system attribute other + * than tableOID; we cannot expect these to be consistent in a HOT chain, + * or even to be set correctly yet in the new tuple. + */ + if (attrnum < 0) + { + if (attrnum != TableOidAttributeNumber) + return false; + } + + /* + * Extract the corresponding values. XXX this is pretty inefficient if + * there are many indexed columns. Should HeapDetermineModifiedColumns do + * a single heap_deform_tuple call on each tuple, instead? But that + * doesn't work for system columns ... + */ + value1 = slot_getattr(slot1, attrnum, &isnull1); + value2 = slot_getattr(slot2, attrnum, &isnull2); + + /* + * If one value is NULL and other is not, then they are certainly not + * equal + */ + if (isnull1 != isnull2) + return false; + + /* + * If both are NULL, they can be considered equal. + */ + if (isnull1) + return true; + + /* + * We do simple binary comparison of the two datums. This may be overly + * strict because there can be multiple binary representations for the + * same logical value. But we should be OK as long as there are no false + * positives. Using a type-specific equality operator is messy because + * there could be multiple notions of equality in different operator + * classes; furthermore, we cannot safely invoke user-defined functions + * while holding exclusive buffer lock. + */ + if (attrnum <= 0) + { + /* The only allowed system columns are OIDs, so do this */ + return (DatumGetObjectId(value1) == DatumGetObjectId(value2)); + } + else + { + Assert(attrnum <= tupdesc->natts); + att = TupleDescAttr(tupdesc, attrnum - 1); + return datumIsEqual(value1, value2, att->attbyval, att->attlen); + } +} + +static bool +is_key_update(Relation relation, TupleTableSlot *oldslot, TupleTableSlot *newslot) +{ + Bitmapset *key_attrs; + Bitmapset *interesting_attrs; + Bitmapset *modified_attrs; + int attnum; + + /* + * Fetch the list of attributes to be checked for various operations. + * + * For HOT considerations, this is wasted effort if we fail to update or + * have to put the new tuple on a different page. But we must compute the + * list before obtaining buffer lock --- in the worst case, if we are + * doing an update on one of the relevant system catalogs, we could + * deadlock if we try to fetch the list later. In any case, the relcache + * caches the data so this is usually pretty cheap. + * + * We also need columns used by the replica identity and columns that are + * considered the "key" of rows in the table. + * + * Note that we get copies of each bitmap, so we need not worry about + * relcache flush happening midway through. + */ + key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); + + interesting_attrs = NULL; + interesting_attrs = bms_add_members(interesting_attrs, key_attrs); + + /* Determine columns modified by the update. */ + modified_attrs = NULL; + while ((attnum = bms_first_member(interesting_attrs)) >= 0) + { + attnum += FirstLowInvalidHeapAttributeNumber; + + if (!zs_tuple_attr_equals(attnum, oldslot, newslot)) + modified_attrs = bms_add_member(modified_attrs, + attnum - FirstLowInvalidHeapAttributeNumber); + } + + return bms_overlap(modified_attrs, key_attrs); +} + +static TM_Result +zedstoream_update(Relation relation, ItemPointer otid_p, TupleTableSlot *slot, + CommandId cid, Snapshot snapshot, Snapshot crosscheck, + bool wait, TM_FailureData *hufd, + LockTupleMode *lockmode, bool *update_indexes) +{ + zstid otid = ZSTidFromItemPointer(*otid_p); + TransactionId xid = GetCurrentTransactionId(); + AttrNumber attno; + bool key_update; + Datum *d; + bool *isnulls; + TM_Result result; + zstid newtid; + TupleTableSlot *oldslot; + IndexFetchTableData *fetcher; + ZSUndoRecPtr prevundoptr; + + ZSUndoRecPtrInitialize(&prevundoptr); + + *update_indexes = true; + + slot_getallattrs(slot); + d = slot->tts_values; + isnulls = slot->tts_isnull; + + oldslot = table_slot_create(relation, NULL); + fetcher = zedstoream_begin_index_fetch(relation); + + /* + * The meta-attribute holds the visibility information, including the "t_ctid" + * pointer to the updated version. All the real attributes are just inserted, + * as if for a new row. + */ +retry: + newtid = InvalidZSTid; + + /* + * Fetch the old row, so that we can figure out which columns were modified. + * + * FIXME: if we have to follow the update chain, we should look at the + * currently latest tuple version, rather than the one visible to our snapshot. + */ + if (!zedstoream_fetch_row((ZedStoreIndexFetchData *) fetcher, + otid_p, SnapshotAny, oldslot)) + { + return TM_Invisible; + } + key_update = is_key_update(relation, oldslot, slot); + + *lockmode = key_update ? LockTupleExclusive : LockTupleNoKeyExclusive; + + result = zsbt_tid_update(relation, otid, + xid, cid, key_update, snapshot, crosscheck, + wait, hufd, &newtid); + + if (result == TM_Ok) + { + /* + * Check for SSI conflicts. + */ + CheckForSerializableConflictIn(relation, otid_p, ItemPointerGetBlockNumber(otid_p)); + + for (attno = 1; attno <= relation->rd_att->natts; attno++) + { + Form_pg_attribute attr = TupleDescAttr(relation->rd_att, attno - 1); + Datum newdatum = d[attno - 1]; + bool newisnull = isnulls[attno - 1]; + Datum toastptr = (Datum) 0; + + if (!newisnull && attr->attlen < 0 && VARATT_IS_EXTERNAL(newdatum)) + newdatum = PointerGetDatum(heap_tuple_fetch_attr((struct varlena *) DatumGetPointer(newdatum))); + + /* If this datum is too large, toast it */ + if (!newisnull && attr->attlen < 0 && + VARSIZE_ANY_EXHDR(newdatum) > MaxZedStoreDatumSize) + { + toastptr = newdatum = zedstore_toast_datum(relation, attno, newdatum); + } + + zsbt_attr_multi_insert(relation, attno, + &newdatum, &newisnull, &newtid, 1); + + if (toastptr != (Datum) 0) + zedstore_toast_finish(relation, attno, toastptr, newtid); + } + + slot->tts_tid = ItemPointerFromZSTid(newtid); + + pgstat_count_heap_update(relation, false); + } + else + { + if (result == TM_Invisible) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("attempted to update invisible tuple"))); + else if (result == TM_BeingModified && wait) + { + TransactionId xwait = hufd->xmax; + + /* TODO: use something like heap_acquire_tuplock() for priority */ + if (!TransactionIdIsCurrentTransactionId(xwait)) + { + XactLockTableWait(xwait, relation, otid_p, XLTW_Delete); + goto retry; + } + } + } + + zedstoream_end_index_fetch(fetcher); + ExecDropSingleTupleTableSlot(oldslot); + + return result; +} + +static const TupleTableSlotOps * +zedstoream_slot_callbacks(Relation relation) +{ + return &TTSOpsZedstore; +} + +static inline void +zs_initialize_proj_attributes(TupleDesc tupledesc, ZedStoreProjectData *proj_data) +{ + MemoryContext oldcontext; + + if (proj_data->num_proj_atts != 0) + return; + + oldcontext = MemoryContextSwitchTo(proj_data->context); + /* add one for meta-attribute */ + proj_data->proj_atts = palloc((tupledesc->natts + 1) * sizeof(int)); + proj_data->btree_scans = palloc0((tupledesc->natts + 1) * sizeof(ZSBtreeScan)); + + proj_data->proj_atts[proj_data->num_proj_atts++] = ZS_META_ATTRIBUTE_NUM; + + /* + * convert booleans array into an array of the attribute numbers of the + * required columns. + */ + for (int idx = 0; idx < tupledesc->natts; idx++) + { + int att_no = idx + 1; + + /* + * never project dropped columns, null will be returned for them + * in slot by default. + */ + if (TupleDescAttr(tupledesc, idx)->attisdropped) + continue; + + /* project_columns empty also conveys need all the columns */ + if (proj_data->project_columns == NULL || proj_data->project_columns[idx]) + proj_data->proj_atts[proj_data->num_proj_atts++] = att_no; + } + + MemoryContextSwitchTo(oldcontext); +} + +static inline void +zs_initialize_proj_attributes_extended(ZedStoreDesc scan, TupleDesc tupledesc) +{ + MemoryContext oldcontext; + ZedStoreProjectData *proj_data = &scan->proj_data; + + /* if already initialized return */ + if (proj_data->num_proj_atts != 0) + return; + + zs_initialize_proj_attributes(tupledesc, proj_data); + + oldcontext = MemoryContextSwitchTo(proj_data->context); + /* Extra setup for bitmap and sample scans */ + if ((scan->rs_scan.rs_flags & SO_TYPE_BITMAPSCAN) || + (scan->rs_scan.rs_flags & SO_TYPE_SAMPLESCAN) || + (scan->rs_scan.rs_flags & SO_TYPE_ANALYZE)) + { + scan->bmscan_ntuples = 0; + scan->bmscan_tids = palloc(MAX_ITEMS_PER_LOGICAL_BLOCK * sizeof(zstid)); + + scan->bmscan_datums = palloc(proj_data->num_proj_atts * sizeof(Datum *)); + scan->bmscan_isnulls = palloc(proj_data->num_proj_atts * sizeof(bool *)); + for (int i = 0; i < proj_data->num_proj_atts; i++) + { + scan->bmscan_datums[i] = palloc(MAX_ITEMS_PER_LOGICAL_BLOCK * sizeof(Datum)); + scan->bmscan_isnulls[i] = palloc(MAX_ITEMS_PER_LOGICAL_BLOCK * sizeof(bool)); + } + } + MemoryContextSwitchTo(oldcontext); +} + +static TableScanDesc +zedstoream_beginscan_with_column_projection(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + ParallelTableScanDesc parallel_scan, + uint32 flags, + bool *project_columns) +{ + ZedStoreDesc scan; + + /* Sample scans have no snapshot, but we need one */ + if (!snapshot) + { + Assert(!(flags & SO_TYPE_SAMPLESCAN)); + snapshot = SnapshotAny; + } + + /* + * allocate and initialize scan descriptor + */ + scan = (ZedStoreDesc) palloc0(sizeof(ZedStoreDescData)); + + scan->rs_scan.rs_rd = relation; + scan->rs_scan.rs_snapshot = snapshot; + scan->rs_scan.rs_nkeys = nkeys; + scan->rs_scan.rs_flags = flags; + scan->rs_scan.rs_parallel = parallel_scan; + + /* + * we can use page-at-a-time mode if it's an MVCC-safe snapshot + */ + scan->state = ZSSCAN_STATE_UNSTARTED; + + /* + * we do this here instead of in initscan() because heap_rescan also calls + * initscan() and we don't want to allocate memory again + */ + if (nkeys > 0) + scan->rs_scan.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); + else + scan->rs_scan.rs_key = NULL; + + scan->proj_data.context = CurrentMemoryContext; + scan->proj_data.project_columns = project_columns; + + /* + * For a seqscan in a serializable transaction, acquire a predicate lock + * on the entire relation. This is required not only to lock all the + * matching tuples, but also to conflict with new insertions into the + * table. In an indexscan, we take page locks on the index pages covering + * the range specified in the scan qual, but in a heap scan there is + * nothing more fine-grained to lock. A bitmap scan is a different story, + * there we have already scanned the index and locked the index pages + * covering the predicate. But in that case we still have to lock any + * matching heap tuples. + */ + if (!(flags & SO_TYPE_BITMAPSCAN) && + !(flags & SO_TYPE_ANALYZE)) + PredicateLockRelation(relation, snapshot); + + /* + * Currently, we don't have a stats counter for bitmap heap scans (but the + * underlying bitmap index scans will be counted) or sample scans (we only + * update stats for tuple fetches there) + */ + if (!(flags & SO_TYPE_BITMAPSCAN) && !(flags & SO_TYPE_SAMPLESCAN)) + pgstat_count_heap_scan(relation); + + return (TableScanDesc) scan; +} + +static TableScanDesc +zedstoream_beginscan(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + ParallelTableScanDesc parallel_scan, + uint32 flags) +{ + return zedstoream_beginscan_with_column_projection(relation, snapshot, + nkeys, key, parallel_scan, flags, NULL); +} + +static void +zedstoream_endscan(TableScanDesc sscan) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + ZedStoreProjectData *proj_data = &scan->proj_data; + + if (proj_data->proj_atts) + pfree(proj_data->proj_atts); + + if (proj_data->num_proj_atts > 0) + { + zsbt_tid_end_scan(&proj_data->btree_scans[0]); + for (int i = 1; i < proj_data->num_proj_atts; i++) + zsbt_attr_end_scan(&proj_data->btree_scans[i]); + } + + if (scan->rs_scan.rs_flags & SO_TEMP_SNAPSHOT) + UnregisterSnapshot(scan->rs_scan.rs_snapshot); + + if (proj_data->btree_scans) + pfree(proj_data->btree_scans); + pfree(scan); +} + +static void +zedstoream_rescan(TableScanDesc sscan, struct ScanKeyData *key, + bool set_params, bool allow_strat, + bool allow_sync, bool allow_pagemode) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + + /* these params don't do much in zedstore yet, but whatever */ + if (set_params) + { + if (allow_strat) + scan->rs_scan.rs_flags |= SO_ALLOW_STRAT; + else + scan->rs_scan.rs_flags &= ~SO_ALLOW_STRAT; + + if (allow_sync) + scan->rs_scan.rs_flags |= SO_ALLOW_SYNC; + else + scan->rs_scan.rs_flags &= ~SO_ALLOW_SYNC; + + if (allow_pagemode && scan->rs_scan.rs_snapshot && + IsMVCCSnapshot(scan->rs_scan.rs_snapshot)) + scan->rs_scan.rs_flags |= SO_ALLOW_PAGEMODE; + else + scan->rs_scan.rs_flags &= ~SO_ALLOW_PAGEMODE; + } + + if (scan->proj_data.num_proj_atts > 0) + { + zsbt_tid_end_scan(&scan->proj_data.btree_scans[0]); + for (int i = 1; i < scan->proj_data.num_proj_atts; i++) + zsbt_attr_end_scan(&scan->proj_data.btree_scans[i]); + } + + scan->state = ZSSCAN_STATE_UNSTARTED; +} + +static bool +zedstoream_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + ZedStoreProjectData *scan_proj = &scan->proj_data; + int i; + int slot_natts = slot->tts_tupleDescriptor->natts; + Datum *slot_values = slot->tts_values; + bool *slot_isnull = slot->tts_isnull; + + if (direction != ForwardScanDirection) + elog(ERROR, "backward scan not implemented in zedstore"); + + zs_initialize_proj_attributes(slot->tts_tupleDescriptor, scan_proj); + Assert((scan_proj->num_proj_atts - 1) <= slot_natts); + + /* + * Initialize the slot. + * + * We initialize all columns to NULL. The values for columns that are projected + * will be set to the actual values below, but it's important that non-projected + * columns are NULL. + */ + ExecClearTuple(slot); + for (i = 0; i < slot_natts; i++) + slot_isnull[i] = true; + + while (scan->state != ZSSCAN_STATE_FINISHED) + { + zstid this_tid; + Datum datum; + bool isnull; + + if (scan->state == ZSSCAN_STATE_UNSTARTED || + scan->state == ZSSCAN_STATE_FINISHED_RANGE) + { + MemoryContext oldcontext; + + if (scan->rs_scan.rs_parallel) + { + /* Allocate next range of TIDs to scan */ + if (!zs_parallelscan_nextrange(scan->rs_scan.rs_rd, + (ParallelZSScanDesc) scan->rs_scan.rs_parallel, + &scan->cur_range_start, &scan->cur_range_end)) + { + scan->state = ZSSCAN_STATE_FINISHED; + break; + } + } + else + { + if (scan->state == ZSSCAN_STATE_FINISHED_RANGE) + { + scan->state = ZSSCAN_STATE_FINISHED; + break; + } + scan->cur_range_start = MinZSTid; + scan->cur_range_end = MaxPlusOneZSTid; + } + + oldcontext = MemoryContextSwitchTo(scan_proj->context); + zsbt_tid_begin_scan(scan->rs_scan.rs_rd, + scan->cur_range_start, + scan->cur_range_end, + scan->rs_scan.rs_snapshot, + &scan_proj->btree_scans[0]); + scan_proj->btree_scans[0].serializable = true; + for (int i = 1; i < scan_proj->num_proj_atts; i++) + { + int attno = scan_proj->proj_atts[i]; + + zsbt_attr_begin_scan(scan->rs_scan.rs_rd, + slot->tts_tupleDescriptor, + attno, + scan->cur_range_start, + scan->cur_range_end, + &scan_proj->btree_scans[i]); + } + MemoryContextSwitchTo(oldcontext); + scan->state = ZSSCAN_STATE_SCANNING; + } + + /* We now have a range to scan. Find the next visible TID. */ + Assert(scan->state == ZSSCAN_STATE_SCANNING); + + this_tid = zsbt_tid_scan_next(&scan_proj->btree_scans[0]); + if (this_tid == InvalidZSTid) + { + scan->state = ZSSCAN_STATE_FINISHED_RANGE; + } + else + { + Assert (this_tid < scan->cur_range_end); + + /* Note: We don't need to predicate-lock tuples in Serializable mode, + * because in a sequential scan, we predicate-locked the whole table. + */ + + /* Fetch the datums of each attribute for this row */ + for (int i = 1; i < scan_proj->num_proj_atts; i++) + { + ZSBtreeScan *btscan = &scan_proj->btree_scans[i]; + Form_pg_attribute attr = ZSBtreeScanGetAttInfo(btscan); + int natt; + + if (!zsbt_scan_next_fetch(btscan, &datum, &isnull, this_tid)) + zsbt_fill_missing_attribute_value(btscan, &datum, &isnull); + + /* + * flatten any ZS-TOASTed values, because the rest of the system + * doesn't know how to deal with them. + */ + natt = scan_proj->proj_atts[i]; + + if (!isnull && attr->attlen == -1 && + VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE) + { + datum = zedstore_toast_flatten(scan->rs_scan.rs_rd, natt, this_tid, datum); + } + + /* Check that the values coming out of the b-tree are aligned properly */ + if (!isnull && attr->attlen == -1) + { + Assert (VARATT_IS_1B(datum) || INTALIGN(datum) == datum); + } + + if (natt != ZS_META_ATTRIBUTE_NUM) + { + Assert(natt > 0); + slot_values[natt - 1] = datum; + slot_isnull[natt - 1] = isnull; + } + } + } + + if (scan->state == ZSSCAN_STATE_FINISHED_RANGE) + { + zsbt_tid_end_scan(&scan_proj->btree_scans[0]); + for (int i = 1; i < scan_proj->num_proj_atts; i++) + zsbt_attr_end_scan(&scan_proj->btree_scans[i]); + } + else + { + Assert(scan->state == ZSSCAN_STATE_SCANNING); + slot->tts_tid = ItemPointerFromZSTid(this_tid); + slot->tts_nvalid = slot->tts_tupleDescriptor->natts; + slot->tts_flags &= ~TTS_FLAG_EMPTY; + + pgstat_count_heap_getnext(scan->rs_scan.rs_rd); + return true; + } + } + + ExecClearTuple(slot); + return false; +} + +static bool +zedstoream_tuple_tid_valid(TableScanDesc sscan, ItemPointer tid) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + zstid ztid = ZSTidFromItemPointer(*tid); + + if (scan->max_tid_to_scan == InvalidZSTid) + { + /* + * get the max tid once and store it + */ + scan->max_tid_to_scan = zsbt_get_last_tid(sscan->rs_rd); + } + + /* + * FIXME: should we get lowest TID as well to further optimize the check. + */ + if (ztid <= scan->max_tid_to_scan) + return true; + else + return false; +} + +static bool +zedstoream_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, + Snapshot snapshot) +{ + /* + * TODO: we didn't keep any visibility information about the tuple in the + * slot, so we have to fetch it again. A custom slot type might be a + * good idea.. + */ + zstid tid = ZSTidFromItemPointer(slot->tts_tid); + ZSBtreeScan meta_scan; + bool found; + + /* Use the meta-data tree for the visibility information. */ + zsbt_tid_begin_scan(rel, tid, tid + 1, snapshot, &meta_scan); + + found = zsbt_tid_scan_next(&meta_scan) != InvalidZSTid; + + zsbt_tid_end_scan(&meta_scan); + + return found; +} + +static TransactionId +zedstoream_compute_xid_horizon_for_tuples(Relation rel, + ItemPointerData *items, + int nitems) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function %s not implemented yet", __func__))); + +} + +static IndexFetchTableData * +zedstoream_begin_index_fetch(Relation rel) +{ + ZedStoreIndexFetch zscan = palloc0(sizeof(ZedStoreIndexFetchData)); + + zscan->idx_fetch_data.rel = rel; + zscan->proj_data.context = CurrentMemoryContext; + + return (IndexFetchTableData *) zscan; +} + +static void +zedstoream_fetch_set_column_projection(struct IndexFetchTableData *scan, + bool *project_columns) +{ + ZedStoreIndexFetch zscan = (ZedStoreIndexFetch) scan; + zscan->proj_data.project_columns = project_columns; +} + +static void +zedstoream_reset_index_fetch(IndexFetchTableData *scan) +{ + /* TODO: we could close the scans here, but currently we don't bother */ +} + +static void +zedstoream_end_index_fetch(IndexFetchTableData *scan) +{ + ZedStoreIndexFetch zscan = (ZedStoreIndexFetch) scan; + ZedStoreProjectData *zscan_proj = &zscan->proj_data; + + if (zscan_proj->num_proj_atts > 0) + { + zsbt_tid_end_scan(&zscan_proj->btree_scans[0]); + for (int i = 1; i < zscan_proj->num_proj_atts; i++) + zsbt_attr_end_scan(&zscan_proj->btree_scans[i]); + } + + if (zscan_proj->proj_atts) + pfree(zscan_proj->proj_atts); + + if (zscan_proj->btree_scans) + pfree(zscan_proj->btree_scans); + pfree(zscan); +} + +static bool +zedstoream_index_fetch_tuple(struct IndexFetchTableData *scan, + ItemPointer tid_p, + Snapshot snapshot, + TupleTableSlot *slot, + bool *call_again, bool *all_dead) +{ + bool result; + + /* + * we don't do in-place updates, so this is essentially the same as + * fetch_row_version. + */ + if (call_again) + *call_again = false; + if (all_dead) + *all_dead = false; + + result = zedstoream_fetch_row((ZedStoreIndexFetchData *) scan, tid_p, snapshot, slot); + if (result) + { + /* FIXME: heapam acquires the predicate lock first, and then + * calls CheckForSerializableConflictOut(). We do it in the + * opposite order, because CheckForSerializableConflictOut() + * call as done in zsbt_get_last_tid() already. Does it matter? + * I'm not sure. + */ + PredicateLockTID(scan->rel, tid_p, snapshot); + } + return result; +} + +/* + * Shared implementation of fetch_row_version and index_fetch_tuple callbacks. + */ +static bool +zedstoream_fetch_row(ZedStoreIndexFetchData *fetch, + ItemPointer tid_p, + Snapshot snapshot, + TupleTableSlot *slot) +{ + Relation rel = fetch->idx_fetch_data.rel; + zstid tid = ZSTidFromItemPointer(*tid_p); + bool found = true; + ZedStoreProjectData *fetch_proj = &fetch->proj_data; + + /* first time here, initialize */ + if (fetch_proj->num_proj_atts == 0) + zs_initialize_proj_attributes(slot->tts_tupleDescriptor, fetch_proj); + else + { + /* If we had a previous fetches still open, close them first */ + zsbt_tid_end_scan(&fetch_proj->btree_scans[0]); + for (int i = 1; i < fetch_proj->num_proj_atts; i++) + zsbt_attr_end_scan(&fetch_proj->btree_scans[i]); + } + + /* + * Initialize the slot. + * + * If we're not fetching all columns, initialize the unfetched values + * in the slot to NULL. (Actually, this initializes all to NULL, and the + * code below will overwrite them for the columns that are projected) + */ + ExecClearTuple(slot); + for (int i = 0; i < slot->tts_tupleDescriptor->natts; i++) + slot->tts_isnull[i] = true; + + zsbt_tid_begin_scan(rel, tid, tid + 1, snapshot, &fetch_proj->btree_scans[0]); + fetch_proj->btree_scans[0].serializable = true; + found = zsbt_tid_scan_next(&fetch_proj->btree_scans[0]) != InvalidZSTid; + if (found) + { + for (int i = 1; i < fetch_proj->num_proj_atts; i++) + { + int natt = fetch_proj->proj_atts[i]; + ZSBtreeScan *btscan = &fetch_proj->btree_scans[i]; + Form_pg_attribute attr; + Datum datum; + bool isnull; + + zsbt_attr_begin_scan(rel, slot->tts_tupleDescriptor, natt, tid, tid + 1, + btscan); + + attr = ZSBtreeScanGetAttInfo(btscan); + if (zsbt_scan_next_fetch(btscan, &datum, &isnull, tid)) + { + /* + * flatten any ZS-TOASTed values, because the rest of the system + * doesn't know how to deal with them. + */ + if (!isnull && attr->attlen == -1 && + VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE) + { + datum = zedstore_toast_flatten(rel, natt, tid, datum); + } + } + else + zsbt_fill_missing_attribute_value(btscan, &datum, &isnull); + + slot->tts_values[natt - 1] = datum; + slot->tts_isnull[natt - 1] = isnull; + } + } + + if (found) + { + slot->tts_tid = ItemPointerFromZSTid(tid); + slot->tts_nvalid = slot->tts_tupleDescriptor->natts; + slot->tts_flags &= ~TTS_FLAG_EMPTY; + return true; + } + + return false; +} + +static void +zedstoream_index_validate_scan(Relation baseRelation, + Relation indexRelation, + IndexInfo *indexInfo, + Snapshot snapshot, + ValidateIndexState *state) +{ + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + bool *proj; + int attno; + TableScanDesc scan; + ItemPointerData idx_ptr; + bool tuplesort_empty = false; + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + /* + * Need an EState for evaluation of index expressions and partial-index + * predicates. Also a slot to hold the current tuple. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + slot = table_slot_create(baseRelation, NULL); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + /* + * Prepare for scan of the base relation. We need just those tuples + * satisfying the passed-in reference snapshot. We must disable syncscan + * here, because it's critical that we read from block zero forward to + * match the sorted TIDs. + */ + + /* + * TODO: It would be very good to fetch only the columns we need. + */ + proj = palloc0(baseRelation->rd_att->natts * sizeof(bool)); + for (attno = 0; attno < indexInfo->ii_NumIndexKeyAttrs; attno++) + { + Assert(indexInfo->ii_IndexAttrNumbers[attno] <= baseRelation->rd_att->natts); + /* skip expressions */ + if (indexInfo->ii_IndexAttrNumbers[attno] > 0) + proj[indexInfo->ii_IndexAttrNumbers[attno] - 1] = true; + } + GetNeededColumnsForNode((Node *)indexInfo->ii_Predicate, proj, + baseRelation->rd_att->natts); + GetNeededColumnsForNode((Node *)indexInfo->ii_Expressions, proj, + baseRelation->rd_att->natts); + + scan = table_beginscan_with_column_projection(baseRelation, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + proj); + + /* + * Scan all tuples matching the snapshot. + */ + ItemPointerSet(&idx_ptr, 0, 0); /* this is less than any real TID */ + while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) + { + ItemPointerData tup_ptr = slot->tts_tid; + HeapTuple heapTuple; + int cmp; + + CHECK_FOR_INTERRUPTS(); + + /* + * TODO: Once we have in-place updates, like HOT, this will need + * to work harder, like heapam's function. + */ + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + if (tuplesort_empty) + cmp = -1; + else + { + while ((cmp = ItemPointerCompare(&tup_ptr, &idx_ptr)) > 0) + { + Datum ts_val; + bool ts_isnull; + + tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, + &ts_val, &ts_isnull, NULL); + if (!tuplesort_empty) + { + Assert(!ts_isnull); + itemptr_decode(&idx_ptr, DatumGetInt64(ts_val)); + + /* If int8 is pass-by-ref, free (encoded) TID Datum memory */ +#ifndef USE_FLOAT8_BYVAL + pfree(DatumGetPointer(ts_val)); +#endif + break; + } + else + { + /* Be tidy */ + ItemPointerSetInvalid(&idx_ptr); + cmp = -1; + } + } + } + if (cmp < 0) + { + /* This item is not in the index */ + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate != NULL) + { + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * For the current heap tuple, extract all the attributes we use in + * this index, and note which are null. This also performs evaluation + * of any expressions needed. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* Call the AM's callback routine to process the tuple */ + heapTuple = ExecCopySlotHeapTuple(slot); + heapTuple->t_self = slot->tts_tid; + index_insert(indexRelation, values, isnull, &tup_ptr, baseRelation, + indexInfo->ii_Unique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + indexInfo); + pfree(heapTuple); + + state->tups_inserted += 1; + } + } + + table_endscan(scan); + + ExecDropSingleTupleTableSlot(slot); + + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; +} + +static double +zedstoream_index_build_range_scan(Relation baseRelation, + Relation indexRelation, + IndexInfo *indexInfo, + bool allow_sync, + bool anyvisible, + bool progress, + BlockNumber start_blockno, + BlockNumber numblocks, + IndexBuildCallback callback, + void *callback_state, + TableScanDesc scan) +{ + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + double reltuples; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + Snapshot snapshot; + SnapshotData NonVacuumableSnapshot; + bool need_unregister_snapshot = false; + TransactionId OldestXmin; + +#ifdef USE_ASSERT_CHECKING + bool checking_uniqueness; + /* See whether we're verifying uniqueness/exclusion properties */ + checking_uniqueness = (indexInfo->ii_Unique || + indexInfo->ii_ExclusionOps != NULL); + + /* + * "Any visible" mode is not compatible with uniqueness checks; make sure + * only one of those is requested. + */ + Assert(!(anyvisible && checking_uniqueness)); +#endif + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + /* + * Need an EState for evaluation of index expressions and partial-index + * predicates. Also a slot to hold the current tuple. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + slot = table_slot_create(baseRelation, NULL); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + /* + * Prepare for scan of the base relation. In a normal index build, we use + * SnapshotAny because we must retrieve all tuples and do our own time + * qual checks (because we have to index RECENTLY_DEAD tuples). In a + * concurrent build, or during bootstrap, we take a regular MVCC snapshot + * and index whatever's live according to that. + */ + OldestXmin = InvalidTransactionId; + + /* okay to ignore lazy VACUUMs here */ + if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) + OldestXmin = GetOldestXmin(baseRelation, PROCARRAY_FLAGS_VACUUM); + + /* + * TODO: It would be very good to fetch only the columns we need. + */ + if (!scan) + { + bool *proj; + int attno; + + /* + * Serial index build. + * + * Must begin our own zedstore scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + */ + if (!TransactionIdIsValid(OldestXmin)) + { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + need_unregister_snapshot = true; + } + else + { + /* leave out completely dead items even with SnapshotAny */ + InitNonVacuumableSnapshot(NonVacuumableSnapshot, OldestXmin); + snapshot = &NonVacuumableSnapshot; + } + + proj = palloc0(baseRelation->rd_att->natts * sizeof(bool)); + for (attno = 0; attno < indexInfo->ii_NumIndexKeyAttrs; attno++) + { + Assert(indexInfo->ii_IndexAttrNumbers[attno] <= baseRelation->rd_att->natts); + /* skip expressions */ + if (indexInfo->ii_IndexAttrNumbers[attno] > 0) + proj[indexInfo->ii_IndexAttrNumbers[attno] - 1] = true; + } + + GetNeededColumnsForNode((Node *)indexInfo->ii_Predicate, proj, + baseRelation->rd_att->natts); + GetNeededColumnsForNode((Node *)indexInfo->ii_Expressions, proj, + baseRelation->rd_att->natts); + + scan = table_beginscan_with_column_projection(baseRelation, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + proj); + + if (start_blockno != 0 || numblocks != InvalidBlockNumber) + { + ZedStoreDesc zscan = (ZedStoreDesc) scan; + ZedStoreProjectData *zscan_proj = &zscan->proj_data; + + zscan->cur_range_start = ZSTidFromBlkOff(start_blockno, 1); + zscan->cur_range_end = ZSTidFromBlkOff(numblocks, 1); + + /* FIXME: when can 'num_proj_atts' be 0? */ + if (zscan_proj->num_proj_atts > 0) + { + zsbt_tid_begin_scan(zscan->rs_scan.rs_rd, + zscan->cur_range_start, + zscan->cur_range_end, + zscan->rs_scan.rs_snapshot, + &zscan_proj->btree_scans[0]); + for (int i = 1; i < zscan_proj->num_proj_atts; i++) + { + int natt = zscan_proj->proj_atts[i]; + + zsbt_attr_begin_scan(zscan->rs_scan.rs_rd, + RelationGetDescr(zscan->rs_scan.rs_rd), + natt, + zscan->cur_range_start, + zscan->cur_range_end, + &zscan_proj->btree_scans[i]); + } + } + zscan->state = ZSSCAN_STATE_SCANNING; + } + } + else + { + /* + * Parallel index build. + * + * Parallel case never registers/unregisters own snapshot. Snapshot + * is taken from parallel zedstore scan, and is SnapshotAny or an MVCC + * snapshot, based on same criteria as serial case. + */ + Assert(!IsBootstrapProcessingMode()); + Assert(allow_sync); + Assert(start_blockno == 0); + Assert(numblocks == InvalidBlockNumber); + snapshot = scan->rs_snapshot; + + if (snapshot == SnapshotAny) + { + /* leave out completely dead items even with SnapshotAny */ + InitNonVacuumableSnapshot(NonVacuumableSnapshot, OldestXmin); + snapshot = &NonVacuumableSnapshot; + } + } + + /* + * Must call GetOldestXmin() with SnapshotAny. Should never call + * GetOldestXmin() with MVCC snapshot. (It's especially worth checking + * this for parallel builds, since ambuild routines that support parallel + * builds must work these details out for themselves.) + */ + Assert(snapshot == &NonVacuumableSnapshot || IsMVCCSnapshot(snapshot)); + Assert(snapshot == &NonVacuumableSnapshot ? TransactionIdIsValid(OldestXmin) : + !TransactionIdIsValid(OldestXmin)); + Assert(snapshot == &NonVacuumableSnapshot || !anyvisible); + + reltuples = 0; + + /* + * Scan all tuples in the base relation. + */ + while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) + { + bool tupleIsAlive; + HeapTuple heapTuple; + + if (numblocks != InvalidBlockNumber && + ItemPointerGetBlockNumber(&slot->tts_tid) >= numblocks) + break; + + CHECK_FOR_INTERRUPTS(); + + /* table_scan_getnextslot did the visibility check */ + tupleIsAlive = true; + reltuples += 1; + + /* + * TODO: Once we have in-place updates, like HOT, this will need + * to work harder, to figure out which tuple version to index. + */ + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate != NULL) + { + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * For the current heap tuple, extract all the attributes we use in + * this index, and note which are null. This also performs evaluation + * of any expressions needed. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* Call the AM's callback routine to process the tuple */ + heapTuple = ExecCopySlotHeapTuple(slot); + heapTuple->t_self = slot->tts_tid; + callback(indexRelation, heapTuple, values, isnull, tupleIsAlive, + callback_state); + pfree(heapTuple); + } + + table_endscan(scan); + + /* we can now forget our snapshot, if set and registered by us */ + if (need_unregister_snapshot) + UnregisterSnapshot(snapshot); + + ExecDropSingleTupleTableSlot(slot); + + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; + + return reltuples; +} + +static void +zedstoream_finish_bulk_insert(Relation relation, int options) +{ + /* + * If we skipped writing WAL, then we need to sync the zedstore (but not + * indexes since those use WAL anyway / don't go through tableam) + */ + if (options & HEAP_INSERT_SKIP_WAL) + heap_sync(relation); +} + +/* ------------------------------------------------------------------------ + * DDL related callbacks for zedstore AM. + * ------------------------------------------------------------------------ + */ + +static void +zedstoream_relation_set_new_filenode(Relation rel, + const RelFileNode *newrnode, + char persistence, + TransactionId *freezeXid, + MultiXactId *minmulti) +{ + SMgrRelation srel; + + /* + * Initialize to the minimum XID that could put tuples in the table. We + * know that no xacts older than RecentXmin are still running, so that + * will do. + */ + *freezeXid = RecentXmin; + + /* + * Similarly, initialize the minimum Multixact to the first value that + * could possibly be stored in tuples in the table. Running transactions + * could reuse values from their local cache, so we are careful to + * consider all currently running multis. + * + * XXX this could be refined further, but is it worth the hassle? + */ + *minmulti = GetOldestMultiXactId(); + + srel = RelationCreateStorage(*newrnode, persistence); + + /* + * If required, set up an init fork for an unlogged table so that it can + * be correctly reinitialized on restart. An immediate sync is required + * even if the page has been logged, because the write did not go through + * shared_buffers and therefore a concurrent checkpoint may have moved the + * redo pointer past our xlog record. Recovery may as well remove it + * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE + * record. Therefore, logging is necessary even if wal_level=minimal. + */ + if (persistence == RELPERSISTENCE_UNLOGGED) + { + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); + smgrcreate(srel, INIT_FORKNUM, false); + log_smgrcreate(newrnode, INIT_FORKNUM); + smgrimmedsync(srel, INIT_FORKNUM); + } +} + +static void +zedstoream_relation_nontransactional_truncate(Relation rel) +{ + RelationTruncate(rel, 0); +} + +static void +zedstoream_relation_copy_data(Relation rel, const RelFileNode *newrnode) +{ + SMgrRelation dstrel; + + dstrel = smgropen(*newrnode, rel->rd_backend); + RelationOpenSmgr(rel); + + /* + * Since we copy the file directly without looking at the shared buffers, + * we'd better first flush out any pages of the source relation that are + * in shared buffers. We assume no new changes will be made while we are + * holding exclusive lock on the rel. + */ + FlushRelationBuffers(rel); + + /* + * Create and copy all the relation, and schedule unlinking of the + * old physical file. + * + * NOTE: any conflict in relfilenode value will be caught in + * RelationCreateStorage(). + * + * NOTE: There is only the main fork in zedstore. Otherwise + * this would need to copy other forks, too. + */ + RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence); + + /* copy main fork */ + RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM, + rel->rd_rel->relpersistence); + + /* drop old relation, and close new one */ + RelationDropStorage(rel); + smgrclose(dstrel); +} + +/* + * Subroutine of the zedstoream_relation_copy_for_cluster() callback. + * + * Creates the TID item with correct visibility information for the + * given tuple in the old table. Returns the tid of the tuple in the + * new table, or InvalidZSTid if this tuple can be left out completely. + * + * FIXME: This break UPDATE chains. I.e. after this is done, an UPDATE + * looks like DELETE + INSERT, instead of an UPDATe, to any transaction that + * might try to follow the update chain. + */ +static zstid +zs_cluster_process_tuple(Relation OldHeap, Relation NewHeap, + zstid oldtid, ZSUndoRecPtr old_undoptr, + ZSUndoRecPtr recent_oldest_undo, + TransactionId OldestXmin) +{ + TransactionId this_xmin; + CommandId this_cmin; + TransactionId this_xmax; + CommandId this_cmax; + bool this_changedPart; + ZSUndoRecPtr undo_ptr; + ZSUndoRec *undorec; + + /* + * Follow the chain of UNDO records for this tuple, to find the + * transaction that originally inserted the row (xmin/cmin), and + * the transaction that deleted or updated it away, if any (xmax/cmax) + */ + this_xmin = FrozenTransactionId; + this_cmin = InvalidCommandId; + this_xmax = InvalidTransactionId; + this_cmax = InvalidCommandId; + + undo_ptr = old_undoptr; + for (;;) + { + if (undo_ptr.counter < recent_oldest_undo.counter) + { + /* This tuple version is visible to everyone. */ + break; + } + + /* Fetch the next UNDO record. */ + undorec = zsundo_fetch(OldHeap, undo_ptr); + + if (undorec->type == ZSUNDO_TYPE_INSERT) + { + if (!TransactionIdIsCurrentTransactionId(undorec->xid) && + !TransactionIdIsInProgress(undorec->xid) && + !TransactionIdDidCommit(undorec->xid)) + { + /* + * inserter aborted or crashed. This row is not visible to + * anyone. Including any later tuple versions we might have + * seen. + */ + this_xmin = InvalidTransactionId; + break; + } + else + { + /* Inserter committed. */ + this_xmin = undorec->xid; + this_cmin = undorec->cid; + + /* we know everything there is to know about this tuple version. */ + break; + } + } + else if (undorec->type == ZSUNDO_TYPE_TUPLE_LOCK) + { + /* Ignore tuple locks for now. + * + * FIXME: we should propagate them to the new copy of the table + */ + undo_ptr = undorec->prevundorec; + continue; + } + else if (undorec->type == ZSUNDO_TYPE_DELETE || + undorec->type == ZSUNDO_TYPE_UPDATE) + { + /* Row was deleted (or updated away). */ + if (!TransactionIdIsCurrentTransactionId(undorec->xid) && + !TransactionIdIsInProgress(undorec->xid) && + !TransactionIdDidCommit(undorec->xid)) + { + /* deleter aborted or crashed. The previous record should + * be an insertion (possibly with some tuple-locking in + * between). We'll remember the tuple when we see the + * insertion. + */ + undo_ptr = undorec->prevundorec; + continue; + } + else + { + /* deleter committed or is still in progress. */ + if (TransactionIdPrecedes(undorec->xid, OldestXmin)) + { + /* the deletion is visible to everyone. We can skip the row completely. */ + this_xmin = InvalidTransactionId; + break; + } + else + { + /* deleter committed or is in progress. Remember that it was + * deleted by this XID. + */ + this_xmax = undorec->xid; + this_cmax = undorec->cid; + if (undorec->type == ZSUNDO_TYPE_DELETE) + this_changedPart = ((ZSUndoRec_Delete *) undorec)->changedPart; + else + this_changedPart = false; + + /* follow the UNDO chain to find information about the inserting + * transaction (xmin/cmin) + */ + undo_ptr = undorec->prevundorec; + continue; + } + } + } + } + + /* + * We now know the visibility of this tuple. Re-create it in the new table. + */ + if (this_xmin != InvalidTransactionId) + { + /* Insert the first version of the row. */ + ZSUndoRecPtr prevundoptr; + zstid newtid = InvalidZSTid; + + /* First, insert the tuple. */ + ZSUndoRecPtrInitialize(&prevundoptr); + zsbt_tid_multi_insert(NewHeap, + &newtid, 1, + this_xmin, + this_cmin, + INVALID_SPECULATIVE_TOKEN, + prevundoptr); + + /* And if the tuple was deleted/updated away, do the same in the new table. */ + if (this_xmax != InvalidTransactionId) + { + TM_Result delete_result; + + /* tuple was deleted. */ + delete_result = zsbt_tid_delete(NewHeap, newtid, + this_xmax, this_cmax, + NULL, NULL, false, NULL, this_changedPart); + if (delete_result != TM_Ok) + elog(ERROR, "tuple deletion failed during table rewrite"); + } + return newtid; + } + else + return InvalidZSTid; +} + + +static void +zedstoream_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, + Relation OldIndex, bool use_sort, + TransactionId OldestXmin, + TransactionId *xid_cutoff, + MultiXactId *multi_cutoff, + double *num_tuples, + double *tups_vacuumed, + double *tups_recently_dead) +{ + TupleDesc olddesc; + ZSBtreeScan meta_scan; + ZSBtreeScan *attr_scans; + ZSUndoRecPtr recent_oldest_undo = zsundo_get_oldest_undo_ptr(OldHeap); + int attno; + IndexScanDesc indexScan; + + olddesc = RelationGetDescr(OldHeap), + + attr_scans = palloc((olddesc->natts + 1) * sizeof(ZSBtreeScan)); + + /* + * Scan the old table. We ignore any old updated-away tuple versions, + * and only stop at the latest tuple version of each row. At the latest + * version, follow the update chain to get all the old versions of that + * row, too. That way, the whole update chain is processed in one go, + * and can be reproduced in the new table. + */ + zsbt_tid_begin_scan(OldHeap, MinZSTid, MaxPlusOneZSTid, + SnapshotAny, &meta_scan); + + for (attno = 1; attno <= olddesc->natts; attno++) + { + if (TupleDescAttr(olddesc, attno - 1)->attisdropped) + continue; + + zsbt_attr_begin_scan(OldHeap, + olddesc, + attno, + MinZSTid, + MaxPlusOneZSTid, + &attr_scans[attno]); + } + + /* TODO: sorting not implemented yet. (it would require materializing each + * row into a HeapTuple or something like that, which could carry the xmin/xmax + * information through the sorter). + */ + use_sort = false; + + /* + * Prepare to scan the OldHeap. To ensure we see recently-dead tuples + * that still need to be copied, we scan with SnapshotAny and use + * HeapTupleSatisfiesVacuum for the visibility test. + */ + if (OldIndex != NULL && !use_sort) + { + const int ci_index[] = { + PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_INDEX_RELID + }; + int64 ci_val[2]; + + /* Set phase and OIDOldIndex to columns */ + ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; + ci_val[1] = RelationGetRelid(OldIndex); + pgstat_progress_update_multi_param(2, ci_index, ci_val); + + indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0); + index_rescan(indexScan, NULL, 0, NULL, 0); + } + else + { + /* In scan-and-sort mode and also VACUUM FULL, set phase */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); + + indexScan = NULL; + + /* Set total heap blocks */ + /* TODO */ +#if 0 + pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, + heapScan->rs_nblocks); +#endif + } + + for (;;) + { + zstid old_tid; + ZSUndoRecPtr old_undoptr; + zstid new_tid; + Datum datum; + bool isnull; + zstid fetchtid = InvalidZSTid; + + CHECK_FOR_INTERRUPTS(); + + if (indexScan != NULL) + { + ItemPointer itemptr; + + itemptr = index_getnext_tid(indexScan, ForwardScanDirection); + if (!itemptr) + break; + + /* Since we used no scan keys, should never need to recheck */ + if (indexScan->xs_recheck) + elog(ERROR, "CLUSTER does not support lossy index conditions"); + + fetchtid = ZSTidFromItemPointer(*itemptr); + zsbt_tid_reset_scan(&meta_scan, fetchtid); + old_tid = zsbt_tid_scan_next(&meta_scan); + } + else + { + old_tid = zsbt_tid_scan_next(&meta_scan); + fetchtid = old_tid; + } + if (old_tid == InvalidZSTid) + break; + if (old_tid != fetchtid) + break; + old_undoptr = meta_scan.array_undoptr; + + new_tid = zs_cluster_process_tuple(OldHeap, NewHeap, + old_tid, old_undoptr, + recent_oldest_undo, + OldestXmin); + if (new_tid != InvalidZSTid) + { + /* Fetch the attributes and write them out */ + for (attno = 1; attno <= olddesc->natts; attno++) + { + Form_pg_attribute att = TupleDescAttr(olddesc, attno - 1); + Datum toastptr = (Datum) 0; + + if (att->attisdropped) + { + datum = (Datum) 0; + isnull = true; + } + else + { + if (indexScan) + zsbt_attr_reset_scan(&attr_scans[attno], old_tid); + + if (!zsbt_scan_next_fetch(&attr_scans[attno], &datum, &isnull, old_tid)) + zsbt_fill_missing_attribute_value(&attr_scans[attno], &datum, &isnull); + } + + /* flatten and re-toast any ZS-TOASTed values */ + if (!isnull && att->attlen == -1) + { + if (VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE) + { + datum = zedstore_toast_flatten(OldHeap, attno, old_tid, datum); + } + + if (VARSIZE_ANY_EXHDR(datum) > MaxZedStoreDatumSize) + { + toastptr = datum = zedstore_toast_datum(NewHeap, attno, datum); + } + } + + zsbt_attr_multi_insert(NewHeap, attno, &datum, &isnull, &new_tid, 1); + + if (toastptr != (Datum) 0) + zedstore_toast_finish(NewHeap, attno, toastptr, new_tid); + } + } + } + + if (indexScan != NULL) + index_endscan(indexScan); + + zsbt_tid_end_scan(&meta_scan); + for (attno = 1; attno <= olddesc->natts; attno++) + { + if (TupleDescAttr(olddesc, attno - 1)->attisdropped) + continue; + + zsbt_attr_end_scan(&attr_scans[attno]); + } +} + +/* + * FIXME: The ANALYZE API is problematic for us. acquire_sample_rows() calls + * RelationGetNumberOfBlocks() directly on the relation, and chooses the + * block numbers to sample based on that. But the logical block numbers + * have little to do with physical ones in zedstore. + */ +static bool +zedstoream_scan_analyze_next_block(TableScanDesc sscan, BlockNumber blockno, + BufferAccessStrategy bstrategy) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + Relation rel = scan->rs_scan.rs_rd; + int ntuples; + ZSBtreeScan btree_scan; + zstid tid; + + /* TODO: for now, assume that we need all columns */ + zs_initialize_proj_attributes_extended(scan, RelationGetDescr(rel)); + + ntuples = 0; + zsbt_tid_begin_scan(scan->rs_scan.rs_rd, + ZSTidFromBlkOff(blockno, 1), + ZSTidFromBlkOff(blockno + 1, 1), + scan->rs_scan.rs_snapshot, + &btree_scan); + /* + * TODO: it would be good to pass the next expected TID down to zsbt_scan_next, + * so that it could skip over to it more efficiently. + */ + ntuples = 0; + while ((tid = zsbt_tid_scan_next(&btree_scan)) != InvalidZSTid) + { + Assert(ZSTidGetBlockNumber(tid) == blockno); + scan->bmscan_tids[ntuples] = tid; + ntuples++; + } + zsbt_tid_end_scan(&btree_scan); + + if (ntuples) + { + for (int i = 1; i < scan->proj_data.num_proj_atts; i++) + { + int natt = scan->proj_data.proj_atts[i]; + ZSBtreeScan btree_scan; + Datum datum; + bool isnull; + Datum *datums = scan->bmscan_datums[i]; + bool *isnulls = scan->bmscan_isnulls[i]; + + zsbt_attr_begin_scan(scan->rs_scan.rs_rd, + RelationGetDescr(scan->rs_scan.rs_rd), + natt, + ZSTidFromBlkOff(blockno, 1), + ZSTidFromBlkOff(blockno + 1, 1), + &btree_scan); + for (int n = 0; n < ntuples; n++) + { + zstid tid = scan->bmscan_tids[n]; + if (zsbt_scan_next_fetch(&btree_scan, &datum, &isnull, tid)) + { + Assert(ZSTidGetBlockNumber(tid) == blockno); + } + else + zsbt_fill_missing_attribute_value(&btree_scan, &datum, &isnull); + + /* + * have to make a copy because we close the scan immediately. + * FIXME: I think this leaks into a too-long-lived context + */ + if (!isnull) + datum = zs_datumCopy(datum, + ZSBtreeScanGetAttInfo(&btree_scan)->attbyval, + ZSBtreeScanGetAttInfo(&btree_scan)->attlen); + datums[n] = datum; + isnulls[n] = isnull; + } + zsbt_attr_end_scan(&btree_scan); + } + } + + scan->bmscan_nexttuple = 0; + scan->bmscan_ntuples = ntuples; + + return true; +} + +static bool +zedstoream_scan_analyze_next_tuple(TableScanDesc sscan, TransactionId OldestXmin, + double *liverows, double *deadrows, + TupleTableSlot *slot) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + zstid tid; + + if (scan->bmscan_nexttuple >= scan->bmscan_ntuples) + return false; + /* + * projection attributes were created based on Relation tuple descriptor + * it better match TupleTableSlot. + */ + Assert((scan->proj_data.num_proj_atts - 1) <= slot->tts_tupleDescriptor->natts); + tid = scan->bmscan_tids[scan->bmscan_nexttuple]; + for (int i = 1; i < scan->proj_data.num_proj_atts; i++) + { + int natt = scan->proj_data.proj_atts[i]; + Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, natt - 1); + + Datum datum; + bool isnull; + + datum = (scan->bmscan_datums[i])[scan->bmscan_nexttuple]; + isnull = (scan->bmscan_isnulls[i])[scan->bmscan_nexttuple]; + + /* + * flatten any ZS-TOASTed values, because the rest of the system + * doesn't know how to deal with them. + */ + if (!isnull && att->attlen == -1 && + VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE) + { + datum = zedstore_toast_flatten(scan->rs_scan.rs_rd, natt, tid, datum); + } + + slot->tts_values[natt - 1] = datum; + slot->tts_isnull[natt - 1] = isnull; + } + slot->tts_tid = ItemPointerFromZSTid(tid); + slot->tts_nvalid = slot->tts_tupleDescriptor->natts; + slot->tts_flags &= ~TTS_FLAG_EMPTY; + + scan->bmscan_nexttuple++; + (*liverows)++; + + return true; +} + +/* ------------------------------------------------------------------------ + * Miscellaneous callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +/* + * FIXME: Implement this function as best for zedstore. The return value is + * for example leveraged by analyze to find which blocks to sample. + */ +static uint64 +zedstoream_relation_size(Relation rel, ForkNumber forkNumber) +{ + uint64 nblocks = 0; + + /* Open it at the smgr level if not already done */ + RelationOpenSmgr(rel); + nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); + return nblocks * BLCKSZ; +} + +/* + * Zedstore stores TOAST chunks within the table file itself. Hence, doesn't + * need separate toast table to be created. Return false for this callback + * avoids creation of toast table. + */ +static bool +zedstoream_relation_needs_toast_table(Relation rel) +{ + return false; +} + +/* ------------------------------------------------------------------------ + * Planner related callbacks for the zedstore AM + * ------------------------------------------------------------------------ + */ + +/* + * currently this is exact duplicate of heapam_estimate_rel_size(). + * TODO fix to tune it based on zedstore storage. + */ +static void +zedstoream_relation_estimate_size(Relation rel, int32 *attr_widths, + BlockNumber *pages, double *tuples, + double *allvisfrac) +{ + BlockNumber curpages; + BlockNumber relpages; + double reltuples; + BlockNumber relallvisible; + double density; + + /* it has storage, ok to call the smgr */ + curpages = RelationGetNumberOfBlocks(rel); + + /* coerce values in pg_class to more desirable types */ + relpages = (BlockNumber) rel->rd_rel->relpages; + reltuples = (double) rel->rd_rel->reltuples; + relallvisible = (BlockNumber) rel->rd_rel->relallvisible; + + /* + * HACK: if the relation has never yet been vacuumed, use a minimum size + * estimate of 10 pages. The idea here is to avoid assuming a + * newly-created table is really small, even if it currently is, because + * that may not be true once some data gets loaded into it. Once a vacuum + * or analyze cycle has been done on it, it's more reasonable to believe + * the size is somewhat stable. + * + * (Note that this is only an issue if the plan gets cached and used again + * after the table has been filled. What we're trying to avoid is using a + * nestloop-type plan on a table that has grown substantially since the + * plan was made. Normally, autovacuum/autoanalyze will occur once enough + * inserts have happened and cause cached-plan invalidation; but that + * doesn't happen instantaneously, and it won't happen at all for cases + * such as temporary tables.) + * + * We approximate "never vacuumed" by "has relpages = 0", which means this + * will also fire on genuinely empty relations. Not great, but + * fortunately that's a seldom-seen case in the real world, and it + * shouldn't degrade the quality of the plan too much anyway to err in + * this direction. + * + * If the table has inheritance children, we don't apply this heuristic. + * Totally empty parent tables are quite common, so we should be willing + * to believe that they are empty. + */ + if (curpages < 10 && + relpages == 0 && + !rel->rd_rel->relhassubclass) + curpages = 10; + + /* report estimated # pages */ + *pages = curpages; + /* quick exit if rel is clearly empty */ + if (curpages == 0) + { + *tuples = 0; + *allvisfrac = 0; + return; + } + + /* estimate number of tuples from previous tuple density */ + if (relpages > 0) + density = reltuples / (double) relpages; + else + { + /* + * When we have no data because the relation was truncated, estimate + * tuple width from attribute datatypes. We assume here that the + * pages are completely full, which is OK for tables (since they've + * presumably not been VACUUMed yet) but is probably an overestimate + * for indexes. Fortunately get_relation_info() can clamp the + * overestimate to the parent table's size. + * + * Note: this code intentionally disregards alignment considerations, + * because (a) that would be gilding the lily considering how crude + * the estimate is, and (b) it creates platform dependencies in the + * default plans which are kind of a headache for regression testing. + */ + int32 tuple_width; + + tuple_width = get_rel_data_width(rel, attr_widths); + tuple_width += MAXALIGN(SizeofHeapTupleHeader); + tuple_width += sizeof(ItemIdData); + /* note: integer division is intentional here */ + density = (BLCKSZ - SizeOfPageHeaderData) / tuple_width; + } + *tuples = rint(density * (double) curpages); + + /* + * We use relallvisible as-is, rather than scaling it up like we do for + * the pages and tuples counts, on the theory that any pages added since + * the last VACUUM are most likely not marked all-visible. But costsize.c + * wants it converted to a fraction. + */ + if (relallvisible == 0 || curpages <= 0) + *allvisfrac = 0; + else if ((double) relallvisible >= curpages) + *allvisfrac = 1; + else + *allvisfrac = (double) relallvisible / curpages; +} + +/* ------------------------------------------------------------------------ + * Executor related callbacks for the zedstore AM + * ------------------------------------------------------------------------ + */ + +static bool +zedstoream_scan_bitmap_next_block(TableScanDesc sscan, + TBMIterateResult *tbmres) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + BlockNumber tid_blkno = tbmres->blockno; + int ntuples; + ZSBtreeScan btree_scan; + zstid tid; + int noff = 0; + + zs_initialize_proj_attributes_extended(scan, RelationGetDescr(scan->rs_scan.rs_rd)); + + /* + * Our strategy for a bitmap scan is to scan the tree of each attribute, + * starting at the given logical block number, and store all the datums + * in the scan struct. zedstoream_scan_analyze_next_tuple() then just + * needs to store the datums of the next TID in the slot. + * + * An alternative would be to keep the scans of each attribute open, + * like in a sequential scan. I'm not sure which is better. + */ + ntuples = 0; + zsbt_tid_begin_scan(scan->rs_scan.rs_rd, + ZSTidFromBlkOff(tid_blkno, 1), + ZSTidFromBlkOff(tid_blkno + 1, 1), + scan->rs_scan.rs_snapshot, + &btree_scan); + btree_scan.serializable = true; + while ((tid = zsbt_tid_scan_next(&btree_scan)) != InvalidZSTid) + { + ItemPointerData itemptr; + + Assert(ZSTidGetBlockNumber(tid) == tid_blkno); + + ItemPointerSet(&itemptr, tid_blkno, ZSTidGetOffsetNumber(tid)); + + if (tbmres->ntuples != -1) + { + while (ZSTidGetOffsetNumber(tid) > tbmres->offsets[noff] && noff < tbmres->ntuples) + { + /* + * Acquire predicate lock on all tuples that we scan, even those that are + * not visible to the snapshot. + */ + PredicateLockTID(scan->rs_scan.rs_rd, &itemptr, scan->rs_scan.rs_snapshot); + + noff++; + } + + if (noff == tbmres->ntuples) + break; + + if (ZSTidGetOffsetNumber(tid) < tbmres->offsets[noff]) + continue; + } + + Assert(ZSTidGetBlockNumber(tid) == tid_blkno); + + scan->bmscan_tids[ntuples] = tid; + ntuples++; + + /* FIXME: heapam acquires the predicate lock first, and then + * calls CheckForSerializableConflictOut(). We do it in the + * opposite order, because CheckForSerializableConflictOut() + * call as done in zsbt_get_last_tid() already. Does it matter? + * I'm not sure. + */ + PredicateLockTID(scan->rs_scan.rs_rd, &itemptr, scan->rs_scan.rs_snapshot); + } + zsbt_tid_end_scan(&btree_scan); + + if (ntuples) + { + for (int i = 1; i < scan->proj_data.num_proj_atts; i++) + { + int natt = scan->proj_data.proj_atts[i]; + ZSBtreeScan btree_scan; + Datum datum; + bool isnull; + Datum *datums = scan->bmscan_datums[i]; + bool *isnulls = scan->bmscan_isnulls[i]; + + zsbt_attr_begin_scan(scan->rs_scan.rs_rd, + RelationGetDescr(scan->rs_scan.rs_rd), + natt, + ZSTidFromBlkOff(tid_blkno, 1), + ZSTidFromBlkOff(tid_blkno + 1, 1), + &btree_scan); + for (int n = 0; n < ntuples; n++) + { + if (!zsbt_scan_next_fetch(&btree_scan, &datum, &isnull, scan->bmscan_tids[n])) + zsbt_fill_missing_attribute_value(&btree_scan, &datum, &isnull); + + /* have to make a copy because we close the scan immediately. */ + if (!isnull) + datum = zs_datumCopy(datum, + ZSBtreeScanGetAttInfo(&btree_scan)->attbyval, + ZSBtreeScanGetAttInfo(&btree_scan)->attlen); + datums[n] = datum; + isnulls[n] = isnull; + } + zsbt_attr_end_scan(&btree_scan); + } + } + scan->bmscan_nexttuple = 0; + scan->bmscan_ntuples = ntuples; + + return ntuples > 0; +} + +static bool +zedstoream_scan_bitmap_next_tuple(TableScanDesc sscan, + TBMIterateResult *tbmres, + TupleTableSlot *slot) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + zstid tid; + + if (scan->bmscan_nexttuple >= scan->bmscan_ntuples) + return false; + /* + * projection attributes were created based on Relation tuple descriptor + * it better match TupleTableSlot. + */ + Assert((scan->proj_data.num_proj_atts - 1) <= slot->tts_tupleDescriptor->natts); + tid = scan->bmscan_tids[scan->bmscan_nexttuple]; + for (int i = 1; i < scan->proj_data.num_proj_atts; i++) + { + int natt = scan->proj_data.proj_atts[i]; + Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, natt - 1); + Datum datum; + bool isnull; + + datum = (scan->bmscan_datums[i])[scan->bmscan_nexttuple]; + isnull = (scan->bmscan_isnulls[i])[scan->bmscan_nexttuple]; + + /* + * flatten any ZS-TOASTed values, because the rest of the system + * doesn't know how to deal with them. + */ + if (!isnull && att->attlen == -1 && + VARATT_IS_EXTERNAL(datum) && VARTAG_EXTERNAL(datum) == VARTAG_ZEDSTORE) + { + datum = zedstore_toast_flatten(scan->rs_scan.rs_rd, natt, tid, datum); + } + + slot->tts_values[natt - 1] = datum; + slot->tts_isnull[natt - 1] = isnull; + } + slot->tts_tid = ItemPointerFromZSTid(tid); + slot->tts_nvalid = slot->tts_tupleDescriptor->natts; + slot->tts_flags &= ~TTS_FLAG_EMPTY; + + scan->bmscan_nexttuple++; + + pgstat_count_heap_fetch(scan->rs_scan.rs_rd); + + return true; +} + +static bool +zedstoream_scan_sample_next_block(TableScanDesc sscan, SampleScanState *scanstate) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + Relation rel = scan->rs_scan.rs_rd; + TsmRoutine *tsm = scanstate->tsmroutine; + int ntuples; + ZSBtreeScan btree_scan; + zstid tid; + BlockNumber blockno; + + /* TODO: for now, assume that we need all columns */ + zs_initialize_proj_attributes_extended(scan, RelationGetDescr(rel)); + + if (scan->max_tid_to_scan == InvalidZSTid) + { + /* + * get the max tid once and store it, used to calculate max blocks to + * scan either for SYSTEM or BERNOULLI sampling. + */ + scan->max_tid_to_scan = zsbt_get_last_tid(rel); + /* + * TODO: should get lowest tid instead of starting from 0 + */ + scan->next_tid_to_scan = ZSTidFromBlkOff(0, 1); + } + + if (tsm->NextSampleBlock) + { + /* Adding one below to convert block number to number of blocks. */ + blockno = tsm->NextSampleBlock(scanstate, + ZSTidGetBlockNumber(scan->max_tid_to_scan) + 1); + + if (!BlockNumberIsValid(blockno)) + return false; + } + else + { + /* scanning table sequentially */ + if (scan->next_tid_to_scan > scan->max_tid_to_scan) + return false; + + blockno = ZSTidGetBlockNumber(scan->next_tid_to_scan); + /* move on to next block of tids for next iteration of scan */ + scan->next_tid_to_scan = ZSTidFromBlkOff(blockno + 1, 1); + } + + Assert(BlockNumberIsValid(blockno)); + + ntuples = 0; + zsbt_tid_begin_scan(scan->rs_scan.rs_rd, + ZSTidFromBlkOff(blockno, 1), + ZSTidFromBlkOff(blockno + 1, 1), + scan->rs_scan.rs_snapshot, + &btree_scan); + while ((tid = zsbt_tid_scan_next(&btree_scan)) != InvalidZSTid) + { + Assert(ZSTidGetBlockNumber(tid) == blockno); + scan->bmscan_tids[ntuples] = tid; + ntuples++; + } + zsbt_tid_end_scan(&btree_scan); + + scan->bmscan_nexttuple = 0; + scan->bmscan_ntuples = ntuples; + + return true; +} + +static bool +zedstoream_scan_sample_next_tuple(TableScanDesc sscan, SampleScanState *scanstate, + TupleTableSlot *slot) +{ + ZedStoreDesc scan = (ZedStoreDesc) sscan; + TsmRoutine *tsm = scanstate->tsmroutine; + zstid tid; + BlockNumber blockno; + OffsetNumber tupoffset; + bool found; + + /* all tuples on this block are invisible */ + if (scan->bmscan_ntuples == 0) + return false; + + blockno = ZSTidGetBlockNumber(scan->bmscan_tids[0]); + + /* find which visible tuple in this block to sample */ + for (;;) + { + zstid lasttid_for_block = scan->bmscan_tids[scan->bmscan_ntuples - 1]; + OffsetNumber maxoffset = ZSTidGetOffsetNumber(lasttid_for_block); + /* Ask the tablesample method which tuples to check on this page. */ + tupoffset = tsm->NextSampleTuple(scanstate, blockno, maxoffset); + + if (!OffsetNumberIsValid(tupoffset)) + return false; + + tid = ZSTidFromBlkOff(blockno, tupoffset); + + found = false; + for (int n = 0; n < scan->bmscan_ntuples; n++) + { + if (scan->bmscan_tids[n] == tid) + { + /* visible tuple */ + found = true; + break; + } + } + + if (found) + break; + else + continue; + } + + /* + * projection attributes were created based on Relation tuple descriptor + * it better match TupleTableSlot. + */ + Assert((scan->proj_data.num_proj_atts - 1) <= slot->tts_tupleDescriptor->natts); + /* fetch values for tuple pointed by tid to sample */ + for (int i = 1; i < scan->proj_data.num_proj_atts; i++) + { + int natt = scan->proj_data.proj_atts[i]; + ZSBtreeScan btree_scan; + Form_pg_attribute attr; + Datum datum; + bool isnull; + + zsbt_attr_begin_scan(scan->rs_scan.rs_rd, + slot->tts_tupleDescriptor, + natt, + tid, tid + 1, + &btree_scan); + + attr = ZSBtreeScanGetAttInfo(&btree_scan); + if (zsbt_scan_next_fetch(&btree_scan, &datum, &isnull, tid)) + { + Assert(ZSTidGetBlockNumber(tid) == blockno); + } + else + { + zsbt_fill_missing_attribute_value(&btree_scan, &datum, &isnull); + } + + /* + * have to make a copy because we close the scan immediately. + * FIXME: I think this leaks into a too-long-lived context + */ + if (!isnull) + datum = zs_datumCopy(datum, attr->attbyval, attr->attlen); + + slot->tts_values[natt - 1] = datum; + slot->tts_isnull[natt - 1] = isnull; + + zsbt_attr_end_scan(&btree_scan); + } + slot->tts_tid = ItemPointerFromZSTid(tid); + slot->tts_nvalid = slot->tts_tupleDescriptor->natts; + slot->tts_flags &= ~TTS_FLAG_EMPTY; + + return true; +} + +static void +zedstoream_vacuum_rel(Relation onerel, VacuumParams *params, + BufferAccessStrategy bstrategy) +{ + zsundo_vacuum(onerel, params, bstrategy, + GetOldestXmin(onerel, PROCARRAY_FLAGS_VACUUM)); +} + +static const TableAmRoutine zedstoream_methods = { + .type = T_TableAmRoutine, + .scans_leverage_column_projection = true, + + .slot_callbacks = zedstoream_slot_callbacks, + + .scan_begin = zedstoream_beginscan, + .scan_begin_with_column_projection = zedstoream_beginscan_with_column_projection, + .scan_end = zedstoream_endscan, + .scan_rescan = zedstoream_rescan, + .scan_getnextslot = zedstoream_getnextslot, + + .parallelscan_estimate = zs_parallelscan_estimate, + .parallelscan_initialize = zs_parallelscan_initialize, + .parallelscan_reinitialize = zs_parallelscan_reinitialize, + + .index_fetch_begin = zedstoream_begin_index_fetch, + .index_fetch_reset = zedstoream_reset_index_fetch, + .index_fetch_end = zedstoream_end_index_fetch, + .index_fetch_set_column_projection = zedstoream_fetch_set_column_projection, + .index_fetch_tuple = zedstoream_index_fetch_tuple, + + .tuple_insert = zedstoream_insert, + .tuple_insert_speculative = zedstoream_insert_speculative, + .tuple_complete_speculative = zedstoream_complete_speculative, + .multi_insert = zedstoream_multi_insert, + .tuple_delete = zedstoream_delete, + .tuple_update = zedstoream_update, + .tuple_lock = zedstoream_lock_tuple, + .finish_bulk_insert = zedstoream_finish_bulk_insert, + + .tuple_fetch_row_version = zedstoream_fetch_row_version, + .tuple_get_latest_tid = zedstoream_get_latest_tid, + .tuple_tid_valid = zedstoream_tuple_tid_valid, + .tuple_satisfies_snapshot = zedstoream_tuple_satisfies_snapshot, + .compute_xid_horizon_for_tuples = zedstoream_compute_xid_horizon_for_tuples, + + .relation_set_new_filenode = zedstoream_relation_set_new_filenode, + .relation_nontransactional_truncate = zedstoream_relation_nontransactional_truncate, + .relation_copy_data = zedstoream_relation_copy_data, + .relation_copy_for_cluster = zedstoream_relation_copy_for_cluster, + .relation_vacuum = zedstoream_vacuum_rel, + .scan_analyze_next_block = zedstoream_scan_analyze_next_block, + .scan_analyze_next_tuple = zedstoream_scan_analyze_next_tuple, + + .index_build_range_scan = zedstoream_index_build_range_scan, + .index_validate_scan = zedstoream_index_validate_scan, + + .relation_size = zedstoream_relation_size, + .relation_needs_toast_table = zedstoream_relation_needs_toast_table, + .relation_estimate_size = zedstoream_relation_estimate_size, + + .scan_bitmap_next_block = zedstoream_scan_bitmap_next_block, + .scan_bitmap_next_tuple = zedstoream_scan_bitmap_next_tuple, + .scan_sample_next_block = zedstoream_scan_sample_next_block, + .scan_sample_next_tuple = zedstoream_scan_sample_next_tuple +}; + +Datum +zedstore_tableam_handler(PG_FUNCTION_ARGS) +{ + PG_RETURN_POINTER(&zedstoream_methods); +} + + +/* + * Routines for dividing up the TID range for parallel seq scans + */ + +typedef struct ParallelZSScanDescData +{ + ParallelTableScanDescData base; + + zstid pzs_endtid; /* last tid + 1 in relation at start of scan */ + pg_atomic_uint64 pzs_allocatedtid_blk; /* TID space allocated to workers so far. (in 65536 increments) */ +} ParallelZSScanDescData; +typedef struct ParallelZSScanDescData *ParallelZSScanDesc; + +static Size +zs_parallelscan_estimate(Relation rel) +{ + return sizeof(ParallelZSScanDescData); +} + +static Size +zs_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan) +{ + ParallelZSScanDesc zpscan = (ParallelZSScanDesc) pscan; + + zpscan->base.phs_relid = RelationGetRelid(rel); + zpscan->pzs_endtid = zsbt_get_last_tid(rel); + pg_atomic_init_u64(&zpscan->pzs_allocatedtid_blk, 0); + + return sizeof(ParallelZSScanDescData); +} + +static void +zs_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) +{ + ParallelZSScanDesc bpscan = (ParallelZSScanDesc) pscan; + + pg_atomic_write_u64(&bpscan->pzs_allocatedtid_blk, 0); +} + +/* + * get the next TID range to scan + * + * Returns true if there is more to scan, false otherwise. + * + * Get the next TID range to scan. Even if there are no TIDs left to scan, + * another backend could have grabbed a range to scan and not yet finished + * looking at it, so it doesn't follow that the scan is done when the first + * backend gets 'false' return. + */ +static bool +zs_parallelscan_nextrange(Relation rel, ParallelZSScanDesc pzscan, + zstid *start, zstid *end) +{ + uint64 allocatedtid_blk; + + /* + * zhs_allocatedtid tracks how much has been allocated to workers + * already. When phs_allocatedtid >= rs_lasttid, all TIDs have been + * allocated. + * + * Because we use an atomic fetch-and-add to fetch the current value, the + * phs_allocatedtid counter will exceed rs_lasttid, because workers will + * still increment the value, when they try to allocate the next block but + * all blocks have been allocated already. The counter must be 64 bits + * wide because of that, to avoid wrapping around when rs_lasttid is close + * to 2^32. That's also one reason we do this at granularity of 2^16 TIDs, + * even though zedstore isn't block-oriented. + * + * TODO: we divide the TID space into chunks of 2^16 TIDs each. That's + * pretty inefficient, there's a fair amount of overhead in re-starting + * the B-tree scans between each range. We probably should use much larger + * ranges. But this is good for testing. + */ + allocatedtid_blk = pg_atomic_fetch_add_u64(&pzscan->pzs_allocatedtid_blk, 1); + *start = ZSTidFromBlkOff(allocatedtid_blk, 1); + *end = ZSTidFromBlkOff(allocatedtid_blk + 1, 1); + + return *start < pzscan->pzs_endtid; +} + +static void +zsbt_fill_missing_attribute_value(ZSBtreeScan *scan, Datum *datum, bool *isnull) +{ + int attno = scan->attno - 1; + TupleDesc tupleDesc = scan->tupledesc; + Form_pg_attribute attr = ZSBtreeScanGetAttInfo(scan); + + *isnull = true; + *datum = (Datum) 0; + + /* This means catalog doesn't have the default value for this attribute */ + if (!attr->atthasmissing) + return; + + if (tupleDesc->constr && + tupleDesc->constr->missing) + { + AttrMissing *attrmiss = NULL; + /* + * If there are missing values we want to put them into the + * tuple. + */ + attrmiss = tupleDesc->constr->missing; + + if (attrmiss[attno].am_present) + { + *isnull = false; + if (attr->attbyval) + *datum = fetch_att(&attrmiss[attno].am_value, attr->attbyval, attr->attlen); + else + *datum = zs_datumCopy(attrmiss[attno].am_value, attr->attbyval, attr->attlen); + } + } +} diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 6cb545c126..e795a510ae 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1057,7 +1057,11 @@ acquire_sample_rows(Relation onerel, int elevel, * done. */ if (numrows < targrows) - rows[numrows++] = ExecCopySlotHeapTuple(slot); + { + rows[numrows] = ExecCopySlotHeapTuple(slot); + rows[numrows]->t_self = slot->tts_tid; + numrows++; + } else { /* @@ -1079,6 +1083,7 @@ acquire_sample_rows(Relation onerel, int elevel, Assert(k >= 0 && k < targrows); heap_freetuple(rows[k]); rows[k] = ExecCopySlotHeapTuple(slot); + rows[k]->t_self = slot->tts_tid; } rowstoskip -= 1; diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index b00891ffd2..ab9fea881a 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2116,9 +2116,27 @@ CopyTo(CopyState cstate) { TupleTableSlot *slot; TableScanDesc scandesc; + bool *proj = NULL; - scandesc = table_beginscan(cstate->rel, GetActiveSnapshot(), 0, NULL); slot = table_slot_create(cstate->rel, NULL); + if (table_scans_leverage_column_projection(cstate->rel)) + { + proj = palloc0(slot->tts_tupleDescriptor->natts * sizeof(bool)); + foreach(cur, cstate->attnumlist) + { + int attnum = lfirst_int(cur); + Assert(attnum <= slot->tts_tupleDescriptor->natts); + proj[attnum-1] = true; + } + + scandesc = table_beginscan_with_column_projection(cstate->rel, + GetActiveSnapshot(), + 0, NULL, proj); + } + else + { + scandesc = table_beginscan(cstate->rel, GetActiveSnapshot(), 0, NULL); + } processed = 0; while (table_scan_getnextslot(scandesc, ForwardScanDirection, slot)) @@ -2135,6 +2153,8 @@ CopyTo(CopyState cstate) ExecDropSingleTupleTableSlot(slot); table_endscan(scandesc); + if (proj) + pfree(proj); } else { diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 602a8dbd1c..2af39c8fdc 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -9586,6 +9586,7 @@ validateCheckConstraint(Relation rel, HeapTuple constrtup) Form_pg_constraint constrForm; bool isnull; Snapshot snapshot; + bool *proj = NULL; /* * VALIDATE CONSTRAINT is a no-op for foreign tables and partitioned @@ -9618,7 +9619,16 @@ validateCheckConstraint(Relation rel, HeapTuple constrtup) econtext->ecxt_scantuple = slot; snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(rel, snapshot, 0, NULL); + if (table_scans_leverage_column_projection(rel)) + { + proj = palloc0(slot->tts_tupleDescriptor->natts * sizeof(bool)); + GetNeededColumnsForNode((Node*)exprstate->expr, proj, slot->tts_tupleDescriptor->natts); + scan = table_beginscan_with_column_projection(rel, snapshot, 0, NULL, proj); + } + else + { + scan = table_beginscan(rel, snapshot, 0, NULL); + } /* * Switch to per-tuple memory context and reset it for each tuple @@ -9643,6 +9653,8 @@ validateCheckConstraint(Relation rel, HeapTuple constrtup) UnregisterSnapshot(snapshot); ExecDropSingleTupleTableSlot(slot); FreeExecutorState(estate); + if (proj) + pfree(proj); } /* diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 3132a13785..db09b3ac9c 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2841,6 +2841,10 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, if (newtuple != trigtuple) heap_freetuple(newtuple); } + + /* Make sure the the new slot is not dependent on the original tuple */ + ExecMaterializeSlot(slot); + if (should_free) heap_freetuple(trigtuple); @@ -3125,6 +3129,10 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, newtuple = NULL; } } + + /* Make sure the the new slot is not dependent on the original tuple */ + ExecMaterializeSlot(newslot); + if (should_free_trig) heap_freetuple(trigtuple); diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c index 67c4be5108..73483aa835 100644 --- a/src/backend/executor/execScan.c +++ b/src/backend/executor/execScan.c @@ -20,6 +20,7 @@ #include "executor/executor.h" #include "miscadmin.h" +#include "nodes/nodeFuncs.h" #include "utils/memutils.h" @@ -301,3 +302,92 @@ ExecScanReScan(ScanState *node) } } } + +typedef struct neededColumnContext +{ + bool *mask; + int n; +} neededColumnContext; + +static bool +neededColumnContextWalker(Node *node, neededColumnContext *c) +{ + if (node == NULL) + return false; + + if (IsA(node, Var)) + { + Var *var = (Var *)node; + + if (var->varattno > 0) + { + Assert(var->varattno <= c->n); + c->mask[var->varattno - 1] = true; + } + /* + * If all attributes are included, + * set all entries in mask to true. + */ + else if (var->varattno == 0) + memset(c->mask, true, c->n); + + return false; + } + return expression_tree_walker(node, neededColumnContextWalker, (void * )c); +} + +/* + * n specifies the number of allowed entries in mask: we use + * it for bounds-checking in the walker above. + */ +void +GetNeededColumnsForNode(Node *expr, bool *mask, int n) +{ + neededColumnContext c; + + c.mask = mask; + c.n = n; + + neededColumnContextWalker(expr, &c); +} + +bool * +GetNeededColumnsForScan(ScanState *scanstate, int ncol) +{ + Plan *plan = scanstate->ps.plan; + bool *proj; + int i; + + proj = palloc0(ncol * sizeof(bool)); + GetNeededColumnsForNode((Node *) plan->targetlist, proj, ncol); + GetNeededColumnsForNode((Node *) plan->qual, proj, ncol); + + /* + * Some node types have more fields with expressions. FIXME: This list + * surely very incomplete. Should teach the planner to do this for us. + */ + if (IsA(plan, IndexScan)) + { + GetNeededColumnsForNode((Node *) ((IndexScan *) plan)->indexqualorig, proj, ncol); + GetNeededColumnsForNode((Node *) ((IndexScan *) plan)->indexorderbyorig, proj, ncol); + } + else if (IsA(plan, BitmapHeapScan)) + { + GetNeededColumnsForNode((Node *) ((BitmapHeapScan *) plan)->bitmapqualorig, proj, ncol); + } + + for (i = 0; i < ncol; i++) + { + if (proj[i]) + break; + } + + /* + * In some cases (for example, count(*)), no columns are specified. + * We always scan the first column. + */ + if (i == ncol && ncol > 0) + proj[0] = true; + + return proj; +} diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index ee5b1c493b..8a4d795d1a 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -166,10 +166,10 @@ IndexOnlyNext(IndexOnlyScanState *node) * Rats, we have to visit the heap to check visibility. */ InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, slot)) + if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) continue; /* no visible tuple, try next index entry */ - ExecClearTuple(slot); + ExecClearTuple(node->ioss_TableSlot); /* * Only MVCC snapshots are supported here, so there should be no @@ -528,7 +528,17 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) */ tupDesc = ExecTypeFromTL(node->indextlist); ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc, - table_slot_callbacks(currentRelation)); + &TTSOpsVirtual); + + /* + * We need another slot, in a format that's suitable for the table AM, + * for when we need to fetch a tuple from the table for rechecking + * visibility. + */ + indexstate->ioss_TableSlot = + ExecAllocTableSlot(&estate->es_tupleTable, + RelationGetDescr(currentRelation), + table_slot_callbacks(currentRelation)); /* * Initialize result type and projection info. The node's targetlist will diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index ac7aa81f67..5492816b6d 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -115,6 +115,13 @@ IndexNext(IndexScanState *node) node->iss_NumScanKeys, node->iss_NumOrderByKeys); + if (table_scans_leverage_column_projection(node->ss.ss_currentRelation)) + { + bool *proj; + proj = GetNeededColumnsForScan(&node->ss, node->ss.ss_currentRelation->rd_att->natts); + table_index_fetch_set_column_projection(scandesc->xs_heapfetch, proj); + } + node->iss_ScanDesc = scandesc; /* @@ -897,6 +904,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) { IndexScanState *indexstate; Relation currentRelation; + const TupleTableSlotOps *table_slot_ops; LOCKMODE lockmode; /* @@ -923,11 +931,19 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ /* - * get the scan type from the relation descriptor. + * Initialize the scan slot. + * + * With the reorder queue, we will sometimes use the reorderqueue's slot, + * which uses heap ops, and sometimes the table AM's slot directly. We + * have to set scanopsfixed to false, unless the table AM also uses heap + * ops. */ + table_slot_ops = table_slot_callbacks(currentRelation); ExecInitScanTupleSlot(estate, &indexstate->ss, RelationGetDescr(currentRelation), - table_slot_callbacks(currentRelation)); + table_slot_ops); + if (node->indexorderby && table_slot_ops != &TTSOpsHeapTuple) + indexstate->ss.ps.scanopsfixed = false; /* * Initialize result type and projection. diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 436b43f8ca..c0922ff823 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -31,6 +31,7 @@ #include "access/tableam.h" #include "executor/execdebug.h" #include "executor/nodeSeqscan.h" +#include "nodes/nodeFuncs.h" #include "utils/rel.h" static TupleTableSlot *SeqNext(SeqScanState *node); @@ -68,9 +69,20 @@ SeqNext(SeqScanState *node) * We reach here if the scan is not parallel, or if we're serially * executing a scan that was planned to be parallel. */ - scandesc = table_beginscan(node->ss.ss_currentRelation, - estate->es_snapshot, - 0, NULL); + if (table_scans_leverage_column_projection(node->ss.ss_currentRelation)) + { + bool *proj; + proj = GetNeededColumnsForScan(&node->ss, node->ss.ss_currentRelation->rd_att->natts); + scandesc = table_beginscan_with_column_projection(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL, proj); + } + else + { + scandesc = table_beginscan(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL); + } node->ss.ss_currentScanDesc = scandesc; } diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 608d5adfed..6527e0d5d2 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -822,6 +822,9 @@ use_physical_tlist(PlannerInfo *root, Path *path, int flags) rel->rtekind != RTE_CTE) return false; + if (rel->rtekind == RTE_RELATION && rel->leverage_column_projection) + return false; + /* * Can't do it with inheritance cases either (mainly because Append * doesn't project; this test may be unnecessary now that diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 2405acbf6f..00d125378b 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -123,6 +123,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, */ relation = table_open(relationObjectId, NoLock); + if (relation->rd_tableam) + rel->leverage_column_projection = relation->rd_tableam->scans_leverage_column_projection; /* Temporary and unlogged relations are inaccessible during recovery. */ if (!RelationNeedsWAL(relation) && RecoveryInProgress()) ereport(ERROR, diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 99d26de7e6..b4110e4152 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -1274,6 +1274,7 @@ check_default_partition_contents(Relation parent, Relation default_rel, TableScanDesc scan; MemoryContext oldCxt; TupleTableSlot *tupslot; + bool *proj = NULL; /* Lock already taken above. */ if (part_relid != RelationGetRelid(default_rel)) @@ -1330,7 +1331,16 @@ check_default_partition_contents(Relation parent, Relation default_rel, econtext = GetPerTupleExprContext(estate); snapshot = RegisterSnapshot(GetLatestSnapshot()); tupslot = table_slot_create(part_rel, &estate->es_tupleTable); - scan = table_beginscan(part_rel, snapshot, 0, NULL); + if (table_scans_leverage_column_projection(part_rel)) + { + proj = palloc0(tupslot->tts_tupleDescriptor->natts * sizeof(bool)); + GetNeededColumnsForNode((Node*)partqualstate->expr, proj, tupslot->tts_tupleDescriptor->natts); + scan = table_beginscan_with_column_projection(part_rel, snapshot, 0, NULL, proj); + } + else + { + scan = table_beginscan(part_rel, snapshot, 0, NULL); + } /* * Switch to per-tuple memory context and reset it for each tuple @@ -1360,6 +1370,9 @@ check_default_partition_contents(Relation parent, Relation default_rel, if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel)) table_close(part_rel, NoLock); /* keep the lock until commit */ + + if (proj) + pfree(proj); } } diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index 2fedbc4c15..b31c0bfe00 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -2547,8 +2547,6 @@ PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot) void PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot) { - PREDICATELOCKTARGETTAG tag; - ItemPointer tid; TransactionId targetxmin; if (!SerializationNeededForRead(relation, snapshot)) @@ -2579,6 +2577,17 @@ PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot) } } + PredicateLockTID(relation, &(tuple->t_self), snapshot); +} + +void +PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot) +{ + PREDICATELOCKTARGETTAG tag; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + /* * Do quick-but-not-definitive test for a relation lock first. This will * never cause a return when the relation is *not* locked, but will @@ -2591,7 +2600,6 @@ PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot) if (PredicateLockExists(&tag)) return; - tid = &(tuple->t_self); SET_PREDICATELOCKTARGETTAG_TUPLE(tag, relation->rd_node.dbNode, relation->rd_id, @@ -4054,14 +4062,11 @@ XidIsConcurrent(TransactionId xid) * currently no known reason to call this function from an index AM. */ void -CheckForSerializableConflictOut(bool visible, Relation relation, +heap_CheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot) { TransactionId xid; - SERIALIZABLEXIDTAG sxidtag; - SERIALIZABLEXID *sxid; - SERIALIZABLEXACT *sxact; HTSV_Result htsvResult; if (!SerializationNeededForRead(relation, snapshot)) @@ -4125,6 +4130,19 @@ CheckForSerializableConflictOut(bool visible, Relation relation, Assert(TransactionIdIsValid(xid)); Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + return CheckForSerializableConflictOut(relation, xid, snapshot); +} + +void +CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot) +{ + SERIALIZABLEXIDTAG sxidtag; + SERIALIZABLEXID *sxid; + SERIALIZABLEXACT *sxact; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + /* * Find top level xid. Bail out if xid is too early to be a conflict, or * if it's our own xid. @@ -4439,8 +4457,7 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag) * tuple itself. */ void -CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, - Buffer buffer) +CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno) { PREDICATELOCKTARGETTAG targettag; @@ -4470,22 +4487,22 @@ CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, * It is not possible to take and hold a lock across the checks for all * granularities because each target could be in a separate partition. */ - if (tuple != NULL) + if (tid != NULL) { SET_PREDICATELOCKTARGETTAG_TUPLE(targettag, relation->rd_node.dbNode, relation->rd_id, - ItemPointerGetBlockNumber(&(tuple->t_self)), - ItemPointerGetOffsetNumber(&(tuple->t_self))); + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); CheckTargetForConflictsIn(&targettag); } - if (BufferIsValid(buffer)) + if (blkno != InvalidBlockNumber) { SET_PREDICATELOCKTARGETTAG_PAGE(targettag, relation->rd_node.dbNode, relation->rd_id, - BufferGetBlockNumber(buffer)); + blkno); CheckTargetForConflictsIn(&targettag); } diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 6f1cd382d8..d914d395c9 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -163,6 +163,7 @@ typedef struct TableAmRoutine { /* this must be set to T_TableAmRoutine */ NodeTag type; + bool scans_leverage_column_projection; /* ------------------------------------------------------------------------ @@ -203,6 +204,13 @@ typedef struct TableAmRoutine ParallelTableScanDesc pscan, uint32 flags); + TableScanDesc (*scan_begin_with_column_projection)(Relation relation, + Snapshot snapshot, + int nkeys, struct ScanKeyData *key, + ParallelTableScanDesc parallel_scan, + uint32 flags, + bool *project_column); + /* * Release resources and deallocate scan. If TableScanDesc.temp_snap, * TableScanDesc.rs_snapshot needs to be unregistered. @@ -278,6 +286,13 @@ typedef struct TableAmRoutine */ void (*index_fetch_end) (struct IndexFetchTableData *data); + /* + * Set column projections for AM which leverage column projections for + * scanning. + */ + void (*index_fetch_set_column_projection) (struct IndexFetchTableData *data, + bool *project_column); + /* * Fetch tuple at `tid` into `slot`, after doing a visibility test * according to `snapshot`. If a tuple was found and passed the visibility @@ -743,6 +758,12 @@ table_beginscan(Relation rel, Snapshot snapshot, return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } +static inline bool +table_scans_leverage_column_projection(Relation relation) +{ + return relation->rd_tableam->scans_leverage_column_projection; +} + /* * Like table_beginscan(), but for scanning catalog. It'll automatically use a * snapshot appropriate for scanning catalog relations. @@ -772,6 +793,19 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } +static inline TableScanDesc +table_beginscan_with_column_projection(Relation relation, Snapshot snapshot, + int nkeys, struct ScanKeyData *key, + bool *project_column) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + Assert(relation->rd_tableam->scans_leverage_column_projection); + return relation->rd_tableam->scan_begin_with_column_projection( + relation, snapshot, nkeys, key, NULL, flags, project_column); +} + /* * table_beginscan_bm is an alternative entry point for setting up a * TableScanDesc for a bitmap heap scan. Although that scan technology is @@ -956,6 +990,13 @@ table_index_fetch_end(struct IndexFetchTableData *scan) scan->rel->rd_tableam->index_fetch_end(scan); } +static inline void +table_index_fetch_set_column_projection(struct IndexFetchTableData *scan, + bool *project_column) +{ + scan->rel->rd_tableam->index_fetch_set_column_projection(scan, project_column); +} + /* * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing * a visibility test according to `snapshot`. If a tuple was found and passed diff --git a/src/include/access/zedstore_compression.h b/src/include/access/zedstore_compression.h new file mode 100644 index 0000000000..f70713a1a7 --- /dev/null +++ b/src/include/access/zedstore_compression.h @@ -0,0 +1,51 @@ +/* + * zedstore_compression.h + * internal declarations for ZedStore compression + * + * Copyright (c) 2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/access/zedstore_compression.h + */ +#ifndef ZEDSTORE_COMPRESSION_H +#define ZEDSTORE_COMPRESSION_H + +#include "storage/itemptr.h" + +typedef struct ZSDecompressContext +{ + char *buffer; + int bufsize; /* allocated size of 'buffer' */ + int uncompressedsize; + int bytesread; +} ZSDecompressContext; + +typedef struct ZSCompressContext +{ + char *uncompressedbuffer; + + int maxCompressedSize; + int maxUncompressedSize; + char *buffer; + int nitems; + int rawsize; +} ZSCompressContext; + +typedef struct ZSBtreeItem ZSBtreeItem; +typedef struct ZSCompressedBtreeItem ZSCompressedBtreeItem; +typedef struct ZSSingleBtreeItem ZSSingleBtreeItem; + +/* compression functions */ +extern void zs_compress_init(ZSCompressContext *context); +extern void zs_compress_begin(ZSCompressContext *context, int maxCompressedSize); +extern bool zs_compress_add(ZSCompressContext *context, ZSBtreeItem *item); +extern ZSCompressedBtreeItem *zs_compress_finish(ZSCompressContext *context); +extern void zs_compress_free(ZSCompressContext *context); + +/* decompression functions */ +extern void zs_decompress_init(ZSDecompressContext *context); +extern void zs_decompress_chunk(ZSDecompressContext *context, ZSCompressedBtreeItem *chunk); +extern ZSBtreeItem *zs_decompress_read_item(ZSDecompressContext *context); +extern void zs_decompress_free(ZSDecompressContext *context); + +#endif /* ZEDSTORE_COMPRESSION_H */ diff --git a/src/include/access/zedstore_internal.h b/src/include/access/zedstore_internal.h new file mode 100644 index 0000000000..8eb9f74b96 --- /dev/null +++ b/src/include/access/zedstore_internal.h @@ -0,0 +1,618 @@ +/* + * zedstore_internal.h + * internal declarations for ZedStore tables + * + * Copyright (c) 2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/access/zedstore_internal.h + */ +#ifndef ZEDSTORE_INTERNAL_H +#define ZEDSTORE_INTERNAL_H + +#include "access/tableam.h" +#include "access/zedstore_compression.h" +#include "access/zedstore_undo.h" +#include "storage/bufmgr.h" +#include "utils/datum.h" + +#define ZS_META_ATTRIBUTE_NUM 0 + +/* + * Throughout ZedStore, we pass around TIDs as uint64's, rather than ItemPointers, + * for speed. + */ +typedef uint64 zstid; + +#define InvalidZSTid 0 +#define MinZSTid 1 /* blk 0, off 1 */ +#define MaxZSTid ((uint64) MaxBlockNumber << 16 | 0xffff) +/* note: if this is converted to ItemPointer, it is invalid */ +#define MaxPlusOneZSTid (MaxZSTid + 1) + +#define MaxZSTidOffsetNumber 129 + +static inline zstid +ZSTidFromBlkOff(BlockNumber blk, OffsetNumber off) +{ + Assert(off != 0); + + return (uint64) blk * (MaxZSTidOffsetNumber - 1) + off; +} + +static inline zstid +ZSTidFromItemPointer(ItemPointerData iptr) +{ + Assert(ItemPointerIsValid(&iptr)); + return ZSTidFromBlkOff(ItemPointerGetBlockNumber(&iptr), + ItemPointerGetOffsetNumber(&iptr)); +} + +static inline ItemPointerData +ItemPointerFromZSTid(zstid tid) +{ + ItemPointerData iptr; + BlockNumber blk; + OffsetNumber off; + + blk = (tid - 1) / (MaxZSTidOffsetNumber - 1); + off = (tid - 1) % (MaxZSTidOffsetNumber - 1) + 1; + + ItemPointerSet(&iptr, blk, off); + Assert(ItemPointerIsValid(&iptr)); + return iptr; +} + +static inline BlockNumber +ZSTidGetBlockNumber(zstid tid) +{ + return (BlockNumber) ((tid - 1) / (MaxZSTidOffsetNumber - 1)); +} + +static inline OffsetNumber +ZSTidGetOffsetNumber(zstid tid) +{ + return (OffsetNumber) ((tid - 1) % (MaxZSTidOffsetNumber - 1) + 1); +} + +/* + * A ZedStore table contains different kinds of pages, all in the same file. + * + * Block 0 is always a metapage. It contains the block numbers of the other + * data structures stored within the file, like the per-attribute B-trees, + * and the UNDO log. In addition, if there are overly large datums in the + * the table, they are chopped into separate "toast" pages. + */ +#define ZS_META_PAGE_ID 0xF083 +#define ZS_BTREE_PAGE_ID 0xF084 +#define ZS_UNDO_PAGE_ID 0xF085 +#define ZS_TOAST_PAGE_ID 0xF086 +#define ZS_FPM_PAGE_ID 0xF087 + +/* flags for zedstore b-tree pages */ +#define ZSBT_ROOT 0x0001 + +typedef struct ZSBtreePageOpaque +{ + AttrNumber zs_attno; + BlockNumber zs_next; + zstid zs_lokey; /* inclusive */ + zstid zs_hikey; /* exclusive */ + uint16 zs_level; /* 0 = leaf */ + uint16 zs_flags; + uint16 padding; /* padding, to put zs_page_id last */ + uint16 zs_page_id; /* always ZS_BTREE_PAGE_ID */ +} ZSBtreePageOpaque; + +#define ZSBtreePageGetOpaque(page) ((ZSBtreePageOpaque *) PageGetSpecialPointer(page)) + +/* + * Internal B-tree page layout. + * + * The "contents" of the page is an array of ZSBtreeInternalPageItem. The number + * of items can be deduced from pd_lower. + */ +typedef struct ZSBtreeInternalPageItem +{ + zstid tid; + BlockNumber childblk; +} ZSBtreeInternalPageItem; + +static inline ZSBtreeInternalPageItem * +ZSBtreeInternalPageGetItems(Page page) +{ + ZSBtreeInternalPageItem *items; + + items = (ZSBtreeInternalPageItem *) PageGetContents(page); + + return items; +} +static inline int +ZSBtreeInternalPageGetNumItems(Page page) +{ + ZSBtreeInternalPageItem *begin; + ZSBtreeInternalPageItem *end; + + begin = (ZSBtreeInternalPageItem *) PageGetContents(page); + end = (ZSBtreeInternalPageItem *) ((char *) page + ((PageHeader) page)->pd_lower); + + return end - begin; +} + +static inline bool +ZSBtreeInternalPageIsFull(Page page) +{ + PageHeader phdr = (PageHeader) page; + + return phdr->pd_upper - phdr->pd_lower < sizeof(ZSBtreeInternalPageItem); +} + +/* + * Leaf B-tree page layout + * + * Leaf pages are packed with ZSBtreeItems. There are three kinds of items: + * + * 1. Single item, holds one tuple (or rather, one datum). + * + * 2. "Array item", holds multiple datums, with consecutive TIDs and the same + * visibility information. An array item saves space compared to multiple + * single items, by leaving out repetitive UNDO and TID fields. An array + * item cannot mix NULLs and non-NULLs, so the ZSBT_NULL flag applies to + * all elements. + * + * 3. "Compressed item", which can hold multiple single or array items. + * + * A single or array item can furthermore be marked as DEAD. A dead item + * prevents the TID (or TID range, for an array item) from being reused. It's + * used during VACUUM, to mark items for which there are no index pointers + * anymore. But it cannot be removed until the undo record has been trimmed + * away, because if the TID was reused for a new record, vacuum might remove + * the new tuple version instead. After t_undo_ptr becomes older than "oldest + * undo ptr", the item can be removed and the TID recycled. + * + * TODO: squeeze harder: eliminate padding, use high bits of t_tid for flags or size + */ +typedef struct ZSBtreeItem +{ + zstid t_tid; + uint16 t_size; + uint16 t_flags; +} ZSBtreeItem; + +typedef struct ZSSingleBtreeItem +{ + /* these fields must match ZSBtreeItem */ + zstid t_tid; + uint16 t_size; + uint16 t_flags; + + ZSUndoRecPtr t_undo_ptr; + + char t_payload[FLEXIBLE_ARRAY_MEMBER]; +} ZSSingleBtreeItem; + +typedef struct ZSArrayBtreeItem +{ + /* these fields must match ZSBtreeItem */ + zstid t_tid; + uint16 t_size; + uint16 t_flags; + + uint16 t_nelements; + ZSUndoRecPtr t_undo_ptr; + + char t_payload[FLEXIBLE_ARRAY_MEMBER]; +} ZSArrayBtreeItem; + +typedef struct ZSCompressedBtreeItem +{ + /* these fields must match ZSBtreeItem */ + zstid t_tid; + uint16 t_size; + uint16 t_flags; + + uint16 t_uncompressedsize; + zstid t_lasttid; /* inclusive */ + + char t_payload[FLEXIBLE_ARRAY_MEMBER]; +} ZSCompressedBtreeItem; + +#define ZSBT_COMPRESSED 0x0001 +#define ZSBT_ARRAY 0x0002 +#define ZSBT_NULL 0x0010 +#define ZSBT_DEAD 0x0020 + +/* + * Get the last TID that the given item spans. + * + * For a single item, it's the TID of the item. For an array item, it's the + * TID of the last element. For a compressed item, it's the last TID of the + * last item it contains (which is stored explicitly in the item header). + */ +static inline zstid +zsbt_item_lasttid(ZSBtreeItem *item) +{ + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + return ((ZSCompressedBtreeItem *) item)->t_lasttid; + else if ((item->t_flags & ZSBT_ARRAY) != 0) + { + ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item; + return aitem->t_tid + aitem->t_nelements - 1; + } + else + return item->t_tid; +} + +static inline ZSUndoRecPtr +zsbt_item_undoptr(ZSBtreeItem *item) +{ + if ((item->t_flags & ZSBT_COMPRESSED) != 0) + elog(ERROR, "cannot get undo pointer from compressed item"); + else if ((item->t_flags & ZSBT_ARRAY) != 0) + { + ZSArrayBtreeItem *aitem = (ZSArrayBtreeItem *) item; + return aitem->t_undo_ptr; + } + else + { + ZSSingleBtreeItem *sitem = (ZSSingleBtreeItem *) item; + return sitem->t_undo_ptr; + } +} + +/* + * Toast page layout. + * + * When an overly large datum is stored, it is divided into chunks, and each + * chunk is stored on a dedicated toast page. The toast pages of a datum form + * list, each page has a next/prev pointer. + */ +/* + * Maximum size of an individual untoasted Datum stored in ZedStore. Datums + * larger than this need to be toasted. + * + * A datum needs to fit on a B-tree page, with page and item headers. + * + * XXX: 500 accounts for all the headers. Need to compute this correctly... + */ +#define MaxZedStoreDatumSize (BLCKSZ - 500) + +typedef struct ZSToastPageOpaque +{ + AttrNumber zs_attno; + + /* these are only set on the first page. */ + zstid zs_tid; + uint32 zs_total_size; + + uint32 zs_slice_offset; + BlockNumber zs_prev; + BlockNumber zs_next; + uint16 zs_flags; + uint16 padding1; /* padding, to put zs_page_id last */ + uint16 padding2; /* padding, to put zs_page_id last */ + uint16 zs_page_id; +} ZSToastPageOpaque; + +/* + * "Toast pointer" of a datum that's stored in zedstore toast pages. + * + * This looks somewhat like a normal TOAST pointer, but we mustn't let these + * escape out of zedstore code, because the rest of the system doesn't know + * how to deal with them. + * + * This must look like varattrib_1b_e! + */ +typedef struct varatt_zs_toastptr +{ + /* varattrib_1b_e */ + uint8 va_header; + uint8 va_tag; /* VARTAG_ZEDSTORE in zedstore toast datums */ + + /* first block */ + BlockNumber zst_block; +} varatt_zs_toastptr; + +/* + * va_tag value. this should be distinguishable from the values in + * vartag_external + */ +#define VARTAG_ZEDSTORE 10 + +/* + * Versions of datumGetSize and datumCopy that know about ZedStore-toasted + * datums. + */ +static inline Size +zs_datumGetSize(Datum value, bool typByVal, int typLen) +{ + if (typLen > 0) + return typLen; + else if (typLen == -1) + { + if (VARATT_IS_EXTERNAL(value) && VARTAG_EXTERNAL(value) == VARTAG_ZEDSTORE) + return sizeof(varatt_zs_toastptr); + else + return VARSIZE_ANY(value); + } + else + return datumGetSize(value, typByVal, typLen); +} + +static inline Datum +zs_datumCopy(Datum value, bool typByVal, int typLen) +{ + if (typLen < 0 && VARATT_IS_EXTERNAL(value) && VARTAG_EXTERNAL(value) == VARTAG_ZEDSTORE) + { + char *result = palloc(sizeof(varatt_zs_toastptr)); + + memcpy(result, DatumGetPointer(value), sizeof(varatt_zs_toastptr)); + + return PointerGetDatum(result); + } + else + return datumCopy(value, typByVal, typLen); +} + +/* + * Block 0 on every ZedStore table is a metapage. + * + * It contains a directory of b-tree roots for each attribute, and lots more. + */ +#define ZS_META_BLK 0 + +/* + * The metapage stores one of these for each attribute. + */ +typedef struct ZSRootDirItem +{ + BlockNumber root; +} ZSRootDirItem; + +typedef struct ZSMetaPage +{ + int nattributes; + ZSRootDirItem tree_root_dir[FLEXIBLE_ARRAY_MEMBER]; /* one for each attribute */ +} ZSMetaPage; + +/* + * it's not clear what we should store in the "opaque" special area, and what + * as page contents, on a metapage. But have at least the page_id field here, + * so that tools like pg_filedump can recognize it as a zedstore metapage. + */ +typedef struct ZSMetaPageOpaque +{ + uint64 zs_undo_counter; + BlockNumber zs_undo_head; + BlockNumber zs_undo_tail; + ZSUndoRecPtr zs_undo_oldestptr; + + BlockNumber zs_fpm_root; /* root of the Free Page Map */ + + uint16 zs_flags; + uint16 zs_page_id; +} ZSMetaPageOpaque; + + +/* + * Holds the state of an in-progress scan on a zedstore btree. + */ +typedef struct ZSBtreeScan +{ + Relation rel; + AttrNumber attno; + TupleDesc tupledesc; + + /* + * memory context that should be used for any allocations that go with the scan, + * like the decompression buffers. This isn't a dedicated context, you must still + * free everything to avoid leaking! We need this because the getnext function + * might be called in a short-lived memory context that is reset between calls. + */ + MemoryContext context; + + bool active; + Buffer lastbuf; + OffsetNumber lastoff; + zstid nexttid; + zstid endtid; + Snapshot snapshot; + + /* in the "real" UNDO-log, this would probably be a global variable */ + ZSUndoRecPtr recent_oldest_undo; + + /* should this scan do predicate locking? Or check for conflicts? */ + bool serializable; + bool acquire_predicate_tuple_locks; + + /* + * if we have remaining items from a compressed container tuple, they + * are kept in the decompressor context, and 'has_decompressed' is true. + */ + ZSDecompressContext decompressor; + bool has_decompressed; + + /* + * These fields are used, if the scan is processing an array tuple. + * And also for a single-item tuple - it works just like a single-element + * array tuple. + */ + ZSUndoRecPtr array_undoptr; + int array_datums_allocated_size; + Datum *array_datums; + Datum *array_next_datum; + int array_elements_left; + bool array_isnull; + +} ZSBtreeScan; + +static inline Form_pg_attribute +ZSBtreeScanGetAttInfo(ZSBtreeScan *scan) +{ + return TupleDescAttr(scan->tupledesc, scan->attno - 1); +} + +/* + * zs_split_stack is used during page split, or page merge, to keep track + * of all the modified pages. The page split (or merge) routines don't + * modify pages directly, but they construct a list of 'zs_split_stack' + * entries. Each entry holds a buffer, and a temporary in-memory copy of + * a page that should be written to the buffer, once everything is completed. + * All the buffers are exclusively-locked. + */ +typedef struct zs_split_stack zs_split_stack; + +struct zs_split_stack +{ + zs_split_stack *next; + + Buffer buf; + Page page; /* temp in-memory copy of page */ + bool recycle; /* should the page be added to the FPM? */ +}; + +/* prototypes for functions in zedstore_tidpage.c */ +extern void zsbt_tid_begin_scan(Relation rel, + zstid starttid, zstid endtid, Snapshot snapshot, ZSBtreeScan *scan); +extern void zsbt_tid_reset_scan(ZSBtreeScan *scan, zstid starttid); +extern void zsbt_tid_end_scan(ZSBtreeScan *scan); +extern zstid zsbt_tid_scan_next(ZSBtreeScan *scan); + +extern void zsbt_tid_multi_insert(Relation rel, + zstid *tids, int ndatums, + TransactionId xid, CommandId cid, uint32 speculative_token, ZSUndoRecPtr prevundoptr); +extern TM_Result zsbt_tid_delete(Relation rel, zstid tid, + TransactionId xid, CommandId cid, + Snapshot snapshot, Snapshot crosscheck, bool wait, + TM_FailureData *hufd, bool changingPart); +extern TM_Result zsbt_tid_update(Relation rel, zstid otid, + TransactionId xid, + CommandId cid, bool key_update, Snapshot snapshot, Snapshot crosscheck, + bool wait, TM_FailureData *hufd, zstid *newtid_p); +extern void zsbt_tid_clear_speculative_token(Relation rel, zstid tid, uint32 spectoken, bool forcomplete); +extern void zsbt_tid_mark_dead(Relation rel, zstid tid, ZSUndoRecPtr undoptr); +extern TM_Result zsbt_tid_lock(Relation rel, zstid tid, + TransactionId xid, CommandId cid, + LockTupleMode lockmode, Snapshot snapshot, TM_FailureData *hufd, zstid *next_tid); +extern void zsbt_tid_undo_deletion(Relation rel, zstid tid, ZSUndoRecPtr undoptr); +extern zstid zsbt_get_last_tid(Relation rel); +extern void zsbt_find_latest_tid(Relation rel, zstid *tid, Snapshot snapshot); + +/* prototypes for functions in zedstore_attrpage.c */ +extern void zsbt_attr_begin_scan(Relation rel, TupleDesc tdesc, AttrNumber attno, + zstid starttid, zstid endtid, ZSBtreeScan *scan); +extern void zsbt_attr_reset_scan(ZSBtreeScan *scan, zstid starttid); +extern void zsbt_attr_end_scan(ZSBtreeScan *scan); +extern bool zsbt_attr_scan_next(ZSBtreeScan *scan); + +extern void zsbt_attr_multi_insert(Relation rel, AttrNumber attno, + Datum *datums, bool *isnulls, zstid *tids, int ndatums); + +/* prototypes for functions in zedstore_btree.c */ +extern zs_split_stack *zsbt_newroot(Relation rel, AttrNumber attno, int level, List *downlinks); +extern zs_split_stack *zsbt_insert_downlinks(Relation rel, AttrNumber attno, + zstid leftlokey, BlockNumber leftblkno, int level, + List *downlinks); +extern void zsbt_attr_remove(Relation rel, AttrNumber attno, zstid tid); +extern zs_split_stack *zsbt_unlink_page(Relation rel, AttrNumber attno, Buffer buf, int level); +extern Buffer zsbt_descend(Relation rel, AttrNumber attno, zstid key, int level, bool readonly); +extern bool zsbt_page_is_expected(Relation rel, AttrNumber attno, zstid key, int level, Buffer buf); + +static inline void +zsbt_scan_skip(ZSBtreeScan *scan, zstid tid) +{ + if (tid > scan->nexttid) + { + if (scan->array_elements_left > 0) + { + int64 skip = tid - scan->nexttid - 1; + + if (skip < scan->array_elements_left) + { + scan->array_next_datum += skip; + scan->array_elements_left -= skip; + } + else + { + scan->array_elements_left = 0; + } + } + scan->nexttid = tid; + } +} + +/* + * Return the value of row identified with 'tid' in a scan. + * + * 'tid' must be greater than any previously returned item. + * + * Returns true if a matching item is found, false otherwise. After + * a false return, it's OK to call this again with another greater TID. + */ +static inline bool +zsbt_scan_next_fetch(ZSBtreeScan *scan, Datum *datum, bool *isnull, zstid tid) +{ + if (!scan->active) + return false; + + /* skip to the given tid. */ + zsbt_scan_skip(scan, tid); + + /* + * Fetch the next item from the scan. The item we're looking for might + * already be in scan->array_*. + */ + do + { + if (tid < scan->nexttid) + { + /* The next item from this scan is beyond the TID we're looking for. */ + return false; + } + + if (scan->array_elements_left > 0) + { + *isnull = scan->array_isnull; + *datum = *(scan->array_next_datum++); + scan->nexttid++; + scan->array_elements_left--; + return true; + } + /* Advance the scan, and check again. */ + } while (zsbt_attr_scan_next(scan)); + + return false; +} + +extern PGDLLIMPORT const TupleTableSlotOps TTSOpsZedstore; + +/* prototypes for functions in zedstore_meta.c */ +extern void zsmeta_initmetapage(Relation rel); +extern BlockNumber zsmeta_get_root_for_attribute(Relation rel, AttrNumber attno, bool for_update); +extern void zsmeta_update_root_for_attribute(Relation rel, AttrNumber attno, Buffer metabuf, BlockNumber rootblk); +extern void zsmeta_add_root_for_new_attributes(Relation rel, Page page); + +/* prototypes for functions in zedstore_visibility.c */ +extern TM_Result zs_SatisfiesUpdate(Relation rel, Snapshot snapshot, + ZSUndoRecPtr recent_oldest_undo, ZSBtreeItem *item, + LockTupleMode mode, + bool *undo_record_needed, + TM_FailureData *tmfd, zstid *next_tid); +extern bool zs_SatisfiesVisibility(ZSBtreeScan *scan, ZSBtreeItem *item, + TransactionId *obsoleting_xid, zstid *next_tid); + +/* prototypes for functions in zedstore_toast.c */ +extern Datum zedstore_toast_datum(Relation rel, AttrNumber attno, Datum value); +extern void zedstore_toast_finish(Relation rel, AttrNumber attno, Datum toasted, zstid tid); +extern Datum zedstore_toast_flatten(Relation rel, AttrNumber attno, zstid tid, Datum toasted); + +/* prototypes for functions in zedstore_freepagemap.c */ +extern Buffer zspage_getnewbuf(Relation rel, Buffer metabuf); +extern Buffer zspage_extendrel_newbuf(Relation rel); +extern void zspage_delete_page(Relation rel, Buffer buf); + +/* prototypes for functions in zedstore_utils.c */ +extern zs_split_stack *zs_new_split_stack_entry(Buffer buf, Page page); +extern void zs_apply_split_changes(Relation rel, zs_split_stack *stack); + +#endif /* ZEDSTORE_INTERNAL_H */ diff --git a/src/include/access/zedstore_undo.h b/src/include/access/zedstore_undo.h new file mode 100644 index 0000000000..2b0c5406a6 --- /dev/null +++ b/src/include/access/zedstore_undo.h @@ -0,0 +1,171 @@ +/* + * zedstore_undo.h + * internal declarations for ZedStore undo logging + * + * Copyright (c) 2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/access/zedstore_undo.h + */ +#ifndef ZEDSTORE_UNDO_H +#define ZEDSTORE_UNDO_H + +#include "commands/vacuum.h" +#include "utils/relcache.h" + +/* this must match the definition in zedstore_internal.h */ +typedef uint64 zstid; + +/* + * An UNDO-pointer. + * + * In the "real" UNDO-logging work from EDB, an UndoRecPtr is only 64 bits. + * But we make life easier for us, by encoding more information in it. + * + * 'counter' is a number that's incremented every time a new undo record is + * created. It can be used to determine if an undo pointer is too old to be + * of interest to anyone. + * + * 'blkno' and 'offset' are the physical location of the UNDO record. They + * can be used to easily fetch a given record. + */ +typedef struct +{ + uint64 counter; + BlockNumber blkno; + int32 offset; +} ZSUndoRecPtr; + +/* TODO: assert that blkno and offset match, too, if counter matches */ +#define ZSUndoRecPtrEquals(a, b) ((a).counter == (b).counter) + +#define INVALID_SPECULATIVE_TOKEN 0 + +typedef struct +{ + int16 size; /* size of this record, including header */ + uint8 type; /* ZSUNDO_TYPE_* */ + ZSUndoRecPtr undorecptr; + TransactionId xid; + CommandId cid; + zstid tid; + uint32 speculative_token; /* Only used for INSERT records */ + + /* + * UNDO-record of the inserter. This is needed if a row is inserted, and + * deleted, and there are some snapshots active don't don't consider even + * the insertion as visible. + * + * This is also used in Insert records, if the record represents the + * new tuple version of an UPDATE, rather than an INSERT. It's needed to + * dig into possible KEY SHARE locks held on the row, which didn't prevent + * the tuple from being updated. + */ + ZSUndoRecPtr prevundorec; +} ZSUndoRec; + +#define ZSUNDO_TYPE_INSERT 1 +#define ZSUNDO_TYPE_DELETE 2 +#define ZSUNDO_TYPE_UPDATE 3 +#define ZSUNDO_TYPE_TUPLE_LOCK 4 + +/* + * Type-specific record formats. + * + * We store similar info as zheap for INSERT/UPDATE/DELETE. See zheap README. + */ +typedef struct +{ + ZSUndoRec rec; + zstid endtid; /* inclusive */ + +} ZSUndoRec_Insert; + +typedef struct +{ + ZSUndoRec rec; + + bool changedPart; /* tuple was moved to a different partition by UPDATE */ + + /* + * TODO: It might be good to move the deleted tuple to the undo-log, so + * that the space can immediately be reused. But currently, we don't do + * that. (or even better, move the old tuple to the undo-log lazily, if + * the space is needed for a new insertion, before the old tuple becomes + * recyclable. + */ +} ZSUndoRec_Delete; + +/* + * This is used for an UPDATE, to mark the old tuple version as updated. + * It's the same as a deletion, except this stores the TID of the new tuple + * version, so it can be followed in READ COMMITTED mode. + * + * The ZSUndoRec_Insert record is used for the insertion of the new tuple + * version. + */ +typedef struct +{ + ZSUndoRec rec; + + bool key_update; /* were key columns updated? + * (for conflicting with FOR KEY SHARE) */ + + zstid newtid; + +} ZSUndoRec_Update; + +/* + * This is used when a tuple is locked e.g. with SELECT FOR UPDATE. + * The tuple isn't really changed in any way, but the undo record gives + * a place to store the XID of the locking transaction. + * + * In case of a FOR SHARE lock, there can be multiple lockers. Each locker + * will create a new undo record with its own XID that points to the previous + * record. So the records will form a chain, leading finally to the insertion + * record (or beyond the UNDO horizon, meaning the tuple's insertion is visible + * to everyone) + */ +typedef struct +{ + ZSUndoRec rec; + + /* + * XXX: Is it OK to store this on disk? The enum values could change. Then + * again, no one should care about old locks that were acquired before + * last restart. Except with two-phase commit prepared transactions. + */ + LockTupleMode lockmode; +} ZSUndoRec_TupleLock; + +typedef struct +{ + BlockNumber next; + uint16 padding; /* padding, to put zs_page_id last */ + uint16 zs_page_id; /* ZS_UNDO_PAGE_ID */ +} ZSUndoPageOpaque; + +static inline void +ZSUndoRecPtrInitialize(ZSUndoRecPtr *uptr) +{ + uptr->blkno = InvalidBlockNumber; + uptr->offset = InvalidOffsetNumber; + uptr->counter = 0; +} + +static inline bool +IsZSUndoRecPtrValid(ZSUndoRecPtr *uptr) +{ + return (uptr->blkno != InvalidBlockNumber && + uptr->offset != InvalidOffsetNumber); +} + +/* prototypes for functions in zstore_undo.c */ +extern ZSUndoRecPtr zsundo_insert(Relation rel, ZSUndoRec *rec); +extern ZSUndoRec *zsundo_fetch(Relation rel, ZSUndoRecPtr undorecptr); +extern void zsundo_clear_speculative_token(Relation rel, ZSUndoRecPtr undoptr); +extern void zsundo_vacuum(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy, + TransactionId OldestXmin); +extern ZSUndoRecPtr zsundo_get_oldest_undo_ptr(Relation rel); + +#endif /* ZEDSTORE_UNDO_H */ diff --git a/src/include/catalog/pg_am.dat b/src/include/catalog/pg_am.dat index 393b41dd68..f370f63460 100644 --- a/src/include/catalog/pg_am.dat +++ b/src/include/catalog/pg_am.dat @@ -33,5 +33,8 @@ { oid => '3580', oid_symbol => 'BRIN_AM_OID', descr => 'block range index (BRIN) access method', amname => 'brin', amhandler => 'brinhandler', amtype => 'i' }, +{ oid => '6668', oid_symbol => 'ZEDSTORE_TABLE_AM_OID', + descr => 'zedstore table access method', + amname => 'zedstore', amhandler => 'zedstore_tableam_handler', amtype => 't' }, ] diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 87335248a0..1df6febeca 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -873,6 +873,11 @@ proname => 'heap_tableam_handler', provolatile => 'v', prorettype => 'table_am_handler', proargtypes => 'internal', prosrc => 'heap_tableam_handler' }, +{ oid => '6669', oid_symbol => 'ZEDSTORE_TABLE_AM_HANDLER_OID', + descr => 'column-oriented table access method handler', + proname => 'zedstore_tableam_handler', provolatile => 'v', + prorettype => 'table_am_handler', proargtypes => 'internal', + prosrc => 'zedstore_tableam_handler' }, # Index access method handlers { oid => '330', descr => 'btree index access method handler', @@ -10677,4 +10682,23 @@ proname => 'pg_partition_root', prorettype => 'regclass', proargtypes => 'regclass', prosrc => 'pg_partition_root' }, +# zedstore inspection functions +{ oid => '7000', descr => 'get zedstore page type', + proname => 'pg_zs_page_type', prorettype => 'text', + proargtypes => 'regclass int8', prosrc => 'pg_zs_page_type' }, +{ oid => '7001', descr => 'show stats about active zedstore undo pages', + proname => 'pg_zs_undo_pages', prorows => '1000', proretset => 't', + prorettype => 'record', proargtypes => 'regclass', + proallargtypes => '{regclass,int8,int4,int4,int8,int8}', + proargmodes => '{i,o,o,o,o,o}', + proargnames => '{relid,blkno,nrecords,freespace,firstrecptr,lastrecptr}', + prosrc => 'pg_zs_undo_pages' }, +{ oid => '7002', descr => 'show stats about zedstore btree pages', + proname => 'pg_zs_btree_pages', prorows => '1000', proretset => 't', + prorettype => 'record', proargtypes => 'regclass', + proallargtypes => '{regclass,int8,int8,int4,int4,int8,int8,int4,int4,int4,int4,int4}', + proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{relid,blkno,nextblk,attno,level,lokey,hikey,nitems,ncompressed,totalsz,uncompressedsz,freespace}', + prosrc => 'pg_zs_btree_pages' }, + ] diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 88134bcc71..2317c688e8 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -597,5 +597,6 @@ extern void CheckCmdReplicaIdentity(Relation rel, CmdType cmd); extern void CheckSubscriptionRelkind(char relkind, const char *nspname, const char *relname); - +extern void GetNeededColumnsForNode(Node *expr, bool *mask, int n); +extern bool *GetNeededColumnsForScan(ScanState *scanstate, int ncol); #endif /* EXECUTOR_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 64122bc1e3..cd5b26118b 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1423,6 +1423,7 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; Buffer ioss_VMBuffer; Size ioss_PscanLen; + TupleTableSlot *ioss_TableSlot; } IndexOnlyScanState; /* ---------------- diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 4b7703d478..b413bb9f78 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -681,6 +681,7 @@ typedef struct RelOptInfo PlannerInfo *subroot; /* if subquery */ List *subplan_params; /* if subquery */ int rel_parallel_workers; /* wanted number of parallel workers */ + bool leverage_column_projection; /* Information about foreign tables and foreign joins */ Oid serverid; /* identifies server for the table or join */ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 6cd4cfed0a..ad7870a0bb 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -350,6 +350,9 @@ /* Define to 1 if you have the `ldap_r' library (-lldap_r). */ #undef HAVE_LIBLDAP_R +/* Define to 1 if you have the `lz4' library (-llz4). */ +#undef HAVE_LIBLZ4 + /* Define to 1 if you have the `m' library (-lm). */ #undef HAVE_LIBM @@ -389,6 +392,9 @@ /* Define to 1 if `long long int' works and is 64 bits. */ #undef HAVE_LONG_LONG_INT_64 +/* Define to 1 if you have the header file. */ +#undef HAVE_LZ4_H + /* Define to 1 if you have the header file. */ #undef HAVE_MBARRIER_H @@ -932,6 +938,9 @@ /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */ #undef USE_LLVM +/* Define to 1 to build with LZ4 support. (--with-lz4) */ +#undef USE_LZ4 + /* Define to select named POSIX semaphores. */ #undef USE_NAMED_POSIX_SEMAPHORES diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h index 376245ecd7..866c3a76f9 100644 --- a/src/include/storage/predicate.h +++ b/src/include/storage/predicate.h @@ -58,15 +58,18 @@ extern void RegisterPredicateLockingXid(TransactionId xid); extern void PredicateLockRelation(Relation relation, Snapshot snapshot); extern void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot); extern void PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot); +extern void PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot); extern void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno); extern void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno); extern void TransferPredicateLocksToHeapRelation(Relation relation); extern void ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe); /* conflict detection (may also trigger rollback) */ -extern void CheckForSerializableConflictOut(bool valid, Relation relation, HeapTuple tuple, - Buffer buffer, Snapshot snapshot); -extern void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer); +extern void heap_CheckForSerializableConflictOut(bool valid, Relation relation, HeapTuple tuple, + Buffer buffer, Snapshot snapshot); +extern void CheckForSerializableConflictOut(Relation relation, TransactionId xid, + Snapshot snapshot); +extern void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno); extern void CheckTableForSerializableConflictIn(Relation relation); /* final rollback checking */ diff --git a/src/test/isolation/specs/read-only-anomaly-2.spec b/src/test/isolation/specs/read-only-anomaly-2.spec index 9812f49ee4..2b17fcb521 100644 --- a/src/test/isolation/specs/read-only-anomaly-2.spec +++ b/src/test/isolation/specs/read-only-anomaly-2.spec @@ -18,13 +18,15 @@ teardown } session "s1" -setup { BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; } +setup { BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; + SET enable_seqscan=off; } step "s1ry" { SELECT balance FROM bank_account WHERE id = 'Y'; } step "s1wy" { UPDATE bank_account SET balance = 20 WHERE id = 'Y'; } step "s1c" { COMMIT; } session "s2" -setup { BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; } +setup { BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; + SET enable_seqscan=off; } step "s2rx" { SELECT balance FROM bank_account WHERE id = 'X'; } step "s2ry" { SELECT balance FROM bank_account WHERE id = 'Y'; } step "s2wx" { UPDATE bank_account SET balance = -11 WHERE id = 'X'; } diff --git a/src/test/regress/expected/.gitignore b/src/test/regress/expected/.gitignore index 93c56c85a0..0eb6984372 100644 --- a/src/test/regress/expected/.gitignore +++ b/src/test/regress/expected/.gitignore @@ -5,5 +5,6 @@ /largeobject.out /largeobject_1.out /misc.out +/misc_1.out /security_label.out /tablespace.out diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out new file mode 100644 index 0000000000..09d60af3b7 --- /dev/null +++ b/src/test/regress/expected/alter_table_1.out @@ -0,0 +1,3997 @@ +-- +-- ALTER_TABLE +-- +-- Clean up in case a prior regression run failed +SET client_min_messages TO 'warning'; +DROP ROLE IF EXISTS regress_alter_table_user1; +RESET client_min_messages; +CREATE USER regress_alter_table_user1; +-- +-- add attribute +-- +CREATE TABLE attmp (initial int4); +COMMENT ON TABLE attmp_wrong IS 'table comment'; +ERROR: relation "attmp_wrong" does not exist +COMMENT ON TABLE attmp IS 'table comment'; +COMMENT ON TABLE attmp IS NULL; +ALTER TABLE attmp ADD COLUMN xmin integer; -- fails +ERROR: column name "xmin" conflicts with a system column name +ALTER TABLE attmp ADD COLUMN a int4 default 3; +ALTER TABLE attmp ADD COLUMN b name; +ALTER TABLE attmp ADD COLUMN c text; +ALTER TABLE attmp ADD COLUMN d float8; +ALTER TABLE attmp ADD COLUMN e float4; +ALTER TABLE attmp ADD COLUMN f int2; +ALTER TABLE attmp ADD COLUMN g polygon; +ALTER TABLE attmp ADD COLUMN i char; +ALTER TABLE attmp ADD COLUMN k int4; +ALTER TABLE attmp ADD COLUMN l tid; +ALTER TABLE attmp ADD COLUMN m xid; +ALTER TABLE attmp ADD COLUMN n oidvector; +--ALTER TABLE attmp ADD COLUMN o lock; +ALTER TABLE attmp ADD COLUMN p boolean; +ALTER TABLE attmp ADD COLUMN q point; +ALTER TABLE attmp ADD COLUMN r lseg; +ALTER TABLE attmp ADD COLUMN s path; +ALTER TABLE attmp ADD COLUMN t box; +ALTER TABLE attmp ADD COLUMN v timestamp; +ALTER TABLE attmp ADD COLUMN w interval; +ALTER TABLE attmp ADD COLUMN x float8[]; +ALTER TABLE attmp ADD COLUMN y float4[]; +ALTER TABLE attmp ADD COLUMN z int2[]; +INSERT INTO attmp (a, b, c, d, e, f, g, i, k, l, m, n, p, q, r, s, t, + v, w, x, y, z) + VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)', + 'c', + 314159, '(1,1)', '512', + '1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)', + '(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)', + 'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}'); +SELECT * FROM attmp; + initial | a | b | c | d | e | f | g | i | k | l | m | n | p | q | r | s | t | v | w | x | y | z +---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+----------- + | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4} +(1 row) + +DROP TABLE attmp; +-- the wolf bug - schema mods caused inconsistent row descriptors +CREATE TABLE attmp ( + initial int4 +); +ALTER TABLE attmp ADD COLUMN a int4; +ALTER TABLE attmp ADD COLUMN b name; +ALTER TABLE attmp ADD COLUMN c text; +ALTER TABLE attmp ADD COLUMN d float8; +ALTER TABLE attmp ADD COLUMN e float4; +ALTER TABLE attmp ADD COLUMN f int2; +ALTER TABLE attmp ADD COLUMN g polygon; +ALTER TABLE attmp ADD COLUMN i char; +ALTER TABLE attmp ADD COLUMN k int4; +ALTER TABLE attmp ADD COLUMN l tid; +ALTER TABLE attmp ADD COLUMN m xid; +ALTER TABLE attmp ADD COLUMN n oidvector; +--ALTER TABLE attmp ADD COLUMN o lock; +ALTER TABLE attmp ADD COLUMN p boolean; +ALTER TABLE attmp ADD COLUMN q point; +ALTER TABLE attmp ADD COLUMN r lseg; +ALTER TABLE attmp ADD COLUMN s path; +ALTER TABLE attmp ADD COLUMN t box; +ALTER TABLE attmp ADD COLUMN v timestamp; +ALTER TABLE attmp ADD COLUMN w interval; +ALTER TABLE attmp ADD COLUMN x float8[]; +ALTER TABLE attmp ADD COLUMN y float4[]; +ALTER TABLE attmp ADD COLUMN z int2[]; +INSERT INTO attmp (a, b, c, d, e, f, g, i, k, l, m, n, p, q, r, s, t, + v, w, x, y, z) + VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)', + 'c', + 314159, '(1,1)', '512', + '1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)', + '(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)', + 'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}'); +SELECT * FROM attmp; + initial | a | b | c | d | e | f | g | i | k | l | m | n | p | q | r | s | t | v | w | x | y | z +---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+----------- + | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4} +(1 row) + +CREATE INDEX attmp_idx ON attmp (a, (d + e), b); +ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000; +ERROR: column number must be in range from 1 to 32767 +LINE 1: ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000; + ^ +ALTER INDEX attmp_idx ALTER COLUMN 1 SET STATISTICS 1000; +ERROR: cannot alter statistics on non-expression column "a" of index "attmp_idx" +HINT: Alter statistics on table column instead. +ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS 1000; +\d+ attmp_idx + Index "public.attmp_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+------------------+------+------------+---------+-------------- + a | integer | yes | a | plain | + expr | double precision | yes | (d + e) | plain | 1000 + b | cstring | yes | b | plain | +btree, for table "public.attmp" + +ALTER INDEX attmp_idx ALTER COLUMN 3 SET STATISTICS 1000; +ERROR: cannot alter statistics on non-expression column "b" of index "attmp_idx" +HINT: Alter statistics on table column instead. +ALTER INDEX attmp_idx ALTER COLUMN 4 SET STATISTICS 1000; +ERROR: column number 4 of relation "attmp_idx" does not exist +ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS -1; +DROP TABLE attmp; +-- +-- rename - check on both non-temp and temp tables +-- +CREATE TABLE attmp (regtable int); +CREATE TEMP TABLE attmp (attmptable int); +ALTER TABLE attmp RENAME TO attmp_new; +SELECT * FROM attmp; + regtable +---------- +(0 rows) + +SELECT * FROM attmp_new; + attmptable +------------ +(0 rows) + +ALTER TABLE attmp RENAME TO attmp_new2; +SELECT * FROM attmp; -- should fail +ERROR: relation "attmp" does not exist +LINE 1: SELECT * FROM attmp; + ^ +SELECT * FROM attmp_new; + attmptable +------------ +(0 rows) + +SELECT * FROM attmp_new2; + regtable +---------- +(0 rows) + +DROP TABLE attmp_new; +DROP TABLE attmp_new2; +-- check rename of partitioned tables and indexes also +CREATE TABLE part_attmp (a int primary key) partition by range (a); +CREATE TABLE part_attmp1 PARTITION OF part_attmp FOR VALUES FROM (0) TO (100); +ALTER INDEX part_attmp_pkey RENAME TO part_attmp_index; +ALTER INDEX part_attmp1_pkey RENAME TO part_attmp1_index; +ALTER TABLE part_attmp RENAME TO part_at2tmp; +ALTER TABLE part_attmp1 RENAME TO part_at2tmp1; +SET ROLE regress_alter_table_user1; +ALTER INDEX part_attmp_index RENAME TO fail; +ERROR: must be owner of index part_attmp_index +ALTER INDEX part_attmp1_index RENAME TO fail; +ERROR: must be owner of index part_attmp1_index +ALTER TABLE part_at2tmp RENAME TO fail; +ERROR: must be owner of table part_at2tmp +ALTER TABLE part_at2tmp1 RENAME TO fail; +ERROR: must be owner of table part_at2tmp1 +RESET ROLE; +DROP TABLE part_at2tmp; +-- +-- check renaming to a table's array type's autogenerated name +-- (the array type's name should get out of the way) +-- +CREATE TABLE attmp_array (id int); +CREATE TABLE attmp_array2 (id int); +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +-------------- + _attmp_array +(1 row) + +SELECT typname FROM pg_type WHERE oid = 'attmp_array2[]'::regtype; + typname +--------------- + _attmp_array2 +(1 row) + +ALTER TABLE attmp_array2 RENAME TO _attmp_array; +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +--------------- + __attmp_array +(1 row) + +SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype; + typname +---------------- + ___attmp_array +(1 row) + +DROP TABLE _attmp_array; +DROP TABLE attmp_array; +-- renaming to table's own array type's name is an interesting corner case +CREATE TABLE attmp_array (id int); +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +-------------- + _attmp_array +(1 row) + +ALTER TABLE attmp_array RENAME TO _attmp_array; +SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype; + typname +--------------- + __attmp_array +(1 row) + +DROP TABLE _attmp_array; +-- ALTER TABLE ... RENAME on non-table relations +-- renaming indexes (FIXME: this should probably test the index's functionality) +ALTER INDEX IF EXISTS __onek_unique1 RENAME TO attmp_onek_unique1; +NOTICE: relation "__onek_unique1" does not exist, skipping +ALTER INDEX IF EXISTS __attmp_onek_unique1 RENAME TO onek_unique1; +NOTICE: relation "__attmp_onek_unique1" does not exist, skipping +ALTER INDEX onek_unique1 RENAME TO attmp_onek_unique1; +ALTER INDEX attmp_onek_unique1 RENAME TO onek_unique1; +SET ROLE regress_alter_table_user1; +ALTER INDEX onek_unique1 RENAME TO fail; -- permission denied +ERROR: must be owner of index onek_unique1 +RESET ROLE; +-- renaming views +CREATE VIEW attmp_view (unique1) AS SELECT unique1 FROM tenk1; +ALTER TABLE attmp_view RENAME TO attmp_view_new; +SET ROLE regress_alter_table_user1; +ALTER VIEW attmp_view_new RENAME TO fail; -- permission denied +ERROR: must be owner of view attmp_view_new +RESET ROLE; +-- hack to ensure we get an indexscan here +set enable_seqscan to off; +set enable_bitmapscan to off; +-- 5 values, sorted +SELECT unique1 FROM tenk1 WHERE unique1 < 5; + unique1 +--------- + 0 + 1 + 2 + 3 + 4 +(5 rows) + +reset enable_seqscan; +reset enable_bitmapscan; +DROP VIEW attmp_view_new; +-- toast-like relation name +alter table stud_emp rename to pg_toast_stud_emp; +alter table pg_toast_stud_emp rename to stud_emp; +-- renaming index should rename constraint as well +ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1); +ALTER INDEX onek_unique1_constraint RENAME TO onek_unique1_constraint_foo; +ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo; +-- renaming constraint +ALTER TABLE onek ADD CONSTRAINT onek_check_constraint CHECK (unique1 >= 0); +ALTER TABLE onek RENAME CONSTRAINT onek_check_constraint TO onek_check_constraint_foo; +ALTER TABLE onek DROP CONSTRAINT onek_check_constraint_foo; +-- renaming constraint should rename index as well +ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1); +DROP INDEX onek_unique1_constraint; -- to see whether it's there +ERROR: cannot drop index onek_unique1_constraint because constraint onek_unique1_constraint on table onek requires it +HINT: You can drop constraint onek_unique1_constraint on table onek instead. +ALTER TABLE onek RENAME CONSTRAINT onek_unique1_constraint TO onek_unique1_constraint_foo; +DROP INDEX onek_unique1_constraint_foo; -- to see whether it's there +ERROR: cannot drop index onek_unique1_constraint_foo because constraint onek_unique1_constraint_foo on table onek requires it +HINT: You can drop constraint onek_unique1_constraint_foo on table onek instead. +ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo; +-- renaming constraints vs. inheritance +CREATE TABLE constraint_rename_test (a int CONSTRAINT con1 CHECK (a > 0), b int, c int); +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1" CHECK (a > 0) + +CREATE TABLE constraint_rename_test2 (a int CONSTRAINT con1 CHECK (a > 0), d int) INHERITS (constraint_rename_test); +NOTICE: merging column "a" with inherited definition +NOTICE: merging constraint "con1" with inherited definition +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test2 RENAME CONSTRAINT con1 TO con1foo; -- fail +ERROR: cannot rename inherited constraint "con1" +ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- fail +ERROR: inherited constraint "con1" must be renamed in child tables too +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test ADD CONSTRAINT con2 CHECK (b > 0) NO INHERIT; +ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con2 TO con2foo; -- ok +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con2foo TO con2bar; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) + "con2bar" CHECK (b > 0) NO INHERIT +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test ADD CONSTRAINT con3 PRIMARY KEY (a); +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con3 TO con3foo; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | | + c | integer | | | +Indexes: + "con3foo" PRIMARY KEY, btree (a) +Check constraints: + "con1foo" CHECK (a > 0) + "con2bar" CHECK (b > 0) NO INHERIT +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +DROP TABLE constraint_rename_test2; +DROP TABLE constraint_rename_test; +ALTER TABLE IF EXISTS constraint_not_exist RENAME CONSTRAINT con3 TO con3foo; -- ok +NOTICE: relation "constraint_not_exist" does not exist, skipping +ALTER TABLE IF EXISTS constraint_rename_test ADD CONSTRAINT con4 UNIQUE (a); +NOTICE: relation "constraint_rename_test" does not exist, skipping +-- renaming constraints with cache reset of target relation +CREATE TABLE constraint_rename_cache (a int, + CONSTRAINT chk_a CHECK (a > 0), + PRIMARY KEY (a)); +ALTER TABLE constraint_rename_cache + RENAME CONSTRAINT chk_a TO chk_a_new; +ALTER TABLE constraint_rename_cache + RENAME CONSTRAINT constraint_rename_cache_pkey TO constraint_rename_pkey_new; +CREATE TABLE like_constraint_rename_cache + (LIKE constraint_rename_cache INCLUDING ALL); +\d like_constraint_rename_cache + Table "public.like_constraint_rename_cache" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | +Indexes: + "like_constraint_rename_cache_pkey" PRIMARY KEY, btree (a) +Check constraints: + "chk_a_new" CHECK (a > 0) + +DROP TABLE constraint_rename_cache; +DROP TABLE like_constraint_rename_cache; +-- FOREIGN KEY CONSTRAINT adding TEST +CREATE TABLE attmp2 (a int primary key); +CREATE TABLE attmp3 (a int, b int); +CREATE TABLE attmp4 (a int, b int, unique(a,b)); +CREATE TABLE attmp5 (a int, b int); +-- Insert rows into attmp2 (pktable) +INSERT INTO attmp2 values (1); +INSERT INTO attmp2 values (2); +INSERT INTO attmp2 values (3); +INSERT INTO attmp2 values (4); +-- Insert rows into attmp3 +INSERT INTO attmp3 values (1,10); +INSERT INTO attmp3 values (1,20); +INSERT INTO attmp3 values (5,50); +-- Try (and fail) to add constraint due to invalid source columns +ALTER TABLE attmp3 add constraint attmpconstr foreign key(c) references attmp2 match full; +ERROR: column "c" referenced in foreign key constraint does not exist +-- Try (and fail) to add constraint due to invalid destination columns explicitly given +ALTER TABLE attmp3 add constraint attmpconstr foreign key(a) references attmp2(b) match full; +ERROR: column "b" referenced in foreign key constraint does not exist +-- Try (and fail) to add constraint due to invalid data +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full; +ERROR: insert or update on table "attmp3" violates foreign key constraint "attmpconstr" +DETAIL: Key (a)=(5) is not present in table "attmp2". +-- Delete failing row +DELETE FROM attmp3 where a=5; +-- Try (and succeed) +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full; +ALTER TABLE attmp3 drop constraint attmpconstr; +INSERT INTO attmp3 values (5,50); +-- Try NOT VALID and then VALIDATE CONSTRAINT, but fails. Delete failure then re-validate +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full NOT VALID; +ALTER TABLE attmp3 validate constraint attmpconstr; +ERROR: insert or update on table "attmp3" violates foreign key constraint "attmpconstr" +DETAIL: Key (a)=(5) is not present in table "attmp2". +-- Delete failing row +DELETE FROM attmp3 where a=5; +-- Try (and succeed) and repeat to show it works on already valid constraint +ALTER TABLE attmp3 validate constraint attmpconstr; +ALTER TABLE attmp3 validate constraint attmpconstr; +-- Try a non-verified CHECK constraint +ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10); -- fail +ERROR: check constraint "b_greater_than_ten" is violated by some row +ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10) NOT VALID; -- succeeds +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- fails +ERROR: check constraint "b_greater_than_ten" is violated by some row +DELETE FROM attmp3 WHERE NOT b > 10; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds +-- Test inherited NOT VALID CHECK constraints +select * from attmp3; + a | b +---+---- + 1 | 20 +(1 row) + +CREATE TABLE attmp6 () INHERITS (attmp3); +CREATE TABLE attmp7 () INHERITS (attmp3); +INSERT INTO attmp6 VALUES (6, 30), (7, 16); +ALTER TABLE attmp3 ADD CONSTRAINT b_le_20 CHECK (b <= 20) NOT VALID; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20; -- fails +ERROR: check constraint "b_le_20" is violated by some row +DELETE FROM attmp6 WHERE b > 20; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20; -- succeeds +-- An already validated constraint must not be revalidated +CREATE FUNCTION boo(int) RETURNS int IMMUTABLE STRICT LANGUAGE plpgsql AS $$ BEGIN RAISE NOTICE 'boo: %', $1; RETURN $1; END; $$; +INSERT INTO attmp7 VALUES (8, 18); +ALTER TABLE attmp7 ADD CONSTRAINT identity CHECK (b = boo(b)); +NOTICE: boo: 18 +ALTER TABLE attmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID; +NOTICE: merging constraint "identity" with inherited definition +ALTER TABLE attmp3 VALIDATE CONSTRAINT identity; +NOTICE: boo: 16 +NOTICE: boo: 20 +-- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT +create table parent_noinh_convalid (a int); +create table child_noinh_convalid () inherits (parent_noinh_convalid); +insert into parent_noinh_convalid values (1); +insert into child_noinh_convalid values (1); +alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid; +-- fail, because of the row in parent +alter table parent_noinh_convalid validate constraint check_a_is_2; +ERROR: check constraint "check_a_is_2" is violated by some row +delete from only parent_noinh_convalid; +-- ok (parent itself contains no violating rows) +alter table parent_noinh_convalid validate constraint check_a_is_2; +select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2'; + convalidated +-------------- + t +(1 row) + +-- cleanup +drop table parent_noinh_convalid, child_noinh_convalid; +-- Try (and fail) to create constraint from attmp5(a) to attmp4(a) - unique constraint on +-- attmp4 is a,b +ALTER TABLE attmp5 add constraint attmpconstr foreign key(a) references attmp4(a) match full; +ERROR: there is no unique constraint matching given keys for referenced table "attmp4" +DROP TABLE attmp7; +DROP TABLE attmp6; +DROP TABLE attmp5; +DROP TABLE attmp4; +DROP TABLE attmp3; +DROP TABLE attmp2; +-- NOT VALID with plan invalidation -- ensure we don't use a constraint for +-- exclusion until validated +set constraint_exclusion TO 'partition'; +create table nv_parent (d date, check (false) no inherit not valid); +-- not valid constraint added at creation time should automatically become valid +\d nv_parent + Table "public.nv_parent" + Column | Type | Collation | Nullable | Default +--------+------+-----------+----------+--------- + d | date | | | +Check constraints: + "nv_parent_check" CHECK (false) NO INHERIT + +create table nv_child_2010 () inherits (nv_parent); +create table nv_child_2011 () inherits (nv_parent); +alter table nv_child_2010 add check (d between '2010-01-01'::date and '2010-12-31'::date) not valid; +alter table nv_child_2011 add check (d between '2011-01-01'::date and '2011-12-31'::date) not valid; +explain (costs off) select * from nv_parent where d between '2011-08-01' and '2011-08-31'; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2010 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2011 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) +(7 rows) + +create table nv_child_2009 (check (d between '2009-01-01'::date and '2009-12-31'::date)) inherits (nv_parent); +explain (costs off) select * from nv_parent where d between '2011-08-01'::date and '2011-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2010 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2011 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) +(7 rows) + +explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2010 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2011 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2009 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) +(9 rows) + +-- after validation, the constraint should be used +alter table nv_child_2011 VALIDATE CONSTRAINT nv_child_2011_d_check; +explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2010 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2009 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) +(7 rows) + +-- add an inherited NOT VALID constraint +alter table nv_parent add check (d between '2001-01-01'::date and '2099-12-31'::date) not valid; +\d nv_child_2009 + Table "public.nv_child_2009" + Column | Type | Collation | Nullable | Default +--------+------+-----------+----------+--------- + d | date | | | +Check constraints: + "nv_child_2009_d_check" CHECK (d >= '01-01-2009'::date AND d <= '12-31-2009'::date) + "nv_parent_d_check" CHECK (d >= '01-01-2001'::date AND d <= '12-31-2099'::date) NOT VALID +Inherits: nv_parent + +-- we leave nv_parent and children around to help test pg_dump logic +-- Foreign key adding test with mixed types +-- Note: these tables are TEMP to avoid name conflicts when this test +-- is run in parallel with foreign_key.sql. +CREATE TEMP TABLE PKTABLE (ptest1 int PRIMARY KEY); +INSERT INTO PKTABLE VALUES(42); +CREATE TEMP TABLE FKTABLE (ftest1 inet); +-- This next should fail, because int=inet does not exist +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer. +-- This should also fail for the same reason, but here we +-- give the column name +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable(ptest1); +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer. +DROP TABLE FKTABLE; +-- This should succeed, even though they are different types, +-- because int=int8 exists and is a member of the integer opfamily +CREATE TEMP TABLE FKTABLE (ftest1 int8); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +-- Check it actually works +INSERT INTO FKTABLE VALUES(42); -- should succeed +INSERT INTO FKTABLE VALUES(43); -- should fail +ERROR: insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey" +DETAIL: Key (ftest1)=(43) is not present in table "pktable". +DROP TABLE FKTABLE; +-- This should fail, because we'd have to cast numeric to int which is +-- not an implicit coercion (or use numeric=numeric, but that's not part +-- of the integer opfamily) +CREATE TEMP TABLE FKTABLE (ftest1 numeric); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: numeric and integer. +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +-- On the other hand, this should work because int implicitly promotes to +-- numeric, and we allow promotion on the FK side +CREATE TEMP TABLE PKTABLE (ptest1 numeric PRIMARY KEY); +INSERT INTO PKTABLE VALUES(42); +CREATE TEMP TABLE FKTABLE (ftest1 int); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +-- Check it actually works +INSERT INTO FKTABLE VALUES(42); -- should succeed +INSERT INTO FKTABLE VALUES(43); -- should fail +ERROR: insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey" +DETAIL: Key (ftest1)=(43) is not present in table "pktable". +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +CREATE TEMP TABLE PKTABLE (ptest1 int, ptest2 inet, + PRIMARY KEY(ptest1, ptest2)); +-- This should fail, because we just chose really odd types +CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) references pktable; +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer. +DROP TABLE FKTABLE; +-- Again, so should this... +CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) + references pktable(ptest1, ptest2); +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer. +DROP TABLE FKTABLE; +-- This fails because we mixed up the column ordering +CREATE TEMP TABLE FKTABLE (ftest1 int, ftest2 inet); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) + references pktable(ptest2, ptest1); +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest2" are of incompatible types: integer and inet. +-- As does this... +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1) + references pktable(ptest1, ptest2); +ERROR: foreign key constraint "fktable_ftest2_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer. +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +-- Test that ALTER CONSTRAINT updates trigger deferrability properly +CREATE TEMP TABLE PKTABLE (ptest1 int primary key); +CREATE TEMP TABLE FKTABLE (ftest1 int); +ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE; +ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE; +SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred +FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint +WHERE tgrelid = 'pktable'::regclass +ORDER BY 1,2,3; + conname | tgfoid | tgtype | tgdeferrable | tginitdeferred +---------+------------------------+--------+--------------+---------------- + fkdd | "RI_FKey_cascade_del" | 9 | f | f + fkdd | "RI_FKey_noaction_upd" | 17 | t | t + fkdd2 | "RI_FKey_cascade_del" | 9 | f | f + fkdd2 | "RI_FKey_noaction_upd" | 17 | t | t + fkdi | "RI_FKey_cascade_del" | 9 | f | f + fkdi | "RI_FKey_noaction_upd" | 17 | t | f + fkdi2 | "RI_FKey_cascade_del" | 9 | f | f + fkdi2 | "RI_FKey_noaction_upd" | 17 | t | f + fknd | "RI_FKey_cascade_del" | 9 | f | f + fknd | "RI_FKey_noaction_upd" | 17 | f | f + fknd2 | "RI_FKey_cascade_del" | 9 | f | f + fknd2 | "RI_FKey_noaction_upd" | 17 | f | f +(12 rows) + +SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred +FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint +WHERE tgrelid = 'fktable'::regclass +ORDER BY 1,2,3; + conname | tgfoid | tgtype | tgdeferrable | tginitdeferred +---------+---------------------+--------+--------------+---------------- + fkdd | "RI_FKey_check_ins" | 5 | t | t + fkdd | "RI_FKey_check_upd" | 17 | t | t + fkdd2 | "RI_FKey_check_ins" | 5 | t | t + fkdd2 | "RI_FKey_check_upd" | 17 | t | t + fkdi | "RI_FKey_check_ins" | 5 | t | f + fkdi | "RI_FKey_check_upd" | 17 | t | f + fkdi2 | "RI_FKey_check_ins" | 5 | t | f + fkdi2 | "RI_FKey_check_upd" | 17 | t | f + fknd | "RI_FKey_check_ins" | 5 | f | f + fknd | "RI_FKey_check_upd" | 17 | f | f + fknd2 | "RI_FKey_check_ins" | 5 | f | f + fknd2 | "RI_FKey_check_upd" | 17 | f | f +(12 rows) + +-- temp tables should go away by themselves, need not drop them. +-- test check constraint adding +create table atacc1 ( test int ); +-- add a check constraint +alter table atacc1 add constraint atacc_test1 check (test>3); +-- should fail +insert into atacc1 (test) values (2); +ERROR: new row for relation "atacc1" violates check constraint "atacc_test1" +DETAIL: Failing row contains (2). +-- should succeed +insert into atacc1 (test) values (4); +drop table atacc1; +-- let's do one where the check fails when added +create table atacc1 ( test int ); +-- insert a soon to be failing row +insert into atacc1 (test) values (2); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test>3); +ERROR: check constraint "atacc_test1" is violated by some row +insert into atacc1 (test) values (4); +drop table atacc1; +-- let's do one where the check fails because the column doesn't exist +create table atacc1 ( test int ); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test1>3); +ERROR: column "test1" does not exist +HINT: Perhaps you meant to reference the column "atacc1.test". +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int, test3 int); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test+test23), test2 int); +alter table atacc1 add check (test2>test); +-- should fail for $2 +insert into atacc1 (test2, test) values (3, 4); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_check" +DETAIL: Failing row contains (4, 3). +drop table atacc1; +-- inheritance related tests +create table atacc1 (test int); +create table atacc2 (test2 int); +create table atacc3 (test3 int) inherits (atacc1, atacc2); +alter table atacc2 add constraint foo check (test2>0); +-- fail and then succeed on atacc2 +insert into atacc2 (test2) values (-3); +ERROR: new row for relation "atacc2" violates check constraint "foo" +DETAIL: Failing row contains (-3). +insert into atacc2 (test2) values (3); +-- fail and then succeed on atacc3 +insert into atacc3 (test2) values (-3); +ERROR: new row for relation "atacc3" violates check constraint "foo" +DETAIL: Failing row contains (null, -3, null). +insert into atacc3 (test2) values (3); +drop table atacc3; +drop table atacc2; +drop table atacc1; +-- same things with one created with INHERIT +create table atacc1 (test int); +create table atacc2 (test2 int); +create table atacc3 (test3 int) inherits (atacc1, atacc2); +alter table atacc3 no inherit atacc2; +-- fail +alter table atacc3 no inherit atacc2; +ERROR: relation "atacc2" is not a parent of relation "atacc3" +-- make sure it really isn't a child +insert into atacc3 (test2) values (3); +select test2 from atacc2; + test2 +------- +(0 rows) + +-- fail due to missing constraint +alter table atacc2 add constraint foo check (test2>0); +alter table atacc3 inherit atacc2; +ERROR: child table is missing constraint "foo" +-- fail due to missing column +alter table atacc3 rename test2 to testx; +alter table atacc3 inherit atacc2; +ERROR: child table is missing column "test2" +-- fail due to mismatched data type +alter table atacc3 add test2 bool; +alter table atacc3 inherit atacc2; +ERROR: child table "atacc3" has different type for column "test2" +alter table atacc3 drop test2; +-- succeed +alter table atacc3 add test2 int; +update atacc3 set test2 = 4 where test2 is null; +alter table atacc3 add constraint foo check (test2>0); +alter table atacc3 inherit atacc2; +-- fail due to duplicates and circular inheritance +alter table atacc3 inherit atacc2; +ERROR: relation "atacc2" would be inherited from more than once +alter table atacc2 inherit atacc3; +ERROR: circular inheritance not allowed +DETAIL: "atacc3" is already a child of "atacc2". +alter table atacc2 inherit atacc2; +ERROR: circular inheritance not allowed +DETAIL: "atacc2" is already a child of "atacc2". +-- test that we really are a child now (should see 4 not 3 and cascade should go through) +select test2 from atacc2; + test2 +------- + 4 +(1 row) + +drop table atacc2 cascade; +NOTICE: drop cascades to table atacc3 +drop table atacc1; +-- adding only to a parent is allowed as of 9.2 +create table atacc1 (test int); +create table atacc2 (test2 int) inherits (atacc1); +-- ok: +alter table atacc1 add constraint foo check (test>0) no inherit; +-- check constraint is not there on child +insert into atacc2 (test) values (-3); +-- check constraint is there on parent +insert into atacc1 (test) values (-3); +ERROR: new row for relation "atacc1" violates check constraint "foo" +DETAIL: Failing row contains (-3). +insert into atacc1 (test) values (3); +-- fail, violating row: +alter table atacc2 add constraint foo check (test>0) no inherit; +ERROR: check constraint "foo" is violated by some row +drop table atacc2; +drop table atacc1; +-- test unique constraint adding +create table atacc1 ( test int ) ; +-- add a unique constraint +alter table atacc1 add constraint atacc_test1 unique (test); +-- insert first value +insert into atacc1 (test) values (2); +-- should fail +insert into atacc1 (test) values (2); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test)=(2) already exists. +-- should succeed +insert into atacc1 (test) values (4); +-- try to create duplicates via alter table using - should fail +alter table atacc1 alter column test type integer using 0; +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(0) is duplicated. +drop table atacc1; +-- let's do one where the unique constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing rows +insert into atacc1 (test) values (2); +insert into atacc1 (test) values (2); +-- add a unique constraint (fails) +alter table atacc1 add constraint atacc_test1 unique (test); +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(2) is duplicated. +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do one where the unique constraint fails +-- because the column doesn't exist +create table atacc1 ( test int ); +-- add a unique constraint (fails) +alter table atacc1 add constraint atacc_test1 unique (test1); +ERROR: column "test1" named in key does not exist +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int); +-- add a unique constraint +alter table atacc1 add constraint atacc_test1 unique (test, test2); +-- insert initial value +insert into atacc1 (test,test2) values (4,4); +-- should fail +insert into atacc1 (test,test2) values (4,4); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test, test2)=(4, 4) already exists. +-- should all succeed +insert into atacc1 (test,test2) values (4,5); +insert into atacc1 (test,test2) values (5,4); +insert into atacc1 (test,test2) values (5,5); +drop table atacc1; +-- lets do some naming tests +create table atacc1 (test int, test2 int, unique(test)); +alter table atacc1 add unique (test2); +-- should fail for @@ second one @@ +insert into atacc1 (test2, test) values (3, 3); +insert into atacc1 (test2, test) values (2, 3); +ERROR: duplicate key value violates unique constraint "atacc1_test_key" +DETAIL: Key (test)=(3) already exists. +drop table atacc1; +-- test primary key constraint adding +create table atacc1 ( id serial, test int) ; +-- add a primary key constraint +alter table atacc1 add constraint atacc_test1 primary key (test); +-- insert first value +insert into atacc1 (test) values (2); +-- should fail +insert into atacc1 (test) values (2); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test)=(2) already exists. +-- should succeed +insert into atacc1 (test) values (4); +-- inserting NULL should fail +insert into atacc1 (test) values(NULL); +ERROR: null value in column "test" violates not-null constraint +DETAIL: Failing row contains (4, null). +-- try adding a second primary key (should fail) +alter table atacc1 add constraint atacc_oid1 primary key(id); +ERROR: multiple primary keys for table "atacc1" are not allowed +-- drop first primary key constraint +alter table atacc1 drop constraint atacc_test1 restrict; +-- try adding a primary key on oid (should succeed) +alter table atacc1 add constraint atacc_oid1 primary key(id); +drop table atacc1; +-- let's do one where the primary key constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing rows +insert into atacc1 (test) values (2); +insert into atacc1 (test) values (2); +-- add a primary key (fails) +alter table atacc1 add constraint atacc_test1 primary key (test); +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(2) is duplicated. +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do another one where the primary key constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing row +insert into atacc1 (test) values (NULL); +-- add a primary key (fails) +alter table atacc1 add constraint atacc_test1 primary key (test); +ERROR: column "test" contains null values +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do one where the primary key constraint fails +-- because the column doesn't exist +create table atacc1 ( test int ); +-- add a primary key constraint (fails) +alter table atacc1 add constraint atacc_test1 primary key (test1); +ERROR: column "test1" of relation "atacc1" does not exist +drop table atacc1; +-- adding a new column as primary key to a non-empty table. +-- should fail unless the column has a non-null default value. +create table atacc1 ( test int ); +insert into atacc1 (test) values (0); +-- add a primary key column without a default (fails). +alter table atacc1 add column test2 int primary key; +ERROR: column "test2" contains null values +-- now add a primary key column with a default (succeeds). +alter table atacc1 add column test2 int default 0 primary key; +drop table atacc1; +-- this combination used to have order-of-execution problems (bug #15580) +create table atacc1 (a int); +insert into atacc1 values(1); +alter table atacc1 + add column b float8 not null default random(), + add primary key(a); +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int); +-- add a primary key constraint +alter table atacc1 add constraint atacc_test1 primary key (test, test2); +-- try adding a second primary key - should fail +alter table atacc1 add constraint atacc_test2 primary key (test); +ERROR: multiple primary keys for table "atacc1" are not allowed +-- insert initial value +insert into atacc1 (test,test2) values (4,4); +-- should fail +insert into atacc1 (test,test2) values (4,4); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test, test2)=(4, 4) already exists. +insert into atacc1 (test,test2) values (NULL,3); +ERROR: null value in column "test" violates not-null constraint +DETAIL: Failing row contains (null, 3). +insert into atacc1 (test,test2) values (3, NULL); +ERROR: null value in column "test2" violates not-null constraint +DETAIL: Failing row contains (3, null). +insert into atacc1 (test,test2) values (NULL,NULL); +ERROR: null value in column "test" violates not-null constraint +DETAIL: Failing row contains (null, null). +-- should all succeed +insert into atacc1 (test,test2) values (4,5); +insert into atacc1 (test,test2) values (5,4); +insert into atacc1 (test,test2) values (5,5); +drop table atacc1; +-- lets do some naming tests +create table atacc1 (test int, test2 int, primary key(test)); +-- only first should succeed +insert into atacc1 (test2, test) values (3, 3); +insert into atacc1 (test2, test) values (2, 3); +ERROR: duplicate key value violates unique constraint "atacc1_pkey" +DETAIL: Key (test)=(3) already exists. +insert into atacc1 (test2, test) values (1, NULL); +ERROR: null value in column "test" violates not-null constraint +DETAIL: Failing row contains (null, 1). +drop table atacc1; +-- alter table / alter column [set/drop] not null tests +-- try altering system catalogs, should fail +alter table pg_class alter column relname drop not null; +ERROR: permission denied: "pg_class" is a system catalog +alter table pg_class alter relname set not null; +ERROR: permission denied: "pg_class" is a system catalog +-- try altering non-existent table, should fail +alter table non_existent alter column bar set not null; +ERROR: relation "non_existent" does not exist +alter table non_existent alter column bar drop not null; +ERROR: relation "non_existent" does not exist +-- test setting columns to null and not null and vice versa +-- test checking for null values and primary key +create table atacc1 (test int not null); +alter table atacc1 add constraint "atacc1_pkey" primary key (test); +alter table atacc1 alter column test drop not null; +ERROR: column "test" is in a primary key +alter table atacc1 drop constraint "atacc1_pkey"; +alter table atacc1 alter column test drop not null; +insert into atacc1 values (null); +alter table atacc1 alter test set not null; +ERROR: column "test" contains null values +delete from atacc1; +alter table atacc1 alter test set not null; +-- try altering a non-existent column, should fail +alter table atacc1 alter bar set not null; +ERROR: column "bar" of relation "atacc1" does not exist +alter table atacc1 alter bar drop not null; +ERROR: column "bar" of relation "atacc1" does not exist +-- try creating a view and altering that, should fail +create view myview as select * from atacc1; +alter table myview alter column test drop not null; +ERROR: "myview" is not a table or foreign table +alter table myview alter column test set not null; +ERROR: "myview" is not a table or foreign table +drop view myview; +drop table atacc1; +-- set not null verified by constraints +create table atacc1 (test_a int, test_b int); +insert into atacc1 values (null, 1); +-- constraint not cover all values, should fail +alter table atacc1 add constraint atacc1_constr_or check(test_a is not null or test_b < 10); +alter table atacc1 alter test_a set not null; +ERROR: column "test_a" contains null values +alter table atacc1 drop constraint atacc1_constr_or; +-- not valid constraint, should fail +alter table atacc1 add constraint atacc1_constr_invalid check(test_a is not null) not valid; +alter table atacc1 alter test_a set not null; +ERROR: column "test_a" contains null values +alter table atacc1 drop constraint atacc1_constr_invalid; +-- with valid constraint +update atacc1 set test_a = 1; +alter table atacc1 add constraint atacc1_constr_a_valid check(test_a is not null); +alter table atacc1 alter test_a set not null; +delete from atacc1; +insert into atacc1 values (2, null); +alter table atacc1 alter test_a drop not null; +-- test multiple set not null at same time +-- test_a checked by atacc1_constr_a_valid, test_b should fail by table scan +alter table atacc1 alter test_a set not null, alter test_b set not null; +ERROR: column "test_b" contains null values +-- commands order has no importance +alter table atacc1 alter test_b set not null, alter test_a set not null; +ERROR: column "test_b" contains null values +-- valid one by table scan, one by check constraints +update atacc1 set test_b = 1; +alter table atacc1 alter test_b set not null, alter test_a set not null; +alter table atacc1 alter test_a drop not null, alter test_b drop not null; +-- both column has check constraints +alter table atacc1 add constraint atacc1_constr_b_valid check(test_b is not null); +alter table atacc1 alter test_b set not null, alter test_a set not null; +drop table atacc1; +-- test inheritance +create table parent (a int); +create table child (b varchar(255)) inherits (parent); +alter table parent alter a set not null; +insert into parent values (NULL); +ERROR: null value in column "a" violates not-null constraint +DETAIL: Failing row contains (null). +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" violates not-null constraint +DETAIL: Failing row contains (null, foo). +alter table parent alter a drop not null; +insert into parent values (NULL); +insert into child (a, b) values (NULL, 'foo'); +alter table only parent alter a set not null; +ERROR: column "a" contains null values +alter table child alter a set not null; +ERROR: column "a" contains null values +delete from parent; +alter table only parent alter a set not null; +insert into parent values (NULL); +ERROR: null value in column "a" violates not-null constraint +DETAIL: Failing row contains (null). +alter table child alter a set not null; +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" violates not-null constraint +DETAIL: Failing row contains (null, foo). +delete from child; +alter table child alter a set not null; +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" violates not-null constraint +DETAIL: Failing row contains (null, foo). +drop table child; +drop table parent; +-- test setting and removing default values +create table def_test ( + c1 int4 default 5, + c2 text default 'initial_default' +); +insert into def_test default values; +alter table def_test alter column c1 drop default; +insert into def_test default values; +alter table def_test alter column c2 drop default; +insert into def_test default values; +alter table def_test alter column c1 set default 10; +alter table def_test alter column c2 set default 'new_default'; +insert into def_test default values; +select * from def_test; + c1 | c2 +----+----------------- + 5 | initial_default + | initial_default + | + 10 | new_default +(4 rows) + +-- set defaults to an incorrect type: this should fail +alter table def_test alter column c1 set default 'wrong_datatype'; +ERROR: invalid input syntax for type integer: "wrong_datatype" +alter table def_test alter column c2 set default 20; +-- set defaults on a non-existent column: this should fail +alter table def_test alter column c3 set default 30; +ERROR: column "c3" of relation "def_test" does not exist +-- set defaults on views: we need to create a view, add a rule +-- to allow insertions into it, and then alter the view to add +-- a default +create view def_view_test as select * from def_test; +create rule def_view_test_ins as + on insert to def_view_test + do instead insert into def_test select new.*; +insert into def_view_test default values; +alter table def_view_test alter column c1 set default 45; +insert into def_view_test default values; +alter table def_view_test alter column c2 set default 'view_default'; +insert into def_view_test default values; +select * from def_view_test; + c1 | c2 +----+----------------- + 5 | initial_default + | initial_default + | + 10 | new_default + | + 45 | + 45 | view_default +(7 rows) + +drop rule def_view_test_ins on def_view_test; +drop view def_view_test; +drop table def_test; +-- alter table / drop column tests +-- try altering system catalogs, should fail +alter table pg_class drop column relname; +ERROR: permission denied: "pg_class" is a system catalog +-- try altering non-existent table, should fail +alter table nosuchtable drop column bar; +ERROR: relation "nosuchtable" does not exist +-- test dropping columns +create table atacc1 (a int4 not null, b int4, c int4 not null, d int4); +insert into atacc1 values (1, 2, 3, 4); +alter table atacc1 drop a; +alter table atacc1 drop a; +ERROR: column "a" of relation "atacc1" does not exist +-- SELECTs +select * from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select * from atacc1 order by a; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 order by a; + ^ +select * from atacc1 order by "........pg.dropped.1........"; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 order by "........pg.dropped.1........"... + ^ +select * from atacc1 group by a; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 group by a; + ^ +select * from atacc1 group by "........pg.dropped.1........"; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 group by "........pg.dropped.1........"... + ^ +select atacc1.* from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select a from atacc1; +ERROR: column "a" does not exist +LINE 1: select a from atacc1; + ^ +select atacc1.a from atacc1; +ERROR: column atacc1.a does not exist +LINE 1: select atacc1.a from atacc1; + ^ +select b,c,d from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select a,b,c,d from atacc1; +ERROR: column "a" does not exist +LINE 1: select a,b,c,d from atacc1; + ^ +select * from atacc1 where a = 1; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 where a = 1; + ^ +select "........pg.dropped.1........" from atacc1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select "........pg.dropped.1........" from atacc1; + ^ +select atacc1."........pg.dropped.1........" from atacc1; +ERROR: column atacc1.........pg.dropped.1........ does not exist +LINE 1: select atacc1."........pg.dropped.1........" from atacc1; + ^ +select "........pg.dropped.1........",b,c,d from atacc1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select "........pg.dropped.1........",b,c,d from atacc1; + ^ +select * from atacc1 where "........pg.dropped.1........" = 1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 where "........pg.dropped.1........" = ... + ^ +-- UPDATEs +update atacc1 set a = 3; +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: update atacc1 set a = 3; + ^ +update atacc1 set b = 2 where a = 3; +ERROR: column "a" does not exist +LINE 1: update atacc1 set b = 2 where a = 3; + ^ +update atacc1 set "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: update atacc1 set "........pg.dropped.1........" = 3; + ^ +update atacc1 set b = 2 where "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: update atacc1 set b = 2 where "........pg.dropped.1........"... + ^ +-- INSERTs +insert into atacc1 values (10, 11, 12, 13); +ERROR: INSERT has more expressions than target columns +LINE 1: insert into atacc1 values (10, 11, 12, 13); + ^ +insert into atacc1 values (default, 11, 12, 13); +ERROR: INSERT has more expressions than target columns +LINE 1: insert into atacc1 values (default, 11, 12, 13); + ^ +insert into atacc1 values (11, 12, 13); +insert into atacc1 (a) values (10); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a) values (10); + ^ +insert into atacc1 (a) values (default); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a) values (default); + ^ +insert into atacc1 (a,b,c,d) values (10,11,12,13); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a,b,c,d) values (10,11,12,13); + ^ +insert into atacc1 (a,b,c,d) values (default,11,12,13); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a,b,c,d) values (default,11,12,13); + ^ +insert into atacc1 (b,c,d) values (11,12,13); +insert into atacc1 ("........pg.dropped.1........") values (10); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........") values (... + ^ +insert into atacc1 ("........pg.dropped.1........") values (default); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........") values (... + ^ +insert into atacc1 ("........pg.dropped.1........",b,c,d) values (10,11,12,13); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va... + ^ +insert into atacc1 ("........pg.dropped.1........",b,c,d) values (default,11,12,13); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va... + ^ +-- DELETEs +delete from atacc1 where a = 3; +ERROR: column "a" does not exist +LINE 1: delete from atacc1 where a = 3; + ^ +delete from atacc1 where "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: delete from atacc1 where "........pg.dropped.1........" = 3; + ^ +delete from atacc1; +-- try dropping a non-existent column, should fail +alter table atacc1 drop bar; +ERROR: column "bar" of relation "atacc1" does not exist +-- try removing an oid column, should succeed (as it's nonexistant) +alter table atacc1 SET WITHOUT OIDS; +-- try adding an oid column, should fail (not supported) +alter table atacc1 SET WITH OIDS; +ERROR: syntax error at or near "WITH" +LINE 1: alter table atacc1 SET WITH OIDS; + ^ +-- try dropping the xmin column, should fail +alter table atacc1 drop xmin; +ERROR: cannot drop system column "xmin" +-- try creating a view and altering that, should fail +create view myview as select * from atacc1; +select * from myview; + b | c | d +---+---+--- +(0 rows) + +alter table myview drop d; +ERROR: "myview" is not a table, composite type, or foreign table +drop view myview; +-- test some commands to make sure they fail on the dropped column +analyze atacc1(a); +ERROR: column "a" of relation "atacc1" does not exist +analyze atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +vacuum analyze atacc1(a); +ERROR: column "a" of relation "atacc1" does not exist +vacuum analyze atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +comment on column atacc1.a is 'testing'; +ERROR: column "a" of relation "atacc1" does not exist +comment on column atacc1."........pg.dropped.1........" is 'testing'; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set storage plain; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set storage plain; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set statistics 0; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set statistics 0; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set default 3; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set default 3; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a drop default; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" drop default; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set not null; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set not null; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a drop not null; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" drop not null; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 rename a to x; +ERROR: column "a" does not exist +alter table atacc1 rename "........pg.dropped.1........" to x; +ERROR: column "........pg.dropped.1........" does not exist +alter table atacc1 add primary key(a); +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 add primary key("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 add unique(a); +ERROR: column "a" named in key does not exist +alter table atacc1 add unique("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" named in key does not exist +alter table atacc1 add check (a > 3); +ERROR: column "a" does not exist +alter table atacc1 add check ("........pg.dropped.1........" > 3); +ERROR: column "........pg.dropped.1........" does not exist +create table atacc2 (id int4 unique); +alter table atacc1 add foreign key (a) references atacc2(id); +ERROR: column "a" referenced in foreign key constraint does not exist +alter table atacc1 add foreign key ("........pg.dropped.1........") references atacc2(id); +ERROR: column "........pg.dropped.1........" referenced in foreign key constraint does not exist +alter table atacc2 add foreign key (id) references atacc1(a); +ERROR: column "a" referenced in foreign key constraint does not exist +alter table atacc2 add foreign key (id) references atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" referenced in foreign key constraint does not exist +drop table atacc2; +create index "testing_idx" on atacc1(a); +ERROR: column "a" does not exist +create index "testing_idx" on atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" does not exist +-- test create as and select into +insert into atacc1 values (21, 22, 23); +create table attest1 as select * from atacc1; +select * from attest1; + b | c | d +----+----+---- + 21 | 22 | 23 +(1 row) + +drop table attest1; +select * into attest2 from atacc1; +select * from attest2; + b | c | d +----+----+---- + 21 | 22 | 23 +(1 row) + +drop table attest2; +-- try dropping all columns +alter table atacc1 drop c; +alter table atacc1 drop d; +alter table atacc1 drop b; +select * from atacc1; +-- +(1 row) + +drop table atacc1; +-- test constraint error reporting in presence of dropped columns +create table atacc1 (id serial primary key, value int check (value < 10)); +insert into atacc1(value) values (100); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_value_check" +DETAIL: Failing row contains (1, 100). +alter table atacc1 drop column value; +alter table atacc1 add column value int check (value < 10); +insert into atacc1(value) values (100); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_value_check" +DETAIL: Failing row contains (2, 100). +insert into atacc1(id, value) values (null, 0); +ERROR: null value in column "id" violates not-null constraint +DETAIL: Failing row contains (null, 0). +drop table atacc1; +-- test inheritance +create table parent (a int, b int, c int); +insert into parent values (1, 2, 3); +alter table parent drop a; +create table child (d varchar(255)) inherits (parent); +insert into child values (12, 13, 'testing'); +select * from parent; + b | c +----+---- + 2 | 3 + 12 | 13 +(2 rows) + +select * from child; + b | c | d +----+----+--------- + 12 | 13 | testing +(1 row) + +alter table parent drop c; +select * from parent; + b +---- + 2 + 12 +(2 rows) + +select * from child; + b | d +----+--------- + 12 | testing +(1 row) + +drop table child; +drop table parent; +-- check error cases for inheritance column merging +create table parent (a float8, b numeric(10,4), c text collate "C"); +create table child (a float4) inherits (parent); -- fail +NOTICE: merging column "a" with inherited definition +ERROR: column "a" has a type conflict +DETAIL: double precision versus real +create table child (b decimal(10,7)) inherits (parent); -- fail +NOTICE: moving and merging column "b" with inherited definition +DETAIL: User-specified column moved to the position of the inherited column. +ERROR: column "b" has a type conflict +DETAIL: numeric(10,4) versus numeric(10,7) +create table child (c text collate "POSIX") inherits (parent); -- fail +NOTICE: moving and merging column "c" with inherited definition +DETAIL: User-specified column moved to the position of the inherited column. +ERROR: column "c" has a collation conflict +DETAIL: "C" versus "POSIX" +create table child (a double precision, b decimal(10,4)) inherits (parent); +NOTICE: merging column "a" with inherited definition +NOTICE: merging column "b" with inherited definition +drop table child; +drop table parent; +-- test copy in/out +create table attest (a int4, b int4, c int4); +insert into attest values (1,2,3); +alter table attest drop a; +copy attest to stdout; +2 3 +copy attest(a) to stdout; +ERROR: column "a" of relation "attest" does not exist +copy attest("........pg.dropped.1........") to stdout; +ERROR: column "........pg.dropped.1........" of relation "attest" does not exist +copy attest from stdin; +ERROR: extra data after last expected column +CONTEXT: COPY attest, line 1: "10 11 12" +select * from attest; + b | c +---+--- + 2 | 3 +(1 row) + +copy attest from stdin; +select * from attest; + b | c +----+---- + 2 | 3 + 21 | 22 +(2 rows) + +copy attest(a) from stdin; +ERROR: column "a" of relation "attest" does not exist +copy attest("........pg.dropped.1........") from stdin; +ERROR: column "........pg.dropped.1........" of relation "attest" does not exist +copy attest(b,c) from stdin; +select * from attest; + b | c +----+---- + 2 | 3 + 21 | 22 + 31 | 32 +(3 rows) + +drop table attest; +-- test inheritance +create table dropColumn (a int, b int, e int); +create table dropColumnChild (c int) inherits (dropColumn); +create table dropColumnAnother (d int) inherits (dropColumnChild); +-- these two should fail +alter table dropColumnchild drop column a; +ERROR: cannot drop inherited column "a" +alter table only dropColumnChild drop column b; +ERROR: cannot drop inherited column "b" +-- these three should work +alter table only dropColumn drop column e; +alter table dropColumnChild drop column c; +alter table dropColumn drop column a; +create table renameColumn (a int); +create table renameColumnChild (b int) inherits (renameColumn); +create table renameColumnAnother (c int) inherits (renameColumnChild); +-- these three should fail +alter table renameColumnChild rename column a to d; +ERROR: cannot rename inherited column "a" +alter table only renameColumnChild rename column a to d; +ERROR: inherited column "a" must be renamed in child tables too +alter table only renameColumn rename column a to d; +ERROR: inherited column "a" must be renamed in child tables too +-- these should work +alter table renameColumn rename column a to d; +alter table renameColumnChild rename column b to a; +-- these should work +alter table if exists doesnt_exist_tab rename column a to d; +NOTICE: relation "doesnt_exist_tab" does not exist, skipping +alter table if exists doesnt_exist_tab rename column b to a; +NOTICE: relation "doesnt_exist_tab" does not exist, skipping +-- this should work +alter table renameColumn add column w int; +-- this should fail +alter table only renameColumn add column x int; +ERROR: column must be added to child tables too +-- Test corner cases in dropping of inherited columns +create table p1 (f1 int, f2 int); +create table c1 (f1 int not null) inherits(p1); +NOTICE: merging column "f1" with inherited definition +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +-- should work +alter table p1 drop column f1; +-- c1.f1 is still there, but no longer inherited +select f1 from c1; + f1 +---- +(0 rows) + +alter table c1 drop column f1; +select f1 from c1; +ERROR: column "f1" does not exist +LINE 1: select f1 from c1; + ^ +HINT: Perhaps you meant to reference the column "c1.f2". +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 () inherits(p1); +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table p1 drop column f1; +-- c1.f1 is dropped now, since there is no local definition for it +select f1 from c1; +ERROR: column "f1" does not exist +LINE 1: select f1 from c1; + ^ +HINT: Perhaps you meant to reference the column "c1.f2". +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 () inherits(p1); +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table only p1 drop column f1; +-- c1.f1 is NOT dropped, but must now be considered non-inherited +alter table c1 drop column f1; +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 (f1 int not null) inherits(p1); +NOTICE: merging column "f1" with inherited definition +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table only p1 drop column f1; +-- c1.f1 is still there, but no longer inherited +alter table c1 drop column f1; +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1(id int, name text); +create table p2(id2 int, name text, height int); +create table c1(age int) inherits(p1,p2); +NOTICE: merging multiple inherited definitions of column "name" +create table gc1() inherits (c1); +select relname, attname, attinhcount, attislocal +from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid) +where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped +order by relname, attnum; + relname | attname | attinhcount | attislocal +---------+---------+-------------+------------ + c1 | id | 1 | f + c1 | name | 2 | f + c1 | id2 | 1 | f + c1 | height | 1 | f + c1 | age | 0 | t + gc1 | id | 1 | f + gc1 | name | 1 | f + gc1 | id2 | 1 | f + gc1 | height | 1 | f + gc1 | age | 1 | f + p1 | id | 0 | t + p1 | name | 0 | t + p2 | id2 | 0 | t + p2 | name | 0 | t + p2 | height | 0 | t +(15 rows) + +-- should work +alter table only p1 drop column name; +-- should work. Now c1.name is local and inhcount is 0. +alter table p2 drop column name; +-- should be rejected since its inherited +alter table gc1 drop column name; +ERROR: cannot drop inherited column "name" +-- should work, and drop gc1.name along +alter table c1 drop column name; +-- should fail: column does not exist +alter table gc1 drop column name; +ERROR: column "name" of relation "gc1" does not exist +-- should work and drop the attribute in all tables +alter table p2 drop column height; +-- IF EXISTS test +create table dropColumnExists (); +alter table dropColumnExists drop column non_existing; --fail +ERROR: column "non_existing" of relation "dropcolumnexists" does not exist +alter table dropColumnExists drop column if exists non_existing; --succeed +NOTICE: column "non_existing" of relation "dropcolumnexists" does not exist, skipping +select relname, attname, attinhcount, attislocal +from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid) +where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped +order by relname, attnum; + relname | attname | attinhcount | attislocal +---------+---------+-------------+------------ + c1 | id | 1 | f + c1 | id2 | 1 | f + c1 | age | 0 | t + gc1 | id | 1 | f + gc1 | id2 | 1 | f + gc1 | age | 1 | f + p1 | id | 0 | t + p2 | id2 | 0 | t +(8 rows) + +drop table p1, p2 cascade; +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to table c1 +drop cascades to table gc1 +-- test attinhcount tracking with merged columns +create table depth0(); +create table depth1(c text) inherits (depth0); +create table depth2() inherits (depth1); +alter table depth0 add c text; +NOTICE: merging definition of column "c" for child "depth1" +select attrelid::regclass, attname, attinhcount, attislocal +from pg_attribute +where attnum > 0 and attrelid::regclass in ('depth0', 'depth1', 'depth2') +order by attrelid::regclass::text, attnum; + attrelid | attname | attinhcount | attislocal +----------+---------+-------------+------------ + depth0 | c | 0 | t + depth1 | c | 1 | t + depth2 | c | 1 | f +(3 rows) + +-- test renumbering of child-table columns in inherited operations +create table p1 (f1 int); +create table c1 (f2 text, f3 int) inherits (p1); +alter table p1 add column a1 int check (a1 > 0); +alter table p1 add column f2 text; +NOTICE: merging definition of column "f2" for child "c1" +insert into p1 values (1,2,'abc'); +insert into c1 values(11,'xyz',33,0); -- should fail +ERROR: new row for relation "c1" violates check constraint "p1_a1_check" +DETAIL: Failing row contains (11, xyz, 33, 0). +insert into c1 values(11,'xyz',33,22); +select * from p1; + f1 | a1 | f2 +----+----+----- + 1 | 2 | abc + 11 | 22 | xyz +(2 rows) + +update p1 set a1 = a1 + 1, f2 = upper(f2); +select * from p1; + f1 | a1 | f2 +----+----+----- + 1 | 3 | ABC + 11 | 23 | XYZ +(2 rows) + +drop table p1 cascade; +NOTICE: drop cascades to table c1 +-- test that operations with a dropped column do not try to reference +-- its datatype +create domain mytype as text; +create temp table foo (f1 text, f2 mytype, f3 text); +insert into foo values('bb','cc','dd'); +select * from foo; + f1 | f2 | f3 +----+----+---- + bb | cc | dd +(1 row) + +drop domain mytype cascade; +NOTICE: drop cascades to column f2 of table foo +select * from foo; + f1 | f3 +----+---- + bb | dd +(1 row) + +insert into foo values('qq','rr'); +select * from foo; + f1 | f3 +----+---- + bb | dd + qq | rr +(2 rows) + +update foo set f3 = 'zz'; +select * from foo; + f1 | f3 +----+---- + bb | zz + qq | zz +(2 rows) + +select f3,max(f1) from foo group by f3; + f3 | max +----+----- + zz | qq +(1 row) + +-- Simple tests for alter table column type +alter table foo alter f1 TYPE integer; -- fails +ERROR: column "f1" cannot be cast automatically to type integer +HINT: You might need to specify "USING f1::integer". +alter table foo alter f1 TYPE varchar(10); +create table anothertab (atcol1 serial8, atcol2 boolean, + constraint anothertab_chk check (atcol1 <= 3)); +insert into anothertab (atcol1, atcol2) values (default, true); +insert into anothertab (atcol1, atcol2) values (default, false); +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f +(2 rows) + +alter table anothertab alter column atcol1 type boolean; -- fails +ERROR: column "atcol1" cannot be cast automatically to type boolean +HINT: You might need to specify "USING atcol1::boolean". +alter table anothertab alter column atcol1 type boolean using atcol1::int; -- fails +ERROR: result of USING clause for column "atcol1" cannot be cast automatically to type boolean +HINT: You might need to add an explicit cast. +alter table anothertab alter column atcol1 type integer; +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f +(2 rows) + +insert into anothertab (atcol1, atcol2) values (45, null); -- fails +ERROR: new row for relation "anothertab" violates check constraint "anothertab_chk" +DETAIL: Failing row contains (45, null). +insert into anothertab (atcol1, atcol2) values (default, null); +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f + 3 | +(3 rows) + +alter table anothertab alter column atcol2 type text + using case when atcol2 is true then 'IT WAS TRUE' + when atcol2 is false then 'IT WAS FALSE' + else 'IT WAS NULL!' end; +select * from anothertab; + atcol1 | atcol2 +--------+-------------- + 1 | IT WAS TRUE + 2 | IT WAS FALSE + 3 | IT WAS NULL! +(3 rows) + +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; -- fails +ERROR: default for column "atcol1" cannot be cast automatically to type boolean +alter table anothertab alter column atcol1 drop default; +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; -- fails +ERROR: operator does not exist: boolean <= integer +HINT: No operator matches the given name and argument types. You might need to add explicit type casts. +alter table anothertab drop constraint anothertab_chk; +alter table anothertab drop constraint anothertab_chk; -- fails +ERROR: constraint "anothertab_chk" of relation "anothertab" does not exist +alter table anothertab drop constraint IF EXISTS anothertab_chk; -- succeeds +NOTICE: constraint "anothertab_chk" of relation "anothertab" does not exist, skipping +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; +select * from anothertab; + atcol1 | atcol2 +--------+-------------- + f | IT WAS TRUE + t | IT WAS FALSE + f | IT WAS NULL! +(3 rows) + +drop table anothertab; +create table another (f1 int, f2 text); +insert into another values(1, 'one'); +insert into another values(2, 'two'); +insert into another values(3, 'three'); +select * from another; + f1 | f2 +----+------- + 1 | one + 2 | two + 3 | three +(3 rows) + +alter table another + alter f1 type text using f2 || ' more', + alter f2 type bigint using f1 * 10; +select * from another; + f1 | f2 +------------+---- + one more | 10 + two more | 20 + three more | 30 +(3 rows) + +drop table another; +-- table's row type +create table tab1 (a int, b text); +create table tab2 (x int, y tab1); +alter table tab1 alter column b type varchar; -- fails +ERROR: cannot alter table "tab1" because column "tab2.y" uses its row type +-- Alter column type that's part of a partitioned index +create table at_partitioned (a int, b text) partition by range (a); +create table at_part_1 partition of at_partitioned for values from (0) to (1000); +insert into at_partitioned values (512, '0.123'); +create table at_part_2 (b text, a int); +insert into at_part_2 values ('1.234', 1024); +create index on at_partitioned (b); +create index on at_partitioned (a); +\d at_part_1 + Table "public.at_part_1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | +Partition of: at_partitioned FOR VALUES FROM (0) TO (1000) +Indexes: + "at_part_1_a_idx" btree (a) + "at_part_1_b_idx" btree (b) + +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | text | | | + a | integer | | | + +alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000); +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | text | | | + a | integer | | | +Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) +Indexes: + "at_part_2_a_idx" btree (a) + "at_part_2_b_idx" btree (b) + +alter table at_partitioned alter column b type numeric using b::numeric; +\d at_part_1 + Table "public.at_part_1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | numeric | | | +Partition of: at_partitioned FOR VALUES FROM (0) TO (1000) +Indexes: + "at_part_1_a_idx" btree (a) + "at_part_1_b_idx" btree (b) + +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | numeric | | | + a | integer | | | +Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) +Indexes: + "at_part_2_a_idx" btree (a) + "at_part_2_b_idx" btree (b) + +drop table at_partitioned; +-- Alter column type when no table rewrite is required +-- Also check that comments are preserved +create table at_partitioned(id int, name varchar(64), unique (id, name)) + partition by hash(id); +comment on constraint at_partitioned_id_name_key on at_partitioned is 'parent constraint'; +comment on index at_partitioned_id_name_key is 'parent index'; +create table at_partitioned_0 partition of at_partitioned + for values with (modulus 2, remainder 0); +comment on constraint at_partitioned_0_id_name_key on at_partitioned_0 is 'child 0 constraint'; +comment on index at_partitioned_0_id_name_key is 'child 0 index'; +create table at_partitioned_1 partition of at_partitioned + for values with (modulus 2, remainder 1); +comment on constraint at_partitioned_1_id_name_key on at_partitioned_1 is 'child 1 constraint'; +comment on index at_partitioned_1_id_name_key is 'child 1 index'; +insert into at_partitioned values(1, 'foo'); +insert into at_partitioned values(3, 'bar'); +create temp table old_oids as + select relname, oid as oldoid, relfilenode as oldfilenode + from pg_class where relname like 'at_partitioned%'; +select relname, + c.oid = oldoid as orig_oid, + case relfilenode + when 0 then 'none' + when c.oid then 'own' + when oldfilenode then 'orig' + else 'OTHER' + end as storage, + obj_description(c.oid, 'pg_class') as desc + from pg_class c left join old_oids using (relname) + where relname like 'at_partitioned%' + order by relname; + relname | orig_oid | storage | desc +------------------------------+----------+---------+--------------- + at_partitioned | t | none | + at_partitioned_0 | t | own | + at_partitioned_0_id_name_key | t | own | child 0 index + at_partitioned_1 | t | own | + at_partitioned_1_id_name_key | t | own | child 1 index + at_partitioned_id_name_key | t | none | parent index +(6 rows) + +select conname, obj_description(oid, 'pg_constraint') as desc + from pg_constraint where conname like 'at_partitioned%' + order by conname; + conname | desc +------------------------------+-------------------- + at_partitioned_0_id_name_key | child 0 constraint + at_partitioned_1_id_name_key | child 1 constraint + at_partitioned_id_name_key | parent constraint +(3 rows) + +alter table at_partitioned alter column name type varchar(127); +-- Note: these tests currently show the wrong behavior for comments :-( +select relname, + c.oid = oldoid as orig_oid, + case relfilenode + when 0 then 'none' + when c.oid then 'own' + when oldfilenode then 'orig' + else 'OTHER' + end as storage, + obj_description(c.oid, 'pg_class') as desc + from pg_class c left join old_oids using (relname) + where relname like 'at_partitioned%' + order by relname; + relname | orig_oid | storage | desc +------------------------------+----------+---------+-------------- + at_partitioned | t | none | + at_partitioned_0 | t | own | + at_partitioned_0_id_name_key | f | own | parent index + at_partitioned_1 | t | own | + at_partitioned_1_id_name_key | f | own | parent index + at_partitioned_id_name_key | f | none | parent index +(6 rows) + +select conname, obj_description(oid, 'pg_constraint') as desc + from pg_constraint where conname like 'at_partitioned%' + order by conname; + conname | desc +------------------------------+------------------- + at_partitioned_0_id_name_key | + at_partitioned_1_id_name_key | + at_partitioned_id_name_key | parent constraint +(3 rows) + +-- Don't remove this DROP, it exposes bug #15672 +drop table at_partitioned; +-- disallow recursive containment of row types +create temp table recur1 (f1 int); +alter table recur1 add column f2 recur1; -- fails +ERROR: composite type recur1 cannot be made a member of itself +alter table recur1 add column f2 recur1[]; -- fails +ERROR: composite type recur1 cannot be made a member of itself +create domain array_of_recur1 as recur1[]; +alter table recur1 add column f2 array_of_recur1; -- fails +ERROR: composite type recur1 cannot be made a member of itself +create temp table recur2 (f1 int, f2 recur1); +alter table recur1 add column f2 recur2; -- fails +ERROR: composite type recur1 cannot be made a member of itself +alter table recur1 add column f2 int; +alter table recur1 alter column f2 type recur2; -- fails +ERROR: composite type recur1 cannot be made a member of itself +-- SET STORAGE may need to add a TOAST table +create table test_storage (a text); +alter table test_storage alter a set storage plain; +alter table test_storage add b int default 0; -- rewrite table to remove its TOAST table +alter table test_storage alter a set storage extended; -- re-add TOAST table +select reltoastrelid <> 0 as has_toast_table +from pg_class +where oid = 'test_storage'::regclass; + has_toast_table +----------------- + f +(1 row) + +-- ALTER COLUMN TYPE with a check constraint and a child table (bug #13779) +CREATE TABLE test_inh_check (a float check (a > 10.2), b float); +CREATE TABLE test_inh_check_child() INHERITS(test_inh_check); +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | double precision | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | double precision | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(2 rows) + +ALTER TABLE test_inh_check ALTER COLUMN a TYPE numeric; +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(2 rows) + +-- also try noinherit, local, and local+inherited cases +ALTER TABLE test_inh_check ADD CONSTRAINT bnoinherit CHECK (b > 100) NO INHERIT; +ALTER TABLE test_inh_check_child ADD CONSTRAINT blocal CHECK (b < 1000); +ALTER TABLE test_inh_check_child ADD CONSTRAINT bmerged CHECK (b > 1); +ALTER TABLE test_inh_check ADD CONSTRAINT bmerged CHECK (b > 1); +NOTICE: merging constraint "bmerged" with inherited definition +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "bmerged" CHECK (b > 1::double precision) + "bnoinherit" CHECK (b > 100::double precision) NO INHERIT + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "blocal" CHECK (b < 1000::double precision) + "bmerged" CHECK (b > 1::double precision) + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | bmerged | 0 | t | f + test_inh_check | bnoinherit | 0 | t | t + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | blocal | 0 | t | f + test_inh_check_child | bmerged | 1 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(6 rows) + +ALTER TABLE test_inh_check ALTER COLUMN b TYPE numeric; +NOTICE: merging constraint "bmerged" with inherited definition +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | numeric | | | + b | numeric | | | +Check constraints: + "bmerged" CHECK (b::double precision > 1::double precision) + "bnoinherit" CHECK (b::double precision > 100::double precision) NO INHERIT + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | numeric | | | + b | numeric | | | +Check constraints: + "blocal" CHECK (b::double precision < 1000::double precision) + "bmerged" CHECK (b::double precision > 1::double precision) + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | bmerged | 0 | t | f + test_inh_check | bnoinherit | 0 | t | t + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | blocal | 0 | t | f + test_inh_check_child | bmerged | 1 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(6 rows) + +-- ALTER COLUMN TYPE with different schema in children +-- Bug at https://postgr.es/m/20170102225618.GA10071@telsasoft.com +CREATE TABLE test_type_diff (f1 int); +CREATE TABLE test_type_diff_c (extra smallint) INHERITS (test_type_diff); +ALTER TABLE test_type_diff ADD COLUMN f2 int; +INSERT INTO test_type_diff_c VALUES (1, 2, 3); +ALTER TABLE test_type_diff ALTER COLUMN f2 TYPE bigint USING f2::bigint; +CREATE TABLE test_type_diff2 (int_two int2, int_four int4, int_eight int8); +CREATE TABLE test_type_diff2_c1 (int_four int4, int_eight int8, int_two int2); +CREATE TABLE test_type_diff2_c2 (int_eight int8, int_two int2, int_four int4); +CREATE TABLE test_type_diff2_c3 (int_two int2, int_four int4, int_eight int8); +ALTER TABLE test_type_diff2_c1 INHERIT test_type_diff2; +ALTER TABLE test_type_diff2_c2 INHERIT test_type_diff2; +ALTER TABLE test_type_diff2_c3 INHERIT test_type_diff2; +INSERT INTO test_type_diff2_c1 VALUES (1, 2, 3); +INSERT INTO test_type_diff2_c2 VALUES (4, 5, 6); +INSERT INTO test_type_diff2_c3 VALUES (7, 8, 9); +ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int8 USING int_four::int8; +-- whole-row references are disallowed +ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int4 USING (pg_column_size(test_type_diff2)); +ERROR: cannot convert whole-row table reference +DETAIL: USING expression contains a whole-row table reference. +-- check for rollback of ANALYZE corrupting table property flags (bug #11638) +CREATE TABLE check_fk_presence_1 (id int PRIMARY KEY, t text); +CREATE TABLE check_fk_presence_2 (id int REFERENCES check_fk_presence_1, t text); +BEGIN; +ALTER TABLE check_fk_presence_2 DROP CONSTRAINT check_fk_presence_2_id_fkey; +ANALYZE check_fk_presence_2; +ROLLBACK; +\d check_fk_presence_2 + Table "public.check_fk_presence_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + t | text | | | +Foreign-key constraints: + "check_fk_presence_2_id_fkey" FOREIGN KEY (id) REFERENCES check_fk_presence_1(id) + +DROP TABLE check_fk_presence_1, check_fk_presence_2; +-- check column addition within a view (bug #14876) +create table at_base_table(id int, stuff text); +insert into at_base_table values (23, 'skidoo'); +create view at_view_1 as select * from at_base_table bt; +create view at_view_2 as select *, to_json(v1) as j from at_view_1 v1; +\d+ at_view_1 + View "public.at_view_1" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | +View definition: + SELECT bt.id, + bt.stuff + FROM at_base_table bt; + +\d+ at_view_2 + View "public.at_view_2" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + j | json | | | | extended | +View definition: + SELECT v1.id, + v1.stuff, + to_json(v1.*) AS j + FROM at_view_1 v1; + +explain (verbose, costs off) select * from at_view_2; + QUERY PLAN +---------------------------------------------------------- + Seq Scan on public.at_base_table bt + Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff)) +(2 rows) + +select * from at_view_2; + id | stuff | j +----+--------+---------------------------- + 23 | skidoo | {"id":23,"stuff":"skidoo"} +(1 row) + +create or replace view at_view_1 as select *, 2+2 as more from at_base_table bt; +\d+ at_view_1 + View "public.at_view_1" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + more | integer | | | | plain | +View definition: + SELECT bt.id, + bt.stuff, + 2 + 2 AS more + FROM at_base_table bt; + +\d+ at_view_2 + View "public.at_view_2" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + j | json | | | | extended | +View definition: + SELECT v1.id, + v1.stuff, + to_json(v1.*) AS j + FROM at_view_1 v1; + +explain (verbose, costs off) select * from at_view_2; + QUERY PLAN +---------------------------------------------------------------- + Seq Scan on public.at_base_table bt + Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL)) +(2 rows) + +select * from at_view_2; + id | stuff | j +----+--------+---------------------------------------- + 23 | skidoo | {"id":23,"stuff":"skidoo","more":null} +(1 row) + +drop view at_view_2; +drop view at_view_1; +drop table at_base_table; +-- +-- lock levels +-- +drop type lockmodes; +ERROR: type "lockmodes" does not exist +create type lockmodes as enum ( + 'SIReadLock' +,'AccessShareLock' +,'RowShareLock' +,'RowExclusiveLock' +,'ShareUpdateExclusiveLock' +,'ShareLock' +,'ShareRowExclusiveLock' +,'ExclusiveLock' +,'AccessExclusiveLock' +); +drop view my_locks; +ERROR: view "my_locks" does not exist +create or replace view my_locks as +select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode +from pg_locks l join pg_class c on l.relation = c.oid +where virtualtransaction = ( + select virtualtransaction + from pg_locks + where transactionid = txid_current()::integer) +and locktype = 'relation' +and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog') +and c.relname != 'my_locks' +group by c.relname; +create table alterlock (f1 int primary key, f2 text); +insert into alterlock values (1, 'foo'); +create table alterlock2 (f3 int primary key, f1 int); +insert into alterlock2 values (1, 1); +begin; alter table alterlock alter column f2 set statistics 150; +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +rollback; +begin; alter table alterlock cluster on alterlock_pkey; +select * from my_locks order by 1; + relname | max_lockmode +----------------+-------------------------- + alterlock | ShareUpdateExclusiveLock + alterlock_pkey | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock set without cluster; +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +commit; +begin; alter table alterlock set (fillfactor = 100); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +commit; +begin; alter table alterlock reset (fillfactor); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +commit; +begin; alter table alterlock set (toast.autovacuum_enabled = off); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +commit; +begin; alter table alterlock set (autovacuum_enabled = off); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +commit; +begin; alter table alterlock alter column f2 set (n_distinct = 1); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +rollback; +-- test that mixing options with different lock levels works as expected +begin; alter table alterlock set (autovacuum_enabled = off, fillfactor = 80); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +commit; +begin; alter table alterlock alter column f2 set storage extended; +select * from my_locks order by 1; + relname | max_lockmode +-----------+--------------------- + alterlock | AccessExclusiveLock +(1 row) + +rollback; +begin; alter table alterlock alter column f2 set default 'x'; +select * from my_locks order by 1; + relname | max_lockmode +-----------+--------------------- + alterlock | AccessExclusiveLock +(1 row) + +rollback; +begin; +create trigger ttdummy + before delete or update on alterlock + for each row + execute procedure + ttdummy (1, 1); +select * from my_locks order by 1; + relname | max_lockmode +-----------+----------------------- + alterlock | ShareRowExclusiveLock +(1 row) + +rollback; +begin; +select * from my_locks order by 1; + relname | max_lockmode +---------+-------------- +(0 rows) + +alter table alterlock2 add foreign key (f1) references alterlock (f1); +select * from my_locks order by 1; + relname | max_lockmode +-----------------+----------------------- + alterlock | ShareRowExclusiveLock + alterlock2 | ShareRowExclusiveLock + alterlock2_pkey | AccessShareLock + alterlock_pkey | AccessShareLock +(4 rows) + +rollback; +begin; +alter table alterlock2 +add constraint alterlock2nv foreign key (f1) references alterlock (f1) NOT VALID; +select * from my_locks order by 1; + relname | max_lockmode +------------+----------------------- + alterlock | ShareRowExclusiveLock + alterlock2 | ShareRowExclusiveLock +(2 rows) + +commit; +begin; +alter table alterlock2 validate constraint alterlock2nv; +select * from my_locks order by 1; + relname | max_lockmode +-----------------+-------------------------- + alterlock | RowShareLock + alterlock2 | ShareUpdateExclusiveLock + alterlock2_pkey | AccessShareLock + alterlock_pkey | AccessShareLock +(4 rows) + +rollback; +create or replace view my_locks as +select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode +from pg_locks l join pg_class c on l.relation = c.oid +where virtualtransaction = ( + select virtualtransaction + from pg_locks + where transactionid = txid_current()::integer) +and locktype = 'relation' +and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog') +and c.relname = 'my_locks' +group by c.relname; +-- raise exception +alter table my_locks set (autovacuum_enabled = false); +ERROR: unrecognized parameter "autovacuum_enabled" +alter view my_locks set (autovacuum_enabled = false); +ERROR: unrecognized parameter "autovacuum_enabled" +alter table my_locks reset (autovacuum_enabled); +alter view my_locks reset (autovacuum_enabled); +begin; +alter view my_locks set (security_barrier=off); +select * from my_locks order by 1; + relname | max_lockmode +----------+--------------------- + my_locks | AccessExclusiveLock +(1 row) + +alter view my_locks reset (security_barrier); +rollback; +-- this test intentionally applies the ALTER TABLE command against a view, but +-- uses a view option so we expect this to succeed. This form of SQL is +-- accepted for historical reasons, as shown in the docs for ALTER VIEW +begin; +alter table my_locks set (security_barrier=off); +select * from my_locks order by 1; + relname | max_lockmode +----------+--------------------- + my_locks | AccessExclusiveLock +(1 row) + +alter table my_locks reset (security_barrier); +rollback; +-- cleanup +drop table alterlock2; +drop table alterlock; +drop view my_locks; +drop type lockmodes; +-- +-- alter function +-- +create function test_strict(text) returns text as + 'select coalesce($1, ''got passed a null'');' + language sql returns null on null input; +select test_strict(NULL); + test_strict +------------- + +(1 row) + +alter function test_strict(text) called on null input; +select test_strict(NULL); + test_strict +------------------- + got passed a null +(1 row) + +create function non_strict(text) returns text as + 'select coalesce($1, ''got passed a null'');' + language sql called on null input; +select non_strict(NULL); + non_strict +------------------- + got passed a null +(1 row) + +alter function non_strict(text) returns null on null input; +select non_strict(NULL); + non_strict +------------ + +(1 row) + +-- +-- alter object set schema +-- +create schema alter1; +create schema alter2; +create table alter1.t1(f1 serial primary key, f2 int check (f2 > 0)); +create view alter1.v1 as select * from alter1.t1; +create function alter1.plus1(int) returns int as 'select $1+1' language sql; +create domain alter1.posint integer check (value > 0); +create type alter1.ctype as (f1 int, f2 text); +create function alter1.same(alter1.ctype, alter1.ctype) returns boolean language sql +as 'select $1.f1 is not distinct from $2.f1 and $1.f2 is not distinct from $2.f2'; +create operator alter1.=(procedure = alter1.same, leftarg = alter1.ctype, rightarg = alter1.ctype); +create operator class alter1.ctype_hash_ops default for type alter1.ctype using hash as + operator 1 alter1.=(alter1.ctype, alter1.ctype); +create conversion alter1.ascii_to_utf8 for 'sql_ascii' to 'utf8' from ascii_to_utf8; +create text search parser alter1.prs(start = prsd_start, gettoken = prsd_nexttoken, end = prsd_end, lextypes = prsd_lextype); +create text search configuration alter1.cfg(parser = alter1.prs); +create text search template alter1.tmpl(init = dsimple_init, lexize = dsimple_lexize); +create text search dictionary alter1.dict(template = alter1.tmpl); +insert into alter1.t1(f2) values(11); +insert into alter1.t1(f2) values(12); +alter table alter1.t1 set schema alter1; -- no-op, same schema +alter table alter1.t1 set schema alter2; +alter table alter1.v1 set schema alter2; +alter function alter1.plus1(int) set schema alter2; +alter domain alter1.posint set schema alter2; +alter operator class alter1.ctype_hash_ops using hash set schema alter2; +alter operator family alter1.ctype_hash_ops using hash set schema alter2; +alter operator alter1.=(alter1.ctype, alter1.ctype) set schema alter2; +alter function alter1.same(alter1.ctype, alter1.ctype) set schema alter2; +alter type alter1.ctype set schema alter1; -- no-op, same schema +alter type alter1.ctype set schema alter2; +alter conversion alter1.ascii_to_utf8 set schema alter2; +alter text search parser alter1.prs set schema alter2; +alter text search configuration alter1.cfg set schema alter2; +alter text search template alter1.tmpl set schema alter2; +alter text search dictionary alter1.dict set schema alter2; +-- this should succeed because nothing is left in alter1 +drop schema alter1; +insert into alter2.t1(f2) values(13); +insert into alter2.t1(f2) values(14); +select * from alter2.t1; + f1 | f2 +----+---- + 1 | 11 + 2 | 12 + 3 | 13 + 4 | 14 +(4 rows) + +select * from alter2.v1; + f1 | f2 +----+---- + 1 | 11 + 2 | 12 + 3 | 13 + 4 | 14 +(4 rows) + +select alter2.plus1(41); + plus1 +------- + 42 +(1 row) + +-- clean up +drop schema alter2 cascade; +NOTICE: drop cascades to 13 other objects +DETAIL: drop cascades to table alter2.t1 +drop cascades to view alter2.v1 +drop cascades to function alter2.plus1(integer) +drop cascades to type alter2.posint +drop cascades to type alter2.ctype +drop cascades to function alter2.same(alter2.ctype,alter2.ctype) +drop cascades to operator alter2.=(alter2.ctype,alter2.ctype) +drop cascades to operator family alter2.ctype_hash_ops for access method hash +drop cascades to conversion alter2.ascii_to_utf8 +drop cascades to text search parser alter2.prs +drop cascades to text search configuration alter2.cfg +drop cascades to text search template alter2.tmpl +drop cascades to text search dictionary alter2.dict +-- +-- composite types +-- +CREATE TYPE test_type AS (a int); +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + +ALTER TYPE nosuchtype ADD ATTRIBUTE b text; -- fails +ERROR: relation "nosuchtype" does not exist +ALTER TYPE test_type ADD ATTRIBUTE b text; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + +ALTER TYPE test_type ADD ATTRIBUTE b text; -- fails +ERROR: column "b" of relation "test_type" already exists +ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE varchar; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + +ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE integer; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + +ALTER TYPE test_type DROP ATTRIBUTE b; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + +ALTER TYPE test_type DROP ATTRIBUTE c; -- fails +ERROR: column "c" of relation "test_type" does not exist +ALTER TYPE test_type DROP ATTRIBUTE IF EXISTS c; +NOTICE: column "c" of relation "test_type" does not exist, skipping +ALTER TYPE test_type DROP ATTRIBUTE a, ADD ATTRIBUTE d boolean; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + d | boolean | | | + +ALTER TYPE test_type RENAME ATTRIBUTE a TO aa; +ERROR: column "a" does not exist +ALTER TYPE test_type RENAME ATTRIBUTE d TO dd; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + dd | boolean | | | + +DROP TYPE test_type; +CREATE TYPE test_type1 AS (a int, b text); +CREATE TABLE test_tbl1 (x int, y test_type1); +ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails +ERROR: cannot alter type "test_type1" because column "test_tbl1.y" uses it +CREATE TYPE test_type2 AS (a int, b text); +CREATE TABLE test_tbl2 OF test_type2; +CREATE TABLE test_tbl2_subclass () INHERITS (test_tbl2); +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 ADD ATTRIBUTE c text; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 ADD ATTRIBUTE c text CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 DROP ATTRIBUTE b; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 DROP ATTRIBUTE b CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +\d test_tbl2_subclass + Table "public.test_tbl2_subclass" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | +Inherits: test_tbl2 + +DROP TABLE test_tbl2_subclass; +CREATE TYPE test_typex AS (a int, b text); +CREATE TABLE test_tblx (x int, y test_typex check ((y).a > 0)); +ALTER TYPE test_typex DROP ATTRIBUTE a; -- fails +ERROR: cannot drop column a of composite type test_typex because other objects depend on it +DETAIL: constraint test_tblx_y_check on table test_tblx depends on column a of composite type test_typex +HINT: Use DROP ... CASCADE to drop the dependent objects too. +ALTER TYPE test_typex DROP ATTRIBUTE a CASCADE; +NOTICE: drop cascades to constraint test_tblx_y_check on table test_tblx +\d test_tblx + Table "public.test_tblx" + Column | Type | Collation | Nullable | Default +--------+------------+-----------+----------+--------- + x | integer | | | + y | test_typex | | | + +DROP TABLE test_tblx; +DROP TYPE test_typex; +-- This test isn't that interesting on its own, but the purpose is to leave +-- behind a table to test pg_upgrade with. The table has a composite type +-- column in it, and the composite type has a dropped attribute. +CREATE TYPE test_type3 AS (a int); +CREATE TABLE test_tbl3 (c) AS SELECT '(1)'::test_type3; +ALTER TYPE test_type3 DROP ATTRIBUTE a, ADD ATTRIBUTE b int; +CREATE TYPE test_type_empty AS (); +DROP TYPE test_type_empty; +-- +-- typed tables: OF / NOT OF +-- +CREATE TYPE tt_t0 AS (z inet, x int, y numeric(8,2)); +ALTER TYPE tt_t0 DROP ATTRIBUTE z; +CREATE TABLE tt0 (x int NOT NULL, y numeric(8,2)); -- OK +CREATE TABLE tt1 (x int, y bigint); -- wrong base type +CREATE TABLE tt2 (x int, y numeric(9,2)); -- wrong typmod +CREATE TABLE tt3 (y numeric(8,2), x int); -- wrong column order +CREATE TABLE tt4 (x int); -- too few columns +CREATE TABLE tt5 (x int, y numeric(8,2), z int); -- too few columns +CREATE TABLE tt6 () INHERITS (tt0); -- can't have a parent +CREATE TABLE tt7 (x int, q text, y numeric(8,2)); +ALTER TABLE tt7 DROP q; -- OK +ALTER TABLE tt0 OF tt_t0; +ALTER TABLE tt1 OF tt_t0; +ERROR: table "tt1" has different type for column "y" +ALTER TABLE tt2 OF tt_t0; +ERROR: table "tt2" has different type for column "y" +ALTER TABLE tt3 OF tt_t0; +ERROR: table has column "y" where type requires "x" +ALTER TABLE tt4 OF tt_t0; +ERROR: table is missing column "y" +ALTER TABLE tt5 OF tt_t0; +ERROR: table has extra column "z" +ALTER TABLE tt6 OF tt_t0; +ERROR: typed tables cannot inherit +ALTER TABLE tt7 OF tt_t0; +CREATE TYPE tt_t1 AS (x int, y numeric(8,2)); +ALTER TABLE tt7 OF tt_t1; -- reassign an already-typed table +ALTER TABLE tt7 NOT OF; +\d tt7 + Table "public.tt7" + Column | Type | Collation | Nullable | Default +--------+--------------+-----------+----------+--------- + x | integer | | | + y | numeric(8,2) | | | + +-- make sure we can drop a constraint on the parent but it remains on the child +CREATE TABLE test_drop_constr_parent (c text CHECK (c IS NOT NULL)); +CREATE TABLE test_drop_constr_child () INHERITS (test_drop_constr_parent); +ALTER TABLE ONLY test_drop_constr_parent DROP CONSTRAINT "test_drop_constr_parent_c_check"; +-- should fail +INSERT INTO test_drop_constr_child (c) VALUES (NULL); +ERROR: new row for relation "test_drop_constr_child" violates check constraint "test_drop_constr_parent_c_check" +DETAIL: Failing row contains (null). +DROP TABLE test_drop_constr_parent CASCADE; +NOTICE: drop cascades to table test_drop_constr_child +-- +-- IF EXISTS test +-- +ALTER TABLE IF EXISTS tt8 ADD COLUMN f int; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f); +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10); +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2; +NOTICE: relation "tt8" does not exist, skipping +CREATE TABLE tt8(a int); +CREATE SCHEMA alter2; +ALTER TABLE IF EXISTS tt8 ADD COLUMN f int; +ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f); +ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10); +ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0; +ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1; +ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2; +\d alter2.tt8 + Table "alter2.tt8" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + f1 | integer | | not null | 0 +Indexes: + "xxx" PRIMARY KEY, btree (f1) +Check constraints: + "tt8_f_check" CHECK (f1 >= 0 AND f1 <= 10) + +DROP TABLE alter2.tt8; +DROP SCHEMA alter2; +-- +-- Check conflicts between index and CHECK constraint names +-- +CREATE TABLE tt9(c integer); +ALTER TABLE tt9 ADD CHECK(c > 1); +ALTER TABLE tt9 ADD CHECK(c > 2); -- picks nonconflicting name +ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 3); +ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 4); -- fail, dup name +ERROR: constraint "foo" for relation "tt9" already exists +ALTER TABLE tt9 ADD UNIQUE(c); +ALTER TABLE tt9 ADD UNIQUE(c); -- picks nonconflicting name +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key UNIQUE(c); -- fail, dup name +ERROR: relation "tt9_c_key" already exists +ALTER TABLE tt9 ADD CONSTRAINT foo UNIQUE(c); -- fail, dup name +ERROR: constraint "foo" for relation "tt9" already exists +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key CHECK(c > 5); -- fail, dup name +ERROR: constraint "tt9_c_key" for relation "tt9" already exists +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key2 CHECK(c > 6); +ALTER TABLE tt9 ADD UNIQUE(c); -- picks nonconflicting name +\d tt9 + Table "public.tt9" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c | integer | | | +Indexes: + "tt9_c_key" UNIQUE CONSTRAINT, btree (c) + "tt9_c_key1" UNIQUE CONSTRAINT, btree (c) + "tt9_c_key3" UNIQUE CONSTRAINT, btree (c) +Check constraints: + "foo" CHECK (c > 3) + "tt9_c_check" CHECK (c > 1) + "tt9_c_check1" CHECK (c > 2) + "tt9_c_key2" CHECK (c > 6) + +DROP TABLE tt9; +-- Check that comments on constraints and indexes are not lost at ALTER TABLE. +CREATE TABLE comment_test ( + id int, + positive_col int CHECK (positive_col > 0), + indexed_col int, + CONSTRAINT comment_test_pk PRIMARY KEY (id)); +CREATE INDEX comment_test_index ON comment_test(indexed_col); +COMMENT ON COLUMN comment_test.id IS 'Column ''id'' on comment_test'; +COMMENT ON INDEX comment_test_index IS 'Simple index on comment_test'; +COMMENT ON CONSTRAINT comment_test_positive_col_check ON comment_test IS 'CHECK constraint on comment_test.positive_col'; +COMMENT ON CONSTRAINT comment_test_pk ON comment_test IS 'PRIMARY KEY constraint of comment_test'; +COMMENT ON INDEX comment_test_pk IS 'Index backing the PRIMARY KEY of comment_test'; +SELECT col_description('comment_test'::regclass, 1) as comment; + comment +----------------------------- + Column 'id' on comment_test +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2; + index | comment +--------------------+----------------------------------------------- + comment_test_index | Simple index on comment_test + comment_test_pk | Index backing the PRIMARY KEY of comment_test +(2 rows) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2; + constraint | comment +---------------------------------+----------------------------------------------- + comment_test_pk | PRIMARY KEY constraint of comment_test + comment_test_positive_col_check | CHECK constraint on comment_test.positive_col +(2 rows) + +-- Change the datatype of all the columns. ALTER TABLE is optimized to not +-- rebuild an index if the new data type is binary compatible with the old +-- one. Check do a dummy ALTER TABLE that doesn't change the datatype +-- first, to test that no-op codepath, and another one that does. +ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE bigint; +-- Check that the comments are intact. +SELECT col_description('comment_test'::regclass, 1) as comment; + comment +----------------------------- + Column 'id' on comment_test +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2; + index | comment +--------------------+----------------------------------------------- + comment_test_index | Simple index on comment_test + comment_test_pk | Index backing the PRIMARY KEY of comment_test +(2 rows) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2; + constraint | comment +---------------------------------+----------------------------------------------- + comment_test_pk | PRIMARY KEY constraint of comment_test + comment_test_positive_col_check | CHECK constraint on comment_test.positive_col +(2 rows) + +-- Check compatibility for foreign keys and comments. This is done +-- separately as rebuilding the column type of the parent leads +-- to an error and would reduce the test scope. +CREATE TABLE comment_test_child ( + id text CONSTRAINT comment_test_child_fk REFERENCES comment_test); +CREATE INDEX comment_test_child_fk ON comment_test_child(id); +COMMENT ON COLUMN comment_test_child.id IS 'Column ''id'' on comment_test_child'; +COMMENT ON INDEX comment_test_child_fk IS 'Index backing the FOREIGN KEY of comment_test_child'; +COMMENT ON CONSTRAINT comment_test_child_fk ON comment_test_child IS 'FOREIGN KEY constraint of comment_test_child'; +-- Change column type of parent +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int USING id::integer; +ERROR: foreign key constraint "comment_test_child_fk" cannot be implemented +DETAIL: Key columns "id" and "id" are of incompatible types: text and integer. +-- Comments should be intact +SELECT col_description('comment_test_child'::regclass, 1) as comment; + comment +----------------------------------- + Column 'id' on comment_test_child +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test_child'::regclass ORDER BY 1, 2; + index | comment +-----------------------+----------------------------------------------------- + comment_test_child_fk | Index backing the FOREIGN KEY of comment_test_child +(1 row) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test_child'::regclass ORDER BY 1, 2; + constraint | comment +-----------------------+---------------------------------------------- + comment_test_child_fk | FOREIGN KEY constraint of comment_test_child +(1 row) + +-- Check that we map relation oids to filenodes and back correctly. Only +-- display bad mappings so the test output doesn't change all the time. A +-- filenode function call can return NULL for a relation dropped concurrently +-- with the call's surrounding query, so ignore a NULL mapped_oid for +-- relations that no longer exist after all calls finish. +CREATE TEMP TABLE filenode_mapping AS +SELECT + oid, mapped_oid, reltablespace, relfilenode, relname +FROM pg_class, + pg_filenode_relation(reltablespace, pg_relation_filenode(oid)) AS mapped_oid +WHERE relkind IN ('r', 'i', 'S', 't', 'm') AND mapped_oid IS DISTINCT FROM oid; +SELECT m.* FROM filenode_mapping m LEFT JOIN pg_class c ON c.oid = m.oid +WHERE c.oid IS NOT NULL OR m.mapped_oid IS NOT NULL; + oid | mapped_oid | reltablespace | relfilenode | relname +-----+------------+---------------+-------------+--------- +(0 rows) + +-- Checks on creating and manipulation of user defined relations in +-- pg_catalog. +-- +-- XXX: It would be useful to add checks around trying to manipulate +-- catalog tables, but that might have ugly consequences when run +-- against an existing server with allow_system_table_mods = on. +SHOW allow_system_table_mods; + allow_system_table_mods +------------------------- + off +(1 row) + +-- disallowed because of search_path issues with pg_dump +CREATE TABLE pg_catalog.new_system_table(); +ERROR: permission denied to create "pg_catalog.new_system_table" +DETAIL: System catalog modifications are currently disallowed. +-- instead create in public first, move to catalog +CREATE TABLE new_system_table(id serial primary key, othercol text); +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +ALTER TABLE new_system_table SET SCHEMA public; +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +-- will be ignored -- already there: +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +ALTER TABLE new_system_table RENAME TO old_system_table; +CREATE INDEX old_system_table__othercol ON old_system_table (othercol); +INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata'); +UPDATE old_system_table SET id = -id; +DELETE FROM old_system_table WHERE othercol = 'somedata'; +TRUNCATE old_system_table; +ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey; +ALTER TABLE old_system_table DROP COLUMN othercol; +DROP TABLE old_system_table; +-- set logged +CREATE UNLOGGED TABLE unlogged1(f1 SERIAL PRIMARY KEY, f2 TEXT); +-- check relpersistence of an unlogged table +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1' +ORDER BY relname; + relname | relkind | relpersistence +------------------+---------+---------------- + unlogged1 | r | u + unlogged1_f1_seq | S | p + unlogged1_pkey | i | u +(3 rows) + +CREATE UNLOGGED TABLE unlogged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged1); -- foreign key +CREATE UNLOGGED TABLE unlogged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged3); -- self-referencing foreign key +ALTER TABLE unlogged3 SET LOGGED; -- skip self-referencing foreign key +ALTER TABLE unlogged2 SET LOGGED; -- fails because a foreign key to an unlogged table exists +ERROR: could not change table "unlogged2" to logged because it references unlogged table "unlogged1" +ALTER TABLE unlogged1 SET LOGGED; +-- check relpersistence of an unlogged table after changing to permanent +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1' +ORDER BY relname; + relname | relkind | relpersistence +------------------+---------+---------------- + unlogged1 | r | p + unlogged1_f1_seq | S | p + unlogged1_pkey | i | p +(3 rows) + +ALTER TABLE unlogged1 SET LOGGED; -- silently do nothing +DROP TABLE unlogged3; +DROP TABLE unlogged2; +DROP TABLE unlogged1; +-- set unlogged +CREATE TABLE logged1(f1 SERIAL PRIMARY KEY, f2 TEXT); +-- check relpersistence of a permanent table +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1' +ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + logged1 | r | p + logged1_f1_seq | S | p + logged1_pkey | i | p +(3 rows) + +CREATE TABLE logged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged1); -- foreign key +CREATE TABLE logged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged3); -- self-referencing foreign key +ALTER TABLE logged1 SET UNLOGGED; -- fails because a foreign key from a permanent table exists +ERROR: could not change table "logged1" to unlogged because it references logged table "logged2" +ALTER TABLE logged3 SET UNLOGGED; -- skip self-referencing foreign key +ALTER TABLE logged2 SET UNLOGGED; +ALTER TABLE logged1 SET UNLOGGED; +-- check relpersistence of a permanent table after changing to unlogged +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1' +ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + logged1 | r | u + logged1_f1_seq | S | p + logged1_pkey | i | u +(3 rows) + +ALTER TABLE logged1 SET UNLOGGED; -- silently do nothing +DROP TABLE logged3; +DROP TABLE logged2; +DROP TABLE logged1; +-- test ADD COLUMN IF NOT EXISTS +CREATE TABLE test_add_column(c1 integer); +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer; +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer; -- fail because c2 already exists +ERROR: column "c2" of relation "test_add_column" already exists +ALTER TABLE ONLY test_add_column + ADD COLUMN c2 integer; -- fail because c2 already exists +ERROR: column "c2" of relation "test_add_column" already exists +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +ALTER TABLE ONLY test_add_column + ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer, -- fail because c2 already exists + ADD COLUMN c3 integer; +ERROR: column "c2" of relation "test_add_column" already exists +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN c3 integer; -- fail because c3 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN IF NOT EXISTS c3 integer; -- skipping because c3 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +NOTICE: column "c3" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN IF NOT EXISTS c3 integer, -- skipping because c3 already exists + ADD COLUMN c4 integer; +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +NOTICE: column "c3" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | | + c4 | integer | | | + +DROP TABLE test_add_column; +-- unsupported constraint types for partitioned tables +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE (a, (a+b+1)); +ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); +ERROR: exclusion constraints are not supported on partitioned tables +LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); + ^ +-- cannot drop column that is part of the partition key +ALTER TABLE partitioned DROP COLUMN a; +ERROR: cannot drop column named in partition key +ALTER TABLE partitioned ALTER COLUMN a TYPE char(5); +ERROR: cannot alter type of column named in partition key +ALTER TABLE partitioned DROP COLUMN b; +ERROR: cannot drop column referenced in partition key expression +ALTER TABLE partitioned ALTER COLUMN b TYPE char(5); +ERROR: cannot alter type of column referenced in partition key expression +-- partitioned table cannot participate in regular inheritance +CREATE TABLE nonpartitioned ( + a int, + b int +); +ALTER TABLE partitioned INHERIT nonpartitioned; +ERROR: cannot change inheritance of partitioned table +ALTER TABLE nonpartitioned INHERIT partitioned; +ERROR: cannot inherit from partitioned table "partitioned" +-- cannot add NO INHERIT constraint to partitioned tables +ALTER TABLE partitioned ADD CONSTRAINT chk_a CHECK (a > 0) NO INHERIT; +ERROR: cannot add NO INHERIT constraint to partitioned table "partitioned" +DROP TABLE partitioned, nonpartitioned; +-- +-- ATTACH PARTITION +-- +-- check that target table is partitioned +CREATE TABLE unparted ( + a int +); +CREATE TABLE fail_part (like unparted); +ALTER TABLE unparted ATTACH PARTITION fail_part FOR VALUES IN ('a'); +ERROR: table "unparted" is not partitioned +DROP TABLE unparted, fail_part; +-- check that partition bound is compatible +CREATE TABLE list_parted ( + a int NOT NULL, + b char(2) COLLATE "C", + CONSTRAINT check_a CHECK (a > 0) +) PARTITION BY LIST (a); +CREATE TABLE fail_part (LIKE list_parted); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) TO (10); +ERROR: invalid bound specification for a list partition +LINE 1: ...list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) T... + ^ +DROP TABLE fail_part; +-- check that the table being attached exists +ALTER TABLE list_parted ATTACH PARTITION nonexistant FOR VALUES IN (1); +ERROR: relation "nonexistant" does not exist +-- check ownership of the source table +CREATE ROLE regress_test_me; +CREATE ROLE regress_test_not_me; +CREATE TABLE not_owned_by_me (LIKE list_parted); +ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me; +SET SESSION AUTHORIZATION regress_test_me; +CREATE TABLE owned_by_me ( + a int +) PARTITION BY LIST (a); +ALTER TABLE owned_by_me ATTACH PARTITION not_owned_by_me FOR VALUES IN (1); +ERROR: must be owner of table not_owned_by_me +RESET SESSION AUTHORIZATION; +DROP TABLE owned_by_me, not_owned_by_me; +DROP ROLE regress_test_not_me; +DROP ROLE regress_test_me; +-- check that the table being attached is not part of regular inheritance +CREATE TABLE parent (LIKE list_parted); +CREATE TABLE child () INHERITS (parent); +ALTER TABLE list_parted ATTACH PARTITION child FOR VALUES IN (1); +ERROR: cannot attach inheritance child as partition +ALTER TABLE list_parted ATTACH PARTITION parent FOR VALUES IN (1); +ERROR: cannot attach inheritance parent as partition +DROP TABLE parent CASCADE; +NOTICE: drop cascades to table child +-- check any TEMP-ness +CREATE TEMP TABLE temp_parted (a int) PARTITION BY LIST (a); +CREATE TABLE perm_part (a int); +ALTER TABLE temp_parted ATTACH PARTITION perm_part FOR VALUES IN (1); +ERROR: cannot attach a permanent relation as partition of temporary relation "temp_parted" +DROP TABLE temp_parted, perm_part; +-- check that the table being attached is not a typed table +CREATE TYPE mytype AS (a int); +CREATE TABLE fail_part OF mytype; +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: cannot attach a typed table as partition +DROP TYPE mytype CASCADE; +NOTICE: drop cascades to table fail_part +-- check that the table being attached has only columns present in the parent +CREATE TABLE fail_part (like list_parted, c int); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: table "fail_part" contains column "c" not found in parent "list_parted" +DETAIL: The new partition may contain only the columns present in parent. +DROP TABLE fail_part; +-- check that the table being attached has every column of the parent +CREATE TABLE fail_part (a int NOT NULL); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table is missing column "b" +DROP TABLE fail_part; +-- check that columns match in type, collation and NOT NULL status +CREATE TABLE fail_part ( + b char(3), + a int NOT NULL +); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different type for column "b" +ALTER TABLE fail_part ALTER b TYPE char (2) COLLATE "POSIX"; +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different collation for column "b" +DROP TABLE fail_part; +-- check that the table being attached has all constraints of the parent +CREATE TABLE fail_part ( + b char(2) COLLATE "C", + a int NOT NULL +); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table is missing constraint "check_a" +-- check that the constraint matches in definition with parent's constraint +ALTER TABLE fail_part ADD CONSTRAINT check_a CHECK (a >= 0); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different definition for check constraint "check_a" +DROP TABLE fail_part; +-- check the attributes and constraints after partition is attached +CREATE TABLE part_1 ( + a int NOT NULL, + b char(2) COLLATE "C", + CONSTRAINT check_a CHECK (a > 0) +); +ALTER TABLE list_parted ATTACH PARTITION part_1 FOR VALUES IN (1); +-- attislocal and conislocal are always false for merged attributes and constraints respectively. +SELECT attislocal, attinhcount FROM pg_attribute WHERE attrelid = 'part_1'::regclass AND attnum > 0; + attislocal | attinhcount +------------+------------- + f | 1 + f | 1 +(2 rows) + +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::regclass AND conname = 'check_a'; + conislocal | coninhcount +------------+------------- + f | 1 +(1 row) + +-- check that the new partition won't overlap with an existing partition +CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: partition "fail_part" would overlap partition "part_1" +DROP TABLE fail_part; +-- check that an existing table can be attached as a default partition +CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT; +-- check attaching default partition fails if a default partition already +-- exists +CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT; +ERROR: partition "fail_def_part" conflicts with existing default partition "def_part" +-- check validation when attaching list partitions +CREATE TABLE list_parted2 ( + a int, + b char +) PARTITION BY LIST (a); +-- check that violating rows are correctly reported +CREATE TABLE part_2 (LIKE list_parted2); +INSERT INTO part_2 VALUES (3, 'a'); +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM part_2; +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +-- check partition cannot be attached if default has some row for its values +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; +INSERT INTO list_parted2_def VALUES (11, 'z'); +CREATE TABLE part_3 (LIKE list_parted2); +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); +ERROR: updated partition constraint for default partition would be violated by some row +-- should be ok after deleting the bad row +DELETE FROM list_parted2_def WHERE a = 11; +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); +-- adding constraints that describe the desired partition constraint +-- (or more restrictive) will help skip the validation scan +CREATE TABLE part_3_4 ( + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IN (3)) +); +-- however, if a list partition does not accept nulls, there should be +-- an explicit NOT NULL constraint on the partition key column for the +-- validation scan to be skipped; +ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4); +-- adding a NOT NULL constraint will cause the scan to be skipped +ALTER TABLE list_parted2 DETACH PARTITION part_3_4; +ALTER TABLE part_3_4 ALTER a SET NOT NULL; +ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4); +INFO: partition constraint for table "part_3_4" is implied by existing constraints +-- check if default partition scan skipped +ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6)); +CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66); +INFO: updated partition constraint for default partition "list_parted2_def" is implied by existing constraints +-- check validation when attaching range partitions +CREATE TABLE range_parted ( + a int, + b int +) PARTITION BY RANGE (a, b); +-- check that violating rows are correctly reported +CREATE TABLE part1 ( + a int NOT NULL CHECK (a = 1), + b int NOT NULL CHECK (b >= 1 AND b <= 10) +); +INSERT INTO part1 VALUES (1, 10); +-- Remember the TO bound is exclusive +ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM part1; +ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10); +-- adding constraints that describe the desired partition constraint +-- (or more restrictive) will help skip the validation scan +CREATE TABLE part2 ( + a int NOT NULL CHECK (a = 1), + b int NOT NULL CHECK (b >= 10 AND b < 18) +); +ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20); +INFO: partition constraint for table "part2" is implied by existing constraints +-- Create default partition +CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT; +-- Only one default partition is allowed, hence, following should give error +CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS); +ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT; +ERROR: partition "partr_def2" conflicts with existing default partition "partr_def1" +-- Overlapping partitions cannot be attached, hence, following should give error +INSERT INTO partr_def1 VALUES (2, 10); +CREATE TABLE part3 (LIKE range_parted); +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20); +ERROR: updated partition constraint for default partition would be violated by some row +-- Attaching partitions should be successful when there are no overlapping rows +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE part_5 ( + LIKE list_parted2 +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE part_5_a PARTITION OF part_5 FOR VALUES IN ('a'); +INSERT INTO part_5_a (a, b) VALUES (6, 'a'); +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +ERROR: partition constraint is violated by some row +-- delete the faulting row and also add a constraint to skip the scan +DELETE FROM part_5_a WHERE a NOT IN (3); +ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 5); +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +INFO: partition constraint for table "part_5" is implied by existing constraints +ALTER TABLE list_parted2 DETACH PARTITION part_5; +ALTER TABLE part_5 DROP CONSTRAINT check_a; +-- scan should again be skipped, even though NOT NULL is now a column property +ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IN (5)), ALTER a SET NOT NULL; +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +INFO: partition constraint for table "part_5" is implied by existing constraints +-- Check the case where attnos of the partitioning columns in the table being +-- attached differs from the parent. It should not affect the constraint- +-- checking logic that allows to skip the scan. +CREATE TABLE part_6 ( + c int, + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 6) +); +ALTER TABLE part_6 DROP c; +ALTER TABLE list_parted2 ATTACH PARTITION part_6 FOR VALUES IN (6); +INFO: partition constraint for table "part_6" is implied by existing constraints +-- Similar to above, but the table being attached is a partitioned table +-- whose partition has still different attnos for the root partitioning +-- columns. +CREATE TABLE part_7 ( + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7) +) PARTITION BY LIST (b); +CREATE TABLE part_7_a_null ( + c int, + d int, + e int, + LIKE list_parted2, -- 'a' will have attnum = 4 + CONSTRAINT check_b CHECK (b IS NULL OR b = 'a'), + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7) +); +ALTER TABLE part_7_a_null DROP c, DROP d, DROP e; +ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null); +INFO: partition constraint for table "part_7_a_null" is implied by existing constraints +ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); +INFO: partition constraint for table "part_7" is implied by existing constraints +INFO: updated partition constraint for default partition "list_parted2_def" is implied by existing constraints +-- Same example, but check this time that the constraint correctly detects +-- violating rows +ALTER TABLE list_parted2 DETACH PARTITION part_7; +ALTER TABLE part_7 DROP CONSTRAINT check_a; -- thusly, scan won't be skipped +INSERT INTO part_7 (a, b) VALUES (8, null), (9, 'a'); +SELECT tableoid::regclass, a, b FROM part_7 order by a; + tableoid | a | b +---------------+---+--- + part_7_a_null | 8 | + part_7_a_null | 9 | a +(2 rows) + +ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); +INFO: updated partition constraint for default partition "list_parted2_def" is implied by existing constraints +ERROR: partition constraint is violated by some row +-- check that leaf partitions of default partition are scanned when +-- attaching a partitioned table. +ALTER TABLE part_5 DROP CONSTRAINT check_a; +CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a); +CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5); +INSERT INTO part5_def_p1 VALUES (5, 'y'); +CREATE TABLE part5_p1 (LIKE part_5); +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); +ERROR: updated partition constraint for default partition would be violated by some row +-- should be ok after deleting the bad row +DELETE FROM part5_def_p1 WHERE b = 'y'; +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); +-- check that the table being attached is not already a partition +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +ERROR: "part_2" is already a partition +-- check that circular inheritance is not allowed +ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b'); +ERROR: circular inheritance not allowed +DETAIL: "part_5" is already a child of "list_parted2". +ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); +ERROR: circular inheritance not allowed +DETAIL: "list_parted2" is already a child of "list_parted2". +-- If a partitioned table being created or an existing table being attached +-- as a partition does not have a constraint that would allow validation scan +-- to be skipped, but an individual partition does, then the partition's +-- validation scan is skipped. +CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a); +CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b); +CREATE TABLE quuux_default1 PARTITION OF quuux_default ( + CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1) +) FOR VALUES IN ('b'); +CREATE TABLE quuux1 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate! +CREATE TABLE quuux2 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation +INFO: updated partition constraint for default partition "quuux_default1" is implied by existing constraints +DROP TABLE quuux1, quuux2; +-- should validate for quuux1, but not for quuux2 +CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1); +CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2); +INFO: updated partition constraint for default partition "quuux_default1" is implied by existing constraints +DROP TABLE quuux; +-- check validation when attaching hash partitions +-- Use hand-rolled hash functions and operator class to get predictable result +-- on different matchines. part_test_int4_ops is defined in insert.sql. +-- check that the new partition won't overlap with an existing partition +CREATE TABLE hash_parted ( + a int, + b int +) PARTITION BY HASH (a part_test_int4_ops); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); +CREATE TABLE fail_part (LIKE hpart_1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4); +ERROR: partition "fail_part" would overlap partition "hpart_1" +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0); +ERROR: partition "fail_part" would overlap partition "hpart_1" +DROP TABLE fail_part; +-- check validation when attaching hash partitions +-- check that violating rows are correctly reported +CREATE TABLE hpart_2 (LIKE hash_parted); +INSERT INTO hpart_2 VALUES (3, 0); +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_2; +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE hpart_5 ( + LIKE hash_parted +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); +INSERT INTO hpart_5_a (a, b) VALUES (7, 1); +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_5_a; +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +-- check that the table being attach is with valid modulus and remainder value +CREATE TABLE fail_part(LIKE hash_parted); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DROP TABLE fail_part; +-- +-- DETACH PARTITION +-- +-- check that the table is partitioned at all +CREATE TABLE regular_table (a int); +ALTER TABLE regular_table DETACH PARTITION any_name; +ERROR: table "regular_table" is not partitioned +DROP TABLE regular_table; +-- check that the partition being detached exists at all +ALTER TABLE list_parted2 DETACH PARTITION part_4; +ERROR: relation "part_4" does not exist +ALTER TABLE hash_parted DETACH PARTITION hpart_4; +ERROR: relation "hpart_4" does not exist +-- check that the partition being detached is actually a partition of the parent +CREATE TABLE not_a_part (a int); +ALTER TABLE list_parted2 DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "list_parted2" +ALTER TABLE list_parted2 DETACH PARTITION part_1; +ERROR: relation "part_1" is not a partition of relation "list_parted2" +ALTER TABLE hash_parted DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "hash_parted" +DROP TABLE not_a_part; +-- check that, after being detached, attinhcount/coninhcount is dropped to 0 and +-- attislocal/conislocal is set to true +ALTER TABLE list_parted2 DETACH PARTITION part_3_4; +SELECT attinhcount, attislocal FROM pg_attribute WHERE attrelid = 'part_3_4'::regclass AND attnum > 0; + attinhcount | attislocal +-------------+------------ + 0 | t + 0 | t +(2 rows) + +SELECT coninhcount, conislocal FROM pg_constraint WHERE conrelid = 'part_3_4'::regclass AND conname = 'check_a'; + coninhcount | conislocal +-------------+------------ + 0 | t +(1 row) + +DROP TABLE part_3_4; +-- check that a detached partition is not dropped on dropping a partitioned table +CREATE TABLE range_parted2 ( + a int +) PARTITION BY RANGE(a); +CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100); +ALTER TABLE range_parted2 DETACH PARTITION part_rp; +DROP TABLE range_parted2; +SELECT * from part_rp; + a +--- +(0 rows) + +DROP TABLE part_rp; +-- Check ALTER TABLE commands for partitioned tables and partitions +-- cannot add/drop column to/from *only* the parent +ALTER TABLE ONLY list_parted2 ADD COLUMN c int; +ERROR: column must be added to child tables too +ALTER TABLE ONLY list_parted2 DROP COLUMN b; +ERROR: cannot drop column from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +-- cannot add a column to partition or drop an inherited one +ALTER TABLE part_2 ADD COLUMN c text; +ERROR: cannot add column to a partition +ALTER TABLE part_2 DROP COLUMN b; +ERROR: cannot drop inherited column "b" +-- Nor rename, alter type +ALTER TABLE part_2 RENAME COLUMN b to c; +ERROR: cannot rename inherited column "b" +ALTER TABLE part_2 ALTER COLUMN b TYPE text; +ERROR: cannot alter inherited column "b" +-- cannot add/drop NOT NULL or check constraints to *only* the parent, when +-- partitions exist +ALTER TABLE ONLY list_parted2 ALTER b SET NOT NULL; +ERROR: constraint must be added to child tables too +DETAIL: Column "b" of relation "part_2" is not already NOT NULL. +HINT: Do not specify the ONLY keyword. +ALTER TABLE ONLY list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz'); +ERROR: constraint must be added to child tables too +ALTER TABLE list_parted2 ALTER b SET NOT NULL; +ALTER TABLE ONLY list_parted2 ALTER b DROP NOT NULL; +ERROR: cannot remove constraint from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +ALTER TABLE list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz'); +ALTER TABLE ONLY list_parted2 DROP CONSTRAINT check_b; +ERROR: cannot remove constraint from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +-- It's alright though, if no partitions are yet created +CREATE TABLE parted_no_parts (a int) PARTITION BY LIST (a); +ALTER TABLE ONLY parted_no_parts ALTER a SET NOT NULL; +ALTER TABLE ONLY parted_no_parts ADD CONSTRAINT check_a CHECK (a > 0); +ALTER TABLE ONLY parted_no_parts ALTER a DROP NOT NULL; +ALTER TABLE ONLY parted_no_parts DROP CONSTRAINT check_a; +DROP TABLE parted_no_parts; +-- cannot drop inherited NOT NULL or check constraints from partition +ALTER TABLE list_parted2 ALTER b SET NOT NULL, ADD CONSTRAINT check_a2 CHECK (a > 0); +ALTER TABLE part_2 ALTER b DROP NOT NULL; +ERROR: column "b" is marked NOT NULL in parent table +ALTER TABLE part_2 DROP CONSTRAINT check_a2; +ERROR: cannot drop inherited constraint "check_a2" of relation "part_2" +-- Doesn't make sense to add NO INHERIT constraints on partitioned tables +ALTER TABLE list_parted2 add constraint check_b2 check (b <> 'zz') NO INHERIT; +ERROR: cannot add NO INHERIT constraint to partitioned table "list_parted2" +-- check that a partition cannot participate in regular inheritance +CREATE TABLE inh_test () INHERITS (part_2); +ERROR: cannot inherit from partition "part_2" +CREATE TABLE inh_test (LIKE part_2); +ALTER TABLE inh_test INHERIT part_2; +ERROR: cannot inherit from a partition +ALTER TABLE part_2 INHERIT inh_test; +ERROR: cannot change inheritance of a partition +-- cannot drop or alter type of partition key columns of lower level +-- partitioned tables; for example, part_5, which is list_parted2's +-- partition, is partitioned on b; +ALTER TABLE list_parted2 DROP COLUMN b; +ERROR: cannot drop column named in partition key +ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; +ERROR: cannot alter type of column named in partition key +-- dropping non-partition key columns should be allowed on the parent table. +ALTER TABLE list_parted DROP COLUMN b; +SELECT * FROM list_parted; + a +--- +(0 rows) + +-- cleanup +DROP TABLE list_parted, list_parted2, range_parted; +DROP TABLE fail_def_part; +DROP TABLE hash_parted; +-- more tests for certain multi-level partitioning scenarios +create table p (a int, b int) partition by range (a, b); +create table p1 (b int, a int not null) partition by range (b); +create table p11 (like p1); +alter table p11 drop a; +alter table p11 add a int; +alter table p11 drop a; +alter table p11 add a int not null; +-- attnum for key attribute 'a' is different in p, p1, and p11 +select attrelid::regclass, attname, attnum +from pg_attribute +where attname = 'a' + and (attrelid = 'p'::regclass + or attrelid = 'p1'::regclass + or attrelid = 'p11'::regclass) +order by attrelid::regclass::text; + attrelid | attname | attnum +----------+---------+-------- + p | a | 1 + p1 | a | 2 + p11 | a | 4 +(3 rows) + +alter table p1 attach partition p11 for values from (2) to (5); +insert into p1 (a, b) values (2, 3); +-- check that partition validation scan correctly detects violating rows +alter table p attach partition p1 for values from (1, 2) to (1, 10); +ERROR: partition constraint is violated by some row +-- cleanup +drop table p; +drop table p1; +-- validate constraint on partitioned tables should only scan leaf partitions +create table parted_validate_test (a int) partition by list (a); +create table parted_validate_test_1 partition of parted_validate_test for values in (0, 1); +alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid; +alter table parted_validate_test validate constraint parted_validate_test_chka; +drop table parted_validate_test; +-- test alter column options +CREATE TABLE attmp(i integer); +INSERT INTO attmp VALUES (1); +ALTER TABLE attmp ALTER COLUMN i SET (n_distinct = 1, n_distinct_inherited = 2); +ALTER TABLE attmp ALTER COLUMN i RESET (n_distinct_inherited); +ANALYZE attmp; +DROP TABLE attmp; +DROP USER regress_alter_table_user1; +-- check that violating rows are correctly reported when attaching as the +-- default partition +create table defpart_attach_test (a int) partition by list (a); +create table defpart_attach_test1 partition of defpart_attach_test for values in (1); +create table defpart_attach_test_d (like defpart_attach_test); +insert into defpart_attach_test_d values (1), (2); +-- error because its constraint as the default partition would be violated +-- by the row containing 1 +alter table defpart_attach_test attach partition defpart_attach_test_d default; +ERROR: partition constraint is violated by some row +delete from defpart_attach_test_d where a = 1; +alter table defpart_attach_test_d add check (a > 1); +-- should be attached successfully and without needing to be scanned +alter table defpart_attach_test attach partition defpart_attach_test_d default; +INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints +drop table defpart_attach_test; +-- check combinations of temporary and permanent relations when attaching +-- partitions. +create table perm_part_parent (a int) partition by list (a); +create temp table temp_part_parent (a int) partition by list (a); +create table perm_part_child (a int); +create temp table temp_part_child (a int); +alter table temp_part_parent attach partition perm_part_child default; -- error +ERROR: cannot attach a permanent relation as partition of temporary relation "temp_part_parent" +alter table perm_part_parent attach partition temp_part_child default; -- error +ERROR: cannot attach a temporary relation as partition of permanent relation "perm_part_parent" +alter table temp_part_parent attach partition temp_part_child default; -- ok +drop table perm_part_parent cascade; +drop table temp_part_parent cascade; +-- check that attaching partitions to a table while it is being used is +-- prevented +create table tab_part_attach (a int) partition by list (a); +create or replace function func_part_attach() returns trigger + language plpgsql as $$ + begin + execute 'create table tab_part_attach_1 (a int)'; + execute 'alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)'; + return null; + end $$; +create trigger trig_part_attach before insert on tab_part_attach + for each statement execute procedure func_part_attach(); +insert into tab_part_attach values (1); +ERROR: cannot ALTER TABLE "tab_part_attach" because it is being used by active queries in this session +CONTEXT: SQL statement "alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)" +PL/pgSQL function func_part_attach() line 4 at EXECUTE +drop table tab_part_attach; +drop function func_part_attach(); +-- test case where the partitioning operator is a SQL function whose +-- evaluation results in the table's relcache being rebuilt partway through +-- the execution of an ATTACH PARTITION command +create function at_test_sql_partop (int4, int4) returns int language sql +as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$; +create operator class at_test_sql_partop for type int4 using btree as + operator 1 < (int4, int4), operator 2 <= (int4, int4), + operator 3 = (int4, int4), operator 4 >= (int4, int4), + operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4); +create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop); +create table at_test_sql_partop_1 (a int); +alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); +drop table at_test_sql_partop; +drop operator class at_test_sql_partop using btree; +drop function at_test_sql_partop; diff --git a/src/test/regress/expected/cluster_1.out b/src/test/regress/expected/cluster_1.out new file mode 100644 index 0000000000..a707ea30cb --- /dev/null +++ b/src/test/regress/expected/cluster_1.out @@ -0,0 +1,475 @@ +-- +-- CLUSTER +-- +CREATE TABLE clstr_tst_s (rf_a SERIAL PRIMARY KEY, + b INT); +CREATE TABLE clstr_tst (a SERIAL PRIMARY KEY, + b INT, + c TEXT, + d TEXT, + CONSTRAINT clstr_tst_con FOREIGN KEY (b) REFERENCES clstr_tst_s); +CREATE INDEX clstr_tst_b ON clstr_tst (b); +CREATE INDEX clstr_tst_c ON clstr_tst (c); +CREATE INDEX clstr_tst_c_b ON clstr_tst (c,b); +CREATE INDEX clstr_tst_b_c ON clstr_tst (b,c); +INSERT INTO clstr_tst_s (b) VALUES (0); +INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s; +INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s; +INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s; +INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s; +INSERT INTO clstr_tst_s (b) SELECT b FROM clstr_tst_s; +CREATE TABLE clstr_tst_inh () INHERITS (clstr_tst); +INSERT INTO clstr_tst (b, c) VALUES (11, 'once'); +INSERT INTO clstr_tst (b, c) VALUES (10, 'diez'); +INSERT INTO clstr_tst (b, c) VALUES (31, 'treinta y uno'); +INSERT INTO clstr_tst (b, c) VALUES (22, 'veintidos'); +INSERT INTO clstr_tst (b, c) VALUES (3, 'tres'); +INSERT INTO clstr_tst (b, c) VALUES (20, 'veinte'); +INSERT INTO clstr_tst (b, c) VALUES (23, 'veintitres'); +INSERT INTO clstr_tst (b, c) VALUES (21, 'veintiuno'); +INSERT INTO clstr_tst (b, c) VALUES (4, 'cuatro'); +INSERT INTO clstr_tst (b, c) VALUES (14, 'catorce'); +INSERT INTO clstr_tst (b, c) VALUES (2, 'dos'); +INSERT INTO clstr_tst (b, c) VALUES (18, 'dieciocho'); +INSERT INTO clstr_tst (b, c) VALUES (27, 'veintisiete'); +INSERT INTO clstr_tst (b, c) VALUES (25, 'veinticinco'); +INSERT INTO clstr_tst (b, c) VALUES (13, 'trece'); +INSERT INTO clstr_tst (b, c) VALUES (28, 'veintiocho'); +INSERT INTO clstr_tst (b, c) VALUES (32, 'treinta y dos'); +INSERT INTO clstr_tst (b, c) VALUES (5, 'cinco'); +INSERT INTO clstr_tst (b, c) VALUES (29, 'veintinueve'); +INSERT INTO clstr_tst (b, c) VALUES (1, 'uno'); +INSERT INTO clstr_tst (b, c) VALUES (24, 'veinticuatro'); +INSERT INTO clstr_tst (b, c) VALUES (30, 'treinta'); +INSERT INTO clstr_tst (b, c) VALUES (12, 'doce'); +INSERT INTO clstr_tst (b, c) VALUES (17, 'diecisiete'); +INSERT INTO clstr_tst (b, c) VALUES (9, 'nueve'); +INSERT INTO clstr_tst (b, c) VALUES (19, 'diecinueve'); +INSERT INTO clstr_tst (b, c) VALUES (26, 'veintiseis'); +INSERT INTO clstr_tst (b, c) VALUES (15, 'quince'); +INSERT INTO clstr_tst (b, c) VALUES (7, 'siete'); +INSERT INTO clstr_tst (b, c) VALUES (16, 'dieciseis'); +INSERT INTO clstr_tst (b, c) VALUES (8, 'ocho'); +-- This entry is needed to test that TOASTED values are copied correctly. +INSERT INTO clstr_tst (b, c, d) VALUES (6, 'seis', repeat('xyzzy', 100000)); +CLUSTER clstr_tst_c ON clstr_tst; +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst; + a | b | c | substring | length +----+----+---------------+--------------------------------+-------- + 10 | 14 | catorce | | + 18 | 5 | cinco | | + 9 | 4 | cuatro | | + 26 | 19 | diecinueve | | + 12 | 18 | dieciocho | | + 30 | 16 | dieciseis | | + 24 | 17 | diecisiete | | + 2 | 10 | diez | | + 23 | 12 | doce | | + 11 | 2 | dos | | + 25 | 9 | nueve | | + 31 | 8 | ocho | | + 1 | 11 | once | | + 28 | 15 | quince | | + 32 | 6 | seis | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000 + 29 | 7 | siete | | + 15 | 13 | trece | | + 22 | 30 | treinta | | + 17 | 32 | treinta y dos | | + 3 | 31 | treinta y uno | | + 5 | 3 | tres | | + 20 | 1 | uno | | + 6 | 20 | veinte | | + 14 | 25 | veinticinco | | + 21 | 24 | veinticuatro | | + 4 | 22 | veintidos | | + 19 | 29 | veintinueve | | + 16 | 28 | veintiocho | | + 27 | 26 | veintiseis | | + 13 | 27 | veintisiete | | + 7 | 23 | veintitres | | + 8 | 21 | veintiuno | | +(32 rows) + +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY a; + a | b | c | substring | length +----+----+---------------+--------------------------------+-------- + 1 | 11 | once | | + 2 | 10 | diez | | + 3 | 31 | treinta y uno | | + 4 | 22 | veintidos | | + 5 | 3 | tres | | + 6 | 20 | veinte | | + 7 | 23 | veintitres | | + 8 | 21 | veintiuno | | + 9 | 4 | cuatro | | + 10 | 14 | catorce | | + 11 | 2 | dos | | + 12 | 18 | dieciocho | | + 13 | 27 | veintisiete | | + 14 | 25 | veinticinco | | + 15 | 13 | trece | | + 16 | 28 | veintiocho | | + 17 | 32 | treinta y dos | | + 18 | 5 | cinco | | + 19 | 29 | veintinueve | | + 20 | 1 | uno | | + 21 | 24 | veinticuatro | | + 22 | 30 | treinta | | + 23 | 12 | doce | | + 24 | 17 | diecisiete | | + 25 | 9 | nueve | | + 26 | 19 | diecinueve | | + 27 | 26 | veintiseis | | + 28 | 15 | quince | | + 29 | 7 | siete | | + 30 | 16 | dieciseis | | + 31 | 8 | ocho | | + 32 | 6 | seis | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000 +(32 rows) + +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY b; + a | b | c | substring | length +----+----+---------------+--------------------------------+-------- + 20 | 1 | uno | | + 11 | 2 | dos | | + 5 | 3 | tres | | + 9 | 4 | cuatro | | + 18 | 5 | cinco | | + 32 | 6 | seis | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000 + 29 | 7 | siete | | + 31 | 8 | ocho | | + 25 | 9 | nueve | | + 2 | 10 | diez | | + 1 | 11 | once | | + 23 | 12 | doce | | + 15 | 13 | trece | | + 10 | 14 | catorce | | + 28 | 15 | quince | | + 30 | 16 | dieciseis | | + 24 | 17 | diecisiete | | + 12 | 18 | dieciocho | | + 26 | 19 | diecinueve | | + 6 | 20 | veinte | | + 8 | 21 | veintiuno | | + 4 | 22 | veintidos | | + 7 | 23 | veintitres | | + 21 | 24 | veinticuatro | | + 14 | 25 | veinticinco | | + 27 | 26 | veintiseis | | + 13 | 27 | veintisiete | | + 16 | 28 | veintiocho | | + 19 | 29 | veintinueve | | + 22 | 30 | treinta | | + 3 | 31 | treinta y uno | | + 17 | 32 | treinta y dos | | +(32 rows) + +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY c; + a | b | c | substring | length +----+----+---------------+--------------------------------+-------- + 10 | 14 | catorce | | + 18 | 5 | cinco | | + 9 | 4 | cuatro | | + 26 | 19 | diecinueve | | + 12 | 18 | dieciocho | | + 30 | 16 | dieciseis | | + 24 | 17 | diecisiete | | + 2 | 10 | diez | | + 23 | 12 | doce | | + 11 | 2 | dos | | + 25 | 9 | nueve | | + 31 | 8 | ocho | | + 1 | 11 | once | | + 28 | 15 | quince | | + 32 | 6 | seis | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000 + 29 | 7 | siete | | + 15 | 13 | trece | | + 22 | 30 | treinta | | + 17 | 32 | treinta y dos | | + 3 | 31 | treinta y uno | | + 5 | 3 | tres | | + 20 | 1 | uno | | + 6 | 20 | veinte | | + 14 | 25 | veinticinco | | + 21 | 24 | veinticuatro | | + 4 | 22 | veintidos | | + 19 | 29 | veintinueve | | + 16 | 28 | veintiocho | | + 27 | 26 | veintiseis | | + 13 | 27 | veintisiete | | + 7 | 23 | veintitres | | + 8 | 21 | veintiuno | | +(32 rows) + +-- Verify that inheritance link still works +INSERT INTO clstr_tst_inh VALUES (0, 100, 'in child table'); +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst; + a | b | c | substring | length +----+-----+----------------+--------------------------------+-------- + 10 | 14 | catorce | | + 18 | 5 | cinco | | + 9 | 4 | cuatro | | + 26 | 19 | diecinueve | | + 12 | 18 | dieciocho | | + 30 | 16 | dieciseis | | + 24 | 17 | diecisiete | | + 2 | 10 | diez | | + 23 | 12 | doce | | + 11 | 2 | dos | | + 25 | 9 | nueve | | + 31 | 8 | ocho | | + 1 | 11 | once | | + 28 | 15 | quince | | + 32 | 6 | seis | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000 + 29 | 7 | siete | | + 15 | 13 | trece | | + 22 | 30 | treinta | | + 17 | 32 | treinta y dos | | + 3 | 31 | treinta y uno | | + 5 | 3 | tres | | + 20 | 1 | uno | | + 6 | 20 | veinte | | + 14 | 25 | veinticinco | | + 21 | 24 | veinticuatro | | + 4 | 22 | veintidos | | + 19 | 29 | veintinueve | | + 16 | 28 | veintiocho | | + 27 | 26 | veintiseis | | + 13 | 27 | veintisiete | | + 7 | 23 | veintitres | | + 8 | 21 | veintiuno | | + 0 | 100 | in child table | | +(33 rows) + +-- Verify that foreign key link still works +INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail'); +ERROR: insert or update on table "clstr_tst" violates foreign key constraint "clstr_tst_con" +DETAIL: Key (b)=(1111) is not present in table "clstr_tst_s". +SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass +ORDER BY 1; + conname +---------------- + clstr_tst_con + clstr_tst_pkey +(2 rows) + +SELECT relname, relkind, + EXISTS(SELECT 1 FROM pg_class WHERE oid = c.reltoastrelid) AS hastoast +FROM pg_class c WHERE relname LIKE 'clstr_tst%' ORDER BY relname; + relname | relkind | hastoast +----------------------+---------+---------- + clstr_tst | r | f + clstr_tst_a_seq | S | f + clstr_tst_b | i | f + clstr_tst_b_c | i | f + clstr_tst_c | i | f + clstr_tst_c_b | i | f + clstr_tst_inh | r | f + clstr_tst_pkey | i | f + clstr_tst_s | r | f + clstr_tst_s_pkey | i | f + clstr_tst_s_rf_a_seq | S | f +(11 rows) + +-- Verify that indisclustered is correctly set +SELECT pg_class.relname FROM pg_index, pg_class, pg_class AS pg_class_2 +WHERE pg_class.oid=indexrelid + AND indrelid=pg_class_2.oid + AND pg_class_2.relname = 'clstr_tst' + AND indisclustered; + relname +------------- + clstr_tst_c +(1 row) + +-- Try changing indisclustered +ALTER TABLE clstr_tst CLUSTER ON clstr_tst_b_c; +SELECT pg_class.relname FROM pg_index, pg_class, pg_class AS pg_class_2 +WHERE pg_class.oid=indexrelid + AND indrelid=pg_class_2.oid + AND pg_class_2.relname = 'clstr_tst' + AND indisclustered; + relname +--------------- + clstr_tst_b_c +(1 row) + +-- Try turning off all clustering +ALTER TABLE clstr_tst SET WITHOUT CLUSTER; +SELECT pg_class.relname FROM pg_index, pg_class, pg_class AS pg_class_2 +WHERE pg_class.oid=indexrelid + AND indrelid=pg_class_2.oid + AND pg_class_2.relname = 'clstr_tst' + AND indisclustered; + relname +--------- +(0 rows) + +-- Verify that clustering all tables does in fact cluster the right ones +CREATE USER regress_clstr_user; +CREATE TABLE clstr_1 (a INT PRIMARY KEY); +CREATE TABLE clstr_2 (a INT PRIMARY KEY); +CREATE TABLE clstr_3 (a INT PRIMARY KEY); +ALTER TABLE clstr_1 OWNER TO regress_clstr_user; +ALTER TABLE clstr_3 OWNER TO regress_clstr_user; +GRANT SELECT ON clstr_2 TO regress_clstr_user; +INSERT INTO clstr_1 VALUES (2); +INSERT INTO clstr_1 VALUES (1); +INSERT INTO clstr_2 VALUES (2); +INSERT INTO clstr_2 VALUES (1); +INSERT INTO clstr_3 VALUES (2); +INSERT INTO clstr_3 VALUES (1); +-- "CLUSTER " on a table that hasn't been clustered +CLUSTER clstr_2; +ERROR: there is no previously clustered index for table "clstr_2" +CLUSTER clstr_1_pkey ON clstr_1; +CLUSTER clstr_2 USING clstr_2_pkey; +SELECT * FROM clstr_1 UNION ALL + SELECT * FROM clstr_2 UNION ALL + SELECT * FROM clstr_3; + a +--- + 1 + 2 + 1 + 2 + 2 + 1 +(6 rows) + +-- revert to the original state +DELETE FROM clstr_1; +DELETE FROM clstr_2; +DELETE FROM clstr_3; +INSERT INTO clstr_1 VALUES (2); +INSERT INTO clstr_1 VALUES (1); +INSERT INTO clstr_2 VALUES (2); +INSERT INTO clstr_2 VALUES (1); +INSERT INTO clstr_3 VALUES (2); +INSERT INTO clstr_3 VALUES (1); +-- this user can only cluster clstr_1 and clstr_3, but the latter +-- has not been clustered +SET SESSION AUTHORIZATION regress_clstr_user; +CLUSTER; +SELECT * FROM clstr_1 UNION ALL + SELECT * FROM clstr_2 UNION ALL + SELECT * FROM clstr_3; + a +--- + 1 + 2 + 2 + 1 + 2 + 1 +(6 rows) + +-- cluster a single table using the indisclustered bit previously set +DELETE FROM clstr_1; +INSERT INTO clstr_1 VALUES (2); +INSERT INTO clstr_1 VALUES (1); +CLUSTER clstr_1; +SELECT * FROM clstr_1; + a +--- + 1 + 2 +(2 rows) + +-- Test MVCC-safety of cluster. There isn't much we can do to verify the +-- results with a single backend... +CREATE TABLE clustertest (key int PRIMARY KEY); +INSERT INTO clustertest VALUES (10); +INSERT INTO clustertest VALUES (20); +INSERT INTO clustertest VALUES (30); +INSERT INTO clustertest VALUES (40); +INSERT INTO clustertest VALUES (50); +-- Use a transaction so that updates are not committed when CLUSTER sees 'em +BEGIN; +-- Test update where the old row version is found first in the scan +UPDATE clustertest SET key = 100 WHERE key = 10; +-- Test update where the new row version is found first in the scan +UPDATE clustertest SET key = 35 WHERE key = 40; +-- Test longer update chain +UPDATE clustertest SET key = 60 WHERE key = 50; +UPDATE clustertest SET key = 70 WHERE key = 60; +UPDATE clustertest SET key = 80 WHERE key = 70; +SELECT * FROM clustertest; + key +----- + 20 + 30 + 100 + 35 + 80 +(5 rows) + +CLUSTER clustertest_pkey ON clustertest; +SELECT * FROM clustertest; + key +----- + 20 + 30 + 35 + 80 + 100 +(5 rows) + +COMMIT; +SELECT * FROM clustertest; + key +----- + 20 + 30 + 35 + 80 + 100 +(5 rows) + +-- check that temp tables can be clustered +create temp table clstr_temp (col1 int primary key, col2 text); +insert into clstr_temp values (2, 'two'), (1, 'one'); +cluster clstr_temp using clstr_temp_pkey; +select * from clstr_temp; + col1 | col2 +------+------ + 1 | one + 2 | two +(2 rows) + +drop table clstr_temp; +RESET SESSION AUTHORIZATION; +-- Check that partitioned tables cannot be clustered +CREATE TABLE clstrpart (a int) PARTITION BY RANGE (a); +CREATE INDEX clstrpart_idx ON clstrpart (a); +ALTER TABLE clstrpart CLUSTER ON clstrpart_idx; +ERROR: cannot mark index clustered in partitioned table +CLUSTER clstrpart USING clstrpart_idx; +ERROR: cannot cluster a partitioned table +DROP TABLE clstrpart; +-- Test CLUSTER with external tuplesorting +create table clstr_4 as select * from tenk1; +create index cluster_sort on clstr_4 (hundred, thousand, tenthous); +-- ensure we don't use the index in CLUSTER nor the checking SELECTs +set enable_indexscan = off; +-- Use external sort: +set maintenance_work_mem = '1MB'; +cluster clstr_4 using cluster_sort; +select * from +(select hundred, lag(hundred) over () as lhundred, + thousand, lag(thousand) over () as lthousand, + tenthous, lag(tenthous) over () as ltenthous from clstr_4) ss +where row(hundred, thousand, tenthous) <= row(lhundred, lthousand, ltenthous); + hundred | lhundred | thousand | lthousand | tenthous | ltenthous +---------+----------+----------+-----------+----------+----------- +(0 rows) + +reset enable_indexscan; +reset maintenance_work_mem; +-- clean up +DROP TABLE clustertest; +DROP TABLE clstr_1; +DROP TABLE clstr_2; +DROP TABLE clstr_3; +DROP TABLE clstr_4; +DROP USER regress_clstr_user; diff --git a/src/test/regress/expected/create_am.out b/src/test/regress/expected/create_am.out index 352959b751..6eae2bab97 100644 --- a/src/test/regress/expected/create_am.out +++ b/src/test/regress/expected/create_am.out @@ -126,11 +126,12 @@ ERROR: function int4in(internal) does not exist CREATE ACCESS METHOD bogus TYPE TABLE HANDLER bthandler; ERROR: function bthandler must return type table_am_handler SELECT amname, amhandler, amtype FROM pg_am where amtype = 't' ORDER BY 1, 2; - amname | amhandler | amtype ---------+----------------------+-------- - heap | heap_tableam_handler | t - heap2 | heap_tableam_handler | t -(2 rows) + amname | amhandler | amtype +----------+--------------------------+-------- + heap | heap_tableam_handler | t + heap2 | heap_tableam_handler | t + zedstore | zedstore_tableam_handler | t +(3 rows) -- First create tables employing the new AM using USING -- plain CREATE TABLE diff --git a/src/test/regress/expected/fsm_1.out b/src/test/regress/expected/fsm_1.out new file mode 100644 index 0000000000..9b5f9be13a --- /dev/null +++ b/src/test/regress/expected/fsm_1.out @@ -0,0 +1,73 @@ +-- +-- Free Space Map test +-- +SELECT current_setting('block_size')::integer AS blocksize, +current_setting('block_size')::integer / 8 AS strsize +\gset +CREATE TABLE fsm_check_size (num int, str text); +-- Fill 3 blocks with one record each +ALTER TABLE fsm_check_size SET (fillfactor=15); +INSERT INTO fsm_check_size SELECT i, rpad('', :strsize, 'a') +FROM generate_series(1,3) i; +-- There should be no FSM +VACUUM fsm_check_size; +SELECT pg_relation_size('fsm_check_size', 'main') / :blocksize AS heap_nblocks, +pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks; + heap_nblocks | fsm_nblocks +--------------+------------- + 5 | 0 +(1 row) + +-- The following operations are for testing the functionality of the local +-- in-memory map. In particular, we want to be able to insert into some +-- other block than the one at the end of the heap, without using a FSM. +-- Fill most of the last block +ALTER TABLE fsm_check_size SET (fillfactor=100); +INSERT INTO fsm_check_size SELECT i, rpad('', :strsize, 'a') +FROM generate_series(101,105) i; +-- Make sure records can go into any block but the last one +ALTER TABLE fsm_check_size SET (fillfactor=30); +-- Insert large record and make sure it does not cause the relation to extend +INSERT INTO fsm_check_size VALUES (111, rpad('', :strsize, 'a')); +VACUUM fsm_check_size; +SELECT pg_relation_size('fsm_check_size', 'main') / :blocksize AS heap_nblocks, +pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks; + heap_nblocks | fsm_nblocks +--------------+------------- + 5 | 0 +(1 row) + +-- Extend table with enough blocks to exceed the FSM threshold +DO $$ +DECLARE curtid tid; +num int; +BEGIN +num = 11; + LOOP + INSERT INTO fsm_check_size VALUES (num, 'b') RETURNING ctid INTO curtid; + EXIT WHEN curtid >= tid '(4, 0)'; + num = num + 1; + END LOOP; +END; +$$; +VACUUM fsm_check_size; +SELECT pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks; + fsm_nblocks +------------- + 0 +(1 row) + +-- Add long random string to extend TOAST table to 1 block +INSERT INTO fsm_check_size +VALUES(0, (SELECT string_agg(md5(chr(i)), '') + FROM generate_series(1, :blocksize / 100) i)); +VACUUM fsm_check_size; +SELECT pg_relation_size(reltoastrelid, 'main') / :blocksize AS toast_nblocks, +pg_relation_size(reltoastrelid, 'fsm') / :blocksize AS toast_fsm_nblocks +FROM pg_class WHERE relname = 'fsm_check_size'; + toast_nblocks | toast_fsm_nblocks +---------------+------------------- + | +(1 row) + +DROP TABLE fsm_check_size; diff --git a/src/test/regress/expected/rangefuncs_1.out b/src/test/regress/expected/rangefuncs_1.out new file mode 100644 index 0000000000..78b177ceb0 --- /dev/null +++ b/src/test/regress/expected/rangefuncs_1.out @@ -0,0 +1,2100 @@ +CREATE TABLE rngfunc2(rngfuncid int, f2 int); +INSERT INTO rngfunc2 VALUES(1, 11); +INSERT INTO rngfunc2 VALUES(2, 22); +INSERT INTO rngfunc2 VALUES(1, 111); +CREATE FUNCTION rngfunct(int) returns setof rngfunc2 as 'SELECT * FROM rngfunc2 WHERE rngfuncid = $1 ORDER BY f2;' LANGUAGE SQL; +-- function with ORDINALITY +select * from rngfunct(1) with ordinality as z(a,b,ord); + a | b | ord +---+-----+----- + 1 | 11 | 1 + 1 | 111 | 2 +(2 rows) + +select * from rngfunct(1) with ordinality as z(a,b,ord) where b > 100; -- ordinal 2, not 1 + a | b | ord +---+-----+----- + 1 | 111 | 2 +(1 row) + +-- ordinality vs. column names and types +select a,b,ord from rngfunct(1) with ordinality as z(a,b,ord); + a | b | ord +---+-----+----- + 1 | 11 | 1 + 1 | 111 | 2 +(2 rows) + +select a,ord from unnest(array['a','b']) with ordinality as z(a,ord); + a | ord +---+----- + a | 1 + b | 2 +(2 rows) + +select * from unnest(array['a','b']) with ordinality as z(a,ord); + a | ord +---+----- + a | 1 + b | 2 +(2 rows) + +select a,ord from unnest(array[1.0::float8]) with ordinality as z(a,ord); + a | ord +---+----- + 1 | 1 +(1 row) + +select * from unnest(array[1.0::float8]) with ordinality as z(a,ord); + a | ord +---+----- + 1 | 1 +(1 row) + +select row_to_json(s.*) from generate_series(11,14) with ordinality s; + row_to_json +------------------------- + {"s":11,"ordinality":1} + {"s":12,"ordinality":2} + {"s":13,"ordinality":3} + {"s":14,"ordinality":4} +(4 rows) + +-- ordinality vs. views +create temporary view vw_ord as select * from (values (1)) v(n) join rngfunct(1) with ordinality as z(a,b,ord) on (n=ord); +select * from vw_ord; + n | a | b | ord +---+---+----+----- + 1 | 1 | 11 | 1 +(1 row) + +select definition from pg_views where viewname='vw_ord'; + definition +------------------------------------------------------------------------- + SELECT v.n, + + z.a, + + z.b, + + z.ord + + FROM (( VALUES (1)) v(n) + + JOIN rngfunct(1) WITH ORDINALITY z(a, b, ord) ON ((v.n = z.ord))); +(1 row) + +drop view vw_ord; +-- multiple functions +select * from rows from(rngfunct(1),rngfunct(2)) with ordinality as z(a,b,c,d,ord); + a | b | c | d | ord +---+-----+---+----+----- + 1 | 11 | 2 | 22 | 1 + 1 | 111 | | | 2 +(2 rows) + +create temporary view vw_ord as select * from (values (1)) v(n) join rows from(rngfunct(1),rngfunct(2)) with ordinality as z(a,b,c,d,ord) on (n=ord); +select * from vw_ord; + n | a | b | c | d | ord +---+---+----+---+----+----- + 1 | 1 | 11 | 2 | 22 | 1 +(1 row) + +select definition from pg_views where viewname='vw_ord'; + definition +------------------------------------------------------------------------------------------------------- + SELECT v.n, + + z.a, + + z.b, + + z.c, + + z.d, + + z.ord + + FROM (( VALUES (1)) v(n) + + JOIN ROWS FROM(rngfunct(1), rngfunct(2)) WITH ORDINALITY z(a, b, c, d, ord) ON ((v.n = z.ord))); +(1 row) + +drop view vw_ord; +-- expansions of unnest() +select * from unnest(array[10,20],array['foo','bar'],array[1.0]); + unnest | unnest | unnest +--------+--------+-------- + 10 | foo | 1.0 + 20 | bar | +(2 rows) + +select * from unnest(array[10,20],array['foo','bar'],array[1.0]) with ordinality as z(a,b,c,ord); + a | b | c | ord +----+-----+-----+----- + 10 | foo | 1.0 | 1 + 20 | bar | | 2 +(2 rows) + +select * from rows from(unnest(array[10,20],array['foo','bar'],array[1.0])) with ordinality as z(a,b,c,ord); + a | b | c | ord +----+-----+-----+----- + 10 | foo | 1.0 | 1 + 20 | bar | | 2 +(2 rows) + +select * from rows from(unnest(array[10,20],array['foo','bar']), generate_series(101,102)) with ordinality as z(a,b,c,ord); + a | b | c | ord +----+-----+-----+----- + 10 | foo | 101 | 1 + 20 | bar | 102 | 2 +(2 rows) + +create temporary view vw_ord as select * from unnest(array[10,20],array['foo','bar'],array[1.0]) as z(a,b,c); +select * from vw_ord; + a | b | c +----+-----+----- + 10 | foo | 1.0 + 20 | bar | +(2 rows) + +select definition from pg_views where viewname='vw_ord'; + definition +---------------------------------------------------------------------------------------- + SELECT z.a, + + z.b, + + z.c + + FROM UNNEST(ARRAY[10, 20], ARRAY['foo'::text, 'bar'::text], ARRAY[1.0]) z(a, b, c); +(1 row) + +drop view vw_ord; +create temporary view vw_ord as select * from rows from(unnest(array[10,20],array['foo','bar'],array[1.0])) as z(a,b,c); +select * from vw_ord; + a | b | c +----+-----+----- + 10 | foo | 1.0 + 20 | bar | +(2 rows) + +select definition from pg_views where viewname='vw_ord'; + definition +---------------------------------------------------------------------------------------- + SELECT z.a, + + z.b, + + z.c + + FROM UNNEST(ARRAY[10, 20], ARRAY['foo'::text, 'bar'::text], ARRAY[1.0]) z(a, b, c); +(1 row) + +drop view vw_ord; +create temporary view vw_ord as select * from rows from(unnest(array[10,20],array['foo','bar']), generate_series(1,2)) as z(a,b,c); +select * from vw_ord; + a | b | c +----+-----+--- + 10 | foo | 1 + 20 | bar | 2 +(2 rows) + +select definition from pg_views where viewname='vw_ord'; + definition +---------------------------------------------------------------------------------------------------------------------- + SELECT z.a, + + z.b, + + z.c + + FROM ROWS FROM(unnest(ARRAY[10, 20]), unnest(ARRAY['foo'::text, 'bar'::text]), generate_series(1, 2)) z(a, b, c); +(1 row) + +drop view vw_ord; +-- ordinality and multiple functions vs. rewind and reverse scan +begin; +declare rf_cur scroll cursor for select * from rows from(generate_series(1,5),generate_series(1,2)) with ordinality as g(i,j,o); +fetch all from rf_cur; + i | j | o +---+---+--- + 1 | 1 | 1 + 2 | 2 | 2 + 3 | | 3 + 4 | | 4 + 5 | | 5 +(5 rows) + +fetch backward all from rf_cur; + i | j | o +---+---+--- + 5 | | 5 + 4 | | 4 + 3 | | 3 + 2 | 2 | 2 + 1 | 1 | 1 +(5 rows) + +fetch all from rf_cur; + i | j | o +---+---+--- + 1 | 1 | 1 + 2 | 2 | 2 + 3 | | 3 + 4 | | 4 + 5 | | 5 +(5 rows) + +fetch next from rf_cur; + i | j | o +---+---+--- +(0 rows) + +fetch next from rf_cur; + i | j | o +---+---+--- +(0 rows) + +fetch prior from rf_cur; + i | j | o +---+---+--- + 5 | | 5 +(1 row) + +fetch absolute 1 from rf_cur; + i | j | o +---+---+--- + 1 | 1 | 1 +(1 row) + +fetch next from rf_cur; + i | j | o +---+---+--- + 2 | 2 | 2 +(1 row) + +fetch next from rf_cur; + i | j | o +---+---+--- + 3 | | 3 +(1 row) + +fetch next from rf_cur; + i | j | o +---+---+--- + 4 | | 4 +(1 row) + +fetch prior from rf_cur; + i | j | o +---+---+--- + 3 | | 3 +(1 row) + +fetch prior from rf_cur; + i | j | o +---+---+--- + 2 | 2 | 2 +(1 row) + +fetch prior from rf_cur; + i | j | o +---+---+--- + 1 | 1 | 1 +(1 row) + +commit; +-- function with implicit LATERAL +select * from rngfunc2, rngfunct(rngfunc2.rngfuncid) z where rngfunc2.f2 = z.f2; + rngfuncid | f2 | rngfuncid | f2 +-----------+-----+-----------+----- + 1 | 11 | 1 | 11 + 2 | 22 | 2 | 22 + 1 | 111 | 1 | 111 +(3 rows) + +-- function with implicit LATERAL and explicit ORDINALITY +select * from rngfunc2, rngfunct(rngfunc2.rngfuncid) with ordinality as z(rngfuncid,f2,ord) where rngfunc2.f2 = z.f2; + rngfuncid | f2 | rngfuncid | f2 | ord +-----------+-----+-----------+-----+----- + 1 | 11 | 1 | 11 | 1 + 2 | 22 | 2 | 22 | 1 + 1 | 111 | 1 | 111 | 2 +(3 rows) + +-- function in subselect +select * from rngfunc2 where f2 in (select f2 from rngfunct(rngfunc2.rngfuncid) z where z.rngfuncid = rngfunc2.rngfuncid) ORDER BY 1,2; + rngfuncid | f2 +-----------+----- + 1 | 11 + 1 | 111 + 2 | 22 +(3 rows) + +-- function in subselect +select * from rngfunc2 where f2 in (select f2 from rngfunct(1) z where z.rngfuncid = rngfunc2.rngfuncid) ORDER BY 1,2; + rngfuncid | f2 +-----------+----- + 1 | 11 + 1 | 111 +(2 rows) + +-- function in subselect +select * from rngfunc2 where f2 in (select f2 from rngfunct(rngfunc2.rngfuncid) z where z.rngfuncid = 1) ORDER BY 1,2; + rngfuncid | f2 +-----------+----- + 1 | 11 + 1 | 111 +(2 rows) + +-- nested functions +select rngfunct.rngfuncid, rngfunct.f2 from rngfunct(sin(pi()/2)::int) ORDER BY 1,2; + rngfuncid | f2 +-----------+----- + 1 | 11 + 1 | 111 +(2 rows) + +CREATE TABLE rngfunc (rngfuncid int, rngfuncsubid int, rngfuncname text, primary key(rngfuncid,rngfuncsubid)); +INSERT INTO rngfunc VALUES(1,1,'Joe'); +INSERT INTO rngfunc VALUES(1,2,'Ed'); +INSERT INTO rngfunc VALUES(2,1,'Mary'); +-- sql, proretset = f, prorettype = b +CREATE FUNCTION getrngfunc1(int) RETURNS int AS 'SELECT $1;' LANGUAGE SQL; +SELECT * FROM getrngfunc1(1) AS t1; + t1 +---- + 1 +(1 row) + +SELECT * FROM getrngfunc1(1) WITH ORDINALITY AS t1(v,o); + v | o +---+--- + 1 | 1 +(1 row) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc1(1); +SELECT * FROM vw_getrngfunc; + getrngfunc1 +------------- + 1 +(1 row) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc1(1) WITH ORDINALITY as t1(v,o); +SELECT * FROM vw_getrngfunc; + v | o +---+--- + 1 | 1 +(1 row) + +DROP VIEW vw_getrngfunc; +-- sql, proretset = t, prorettype = b +CREATE FUNCTION getrngfunc2(int) RETURNS setof int AS 'SELECT rngfuncid FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL; +SELECT * FROM getrngfunc2(1) AS t1; + t1 +---- + 1 + 1 +(2 rows) + +SELECT * FROM getrngfunc2(1) WITH ORDINALITY AS t1(v,o); + v | o +---+--- + 1 | 1 + 1 | 2 +(2 rows) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc2(1); +SELECT * FROM vw_getrngfunc; + getrngfunc2 +------------- + 1 + 1 +(2 rows) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc2(1) WITH ORDINALITY AS t1(v,o); +SELECT * FROM vw_getrngfunc; + v | o +---+--- + 1 | 1 + 1 | 2 +(2 rows) + +DROP VIEW vw_getrngfunc; +-- sql, proretset = t, prorettype = b +CREATE FUNCTION getrngfunc3(int) RETURNS setof text AS 'SELECT rngfuncname FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL; +SELECT * FROM getrngfunc3(1) AS t1; + t1 +----- + Joe + Ed +(2 rows) + +SELECT * FROM getrngfunc3(1) WITH ORDINALITY AS t1(v,o); + v | o +-----+--- + Joe | 1 + Ed | 2 +(2 rows) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc3(1); +SELECT * FROM vw_getrngfunc; + getrngfunc3 +------------- + Joe + Ed +(2 rows) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc3(1) WITH ORDINALITY AS t1(v,o); +SELECT * FROM vw_getrngfunc; + v | o +-----+--- + Joe | 1 + Ed | 2 +(2 rows) + +DROP VIEW vw_getrngfunc; +-- sql, proretset = f, prorettype = c +CREATE FUNCTION getrngfunc4(int) RETURNS rngfunc AS 'SELECT * FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL; +SELECT * FROM getrngfunc4(1) AS t1; + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe +(1 row) + +SELECT * FROM getrngfunc4(1) WITH ORDINALITY AS t1(a,b,c,o); + a | b | c | o +---+---+-----+--- + 1 | 1 | Joe | 1 +(1 row) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc4(1); +SELECT * FROM vw_getrngfunc; + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe +(1 row) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc4(1) WITH ORDINALITY AS t1(a,b,c,o); +SELECT * FROM vw_getrngfunc; + a | b | c | o +---+---+-----+--- + 1 | 1 | Joe | 1 +(1 row) + +DROP VIEW vw_getrngfunc; +-- sql, proretset = t, prorettype = c +CREATE FUNCTION getrngfunc5(int) RETURNS setof rngfunc AS 'SELECT * FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL; +SELECT * FROM getrngfunc5(1) AS t1; + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe + 1 | 2 | Ed +(2 rows) + +SELECT * FROM getrngfunc5(1) WITH ORDINALITY AS t1(a,b,c,o); + a | b | c | o +---+---+-----+--- + 1 | 1 | Joe | 1 + 1 | 2 | Ed | 2 +(2 rows) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc5(1); +SELECT * FROM vw_getrngfunc; + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe + 1 | 2 | Ed +(2 rows) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc5(1) WITH ORDINALITY AS t1(a,b,c,o); +SELECT * FROM vw_getrngfunc; + a | b | c | o +---+---+-----+--- + 1 | 1 | Joe | 1 + 1 | 2 | Ed | 2 +(2 rows) + +DROP VIEW vw_getrngfunc; +-- sql, proretset = f, prorettype = record +CREATE FUNCTION getrngfunc6(int) RETURNS RECORD AS 'SELECT * FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL; +SELECT * FROM getrngfunc6(1) AS t1(rngfuncid int, rngfuncsubid int, rngfuncname text); + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe +(1 row) + +SELECT * FROM ROWS FROM( getrngfunc6(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text) ) WITH ORDINALITY; + rngfuncid | rngfuncsubid | rngfuncname | ordinality +-----------+--------------+-------------+------------ + 1 | 1 | Joe | 1 +(1 row) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc6(1) AS +(rngfuncid int, rngfuncsubid int, rngfuncname text); +SELECT * FROM vw_getrngfunc; + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe +(1 row) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS + SELECT * FROM ROWS FROM( getrngfunc6(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text) ) + WITH ORDINALITY; +SELECT * FROM vw_getrngfunc; + rngfuncid | rngfuncsubid | rngfuncname | ordinality +-----------+--------------+-------------+------------ + 1 | 1 | Joe | 1 +(1 row) + +DROP VIEW vw_getrngfunc; +-- sql, proretset = t, prorettype = record +CREATE FUNCTION getrngfunc7(int) RETURNS setof record AS 'SELECT * FROM rngfunc WHERE rngfuncid = $1;' LANGUAGE SQL; +SELECT * FROM getrngfunc7(1) AS t1(rngfuncid int, rngfuncsubid int, rngfuncname text); + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe + 1 | 2 | Ed +(2 rows) + +SELECT * FROM ROWS FROM( getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text) ) WITH ORDINALITY; + rngfuncid | rngfuncsubid | rngfuncname | ordinality +-----------+--------------+-------------+------------ + 1 | 1 | Joe | 1 + 1 | 2 | Ed | 2 +(2 rows) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc7(1) AS +(rngfuncid int, rngfuncsubid int, rngfuncname text); +SELECT * FROM vw_getrngfunc; + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe + 1 | 2 | Ed +(2 rows) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS + SELECT * FROM ROWS FROM( getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text) ) + WITH ORDINALITY; +SELECT * FROM vw_getrngfunc; + rngfuncid | rngfuncsubid | rngfuncname | ordinality +-----------+--------------+-------------+------------ + 1 | 1 | Joe | 1 + 1 | 2 | Ed | 2 +(2 rows) + +DROP VIEW vw_getrngfunc; +-- plpgsql, proretset = f, prorettype = b +CREATE FUNCTION getrngfunc8(int) RETURNS int AS 'DECLARE rngfuncint int; BEGIN SELECT rngfuncid into rngfuncint FROM rngfunc WHERE rngfuncid = $1; RETURN rngfuncint; END;' LANGUAGE plpgsql; +SELECT * FROM getrngfunc8(1) AS t1; + t1 +---- + 1 +(1 row) + +SELECT * FROM getrngfunc8(1) WITH ORDINALITY AS t1(v,o); + v | o +---+--- + 1 | 1 +(1 row) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc8(1); +SELECT * FROM vw_getrngfunc; + getrngfunc8 +------------- + 1 +(1 row) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc8(1) WITH ORDINALITY AS t1(v,o); +SELECT * FROM vw_getrngfunc; + v | o +---+--- + 1 | 1 +(1 row) + +DROP VIEW vw_getrngfunc; +-- plpgsql, proretset = f, prorettype = c +CREATE FUNCTION getrngfunc9(int) RETURNS rngfunc AS 'DECLARE rngfunctup rngfunc%ROWTYPE; BEGIN SELECT * into rngfunctup FROM rngfunc WHERE rngfuncid = $1; RETURN rngfunctup; END;' LANGUAGE plpgsql; +SELECT * FROM getrngfunc9(1) AS t1; + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe +(1 row) + +SELECT * FROM getrngfunc9(1) WITH ORDINALITY AS t1(a,b,c,o); + a | b | c | o +---+---+-----+--- + 1 | 1 | Joe | 1 +(1 row) + +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc9(1); +SELECT * FROM vw_getrngfunc; + rngfuncid | rngfuncsubid | rngfuncname +-----------+--------------+------------- + 1 | 1 | Joe +(1 row) + +DROP VIEW vw_getrngfunc; +CREATE VIEW vw_getrngfunc AS SELECT * FROM getrngfunc9(1) WITH ORDINALITY AS t1(a,b,c,o); +SELECT * FROM vw_getrngfunc; + a | b | c | o +---+---+-----+--- + 1 | 1 | Joe | 1 +(1 row) + +DROP VIEW vw_getrngfunc; +-- mix 'n match kinds, to exercise expandRTE and related logic +select * from rows from(getrngfunc1(1),getrngfunc2(1),getrngfunc3(1),getrngfunc4(1),getrngfunc5(1), + getrngfunc6(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text), + getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text), + getrngfunc8(1),getrngfunc9(1)) + with ordinality as t1(a,b,c,d,e,f,g,h,i,j,k,l,m,o,p,q,r,s,t,u); + a | b | c | d | e | f | g | h | i | j | k | l | m | o | p | q | r | s | t | u +---+---+-----+---+---+-----+---+---+-----+---+---+-----+---+---+-----+---+---+---+-----+--- + 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | 1 | Joe | 1 + | 1 | Ed | | | | 1 | 2 | Ed | | | | 1 | 2 | Ed | | | | | 2 +(2 rows) + +select * from rows from(getrngfunc9(1),getrngfunc8(1), + getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text), + getrngfunc6(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text), + getrngfunc5(1),getrngfunc4(1),getrngfunc3(1),getrngfunc2(1),getrngfunc1(1)) + with ordinality as t1(a,b,c,d,e,f,g,h,i,j,k,l,m,o,p,q,r,s,t,u); + a | b | c | d | e | f | g | h | i | j | k | l | m | o | p | q | r | s | t | u +---+---+-----+---+---+---+-----+---+---+-----+---+---+-----+---+---+-----+-----+---+---+--- + 1 | 1 | Joe | 1 | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 | Joe | Joe | 1 | 1 | 1 + | | | | 1 | 2 | Ed | | | | 1 | 2 | Ed | | | | Ed | 1 | | 2 +(2 rows) + +create temporary view vw_rngfunc as + select * from rows from(getrngfunc9(1), + getrngfunc7(1) AS (rngfuncid int, rngfuncsubid int, rngfuncname text), + getrngfunc1(1)) + with ordinality as t1(a,b,c,d,e,f,g,n); +select * from vw_rngfunc; + a | b | c | d | e | f | g | n +---+---+-----+---+---+-----+---+--- + 1 | 1 | Joe | 1 | 1 | Joe | 1 | 1 + | | | 1 | 2 | Ed | | 2 +(2 rows) + +select pg_get_viewdef('vw_rngfunc'); + pg_get_viewdef +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + SELECT t1.a, + + t1.b, + + t1.c, + + t1.d, + + t1.e, + + t1.f, + + t1.g, + + t1.n + + FROM ROWS FROM(getrngfunc9(1), getrngfunc7(1) AS (rngfuncid integer, rngfuncsubid integer, rngfuncname text), getrngfunc1(1)) WITH ORDINALITY t1(a, b, c, d, e, f, g, n); +(1 row) + +drop view vw_rngfunc; +DROP FUNCTION getrngfunc1(int); +DROP FUNCTION getrngfunc2(int); +DROP FUNCTION getrngfunc3(int); +DROP FUNCTION getrngfunc4(int); +DROP FUNCTION getrngfunc5(int); +DROP FUNCTION getrngfunc6(int); +DROP FUNCTION getrngfunc7(int); +DROP FUNCTION getrngfunc8(int); +DROP FUNCTION getrngfunc9(int); +DROP FUNCTION rngfunct(int); +DROP TABLE rngfunc2; +DROP TABLE rngfunc; +-- Rescan tests -- +CREATE TEMPORARY SEQUENCE rngfunc_rescan_seq1; +CREATE TEMPORARY SEQUENCE rngfunc_rescan_seq2; +CREATE TYPE rngfunc_rescan_t AS (i integer, s bigint); +CREATE FUNCTION rngfunc_sql(int,int) RETURNS setof rngfunc_rescan_t AS 'SELECT i, nextval(''rngfunc_rescan_seq1'') FROM generate_series($1,$2) i;' LANGUAGE SQL; +-- plpgsql functions use materialize mode +CREATE FUNCTION rngfunc_mat(int,int) RETURNS setof rngfunc_rescan_t AS 'begin for i in $1..$2 loop return next (i, nextval(''rngfunc_rescan_seq2'')); end loop; end;' LANGUAGE plpgsql; +--invokes ExecReScanFunctionScan - all these cases should materialize the function only once +-- LEFT JOIN on a condition that the planner can't prove to be true is used to ensure the function +-- is on the inner path of a nestloop join +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN rngfunc_sql(11,13) ON (r+i)<100; + r | i | s +---+----+--- + 1 | 11 | 1 + 1 | 12 | 2 + 1 | 13 | 3 + 2 | 11 | 1 + 2 | 12 | 2 + 2 | 13 | 3 + 3 | 11 | 1 + 3 | 12 | 2 + 3 | 13 | 3 +(9 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN rngfunc_sql(11,13) WITH ORDINALITY AS f(i,s,o) ON (r+i)<100; + r | i | s | o +---+----+---+--- + 1 | 11 | 1 | 1 + 1 | 12 | 2 | 2 + 1 | 13 | 3 | 3 + 2 | 11 | 1 | 1 + 2 | 12 | 2 | 2 + 2 | 13 | 3 | 3 + 3 | 11 | 1 | 1 + 3 | 12 | 2 | 2 + 3 | 13 | 3 | 3 +(9 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN rngfunc_mat(11,13) ON (r+i)<100; + r | i | s +---+----+--- + 1 | 11 | 1 + 1 | 12 | 2 + 1 | 13 | 3 + 2 | 11 | 1 + 2 | 12 | 2 + 2 | 13 | 3 + 3 | 11 | 1 + 3 | 12 | 2 + 3 | 13 | 3 +(9 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN rngfunc_mat(11,13) WITH ORDINALITY AS f(i,s,o) ON (r+i)<100; + r | i | s | o +---+----+---+--- + 1 | 11 | 1 | 1 + 1 | 12 | 2 | 2 + 1 | 13 | 3 | 3 + 2 | 11 | 1 | 1 + 2 | 12 | 2 | 2 + 2 | 13 | 3 | 3 + 3 | 11 | 1 | 1 + 3 | 12 | 2 | 2 + 3 | 13 | 3 | 3 +(9 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN ROWS FROM( rngfunc_sql(11,13), rngfunc_mat(11,13) ) WITH ORDINALITY AS f(i1,s1,i2,s2,o) ON (r+i1+i2)<100; + r | i1 | s1 | i2 | s2 | o +---+----+----+----+----+--- + 1 | 11 | 1 | 11 | 1 | 1 + 1 | 12 | 2 | 12 | 2 | 2 + 1 | 13 | 3 | 13 | 3 | 3 + 2 | 11 | 1 | 11 | 1 | 1 + 2 | 12 | 2 | 12 | 2 | 2 + 2 | 13 | 3 | 13 | 3 | 3 + 3 | 11 | 1 | 11 | 1 | 1 + 3 | 12 | 2 | 12 | 2 | 2 + 3 | 13 | 3 | 13 | 3 | 3 +(9 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN generate_series(11,13) f(i) ON (r+i)<100; + r | i +---+---- + 1 | 11 + 1 | 12 + 1 | 13 + 2 | 11 + 2 | 12 + 2 | 13 + 3 | 11 + 3 | 12 + 3 | 13 +(9 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN generate_series(11,13) WITH ORDINALITY AS f(i,o) ON (r+i)<100; + r | i | o +---+----+--- + 1 | 11 | 1 + 1 | 12 | 2 + 1 | 13 | 3 + 2 | 11 | 1 + 2 | 12 | 2 + 2 | 13 | 3 + 3 | 11 | 1 + 3 | 12 | 2 + 3 | 13 | 3 +(9 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN unnest(array[10,20,30]) f(i) ON (r+i)<100; + r | i +---+---- + 1 | 10 + 1 | 20 + 1 | 30 + 2 | 10 + 2 | 20 + 2 | 30 + 3 | 10 + 3 | 20 + 3 | 30 +(9 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v(r) LEFT JOIN unnest(array[10,20,30]) WITH ORDINALITY AS f(i,o) ON (r+i)<100; + r | i | o +---+----+--- + 1 | 10 | 1 + 1 | 20 | 2 + 1 | 30 | 3 + 2 | 10 | 1 + 2 | 20 | 2 + 2 | 30 | 3 + 3 | 10 | 1 + 3 | 20 | 2 + 3 | 30 | 3 +(9 rows) + +--invokes ExecReScanFunctionScan with chgParam != NULL (using implied LATERAL) +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_sql(10+r,13); + r | i | s +---+----+--- + 1 | 11 | 1 + 1 | 12 | 2 + 1 | 13 | 3 + 2 | 12 | 4 + 2 | 13 | 5 + 3 | 13 | 6 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_sql(10+r,13) WITH ORDINALITY AS f(i,s,o); + r | i | s | o +---+----+---+--- + 1 | 11 | 1 | 1 + 1 | 12 | 2 | 2 + 1 | 13 | 3 | 3 + 2 | 12 | 4 | 1 + 2 | 13 | 5 | 2 + 3 | 13 | 6 | 1 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_sql(11,10+r); + r | i | s +---+----+--- + 1 | 11 | 1 + 2 | 11 | 2 + 2 | 12 | 3 + 3 | 11 | 4 + 3 | 12 | 5 + 3 | 13 | 6 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_sql(11,10+r) WITH ORDINALITY AS f(i,s,o); + r | i | s | o +---+----+---+--- + 1 | 11 | 1 | 1 + 2 | 11 | 2 | 1 + 2 | 12 | 3 | 2 + 3 | 11 | 4 | 1 + 3 | 12 | 5 | 2 + 3 | 13 | 6 | 3 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (11,12),(13,15),(16,20)) v(r1,r2), rngfunc_sql(r1,r2); + r1 | r2 | i | s +----+----+----+---- + 11 | 12 | 11 | 1 + 11 | 12 | 12 | 2 + 13 | 15 | 13 | 3 + 13 | 15 | 14 | 4 + 13 | 15 | 15 | 5 + 16 | 20 | 16 | 6 + 16 | 20 | 17 | 7 + 16 | 20 | 18 | 8 + 16 | 20 | 19 | 9 + 16 | 20 | 20 | 10 +(10 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (11,12),(13,15),(16,20)) v(r1,r2), rngfunc_sql(r1,r2) WITH ORDINALITY AS f(i,s,o); + r1 | r2 | i | s | o +----+----+----+----+--- + 11 | 12 | 11 | 1 | 1 + 11 | 12 | 12 | 2 | 2 + 13 | 15 | 13 | 3 | 1 + 13 | 15 | 14 | 4 | 2 + 13 | 15 | 15 | 5 | 3 + 16 | 20 | 16 | 6 | 1 + 16 | 20 | 17 | 7 | 2 + 16 | 20 | 18 | 8 | 3 + 16 | 20 | 19 | 9 | 4 + 16 | 20 | 20 | 10 | 5 +(10 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_mat(10+r,13); + r | i | s +---+----+--- + 1 | 11 | 1 + 1 | 12 | 2 + 1 | 13 | 3 + 2 | 12 | 4 + 2 | 13 | 5 + 3 | 13 | 6 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_mat(10+r,13) WITH ORDINALITY AS f(i,s,o); + r | i | s | o +---+----+---+--- + 1 | 11 | 1 | 1 + 1 | 12 | 2 | 2 + 1 | 13 | 3 | 3 + 2 | 12 | 4 | 1 + 2 | 13 | 5 | 2 + 3 | 13 | 6 | 1 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_mat(11,10+r); + r | i | s +---+----+--- + 1 | 11 | 1 + 2 | 11 | 2 + 2 | 12 | 3 + 3 | 11 | 4 + 3 | 12 | 5 + 3 | 13 | 6 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), rngfunc_mat(11,10+r) WITH ORDINALITY AS f(i,s,o); + r | i | s | o +---+----+---+--- + 1 | 11 | 1 | 1 + 2 | 11 | 2 | 1 + 2 | 12 | 3 | 2 + 3 | 11 | 4 | 1 + 3 | 12 | 5 | 2 + 3 | 13 | 6 | 3 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (11,12),(13,15),(16,20)) v(r1,r2), rngfunc_mat(r1,r2); + r1 | r2 | i | s +----+----+----+---- + 11 | 12 | 11 | 1 + 11 | 12 | 12 | 2 + 13 | 15 | 13 | 3 + 13 | 15 | 14 | 4 + 13 | 15 | 15 | 5 + 16 | 20 | 16 | 6 + 16 | 20 | 17 | 7 + 16 | 20 | 18 | 8 + 16 | 20 | 19 | 9 + 16 | 20 | 20 | 10 +(10 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (11,12),(13,15),(16,20)) v(r1,r2), rngfunc_mat(r1,r2) WITH ORDINALITY AS f(i,s,o); + r1 | r2 | i | s | o +----+----+----+----+--- + 11 | 12 | 11 | 1 | 1 + 11 | 12 | 12 | 2 | 2 + 13 | 15 | 13 | 3 | 1 + 13 | 15 | 14 | 4 | 2 + 13 | 15 | 15 | 5 | 3 + 16 | 20 | 16 | 6 | 1 + 16 | 20 | 17 | 7 | 2 + 16 | 20 | 18 | 8 | 3 + 16 | 20 | 19 | 9 | 4 + 16 | 20 | 20 | 10 | 5 +(10 rows) + +-- selective rescan of multiple functions: +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), ROWS FROM( rngfunc_sql(11,11), rngfunc_mat(10+r,13) ); + r | i | s | i | s +---+----+---+----+--- + 1 | 11 | 1 | 11 | 1 + 1 | | | 12 | 2 + 1 | | | 13 | 3 + 2 | 11 | 1 | 12 | 4 + 2 | | | 13 | 5 + 3 | 11 | 1 | 13 | 6 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), ROWS FROM( rngfunc_sql(10+r,13), rngfunc_mat(11,11) ); + r | i | s | i | s +---+----+---+----+--- + 1 | 11 | 1 | 11 | 1 + 1 | 12 | 2 | | + 1 | 13 | 3 | | + 2 | 12 | 4 | 11 | 1 + 2 | 13 | 5 | | + 3 | 13 | 6 | 11 | 1 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), ROWS FROM( rngfunc_sql(10+r,13), rngfunc_mat(10+r,13) ); + r | i | s | i | s +---+----+---+----+--- + 1 | 11 | 1 | 11 | 1 + 1 | 12 | 2 | 12 | 2 + 1 | 13 | 3 | 13 | 3 + 2 | 12 | 4 | 12 | 4 + 2 | 13 | 5 | 13 | 5 + 3 | 13 | 6 | 13 | 6 +(6 rows) + +SELECT setval('rngfunc_rescan_seq1',1,false),setval('rngfunc_rescan_seq2',1,false); + setval | setval +--------+-------- + 1 | 1 +(1 row) + +SELECT * FROM generate_series(1,2) r1, generate_series(r1,3) r2, ROWS FROM( rngfunc_sql(10+r1,13), rngfunc_mat(10+r2,13) ); + r1 | r2 | i | s | i | s +----+----+----+----+----+--- + 1 | 1 | 11 | 1 | 11 | 1 + 1 | 1 | 12 | 2 | 12 | 2 + 1 | 1 | 13 | 3 | 13 | 3 + 1 | 2 | 11 | 4 | 12 | 4 + 1 | 2 | 12 | 5 | 13 | 5 + 1 | 2 | 13 | 6 | | + 1 | 3 | 11 | 7 | 13 | 6 + 1 | 3 | 12 | 8 | | + 1 | 3 | 13 | 9 | | + 2 | 2 | 12 | 10 | 12 | 7 + 2 | 2 | 13 | 11 | 13 | 8 + 2 | 3 | 12 | 12 | 13 | 9 + 2 | 3 | 13 | 13 | | +(13 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), generate_series(10+r,20-r) f(i); + r | i +---+---- + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 12 + 2 | 13 + 2 | 14 + 2 | 15 + 2 | 16 + 2 | 17 + 2 | 18 + 3 | 13 + 3 | 14 + 3 | 15 + 3 | 16 + 3 | 17 +(21 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), generate_series(10+r,20-r) WITH ORDINALITY AS f(i,o); + r | i | o +---+----+--- + 1 | 11 | 1 + 1 | 12 | 2 + 1 | 13 | 3 + 1 | 14 | 4 + 1 | 15 | 5 + 1 | 16 | 6 + 1 | 17 | 7 + 1 | 18 | 8 + 1 | 19 | 9 + 2 | 12 | 1 + 2 | 13 | 2 + 2 | 14 | 3 + 2 | 15 | 4 + 2 | 16 | 5 + 2 | 17 | 6 + 2 | 18 | 7 + 3 | 13 | 1 + 3 | 14 | 2 + 3 | 15 | 3 + 3 | 16 | 4 + 3 | 17 | 5 +(21 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), unnest(array[r*10,r*20,r*30]) f(i); + r | i +---+---- + 1 | 10 + 1 | 20 + 1 | 30 + 2 | 20 + 2 | 40 + 2 | 60 + 3 | 30 + 3 | 60 + 3 | 90 +(9 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v(r), unnest(array[r*10,r*20,r*30]) WITH ORDINALITY AS f(i,o); + r | i | o +---+----+--- + 1 | 10 | 1 + 1 | 20 | 2 + 1 | 30 | 3 + 2 | 20 | 1 + 2 | 40 | 2 + 2 | 60 | 3 + 3 | 30 | 1 + 3 | 60 | 2 + 3 | 90 | 3 +(9 rows) + +-- deep nesting +SELECT * FROM (VALUES (1),(2),(3)) v1(r1), + LATERAL (SELECT r1, * FROM (VALUES (10),(20),(30)) v2(r2) + LEFT JOIN generate_series(21,23) f(i) ON ((r2+i)<100) OFFSET 0) s1; + r1 | r1 | r2 | i +----+----+----+---- + 1 | 1 | 10 | 21 + 1 | 1 | 10 | 22 + 1 | 1 | 10 | 23 + 1 | 1 | 20 | 21 + 1 | 1 | 20 | 22 + 1 | 1 | 20 | 23 + 1 | 1 | 30 | 21 + 1 | 1 | 30 | 22 + 1 | 1 | 30 | 23 + 2 | 2 | 10 | 21 + 2 | 2 | 10 | 22 + 2 | 2 | 10 | 23 + 2 | 2 | 20 | 21 + 2 | 2 | 20 | 22 + 2 | 2 | 20 | 23 + 2 | 2 | 30 | 21 + 2 | 2 | 30 | 22 + 2 | 2 | 30 | 23 + 3 | 3 | 10 | 21 + 3 | 3 | 10 | 22 + 3 | 3 | 10 | 23 + 3 | 3 | 20 | 21 + 3 | 3 | 20 | 22 + 3 | 3 | 20 | 23 + 3 | 3 | 30 | 21 + 3 | 3 | 30 | 22 + 3 | 3 | 30 | 23 +(27 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v1(r1), + LATERAL (SELECT r1, * FROM (VALUES (10),(20),(30)) v2(r2) + LEFT JOIN generate_series(20+r1,23) f(i) ON ((r2+i)<100) OFFSET 0) s1; + r1 | r1 | r2 | i +----+----+----+---- + 1 | 1 | 10 | 21 + 1 | 1 | 10 | 22 + 1 | 1 | 10 | 23 + 1 | 1 | 20 | 21 + 1 | 1 | 20 | 22 + 1 | 1 | 20 | 23 + 1 | 1 | 30 | 21 + 1 | 1 | 30 | 22 + 1 | 1 | 30 | 23 + 2 | 2 | 10 | 22 + 2 | 2 | 10 | 23 + 2 | 2 | 20 | 22 + 2 | 2 | 20 | 23 + 2 | 2 | 30 | 22 + 2 | 2 | 30 | 23 + 3 | 3 | 10 | 23 + 3 | 3 | 20 | 23 + 3 | 3 | 30 | 23 +(18 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v1(r1), + LATERAL (SELECT r1, * FROM (VALUES (10),(20),(30)) v2(r2) + LEFT JOIN generate_series(r2,r2+3) f(i) ON ((r2+i)<100) OFFSET 0) s1; + r1 | r1 | r2 | i +----+----+----+---- + 1 | 1 | 10 | 10 + 1 | 1 | 10 | 11 + 1 | 1 | 10 | 12 + 1 | 1 | 10 | 13 + 1 | 1 | 20 | 20 + 1 | 1 | 20 | 21 + 1 | 1 | 20 | 22 + 1 | 1 | 20 | 23 + 1 | 1 | 30 | 30 + 1 | 1 | 30 | 31 + 1 | 1 | 30 | 32 + 1 | 1 | 30 | 33 + 2 | 2 | 10 | 10 + 2 | 2 | 10 | 11 + 2 | 2 | 10 | 12 + 2 | 2 | 10 | 13 + 2 | 2 | 20 | 20 + 2 | 2 | 20 | 21 + 2 | 2 | 20 | 22 + 2 | 2 | 20 | 23 + 2 | 2 | 30 | 30 + 2 | 2 | 30 | 31 + 2 | 2 | 30 | 32 + 2 | 2 | 30 | 33 + 3 | 3 | 10 | 10 + 3 | 3 | 10 | 11 + 3 | 3 | 10 | 12 + 3 | 3 | 10 | 13 + 3 | 3 | 20 | 20 + 3 | 3 | 20 | 21 + 3 | 3 | 20 | 22 + 3 | 3 | 20 | 23 + 3 | 3 | 30 | 30 + 3 | 3 | 30 | 31 + 3 | 3 | 30 | 32 + 3 | 3 | 30 | 33 +(36 rows) + +SELECT * FROM (VALUES (1),(2),(3)) v1(r1), + LATERAL (SELECT r1, * FROM (VALUES (10),(20),(30)) v2(r2) + LEFT JOIN generate_series(r1,2+r2/5) f(i) ON ((r2+i)<100) OFFSET 0) s1; + r1 | r1 | r2 | i +----+----+----+--- + 1 | 1 | 10 | 1 + 1 | 1 | 10 | 2 + 1 | 1 | 10 | 3 + 1 | 1 | 10 | 4 + 1 | 1 | 20 | 1 + 1 | 1 | 20 | 2 + 1 | 1 | 20 | 3 + 1 | 1 | 20 | 4 + 1 | 1 | 20 | 5 + 1 | 1 | 20 | 6 + 1 | 1 | 30 | 1 + 1 | 1 | 30 | 2 + 1 | 1 | 30 | 3 + 1 | 1 | 30 | 4 + 1 | 1 | 30 | 5 + 1 | 1 | 30 | 6 + 1 | 1 | 30 | 7 + 1 | 1 | 30 | 8 + 2 | 2 | 10 | 2 + 2 | 2 | 10 | 3 + 2 | 2 | 10 | 4 + 2 | 2 | 20 | 2 + 2 | 2 | 20 | 3 + 2 | 2 | 20 | 4 + 2 | 2 | 20 | 5 + 2 | 2 | 20 | 6 + 2 | 2 | 30 | 2 + 2 | 2 | 30 | 3 + 2 | 2 | 30 | 4 + 2 | 2 | 30 | 5 + 2 | 2 | 30 | 6 + 2 | 2 | 30 | 7 + 2 | 2 | 30 | 8 + 3 | 3 | 10 | 3 + 3 | 3 | 10 | 4 + 3 | 3 | 20 | 3 + 3 | 3 | 20 | 4 + 3 | 3 | 20 | 5 + 3 | 3 | 20 | 6 + 3 | 3 | 30 | 3 + 3 | 3 | 30 | 4 + 3 | 3 | 30 | 5 + 3 | 3 | 30 | 6 + 3 | 3 | 30 | 7 + 3 | 3 | 30 | 8 +(45 rows) + +-- check handling of FULL JOIN with multiple lateral references (bug #15741) +SELECT * +FROM (VALUES (1),(2)) v1(r1) + LEFT JOIN LATERAL ( + SELECT * + FROM generate_series(1, v1.r1) AS gs1 + LEFT JOIN LATERAL ( + SELECT * + FROM generate_series(1, gs1) AS gs2 + LEFT JOIN generate_series(1, gs2) AS gs3 ON TRUE + ) AS ss1 ON TRUE + FULL JOIN generate_series(1, v1.r1) AS gs4 ON FALSE + ) AS ss0 ON TRUE; + r1 | gs1 | gs2 | gs3 | gs4 +----+-----+-----+-----+----- + 1 | | | | 1 + 1 | 1 | 1 | 1 | + 2 | | | | 1 + 2 | | | | 2 + 2 | 1 | 1 | 1 | + 2 | 2 | 1 | 1 | + 2 | 2 | 2 | 1 | + 2 | 2 | 2 | 2 | +(8 rows) + +DROP FUNCTION rngfunc_sql(int,int); +DROP FUNCTION rngfunc_mat(int,int); +DROP SEQUENCE rngfunc_rescan_seq1; +DROP SEQUENCE rngfunc_rescan_seq2; +-- +-- Test cases involving OUT parameters +-- +CREATE FUNCTION rngfunc(in f1 int, out f2 int) +AS 'select $1+1' LANGUAGE sql; +SELECT rngfunc(42); + rngfunc +--------- + 43 +(1 row) + +SELECT * FROM rngfunc(42); + f2 +---- + 43 +(1 row) + +SELECT * FROM rngfunc(42) AS p(x); + x +---- + 43 +(1 row) + +-- explicit spec of return type is OK +CREATE OR REPLACE FUNCTION rngfunc(in f1 int, out f2 int) RETURNS int +AS 'select $1+1' LANGUAGE sql; +-- error, wrong result type +CREATE OR REPLACE FUNCTION rngfunc(in f1 int, out f2 int) RETURNS float +AS 'select $1+1' LANGUAGE sql; +ERROR: function result type must be integer because of OUT parameters +-- with multiple OUT params you must get a RECORD result +CREATE OR REPLACE FUNCTION rngfunc(in f1 int, out f2 int, out f3 text) RETURNS int +AS 'select $1+1' LANGUAGE sql; +ERROR: function result type must be record because of OUT parameters +CREATE OR REPLACE FUNCTION rngfunc(in f1 int, out f2 int, out f3 text) +RETURNS record +AS 'select $1+1' LANGUAGE sql; +ERROR: cannot change return type of existing function +HINT: Use DROP FUNCTION rngfunc(integer) first. +CREATE OR REPLACE FUNCTION rngfuncr(in f1 int, out f2 int, out text) +AS $$select $1-1, $1::text || 'z'$$ LANGUAGE sql; +SELECT f1, rngfuncr(f1) FROM int4_tbl; + f1 | rngfuncr +-------------+---------------------------- + 0 | (-1,0z) + 123456 | (123455,123456z) + -123456 | (-123457,-123456z) + 2147483647 | (2147483646,2147483647z) + -2147483647 | (-2147483648,-2147483647z) +(5 rows) + +SELECT * FROM rngfuncr(42); + f2 | column2 +----+--------- + 41 | 42z +(1 row) + +SELECT * FROM rngfuncr(42) AS p(a,b); + a | b +----+----- + 41 | 42z +(1 row) + +CREATE OR REPLACE FUNCTION rngfuncb(in f1 int, inout f2 int, out text) +AS $$select $2-1, $1::text || 'z'$$ LANGUAGE sql; +SELECT f1, rngfuncb(f1, f1/2) FROM int4_tbl; + f1 | rngfuncb +-------------+---------------------------- + 0 | (-1,0z) + 123456 | (61727,123456z) + -123456 | (-61729,-123456z) + 2147483647 | (1073741822,2147483647z) + -2147483647 | (-1073741824,-2147483647z) +(5 rows) + +SELECT * FROM rngfuncb(42, 99); + f2 | column2 +----+--------- + 98 | 42z +(1 row) + +SELECT * FROM rngfuncb(42, 99) AS p(a,b); + a | b +----+----- + 98 | 42z +(1 row) + +-- Can reference function with or without OUT params for DROP, etc +DROP FUNCTION rngfunc(int); +DROP FUNCTION rngfuncr(in f2 int, out f1 int, out text); +DROP FUNCTION rngfuncb(in f1 int, inout f2 int); +-- +-- For my next trick, polymorphic OUT parameters +-- +CREATE FUNCTION dup (f1 anyelement, f2 out anyelement, f3 out anyarray) +AS 'select $1, array[$1,$1]' LANGUAGE sql; +SELECT dup(22); + dup +---------------- + (22,"{22,22}") +(1 row) + +SELECT dup('xyz'); -- fails +ERROR: could not determine polymorphic type because input has type unknown +SELECT dup('xyz'::text); + dup +------------------- + (xyz,"{xyz,xyz}") +(1 row) + +SELECT * FROM dup('xyz'::text); + f2 | f3 +-----+----------- + xyz | {xyz,xyz} +(1 row) + +-- fails, as we are attempting to rename first argument +CREATE OR REPLACE FUNCTION dup (inout f2 anyelement, out f3 anyarray) +AS 'select $1, array[$1,$1]' LANGUAGE sql; +ERROR: cannot change name of input parameter "f1" +HINT: Use DROP FUNCTION dup(anyelement) first. +DROP FUNCTION dup(anyelement); +-- equivalent behavior, though different name exposed for input arg +CREATE OR REPLACE FUNCTION dup (inout f2 anyelement, out f3 anyarray) +AS 'select $1, array[$1,$1]' LANGUAGE sql; +SELECT dup(22); + dup +---------------- + (22,"{22,22}") +(1 row) + +DROP FUNCTION dup(anyelement); +-- fails, no way to deduce outputs +CREATE FUNCTION bad (f1 int, out f2 anyelement, out f3 anyarray) +AS 'select $1, array[$1,$1]' LANGUAGE sql; +ERROR: cannot determine result data type +DETAIL: A function returning a polymorphic type must have at least one polymorphic argument. +-- +-- table functions +-- +CREATE OR REPLACE FUNCTION rngfunc() +RETURNS TABLE(a int) +AS $$ SELECT a FROM generate_series(1,5) a(a) $$ LANGUAGE sql; +SELECT * FROM rngfunc(); + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +DROP FUNCTION rngfunc(); +CREATE OR REPLACE FUNCTION rngfunc(int) +RETURNS TABLE(a int, b int) +AS $$ SELECT a, b + FROM generate_series(1,$1) a(a), + generate_series(1,$1) b(b) $$ LANGUAGE sql; +SELECT * FROM rngfunc(3); + a | b +---+--- + 1 | 1 + 1 | 2 + 1 | 3 + 2 | 1 + 2 | 2 + 2 | 3 + 3 | 1 + 3 | 2 + 3 | 3 +(9 rows) + +DROP FUNCTION rngfunc(int); +-- case that causes change of typmod knowledge during inlining +CREATE OR REPLACE FUNCTION rngfunc() +RETURNS TABLE(a varchar(5)) +AS $$ SELECT 'hello'::varchar(5) $$ LANGUAGE sql STABLE; +SELECT * FROM rngfunc() GROUP BY 1; + a +------- + hello +(1 row) + +DROP FUNCTION rngfunc(); +-- +-- some tests on SQL functions with RETURNING +-- +create temp table tt(f1 serial, data text); +create function insert_tt(text) returns int as +$$ insert into tt(data) values($1) returning f1 $$ +language sql; +select insert_tt('foo'); + insert_tt +----------- + 1 +(1 row) + +select insert_tt('bar'); + insert_tt +----------- + 2 +(1 row) + +select * from tt; + f1 | data +----+------ + 1 | foo + 2 | bar +(2 rows) + +-- insert will execute to completion even if function needs just 1 row +create or replace function insert_tt(text) returns int as +$$ insert into tt(data) values($1),($1||$1) returning f1 $$ +language sql; +select insert_tt('fool'); + insert_tt +----------- + 3 +(1 row) + +select * from tt; + f1 | data +----+---------- + 1 | foo + 2 | bar + 3 | fool + 4 | foolfool +(4 rows) + +-- setof does what's expected +create or replace function insert_tt2(text,text) returns setof int as +$$ insert into tt(data) values($1),($2) returning f1 $$ +language sql; +select insert_tt2('foolish','barrish'); + insert_tt2 +------------ + 5 + 6 +(2 rows) + +select * from insert_tt2('baz','quux'); + insert_tt2 +------------ + 7 + 8 +(2 rows) + +select * from tt; + f1 | data +----+---------- + 1 | foo + 2 | bar + 3 | fool + 4 | foolfool + 5 | foolish + 6 | barrish + 7 | baz + 8 | quux +(8 rows) + +-- limit doesn't prevent execution to completion +select insert_tt2('foolish','barrish') limit 1; + insert_tt2 +------------ + 9 +(1 row) + +select * from tt; + f1 | data +----+---------- + 1 | foo + 2 | bar + 3 | fool + 4 | foolfool + 5 | foolish + 6 | barrish + 7 | baz + 8 | quux + 9 | foolish + 10 | barrish +(10 rows) + +-- triggers will fire, too +create function noticetrigger() returns trigger as $$ +begin + raise notice 'noticetrigger % %', new.f1, new.data; + return null; +end $$ language plpgsql; +create trigger tnoticetrigger after insert on tt for each row +execute procedure noticetrigger(); +select insert_tt2('foolme','barme') limit 1; +NOTICE: noticetrigger 11 foolme +NOTICE: noticetrigger 12 barme + insert_tt2 +------------ + 11 +(1 row) + +select * from tt; + f1 | data +----+---------- + 1 | foo + 2 | bar + 3 | fool + 4 | foolfool + 5 | foolish + 6 | barrish + 7 | baz + 8 | quux + 9 | foolish + 10 | barrish + 11 | foolme + 12 | barme +(12 rows) + +-- and rules work +create temp table tt_log(f1 int, data text); +create rule insert_tt_rule as on insert to tt do also + insert into tt_log values(new.*); +select insert_tt2('foollog','barlog') limit 1; +NOTICE: noticetrigger 13 foollog +NOTICE: noticetrigger 14 barlog + insert_tt2 +------------ + 13 +(1 row) + +select * from tt; + f1 | data +----+---------- + 1 | foo + 2 | bar + 3 | fool + 4 | foolfool + 5 | foolish + 6 | barrish + 7 | baz + 8 | quux + 9 | foolish + 10 | barrish + 11 | foolme + 12 | barme + 13 | foollog + 14 | barlog +(14 rows) + +-- note that nextval() gets executed a second time in the rule expansion, +-- which is expected. +select * from tt_log; + f1 | data +----+--------- + 15 | foollog + 16 | barlog +(2 rows) + +-- test case for a whole-row-variable bug +create function rngfunc1(n integer, out a text, out b text) + returns setof record + language sql + as $$ select 'foo ' || i, 'bar ' || i from generate_series(1,$1) i $$; +set work_mem='64kB'; +select t.a, t, t.a from rngfunc1(10000) t limit 1; + a | t | a +-------+-------------------+------- + foo 1 | ("foo 1","bar 1") | foo 1 +(1 row) + +reset work_mem; +select t.a, t, t.a from rngfunc1(10000) t limit 1; + a | t | a +-------+-------------------+------- + foo 1 | ("foo 1","bar 1") | foo 1 +(1 row) + +drop function rngfunc1(n integer); +-- test use of SQL functions returning record +-- this is supported in some cases where the query doesn't specify +-- the actual record type ... +create function array_to_set(anyarray) returns setof record as $$ + select i AS "index", $1[i] AS "value" from generate_subscripts($1, 1) i +$$ language sql strict immutable; +select array_to_set(array['one', 'two']); + array_to_set +-------------- + (1,one) + (2,two) +(2 rows) + +select * from array_to_set(array['one', 'two']) as t(f1 int,f2 text); + f1 | f2 +----+----- + 1 | one + 2 | two +(2 rows) + +select * from array_to_set(array['one', 'two']); -- fail +ERROR: a column definition list is required for functions returning "record" +LINE 1: select * from array_to_set(array['one', 'two']); + ^ +create temp table rngfunc(f1 int8, f2 int8); +create function testrngfunc() returns record as $$ + insert into rngfunc values (1,2) returning *; +$$ language sql; +select testrngfunc(); + testrngfunc +------------- + (1,2) +(1 row) + +select * from testrngfunc() as t(f1 int8,f2 int8); + f1 | f2 +----+---- + 1 | 2 +(1 row) + +select * from testrngfunc(); -- fail +ERROR: a column definition list is required for functions returning "record" +LINE 1: select * from testrngfunc(); + ^ +drop function testrngfunc(); +create function testrngfunc() returns setof record as $$ + insert into rngfunc values (1,2), (3,4) returning *; +$$ language sql; +select testrngfunc(); + testrngfunc +------------- + (1,2) + (3,4) +(2 rows) + +select * from testrngfunc() as t(f1 int8,f2 int8); + f1 | f2 +----+---- + 1 | 2 + 3 | 4 +(2 rows) + +select * from testrngfunc(); -- fail +ERROR: a column definition list is required for functions returning "record" +LINE 1: select * from testrngfunc(); + ^ +drop function testrngfunc(); +-- +-- Check some cases involving added/dropped columns in a rowtype result +-- +create temp table users (userid text, seq int, email text, todrop bool, moredrop int, enabled bool); +insert into users values ('id',1,'email',true,11,true); +insert into users values ('id2',2,'email2',true,12,true); +alter table users drop column todrop; +create or replace function get_first_user() returns users as +$$ SELECT * FROM users ORDER BY userid LIMIT 1; $$ +language sql stable; +SELECT get_first_user(); + get_first_user +------------------- + (id,1,email,11,t) +(1 row) + +SELECT * FROM get_first_user(); + userid | seq | email | moredrop | enabled +--------+-----+-------+----------+--------- + id | 1 | email | 11 | t +(1 row) + +create or replace function get_users() returns setof users as +$$ SELECT * FROM users ORDER BY userid; $$ +language sql stable; +SELECT get_users(); + get_users +--------------------- + (id,1,email,11,t) + (id2,2,email2,12,t) +(2 rows) + +SELECT * FROM get_users(); + userid | seq | email | moredrop | enabled +--------+-----+--------+----------+--------- + id | 1 | email | 11 | t + id2 | 2 | email2 | 12 | t +(2 rows) + +SELECT * FROM get_users() WITH ORDINALITY; -- make sure ordinality copes + userid | seq | email | moredrop | enabled | ordinality +--------+-----+--------+----------+---------+------------ + id | 1 | email | 11 | t | 1 + id2 | 2 | email2 | 12 | t | 2 +(2 rows) + +-- multiple functions vs. dropped columns +SELECT * FROM ROWS FROM(generate_series(10,11), get_users()) WITH ORDINALITY; + generate_series | userid | seq | email | moredrop | enabled | ordinality +-----------------+--------+-----+--------+----------+---------+------------ + 10 | id | 1 | email | 11 | t | 1 + 11 | id2 | 2 | email2 | 12 | t | 2 +(2 rows) + +SELECT * FROM ROWS FROM(get_users(), generate_series(10,11)) WITH ORDINALITY; + userid | seq | email | moredrop | enabled | generate_series | ordinality +--------+-----+--------+----------+---------+-----------------+------------ + id | 1 | email | 11 | t | 10 | 1 + id2 | 2 | email2 | 12 | t | 11 | 2 +(2 rows) + +-- check that we can cope with post-parsing changes in rowtypes +create temp view usersview as +SELECT * FROM ROWS FROM(get_users(), generate_series(10,11)) WITH ORDINALITY; +select * from usersview; + userid | seq | email | moredrop | enabled | generate_series | ordinality +--------+-----+--------+----------+---------+-----------------+------------ + id | 1 | email | 11 | t | 10 | 1 + id2 | 2 | email2 | 12 | t | 11 | 2 +(2 rows) + +alter table users add column junk text; +select * from usersview; + userid | seq | email | moredrop | enabled | generate_series | ordinality +--------+-----+--------+----------+---------+-----------------+------------ + id | 1 | email | 11 | t | 10 | 1 + id2 | 2 | email2 | 12 | t | 11 | 2 +(2 rows) + +begin; +alter table users drop column moredrop; +select * from usersview; -- expect clean failure +ERROR: attribute 5 of type record has been dropped +rollback; +alter table users alter column seq type numeric; +select * from usersview; -- expect clean failure +ERROR: attribute 2 of type record has wrong type +DETAIL: Table has type numeric, but query expects integer. +drop view usersview; +drop function get_first_user(); +drop function get_users(); +drop table users; +-- this won't get inlined because of type coercion, but it shouldn't fail +create or replace function rngfuncbar() returns setof text as +$$ select 'foo'::varchar union all select 'bar'::varchar ; $$ +language sql stable; +select rngfuncbar(); + rngfuncbar +------------ + foo + bar +(2 rows) + +select * from rngfuncbar(); + rngfuncbar +------------ + foo + bar +(2 rows) + +drop function rngfuncbar(); +-- check handling of a SQL function with multiple OUT params (bug #5777) +create or replace function rngfuncbar(out integer, out numeric) as +$$ select (1, 2.1) $$ language sql; +select * from rngfuncbar(); + column1 | column2 +---------+--------- + 1 | 2.1 +(1 row) + +create or replace function rngfuncbar(out integer, out numeric) as +$$ select (1, 2) $$ language sql; +select * from rngfuncbar(); -- fail +ERROR: function return row and query-specified return row do not match +DETAIL: Returned type integer at ordinal position 2, but query expects numeric. +create or replace function rngfuncbar(out integer, out numeric) as +$$ select (1, 2.1, 3) $$ language sql; +select * from rngfuncbar(); -- fail +ERROR: function return row and query-specified return row do not match +DETAIL: Returned row contains 3 attributes, but query expects 2. +drop function rngfuncbar(); +-- check whole-row-Var handling in nested lateral functions (bug #11703) +create function extractq2(t int8_tbl) returns int8 as $$ + select t.q2 +$$ language sql immutable; +explain (verbose, costs off) +select x from int8_tbl, extractq2(int8_tbl) f(x); + QUERY PLAN +------------------------------------ + Nested Loop + Output: f.x + -> Seq Scan on public.int8_tbl + Output: int8_tbl.q2 + -> Function Scan on f + Output: f.x + Function Call: int8_tbl.q2 +(7 rows) + +select x from int8_tbl, extractq2(int8_tbl) f(x); + x +------------------- + 456 + 4567890123456789 + 123 + 4567890123456789 + -4567890123456789 +(5 rows) + +create function extractq2_2(t int8_tbl) returns table(ret1 int8) as $$ + select extractq2(t) offset 0 +$$ language sql immutable; +explain (verbose, costs off) +select x from int8_tbl, extractq2_2(int8_tbl) f(x); + QUERY PLAN +----------------------------------- + Nested Loop + Output: ((int8_tbl.*).q2) + -> Seq Scan on public.int8_tbl + Output: int8_tbl.* + -> Result + Output: (int8_tbl.*).q2 +(6 rows) + +select x from int8_tbl, extractq2_2(int8_tbl) f(x); + x +------------------- + 456 + 4567890123456789 + 123 + 4567890123456789 + -4567890123456789 +(5 rows) + +-- without the "offset 0", this function gets optimized quite differently +create function extractq2_2_opt(t int8_tbl) returns table(ret1 int8) as $$ + select extractq2(t) +$$ language sql immutable; +explain (verbose, costs off) +select x from int8_tbl, extractq2_2_opt(int8_tbl) f(x); + QUERY PLAN +----------------------------- + Seq Scan on public.int8_tbl + Output: int8_tbl.q2 +(2 rows) + +select x from int8_tbl, extractq2_2_opt(int8_tbl) f(x); + x +------------------- + 456 + 4567890123456789 + 123 + 4567890123456789 + -4567890123456789 +(5 rows) + +-- check handling of nulls in SRF results (bug #7808) +create type rngfunc2 as (a integer, b text); +select *, row_to_json(u) from unnest(array[(1,'foo')::rngfunc2, null::rngfunc2]) u; + a | b | row_to_json +---+-----+--------------------- + 1 | foo | {"a":1,"b":"foo"} + | | {"a":null,"b":null} +(2 rows) + +select *, row_to_json(u) from unnest(array[null::rngfunc2, null::rngfunc2]) u; + a | b | row_to_json +---+---+--------------------- + | | {"a":null,"b":null} + | | {"a":null,"b":null} +(2 rows) + +select *, row_to_json(u) from unnest(array[null::rngfunc2, (1,'foo')::rngfunc2, null::rngfunc2]) u; + a | b | row_to_json +---+-----+--------------------- + | | {"a":null,"b":null} + 1 | foo | {"a":1,"b":"foo"} + | | {"a":null,"b":null} +(3 rows) + +select *, row_to_json(u) from unnest(array[]::rngfunc2[]) u; + a | b | row_to_json +---+---+------------- +(0 rows) + +drop type rngfunc2; diff --git a/src/test/regress/expected/reloptions_1.out b/src/test/regress/expected/reloptions_1.out new file mode 100644 index 0000000000..fd0b73a365 --- /dev/null +++ b/src/test/regress/expected/reloptions_1.out @@ -0,0 +1,219 @@ +-- Simple create +CREATE TABLE reloptions_test(i INT) WITH (FiLLFaCToR=30, + autovacuum_enabled = false, autovacuum_analyze_scale_factor = 0.2); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; + reloptions +------------------------------------------------------------------------------ + {fillfactor=30,autovacuum_enabled=false,autovacuum_analyze_scale_factor=0.2} +(1 row) + +-- Fail min/max values check +CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=2); +ERROR: value 2 out of bounds for option "fillfactor" +DETAIL: Valid values are between "10" and "100". +CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=110); +ERROR: value 110 out of bounds for option "fillfactor" +DETAIL: Valid values are between "10" and "100". +CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_analyze_scale_factor = -10.0); +ERROR: value -10.0 out of bounds for option "autovacuum_analyze_scale_factor" +DETAIL: Valid values are between "0.000000" and "100.000000". +CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_analyze_scale_factor = 110.0); +ERROR: value 110.0 out of bounds for option "autovacuum_analyze_scale_factor" +DETAIL: Valid values are between "0.000000" and "100.000000". +-- Fail when option and namespace do not exist +CREATE TABLE reloptions_test2(i INT) WITH (not_existing_option=2); +ERROR: unrecognized parameter "not_existing_option" +CREATE TABLE reloptions_test2(i INT) WITH (not_existing_namespace.fillfactor=2); +ERROR: unrecognized parameter namespace "not_existing_namespace" +-- Fail while setting improper values +CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=-30.1); +ERROR: value -30.1 out of bounds for option "fillfactor" +DETAIL: Valid values are between "10" and "100". +CREATE TABLE reloptions_test2(i INT) WITH (fillfactor='string'); +ERROR: invalid value for integer option "fillfactor": string +CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=true); +ERROR: invalid value for integer option "fillfactor": true +CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_enabled=12); +ERROR: invalid value for boolean option "autovacuum_enabled": 12 +CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_enabled=30.5); +ERROR: invalid value for boolean option "autovacuum_enabled": 30.5 +CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_enabled='string'); +ERROR: invalid value for boolean option "autovacuum_enabled": string +CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_analyze_scale_factor='string'); +ERROR: invalid value for floating point option "autovacuum_analyze_scale_factor": string +CREATE TABLE reloptions_test2(i INT) WITH (autovacuum_analyze_scale_factor=true); +ERROR: invalid value for floating point option "autovacuum_analyze_scale_factor": true +-- Fail if option is specified twice +CREATE TABLE reloptions_test2(i INT) WITH (fillfactor=30, fillfactor=40); +ERROR: parameter "fillfactor" specified more than once +-- Specifying name only for a non-Boolean option should fail +CREATE TABLE reloptions_test2(i INT) WITH (fillfactor); +ERROR: invalid value for integer option "fillfactor": true +-- Simple ALTER TABLE +ALTER TABLE reloptions_test SET (fillfactor=31, + autovacuum_analyze_scale_factor = 0.3); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; + reloptions +------------------------------------------------------------------------------ + {autovacuum_enabled=false,fillfactor=31,autovacuum_analyze_scale_factor=0.3} +(1 row) + +-- Set boolean option to true without specifying value +ALTER TABLE reloptions_test SET (autovacuum_enabled, fillfactor=32); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; + reloptions +----------------------------------------------------------------------------- + {autovacuum_analyze_scale_factor=0.3,autovacuum_enabled=true,fillfactor=32} +(1 row) + +-- Check that RESET works well +ALTER TABLE reloptions_test RESET (fillfactor); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; + reloptions +--------------------------------------------------------------- + {autovacuum_analyze_scale_factor=0.3,autovacuum_enabled=true} +(1 row) + +-- Resetting all values causes the column to become null +ALTER TABLE reloptions_test RESET (autovacuum_enabled, + autovacuum_analyze_scale_factor); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass AND + reloptions IS NULL; + reloptions +------------ + +(1 row) + +-- RESET fails if a value is specified +ALTER TABLE reloptions_test RESET (fillfactor=12); +ERROR: RESET must not include values for parameters +-- Test vacuum_truncate option +DROP TABLE reloptions_test; +CREATE TABLE reloptions_test(i INT NOT NULL, j text) + WITH (vacuum_truncate=false, + toast.vacuum_truncate=false, + autovacuum_enabled=false); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; + reloptions +-------------------------------------------------- + {vacuum_truncate=false,autovacuum_enabled=false} +(1 row) + +INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); +ERROR: null value in column "i" violates not-null constraint +DETAIL: Failing row contains (null, null). +VACUUM reloptions_test; +SELECT pg_relation_size('reloptions_test') > 0; + ?column? +---------- + t +(1 row) + +SELECT reloptions FROM pg_class WHERE oid = + (SELECT reltoastrelid FROM pg_class + WHERE oid = 'reloptions_test'::regclass); + reloptions +------------ +(0 rows) + +ALTER TABLE reloptions_test RESET (vacuum_truncate); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; + reloptions +---------------------------- + {autovacuum_enabled=false} +(1 row) + +INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); +ERROR: null value in column "i" violates not-null constraint +DETAIL: Failing row contains (null, null). +VACUUM reloptions_test; +SELECT pg_relation_size('reloptions_test') = 0; + ?column? +---------- + f +(1 row) + +-- Test toast.* options +DROP TABLE reloptions_test; +CREATE TABLE reloptions_test (s VARCHAR) + WITH (toast.autovacuum_vacuum_cost_delay = 23); +SELECT reltoastrelid as toast_oid + FROM pg_class WHERE oid = 'reloptions_test'::regclass \gset +SELECT reloptions FROM pg_class WHERE oid = :toast_oid; + reloptions +------------ +(0 rows) + +ALTER TABLE reloptions_test SET (toast.autovacuum_vacuum_cost_delay = 24); +SELECT reloptions FROM pg_class WHERE oid = :toast_oid; + reloptions +------------ +(0 rows) + +ALTER TABLE reloptions_test RESET (toast.autovacuum_vacuum_cost_delay); +SELECT reloptions FROM pg_class WHERE oid = :toast_oid; + reloptions +------------ +(0 rows) + +-- Fail on non-existent options in toast namespace +CREATE TABLE reloptions_test2 (i int) WITH (toast.not_existing_option = 42); +ERROR: unrecognized parameter "not_existing_option" +-- Mix TOAST & heap +DROP TABLE reloptions_test; +CREATE TABLE reloptions_test (s VARCHAR) WITH + (toast.autovacuum_vacuum_cost_delay = 23, + autovacuum_vacuum_cost_delay = 24, fillfactor = 40); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; + reloptions +------------------------------------------------- + {autovacuum_vacuum_cost_delay=24,fillfactor=40} +(1 row) + +SELECT reloptions FROM pg_class WHERE oid = ( + SELECT reltoastrelid FROM pg_class WHERE oid = 'reloptions_test'::regclass); + reloptions +------------ +(0 rows) + +-- +-- CREATE INDEX, ALTER INDEX for btrees +-- +CREATE INDEX reloptions_test_idx ON reloptions_test (s) WITH (fillfactor=30); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test_idx'::regclass; + reloptions +----------------- + {fillfactor=30} +(1 row) + +-- Fail when option and namespace do not exist +CREATE INDEX reloptions_test_idx ON reloptions_test (s) + WITH (not_existing_option=2); +ERROR: unrecognized parameter "not_existing_option" +CREATE INDEX reloptions_test_idx ON reloptions_test (s) + WITH (not_existing_ns.fillfactor=2); +ERROR: unrecognized parameter namespace "not_existing_ns" +-- Check allowed ranges +CREATE INDEX reloptions_test_idx2 ON reloptions_test (s) WITH (fillfactor=1); +ERROR: value 1 out of bounds for option "fillfactor" +DETAIL: Valid values are between "10" and "100". +CREATE INDEX reloptions_test_idx2 ON reloptions_test (s) WITH (fillfactor=130); +ERROR: value 130 out of bounds for option "fillfactor" +DETAIL: Valid values are between "10" and "100". +-- Check ALTER +ALTER INDEX reloptions_test_idx SET (fillfactor=40); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test_idx'::regclass; + reloptions +----------------- + {fillfactor=40} +(1 row) + +-- Check ALTER on empty reloption list +CREATE INDEX reloptions_test_idx3 ON reloptions_test (s); +ALTER INDEX reloptions_test_idx3 SET (fillfactor=40); +SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test_idx3'::regclass; + reloptions +----------------- + {fillfactor=40} +(1 row) + diff --git a/src/test/regress/expected/strings_1.out b/src/test/regress/expected/strings_1.out new file mode 100644 index 0000000000..a5c324a8b7 --- /dev/null +++ b/src/test/regress/expected/strings_1.out @@ -0,0 +1,1823 @@ +-- +-- STRINGS +-- Test various data entry syntaxes. +-- +-- SQL string continuation syntax +-- E021-03 character string literals +SELECT 'first line' +' - next line' + ' - third line' + AS "Three lines to one"; + Three lines to one +------------------------------------- + first line - next line - third line +(1 row) + +-- illegal string continuation syntax +SELECT 'first line' +' - next line' /* this comment is not allowed here */ +' - third line' + AS "Illegal comment within continuation"; +ERROR: syntax error at or near "' - third line'" +LINE 3: ' - third line' + ^ +-- Unicode escapes +SET standard_conforming_strings TO on; +SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061"; + data +------ + data +(1 row) + +SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*'; + dat\+000061 +------------- + dat\+000061 +(1 row) + +SELECT U&' \' UESCAPE '!' AS "tricky"; + tricky +-------- + \ +(1 row) + +SELECT 'tricky' AS U&"\" UESCAPE '!'; + \ +-------- + tricky +(1 row) + +SELECT U&'wrong: \061'; +ERROR: invalid Unicode escape value at or near "\061'" +LINE 1: SELECT U&'wrong: \061'; + ^ +SELECT U&'wrong: \+0061'; +ERROR: invalid Unicode escape value at or near "\+0061'" +LINE 1: SELECT U&'wrong: \+0061'; + ^ +SELECT U&'wrong: +0061' UESCAPE '+'; +ERROR: invalid Unicode escape character at or near "+'" +LINE 1: SELECT U&'wrong: +0061' UESCAPE '+'; + ^ +SET standard_conforming_strings TO off; +SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061"; +ERROR: unsafe use of string constant with Unicode escapes +LINE 1: SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061"; + ^ +DETAIL: String constants with Unicode escapes cannot be used when standard_conforming_strings is off. +SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*'; +ERROR: unsafe use of string constant with Unicode escapes +LINE 1: SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061... + ^ +DETAIL: String constants with Unicode escapes cannot be used when standard_conforming_strings is off. +SELECT U&' \' UESCAPE '!' AS "tricky"; +ERROR: unsafe use of string constant with Unicode escapes +LINE 1: SELECT U&' \' UESCAPE '!' AS "tricky"; + ^ +DETAIL: String constants with Unicode escapes cannot be used when standard_conforming_strings is off. +SELECT 'tricky' AS U&"\" UESCAPE '!'; + \ +-------- + tricky +(1 row) + +SELECT U&'wrong: \061'; +ERROR: unsafe use of string constant with Unicode escapes +LINE 1: SELECT U&'wrong: \061'; + ^ +DETAIL: String constants with Unicode escapes cannot be used when standard_conforming_strings is off. +SELECT U&'wrong: \+0061'; +ERROR: unsafe use of string constant with Unicode escapes +LINE 1: SELECT U&'wrong: \+0061'; + ^ +DETAIL: String constants with Unicode escapes cannot be used when standard_conforming_strings is off. +SELECT U&'wrong: +0061' UESCAPE '+'; +ERROR: unsafe use of string constant with Unicode escapes +LINE 1: SELECT U&'wrong: +0061' UESCAPE '+'; + ^ +DETAIL: String constants with Unicode escapes cannot be used when standard_conforming_strings is off. +RESET standard_conforming_strings; +-- bytea +SET bytea_output TO hex; +SELECT E'\\xDeAdBeEf'::bytea; + bytea +------------ + \xdeadbeef +(1 row) + +SELECT E'\\x De Ad Be Ef '::bytea; + bytea +------------ + \xdeadbeef +(1 row) + +SELECT E'\\xDeAdBeE'::bytea; +ERROR: invalid hexadecimal data: odd number of digits +LINE 1: SELECT E'\\xDeAdBeE'::bytea; + ^ +SELECT E'\\xDeAdBeEx'::bytea; +ERROR: invalid hexadecimal digit: "x" +LINE 1: SELECT E'\\xDeAdBeEx'::bytea; + ^ +SELECT E'\\xDe00BeEf'::bytea; + bytea +------------ + \xde00beef +(1 row) + +SELECT E'DeAdBeEf'::bytea; + bytea +-------------------- + \x4465416442654566 +(1 row) + +SELECT E'De\\000dBeEf'::bytea; + bytea +-------------------- + \x4465006442654566 +(1 row) + +SELECT E'De\123dBeEf'::bytea; + bytea +-------------------- + \x4465536442654566 +(1 row) + +SELECT E'De\\123dBeEf'::bytea; + bytea +-------------------- + \x4465536442654566 +(1 row) + +SELECT E'De\\678dBeEf'::bytea; +ERROR: invalid input syntax for type bytea +LINE 1: SELECT E'De\\678dBeEf'::bytea; + ^ +SET bytea_output TO escape; +SELECT E'\\xDeAdBeEf'::bytea; + bytea +------------------ + \336\255\276\357 +(1 row) + +SELECT E'\\x De Ad Be Ef '::bytea; + bytea +------------------ + \336\255\276\357 +(1 row) + +SELECT E'\\xDe00BeEf'::bytea; + bytea +------------------ + \336\000\276\357 +(1 row) + +SELECT E'DeAdBeEf'::bytea; + bytea +---------- + DeAdBeEf +(1 row) + +SELECT E'De\\000dBeEf'::bytea; + bytea +------------- + De\000dBeEf +(1 row) + +SELECT E'De\\123dBeEf'::bytea; + bytea +---------- + DeSdBeEf +(1 row) + +-- +-- test conversions between various string types +-- E021-10 implicit casting among the character data types +-- +SELECT CAST(f1 AS text) AS "text(char)" FROM CHAR_TBL; + text(char) +------------ + a + ab + abcd + abcd +(4 rows) + +SELECT CAST(f1 AS text) AS "text(varchar)" FROM VARCHAR_TBL; + text(varchar) +--------------- + a + ab + abcd + abcd +(4 rows) + +SELECT CAST(name 'namefield' AS text) AS "text(name)"; + text(name) +------------ + namefield +(1 row) + +-- since this is an explicit cast, it should truncate w/o error: +SELECT CAST(f1 AS char(10)) AS "char(text)" FROM TEXT_TBL; + char(text) +------------ + doh! + hi de ho n +(2 rows) + +-- note: implicit-cast case is tested in char.sql +SELECT CAST(f1 AS char(20)) AS "char(text)" FROM TEXT_TBL; + char(text) +---------------------- + doh! + hi de ho neighbor +(2 rows) + +SELECT CAST(f1 AS char(10)) AS "char(varchar)" FROM VARCHAR_TBL; + char(varchar) +--------------- + a + ab + abcd + abcd +(4 rows) + +SELECT CAST(name 'namefield' AS char(10)) AS "char(name)"; + char(name) +------------ + namefield +(1 row) + +SELECT CAST(f1 AS varchar) AS "varchar(text)" FROM TEXT_TBL; + varchar(text) +------------------- + doh! + hi de ho neighbor +(2 rows) + +SELECT CAST(f1 AS varchar) AS "varchar(char)" FROM CHAR_TBL; + varchar(char) +--------------- + a + ab + abcd + abcd +(4 rows) + +SELECT CAST(name 'namefield' AS varchar) AS "varchar(name)"; + varchar(name) +--------------- + namefield +(1 row) + +-- +-- test SQL string functions +-- E### and T### are feature reference numbers from SQL99 +-- +-- E021-09 trim function +SELECT TRIM(BOTH FROM ' bunch o blanks ') = 'bunch o blanks' AS "bunch o blanks"; + bunch o blanks +---------------- + t +(1 row) + +SELECT TRIM(LEADING FROM ' bunch o blanks ') = 'bunch o blanks ' AS "bunch o blanks "; + bunch o blanks +------------------ + t +(1 row) + +SELECT TRIM(TRAILING FROM ' bunch o blanks ') = ' bunch o blanks' AS " bunch o blanks"; + bunch o blanks +------------------ + t +(1 row) + +SELECT TRIM(BOTH 'x' FROM 'xxxxxsome Xsxxxxx') = 'some Xs' AS "some Xs"; + some Xs +--------- + t +(1 row) + +-- E021-06 substring expression +SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890"; + 34567890 +---------- + t +(1 row) + +SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; + 456 +----- + t +(1 row) + +-- T581 regular expression substring (with SQL's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; + bcd +----- + bcd +(1 row) + +-- No match should return NULL +SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; + True +------ + t +(1 row) + +-- Null inputs should return NULL +SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True"; + True +------ + t +(1 row) + +SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True"; + True +------ + t +(1 row) + +SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; + True +------ + t +(1 row) + +-- The first and last parts should act non-greedy +SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef"; + bcdef +------- + bcdef +(1 row) + +SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg"; + abcdefg +--------- + abcdefg +(1 row) + +-- Vertical bar in any part affects only that part +SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef"; + bcdef +------- + bcdef +(1 row) + +SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef"; + bcdef +------- + bcdef +(1 row) + +SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef"; + bcdef +------- + bcdef +(1 row) + +-- Can't have more than two part separators +SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error"; +ERROR: SQL regular expression may not contain more than two escape-double-quote separators +CONTEXT: SQL function "substring" statement 1 +-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty +SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg"; + bcdefg +-------- + bcdefg +(1 row) + +SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg"; + abcdefg +--------- + abcdefg +(1 row) + +-- substring() with just two arguments is not allowed by SQL spec; +-- we accept it, but we interpret the pattern as a POSIX regexp not SQL +SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; + cde +----- + cde +(1 row) + +-- With a parenthesized subexpression, return only what matches the subexpr +SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; + cde +----- + cde +(1 row) + +-- PostgreSQL extension to allow using back reference in replace string; +SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); + regexp_replace +---------------- + (111) 222-3333 +(1 row) + +SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g'); + regexp_replace +---------------- + AAA BBB CCC +(1 row) + +SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); + regexp_replace +---------------- + ZAAAZ +(1 row) + +SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); + regexp_replace +---------------- + Z Z +(1 row) + +-- invalid regexp option +SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); +ERROR: invalid regular expression option: "z" +-- set so we can tell NULL from empty string +\pset null '\\N' +-- return all matches from regexp +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); + regexp_matches +---------------- + {bar,beque} +(1 row) + +-- test case insensitive +SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); + regexp_matches +---------------- + {bAR,bEqUE} +(1 row) + +-- global option - more than one match +SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); + regexp_matches +---------------- + {bar,beque} + {bazil,barf} +(2 rows) + +-- empty capture group (matched empty string) +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$); + regexp_matches +---------------- + {bar,"",beque} +(1 row) + +-- no match +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$); + regexp_matches +---------------- +(0 rows) + +-- optional capture group did not match, null entry in array +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$); + regexp_matches +------------------ + {bar,NULL,beque} +(1 row) + +-- no capture groups +SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$); + regexp_matches +---------------- + {barbeque} +(1 row) + +-- start/end-of-line matches are of zero length +SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^', 'mg'); + regexp_matches +---------------- + {""} + {""} + {""} + {""} +(4 rows) + +SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '$', 'mg'); + regexp_matches +---------------- + {""} + {""} + {""} + {""} +(4 rows) + +SELECT regexp_matches('1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '^.?', 'mg'); + regexp_matches +---------------- + {1} + {2} + {3} + {4} + {""} +(5 rows) + +SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '.?$', 'mg'); + regexp_matches +---------------- + {""} + {1} + {""} + {2} + {""} + {3} + {""} + {4} + {""} + {""} +(10 rows) + +SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4', '.?$', 'mg'); + regexp_matches +---------------- + {""} + {1} + {""} + {2} + {""} + {3} + {""} + {4} + {""} +(9 rows) + +-- give me errors +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz'); +ERROR: invalid regular expression option: "z" +SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$); +ERROR: invalid regular expression: parentheses () not balanced +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$); +ERROR: invalid regular expression: invalid repetition count(s) +-- split string on regexp +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s+$re$) AS foo; + foo | length +-------+-------- + the | 3 + quick | 5 + brown | 5 + fox | 3 + jumps | 5 + over | 4 + the | 3 + lazy | 4 + dog | 3 +(9 rows) + +SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s+$re$); + regexp_split_to_array +----------------------------------------------- + {the,quick,brown,fox,jumps,over,the,lazy,dog} +(1 row) + +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s*$re$) AS foo; + foo | length +-----+-------- + t | 1 + h | 1 + e | 1 + q | 1 + u | 1 + i | 1 + c | 1 + k | 1 + b | 1 + r | 1 + o | 1 + w | 1 + n | 1 + f | 1 + o | 1 + x | 1 + j | 1 + u | 1 + m | 1 + p | 1 + s | 1 + o | 1 + v | 1 + e | 1 + r | 1 + t | 1 + h | 1 + e | 1 + l | 1 + a | 1 + z | 1 + y | 1 + d | 1 + o | 1 + g | 1 +(35 rows) + +SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s*$re$); + regexp_split_to_array +------------------------------------------------------------------------- + {t,h,e,q,u,i,c,k,b,r,o,w,n,f,o,x,j,u,m,p,s,o,v,e,r,t,h,e,l,a,z,y,d,o,g} +(1 row) + +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', '') AS foo; + foo | length +-----+-------- + t | 1 + h | 1 + e | 1 + | 1 + q | 1 + u | 1 + i | 1 + c | 1 + k | 1 + | 1 + b | 1 + r | 1 + o | 1 + w | 1 + n | 1 + | 1 + f | 1 + o | 1 + x | 1 + | 1 + j | 1 + u | 1 + m | 1 + p | 1 + s | 1 + | 1 + o | 1 + v | 1 + e | 1 + r | 1 + | 1 + t | 1 + h | 1 + e | 1 + | 1 + l | 1 + a | 1 + z | 1 + y | 1 + | 1 + d | 1 + o | 1 + g | 1 +(43 rows) + +SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', ''); + regexp_split_to_array +--------------------------------------------------------------------------------------------------------- + {t,h,e," ",q,u,i,c,k," ",b,r,o,w,n," ",f,o,x," ",j,u,m,p,s," ",o,v,e,r," ",t,h,e," ",l,a,z,y," ",d,o,g} +(1 row) + +-- case insensitive +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i') AS foo; + foo | length +---------------------------+-------- + th | 2 + QUick bROWn FOx jUMPs ov | 25 + r Th | 4 + lazy dOG | 9 +(4 rows) + +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i'); + regexp_split_to_array +----------------------------------------------------- + {th," QUick bROWn FOx jUMPs ov","r Th"," lazy dOG"} +(1 row) + +-- no match of pattern +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', 'nomatch') AS foo; + foo | length +---------------------------------------------+-------- + the quick brown fox jumps over the lazy dog | 43 +(1 row) + +SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nomatch'); + regexp_split_to_array +------------------------------------------------- + {"the quick brown fox jumps over the lazy dog"} +(1 row) + +-- some corner cases +SELECT regexp_split_to_array('123456','1'); + regexp_split_to_array +----------------------- + {"",23456} +(1 row) + +SELECT regexp_split_to_array('123456','6'); + regexp_split_to_array +----------------------- + {12345,""} +(1 row) + +SELECT regexp_split_to_array('123456','.'); + regexp_split_to_array +------------------------ + {"","","","","","",""} +(1 row) + +SELECT regexp_split_to_array('123456',''); + regexp_split_to_array +----------------------- + {1,2,3,4,5,6} +(1 row) + +SELECT regexp_split_to_array('123456','(?:)'); + regexp_split_to_array +----------------------- + {1,2,3,4,5,6} +(1 row) + +SELECT regexp_split_to_array('1',''); + regexp_split_to_array +----------------------- + {1} +(1 row) + +-- errors +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo; +ERROR: invalid regular expression option: "z" +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz'); +ERROR: invalid regular expression option: "z" +-- global option meaningless for regexp_split +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g') AS foo; +ERROR: regexp_split_to_table() does not support the "global" option +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g'); +ERROR: regexp_split_to_array() does not support the "global" option +-- change NULL-display back +\pset null '' +-- E021-11 position expression +SELECT POSITION('4' IN '1234567890') = '4' AS "4"; + 4 +--- + t +(1 row) + +SELECT POSITION('5' IN '1234567890') = '5' AS "5"; + 5 +--- + t +(1 row) + +-- T312 character overlay function +SELECT OVERLAY('abcdef' PLACING '45' FROM 4) AS "abc45f"; + abc45f +-------- + abc45f +(1 row) + +SELECT OVERLAY('yabadoo' PLACING 'daba' FROM 5) AS "yabadaba"; + yabadaba +---------- + yabadaba +(1 row) + +SELECT OVERLAY('yabadoo' PLACING 'daba' FROM 5 FOR 0) AS "yabadabadoo"; + yabadabadoo +------------- + yabadabadoo +(1 row) + +SELECT OVERLAY('babosa' PLACING 'ubb' FROM 2 FOR 4) AS "bubba"; + bubba +------- + bubba +(1 row) + +-- +-- test LIKE +-- Be sure to form every test as a LIKE/NOT LIKE pair. +-- +-- simplest examples +-- E061-04 like predicate +SELECT 'hawkeye' LIKE 'h%' AS "true"; + true +------ + t +(1 row) + +SELECT 'hawkeye' NOT LIKE 'h%' AS "false"; + false +------- + f +(1 row) + +SELECT 'hawkeye' LIKE 'H%' AS "false"; + false +------- + f +(1 row) + +SELECT 'hawkeye' NOT LIKE 'H%' AS "true"; + true +------ + t +(1 row) + +SELECT 'hawkeye' LIKE 'indio%' AS "false"; + false +------- + f +(1 row) + +SELECT 'hawkeye' NOT LIKE 'indio%' AS "true"; + true +------ + t +(1 row) + +SELECT 'hawkeye' LIKE 'h%eye' AS "true"; + true +------ + t +(1 row) + +SELECT 'hawkeye' NOT LIKE 'h%eye' AS "false"; + false +------- + f +(1 row) + +SELECT 'indio' LIKE '_ndio' AS "true"; + true +------ + t +(1 row) + +SELECT 'indio' NOT LIKE '_ndio' AS "false"; + false +------- + f +(1 row) + +SELECT 'indio' LIKE 'in__o' AS "true"; + true +------ + t +(1 row) + +SELECT 'indio' NOT LIKE 'in__o' AS "false"; + false +------- + f +(1 row) + +SELECT 'indio' LIKE 'in_o' AS "false"; + false +------- + f +(1 row) + +SELECT 'indio' NOT LIKE 'in_o' AS "true"; + true +------ + t +(1 row) + +-- unused escape character +SELECT 'hawkeye' LIKE 'h%' ESCAPE '#' AS "true"; + true +------ + t +(1 row) + +SELECT 'hawkeye' NOT LIKE 'h%' ESCAPE '#' AS "false"; + false +------- + f +(1 row) + +SELECT 'indio' LIKE 'ind_o' ESCAPE '$' AS "true"; + true +------ + t +(1 row) + +SELECT 'indio' NOT LIKE 'ind_o' ESCAPE '$' AS "false"; + false +------- + f +(1 row) + +-- escape character +-- E061-05 like predicate with escape clause +SELECT 'h%' LIKE 'h#%' ESCAPE '#' AS "true"; + true +------ + t +(1 row) + +SELECT 'h%' NOT LIKE 'h#%' ESCAPE '#' AS "false"; + false +------- + f +(1 row) + +SELECT 'h%wkeye' LIKE 'h#%' ESCAPE '#' AS "false"; + false +------- + f +(1 row) + +SELECT 'h%wkeye' NOT LIKE 'h#%' ESCAPE '#' AS "true"; + true +------ + t +(1 row) + +SELECT 'h%wkeye' LIKE 'h#%%' ESCAPE '#' AS "true"; + true +------ + t +(1 row) + +SELECT 'h%wkeye' NOT LIKE 'h#%%' ESCAPE '#' AS "false"; + false +------- + f +(1 row) + +SELECT 'h%awkeye' LIKE 'h#%a%k%e' ESCAPE '#' AS "true"; + true +------ + t +(1 row) + +SELECT 'h%awkeye' NOT LIKE 'h#%a%k%e' ESCAPE '#' AS "false"; + false +------- + f +(1 row) + +SELECT 'indio' LIKE '_ndio' ESCAPE '$' AS "true"; + true +------ + t +(1 row) + +SELECT 'indio' NOT LIKE '_ndio' ESCAPE '$' AS "false"; + false +------- + f +(1 row) + +SELECT 'i_dio' LIKE 'i$_d_o' ESCAPE '$' AS "true"; + true +------ + t +(1 row) + +SELECT 'i_dio' NOT LIKE 'i$_d_o' ESCAPE '$' AS "false"; + false +------- + f +(1 row) + +SELECT 'i_dio' LIKE 'i$_nd_o' ESCAPE '$' AS "false"; + false +------- + f +(1 row) + +SELECT 'i_dio' NOT LIKE 'i$_nd_o' ESCAPE '$' AS "true"; + true +------ + t +(1 row) + +SELECT 'i_dio' LIKE 'i$_d%o' ESCAPE '$' AS "true"; + true +------ + t +(1 row) + +SELECT 'i_dio' NOT LIKE 'i$_d%o' ESCAPE '$' AS "false"; + false +------- + f +(1 row) + +-- escape character same as pattern character +SELECT 'maca' LIKE 'm%aca' ESCAPE '%' AS "true"; + true +------ + t +(1 row) + +SELECT 'maca' NOT LIKE 'm%aca' ESCAPE '%' AS "false"; + false +------- + f +(1 row) + +SELECT 'ma%a' LIKE 'm%a%%a' ESCAPE '%' AS "true"; + true +------ + t +(1 row) + +SELECT 'ma%a' NOT LIKE 'm%a%%a' ESCAPE '%' AS "false"; + false +------- + f +(1 row) + +SELECT 'bear' LIKE 'b_ear' ESCAPE '_' AS "true"; + true +------ + t +(1 row) + +SELECT 'bear' NOT LIKE 'b_ear' ESCAPE '_' AS "false"; + false +------- + f +(1 row) + +SELECT 'be_r' LIKE 'b_e__r' ESCAPE '_' AS "true"; + true +------ + t +(1 row) + +SELECT 'be_r' NOT LIKE 'b_e__r' ESCAPE '_' AS "false"; + false +------- + f +(1 row) + +SELECT 'be_r' LIKE '__e__r' ESCAPE '_' AS "false"; + false +------- + f +(1 row) + +SELECT 'be_r' NOT LIKE '__e__r' ESCAPE '_' AS "true"; + true +------ + t +(1 row) + +-- +-- test ILIKE (case-insensitive LIKE) +-- Be sure to form every test as an ILIKE/NOT ILIKE pair. +-- +SELECT 'hawkeye' ILIKE 'h%' AS "true"; + true +------ + t +(1 row) + +SELECT 'hawkeye' NOT ILIKE 'h%' AS "false"; + false +------- + f +(1 row) + +SELECT 'hawkeye' ILIKE 'H%' AS "true"; + true +------ + t +(1 row) + +SELECT 'hawkeye' NOT ILIKE 'H%' AS "false"; + false +------- + f +(1 row) + +SELECT 'hawkeye' ILIKE 'H%Eye' AS "true"; + true +------ + t +(1 row) + +SELECT 'hawkeye' NOT ILIKE 'H%Eye' AS "false"; + false +------- + f +(1 row) + +SELECT 'Hawkeye' ILIKE 'h%' AS "true"; + true +------ + t +(1 row) + +SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false"; + false +------- + f +(1 row) + +-- +-- test %/_ combination cases, cf bugs #4821 and #5478 +-- +SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f; + t | t | f +---+---+--- + t | t | f +(1 row) + +SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f; + t | t | f +---+---+--- + t | t | f +(1 row) + +SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f; + t | t | f +---+---+--- + t | t | f +(1 row) + +SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f; + t | t | f +---+---+--- + t | t | f +(1 row) + +SELECT 'jack' LIKE '%____%' AS t; + t +--- + t +(1 row) + +-- +-- basic tests of LIKE with indexes +-- +CREATE TABLE texttest (a text PRIMARY KEY, b int); +SELECT * FROM texttest WHERE a LIKE '%1%'; + a | b +---+--- +(0 rows) + +CREATE TABLE byteatest (a bytea PRIMARY KEY, b int); +SELECT * FROM byteatest WHERE a LIKE '%1%'; + a | b +---+--- +(0 rows) + +DROP TABLE texttest, byteatest; +-- +-- test implicit type conversion +-- +-- E021-07 character concatenation +SELECT 'unknown' || ' and unknown' AS "Concat unknown types"; + Concat unknown types +---------------------- + unknown and unknown +(1 row) + +SELECT text 'text' || ' and unknown' AS "Concat text to unknown type"; + Concat text to unknown type +----------------------------- + text and unknown +(1 row) + +SELECT char(20) 'characters' || ' and text' AS "Concat char to unknown type"; + Concat char to unknown type +----------------------------- + characters and text +(1 row) + +SELECT text 'text' || char(20) ' and characters' AS "Concat text to char"; + Concat text to char +--------------------- + text and characters +(1 row) + +SELECT text 'text' || varchar ' and varchar' AS "Concat text to varchar"; + Concat text to varchar +------------------------ + text and varchar +(1 row) + +-- +-- test substr with toasted text values +-- +CREATE TABLE toasttest(f1 text); +insert into toasttest values(repeat('1234567890',10000)); +insert into toasttest values(repeat('1234567890',10000)); +-- +-- Ensure that some values are uncompressed, to test the faster substring +-- operation used in that case +-- +alter table toasttest alter column f1 set storage external; +insert into toasttest values(repeat('1234567890',10000)); +insert into toasttest values(repeat('1234567890',10000)); +-- If the starting position is zero or less, then return from the start of the string +-- adjusting the length to be consistent with the "negative start" per SQL. +SELECT substr(f1, -1, 5) from toasttest; + substr +-------- + 123 + 123 + 123 + 123 +(4 rows) + +-- If the length is less than zero, an ERROR is thrown. +SELECT substr(f1, 5, -1) from toasttest; +ERROR: negative substring length not allowed +-- If no third argument (length) is provided, the length to the end of the +-- string is assumed. +SELECT substr(f1, 99995) from toasttest; + substr +-------- + 567890 + 567890 + 567890 + 567890 +(4 rows) + +-- If start plus length is > string length, the result is truncated to +-- string length +SELECT substr(f1, 99995, 10) from toasttest; + substr +-------- + 567890 + 567890 + 567890 + 567890 +(4 rows) + +TRUNCATE TABLE toasttest; +INSERT INTO toasttest values (repeat('1234567890',300)); +INSERT INTO toasttest values (repeat('1234567890',300)); +INSERT INTO toasttest values (repeat('1234567890',300)); +INSERT INTO toasttest values (repeat('1234567890',300)); +-- expect >0 blocks +SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty + FROM pg_class where relname = 'toasttest'; + is_empty +---------- + +(1 row) + +TRUNCATE TABLE toasttest; +ALTER TABLE toasttest set (toast_tuple_target = 4080); +INSERT INTO toasttest values (repeat('1234567890',300)); +INSERT INTO toasttest values (repeat('1234567890',300)); +INSERT INTO toasttest values (repeat('1234567890',300)); +INSERT INTO toasttest values (repeat('1234567890',300)); +-- expect 0 blocks +SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty + FROM pg_class where relname = 'toasttest'; + is_empty +---------- + +(1 row) + +DROP TABLE toasttest; +-- +-- test substr with toasted bytea values +-- +CREATE TABLE toasttest(f1 bytea); +insert into toasttest values(decode(repeat('1234567890',10000),'escape')); +insert into toasttest values(decode(repeat('1234567890',10000),'escape')); +-- +-- Ensure that some values are uncompressed, to test the faster substring +-- operation used in that case +-- +alter table toasttest alter column f1 set storage external; +insert into toasttest values(decode(repeat('1234567890',10000),'escape')); +insert into toasttest values(decode(repeat('1234567890',10000),'escape')); +-- If the starting position is zero or less, then return from the start of the string +-- adjusting the length to be consistent with the "negative start" per SQL. +SELECT substr(f1, -1, 5) from toasttest; + substr +-------- + 123 + 123 + 123 + 123 +(4 rows) + +-- If the length is less than zero, an ERROR is thrown. +SELECT substr(f1, 5, -1) from toasttest; +ERROR: negative substring length not allowed +-- If no third argument (length) is provided, the length to the end of the +-- string is assumed. +SELECT substr(f1, 99995) from toasttest; + substr +-------- + 567890 + 567890 + 567890 + 567890 +(4 rows) + +-- If start plus length is > string length, the result is truncated to +-- string length +SELECT substr(f1, 99995, 10) from toasttest; + substr +-------- + 567890 + 567890 + 567890 + 567890 +(4 rows) + +DROP TABLE toasttest; +-- test internally compressing datums +-- this tests compressing a datum to a very small size which exercises a +-- corner case in packed-varlena handling: even though small, the compressed +-- datum must be given a 4-byte header because there are no bits to indicate +-- compression in a 1-byte header +CREATE TABLE toasttest (c char(4096)); +INSERT INTO toasttest VALUES('x'); +SELECT length(c), c::text FROM toasttest; + length | c +--------+--- + 1 | x +(1 row) + +SELECT c FROM toasttest; + c +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + x +(1 row) + +DROP TABLE toasttest; +-- +-- test length +-- +SELECT length('abcdef') AS "length_6"; + length_6 +---------- + 6 +(1 row) + +-- +-- test strpos +-- +SELECT strpos('abcdef', 'cd') AS "pos_3"; + pos_3 +------- + 3 +(1 row) + +SELECT strpos('abcdef', 'xy') AS "pos_0"; + pos_0 +------- + 0 +(1 row) + +-- +-- test replace +-- +SELECT replace('abcdef', 'de', '45') AS "abc45f"; + abc45f +-------- + abc45f +(1 row) + +SELECT replace('yabadabadoo', 'ba', '123') AS "ya123da123doo"; + ya123da123doo +--------------- + ya123da123doo +(1 row) + +SELECT replace('yabadoo', 'bad', '') AS "yaoo"; + yaoo +------ + yaoo +(1 row) + +-- +-- test split_part +-- +select split_part('joeuser@mydatabase','@',0) AS "an error"; +ERROR: field position must be greater than zero +select split_part('joeuser@mydatabase','@',1) AS "joeuser"; + joeuser +--------- + joeuser +(1 row) + +select split_part('joeuser@mydatabase','@',2) AS "mydatabase"; + mydatabase +------------ + mydatabase +(1 row) + +select split_part('joeuser@mydatabase','@',3) AS "empty string"; + empty string +-------------- + +(1 row) + +select split_part('@joeuser@mydatabase@','@',2) AS "joeuser"; + joeuser +--------- + joeuser +(1 row) + +-- +-- test to_hex +-- +select to_hex(256*256*256 - 1) AS "ffffff"; + ffffff +-------- + ffffff +(1 row) + +select to_hex(256::bigint*256::bigint*256::bigint*256::bigint - 1) AS "ffffffff"; + ffffffff +---------- + ffffffff +(1 row) + +-- +-- MD5 test suite - from IETF RFC 1321 +-- (see: ftp://ftp.rfc-editor.org/in-notes/rfc1321.txt) +-- +select md5('') = 'd41d8cd98f00b204e9800998ecf8427e' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('a') = '0cc175b9c0f1b6a831c399e269772661' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('abc') = '900150983cd24fb0d6963f7d28e17f72' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('message digest') = 'f96b697d7cb7938d525a2f31aaf161d0' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('abcdefghijklmnopqrstuvwxyz') = 'c3fcd3d76192e4007dfb496cca67e13b' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') = 'd174ab98d277d9f5a5611c2c9f419d9f' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('12345678901234567890123456789012345678901234567890123456789012345678901234567890') = '57edf4a22be3c955ac49da2e2107b67a' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5(''::bytea) = 'd41d8cd98f00b204e9800998ecf8427e' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('a'::bytea) = '0cc175b9c0f1b6a831c399e269772661' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('abc'::bytea) = '900150983cd24fb0d6963f7d28e17f72' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('message digest'::bytea) = 'f96b697d7cb7938d525a2f31aaf161d0' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('abcdefghijklmnopqrstuvwxyz'::bytea) = 'c3fcd3d76192e4007dfb496cca67e13b' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'::bytea) = 'd174ab98d277d9f5a5611c2c9f419d9f' AS "TRUE"; + TRUE +------ + t +(1 row) + +select md5('12345678901234567890123456789012345678901234567890123456789012345678901234567890'::bytea) = '57edf4a22be3c955ac49da2e2107b67a' AS "TRUE"; + TRUE +------ + t +(1 row) + +-- +-- SHA-2 +-- +SET bytea_output TO hex; +SELECT sha224(''); + sha224 +------------------------------------------------------------ + \xd14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f +(1 row) + +SELECT sha224('The quick brown fox jumps over the lazy dog.'); + sha224 +------------------------------------------------------------ + \x619cba8e8e05826e9b8c519c0a5c68f4fb653e8a3d8aa04bb2c8cd4c +(1 row) + +SELECT sha256(''); + sha256 +-------------------------------------------------------------------- + \xe3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +(1 row) + +SELECT sha256('The quick brown fox jumps over the lazy dog.'); + sha256 +-------------------------------------------------------------------- + \xef537f25c895bfa782526529a9b63d97aa631564d5d789c2b765448c8635fb6c +(1 row) + +SELECT sha384(''); + sha384 +---------------------------------------------------------------------------------------------------- + \x38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b +(1 row) + +SELECT sha384('The quick brown fox jumps over the lazy dog.'); + sha384 +---------------------------------------------------------------------------------------------------- + \xed892481d8272ca6df370bf706e4d7bc1b5739fa2177aae6c50e946678718fc67a7af2819a021c2fc34e91bdb63409d7 +(1 row) + +SELECT sha512(''); + sha512 +------------------------------------------------------------------------------------------------------------------------------------ + \xcf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e +(1 row) + +SELECT sha512('The quick brown fox jumps over the lazy dog.'); + sha512 +------------------------------------------------------------------------------------------------------------------------------------ + \x91ea1245f20d46ae9a037a989f54f1f790f0a47607eeb8a14d12890cea77a1bbc6c7ed9cf205e67b7f2b8fd4c7dfd3a7a8617e45f3c463d481c7e586c39ac1ed +(1 row) + +-- +-- test behavior of escape_string_warning and standard_conforming_strings options +-- +set escape_string_warning = off; +set standard_conforming_strings = off; +show escape_string_warning; + escape_string_warning +----------------------- + off +(1 row) + +show standard_conforming_strings; + standard_conforming_strings +----------------------------- + off +(1 row) + +set escape_string_warning = on; +set standard_conforming_strings = on; +show escape_string_warning; + escape_string_warning +----------------------- + on +(1 row) + +show standard_conforming_strings; + standard_conforming_strings +----------------------------- + on +(1 row) + +select 'a\bcd' as f1, 'a\b''cd' as f2, 'a\b''''cd' as f3, 'abcd\' as f4, 'ab\''cd' as f5, '\\' as f6; + f1 | f2 | f3 | f4 | f5 | f6 +-------+--------+---------+-------+--------+---- + a\bcd | a\b'cd | a\b''cd | abcd\ | ab\'cd | \\ +(1 row) + +set standard_conforming_strings = off; +select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3, 'abcd\\' as f4, 'ab\\\'cd' as f5, '\\\\' as f6; +WARNING: nonstandard use of \\ in a string literal +LINE 1: select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3,... + ^ +HINT: Use the escape string syntax for backslashes, e.g., E'\\'. +WARNING: nonstandard use of \\ in a string literal +LINE 1: select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3,... + ^ +HINT: Use the escape string syntax for backslashes, e.g., E'\\'. +WARNING: nonstandard use of \\ in a string literal +LINE 1: select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3,... + ^ +HINT: Use the escape string syntax for backslashes, e.g., E'\\'. +WARNING: nonstandard use of \\ in a string literal +LINE 1: ...bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3, 'abcd\\' ... + ^ +HINT: Use the escape string syntax for backslashes, e.g., E'\\'. +WARNING: nonstandard use of \\ in a string literal +LINE 1: ...'cd' as f2, 'a\\b\'''cd' as f3, 'abcd\\' as f4, 'ab\\\'cd'... + ^ +HINT: Use the escape string syntax for backslashes, e.g., E'\\'. +WARNING: nonstandard use of \\ in a string literal +LINE 1: ...'''cd' as f3, 'abcd\\' as f4, 'ab\\\'cd' as f5, '\\\\' as ... + ^ +HINT: Use the escape string syntax for backslashes, e.g., E'\\'. + f1 | f2 | f3 | f4 | f5 | f6 +-------+--------+---------+-------+--------+---- + a\bcd | a\b'cd | a\b''cd | abcd\ | ab\'cd | \\ +(1 row) + +set escape_string_warning = off; +set standard_conforming_strings = on; +select 'a\bcd' as f1, 'a\b''cd' as f2, 'a\b''''cd' as f3, 'abcd\' as f4, 'ab\''cd' as f5, '\\' as f6; + f1 | f2 | f3 | f4 | f5 | f6 +-------+--------+---------+-------+--------+---- + a\bcd | a\b'cd | a\b''cd | abcd\ | ab\'cd | \\ +(1 row) + +set standard_conforming_strings = off; +select 'a\\bcd' as f1, 'a\\b\'cd' as f2, 'a\\b\'''cd' as f3, 'abcd\\' as f4, 'ab\\\'cd' as f5, '\\\\' as f6; + f1 | f2 | f3 | f4 | f5 | f6 +-------+--------+---------+-------+--------+---- + a\bcd | a\b'cd | a\b''cd | abcd\ | ab\'cd | \\ +(1 row) + +-- +-- Additional string functions +-- +SET bytea_output TO escape; +SELECT initcap('hi THOMAS'); + initcap +----------- + Hi Thomas +(1 row) + +SELECT lpad('hi', 5, 'xy'); + lpad +------- + xyxhi +(1 row) + +SELECT lpad('hi', 5); + lpad +------- + hi +(1 row) + +SELECT lpad('hi', -5, 'xy'); + lpad +------ + +(1 row) + +SELECT lpad('hello', 2); + lpad +------ + he +(1 row) + +SELECT lpad('hi', 5, ''); + lpad +------ + hi +(1 row) + +SELECT rpad('hi', 5, 'xy'); + rpad +------- + hixyx +(1 row) + +SELECT rpad('hi', 5); + rpad +------- + hi +(1 row) + +SELECT rpad('hi', -5, 'xy'); + rpad +------ + +(1 row) + +SELECT rpad('hello', 2); + rpad +------ + he +(1 row) + +SELECT rpad('hi', 5, ''); + rpad +------ + hi +(1 row) + +SELECT ltrim('zzzytrim', 'xyz'); + ltrim +------- + trim +(1 row) + +SELECT translate('', '14', 'ax'); + translate +----------- + +(1 row) + +SELECT translate('12345', '14', 'ax'); + translate +----------- + a23x5 +(1 row) + +SELECT ascii('x'); + ascii +------- + 120 +(1 row) + +SELECT ascii(''); + ascii +------- + 0 +(1 row) + +SELECT chr(65); + chr +----- + A +(1 row) + +SELECT chr(0); +ERROR: null character not permitted +SELECT repeat('Pg', 4); + repeat +---------- + PgPgPgPg +(1 row) + +SELECT repeat('Pg', -4); + repeat +-------- + +(1 row) + +SELECT trim(E'\\000'::bytea from E'\\000Tom\\000'::bytea); + btrim +------- + Tom +(1 row) + +SELECT btrim(E'\\000trim\\000'::bytea, E'\\000'::bytea); + btrim +------- + trim +(1 row) + +SELECT btrim(''::bytea, E'\\000'::bytea); + btrim +------- + +(1 row) + +SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea); + btrim +-------------- + \000trim\000 +(1 row) + +SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape'); + encode +------------- + TTh\x01omas +(1 row) + +SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape'); + encode +-------------------- + Th\000omas\x02\x03 +(1 row) + +SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape'); + encode +----------------- + Th\000o\x02\x03 +(1 row) + diff --git a/src/test/regress/expected/tsrf_1.out b/src/test/regress/expected/tsrf_1.out new file mode 100644 index 0000000000..a0f7d80c69 --- /dev/null +++ b/src/test/regress/expected/tsrf_1.out @@ -0,0 +1,712 @@ +-- +-- tsrf - targetlist set returning function tests +-- +-- simple srf +SELECT generate_series(1, 3); + generate_series +----------------- + 1 + 2 + 3 +(3 rows) + +-- parallel iteration +SELECT generate_series(1, 3), generate_series(3,5); + generate_series | generate_series +-----------------+----------------- + 1 | 3 + 2 | 4 + 3 | 5 +(3 rows) + +-- parallel iteration, different number of rows +SELECT generate_series(1, 2), generate_series(1,4); + generate_series | generate_series +-----------------+----------------- + 1 | 1 + 2 | 2 + | 3 + | 4 +(4 rows) + +-- srf, with SRF argument +SELECT generate_series(1, generate_series(1, 3)); + generate_series +----------------- + 1 + 1 + 2 + 1 + 2 + 3 +(6 rows) + +-- but we've traditionally rejected the same in FROM +SELECT * FROM generate_series(1, generate_series(1, 3)); +ERROR: set-returning functions must appear at top level of FROM +LINE 1: SELECT * FROM generate_series(1, generate_series(1, 3)); + ^ +-- srf, with two SRF arguments +SELECT generate_series(generate_series(1,3), generate_series(2, 4)); + generate_series +----------------- + 1 + 2 + 2 + 3 + 3 + 4 +(6 rows) + +-- check proper nesting of SRFs in different expressions +explain (verbose, costs off) +SELECT generate_series(1, generate_series(1, 3)), generate_series(2, 4); + QUERY PLAN +-------------------------------------------------------------------------------- + ProjectSet + Output: generate_series(1, (generate_series(1, 3))), (generate_series(2, 4)) + -> ProjectSet + Output: generate_series(1, 3), generate_series(2, 4) + -> Result +(5 rows) + +SELECT generate_series(1, generate_series(1, 3)), generate_series(2, 4); + generate_series | generate_series +-----------------+----------------- + 1 | 2 + 1 | 3 + 2 | 3 + 1 | 4 + 2 | 4 + 3 | 4 +(6 rows) + +CREATE TABLE few(id int, dataa text, datab text); +INSERT INTO few VALUES(1, 'a', 'foo'),(2, 'a', 'bar'),(3, 'b', 'bar'); +-- SRF with a provably-dummy relation +explain (verbose, costs off) +SELECT unnest(ARRAY[1, 2]) FROM few WHERE false; + QUERY PLAN +-------------------------------------- + ProjectSet + Output: unnest('{1,2}'::integer[]) + -> Result + One-Time Filter: false +(4 rows) + +SELECT unnest(ARRAY[1, 2]) FROM few WHERE false; + unnest +-------- +(0 rows) + +-- SRF shouldn't prevent upper query from recognizing lower as dummy +explain (verbose, costs off) +SELECT * FROM few f1, + (SELECT unnest(ARRAY[1,2]) FROM few f2 WHERE false OFFSET 0) ss; + QUERY PLAN +------------------------------------------------ + Result + Output: f1.id, f1.dataa, f1.datab, ss.unnest + One-Time Filter: false +(3 rows) + +SELECT * FROM few f1, + (SELECT unnest(ARRAY[1,2]) FROM few f2 WHERE false OFFSET 0) ss; + id | dataa | datab | unnest +----+-------+-------+-------- +(0 rows) + +-- SRF output order of sorting is maintained, if SRF is not referenced +SELECT few.id, generate_series(1,3) g FROM few ORDER BY id DESC; + id | g +----+--- + 3 | 1 + 3 | 2 + 3 | 3 + 2 | 1 + 2 | 2 + 2 | 3 + 1 | 1 + 1 | 2 + 1 | 3 +(9 rows) + +-- but SRFs can be referenced in sort +SELECT few.id, generate_series(1,3) g FROM few ORDER BY id, g DESC; + id | g +----+--- + 1 | 3 + 1 | 2 + 1 | 1 + 2 | 3 + 2 | 2 + 2 | 1 + 3 | 3 + 3 | 2 + 3 | 1 +(9 rows) + +SELECT few.id, generate_series(1,3) g FROM few ORDER BY id, generate_series(1,3) DESC; + id | g +----+--- + 1 | 3 + 1 | 2 + 1 | 1 + 2 | 3 + 2 | 2 + 2 | 1 + 3 | 3 + 3 | 2 + 3 | 1 +(9 rows) + +-- it's weird to have ORDER BYs that increase the number of results +SELECT few.id FROM few ORDER BY id, generate_series(1,3) DESC; + id +---- + 1 + 1 + 1 + 2 + 2 + 2 + 3 + 3 + 3 +(9 rows) + +-- SRFs are computed after aggregation +SET enable_hashagg TO 0; -- stable output order +SELECT few.dataa, count(*), min(id), max(id), unnest('{1,1,3}'::int[]) FROM few WHERE few.id = 1 GROUP BY few.dataa; + dataa | count | min | max | unnest +-------+-------+-----+-----+-------- + a | 1 | 1 | 1 | 1 + a | 1 | 1 | 1 | 1 + a | 1 | 1 | 1 | 3 +(3 rows) + +-- unless referenced in GROUP BY clause +SELECT few.dataa, count(*), min(id), max(id), unnest('{1,1,3}'::int[]) FROM few WHERE few.id = 1 GROUP BY few.dataa, unnest('{1,1,3}'::int[]); + dataa | count | min | max | unnest +-------+-------+-----+-----+-------- + a | 2 | 1 | 1 | 1 + a | 1 | 1 | 1 | 3 +(2 rows) + +SELECT few.dataa, count(*), min(id), max(id), unnest('{1,1,3}'::int[]) FROM few WHERE few.id = 1 GROUP BY few.dataa, 5; + dataa | count | min | max | unnest +-------+-------+-----+-----+-------- + a | 2 | 1 | 1 | 1 + a | 1 | 1 | 1 | 3 +(2 rows) + +RESET enable_hashagg; +-- check HAVING works when GROUP BY does [not] reference SRF output +SELECT dataa, generate_series(1,1), count(*) FROM few GROUP BY 1 HAVING count(*) > 1; + dataa | generate_series | count +-------+-----------------+------- + a | 1 | 2 +(1 row) + +SELECT dataa, generate_series(1,1), count(*) FROM few GROUP BY 1, 2 HAVING count(*) > 1; + dataa | generate_series | count +-------+-----------------+------- + a | 1 | 2 +(1 row) + +-- it's weird to have GROUP BYs that increase the number of results +SELECT few.dataa, count(*) FROM few WHERE dataa = 'a' GROUP BY few.dataa ORDER BY 2; + dataa | count +-------+------- + a | 2 +(1 row) + +SELECT few.dataa, count(*) FROM few WHERE dataa = 'a' GROUP BY few.dataa, unnest('{1,1,3}'::int[]) ORDER BY 2; + dataa | count +-------+------- + a | 2 + a | 4 +(2 rows) + +-- SRFs are not allowed if they'd need to be conditionally executed +SELECT q1, case when q1 > 0 then generate_series(1,3) else 0 end FROM int8_tbl; +ERROR: set-returning functions are not allowed in CASE +LINE 1: SELECT q1, case when q1 > 0 then generate_series(1,3) else 0... + ^ +HINT: You might be able to move the set-returning function into a LATERAL FROM item. +SELECT q1, coalesce(generate_series(1,3), 0) FROM int8_tbl; +ERROR: set-returning functions are not allowed in COALESCE +LINE 1: SELECT q1, coalesce(generate_series(1,3), 0) FROM int8_tbl; + ^ +HINT: You might be able to move the set-returning function into a LATERAL FROM item. +-- SRFs are not allowed in aggregate arguments +SELECT min(generate_series(1, 3)) FROM few; +ERROR: aggregate function calls cannot contain set-returning function calls +LINE 1: SELECT min(generate_series(1, 3)) FROM few; + ^ +HINT: You might be able to move the set-returning function into a LATERAL FROM item. +-- ... unless they're within a sub-select +SELECT sum((3 = ANY(SELECT generate_series(1,4)))::int); + sum +----- + 1 +(1 row) + +SELECT sum((3 = ANY(SELECT lag(x) over(order by x) + FROM generate_series(1,4) x))::int); + sum +----- + 1 +(1 row) + +-- SRFs are not allowed in window function arguments, either +SELECT min(generate_series(1, 3)) OVER() FROM few; +ERROR: window function calls cannot contain set-returning function calls +LINE 1: SELECT min(generate_series(1, 3)) OVER() FROM few; + ^ +HINT: You might be able to move the set-returning function into a LATERAL FROM item. +-- SRFs are normally computed after window functions +SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few; + id | lag | count | generate_series +----+-----+-------+----------------- + 1 | | 3 | 1 + 1 | | 3 | 2 + 1 | | 3 | 3 + 2 | 1 | 3 | 1 + 2 | 1 | 3 | 2 + 2 | 1 | 3 | 3 + 3 | 2 | 3 | 1 + 3 | 2 | 3 | 2 + 3 | 2 | 3 | 3 +(9 rows) + +-- unless referencing SRFs +SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_series(1,3)), generate_series(1,3) g FROM few GROUP BY g; + sum | g +-----+--- + 3 | 1 + 3 | 2 + 3 | 3 +(3 rows) + +-- sorting + grouping +SELECT few.dataa, count(*), min(id), max(id), generate_series(1,3) FROM few GROUP BY few.dataa ORDER BY 5, 1; + dataa | count | min | max | generate_series +-------+-------+-----+-----+----------------- + a | 2 | 1 | 2 | 1 + b | 1 | 3 | 3 | 1 + a | 2 | 1 | 2 | 2 + b | 1 | 3 | 3 | 2 + a | 2 | 1 | 2 | 3 + b | 1 | 3 | 3 | 3 +(6 rows) + +-- grouping sets are a bit special, they produce NULLs in columns not actually NULL +set enable_hashagg = false; +SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab); + dataa | b | g | count +-------+-----+---+------- + a | bar | 1 | 1 + a | bar | 2 | 1 + a | foo | 1 | 1 + a | foo | 2 | 1 + a | | 1 | 2 + a | | 2 | 2 + b | bar | 1 | 1 + b | bar | 2 | 1 + b | | 1 | 1 + b | | 2 | 1 + | | 1 | 3 + | | 2 | 3 + | bar | 1 | 2 + | bar | 2 | 2 + | foo | 1 | 1 + | foo | 2 | 1 +(16 rows) + +SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab) ORDER BY dataa; + dataa | b | g | count +-------+-----+---+------- + a | bar | 1 | 1 + a | bar | 2 | 1 + a | foo | 1 | 1 + a | foo | 2 | 1 + a | | 1 | 2 + a | | 2 | 2 + b | bar | 1 | 1 + b | bar | 2 | 1 + b | | 1 | 1 + b | | 2 | 1 + | | 1 | 3 + | | 2 | 3 + | bar | 1 | 2 + | bar | 2 | 2 + | foo | 1 | 1 + | foo | 2 | 1 +(16 rows) + +SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab) ORDER BY g; + dataa | b | g | count +-------+-----+---+------- + a | bar | 1 | 1 + a | foo | 1 | 1 + a | | 1 | 2 + b | bar | 1 | 1 + b | | 1 | 1 + | | 1 | 3 + | bar | 1 | 2 + | foo | 1 | 1 + | foo | 2 | 1 + a | bar | 2 | 1 + b | | 2 | 1 + a | foo | 2 | 1 + | bar | 2 | 2 + a | | 2 | 2 + | | 2 | 3 + b | bar | 2 | 1 +(16 rows) + +SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g); + dataa | b | g | count +-------+-----+---+------- + a | bar | 1 | 1 + a | bar | 2 | 1 + a | bar | | 2 + a | foo | 1 | 1 + a | foo | 2 | 1 + a | foo | | 2 + a | | | 4 + b | bar | 1 | 1 + b | bar | 2 | 1 + b | bar | | 2 + b | | | 2 + | | | 6 + | bar | 1 | 2 + | bar | 2 | 2 + | bar | | 4 + | foo | 1 | 1 + | foo | 2 | 1 + | foo | | 2 + a | | 1 | 2 + b | | 1 | 1 + | | 1 | 3 + a | | 2 | 2 + b | | 2 | 1 + | | 2 | 3 +(24 rows) + +SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g) ORDER BY dataa; + dataa | b | g | count +-------+-----+---+------- + a | foo | | 2 + a | | | 4 + a | | 2 | 2 + a | bar | 1 | 1 + a | bar | 2 | 1 + a | bar | | 2 + a | foo | 1 | 1 + a | foo | 2 | 1 + a | | 1 | 2 + b | bar | 1 | 1 + b | | | 2 + b | | 1 | 1 + b | bar | 2 | 1 + b | bar | | 2 + b | | 2 | 1 + | | 2 | 3 + | | | 6 + | bar | 1 | 2 + | bar | 2 | 2 + | bar | | 4 + | foo | 1 | 1 + | foo | 2 | 1 + | foo | | 2 + | | 1 | 3 +(24 rows) + +SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g) ORDER BY g; + dataa | b | g | count +-------+-----+---+------- + a | bar | 1 | 1 + a | foo | 1 | 1 + b | bar | 1 | 1 + | bar | 1 | 2 + | foo | 1 | 1 + a | | 1 | 2 + b | | 1 | 1 + | | 1 | 3 + a | | 2 | 2 + b | | 2 | 1 + | bar | 2 | 2 + | | 2 | 3 + | foo | 2 | 1 + a | bar | 2 | 1 + a | foo | 2 | 1 + b | bar | 2 | 1 + a | | | 4 + b | bar | | 2 + b | | | 2 + | | | 6 + a | foo | | 2 + a | bar | | 2 + | bar | | 4 + | foo | | 2 +(24 rows) + +reset enable_hashagg; +-- case with degenerate ORDER BY +explain (verbose, costs off) +select 'foo' as f, generate_series(1,2) as g from few order by 1; + QUERY PLAN +------------------------------------------------ + ProjectSet + Output: ('foo'::text), generate_series(1, 2) + -> Seq Scan on public.few + Output: 'foo'::text +(4 rows) + +select 'foo' as f, generate_series(1,2) as g from few order by 1; + f | g +-----+--- + foo | 1 + foo | 2 + foo | 1 + foo | 2 + foo | 1 + foo | 2 +(6 rows) + +-- data modification +CREATE TABLE fewmore AS SELECT generate_series(1,3) AS data; +INSERT INTO fewmore VALUES(generate_series(4,5)); +SELECT * FROM fewmore; + data +------ + 1 + 2 + 3 + 4 + 5 +(5 rows) + +-- SRFs are not allowed in UPDATE (they once were, but it was nonsense) +UPDATE fewmore SET data = generate_series(4,9); +ERROR: set-returning functions are not allowed in UPDATE +LINE 1: UPDATE fewmore SET data = generate_series(4,9); + ^ +-- SRFs are not allowed in RETURNING +INSERT INTO fewmore VALUES(1) RETURNING generate_series(1,3); +ERROR: set-returning functions are not allowed in RETURNING +LINE 1: INSERT INTO fewmore VALUES(1) RETURNING generate_series(1,3)... + ^ +-- nor standalone VALUES (but surely this is a bug?) +VALUES(1, generate_series(1,2)); +ERROR: set-returning functions are not allowed in VALUES +LINE 1: VALUES(1, generate_series(1,2)); + ^ +-- We allow tSRFs that are not at top level +SELECT int4mul(generate_series(1,2), 10); + int4mul +--------- + 10 + 20 +(2 rows) + +SELECT generate_series(1,3) IS DISTINCT FROM 2; + ?column? +---------- + t + f + t +(3 rows) + +-- but SRFs in function RTEs must be at top level (annoying restriction) +SELECT * FROM int4mul(generate_series(1,2), 10); +ERROR: set-returning functions must appear at top level of FROM +LINE 1: SELECT * FROM int4mul(generate_series(1,2), 10); + ^ +-- DISTINCT ON is evaluated before tSRF evaluation if SRF is not +-- referenced either in ORDER BY or in the DISTINCT ON list. The ORDER +-- BY reference can be implicitly generated, if there's no other ORDER BY. +-- implicit reference (via implicit ORDER) to all columns +SELECT DISTINCT ON (a) a, b, generate_series(1,3) g +FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b); + a | b | g +---+---+--- + 1 | 1 | 1 + 3 | 2 | 1 + 5 | 3 | 1 +(3 rows) + +-- unreferenced in DISTINCT ON or ORDER BY +SELECT DISTINCT ON (a) a, b, generate_series(1,3) g +FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b) +ORDER BY a, b DESC; + a | b | g +---+---+--- + 1 | 4 | 1 + 1 | 4 | 2 + 1 | 4 | 3 + 3 | 2 | 1 + 3 | 2 | 2 + 3 | 2 | 3 + 5 | 3 | 1 + 5 | 3 | 2 + 5 | 3 | 3 +(9 rows) + +-- referenced in ORDER BY +SELECT DISTINCT ON (a) a, b, generate_series(1,3) g +FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b) +ORDER BY a, b DESC, g DESC; + a | b | g +---+---+--- + 1 | 4 | 3 + 3 | 2 | 3 + 5 | 3 | 3 +(3 rows) + +-- referenced in ORDER BY and DISTINCT ON +SELECT DISTINCT ON (a, b, g) a, b, generate_series(1,3) g +FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b) +ORDER BY a, b DESC, g DESC; + a | b | g +---+---+--- + 1 | 4 | 3 + 1 | 4 | 2 + 1 | 4 | 1 + 1 | 1 | 3 + 1 | 1 | 2 + 1 | 1 | 1 + 3 | 2 | 3 + 3 | 2 | 2 + 3 | 2 | 1 + 3 | 1 | 3 + 3 | 1 | 2 + 3 | 1 | 1 + 5 | 3 | 3 + 5 | 3 | 2 + 5 | 3 | 1 + 5 | 1 | 3 + 5 | 1 | 2 + 5 | 1 | 1 +(18 rows) + +-- only SRF mentioned in DISTINCT ON +SELECT DISTINCT ON (g) a, b, generate_series(1,3) g +FROM (VALUES (3, 2), (3,1), (1,1), (1,4), (5,3), (5,1)) AS t(a, b); + a | b | g +---+---+--- + 3 | 2 | 1 + 5 | 1 | 2 + 3 | 1 | 3 +(3 rows) + +-- LIMIT / OFFSET is evaluated after SRF evaluation +SELECT a, generate_series(1,2) FROM (VALUES(1),(2),(3)) r(a) LIMIT 2 OFFSET 2; + a | generate_series +---+----------------- + 2 | 1 + 2 | 2 +(2 rows) + +-- SRFs are not allowed in LIMIT. +SELECT 1 LIMIT generate_series(1,3); +ERROR: set-returning functions are not allowed in LIMIT +LINE 1: SELECT 1 LIMIT generate_series(1,3); + ^ +-- tSRF in correlated subquery, referencing table outside +SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few; + generate_series +----------------- + 2 + 3 + +(3 rows) + +-- tSRF in correlated subquery, referencing SRF outside +SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET g.i) FROM generate_series(0,3) g(i); + generate_series +----------------- + 1 + 2 + 3 + +(4 rows) + +-- Operators can return sets too +CREATE OPERATOR |@| (PROCEDURE = unnest, RIGHTARG = ANYARRAY); +SELECT |@|ARRAY[1,2,3]; + ?column? +---------- + 1 + 2 + 3 +(3 rows) + +-- Some fun cases involving duplicate SRF calls +explain (verbose, costs off) +select generate_series(1,3) as x, generate_series(1,3) + 1 as xp1; + QUERY PLAN +------------------------------------------------------------------ + Result + Output: (generate_series(1, 3)), ((generate_series(1, 3)) + 1) + -> ProjectSet + Output: generate_series(1, 3) + -> Result +(5 rows) + +select generate_series(1,3) as x, generate_series(1,3) + 1 as xp1; + x | xp1 +---+----- + 1 | 2 + 2 | 3 + 3 | 4 +(3 rows) + +explain (verbose, costs off) +select generate_series(1,3)+1 order by generate_series(1,3); + QUERY PLAN +------------------------------------------------------------------------ + Sort + Output: (((generate_series(1, 3)) + 1)), (generate_series(1, 3)) + Sort Key: (generate_series(1, 3)) + -> Result + Output: ((generate_series(1, 3)) + 1), (generate_series(1, 3)) + -> ProjectSet + Output: generate_series(1, 3) + -> Result +(8 rows) + +select generate_series(1,3)+1 order by generate_series(1,3); + ?column? +---------- + 2 + 3 + 4 +(3 rows) + +-- Check that SRFs of same nesting level run in lockstep +explain (verbose, costs off) +select generate_series(1,3) as x, generate_series(3,6) + 1 as y; + QUERY PLAN +------------------------------------------------------------------ + Result + Output: (generate_series(1, 3)), ((generate_series(3, 6)) + 1) + -> ProjectSet + Output: generate_series(1, 3), generate_series(3, 6) + -> Result +(5 rows) + +select generate_series(1,3) as x, generate_series(3,6) + 1 as y; + x | y +---+--- + 1 | 4 + 2 | 5 + 3 | 6 + | 7 +(4 rows) + +-- Clean up +DROP TABLE few; +DROP TABLE fewmore; diff --git a/src/test/regress/expected/zedstore.out b/src/test/regress/expected/zedstore.out new file mode 100644 index 0000000000..6041e42a93 --- /dev/null +++ b/src/test/regress/expected/zedstore.out @@ -0,0 +1,599 @@ +-- simple tests to iteratively build the zedstore +-- create and drop works +create table t_zedstore(c1 int, c2 int, c3 int) USING zedstore; +drop table t_zedstore; +-- insert and select works +create table t_zedstore(c1 int, c2 int, c3 int) USING zedstore; +insert into t_zedstore select i,i+1,i+2 from generate_series(1, 10)i; +select * from t_zedstore; + c1 | c2 | c3 +----+----+---- + 1 | 2 | 3 + 2 | 3 | 4 + 3 | 4 | 5 + 4 | 5 | 6 + 5 | 6 | 7 + 6 | 7 | 8 + 7 | 8 | 9 + 8 | 9 | 10 + 9 | 10 | 11 + 10 | 11 | 12 +(10 rows) + +-- selecting only few columns work +select c1, c3 from t_zedstore; + c1 | c3 +----+---- + 1 | 3 + 2 | 4 + 3 | 5 + 4 | 6 + 5 | 7 + 6 | 8 + 7 | 9 + 8 | 10 + 9 | 11 + 10 | 12 +(10 rows) + +-- only few columns in output and where clause work +select c3 from t_zedstore where c2 > 5; + c3 +---- + 7 + 8 + 9 + 10 + 11 + 12 +(6 rows) + +-- Test abort works +begin; +insert into t_zedstore select i,i+1,i+2 from generate_series(21, 25)i; +abort; +insert into t_zedstore select i,i+1,i+2 from generate_series(31, 35)i; +select * from t_zedstore; + c1 | c2 | c3 +----+----+---- + 1 | 2 | 3 + 2 | 3 | 4 + 3 | 4 | 5 + 4 | 5 | 6 + 5 | 6 | 7 + 6 | 7 | 8 + 7 | 8 | 9 + 8 | 9 | 10 + 9 | 10 | 11 + 10 | 11 | 12 + 31 | 32 | 33 + 32 | 33 | 34 + 33 | 34 | 35 + 34 | 35 | 36 + 35 | 36 | 37 +(15 rows) + +-- +-- Test indexing +-- +create index on t_zedstore (c1); +set enable_seqscan=off; +set enable_indexscan=on; +set enable_bitmapscan=off; +-- index scan +select * from t_zedstore where c1 = 5; + c1 | c2 | c3 +----+----+---- + 5 | 6 | 7 +(1 row) + +-- index-only scan +select c1 from t_zedstore where c1 = 5; + c1 +---- + 5 +(1 row) + +-- bitmap scan +set enable_indexscan=off; +set enable_bitmapscan=on; +select c1, c2 from t_zedstore where c1 between 5 and 10; + c1 | c2 +----+---- + 5 | 6 + 6 | 7 + 7 | 8 + 8 | 9 + 9 | 10 + 10 | 11 +(6 rows) + +-- +-- Test DELETE and UPDATE +-- +delete from t_zedstore where c2 = 5; +select * from t_zedstore; + c1 | c2 | c3 +----+----+---- + 1 | 2 | 3 + 2 | 3 | 4 + 3 | 4 | 5 + 5 | 6 | 7 + 6 | 7 | 8 + 7 | 8 | 9 + 8 | 9 | 10 + 9 | 10 | 11 + 10 | 11 | 12 + 31 | 32 | 33 + 32 | 33 | 34 + 33 | 34 | 35 + 34 | 35 | 36 + 35 | 36 | 37 +(14 rows) + +delete from t_zedstore where c2 < 5; +select * from t_zedstore; + c1 | c2 | c3 +----+----+---- + 5 | 6 | 7 + 6 | 7 | 8 + 7 | 8 | 9 + 8 | 9 | 10 + 9 | 10 | 11 + 10 | 11 | 12 + 31 | 32 | 33 + 32 | 33 | 34 + 33 | 34 | 35 + 34 | 35 | 36 + 35 | 36 | 37 +(11 rows) + +update t_zedstore set c2 = 100 where c1 = 8; +select * from t_zedstore; + c1 | c2 | c3 +----+-----+---- + 5 | 6 | 7 + 6 | 7 | 8 + 7 | 8 | 9 + 9 | 10 | 11 + 10 | 11 | 12 + 31 | 32 | 33 + 32 | 33 | 34 + 33 | 34 | 35 + 34 | 35 | 36 + 35 | 36 | 37 + 8 | 100 | 10 +(11 rows) + +-- +-- Test page deletion, by deleting a bigger range of values +-- +insert into t_zedstore select i,i+1,i+2 from generate_series(10000, 15000)i; +delete from t_zedstore where c1 >= 10000; +-- +-- Test VACUUM +-- +vacuum t_zedstore; +select * from t_zedstore; + c1 | c2 | c3 +----+-----+---- + 5 | 6 | 7 + 6 | 7 | 8 + 7 | 8 | 9 + 9 | 10 | 11 + 10 | 11 | 12 + 31 | 32 | 33 + 32 | 33 | 34 + 33 | 34 | 35 + 34 | 35 | 36 + 35 | 36 | 37 + 8 | 100 | 10 +(11 rows) + +-- +-- Test toasting +-- +create table t_zedtoast(c1 int, t text) USING zedstore; +insert into t_zedtoast select i, repeat('x', 10000) from generate_series(1, 10) i; +select c1, length(t) from t_zedtoast; + c1 | length +----+-------- + 1 | 10000 + 2 | 10000 + 3 | 10000 + 4 | 10000 + 5 | 10000 + 6 | 10000 + 7 | 10000 + 8 | 10000 + 9 | 10000 + 10 | 10000 +(10 rows) + +-- +-- Test NULL values +-- +create table t_zednullvalues(c1 int, c2 int) USING zedstore; +insert into t_zednullvalues values(1, NULL), (NULL, 2); +select * from t_zednullvalues; + c1 | c2 +----+---- + 1 | + | 2 +(2 rows) + +select c2 from t_zednullvalues; + c2 +---- + + 2 +(2 rows) + +update t_zednullvalues set c1 = 1, c2 = NULL; +select * from t_zednullvalues; + c1 | c2 +----+---- + 1 | + 1 | +(2 rows) + +-- +-- Test COPY +-- +create table t_zedcopy(a serial, b int, c text not null default 'stuff', d text,e text) USING zedstore; +COPY t_zedcopy (a, b, c, d, e) from stdin; +COPY t_zedcopy (b, d) from stdin; +COPY t_zedcopy (b, d) from stdin; +COPY t_zedcopy (a, b, c, d, e) from stdin; +select * from t_zedcopy; + a | b | c | d | e +-------+----+-------+--------+---- + 9999 | | \N | NN | + 10000 | 21 | 31 | 41 | 51 + 1 | 1 | stuff | test_1 | + 2 | 2 | stuff | test_2 | + 3 | 3 | stuff | test_3 | + 4 | 4 | stuff | test_4 | + 5 | 5 | stuff | test_5 | + 10001 | 22 | 32 | 42 | 52 + 10002 | 23 | 33 | 43 | 53 + 10003 | 24 | 34 | 44 | 54 + 10004 | 25 | 35 | 45 | 55 + 10005 | 26 | 36 | 46 | 56 +(12 rows) + +COPY t_zedcopy (a, d, e) to stdout; +9999 NN \N +10000 41 51 +1 test_1 \N +2 test_2 \N +3 test_3 \N +4 test_4 \N +5 test_5 \N +10001 42 52 +10002 43 53 +10003 44 54 +10004 45 55 +10005 46 56 +-- +-- Also test delete and update on the table that was populated with COPY. +-- This exercises splitting the array item. (A table not populated with +-- COPY only contains single items, at the moment.) +-- +delete from t_zedcopy where b = 4; +select * from t_zedcopy; + a | b | c | d | e +-------+----+-------+--------+---- + 9999 | | \N | NN | + 10000 | 21 | 31 | 41 | 51 + 1 | 1 | stuff | test_1 | + 2 | 2 | stuff | test_2 | + 3 | 3 | stuff | test_3 | + 5 | 5 | stuff | test_5 | + 10001 | 22 | 32 | 42 | + 10002 | 23 | 33 | 43 | 53 + 10003 | 24 | 34 | 44 | 54 + 10004 | 25 | 35 | 45 | 55 + 10005 | 26 | 36 | 46 | 56 +(11 rows) + +delete from t_zedcopy where b < 3; +select * from t_zedcopy; + a | b | c | d | e +-------+----+-------+--------+---- + 9999 | | \N | NN | + 10000 | 21 | 31 | 41 | 51 + 3 | 3 | stuff | test_3 | + 5 | 5 | stuff | test_5 | + 10001 | 22 | 32 | 42 | + 10002 | 23 | 33 | 43 | 53 + 10003 | 24 | 34 | 44 | 54 + 10004 | 25 | 35 | 45 | 55 + 10005 | 26 | 36 | 46 | 56 +(9 rows) + +update t_zedcopy set b = 100 where b = 5; +select * from t_zedcopy; + a | b | c | d | e +-------+-----+-------+--------+---- + 9999 | | \N | NN | + 10000 | 21 | 31 | 41 | 51 + 3 | 3 | stuff | test_3 | + 10001 | 22 | 32 | 42 | + 10002 | 23 | 33 | 43 | 53 + 10003 | 24 | 34 | 44 | 54 + 10004 | 25 | 35 | 45 | 55 + 10005 | 26 | 36 | 46 | 56 + 5 | 100 | stuff | test_5 | +(9 rows) + +-- +-- Test zero column table +-- +create table t_zwithzerocols() using zedstore; +insert into t_zwithzerocols select t.* from t_zwithzerocols t right join generate_series(1,1) on true; +select count(*) from t_zwithzerocols; + count +------- + 1 +(1 row) + +-- Test for alter table add column +create table t_zaddcol(a int) using zedstore; +insert into t_zaddcol select * from generate_series(1, 3); +-- rewrite case +alter table t_zaddcol add column b int generated always as (a + 1) stored; +select * from t_zaddcol; + a | b +---+--- + 1 | 2 + 2 | 3 + 3 | 4 +(3 rows) + +-- test alter table add column with no default +create table t_zaddcol_simple(a int) using zedstore; +insert into t_zaddcol_simple values (1); +alter table t_zaddcol_simple add b int; +select * from t_zaddcol_simple; + a | b +---+--- + 1 | +(1 row) + +insert into t_zaddcol_simple values(2,3); +select * from t_zaddcol_simple; + a | b +---+--- + 1 | + 2 | 3 +(2 rows) + +-- fixed length default value stored in catalog +alter table t_zaddcol add column c int default 3; +select * from t_zaddcol; + a | b | c +---+---+--- + 1 | 2 | 3 + 2 | 3 | 3 + 3 | 4 | 3 +(3 rows) + +-- variable length default value stored in catalog +alter table t_zaddcol add column d text default 'abcdefgh'; +select d from t_zaddcol; + d +---------- + abcdefgh + abcdefgh + abcdefgh +(3 rows) + +-- insert after add column +insert into t_zaddcol values (2); +select * from t_zaddcol; + a | b | c | d +---+---+---+---------- + 1 | 2 | 3 | abcdefgh + 2 | 3 | 3 | abcdefgh + 3 | 4 | 3 | abcdefgh + 2 | 3 | 3 | abcdefgh +(4 rows) + +insert into t_zaddcol (a, c, d) values (3,5, 'test_insert'); +select b,c,d from t_zaddcol; + b | c | d +---+---+------------- + 2 | 3 | abcdefgh + 3 | 3 | abcdefgh + 4 | 3 | abcdefgh + 3 | 3 | abcdefgh + 4 | 5 | test_insert +(5 rows) + +-- +-- Test TABLESAMPLE +-- +-- regular test tablesample.sql doesn't directly work for zedstore as +-- its using fillfactor to create specific block layout for +-- heap. Hence, output differs between heap and zedstore table while +-- sampling. We need to use many tuples here to have multiple logical +-- blocks as don't have way to force TIDs spread / jump for zedstore. +-- +CREATE TABLE t_ztablesample (id int, name text) using zedstore; +INSERT INTO t_ztablesample + SELECT i, repeat(i::text, 2) FROM generate_series(0, 299) s(i); +-- lets delete half (even numbered ids) rows to limit the output +DELETE FROM t_ztablesample WHERE id%2 = 0; +-- should return ALL visible tuples from SOME blocks +SELECT ctid,t.id FROM t_ztablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (0); + ctid | id +---------+----- + (1,2) | 129 + (1,4) | 131 + (1,6) | 133 + (1,8) | 135 + (1,10) | 137 + (1,12) | 139 + (1,14) | 141 + (1,16) | 143 + (1,18) | 145 + (1,20) | 147 + (1,22) | 149 + (1,24) | 151 + (1,26) | 153 + (1,28) | 155 + (1,30) | 157 + (1,32) | 159 + (1,34) | 161 + (1,36) | 163 + (1,38) | 165 + (1,40) | 167 + (1,42) | 169 + (1,44) | 171 + (1,46) | 173 + (1,48) | 175 + (1,50) | 177 + (1,52) | 179 + (1,54) | 181 + (1,56) | 183 + (1,58) | 185 + (1,60) | 187 + (1,62) | 189 + (1,64) | 191 + (1,66) | 193 + (1,68) | 195 + (1,70) | 197 + (1,72) | 199 + (1,74) | 201 + (1,76) | 203 + (1,78) | 205 + (1,80) | 207 + (1,82) | 209 + (1,84) | 211 + (1,86) | 213 + (1,88) | 215 + (1,90) | 217 + (1,92) | 219 + (1,94) | 221 + (1,96) | 223 + (1,98) | 225 + (1,100) | 227 + (1,102) | 229 + (1,104) | 231 + (1,106) | 233 + (1,108) | 235 + (1,110) | 237 + (1,112) | 239 + (1,114) | 241 + (1,116) | 243 + (1,118) | 245 + (1,120) | 247 + (1,122) | 249 + (1,124) | 251 + (1,126) | 253 + (1,128) | 255 + (2,2) | 257 + (2,4) | 259 + (2,6) | 261 + (2,8) | 263 + (2,10) | 265 + (2,12) | 267 + (2,14) | 269 + (2,16) | 271 + (2,18) | 273 + (2,20) | 275 + (2,22) | 277 + (2,24) | 279 + (2,26) | 281 + (2,28) | 283 + (2,30) | 285 + (2,32) | 287 + (2,34) | 289 + (2,36) | 291 + (2,38) | 293 + (2,40) | 295 + (2,42) | 297 + (2,44) | 299 +(86 rows) + +-- should return SOME visible tuples but from ALL the blocks +SELECT ctid,id FROM t_ztablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (0); + ctid | id +---------+----- + (0,4) | 3 + (0,6) | 5 + (0,8) | 7 + (0,20) | 19 + (0,30) | 29 + (0,42) | 41 + (0,44) | 43 + (0,48) | 47 + (0,52) | 51 + (0,54) | 53 + (0,56) | 55 + (0,62) | 61 + (0,64) | 63 + (0,66) | 65 + (0,76) | 75 + (0,80) | 79 + (0,82) | 81 + (0,84) | 83 + (0,88) | 87 + (0,90) | 89 + (0,92) | 91 + (0,98) | 97 + (0,106) | 105 + (0,108) | 107 + (0,122) | 121 + (0,126) | 125 + (1,2) | 129 + (1,4) | 131 + (1,6) | 133 + (1,8) | 135 + (1,10) | 137 + (1,12) | 139 + (1,20) | 147 + (1,24) | 151 + (1,26) | 153 + (1,28) | 155 + (1,30) | 157 + (1,32) | 159 + (1,34) | 161 + (1,40) | 167 + (1,44) | 171 + (1,46) | 173 + (1,58) | 185 + (1,66) | 193 + (1,68) | 195 + (1,70) | 197 + (1,78) | 205 + (1,80) | 207 + (1,88) | 215 + (1,92) | 219 + (1,96) | 223 + (1,100) | 227 + (1,102) | 229 + (1,106) | 233 + (1,112) | 239 + (1,116) | 243 + (1,120) | 247 + (1,122) | 249 + (1,126) | 253 + (2,2) | 257 + (2,6) | 261 + (2,8) | 263 + (2,10) | 265 + (2,12) | 267 + (2,16) | 271 + (2,18) | 273 + (2,24) | 279 + (2,26) | 281 + (2,28) | 283 + (2,30) | 285 + (2,34) | 289 + (2,36) | 291 + (2,42) | 297 + (2,44) | 299 +(74 rows) + diff --git a/src/test/regress/output/misc_1.source b/src/test/regress/output/misc_1.source new file mode 100644 index 0000000000..c29c54c414 --- /dev/null +++ b/src/test/regress/output/misc_1.source @@ -0,0 +1,692 @@ +-- +-- MISC +-- +-- +-- BTREE +-- +UPDATE onek + SET unique1 = onek.unique1 + 1; +UPDATE onek + SET unique1 = onek.unique1 - 1; +-- +-- BTREE partial +-- +-- UPDATE onek2 +-- SET unique1 = onek2.unique1 + 1; +--UPDATE onek2 +-- SET unique1 = onek2.unique1 - 1; +-- +-- BTREE shutting out non-functional updates +-- +-- the following two tests seem to take a long time on some +-- systems. This non-func update stuff needs to be examined +-- more closely. - jolly (2/22/96) +-- +UPDATE tmp + SET stringu1 = reverse_name(onek.stringu1) + FROM onek + WHERE onek.stringu1 = 'JBAAAA' and + onek.stringu1 = tmp.stringu1; +UPDATE tmp + SET stringu1 = reverse_name(onek2.stringu1) + FROM onek2 + WHERE onek2.stringu1 = 'JCAAAA' and + onek2.stringu1 = tmp.stringu1; +DROP TABLE tmp; +--UPDATE person* +-- SET age = age + 1; +--UPDATE person* +-- SET age = age + 3 +-- WHERE name = 'linda'; +-- +-- copy +-- +COPY onek TO '@abs_builddir@/results/onek.data'; +DELETE FROM onek; +COPY onek FROM '@abs_builddir@/results/onek.data'; +SELECT unique1 FROM onek WHERE unique1 < 2 ORDER BY unique1; + unique1 +--------- + 0 + 1 +(2 rows) + +DELETE FROM onek2; +COPY onek2 FROM '@abs_builddir@/results/onek.data'; +SELECT unique1 FROM onek2 WHERE unique1 < 2 ORDER BY unique1; + unique1 +--------- + 0 + 1 +(2 rows) + +COPY BINARY stud_emp TO '@abs_builddir@/results/stud_emp.data'; +DELETE FROM stud_emp; +COPY BINARY stud_emp FROM '@abs_builddir@/results/stud_emp.data'; +SELECT * FROM stud_emp; + name | age | location | salary | manager | gpa | percent +-------+-----+------------+--------+---------+-----+--------- + jeff | 23 | (8,7.7) | 600 | sharon | 3.5 | + cim | 30 | (10.5,4.7) | 400 | | 3.4 | + linda | 19 | (0.9,6.1) | 100 | | 2.9 | +(3 rows) + +-- COPY aggtest FROM stdin; +-- 56 7.8 +-- 100 99.097 +-- 0 0.09561 +-- 42 324.78 +-- . +-- COPY aggtest TO stdout; +-- +-- inheritance stress test +-- +SELECT * FROM a_star*; + class | a +-------+---- + a | 1 + a | 2 + a | + b | 3 + b | 4 + b | + b | + c | 5 + c | 6 + c | + c | + d | 7 + d | 8 + d | 9 + d | 10 + d | + d | 11 + d | 12 + d | 13 + d | + d | + d | + d | 14 + d | + d | + d | + d | + e | 15 + e | 16 + e | 17 + e | + e | 18 + e | + e | + f | 19 + f | 20 + f | 21 + f | 22 + f | + f | 24 + f | 25 + f | 26 + f | + f | + f | + f | 27 + f | + f | + f | + f | +(50 rows) + +SELECT * + FROM b_star* x + WHERE x.b = text 'bumble' or x.a < 3; + class | a | b +-------+---+-------- + b | | bumble +(1 row) + +SELECT class, a + FROM c_star* x + WHERE x.c ~ text 'hi'; + class | a +-------+---- + c | 5 + c | + d | 7 + d | 8 + d | 10 + d | + d | 12 + d | + d | + d | + e | 15 + e | 16 + e | + e | + f | 19 + f | 20 + f | 21 + f | + f | 24 + f | + f | + f | +(22 rows) + +SELECT class, b, c + FROM d_star* x + WHERE x.a < 100; + class | b | c +-------+---------+------------ + d | grumble | hi sunita + d | stumble | hi koko + d | rumble | + d | | hi kristin + d | fumble | + d | | hi avi + d | | + d | | +(8 rows) + +SELECT class, c FROM e_star* x WHERE x.c NOTNULL; + class | c +-------+------------- + e | hi carol + e | hi bob + e | hi michelle + e | hi elisa + f | hi claire + f | hi mike + f | hi marcel + f | hi keith + f | hi marc + f | hi allison + f | hi jeff + f | hi carl +(12 rows) + +SELECT * FROM f_star* x WHERE x.c ISNULL; + class | a | c | e | f +-------+----+---+-----+------------------------------------------- + f | 22 | | -7 | ((111,555),(222,666),(333,777),(444,888)) + f | 25 | | -9 | + f | 26 | | | ((11111,33333),(22222,44444)) + f | | | -11 | ((1111111,3333333),(2222222,4444444)) + f | 27 | | | + f | | | -12 | + f | | | | ((11111111,33333333),(22222222,44444444)) + f | | | | +(8 rows) + +-- grouping and aggregation on inherited sets have been busted in the past... +SELECT sum(a) FROM a_star*; + sum +----- + 355 +(1 row) + +SELECT class, sum(a) FROM a_star* GROUP BY class ORDER BY class; + class | sum +-------+----- + a | 3 + b | 7 + c | 11 + d | 84 + e | 66 + f | 184 +(6 rows) + +ALTER TABLE f_star RENAME COLUMN f TO ff; +ALTER TABLE e_star* RENAME COLUMN e TO ee; +ALTER TABLE d_star* RENAME COLUMN d TO dd; +ALTER TABLE c_star* RENAME COLUMN c TO cc; +ALTER TABLE b_star* RENAME COLUMN b TO bb; +ALTER TABLE a_star* RENAME COLUMN a TO aa; +SELECT class, aa + FROM a_star* x + WHERE aa ISNULL; + class | aa +-------+---- + a | + b | + b | + c | + c | + d | + d | + d | + d | + d | + d | + d | + d | + e | + e | + e | + f | + f | + f | + f | + f | + f | + f | + f | +(24 rows) + +-- As of Postgres 7.1, ALTER implicitly recurses, +-- so this should be same as ALTER a_star* +ALTER TABLE a_star RENAME COLUMN aa TO foo; +SELECT class, foo + FROM a_star* x + WHERE x.foo >= 2; + class | foo +-------+----- + a | 2 + b | 3 + b | 4 + c | 5 + c | 6 + d | 7 + d | 8 + d | 9 + d | 10 + d | 11 + d | 12 + d | 13 + d | 14 + e | 15 + e | 16 + e | 17 + e | 18 + f | 19 + f | 20 + f | 21 + f | 22 + f | 24 + f | 25 + f | 26 + f | 27 +(25 rows) + +ALTER TABLE a_star RENAME COLUMN foo TO aa; +SELECT * + from a_star* + WHERE aa < 1000; + class | aa +-------+---- + a | 1 + a | 2 + b | 3 + b | 4 + c | 5 + c | 6 + d | 7 + d | 8 + d | 9 + d | 10 + d | 11 + d | 12 + d | 13 + d | 14 + e | 15 + e | 16 + e | 17 + e | 18 + f | 19 + f | 20 + f | 21 + f | 22 + f | 24 + f | 25 + f | 26 + f | 27 +(26 rows) + +ALTER TABLE f_star ADD COLUMN f int4; +UPDATE f_star SET f = 10; +ALTER TABLE e_star* ADD COLUMN e int4; +--UPDATE e_star* SET e = 42; +SELECT * FROM e_star*; + class | aa | cc | ee | e +-------+----+-------------+-----+--- + e | 15 | hi carol | -1 | + e | 16 | hi bob | | + e | 17 | | -2 | + e | | hi michelle | -3 | + e | 18 | | | + e | | hi elisa | | + e | | | -4 | + f | 19 | hi claire | -5 | + f | 20 | hi mike | -6 | + f | 21 | hi marcel | | + f | 22 | | -7 | + f | | hi keith | -8 | + f | 24 | hi marc | | + f | 25 | | -9 | + f | 26 | | | + f | | hi allison | -10 | + f | | hi jeff | | + f | | | -11 | + f | 27 | | | + f | | hi carl | | + f | | | -12 | + f | | | | + f | | | | +(23 rows) + +ALTER TABLE a_star* ADD COLUMN a text; +NOTICE: merging definition of column "a" for child "d_star" +-- That ALTER TABLE should have added TOAST tables. +SELECT relname, reltoastrelid <> 0 AS has_toast_table + FROM pg_class + WHERE oid::regclass IN ('a_star', 'c_star') + ORDER BY 1; + relname | has_toast_table +---------+----------------- + a_star | f + c_star | f +(2 rows) + +--UPDATE b_star* +-- SET a = text 'gazpacho' +-- WHERE aa > 4; +SELECT class, aa, a FROM a_star*; + class | aa | a +-------+----+--- + a | 1 | + a | 2 | + a | | + b | 3 | + b | 4 | + b | | + b | | + c | 5 | + c | 6 | + c | | + c | | + d | 7 | + d | 8 | + d | 9 | + d | 10 | + d | | + d | 11 | + d | 12 | + d | 13 | + d | | + d | | + d | | + d | 14 | + d | | + d | | + d | | + d | | + e | 15 | + e | 16 | + e | 17 | + e | | + e | 18 | + e | | + e | | + f | 19 | + f | 20 | + f | 21 | + f | 22 | + f | | + f | 24 | + f | 25 | + f | 26 | + f | | + f | | + f | | + f | 27 | + f | | + f | | + f | | + f | | +(50 rows) + +-- +-- versions +-- +-- +-- postquel functions +-- +-- +-- mike does post_hacking, +-- joe and sally play basketball, and +-- everyone else does nothing. +-- +SELECT p.name, name(p.hobbies) FROM ONLY person p; + name | name +-------+------------- + mike | posthacking + joe | basketball + sally | basketball +(3 rows) + +-- +-- as above, but jeff also does post_hacking. +-- +SELECT p.name, name(p.hobbies) FROM person* p; + name | name +-------+------------- + mike | posthacking + joe | basketball + sally | basketball + jeff | posthacking +(4 rows) + +-- +-- the next two queries demonstrate how functions generate bogus duplicates. +-- this is a "feature" .. +-- +SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r + ORDER BY 1,2; + name | name +-------------+--------------- + basketball | hightops + posthacking | advil + posthacking | peet's coffee + skywalking | guts +(4 rows) + +SELECT hobbies_r.name, (hobbies_r.equipment).name FROM hobbies_r; + name | name +-------------+--------------- + posthacking | advil + posthacking | peet's coffee + posthacking | advil + posthacking | peet's coffee + basketball | hightops + basketball | hightops + skywalking | guts +(7 rows) + +-- +-- mike needs advil and peet's coffee, +-- joe and sally need hightops, and +-- everyone else is fine. +-- +SELECT p.name, name(p.hobbies), name(equipment(p.hobbies)) FROM ONLY person p; + name | name | name +-------+-------------+--------------- + mike | posthacking | advil + mike | posthacking | peet's coffee + joe | basketball | hightops + sally | basketball | hightops +(4 rows) + +-- +-- as above, but jeff needs advil and peet's coffee as well. +-- +SELECT p.name, name(p.hobbies), name(equipment(p.hobbies)) FROM person* p; + name | name | name +-------+-------------+--------------- + mike | posthacking | advil + mike | posthacking | peet's coffee + joe | basketball | hightops + sally | basketball | hightops + jeff | posthacking | advil + jeff | posthacking | peet's coffee +(6 rows) + +-- +-- just like the last two, but make sure that the target list fixup and +-- unflattening is being done correctly. +-- +SELECT name(equipment(p.hobbies)), p.name, name(p.hobbies) FROM ONLY person p; + name | name | name +---------------+-------+------------- + advil | mike | posthacking + peet's coffee | mike | posthacking + hightops | joe | basketball + hightops | sally | basketball +(4 rows) + +SELECT (p.hobbies).equipment.name, p.name, name(p.hobbies) FROM person* p; + name | name | name +---------------+-------+------------- + advil | mike | posthacking + peet's coffee | mike | posthacking + hightops | joe | basketball + hightops | sally | basketball + advil | jeff | posthacking + peet's coffee | jeff | posthacking +(6 rows) + +SELECT (p.hobbies).equipment.name, name(p.hobbies), p.name FROM ONLY person p; + name | name | name +---------------+-------------+------- + advil | posthacking | mike + peet's coffee | posthacking | mike + hightops | basketball | joe + hightops | basketball | sally +(4 rows) + +SELECT name(equipment(p.hobbies)), name(p.hobbies), p.name FROM person* p; + name | name | name +---------------+-------------+------- + advil | posthacking | mike + peet's coffee | posthacking | mike + hightops | basketball | joe + hightops | basketball | sally + advil | posthacking | jeff + peet's coffee | posthacking | jeff +(6 rows) + +SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer'))); + name +------ + guts +(1 row) + +SELECT name(equipment(hobby_construct_named(text 'skywalking', text 'mer'))); + name +------ + guts +(1 row) + +SELECT name(equipment_named(hobby_construct_named(text 'skywalking', text 'mer'))); + name +------ + guts +(1 row) + +SELECT name(equipment_named_ambiguous_1a(hobby_construct_named(text 'skywalking', text 'mer'))); + name +------ + guts +(1 row) + +SELECT name(equipment_named_ambiguous_1b(hobby_construct_named(text 'skywalking', text 'mer'))); + name +------ + guts +(1 row) + +SELECT name(equipment_named_ambiguous_1c(hobby_construct_named(text 'skywalking', text 'mer'))); + name +------ + guts +(1 row) + +SELECT name(equipment_named_ambiguous_2a(text 'skywalking')); + name +------ + guts +(1 row) + +SELECT name(equipment_named_ambiguous_2b(text 'skywalking')); + name +--------------- + advil + peet's coffee + hightops + guts +(4 rows) + +SELECT hobbies_by_name('basketball'); + hobbies_by_name +----------------- + joe +(1 row) + +SELECT name, overpaid(emp.*) FROM emp; + name | overpaid +--------+---------- + sharon | t + sam | t + bill | t + jeff | f + cim | f + linda | f +(6 rows) + +-- +-- Try a few cases with SQL-spec row constructor expressions +-- +SELECT * FROM equipment(ROW('skywalking', 'mer')); + name | hobby +------+------------ + guts | skywalking +(1 row) + +SELECT name(equipment(ROW('skywalking', 'mer'))); + name +------ + guts +(1 row) + +SELECT *, name(equipment(h.*)) FROM hobbies_r h; + name | person | name +-------------+--------+--------------- + posthacking | mike | advil + posthacking | mike | peet's coffee + posthacking | jeff | advil + posthacking | jeff | peet's coffee + basketball | joe | hightops + basketball | sally | hightops + skywalking | | guts +(7 rows) + +SELECT *, (equipment(CAST((h.*) AS hobbies_r))).name FROM hobbies_r h; + name | person | name +-------------+--------+--------------- + posthacking | mike | advil + posthacking | mike | peet's coffee + posthacking | jeff | advil + posthacking | jeff | peet's coffee + basketball | joe | hightops + basketball | sally | hightops + skywalking | | guts +(7 rows) + +-- +-- functional joins +-- +-- +-- instance rules +-- +-- +-- rewrite rules +-- diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index f23fe8d870..aad070d48e 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,7 +78,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview # ---------- # Another group of parallel tests # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan +test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan zedstore # rules cannot run concurrently with any test that creates # a view or rule in the public schema diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index ca200eb599..5ad9d90b58 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -134,6 +134,7 @@ test: misc_functions test: sysviews test: tsrf test: tidscan +test: zedstore test: rules test: psql test: psql_crosstab diff --git a/src/test/regress/sql/zedstore.sql b/src/test/regress/sql/zedstore.sql new file mode 100644 index 0000000000..d987e70c4f --- /dev/null +++ b/src/test/regress/sql/zedstore.sql @@ -0,0 +1,176 @@ +-- simple tests to iteratively build the zedstore +-- create and drop works +create table t_zedstore(c1 int, c2 int, c3 int) USING zedstore; +drop table t_zedstore; +-- insert and select works +create table t_zedstore(c1 int, c2 int, c3 int) USING zedstore; +insert into t_zedstore select i,i+1,i+2 from generate_series(1, 10)i; +select * from t_zedstore; +-- selecting only few columns work +select c1, c3 from t_zedstore; +-- only few columns in output and where clause work +select c3 from t_zedstore where c2 > 5; + +-- Test abort works +begin; +insert into t_zedstore select i,i+1,i+2 from generate_series(21, 25)i; +abort; +insert into t_zedstore select i,i+1,i+2 from generate_series(31, 35)i; +select * from t_zedstore; + +-- +-- Test indexing +-- +create index on t_zedstore (c1); +set enable_seqscan=off; +set enable_indexscan=on; +set enable_bitmapscan=off; + +-- index scan +select * from t_zedstore where c1 = 5; + +-- index-only scan +select c1 from t_zedstore where c1 = 5; + +-- bitmap scan +set enable_indexscan=off; +set enable_bitmapscan=on; +select c1, c2 from t_zedstore where c1 between 5 and 10; + +-- +-- Test DELETE and UPDATE +-- +delete from t_zedstore where c2 = 5; +select * from t_zedstore; +delete from t_zedstore where c2 < 5; +select * from t_zedstore; + +update t_zedstore set c2 = 100 where c1 = 8; +select * from t_zedstore; + +-- +-- Test page deletion, by deleting a bigger range of values +-- +insert into t_zedstore select i,i+1,i+2 from generate_series(10000, 15000)i; +delete from t_zedstore where c1 >= 10000; + +-- +-- Test VACUUM +-- +vacuum t_zedstore; +select * from t_zedstore; + +-- +-- Test toasting +-- +create table t_zedtoast(c1 int, t text) USING zedstore; +insert into t_zedtoast select i, repeat('x', 10000) from generate_series(1, 10) i; + +select c1, length(t) from t_zedtoast; + +-- +-- Test NULL values +-- +create table t_zednullvalues(c1 int, c2 int) USING zedstore; +insert into t_zednullvalues values(1, NULL), (NULL, 2); +select * from t_zednullvalues; +select c2 from t_zednullvalues; +update t_zednullvalues set c1 = 1, c2 = NULL; +select * from t_zednullvalues; + +-- +-- Test COPY +-- +create table t_zedcopy(a serial, b int, c text not null default 'stuff', d text,e text) USING zedstore; + +COPY t_zedcopy (a, b, c, d, e) from stdin; +9999 \N \\N \NN \N +10000 21 31 41 51 +\. + +COPY t_zedcopy (b, d) from stdin; +1 test_1 +\. + +COPY t_zedcopy (b, d) from stdin; +2 test_2 +3 test_3 +4 test_4 +5 test_5 +\. + +COPY t_zedcopy (a, b, c, d, e) from stdin; +10001 22 32 42 52 +10002 23 33 43 53 +10003 24 34 44 54 +10004 25 35 45 55 +10005 26 36 46 56 +\. + +select * from t_zedcopy; +COPY t_zedcopy (a, d, e) to stdout; + +-- +-- Also test delete and update on the table that was populated with COPY. +-- This exercises splitting the array item. (A table not populated with +-- COPY only contains single items, at the moment.) +-- + +delete from t_zedcopy where b = 4; +select * from t_zedcopy; +delete from t_zedcopy where b < 3; +select * from t_zedcopy; + +update t_zedcopy set b = 100 where b = 5; +select * from t_zedcopy; + +-- +-- Test zero column table +-- +create table t_zwithzerocols() using zedstore; +insert into t_zwithzerocols select t.* from t_zwithzerocols t right join generate_series(1,1) on true; +select count(*) from t_zwithzerocols; + +-- Test for alter table add column +create table t_zaddcol(a int) using zedstore; +insert into t_zaddcol select * from generate_series(1, 3); +-- rewrite case +alter table t_zaddcol add column b int generated always as (a + 1) stored; +select * from t_zaddcol; +-- test alter table add column with no default +create table t_zaddcol_simple(a int) using zedstore; +insert into t_zaddcol_simple values (1); +alter table t_zaddcol_simple add b int; +select * from t_zaddcol_simple; +insert into t_zaddcol_simple values(2,3); +select * from t_zaddcol_simple; +-- fixed length default value stored in catalog +alter table t_zaddcol add column c int default 3; +select * from t_zaddcol; +-- variable length default value stored in catalog +alter table t_zaddcol add column d text default 'abcdefgh'; +select d from t_zaddcol; +-- insert after add column +insert into t_zaddcol values (2); +select * from t_zaddcol; +insert into t_zaddcol (a, c, d) values (3,5, 'test_insert'); +select b,c,d from t_zaddcol; + +-- +-- Test TABLESAMPLE +-- +-- regular test tablesample.sql doesn't directly work for zedstore as +-- its using fillfactor to create specific block layout for +-- heap. Hence, output differs between heap and zedstore table while +-- sampling. We need to use many tuples here to have multiple logical +-- blocks as don't have way to force TIDs spread / jump for zedstore. +-- +CREATE TABLE t_ztablesample (id int, name text) using zedstore; +INSERT INTO t_ztablesample + SELECT i, repeat(i::text, 2) FROM generate_series(0, 299) s(i); +-- lets delete half (even numbered ids) rows to limit the output +DELETE FROM t_ztablesample WHERE id%2 = 0; +-- should return ALL visible tuples from SOME blocks +SELECT ctid,t.id FROM t_ztablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (0); +-- should return SOME visible tuples but from ALL the blocks +SELECT ctid,id FROM t_ztablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (0); diff --git a/src/test/storageperf/driver.sql b/src/test/storageperf/driver.sql new file mode 100644 index 0000000000..73981d1c94 --- /dev/null +++ b/src/test/storageperf/driver.sql @@ -0,0 +1,36 @@ +-- +-- Main script, to run all the tests, and print the results. +-- +-- + +-- First run the tests using heap. +DROP SCHEMA IF EXISTS storagetest_heap CASCADE; +CREATE SCHEMA storagetest_heap; +SET search_path='storagetest_heap'; + +CREATE TABLE results (testname text, val numeric) USING heap; + +SET default_table_access_method=heap; +\i tests.sql + + +-- Repeat with zedstore + +DROP SCHEMA IF EXISTS storagetest_zedstore CASCADE; +CREATE SCHEMA storagetest_zedstore; +SET search_path='storagetest_zedstore'; + +CREATE TABLE results (testname text, val numeric) USING heap; + +SET default_table_access_method=zedstore; +\i tests.sql + + +SET search_path='public'; + +SELECT COALESCE(h.testname, zs.testname) as testname, + h.val as heap, + zs.val as zedstore, + round(zs.val / h.val, 2) as "heap / zedstore" +FROM storagetest_heap.results h +FULL OUTER JOIN storagetest_zedstore.results zs ON (h.testname = zs.testname); diff --git a/src/test/storageperf/sql/onecol.sql b/src/test/storageperf/sql/onecol.sql new file mode 100644 index 0000000000..5cf18158c9 --- /dev/null +++ b/src/test/storageperf/sql/onecol.sql @@ -0,0 +1,38 @@ +-- Tests with a narrow, single-column table. + +CREATE UNLOGGED TABLE onecol (i int4); + +-- Populate the table with a bunch of INSERT ... SELECT statements. +-- Measure how long it takes, and the resulting table size. +select extract(epoch from now()) as before +\gset + +INSERT INTO onecol SELECT generate_series(1, 100000); +INSERT INTO onecol SELECT generate_series(1, 100000); +INSERT INTO onecol SELECT generate_series(1, 100000); +INSERT INTO onecol SELECT generate_series(1, 100000); +INSERT INTO onecol SELECT generate_series(1, 100000); + +select extract(epoch from now()) as after +\gset + +INSERT INTO results (testname, val) VALUES ('onecol, insert-select, size', pg_total_relation_size('onecol')); +INSERT INTO results (testname, val) VALUES ('onecol, insert-select, time', :after - :before); + +COPY onecol TO '/tmp/onecol.data'; -- dump the data, for COPY test below. + +-- +-- Truncate and populate it again with the same data, but this time using COPY. +-- +TRUNCATE onecol; + +select extract(epoch from now()) as before +\gset + +COPY onecol FROM '/tmp/onecol.data'; + +select extract(epoch from now()) as after +\gset + +INSERT INTO results (testname, val) VALUES ('onecol, COPY, size', pg_total_relation_size('onecol')); +INSERT INTO results (testname, val) VALUES ('onecol, COPY, time', :after - :before); diff --git a/src/test/storageperf/tests.sql b/src/test/storageperf/tests.sql new file mode 100644 index 0000000000..d1f25ed029 --- /dev/null +++ b/src/test/storageperf/tests.sql @@ -0,0 +1,3 @@ +-- Test "schedule". List all the tests you want to run here. + +\i sql/onecol.sql base-commit: db6e2b4c52ade524f3db419d75084728e96e1f9c -- 2.19.1