From 67f6b01dfdcc394ca025b357c90fec436df6d59a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 10 Sep 2018 19:53:51 -0700 Subject: [PATCH v29 2/3] Teach pageinspect about nbtree posting lists. Add a column for posting list TIDs to bt_page_items(). Also add a column that displays a single heap TID value for each tuple, regardless of whether or not "ctid" is used for heap TID. In the case of posting list tuples, the value is the lowest heap TID in the posting list. Arguably I should have done this when commit dd299df8 went in, since that added a pivot tuple representation that could have a heap TID but didn't use ctid for that purpose. Also add a boolean column that displays the LP_DEAD bit value for each non-pivot tuple. No version bump for the pageinspect extension, since there hasn't been a stable release since the last version bump (see commit 58b4cb30). --- contrib/pageinspect/btreefuncs.c | 118 +++++++++++++++--- contrib/pageinspect/expected/btree.out | 7 ++ contrib/pageinspect/pageinspect--1.7--1.8.sql | 53 ++++++++ doc/src/sgml/pageinspect.sgml | 83 ++++++------ 4 files changed, 206 insertions(+), 55 deletions(-) diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 78cdc69ec7..1b2ea14122 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -31,9 +31,11 @@ #include "access/relation.h" #include "catalog/namespace.h" #include "catalog/pg_am.h" +#include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" #include "pageinspect.h" +#include "utils/array.h" #include "utils/builtins.h" #include "utils/rel.h" #include "utils/varlena.h" @@ -45,6 +47,8 @@ PG_FUNCTION_INFO_V1(bt_page_stats); #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX) #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID) +#define DatumGetItemPointer(X) ((ItemPointer) DatumGetPointer(X)) +#define ItemPointerGetDatum(X) PointerGetDatum(X) /* note: BlockNumber is unsigned, hence can't be negative */ #define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \ @@ -243,6 +247,9 @@ struct user_args { Page page; OffsetNumber offset; + bool leafpage; + bool rightmost; + TupleDesc tupd; }; /*------------------------------------------------------- @@ -252,17 +259,25 @@ struct user_args * ------------------------------------------------------ */ static Datum -bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset) +bt_page_print_tuples(FuncCallContext *fctx, struct user_args *uargs) { - char *values[6]; + Page page = uargs->page; + OffsetNumber offset = uargs->offset; + bool leafpage = uargs->leafpage; + bool rightmost = uargs->rightmost; + bool pivotoffset; + Datum values[9]; + bool nulls[9]; HeapTuple tuple; ItemId id; IndexTuple itup; int j; int off; int dlen; - char *dump; + char *dump, + *datacstring; char *ptr; + ItemPointer htid; id = PageGetItemId(page, offset); @@ -272,18 +287,27 @@ bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset) itup = (IndexTuple) PageGetItem(page, id); j = 0; - values[j++] = psprintf("%d", offset); - values[j++] = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&itup->t_tid), - ItemPointerGetOffsetNumberNoCheck(&itup->t_tid)); - values[j++] = psprintf("%d", (int) IndexTupleSize(itup)); - values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f'); - values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f'); + memset(nulls, 0, sizeof(nulls)); + values[j++] = DatumGetInt16(offset); + values[j++] = ItemPointerGetDatum(&itup->t_tid); + values[j++] = Int32GetDatum((int) IndexTupleSize(itup)); + values[j++] = BoolGetDatum(IndexTupleHasNulls(itup)); + values[j++] = BoolGetDatum(IndexTupleHasVarwidths(itup)); ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info); dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info); + + /* + * Make sure that "data" column does not include posting list or pivot + * tuple representation of heap TID + */ + if (BTreeTupleIsPosting(itup)) + dlen -= IndexTupleSize(itup) - BTreeTupleGetPostingOffset(itup); + else if (BTreeTupleIsPivot(itup) && BTreeTupleGetHeapTID(itup) != NULL) + dlen -= MAXALIGN(sizeof(ItemPointerData)); + dump = palloc0(dlen * 3 + 1); - values[j] = dump; + datacstring = dump; for (off = 0; off < dlen; off++) { if (off > 0) @@ -291,8 +315,57 @@ bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset) sprintf(dump, "%02x", *(ptr + off) & 0xff); dump += 2; } + values[j++] = CStringGetTextDatum(datacstring); + pfree(datacstring); - tuple = BuildTupleFromCStrings(fctx->attinmeta, values); + /* + * Avoid indicating that pivot tuple from !heapkeyspace index (which won't + * have v4+ status bit set) is dead or has a heap TID -- that can only + * happen with non-pivot tuples. (Most backend code can use the + * heapkeyspace field from the metapage to figure out which representation + * to expect, but we have to be a bit creative here.) + */ + pivotoffset = (!leafpage || (!rightmost && offset == P_HIKEY)); + + /* LP_DEAD status bit */ + if (!pivotoffset) + values[j++] = BoolGetDatum(ItemIdIsDead(id)); + else + nulls[j++] = true; + + htid = BTreeTupleGetHeapTID(itup); + if (pivotoffset && !BTreeTupleIsPivot(itup)) + htid = NULL; + + if (htid) + values[j++] = ItemPointerGetDatum(htid); + else + nulls[j++] = true; + + if (BTreeTupleIsPosting(itup)) + { + /* build an array of item pointers */ + ItemPointer tids; + Datum *tids_datum; + int nposting; + + tids = BTreeTupleGetPosting(itup); + nposting = BTreeTupleGetNPosting(itup); + tids_datum = (Datum *) palloc(nposting * sizeof(Datum)); + for (int i = 0; i < nposting; i++) + tids_datum[i] = ItemPointerGetDatum(&tids[i]); + values[j++] = PointerGetDatum(construct_array(tids_datum, + nposting, + TIDOID, + sizeof(ItemPointerData), + false, 's')); + pfree(tids_datum); + } + else + nulls[j++] = true; + + /* Build and return the result tuple */ + tuple = heap_form_tuple(uargs->tupd, values, nulls); return HeapTupleGetDatum(tuple); } @@ -378,12 +451,13 @@ bt_page_items(PG_FUNCTION_ARGS) elog(NOTICE, "page is deleted"); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + uargs->leafpage = P_ISLEAF(opaque); + uargs->rightmost = P_RIGHTMOST(opaque); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); - - fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc); + uargs->tupd = tupleDesc; fctx->user_fctx = uargs; @@ -395,7 +469,7 @@ bt_page_items(PG_FUNCTION_ARGS) if (fctx->call_cntr < fctx->max_calls) { - result = bt_page_print_tuples(fctx, uargs->page, uargs->offset); + result = bt_page_print_tuples(fctx, uargs); uargs->offset++; SRF_RETURN_NEXT(fctx, result); } @@ -463,12 +537,13 @@ bt_page_items_bytea(PG_FUNCTION_ARGS) elog(NOTICE, "page is deleted"); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + uargs->leafpage = P_ISLEAF(opaque); + uargs->rightmost = P_RIGHTMOST(opaque); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); - - fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc); + uargs->tupd = tupleDesc; fctx->user_fctx = uargs; @@ -480,7 +555,7 @@ bt_page_items_bytea(PG_FUNCTION_ARGS) if (fctx->call_cntr < fctx->max_calls) { - result = bt_page_print_tuples(fctx, uargs->page, uargs->offset); + result = bt_page_print_tuples(fctx, uargs); uargs->offset++; SRF_RETURN_NEXT(fctx, result); } @@ -510,7 +585,7 @@ bt_metap(PG_FUNCTION_ARGS) BTMetaPageData *metad; TupleDesc tupleDesc; int j; - char *values[8]; + char *values[9]; Buffer buffer; Page page; HeapTuple tuple; @@ -557,17 +632,20 @@ bt_metap(PG_FUNCTION_ARGS) /* * Get values of extended metadata if available, use default values - * otherwise. + * otherwise. Note that we rely on the assumption that btm_safededup is + * initialized to zero on databases that were initdb'd before Postgres 13. */ if (metad->btm_version >= BTREE_NOVAC_VERSION) { values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact); values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples); + values[j++] = metad->btm_safededup ? "t" : "f"; } else { values[j++] = "0"; values[j++] = "-1"; + values[j++] = "f"; } tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out index 07c2dcd771..92d5c59654 100644 --- a/contrib/pageinspect/expected/btree.out +++ b/contrib/pageinspect/expected/btree.out @@ -12,6 +12,7 @@ fastroot | 1 fastlevel | 0 oldest_xact | 0 last_cleanup_num_tuples | -1 +safededup | t SELECT * FROM bt_page_stats('test1_a_idx', 0); ERROR: block 0 is a meta page @@ -41,6 +42,9 @@ itemlen | 16 nulls | f vars | f data | 01 00 00 00 00 00 00 01 +dead | f +htid | (0,1) +tids | SELECT * FROM bt_page_items('test1_a_idx', 2); ERROR: block number out of range @@ -54,6 +58,9 @@ itemlen | 16 nulls | f vars | f data | 01 00 00 00 00 00 00 01 +dead | f +htid | (0,1) +tids | SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 2)); ERROR: block number 2 is out of range for relation "test1_a_idx" diff --git a/contrib/pageinspect/pageinspect--1.7--1.8.sql b/contrib/pageinspect/pageinspect--1.7--1.8.sql index 2a7c4b3516..93ea37cde3 100644 --- a/contrib/pageinspect/pageinspect--1.7--1.8.sql +++ b/contrib/pageinspect/pageinspect--1.7--1.8.sql @@ -14,3 +14,56 @@ CREATE FUNCTION heap_tuple_infomask_flags( RETURNS record AS 'MODULE_PATHNAME', 'heap_tuple_infomask_flags' LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_metap() +-- +DROP FUNCTION bt_metap(text); +CREATE FUNCTION bt_metap(IN relname text, + OUT magic int4, + OUT version int4, + OUT root int4, + OUT level int4, + OUT fastroot int4, + OUT fastlevel int4, + OUT oldest_xact int4, + OUT last_cleanup_num_tuples real, + OUT safededup boolean) +AS 'MODULE_PATHNAME', 'bt_metap' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_page_items(text, int4) +-- +DROP FUNCTION bt_page_items(text, int4); +CREATE FUNCTION bt_page_items(IN relname text, IN blkno int4, + OUT itemoffset smallint, + OUT ctid tid, + OUT itemlen smallint, + OUT nulls bool, + OUT vars bool, + OUT data text, + OUT dead boolean, + OUT htid tid, + OUT tids tid[]) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'bt_page_items' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_page_items(bytea) +-- +DROP FUNCTION bt_page_items(bytea); +CREATE FUNCTION bt_page_items(IN page bytea, + OUT itemoffset smallint, + OUT ctid tid, + OUT itemlen smallint, + OUT nulls bool, + OUT vars bool, + OUT data text, + OUT dead boolean, + OUT htid tid, + OUT tids tid[]) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'bt_page_items_bytea' +LANGUAGE C STRICT PARALLEL SAFE; diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index 7e2e1487d7..b527daf6ca 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -300,13 +300,14 @@ test=# SELECT t_ctid, raw_flags, combined_flags test=# SELECT * FROM bt_metap('pg_cast_oid_index'); -[ RECORD 1 ]-----------+------- magic | 340322 -version | 3 +version | 4 root | 1 level | 0 fastroot | 1 fastlevel | 0 oldest_xact | 582 last_cleanup_num_tuples | 1000 +safededup | f @@ -329,11 +330,11 @@ test=# SELECT * FROM bt_page_stats('pg_cast_oid_index', 1); -[ RECORD 1 ]-+----- blkno | 1 type | l -live_items | 256 +live_items | 224 dead_items | 0 -avg_item_size | 12 +avg_item_size | 16 page_size | 8192 -free_size | 4056 +free_size | 3668 btpo_prev | 0 btpo_next | 0 btpo | 0 @@ -356,33 +357,45 @@ btpo_flags | 3 bt_page_items returns detailed information about all of the items on a B-tree index page. For example: -test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1); - itemoffset | ctid | itemlen | nulls | vars | data -------------+---------+---------+-------+------+------------- - 1 | (0,1) | 12 | f | f | 23 27 00 00 - 2 | (0,2) | 12 | f | f | 24 27 00 00 - 3 | (0,3) | 12 | f | f | 25 27 00 00 - 4 | (0,4) | 12 | f | f | 26 27 00 00 - 5 | (0,5) | 12 | f | f | 27 27 00 00 - 6 | (0,6) | 12 | f | f | 28 27 00 00 - 7 | (0,7) | 12 | f | f | 29 27 00 00 - 8 | (0,8) | 12 | f | f | 2a 27 00 00 +regression=# SELECT * FROM bt_page_items('tenk2_unique1', 5); + itemoffset | ctid | itemlen | nulls | vars | data | dead | htid | tids +------------+----------+---------+-------+------+-------------------------+------+----------+------ + 1 | (40,1) | 16 | f | f | b8 05 00 00 00 00 00 00 | | | + 2 | (58,11) | 16 | f | f | 4a 04 00 00 00 00 00 00 | f | (58,11) | + 3 | (266,4) | 16 | f | f | 4b 04 00 00 00 00 00 00 | f | (266,4) | + 4 | (279,25) | 16 | f | f | 4c 04 00 00 00 00 00 00 | f | (279,25) | + 5 | (333,11) | 16 | f | f | 4d 04 00 00 00 00 00 00 | f | (333,11) | + 6 | (87,24) | 16 | f | f | 4e 04 00 00 00 00 00 00 | f | (87,24) | + 7 | (38,22) | 16 | f | f | 4f 04 00 00 00 00 00 00 | f | (38,22) | + 8 | (272,17) | 16 | f | f | 50 04 00 00 00 00 00 00 | f | (272,17) | - In a B-tree leaf page, ctid points to a heap tuple. - In an internal page, the block number part of ctid - points to another page in the index itself, while the offset part - (the second number) is ignored and is usually 1. + In a B-tree leaf page, ctid usually + points to a heap tuple, and dead may + indicate that the item has its LP_DEAD bit + set. In an internal page, the block number part of + ctid points to another page in the + index itself, while the offset part (the second number) encodes + metadata about the tuple. Posting list tuples on leaf pages + also use ctid for metadata. + htid always shows a single heap TID + for the tuple, regardless of how it is represented (internal + page tuples may need to store a heap TID when there are many + duplicate tuples on descendent leaf pages). + tids is a list of TIDs that is stored + within posting list tuples (tuples created by deduplication). Note that the first item on any non-rightmost page (any page with a non-zero value in the btpo_next field) is the page's high key, meaning its data serves as an upper bound on all items appearing on the page, while - its ctid field is meaningless. Also, on non-leaf - pages, the first real data item (the first item that is not a high - key) is a minus infinity item, with no actual value - in its data field. Such an item does have a valid - downlink in its ctid field, however. + its ctid field does not point to + another block. Also, on non-leaf pages, the first real data item + (the first item that is not a high key) is a minus + infinity item, with no actual value in its + data field. Such an item does have a + valid downlink in its ctid field, + however. @@ -402,17 +415,17 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1); with get_raw_page should be passed as argument. So the last example could also be rewritten like this: -test=# SELECT * FROM bt_page_items(get_raw_page('pg_cast_oid_index', 1)); - itemoffset | ctid | itemlen | nulls | vars | data -------------+---------+---------+-------+------+------------- - 1 | (0,1) | 12 | f | f | 23 27 00 00 - 2 | (0,2) | 12 | f | f | 24 27 00 00 - 3 | (0,3) | 12 | f | f | 25 27 00 00 - 4 | (0,4) | 12 | f | f | 26 27 00 00 - 5 | (0,5) | 12 | f | f | 27 27 00 00 - 6 | (0,6) | 12 | f | f | 28 27 00 00 - 7 | (0,7) | 12 | f | f | 29 27 00 00 - 8 | (0,8) | 12 | f | f | 2a 27 00 00 +regression=# SELECT * FROM bt_page_items(get_raw_page('tenk2_unique1', 5)); + itemoffset | ctid | itemlen | nulls | vars | data | dead | htid | tids +------------+----------+---------+-------+------+-------------------------+------+----------+------ + 1 | (40,1) | 16 | f | f | b8 05 00 00 00 00 00 00 | | | + 2 | (58,11) | 16 | f | f | 4a 04 00 00 00 00 00 00 | f | (58,11) | + 3 | (266,4) | 16 | f | f | 4b 04 00 00 00 00 00 00 | f | (266,4) | + 4 | (279,25) | 16 | f | f | 4c 04 00 00 00 00 00 00 | f | (279,25) | + 5 | (333,11) | 16 | f | f | 4d 04 00 00 00 00 00 00 | f | (333,11) | + 6 | (87,24) | 16 | f | f | 4e 04 00 00 00 00 00 00 | f | (87,24) | + 7 | (38,22) | 16 | f | f | 4f 04 00 00 00 00 00 00 | f | (38,22) | + 8 | (272,17) | 16 | f | f | 50 04 00 00 00 00 00 00 | f | (272,17) | All the other details are the same as explained in the previous item. -- 2.17.1