*** a/src/backend/access/common/heaptuple.c
--- b/src/backend/access/common/heaptuple.c
***************
*** 60,66 ****
--- 60,69 ----
#include "access/sysattr.h"
#include "access/tuptoaster.h"
#include "executor/tuptable.h"
+ #include "utils/datum.h"
+ /* guc variable for EWT compression ratio*/
+ int wal_update_compression_ratio = 25;
/* Does att's datatype allow packing into the 1-byte-header varlena format? */
#define ATT_IS_PACKABLE(att) \
***************
*** 297,308 **** heap_attisnull(HeapTuple tup, int attnum)
}
/* ----------------
! * nocachegetattr
*
! * This only gets called from fastgetattr() macro, in cases where
* we can't use a cacheoffset and the value is not null.
*
! * This caches attribute offsets in the attribute descriptor.
*
* An alternative way to speed things up would be to cache offsets
* with the tuple, but that seems more difficult unless you take
--- 300,312 ----
}
/* ----------------
! * nocachegetattr_with_len
*
! * This only gets called in cases where
* we can't use a cacheoffset and the value is not null.
*
! * This caches attribute offsets in the attribute descriptor and
! * outputs the length of the attribute value.
*
* An alternative way to speed things up would be to cache offsets
* with the tuple, but that seems more difficult unless you take
***************
*** 320,328 **** heap_attisnull(HeapTuple tup, int attnum)
* ----------------
*/
Datum
! nocachegetattr(HeapTuple tuple,
! int attnum,
! TupleDesc tupleDesc)
{
HeapTupleHeader tup = tuple->t_data;
Form_pg_attribute *att = tupleDesc->attrs;
--- 324,333 ----
* ----------------
*/
Datum
! nocachegetattr_with_len(HeapTuple tuple,
! int attnum,
! TupleDesc tupleDesc,
! Size *len)
{
HeapTupleHeader tup = tuple->t_data;
Form_pg_attribute *att = tupleDesc->attrs;
***************
*** 381,386 **** nocachegetattr(HeapTuple tuple,
--- 386,394 ----
*/
if (att[attnum]->attcacheoff >= 0)
{
+ if (len)
+ *len = att_getlength(att[attnum]->attlen,
+ tp + att[attnum]->attcacheoff);
return fetchatt(att[attnum],
tp + att[attnum]->attcacheoff);
}
***************
*** 507,515 **** nocachegetattr(HeapTuple tuple,
--- 515,536 ----
}
}
+ if (len)
+ *len = att_getlength(att[attnum]->attlen, tp + off);
return fetchatt(att[attnum], tp + off);
}
+ /*
+ * nocachegetattr
+ */
+ Datum
+ nocachegetattr(HeapTuple tuple,
+ int attnum,
+ TupleDesc tupleDesc)
+ {
+ return nocachegetattr_with_len(tuple, attnum, tupleDesc, NULL);
+ }
+
/* ----------------
* heap_getsysattr
*
***************
*** 617,622 **** heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest)
--- 638,1061 ----
memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len);
}
+ /* ----------------
+ * heap_attr_get_length_and_check_equals
+ *
+ * returns the result of comparison of specified attribute's value for
+ * input tuples.
+ * outputs the length of specified attribute's value for
+ * input tuples.
+ * ----------------
+ */
+ bool
+ heap_attr_get_length_and_check_equals(TupleDesc tupdesc, int attrnum,
+ HeapTuple tup1, HeapTuple tup2,
+ Size *tup1_attr_len, Size *tup2_attr_len)
+ {
+ Datum value1,
+ value2;
+ bool isnull1,
+ isnull2;
+ Form_pg_attribute att;
+
+ *tup1_attr_len = 0;
+ *tup2_attr_len = 0;
+
+ /*
+ * If it's a whole-tuple reference, say "not equal". It's not really
+ * worth supporting this case, since it could only succeed after a no-op
+ * update, which is hardly a case worth optimizing for.
+ */
+ if (attrnum == 0)
+ return false;
+
+ /*
+ * Likewise, automatically say "not equal" for any system attribute other
+ * than OID and tableOID; we cannot expect these to be consistent in a HOT
+ * chain, or even to be set correctly yet in the new tuple.
+ */
+ if (attrnum < 0)
+ {
+ if (attrnum != ObjectIdAttributeNumber &&
+ attrnum != TableOidAttributeNumber)
+ return false;
+ }
+
+ /*
+ * Extract the corresponding values and length of values. XXX this is
+ * pretty inefficient if there are many indexed columns. Should
+ * HeapSatisfiesHOTUpdate do a single heap_deform_tuple call on each
+ * tuple, instead? But that doesn't work for system columns ...
+ */
+ value1 = heap_getattr_with_len(tup1, attrnum, tupdesc, &isnull1, tup1_attr_len);
+ value2 = heap_getattr_with_len(tup2, attrnum, tupdesc, &isnull2, tup2_attr_len);
+
+ /*
+ * If one value is NULL and other is not, then they are certainly not
+ * equal
+ */
+ if (isnull1 != isnull2)
+ return false;
+
+ /*
+ * If both are NULL, they can be considered equal.
+ */
+ if (isnull1)
+ return true;
+
+ /*
+ * We do simple binary comparison of the two datums. This may be overly
+ * strict because there can be multiple binary representations for the
+ * same logical value. But we should be OK as long as there are no false
+ * positives. Using a type-specific equality operator is messy because
+ * there could be multiple notions of equality in different operator
+ * classes; furthermore, we cannot safely invoke user-defined functions
+ * while holding exclusive buffer lock.
+ */
+ if (attrnum <= 0)
+ {
+ /* The only allowed system columns are OIDs, so do this */
+ return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
+ }
+ else
+ {
+ Assert(attrnum <= tupdesc->natts);
+ att = tupdesc->attrs[attrnum - 1];
+ return datumIsEqual(value1, value2, att->attbyval, att->attlen);
+ }
+ }
+
+ /* ----------------
+ * heap_delta_encode
+ *
+ * Construct a delta Encoded WAL Tuple (EWT) by comparing old and new
+ * tuple versions w.r.t column boundaries.
+ *
+ * Encoded WAL Tuple Format:
+ * Header + Control byte + history reference (2 - 3)bytes
+ * + New data (1 byte length + variable data)+ ...
+ *
+ * Encode Mechanism:
+ *
+ * Copy the bitmap data from new tuple to the EWT (Encoded WAL Tuple) and
+ * loop for all attributes to find any modifications in the attributes.
+ * The unmodified data is encoded as a History Reference in EWT and
+ * the modified data (if NOT NULL) is encoded as New Data in EWT.
+ *
+ * The offset values are calculated with respect to the tuple t_hoff
+ * value. For each column attribute old and new tuple offsets
+ * are recalculated based on padding in the tuples.
+ * Once the alignment difference is found between old and new tuple
+ * versions, then include alignment difference as New Data in EWT.
+ *
+ * max encoded data length is 75% (default compression rate)
+ * of original data, If encoded output data length is greater than
+ * that, original tuple (new tuple version) will be directly stored in
+ * WAL Tuple.
+ *
+ *
+ * History Reference:
+ * If any column is modified then the unmodified columns data till the
+ * modified column needs to be copied to EWT as a Tag.
+ *
+ *
+ * New data (modified data):
+ * First byte repersents the length [0-255] of the modified data,
+ * followed by the modified data of corresponding length.
+ *
+ * For more details about Encoded WAL Tuple (EWT) representation,
+ * refer transam\README
+ * ----------------
+ */
+ bool
+ heap_delta_encode(TupleDesc tupleDesc, HeapTuple oldtup, HeapTuple newtup,
+ PGLZ_Header *encdata)
+ {
+ Form_pg_attribute *att = tupleDesc->attrs;
+ int numberOfAttributes;
+ int32 new_tup_off = 0,
+ old_tup_off = 0,
+ temp_off = 0,
+ match_off = 0,
+ change_off = 0;
+ int attnum;
+ int32 data_len,
+ old_tup_pad_len,
+ new_tup_pad_len;
+ Size old_tup_attr_len,
+ new_tup_attr_len;
+ bool is_attr_equals = true;
+ unsigned char *bp = (unsigned char *) encdata + sizeof(PGLZ_Header);
+ unsigned char *bstart = bp;
+ char *dp = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits);
+ char *dstart = dp;
+ char *history;
+ unsigned char ctrl_dummy = 0;
+ unsigned char *ctrlp = &ctrl_dummy;
+ unsigned char ctrlb = 0;
+ unsigned char ctrl = 0;
+ int32 len,
+ old_tup_bitmaplen,
+ new_tup_bitmaplen,
+ old_tup_len,
+ new_tup_len;
+ int32 result_size;
+ int32 result_max;
+
+ old_tup_len = oldtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
+
+ /*
+ * Tuples of length greater than PGLZ_HISTORY_SIZE are not allowed for
+ * delta encode as this is the maximum size of history offset.
+ */
+ if (old_tup_len >= PGLZ_HISTORY_SIZE)
+ return false;
+
+ history = (char *) oldtup->t_data + offsetof(HeapTupleHeaderData, t_bits);
+ old_tup_bitmaplen = oldtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits);
+ new_tup_bitmaplen = newtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits);
+ new_tup_len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
+
+ /*
+ * If length of old and new tuple versions vary by more than 50%, include
+ * new as-is
+ */
+ if ((new_tup_len <= (old_tup_len >> 1))
+ || (old_tup_len <= (new_tup_len >> 1)))
+ return false;
+
+ /* Required compression ratio for EWT */
+ result_max = (new_tup_len * (100 - wal_update_compression_ratio)) / 100;
+ encdata->rawsize = new_tup_len;
+
+ /*
+ * Advance the EWT by adding the approximate length of the operation for
+ * new data as [1 control byte + 1 length byte + data_length] and validate
+ * it with result_max. The same length approximation is used in the
+ * function for New data.
+ */
+ if ((bp + (2 + new_tup_bitmaplen)) - bstart >= result_max)
+ return false;
+
+ /* Copy the bitmap data from new tuple to EWT */
+ pglz_out_add(ctrlp, ctrlb, ctrl, bp, new_tup_bitmaplen, dp);
+ dstart = dp;
+
+ /*
+ * Loop through all attributes, if the attribute is modified by the update
+ * operation, store the [Offset,Length] reffering old tuple version till
+ * the last unchanged column in the EWT as History Reference, else store
+ * the [Length,Data] from new tuple version as New Data.
+ */
+ numberOfAttributes = HeapTupleHeaderGetNatts(newtup->t_data);
+ for (attnum = 1; attnum <= numberOfAttributes; attnum++)
+ {
+ if (!heap_attr_get_length_and_check_equals(tupleDesc, attnum, oldtup,
+ newtup, &old_tup_attr_len, &new_tup_attr_len))
+ {
+ is_attr_equals = false;
+ data_len = old_tup_off - match_off;
+
+ len = PGLZ_GET_HIST_CTRL_BIT_LEN(data_len);
+ if ((bp + len) - bstart >= result_max)
+ return false;
+
+ /*
+ * The match_off value is calculated w.r.t to the tuple t_hoff
+ * value, the bit map len needs to be added to match_off to get
+ * the actual start offset from the old/history tuple.
+ */
+ match_off += old_tup_bitmaplen;
+
+ /*
+ * If any unchanged data presents in the old and new tuples then
+ * encode the data as it needs to copy from history tuple with len
+ * and offset.
+ */
+ pglz_out_tag(ctrlp, ctrlb, ctrl, bp, data_len, match_off, history);
+
+ /*
+ * Recalculate the old and new tuple offsets based on padding in
+ * the tuples
+ */
+ if (!HeapTupleHasNulls(oldtup)
+ || !att_isnull((attnum - 1), oldtup->t_data->t_bits))
+ {
+ old_tup_off = att_align_pointer(old_tup_off,
+ att[attnum - 1]->attalign,
+ att[attnum - 1]->attlen,
+ (char *) oldtup->t_data + oldtup->t_data->t_hoff + old_tup_off);
+ }
+
+ if (!HeapTupleHasNulls(newtup)
+ || !att_isnull((attnum - 1), newtup->t_data->t_bits))
+ {
+ new_tup_off = att_align_pointer(new_tup_off,
+ att[attnum - 1]->attalign,
+ att[attnum - 1]->attlen,
+ (char *) newtup->t_data + newtup->t_data->t_hoff + new_tup_off);
+ }
+
+ old_tup_off += old_tup_attr_len;
+ new_tup_off += new_tup_attr_len;
+
+ match_off = old_tup_off;
+ }
+ else
+ {
+ data_len = new_tup_off - change_off;
+ if ((bp + (2 + data_len)) - bstart >= result_max)
+ return false;
+
+ /* Add the modified column data to the EWT */
+ pglz_out_add(ctrlp, ctrlb, ctrl, bp, data_len, dp);
+
+ /*
+ * Calculate the alignment for old and new tuple versions for this
+ * attribute, if the alignment is same, then we continue for next
+ * attribute else 1. stores the [Offset,Length] reffering old
+ * tuple version for previous attribute (if previous attr is same
+ * in old and new tuple versions) in the EWT as History Reference,
+ * 2. add the [Length,Data] for alignment from new tuple as New
+ * Data in EWT.
+ */
+ if (!HeapTupleHasNulls(oldtup)
+ || !att_isnull((attnum - 1), oldtup->t_data->t_bits))
+ {
+ temp_off = old_tup_off;
+ old_tup_off = att_align_pointer(old_tup_off,
+ att[attnum - 1]->attalign,
+ att[attnum - 1]->attlen,
+ (char *) oldtup->t_data + oldtup->t_data->t_hoff + old_tup_off);
+
+ old_tup_pad_len = old_tup_off - temp_off;
+
+
+ temp_off = new_tup_off;
+ new_tup_off = att_align_pointer(new_tup_off,
+ att[attnum - 1]->attalign,
+ att[attnum - 1]->attlen,
+ (char *) newtup->t_data + newtup->t_data->t_hoff + new_tup_off);
+ new_tup_pad_len = new_tup_off - temp_off;
+
+ if (old_tup_pad_len != new_tup_pad_len)
+ {
+ /*
+ * If the alignment difference is found between old and
+ * new tuples and previous attribute value of the old and
+ * new tuple versions is same then store until the current
+ * match as history reference Tag in EWT.
+ */
+ if (is_attr_equals)
+ {
+ data_len = old_tup_off - old_tup_pad_len - match_off;
+ len = PGLZ_GET_HIST_CTRL_BIT_LEN(data_len);
+ if ((bp + len) - bstart >= result_max)
+ return false;
+
+ match_off += old_tup_bitmaplen;
+ pglz_out_tag(ctrlp, ctrlb, ctrl, bp, data_len, match_off, history);
+ }
+
+ match_off = old_tup_off;
+
+ /* Alignment data */
+ if ((bp + (2 + new_tup_pad_len)) - bstart >= result_max)
+ return false;
+
+ pglz_out_add(ctrlp, ctrlb, ctrl, bp, new_tup_pad_len, dp);
+ }
+ }
+
+ old_tup_off += old_tup_attr_len;
+ new_tup_off += new_tup_attr_len;
+
+ change_off = new_tup_off;
+
+ /*
+ * Recalculate the destination pointer with the new offset which
+ * is used while copying the modified data.
+ */
+ dp = dstart + new_tup_off;
+ is_attr_equals = true;
+ }
+ }
+
+ /* If any modified column data presents then add it in EWT. */
+ data_len = new_tup_off - change_off;
+ if ((bp + (2 + data_len)) - bstart >= result_max)
+ return false;
+
+ pglz_out_add(ctrlp, ctrlb, ctrl, bp, data_len, dp);
+
+ /*
+ * If any left out old tuple data is present then copy it as history
+ * reference
+ */
+ data_len = old_tup_off - match_off;
+ len = PGLZ_GET_HIST_CTRL_BIT_LEN(data_len);
+ if ((bp + len) - bstart >= result_max)
+ return false;
+
+ match_off += old_tup_bitmaplen;
+ pglz_out_tag(ctrlp, ctrlb, ctrl, bp, data_len, match_off, history);
+
+ /*
+ * Write out the last control byte and check that we haven't overrun the
+ * output size allowed by the strategy.
+ */
+ *ctrlp = ctrlb;
+
+ result_size = bp - bstart;
+ if (result_size >= result_max)
+ return false;
+
+ /* Fill in the actual length of the compressed datum */
+ SET_VARSIZE_COMPRESSED(encdata, result_size + sizeof(PGLZ_Header));
+ return true;
+ }
+
+ /* ----------------
+ * heap_delta_decode
+ *
+ * Decode a tuple using delta-encoded WAL tuple and old tuple version
+ *
+ * Encoded WAL Tuple Format:
+ * Header + Control byte + history reference (2 - 3)bytes
+ * + New data (1 byte length + variable data)+ ...
+ *
+ *
+ * Decode Mechanism:
+ * Skip header and Read one control byte and process the next 8 items (or as many as
+ * remain in the compressed input).
+ * Check each control bit, if the bit is set then it is History Reference which
+ * means the next 2 - 3 byte tag provides the offset and length of history match.
+ * Use the offset and corresponding length to copy data from old tuple version
+ * to new tuple.
+ * If the control bit is unset, then it is New Data Reference which means
+ * first byte contains the length [0-255] of the modified data, followed
+ * by the modified data of corresponding length specified in the first byte.
+ *
+ * Tag in History Reference:
+ * 2-3 byte tag -
+ * 2 byte tag is used when length of History data (unchanged data from old tuple version) is less than 18.
+ * 3 byte tag is used when length of History data (unchanged data from old tuple version) is greater than
+ * equal to 18.
+ * The maximum length that can be represented by one Tag is 273.
+ *
+ * For more details about Encoded WAL Tuple (EWT) representation, refer transam\README
+ *
+ * ----------------
+ */
+ void
+ heap_delta_decode(PGLZ_Header *encdata, HeapTuple oldtup, HeapTuple newtup)
+ {
+ return pglz_decompress_with_history((char *) encdata,
+ (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits),
+ &newtup->t_len,
+ (char *) oldtup->t_data + offsetof(HeapTupleHeaderData, t_bits));
+ }
+
/*
* heap_form_tuple
* construct a tuple from the given values[] and isnull[] arrays,
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***************
*** 85,90 **** static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
--- 85,91 ----
TransactionId xid, CommandId cid, int options);
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
ItemPointerData from, Buffer newbuf, HeapTuple newtup,
+ HeapTuple oldtup,
bool all_visible_cleared, bool new_all_visible_cleared);
static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
HeapTuple oldtup, HeapTuple newtup);
***************
*** 857,862 **** heapgettup_pagemode(HeapScanDesc scan,
--- 858,911 ----
* definition in access/htup.h is maintained.
*/
Datum
+ fastgetattr_with_len(HeapTuple tup, int attnum, TupleDesc tupleDesc,
+ bool *isnull, int32 *len)
+ {
+ return (
+ (attnum) > 0 ?
+ (
+ (*(isnull) = false),
+ HeapTupleNoNulls(tup) ?
+ (
+ (tupleDesc)->attrs[(attnum) - 1]->attcacheoff >= 0 ?
+ (
+ (*(len) = att_getlength((tupleDesc)->attrs[(attnum - 1)]->attlen,
+ (char *) (tup)->t_data + (tup)->t_data->t_hoff +
+ (tupleDesc)->attrs[(attnum) - 1]->attcacheoff)),
+ fetchatt((tupleDesc)->attrs[(attnum) - 1],
+ (char *) (tup)->t_data + (tup)->t_data->t_hoff +
+ (tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
+ )
+ :
+ (
+ nocachegetattr_with_len(tup), (attnum), (tupleDesc), (len))
+ )
+ :
+ (
+ att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
+ (
+ (*(isnull) = true),
+ (*(len) = 0),
+ (Datum) NULL
+ )
+ :
+ (
+ nocachegetattr_with_len((tup), (attnum), (tupleDesc), (len))
+ )
+ )
+ )
+ :
+ (
+ (Datum) NULL
+ )
+ );
+ }
+
+ /*
+ * This is formatted so oddly so that the correspondence to the macro
+ * definition in access/htup.h is maintained.
+ */
+ Datum
fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
bool *isnull)
{
***************
*** 873,879 **** fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
(tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
)
:
! nocachegetattr((tup), (attnum), (tupleDesc))
)
:
(
--- 922,929 ----
(tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
)
:
! (
! nocachegetattr(tup), (attnum), (tupleDesc))
)
:
(
***************
*** 3229,3238 **** l2:
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
! XLogRecPtr recptr = log_heap_update(relation, buffer, oldtup.t_self,
! newbuf, heaptup,
! all_visible_cleared,
! all_visible_cleared_new);
if (newbuf != buffer)
{
--- 3279,3290 ----
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
! XLogRecPtr recptr;
!
! recptr = log_heap_update(relation, buffer, oldtup.t_self,
! newbuf, heaptup, &oldtup,
! all_visible_cleared,
! all_visible_cleared_new);
if (newbuf != buffer)
{
***************
*** 3299,3372 **** static bool
heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
HeapTuple tup1, HeapTuple tup2)
{
! Datum value1,
! value2;
! bool isnull1,
! isnull2;
! Form_pg_attribute att;
!
! /*
! * If it's a whole-tuple reference, say "not equal". It's not really
! * worth supporting this case, since it could only succeed after a no-op
! * update, which is hardly a case worth optimizing for.
! */
! if (attrnum == 0)
! return false;
!
! /*
! * Likewise, automatically say "not equal" for any system attribute other
! * than OID and tableOID; we cannot expect these to be consistent in a HOT
! * chain, or even to be set correctly yet in the new tuple.
! */
! if (attrnum < 0)
! {
! if (attrnum != ObjectIdAttributeNumber &&
! attrnum != TableOidAttributeNumber)
! return false;
! }
!
! /*
! * Extract the corresponding values. XXX this is pretty inefficient if
! * there are many indexed columns. Should HeapSatisfiesHOTUpdate do a
! * single heap_deform_tuple call on each tuple, instead? But that doesn't
! * work for system columns ...
! */
! value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
! value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
!
! /*
! * If one value is NULL and other is not, then they are certainly not
! * equal
! */
! if (isnull1 != isnull2)
! return false;
!
! /*
! * If both are NULL, they can be considered equal.
! */
! if (isnull1)
! return true;
! /*
! * We do simple binary comparison of the two datums. This may be overly
! * strict because there can be multiple binary representations for the
! * same logical value. But we should be OK as long as there are no false
! * positives. Using a type-specific equality operator is messy because
! * there could be multiple notions of equality in different operator
! * classes; furthermore, we cannot safely invoke user-defined functions
! * while holding exclusive buffer lock.
! */
! if (attrnum <= 0)
! {
! /* The only allowed system columns are OIDs, so do this */
! return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
! }
! else
! {
! Assert(attrnum <= tupdesc->natts);
! att = tupdesc->attrs[attrnum - 1];
! return datumIsEqual(value1, value2, att->attbyval, att->attlen);
! }
}
/*
--- 3351,3361 ----
heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
HeapTuple tup1, HeapTuple tup2)
{
! Size tup1_attr_len,
! tup2_attr_len;
! return heap_attr_get_length_and_check_equals(tupdesc, attrnum, tup1, tup2,
! &tup1_attr_len, &tup2_attr_len);
}
/*
***************
*** 4464,4470 **** log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
*/
static XLogRecPtr
log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
! Buffer newbuf, HeapTuple newtup,
bool all_visible_cleared, bool new_all_visible_cleared)
{
xl_heap_update xlrec;
--- 4453,4459 ----
*/
static XLogRecPtr
log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
! Buffer newbuf, HeapTuple newtup, HeapTuple oldtup,
bool all_visible_cleared, bool new_all_visible_cleared)
{
xl_heap_update xlrec;
***************
*** 4473,4478 **** log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
--- 4462,4477 ----
XLogRecPtr recptr;
XLogRecData rdata[4];
Page page = BufferGetPage(newbuf);
+ char *newtupdata;
+ int newtuplen;
+ bool compressed = false;
+
+ /* Structure which holds EWT */
+ struct
+ {
+ PGLZ_Header pglzheader;
+ char buf[MaxHeapTupleSize];
+ } buf;
/* Caller should not call me on a non-WAL-logged relation */
Assert(RelationNeedsWAL(reln));
***************
*** 4482,4492 **** log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
else
info = XLOG_HEAP_UPDATE;
xlrec.target.node = reln->rd_node;
xlrec.target.tid = from;
! xlrec.all_visible_cleared = all_visible_cleared;
xlrec.newtid = newtup->t_self;
! xlrec.new_all_visible_cleared = new_all_visible_cleared;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapUpdate;
--- 4481,4522 ----
else
info = XLOG_HEAP_UPDATE;
+ newtupdata = ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits);
+ newtuplen = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
+
+ /*
+ * EWT can be generated for all new tuple versions created by Update
+ * operation. Currently we do it when both the old and new tuple versions
+ * are on same page, because during recovery if the page containing old
+ * tuple is corrupt, it should not cascade that corruption to other pages.
+ * Under the general assumption that for long runs most updates tend to
+ * create new tuple version on same page, there should not be significant
+ * impact on WAL reduction or performance.
+ *
+ * We should not generate EWT when we need to backup the whole bolck in
+ * WAL as in that case there is no saving by reduced WAL size.
+ */
+ if ((oldbuf == newbuf) && !XLogCheckBufferNeedsBackup(newbuf))
+ {
+ /* Delta-encode the new tuple using the old tuple */
+ if (heap_delta_encode(reln->rd_att, oldtup, newtup, &buf.pglzheader))
+ {
+ compressed = true;
+ newtupdata = (char *) &buf.pglzheader;
+ newtuplen = VARSIZE(&buf.pglzheader);
+ }
+ }
+
+ xlrec.flags = 0;
xlrec.target.node = reln->rd_node;
xlrec.target.tid = from;
! if (all_visible_cleared)
! xlrec.flags |= XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED;
xlrec.newtid = newtup->t_self;
! if (new_all_visible_cleared)
! xlrec.flags |= XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED;
! if (compressed)
! xlrec.flags |= XL_HEAP_UPDATE_DELTA_ENCODED;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapUpdate;
***************
*** 4513,4521 **** log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
rdata[2].buffer_std = true;
rdata[2].next = &(rdata[3]);
! /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
! rdata[3].data = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits);
! rdata[3].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
rdata[3].buffer = newbuf;
rdata[3].buffer_std = true;
rdata[3].next = NULL;
--- 4543,4554 ----
rdata[2].buffer_std = true;
rdata[2].next = &(rdata[3]);
! /*
! * PG73FORMAT: write bitmap [+ padding] [+ oid] + data follows .........
! * OR PG93FORMAT [If encoded]: LZ header + Encoded data follows
! */
! rdata[3].data = newtupdata;
! rdata[3].len = newtuplen;
rdata[3].buffer = newbuf;
rdata[3].buffer_std = true;
rdata[3].next = NULL;
***************
*** 5291,5297 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
--- 5324,5333 ----
Page page;
OffsetNumber offnum;
ItemId lp = NULL;
+ HeapTupleData newtup;
+ HeapTupleData oldtup;
HeapTupleHeader htup;
+ HeapTupleHeader oldtupdata = NULL;
struct
{
HeapTupleHeaderData hdr;
***************
*** 5306,5312 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
! if (xlrec->all_visible_cleared)
{
Relation reln = CreateFakeRelcacheEntry(xlrec->target.node);
BlockNumber block = ItemPointerGetBlockNumber(&xlrec->target.tid);
--- 5342,5348 ----
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
! if (xlrec->flags & XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED)
{
Relation reln = CreateFakeRelcacheEntry(xlrec->target.node);
BlockNumber block = ItemPointerGetBlockNumber(&xlrec->target.tid);
***************
*** 5366,5372 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(PANIC, "heap_update_redo: invalid lp");
! htup = (HeapTupleHeader) PageGetItem(page, lp);
htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
HEAP_XMAX_INVALID |
--- 5402,5408 ----
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(PANIC, "heap_update_redo: invalid lp");
! oldtupdata = htup = (HeapTupleHeader) PageGetItem(page, lp);
htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
HEAP_XMAX_INVALID |
***************
*** 5385,5391 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
/* Mark the page as a candidate for pruning */
PageSetPrunable(page, record->xl_xid);
! if (xlrec->all_visible_cleared)
PageClearAllVisible(page);
/*
--- 5421,5427 ----
/* Mark the page as a candidate for pruning */
PageSetPrunable(page, record->xl_xid);
! if (xlrec->flags & XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED)
PageClearAllVisible(page);
/*
***************
*** 5410,5416 **** newt:;
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
! if (xlrec->new_all_visible_cleared)
{
Relation reln = CreateFakeRelcacheEntry(xlrec->target.node);
BlockNumber block = ItemPointerGetBlockNumber(&xlrec->newtid);
--- 5446,5452 ----
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
! if (xlrec->flags & XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED)
{
Relation reln = CreateFakeRelcacheEntry(xlrec->target.node);
BlockNumber block = ItemPointerGetBlockNumber(&xlrec->newtid);
***************
*** 5473,5482 **** newsame:;
SizeOfHeapHeader);
htup = &tbuf.hdr;
MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
! /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
! memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
! (char *) xlrec + hsize,
! newlen);
newlen += offsetof(HeapTupleHeaderData, t_bits);
htup->t_infomask2 = xlhdr.t_infomask2;
htup->t_infomask = xlhdr.t_infomask;
--- 5509,5540 ----
SizeOfHeapHeader);
htup = &tbuf.hdr;
MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
!
! /*
! * If the record is EWT then decode it.
! */
! if (xlrec->flags & XL_HEAP_UPDATE_DELTA_ENCODED)
! {
! /*
! * PG93FORMAT: Header + Control byte + history reference (2 - 3)bytes
! * + New data (1 byte length + variable data)+ ...
! */
! PGLZ_Header *encoded_data = (PGLZ_Header *) (((char *) xlrec) + hsize);
!
! oldtup.t_data = oldtupdata;
! newtup.t_data = htup;
!
! heap_delta_decode(encoded_data, &oldtup, &newtup);
! newlen = newtup.t_len;
! }
! else
! {
! /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
! memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
! (char *) xlrec + hsize,
! newlen);
! }
!
newlen += offsetof(HeapTupleHeaderData, t_bits);
htup->t_infomask2 = xlhdr.t_infomask2;
htup->t_infomask = xlhdr.t_infomask;
***************
*** 5491,5497 **** newsame:;
if (offnum == InvalidOffsetNumber)
elog(PANIC, "heap_update_redo: failed to add tuple");
! if (xlrec->new_all_visible_cleared)
PageClearAllVisible(page);
freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
--- 5549,5555 ----
if (offnum == InvalidOffsetNumber)
elog(PANIC, "heap_update_redo: failed to add tuple");
! if (xlrec->flags & XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED)
PageClearAllVisible(page);
freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
*** a/src/backend/access/transam/README
--- b/src/backend/access/transam/README
***************
*** 665,670 **** then restart recovery. This is part of the reason for not writing a WAL
--- 665,778 ----
entry until we've successfully done the original action.
+ Encoded WAL Tuple (EWT)
+ -----------------------
+
+ Delta Encoded WAL Tuple (EWT) eliminates the need for copying entire tuple to WAL for the update operation.
+ EWT is constructed by comparing old and new versions of tuple w.r.t column boundaries. It contains the data
+ from new tuple for modified columns and reference [Offset,Length] of old tuple verion for un-changed columns.
+
+
+ EWT Format
+ ----------
+
+ Header + Control byte + History Reference (2 - 3)bytes
+ + New data (1 byte length + variable data) + ...
+
+
+ Header:
+
+ The header is same as PGLZ_Header, which is used to store the compressed length and raw length.
+
+ Control byte:
+
+ The first byte after the header tells what to do the next 8 times. We call this the control byte.
+
+
+ History Reference:
+
+ A set bit in the control byte means, that a tag of 2-3 bytes follows. A tag contains information
+ to copy some bytes from old tuple version to the current location in the output.
+
+ Details about 2-3 byte Tag
+ 2 byte tag is used when length of History data (unchanged data from old tuple version) is less than 18.
+ 3 byte tag is used when length of History data (unchanged data from old tuple version) is greater than
+ equal to 18.
+ The maximum length that can be represented by one Tag is 273.
+
+ Let's call the three tag bytes T1, T2 and T3. The position of the data to copy is coded as an offset
+ from the old tuple.
+
+ The offset is in the upper nibble of T1 and in T2.
+ The length is in the lower nibble of T1.
+
+ So the 16 bits of a 2 byte tag are coded as
+
+ 7---T1--0 7---T2--0
+ OOOO LLLL OOOO OOOO
+
+ This limits the offset to 1-4095 (12 bits) and the length to 3-18 (4 bits) because 3 is always added to it.
+
+ In the actual implementation, the 2 byte tag's length is limited to 3-17, because the value 0xF
+ in the length nibble has special meaning. It means, that the next following byte (T3) has to be
+ added to the length value of 18. That makes total limits of 1-4095 for offset and 3-273 for length.
+
+
+
+
+ New data:
+
+ An unset bit in the control byte represents modified data of new tuple version.
+ First byte repersents the length [0-255] of the modified data, followed by the
+ modified data of corresponding length.
+
+ 7---T1--0 7---T2--0 ...
+ LLLL LLLL DDDD DDDD ...
+
+ Data bytes repeat until the length of the new data.
+
+
+ L - Length
+ O - Offset
+ D - Data
+
+ This encoding is very similar to LZ Compression used in PostgreSQL (pg_lzcompress.c).
+
+
+ Encoding Mechanism for EWT
+ --------------------------
+ Copy the bitmap data from new tuple to the EWT (Encoded WAL Tuple) and loop for all attributes
+ to find any modifications in the attributes. The unmodified data is encoded as a
+ History Reference in EWT and the modified data (if NOT NULL) is encoded as New Data in EWT.
+
+ The offset values are calculated with respect to the tuple t_hoff value. For each column attribute
+ old and new tuple offsets are recalculated based on padding in the tuples.
+ Once the alignment difference is found between old and new tuple versions,
+ then include alignment difference as New Data in EWT.
+
+ Max encoded data length is 75% (default compression rate) of original data, if encoded output data
+ length is greater thanthat, original tuple (new tuple version) will be directly stored in WAL Tuple.
+
+
+ Decoding Mechanism for EWT
+ --------------------------
+ Skip header and Read one control byte and process the next 8 items (or as many as remain in the compressed input).
+ Check each control bit, if the bit is set then it is History Reference which means the next 2 - 3 byte tag
+ provides the offset and length of history match.
+ Use the offset and corresponding length to copy data from old tuple version to new tuple.
+ If the control bit is unset, then it is New Data Reference which means first byte contains the
+ length [0-255] of the modified data, followed by the modified data of corresponding length
+ specified in the first byte.
+
+
+ Constraints for EWT
+ --------------------
+ 1. Delta encoding is allowed when the update is going to the same page and
+ buffer doesn't need a backup block in case of full-pagewrite is on.
+ 2. Old Tuples with length less than PGLZ_HISTORY_SIZE are allowed for encoding.
+ 3. Old and New tuple versions shouldn't vary in length by more than 50% are allowed for encoding.
+
+
Asynchronous Commit
-------------------
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 1204,1209 **** begin:;
--- 1204,1231 ----
}
/*
+ * Determine whether the buffer referenced has to be backed up. Since we don't
+ * yet have the insert lock, fullPageWrites and forcePageWrites could change
+ * later, but will not cause any problem because this function is used only to
+ * identify whether EWT is required for WAL update.
+ */
+ bool
+ XLogCheckBufferNeedsBackup(Buffer buffer)
+ {
+ bool doPageWrites;
+ Page page;
+
+ page = BufferGetPage(buffer);
+
+ doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites;
+
+ if (doPageWrites && PageGetLSN(page) <= RedoRecPtr)
+ return true; /* buffer requires backup */
+
+ return false; /* buffer does not need to be backed up */
+ }
+
+ /*
* Determine whether the buffer referenced by an XLogRecData item has to
* be backed up, and if so fill a BkpBlock struct for it. In any case
* save the buffer's LSN at *lsn.
*** a/src/backend/utils/adt/pg_lzcompress.c
--- b/src/backend/utils/adt/pg_lzcompress.c
***************
*** 182,190 ****
*/
#define PGLZ_HISTORY_LISTS 8192 /* must be power of 2 */
#define PGLZ_HISTORY_MASK (PGLZ_HISTORY_LISTS - 1)
- #define PGLZ_HISTORY_SIZE 4096
- #define PGLZ_MAX_MATCH 273
-
/* ----------
* PGLZ_HistEntry -
--- 182,187 ----
***************
*** 302,368 **** do { \
} \
} while (0)
-
- /* ----------
- * pglz_out_ctrl -
- *
- * Outputs the last and allocates a new control byte if needed.
- * ----------
- */
- #define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) \
- do { \
- if ((__ctrl & 0xff) == 0) \
- { \
- *(__ctrlp) = __ctrlb; \
- __ctrlp = (__buf)++; \
- __ctrlb = 0; \
- __ctrl = 1; \
- } \
- } while (0)
-
-
- /* ----------
- * pglz_out_literal -
- *
- * Outputs a literal byte to the destination buffer including the
- * appropriate control bit.
- * ----------
- */
- #define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) \
- do { \
- pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \
- *(_buf)++ = (unsigned char)(_byte); \
- _ctrl <<= 1; \
- } while (0)
-
-
- /* ----------
- * pglz_out_tag -
- *
- * Outputs a backward reference tag of 2-4 bytes (depending on
- * offset and length) to the destination buffer including the
- * appropriate control bit.
- * ----------
- */
- #define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off) \
- do { \
- pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \
- _ctrlb |= _ctrl; \
- _ctrl <<= 1; \
- if (_len > 17) \
- { \
- (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f); \
- (_buf)[1] = (unsigned char)(((_off) & 0xff)); \
- (_buf)[2] = (unsigned char)((_len) - 18); \
- (_buf) += 3; \
- } else { \
- (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | ((_len) - 3)); \
- (_buf)[1] = (unsigned char)((_off) & 0xff); \
- (_buf) += 2; \
- } \
- } while (0)
-
-
/* ----------
* pglz_find_match -
*
--- 299,304 ----
***************
*** 595,601 **** pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
* Create the tag and add history entries for all matched
* characters.
*/
! pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
while (match_len--)
{
pglz_hist_add(hist_start, hist_entries,
--- 531,537 ----
* Create the tag and add history entries for all matched
* characters.
*/
! pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off, dp);
while (match_len--)
{
pglz_hist_add(hist_start, hist_entries,
***************
*** 647,661 **** pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
void
pglz_decompress(const PGLZ_Header *source, char *dest)
{
const unsigned char *sp;
const unsigned char *srcend;
unsigned char *dp;
unsigned char *destend;
sp = ((const unsigned char *) source) + sizeof(PGLZ_Header);
! srcend = ((const unsigned char *) source) + VARSIZE(source);
dp = (unsigned char *) dest;
! destend = dp + source->rawsize;
while (sp < srcend && dp < destend)
{
--- 583,620 ----
void
pglz_decompress(const PGLZ_Header *source, char *dest)
{
+ pglz_decompress_with_history((char *) source, dest, NULL, NULL);
+ }
+
+ /* ----------
+ * pglz_decompress_with_history -
+ *
+ * Decompresses source into dest.
+ * To decompress, it uses history if provided.
+ * ----------
+ */
+ void
+ pglz_decompress_with_history(const char *source, char *dest, uint32 *destlen,
+ const char *history)
+ {
+ PGLZ_Header src;
const unsigned char *sp;
const unsigned char *srcend;
unsigned char *dp;
unsigned char *destend;
+ /* To avoid the unaligned access of PGLZ_Header */
+ memcpy((char *) &src, source, sizeof(PGLZ_Header));
+
sp = ((const unsigned char *) source) + sizeof(PGLZ_Header);
! srcend = ((const unsigned char *) source) + VARSIZE(&src);
dp = (unsigned char *) dest;
! destend = dp + src.rawsize;
!
! if (destlen)
! {
! *destlen = src.rawsize;
! }
while (sp < srcend && dp < destend)
{
***************
*** 699,726 **** pglz_decompress(const PGLZ_Header *source, char *dest)
break;
}
! /*
! * Now we copy the bytes specified by the tag from OUTPUT to
! * OUTPUT. It is dangerous and platform dependent to use
! * memcpy() here, because the copied areas could overlap
! * extremely!
! */
! while (len--)
{
! *dp = dp[-off];
! dp++;
}
}
else
{
! /*
! * An unset control bit means LITERAL BYTE. So we just copy
! * one from INPUT to OUTPUT.
! */
! if (dp >= destend) /* check for buffer overrun */
! break; /* do not clobber memory */
!
! *dp++ = *sp++;
}
/*
--- 658,726 ----
break;
}
! if (history)
! {
! /*
! * Now we copy the bytes specified by the tag from history
! * to OUTPUT.
! */
! memcpy(dp, history + off, len);
! dp += len;
! }
! else
{
! /*
! * Now we copy the bytes specified by the tag from OUTPUT
! * to OUTPUT. It is dangerous and platform dependent to
! * use memcpy() here, because the copied areas could
! * overlap extremely!
! */
! while (len--)
! {
! *dp = dp[-off];
! dp++;
! }
}
}
else
{
! if (history)
! {
! /*
! * The byte at current offset in the source is the length
! * of this literal segment. See pglz_out_add for encoding
! * side.
! */
! int32 len;
!
! len = sp[0];
! sp += 1;
!
! if (dp + len > destend)
! {
! dp += len;
! break;
! }
!
! /*
! * Now we copy the bytes specified by the tag from Source
! * to OUTPUT.
! */
! memcpy(dp, sp, len);
! dp += len;
! sp += len;
! }
! else
! {
! /*
! * An unset control bit means LITERAL BYTE. So we just
! * copy one from INPUT to OUTPUT.
! */
! if (dp >= destend) /* check for buffer overrun */
! break; /* do not clobber memory */
!
! *dp++ = *sp++;
! }
}
/*
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 123,128 **** extern int CommitSiblings;
--- 123,129 ----
extern char *default_tablespace;
extern char *temp_tablespaces;
extern bool synchronize_seqscans;
+ extern int wal_update_compression_ratio;
extern int ssl_renegotiation_limit;
extern char *SSLCipherSuites;
***************
*** 2382,2387 **** static struct config_int ConfigureNamesInt[] =
--- 2383,2399 ----
NULL, NULL, NULL
},
+ {
+ /* Not for general use */
+ {"wal_update_compression_ratio", PGC_USERSET, DEVELOPER_OPTIONS,
+ gettext_noop("Sets the compression ratio of delta record for wal update"),
+ NULL,
+ },
+ &wal_update_compression_ratio,
+ 25, 1, 99,
+ NULL, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
*** a/src/include/access/heapam_xlog.h
--- b/src/include/access/heapam_xlog.h
***************
*** 142,153 **** typedef struct xl_heap_update
{
xl_heaptid target; /* deleted tuple id */
ItemPointerData newtid; /* new inserted tuple id */
! bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */
! bool new_all_visible_cleared; /* same for the page of newtid */
/* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */
} xl_heap_update;
! #define SizeOfHeapUpdate (offsetof(xl_heap_update, new_all_visible_cleared) + sizeof(bool))
/*
* This is what we need to know about vacuum page cleanup/redirect
--- 142,161 ----
{
xl_heaptid target; /* deleted tuple id */
ItemPointerData newtid; /* new inserted tuple id */
! int flags; /* flag bits, see below */
!
/* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */
} xl_heap_update;
!
! #define XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED 0x01 /* Indicates as old page's
! all visible bit is cleared */
! #define XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED 0x02 /* Indicates as new page's
! all visible bit is cleared */
! #define XL_HEAP_UPDATE_DELTA_ENCODED 0x04 /* Indicates as the update
! operation is delta encoded */
!
! #define SizeOfHeapUpdate (offsetof(xl_heap_update, flags) + sizeof(char))
/*
* This is what we need to know about vacuum page cleanup/redirect
*** a/src/include/access/htup_details.h
--- b/src/include/access/htup_details.h
***************
*** 18,23 ****
--- 18,24 ----
#include "access/tupdesc.h"
#include "access/tupmacs.h"
#include "storage/bufpage.h"
+ #include "utils/pg_lzcompress.h"
/*
* MaxTupleAttributeNumber limits the number of (user) columns in a tuple.
***************
*** 528,533 **** struct MinimalTupleData
--- 529,535 ----
HeapTupleHeaderSetOid((tuple)->t_data, (oid))
+ #if !defined(DISABLE_COMPLEX_MACRO)
/* ----------------
* fastgetattr
*
***************
*** 542,550 **** struct MinimalTupleData
* lookups, and call nocachegetattr() for the rest.
* ----------------
*/
-
- #if !defined(DISABLE_COMPLEX_MACRO)
-
#define fastgetattr(tup, attnum, tupleDesc, isnull) \
( \
AssertMacro((attnum) > 0), \
--- 544,549 ----
***************
*** 572,585 **** struct MinimalTupleData
nocachegetattr((tup), (attnum), (tupleDesc)) \
) \
) \
)
- #else /* defined(DISABLE_COMPLEX_MACRO) */
extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
bool *isnull);
#endif /* defined(DISABLE_COMPLEX_MACRO) */
-
/* ----------------
* heap_getattr
*
--- 571,626 ----
nocachegetattr((tup), (attnum), (tupleDesc)) \
) \
) \
+ ) \
+
+ /* ----------------
+ * fastgetattr_with_len
+ *
+ * Similar to fastgetattr and fetches the length of the given attribute
+ * also.
+ * ----------------
+ */
+ #define fastgetattr_with_len(tup, attnum, tupleDesc, isnull, len) \
+ ( \
+ AssertMacro((attnum) > 0), \
+ (*(isnull) = false), \
+ HeapTupleNoNulls(tup) ? \
+ ( \
+ (tupleDesc)->attrs[(attnum)-1]->attcacheoff >= 0 ? \
+ ( \
+ (*(len) = att_getlength( \
+ (tupleDesc)->attrs[(attnum)-1]->attlen, \
+ (char *) (tup)->t_data + (tup)->t_data->t_hoff +\
+ (tupleDesc)->attrs[(attnum)-1]->attcacheoff)), \
+ fetchatt((tupleDesc)->attrs[(attnum)-1], \
+ (char *) (tup)->t_data + (tup)->t_data->t_hoff + \
+ (tupleDesc)->attrs[(attnum)-1]->attcacheoff) \
+ ) \
+ : \
+ nocachegetattr_with_len((tup), (attnum), (tupleDesc), (len))\
+ ) \
+ : \
+ ( \
+ att_isnull((attnum)-1, (tup)->t_data->t_bits) ? \
+ ( \
+ (*(isnull) = true), \
+ (*(len) = 0), \
+ (Datum)NULL \
+ ) \
+ : \
+ ( \
+ nocachegetattr_with_len((tup), (attnum), (tupleDesc), (len))\
+ ) \
+ ) \
)
+ #else /* defined(DISABLE_COMPLEX_MACRO) */
extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
bool *isnull);
+ extern Datum fastgetattr_with_len(HeapTuple tup, int attnum,
+ TupleDesc tupleDesc, bool *isnull, int32 *len);
#endif /* defined(DISABLE_COMPLEX_MACRO) */
/* ----------------
* heap_getattr
*
***************
*** 596,616 **** extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
* ----------------
*/
#define heap_getattr(tup, attnum, tupleDesc, isnull) \
( \
! ((attnum) > 0) ? \
( \
! ((attnum) > (int) HeapTupleHeaderGetNatts((tup)->t_data)) ? \
! ( \
! (*(isnull) = true), \
! (Datum)NULL \
! ) \
! : \
! fastgetattr((tup), (attnum), (tupleDesc), (isnull)) \
) \
: \
! heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \
! )
/* prototypes for functions in common/heaptuple.c */
extern Size heap_compute_data_size(TupleDesc tupleDesc,
--- 637,679 ----
* ----------------
*/
#define heap_getattr(tup, attnum, tupleDesc, isnull) \
+ ( \
+ ((attnum) > 0) ? \
( \
! ((attnum) > (int) HeapTupleHeaderGetNatts((tup)->t_data)) ? \
( \
! (*(isnull) = true), \
! (Datum)NULL \
) \
: \
! fastgetattr((tup), (attnum), (tupleDesc), (isnull)) \
! ) \
! : \
! heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \
! )
+ /* ----------------
+ * heap_getattr_with_len
+ *
+ * Similar to heap_getattr and outputs the length of the given attribute.
+ * ----------------
+ */
+ #define heap_getattr_with_len(tup, attnum, tupleDesc, isnull, len) \
+ ( \
+ ((attnum) > 0) ? \
+ ( \
+ ((attnum) > (int) HeapTupleHeaderGetNatts((tup)->t_data)) ? \
+ ( \
+ (*(isnull) = true), \
+ (*(len) = 0), \
+ (Datum)NULL \
+ ) \
+ : \
+ fastgetattr_with_len((tup), (attnum), (tupleDesc), (isnull), (len)) \
+ ) \
+ : \
+ heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \
+ )
/* prototypes for functions in common/heaptuple.c */
extern Size heap_compute_data_size(TupleDesc tupleDesc,
***************
*** 620,625 **** extern void heap_fill_tuple(TupleDesc tupleDesc,
--- 683,690 ----
char *data, Size data_size,
uint16 *infomask, bits8 *bit);
extern bool heap_attisnull(HeapTuple tup, int attnum);
+ extern Datum nocachegetattr_with_len(HeapTuple tup, int attnum,
+ TupleDesc att, Size *len);
extern Datum nocachegetattr(HeapTuple tup, int attnum,
TupleDesc att);
extern Datum heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
***************
*** 636,641 **** extern HeapTuple heap_modify_tuple(HeapTuple tuple,
--- 701,714 ----
extern void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc,
Datum *values, bool *isnull);
+ extern bool heap_attr_get_length_and_check_equals(TupleDesc tupdesc,
+ int attrnum, HeapTuple tup1, HeapTuple tup2,
+ Size *tup1_attr_len, Size *tup2_attr_len);
+ extern bool heap_delta_encode(TupleDesc tupleDesc, HeapTuple oldtup,
+ HeapTuple newtup, PGLZ_Header *encdata);
+ extern void heap_delta_decode (PGLZ_Header *encdata, HeapTuple oldtup,
+ HeapTuple newtup);
+
/* these three are deprecated versions of the three above: */
extern HeapTuple heap_formtuple(TupleDesc tupleDescriptor,
Datum *values, char *nulls);
*** a/src/include/access/tupmacs.h
--- b/src/include/access/tupmacs.h
***************
*** 187,192 ****
--- 187,214 ----
)
/*
+ * att_getlength -
+ * Gets the length of the attribute.
+ */
+ #define att_getlength(attlen, attptr) \
+ ( \
+ ((attlen) > 0) ? \
+ ( \
+ (attlen) \
+ ) \
+ : (((attlen) == -1) ? \
+ ( \
+ VARSIZE_ANY(attptr) \
+ ) \
+ : \
+ ( \
+ AssertMacro((attlen) == -2), \
+ (strlen((char *) (attptr)) + 1) \
+ )) \
+ )
+
+
+ /*
* store_att_byval is a partial inverse of fetch_att: store a given Datum
* value into a tuple data area at the specified address. However, it only
* handles the byval case, because in typical usage the caller needs to
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
***************
*** 261,266 **** typedef struct CheckpointStatsData
--- 261,267 ----
extern CheckpointStatsData CheckpointStats;
extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
+ extern bool XLogCheckBufferNeedsBackup(Buffer buffer);
extern void XLogFlush(XLogRecPtr RecPtr);
extern bool XLogBackgroundFlush(void);
extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
*** a/src/include/utils/pg_lzcompress.h
--- b/src/include/utils/pg_lzcompress.h
***************
*** 23,28 **** typedef struct PGLZ_Header
--- 23,31 ----
int32 rawsize;
} PGLZ_Header;
+ /* LZ algorithm can hold only history offset in the range of 1 - 4095. */
+ #define PGLZ_HISTORY_SIZE 4096
+ #define PGLZ_MAX_MATCH 273
/* ----------
* PGLZ_MAX_OUTPUT -
***************
*** 86,91 **** typedef struct PGLZ_Strategy
--- 89,207 ----
int32 match_size_drop;
} PGLZ_Strategy;
+ /*
+ * calculate the approximate length required for history reference tag for the
+ * given length
+ */
+ #define PGLZ_GET_HIST_CTRL_BIT_LEN(_len) \
+ ( \
+ ((_len) < 17) ? (3) : (4 * (1 + ((_len) / PGLZ_MAX_MATCH))) \
+ )
+
+ /* ----------
+ * pglz_out_ctrl -
+ *
+ * Outputs the last and allocates a new control byte if needed.
+ * ----------
+ */
+ #define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) \
+ do { \
+ if ((__ctrl & 0xff) == 0) \
+ { \
+ *(__ctrlp) = __ctrlb; \
+ __ctrlp = (__buf)++; \
+ __ctrlb = 0; \
+ __ctrl = 1; \
+ } \
+ } while (0)
+
+ /* ----------
+ * pglz_out_literal -
+ *
+ * Outputs a literal byte to the destination buffer including the
+ * appropriate control bit.
+ * ----------
+ */
+ #define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) \
+ do { \
+ pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \
+ *(_buf)++ = (unsigned char)(_byte); \
+ _ctrl <<= 1; \
+ } while (0)
+
+ /* ----------
+ * pglz_out_tag -
+ *
+ * Outputs a backward/history reference tag of 2-3 bytes (depending on
+ * offset and length) to the destination buffer including the
+ * appropriate control bit.
+ *
+ * Split the process of backward/history reference as different chunks,
+ * if the given length is more than max match and repeats the process
+ * until the given length is processed.
+ *
+ * If the matched history length is less than 3 bytes then add it as a
+ * new data only during encoding instead of history reference. This occurs
+ * only while framing EWT.
+ * ----------
+ */
+ #define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off,_byte) \
+ do { \
+ int _mtaglen; \
+ int _tagtotal_len = (_len); \
+ while (_tagtotal_len > 0) \
+ { \
+ _mtaglen = _tagtotal_len > PGLZ_MAX_MATCH ? PGLZ_MAX_MATCH : _tagtotal_len; \
+ if (_mtaglen < 3) \
+ { \
+ char *_data = (char *)(_byte) + (_off); \
+ pglz_out_add(_ctrlp,_ctrlb,_ctrl,_buf,_mtaglen,_data); \
+ break; \
+ } \
+ pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \
+ _ctrlb |= _ctrl; \
+ _ctrl <<= 1; \
+ if (_mtaglen > 17) \
+ { \
+ (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f); \
+ (_buf)[1] = (unsigned char)(((_off) & 0xff)); \
+ (_buf)[2] = (unsigned char)((_mtaglen) - 18); \
+ (_buf) += 3; \
+ } else { \
+ (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | ((_mtaglen) - 3)); \
+ (_buf)[1] = (unsigned char)((_off) & 0xff); \
+ (_buf) += 2; \
+ } \
+ _tagtotal_len -= _mtaglen; \
+ (_off) += _mtaglen; \
+ } \
+ } while (0)
+
+ /* ----------
+ * pglz_out_add -
+ *
+ * Outputs a reference tag of 1 byte with length and the new data
+ * to the destination buffer, including the appropriate control bit.
+ * ----------
+ */
+ #define pglz_out_add(_ctrlp,_ctrlb,_ctrl,_buf,_len,_byte) \
+ do { \
+ int32 _maddlen; \
+ int32 _addtotal_len = (_len); \
+ while (_addtotal_len > 0) \
+ { \
+ _maddlen = _addtotal_len > 255 ? 255 : _addtotal_len; \
+ pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \
+ _ctrl <<= 1; \
+ (_buf)[0] = (unsigned char)(_maddlen); \
+ (_buf) += 1; \
+ memcpy((_buf), (_byte), _maddlen); \
+ (_buf) += _maddlen; \
+ (_byte) += _maddlen; \
+ _addtotal_len -= _maddlen; \
+ } \
+ } while (0)
+
/* ----------
* The standard strategies
***************
*** 108,112 **** extern const PGLZ_Strategy *const PGLZ_strategy_always;
extern bool pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
const PGLZ_Strategy *strategy);
extern void pglz_decompress(const PGLZ_Header *source, char *dest);
!
#endif /* _PG_LZCOMPRESS_H_ */
--- 224,229 ----
extern bool pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
const PGLZ_Strategy *strategy);
extern void pglz_decompress(const PGLZ_Header *source, char *dest);
! extern void pglz_decompress_with_history(const char *source, char *dest,
! uint32 *destlen, const char *history);
#endif /* _PG_LZCOMPRESS_H_ */
*** a/src/test/regress/expected/update.out
--- b/src/test/regress/expected/update.out
***************
*** 97,99 **** SELECT a, b, char_length(c) FROM update_test;
--- 97,169 ----
(2 rows)
DROP TABLE update_test;
+ --
+ -- Test to update continuos and non continuos columns
+ --
+ DROP TABLE IF EXISTS update_test;
+ NOTICE: table "update_test" does not exist, skipping
+ CREATE TABLE update_test (
+ bser bigserial,
+ bln boolean,
+ ename VARCHAR(25),
+ perf_f float(8),
+ grade CHAR,
+ dept CHAR(5) NOT NULL,
+ dob DATE,
+ idnum INT,
+ addr VARCHAR(30) NOT NULL,
+ destn CHAR(6),
+ Gend CHAR,
+ samba BIGINT,
+ hgt float,
+ ctime TIME
+ );
+ INSERT INTO update_test VALUES (
+ nextval('update_test_bser_seq'::regclass),
+ TRUE,
+ 'Test',
+ 7.169,
+ 'B',
+ 'CSD',
+ '2000-01-01',
+ 520,
+ 'road2,
+ streeeeet2,
+ city2',
+ 'dcy2',
+ 'M',
+ 12000,
+ 50.4,
+ '00:00:00.0'
+ );
+ SELECT * from update_test;
+ bser | bln | ename | perf_f | grade | dept | dob | idnum | addr | destn | gend | samba | hgt | ctime
+ ------+-----+-------+--------+-------+-------+------------+-------+-----------------------------+--------+------+-------+------+----------
+ 1 | t | Test | 7.169 | B | CSD | 01-01-2000 | 520 | road2, +| dcy2 | M | 12000 | 50.4 | 00:00:00
+ | | | | | | | | streeeeet2,+| | | | |
+ | | | | | | | | city2 | | | | |
+ (1 row)
+
+ -- update first column
+ UPDATE update_test SET bser = bser - 1 + 1;
+ -- update middle column
+ UPDATE update_test SET perf_f = 8.9;
+ -- update last column
+ UPDATE update_test SET ctime = '00:00:00.1';
+ -- update 3 continuos columns
+ UPDATE update_test SET destn = 'dcy2', samba = 0 WHERE Gend = 'M' and dept = 'CSD';
+ -- update two non continuos columns
+ UPDATE update_test SET destn = 'moved', samba = 0;
+ UPDATE update_test SET bln = FALSE, hgt = 10.1;
+ -- update causing some column alignment difference
+ UPDATE update_test SET ename = 'Tes';
+ UPDATE update_test SET dept = 'Test';
+ SELECT * from update_test;
+ bser | bln | ename | perf_f | grade | dept | dob | idnum | addr | destn | gend | samba | hgt | ctime
+ ------+-----+-------+--------+-------+-------+------------+-------+-----------------------------+--------+------+-------+------+------------
+ 1 | f | Tes | 8.9 | B | Test | 01-01-2000 | 520 | road2, +| moved | M | 0 | 10.1 | 00:00:00.1
+ | | | | | | | | streeeeet2,+| | | | |
+ | | | | | | | | city2 | | | | |
+ (1 row)
+
+ DROP TABLE update_test;
*** a/src/test/regress/sql/update.sql
--- b/src/test/regress/sql/update.sql
***************
*** 59,61 **** UPDATE update_test SET c = repeat('x', 10000) WHERE c = 'car';
--- 59,128 ----
SELECT a, b, char_length(c) FROM update_test;
DROP TABLE update_test;
+
+
+ --
+ -- Test to update continuos and non continuos columns
+ --
+
+ DROP TABLE IF EXISTS update_test;
+ CREATE TABLE update_test (
+ bser bigserial,
+ bln boolean,
+ ename VARCHAR(25),
+ perf_f float(8),
+ grade CHAR,
+ dept CHAR(5) NOT NULL,
+ dob DATE,
+ idnum INT,
+ addr VARCHAR(30) NOT NULL,
+ destn CHAR(6),
+ Gend CHAR,
+ samba BIGINT,
+ hgt float,
+ ctime TIME
+ );
+
+ INSERT INTO update_test VALUES (
+ nextval('update_test_bser_seq'::regclass),
+ TRUE,
+ 'Test',
+ 7.169,
+ 'B',
+ 'CSD',
+ '2000-01-01',
+ 520,
+ 'road2,
+ streeeeet2,
+ city2',
+ 'dcy2',
+ 'M',
+ 12000,
+ 50.4,
+ '00:00:00.0'
+ );
+
+ SELECT * from update_test;
+
+ -- update first column
+ UPDATE update_test SET bser = bser - 1 + 1;
+
+ -- update middle column
+ UPDATE update_test SET perf_f = 8.9;
+
+ -- update last column
+ UPDATE update_test SET ctime = '00:00:00.1';
+
+ -- update 3 continuos columns
+ UPDATE update_test SET destn = 'dcy2', samba = 0 WHERE Gend = 'M' and dept = 'CSD';
+
+ -- update two non continuos columns
+ UPDATE update_test SET destn = 'moved', samba = 0;
+ UPDATE update_test SET bln = FALSE, hgt = 10.1;
+
+ -- update causing some column alignment difference
+ UPDATE update_test SET ename = 'Tes';
+ UPDATE update_test SET dept = 'Test';
+
+ SELECT * from update_test;
+ DROP TABLE update_test;