*** a/src/backend/access/common/heaptuple.c --- b/src/backend/access/common/heaptuple.c *************** *** 60,66 **** --- 60,69 ---- #include "access/sysattr.h" #include "access/tuptoaster.h" #include "executor/tuptable.h" + #include "utils/datum.h" + /* guc variable for EWT compression ratio*/ + int wal_update_compression_ratio = 25; /* Does att's datatype allow packing into the 1-byte-header varlena format? */ #define ATT_IS_PACKABLE(att) \ *************** *** 297,308 **** heap_attisnull(HeapTuple tup, int attnum) } /* ---------------- ! * nocachegetattr * ! * This only gets called from fastgetattr() macro, in cases where * we can't use a cacheoffset and the value is not null. * ! * This caches attribute offsets in the attribute descriptor. * * An alternative way to speed things up would be to cache offsets * with the tuple, but that seems more difficult unless you take --- 300,312 ---- } /* ---------------- ! * nocachegetattr_with_len * ! * This only gets called in cases where * we can't use a cacheoffset and the value is not null. * ! * This caches attribute offsets in the attribute descriptor and ! * outputs the length of the attribute value. * * An alternative way to speed things up would be to cache offsets * with the tuple, but that seems more difficult unless you take *************** *** 320,328 **** heap_attisnull(HeapTuple tup, int attnum) * ---------------- */ Datum ! nocachegetattr(HeapTuple tuple, ! int attnum, ! TupleDesc tupleDesc) { HeapTupleHeader tup = tuple->t_data; Form_pg_attribute *att = tupleDesc->attrs; --- 324,333 ---- * ---------------- */ Datum ! nocachegetattr_with_len(HeapTuple tuple, ! int attnum, ! TupleDesc tupleDesc, ! Size *len) { HeapTupleHeader tup = tuple->t_data; Form_pg_attribute *att = tupleDesc->attrs; *************** *** 381,386 **** nocachegetattr(HeapTuple tuple, --- 386,394 ---- */ if (att[attnum]->attcacheoff >= 0) { + if (len) + *len = att_getlength(att[attnum]->attlen, + tp + att[attnum]->attcacheoff); return fetchatt(att[attnum], tp + att[attnum]->attcacheoff); } *************** *** 507,515 **** nocachegetattr(HeapTuple tuple, --- 515,536 ---- } } + if (len) + *len = att_getlength(att[attnum]->attlen, tp + off); return fetchatt(att[attnum], tp + off); } + /* + * nocachegetattr + */ + Datum + nocachegetattr(HeapTuple tuple, + int attnum, + TupleDesc tupleDesc) + { + return nocachegetattr_with_len(tuple, attnum, tupleDesc, NULL); + } + /* ---------------- * heap_getsysattr * *************** *** 617,622 **** heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest) --- 638,1061 ---- memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len); } + /* ---------------- + * heap_attr_get_length_and_check_equals + * + * returns the result of comparison of specified attribute's value for + * input tuples. + * outputs the length of specified attribute's value for + * input tuples. + * ---------------- + */ + bool + heap_attr_get_length_and_check_equals(TupleDesc tupdesc, int attrnum, + HeapTuple tup1, HeapTuple tup2, + Size *tup1_attr_len, Size *tup2_attr_len) + { + Datum value1, + value2; + bool isnull1, + isnull2; + Form_pg_attribute att; + + *tup1_attr_len = 0; + *tup2_attr_len = 0; + + /* + * If it's a whole-tuple reference, say "not equal". It's not really + * worth supporting this case, since it could only succeed after a no-op + * update, which is hardly a case worth optimizing for. + */ + if (attrnum == 0) + return false; + + /* + * Likewise, automatically say "not equal" for any system attribute other + * than OID and tableOID; we cannot expect these to be consistent in a HOT + * chain, or even to be set correctly yet in the new tuple. + */ + if (attrnum < 0) + { + if (attrnum != ObjectIdAttributeNumber && + attrnum != TableOidAttributeNumber) + return false; + } + + /* + * Extract the corresponding values and length of values. XXX this is + * pretty inefficient if there are many indexed columns. Should + * HeapSatisfiesHOTUpdate do a single heap_deform_tuple call on each + * tuple, instead? But that doesn't work for system columns ... + */ + value1 = heap_getattr_with_len(tup1, attrnum, tupdesc, &isnull1, tup1_attr_len); + value2 = heap_getattr_with_len(tup2, attrnum, tupdesc, &isnull2, tup2_attr_len); + + /* + * If one value is NULL and other is not, then they are certainly not + * equal + */ + if (isnull1 != isnull2) + return false; + + /* + * If both are NULL, they can be considered equal. + */ + if (isnull1) + return true; + + /* + * We do simple binary comparison of the two datums. This may be overly + * strict because there can be multiple binary representations for the + * same logical value. But we should be OK as long as there are no false + * positives. Using a type-specific equality operator is messy because + * there could be multiple notions of equality in different operator + * classes; furthermore, we cannot safely invoke user-defined functions + * while holding exclusive buffer lock. + */ + if (attrnum <= 0) + { + /* The only allowed system columns are OIDs, so do this */ + return (DatumGetObjectId(value1) == DatumGetObjectId(value2)); + } + else + { + Assert(attrnum <= tupdesc->natts); + att = tupdesc->attrs[attrnum - 1]; + return datumIsEqual(value1, value2, att->attbyval, att->attlen); + } + } + + /* ---------------- + * heap_delta_encode + * + * Construct a delta Encoded WAL Tuple (EWT) by comparing old and new + * tuple versions w.r.t column boundaries. + * + * Encoded WAL Tuple Format: + * Header + Control byte + history reference (2 - 3)bytes + * + New data (1 byte length + variable data)+ ... + * + * Encode Mechanism: + * + * Copy the bitmap data from new tuple to the EWT (Encoded WAL Tuple) and + * loop for all attributes to find any modifications in the attributes. + * The unmodified data is encoded as a History Reference in EWT and + * the modified data (if NOT NULL) is encoded as New Data in EWT. + * + * The offset values are calculated with respect to the tuple t_hoff + * value. For each column attribute old and new tuple offsets + * are recalculated based on padding in the tuples. + * Once the alignment difference is found between old and new tuple + * versions, then include alignment difference as New Data in EWT. + * + * max encoded data length is 75% (default compression rate) + * of original data, If encoded output data length is greater than + * that, original tuple (new tuple version) will be directly stored in + * WAL Tuple. + * + * + * History Reference: + * If any column is modified then the unmodified columns data till the + * modified column needs to be copied to EWT as a Tag. + * + * + * New data (modified data): + * First byte repersents the length [0-255] of the modified data, + * followed by the modified data of corresponding length. + * + * For more details about Encoded WAL Tuple (EWT) representation, + * refer transam\README + * ---------------- + */ + bool + heap_delta_encode(TupleDesc tupleDesc, HeapTuple oldtup, HeapTuple newtup, + PGLZ_Header *encdata) + { + Form_pg_attribute *att = tupleDesc->attrs; + int numberOfAttributes; + int32 new_tup_off = 0, + old_tup_off = 0, + temp_off = 0, + match_off = 0, + change_off = 0; + int attnum; + int32 data_len, + old_tup_pad_len, + new_tup_pad_len; + Size old_tup_attr_len, + new_tup_attr_len; + bool is_attr_equals = true; + unsigned char *bp = (unsigned char *) encdata + sizeof(PGLZ_Header); + unsigned char *bstart = bp; + char *dp = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits); + char *dstart = dp; + char *history; + unsigned char ctrl_dummy = 0; + unsigned char *ctrlp = &ctrl_dummy; + unsigned char ctrlb = 0; + unsigned char ctrl = 0; + int32 len, + old_tup_bitmaplen, + new_tup_bitmaplen, + old_tup_len, + new_tup_len; + int32 result_size; + int32 result_max; + + old_tup_len = oldtup->t_len - offsetof(HeapTupleHeaderData, t_bits); + + /* + * Tuples of length greater than PGLZ_HISTORY_SIZE are not allowed for + * delta encode as this is the maximum size of history offset. + */ + if (old_tup_len >= PGLZ_HISTORY_SIZE) + return false; + + history = (char *) oldtup->t_data + offsetof(HeapTupleHeaderData, t_bits); + old_tup_bitmaplen = oldtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits); + new_tup_bitmaplen = newtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits); + new_tup_len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits); + + /* + * If length of old and new tuple versions vary by more than 50%, include + * new as-is + */ + if ((new_tup_len <= (old_tup_len >> 1)) + || (old_tup_len <= (new_tup_len >> 1))) + return false; + + /* Required compression ratio for EWT */ + result_max = (new_tup_len * (100 - wal_update_compression_ratio)) / 100; + encdata->rawsize = new_tup_len; + + /* + * Advance the EWT by adding the approximate length of the operation for + * new data as [1 control byte + 1 length byte + data_length] and validate + * it with result_max. The same length approximation is used in the + * function for New data. + */ + if ((bp + (2 + new_tup_bitmaplen)) - bstart >= result_max) + return false; + + /* Copy the bitmap data from new tuple to EWT */ + pglz_out_add(ctrlp, ctrlb, ctrl, bp, new_tup_bitmaplen, dp); + dstart = dp; + + /* + * Loop through all attributes, if the attribute is modified by the update + * operation, store the [Offset,Length] reffering old tuple version till + * the last unchanged column in the EWT as History Reference, else store + * the [Length,Data] from new tuple version as New Data. + */ + numberOfAttributes = HeapTupleHeaderGetNatts(newtup->t_data); + for (attnum = 1; attnum <= numberOfAttributes; attnum++) + { + if (!heap_attr_get_length_and_check_equals(tupleDesc, attnum, oldtup, + newtup, &old_tup_attr_len, &new_tup_attr_len)) + { + is_attr_equals = false; + data_len = old_tup_off - match_off; + + len = PGLZ_GET_HIST_CTRL_BIT_LEN(data_len); + if ((bp + len) - bstart >= result_max) + return false; + + /* + * The match_off value is calculated w.r.t to the tuple t_hoff + * value, the bit map len needs to be added to match_off to get + * the actual start offset from the old/history tuple. + */ + match_off += old_tup_bitmaplen; + + /* + * If any unchanged data presents in the old and new tuples then + * encode the data as it needs to copy from history tuple with len + * and offset. + */ + pglz_out_tag(ctrlp, ctrlb, ctrl, bp, data_len, match_off, history); + + /* + * Recalculate the old and new tuple offsets based on padding in + * the tuples + */ + if (!HeapTupleHasNulls(oldtup) + || !att_isnull((attnum - 1), oldtup->t_data->t_bits)) + { + old_tup_off = att_align_pointer(old_tup_off, + att[attnum - 1]->attalign, + att[attnum - 1]->attlen, + (char *) oldtup->t_data + oldtup->t_data->t_hoff + old_tup_off); + } + + if (!HeapTupleHasNulls(newtup) + || !att_isnull((attnum - 1), newtup->t_data->t_bits)) + { + new_tup_off = att_align_pointer(new_tup_off, + att[attnum - 1]->attalign, + att[attnum - 1]->attlen, + (char *) newtup->t_data + newtup->t_data->t_hoff + new_tup_off); + } + + old_tup_off += old_tup_attr_len; + new_tup_off += new_tup_attr_len; + + match_off = old_tup_off; + } + else + { + data_len = new_tup_off - change_off; + if ((bp + (2 + data_len)) - bstart >= result_max) + return false; + + /* Add the modified column data to the EWT */ + pglz_out_add(ctrlp, ctrlb, ctrl, bp, data_len, dp); + + /* + * Calculate the alignment for old and new tuple versions for this + * attribute, if the alignment is same, then we continue for next + * attribute else 1. stores the [Offset,Length] reffering old + * tuple version for previous attribute (if previous attr is same + * in old and new tuple versions) in the EWT as History Reference, + * 2. add the [Length,Data] for alignment from new tuple as New + * Data in EWT. + */ + if (!HeapTupleHasNulls(oldtup) + || !att_isnull((attnum - 1), oldtup->t_data->t_bits)) + { + temp_off = old_tup_off; + old_tup_off = att_align_pointer(old_tup_off, + att[attnum - 1]->attalign, + att[attnum - 1]->attlen, + (char *) oldtup->t_data + oldtup->t_data->t_hoff + old_tup_off); + + old_tup_pad_len = old_tup_off - temp_off; + + + temp_off = new_tup_off; + new_tup_off = att_align_pointer(new_tup_off, + att[attnum - 1]->attalign, + att[attnum - 1]->attlen, + (char *) newtup->t_data + newtup->t_data->t_hoff + new_tup_off); + new_tup_pad_len = new_tup_off - temp_off; + + if (old_tup_pad_len != new_tup_pad_len) + { + /* + * If the alignment difference is found between old and + * new tuples and previous attribute value of the old and + * new tuple versions is same then store until the current + * match as history reference Tag in EWT. + */ + if (is_attr_equals) + { + data_len = old_tup_off - old_tup_pad_len - match_off; + len = PGLZ_GET_HIST_CTRL_BIT_LEN(data_len); + if ((bp + len) - bstart >= result_max) + return false; + + match_off += old_tup_bitmaplen; + pglz_out_tag(ctrlp, ctrlb, ctrl, bp, data_len, match_off, history); + } + + match_off = old_tup_off; + + /* Alignment data */ + if ((bp + (2 + new_tup_pad_len)) - bstart >= result_max) + return false; + + pglz_out_add(ctrlp, ctrlb, ctrl, bp, new_tup_pad_len, dp); + } + } + + old_tup_off += old_tup_attr_len; + new_tup_off += new_tup_attr_len; + + change_off = new_tup_off; + + /* + * Recalculate the destination pointer with the new offset which + * is used while copying the modified data. + */ + dp = dstart + new_tup_off; + is_attr_equals = true; + } + } + + /* If any modified column data presents then add it in EWT. */ + data_len = new_tup_off - change_off; + if ((bp + (2 + data_len)) - bstart >= result_max) + return false; + + pglz_out_add(ctrlp, ctrlb, ctrl, bp, data_len, dp); + + /* + * If any left out old tuple data is present then copy it as history + * reference + */ + data_len = old_tup_off - match_off; + len = PGLZ_GET_HIST_CTRL_BIT_LEN(data_len); + if ((bp + len) - bstart >= result_max) + return false; + + match_off += old_tup_bitmaplen; + pglz_out_tag(ctrlp, ctrlb, ctrl, bp, data_len, match_off, history); + + /* + * Write out the last control byte and check that we haven't overrun the + * output size allowed by the strategy. + */ + *ctrlp = ctrlb; + + result_size = bp - bstart; + if (result_size >= result_max) + return false; + + /* Fill in the actual length of the compressed datum */ + SET_VARSIZE_COMPRESSED(encdata, result_size + sizeof(PGLZ_Header)); + return true; + } + + /* ---------------- + * heap_delta_decode + * + * Decode a tuple using delta-encoded WAL tuple and old tuple version + * + * Encoded WAL Tuple Format: + * Header + Control byte + history reference (2 - 3)bytes + * + New data (1 byte length + variable data)+ ... + * + * + * Decode Mechanism: + * Skip header and Read one control byte and process the next 8 items (or as many as + * remain in the compressed input). + * Check each control bit, if the bit is set then it is History Reference which + * means the next 2 - 3 byte tag provides the offset and length of history match. + * Use the offset and corresponding length to copy data from old tuple version + * to new tuple. + * If the control bit is unset, then it is New Data Reference which means + * first byte contains the length [0-255] of the modified data, followed + * by the modified data of corresponding length specified in the first byte. + * + * Tag in History Reference: + * 2-3 byte tag - + * 2 byte tag is used when length of History data (unchanged data from old tuple version) is less than 18. + * 3 byte tag is used when length of History data (unchanged data from old tuple version) is greater than + * equal to 18. + * The maximum length that can be represented by one Tag is 273. + * + * For more details about Encoded WAL Tuple (EWT) representation, refer transam\README + * + * ---------------- + */ + void + heap_delta_decode(PGLZ_Header *encdata, HeapTuple oldtup, HeapTuple newtup) + { + return pglz_decompress_with_history((char *) encdata, + (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits), + &newtup->t_len, + (char *) oldtup->t_data + offsetof(HeapTupleHeaderData, t_bits)); + } + /* * heap_form_tuple * construct a tuple from the given values[] and isnull[] arrays, *** a/src/backend/access/heap/heapam.c --- b/src/backend/access/heap/heapam.c *************** *** 85,90 **** static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, --- 85,91 ---- TransactionId xid, CommandId cid, int options); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, Buffer newbuf, HeapTuple newtup, + HeapTuple oldtup, bool all_visible_cleared, bool new_all_visible_cleared); static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs, HeapTuple oldtup, HeapTuple newtup); *************** *** 857,862 **** heapgettup_pagemode(HeapScanDesc scan, --- 858,911 ---- * definition in access/htup.h is maintained. */ Datum + fastgetattr_with_len(HeapTuple tup, int attnum, TupleDesc tupleDesc, + bool *isnull, int32 *len) + { + return ( + (attnum) > 0 ? + ( + (*(isnull) = false), + HeapTupleNoNulls(tup) ? + ( + (tupleDesc)->attrs[(attnum) - 1]->attcacheoff >= 0 ? + ( + (*(len) = att_getlength((tupleDesc)->attrs[(attnum - 1)]->attlen, + (char *) (tup)->t_data + (tup)->t_data->t_hoff + + (tupleDesc)->attrs[(attnum) - 1]->attcacheoff)), + fetchatt((tupleDesc)->attrs[(attnum) - 1], + (char *) (tup)->t_data + (tup)->t_data->t_hoff + + (tupleDesc)->attrs[(attnum) - 1]->attcacheoff) + ) + : + ( + nocachegetattr_with_len(tup), (attnum), (tupleDesc), (len)) + ) + : + ( + att_isnull((attnum) - 1, (tup)->t_data->t_bits) ? + ( + (*(isnull) = true), + (*(len) = 0), + (Datum) NULL + ) + : + ( + nocachegetattr_with_len((tup), (attnum), (tupleDesc), (len)) + ) + ) + ) + : + ( + (Datum) NULL + ) + ); + } + + /* + * This is formatted so oddly so that the correspondence to the macro + * definition in access/htup.h is maintained. + */ + Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) { *************** *** 873,879 **** fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, (tupleDesc)->attrs[(attnum) - 1]->attcacheoff) ) : ! nocachegetattr((tup), (attnum), (tupleDesc)) ) : ( --- 922,929 ---- (tupleDesc)->attrs[(attnum) - 1]->attcacheoff) ) : ! ( ! nocachegetattr(tup), (attnum), (tupleDesc)) ) : ( *************** *** 3229,3238 **** l2: /* XLOG stuff */ if (RelationNeedsWAL(relation)) { ! XLogRecPtr recptr = log_heap_update(relation, buffer, oldtup.t_self, ! newbuf, heaptup, ! all_visible_cleared, ! all_visible_cleared_new); if (newbuf != buffer) { --- 3279,3290 ---- /* XLOG stuff */ if (RelationNeedsWAL(relation)) { ! XLogRecPtr recptr; ! ! recptr = log_heap_update(relation, buffer, oldtup.t_self, ! newbuf, heaptup, &oldtup, ! all_visible_cleared, ! all_visible_cleared_new); if (newbuf != buffer) { *************** *** 3299,3372 **** static bool heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, HeapTuple tup1, HeapTuple tup2) { ! Datum value1, ! value2; ! bool isnull1, ! isnull2; ! Form_pg_attribute att; ! ! /* ! * If it's a whole-tuple reference, say "not equal". It's not really ! * worth supporting this case, since it could only succeed after a no-op ! * update, which is hardly a case worth optimizing for. ! */ ! if (attrnum == 0) ! return false; ! ! /* ! * Likewise, automatically say "not equal" for any system attribute other ! * than OID and tableOID; we cannot expect these to be consistent in a HOT ! * chain, or even to be set correctly yet in the new tuple. ! */ ! if (attrnum < 0) ! { ! if (attrnum != ObjectIdAttributeNumber && ! attrnum != TableOidAttributeNumber) ! return false; ! } ! ! /* ! * Extract the corresponding values. XXX this is pretty inefficient if ! * there are many indexed columns. Should HeapSatisfiesHOTUpdate do a ! * single heap_deform_tuple call on each tuple, instead? But that doesn't ! * work for system columns ... ! */ ! value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1); ! value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2); ! ! /* ! * If one value is NULL and other is not, then they are certainly not ! * equal ! */ ! if (isnull1 != isnull2) ! return false; ! ! /* ! * If both are NULL, they can be considered equal. ! */ ! if (isnull1) ! return true; ! /* ! * We do simple binary comparison of the two datums. This may be overly ! * strict because there can be multiple binary representations for the ! * same logical value. But we should be OK as long as there are no false ! * positives. Using a type-specific equality operator is messy because ! * there could be multiple notions of equality in different operator ! * classes; furthermore, we cannot safely invoke user-defined functions ! * while holding exclusive buffer lock. ! */ ! if (attrnum <= 0) ! { ! /* The only allowed system columns are OIDs, so do this */ ! return (DatumGetObjectId(value1) == DatumGetObjectId(value2)); ! } ! else ! { ! Assert(attrnum <= tupdesc->natts); ! att = tupdesc->attrs[attrnum - 1]; ! return datumIsEqual(value1, value2, att->attbyval, att->attlen); ! } } /* --- 3351,3361 ---- heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, HeapTuple tup1, HeapTuple tup2) { ! Size tup1_attr_len, ! tup2_attr_len; ! return heap_attr_get_length_and_check_equals(tupdesc, attrnum, tup1, tup2, ! &tup1_attr_len, &tup2_attr_len); } /* *************** *** 4464,4470 **** log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer, */ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, ! Buffer newbuf, HeapTuple newtup, bool all_visible_cleared, bool new_all_visible_cleared) { xl_heap_update xlrec; --- 4453,4459 ---- */ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, ! Buffer newbuf, HeapTuple newtup, HeapTuple oldtup, bool all_visible_cleared, bool new_all_visible_cleared) { xl_heap_update xlrec; *************** *** 4473,4478 **** log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, --- 4462,4477 ---- XLogRecPtr recptr; XLogRecData rdata[4]; Page page = BufferGetPage(newbuf); + char *newtupdata; + int newtuplen; + bool compressed = false; + + /* Structure which holds EWT */ + struct + { + PGLZ_Header pglzheader; + char buf[MaxHeapTupleSize]; + } buf; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); *************** *** 4482,4492 **** log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, else info = XLOG_HEAP_UPDATE; xlrec.target.node = reln->rd_node; xlrec.target.tid = from; ! xlrec.all_visible_cleared = all_visible_cleared; xlrec.newtid = newtup->t_self; ! xlrec.new_all_visible_cleared = new_all_visible_cleared; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapUpdate; --- 4481,4522 ---- else info = XLOG_HEAP_UPDATE; + newtupdata = ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits); + newtuplen = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits); + + /* + * EWT can be generated for all new tuple versions created by Update + * operation. Currently we do it when both the old and new tuple versions + * are on same page, because during recovery if the page containing old + * tuple is corrupt, it should not cascade that corruption to other pages. + * Under the general assumption that for long runs most updates tend to + * create new tuple version on same page, there should not be significant + * impact on WAL reduction or performance. + * + * We should not generate EWT when we need to backup the whole bolck in + * WAL as in that case there is no saving by reduced WAL size. + */ + if ((oldbuf == newbuf) && !XLogCheckBufferNeedsBackup(newbuf)) + { + /* Delta-encode the new tuple using the old tuple */ + if (heap_delta_encode(reln->rd_att, oldtup, newtup, &buf.pglzheader)) + { + compressed = true; + newtupdata = (char *) &buf.pglzheader; + newtuplen = VARSIZE(&buf.pglzheader); + } + } + + xlrec.flags = 0; xlrec.target.node = reln->rd_node; xlrec.target.tid = from; ! if (all_visible_cleared) ! xlrec.flags |= XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED; xlrec.newtid = newtup->t_self; ! if (new_all_visible_cleared) ! xlrec.flags |= XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED; ! if (compressed) ! xlrec.flags |= XL_HEAP_UPDATE_DELTA_ENCODED; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapUpdate; *************** *** 4513,4521 **** log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, rdata[2].buffer_std = true; rdata[2].next = &(rdata[3]); ! /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ ! rdata[3].data = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits); ! rdata[3].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits); rdata[3].buffer = newbuf; rdata[3].buffer_std = true; rdata[3].next = NULL; --- 4543,4554 ---- rdata[2].buffer_std = true; rdata[2].next = &(rdata[3]); ! /* ! * PG73FORMAT: write bitmap [+ padding] [+ oid] + data follows ......... ! * OR PG93FORMAT [If encoded]: LZ header + Encoded data follows ! */ ! rdata[3].data = newtupdata; ! rdata[3].len = newtuplen; rdata[3].buffer = newbuf; rdata[3].buffer_std = true; rdata[3].next = NULL; *************** *** 5291,5297 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) --- 5324,5333 ---- Page page; OffsetNumber offnum; ItemId lp = NULL; + HeapTupleData newtup; + HeapTupleData oldtup; HeapTupleHeader htup; + HeapTupleHeader oldtupdata = NULL; struct { HeapTupleHeaderData hdr; *************** *** 5306,5312 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ ! if (xlrec->all_visible_cleared) { Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); BlockNumber block = ItemPointerGetBlockNumber(&xlrec->target.tid); --- 5342,5348 ---- * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ ! if (xlrec->flags & XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); BlockNumber block = ItemPointerGetBlockNumber(&xlrec->target.tid); *************** *** 5366,5372 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) elog(PANIC, "heap_update_redo: invalid lp"); ! htup = (HeapTupleHeader) PageGetItem(page, lp); htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | --- 5402,5408 ---- if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) elog(PANIC, "heap_update_redo: invalid lp"); ! oldtupdata = htup = (HeapTupleHeader) PageGetItem(page, lp); htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | *************** *** 5385,5391 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) /* Mark the page as a candidate for pruning */ PageSetPrunable(page, record->xl_xid); ! if (xlrec->all_visible_cleared) PageClearAllVisible(page); /* --- 5421,5427 ---- /* Mark the page as a candidate for pruning */ PageSetPrunable(page, record->xl_xid); ! if (xlrec->flags & XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); /* *************** *** 5410,5416 **** newt:; * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ ! if (xlrec->new_all_visible_cleared) { Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); BlockNumber block = ItemPointerGetBlockNumber(&xlrec->newtid); --- 5446,5452 ---- * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ ! if (xlrec->flags & XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); BlockNumber block = ItemPointerGetBlockNumber(&xlrec->newtid); *************** *** 5473,5482 **** newsame:; SizeOfHeapHeader); htup = &tbuf.hdr; MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData)); ! /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ ! memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits), ! (char *) xlrec + hsize, ! newlen); newlen += offsetof(HeapTupleHeaderData, t_bits); htup->t_infomask2 = xlhdr.t_infomask2; htup->t_infomask = xlhdr.t_infomask; --- 5509,5540 ---- SizeOfHeapHeader); htup = &tbuf.hdr; MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData)); ! ! /* ! * If the record is EWT then decode it. ! */ ! if (xlrec->flags & XL_HEAP_UPDATE_DELTA_ENCODED) ! { ! /* ! * PG93FORMAT: Header + Control byte + history reference (2 - 3)bytes ! * + New data (1 byte length + variable data)+ ... ! */ ! PGLZ_Header *encoded_data = (PGLZ_Header *) (((char *) xlrec) + hsize); ! ! oldtup.t_data = oldtupdata; ! newtup.t_data = htup; ! ! heap_delta_decode(encoded_data, &oldtup, &newtup); ! newlen = newtup.t_len; ! } ! else ! { ! /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ ! memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits), ! (char *) xlrec + hsize, ! newlen); ! } ! newlen += offsetof(HeapTupleHeaderData, t_bits); htup->t_infomask2 = xlhdr.t_infomask2; htup->t_infomask = xlhdr.t_infomask; *************** *** 5491,5497 **** newsame:; if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_update_redo: failed to add tuple"); ! if (xlrec->new_all_visible_cleared) PageClearAllVisible(page); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ --- 5549,5555 ---- if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_update_redo: failed to add tuple"); ! if (xlrec->flags & XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ *** a/src/backend/access/transam/README --- b/src/backend/access/transam/README *************** *** 665,670 **** then restart recovery. This is part of the reason for not writing a WAL --- 665,778 ---- entry until we've successfully done the original action. + Encoded WAL Tuple (EWT) + ----------------------- + + Delta Encoded WAL Tuple (EWT) eliminates the need for copying entire tuple to WAL for the update operation. + EWT is constructed by comparing old and new versions of tuple w.r.t column boundaries. It contains the data + from new tuple for modified columns and reference [Offset,Length] of old tuple verion for un-changed columns. + + + EWT Format + ---------- + + Header + Control byte + History Reference (2 - 3)bytes + + New data (1 byte length + variable data) + ... + + + Header: + + The header is same as PGLZ_Header, which is used to store the compressed length and raw length. + + Control byte: + + The first byte after the header tells what to do the next 8 times. We call this the control byte. + + + History Reference: + + A set bit in the control byte means, that a tag of 2-3 bytes follows. A tag contains information + to copy some bytes from old tuple version to the current location in the output. + + Details about 2-3 byte Tag + 2 byte tag is used when length of History data (unchanged data from old tuple version) is less than 18. + 3 byte tag is used when length of History data (unchanged data from old tuple version) is greater than + equal to 18. + The maximum length that can be represented by one Tag is 273. + + Let's call the three tag bytes T1, T2 and T3. The position of the data to copy is coded as an offset + from the old tuple. + + The offset is in the upper nibble of T1 and in T2. + The length is in the lower nibble of T1. + + So the 16 bits of a 2 byte tag are coded as + + 7---T1--0 7---T2--0 + OOOO LLLL OOOO OOOO + + This limits the offset to 1-4095 (12 bits) and the length to 3-18 (4 bits) because 3 is always added to it. + + In the actual implementation, the 2 byte tag's length is limited to 3-17, because the value 0xF + in the length nibble has special meaning. It means, that the next following byte (T3) has to be + added to the length value of 18. That makes total limits of 1-4095 for offset and 3-273 for length. + + + + + New data: + + An unset bit in the control byte represents modified data of new tuple version. + First byte repersents the length [0-255] of the modified data, followed by the + modified data of corresponding length. + + 7---T1--0 7---T2--0 ... + LLLL LLLL DDDD DDDD ... + + Data bytes repeat until the length of the new data. + + + L - Length + O - Offset + D - Data + + This encoding is very similar to LZ Compression used in PostgreSQL (pg_lzcompress.c). + + + Encoding Mechanism for EWT + -------------------------- + Copy the bitmap data from new tuple to the EWT (Encoded WAL Tuple) and loop for all attributes + to find any modifications in the attributes. The unmodified data is encoded as a + History Reference in EWT and the modified data (if NOT NULL) is encoded as New Data in EWT. + + The offset values are calculated with respect to the tuple t_hoff value. For each column attribute + old and new tuple offsets are recalculated based on padding in the tuples. + Once the alignment difference is found between old and new tuple versions, + then include alignment difference as New Data in EWT. + + Max encoded data length is 75% (default compression rate) of original data, if encoded output data + length is greater thanthat, original tuple (new tuple version) will be directly stored in WAL Tuple. + + + Decoding Mechanism for EWT + -------------------------- + Skip header and Read one control byte and process the next 8 items (or as many as remain in the compressed input). + Check each control bit, if the bit is set then it is History Reference which means the next 2 - 3 byte tag + provides the offset and length of history match. + Use the offset and corresponding length to copy data from old tuple version to new tuple. + If the control bit is unset, then it is New Data Reference which means first byte contains the + length [0-255] of the modified data, followed by the modified data of corresponding length + specified in the first byte. + + + Constraints for EWT + -------------------- + 1. Delta encoding is allowed when the update is going to the same page and + buffer doesn't need a backup block in case of full-pagewrite is on. + 2. Old Tuples with length less than PGLZ_HISTORY_SIZE are allowed for encoding. + 3. Old and New tuple versions shouldn't vary in length by more than 50% are allowed for encoding. + + Asynchronous Commit ------------------- *** a/src/backend/access/transam/xlog.c --- b/src/backend/access/transam/xlog.c *************** *** 1204,1209 **** begin:; --- 1204,1231 ---- } /* + * Determine whether the buffer referenced has to be backed up. Since we don't + * yet have the insert lock, fullPageWrites and forcePageWrites could change + * later, but will not cause any problem because this function is used only to + * identify whether EWT is required for WAL update. + */ + bool + XLogCheckBufferNeedsBackup(Buffer buffer) + { + bool doPageWrites; + Page page; + + page = BufferGetPage(buffer); + + doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites; + + if (doPageWrites && PageGetLSN(page) <= RedoRecPtr) + return true; /* buffer requires backup */ + + return false; /* buffer does not need to be backed up */ + } + + /* * Determine whether the buffer referenced by an XLogRecData item has to * be backed up, and if so fill a BkpBlock struct for it. In any case * save the buffer's LSN at *lsn. *** a/src/backend/utils/adt/pg_lzcompress.c --- b/src/backend/utils/adt/pg_lzcompress.c *************** *** 182,190 **** */ #define PGLZ_HISTORY_LISTS 8192 /* must be power of 2 */ #define PGLZ_HISTORY_MASK (PGLZ_HISTORY_LISTS - 1) - #define PGLZ_HISTORY_SIZE 4096 - #define PGLZ_MAX_MATCH 273 - /* ---------- * PGLZ_HistEntry - --- 182,187 ---- *************** *** 302,368 **** do { \ } \ } while (0) - - /* ---------- - * pglz_out_ctrl - - * - * Outputs the last and allocates a new control byte if needed. - * ---------- - */ - #define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) \ - do { \ - if ((__ctrl & 0xff) == 0) \ - { \ - *(__ctrlp) = __ctrlb; \ - __ctrlp = (__buf)++; \ - __ctrlb = 0; \ - __ctrl = 1; \ - } \ - } while (0) - - - /* ---------- - * pglz_out_literal - - * - * Outputs a literal byte to the destination buffer including the - * appropriate control bit. - * ---------- - */ - #define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) \ - do { \ - pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ - *(_buf)++ = (unsigned char)(_byte); \ - _ctrl <<= 1; \ - } while (0) - - - /* ---------- - * pglz_out_tag - - * - * Outputs a backward reference tag of 2-4 bytes (depending on - * offset and length) to the destination buffer including the - * appropriate control bit. - * ---------- - */ - #define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off) \ - do { \ - pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ - _ctrlb |= _ctrl; \ - _ctrl <<= 1; \ - if (_len > 17) \ - { \ - (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f); \ - (_buf)[1] = (unsigned char)(((_off) & 0xff)); \ - (_buf)[2] = (unsigned char)((_len) - 18); \ - (_buf) += 3; \ - } else { \ - (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | ((_len) - 3)); \ - (_buf)[1] = (unsigned char)((_off) & 0xff); \ - (_buf) += 2; \ - } \ - } while (0) - - /* ---------- * pglz_find_match - * --- 299,304 ---- *************** *** 595,601 **** pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, * Create the tag and add history entries for all matched * characters. */ ! pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off); while (match_len--) { pglz_hist_add(hist_start, hist_entries, --- 531,537 ---- * Create the tag and add history entries for all matched * characters. */ ! pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off, dp); while (match_len--) { pglz_hist_add(hist_start, hist_entries, *************** *** 647,661 **** pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, void pglz_decompress(const PGLZ_Header *source, char *dest) { const unsigned char *sp; const unsigned char *srcend; unsigned char *dp; unsigned char *destend; sp = ((const unsigned char *) source) + sizeof(PGLZ_Header); ! srcend = ((const unsigned char *) source) + VARSIZE(source); dp = (unsigned char *) dest; ! destend = dp + source->rawsize; while (sp < srcend && dp < destend) { --- 583,620 ---- void pglz_decompress(const PGLZ_Header *source, char *dest) { + pglz_decompress_with_history((char *) source, dest, NULL, NULL); + } + + /* ---------- + * pglz_decompress_with_history - + * + * Decompresses source into dest. + * To decompress, it uses history if provided. + * ---------- + */ + void + pglz_decompress_with_history(const char *source, char *dest, uint32 *destlen, + const char *history) + { + PGLZ_Header src; const unsigned char *sp; const unsigned char *srcend; unsigned char *dp; unsigned char *destend; + /* To avoid the unaligned access of PGLZ_Header */ + memcpy((char *) &src, source, sizeof(PGLZ_Header)); + sp = ((const unsigned char *) source) + sizeof(PGLZ_Header); ! srcend = ((const unsigned char *) source) + VARSIZE(&src); dp = (unsigned char *) dest; ! destend = dp + src.rawsize; ! ! if (destlen) ! { ! *destlen = src.rawsize; ! } while (sp < srcend && dp < destend) { *************** *** 699,726 **** pglz_decompress(const PGLZ_Header *source, char *dest) break; } ! /* ! * Now we copy the bytes specified by the tag from OUTPUT to ! * OUTPUT. It is dangerous and platform dependent to use ! * memcpy() here, because the copied areas could overlap ! * extremely! ! */ ! while (len--) { ! *dp = dp[-off]; ! dp++; } } else { ! /* ! * An unset control bit means LITERAL BYTE. So we just copy ! * one from INPUT to OUTPUT. ! */ ! if (dp >= destend) /* check for buffer overrun */ ! break; /* do not clobber memory */ ! ! *dp++ = *sp++; } /* --- 658,726 ---- break; } ! if (history) ! { ! /* ! * Now we copy the bytes specified by the tag from history ! * to OUTPUT. ! */ ! memcpy(dp, history + off, len); ! dp += len; ! } ! else { ! /* ! * Now we copy the bytes specified by the tag from OUTPUT ! * to OUTPUT. It is dangerous and platform dependent to ! * use memcpy() here, because the copied areas could ! * overlap extremely! ! */ ! while (len--) ! { ! *dp = dp[-off]; ! dp++; ! } } } else { ! if (history) ! { ! /* ! * The byte at current offset in the source is the length ! * of this literal segment. See pglz_out_add for encoding ! * side. ! */ ! int32 len; ! ! len = sp[0]; ! sp += 1; ! ! if (dp + len > destend) ! { ! dp += len; ! break; ! } ! ! /* ! * Now we copy the bytes specified by the tag from Source ! * to OUTPUT. ! */ ! memcpy(dp, sp, len); ! dp += len; ! sp += len; ! } ! else ! { ! /* ! * An unset control bit means LITERAL BYTE. So we just ! * copy one from INPUT to OUTPUT. ! */ ! if (dp >= destend) /* check for buffer overrun */ ! break; /* do not clobber memory */ ! ! *dp++ = *sp++; ! } } /* *** a/src/backend/utils/misc/guc.c --- b/src/backend/utils/misc/guc.c *************** *** 123,128 **** extern int CommitSiblings; --- 123,129 ---- extern char *default_tablespace; extern char *temp_tablespaces; extern bool synchronize_seqscans; + extern int wal_update_compression_ratio; extern int ssl_renegotiation_limit; extern char *SSLCipherSuites; *************** *** 2382,2387 **** static struct config_int ConfigureNamesInt[] = --- 2383,2399 ---- NULL, NULL, NULL }, + { + /* Not for general use */ + {"wal_update_compression_ratio", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Sets the compression ratio of delta record for wal update"), + NULL, + }, + &wal_update_compression_ratio, + 25, 1, 99, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL *** a/src/include/access/heapam_xlog.h --- b/src/include/access/heapam_xlog.h *************** *** 142,153 **** typedef struct xl_heap_update { xl_heaptid target; /* deleted tuple id */ ItemPointerData newtid; /* new inserted tuple id */ ! bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */ ! bool new_all_visible_cleared; /* same for the page of newtid */ /* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_update; ! #define SizeOfHeapUpdate (offsetof(xl_heap_update, new_all_visible_cleared) + sizeof(bool)) /* * This is what we need to know about vacuum page cleanup/redirect --- 142,161 ---- { xl_heaptid target; /* deleted tuple id */ ItemPointerData newtid; /* new inserted tuple id */ ! int flags; /* flag bits, see below */ ! /* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_update; ! ! #define XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED 0x01 /* Indicates as old page's ! all visible bit is cleared */ ! #define XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED 0x02 /* Indicates as new page's ! all visible bit is cleared */ ! #define XL_HEAP_UPDATE_DELTA_ENCODED 0x04 /* Indicates as the update ! operation is delta encoded */ ! ! #define SizeOfHeapUpdate (offsetof(xl_heap_update, flags) + sizeof(char)) /* * This is what we need to know about vacuum page cleanup/redirect *** a/src/include/access/htup_details.h --- b/src/include/access/htup_details.h *************** *** 18,23 **** --- 18,24 ---- #include "access/tupdesc.h" #include "access/tupmacs.h" #include "storage/bufpage.h" + #include "utils/pg_lzcompress.h" /* * MaxTupleAttributeNumber limits the number of (user) columns in a tuple. *************** *** 528,533 **** struct MinimalTupleData --- 529,535 ---- HeapTupleHeaderSetOid((tuple)->t_data, (oid)) + #if !defined(DISABLE_COMPLEX_MACRO) /* ---------------- * fastgetattr * *************** *** 542,550 **** struct MinimalTupleData * lookups, and call nocachegetattr() for the rest. * ---------------- */ - - #if !defined(DISABLE_COMPLEX_MACRO) - #define fastgetattr(tup, attnum, tupleDesc, isnull) \ ( \ AssertMacro((attnum) > 0), \ --- 544,549 ---- *************** *** 572,585 **** struct MinimalTupleData nocachegetattr((tup), (attnum), (tupleDesc)) \ ) \ ) \ ) - #else /* defined(DISABLE_COMPLEX_MACRO) */ extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull); #endif /* defined(DISABLE_COMPLEX_MACRO) */ - /* ---------------- * heap_getattr * --- 571,626 ---- nocachegetattr((tup), (attnum), (tupleDesc)) \ ) \ ) \ + ) \ + + /* ---------------- + * fastgetattr_with_len + * + * Similar to fastgetattr and fetches the length of the given attribute + * also. + * ---------------- + */ + #define fastgetattr_with_len(tup, attnum, tupleDesc, isnull, len) \ + ( \ + AssertMacro((attnum) > 0), \ + (*(isnull) = false), \ + HeapTupleNoNulls(tup) ? \ + ( \ + (tupleDesc)->attrs[(attnum)-1]->attcacheoff >= 0 ? \ + ( \ + (*(len) = att_getlength( \ + (tupleDesc)->attrs[(attnum)-1]->attlen, \ + (char *) (tup)->t_data + (tup)->t_data->t_hoff +\ + (tupleDesc)->attrs[(attnum)-1]->attcacheoff)), \ + fetchatt((tupleDesc)->attrs[(attnum)-1], \ + (char *) (tup)->t_data + (tup)->t_data->t_hoff + \ + (tupleDesc)->attrs[(attnum)-1]->attcacheoff) \ + ) \ + : \ + nocachegetattr_with_len((tup), (attnum), (tupleDesc), (len))\ + ) \ + : \ + ( \ + att_isnull((attnum)-1, (tup)->t_data->t_bits) ? \ + ( \ + (*(isnull) = true), \ + (*(len) = 0), \ + (Datum)NULL \ + ) \ + : \ + ( \ + nocachegetattr_with_len((tup), (attnum), (tupleDesc), (len))\ + ) \ + ) \ ) + #else /* defined(DISABLE_COMPLEX_MACRO) */ extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull); + extern Datum fastgetattr_with_len(HeapTuple tup, int attnum, + TupleDesc tupleDesc, bool *isnull, int32 *len); #endif /* defined(DISABLE_COMPLEX_MACRO) */ /* ---------------- * heap_getattr * *************** *** 596,616 **** extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, * ---------------- */ #define heap_getattr(tup, attnum, tupleDesc, isnull) \ ( \ ! ((attnum) > 0) ? \ ( \ ! ((attnum) > (int) HeapTupleHeaderGetNatts((tup)->t_data)) ? \ ! ( \ ! (*(isnull) = true), \ ! (Datum)NULL \ ! ) \ ! : \ ! fastgetattr((tup), (attnum), (tupleDesc), (isnull)) \ ) \ : \ ! heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \ ! ) /* prototypes for functions in common/heaptuple.c */ extern Size heap_compute_data_size(TupleDesc tupleDesc, --- 637,679 ---- * ---------------- */ #define heap_getattr(tup, attnum, tupleDesc, isnull) \ + ( \ + ((attnum) > 0) ? \ ( \ ! ((attnum) > (int) HeapTupleHeaderGetNatts((tup)->t_data)) ? \ ( \ ! (*(isnull) = true), \ ! (Datum)NULL \ ) \ : \ ! fastgetattr((tup), (attnum), (tupleDesc), (isnull)) \ ! ) \ ! : \ ! heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \ ! ) + /* ---------------- + * heap_getattr_with_len + * + * Similar to heap_getattr and outputs the length of the given attribute. + * ---------------- + */ + #define heap_getattr_with_len(tup, attnum, tupleDesc, isnull, len) \ + ( \ + ((attnum) > 0) ? \ + ( \ + ((attnum) > (int) HeapTupleHeaderGetNatts((tup)->t_data)) ? \ + ( \ + (*(isnull) = true), \ + (*(len) = 0), \ + (Datum)NULL \ + ) \ + : \ + fastgetattr_with_len((tup), (attnum), (tupleDesc), (isnull), (len)) \ + ) \ + : \ + heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \ + ) /* prototypes for functions in common/heaptuple.c */ extern Size heap_compute_data_size(TupleDesc tupleDesc, *************** *** 620,625 **** extern void heap_fill_tuple(TupleDesc tupleDesc, --- 683,690 ---- char *data, Size data_size, uint16 *infomask, bits8 *bit); extern bool heap_attisnull(HeapTuple tup, int attnum); + extern Datum nocachegetattr_with_len(HeapTuple tup, int attnum, + TupleDesc att, Size *len); extern Datum nocachegetattr(HeapTuple tup, int attnum, TupleDesc att); extern Datum heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, *************** *** 636,641 **** extern HeapTuple heap_modify_tuple(HeapTuple tuple, --- 701,714 ---- extern void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull); + extern bool heap_attr_get_length_and_check_equals(TupleDesc tupdesc, + int attrnum, HeapTuple tup1, HeapTuple tup2, + Size *tup1_attr_len, Size *tup2_attr_len); + extern bool heap_delta_encode(TupleDesc tupleDesc, HeapTuple oldtup, + HeapTuple newtup, PGLZ_Header *encdata); + extern void heap_delta_decode (PGLZ_Header *encdata, HeapTuple oldtup, + HeapTuple newtup); + /* these three are deprecated versions of the three above: */ extern HeapTuple heap_formtuple(TupleDesc tupleDescriptor, Datum *values, char *nulls); *** a/src/include/access/tupmacs.h --- b/src/include/access/tupmacs.h *************** *** 187,192 **** --- 187,214 ---- ) /* + * att_getlength - + * Gets the length of the attribute. + */ + #define att_getlength(attlen, attptr) \ + ( \ + ((attlen) > 0) ? \ + ( \ + (attlen) \ + ) \ + : (((attlen) == -1) ? \ + ( \ + VARSIZE_ANY(attptr) \ + ) \ + : \ + ( \ + AssertMacro((attlen) == -2), \ + (strlen((char *) (attptr)) + 1) \ + )) \ + ) + + + /* * store_att_byval is a partial inverse of fetch_att: store a given Datum * value into a tuple data area at the specified address. However, it only * handles the byval case, because in typical usage the caller needs to *** a/src/include/access/xlog.h --- b/src/include/access/xlog.h *************** *** 261,266 **** typedef struct CheckpointStatsData --- 261,267 ---- extern CheckpointStatsData CheckpointStats; extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); + extern bool XLogCheckBufferNeedsBackup(Buffer buffer); extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); *** a/src/include/utils/pg_lzcompress.h --- b/src/include/utils/pg_lzcompress.h *************** *** 23,28 **** typedef struct PGLZ_Header --- 23,31 ---- int32 rawsize; } PGLZ_Header; + /* LZ algorithm can hold only history offset in the range of 1 - 4095. */ + #define PGLZ_HISTORY_SIZE 4096 + #define PGLZ_MAX_MATCH 273 /* ---------- * PGLZ_MAX_OUTPUT - *************** *** 86,91 **** typedef struct PGLZ_Strategy --- 89,207 ---- int32 match_size_drop; } PGLZ_Strategy; + /* + * calculate the approximate length required for history reference tag for the + * given length + */ + #define PGLZ_GET_HIST_CTRL_BIT_LEN(_len) \ + ( \ + ((_len) < 17) ? (3) : (4 * (1 + ((_len) / PGLZ_MAX_MATCH))) \ + ) + + /* ---------- + * pglz_out_ctrl - + * + * Outputs the last and allocates a new control byte if needed. + * ---------- + */ + #define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) \ + do { \ + if ((__ctrl & 0xff) == 0) \ + { \ + *(__ctrlp) = __ctrlb; \ + __ctrlp = (__buf)++; \ + __ctrlb = 0; \ + __ctrl = 1; \ + } \ + } while (0) + + /* ---------- + * pglz_out_literal - + * + * Outputs a literal byte to the destination buffer including the + * appropriate control bit. + * ---------- + */ + #define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) \ + do { \ + pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ + *(_buf)++ = (unsigned char)(_byte); \ + _ctrl <<= 1; \ + } while (0) + + /* ---------- + * pglz_out_tag - + * + * Outputs a backward/history reference tag of 2-3 bytes (depending on + * offset and length) to the destination buffer including the + * appropriate control bit. + * + * Split the process of backward/history reference as different chunks, + * if the given length is more than max match and repeats the process + * until the given length is processed. + * + * If the matched history length is less than 3 bytes then add it as a + * new data only during encoding instead of history reference. This occurs + * only while framing EWT. + * ---------- + */ + #define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off,_byte) \ + do { \ + int _mtaglen; \ + int _tagtotal_len = (_len); \ + while (_tagtotal_len > 0) \ + { \ + _mtaglen = _tagtotal_len > PGLZ_MAX_MATCH ? PGLZ_MAX_MATCH : _tagtotal_len; \ + if (_mtaglen < 3) \ + { \ + char *_data = (char *)(_byte) + (_off); \ + pglz_out_add(_ctrlp,_ctrlb,_ctrl,_buf,_mtaglen,_data); \ + break; \ + } \ + pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ + _ctrlb |= _ctrl; \ + _ctrl <<= 1; \ + if (_mtaglen > 17) \ + { \ + (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f); \ + (_buf)[1] = (unsigned char)(((_off) & 0xff)); \ + (_buf)[2] = (unsigned char)((_mtaglen) - 18); \ + (_buf) += 3; \ + } else { \ + (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | ((_mtaglen) - 3)); \ + (_buf)[1] = (unsigned char)((_off) & 0xff); \ + (_buf) += 2; \ + } \ + _tagtotal_len -= _mtaglen; \ + (_off) += _mtaglen; \ + } \ + } while (0) + + /* ---------- + * pglz_out_add - + * + * Outputs a reference tag of 1 byte with length and the new data + * to the destination buffer, including the appropriate control bit. + * ---------- + */ + #define pglz_out_add(_ctrlp,_ctrlb,_ctrl,_buf,_len,_byte) \ + do { \ + int32 _maddlen; \ + int32 _addtotal_len = (_len); \ + while (_addtotal_len > 0) \ + { \ + _maddlen = _addtotal_len > 255 ? 255 : _addtotal_len; \ + pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ + _ctrl <<= 1; \ + (_buf)[0] = (unsigned char)(_maddlen); \ + (_buf) += 1; \ + memcpy((_buf), (_byte), _maddlen); \ + (_buf) += _maddlen; \ + (_byte) += _maddlen; \ + _addtotal_len -= _maddlen; \ + } \ + } while (0) + /* ---------- * The standard strategies *************** *** 108,112 **** extern const PGLZ_Strategy *const PGLZ_strategy_always; extern bool pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, const PGLZ_Strategy *strategy); extern void pglz_decompress(const PGLZ_Header *source, char *dest); ! #endif /* _PG_LZCOMPRESS_H_ */ --- 224,229 ---- extern bool pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, const PGLZ_Strategy *strategy); extern void pglz_decompress(const PGLZ_Header *source, char *dest); ! extern void pglz_decompress_with_history(const char *source, char *dest, ! uint32 *destlen, const char *history); #endif /* _PG_LZCOMPRESS_H_ */ *** a/src/test/regress/expected/update.out --- b/src/test/regress/expected/update.out *************** *** 97,99 **** SELECT a, b, char_length(c) FROM update_test; --- 97,169 ---- (2 rows) DROP TABLE update_test; + -- + -- Test to update continuos and non continuos columns + -- + DROP TABLE IF EXISTS update_test; + NOTICE: table "update_test" does not exist, skipping + CREATE TABLE update_test ( + bser bigserial, + bln boolean, + ename VARCHAR(25), + perf_f float(8), + grade CHAR, + dept CHAR(5) NOT NULL, + dob DATE, + idnum INT, + addr VARCHAR(30) NOT NULL, + destn CHAR(6), + Gend CHAR, + samba BIGINT, + hgt float, + ctime TIME + ); + INSERT INTO update_test VALUES ( + nextval('update_test_bser_seq'::regclass), + TRUE, + 'Test', + 7.169, + 'B', + 'CSD', + '2000-01-01', + 520, + 'road2, + streeeeet2, + city2', + 'dcy2', + 'M', + 12000, + 50.4, + '00:00:00.0' + ); + SELECT * from update_test; + bser | bln | ename | perf_f | grade | dept | dob | idnum | addr | destn | gend | samba | hgt | ctime + ------+-----+-------+--------+-------+-------+------------+-------+-----------------------------+--------+------+-------+------+---------- + 1 | t | Test | 7.169 | B | CSD | 01-01-2000 | 520 | road2, +| dcy2 | M | 12000 | 50.4 | 00:00:00 + | | | | | | | | streeeeet2,+| | | | | + | | | | | | | | city2 | | | | | + (1 row) + + -- update first column + UPDATE update_test SET bser = bser - 1 + 1; + -- update middle column + UPDATE update_test SET perf_f = 8.9; + -- update last column + UPDATE update_test SET ctime = '00:00:00.1'; + -- update 3 continuos columns + UPDATE update_test SET destn = 'dcy2', samba = 0 WHERE Gend = 'M' and dept = 'CSD'; + -- update two non continuos columns + UPDATE update_test SET destn = 'moved', samba = 0; + UPDATE update_test SET bln = FALSE, hgt = 10.1; + -- update causing some column alignment difference + UPDATE update_test SET ename = 'Tes'; + UPDATE update_test SET dept = 'Test'; + SELECT * from update_test; + bser | bln | ename | perf_f | grade | dept | dob | idnum | addr | destn | gend | samba | hgt | ctime + ------+-----+-------+--------+-------+-------+------------+-------+-----------------------------+--------+------+-------+------+------------ + 1 | f | Tes | 8.9 | B | Test | 01-01-2000 | 520 | road2, +| moved | M | 0 | 10.1 | 00:00:00.1 + | | | | | | | | streeeeet2,+| | | | | + | | | | | | | | city2 | | | | | + (1 row) + + DROP TABLE update_test; *** a/src/test/regress/sql/update.sql --- b/src/test/regress/sql/update.sql *************** *** 59,61 **** UPDATE update_test SET c = repeat('x', 10000) WHERE c = 'car'; --- 59,128 ---- SELECT a, b, char_length(c) FROM update_test; DROP TABLE update_test; + + + -- + -- Test to update continuos and non continuos columns + -- + + DROP TABLE IF EXISTS update_test; + CREATE TABLE update_test ( + bser bigserial, + bln boolean, + ename VARCHAR(25), + perf_f float(8), + grade CHAR, + dept CHAR(5) NOT NULL, + dob DATE, + idnum INT, + addr VARCHAR(30) NOT NULL, + destn CHAR(6), + Gend CHAR, + samba BIGINT, + hgt float, + ctime TIME + ); + + INSERT INTO update_test VALUES ( + nextval('update_test_bser_seq'::regclass), + TRUE, + 'Test', + 7.169, + 'B', + 'CSD', + '2000-01-01', + 520, + 'road2, + streeeeet2, + city2', + 'dcy2', + 'M', + 12000, + 50.4, + '00:00:00.0' + ); + + SELECT * from update_test; + + -- update first column + UPDATE update_test SET bser = bser - 1 + 1; + + -- update middle column + UPDATE update_test SET perf_f = 8.9; + + -- update last column + UPDATE update_test SET ctime = '00:00:00.1'; + + -- update 3 continuos columns + UPDATE update_test SET destn = 'dcy2', samba = 0 WHERE Gend = 'M' and dept = 'CSD'; + + -- update two non continuos columns + UPDATE update_test SET destn = 'moved', samba = 0; + UPDATE update_test SET bln = FALSE, hgt = 10.1; + + -- update causing some column alignment difference + UPDATE update_test SET ename = 'Tes'; + UPDATE update_test SET dept = 'Test'; + + SELECT * from update_test; + DROP TABLE update_test;