*** a/src/backend/access/common/heaptuple.c --- b/src/backend/access/common/heaptuple.c *************** *** 60,66 **** --- 60,70 ---- #include "access/sysattr.h" #include "access/tuptoaster.h" #include "executor/tuptable.h" + #include "utils/datum.h" + #include "utils/pg_lzcompress.h" + /* guc variable for EWT compression ratio*/ + int wal_update_compression_ratio = 25; /* Does att's datatype allow packing into the 1-byte-header varlena format? */ #define ATT_IS_PACKABLE(att) \ *************** *** 69,74 **** --- 73,80 ---- #define VARLENA_ATT_IS_PACKABLE(att) \ ((att)->attstorage != 'p') + static void heap_get_attr_offsets(TupleDesc tupleDesc, HeapTuple Tuple, + int32 **offsets, int *noffsets); /* ---------------------------------------------------------------- * misc support routines *************** *** 617,622 **** heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest) --- 623,775 ---- memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len); } + /* ---------------- + * heap_get_attr_offsets + * + * Given a tuple, extract each attribute's starting offset and return + * it as an array of offsets for a heap tuple. + * If the attribute has null value, the offset for it will be end of + * previous attribute offset. + * ---------------- + */ + static void + heap_get_attr_offsets(TupleDesc tupleDesc, HeapTuple Tuple, + int32 **offsets, int *noffsets) + { + HeapTupleHeader tup = Tuple->t_data; + Form_pg_attribute *att = tupleDesc->attrs; + bool hasnulls = HeapTupleHasNulls(Tuple); + bits8 *bp = Tuple->t_data->t_bits; /* ptr to null bitmap in tuple */ + bool slow = false; /* can we use/set attcacheoff? */ + char *tp; /* ptr to tuple data */ + long off; /* offset in tuple data */ + int natts; + int attnum; + + natts = HeapTupleHeaderGetNatts(Tuple->t_data); + + *offsets = palloc(natts * sizeof(int32)); + + *noffsets = 0; + + /* copied from heap_deform_tuple */ + tp = (char *) tup + tup->t_hoff; + off = 0; + for (attnum = 0; attnum < natts; attnum++) + { + Form_pg_attribute thisatt = att[attnum]; + + if (hasnulls && att_isnull(attnum, bp)) + { + slow = true; /* can't use attcacheoff anymore */ + (*offsets)[(*noffsets)++] = off; + continue; + } + + if (!slow && thisatt->attcacheoff >= 0) + off = thisatt->attcacheoff; + else if (thisatt->attlen == -1) + { + /* + * We can only cache the offset for a varlena attribute if the + * offset is already suitably aligned, so that there would be no + * pad bytes in any case: then the offset will be valid for either + * an aligned or unaligned value. + */ + if (!slow && + off == att_align_nominal(off, thisatt->attalign)) + thisatt->attcacheoff = off; + else + { + off = att_align_pointer(off, thisatt->attalign, -1, + tp + off); + slow = true; + } + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, thisatt->attalign); + + if (!slow) + thisatt->attcacheoff = off; + } + + (*offsets)[(*noffsets)++] = off; + + off = att_addlength_pointer(off, thisatt->attlen, tp + off); + + if (thisatt->attlen <= 0) + slow = true; /* can't use attcacheoff anymore */ + + } + + } + + /* ---------------- + * heap_delta_encode + * + * Calculate the delta between two tuples, using pglz. The result is + * stored in *encdata. *encdata must point to a PGLZ_header buffer, with at + * least PGLZ_MAX_OUTPUT(newtup->t_len) bytes. + * ---------------- + */ + bool + heap_delta_encode(TupleDesc tupleDesc, HeapTuple oldtup, HeapTuple newtup, + char *encdata) + { + int32 *hoffsets, + *newoffsets; + int noffsets; + PGLZ_Strategy strategy; + int32 newbitmaplen, + hbitmpalen; + + /* + * If length of old and new tuple versions vary by more than 50%, include + * new as-is + */ + if ((newtup->t_len <= (oldtup->t_len >> 1)) + || (oldtup->t_len <= (newtup->t_len >> 1))) + return false; + + newbitmaplen = newtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits); + hbitmpalen = oldtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits); + + /* + * Deform and get the attribute offsets for old and new tuple which will + * be used for calculating delta between old and new tuples. + */ + heap_get_attr_offsets(tupleDesc, oldtup, &hoffsets, &noffsets); + heap_get_attr_offsets(tupleDesc, newtup, &newoffsets, &noffsets); + + strategy = *PGLZ_strategy_always; + strategy.min_comp_rate = wal_update_compression_ratio; + + return pglz_compress_with_history((char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits), + newtup->t_len - offsetof(HeapTupleHeaderData, t_bits), + (char *) oldtup->t_data + offsetof(HeapTupleHeaderData, t_bits), + oldtup->t_len - offsetof(HeapTupleHeaderData, t_bits), + newoffsets, hoffsets, noffsets, + newbitmaplen, hbitmpalen, + (PGLZ_Header *) encdata, &strategy); + } + + /* ---------------- + * heap_delta_decode + * + * Decode a tuple using delta-encoded WAL tuple and old tuple version. + * ---------------- + */ + void + heap_delta_decode(char *encdata, HeapTuple oldtup, HeapTuple newtup) + { + return pglz_decompress_with_history((char *) encdata, + (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits), + &newtup->t_len, + (char *) oldtup->t_data + offsetof(HeapTupleHeaderData, t_bits)); + } + /* * heap_form_tuple * construct a tuple from the given values[] and isnull[] arrays, *** a/src/backend/access/heap/heapam.c --- b/src/backend/access/heap/heapam.c *************** *** 70,75 **** --- 70,76 ---- #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" + #include "utils/pg_lzcompress.h" /* GUC variable */ *************** *** 5765,5770 **** log_heap_update(Relation reln, Buffer oldbuf, --- 5766,5781 ---- XLogRecPtr recptr; XLogRecData rdata[4]; Page page = BufferGetPage(newbuf); + char *newtupdata; + int newtuplen; + bool compressed = false; + + /* Structure which holds EWT */ + struct + { + PGLZ_Header pglzheader; + char buf[MaxHeapTupleSize]; + } buf; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); *************** *** 5774,5788 **** log_heap_update(Relation reln, Buffer oldbuf, else info = XLOG_HEAP_UPDATE; xlrec.target.node = reln->rd_node; xlrec.target.tid = oldtup->t_self; xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, oldtup->t_data->t_infomask2); xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); ! xlrec.all_visible_cleared = all_visible_cleared; xlrec.newtid = newtup->t_self; ! xlrec.new_all_visible_cleared = new_all_visible_cleared; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapUpdate; --- 5785,5830 ---- else info = XLOG_HEAP_UPDATE; + newtupdata = ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits); + newtuplen = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits); + + /* + * EWT can be generated for all new tuple versions created by Update + * operation. Currently we do it when both the old and new tuple versions + * are on same page, because during recovery if the page containing old + * tuple is corrupt, it should not cascade that corruption to other pages. + * Under the general assumption that for long runs most updates tend to + * create new tuple version on same page, there should not be significant + * impact on WAL reduction or performance. + * + * We should not generate EWT when we need to backup the whole bolck in + * WAL as in that case there is no saving by reduced WAL size. + */ + if ((oldbuf == newbuf) && !XLogCheckBufferNeedsBackup(newbuf)) + { + /* Delta-encode the new tuple using the old tuple */ + if (heap_delta_encode(reln->rd_att, oldtup, newtup, (char *) &buf.pglzheader)) + { + compressed = true; + newtupdata = (char *) &buf.pglzheader; + newtuplen = VARSIZE(&buf.pglzheader); + } + } + + xlrec.flags = 0; xlrec.target.node = reln->rd_node; xlrec.target.tid = oldtup->t_self; xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, oldtup->t_data->t_infomask2); xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); ! if (all_visible_cleared) ! xlrec.flags |= XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED; xlrec.newtid = newtup->t_self; ! if (new_all_visible_cleared) ! xlrec.flags |= XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED; ! if (compressed) ! xlrec.flags |= XL_HEAP_UPDATE_DELTA_ENCODED; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapUpdate; *************** *** 5809,5817 **** log_heap_update(Relation reln, Buffer oldbuf, rdata[2].buffer_std = true; rdata[2].next = &(rdata[3]); ! /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ ! rdata[3].data = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits); ! rdata[3].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits); rdata[3].buffer = newbuf; rdata[3].buffer_std = true; rdata[3].next = NULL; --- 5851,5862 ---- rdata[2].buffer_std = true; rdata[2].next = &(rdata[3]); ! /* ! * PG73FORMAT: write bitmap [+ padding] [+ oid] + data follows ......... ! * OR PG93FORMAT [If encoded]: LZ header + Encoded data follows ! */ ! rdata[3].data = newtupdata; ! rdata[3].len = newtuplen; rdata[3].buffer = newbuf; rdata[3].buffer_std = true; rdata[3].next = NULL; *************** *** 6614,6620 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) --- 6659,6668 ---- Page page; OffsetNumber offnum; ItemId lp = NULL; + HeapTupleData newtup; + HeapTupleData oldtup; HeapTupleHeader htup; + HeapTupleHeader oldtupdata = NULL; struct { HeapTupleHeaderData hdr; *************** *** 6629,6635 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ ! if (xlrec->all_visible_cleared) { Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); BlockNumber block = ItemPointerGetBlockNumber(&xlrec->target.tid); --- 6677,6683 ---- * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ ! if (xlrec->flags & XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); BlockNumber block = ItemPointerGetBlockNumber(&xlrec->target.tid); *************** *** 6689,6695 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) elog(PANIC, "heap_update_redo: invalid lp"); ! htup = (HeapTupleHeader) PageGetItem(page, lp); htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; --- 6737,6743 ---- if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) elog(PANIC, "heap_update_redo: invalid lp"); ! oldtupdata = htup = (HeapTupleHeader) PageGetItem(page, lp); htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; *************** *** 6707,6713 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) /* Mark the page as a candidate for pruning */ PageSetPrunable(page, record->xl_xid); ! if (xlrec->all_visible_cleared) PageClearAllVisible(page); /* --- 6755,6761 ---- /* Mark the page as a candidate for pruning */ PageSetPrunable(page, record->xl_xid); ! if (xlrec->flags & XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); /* *************** *** 6732,6738 **** newt:; * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ ! if (xlrec->new_all_visible_cleared) { Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); BlockNumber block = ItemPointerGetBlockNumber(&xlrec->newtid); --- 6780,6786 ---- * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ ! if (xlrec->flags & XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); BlockNumber block = ItemPointerGetBlockNumber(&xlrec->newtid); *************** *** 6795,6804 **** newsame:; SizeOfHeapHeader); htup = &tbuf.hdr; MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData)); ! /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ ! memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits), ! (char *) xlrec + hsize, ! newlen); newlen += offsetof(HeapTupleHeaderData, t_bits); htup->t_infomask2 = xlhdr.t_infomask2; htup->t_infomask = xlhdr.t_infomask; --- 6843,6874 ---- SizeOfHeapHeader); htup = &tbuf.hdr; MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData)); ! ! /* ! * If the record is EWT then decode it. ! */ ! if (xlrec->flags & XL_HEAP_UPDATE_DELTA_ENCODED) ! { ! /* ! * PG93FORMAT: Header + Control byte + history reference (2 - 3)bytes ! * + New data (1 byte length + variable data)+ ... ! */ ! PGLZ_Header *encoded_data = (PGLZ_Header *) (((char *) xlrec) + hsize); ! ! oldtup.t_data = oldtupdata; ! newtup.t_data = htup; ! ! heap_delta_decode((char *) encoded_data, &oldtup, &newtup); ! newlen = newtup.t_len; ! } ! else ! { ! /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ ! memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits), ! (char *) xlrec + hsize, ! newlen); ! } ! newlen += offsetof(HeapTupleHeaderData, t_bits); htup->t_infomask2 = xlhdr.t_infomask2; htup->t_infomask = xlhdr.t_infomask; *************** *** 6814,6820 **** newsame:; if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_update_redo: failed to add tuple"); ! if (xlrec->new_all_visible_cleared) PageClearAllVisible(page); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ --- 6884,6890 ---- if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_update_redo: failed to add tuple"); ! if (xlrec->flags & XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ *** a/src/backend/access/transam/README --- b/src/backend/access/transam/README *************** *** 665,670 **** then restart recovery. This is part of the reason for not writing a WAL --- 665,784 ---- entry until we've successfully done the original action. + Encoded WAL Tuple (EWT) + ----------------------- + + Delta Encoded WAL Tuple (EWT) eliminates the need for copying entire tuple + to WAL for the update operation. EWT is constructed using pglz by comparing + old and new versions of tuple w.r.t column boundaries. It contains the data + from new tuple for modified columns and reference [Offset,Length] of old tuple + verion for un-changed columns. + + + EWT Format + ---------- + + Header + Control byte + History Reference (2 - 3)bytes + + New data (1 byte length + variable data) + ... + + + Header: + + The header is same as PGLZ_Header, which is used to store the compressed length + and raw length. + + Control byte: + + The first byte after the header tells what to do the next 8 times. We call this + the control byte. + + + History Reference: + + A set bit in the control byte means, that a tag of 2-3 bytes follows. + A tag contains information to copy some bytes from old tuple version to + the current location in the output. + + Details about 2-3 byte Tag + 2 byte tag is used when length of History data + (unchanged data from old tuple version) is less than 18. + 3 byte tag is used when length of History data + (unchanged data from old tuple version) is greater than equal to 18. + The maximum length that can be represented by one Tag is 273. + + Let's call the three tag bytes T1, T2 and T3. The position of the data + to copy is coded as an offset from the old tuple. + + The offset is in the upper nibble of T1 and in T2. + The length is in the lower nibble of T1. + + So the 16 bits of a 2 byte tag are coded as + + 7---T1--0 7---T2--0 + OOOO LLLL OOOO OOOO + + This limits the offset to 1-4095 (12 bits) and the length to 3-18 (4 bits) + because 3 is always added to it. + + In the actual implementation, the 2 byte tag's length is limited to 3-17, + because the value 0xF in the length nibble has special meaning. It means, + that the next following byte (T3) has to be added to the length value of 18. + That makes total limits of 1-4095 for offset and 3-273 for length. + + + New data: + + An unset bit in the control byte represents modified data of new tuple version. + First byte repersents the length [0-255] of the modified data, followed by the + modified data of corresponding length. + + 7---T1--0 7---T2--0 ... + LLLL LLLL DDDD DDDD ... + + Data bytes repeat until the length of the new data. + + + L - Length + O - Offset + D - Data + + + Encoding Mechanism for EWT + -------------------------- + Copy the bitmap data from new tuple to the EWT (Encoded WAL Tuple) + and loop for all attributes to find any modifications in the attributes. + The unmodified data is encoded as a History Reference in EWT and the + modified data (if NOT NULL) is encoded as New Data in EWT. + + The offset values are calculated with respect to the tuple t_hoff value. + Max encoded data length is 75% (default compression rate) of original data, + if encoded output data length is greater than that, original tuple + (new tuple version) will be directly stored in WAL Tuple. + + + Decoding Mechanism for EWT + -------------------------- + Skip header and Read one control byte and process the next 8 items + (or as many as remain in the compressed input). Check each control bit, + if the bit is set then it is History Reference which means the next + 2 - 3 byte tag provides the offset and length of history match. + + Use the offset and corresponding length to copy data from old tuple + version to new tuple. If the control bit is unset, then it is + New Data Reference which means first byte contains the length [0-255] + of the modified data, followed by the modified data of corresponding length + specified in the first byte. + + + Constraints for EWT + -------------------- + 1. Delta encoding is allowed when the update is going to the same page and + buffer doesn't need a backup block in case of full-pagewrite is on. + 2. Old Tuples with length less than PGLZ_HISTORY_SIZE are allowed for encoding. + 3. Old and New tuple versions shouldn't vary in length by more than 50% + are allowed for encoding. + + Asynchronous Commit ------------------- *** a/src/backend/access/transam/xlog.c --- b/src/backend/access/transam/xlog.c *************** *** 1209,1214 **** begin:; --- 1209,1236 ---- } /* + * Determine whether the buffer referenced has to be backed up. Since we don't + * yet have the insert lock, fullPageWrites and forcePageWrites could change + * later, but will not cause any problem because this function is used only to + * identify whether EWT is required for WAL update. + */ + bool + XLogCheckBufferNeedsBackup(Buffer buffer) + { + bool doPageWrites; + Page page; + + page = BufferGetPage(buffer); + + doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites; + + if (doPageWrites && PageGetLSN(page) <= RedoRecPtr) + return true; /* buffer requires backup */ + + return false; /* buffer does not need to be backed up */ + } + + /* * Determine whether the buffer referenced by an XLogRecData item has to * be backed up, and if so fill a BkpBlock struct for it. In any case * save the buffer's LSN at *lsn. *** a/src/backend/utils/adt/pg_lzcompress.c --- b/src/backend/utils/adt/pg_lzcompress.c *************** *** 362,367 **** do { \ --- 362,391 ---- } \ } while (0) + /* ---------- + * pglz_out_add - + * + * Outputs a reference tag of 1 byte with length and the new data + * to the destination buffer, including the appropriate control bit. + * ---------- + */ + #define pglz_out_add(_ctrlp,_ctrlb,_ctrl,_buf,_len,_byte) \ + do { \ + int32 _maddlen; \ + int32 _addtotal_len = (_len); \ + while (_addtotal_len > 0) \ + { \ + _maddlen = _addtotal_len > 255 ? 255 : _addtotal_len; \ + pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ + _ctrl <<= 1; \ + (_buf)[0] = (unsigned char)(_maddlen); \ + (_buf) += 1; \ + memcpy((_buf), (_byte), _maddlen); \ + (_buf) += _maddlen; \ + (_byte) += _maddlen; \ + _addtotal_len -= _maddlen; \ + } \ + } while (0) /* ---------- * pglz_find_match - *************** *** 471,476 **** pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end, --- 495,539 ---- return 0; } + /* ---------- + * pglz_find_match - + * + * Lookup the history table if the actual input stream matches + * another sequence of characters, starting somewhere earlier + * in the input buffer. + * ---------- + */ + static inline int + pglz_find_match_with_history(const char *input, const char *end, + const char *history, const char *hend, int *lenp) + { + const char *ip = input; + const char *hp = history; + + /* + * Determine length of match. A better match must be larger than the best + * so far. And if we already have a match of 16 or more bytes, it's worth + * the call overhead to use memcmp() to check if this match is equal for + * the same size. After that we must fallback to character by character + * comparison to know the exact position where the diff occurred. + */ + while (ip < end && hp < hend && *ip == *hp && *lenp < PGLZ_MAX_MATCH) + { + (*lenp)++; + ip++; + hp++; + } + + /* + * Return match information only if it results at least in one byte + * reduction. + */ + if (*lenp > 2) + return 1; + + return 0; + } + /* ---------- * pglz_compress - *************** *** 637,642 **** pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, --- 700,895 ---- return true; } + /* ---------- + * pglz_compress_with_history + * + * Like pglz_compress, but performs delta encoding rather than compression. + * The references are offsets from the start of history data, rather + * than current output position. 'hoffsets' and 'newoffsets' are array of + * offsets in the history and source to consider. We scan the history + * string based on attribute offsets for possible matches with source string. + * + * For attributes having NULL value, the offset will be same as next attribute + * offset. When old tuple contains NULL and new tuple has non-NULL value, + * it will copy it as New Data in Encoded WAL Tuple. When new tuple has NULL + * value and old tuple has non-NULL value, the old tuple value will be ignored. + * ---------- + */ + bool + pglz_compress_with_history(const char *source, int32 slen, + const char *history, int32 hlen, + int32 *newoffsets, int32 *hoffsets, int32 noffsets, + int32 newbitmaplen, int32 hbitmaplen, + PGLZ_Header *dest, const PGLZ_Strategy *strategy) + { + unsigned char *bp = ((unsigned char *) dest) + sizeof(PGLZ_Header); + unsigned char *bstart = bp; + const char *dp = source; + const char *dend = source + slen; + unsigned char ctrl_dummy = 0; + unsigned char *ctrlp = &ctrl_dummy; + unsigned char ctrlb = 0; + unsigned char ctrl = 0; + bool found_match = false; + int32 match_len = 0; + int32 match_off; + int32 result_size; + int32 result_max; + int i, + len; + int32 need_rate; + const char *hp = history; + const char *hend = history + hlen; + + /* + * Tuples of length greater than PGLZ_HISTORY_SIZE are not allowed for + * delta encode as this is the maximum size of history offset. + */ + if (hlen >= PGLZ_HISTORY_SIZE) + return false; + + /* + * Our fallback strategy is the default. + */ + if (strategy == NULL) + strategy = PGLZ_strategy_default; + + /* + * If the strategy forbids compression (at all or if source chunk size out + * of range), fail. + */ + if (strategy->match_size_good <= 0 || + slen < strategy->min_input_size || + slen > strategy->max_input_size) + return false; + + /* + * Save the original source size in the header. + */ + dest->rawsize = slen; + + need_rate = strategy->min_comp_rate; + if (need_rate < 0) + need_rate = 0; + else if (need_rate > 99) + need_rate = 99; + + /* + * Compute the maximum result size allowed by the strategy, namely the + * input size minus the minimum wanted compression rate. This had better + * be <= slen, else we might overrun the provided output buffer. + */ + if (slen > (INT_MAX / 100)) + { + /* Approximate to avoid overflow */ + result_max = (slen / 100) * (100 - need_rate); + } + else + result_max = (slen * (100 - need_rate)) / 100; + + /* + * Compress the source directly into the output buffer until bitmaplen. + */ + if ((bp + newbitmaplen + 2) - bstart >= result_max) + return false; + + pglz_out_add(ctrlp, ctrlb, ctrl, bp, newbitmaplen, dp); + + /* + * Loop through all attributes offsets, if the attribute data differs with + * history referring offsets, store the [Offset,Length] reffering history + * version till the match and store the changed data as New data. We need + * to accumulate all the matched attributes till an unmatched one is + * found. For the last attribute if it is matched, directly store its + * Offset. It can be improved for accumulation of unmatched attributes. + */ + match_off = hbitmaplen; + hp = history + hbitmaplen; + for (i = 0; i < noffsets; i++) + { + dend = source + ((i + 1 == noffsets) ? slen : newoffsets[i + 1] + newbitmaplen); + hend = history + ((i + 1 == noffsets) ? hlen : hoffsets[i + 1] + hbitmaplen); + + MATCH_AGAIN: + + /* If we already exceeded the maximum result size, fail. */ + if (bp - bstart >= result_max) + return false; + + /* + * Try to find a match in the history. It can match maximum + * PGLZ_MAX_MATCH in one pass as history tag can be of 3 bytes. For + * match greater than PGLZ_MAX_MATCH, it need to do it in multiple + * passes (MATCH_AGAIN). + */ + if (pglz_find_match_with_history(dp + match_len, dend, hp + match_len, + hend, &match_len)) + { + found_match = true; + + /* Finding the maximum match across the offsets */ + if ((i + 1 == noffsets) + || ((dp + match_len) < dend) + || ((hp + match_len < hend))) + { + /* + * Create the tag and add history entries for all matched + * characters. + */ + pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off); + match_off += match_len; + dp += match_len; + hp += match_len; + + if (match_len == PGLZ_MAX_MATCH) + { + match_len = 0; + goto MATCH_AGAIN; + } + else + { + hp = hend; + match_off = hend - history; + match_len = 0; + } + } + } + else + { + hp = hend; + match_off = hend - history; + match_len = 0; + } + + /* copy the unmatched data to output buffer directly from source */ + len = dend - (dp + match_len); + if ((bp + len + 2) - bstart >= result_max) + return false; + + pglz_out_add(ctrlp, ctrlb, ctrl, bp, len, dp); + } + + if (!found_match) + return false; + + /* + * Write out the last control byte and check that we haven't overrun the + * output size allowed by the strategy. + */ + *ctrlp = ctrlb; + result_size = bp - bstart; + + #ifdef DELTA_DEBUG + elog(LOG, "old %d new %d compressed %d", hlen, slen, result_size); + #endif + + /* + * Success - need only fill in the actual length of the compressed datum. + */ + SET_VARSIZE_COMPRESSED(dest, result_size + sizeof(PGLZ_Header)); + + return true; + } /* ---------- * pglz_decompress - *************** *** 647,661 **** pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, void pglz_decompress(const PGLZ_Header *source, char *dest) { const unsigned char *sp; const unsigned char *srcend; unsigned char *dp; unsigned char *destend; sp = ((const unsigned char *) source) + sizeof(PGLZ_Header); ! srcend = ((const unsigned char *) source) + VARSIZE(source); dp = (unsigned char *) dest; ! destend = dp + source->rawsize; while (sp < srcend && dp < destend) { --- 900,937 ---- void pglz_decompress(const PGLZ_Header *source, char *dest) { + pglz_decompress_with_history((char *) source, dest, NULL, NULL); + } + + /* ---------- + * pglz_decompress_with_history - + * + * Decompresses source into dest. + * To decompress, it uses history if provided. + * ---------- + */ + void + pglz_decompress_with_history(const char *source, char *dest, uint32 *destlen, + const char *history) + { + PGLZ_Header src; const unsigned char *sp; const unsigned char *srcend; unsigned char *dp; unsigned char *destend; + /* To avoid the unaligned access of PGLZ_Header */ + memcpy((char *) &src, source, sizeof(PGLZ_Header)); + sp = ((const unsigned char *) source) + sizeof(PGLZ_Header); ! srcend = ((const unsigned char *) source) + VARSIZE(&src); dp = (unsigned char *) dest; ! destend = dp + src.rawsize; ! ! if (destlen) ! { ! *destlen = src.rawsize; ! } while (sp < srcend && dp < destend) { *************** *** 665,670 **** pglz_decompress(const PGLZ_Header *source, char *dest) --- 941,947 ---- */ unsigned char ctrl = *sp++; int ctrlc; + int32 len; for (ctrlc = 0; ctrlc < 8 && sp < srcend; ctrlc++) { *************** *** 677,683 **** pglz_decompress(const PGLZ_Header *source, char *dest) * coded as 18, another extension tag byte tells how much * longer the match really was (0-255). */ - int32 len; int32 off; len = (sp[0] & 0x0f) + 3; --- 954,959 ---- *************** *** 699,726 **** pglz_decompress(const PGLZ_Header *source, char *dest) break; } ! /* ! * Now we copy the bytes specified by the tag from OUTPUT to ! * OUTPUT. It is dangerous and platform dependent to use ! * memcpy() here, because the copied areas could overlap ! * extremely! ! */ ! while (len--) { ! *dp = dp[-off]; ! dp++; } } else { ! /* ! * An unset control bit means LITERAL BYTE. So we just copy ! * one from INPUT to OUTPUT. ! */ ! if (dp >= destend) /* check for buffer overrun */ ! break; /* do not clobber memory */ ! *dp++ = *sp++; } /* --- 975,1030 ---- break; } ! if (history) { ! /* ! * Now we copy the bytes specified by the tag from history ! * to OUTPUT. ! */ ! memcpy(dp, history + off, len); ! dp += len; ! } ! else ! { ! /* ! * Now we copy the bytes specified by the tag from OUTPUT ! * to OUTPUT. It is dangerous and platform dependent to ! * use memcpy() here, because the copied areas could ! * overlap extremely! ! */ ! while (len--) ! { ! *dp = dp[-off]; ! dp++; ! } } } else { ! if (history) ! { ! len = sp[0]; ! sp++; ! /* ! * Now we copy the bytes specified by the len from source ! * to OUTPUT. ! */ ! memcpy(dp, sp, len); ! sp += len; ! dp += len; ! } ! else ! { ! /* ! * An unset control bit means LITERAL BYTE. So we just ! * copy one from INPUT to OUTPUT. ! */ ! if (dp >= destend) /* check for buffer overrun */ ! break; /* do not clobber memory */ ! ! *dp++ = *sp++; ! } } /* *** a/src/backend/utils/misc/guc.c --- b/src/backend/utils/misc/guc.c *************** *** 123,128 **** extern int CommitSiblings; --- 123,129 ---- extern char *default_tablespace; extern char *temp_tablespaces; extern bool synchronize_seqscans; + extern int wal_update_compression_ratio; extern int ssl_renegotiation_limit; extern char *SSLCipherSuites; *************** *** 2382,2387 **** static struct config_int ConfigureNamesInt[] = --- 2383,2399 ---- NULL, NULL, NULL }, + { + /* Not for general use */ + {"wal_update_compression_ratio", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Sets the compression ratio of delta record for wal update"), + NULL, + }, + &wal_update_compression_ratio, + 25, 1, 99, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL *** a/src/include/access/heapam_xlog.h --- b/src/include/access/heapam_xlog.h *************** *** 147,159 **** typedef struct xl_heap_update TransactionId old_xmax; /* xmax of the old tuple */ TransactionId new_xmax; /* xmax of the new tuple */ ItemPointerData newtid; /* new inserted tuple id */ ! uint8 old_infobits_set; /* infomask bits to set on old tuple */ ! bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */ ! bool new_all_visible_cleared; /* same for the page of newtid */ /* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_update; ! #define SizeOfHeapUpdate (offsetof(xl_heap_update, new_all_visible_cleared) + sizeof(bool)) /* * This is what we need to know about vacuum page cleanup/redirect --- 147,168 ---- TransactionId old_xmax; /* xmax of the old tuple */ TransactionId new_xmax; /* xmax of the new tuple */ ItemPointerData newtid; /* new inserted tuple id */ ! uint8 old_infobits_set; /* infomask bits to set on old tuple */ ! int flags; /* flag bits, see below */ /* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_update; ! #define XL_HEAP_UPDATE_ALL_VISIBLE_CLEARED 0x01 /* Indicates as old ! * page's all visible ! * bit is cleared */ ! #define XL_HEAP_UPDATE_NEW_ALL_VISIBLE_CLEARED 0x02 /* Indicates as new ! * page's all visible ! * bit is cleared */ ! #define XL_HEAP_UPDATE_DELTA_ENCODED 0x04 /* Indicates as the ! * update operation is ! * delta encoded */ ! ! #define SizeOfHeapUpdate (offsetof(xl_heap_update, flags) + sizeof(int)) /* * This is what we need to know about vacuum page cleanup/redirect *** a/src/include/access/htup_details.h --- b/src/include/access/htup_details.h *************** *** 687,692 **** extern HeapTuple heap_modify_tuple(HeapTuple tuple, --- 687,697 ---- extern void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull); + extern bool heap_delta_encode(TupleDesc tupleDesc, HeapTuple oldtup, + HeapTuple newtup, char *encdata); + extern void heap_delta_decode(char *encdata, HeapTuple oldtup, + HeapTuple newtup); + /* these three are deprecated versions of the three above: */ extern HeapTuple heap_formtuple(TupleDesc tupleDescriptor, Datum *values, char *nulls); *** a/src/include/access/xlog.h --- b/src/include/access/xlog.h *************** *** 261,266 **** typedef struct CheckpointStatsData --- 261,267 ---- extern CheckpointStatsData CheckpointStats; extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); + extern bool XLogCheckBufferNeedsBackup(Buffer buffer); extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); *** a/src/include/utils/pg_lzcompress.h --- b/src/include/utils/pg_lzcompress.h *************** *** 107,112 **** extern const PGLZ_Strategy *const PGLZ_strategy_always; --- 107,119 ---- */ extern bool pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, const PGLZ_Strategy *strategy); + extern bool pglz_compress_with_history(const char *source, int32 slen, + const char *history, int32 hlen, + int32 *newoffsets, int32 *hoffsets, int32 noffsets, + int32 newbitmaplen, int32 hbitmaplen, + PGLZ_Header *dest, const PGLZ_Strategy *strategy); extern void pglz_decompress(const PGLZ_Header *source, char *dest); + extern void pglz_decompress_with_history(const char *source, char *dest, + uint32 *destlen, const char *history); #endif /* _PG_LZCOMPRESS_H_ */ *** a/src/test/regress/expected/update.out --- b/src/test/regress/expected/update.out *************** *** 97,99 **** SELECT a, b, char_length(c) FROM update_test; --- 97,169 ---- (2 rows) DROP TABLE update_test; + -- + -- Test to update continuos and non continuos columns + -- + DROP TABLE IF EXISTS update_test; + NOTICE: table "update_test" does not exist, skipping + CREATE TABLE update_test ( + bser bigserial, + bln boolean, + ename VARCHAR(25), + perf_f float(8), + grade CHAR, + dept CHAR(5) NOT NULL, + dob DATE, + idnum INT, + addr VARCHAR(30) NOT NULL, + destn CHAR(6), + Gend CHAR, + samba BIGINT, + hgt float, + ctime TIME + ); + INSERT INTO update_test VALUES ( + nextval('update_test_bser_seq'::regclass), + TRUE, + 'Test', + 7.169, + 'B', + 'CSD', + '2000-01-01', + 520, + 'road2, + streeeeet2, + city2', + 'dcy2', + 'M', + 12000, + 50.4, + '00:00:00.0' + ); + SELECT * from update_test; + bser | bln | ename | perf_f | grade | dept | dob | idnum | addr | destn | gend | samba | hgt | ctime + ------+-----+-------+--------+-------+-------+------------+-------+-----------------------------+--------+------+-------+------+---------- + 1 | t | Test | 7.169 | B | CSD | 01-01-2000 | 520 | road2, +| dcy2 | M | 12000 | 50.4 | 00:00:00 + | | | | | | | | streeeeet2,+| | | | | + | | | | | | | | city2 | | | | | + (1 row) + + -- update first column + UPDATE update_test SET bser = bser - 1 + 1; + -- update middle column + UPDATE update_test SET perf_f = 8.9; + -- update last column + UPDATE update_test SET ctime = '00:00:00.1'; + -- update 3 continuos columns + UPDATE update_test SET destn = 'dcy2', samba = 0 WHERE Gend = 'M' and dept = 'CSD'; + -- update two non continuos columns + UPDATE update_test SET destn = 'moved', samba = 0; + UPDATE update_test SET bln = FALSE, hgt = 10.1; + -- update causing some column alignment difference + UPDATE update_test SET ename = 'Tes'; + UPDATE update_test SET dept = 'Test'; + SELECT * from update_test; + bser | bln | ename | perf_f | grade | dept | dob | idnum | addr | destn | gend | samba | hgt | ctime + ------+-----+-------+--------+-------+-------+------------+-------+-----------------------------+--------+------+-------+------+------------ + 1 | f | Tes | 8.9 | B | Test | 01-01-2000 | 520 | road2, +| moved | M | 0 | 10.1 | 00:00:00.1 + | | | | | | | | streeeeet2,+| | | | | + | | | | | | | | city2 | | | | | + (1 row) + + DROP TABLE update_test; *** a/src/test/regress/sql/update.sql --- b/src/test/regress/sql/update.sql *************** *** 59,61 **** UPDATE update_test SET c = repeat('x', 10000) WHERE c = 'car'; --- 59,128 ---- SELECT a, b, char_length(c) FROM update_test; DROP TABLE update_test; + + + -- + -- Test to update continuos and non continuos columns + -- + + DROP TABLE IF EXISTS update_test; + CREATE TABLE update_test ( + bser bigserial, + bln boolean, + ename VARCHAR(25), + perf_f float(8), + grade CHAR, + dept CHAR(5) NOT NULL, + dob DATE, + idnum INT, + addr VARCHAR(30) NOT NULL, + destn CHAR(6), + Gend CHAR, + samba BIGINT, + hgt float, + ctime TIME + ); + + INSERT INTO update_test VALUES ( + nextval('update_test_bser_seq'::regclass), + TRUE, + 'Test', + 7.169, + 'B', + 'CSD', + '2000-01-01', + 520, + 'road2, + streeeeet2, + city2', + 'dcy2', + 'M', + 12000, + 50.4, + '00:00:00.0' + ); + + SELECT * from update_test; + + -- update first column + UPDATE update_test SET bser = bser - 1 + 1; + + -- update middle column + UPDATE update_test SET perf_f = 8.9; + + -- update last column + UPDATE update_test SET ctime = '00:00:00.1'; + + -- update 3 continuos columns + UPDATE update_test SET destn = 'dcy2', samba = 0 WHERE Gend = 'M' and dept = 'CSD'; + + -- update two non continuos columns + UPDATE update_test SET destn = 'moved', samba = 0; + UPDATE update_test SET bln = FALSE, hgt = 10.1; + + -- update causing some column alignment difference + UPDATE update_test SET ename = 'Tes'; + UPDATE update_test SET dept = 'Test'; + + SELECT * from update_test; + DROP TABLE update_test;