diff --git src/backend/utils/adt/pg_lzcompress.c src/backend/utils/adt/pg_lzcompress.c index 466982e..3980334 100644 --- src/backend/utils/adt/pg_lzcompress.c +++ src/backend/utils/adt/pg_lzcompress.c @@ -62,8 +62,10 @@ * Otherwise the first byte after the header tells what to do * the next 8 times. We call this the control byte. * - * An unset bit in the control byte means, that one uncompressed - * byte follows, which is copied from input to output. + * An unset bit in the control byte means, that one byte header + * and uncompressed 1-255 literals follow. The header includes + * the length of the literals, and the literals are just copied + * from input to output. * * A set bit in the control byte means, that a tag of 2-3 bytes * follows. A tag contains information to copy some bytes, that @@ -184,6 +186,8 @@ #define PGLZ_HISTORY_MASK (PGLZ_HISTORY_LISTS - 1) #define PGLZ_HISTORY_SIZE 4096 #define PGLZ_MAX_MATCH 273 +#define PGLZ_SKIP_SIZE 3 +#define PGLZ_HASH_GAP 8 /* ---------- @@ -322,16 +326,20 @@ do { \ /* ---------- - * pglz_out_literal - + * pglz_out_literals - * * Outputs a literal byte to the destination buffer including the * appropriate control bit. + * Outputs multiple literals from the source to the destination + * buffer including the appropriate control bit. * ---------- */ -#define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) \ +#define pglz_out_literals(_ctrlp,_ctrlb,_ctrl,_buf,_sp,_len) \ do { \ pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ - *(_buf)++ = (unsigned char)(_byte); \ + *(_buf)++ = (unsigned char)(_len); \ + memcpy(_buf, _sp, _len); \ + _buf += _len; \ _ctrl <<= 1; \ } while (0) @@ -468,6 +476,12 @@ pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end, return 1; } + /* + * This match information decides not to output a backward + * reference tag. + */ + *lenp = 0; + return 0; } @@ -586,37 +600,81 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, return false; /* - * Try to find a match in the history + * Remember a base position. A loop below skips some literals + * that might be incompressible, and the literals from the + * base to a next match are just copied to the output buffer. */ - if (pglz_find_match(hist_start, dp, dend, &match_len, - &match_off, good_match, good_drop)) + const char *anchor = dp; + + do { + /* + * Try to find a match in the history + */ + if (pglz_find_match(hist_start, dp, dend, &match_len, + &match_off, good_match, good_drop)) + { + /* + * Check literals backward to find as many matches + * as possible. + */ + while (dp > anchor && dp[-1] == dp[-match_off - 1] && + match_off < 0x0fff && match_len < PGLZ_MAX_MATCH) + { + dp--; + match_len++; + } + + found_match = true; + break; + } + + pglz_hist_add(hist_start, hist_entries, + hist_next, hist_recycle, + dp, dend); + + /* + * Skip PGLZ_SKIP_SIZE literals. + */ + dp += PGLZ_SKIP_SIZE; + + if (dp >= dend) { + dp = dend; + break; + } + } while (dp - anchor < CHAR_MAX - PGLZ_SKIP_SIZE); + + if (dp != anchor) + { + /* + * No match found. Copy skipped multiple literals. + */ + int32 len = dp - anchor; + pglz_out_literals(ctrlp, ctrlb, ctrl, bp, anchor, len); + } + + if (match_len > 0) { /* * Create the tag and add history entries for all matched * characters. */ pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off); - while (match_len--) + + /* + * Skip hash updates by PGLZ_HASH_GAP. As the updates eat + * much processor time, the large skip improves compression + * speeds with little loss of compression ratios. + */ + const char *hashp = dp; + + for (; hashp < dp + match_len; hashp += PGLZ_HASH_GAP) { pglz_hist_add(hist_start, hist_entries, hist_next, hist_recycle, - dp, dend); - dp++; /* Do not do this ++ in the line above! */ - /* The macro would do it four times - Jan. */ + hashp, dend); } - found_match = true; - } - else - { - /* - * No match found. Copy one literal byte. - */ - pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp); - pglz_hist_add(hist_start, hist_entries, - hist_next, hist_recycle, - dp, dend); - dp++; /* Do not do this ++ in the line above! */ - /* The macro would do it four times - Jan. */ + + dp += match_len; } } @@ -720,7 +778,10 @@ pglz_decompress(const PGLZ_Header *source, char *dest) if (dp >= destend) /* check for buffer overrun */ break; /* do not clobber memory */ - *dp++ = *sp++; + int32 len = *sp++; + memcpy(dp, sp, len); + sp += len; + dp += len; } /* diff --git src/include/utils/pg_lzcompress.h src/include/utils/pg_lzcompress.h index 4af24a3..5e5ab59 100644 --- src/include/utils/pg_lzcompress.h +++ src/include/utils/pg_lzcompress.h @@ -28,10 +28,10 @@ typedef struct PGLZ_Header * PGLZ_MAX_OUTPUT - * * Macro to compute the buffer size required by pglz_compress(). - * We allow 4 bytes for overrun before detecting compression failure. + * We allow 259 bytes for overrun before detecting compression failure. * ---------- */ -#define PGLZ_MAX_OUTPUT(_dlen) ((_dlen) + 4 + sizeof(PGLZ_Header)) +#define PGLZ_MAX_OUTPUT(_dlen) ((_dlen) + 259 + sizeof(PGLZ_Header)) /* ---------- * PGLZ_RAW_SIZE -