Re: svn commit: r1223035 - /subversion/trunk/subversion/libsvn_delta/xdelta.c

From: Hyrum K Wright <hyrum.wright_at_wandisco.com>
Date: Mon, 26 Dec 2011 14:27:46 -0600

On Sat, Dec 24, 2011 at 6:26 PM, <stefan2_at_apache.org> wrote:
> Author: stefan2
> Date: Sun Dec 25 00:26:14 2011
> New Revision: 1223035
>
> URL: http://svn.apache.org/viewvc?rev=1223035&view=rev
> Log:
> Store 32 bit offsets in our hash table even under 64 bits
> (our delta window size much much smaller then 4GB).
> That reduces the hash table size by 50% from 32to 16KB
> and makes it easier to fit into the CPU's L1 cache.

Perhaps a comment to this effect should be in the code somewhere? I
can envision the day where the delta window size changes (or becomes
variable) and this could be a large "gotcha" when that happens.

-Hyrum

> Also, use a proper define instead of casted -1 all over the place.
>
> * subversion/libsvn_delta/xdelta.c
> (NO_POSITION): new define
> (block, blocks, hash_func, add_block, find_block):
> replace apr_size_t with apr_uint32_t for offsets within delta windows
> (init_blocks_table, find_match): use NO_POSITION define instead of -1
>
> Modified:
> subversion/trunk/subversion/libsvn_delta/xdelta.c
>
> Modified: subversion/trunk/subversion/libsvn_delta/xdelta.c
> URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_delta/xdelta.c?rev=1223035&r1=1223034&r2=1223035&view=diff
> ==============================================================================
> --- subversion/trunk/subversion/libsvn_delta/xdelta.c (original)
> +++ subversion/trunk/subversion/libsvn_delta/xdelta.c Sun Dec 25 00:26:14 2011
> @@ -44,6 +44,10 @@
> */
> #define MATCH_BLOCKSIZE 64
>
> +/* "no" / "invalid" / "unused" value for positions within the detla windows
> + */
> +#define NO_POSITION ((apr_uint32_t)-1)
> +
> /* Feed C_IN into the adler32 checksum and remove C_OUT at the same time.
> * This function may (and will) only be called for characters that are
> * MATCH_BLOCKSIZE positions apart.
> @@ -96,17 +100,17 @@ init_adler32(const char *data)
> struct block
> {
> apr_uint32_t adlersum;
> - apr_size_t pos;
> + apr_uint32_t pos; /* NO_POSITION -> block is not used */
> };
>
> /* A hash table, using open addressing, of the blocks of the source. */
> struct blocks
> {
> /* The largest valid index of slots. */
> - apr_size_t max;
> + apr_uint32_t max;
> /* Source buffer that the positions in SLOTS refer to. */
> const char* data;
> - /* The vector of blocks. A pos value of (apr_size_t)-1 represents an unused
> + /* The vector of blocks. A pos value of NO_POSITION represents an unused
> slot. */
> struct block *slots;
> };
> @@ -114,7 +118,7 @@ struct blocks
>
> /* Return a hash value calculated from the adler32 SUM, suitable for use with
> our hash table. */
> -static apr_size_t hash_func(apr_uint32_t sum)
> +static apr_uint32_t hash_func(apr_uint32_t sum)
> {
> /* Since the adl32 checksum have a bad distribution for the 11th to 16th
> bits when used for our small block size, we add some bits from the
> @@ -126,12 +130,12 @@ static apr_size_t hash_func(apr_uint32_t
> data into the table BLOCKS. Ignore true duplicates, i.e. blocks with
> actually the same content. */
> static void
> -add_block(struct blocks *blocks, apr_uint32_t adlersum, apr_size_t pos)
> +add_block(struct blocks *blocks, apr_uint32_t adlersum, apr_uint32_t pos)
> {
> - apr_size_t h = hash_func(adlersum) & blocks->max;
> + apr_uint32_t h = hash_func(adlersum) & blocks->max;
>
> /* This will terminate, since we know that we will not fill the table. */
> - for (; blocks->slots[h].pos != (apr_size_t)-1; h = (h + 1) & blocks->max)
> + for (; blocks->slots[h].pos != NO_POSITION; h = (h + 1) & blocks->max)
> if (blocks->slots[h].adlersum == adlersum)
> if (memcmp(blocks->data + blocks->slots[h].pos, blocks->data + pos,
> MATCH_BLOCKSIZE) == 0)
> @@ -143,21 +147,21 @@ add_block(struct blocks *blocks, apr_uin
>
> /* Find a block in BLOCKS with the checksum ADLERSUM and matching the content
> at DATA, returning its position in the source data. If there is no such
> - block, return (apr_size_t)-1. */
> -static apr_size_t
> + block, return NO_POSITION. */
> +static apr_uint32_t
> find_block(const struct blocks *blocks,
> apr_uint32_t adlersum,
> const char* data)
> {
> - apr_size_t h = hash_func(adlersum) & blocks->max;
> + apr_uint32_t h = hash_func(adlersum) & blocks->max;
>
> - for (; blocks->slots[h].pos != (apr_size_t)-1; h = (h + 1) & blocks->max)
> + for (; blocks->slots[h].pos != NO_POSITION; h = (h + 1) & blocks->max)
> if (blocks->slots[h].adlersum == adlersum)
> if (memcmp(blocks->data + blocks->slots[h].pos, data,
> MATCH_BLOCKSIZE) == 0)
> return blocks->slots[h].pos;
>
> - return (apr_size_t)-1;
> + return NO_POSITION;
> }
>
> /* Initialize the matches table from DATA of size DATALEN. This goes
> @@ -187,7 +191,7 @@ init_blocks_table(const char *data,
> {
> /* Avoid using an indeterminate value in the lookup. */
> blocks->slots[i].adlersum = 0;
> - blocks->slots[i].pos = (apr_size_t)-1;
> + blocks->slots[i].pos = NO_POSITION;
> }
>
> /* If there is an odd block at the end of the buffer, we will
> @@ -252,7 +256,7 @@ find_match(const struct blocks *blocks,
> apos = find_block(blocks, rolling, b + bpos);
>
> /* See if we have a match. */
> - if (apos == (apr_size_t)-1)
> + if (apos == NO_POSITION)
> return 0;
>
> /* Extend the match forward as far as possible */
>
>

-- 
uberSVN: Apache Subversion Made Easy
http://www.uberSVN.com/

Received on 2011-12-26 21:28:33 CET

This message: [ Message body ]
Next message: Stefan Fuhrmann: "Directory deltification"
Previous message: neels_at_apache.org: "[svnbench] Revision: 1224666 compiled Dec 26 2011, 00:21:25"

Contemporary messages sorted: [ by date ] [ by thread ] [ by subject ] [ by author ] [ by messages with attachments ]