[svn.haxx.se] · SVN Dev · SVN Users · SVN Org · TSVN Dev · TSVN Users · Subclipse Dev · Subclipse Users · this month's index

Re: svn commit: r1223035 - /subversion/trunk/subversion/libsvn_delta/xdelta.c

From: Hyrum K Wright <hyrum.wright_at_wandisco.com>
Date: Mon, 26 Dec 2011 14:27:46 -0600

On Sat, Dec 24, 2011 at 6:26 PM, <stefan2_at_apache.org> wrote:
> Author: stefan2
> Date: Sun Dec 25 00:26:14 2011
> New Revision: 1223035
>
> URL: http://svn.apache.org/viewvc?rev=1223035&view=rev
> Log:
> Store 32 bit offsets in our hash table even under 64 bits
> (our delta window size much much smaller then 4GB).
> That reduces the hash table size by 50% from 32to 16KB
> and makes it easier to fit into the CPU's L1 cache.

Perhaps a comment to this effect should be in the code somewhere? I
can envision the day where the delta window size changes (or becomes
variable) and this could be a large "gotcha" when that happens.

-Hyrum

> Also, use a proper define instead of casted -1 all over the place.
>
> * subversion/libsvn_delta/xdelta.c
>  (NO_POSITION): new define
>  (block, blocks, hash_func, add_block, find_block):
>   replace apr_size_t with apr_uint32_t for offsets within delta windows
>  (init_blocks_table, find_match): use NO_POSITION define instead of -1
>
> Modified:
>    subversion/trunk/subversion/libsvn_delta/xdelta.c
>
> Modified: subversion/trunk/subversion/libsvn_delta/xdelta.c
> URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_delta/xdelta.c?rev=1223035&r1=1223034&r2=1223035&view=diff
> ==============================================================================
> --- subversion/trunk/subversion/libsvn_delta/xdelta.c (original)
> +++ subversion/trunk/subversion/libsvn_delta/xdelta.c Sun Dec 25 00:26:14 2011
> @@ -44,6 +44,10 @@
>  */
>  #define MATCH_BLOCKSIZE 64
>
> +/* "no" / "invalid" / "unused" value for positions within the detla windows
> + */
> +#define NO_POSITION ((apr_uint32_t)-1)
> +
>  /* Feed C_IN into the adler32 checksum and remove C_OUT at the same time.
>  * This function may (and will) only be called for characters that are
>  * MATCH_BLOCKSIZE positions apart.
> @@ -96,17 +100,17 @@ init_adler32(const char *data)
>  struct block
>  {
>   apr_uint32_t adlersum;
> -  apr_size_t pos;
> +  apr_uint32_t pos;                    /* NO_POSITION -> block is not used */
>  };
>
>  /* A hash table, using open addressing, of the blocks of the source. */
>  struct blocks
>  {
>   /* The largest valid index of slots. */
> -  apr_size_t max;
> +  apr_uint32_t max;
>   /* Source buffer that the positions in SLOTS refer to. */
>   const char* data;
> -  /* The vector of blocks.  A pos value of (apr_size_t)-1 represents an unused
> +  /* The vector of blocks.  A pos value of NO_POSITION represents an unused
>      slot. */
>   struct block *slots;
>  };
> @@ -114,7 +118,7 @@ struct blocks
>
>  /* Return a hash value calculated from the adler32 SUM, suitable for use with
>    our hash table. */
> -static apr_size_t hash_func(apr_uint32_t sum)
> +static apr_uint32_t hash_func(apr_uint32_t sum)
>  {
>   /* Since the adl32 checksum have a bad distribution for the 11th to 16th
>      bits when used for our small block size, we add some bits from the
> @@ -126,12 +130,12 @@ static apr_size_t hash_func(apr_uint32_t
>    data into the table BLOCKS.  Ignore true duplicates, i.e. blocks with
>    actually the same content. */
>  static void
> -add_block(struct blocks *blocks, apr_uint32_t adlersum, apr_size_t pos)
> +add_block(struct blocks *blocks, apr_uint32_t adlersum, apr_uint32_t pos)
>  {
> -  apr_size_t h = hash_func(adlersum) & blocks->max;
> +  apr_uint32_t h = hash_func(adlersum) & blocks->max;
>
>   /* This will terminate, since we know that we will not fill the table. */
> -  for (; blocks->slots[h].pos != (apr_size_t)-1; h = (h + 1) & blocks->max)
> +  for (; blocks->slots[h].pos != NO_POSITION; h = (h + 1) & blocks->max)
>     if (blocks->slots[h].adlersum == adlersum)
>       if (memcmp(blocks->data + blocks->slots[h].pos, blocks->data + pos,
>                  MATCH_BLOCKSIZE) == 0)
> @@ -143,21 +147,21 @@ add_block(struct blocks *blocks, apr_uin
>
>  /* Find a block in BLOCKS with the checksum ADLERSUM and matching the content
>    at DATA, returning its position in the source data.  If there is no such
> -   block, return (apr_size_t)-1. */
> -static apr_size_t
> +   block, return NO_POSITION. */
> +static apr_uint32_t
>  find_block(const struct blocks *blocks,
>            apr_uint32_t adlersum,
>            const char* data)
>  {
> -  apr_size_t h = hash_func(adlersum) & blocks->max;
> +  apr_uint32_t h = hash_func(adlersum) & blocks->max;
>
> -  for (; blocks->slots[h].pos != (apr_size_t)-1; h = (h + 1) & blocks->max)
> +  for (; blocks->slots[h].pos != NO_POSITION; h = (h + 1) & blocks->max)
>     if (blocks->slots[h].adlersum == adlersum)
>       if (memcmp(blocks->data + blocks->slots[h].pos, data,
>                  MATCH_BLOCKSIZE) == 0)
>         return blocks->slots[h].pos;
>
> -  return (apr_size_t)-1;
> +  return NO_POSITION;
>  }
>
>  /* Initialize the matches table from DATA of size DATALEN.  This goes
> @@ -187,7 +191,7 @@ init_blocks_table(const char *data,
>     {
>       /* Avoid using an indeterminate value in the lookup. */
>       blocks->slots[i].adlersum = 0;
> -      blocks->slots[i].pos = (apr_size_t)-1;
> +      blocks->slots[i].pos = NO_POSITION;
>     }
>
>   /* If there is an odd block at the end of the buffer, we will
> @@ -252,7 +256,7 @@ find_match(const struct blocks *blocks,
>   apos = find_block(blocks, rolling, b + bpos);
>
>   /* See if we have a match.  */
> -  if (apos == (apr_size_t)-1)
> +  if (apos == NO_POSITION)
>     return 0;
>
>   /* Extend the match forward as far as possible */
>
>

-- 
uberSVN: Apache Subversion Made Easy
http://www.uberSVN.com/
Received on 2011-12-26 21:28:33 CET

This is an archived mail posted to the Subversion Dev mailing list.

This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.