Index: subversion/libsvn_delta/vdelta.c =================================================================== --- subversion/libsvn_delta/vdelta.c (revision 12966) +++ subversion/libsvn_delta/vdelta.c (working copy) @@ -20,10 +20,10 @@ #include #include /* for APR_INLINE */ +#include #include "svn_delta.h" #include "delta.h" - /* ==================================================================== */ /* Hash table for vdelta hashing. @@ -302,7 +302,302 @@ } } +/* This is pseudo-adler32, and xdelta, as monotone does it. The + adler32 has no prime modulus. This is a direct translation of the + C++ code, hence the reason it looks a bit weird. */ +#define adler32_mask 0x0000ffff +#define adler32_char_mask 0x000000ff + +/* Structure to store the state of our adler32 checksum. */ +struct adler32 +{ + apr_uint32_t s1; + apr_uint32_t s2; + apr_uint32_t len; + apr_uint32_t mask; +}; + +/* Feed C into the adler32 checksum. */ + +static inline void +adler32_in (struct adler32 *ad, const char c) +{ + ad->s1 += ((apr_uint32_t) (c)) & adler32_char_mask; + ad->s1 &= adler32_mask; + ad->s2 += ad->s1; + ad->s2 &= adler32_mask; + ad->len++; +} + +/* Remove the result of byte C from the adler32 checksum. */ + +static inline void +adler32_out (struct adler32 *ad, const char c) +{ + ad->s1 -= ((apr_uint32_t) (c)) & adler32_char_mask; + ad->s1 &= adler32_mask; + ad->s2 -= (ad->len * (((apr_uint32_t) c) & adler32_char_mask)) + 1; + ad->s2 &= adler32_mask; + --ad->len; +} + +/* Return the current adler32 checksum in the adler32 structure. */ + +static inline apr_uint32_t +adler32_sum (const struct adler32 *ad) +{ + return (ad->s2 << 16) | (ad->s1); +} + +/* Initialize an adler32 checksum structure with DATA, which has length + DATALEN. Return the initialized structure. */ + +static inline struct adler32 * +init_adler32 (struct adler32 *ad, const char *data, apr_uint32_t datalen) +{ + ad->s1 = 1; + ad->s2 = 0; + ad->mask = adler32_mask; + ad->len = 0; + while (datalen--) + adler32_in (ad, *(data++)); + return ad; +} + +/* Match structure used in the xdelta matches hashtable. This contains the + position and length of the initial match. */ + +struct match +{ + apr_uint32_t pos; + apr_uint32_t len; +}; + +/* Initialize the matches table from DATA. This goes through every + block of BLOCKSIZE bytes in the source and checksums it, inserting the + result into the MATCHES table. */ + +static void +init_matches_table (const char *data, + apr_uint32_t datalen, + apr_uint32_t blocksize, + apr_hash_t *matches, + apr_pool_t *pool) +{ + apr_uint32_t i = 0; + struct adler32 adler; + for (i = 0; i < datalen; i += blocksize) + { + apr_uint32_t step = ((i + blocksize) >= datalen) ? (datalen - i) : blocksize; + apr_uint32_t adlersum = adler32_sum (init_adler32 (&adler, data + i, step)); + if (apr_hash_get (matches, &adlersum, sizeof (adlersum)) == NULL) + { + struct match *newmatch = apr_palloc (pool, sizeof (struct match)); + apr_uint32_t *key = apr_palloc (pool, sizeof (apr_uint32_t)); + newmatch->pos = i; + newmatch->len = step; + *key = adlersum; + apr_hash_set (matches, key, sizeof (*key), + newmatch); + } + } +} + +/* Try to find a match for the target data B in MATCHES, and then extend the + match as long as data in A and B at the match position continues to match. + We set the position in a we ended up in (in case we extended it backwards) + in APOSP, the length of the match in ALENP, and the amount to advance B in + BADVANCEP. PENDING_INSERT is a pointer to a stringbuf pointer that is the + last insert operation that has not been committed yet to the delta stream, + if any. This is used when extending the matches backwards, possibly + alleviating the need for the insert entirely. */ + +static void +find_match (apr_hash_t *matches, + const struct adler32 *rolling, + const char *a, + apr_uint32_t asize, + const char *b, + apr_uint32_t bsize, + apr_uint32_t bpos, + apr_uint32_t *aposp, + apr_uint32_t *alenp, + apr_uint32_t *badvancep, + svn_stringbuf_t **pending_insert) +{ + apr_uint32_t sum = adler32_sum (rolling); + apr_uint32_t alen, badvance, apos; + struct match *match; + apr_uint32_t tpos, tlen; + + /* See if we have a match. */ + match = apr_hash_get (matches, &sum, sizeof (sum)); + if (match == NULL) + return; + + /* See where our match started. */ + tpos = match->pos; + tlen = match->len; + + /* Make sure it's not a false match. */ + if (memcmp (a + tpos, b + bpos, tlen) != 0) + return; + + apos = tpos; + alen = tlen; + badvance = tlen; + + /* Extend the match forward as far as possible */ + while ((apos + alen >= 0) + && (bpos + badvance >= 0) + && (apos + alen < asize) + && (bpos + badvance < bsize) + && (a[apos + alen] == b[bpos + badvance])) + { + ++alen; + ++badvance; + } + + /* See if we can extend backwards into a previous insert hunk. */ + if (*pending_insert) + { + while (apos > 0 + && bpos > 0 + && a[apos - 1] == b[bpos - 1] + && (*pending_insert)->len != 0) + { + svn_stringbuf_chop (*pending_insert, 1); + --apos; + --bpos; + ++alen; + } + /* If we completely consumed the entire insert, delete it. */ + if ((*pending_insert)->len == 0) + *pending_insert = NULL; + } + + *aposp = apos; + *alenp = alen; + *badvancep = badvance; +} + +/* Size of the blocks we compute checksums for. */ +#define MATCH_BLOCKSIZE 64 + + +/* Compute a delta from A to B using xdelta. + +The basic xdelta algorithm is as follows: + +1. Go through the source data, checksumming every BLOCKSIZE block of bytes + using adler32, and inserting the checksum into a match table with the + position of the match. +2. Go through the target byte by byte, seeing if that byte starts a match that + we have in the match table. + 2a. If so, try to extend the match as far as possible both forwards and + backwards, and then insert a source copy operation into the delta ops + builder for the match. + 2b. If not, insert the byte as new data using an insert delta op. + +Our implementation doesn't immediately insert "insert" operations, it waits +until we have another copy, or we are done. The reasoning is twofold: + +1. Otherwise, we would just be building a ton of 1 byte insert operations +2. So that we can extend a source match backwards into a pending insert +operation, and possibly remove the need for the insert entirely. This can +happen due to stream alignment. + +*/ +static void +compute_delta (svn_txdelta__ops_baton_t *build_baton, + const char *a, + apr_uint32_t asize, + const char *b, + apr_uint32_t bsize, + apr_pool_t *pool) +{ + apr_hash_t *matches = apr_hash_make(pool); + struct adler32 rolling; + apr_uint32_t sz, lo, hi; + svn_stringbuf_t *pending_insert = NULL; + + /* Initialize the matches table. */ + init_matches_table (a, asize, MATCH_BLOCKSIZE, matches, pool); + + /* If the size of the target is smaller than the match blocksize, just + insert the entire target. */ + if (bsize < MATCH_BLOCKSIZE) + { + svn_txdelta__insert_op (build_baton, svn_txdelta_new, + 0, bsize, b, pool); + return; + } + + /* Initialize our rolling checksum. */ + init_adler32 (&rolling, b, MATCH_BLOCKSIZE); + for (sz = bsize, lo = 0, hi = MATCH_BLOCKSIZE; lo < sz;) + { + + apr_uint32_t apos = 0; + apr_uint32_t alen = 1; + apr_uint32_t badvance = 1; + apr_uint32_t next; + + find_match (matches, &rolling, a, asize, b, bsize, lo, &apos, &alen, + &badvance, &pending_insert); + + /* If we didn't find a real match, insert the byte at the target + position into the pending insert. */ + if (alen < MATCH_BLOCKSIZE && + (apos + alen < asize)) + { + if (pending_insert != NULL) + svn_stringbuf_appendbytes (pending_insert, b + lo, 1); + else + pending_insert = svn_stringbuf_ncreate (b + lo, 1, pool); + } + else + { + if (pending_insert) + { + svn_txdelta__insert_op (build_baton, svn_txdelta_new, + 0, pending_insert->len, + pending_insert->data, pool); + pending_insert = NULL; + } + svn_txdelta__insert_op (build_baton, svn_txdelta_source, + apos, alen, NULL, pool); + } + next = lo; + for (; next < lo + badvance; ++next) + { + + adler32_out (&rolling, b[next]); + if (next + MATCH_BLOCKSIZE < bsize) + adler32_in (&rolling, b[next + MATCH_BLOCKSIZE]); + } + lo = next; + hi = lo + MATCH_BLOCKSIZE; + } + + /* If we still have an insert pending at the end, throw it in. */ + if (pending_insert) + { + svn_txdelta__insert_op (build_baton, svn_txdelta_new, + 0, pending_insert->len, + pending_insert->data, pool); + pending_insert = NULL; + } + +} + +/* Define one of these to switch between the various delta algorithms + below. */ +#undef NOOP +#undef VDELTA +#define XDELTA + void svn_txdelta__vdelta (svn_txdelta__ops_baton_t *build_baton, const char *data, @@ -310,12 +605,33 @@ apr_size_t target_len, apr_pool_t *pool) { +#if defined(XDELTA) + /* When source_len is 0, compress the target against itself using vdelta in + order to get some compression. */ + if (source_len == 0) + { + hash_table_t *table = create_hash_table (source_len + target_len, pool); + vdelta (build_baton, data, data + source_len, + data + source_len + target_len, + TRUE, table, pool); + } + else + compute_delta (build_baton, + data, source_len, + data + source_len, target_len, + pool); + +#elif defined(NOOP) + + svn_txdelta__insert_op (build_baton, svn_txdelta_new, + 0, target_len, data + source_len, pool); +#elif defined(VDELTA) hash_table_t *table = create_hash_table (source_len + target_len, pool); - vdelta (build_baton, data, data, data + source_len, FALSE, table, pool); - vdelta (build_baton, data, data + source_len, data + source_len + target_len, - TRUE, table, pool); + vdelta (build_baton, data, data + source_len, + data + source_len + target_len, TRUE, table, pool); +#endif #if 0 /* This bit of code calculates the hash load and the number of collisions. Please note that a the number