On Fri, 2010-05-21 at 00:47 +0200, Stefan Fuhrmann wrote:
> Hi there,
>
> this is an improved version of the patch posted here:
>
> http://svn.haxx.se/dev/archive-2010-05/0002.shtml
>
> The improvements address the issues listed there:
>
> http://svn.haxx.se/dev/archive-2010-05/0216.shtml
>
> -- Stefan^2.
>
>
> [[[
> svn_txdelta_apply_instructions is relatively slow for long
> instruction sequences copying small pieces of data. This
> seems to be particularly visible in non-packed FSFS
> repositories.
>
> This patch extracts invariants out from the 'for' loop,
> optimizes overlapping copies as well as small data copy
> runtime.
>
> * subversion/libsvn_delta/text_delta.c
> (fast_memcpy, patterning_copy): new functions,
> optimized for our specific workload
> (svn_txdelta_apply_instructions): reduce loop overhead,
> use fast_memcpy and patterning_copy
>
> patch by stefanfuhrmann < at > alice-dsl.de
> ]]]
>
> plain text document attachment (PerformanceApplyDeltas.v2.patch)
> Index: subversion/libsvn_delta/text_delta.c
> ===================================================================
> --- subversion/libsvn_delta/text_delta.c (revision 944296)
> +++ subversion/libsvn_delta/text_delta.c (working copy)
> @@ -564,61 +564,147 @@
> return SVN_NO_ERROR;
> }
>
> +/* memcpy() is hard to tune for a wide range of buffer lengths.
> + * Therefore, it is often tuned for high throughput on large
> + * buffers and relatively low latency for mid-sized buffers
> + * (tens of bytes). However, the overhead for very small buffers
> + * (<10 bytes) is still high. Even passing the parameters, for
> + * instance, may take as long as copying 3 bytes.
> + *
> + * Because short copy sequences seem to be a common case in
> + * non-packed FSFS repositories, we copy them directly. Larger
> + * buffer sizes aren't hurt measurably by the exta 'if' clause.
> + *
> + * As a further optimization, we return a pointer to the first
> + * byte after the copied target range.
> + */
> +static APR_INLINE char*
> +fast_memcpy(char *target, const char *source, apr_size_t len)
> +{
> + if (len > 7)
> + {
> + memcpy(target, source, len);
> + target += len;
> + }
> + else
> + {
> + /* memcpy is not exactly fast for small block sizes.
> + Since they are common, let's run optimized code for them. */
> + const char *end = source + len;
> + for (; source != end; source++)
> + *(target++) = *source;
> + }
>
> + return target;
> +}
> +
> +/* Unlike memmove() or memcpy(), create repeating patterns when
> + * source and target range overlap. Returns a pointer to the first
> + * byte after the copied target range.
> + */
> +static APR_INLINE char*
> +patterning_copy(char *target, const char *source, apr_size_t len)
> +{
> + const char *end = source + len;
> +
> + /* Copy in larger chunks if source and target don't overlap
> + * closer than the size of the chunks (or don't overlap at all).
> + * Use the natural machine word as chunk size
> + * (for some architectures size_t is even a bit larger).
> + */
> + if (end + sizeof(apr_size_t) <= target)
> + for (; source + sizeof (apr_size_t) <= end;
> + source += sizeof (apr_size_t),
> + target += sizeof (apr_size_t))
> + *(apr_size_t*)(target) = *(apr_size_t*)(source);
> +
> + /* Copy trailing bytes */
> + for (; source != end; source++)
> + *(target++) = *source;
> +
> + return target;
> +}
> +
> void
> svn_txdelta_apply_instructions(svn_txdelta_window_t *window,
> const char *sbuf, char *tbuf,
> apr_size_t *tlen)
> {
> - const svn_txdelta_op_t *op;
> - apr_size_t i, j, tpos = 0;
> + /* The main loop is run quite often.
> + * Pre-calculate as much data as possible.
> + */
> + const svn_txdelta_op_t *op = window->ops;
> + const svn_txdelta_op_t *last_op = window->ops + window->num_ops;
>
> - for (op = window->ops; op < window->ops + window->num_ops; op++)
> + /* target buffer range is limited by target buffer length
> + * as well as the window length
> + */
> + apr_size_t to_fill = *tlen < window->tview_len ? *tlen : window->tview_len;
> +
> + /* we copy data until left is exactly 0 */
> + apr_size_t left = to_fill;
> +
> + /* where to copy to */
> + char *target = tbuf;
> +
> + /* a temporary */
> + apr_size_t buf_len;
> +
> + /* Process the window copy ops until we filled the target buffer. */
> + while (left > 0)
> {
> - const apr_size_t buf_len = (op->length < *tlen - tpos
> - ? op->length : *tlen - tpos);
> + /* If there were to few window ops, the value for window->tview_len
> + * would be inconsistent with the sum of op->length values.
> + */
> + assert(op < last_op);
>
> /* Check some invariants common to all instructions. */
> - assert(tpos + op->length <= window->tview_len);
> + assert(to_fill - left + op->length <= window->tview_len);
>
> + /* under no circumstance will we write beyond the end
> + * of the target buffer.
> + */
> + buf_len = op->length < left ? op->length : left;
> + left -= buf_len;
> +
> switch (op->action_code)
> {
> case svn_txdelta_source:
> /* Copy from source area. */
> assert(op->offset + op->length <= window->sview_len);
> - memcpy(tbuf + tpos, sbuf + op->offset, buf_len);
> + target = fast_memcpy(target, sbuf + op->offset, buf_len);
> break;
>
> case svn_txdelta_target:
> - /* Copy from target area. Don't use memcpy() since its
> - semantics aren't guaranteed for overlapping memory areas,
> - and target copies are allowed to overlap to generate
> - repeated data. */
> - assert(op->offset < tpos);
> - for (i = op->offset, j = tpos; i < op->offset + buf_len; i++)
> - tbuf[j++] = tbuf[i];
> + /* Copy from target area. We can't use memcpy() or the like
> + since we need a specific semantics for overlapping copies:
> + they must result in repeating patterns.
> + */
> + assert(op->offset < target - *tbuf);
> + target = patterning_copy(target, tbuf + op->offset, buf_len);
> break;
>
> case svn_txdelta_new:
> /* Copy from window new area. */
> assert(op->offset + op->length <= window->new_data->len);
> - memcpy(tbuf + tpos,
> - window->new_data->data + op->offset,
> - buf_len);
> + target = fast_memcpy(target,
> + window->new_data->data + op->offset,
> + buf_len);
> break;
>
> default:
> assert(!"Invalid delta instruction code");
> }
>
> - tpos += op->length;
> - if (tpos >= *tlen)
> - return; /* The buffer is full. */
> + /* next operation */
> + ++op;
> }
>
> - /* Check that we produced the right amount of data. */
> - assert(tpos == window->tview_len);
The original code looped through 'window->num_ops' operations, and
afterwards asserted that the amount of target data generated by them was
the expected amount.
The new code loops until the expected amount of target data has been
generated by (some of) the operations. I think, to preserve the
equivalent self-checking, it should then assert that exactly
'window->num_ops' operations have been used:
assert(op == last_op);
Would that seem right to you?
> - *tlen = tpos;
> + /* We always fill the whole target buffer, i.e. left gets exactly 0,
"The whole target buffer" implies to me the buffer 'tbuf' passed in by
the caller which is of length '*tlen' (as passed in, before we modify
it). We are not (necessarily) filling the whole of that buffer:
instead, we are here telling the caller how much we did fill.
So can we replace this comment with something like:
/* Tell the caller how much data we wrote */
or, since that's pretty clearly what it's doing, just remove it?
> + * unless we stumble across an invalid instruction (caught by debug
> + * assertions only).
> + */
> + *tlen = to_fill;
> }
I'm happy to make the above changes and commit it if you're happy with
them.
- Julian
Received on 2010-05-21 13:54:43 CEST