[svn.haxx.se] · SVN Dev · SVN Users · SVN Org · TSVN Dev · TSVN Users · Subclipse Dev · Subclipse Users · this month's index

Re: [PATCH] Speed-up of libsvn_diff using token counts

From: Morten Kloster <morklo_at_gmail.com>
Date: Thu, 19 May 2011 13:38:56 +0200

Here is a version without the whitespace changes in diff3.c. I have also kept
the original indentation level of the loop in lcs.c that I have wrapped in
another loop.

The indentations in Daniel's version had little or nothing to do with
my patch. :)

[[[
Index: subversion/libsvn_diff/diff.c
===================================================================
--- subversion/libsvn_diff/diff.c (revision 1104713)
+++ subversion/libsvn_diff/diff.c (working copy)
@@ -105,6 +105,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[2];
+ apr_uint32_t num_tokens;
+ apr_uint32_t token_index;
+ svn_diff__position_t *current;
+ apr_uint32_t *token_counts[2];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified};
   svn_diff__lcs_t *lcs;
@@ -138,6 +142,8 @@
                                prefix_lines,
                                subpool));

+ num_tokens = svn_diff__get_node_count(tree);
+
   /* The cool part is that we don't need the tokens anymore.
    * Allow the app to clean them up if it wants to.
    */
@@ -147,8 +153,35 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);

+ token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ token_counts[0][token_index] = token_counts[1][token_index] = 0;
+
+ current = position_list[0];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[0][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[0]);
+ }
+ current = position_list[1];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[1][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[1]);
+ }
+
   /* Get the lcs */
- lcs = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+ lcs = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+ token_counts[1], num_tokens, prefix_lines,
                       suffix_lines, subpool);

   /* Produce the diff */
Index: subversion/libsvn_diff/diff.h
===================================================================
--- subversion/libsvn_diff/diff.h (revision 1104713)
+++ subversion/libsvn_diff/diff.h (working copy)
@@ -62,7 +62,7 @@
 struct svn_diff__position_t
 {
   svn_diff__position_t *next;
- svn_diff__node_t *node;
+ apr_int32_t node;
   apr_off_t offset;
 };

@@ -91,12 +91,21 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to
tail (ring) */
               svn_diff__position_t *position_list2, /* pointer to
tail (ring) */
+ apr_uint32_t *token_counts_list1, /* array of counts */
+ apr_uint32_t *token_counts_list2, /* array of counts */
+ apr_uint32_t num_tokens,
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool);

 /*
+ * Returns number of tokens in a tree
+ */
+apr_uint32_t
+svn_diff__get_node_count(svn_diff__tree_t *tree);
+
+/*
  * Support functions to build a tree of token positions
  */
 void
@@ -128,6 +137,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+ apr_uint32_t num_tokens,
                            apr_pool_t *pool);

Index: subversion/libsvn_diff/diff3.c
===================================================================
--- subversion/libsvn_diff/diff3.c (revision 1104713)
+++ subversion/libsvn_diff/diff3.c (working copy)
@@ -38,6 +38,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+ apr_uint32_t num_tokens,
                            apr_pool_t *pool)
 {
     apr_off_t modified_start = hunk->modified_start + 1;
@@ -47,6 +48,9 @@
     apr_off_t latest_length = hunk->latest_length;
     svn_diff__position_t *start_position[2];
     svn_diff__position_t *position[2];
+ apr_uint32_t token_index;
+ svn_diff__position_t *current;
+ apr_uint32_t *token_counts[2];
     svn_diff__lcs_t *lcs = NULL;
     svn_diff__lcs_t **lcs_ref = &lcs;
     svn_diff_t **diff_ref = &hunk->resolved_diff;
@@ -173,9 +177,29 @@
         position[1]->next = start_position[1];
       }

- *lcs_ref = svn_diff__lcs(position[0], position[1], 0, 0,
- subpool);
+ token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ token_counts[0][token_index] = token_counts[1][token_index] = 0;

+ current = position[0];
+ do
+ {
+ token_counts[0][current->node]++;
+ current = current->next;
+ }
+ while (current != position[0]);
+ current = position[1];
+ do
+ {
+ token_counts[1][current->node]++;
+ current = current->next;
+ }
+ while (current != position[1]);
+
+ *lcs_ref = svn_diff__lcs(position[0], position[1], token_counts[0],
+ token_counts[1], num_tokens, 0, 0, subpool);
+
     /* Fix up the EOF lcs element in case one of
      * the two sequences was NULL.
      */
@@ -251,6 +275,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[3];
+ apr_uint32_t num_tokens;
+ apr_uint32_t token_index;
+ svn_diff__position_t *current;
+ apr_uint32_t *token_counts[3];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest};
@@ -292,6 +320,8 @@
                                prefix_lines,
                                subpool));

+ num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -299,10 +329,52 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);

+ token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[2] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ {
+ token_counts[0][token_index] = token_counts[1][token_index] = 0;
+ token_counts[2][token_index] = 0;
+ }
+
+ current = position_list[0];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[0][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[0]);
+ }
+ current = position_list[1];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[1][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[1]);
+ }
+ current = position_list[2];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[2][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[2]);
+ }
+
   /* Get the lcs for original-modified and original-latest */
- lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+ lcs_om = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+ token_counts[1], num_tokens, prefix_lines,
                          suffix_lines, subpool);
- lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+ lcs_ol = svn_diff__lcs(position_list[0], position_list[2], token_counts[0],
+ token_counts[2], num_tokens, prefix_lines,
                          suffix_lines, subpool);

   /* Produce a merged diff */
@@ -435,6 +507,7 @@
                 svn_diff__resolve_conflict(*diff_ref,
                                            &position_list[1],
                                            &position_list[2],
+ num_tokens,
                                            pool);
               }
             else if (is_modified)
Index: subversion/libsvn_diff/diff4.c
===================================================================
--- subversion/libsvn_diff/diff4.c (revision 1104713)
+++ subversion/libsvn_diff/diff4.c (working copy)
@@ -173,6 +173,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[4];
+ apr_uint32_t num_tokens;
+ apr_uint32_t token_index;
+ svn_diff__position_t *current;
+ apr_uint32_t *token_counts[4];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest,
@@ -227,6 +231,8 @@
                                prefix_lines,
                                subpool2));

+ num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -234,8 +240,61 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_clear(subpool3);

+ token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[2] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[3] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ {
+ token_counts[0][token_index] = token_counts[1][token_index] = 0;
+ token_counts[2][token_index] = token_counts[3][token_index] = 0;
+ }
+
+ current = position_list[0];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[0][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[0]);
+ }
+ current = position_list[1];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[1][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[1]);
+ }
+ current = position_list[2];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[2][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[2]);
+ }
+ current = position_list[3];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[3][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[3]);
+ }
+
   /* Get the lcs for original - latest */
- lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+ lcs_ol = svn_diff__lcs(position_list[0], position_list[2],
+ token_counts[0], token_counts[2],
+ num_tokens, prefix_lines,
                          suffix_lines, subpool3);
   diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);

@@ -257,7 +316,9 @@
   /* Get the lcs for common ancestor - original
    * Do reverse adjustements
    */
- lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
+ lcs_adjust = svn_diff__lcs(position_list[3], position_list[2],
+ token_counts[3], token_counts[2],
+ num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -267,7 +328,9 @@
   /* Get the lcs for modified - common ancestor
    * Do forward adjustments
    */
- lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
+ lcs_adjust = svn_diff__lcs(position_list[1], position_list[3],
+ token_counts[1], token_counts[3],
+ num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -283,7 +346,7 @@
       if (hunk->type == svn_diff__type_conflict)
         {
           svn_diff__resolve_conflict(hunk, &position_list[1],
- &position_list[2], pool);
+ &position_list[2], num_tokens, pool);
         }
     }

Index: subversion/libsvn_diff/lcs.c
===================================================================
--- subversion/libsvn_diff/lcs.c (revision 1104713)
+++ subversion/libsvn_diff/lcs.c (working copy)
@@ -51,6 +51,7 @@
 svn_diff__snake(apr_off_t k,
                 svn_diff__snake_t *fp,
                 int idx,
+ apr_uint32_t *token_counts[2],
                 svn_diff__lcs_t **freelist,
                 apr_pool_t *pool)
 {
@@ -92,6 +93,11 @@
     }

+ if (previous_lcs)
+ {
+ previous_lcs->refcount++;
+ }
+
   /* ### Optimization, skip all positions that don't have matchpoints
    * ### anyway. Beware of the sentinel, don't skip it!
    */
@@ -99,6 +105,8 @@
   position[0] = start_position[0];
   position[1] = start_position[1];

+ while (1)
+ {
   while (position[0]->node == position[1]->node)
     {
       position[0] = position[0]->next;
@@ -122,18 +130,19 @@
       lcs->length = position[1]->offset - start_position[1]->offset;
       lcs->next = previous_lcs;
       lcs->refcount = 1;
- fp[k].lcs = lcs;
+ previous_lcs = lcs;
+ start_position[0] = position[0];
+ start_position[1] = position[1];
     }
- else
- {
- fp[k].lcs = previous_lcs;
+ if (position[0]->node >= 0 && token_counts[1][position[0]->node] == 0)
+ start_position[0] = position[0] = position[0]->next;
+ else if (position[1]->node >= 0 &&
token_counts[0][position[1]->node] == 0)
+ start_position[1] = position[1] = position[1]->next;
+ else
+ break;
     }

- if (previous_lcs)
- {
- previous_lcs->refcount++;
- }
-
+ fp[k].lcs = previous_lcs;
   fp[k].position[0] = position[0];
   fp[k].position[1] = position[1];

@@ -188,12 +197,18 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to
tail (ring) */
               svn_diff__position_t *position_list2, /* pointer to
tail (ring) */
+ apr_uint32_t *token_counts_list1, /* array of counts */
+ apr_uint32_t *token_counts_list2, /* array of counts */
+ apr_uint32_t num_tokens,
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool)
 {
   int idx;
   apr_off_t length[2];
+ apr_uint32_t *token_counts[2];
+ apr_uint32_t unique_count[2];
+ apr_uint32_t token_index;
   svn_diff__snake_t *fp;
   apr_off_t d;
   apr_off_t k;
@@ -231,10 +246,19 @@
       return lcs;
     }

+ unique_count[1] = unique_count[0] = 0;
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ {
+ if (token_counts_list1[token_index] == 0)
+ unique_count[1] += token_counts_list2[token_index];
+ if (token_counts_list2[token_index] == 0)
+ unique_count[0] += token_counts_list1[token_index];
+ }
+
   /* Calculate length of both sequences to be compared */
   length[0] = position_list1->offset - position_list1->next->offset + 1;
   length[1] = position_list2->offset - position_list2->next->offset + 1;
- idx = length[0] > length[1] ? 1 : 0;
+ idx = length[0] - unique_count[0] > length[1] - unique_count[1] ? 1 : 0;

   /* strikerXXX: here we allocate the furthest point array, which is
    * strikerXXX: sized M + N + 3 (!)
@@ -246,18 +270,20 @@
   sentinel_position[idx].next = position_list1->next;
   position_list1->next = &sentinel_position[idx];
   sentinel_position[idx].offset = position_list1->offset + 1;
+ token_counts[idx] = token_counts_list1;

   sentinel_position[abs(1 - idx)].next = position_list2->next;
   position_list2->next = &sentinel_position[abs(1 - idx)];
   sentinel_position[abs(1 - idx)].offset = position_list2->offset + 1;
+ token_counts[abs(1 - idx)] = token_counts_list2;

- /* These are never dereferenced, only compared by value, so we
- * can safely fake these up and the void* cast is OK.
+ /* Negative indices will not be used elsewhere
    */
- sentinel_position[0].node = (void*)&sentinel_position[0];
- sentinel_position[1].node = (void*)&sentinel_position[1];
+ sentinel_position[0].node = -1;
+ sentinel_position[1].node = -2;

- d = length[abs(1 - idx)] - length[idx];
+ d = length[abs(1 - idx)] - unique_count[abs(1 - idx)]
+ - (length[idx] - unique_count[idx]);

   /* k = -1 will be the first to be used to get previous
    * position information from, make sure it holds sane
@@ -272,12 +298,12 @@
       /* Forward */
       for (k = -p; k < d; k++)
         {
- svn_diff__snake(k, fp, idx, &lcs_freelist, pool);
+ svn_diff__snake(k, fp, idx, token_counts, &lcs_freelist, pool);
         }

       for (k = d + p; k >= d; k--)
         {
- svn_diff__snake(k, fp, idx, &lcs_freelist, pool);
+ svn_diff__snake(k, fp, idx, token_counts, &lcs_freelist, pool);
         }

       p++;
Index: subversion/libsvn_diff/token.c
===================================================================
--- subversion/libsvn_diff/token.c (revision 1104713)
+++ subversion/libsvn_diff/token.c (working copy)
@@ -46,6 +46,7 @@
   svn_diff__node_t *right;

   apr_uint32_t hash;
+ apr_int32_t index;
   void *token;
 };

@@ -53,10 +54,20 @@
 {
   svn_diff__node_t *root[SVN_DIFF__HASH_SIZE];
   apr_pool_t *pool;
+ apr_uint32_t node_count;
 };

 /*
+ * Returns number of tokens in a tree
+ */
+apr_uint32_t
+svn_diff__get_node_count(svn_diff__tree_t *tree)
+{
+ return tree->node_count;
+}
+
+/*
  * Support functions to build a tree of token positions
  */

@@ -65,6 +76,7 @@
 {
   *tree = apr_pcalloc(pool, sizeof(**tree));
   (*tree)->pool = pool;
+ (*tree)->node_count = 0;
 }

@@ -122,6 +134,7 @@
   new_node->right = NULL;
   new_node->hash = hash;
   new_node->token = token;
+ new_node->index = tree->node_count++;

   *node = *node_ref = new_node;

@@ -168,7 +181,7 @@
       /* Create a new position */
       position = apr_palloc(pool, sizeof(*position));
       position->next = NULL;
- position->node = node;
+ position->node = node->index;
       position->offset = offset;

       *position_ref = position;
]]]

On Thu, May 19, 2011 at 11:38 AM, Stefan Sperling <stsp_at_elego.de> wrote:
> On Wed, May 18, 2011 at 07:31:54PM +0200, Morten Kloster wrote:
>> >
>> > I'm attaching a version of the patch re-generated with -x-pw.
>> >
>> > [[[
>> > Index: subversion/libsvn_diff/diff.c
>> > ===================================================================
>> > --- subversion/libsvn_diff/diff.c       (revision 1104340)
>> > +++ subversion/libsvn_diff/diff.c       (working copy)
>> > @@ -105,6 +105,10 @@ svn_diff_diff_2(svn_diff_t **diff,
>> >  {
>> >   svn_diff__tree_t *tree;
>> >   svn_diff__position_t *position_list[2];
>> > +  apr_uint32_t num_tokens;
>> > +  apr_uint32_t token_index;
>> > +  svn_diff__position_t *current;
>> > +  apr_uint32_t *token_counts[2];
>
> The indentation above looks wrong and in some other places, too.
>
> Morten, your patch is very interesting. But I don't want to review it
> as posted. Can you please provide a fresh patch that includes only
> your functional changes and uses correct code formatting as per our
> coding guidelines? See the community guide at
> http://subversion.apache.org/docs/community-guide/conventions.html
>
> This would speed up processing your patch a lot. Any overhead spent
> separating functional from non-functional changes and checking the
> coding style is time better spent by the patch submitter than the
> patch reviewers. It's a question of one person investing the time
> vs. N people investing the time. If only one person spends this time
> the community as a whole wins time. Thank you :)
>
Received on 2011-05-19 13:39:25 CEST

This is an archived mail posted to the Subversion Dev mailing list.

This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.