[svn.haxx.se] · SVN Dev · SVN Users · SVN Org · TSVN Dev · TSVN Users · Subclipse Dev · Subclipse Users · this month's index

[PATCH] Skip-deltas, for review

From: Greg Hudson <ghudson_at_MIT.EDU>
Date: 2002-07-27 01:21:29 CEST

It turns out none of my skip-delta implementations were actually
creating skip-deltas; svn_fs_rep_deltify() was throwing out the new
deltas because they were bigger than the old ones. With that fixed, I
got the following results from a dump of the svn repository:

                Delta combiner Delta combiner plus skip-deltas
                -------------- -------------------------------
svnadmin load 949 994
svnadmin dump 516 371
svn co -r500 42 36
svn up -r2500 345 368
svn up -r1500 110 111
Repos db size 38712K 40776K

To summarize, on our repository, the effect of this skip-delta
implementation is:

  * Load times: Up 5%
  * Dump times: Down 30%
  * Day to day operations: Drowned in the noise
  * Repository size: Up 5%

Of course, with more extreme data sets, you get more extreme benefits.
To show this, I artificially constructed a ~50MB "repository dump"
where the repository consists of a single text file which shrank
steadily from 3000 lines to 0 lines over 1000 revisions. With this
repository, skip-deltas increased the load time by 17% but decreased
the dump time by 94%, and the repository size still only went up by a
small fraction.

Here is the patch, for review. It's against branches/issue-531-dev,
and should wait until the delta combiner is integrated into the trunk.

        * tree.c (txn_deltify, deltify_if_mutable_under_txn_id):
          Implement skip-deltas, in a low-overhead way.

        * reps-strings.c (svn_fs__rep_deltify): Only check for
          increasing rep size when replacing a plain text. When
          replacing a delta, the goal is speed, not size.

        * structure,
          fs.h (svn_fs__node_revision_t)
          tree.c (update_ancestry, merge, txn_body_merge),
          dag.c, dag.h (svn_fs__dag_get_predecessor_count,
            copy_node_revision, svn_fs__dag_clone_child,
            svn_fs__dag_clone_root, svn_fs__dag_copy),
          util/fs_skels.c (is_valid_node_revision_header_skel,
            svn_fs__parse_node_revision_skel,
            svn_fs__unparse_node_revision_skel):

          Maintain a predecessor count in node-revision header skels.
          The new code will work against old repositories
          (predecessor_count will be set to -1 for node-revisions in
          existing nodes, which means no skip deltas will be done for
          those nodes); old fs code will barf against new
          repositories, but that's normal.

Index: ./subversion/libsvn_fs/tree.c
===================================================================
--- ./subversion/libsvn_fs/tree.c
+++ ./subversion/libsvn_fs/tree.c Fri Jul 26 10:45:10 2002
@@ -1129,6 +1129,65 @@
 };
 
 
+/* Redeltify predecessor node-revisions of the one we added. The idea
+ is to require at most 2*lg(N) deltas to be applied to get to any
+ node-revision in a chain of N predecessors. We do this using a
+ technique derived from skip lists:
+
+ Always redeltify the immediate parent
+ If the number of predecessors is divisible by 2, deltify
+ the revision two predecessors back
+ If the number of predecessors is divisible by 4, deltify
+ the revision four predecessors back
+ etc.
+
+ That's the theory, anyway. Unfortunately, if we strictly follow
+ that theory we get a bunch of overhead up front and no great
+ benefit until the number of predecessors gets large. So, stop at
+ redeltifying the parent if the number of predecessors is less than
+ 32 or is not divisible by four. */
+static svn_error_t *
+txn_deltify (dag_node_t *node, int pred_count, int props_only, trail_t *trail)
+{
+ int nlevels, lev, count;
+ dag_node_t *prednode;
+ svn_fs_t *fs;
+
+ /* Decide how many predecessors to redeltify. */
+ nlevels = 1;
+ if (pred_count >= 32 && pred_count % 4 == 0)
+ {
+ while (pred_count % 2 == 0)
+ {
+ pred_count /= 2;
+ nlevels++;
+ }
+ }
+
+ /* Redeltify the desired number of predecessors. */
+ count = 0;
+ prednode = node;
+ fs = svn_fs__dag_get_fs (node);
+ for (lev = 0; lev < nlevels; lev++)
+ {
+ while (count < (1 << lev))
+ {
+ const svn_fs_id_t *pred_id;
+
+ SVN_ERR (svn_fs__dag_get_predecessor_id (&pred_id, prednode, trail));
+ if (pred_id == NULL)
+ return svn_error_create (SVN_ERR_FS_CORRUPT, 0, 0, trail->pool,
+ "faulty predecessor count");
+ SVN_ERR (svn_fs__dag_get_node (&prednode, fs, pred_id, trail));
+ count++;
+ }
+ SVN_ERR (svn_fs__dag_deltify (prednode, node, props_only, trail));
+ }
+
+ return SVN_NO_ERROR;
+}
+
+
 /* Deltify ID's predecessor iff ID is mutable under TXN_ID in FS. If
    ID is a mutable directory, recurse. Do this as part of TRAIL. */
 static svn_error_t *
@@ -1141,7 +1200,6 @@
   dag_node_t *node;
   apr_hash_t *entries;
   int is_dir;
- const svn_fs_id_t *pred_id;
 
   /* Not mutable? Go no further. This is safe to do because for
      items in the tree to be mutable, their parent dirs must also be
@@ -1180,14 +1238,8 @@
         }
     }
 
- /* If we have have a predecessor, deltify it against the current
- node revision. */
- if ((pred_id = noderev->predecessor_id))
- {
- dag_node_t *pred_node;
- SVN_ERR (svn_fs__dag_get_node (&pred_node, fs, pred_id, trail));
- SVN_ERR (svn_fs__dag_deltify (pred_node, node, is_dir, trail));
- }
+ if (noderev->predecessor_id)
+ SVN_ERR (txn_deltify (node, noderev->predecessor_count, is_dir, trail));
 
   return SVN_NO_ERROR;
 }
@@ -1252,6 +1304,7 @@
                  const svn_fs_id_t *target_id,
                  const char *txn_id,
                  const char *target_path,
+ int source_pred_count,
                  trail_t *trail)
 {
   svn_fs__node_revision_t *noderev;
@@ -1263,6 +1316,9 @@
        "unexpected immutable node at \"%s\"", target_path);
   SVN_ERR (svn_fs__get_node_revision (&noderev, fs, target_id, trail));
   noderev->predecessor_id = source_id;
+ noderev->predecessor_count = source_pred_count;
+ if (noderev->predecessor_count != -1)
+ noderev->predecessor_count++;
   return svn_fs__put_node_revision (fs, target_id, noderev, trail);
 }
 
@@ -1662,6 +1718,7 @@
                 {
                   dag_node_t *s_ent_node, *t_ent_node, *a_ent_node;
                   const char *new_tpath;
+ int pred_count;
                       
                   SVN_ERR (svn_fs__dag_get_node (&s_ent_node, fs,
                                                  s_entry->id, trail));
@@ -1689,13 +1746,17 @@
                                   t_ent_node, s_ent_node, a_ent_node,
                                   txn_id, trail));
 
+ SVN_ERR (svn_fs__dag_get_predecessor_count (&pred_count,
+ s_ent_node,
+ trail));
+
                   /* If target is an immediate descendant of ancestor,
                      and source is also a descendant of ancestor, we
                      need to point target's predecessor-id to
                      source. */
                   SVN_ERR (update_ancestry (fs, s_entry->id,
                                             t_entry->id, txn_id,
- new_tpath, trail));
+ new_tpath, pred_count, trail));
                 }
               /* Else target entry has changed since ancestor entry,
                  but it changed either to source entry or to a
@@ -1928,16 +1989,21 @@
     }
   else
     {
+ int pred_count;
+
       SVN_ERR (merge (args->conflict, "/", txn_root_node,
                       source_node, ancestor_node, txn_id, trail));
 
+ SVN_ERR (svn_fs__dag_get_predecessor_count (&pred_count, source_node,
+ trail));
+
       /* After the merge, txn's new "ancestor" is now really the node
          at source_id, so record that fact. Think of this as
          ratcheting the txn forward in time, so it can't backslide and
          forget the merging work that's already been done. */
       SVN_ERR (update_ancestry (fs, source_id,
                                 svn_fs__dag_get_id (txn_root_node),
- txn_id, "/", trail));
+ txn_id, "/", pred_count, trail));
       SVN_ERR (svn_fs__set_txn_base (fs, txn_id, source_id, trail));
     }
   
Index: ./subversion/libsvn_fs/reps-strings.c
===================================================================
--- ./subversion/libsvn_fs/reps-strings.c
+++ ./subversion/libsvn_fs/reps-strings.c Fri Jul 26 17:05:10 2002
@@ -1303,55 +1303,43 @@
        "svn_fs__rep_deltify: failed to calculate MD5 digest for %s",
        source);
 
- /* Get the size of the target's original string data. Note that we
- don't use svn_fs__rep_contents_size() for this; that function
- always returns the fulltext size, whereas we need to know the
- actual amount of storage used by this representation. Check the
- size of the new string. If it is larger than the old one, this
- whole deltafication might not be such a bright idea. While we're
- at it, we might as well figure out all the strings currently used
- by REP so we can potentially delete them later. */
+ /* Construct a list of the strings used by the old representation so
+ that we can delete them later. While we are here, if the old
+ representation was a fulltext, check to make sure the delta we're
+ replacing it with is actually smaller. (Don't perform this check
+ if we're replacing a delta; in that case, we're going for a time
+ optimization, not a space optimization.) */
   {
     svn_fs__representation_t *old_rep;
- apr_size_t old_size = 0;
     const char *str_key;
 
     SVN_ERR (svn_fs__read_rep (&old_rep, fs, target, trail));
     if (old_rep->kind == svn_fs__rep_kind_fulltext)
       {
+ apr_size_t old_size = 0;
+
         str_key = old_rep->contents.fulltext.string_key;
         SVN_ERR (svn_fs__string_size (&old_size, fs, str_key, trail));
         orig_str_keys = apr_array_make (pool, 1, sizeof (str_key));
         (*((const char **)(apr_array_push (orig_str_keys)))) = str_key;
- }
- else if (old_rep->kind == svn_fs__rep_kind_delta)
- {
- int i;
- apr_size_t my_size;
-
- SVN_ERR (delta_string_keys (&orig_str_keys, old_rep, pool));
- for (i = 0; i < orig_str_keys->nelts; i++)
+
+ /* If the new data is NOT an space optimization, destroy the
+ string(s) we created, and get outta here. */
+ if (diffsize >= old_size)
           {
- str_key = ((const char **) orig_str_keys->elts)[i];
- SVN_ERR (svn_fs__string_size (&my_size, fs, str_key, trail));
- old_size += my_size;
+ int i;
+ for (i = 0; i < windows->nelts; i++)
+ {
+ ww = ((window_write_t **) windows->elts)[i];
+ SVN_ERR (svn_fs__string_delete (fs, ww->key, trail));
+ }
+ return SVN_NO_ERROR;
           }
       }
+ else if (old_rep->kind == svn_fs__rep_kind_delta)
+ SVN_ERR (delta_string_keys (&orig_str_keys, old_rep, pool));
     else /* unknown kind */
       abort ();
-
- /* If the new data is NOT an space optimization, destroy the
- string(s) we created, and get outta here. */
- if (diffsize >= old_size)
- {
- int i;
- for (i = 0; i < windows->nelts; i++)
- {
- ww = ((window_write_t **) windows->elts)[i];
- SVN_ERR (svn_fs__string_delete (fs, ww->key, trail));
- }
- return SVN_NO_ERROR;
- }
   }
 
   /* Hook the new strings we wrote into the rest of the filesystem by
Index: ./subversion/libsvn_fs/dag.h
===================================================================
--- ./subversion/libsvn_fs/dag.h
+++ ./subversion/libsvn_fs/dag.h Fri Jul 26 04:05:09 2002
@@ -109,6 +109,13 @@
                                              trail_t *trail);
 
 
+/* Set *COUNT to the number of predecessors NODE has (recursively), or
+ -1 if not known, as part of TRAIL. */
+svn_error_t *svn_fs__dag_get_predecessor_count (int *count,
+ dag_node_t *node,
+ trail_t *trail);
+
+
 /* Callback function type for svn_fs__dag_walk_predecessors() */
 typedef svn_error_t *(*svn_fs__dag_pred_func_t) (void *baton,
                                                  dag_node_t *node,
Index: ./subversion/libsvn_fs/fs.h
===================================================================
--- ./subversion/libsvn_fs/fs.h
+++ ./subversion/libsvn_fs/fs.h Fri Jul 26 04:03:37 2002
@@ -142,6 +142,10 @@
      for this node revision */
   const svn_fs_id_t *predecessor_id;
 
+ /* number of predecessors this node revision has (recursively), or
+ -1 if not known (for backward compatibility). */
+ int predecessor_count;
+
   /* representation key for this node's properties. may be NULL if
      there are no properties. */
   const char *prop_key;
Index: ./subversion/libsvn_fs/structure
===================================================================
--- ./subversion/libsvn_fs/structure
+++ ./subversion/libsvn_fs/structure Fri Jul 26 18:46:46 2002
@@ -116,7 +116,7 @@
 
 HEADER has the form:
 
- (KIND PREDECESSOR)
+ (KIND PREDECESSOR-ID PREDECESSOR-COUNT)
 
 where:
 
@@ -128,6 +128,9 @@
    * PREDECESSOR-ID, if present, indicates the node revision which is
      the immediate ancestor of this node.
 
+ * PREDECESSOR-COUNT, if present, indicates the number of
+ predecessors the node revision has (recursively).
+
 Note that a node cannot change its kind from one revision to the next.
 A directory node is always a directory; a file node is always a file;
 etc. The fact that the node's kind is stored in each node revision,
@@ -773,9 +776,10 @@
                    FILE ::= (HEADER PROP-KEY DATA-KEY [EDIT-DATA-KEY]) ;
                     DIR ::= (HEADER PROP-KEY ENTRIES-KEY) ;
 
- HEADER ::= (KIND [PREDECESSOR-ID]) ;
+ HEADER ::= (KIND [PREDECESSOR-ID [PREDECESSOR-COUNT]]) ;
                    KIND ::= "file" | "dir" ;
          PREDECESSOR-ID ::= NODE-REV-ID ;
+ PREDECESSOR-COUNT ::= number ;
                PROP-KEY ::= atom ;
                 REP-KEY ::= atom ;
 
Index: ./subversion/libsvn_fs/dag.c
===================================================================
--- ./subversion/libsvn_fs/dag.c
+++ ./subversion/libsvn_fs/dag.c Fri Jul 26 04:29:05 2002
@@ -132,6 +132,7 @@
   nr->kind = noderev->kind;
   if (noderev->predecessor_id)
     nr->predecessor_id = svn_fs__id_copy (noderev->predecessor_id, pool);
+ nr->predecessor_count = noderev->predecessor_count;
   if (noderev->prop_key)
     nr->prop_key = apr_pstrdup (pool, noderev->prop_key);
   if (noderev->data_key)
@@ -298,6 +299,19 @@
 
 
 svn_error_t *
+svn_fs__dag_get_predecessor_count (int *count,
+ dag_node_t *node,
+ trail_t *trail)
+{
+ svn_fs__node_revision_t *noderev;
+
+ SVN_ERR (get_node_revision (&noderev, node, trail));
+ *count = noderev->predecessor_count;
+ return SVN_NO_ERROR;
+}
+
+
+svn_error_t *
 svn_fs__dag_walk_predecessors (dag_node_t *node,
                                svn_fs__dag_pred_func_t callback,
                                void *baton,
@@ -833,6 +847,8 @@
       
       /* Do the clone thingy here. */
       noderev->predecessor_id = svn_fs__id_copy (cur_entry->id, trail->pool);
+ if (noderev->predecessor_count != -1)
+ noderev->predecessor_count++;
       SVN_ERR (svn_fs__create_successor (&new_node_id, fs, cur_entry->id,
                                          noderev, copy_id, txn_id, trail));
       
@@ -875,6 +891,8 @@
          the root node? That is, does this function need a copy_id
          passed in? */
       noderev->predecessor_id = svn_fs__id_copy (base_root_id, trail->pool);
+ if (noderev->predecessor_count != -1)
+ noderev->predecessor_count++;
       SVN_ERR (svn_fs__create_successor (&root_id, fs, base_root_id,
                                          noderev,
                                          svn_fs__id_copy_id (base_root_id),
@@ -1426,6 +1444,8 @@
       /* Create a successor with its predecessor pointing at the copy
          source. */
       to_noderev->predecessor_id = svn_fs__id_copy (src_id, trail->pool);
+ if (to_noderev->predecessor_count != -1)
+ to_noderev->predecessor_count++;
       SVN_ERR (svn_fs__create_successor (&id, fs, src_id, to_noderev,
                                          copy_id, txn_id, trail));
 
Index: ./subversion/libsvn_fs/util/fs_skels.c
===================================================================
--- ./subversion/libsvn_fs/util/fs_skels.c
+++ ./subversion/libsvn_fs/util/fs_skels.c Fri Jul 26 04:36:30 2002
@@ -204,6 +204,13 @@
       && skel->children->next->is_atom)
     return 1;
 
+ /* or with predecessor and predecessor count... */
+ if ((len == 3)
+ && skel->children->is_atom
+ && skel->children->next->is_atom
+ && skel->children->next->next->is_atom)
+ return 1;
+
   return 0;
 }
 
@@ -550,6 +557,17 @@
       noderev->predecessor_id
         = svn_fs_parse_id (header_skel->children->next->data,
                            header_skel->children->next->len, pool);
+
+ /* PREDECESSOR-COUNT */
+ if (header_skel->children->next->next)
+ {
+ noderev->predecessor_count =
+ atoi (apr_pstrmemdup (pool,
+ header_skel->children->next->next->data,
+ header_skel->children->next->next->len));
+ }
+ else if (noderev->predecessor_id)
+ noderev->predecessor_count = -1;
     }
       
   /* PROP-KEY */
@@ -973,6 +991,14 @@
   skel = svn_fs__make_empty_list (pool);
   header_skel = svn_fs__make_empty_list (pool);
 
+ /* PREDECESSOR-COUNT */
+ if (noderev->predecessor_count != -1)
+ {
+ const char *count_str = apr_psprintf(pool, "%d",
+ noderev->predecessor_count);
+ svn_fs__prepend (svn_fs__str_atom (count_str, pool), header_skel);
+ }
+
   /* PREDECESSOR-ID */
   if (noderev->predecessor_id)
     {

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Sat Jul 27 01:22:01 2002

This is an archived mail posted to the Subversion Dev mailing list.

This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.