It turns out none of my skip-delta implementations were actually
creating skip-deltas; svn_fs_rep_deltify() was throwing out the new
deltas because they were bigger than the old ones.  With that fixed, I
got the following results from a dump of the svn repository:
                Delta combiner		Delta combiner plus skip-deltas
                --------------		-------------------------------
svnadmin load	  949			  994
svnadmin dump	  516			  371
svn co -r500	   42			   36
svn up -r2500	  345			  368
svn up -r1500	  110			  111
Repos db size	38712K			40776K
To summarize, on our repository, the effect of this skip-delta
implementation is:
  * Load times:             Up 5%
  * Dump times:             Down 30%
  * Day to day operations:  Drowned in the noise
  * Repository size:	    Up 5%
Of course, with more extreme data sets, you get more extreme benefits.
To show this, I artificially constructed a ~50MB "repository dump"
where the repository consists of a single text file which shrank
steadily from 3000 lines to 0 lines over 1000 revisions.  With this
repository, skip-deltas increased the load time by 17% but decreased
the dump time by 94%, and the repository size still only went up by a
small fraction.
Here is the patch, for review.  It's against branches/issue-531-dev,
and should wait until the delta combiner is integrated into the trunk.
        * tree.c (txn_deltify, deltify_if_mutable_under_txn_id):
          Implement skip-deltas, in a low-overhead way.
        * reps-strings.c (svn_fs__rep_deltify): Only check for
          increasing rep size when replacing a plain text.  When
          replacing a delta, the goal is speed, not size.
        * structure,
          fs.h (svn_fs__node_revision_t)
          tree.c (update_ancestry, merge, txn_body_merge),
          dag.c, dag.h (svn_fs__dag_get_predecessor_count,
            copy_node_revision, svn_fs__dag_clone_child,
            svn_fs__dag_clone_root, svn_fs__dag_copy),
          util/fs_skels.c (is_valid_node_revision_header_skel,
            svn_fs__parse_node_revision_skel,
            svn_fs__unparse_node_revision_skel):
          Maintain a predecessor count in node-revision header skels.
          The new code will work against old repositories
          (predecessor_count will be set to -1 for node-revisions in
          existing nodes, which means no skip deltas will be done for
          those nodes); old fs code will barf against new
          repositories, but that's normal.
Index: ./subversion/libsvn_fs/tree.c
===================================================================
--- ./subversion/libsvn_fs/tree.c
+++ ./subversion/libsvn_fs/tree.c	Fri Jul 26 10:45:10 2002
@@ -1129,6 +1129,65 @@
 };
 
 
+/* Redeltify predecessor node-revisions of the one we added.  The idea
+   is to require at most 2*lg(N) deltas to be applied to get to any
+   node-revision in a chain of N predecessors.  We do this using a
+   technique derived from skip lists:
+
+     Always redeltify the immediate parent
+     If the number of predecessors is divisible by 2, deltify
+      the revision two predecessors back
+     If the number of predecessors is divisible by 4, deltify
+      the revision four predecessors back
+     etc.
+
+   That's the theory, anyway.  Unfortunately, if we strictly follow
+   that theory we get a bunch of overhead up front and no great
+   benefit until the number of predecessors gets large.  So, stop at
+   redeltifying the parent if the number of predecessors is less than
+   32 or is not divisible by four. */
+static svn_error_t *
+txn_deltify (dag_node_t *node, int pred_count, int props_only, trail_t *trail)
+{
+  int nlevels, lev, count;
+  dag_node_t *prednode;
+  svn_fs_t *fs;
+
+  /* Decide how many predecessors to redeltify. */
+  nlevels = 1;
+  if (pred_count >= 32 && pred_count % 4 == 0)
+    {
+      while (pred_count % 2 == 0)
+        {
+          pred_count /= 2;
+          nlevels++;
+        }
+    }
+
+  /* Redeltify the desired number of predecessors. */
+  count = 0;
+  prednode = node;
+  fs = svn_fs__dag_get_fs (node);
+  for (lev = 0; lev < nlevels; lev++)
+    {
+      while (count < (1 << lev))
+        {
+          const svn_fs_id_t *pred_id;
+
+          SVN_ERR (svn_fs__dag_get_predecessor_id (&pred_id, prednode, trail));
+          if (pred_id == NULL)
+            return svn_error_create (SVN_ERR_FS_CORRUPT, 0, 0, trail->pool,
+                                     "faulty predecessor count");
+          SVN_ERR (svn_fs__dag_get_node (&prednode, fs, pred_id, trail));
+          count++;
+        }
+      SVN_ERR (svn_fs__dag_deltify (prednode, node, props_only, trail));
+    }
+
+  return SVN_NO_ERROR;
+}
+
+
 /* Deltify ID's predecessor iff ID is mutable under TXN_ID in FS.  If
    ID is a mutable directory, recurse.  Do this as part of TRAIL. */
 static svn_error_t *
@@ -1141,7 +1200,6 @@
   dag_node_t *node;
   apr_hash_t *entries;
   int is_dir;
-  const svn_fs_id_t *pred_id;
 
   /* Not mutable?  Go no further.  This is safe to do because for
      items in the tree to be mutable, their parent dirs must also be
@@ -1180,14 +1238,8 @@
         }
     }
 
-  /* If we have have a predecessor, deltify it against the current
-     node revision. */
-  if ((pred_id = noderev->predecessor_id))
-    {
-      dag_node_t *pred_node;
-      SVN_ERR (svn_fs__dag_get_node (&pred_node, fs, pred_id, trail));
-      SVN_ERR (svn_fs__dag_deltify (pred_node, node, is_dir, trail));
-    }
+  if (noderev->predecessor_id)
+    SVN_ERR (txn_deltify (node, noderev->predecessor_count, is_dir, trail));
 
   return SVN_NO_ERROR;
 }
@@ -1252,6 +1304,7 @@
                  const svn_fs_id_t *target_id,
                  const char *txn_id,
                  const char *target_path,
+                 int source_pred_count,
                  trail_t *trail)
 {
   svn_fs__node_revision_t *noderev;
@@ -1263,6 +1316,9 @@
        "unexpected immutable node at \"%s\"", target_path);
   SVN_ERR (svn_fs__get_node_revision (&noderev, fs, target_id, trail));
   noderev->predecessor_id = source_id;
+  noderev->predecessor_count = source_pred_count;
+  if (noderev->predecessor_count != -1)
+    noderev->predecessor_count++;
   return svn_fs__put_node_revision (fs, target_id, noderev, trail);
 }
 
@@ -1662,6 +1718,7 @@
                 {
                   dag_node_t *s_ent_node, *t_ent_node, *a_ent_node;
                   const char *new_tpath;
+                  int pred_count;
                       
                   SVN_ERR (svn_fs__dag_get_node (&s_ent_node, fs,
                                                  s_entry->id, trail));
@@ -1689,13 +1746,17 @@
                                   t_ent_node, s_ent_node, a_ent_node,
                                   txn_id, trail));
 
+                  SVN_ERR (svn_fs__dag_get_predecessor_count (&pred_count,
+                                                              s_ent_node,
+                                                              trail));
+
                   /* If target is an immediate descendant of ancestor,
                      and source is also a descendant of ancestor, we
                      need to point target's predecessor-id to
                      source. */
                   SVN_ERR (update_ancestry (fs, s_entry->id,
                                             t_entry->id, txn_id, 
-                                            new_tpath, trail));
+                                            new_tpath, pred_count, trail));
                 }
               /* Else target entry has changed since ancestor entry,
                  but it changed either to source entry or to a
@@ -1928,16 +1989,21 @@
     }
   else
     {
+      int pred_count;
+
       SVN_ERR (merge (args->conflict, "/", txn_root_node,
                       source_node, ancestor_node, txn_id, trail));
 
+      SVN_ERR (svn_fs__dag_get_predecessor_count (&pred_count, source_node,
+                                                  trail));
+
       /* After the merge, txn's new "ancestor" is now really the node
          at source_id, so record that fact.  Think of this as
          ratcheting the txn forward in time, so it can't backslide and
          forget the merging work that's already been done. */
       SVN_ERR (update_ancestry (fs, source_id, 
                                 svn_fs__dag_get_id (txn_root_node),
-                                txn_id, "/", trail));
+                                txn_id, "/", pred_count, trail));
       SVN_ERR (svn_fs__set_txn_base (fs, txn_id, source_id, trail));
     }
   
Index: ./subversion/libsvn_fs/reps-strings.c
===================================================================
--- ./subversion/libsvn_fs/reps-strings.c
+++ ./subversion/libsvn_fs/reps-strings.c	Fri Jul 26 17:05:10 2002
@@ -1303,55 +1303,43 @@
        "svn_fs__rep_deltify: failed to calculate MD5 digest for %s",
        source);
 
-  /* Get the size of the target's original string data.  Note that we
-     don't use svn_fs__rep_contents_size() for this; that function
-     always returns the fulltext size, whereas we need to know the
-     actual amount of storage used by this representation.  Check the
-     size of the new string.  If it is larger than the old one, this
-     whole deltafication might not be such a bright idea.  While we're
-     at it, we might as well figure out all the strings currently used
-     by REP so we can potentially delete them later. */
+  /* Construct a list of the strings used by the old representation so
+     that we can delete them later.  While we are here, if the old
+     representation was a fulltext, check to make sure the delta we're
+     replacing it with is actually smaller.  (Don't perform this check
+     if we're replacing a delta; in that case, we're going for a time
+     optimization, not a space optimization.)  */
   {
     svn_fs__representation_t *old_rep;
-    apr_size_t old_size = 0;
     const char *str_key;
 
     SVN_ERR (svn_fs__read_rep (&old_rep, fs, target, trail));
     if (old_rep->kind == svn_fs__rep_kind_fulltext)
       {
+        apr_size_t old_size = 0;
+
         str_key = old_rep->contents.fulltext.string_key;
         SVN_ERR (svn_fs__string_size (&old_size, fs, str_key, trail));
         orig_str_keys = apr_array_make (pool, 1, sizeof (str_key));
         (*((const char **)(apr_array_push (orig_str_keys)))) = str_key;
-      }
-    else if (old_rep->kind == svn_fs__rep_kind_delta)
-      {
-        int i;
-        apr_size_t my_size;
-        
-        SVN_ERR (delta_string_keys (&orig_str_keys, old_rep, pool));
-        for (i = 0; i < orig_str_keys->nelts; i++)
+
+        /* If the new data is NOT an space optimization, destroy the
+           string(s) we created, and get outta here. */
+        if (diffsize >= old_size)
           {
-            str_key = ((const char **) orig_str_keys->elts)[i];
-            SVN_ERR (svn_fs__string_size (&my_size, fs, str_key, trail));
-            old_size += my_size;
+            int i;
+            for (i = 0; i < windows->nelts; i++)
+              {
+                ww = ((window_write_t **) windows->elts)[i];
+                SVN_ERR (svn_fs__string_delete (fs, ww->key, trail));
+              }
+            return SVN_NO_ERROR;
           }
       }
+    else if (old_rep->kind == svn_fs__rep_kind_delta)
+      SVN_ERR (delta_string_keys (&orig_str_keys, old_rep, pool));
     else /* unknown kind */
       abort ();
-
-    /* If the new data is NOT an space optimization, destroy the
-       string(s) we created, and get outta here. */
-    if (diffsize >= old_size)
-      {
-        int i;
-        for (i = 0; i < windows->nelts; i++)
-          {
-            ww = ((window_write_t **) windows->elts)[i];
-            SVN_ERR (svn_fs__string_delete (fs, ww->key, trail));
-          }
-        return SVN_NO_ERROR;
-      }
   }
 
   /* Hook the new strings we wrote into the rest of the filesystem by
Index: ./subversion/libsvn_fs/dag.h
===================================================================
--- ./subversion/libsvn_fs/dag.h
+++ ./subversion/libsvn_fs/dag.h	Fri Jul 26 04:05:09 2002
@@ -109,6 +109,13 @@
                                              trail_t *trail);
 
 
+/* Set *COUNT to the number of predecessors NODE has (recursively), or
+   -1 if not known, as part of TRAIL.  */
+svn_error_t *svn_fs__dag_get_predecessor_count (int *count,
+                                                dag_node_t *node,
+                                                trail_t *trail);
+
+
 /* Callback function type for svn_fs__dag_walk_predecessors() */
 typedef svn_error_t *(*svn_fs__dag_pred_func_t) (void *baton,
                                                  dag_node_t *node,
Index: ./subversion/libsvn_fs/fs.h
===================================================================
--- ./subversion/libsvn_fs/fs.h
+++ ./subversion/libsvn_fs/fs.h	Fri Jul 26 04:03:37 2002
@@ -142,6 +142,10 @@
      for this node revision */
   const svn_fs_id_t *predecessor_id;
 
+  /* number of predecessors this node revision has (recursively), or
+     -1 if not known (for backward compatibility). */
+  int predecessor_count;
+
   /* representation key for this node's properties.  may be NULL if
      there are no properties.  */
   const char *prop_key;
Index: ./subversion/libsvn_fs/structure
===================================================================
--- ./subversion/libsvn_fs/structure
+++ ./subversion/libsvn_fs/structure	Fri Jul 26 18:46:46 2002
@@ -116,7 +116,7 @@
 
 HEADER has the form:
 
-    (KIND PREDECESSOR)
+    (KIND PREDECESSOR-ID PREDECESSOR-COUNT)
 
 where:
 
@@ -128,6 +128,9 @@
    * PREDECESSOR-ID, if present, indicates the node revision which is
      the immediate ancestor of this node.
 
+   * PREDECESSOR-COUNT, if present, indicates the number of
+     predecessors the node revision has (recursively).
+
 Note that a node cannot change its kind from one revision to the next.
 A directory node is always a directory; a file node is always a file;
 etc.  The fact that the node's kind is stored in each node revision,
@@ -773,9 +776,10 @@
                    FILE ::= (HEADER PROP-KEY DATA-KEY [EDIT-DATA-KEY]) ;
                     DIR ::= (HEADER PROP-KEY ENTRIES-KEY) ;
 
-                 HEADER ::= (KIND [PREDECESSOR-ID]) ;
+                 HEADER ::= (KIND [PREDECESSOR-ID [PREDECESSOR-COUNT]]) ;
                    KIND ::= "file" | "dir" ;
          PREDECESSOR-ID ::= NODE-REV-ID ;
+      PREDECESSOR-COUNT ::= number ;
                PROP-KEY ::= atom ;
                 REP-KEY ::= atom ;
 
Index: ./subversion/libsvn_fs/dag.c
===================================================================
--- ./subversion/libsvn_fs/dag.c
+++ ./subversion/libsvn_fs/dag.c	Fri Jul 26 04:29:05 2002
@@ -132,6 +132,7 @@
   nr->kind = noderev->kind;
   if (noderev->predecessor_id)
     nr->predecessor_id = svn_fs__id_copy (noderev->predecessor_id, pool);
+  nr->predecessor_count = noderev->predecessor_count;
   if (noderev->prop_key)
     nr->prop_key = apr_pstrdup (pool, noderev->prop_key);
   if (noderev->data_key)
@@ -298,6 +299,19 @@
 
 
 svn_error_t *
+svn_fs__dag_get_predecessor_count (int *count,
+                                   dag_node_t *node,
+                                   trail_t *trail)
+{
+  svn_fs__node_revision_t *noderev;
+  
+  SVN_ERR (get_node_revision (&noderev, node, trail));
+  *count = noderev->predecessor_count;
+  return SVN_NO_ERROR;
+}
+
+
+svn_error_t *
 svn_fs__dag_walk_predecessors (dag_node_t *node,
                                svn_fs__dag_pred_func_t callback,
                                void *baton,
@@ -833,6 +847,8 @@
       
       /* Do the clone thingy here. */
       noderev->predecessor_id = svn_fs__id_copy (cur_entry->id, trail->pool);
+      if (noderev->predecessor_count != -1)
+        noderev->predecessor_count++;
       SVN_ERR (svn_fs__create_successor (&new_node_id, fs, cur_entry->id, 
                                          noderev, copy_id, txn_id, trail));
       
@@ -875,6 +891,8 @@
          the root node?  That is, does this function need a copy_id
          passed in?  */
       noderev->predecessor_id = svn_fs__id_copy (base_root_id, trail->pool);
+      if (noderev->predecessor_count != -1)
+        noderev->predecessor_count++;
       SVN_ERR (svn_fs__create_successor (&root_id, fs, base_root_id, 
                                          noderev, 
                                          svn_fs__id_copy_id (base_root_id),
@@ -1426,6 +1444,8 @@
       /* Create a successor with its predecessor pointing at the copy
          source. */
       to_noderev->predecessor_id = svn_fs__id_copy (src_id, trail->pool);
+      if (to_noderev->predecessor_count != -1)
+        to_noderev->predecessor_count++;
       SVN_ERR (svn_fs__create_successor (&id, fs, src_id, to_noderev,
                                          copy_id, txn_id, trail));
 
Index: ./subversion/libsvn_fs/util/fs_skels.c
===================================================================
--- ./subversion/libsvn_fs/util/fs_skels.c
+++ ./subversion/libsvn_fs/util/fs_skels.c	Fri Jul 26 04:36:30 2002
@@ -204,6 +204,13 @@
       && skel->children->next->is_atom)
     return 1;
 
+  /* or with predecessor and predecessor count... */
+  if ((len == 3)
+      && skel->children->is_atom
+      && skel->children->next->is_atom
+      && skel->children->next->next->is_atom)
+    return 1;
+
   return 0;
 }
 
@@ -550,6 +557,17 @@
       noderev->predecessor_id 
         = svn_fs_parse_id (header_skel->children->next->data,
                            header_skel->children->next->len, pool);
+
+      /* PREDECESSOR-COUNT */
+      if (header_skel->children->next->next)
+        {
+          noderev->predecessor_count =
+            atoi (apr_pstrmemdup (pool,
+                                  header_skel->children->next->next->data,
+                                  header_skel->children->next->next->len));
+        }
+      else if (noderev->predecessor_id)
+        noderev->predecessor_count = -1;
     }
       
   /* PROP-KEY */
@@ -973,6 +991,14 @@
   skel = svn_fs__make_empty_list (pool);
   header_skel = svn_fs__make_empty_list (pool);
 
+  /* PREDECESSOR-COUNT */
+  if (noderev->predecessor_count != -1)
+    {
+      const char *count_str = apr_psprintf(pool, "%d",
+                                           noderev->predecessor_count);
+      svn_fs__prepend (svn_fs__str_atom (count_str, pool), header_skel);
+    }
+
   /* PREDECESSOR-ID */
   if (noderev->predecessor_id)
     {
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Sat Jul 27 01:22:01 2002