It turns out none of my skip-delta implementations were actually
creating skip-deltas; svn_fs_rep_deltify() was throwing out the new
deltas because they were bigger than the old ones. With that fixed, I
got the following results from a dump of the svn repository:
Delta combiner Delta combiner plus skip-deltas
-------------- -------------------------------
svnadmin load 949 994
svnadmin dump 516 371
svn co -r500 42 36
svn up -r2500 345 368
svn up -r1500 110 111
Repos db size 38712K 40776K
To summarize, on our repository, the effect of this skip-delta
implementation is:
* Load times: Up 5%
* Dump times: Down 30%
* Day to day operations: Drowned in the noise
* Repository size: Up 5%
Of course, with more extreme data sets, you get more extreme benefits.
To show this, I artificially constructed a ~50MB "repository dump"
where the repository consists of a single text file which shrank
steadily from 3000 lines to 0 lines over 1000 revisions. With this
repository, skip-deltas increased the load time by 17% but decreased
the dump time by 94%, and the repository size still only went up by a
small fraction.
Here is the patch, for review. It's against branches/issue-531-dev,
and should wait until the delta combiner is integrated into the trunk.
* tree.c (txn_deltify, deltify_if_mutable_under_txn_id):
Implement skip-deltas, in a low-overhead way.
* reps-strings.c (svn_fs__rep_deltify): Only check for
increasing rep size when replacing a plain text. When
replacing a delta, the goal is speed, not size.
* structure,
fs.h (svn_fs__node_revision_t)
tree.c (update_ancestry, merge, txn_body_merge),
dag.c, dag.h (svn_fs__dag_get_predecessor_count,
copy_node_revision, svn_fs__dag_clone_child,
svn_fs__dag_clone_root, svn_fs__dag_copy),
util/fs_skels.c (is_valid_node_revision_header_skel,
svn_fs__parse_node_revision_skel,
svn_fs__unparse_node_revision_skel):
Maintain a predecessor count in node-revision header skels.
The new code will work against old repositories
(predecessor_count will be set to -1 for node-revisions in
existing nodes, which means no skip deltas will be done for
those nodes); old fs code will barf against new
repositories, but that's normal.
Index: ./subversion/libsvn_fs/tree.c
===================================================================
--- ./subversion/libsvn_fs/tree.c
+++ ./subversion/libsvn_fs/tree.c Fri Jul 26 10:45:10 2002
@@ -1129,6 +1129,65 @@
};
+/* Redeltify predecessor node-revisions of the one we added. The idea
+ is to require at most 2*lg(N) deltas to be applied to get to any
+ node-revision in a chain of N predecessors. We do this using a
+ technique derived from skip lists:
+
+ Always redeltify the immediate parent
+ If the number of predecessors is divisible by 2, deltify
+ the revision two predecessors back
+ If the number of predecessors is divisible by 4, deltify
+ the revision four predecessors back
+ etc.
+
+ That's the theory, anyway. Unfortunately, if we strictly follow
+ that theory we get a bunch of overhead up front and no great
+ benefit until the number of predecessors gets large. So, stop at
+ redeltifying the parent if the number of predecessors is less than
+ 32 or is not divisible by four. */
+static svn_error_t *
+txn_deltify (dag_node_t *node, int pred_count, int props_only, trail_t *trail)
+{
+ int nlevels, lev, count;
+ dag_node_t *prednode;
+ svn_fs_t *fs;
+
+ /* Decide how many predecessors to redeltify. */
+ nlevels = 1;
+ if (pred_count >= 32 && pred_count % 4 == 0)
+ {
+ while (pred_count % 2 == 0)
+ {
+ pred_count /= 2;
+ nlevels++;
+ }
+ }
+
+ /* Redeltify the desired number of predecessors. */
+ count = 0;
+ prednode = node;
+ fs = svn_fs__dag_get_fs (node);
+ for (lev = 0; lev < nlevels; lev++)
+ {
+ while (count < (1 << lev))
+ {
+ const svn_fs_id_t *pred_id;
+
+ SVN_ERR (svn_fs__dag_get_predecessor_id (&pred_id, prednode, trail));
+ if (pred_id == NULL)
+ return svn_error_create (SVN_ERR_FS_CORRUPT, 0, 0, trail->pool,
+ "faulty predecessor count");
+ SVN_ERR (svn_fs__dag_get_node (&prednode, fs, pred_id, trail));
+ count++;
+ }
+ SVN_ERR (svn_fs__dag_deltify (prednode, node, props_only, trail));
+ }
+
+ return SVN_NO_ERROR;
+}
+
+
/* Deltify ID's predecessor iff ID is mutable under TXN_ID in FS. If
ID is a mutable directory, recurse. Do this as part of TRAIL. */
static svn_error_t *
@@ -1141,7 +1200,6 @@
dag_node_t *node;
apr_hash_t *entries;
int is_dir;
- const svn_fs_id_t *pred_id;
/* Not mutable? Go no further. This is safe to do because for
items in the tree to be mutable, their parent dirs must also be
@@ -1180,14 +1238,8 @@
}
}
- /* If we have have a predecessor, deltify it against the current
- node revision. */
- if ((pred_id = noderev->predecessor_id))
- {
- dag_node_t *pred_node;
- SVN_ERR (svn_fs__dag_get_node (&pred_node, fs, pred_id, trail));
- SVN_ERR (svn_fs__dag_deltify (pred_node, node, is_dir, trail));
- }
+ if (noderev->predecessor_id)
+ SVN_ERR (txn_deltify (node, noderev->predecessor_count, is_dir, trail));
return SVN_NO_ERROR;
}
@@ -1252,6 +1304,7 @@
const svn_fs_id_t *target_id,
const char *txn_id,
const char *target_path,
+ int source_pred_count,
trail_t *trail)
{
svn_fs__node_revision_t *noderev;
@@ -1263,6 +1316,9 @@
"unexpected immutable node at \"%s\"", target_path);
SVN_ERR (svn_fs__get_node_revision (&noderev, fs, target_id, trail));
noderev->predecessor_id = source_id;
+ noderev->predecessor_count = source_pred_count;
+ if (noderev->predecessor_count != -1)
+ noderev->predecessor_count++;
return svn_fs__put_node_revision (fs, target_id, noderev, trail);
}
@@ -1662,6 +1718,7 @@
{
dag_node_t *s_ent_node, *t_ent_node, *a_ent_node;
const char *new_tpath;
+ int pred_count;
SVN_ERR (svn_fs__dag_get_node (&s_ent_node, fs,
s_entry->id, trail));
@@ -1689,13 +1746,17 @@
t_ent_node, s_ent_node, a_ent_node,
txn_id, trail));
+ SVN_ERR (svn_fs__dag_get_predecessor_count (&pred_count,
+ s_ent_node,
+ trail));
+
/* If target is an immediate descendant of ancestor,
and source is also a descendant of ancestor, we
need to point target's predecessor-id to
source. */
SVN_ERR (update_ancestry (fs, s_entry->id,
t_entry->id, txn_id,
- new_tpath, trail));
+ new_tpath, pred_count, trail));
}
/* Else target entry has changed since ancestor entry,
but it changed either to source entry or to a
@@ -1928,16 +1989,21 @@
}
else
{
+ int pred_count;
+
SVN_ERR (merge (args->conflict, "/", txn_root_node,
source_node, ancestor_node, txn_id, trail));
+ SVN_ERR (svn_fs__dag_get_predecessor_count (&pred_count, source_node,
+ trail));
+
/* After the merge, txn's new "ancestor" is now really the node
at source_id, so record that fact. Think of this as
ratcheting the txn forward in time, so it can't backslide and
forget the merging work that's already been done. */
SVN_ERR (update_ancestry (fs, source_id,
svn_fs__dag_get_id (txn_root_node),
- txn_id, "/", trail));
+ txn_id, "/", pred_count, trail));
SVN_ERR (svn_fs__set_txn_base (fs, txn_id, source_id, trail));
}
Index: ./subversion/libsvn_fs/reps-strings.c
===================================================================
--- ./subversion/libsvn_fs/reps-strings.c
+++ ./subversion/libsvn_fs/reps-strings.c Fri Jul 26 17:05:10 2002
@@ -1303,55 +1303,43 @@
"svn_fs__rep_deltify: failed to calculate MD5 digest for %s",
source);
- /* Get the size of the target's original string data. Note that we
- don't use svn_fs__rep_contents_size() for this; that function
- always returns the fulltext size, whereas we need to know the
- actual amount of storage used by this representation. Check the
- size of the new string. If it is larger than the old one, this
- whole deltafication might not be such a bright idea. While we're
- at it, we might as well figure out all the strings currently used
- by REP so we can potentially delete them later. */
+ /* Construct a list of the strings used by the old representation so
+ that we can delete them later. While we are here, if the old
+ representation was a fulltext, check to make sure the delta we're
+ replacing it with is actually smaller. (Don't perform this check
+ if we're replacing a delta; in that case, we're going for a time
+ optimization, not a space optimization.) */
{
svn_fs__representation_t *old_rep;
- apr_size_t old_size = 0;
const char *str_key;
SVN_ERR (svn_fs__read_rep (&old_rep, fs, target, trail));
if (old_rep->kind == svn_fs__rep_kind_fulltext)
{
+ apr_size_t old_size = 0;
+
str_key = old_rep->contents.fulltext.string_key;
SVN_ERR (svn_fs__string_size (&old_size, fs, str_key, trail));
orig_str_keys = apr_array_make (pool, 1, sizeof (str_key));
(*((const char **)(apr_array_push (orig_str_keys)))) = str_key;
- }
- else if (old_rep->kind == svn_fs__rep_kind_delta)
- {
- int i;
- apr_size_t my_size;
-
- SVN_ERR (delta_string_keys (&orig_str_keys, old_rep, pool));
- for (i = 0; i < orig_str_keys->nelts; i++)
+
+ /* If the new data is NOT an space optimization, destroy the
+ string(s) we created, and get outta here. */
+ if (diffsize >= old_size)
{
- str_key = ((const char **) orig_str_keys->elts)[i];
- SVN_ERR (svn_fs__string_size (&my_size, fs, str_key, trail));
- old_size += my_size;
+ int i;
+ for (i = 0; i < windows->nelts; i++)
+ {
+ ww = ((window_write_t **) windows->elts)[i];
+ SVN_ERR (svn_fs__string_delete (fs, ww->key, trail));
+ }
+ return SVN_NO_ERROR;
}
}
+ else if (old_rep->kind == svn_fs__rep_kind_delta)
+ SVN_ERR (delta_string_keys (&orig_str_keys, old_rep, pool));
else /* unknown kind */
abort ();
-
- /* If the new data is NOT an space optimization, destroy the
- string(s) we created, and get outta here. */
- if (diffsize >= old_size)
- {
- int i;
- for (i = 0; i < windows->nelts; i++)
- {
- ww = ((window_write_t **) windows->elts)[i];
- SVN_ERR (svn_fs__string_delete (fs, ww->key, trail));
- }
- return SVN_NO_ERROR;
- }
}
/* Hook the new strings we wrote into the rest of the filesystem by
Index: ./subversion/libsvn_fs/dag.h
===================================================================
--- ./subversion/libsvn_fs/dag.h
+++ ./subversion/libsvn_fs/dag.h Fri Jul 26 04:05:09 2002
@@ -109,6 +109,13 @@
trail_t *trail);
+/* Set *COUNT to the number of predecessors NODE has (recursively), or
+ -1 if not known, as part of TRAIL. */
+svn_error_t *svn_fs__dag_get_predecessor_count (int *count,
+ dag_node_t *node,
+ trail_t *trail);
+
+
/* Callback function type for svn_fs__dag_walk_predecessors() */
typedef svn_error_t *(*svn_fs__dag_pred_func_t) (void *baton,
dag_node_t *node,
Index: ./subversion/libsvn_fs/fs.h
===================================================================
--- ./subversion/libsvn_fs/fs.h
+++ ./subversion/libsvn_fs/fs.h Fri Jul 26 04:03:37 2002
@@ -142,6 +142,10 @@
for this node revision */
const svn_fs_id_t *predecessor_id;
+ /* number of predecessors this node revision has (recursively), or
+ -1 if not known (for backward compatibility). */
+ int predecessor_count;
+
/* representation key for this node's properties. may be NULL if
there are no properties. */
const char *prop_key;
Index: ./subversion/libsvn_fs/structure
===================================================================
--- ./subversion/libsvn_fs/structure
+++ ./subversion/libsvn_fs/structure Fri Jul 26 18:46:46 2002
@@ -116,7 +116,7 @@
HEADER has the form:
- (KIND PREDECESSOR)
+ (KIND PREDECESSOR-ID PREDECESSOR-COUNT)
where:
@@ -128,6 +128,9 @@
* PREDECESSOR-ID, if present, indicates the node revision which is
the immediate ancestor of this node.
+ * PREDECESSOR-COUNT, if present, indicates the number of
+ predecessors the node revision has (recursively).
+
Note that a node cannot change its kind from one revision to the next.
A directory node is always a directory; a file node is always a file;
etc. The fact that the node's kind is stored in each node revision,
@@ -773,9 +776,10 @@
FILE ::= (HEADER PROP-KEY DATA-KEY [EDIT-DATA-KEY]) ;
DIR ::= (HEADER PROP-KEY ENTRIES-KEY) ;
- HEADER ::= (KIND [PREDECESSOR-ID]) ;
+ HEADER ::= (KIND [PREDECESSOR-ID [PREDECESSOR-COUNT]]) ;
KIND ::= "file" | "dir" ;
PREDECESSOR-ID ::= NODE-REV-ID ;
+ PREDECESSOR-COUNT ::= number ;
PROP-KEY ::= atom ;
REP-KEY ::= atom ;
Index: ./subversion/libsvn_fs/dag.c
===================================================================
--- ./subversion/libsvn_fs/dag.c
+++ ./subversion/libsvn_fs/dag.c Fri Jul 26 04:29:05 2002
@@ -132,6 +132,7 @@
nr->kind = noderev->kind;
if (noderev->predecessor_id)
nr->predecessor_id = svn_fs__id_copy (noderev->predecessor_id, pool);
+ nr->predecessor_count = noderev->predecessor_count;
if (noderev->prop_key)
nr->prop_key = apr_pstrdup (pool, noderev->prop_key);
if (noderev->data_key)
@@ -298,6 +299,19 @@
svn_error_t *
+svn_fs__dag_get_predecessor_count (int *count,
+ dag_node_t *node,
+ trail_t *trail)
+{
+ svn_fs__node_revision_t *noderev;
+
+ SVN_ERR (get_node_revision (&noderev, node, trail));
+ *count = noderev->predecessor_count;
+ return SVN_NO_ERROR;
+}
+
+
+svn_error_t *
svn_fs__dag_walk_predecessors (dag_node_t *node,
svn_fs__dag_pred_func_t callback,
void *baton,
@@ -833,6 +847,8 @@
/* Do the clone thingy here. */
noderev->predecessor_id = svn_fs__id_copy (cur_entry->id, trail->pool);
+ if (noderev->predecessor_count != -1)
+ noderev->predecessor_count++;
SVN_ERR (svn_fs__create_successor (&new_node_id, fs, cur_entry->id,
noderev, copy_id, txn_id, trail));
@@ -875,6 +891,8 @@
the root node? That is, does this function need a copy_id
passed in? */
noderev->predecessor_id = svn_fs__id_copy (base_root_id, trail->pool);
+ if (noderev->predecessor_count != -1)
+ noderev->predecessor_count++;
SVN_ERR (svn_fs__create_successor (&root_id, fs, base_root_id,
noderev,
svn_fs__id_copy_id (base_root_id),
@@ -1426,6 +1444,8 @@
/* Create a successor with its predecessor pointing at the copy
source. */
to_noderev->predecessor_id = svn_fs__id_copy (src_id, trail->pool);
+ if (to_noderev->predecessor_count != -1)
+ to_noderev->predecessor_count++;
SVN_ERR (svn_fs__create_successor (&id, fs, src_id, to_noderev,
copy_id, txn_id, trail));
Index: ./subversion/libsvn_fs/util/fs_skels.c
===================================================================
--- ./subversion/libsvn_fs/util/fs_skels.c
+++ ./subversion/libsvn_fs/util/fs_skels.c Fri Jul 26 04:36:30 2002
@@ -204,6 +204,13 @@
&& skel->children->next->is_atom)
return 1;
+ /* or with predecessor and predecessor count... */
+ if ((len == 3)
+ && skel->children->is_atom
+ && skel->children->next->is_atom
+ && skel->children->next->next->is_atom)
+ return 1;
+
return 0;
}
@@ -550,6 +557,17 @@
noderev->predecessor_id
= svn_fs_parse_id (header_skel->children->next->data,
header_skel->children->next->len, pool);
+
+ /* PREDECESSOR-COUNT */
+ if (header_skel->children->next->next)
+ {
+ noderev->predecessor_count =
+ atoi (apr_pstrmemdup (pool,
+ header_skel->children->next->next->data,
+ header_skel->children->next->next->len));
+ }
+ else if (noderev->predecessor_id)
+ noderev->predecessor_count = -1;
}
/* PROP-KEY */
@@ -973,6 +991,14 @@
skel = svn_fs__make_empty_list (pool);
header_skel = svn_fs__make_empty_list (pool);
+ /* PREDECESSOR-COUNT */
+ if (noderev->predecessor_count != -1)
+ {
+ const char *count_str = apr_psprintf(pool, "%d",
+ noderev->predecessor_count);
+ svn_fs__prepend (svn_fs__str_atom (count_str, pool), header_skel);
+ }
+
/* PREDECESSOR-ID */
if (noderev->predecessor_id)
{
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Sat Jul 27 01:22:01 2002