[PATCH] Use serialized hashes for node-origins records in FSFS [unfinished]

From: C. Michael Pilato <cmpilato_at_collab.net>
Date: Fri, 01 Feb 2008 14:08:02 -0500

The following patch was done just to see what kind of space savings storing
several node-origins records in a single file would provide for FSFS. (Yes,
it's still along the lines of trying to let folks avoid an unnecessary
dump/load.) Because it was simple to do, I decided to use our serialized
hash file format with filenames that of:

db/node-origins/node_id_minus_last_char

(and for single-character node IDs, db/node-origins/0).

I don't have a monstrously large repository to play with -- closest thing is
my sync of our own source code repository. So here's the before and after:

BEFORE

    $ du -sk /usr/local/svn/subversion
    496360 /usr/local/svn/subversion
    $ du -sk /usr/local/svn/subversion/db/node-origins/
    20108 /usr/local/svn/subversion/db/node-origins/

AFTER

    $ du -sk /usr/local/svn/subversion
    476816 /usr/local/svn/subversion
    $ du -sk /usr/local/svn/subversion/db/node-origins/
    564 /usr/local/svn/subversion/db/node-origins/

20M to 564k? Not bad... After the rebuild of the node-origins cache, I had
only two underutilized files (the first-created one, and the most recent one).

That said, my patch is incomplete in that it doesn't quite do the concurrent
access thing correctly. Fixing that might be simple, but it remains to be
done nonetheless.

I think Glasser's approach of using node revisions IDs to store origins is a
very good one, and would encourage him to go ahead with that approach. I'd
still be happiest if he consider a hybrid model -- I don't think the
"switching code" that would have to bridge the two approaches would really
be that hard to deal with. svn_fs_fs__set_node_origin() use the current
approach; at commit time the logic would use the old approach;
svn_fs_fs__get_node_origin() would check the node_id for an origin, and
failing that check the disk for a record. Seems purty straightforward.

But I need to hop off of this task for the time being, so I'm selfishly
using all of your Inboxes as my personal patch backup system. :-)

-- 
C. Michael Pilato <cmpilato_at_collab.net>
CollabNet   <>   www.collab.net   <>   Distributed Development On Demand

Teach the FSFS implementation of node-origins indexing to use
serialized hash files to house 36 records at a time instead of a
separate file for each record.

###
### WARNING: This does *not* deal with concurrency properly!
### (Would making set_node_origins_for_file() into an
### svn_fs_fs__with_write_lock()-wrapped function fix this?)
###

* subversion/libsvn_fs_fs/fs_fs.c
  (path_node_origins): Tweak to calculate the path to a node-origins
    file based on the node-id minus it's last character.
  (get_node_origins_from_file, set_node_origins_for_file): New helper
    functions.
  (svn_fs_fs__get_node_origin): Now use get_node_origins_from_file()
    to get a hash of node origins, and find what it seeks in the hash.
  (svn_fs_fs__set_node_origins, svn_fs_fs__set_node_origin): Now use
    set_node_origins_for_file(), fiddling with data as necessary to do so.

Index: subversion/libsvn_fs_fs/fs_fs.c
===================================================================
--- subversion/libsvn_fs_fs/fs_fs.c (revision 29107)
+++ subversion/libsvn_fs_fs/fs_fs.c (working copy)
@@ -306,8 +306,11 @@
static APR_INLINE const char *
path_node_origin(svn_fs_t *fs, const char *node_id, apr_pool_t *pool)
{
+ int len = strlen(node_id);
+ const char *node_id_minus_last_char =
+ (len == 1) ? "0" : apr_pstrmemdup(pool, node_id, len - 1);
   return svn_path_join_many(pool, fs->path, PATH_NODE_ORIGINS_DIR,
- node_id, NULL);
+ node_id_minus_last_char, NULL);
}

@@ -5711,18 +5714,20 @@
   return SVN_NO_ERROR;
}

-svn_error_t *
-svn_fs_fs__get_node_origin(const svn_fs_id_t **origin_id,
- svn_fs_t *fs,
- const char *node_id,
+/* Set *NODE_ORIGINS to a hash mapping 'const char *' node IDs to
+ 'svn_string_t *' node revision IDs. Use POOL for allocations. */
+static svn_error_t *
+get_node_origins_from_file(svn_fs_t *fs,
+ apr_hash_t **node_origins,
+ const char *node_origins_file,
                            apr_pool_t *pool)
{
   apr_file_t *fd;
- svn_stringbuf_t *origin_stringbuf;
   svn_error_t *err;
+ svn_stream_t *stream;

- *origin_id = NULL;
- err = svn_io_file_open(&fd, path_node_origin(fs, node_id, pool),
+ *node_origins = NULL;
+ err = svn_io_file_open(&fd, node_origins_file,
                          APR_READ, APR_OS_DEFAULT, pool);
   if (err && APR_STATUS_IS_ENOENT(err->apr_err))
     {
@@ -5731,43 +5736,80 @@
     }
   SVN_ERR(err);

- SVN_ERR(svn_stringbuf_from_aprfile(&origin_stringbuf, fd, pool));
+ stream = svn_stream_from_aprfile2(fd, FALSE, pool);
+ *node_origins = apr_hash_make(pool);
+ SVN_ERR(svn_hash_read2(*node_origins, stream, SVN_HASH_TERMINATOR, pool));
+ return svn_stream_close(stream);
+}

- *origin_id = svn_fs_fs__id_parse(origin_stringbuf->data,
- origin_stringbuf->len, pool);
+svn_error_t *
+svn_fs_fs__get_node_origin(const svn_fs_id_t **origin_id,
+ svn_fs_t *fs,
+ const char *node_id,
+ apr_pool_t *pool)
+{
+ apr_hash_t *node_origins;

- SVN_ERR(svn_io_file_close(fd, pool));
-
+ *origin_id = NULL;
+ SVN_ERR(get_node_origins_from_file(fs, &node_origins,
+ path_node_origin(fs, node_id, pool),
+ pool));
+ if (node_origins)
+ {
+ svn_string_t *origin_id_str =
+ apr_hash_get(node_origins, node_id, APR_HASH_KEY_STRING);
+ if (origin_id_str)
+ *origin_id = svn_fs_fs__id_parse(origin_id_str->data,
+ origin_id_str->len, pool);
+ }
   return SVN_NO_ERROR;
}

-/* Helper for svn_fs_fs__set_node_origin[s]. Exactly like
- svn_fs_fs__set_node_origin, except that it throws an error if the
- file can't be written. */
+
+/* Helper for svn_fs_fs__set_node_origin[s]. Takes a hash of
+ NODE_ORIGINS records -- all destined for the same NODE_ORIGINS_PATH
+ file -- and merges them with any records already present in that
+ file. */
static svn_error_t *
-set_node_origin(svn_fs_t *fs,
- const char *node_id,
- const svn_fs_id_t *node_rev_id,
- apr_pool_t *pool)
+set_node_origins_for_file(svn_fs_t *fs,
+ const char *node_origins_path,
+ apr_hash_t *node_origins,
+ apr_pool_t *pool)
{
- apr_file_t *file;
- svn_string_t *node_rev_id_string;
+ apr_file_t *fd;
+ const char *path_tmp;
+ svn_stream_t *stream;
+ apr_hash_t *old_origins;

   SVN_ERR(svn_fs_fs__ensure_dir_exists(svn_path_join(fs->path,
                                                      PATH_NODE_ORIGINS_DIR,
                                                      pool),
                                        fs, pool));

- node_rev_id_string = svn_fs_fs__id_unparse(node_rev_id, pool);
+ /* Read the previously existing origins (if any), and merge our
+ updates with it. */
+ SVN_ERR(get_node_origins_from_file(fs, &old_origins,
+ node_origins_path, pool));
+ if (old_origins)
+ node_origins = apr_hash_overlay(pool, old_origins, node_origins);

- SVN_ERR(svn_io_file_open(&file, path_node_origin(fs, node_id, pool),
- APR_WRITE | APR_CREATE | APR_TRUNCATE
- | APR_BUFFERED, APR_OS_DEFAULT, pool));
- SVN_ERR(svn_io_file_write_full(file,
- node_rev_id_string->data,
- node_rev_id_string->len, NULL, pool));
- SVN_ERR(svn_io_file_close(file, pool));

+ /* ###
+ * ### LA DEE DAH -- Did anybody else spot the race condition here?
+ * ###
+ */
+
+
+ /* Create a temporary file, write out our hash, and close the file. */
+ SVN_ERR(svn_io_open_unique_file2(&fd, &path_tmp, node_origins_path, ".tmp",
+ svn_io_file_del_none, pool));
+ stream = svn_stream_from_aprfile2(fd, FALSE, pool);
+ SVN_ERR(svn_hash_write2(node_origins, stream, SVN_HASH_TERMINATOR, pool));
+ SVN_ERR(svn_stream_close(stream));
+
+ /* Rename the temp file as the real destination */
+ SVN_ERR(svn_io_file_rename(path_tmp, node_origins_path, pool));
+
   return SVN_NO_ERROR;
}

@@ -5779,7 +5821,11 @@
{
   apr_hash_index_t *hi;
   apr_pool_t *iterpool = svn_pool_create(pool);
+ apr_hash_t *hash_of_hashes = apr_hash_make(pool);

+ /* Split the hash of node IDs and origin node revision IDs into a
+ hash of hashes keyed on the filenames in which each of these bad
+ boys is supposed to be stored. */
   for (hi = apr_hash_first(pool, node_origins);
        hi != NULL;
        hi = apr_hash_next(hi))
@@ -5787,27 +5833,46 @@
       const void *key;
       void *val;
       const char *node_id;
- const svn_fs_id_t *node_rev_id;
+ svn_string_t *node_rev_id_str;
+ apr_hash_t *file_hash;
+ const char *filename;
+
+ apr_hash_this(hi, &key, NULL, &val);
+ node_id = key;
+ node_rev_id_str = svn_fs_fs__id_unparse(val, pool);
+ filename = path_node_origin(fs, node_id, pool);
+ file_hash = apr_hash_get(hash_of_hashes, filename, APR_HASH_KEY_STRING);
+ if (! file_hash)
+ {
+ file_hash = apr_hash_make(pool);
+ apr_hash_set(hash_of_hashes, filename, APR_HASH_KEY_STRING,
+ file_hash);
+ }
+ apr_hash_set(file_hash, node_id, APR_HASH_KEY_STRING, node_rev_id_str);
+ }
+
+ /* Now, iterate over the hash of hashes, calling
+ set_node_origins_for_file on each set. */
+ for (hi = apr_hash_first(pool, hash_of_hashes);
+ hi != NULL;
+ hi = apr_hash_next(hi))
+ {
+ const void *key;
+ void *val;
       svn_error_t *err;

       svn_pool_clear(iterpool);

       apr_hash_this(hi, &key, NULL, &val);
- node_id = key;
- node_rev_id = val;
-
- err = set_node_origin(fs, node_id, node_rev_id, iterpool);
+ err = set_node_origins_for_file(fs, key, val, iterpool);
       if (err && APR_STATUS_IS_EACCES(err->apr_err))
         {
- /* It's just a cache; stop trying if I can't write. */
           svn_error_clear(err);
- err = NULL;
- goto cleanup;
+ err = SVN_NO_ERROR;
         }
       SVN_ERR(err);
     }

- cleanup:
   svn_pool_destroy(iterpool);
   return SVN_NO_ERROR;
}
@@ -5818,7 +5883,14 @@
                            const svn_fs_id_t *node_rev_id,
                            apr_pool_t *pool)
{
- svn_error_t *err = set_node_origin(fs, node_id, node_rev_id, pool);
+ svn_error_t *err;
+ apr_hash_t *file_hash = apr_hash_make(pool);
+ const char *filename = path_node_origin(fs, node_id, pool);
+
+ apr_hash_set(file_hash, node_id, APR_HASH_KEY_STRING,
+ svn_fs_fs__id_unparse(node_rev_id, pool));
+
+ err = set_node_origins_for_file(fs, filename, file_hash, pool);
   if (err && APR_STATUS_IS_EACCES(err->apr_err))
     {
       /* It's just a cache; stop trying if I can't write. */

application/pgp-signature attachment: OpenPGP digital signature

Received on 2008-02-01 20:08:16 CET

This message: [ Message body ]
Next message: Augie Fackler: "Re: svn commit: r29127 - in trunk/subversion: bindings/swig include libsvn_subr libsvn_wc svn tests/cmdline"
Previous message: Hyrum K. Wright: "Re: svn commit: r29136 - trunk/subversion/libsvn_subr"
Next in thread: David Glasser: "Re: [PATCH] Use serialized hashes for node-origins records in FSFS [unfinished]"
Reply: David Glasser: "Re: [PATCH] Use serialized hashes for node-origins records in FSFS [unfinished]"

Contemporary messages sorted: [ by date ] [ by thread ] [ by subject ] [ by author ] [ by messages with attachments ]