[svn.haxx.se] · SVN Dev · SVN Users · SVN Org · TSVN Dev · TSVN Users · Subclipse Dev · Subclipse Users · this month's index

Re: Remaining issues with FSFS reshard script?

From: Eric Gillespie <epg_at_pretzelnet.org>
Date: 2007-08-15 23:43:10 CEST

"David Glasser" <glasser@davidglasser.net> writes:

> Are we confident now that the script is ready to be used on live data?
> If not, can we decide why (Malcolm?) and create an milestone-1.5
> issue for finishing it in the tracker?

It needs a lot more testing, if nothing else. I think only
Malcolm and I have ever used it. I sent Malcolm this patch
directly a couple months ago, but he hasn't had time to look at
it. Maybe you do :).

* tools/server-side/fsfs-reshard.py
  (check_fs_format): Return bool whether the filesystem is sharded.
  (main): Use check_fs_format's return value to skip the linearise call if
    the repository is not sharded.

* tools/server-side/fsfs-reshard.py
  (current_file): New function to return the contents of the current file.
  (shard): Take new start and end arguments, specifying the start and end
    revisions to shard. Use xrange to iterate from start to end rather
    than listing the directories.
  (main): Accept optional start/end revisions from the command-line; if
    they're missing, use 0 for start and HEAD for end (using current_file).
    Pass start and end to shard.

* tools/server-side/fsfs-reshard.py
  (shard): Create temporary .reshard directories directly under db, so that
    repositories which have already reached the limit on directory size can
    be resharded; don't complain if they already exist, to allow parallel
    reshard jobs.

Index: fsfs-reshard.py
===================================================================
--- fsfs-reshard.py (revision 25551)
+++ fsfs-reshard.py (working copy)
@@ -48,9 +48,11 @@
 
 import os, stat, sys
 
+from errno import EEXIST
+
 def usage():
   """Print a usage message and exit."""
- print """usage: %s REPOS_PATH MAX_FILES_PER_SHARD
+ print """usage: %s REPOS_PATH MAX_FILES_PER_SHARD [START END]
 
 Perform an offline conversion of an FSFS repository between linear
 (readable by Subversion 1.4 or later) and sharded (readable by
@@ -60,6 +62,9 @@
 files that will be stored in each shard (directory), or zero to
 specify a linear layout. Subversion 1.5 uses a default value of
 1000 files per shard.
+
+Convert revisions START through END inclusive if specified, or all
+revisions if unspecified.
 """ % sys.argv[0]
   sys.exit(1)
 
@@ -143,7 +148,8 @@
 def check_fs_format(repos_path):
   """Check that REPOS_PATH contains a filesystem with a suitable format,
   or that it contains no format file; print a message and exit if neither
- is true."""
+ is true. Return bool whether the filesystem is sharded."""
+ sharded = False
   db_path = os.path.join(repos_path, 'db')
   format_path = os.path.join(db_path, 'format')
   try:
@@ -169,8 +175,10 @@
         unexpected_fs_format_options(repos_path)
 
       line = line.rstrip('\n')
- if line == 'layout linear' or line.startswith('layout sharded '):
+ if line == 'layout linear':
         pass
+ elif line.startswith('layout sharded '):
+ sharded = True
       else:
         incompatible_fs_format_option(repos_path, line)
 
@@ -182,6 +190,13 @@
     # compatible.
     pass
 
+ return sharded
+
+def current_file(repos_path):
+ """Return triple of (revision, next_node_id, next_copy_id) from
+ REPOS_PATH/db/current ."""
+ return open(os.path.join(repos_path, 'db', 'current')).readline().split()
+
 def remove_fs_format(repos_path):
   """Remove the filesystem format file for repository REPOS_PATH.
   Do not raise an error if the file is already missing."""
@@ -238,50 +253,60 @@
       os.rename(from_path, to_path)
     os.rmdir(root_path)
 
-def shard(path, max_files_per_shard):
- """Move all the files in PATH into subdirectories of PATH named such that
- subdirectory '0' contains at most MAX_FILES_PER_SHARD files, those named
- [0, MAX_FILES_PER_SHARD). Abort if PATH is found to contain any entries
- with non-numeric names."""
+def shard(path, max_files_per_shard, start, end):
+ """Move the files for revisions START to END inclusive in PATH into
+ subdirectories of PATH named such that subdirectory '0' contains at most
+ MAX_FILES_PER_SHARD files, those named [0, MAX_FILES_PER_SHARD). Abort if
+ PATH is found to contain any entries with non-numeric names."""
 
+ tmp = path + '.reshard'
+ try:
+ os.mkdir(tmp)
+ except OSError, e:
+ if e.errno != EEXIST:
+ raise
+
   # Move all entries into shards named N.shard.
- for name in os.listdir(path):
- try:
- rev = int(name)
- except ValueError, OverflowError:
- print >> sys.stderr, "error: file '%s' does not represent a revision." \
- % os.path.join(path, name)
- sys.exit(1)
-
+ for rev in xrange(start, end + 1):
+ name = str(rev)
     shard = rev // max_files_per_shard
     shard_name = str(shard) + '.shard'
 
     from_path = os.path.join(path, name)
- to_path = os.path.join(path, shard_name, name)
+ to_path = os.path.join(tmp, shard_name, name)
     try:
       os.rename(from_path, to_path)
     except OSError:
       # The most likely explanation is that the shard directory doesn't
       # exist. Let's create it and retry the rename.
- os.mkdir(os.path.join(path, shard_name))
+ os.mkdir(os.path.join(tmp, shard_name))
       os.rename(from_path, to_path)
 
   # Now rename all the shards to remove the suffix.
- for name in os.listdir(path):
+ skipped = 0
+ for name in os.listdir(tmp):
     if not name.endswith('.shard'):
       print >> sys.stderr, "warning: ignoring unexpected subdirectory '%s'." \
- % os.path.join(path, name)
+ % os.path.join(tmp, name)
+ skipped += 1
       continue
- from_path = os.path.join(path, name)
- to_path = from_path[:-6]
+ from_path = os.path.join(tmp, name)
+ to_path = os.path.join(path, os.path.basename(from_path)[:-6])
     os.rename(from_path, to_path)
+ skipped == 0 and os.rmdir(tmp)
 
 def main():
- if len(sys.argv) != 3:
+ if len(sys.argv) < 3:
     usage()
 
   repos_path = sys.argv[1]
   max_files_per_shard = sys.argv[2]
+ try:
+ start = int(sys.argv[3])
+ end = int(sys.argv[4])
+ except IndexError:
+ start = 0
+ end = int(current_file(repos_path)[0])
 
   # Validate the command-line arguments.
   db_path = os.path.join(repos_path, 'db')
@@ -308,13 +333,14 @@
 
   # Check the format of the repository.
   check_repos_format(repos_path)
- check_fs_format(repos_path)
+ sharded = check_fs_format(repos_path)
 
   # Let the user know what's going on.
   if max_files_per_shard > 0:
     print "Converting '%s' to a sharded structure with %d files per directory" \
       % (repos_path, max_files_per_shard)
- print '(will convert to a linear structure first)'
+ if sharded:
+ print '(will convert to a linear structure first)'
   else:
     print "Converting '%s' to a linear structure" % repos_path
 
@@ -326,10 +352,11 @@
 
   # First, convert to a linear scheme (this makes recovery easier because
   # it's easier to reason about the behaviour on restart).
- print '- linearising db/revs'
- linearise(os.path.join(repos_path, 'db', 'revs'))
- print '- linearising db/revprops'
- linearise(os.path.join(repos_path, 'db', 'revprops'))
+ if sharded:
+ print '- linearising db/revs'
+ linearise(os.path.join(repos_path, 'db', 'revs'))
+ print '- linearising db/revprops'
+ linearise(os.path.join(repos_path, 'db', 'revprops'))
 
   if max_files_per_shard == 0:
     # We're done. Stamp the filesystem with a format 2 db/format file.
@@ -337,9 +364,11 @@
     write_fs_format(repos_path, '2\n')
   else:
     print '- sharding db/revs'
- shard(os.path.join(repos_path, 'db', 'revs'), max_files_per_shard)
+ shard(os.path.join(repos_path, 'db', 'revs'), max_files_per_shard,
+ start, end)
     print '- sharding db/revprops'
- shard(os.path.join(repos_path, 'db', 'revprops'), max_files_per_shard)
+ shard(os.path.join(repos_path, 'db', 'revprops'), max_files_per_shard,
+ start, end)
 
     # We're done. Stamp the filesystem with a format 3 db/format file.
     print '- marking the repository as a valid sharded repository'

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Wed Aug 15 23:41:02 2007

This is an archived mail posted to the Subversion Dev mailing list.

This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.