kfogel@collab.net writes:
> Robert Pluim <rpluim@bigfoot.com> writes:
> > Thinking about this some more, pass4 & pass5 can be combined with a
> > pipe between pass4 && 'svnadmin load' taking the place of the
> > dumpfile. Karl, I have a quick & dirty implementation of this that
> > seems to work, would you like a cleaned-up version?
>
> Oh! Wow. Cool idea.
>
> Yes, can you post it? I'm very curious to see how you managed it,
> considering that we go back in the dumpfile to write out checksums and
> stuff (i.e., you're streaming something that's been seekable up till
> now, I'm quite intrigued :-) ).
I cheated of course, by using a temporary file to hold the output of
the rcs co command whilst we calculate the checksum, then writing the
checksums & lengths etc to the dump stream. Patch attached, needs
more work. BTW, doing it this way will probably be slower than using
a dumpfile, since we process each file's data twice, but you never
know, it might be faster on a multi-processor machine.
Robert
--- cvs2svn.py Wed May 21 00:25:00 2003
+++ cvs2svn-pipe.py Thu May 22 00:09:24 2003
@@ -420,11 +420,10 @@
class Dump:
- def __init__(self, dumpfile_path, revision):
+ def __init__(self, dumpfile_path, revision, repo_pipe):
'Open DUMPFILE_PATH, and initialize revision to REVISION.'
- self.dumpfile_path = dumpfile_path
self.revision = revision
- self.dumpfile = open(dumpfile_path, 'wb')
+ self.dumpfile = repo_pipe
self.head_mirror = TreeMirror()
# Initialize the dumpfile with the standard headers:
@@ -558,21 +557,8 @@
'Text-content-length: '
% (svn_path, action, props_len))
- pos = self.dumpfile.tell()
-
- self.dumpfile.write('0000000000000000\n'
- 'Text-content-md5: 00000000000000000000000000000000\n'
- 'Content-length: 0000000000000000\n'
- '\n')
-
- if is_executable:
- self.dumpfile.write('K 14\n'
- 'svn:executable\n'
- 'V 1\n'
- '*\n')
-
- self.dumpfile.write('PROPS-END\n')
-
+ tempfile = open(DUMPFILE,'w+')
+
# Insert the rev contents, calculating length and checksum as we go.
checksum = md5.new()
length = 0
@@ -580,26 +566,31 @@
while buf:
checksum.update(buf)
length = length + len(buf)
- self.dumpfile.write(buf)
+ tempfile.write(buf)
buf = pipe.read()
pipe.close()
- # Go back to patch up the length and checksum headers:
- self.dumpfile.seek(pos, 0)
- # We left 16 zeros for the text length; replace them with the real
- # length, padded on the left with spaces:
- self.dumpfile.write('%16d' % length)
- # 16... + 1 newline + len('Text-content-md5: ') == 35
- self.dumpfile.seek(pos + 35, 0)
+ self.dumpfile.write('%16d\n' % length)
+ self.dumpfile.write('Text-content-md5: ')
self.dumpfile.write(checksum.hexdigest())
- # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
- self.dumpfile.seek(pos + 84, 0)
- # The content length is the length of property data, text data,
- # and any metadata around/inside around them.
- self.dumpfile.write('%16d' % (length + props_len))
- # Jump back to the end of the stream
- self.dumpfile.seek(0, 2)
+ self.dumpfile.write('\nContent-length: %16d\n\n' % (length + props_len))
+
+ if is_executable:
+ self.dumpfile.write('K 14\n'
+ 'svn:executable\n'
+ 'V 1\n'
+ '*\n')
+
+ self.dumpfile.write('PROPS-END\n')
+
+ tempfile.seek(0)
+ tempbuf = tempfile.read()
+ while tempbuf:
+ self.dumpfile.write(tempbuf)
+ tempbuf = tempfile.read()
+ tempfile.close()
+
# This record is done.
self.dumpfile.write('\n')
@@ -1114,7 +1105,94 @@
os.system('%s load --ignore-uuid %s < %s'
% (ctx.svnadmin, ctx.target, ctx.dumpfile))
-_passes = [
+def pass4_and_5(ctx):
+ # create the target repository
+ if not ctx.dry_run:
+ if ctx.create_repos:
+ os.system('%s create %s' % (ctx.svnadmin, ctx.target))
+ else:
+ t_fs = t_repos = None
+
+ repo_pipe = os.popen('%s load --ignore-uuid %s'
+ % (ctx.svnadmin, ctx.target), 'w', 102400)
+
+ # A dictionary of Commit objects, keyed by digest. Each object
+ # represents one logical commit, which may involve multiple files.
+ #
+ # The reason this is a dictionary, not a single object, is that
+ # there may be multiple commits interleaved in time. A commit can
+ # span up to COMMIT_THRESHOLD seconds, which leaves plenty of time
+ # for parts of some other commit to occur. Since the s-revs file is
+ # sorted by timestamp first, then by digest within each timestamp,
+ # it's quite easy to have interleaved commits.
+ commits = { }
+
+ # The number of separate commits processed in a given flush. This
+ # is used only for printing statistics, it does not affect the
+ # results in the repository.
+ count = 0
+
+ # Start the dumpfile object.
+ dump = Dump(ctx.dumpfile, ctx.initial_revision, repo_pipe)
+
+ # process the logfiles, creating the target
+ for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
+ timestamp, id, op, rev, fname, branch_name, tags, branches = \
+ parse_revs_line(line)
+
+ ### for now, only handle changes on the trunk until we get the tag
+ ### and branch processing to stop making so many copies
+ if not trunk_rev.match(rev):
+ ### note this could/should have caused a flush, but the next item
+ ### will take care of that for us
+ continue
+
+ # Each time we read a new line, we scan the commits we've
+ # accumulated so far to see if any are ready for processing now.
+ process = [ ]
+ for scan_id, scan_c in commits.items():
+
+ # ### ISSUE: the has_file() check below is not optimal.
+ # It does fix the dataloss bug where revisions would get lost
+ # if checked in too quickly, but it can also break apart the
+ # commits. The correct fix would require tracking the dependencies
+ # between change sets and committing them in proper order.
+ if scan_c.t_max + COMMIT_THRESHOLD < timestamp or \
+ scan_c.has_file(fname):
+ process.append((scan_c.t_max, scan_c))
+ del commits[scan_id]
+
+ # If there are any elements in 'process' at this point, they need
+ # to be committed, because this latest rev couldn't possibly be
+ # part of any of them. Sort them into time-order, then commit 'em.
+ process.sort()
+ for t_max, c in process:
+ c.commit(dump, ctx)
+ count = count + len(process)
+
+ # Add this item into the set of still-available commits.
+ if commits.has_key(id):
+ c = commits[id]
+ else:
+ c = commits[id] = Commit()
+ c.add(timestamp, op, fname, rev, branch_name, tags, branches)
+
+ # End of the sorted revs file. Flush any remaining commits:
+ if commits:
+ process = [ ]
+ for id, c in commits.items():
+ process.append((c.t_max, c))
+ process.sort()
+ for t_max, c in process:
+ c.commit(dump, ctx)
+ count = count + len(process)
+
+ dump.close()
+
+ if ctx.verbose:
+ print count, 'commits processed.'
+
+_passes_nopipe = [
pass1,
pass2,
pass3,
@@ -1122,6 +1200,13 @@
pass5,
]
+_passes_pipe = [
+ pass1,
+ pass2,
+ pass3,
+ pass4_and_5,
+ ]
+
class _ctx:
pass
@@ -1155,10 +1240,12 @@
# print ' --branches=PATH path for branches (default: %s)' % ctx.branches_base
# print ' --tags=PATH path for tags (default: %s)' % ctx.tags_base
print ' --encoding=ENC encoding of log messages in CVS repos (default: %s)' % ctx.encoding
+ print ' --pipe use a pipe rather than a file for the dumpfile'
sys.exit(1)
def main():
# prepare the operation context
+ global _passes
ctx = _ctx()
ctx.cvsroot = None
ctx.target = SVNROOT
@@ -1173,11 +1260,14 @@
ctx.branches_base = "branches"
ctx.encoding = "ascii"
ctx.svnadmin = "svnadmin"
-
+ ctx.pipe = 0
+ _passes = _passes_nopipe
+
try:
opts, args = getopt.getopt(sys.argv[1:], 'p:s:vn',
[ "create", "trunk=",
- "branches=", "tags=", "encoding=" ])
+ "branches=", "tags=", "encoding=",
+ "pipe"])
except getopt.GetoptError:
usage(ctx)
if len(args) != 1:
@@ -1213,6 +1303,9 @@
ctx.tags_base = value
elif opt == '--encoding':
ctx.encoding = value
+ elif opt == '--pipe':
+ ctx.pipe = 1
+ _passes = _passes_pipe
convert(ctx, start_pass=start_pass)
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Thu May 22 00:10:10 2003