#!/usr/bin/env python2.2
#
# cvs2svn: ...
#
# With a small number of experimental patches applied...
import os
import sys
import sha
import re
import time
import fileinput
import string
import getopt
import statcache
import xreadlines
# I keep rcsparse somewhere else...
#import rcsparse
from svn import rcsparse
from svn import fs, util, _delta, _repos
### these should go somewhere else. should have SWIG export them.
svn_node_none = 0
svn_node_file = 1
svn_node_dir = 2
svn_node_unknown = 3
trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$') # XXX?
DATAFILE = 'cvs2svn-data'
REVS_SUFFIX = '.revs'
CLEAN_REVS_SUFFIX = '.c-revs'
SORTED_REVS_SUFFIX = '.s-revs'
TAGS_SUFFIX = '.tags'
RESYNC_SUFFIX = '.resync'
COMMANDS_SUFFIX = '.commands'
SVNROOT = 'svnroot'
ATTIC = os.sep + 'Attic'
COMMIT_THRESHOLD = 3 * 60 # flush a commit if a 3 minute gap occurs
OP_DELETE = 'D'
OP_CHANGE = 'C'
DIGEST_END_IDX = 9 + (sha.digestsize * 2)
verbose = 1
class CollectData(rcsparse.Sink):
def __init__(self, cvsroot, log_fname_base):
self.cvsroot = cvsroot
self.revs = open(log_fname_base + '.revs', 'w')
self.tags = open(log_fname_base + '.tags', 'w')
self.resync = open(log_fname_base + '.resync', 'w')
def set_fname(self, fname):
"Prepare to receive data for a new file."
self.fname = fname
# revision -> [timestamp, author, operation, old-timestamp]
self.rev_data = { }
self.prev = { }
self.branch_names = {}
self.taglist = {}
self.branchlist = { }
def set_branch_name(self, revision, name):
self.branch_names[revision] = name
def get_branch_name(self, revision):
brev = revision[:revision.rindex(".")];
if not self.branch_names.has_key(brev):
return None
return self.branch_names[brev]
def add_branch_point(self, revision, branch_name):
if not self.branchlist.has_key(revision):
self.branchlist[revision] = []
self.branchlist[revision].append(branch_name)
def add_cvs_branch(self, revision, branch_name):
last_dot = revision.rfind(".");
branch_rev = revision[:last_dot];
last2_dot = branch_rev.rfind(".");
branch_rev = branch_rev[:last2_dot] + revision[last_dot:];
self.set_branch_name(branch_rev, branch_name)
self.add_branch_point(branch_rev[:last2_dot], branch_name)
def get_tags(self, revision):
if self.taglist.has_key(revision):
return self.taglist[revision]
else:
return []
def get_branches(self, revision):
if self.branchlist.has_key(revision):
return self.branchlist[revision]
else:
return []
def define_tag(self, name, revision):
self.tags.write('%s %s %s\n' % (name, revision, self.fname))
if branch_tag.match(revision):
self.add_cvs_branch(revision, name)
elif vendor_tag.match(revision):
self.set_branch_name(revision, name)
else:
if not self.taglist.has_key(revision):
self.taglist[revision] = [];
self.taglist[revision].append(name)
def define_revision(self, revision, timestamp, author, state,
branches, next):
### what else?
if state == 'dead':
op = OP_DELETE
else:
op = OP_CHANGE
# store the rev_data as a list in case we have to jigger the timestamp
self.rev_data[revision] = [int(timestamp), author, op, None]
# record the previous revision for sanity checking later
if trunk_rev.match(revision):
self.prev[revision] = next
elif next:
self.prev[next] = revision
for b in branches:
self.prev[b] = revision
def tree_completed(self):
"The revision tree has been parsed. Analyze it for consistency."
# Our algorithm depends upon the timestamps on the revisions occuring
# monotonically over time. That is, we want to see rev 1.34 occur in
# time before rev 1.35. If we inserted 1.35 *first* (due to the time-
# sorting), and then tried to insert 1.34, we'd be screwed.
# to perform the analysis, we'll simply visit all of the 'previous'
# links that we have recorded and validate that the timestamp on the
# previous revision is before the specified revision
# if we have to resync some nodes, then we restart the scan. just keep
# looping as long as we need to restart.
while 1:
for current, prev in self.prev.items():
if not prev:
# no previous revision exists (i.e. the initial revision)
continue
t_c = self.rev_data[current][0]
t_p = self.rev_data[prev][0]
if t_p >= t_c:
# the previous revision occurred later than the current revision.
# shove the previous revision back in time (and any before it that
# may need to shift).
while t_p >= t_c:
self.rev_data[prev][0] = t_c - 1 # new timestamp
self.rev_data[prev][3] = t_p # old timestamp
print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
% (relative_name(self.cvsroot, self.fname),
prev, time.ctime(t_p), time.ctime(t_c - 1))
current = prev
prev = self.prev[current]
if not prev:
break
t_c = t_c - 1 # self.rev_data[current][0]
t_p = self.rev_data[prev][0]
# break from the for-loop
break
else:
# finished the for-loop (no resyncing was performed)
return
def set_revision_info(self, revision, log, text):
timestamp, author, op, old_ts = self.rev_data[revision]
digest = sha.new(log + '\0' + author).hexdigest()
if old_ts:
# the timestamp on this revision was changed. log it for later
# resynchronization of other files's revisions that occurred
# for this time and log message.
self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
branch_name = self.get_branch_name(revision)
write_revs_line(self.revs,
timestamp, digest, op, revision, self.fname, branch_name,
self.get_tags(revision), self.get_branches(revision))
def branch_path(ctx, branch_name = None):
if branch_name == None:
return ctx.trunk_base + '/'
else:
return ctx.branches_base + '/' + branch_name + '/'
def relative_name(cvsroot, fname):
l = len(cvsroot)
if fname[:l] == cvsroot:
if fname[l] == '/':
return fname[l+1:]
return fname[l:]
return l
def visit_file(arg, dirname, files):
cd, p, stats = arg
for fname in files:
if fname[-2:] != ',v':
continue
pathname = os.path.join(dirname, fname)
if dirname[-6:] == ATTIC:
# drop the 'Attic' portion from the pathname
### we should record this so we can easily insert it back in
cd.set_fname(os.path.join(dirname[:-6], fname))
else:
cd.set_fname(pathname)
if verbose:
print pathname
p.parse(open(pathname), cd)
stats[0] = stats[0] + 1
class RevInfoParser(rcsparse.Sink):
def __init__(self):
self.authors = { } # revision -> author
self.logs = { } # revision -> log message
def define_revision(self, revision, timestamp, author, state,
branches, next):
self.authors[revision] = author
def set_revision_info(self, revision, log, text):
self.logs[revision] = log
def parse_cvs_file(self, rcs_pathname):
try:
rcsfile = open(rcs_pathname, 'r')
except:
try:
dirname, fname = os.path.split(rcs_pathname)
rcs_pathname = os.path.join(dirname, "Attic", fname)
rcsfile = open(rcs_pathname, 'r')
except:
### should use a better error
raise RuntimeError, ('error: %s appeared to be under CVS control, '
'but the RCS file is inaccessible.'
% rcs_pathname)
rcsparse.Parser().parse(rcsfile, self)
class BuildRevision(rcsparse.Sink):
def __init__(self, rev, get_metadata=0):
self.rev = rev
self.get_metadata = get_metadata
self.result = None
def define_revision(self, revision, timestamp, author, state,
branches, next):
for branch in branches:
self.prev_delta[branch] = revision
if next:
self.prev_delta[next] = revision
if self.get_metadata and revision == self.rev:
self.author = author
def tree_completed(self):
path = [ ]
revision = self.rev
while revision:
path.append(revision)
revision = self.prev_delta.get(revision)
path.reverse()
self.collect = path
def set_revision_info(self, revision, log, text):
if not self.collect:
# nothing more to do
### would be nice to halt the file parsing...
return
# NOTE: we assume that the deltas appear in the proper order within
# the RCS file, for streaming application. Thus, our max size is the
# largest revision of all involved (rather than the revision plus all
# diff entries).
if revision != self.collect[0]:
# not something we are interested in
return
if self.get_metadata and revision == self.rev:
self.log = log
if self.result is None:
self.result = string.split(text, '\n')
else:
adjust = 0
diffs = string.split(text, '\n')
for command in diffs:
if add_lines_remaining > 0:
# Insertion lines from a prior "a" command
self.result.insert(start_line + adjust, command)
add_lines_remaining = add_lines_remaining - 1
adjust = adjust + 1
else:
dmatch = self.d_command.match(command)
amatch = self.a_command.match(command)
if dmatch:
# "d" - Delete command
start_line = string.atoi(dmatch.group(1))
count = string.atoi(dmatch.group(2))
begin = start_line + adjust - 1
del self.result[begin:begin + count]
adjust = adjust - count
elif amatch:
# "a" - Add command
start_line = string.atoi(amatch.group(1))
count = string.atoi(amatch.group(2))
add_lines_remaining = count
else:
raise RuntimeError, 'Error parsing diff commands'
# since we don't have a repository, we should keep
# track of the files that exist.
class Created:
def __init__(self):
self.files = []
def add_file( self, file ):
self.files.append( file )
def check_path( self, file ):
for x in self.files:
if x == file:
return 1
return 0
# This class decides what operates to do on SVN repository
class OperationBuilder:
def __init__(self, created, output, pool ):
self.changes = []
self.deletes = []
self.loghash = {}
self.created = created
self.output = output
self.t_min = 1 << 30
self.t_max = 0
self.s_pool = pool
def ready_to_process( self, tstamp, fname ):
if self.t_max + COMMIT_THRESHOLD < tstamp:
return 1
for x in self.changes:
if x[0] == fname:
return 1
for x in self.deletes:
if x[0] == fname:
return 1
return 0
def add(self, t, op, file, rev, branch_name, tags, branches):
# record the time range of this commit
if t < self.t_min:
self.t_min = t
if t > self.t_max:
self.t_max = t
# Keep track of all files...
if op == OP_CHANGE:
self.changes.append((file, rev, branch_name, tags, branches))
else:
# OP_DELETE
self.deletes.append((file, rev, branch_name, tags, branches))
def get_parent( self, ctx, br, f, tag = None ):
repos_path = ''
if tag:
repos_path = ctx.tags_base + '/' + tag + '/' + relative_name(ctx.cvsroot, f[:-2])
else:
repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
parent_dir = ''
dirname = os.path.dirname(repos_path)
if dirname != '/':
# get the components of the path (skipping the leading '/')
parts = string.split(dirname[1:], os.sep)
for i in range(1, len(parts) + 1):
# reassemble the pieces, adding a leading slash
parent_dir = '/' + string.join(parts[:i], '/')
return repos_path, parent_dir
def get_logmessages(self, mods, ctx, modifier):
# get the log messages for those mods (change or delete)
authorhash = {}
currlog = ''
lastlog = ''
rip = RevInfoParser()
for file, rev, br, tags, branches in mods:
repos_path = relative_name(ctx.cvsroot, file[:-2])
# now, fetch the author/log from the ,v file
rip.parse_cvs_file(file)
author = rip.authors[rev]
authorhash[author] = 1
currlog = rip.logs[rev].strip()
tmp = self.loghash.get(currlog, [])
# ugly hack
if 'M' == modifier and '1.1' == rev:
tmp.append('A %s (%s)' % (repos_path, rev))
else:
tmp.append('%s %s (%s)' % (modifier, repos_path, rev))
self.loghash[currlog] = tmp
return author
def get_metadata(self, ctx):
# by definition, the author and log message must be the same for all
# items that went into this commit. therefore, just grab any item from
# our record of changes/deletes.
log = ''
author = ''
# now, fetch the author/log from the ,v file
if ctx.logcvsversion:
if self.changes:
author = self.get_logmessages(self.changes, ctx, 'M')
if self.deletes:
author = self.get_logmessages(self.deletes, ctx, 'D')
for msg, files in self.loghash.items():
log += '\n'.join(files)
log += ":\n " + msg + '\n'
else:
if self.changes:
file, rev, br, tags, branches = self.changes[0]
else:
# there better be one...
file, rev, br, tags, branches = self.deletes[0]
# just pick one...
rip = RevInfoParser()
rip.parse_cvs_file( file )
author = rip.authors[rev]
log = rip.logs[rev]
# format the date properly
date = self.t_max
return author, log, date
def begin( self ):
self.output.write( 'begin\n' )
def end( self ):
self.output.write( 'end\n' )
def comment( self, line ):
self.output.write( '# %s\n' % line );
def make_dir( self, path ):
self.output.write( 'mkdir %s\n' % path )
self.created.add_file( path )
self.comment( "adding path %s" % path )
print 'Adding path %s' % path
def make_file( self, path, r, f ):
self.output.write( 'mkfile %s %s %s\n' % (path, r, f) )
self.created.add_file( path )
def copy( self, a, b ):
self.output.write( 'cp %s %s\n' % ( a, b ) )
def apply_delta( self, path, r, f ):
self.output.write( 'mkdelta %s %s %s\n' % (path, r, f) )
def delete( self, file ):
self.output.write( 'rm %s\n' % file )
def rev_meta_data( self, prop, value ):
self.output.write( 'rev_metadata %s\n\n%s\n\n' % ( prop, value ) )
def txn_meta_data( self, prop, value ):
self.output.write( 'txn_metadata %s\n\n%s\n\n' % ( prop, value ) )
def commit( self, ctx ):
self.output.write( 'commit %d \n' % ctx.commits )
ctx.commits = ctx.commits + 1
#TODO: Clean this up a bit.
def make_all_dirs( self, dir ):
if not self.created.check_path( dir ):
first, second = os.path.split( dir )
if second != '' and first != '/' and first != '' :
self.make_all_dirs( first )
self.make_dir( dir )
# Actually do the work here...
def write( self, ctx ):
# commit this transaction
print 'creating: rev %d %s, over %d seconds' % ( ctx.commits + 1,
time.ctime(self.t_min),
self.t_max - self.t_min)
self.begin()
n = ctx.commits + 1
self.comment( 'creating revision %d' % n )
# for each change:
for f, r, br, tags, branches in self.changes:
# compute a repository path. ensure we have a leading "/" and drop
# the ,v from the file name
repos_path, parent_dir = self.get_parent( ctx, br, f )
created_file = 0
if not self.created.check_path( parent_dir ):
self.make_all_dirs( parent_dir )
if not self.created.check_path( repos_path ):
created_file = 1
# figure out the real file path for "co"
try:
statcache.stat(f)
except os.error:
dirname, fname = os.path.split(f)
f = os.path.join(dirname, 'Attic', fname)
statcache.stat(f)
if created_file:
self.make_file( repos_path, r, f )
else:
self.apply_delta( repos_path, r, f )
for f, r, br, tags, branches in self.deletes:
repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
# Do we really process deletes?
# self.delete( repos_path )
author, log, date = self.get_metadata( ctx )
# todo: Get metadata.
self.txn_meta_data( 'svn:author', author )
self.txn_meta_data( 'svn:log', log )
self.commit( ctx )
self.rev_meta_data( 'svn:date', '%d' % date )
self.end()
# don't do tags if we don't need to.
dotags = 0
for f, r, br, tags, branches in self.changes:
dotags += len( tags )
dotags += len( branches )
if not dotags:
return
# make a new transaction for the tags
self.begin()
for f, r, br, tags, branches in self.changes:
for tag in tags:
tag_path, parent_dir = self.get_parent( ctx, br, f, tag )
repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
if not self.created.check_path( parent_dir ):
self.make_all_dirs( parent_dir )
self.copy( repos_path, tag_path )
for br2 in branches:
new_branch_path, parent_dir = self.get_parent( ctx, br2, f )
repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
if not self.created.check_path( parent_dir ):
self.make_all_dirs( parent_dir )
self.copy( repos_path, new_branch_path )
for f, r, br, tags, branches in self.deletes:
for br2 in branches:
new_branch_path = branch_path(ctx, br2) + relative_name(ctx.cvsroot, f[:-2])
# Do we really process deletes?
# self.delete( new_branch_path )
self.txn_meta_data( 'svn:author', author )
self.txn_meta_data( 'svn:log', log )
self.commit( ctx )
self.rev_meta_data( 'svn:date', date )
self.end()
# This class actually operates on the SVN repository
class OperationCommiter:
def __init__( self, ctx, t_fs ):
self.ctx = ctx
self.t_fs = t_fs
# data for individual operations.
self.c_pool = None
self.f_pool = None
self.txn = None
self.fs = None
self.rev = None
self.new_rev = None
self.root = None
# placeholders for metadata stuff.
self.in_metadata = 0
self.metadata = ''
self.metadata_value = ''
def handle_metadata( self, line ):
if self.in_metadata:
if line == '\n':
self.handle_metadata_done()
self.in_metadata = 0
elif line == '\n':
pass
else:
self.metadata_value += line
elif not self.in_metadata:
self.metadata = line
self.in_metadata = 1
def handle_metadata_txn( self, line ):
if self.in_metadata:
if line == '\n':
self.handle_metadata_done()
self.in_metadata = 0
elif line == '\n':
pass
else:
self.metadata_value += line
elif not self.in_metadata:
self.metadata = line
self.in_metadata = 1
def handle_metadata_rev( self, line ):
if self.in_metadata:
if line == '\n':
self.handle_metadata_done()
self.in_metadata = 0
elif line == '\n':
pass
else:
self.metadata_value += line
elif not self.in_metadata:
self.metadata = line
self.in_metadata = 2
def handle_begin( self, line ):
self.c_pool = util.svn_pool_create( self.ctx.pool )
self.f_pool = util.svn_pool_create( self.c_pool )
self.rev = fs.youngest_rev( self.t_fs, self.c_pool )
self.txn = fs.begin_txn( self.t_fs, self.rev, self.c_pool)
self.root = fs.txn_root( self.txn, self.c_pool )
# end of a transaction. cleanup.
def handle_end( self, line ):
if self.f_pool:
# util.svn_pool_destroy( self.f_pool )
if self.c_pool:
util.svn_pool_destroy( self.c_pool )
self.c_pool = None
self.f_pool = None
self.txn = None
self.root = None
self.rev = None
self.new_rev = None
print '.'
# write out the metadata.
def handle_metadata_done( self ):
value = string.rstrip( self.metadata_value )
print 'METADATA', self.metadata, ' ', value
if self.metadata == 'svn:date':
t_max = int( value )
a_t = util.apr_time_ansi_put( t_max )[1]
value = util.svn_time_to_nts( a_t, self.c_pool )
if self.in_metadata == 2:
fs.change_rev_prop( self.t_fs, self.new_rev, self.metadata, value, self.c_pool )
else:
fs.change_txn_prop( self.txn, self.metadata, value, self.c_pool )
self.metadata = ''
self.metadata_value = ''
def handle_commit( self, line ):
conflicts, self.new_rev = fs.commit_txn( self.txn )
self.txn = None
print 'Committed revision %d ' % self.new_rev
if len( conflicts ) > 1:
print 'CONFLICTS %s' % conflicts
def handle_mkdir( self, line ):
fs.make_dir( self.root, line, self.f_pool )
util.svn_pool_clear( self.f_pool )
def handle_mkfile( self, line ):
repo_file, cvs_rev, cvs_file = string.split( line, None, 2 )
fs.make_file( self.root, repo_file, self.f_pool )
handler, baton = fs.apply_textdelta( self.root, repo_file, self.f_pool )
pipe = os.popen('co -q -p%s \'%s\'' % (cvs_rev, cvs_file), 'r', 102400)
_delta.svn_txdelta_send_string( pipe.read(), handler, baton, self.f_pool )
# shut down the current-rev pipe
pipe.close()
util.svn_pool_clear(self.f_pool)
def handle_mkdelta( self, line ):
repo_file, cvs_rev, cvs_file = string.split( line, None, 2 )
handler, baton = fs.apply_textdelta( self.root, repo_file, self.f_pool )
pipe = os.popen('co -q -p%s \'%s\'' % (cvs_rev, cvs_file), 'r', 102400)
# open an SVN stream onto the pipe
stream2 = util.svn_stream_from_stdio( pipe, self.f_pool )
# Get current contents of the repo. ( we dissallow multiple files per checkin. )
stream1 = fs.file_contents(self.root, repo_file, self.f_pool )
# get the diff stream, and apply it to the SVN pipe.
txstream = _delta.svn_txdelta( stream1, stream2, self.f_pool )
_delta.svn_txdelta_send_txstream( txstream, handler, baton, self.f_pool )
# shut down the current-rev pipe
pipe.close()
util.svn_pool_clear( self.f_pool )
def handle_cp( self, line ):
repo_file, dest_file = string.split( line, None, 1 )
o_root = fs.revision_root( self.t_fs, self.rev, self.f_pool );
fs.copy( o_root, repo_file, self.root, dest_file, self.f_pool )
util.svn_pool_clear( self.f_pool )
def handle_rm( self, line ):
fs.delete( self.root, line, self.f_pool )
util.svn_pool_clear( self.f_pool )
def handle_command( self, line ):
if not self.ctx.dry_run:
if line == 'begin':
print line
self.handle_begin( '' )
return 1
elif line == 'end':
print line
self.handle_end( '' )
return 0
op = string.split( line, None, 1 )
if self.ctx.verbose:
print op
if self.ctx.dry_run:
return 1
if op[0] == 'mkdir':
self.handle_mkdir( op[1] )
elif op[0] == 'mkfile':
self.handle_mkfile( op[1] )
elif op[0] == 'mkdelta':
self.handle_mkdelta( op[1] )
elif op[0] == 'cp':
self.handle_cp( op[1] )
elif op[0] == 'rm':
self.handle_rm( op[1] )
elif op[0] == 'metadata' or op[0] == 'txn_metadata':
self.handle_metadata_txn( op[1] )
elif op[0] == 'rev_metadata':
self.handle_metadata_rev( op[1] )
elif op[0] == 'commit':
self.handle_commit( op )
return 1
def run( self, line ):
if self.in_metadata:
self.handle_metadata( line )
return 1
else:
l = string.rstrip( line )
if l == None:
return 1
if l[0] == '#':
print l
return 1
else:
return self.handle_command( l )
def read_resync(fname):
"Read the .resync file into memory."
### note that we assume that we can hold the entire resync file in
### memory. really large repositories with whacky timestamps could
### bust this assumption. should that ever happen, then it is possible
### to split the resync file into pieces and make multiple passes,
### using each piece.
#
# A digest maps to a sequence of lists which specify a lower and upper
# time bound for matching up the commit. We keep a sequence of these
# because a number of checkins with the same log message (e.g. an empty
# log message) could need to be remapped. We also make them a list because
# we will dynamically expand the lower/upper bound as we find commits
# that fall into a particular msg and time range.
#
# resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
#
resync = { }
for line in fileinput.FileInput(fname):
t1 = int(line[:8], 16)
digest = line[9:DIGEST_END_IDX]
t2 = int(line[DIGEST_END_IDX+1:], 16)
t1_l = t1 - COMMIT_THRESHOLD/2
t1_u = t1 + COMMIT_THRESHOLD/2
if resync.has_key(digest):
resync[digest].append([t1_l, t1_u, t2])
else:
resync[digest] = [ [t1_l, t1_u, t2] ]
return resync
def write_revs_line(output,
timestamp, digest, op, revision, fname,
branch_name, tags, branches):
if not branch_name:
branch_name = "*"
output.write('%08lx %s %s %s %s' % (timestamp, digest,
op, revision, branch_name))
output.write(' %d ' % (len(tags)));
for tag in tags:
output.write('%s ' % (tag));
output.write('%d ' % (len(branches)));
for branch in branches:
output.write('%s ' % (branch));
output.write('%s\n' % fname);
def parse_revs_line(line):
data = line.split(' ', 6)
##print "DATA", repr(data)
timestamp = int(data[0], 16)
id = data[1]
op = data[2]
rev = data[3]
branch_name = data[4]
if branch_name == "*":
branch_name = None
ntags = int(data[5])
tags = data[6].split(' ', ntags + 1)
nbranches = int(tags[ntags])
branches = tags[ntags + 1].split(' ', nbranches + 1)
fname = branches[nbranches][:-1]
tags = tags[:ntags]
branches = branches[:nbranches]
return timestamp, id, op, rev, fname, branch_name, tags, branches
def pass1(ctx):
cd = CollectData(ctx.cvsroot, DATAFILE)
p = rcsparse.Parser()
stats = [ 0 ]
os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
if ctx.verbose:
print 'processed', stats[0], 'files'
def pass2(ctx):
"Pass 2: clean up the revision information."
# We may have recorded some changes in revisions' timestamp. We need to
# scan for any other files which may have had the same log message and
# occurred at "the same time" and change their timestamps, too.
# read the resync data file
resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)
output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')
# process the revisions file, looking for items to clean up
for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
timestamp, digest, op, rev, fname, branch_name, tags, branches = \
parse_revs_line(line)
if not resync.has_key(digest):
output.write(line)
continue
# we have a hit. see if this is "near" any of the resync records we
# have recorded for this digest [of the log message].
for record in resync[digest]:
if record[0] <= timestamp <= record[1]:
# bingo! remap the time on this (record[2] is the new time).
write_revs_line(output,
record[2], digest, op, rev, fname, branch_name,
tags, branches)
print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
% (relative_name(ctx.cvsroot, fname),
rev, time.ctime(timestamp), time.ctime(record[2]))
# adjust the time range. we want the COMMIT_THRESHOLD from the
# bounds of the earlier/latest commit in this group.
record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)
# stop looking for hits
break
else:
# the file/rev did not need to have its time changed.
output.write(line)
def pass3(ctx):
# sort the log files
os.system('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
ctx.log_fname_base + SORTED_REVS_SUFFIX))
def pass4(ctx):
# create a command stream for the different transactions.
txn_list = { }
process = []
count = 0
created = Created()
output = open(ctx.log_fname_base + COMMANDS_SUFFIX, 'w')
s_pool = util.svn_pool_create(ctx.pool)
for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
timestamp, id, op, rev, fname, branch_name, tags, branches = \
parse_revs_line(line)
for scan_id, scan_c in txn_list.items():
if scan_c.ready_to_process( timestamp, fname ):
process.append((scan_c.t_max, scan_c))
del txn_list[scan_id]
# sort the commits into time-order, then commit 'em
process.sort()
for t_max, c in process:
c.write( ctx )
count = count + len(process)
process = []
if txn_list.has_key(id):
c = txn_list[id]
else:
c = txn_list[id] = OperationBuilder( created, output, s_pool )
c.add(timestamp, op, fname, rev, branch_name, tags, branches)
# if there are any pending commits left, then flush them
if txn_list:
process = [ ]
for id, c in txn_list.items():
process.append((c.t_max, c))
process.sort()
for t_max, c in process:
c.write(ctx)
count = count + len(process)
# destroy the pool
util.svn_pool_destroy(s_pool)
if ctx.verbose:
print count, 'commits processed.'
def pass5(ctx):
# create the target repository
t_fs = None
t_repos = None
if not ctx.dry_run:
if ctx.create_repos:
t_repos = _repos.svn_repos_create( ctx.target, ctx.pool )
else:
t_repos = _repos.svn_repos_open( ctx.target, ctx.pool )
t_fs = _repos.svn_repos_fs( t_repos )
x = None
# for line in fileinput.FileInput(ctx.log_fname_base + COMMANDS_SUFFIX):
f = file( ctx.log_fname_base + COMMANDS_SUFFIX , 'r' )
for line in xreadlines.xreadlines( f ):
# Create a new OperationCommiter after each end.
if not x:
x = OperationCommiter( ctx, t_fs )
if not x.run( line ):
x = None
_passes = [
pass1,
pass2,
pass3,
pass4,
pass5,
]
class _ctx:
pass
def convert(pool, ctx, start_pass=1, stop_pass=len(_passes)):
"Convert a CVS repository to an SVN repository."
ctx.pool = pool
times = [ None ] * ( len(_passes ) + 1 )
for i in range(start_pass - 1, stop_pass ):
times[i] = time.time()
if verbose:
print '----- pass %d -----' % (i + 1)
_passes[i](ctx)
times[i+1] = time.time()
times.append(time.time())
if verbose:
for i in range( start_pass, stop_pass+1 ):
print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
print ' total:', int(times[stop_pass] - times[start_pass-1]), 'seconds'
def usage(ctx):
print 'USAGE: %s [-n] [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
% os.path.basename(sys.argv[0])
print ' -n dry run. parse CVS repos, but do not construct SVN repos.'
print ' -v verbose.'
print ' -s PATH path for SVN repos.'
print ' -p NUM start at pass NUM of %d.' % len(_passes)
print ' -q NUM stop after pass NUM of %d.' % len(_passes)
print ' --create create a new SVN repository'
print ' --trunk=PATH path for trunk (default: %s)' % ctx.trunk_base
print ' --branches=PATH path for branches (default: %s)' % ctx.branches_base
print ' --tags=PATH path for tags (default: %s)' % ctx.tags_base
print ' --logcvsrevision include the cvs revision in the log (default: %s)' % ctx.logcvsversion
sys.exit(1)
def main():
# prepare the operation context
ctx = _ctx()
ctx.cvsroot = None
ctx.target = SVNROOT
ctx.log_fname_base = DATAFILE
ctx.verbose = 0
ctx.dry_run = 0
ctx.create_repos = 0
ctx.trunk_base = "/trunk"
ctx.tags_base = "/tags"
ctx.branches_base = "/branches"
ctx.logcvsversion = 0
ctx.commits = 0
ctx.pool = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'p:s:q:vn',
[ "create", "logcvsrevision", "trunk=", "branches=", "tags=" ])
except getopt.GetoptError:
usage(ctx)
if len(args) != 1:
usage(ctx)
ctx.cvsroot = args[0]
start_pass = 1
stop_pass = len(_passes)
for opt, value in opts:
if opt == '-p':
start_pass = int(value)
if start_pass < 1 or start_pass > len(_passes):
print 'ERROR: illegal value (%d) for starting pass. ' \
'must be 1 through %d.' % (start_pass, len(_passes))
sys.exit(1)
elif opt == '-q':
stop_pass = int(value)
if stop_pass < 1 or stop_pass > len(_passes):
print 'ERROR: illegal value (%d) for stoping pass. ' \
'must be 1 through %d.' % (stop_pass, len(_passes))
sys.exit(1)
elif opt == '-v':
ctx.verbose = 1
elif opt == '-n':
ctx.dry_run = 1
elif opt == '-s':
ctx.target = value
elif opt == '--create':
ctx.create_repos = 1
elif opt == '--trunk':
ctx.trunk_base = value
elif opt == '--branches':
ctx.branches_base = value
elif opt == '--tags':
ctx.tags_base = value
elif opt == '--logcvsrevision':
ctx.logcvsversion = 1
util.run_app(convert, ctx, start_pass=start_pass, stop_pass=stop_pass)
if __name__ == '__main__':
main()