[PATCH] cvs2svn.py - new test release

From: Marko Macek <Marko.Macek_at_gmx.net>
Date: 2002-11-07 22:02:59 CET

Hi!

This is my newest test patch for cvs2svn with tags and branches support.

ChangeLog (relative to rev 3685):
1) includes bug fix for preventing the commits on the same file from
being combined
2) remembers the branch for each file in the .*revs files
3) remembers the tags and branch points for each file in .*revs file
4) NEW: a new pass to determine branch dependencies (only handles
    trees, not DAGs for now)
5) NEW: branches are copied recursively, starting from the trunk
    (or the vendor branch)
6) NEW: unlike before, we now copy the tags and branch points in a
    single new revision (still file-by-file copy though - work is in
    progress to optimize this).
    This revision now has the "svn:author" and "svn:log" properties set.
7) NEW: --vendor=vendor-branch-tag-name to start the conversion
    from the vendor branch.
    TODO: the trunk should be a copy of the vendor branch start tag,
    not a new checkin.

Please test and report any problems, especially with the branch
conversion.

I have tried converting the emacs repository and it seems that
subversion gets really slow with several ten thousand revisions.

MArk

--- /home/mark/cvs2svn/cvs2svn/cvs2svn.py 2002-10-28 06:58:47.000000000 +0100
+++ cvs2svn.py 2002-11-07 21:49:52.000000000 +0100
@@ -24,6 +24,8 @@

trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
+branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
+vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')

DATAFILE = 'cvs2svn-data'
REVS_SUFFIX = '.revs'
@@ -44,6 +46,8 @@

verbose = 1

+copy_from = { None: [ None, 0 ] }
+first_branch = None

class CollectData(rcsparse.Sink):
   def __init__(self, cvsroot, log_fname_base):
@@ -59,9 +63,54 @@
     # revision -> [timestamp, author, operation, old-timestamp]
     self.rev_data = { }
     self.prev = { }
+ self.branch_names = {}
+ self.taglist = {}
+ self.branchlist = {}
+
+ def set_branch_name(self, revision, name):
+ self.branch_names[revision] = name
+
+ def get_branch_name(self, revision):
+ brev = revision[:revision.rindex(".")];
+ if not self.branch_names.has_key(brev):
+ return None
+ return self.branch_names[brev]
+
+ def add_branch_point(self, revision, branch_name):
+ if not self.branchlist.has_key(revision):
+ self.branchlist[revision] = []
+ self.branchlist[revision].append(branch_name)
+
+ def add_cvs_branch(self, revision, branch_name):
+ last_dot = revision.rfind(".");
+ branch_rev = revision[:last_dot];
+ last2_dot = branch_rev.rfind(".");
+ branch_rev = branch_rev[:last2_dot] + revision[last_dot:];
+ self.set_branch_name(branch_rev, branch_name)
+ self.add_branch_point(branch_rev[:last2_dot], branch_name)
+
+ def get_tags(self, revision):
+ if self.taglist.has_key(revision):
+ return self.taglist[revision]
+ else:
+ return []
+
+ def get_branches(self, revision):
+ if self.branchlist.has_key(revision):
+ return self.branchlist[revision]
+ else:
+ return []

   def define_tag(self, name, revision):
     self.tags.write('%s %s %s\n' % (name, revision, self.fname))
+ if branch_tag.match(revision):
+ self.add_cvs_branch(revision, name)
+ elif vendor_tag.match(revision):
+ self.set_branch_name(revision, name)
+ else:
+ if not self.taglist.has_key(revision):
+ self.taglist[revision] = [];
+ self.taglist[revision].append(name)

   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
@@ -137,8 +186,10 @@
       # for this time and log message.
       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))

- self.revs.write('%08lx %s %s %s %s\n' % (timestamp, digest,
- op, revision, self.fname))
+ branch_name = self.get_branch_name(revision)
+
+ write_revs_line(self.revs, timestamp, digest, op, revision, self.fname,
+ branch_name, self.get_tags(revision), self.get_branches(revision))

def branch_path(ctx, branch_name = None):
   if branch_name:
@@ -146,6 +197,9 @@
   else:
      return ctx.trunk_base + '/'

+def get_tag_path(ctx, tag_name):
+ return ctx.tags_base + '/' + tag_name + '/'
+
def relative_name(cvsroot, fname):
   l = len(cvsroot)
   if fname[:l] == cvsroot:
@@ -154,6 +208,19 @@
     return fname[l:]
   return l

+def make_path(fs, root, repos_path, f_pool):
+ ### hmm. need to clarify OS path separators vs FS path separators
+ dirname = os.path.dirname(repos_path)
+ if dirname != '/':
+ # get the components of the path (skipping the leading '/')
+ parts = string.split(dirname[1:], os.sep)
+ for i in range(1, len(parts) + 1):
+ # reassemble the pieces, adding a leading slash
+ parent_dir = '/' + string.join(parts[:i], '/')
+ if fs.check_path(root, parent_dir, f_pool) == svn_node_none:
+ print ' making dir:', parent_dir
+ fs.make_dir(root, parent_dir, f_pool)
+
def visit_file(arg, dirname, files):
   cd, p, stats = arg
   for fname in files:
@@ -272,12 +339,16 @@

class Commit:
   def __init__(self):
+ self.files = { }
     self.changes = [ ]
     self.deletes = [ ]
     self.t_min = 1<<30
     self.t_max = 0

- def add(self, t, op, file, rev):
+ def has_file(self, fname):
+ return self.files.has_key(fname)
+
+ def add(self, t, op, file, rev, branch_name, tags, branches):
     # record the time range of this commit
     if t < self.t_min:
       self.t_min = t
@@ -285,20 +356,21 @@
       self.t_max = t

     if op == OP_CHANGE:
- self.changes.append((file, rev))
+ self.changes.append((file, rev, branch_name, tags, branches))
     else:
       # OP_DELETE
- self.deletes.append((file, rev))
+ self.deletes.append((file, rev, branch_name, tags, branches))
+ self.files[file] = 1

   def get_metadata(self, pool):
     # by definition, the author and log message must be the same for all
     # items that went into this commit. therefore, just grab any item from
     # our record of changes/deletes.
     if self.changes:
- file, rev = self.changes[0]
+ file, rev, br, tags, branches = self.changes[0]
     else:
       # there better be one...
- file, rev = self.deletes[0]
+ file, rev, br, tags, branches = self.deletes[0]

     # now, fetch the author/log from the ,v file
     rip = RevInfoParser()
@@ -312,21 +384,21 @@

     return author, log, date

- def commit(self, t_fs, ctx):
+ def commit(self, t_fs, ctx, target_branch, tag_copies, branch_copies, found_branches):
     # commit this transaction
     print 'committing: %s, over %d seconds' % (time.ctime(self.t_min),
                                                self.t_max - self.t_min)

     if ctx.dry_run:
- for f, r in self.changes:
+ for f, r, br, tags, branches in self.changes:
         # compute a repository path. ensure we have a leading "/" and drop
         # the ,v from the file name
- repos_path = branch_path(ctx) + relative_name(ctx.cvsroot, f[:-2])
+ repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
         print ' changing %s : %s' % (r, repos_path)
- for f, r in self.deletes:
+ for f, r, br, tags, branches in self.deletes:
         # compute a repository path. ensure we have a leading "/" and drop
         # the ,v from the file name
- repos_path = branch_path(ctx) + relative_name(ctx.cvsroot, f[:-2])
+ repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
         print ' deleting %s : %s' % (r, repos_path)
       print ' (skipped; dry run enabled)'
       return
@@ -343,25 +415,15 @@
     # create a pool for each file; it will be cleared on each iteration
     f_pool = util.svn_pool_create(c_pool)

- for f, r in self.changes:
+ for f, r, br, tags, branches in self.changes:
       # compute a repository path. ensure we have a leading "/" and drop
       # the ,v from the file name
- repos_path = branch_path(ctx) + relative_name(ctx.cvsroot, f[:-2])
+ repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
       #print 'DEBUG:', repos_path

       print ' changing %s : %s' % (r, repos_path)

- ### hmm. need to clarify OS path separators vs FS path separators
- dirname = os.path.dirname(repos_path)
- if dirname != '/':
- # get the components of the path (skipping the leading '/')
- parts = string.split(dirname[1:], os.sep)
- for i in range(1, len(parts) + 1):
- # reassemble the pieces, adding a leading slash
- parent_dir = '/' + string.join(parts[:i], '/')
- if fs.check_path(root, parent_dir, f_pool) == svn_node_none:
- print ' making dir:', parent_dir
- fs.make_dir(root, parent_dir, f_pool)
+ make_path(fs, root, repos_path, f_pool)

       if fs.check_path(root, repos_path, f_pool) == svn_node_none:
         created_file = 1
@@ -418,10 +480,10 @@
       # remember what we just did, for the next iteration
       lastcommit = (repos_path, r)

- for f, r in self.deletes:
+ for f, r, br, tags, branches in self.deletes:
       # compute a repository path. ensure we have a leading "/" and drop
       # the ,v from the file name
- repos_path = branch_path(ctx) + relative_name(ctx.cvsroot, f[:-2])
+ repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])

       print ' deleting %s : %s' % (r, repos_path)

@@ -450,6 +512,43 @@
       print ' CONFLICTS:', `conflicts`
     print ' new revision:', new_rev

+ for f, r, br, tags, branches in self.changes:
+ rel_name = relative_name(ctx.cvsroot, f[:-2])
+ for tag in tags:
+ rel_name = relative_name(ctx.cvsroot, f[:-2])
+ tag_path = get_tag_path(ctx, tag) + rel_name
+ repos_path = branch_path(ctx, br) + rel_name
+
+ t_root = fs.revision_root(t_fs, rev, f_pool);
+ if not tag_copies.has_key(tag):
+ tag_copies[tag] = []
+ tag_copies[tag].append([ br, rel_name, new_rev ])
+
+ # clear the pool after each copy
+ util.svn_pool_clear(f_pool)
+
+ for f, r, br, tags, branches in self.changes:
+ rel_name = relative_name(ctx.cvsroot, f[:-2])
+ for br2 in branches:
+ new_branch_path = branch_path(ctx, br2) + rel_name
+ repos_path = branch_path(ctx, br) + rel_name
+
+ t_root = fs.revision_root(t_fs, rev, f_pool);
+ if not branch_copies.has_key(br2):
+ branch_copies[br2] = []
+ branch_copies[br2].append([ br, rel_name, new_rev ])
+ found_branches[br2] = 1
+
+ # clear the pool after each copy
+ util.svn_pool_clear(f_pool)
+
+ for f, r, br, tags, branches in self.deletes:
+ rel_name = relative_name(ctx.cvsroot, f[:-2])
+ for br2 in branches:
+ new_branch_path = branch_path(ctx, br2) + rel_name
+ print "file:", f, "created on branch:", br2, "revision:", r
+ found_branches[br2] = 1
+
     # done with the commit and file pools
     util.svn_pool_destroy(c_pool)

@@ -487,15 +586,37 @@
   return resync

def parse_revs_line(line):
- timestamp = int(line[:8], 16)
- id = line[9:DIGEST_END_IDX]
- op = line[DIGEST_END_IDX + 1]
- idx = string.find(line, ' ', DIGEST_END_IDX + 3)
- rev = line[DIGEST_END_IDX+3:idx]
- fname = line[idx+1:-1]
-
- return timestamp, id, op, rev, fname
-
+ data = line.split(' ', 6)
+ timestamp = int(data[0], 16)
+ id = data[1]
+ op = data[2]
+ rev = data[3]
+ branch_name = data[4]
+ if branch_name == "*":
+ branch_name = None
+ ntags = int(data[5])
+ tags = data[6].split(' ', ntags + 1)
+ nbranches = int(tags[ntags])
+ branches = tags[ntags + 1].split(' ', nbranches)
+ fname = branches[nbranches][:-1] # strip \n
+ tags = tags[:ntags]
+ branches = branches[:nbranches]
+
+ return timestamp, id, op, rev, fname, branch_name, tags, branches
+
+def write_revs_line(output, timestamp, digest, op, revision, fname,
+ branch_name, tags, branches):
+ output.write('%08lx %s %s %s ' % (timestamp, digest, op, revision))
+ if not branch_name:
+ branch_name = "*"
+ output.write('%s ' % branch_name);
+ output.write('%d ' % (len(tags)));
+ for tag in tags:
+ output.write('%s ' % (tag));
+ output.write('%d ' % (len(branches)));
+ for branch in branches:
+ output.write('%s ' % (branch));
+ output.write('%s\n' % fname);

def pass1(ctx):
   cd = CollectData(ctx.cvsroot, DATAFILE)
@@ -519,7 +640,8 @@

   # process the revisions file, looking for items to clean up
   for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
- timestamp, digest, op, rev, fname = parse_revs_line(line)
+ timestamp, digest, op, rev, fname, branch_name, tags, branches = \
+ parse_revs_line(line)
     if not resync.has_key(digest):
       output.write(line)
       continue
@@ -529,8 +651,8 @@
     for record in resync[digest]:
       if record[0] <= timestamp <= record[1]:
         # bingo! remap the time on this (record[2] is the new time).
- output.write('%08lx %s %s %s %s\n'
- % (record[2], digest, op, rev, fname))
+ write_revs_line(output, record[2], digest, op, rev, fname,
+ branch_name, tags, branches)

         print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
               % (relative_name(ctx.cvsroot, fname),
@@ -552,7 +674,34 @@
   os.system('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
                               ctx.log_fname_base + SORTED_REVS_SUFFIX))

+# determine the branch hierarchy.
+# TODO: only handles tree hierarchies for now, no DAGs yet
def pass4(ctx):
+ global copy_from
+ global first_branch
+ if ctx.vendor_branch != None:
+ copy_from = { ctx.vendor_branch: [ ctx.vendor_branch, 0 ],
+ None: [ ctx.vendor_branch, 1] }
+ first_branch = ctx.vendor_branch
+
+ for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
+ timestamp, id, op, rev, fname, branch_name, tags, branches = \
+ parse_revs_line(line)
+
+ for tag in tags:
+
+ if not copy_from.has_key(branch_name):
+ copy_from[branch_name] = [ None, 0 ]
+ #print branch_name, tag
+ #raise "branch error"
+
+ level = copy_from[branch_name][1]
+
+ for tag in tags + branches:
+ if not copy_from.has_key(tag) or copy_from[tag][1] <= level:
+ copy_from[tag] = (branch_name, level + 1)
+
+def pass5(ctx):
   # create the target repository
   if not ctx.dry_run:
     if ctx.create_repos:
@@ -563,15 +712,107 @@
   else:
     t_fs = t_repos = None

+ branch_copies = { }
+ tag_copies = { }
+ branches_done = { }
+ copies_done = { }
+
+ branches = [ None ] # start with the trunk
+ if ctx.vendor_branch != None:
+ branches = [ ctx.vendor_branch, None ]
+ while len(branches) > 0:
+ br = branches[0]
+ branches = branches[1:]
+
+ if not branches_done.has_key(br):
+ print "CONVERTING BRANCH:", br
+
+ if br != first_branch:
+ if not branches_done.has_key(copy_from[br][0]):
+ print "COPY FROM", copy_from
+ print "COPIES DONE", copies_done
+ print "SKIPPING:", br, "NEED:", copy_from[br][0]
+ branches.append(br)
+ continue
+
+ new_branches = convert_branch(ctx, t_repos, t_fs, br, tag_copies, branch_copies, copies_done)
+ branches_done[br] = 1
+
+ print "FOUND BRANCHES:", repr(new_branches)
+ branches += new_branches
+
+ do_copy(ctx, t_fs, 1, branch_copies, copy_from, copies_done, branches_done)
+ do_copy(ctx, t_fs, 0, tag_copies, copy_from, copies_done, branches_done)
+
+ if ctx.verbose:
+ print count, 'commits processed.'
+
+def do_copy(ctx, t_fs, is_branch, do_copies, copy_from, copies_done, branches_done):
+ if is_branch:
+ action = "BRANCHING:"
+ else:
+ action = "TAGGING:"
+
+ c_pool = util.svn_pool_create(ctx.pool)
+
+ for tag in do_copies.keys():
+ if copies_done.has_key(tag):
+ continue
+ if not branches_done.has_key(copy_from[tag][0]):
+ continue
+
+ rev = fs.youngest_rev(t_fs, c_pool)
+ txn = fs.begin_txn(t_fs, rev, c_pool)
+ root = fs.txn_root(txn, c_pool)
+ f_pool = util.svn_pool_create(c_pool)
+
+ make_path(fs, root, ctx.tags_base, f_pool)
+ make_path(fs, root, ctx.branches_base, f_pool)
+
+ fileset = do_copies[tag]
+ for br, f, rev in fileset:
+ rel_name = f # relative_name(ctx.cvsroot, f)
+ if is_branch:
+ copy_path = branch_path(ctx, tag) + rel_name
+ else:
+ copy_path = get_tag_path(ctx, tag) + rel_name
+ repos_path = branch_path(ctx, br) + rel_name
+
+ print "tag", tag, "to", copy_path, "from", repos_path, "revision", rev
+ t_root = fs.revision_root(t_fs, rev, f_pool);
+ make_path(fs, root, copy_path, f_pool)
+ fs.copy(t_root, repos_path, root, copy_path, f_pool)
+
+ # clear the pool after each copy
+ util.svn_pool_clear(f_pool)
+ pass
+
+ log_msg = "%s %s\n" % (action, tag)
+ fs.change_txn_prop(txn, 'svn:author', "cvs2svn", c_pool)
+ fs.change_txn_prop(txn, 'svn:log', log_msg, c_pool)
+
+ conflicts, new_rev = fs.commit_txn(txn)
+ if conflicts != '\n':
+ print ' CONFLICTS:', `conflicts`
+ print ' new revision:', new_rev
+ del do_copies[tag]
+ copies_done[tag] = 1
+
+ util.svn_pool_destroy(c_pool)
+
+def convert_branch(ctx, t_repos, t_fs, target_branch, tag_copies, branch_copies, copies_done):
   # process the logfiles, creating the target
   commits = { }
+ found_branches = { }
   count = 0

   for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
- timestamp, id, op, rev, fname = parse_revs_line(line)
+ timestamp, id, op, rev, fname, branch_name, tags, branches = \
+ parse_revs_line(line)

- ### only handle changes on the trunk for now
- if not trunk_rev.match(rev):
+ # ignore changes not on current branch
+ if branch_name != target_branch:
+ found_branches[branch_name] = 1
       ### technically, the timestamp on this could/should cause a flush.
       ### don't worry about it; the next item will handle it
       continue
@@ -579,14 +820,15 @@
     # scan for commits to process
     process = [ ]
     for scan_id, scan_c in commits.items():
- if scan_c.t_max + COMMIT_THRESHOLD < timestamp:
+ if scan_c.t_max + COMMIT_THRESHOLD < timestamp or \
+ scan_c.has_file(fname):
         process.append((scan_c.t_max, scan_c))
         del commits[scan_id]

     # sort the commits into time-order, then commit 'em
     process.sort()
     for t_max, c in process:
- c.commit(t_fs, ctx)
+ latest_rev = c.commit(t_fs, ctx, target_branch, tag_copies, branch_copies, found_branches)
     count = count + len(process)

     # add this item into the set of commits we're assembling
@@ -594,7 +836,7 @@
       c = commits[id]
     else:
       c = commits[id] = Commit()
- c.add(timestamp, op, fname, rev)
+ c.add(timestamp, op, fname, rev, branch_name, tags, branches)

   # if there are any pending commits left, then flush them
   if commits:
@@ -603,17 +845,17 @@
       process.append((c.t_max, c))
     process.sort()
     for t_max, c in process:
- c.commit(t_fs, ctx)
+ latest_rev = c.commit(t_fs, ctx, target_branch, tag_copies, branch_copies, found_branches)
     count = count + len(process)

- if ctx.verbose:
- print count, 'commits processed.'
+ return found_branches.keys()

_passes = [
   pass1,
   pass2,
   pass3,
   pass4,
+ pass5,
   ]

class _ctx:
@@ -662,10 +904,11 @@
   ctx.trunk_base = "/trunk"
   ctx.tags_base = "/tags"
   ctx.branches_base = "/branches"
+ ctx.vendor_branch = None

   try:
     opts, args = getopt.getopt(sys.argv[1:], 'p:s:vn',
- [ "create", "trunk=", "branches=", "tags=" ])
+ [ "create", "trunk=", "branches=", "tags=", "vendor=" ])
   except getopt.GetoptError:
     usage(ctx)
   if len(args) != 1:
@@ -695,6 +938,8 @@
       ctx.branches_base = value
     elif opt == '--tags':
       ctx.tags_base = value
+ elif opt == '--vendor':
+ ctx.vendor_branch = value

   util.run_app(convert, ctx, start_pass=start_pass)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Thu Nov 7 22:03:53 2002

This message: [ Message body ]
Next message: Stefan Küng: "Re: crash in rev 3683"
Previous message: James Cox: "RE: Subversion in Linux Magazine?"
Next in thread: Daniel Berlin: "Re: [PATCH] cvs2svn.py - new test release"
Reply: Daniel Berlin: "Re: [PATCH] cvs2svn.py - new test release"
Reply: Shun-ichi GOTO: "Re: [PATCH] cvs2svn.py - new test release"

Contemporary messages sorted: [ By Date ] [ By Thread ] [ By Subject ] [ By Author ] [ By messages with attachments ]