"Laramie Leavitt" <lar@leavitt.us> writes:
> Diff against revision 3843 of:
> http://svn.collab.net/repos/svn/trunk/tools/cvs2svn/cvs2svn.py
> 
> Note that the diff is a little dirty--some of the changes are
> definately spurious.
Thanks!  Can you  write a log message for the change?  (The HACKING
file gives more guidance on this.)  It's harder to review without a
log message.
-K
> --- cvs2svn.py.orig	2002-11-20 22:14:45.000000000 -0800
> +++ cvs2svn.py	2002-11-20 22:15:02.000000000 -0800
> @@ -1,9 +1,10 @@
> -#!/usr/bin/env python
> +#!/usr/bin/env python2.2
>  #
>  # cvs2svn: ...
>  #
> 
> -import rcsparse
> +# With a small number of experimental patches applied...
> +
>  import os
>  import sys
>  import sha
> @@ -13,17 +14,22 @@
>  import string
>  import getopt
>  import statcache
> +import rcsparse
> 
>  from svn import fs, util, _delta, _repos
> 
>  ### these should go somewhere else. should have SWIG export them.
> -svn_node_none = 0
> -svn_node_file = 1
> -svn_node_dir = 2
> +svn_node_none    = 0
> +svn_node_file    = 1
> +svn_node_dir     = 2
>  svn_node_unknown = 3
> 
> 
> -trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
> +trunk_rev  = re.compile('^[0-9]+\\.[0-9]+$')
> +branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
> +vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$') # XXX?
> +
> +
> 
>  DATAFILE = 'cvs2svn-data'
>  REVS_SUFFIX = '.revs'
> @@ -35,7 +41,7 @@
>  SVNROOT = 'svnroot'
>  ATTIC = os.sep + 'Attic'
> 
> -COMMIT_THRESHOLD = 5 * 60	# flush a commit if a 5 minute gap occurs
> +COMMIT_THRESHOLD = 3 * 60        # flush a commit if a 3 minute gap occurs
> 
>  OP_DELETE = 'D'
>  OP_CHANGE = 'C'
> @@ -59,9 +65,54 @@
>      # revision -> [timestamp, author, operation, old-timestamp]
>      self.rev_data = { }
>      self.prev = { }
> +    self.branch_names = {}
> +    self.taglist = {}
> +    self.branchlist = { }
> +
> +  def set_branch_name(self, revision, name):
> +    self.branch_names[revision] = name
> +
> +  def get_branch_name(self, revision):
> +    brev = revision[:revision.rindex(".")];
> +    if not self.branch_names.has_key(brev):
> +      return None
> +    return self.branch_names[brev]
> +
> +  def add_branch_point(self, revision, branch_name):
> +    if not self.branchlist.has_key(revision):
> +      self.branchlist[revision] = []
> +    self.branchlist[revision].append(branch_name)
> +
> +  def add_cvs_branch(self, revision, branch_name):
> +    last_dot = revision.rfind(".");
> +    branch_rev = revision[:last_dot];
> +    last2_dot = branch_rev.rfind(".");
> +    branch_rev = branch_rev[:last2_dot] + revision[last_dot:];
> +    self.set_branch_name(branch_rev, branch_name)
> +    self.add_branch_point(branch_rev[:last2_dot], branch_name)
> +
> +  def get_tags(self, revision):
> +    if self.taglist.has_key(revision):
> +      return self.taglist[revision]
> +    else:
> +      return []
> 
> +  def get_branches(self, revision):
> +    if self.branchlist.has_key(revision):
> +      return self.branchlist[revision]
> +    else:
> +      return []
> +
>    def define_tag(self, name, revision):
>      self.tags.write('%s %s %s\n' % (name, revision, self.fname))
> +    if branch_tag.match(revision):
> +      self.add_cvs_branch(revision, name)
> +    elif vendor_tag.match(revision):
> +      self.set_branch_name(revision, name)
> +    else:
> +      if not self.taglist.has_key(revision):
> +        self.taglist[revision] = [];
> +      self.taglist[revision].append(name)
> 
>    def define_revision(self, revision, timestamp, author, state,
>                        branches, next):
> @@ -108,8 +159,8 @@
>            # shove the previous revision back in time (and any before it that
>            # may need to shift).
>            while t_p >= t_c:
> -            self.rev_data[prev][0] = t_c - 1	# new timestamp
> -            self.rev_data[prev][3] = t_p	# old timestamp
> +            self.rev_data[prev][0] = t_c - 1        # new timestamp
> +            self.rev_data[prev][3] = t_p        # old timestamp
> 
>              print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
>                    % (relative_name(self.cvsroot, self.fname),
> @@ -119,7 +170,7 @@
>              prev = self.prev[current]
>              if not prev:
>                break
> -            t_c = t_c - 1		# self.rev_data[current][0]
> +            t_c = t_c - 1                # self.rev_data[current][0]
>              t_p = self.rev_data[prev][0]
> 
>            # break from the for-loop
> @@ -137,14 +188,19 @@
>        # for this time and log message.
>        self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
> 
> -    self.revs.write('%08lx %s %s %s %s\n' % (timestamp, digest,
> -                                             op, revision, self.fname))
> +    branch_name = self.get_branch_name(revision)
> +
> +    write_revs_line(self.revs,
> +                    timestamp, digest, op, revision, self.fname,
> branch_name,
> +                    self.get_tags(revision), self.get_branches(revision))
> +
> +
> 
>  def branch_path(ctx, branch_name = None):
> -  if branch_name:
> -     return ctx.branches_base + '/' + branch_name + '/'
> -  else:
> +  if branch_name == None:
>       return ctx.trunk_base + '/'
> +  else:
> +     return ctx.branches_base + '/' + branch_name + '/'
> 
>  def relative_name(cvsroot, fname):
>    l = len(cvsroot)
> @@ -171,10 +227,11 @@
>      p.parse(open(pathname), cd)
>      stats[0] = stats[0] + 1
> 
> +
>  class RevInfoParser(rcsparse.Sink):
>    def __init__(self):
> -    self.authors = { }	# revision -> author
> -    self.logs = { }	# revision -> log message
> +    self.authors = { }        # revision -> author
> +    self.logs = { }        # revision -> log message
> 
>    def define_revision(self, revision, timestamp, author, state,
>                        branches, next):
> @@ -274,37 +331,83 @@
>    def __init__(self):
>      self.changes = [ ]
>      self.deletes = [ ]
> +    self.loghash = {}
> +
>      self.t_min = 1<<30
>      self.t_max = 0
> 
> -  def add(self, t, op, file, rev):
> +  def ready_to_process( self, tstamp, fname ):
> +    if self.t_max + COMMIT_THRESHOLD < tstamp:
> +      return 1
> +    for x in self.changes:
> +      if x[0] == fname:
> +	return 1
> +    for x in self.deletes:
> +      if x[0] == fname:
> +	return 1
> +    return 0
> +
> +  def add(self, t, op, file, rev, branch_name, tags, branches):
>      # record the time range of this commit
>      if t < self.t_min:
>        self.t_min = t
>      if t > self.t_max:
>        self.t_max = t
> 
> +    # Keep track of all files...
>      if op == OP_CHANGE:
> -      self.changes.append((file, rev))
> +      self.changes.append((file, rev, branch_name, tags, branches))
>      else:
>        # OP_DELETE
> -      self.deletes.append((file, rev))
> +      self.deletes.append((file, rev, branch_name, tags, branches))
> +
> 
> -  def get_metadata(self, pool):
> +  def get_logmessages(self, mods, ctx, modifier):
> +    # get the log messages for those mods (change or delete)
> +    authorhash = {}
> +    currlog = ''
> +    lastlog = ''
> +
> +    rip = RevInfoParser()
> +
> +    for file, rev, br, tags, branches in mods:
> +      repos_path = relative_name(ctx.cvsroot, file[:-2])
> +      # now, fetch the author/log from the ,v file
> +      rip.parse_cvs_file(file)
> +      author = rip.authors[rev]
> +      authorhash[author] = 1
> +      currlog = rip.logs[rev].strip()
> +
> +      tmp = self.loghash.get(currlog, [])
> +      # ugly hack
> +      if 'M' == modifier and '1.1' == rev:
> +        tmp.append('A %s (%s)' % (repos_path, rev))
> +      else:
> +        tmp.append('%s %s (%s)' % (modifier, repos_path, rev))
> +      self.loghash[currlog] = tmp
> +    return author
> +
> +  def get_metadata(self, pool, ctx):
>      # by definition, the author and log message must be the same for all
>      # items that went into this commit. therefore, just grab any item from
>      # our record of changes/deletes.
>      if self.changes:
> -      file, rev = self.changes[0]
> +      file, rev, br, tags, branches = self.changes[0]
>      else:
> -      # there better be one...
> -      file, rev = self.deletes[0]
> +       # there better be one...
> +      file, rev, br, tags, branches = self.deletes[0]
> 
> +    log = ''
> +
>      # now, fetch the author/log from the ,v file
> -    rip = RevInfoParser()
> -    rip.parse_cvs_file(file)
> -    author = rip.authors[rev]
> -    log = rip.logs[rev]
> +    if self.changes:
> +      author = self.get_logmessages(self.changes, ctx, 'M')
> +    if self.deletes:
> +      author = self.get_logmessages(self.deletes, ctx, 'D')
> +
> +    for msg, files in self.loghash.items():
> +      log += '\n'.join(files)
> +      log += ":\n  " + msg + '\n'
> 
>      # format the date properly
>      a_t = util.apr_time_ansi_put(self.t_max)[1]
> @@ -312,21 +415,22 @@
> 
>      return author, log, date
> 
> +
>    def commit(self, t_fs, ctx):
>      # commit this transaction
>      print 'committing: %s, over %d seconds' % (time.ctime(self.t_min),
>                                                 self.t_max - self.t_min)
> -
> +
>      if ctx.dry_run:
> -      for f, r in self.changes:
> +      for f, r, br, tags, branches in self.changes:
>          # compute a repository path. ensure we have a leading "/" and drop
>          # the ,v from the file name
> -        repos_path = branch_path(ctx) + relative_name(ctx.cvsroot, f[:-2])
> +        repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot,
> f[:-2])
>          print '    changing %s : %s' % (r, repos_path)
> -      for f, r in self.deletes:
> +      for f, r, br, tags, branches in self.deletes:
>          # compute a repository path. ensure we have a leading "/" and drop
>          # the ,v from the file name
> -        repos_path = branch_path(ctx) + relative_name(ctx.cvsroot, f[:-2])
> +        repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot,
> f[:-2])
>          print '    deleting %s : %s' % (r, repos_path)
>        print '    (skipped; dry run enabled)'
>        return
> @@ -334,8 +438,8 @@
>      # create a pool for the entire commit
>      c_pool = util.svn_pool_create(ctx.pool)
> 
> -    rev = fs.youngest_rev(t_fs, c_pool)
> -    txn = fs.begin_txn(t_fs, rev, c_pool)
> +    rev  = fs.youngest_rev(t_fs, c_pool)
> +    txn  = fs.begin_txn(t_fs, rev, c_pool)
>      root = fs.txn_root(txn, c_pool)
> 
>      lastcommit = (None, None)
> @@ -343,10 +447,10 @@
>      # create a pool for each file; it will be cleared on each iteration
>      f_pool = util.svn_pool_create(c_pool)
> 
> -    for f, r in self.changes:
> +    for f, r, br, tags, branches in self.changes:
>        # compute a repository path. ensure we have a leading "/" and drop
>        # the ,v from the file name
> -      repos_path = branch_path(ctx) + relative_name(ctx.cvsroot, f[:-2])
> +      repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
>        #print 'DEBUG:', repos_path
> 
>        print '    changing %s : %s' % (r, repos_path)
> @@ -418,10 +522,10 @@
>        # remember what we just did, for the next iteration
>        lastcommit = (repos_path, r)
> 
> -    for f, r in self.deletes:
> +    for f, r, br, tags, branches in self.deletes:
>        # compute a repository path. ensure we have a leading "/" and drop
>        # the ,v from the file name
> -      repos_path = branch_path(ctx) + relative_name(ctx.cvsroot, f[:-2])
> +      repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot, f[:-2])
> 
>        print '    deleting %s : %s' % (r, repos_path)
> 
> @@ -436,7 +540,7 @@
>        util.svn_pool_clear(f_pool)
> 
>      # get the metadata for this commit
> -    author, log, date = self.get_metadata(c_pool)
> +    author, log, date = self.get_metadata(c_pool, ctx )
>      fs.change_txn_prop(txn, 'svn:author', author, c_pool)
>      fs.change_txn_prop(txn, 'svn:log', log, c_pool)
> 
> @@ -446,6 +550,84 @@
>      fs.change_rev_prop(t_fs, new_rev, 'svn:date', date, c_pool)
> 
>      ### how come conflicts is a newline?
> +    if len( conflicts ) > 1:
> +      print '    CONFLICTS:', conflicts
> +    print '    new revision:', new_rev
> +
> +
> +    # don't do tags if we don't need to.
> +    dotags = 0
> +    for f, r, br, tags, branches in self.changes:
> +      dotags += len( tags )
> +      dotags += len( branches )
> +
> +    if not dotags:
> +      util.svn_pool_destroy(c_pool)
> +      return
> +
> +    # make a new transaction for the tags
> +    rev = fs.youngest_rev(t_fs, c_pool)
> +    txn = fs.begin_txn(t_fs, rev, c_pool)
> +    root = fs.txn_root(txn, c_pool)
> +
> +    for f, r, br, tags, branches in self.changes:
> +      for tag in tags:
> +        tag_path = ctx.tags_base + '/' + tag + '/' +
> relative_name(ctx.cvsroot, f[:-2])
> +        repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot,
> f[:-2])
> +
> +        print "tagging", tag, "to", tag_path, "from", repos_path
> +
> +        t_root = fs.revision_root(t_fs, rev, f_pool);
> +
> +        ### hmm. need to clarify OS path separators vs FS path separators
> +        dirname = os.path.dirname(tag_path)
> +        if dirname != '/':
> +          # get the components of the path (skipping the leading '/')
> +          parts = string.split(dirname[1:], os.sep)
> +          for i in range(1, len(parts) + 1):
> +            # reassemble the pieces, adding a leading slash
> +            parent_dir = '/' + string.join(parts[:i], '/')
> +            if fs.check_path(root, parent_dir, f_pool) == svn_node_none:
> +              print '    making dir:', parent_dir
> +              fs.make_dir(root, parent_dir, f_pool) ### XXX COPY FROM
> BRANCH?
> +
> +        fs.copy(t_root, repos_path, root, tag_path, f_pool)
> +
> +        # clear the pool after each copy
> +        util.svn_pool_clear(f_pool)
> +
> +    for f, r, br, tags, branches in self.changes:
> +      for br2 in branches:
> +        new_branch_path = branch_path(ctx, br2) +
> relative_name(ctx.cvsroot, f[:-2])
> +        repos_path = branch_path(ctx, br) + relative_name(ctx.cvsroot,
> f[:-2])
> +
> +        print "branching", r, "to", new_branch_path, "from", repos_path
> +
> +        t_root = fs.revision_root(t_fs, rev, f_pool);
> +
> +        ### hmm. need to clarify OS path separators vs FS path separators
> +        dirname = os.path.dirname(new_branch_path)
> +        if dirname != '/':
> +          # get the components of the path (skipping the leading '/')
> +          parts = string.split(dirname[1:], os.sep)
> +          for i in range(1, len(parts) + 1):
> +            # reassemble the pieces, adding a leading slash
> +            parent_dir = '/' + string.join(parts[:i], '/')
> +            if fs.check_path(root, parent_dir, f_pool) == svn_node_none:
> +              print '    making dir:', parent_dir
> +              fs.make_dir(root, parent_dir, f_pool) ### XXX COPY FROM
> BRANCH?
> +
> +        fs.copy(t_root, repos_path, root, new_branch_path, f_pool)
> +
> +        # clear the pool after each copy
> +        util.svn_pool_clear(f_pool)
> +
> +    for f, r, br, tags, branches in self.deletes:
> +      for br2 in branches:
> +        new_branch_path = branch_path(ctx, br2) +
> relative_name(ctx.cvsroot, f[:-2])
> +        print "file:", f, "created on branch:", br2, "revision:", r
> +
> +    conflicts, new_rev = fs.commit_txn(txn)
>      if conflicts != '\n':
>        print '    CONFLICTS:', `conflicts`
>      print '    new revision:', new_rev
> @@ -486,15 +668,39 @@
>        resync[digest] = [ [t1_l, t1_u, t2] ]
>    return resync
> 
> -def parse_revs_line(line):
> -  timestamp = int(line[:8], 16)
> -  id = line[9:DIGEST_END_IDX]
> -  op = line[DIGEST_END_IDX + 1]
> -  idx = string.find(line, ' ', DIGEST_END_IDX + 3)
> -  rev = line[DIGEST_END_IDX+3:idx]
> -  fname = line[idx+1:-1]
> +def write_revs_line(output,
> +                    timestamp, digest, op, revision, fname,
> +                    branch_name, tags, branches):
> +  if not branch_name:
> +    branch_name = "*"
> +  output.write('%08lx %s %s %s %s' % (timestamp, digest,
> +                                      op, revision, branch_name))
> +  output.write(' %d ' % (len(tags)));
> +  for tag in tags:
> +    output.write('%s ' % (tag));
> +  output.write('%d ' % (len(branches)));
> +  for branch in branches:
> +    output.write('%s ' % (branch));
> +  output.write('%s\n' % fname);
> 
> -  return timestamp, id, op, rev, fname
> +def parse_revs_line(line):
> +  data = line.split(' ', 6)
> +  ##print "DATA", repr(data)
> +  timestamp = int(data[0], 16)
> +  id = data[1]
> +  op = data[2]
> +  rev = data[3]
> +  branch_name = data[4]
> +  if branch_name == "*":
> +    branch_name = None
> +  ntags = int(data[5])
> +  tags = data[6].split(' ', ntags + 1)
> +  nbranches = int(tags[ntags])
> +  branches = tags[ntags + 1].split(' ', nbranches + 1)
> +  fname = branches[nbranches][:-1]
> +  tags = tags[:ntags]
> +  branches = branches[:nbranches]
> +  return timestamp, id, op, rev, fname, branch_name, tags, branches
> 
> 
>  def pass1(ctx):
> @@ -519,7 +725,9 @@
> 
>    # process the revisions file, looking for items to clean up
>    for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
> -    timestamp, digest, op, rev, fname = parse_revs_line(line)
> +    timestamp, digest, op, rev, fname, branch_name, tags, branches = \
> +      parse_revs_line(line)
> +
>      if not resync.has_key(digest):
>        output.write(line)
>        continue
> @@ -529,8 +737,9 @@
>      for record in resync[digest]:
>        if record[0] <= timestamp <= record[1]:
>          # bingo! remap the time on this (record[2] is the new time).
> -        output.write('%08lx %s %s %s %s\n'
> -                     % (record[2], digest, op, rev, fname))
> +        write_revs_line(output,
> +                        record[2], digest, op, rev, fname, branch_name,
> +                        tags, branches)
> 
>          print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
>                % (relative_name(ctx.cvsroot, fname),
> @@ -568,18 +777,13 @@
>    count = 0
> 
>    for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
> -    timestamp, id, op, rev, fname = parse_revs_line(line)
> -
> -    ### only handle changes on the trunk for now
> -    if not trunk_rev.match(rev):
> -      ### technically, the timestamp on this could/should cause a flush.
> -      ### don't worry about it; the next item will handle it
> -      continue
> +    timestamp, id, op, rev, fname, branch_name, tags, branches = \
> +               parse_revs_line(line)
> 
>      # scan for commits to process
>      process = [ ]
>      for scan_id, scan_c in commits.items():
> -      if scan_c.t_max + COMMIT_THRESHOLD < timestamp:
> +      if scan_c.ready_to_process( timestamp, fname ):
>          process.append((scan_c.t_max, scan_c))
>          del commits[scan_id]
> 
> @@ -588,13 +792,16 @@
>      for t_max, c in process:
>        c.commit(t_fs, ctx)
>      count = count + len(process)
> +    process = []
> 
>      # add this item into the set of commits we're assembling
>      if commits.has_key(id):
>        c = commits[id]
>      else:
>        c = commits[id] = Commit()
> -    c.add(timestamp, op, fname, rev)
> +    c.add(timestamp, op, fname, rev, branch_name, tags, branches)
> +
> +  print 'flushing final commits...'
> 
>    # if there are any pending commits left, then flush them
>    if commits:
> @@ -616,9 +823,11 @@
>    pass4,
>    ]
> 
> +
>  class _ctx:
>    pass
> 
> +
>  def convert(pool, ctx, start_pass=1):
>    "Convert a CVS repository to an SVN repository."
> 
> @@ -700,3 +909,4 @@
> 
>  if __name__ == '__main__':
>    main()
> +
> 
> 
> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
> For additional commands, e-mail: dev-help@subversion.tigris.org
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Thu Nov 21 07:50:39 2002