#!/usr/bin/perl -w # # this takes one argument: the working directory path. # if omitted, defaults to "." # # it then reads the list of files under version control from # svn st -qv, and compares each (plain) $file with its # subdir/.svn/text-base/$file.svn-base # counterpart. If they are identical, $file is unlinked, # then hardlinked to its text-base # # this could be run after each "svn ci" or "svn up", # or more lazily just when you feel to consolidate some diskspace. # # Copyleft 2004-10-26, Lars Ellenberg # # #| WARNING #| WARNING #| WARNING #| #| if you want to use this with vim, make sure you have a recent vim! #| and that you have set backupcopy=auto,breakhardlink. #| #| Patch 6.2.481 #| Problem: When writing a file it is not possible to specify that hard and/or #| symlinks are to be broken instead of preserved. #| Solution: Add the "breaksymlink" and "breakhardlink" values to 'backupcopy'. #| (Simon Ekstrand) #| Files: runtime/doc/options.txt, src/fileio.c, src/option.c, src/option.h # # # this script can easily be changed to work for arbitrary trees, just do two # passes: one recursive find-all, hashing by file size and the first # bytes (and probably the permissions!), storing the attributes, then do a # second pass and compare the contents of those files in the same bucket, and # hard-link them if identical (and not yet linked). scales with N*(N-1) ... # # for svn, I do the "educated guess" that most likely comparison of two files # with different origin is wasted effort. only compare each file with its text-base. # use strict; my ($total, $linked, $linked_size) = (0,0,0); my ($left, $right); my ($l_dev,$l_ino,$l_mode,$l_nlink,$l_uid,$l_gid,$l_rdev,$l_size, $l_atime,$l_mtime,$l_ctime,$l_blksize,$l_blocks); my ($r_dev,$r_ino,$r_mode,$r_nlink,$r_uid,$r_gid,$r_rdev,$r_size, $r_atime,$r_mtime,$r_ctime,$r_blksize,$r_blocks); sub link_is_possible() { # left is there, and is a plain file (($l_dev,$l_ino,$l_mode,$l_nlink,$l_uid,$l_gid,$l_rdev,$l_size, $l_atime,$l_mtime,$l_ctime,$l_blksize,$l_blocks) = lstat($left)) && ( -f _ ) && # right is there, and is a plain file (($r_dev,$r_ino,$r_mode,$r_nlink,$r_uid,$r_gid,$r_rdev,$r_size, $r_atime,$r_mtime,$r_ctime,$r_blksize,$r_blocks) = lstat($right)) && ( -f _ ) && # they are not hardlinks of the same inode yet ( $l_ino != $r_ino ) && # they have the same size ( $l_size == $r_size ) } my $chunksize = 0x4000; sub file_content_is_identical() { open LEFT, '<',$left or die "open $left: $!"; open RIGHT,'<',$right or die "open $right: $!"; my ($l,$lc,$r,$rc); my $ok = 0; while ( defined($lc = read LEFT,$l,$chunksize) && defined($rc = read RIGHT,$r,$chunksize) && ($lc == $rc) && ($l eq $r) ) { $ok = 1, last if $lc == 0; } close LEFT; close RIGHT; return $ok; } sub min($$) { return (($_[0] < $_[1]) ? $_[0] : $_[1]) } # # MAIN # (my $PROG = $0) =~ s,^.*/,,; my $LEFT = shift || "."; die "Usage: $PROG [working directory]\n" if (@ARGV); chdir $LEFT or die "chdir $LEFT: $!\n"; system("svn info .") == 0 or die "'svn info .' in $LEFT failed -- too scared to continue.\n"; open STDIN, "svn st -qv|" or die "open(STDIN,'svn st -qv|'): $!\n"; while (defined($_ = )) { chomp; s/^.{8}\s*\d+\s+\d+\s+\S+\s+//; $left = "./$_"; -f $left or next; ++$total; ($right = $left) =~ s{^(.*/)([^/]*)$} {$1.svn/text-base/$2.svn-base}; # print "?? $left <=> $right\n"; next unless link_is_possible and file_content_is_identical; # unlink working copy, since it can be restored by svn revert # I don't want to end up losing the base file after a crash between # unlink and link... unlink $left or die "unlink $left: $!"; link $right => $left or die "link $right => $left: $!"; print "ln $right $left\n"; utime time, min($l_mtime,$r_mtime), $left; ++$linked; $linked_size += $l_size; } print "hardlinked $linked of $total files, ($linked_size bytes)\n";