#!/usr/bin/env python # Copyright (c) 2006, 2007 by John Szakmeister # Copyright (c) 2008 by James Knight # # This program is free software; you can redistribute it and/or mdoify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA import os import optparse import sys import re # A handy constant for refering to the NULL digest (one that # matches every digest). NULL_DIGEST = '00000000000000000000000000000000' def getDirHash(f): l = f.readline() if l != 'PLAIN\n': raise ValueError, "Expected a PLAIN representation (%d)" % f.tell() hash = {} while True: field = f.readline() if field == 'END\n': break assert(field[0] == 'K') length = int(field[2:-1]) field = f.read(length) assert(f.read(1) == '\n') value = f.readline() assert(value[0] == 'V') length = int(value[2:-1]) value = f.read(length) (type, txn) = value.split(' ', 1) assert(f.read(1) == '\n') hash[field] = (NodeType(type), NodeId(txn)) return hash def getTextRepType(f, textrep): f.seek(textrep.offset) rep = f.readline()[:-1] if rep == "PLAIN": return "PLAIN", None, None, None elif rep[0:5] == "DELTA": pieces = rep.split(' ') if len(pieces) == 1: return "DELTA", None, None, None else: return "DELTA", int(pieces[1]), int(pieces[2]), int(pieces[3]) raise "WTFBBQ %s" % rep def getDirHash2(f, textrep): f.seek(textrep.offset) rep = f.readline() if rep != 'PLAIN\n': raise ValueError, "Expected a PLAIN representation (%d)" % f.tell() data = f.read(textrep.length) lines = data.splitlines() hash = {} i = iter(lines) while True: field = i.next() if field == 'END': break assert(field[0] == 'K') length = int(field[2:]) field = i.next() assert(len(field) == length) value = i.next() assert(value[0] == 'V') length = int(value[2:]) value = i.next() assert(len(value) == length) (type, txn) = value.split(' ', 1) hash[field] = (NodeType(type), NodeId(txn)) return hash class Rep(object): __slots__ = ['type', 'rev', 'offset', 'length', 'size', 'digest', 'noderev'] def __init__(self, type, rev, offset, length, size, digest, noderev): self.type = type self.rev = rev self.offset = offset self.length = length self.size = size self.digest = digest self.noderev = noderev def __repr__(self): return '%s: %d %d %d %d %s' % (self.type, self.rev, self.offset, self.length, self.size, self.digest) class NodeId(object): __slots__ = ['txn_name', 'offset', 'rev'] def __init__(self, nodeid): (self.txn_name, offset) = nodeid.split('/') self.offset = int(offset) self.rev = int(self.txn_name.split('.')[2][1:]) def __repr__(self): return '%s/%d' % (self.txn_name, self.offset) def __eq__ (self, other): return self.txn_name == other.txn_name and self.offset == other.offset def __hash__(self): return hash(self.txn_name) ^ hash(self.offset) def NodeType(t): if (t == 'file'): return 'file' if (t == 'dir'): return 'dir' raise ValueError, 'Invalid Node type received: "%s"' % t class NodeRev(object): def __init__(self, f, currentRev, offset): self.pred = None self.text = None self.props = None self.cpath = None self.copyroot = None self.copyfrom = None self.dir = () self.nodeOffset = offset f.seek(offset) while True: line = f.readline() if line == '': raise IOError, "Unexpected end of file" if line == '\n': break # break apart the line try: (field, value) = line.split(':', 1) except: print repr(line) print self.nodeOffset print f.tell() raise # pull of the leading space and trailing new line value = value[1:-1] if field == 'id': self.id = NodeId(value) elif field == 'type': self.type = NodeType(value) elif field == 'pred': self.pred = NodeId(value) elif field == 'text': (rev, offset, length, size, digest) = value.split(' ') self.text = Rep('text', int(rev), int(offset), int(length), int(size), digest, self) elif field == 'props': pass (rev, offset, length, size, digest) = value.split(' ') self.props = Rep('prop', int(rev), int(offset), int(length), int(size), digest, self) elif field == 'cpath': self.cpath = value elif field == 'copyroot': self.copyroot = value elif field == 'copyfrom': self.copyfrom = value if self.type == 'dir': if self.text: if currentRev == self.text.rev: self.dir = getDirHash2(f, self.text) else: pass # f2 = open(rev_filename(self.text.rev), "rb") # f2.seek(self.text.offset) # self.dir = getDirHash(f2) # f2.close() def __repr__(self): str = 'NodeRev Id: %s\n type: %s\n' % (repr(self.id), repr(self.type)) if self.pred: str = str + ' pred: %s\n' % repr(self.pred) if self.text: str = str + ' %s\n' % repr(self.text) if self.props: str = str + ' %s\n' % repr(self.props) if self.cpath: str = str + ' cpath: %s\n' % self.cpath if self.copyroot: str = str + ' copyroot: %s\n' % self.copyroot if self.copyfrom: str = str + ' copyfrom: %s\n' % self.copyfrom if self.dir: str = str + ' dir contents:\n' for k in self.dir: str = str + ' %s: %s\n' % (k, self.dir[k]) return str[:-1] def getRootAndChangedPaths(revFile): offset = -2 while True: revFile.seek(offset, 2) c = revFile.read(1) if c == '\n': offset = revFile.tell() break offset = offset - 1 (rootNode, changedPaths) = map(int, revFile.readline().split(' ')) return (rootNode, changedPaths) def rev_filename(rev): return "%s/db/revs/%d" % (svn_repo, rev) class NodeWalkerAll(object): def __init__(self): self.visited_nodes = {} def iterate_revs(self, f, currentRev, offset): # print "_nodeWalker", f, currentRev, offset noderev = NodeRev(f, currentRev, offset) self.visited_nodes[noderev.id] = 1 yield noderev if noderev.type == 'dir': for e in noderev.dir: nodetype, nodekey = noderev.dir[e] if 1: #nodekey not in self.visited_nodes: # print "visiting", nodekey, type(nodekey) if nodekey.rev == currentRev: f2 = f elif nodekey.rev >= earliest_revision: print "Reopening earlier rev!" f2 = open(rev_filename(nodekey.rev), "rb") else: continue if nodetype == 'file': yield NodeRev(f2, nodekey.rev, nodekey.offset) else: for x in self.iterate_revs(f2, nodekey.rev, nodekey.offset): yield x else: if nodekey.rev == currentRev: print "WTF??? revisiting?" class NodeWalker(object): def iterate_revs(self, f, currentRev, offset): noderev = NodeRev(f, currentRev, offset) yield noderev if noderev.type == 'dir': for e in noderev.dir: nodetype, nodekey = noderev.dir[e] if nodekey.rev != currentRev: continue if nodetype == 'file': yield NodeRev(f, nodekey.rev, nodekey.offset) else: for x in self.iterate_revs(f, nodekey.rev, nodekey.offset): yield x def truncate(noderev): txnId = noderev.id revFile = open(rev_filename(noderev.id.rev), 'rb+', 512) print "Truncating node %s (%s)" % (txnId, noderev.cpath) # Grab the text rep textRep = noderev.text if textRep.rev != noderev.id.rev: print " - No text rep present in this revision." else: # Fix the text rep contents offset = textRep.offset revFile.seek(offset, 0) revFile.readline() revFile.seek(textRep.length, 1) endByte = revFile.tell() checkLine = revFile.readline() assert(checkLine == "ENDREP\n") revFile.seek(offset, 0) lengthAvailable = endByte - offset #print "lengthAvailable:", lengthAvailable if lengthAvailable > len('PLAIN\nENDREP\nPLAIN\n'): revFile.write("PLAIN\nENDREP\nPLAIN\n") revFile.write('\x00' * (lengthAvailable - len('PLAIN\nENDREP\nPLAIN\n'))) pass else: if lengthAvailable != 6: print "WARNING: not enough space to make nicely formatted file." # The output will look something like: "PLAIN\nENDREP\nREP\n", # because we're overlapping the previously present ENDREP which # wasn't counted in the available space. That should be okay, though. revFile.write("PLAIN\nENDREP\n") # Fix the node rev revFile.seek(noderev.nodeOffset, 0) textLine = None while True: savedOffset = revFile.tell() s = revFile.readline() if s == '\n': break if s.startswith('text:'): if textLine != None: raise "Found two 'text:' lines!" textLine = savedOffset if not textLine: raise "No text: line found!" revFile.seek(textLine, 0) line = revFile.readline() revFile.seek(textLine, 0) fields = line.split(' ') overallLength = len(line) fields[3] = '0' * len(fields[3]) fields[4] = '0' * len(fields[4]) fields[5] = 'd41d8cd98f00b204e9800998ecf8427e' newTextRep = ' '.join(fields) + '\x0a' assert(len(newTextRep) == overallLength) revFile.write(newTextRep) if __name__ == '__main__': args = sys.argv if len(args) < 3: print "Usage: fsfs_obliterate REPO earliest_revision end_revision [FILES]*" sys.exit(1) svn_repo = args[0] earliest_revision = int(args[1]) end_revision = int(args[2]) # Make stderr the same as stdout. This helps when trying to catch all of the # output from a run. sys.stderr = sys.stdout node_iterator = NodeWalker() interesting_files = sys.argv[3:] interesting_files = dict([(f,1) for f in interesting_files]) interesting_locations = {} for revision in xrange(earliest_revision, end_revision): if revision % 2000 == 0: print revision revFile = open(rev_filename(revision), 'rb', 512) (root, changed) = getRootAndChangedPaths(revFile) for noderev in node_iterator.iterate_revs(revFile, revision, root): truncated = 0 if noderev.type == 'file' and noderev.text is not None: if noderev.cpath in interesting_files: print "marked node %s %s matches, %s/%s" % (noderev.id, noderev.cpath, noderev.text.rev, noderev.text.offset) interesting_locations[(noderev.text.rev, noderev.text.offset)] = 1 if (noderev.text.rev, noderev.text.offset) in interesting_locations: print "node %s %s refs %s/%s" % (noderev.id, noderev.cpath, noderev.text.rev, noderev.text.offset) truncate(noderev) truncated = 1 if noderev.text.rev == revision: textreptype = getTextRepType(revFile, noderev.text) if (textreptype[1], textreptype[2]) in interesting_locations: print 'nodetext of %s %s (%s/%s) refs %s/%s' % (noderev.id, noderev.cpath, noderev.text.rev, noderev.text.offset, textreptype[1], textreptype[2]) interesting_locations[(textreptype[1], textreptype[2])] = 1 if not truncated: truncate(noderev) revFile.close() print interesting_locations