#!/usr/bin/env python
# Copyright (c) 2006, 2007 by John Szakmeister <john at szakmeister dot net>
# Copyright (c) 2008 by James Knight <foom@fuhm.net>
#
# This program is free software; you can redistribute it and/or mdoify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

import os
import optparse
import sys
import re


# A handy constant for refering to the NULL digest (one that
# matches every digest).
NULL_DIGEST = '00000000000000000000000000000000'


def getDirHash(f):
  l = f.readline()
  if l != 'PLAIN\n':
    raise ValueError, "Expected a PLAIN representation (%d)" % f.tell()

  hash = {}

  while True:
    field = f.readline()
    if field == 'END\n':
      break
    assert(field[0] == 'K')
    length = int(field[2:-1])
    field = f.read(length)
    assert(f.read(1) == '\n')

    value = f.readline()
    assert(value[0] == 'V')
    length = int(value[2:-1])

    value = f.read(length)
    (type, txn) = value.split(' ', 1)
    assert(f.read(1) == '\n')

    hash[field] = (NodeType(type), NodeId(txn))

  return hash


def getTextRepType(f, textrep):
  f.seek(textrep.offset)
  rep = f.readline()[:-1]
  if rep == "PLAIN":
    return "PLAIN", None, None, None
  elif rep[0:5] == "DELTA":
    pieces = rep.split(' ')
    if len(pieces) == 1:
      return "DELTA", None, None, None
    else:
      return "DELTA", int(pieces[1]), int(pieces[2]), int(pieces[3])
  raise "WTFBBQ %s" % rep
    
def getDirHash2(f, textrep):
  f.seek(textrep.offset)
  rep = f.readline()
  if rep != 'PLAIN\n':
    raise ValueError, "Expected a PLAIN representation (%d)" % f.tell()

  data = f.read(textrep.length)
  lines = data.splitlines()

  hash = {}
  i = iter(lines)
  
  while True:
    field = i.next()
    if field == 'END':
      break
    assert(field[0] == 'K')
    length = int(field[2:])
    field = i.next()
    assert(len(field) == length)

    value = i.next()
    assert(value[0] == 'V')
    length = int(value[2:])

    value = i.next()

    assert(len(value) == length)
    (type, txn) = value.split(' ', 1)

    hash[field] = (NodeType(type), NodeId(txn))

  return hash

class Rep(object):
  __slots__ = ['type', 'rev', 'offset', 'length', 'size', 'digest', 'noderev']
  def __init__(self, type, rev, offset, length, size, digest,
               noderev):
    self.type = type
    self.rev = rev
    self.offset = offset
    self.length = length
    self.size = size

    self.digest = digest

    self.noderev = noderev

  def __repr__(self):
    return '%s: %d %d %d %d %s' % (self.type, self.rev,
                                   self.offset, self.length, self.size,
                                   self.digest)


class NodeId(object):
  __slots__ = ['txn_name', 'offset', 'rev']

  def __init__(self, nodeid):
    (self.txn_name, offset) = nodeid.split('/')
    self.offset = int(offset)
    self.rev = int(self.txn_name.split('.')[2][1:])

  def __repr__(self):
    return '%s/%d' % (self.txn_name, self.offset)

  def __eq__ (self, other):
    return self.txn_name == other.txn_name and self.offset == other.offset

  def __hash__(self):
    return hash(self.txn_name) ^ hash(self.offset)

def NodeType(t):
  if (t == 'file'):
    return 'file'
  if (t == 'dir'):
    return 'dir'
  raise ValueError, 'Invalid Node type received: "%s"' % t

class NodeRev(object):
  def __init__(self, f, currentRev, offset):
    self.pred = None
    self.text = None
    self.props = None
    self.cpath = None
    self.copyroot = None
    self.copyfrom = None
    self.dir = ()

    self.nodeOffset = offset
    f.seek(offset)

    while True:
      line = f.readline()
      if line == '':
        raise IOError, "Unexpected end of file"
      if line == '\n':
        break

      # break apart the line
      try:
        (field, value) = line.split(':', 1)
      except:
        print repr(line)
        print self.nodeOffset
        print f.tell()
        raise

      # pull of the leading space and trailing new line
      value = value[1:-1]

      if field == 'id':
        self.id = NodeId(value)
      elif field == 'type':
        self.type = NodeType(value)
      elif field == 'pred':
        self.pred = NodeId(value)
      elif field == 'text':
        (rev, offset, length, size, digest) = value.split(' ')
        self.text = Rep('text', int(rev), int(offset), int(length), int(size), digest, self)
      elif field == 'props':
        pass
        (rev, offset, length, size, digest) = value.split(' ')
        self.props = Rep('prop', int(rev), int(offset), int(length), int(size), digest, self)
      elif field == 'cpath':
        self.cpath = value
      elif field == 'copyroot':
        self.copyroot = value
      elif field == 'copyfrom':
        self.copyfrom = value

    if self.type == 'dir':
      if self.text:
        if currentRev == self.text.rev:
          self.dir = getDirHash2(f, self.text)
        else:
          pass
#          f2 = open(rev_filename(self.text.rev), "rb")
#          f2.seek(self.text.offset)
#          self.dir = getDirHash(f2)
#          f2.close()

  def __repr__(self):
    str = 'NodeRev Id: %s\n type: %s\n' % (repr(self.id), repr(self.type))
    if self.pred:
      str = str + ' pred: %s\n' % repr(self.pred)
    if self.text:
      str = str + ' %s\n' % repr(self.text)
    if self.props:
      str = str + ' %s\n' % repr(self.props)
    if self.cpath:
      str = str + ' cpath: %s\n' % self.cpath
    if self.copyroot:
      str = str + ' copyroot: %s\n' % self.copyroot
    if self.copyfrom:
      str = str + ' copyfrom: %s\n' % self.copyfrom
    if self.dir:
      str = str + ' dir contents:\n'
      for k in self.dir:
        str = str + '  %s: %s\n' % (k, self.dir[k])
    return str[:-1]


def getRootAndChangedPaths(revFile):
  offset = -2
  while True:
    revFile.seek(offset, 2)
    c = revFile.read(1)
    if c == '\n':
      offset = revFile.tell()
      break
    offset = offset - 1

  (rootNode, changedPaths) = map(int, revFile.readline().split(' '))

  return (rootNode, changedPaths)


def rev_filename(rev):
  return "%s/db/revs/%d" % (svn_repo, rev)

class NodeWalkerAll(object):
  def __init__(self):
    self.visited_nodes = {}

  def iterate_revs(self, f, currentRev, offset):
#    print "_nodeWalker", f, currentRev, offset
    noderev = NodeRev(f, currentRev, offset)
    self.visited_nodes[noderev.id] = 1
    yield noderev

    if noderev.type == 'dir':
      for e in noderev.dir:
        nodetype, nodekey = noderev.dir[e]
        if 1: #nodekey not in self.visited_nodes:
#          print "visiting", nodekey, type(nodekey)
          if nodekey.rev == currentRev:
            f2 = f
          elif nodekey.rev >= earliest_revision:
            print "Reopening earlier rev!"
            f2 = open(rev_filename(nodekey.rev), "rb")
          else:
            continue
          if nodetype == 'file':
            yield NodeRev(f2, nodekey.rev, nodekey.offset)
          else:
            for x in self.iterate_revs(f2, nodekey.rev, nodekey.offset):
              yield x
        else:
          if nodekey.rev == currentRev:
            print "WTF??? revisiting?"

class NodeWalker(object):
  def iterate_revs(self, f, currentRev, offset):
    noderev = NodeRev(f, currentRev, offset)
    yield noderev

    if noderev.type == 'dir':
      for e in noderev.dir:
        nodetype, nodekey = noderev.dir[e]
        if nodekey.rev != currentRev:
          continue

        if nodetype == 'file':
          yield NodeRev(f, nodekey.rev, nodekey.offset)
        else:
          for x in self.iterate_revs(f, nodekey.rev, nodekey.offset):
            yield x

def truncate(noderev):
  txnId = noderev.id
  revFile = open(rev_filename(noderev.id.rev), 'rb+', 512)

  print "Truncating node %s (%s)" % (txnId, noderev.cpath)

  # Grab the text rep
  textRep = noderev.text

  if textRep.rev != noderev.id.rev:
    print " - No text rep present in this revision."
  else:
    # Fix the text rep contents
    offset = textRep.offset
    revFile.seek(offset, 0)
    revFile.readline()
    revFile.seek(textRep.length, 1)
    endByte = revFile.tell()
    checkLine = revFile.readline()
    assert(checkLine == "ENDREP\n")

    revFile.seek(offset, 0)

    lengthAvailable = endByte - offset
    #print "lengthAvailable:", lengthAvailable

    if lengthAvailable > len('PLAIN\nENDREP\nPLAIN\n'):
      revFile.write("PLAIN\nENDREP\nPLAIN\n")
      revFile.write('\x00' * (lengthAvailable - len('PLAIN\nENDREP\nPLAIN\n')))
      pass
    else:
      if lengthAvailable != 6:
        print "WARNING: not enough space to make nicely formatted file."
      # The output will look something like: "PLAIN\nENDREP\nREP\n",
      # because we're overlapping the previously present ENDREP which
      # wasn't counted in the available space. That should be okay, though.
      revFile.write("PLAIN\nENDREP\n")

  # Fix the node rev
  revFile.seek(noderev.nodeOffset, 0)

  textLine = None
  while True:
    savedOffset = revFile.tell()
    s = revFile.readline()
    if s == '\n':
      break
    if s.startswith('text:'):
      if textLine != None:
        raise "Found two 'text:' lines!"
      textLine = savedOffset

  if not textLine:
    raise "No text: line found!"

  revFile.seek(textLine, 0)

  line = revFile.readline()
  revFile.seek(textLine, 0)
  fields = line.split(' ')
  overallLength = len(line)

  fields[3] = '0' * len(fields[3])
  fields[4] = '0' * len(fields[4])
  fields[5] = 'd41d8cd98f00b204e9800998ecf8427e'
  newTextRep = ' '.join(fields) + '\x0a'
  assert(len(newTextRep) == overallLength)
  revFile.write(newTextRep)


if __name__ == '__main__':

  args = sys.argv

  if len(args) < 3:
    print "Usage: fsfs_obliterate REPO earliest_revision end_revision [FILES]*"
    sys.exit(1)

  svn_repo = args[0]
  earliest_revision = int(args[1])
  end_revision = int(args[2])

  # Make stderr the same as stdout.  This helps when trying to catch all of the
  # output from a run.
  sys.stderr = sys.stdout

  node_iterator = NodeWalker()

  interesting_files = sys.argv[3:]

  interesting_files = dict([(f,1) for f in interesting_files])

  interesting_locations = {}
  for revision in xrange(earliest_revision, end_revision):
    if revision % 2000 == 0: print revision
    revFile = open(rev_filename(revision), 'rb', 512)
    (root, changed) = getRootAndChangedPaths(revFile)

    for noderev in node_iterator.iterate_revs(revFile, revision, root):
      truncated = 0

      if noderev.type == 'file' and noderev.text is not None:
        if noderev.cpath in interesting_files:
          print "marked node %s %s matches, %s/%s" % (noderev.id, noderev.cpath, noderev.text.rev, noderev.text.offset)
          interesting_locations[(noderev.text.rev, noderev.text.offset)] = 1

        if (noderev.text.rev, noderev.text.offset) in interesting_locations:
          print "node %s %s refs %s/%s" % (noderev.id, noderev.cpath, noderev.text.rev, noderev.text.offset)
          truncate(noderev)
          truncated = 1

        if noderev.text.rev == revision:
          textreptype = getTextRepType(revFile, noderev.text)
          if (textreptype[1], textreptype[2]) in interesting_locations:
            print 'nodetext of %s %s (%s/%s) refs %s/%s' % (noderev.id, noderev.cpath, noderev.text.rev, noderev.text.offset, textreptype[1], textreptype[2])
            interesting_locations[(textreptype[1], textreptype[2])] = 1
            if not truncated:
              truncate(noderev)

    revFile.close()

  print interesting_locations

