[svn.haxx.se] · SVN Dev · SVN Users · SVN Org · TSVN Dev · TSVN Users · Subclipse Dev · Subclipse Users · this month's index

Stripping 'charset=' from po files [the sequal]

From: Erik Huelsmann <e.huelsmann_at_gmx.net>
Date: 2004-05-13 20:13:52 CEST

In order to prevent charset conversion by 'smart' gettext implementations
our build system has to strip out the the 'charset=UTF-8' string in the
administrative section of po files. The Makefile based system currently
does this by ripping out the entire 'Content-Type' line using 'sed'.

The Windows (python based) build system does not provide sed. To work
around that I wrote the general python based po parser included below. A
separate script does the real stripping. This also provides the (cleaner)
solution to only examen the admin section.

There are several questions to be answered before proceding:

1) We don't want to use the same script for the Makefile build (adding a new
dependency), do we?

2)
 a) Do we want the po parser in the Subversion repository?
 b) If so: where?

3) Do you have any comments to either script? (the strip charset script has
to be extended to include plural support before this code can be committed)

bye,

Erik.

start of the parser ===============
import string


class PoSink:
  def __init__(self):
    self.domain = None

  def recv_domain(self, domain):
    self.domain = domain

  def recv_simple_msg(self, pre_comment, msgid, msgstr):
    pass
  def recv_plural_msg(self, pre_comment, msgid, plural, msgstr_order,
msgstrs):
    pass
  def finish_parse(self):
    pass


# implement a token-parser
#
# the tokens will be defined as (with '$' == EOL)
#
# COMMENT : #(.*)(<EOL>|<EOF>)
# STRING : "<any character including escaped ">*"
# INDEX : '[' NUMBER ']'
# NUMBER : [0-9]+
# other : [a-zA-Z0-9_]+

TOKEN_CHUNK_SIZE = 100 * 1024 # 100kiB
OTHER_TOKEN_CHARS = string.letters + string.digits + '_'

class PoTokens:
  def __init__(self, inp):
    self.inp = inp
    self.buf = inp.read(TOKEN_CHUNK_SIZE)
    self.idx = -1

  def get(self):

    # skip initial whitespace
    while 1:
      self.idx += 1

      while self.idx < len(self.buf) and \
                self.buf[self.idx] in string.whitespace:
        self.idx += 1

      if self.idx == len(self.buf):
        self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
        self.idx = -1
        if self.buf == "":
          del self.buf
          return ""

      if not self.buf[self.idx] in string.whitespace:
        break

    start = self.idx

    # string "token"
    if self.buf[start] == "\"":
      token = ""

      end = self.buf.find('"', start+1)
      while 1:
        while end > -1 and self.buf[end - 1] == "\\":
          end = self.buf.find('"', end + 1)

        if end == -1:
          token += self.buf[start:]

          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
          if not self.buf:
            raise "Unexpected EOF; unterminated string."

          end = self.buf.find('"')
          start = 0
          continue

        self.idx = end
        return token + self.buf[start:end+1]

    # comment "token"
    if self.buf[start] == "#":
      token = ""

      while 1:
        end = self.buf.find("\n", start+1)

        if end == -1:
          token += self.buf[start:]

          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
          if not self.buf:
            del self.buf
            return token

          start = 0
          continue

        self.idx = end
        return token + self.buf[start:end]

    # msgstr "[INDEX]" "token"
    if self.buf[start] == '[':
      token = "["

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.whitespace:
          self.idx += 1

        if self.idx == len(self.buf):
          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

          if not self.buf:
            raise "Unexpected EOF while parsing a msgstr INDEX"

          self.idx = start = 0
          continue

        break

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.digits:
          self.idx += 1

        if self.idx == len(self.buf):
          token += self.buf[start:]
          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

          if not self.buf:
            raise "Unexpected EOF in msgstr INDEX"

          self.idx = start = 0

        token += self.buf[start:self.idx]
        break

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.whitespace:
          self.idx += 1

        if self.idx == len(self.buf):
          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

          if not self.buf:
            raise "Unexpected EOF while parsing a msgstr INDEX"

          self.idx = start = 0
          continue

        if self.buf[self.idx] == ']':
          return token + ']'
        else:
          raise "Unexpected character while parsing a msgstr INDEX"

    # character series token
    if self.buf[start] in OTHER_TOKEN_CHARS:
      token = ""

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in OTHER_TOKEN_CHARS:
          self.idx += 1

        if self.idx == len(self.buf):
          token += self.buf[start:]

          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

          if not self.buf:
            return token

          self.idx = start = 0
          continue

        return token + self.buf[start:self.idx]

    # unknown token starting character
    raise "Unexpected character in input stream (%s)" % self.buf[start]

  def unget(self, token):
    def reget(self=self, ungot=token):
      del self.get

      return ungot

    self.get = reget


def parse(inp, sink):

  def get_msg_argument(arg_to):
    rv = []
    token = inp.get()
    while token[0] == '"':
      rv += [ token ]
      token = inp.get()

    inp.unget(token)

    if len(rv) == 0:
      raise "Expected %s argument found other token instead" % arg_to

    return rv

  comment = []
  while 1:
    token = inp.get()

    if not token: # EOF
      return

    if token[0] == '#':
      comment += [ token ]

      continue

    if token.lower() == 'domain':
      token = inp.get()

      if token[0] in string.letters + string.digits + '_':
        sink.recv_domain(token)

      else:
        raise "Invalid token where domain name expected"

      continue

    if token.lower() == 'msgid':
      msgid = get_msg_argument('msgid')
      msgid_plural = []

      token = inp.get()
      if token.lower() == 'msgid_plural':
        msgid_plural = get_msg_argument('msgid_plural')
        token = inp.get()

      if msgid_plural:
        while token.lower() == 'msgstr':
          token = inp.get()

          if not token[0] == '[':
            raise "msgid INDEX expected when msgid_plural defined"

          msgstr_indices += [ token[1:-1] ]
          msgstrs[token[1:-1]] = get_msg_argument('msgstr[INDEX]')

          token = inp.get()

        if len(msgstr_indices) == 0:
          raise "msgstr expected after msgid_plural"

        inp.unget(token)

        sink.recv_plural_msg(comment, msgid, msgid_plural,
                             msgstr_indices, msgstrs)
        continue

      else: # not msgid_plural
        if not token.lower() == "msgstr":
          raise "Unexpected token where 'msgstr'"

        sink.recv_simple_msg(comment, msgid, get_msg_argument('msgstr'))

        comment = []
        continue

    raise "Unknown token (%s)" % token

end of the parser ===============

start of the strip script ===============
#!/usr/bin/env python

import sys, poparse
import getopt

class CharsetStrippingSink(poparse.PoSink):
  def __init__(self, out):
    self.out = out

  def recv_simple_msg(self, pre_comment, msgid, msgstr):
    if msgid == [ '""' ]:
      for i in xrange(len(msgstr)):
        msgstr_len = len(msgstr)-1
        # note that the charset could be split over lines
        if msgstr[msgstr_len-i].find("charset=") >= 0:
          del msgstr[msgstr_len-i]
          break

    for l in pre_comment:
      self.out.write("%s\n" % l)

    msg = "msgid "
    for l in msgid:
      self.out.write("%s%s\n" % (msg, l))
      msg = ""

    msg = "msgstr "
    for l in msgstr:
      self.out.write("%s%s\n" % (msg, l))
      msg = ""

  def finish_parse(self):
    pass

def strip_it(infile, outfile):
  poparse.parse(poparse.PoTokens(infile),
                CharsetStrippingSink(outfile))

def main():
  """Docstring to be added"""

  opts, args = getopt.getopt(sys.argv[1:], '', [])

  if len(args) < 1:
    print __doc__
    sys.exit(2)

  infile = None
  if args[0] == '-':
    infile = sys.stdin
  else:
    infile = open(args[0],'r')

  outfile = None
  if len(args) < 2 or args[1] == '-':
    outfile = sys.stdout
  else:
    outfile = open(args[1],'w')

  strip_it(infile, outfile)

if __name__ == '__main__':
  main()
end of the strip script ===============

-- 
NEU : GMX Internet.FreeDSL
Ab sofort DSL-Tarif ohne Grundgebühr: http://www.gmx.net/dsl
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Thu May 13 20:14:23 2004

This is an archived mail posted to the Subversion Dev mailing list.

This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.