Stripping 'charset=' from po files [the sequal]

From: Erik Huelsmann <e.huelsmann_at_gmx.net>
Date: 2004-05-13 20:13:52 CEST

In order to prevent charset conversion by 'smart' gettext implementations
our build system has to strip out the the 'charset=UTF-8' string in the
administrative section of po files. The Makefile based system currently
does this by ripping out the entire 'Content-Type' line using 'sed'.

The Windows (python based) build system does not provide sed. To work
around that I wrote the general python based po parser included below. A
separate script does the real stripping. This also provides the (cleaner)
solution to only examen the admin section.

There are several questions to be answered before proceding:

1) We don't want to use the same script for the Makefile build (adding a new
dependency), do we?

 a) Do we want the po parser in the Subversion repository?
 b) If so: where?

3) Do you have any comments to either script? (the strip charset script has
to be extended to include plural support before this code can be committed)



start of the parser ===============
import string

class PoSink:
  def __init__(self):
    self.domain = None

  def recv_domain(self, domain):
    self.domain = domain

  def recv_simple_msg(self, pre_comment, msgid, msgstr):
  def recv_plural_msg(self, pre_comment, msgid, plural, msgstr_order,
  def finish_parse(self):

# implement a token-parser
# the tokens will be defined as (with '$' == EOL)
# COMMENT : #(.*)(<EOL>|<EOF>)
# STRING : "<any character including escaped ">*"
# INDEX : '[' NUMBER ']'
# NUMBER : [0-9]+
# other : [a-zA-Z0-9_]+

TOKEN_CHUNK_SIZE = 100 * 1024 # 100kiB
OTHER_TOKEN_CHARS = string.letters + string.digits + '_'

class PoTokens:
  def __init__(self, inp):
    self.inp = inp
    self.buf = inp.read(TOKEN_CHUNK_SIZE)
    self.idx = -1

  def get(self):

    # skip initial whitespace
    while 1:
      self.idx += 1

      while self.idx < len(self.buf) and \
                self.buf[self.idx] in string.whitespace:
        self.idx += 1

      if self.idx == len(self.buf):
        self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
        self.idx = -1
        if self.buf == "":
          del self.buf
          return ""

      if not self.buf[self.idx] in string.whitespace:

    start = self.idx

    # string "token"
    if self.buf[start] == "\"":
      token = ""

      end = self.buf.find('"', start+1)
      while 1:
        while end > -1 and self.buf[end - 1] == "\\":
          end = self.buf.find('"', end + 1)

        if end == -1:
          token += self.buf[start:]

          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
          if not self.buf:
            raise "Unexpected EOF; unterminated string."

          end = self.buf.find('"')
          start = 0

        self.idx = end
        return token + self.buf[start:end+1]

    # comment "token"
    if self.buf[start] == "#":
      token = ""

      while 1:
        end = self.buf.find("\n", start+1)

        if end == -1:
          token += self.buf[start:]

          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
          if not self.buf:
            del self.buf
            return token

          start = 0

        self.idx = end
        return token + self.buf[start:end]

    # msgstr "[INDEX]" "token"
    if self.buf[start] == '[':
      token = "["

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.whitespace:
          self.idx += 1

        if self.idx == len(self.buf):
          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

          if not self.buf:
            raise "Unexpected EOF while parsing a msgstr INDEX"

          self.idx = start = 0


      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.digits:
          self.idx += 1

        if self.idx == len(self.buf):
          token += self.buf[start:]
          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

          if not self.buf:
            raise "Unexpected EOF in msgstr INDEX"

          self.idx = start = 0

        token += self.buf[start:self.idx]

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.whitespace:
          self.idx += 1

        if self.idx == len(self.buf):
          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

          if not self.buf:
            raise "Unexpected EOF while parsing a msgstr INDEX"

          self.idx = start = 0

        if self.buf[self.idx] == ']':
          return token + ']'
          raise "Unexpected character while parsing a msgstr INDEX"

    # character series token
    if self.buf[start] in OTHER_TOKEN_CHARS:
      token = ""

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in OTHER_TOKEN_CHARS:
          self.idx += 1

        if self.idx == len(self.buf):
          token += self.buf[start:]

          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

          if not self.buf:
            return token

          self.idx = start = 0

        return token + self.buf[start:self.idx]

    # unknown token starting character
    raise "Unexpected character in input stream (%s)" % self.buf[start]

  def unget(self, token):
    def reget(self=self, ungot=token):
      del self.get

      return ungot

    self.get = reget

def parse(inp, sink):

  def get_msg_argument(arg_to):
    rv = []
    token = inp.get()
    while token[0] == '"':
      rv += [ token ]
      token = inp.get()


    if len(rv) == 0:
      raise "Expected %s argument found other token instead" % arg_to

    return rv

  comment = []
  while 1:
    token = inp.get()

    if not token: # EOF

    if token[0] == '#':
      comment += [ token ]


    if token.lower() == 'domain':
      token = inp.get()

      if token[0] in string.letters + string.digits + '_':

        raise "Invalid token where domain name expected"


    if token.lower() == 'msgid':
      msgid = get_msg_argument('msgid')
      msgid_plural = []

      token = inp.get()
      if token.lower() == 'msgid_plural':
        msgid_plural = get_msg_argument('msgid_plural')
        token = inp.get()

      if msgid_plural:
        while token.lower() == 'msgstr':
          token = inp.get()

          if not token[0] == '[':
            raise "msgid INDEX expected when msgid_plural defined"

          msgstr_indices += [ token[1:-1] ]
          msgstrs[token[1:-1]] = get_msg_argument('msgstr[INDEX]')

          token = inp.get()

        if len(msgstr_indices) == 0:
          raise "msgstr expected after msgid_plural"


        sink.recv_plural_msg(comment, msgid, msgid_plural,
                             msgstr_indices, msgstrs)

      else: # not msgid_plural
        if not token.lower() == "msgstr":
          raise "Unexpected token where 'msgstr'"

        sink.recv_simple_msg(comment, msgid, get_msg_argument('msgstr'))

        comment = []

    raise "Unknown token (%s)" % token

end of the parser ===============

start of the strip script ===============
#!/usr/bin/env python

import sys, poparse
import getopt

class CharsetStrippingSink(poparse.PoSink):
  def __init__(self, out):
    self.out = out

  def recv_simple_msg(self, pre_comment, msgid, msgstr):
    if msgid == [ '""' ]:
      for i in xrange(len(msgstr)):
        msgstr_len = len(msgstr)-1
        # note that the charset could be split over lines
        if msgstr[msgstr_len-i].find("charset=") >= 0:
          del msgstr[msgstr_len-i]

    for l in pre_comment:
      self.out.write("%s\n" % l)

    msg = "msgid "
    for l in msgid:
      self.out.write("%s%s\n" % (msg, l))
      msg = ""

    msg = "msgstr "
    for l in msgstr:
      self.out.write("%s%s\n" % (msg, l))
      msg = ""

  def finish_parse(self):

def strip_it(infile, outfile):

def main():
  """Docstring to be added"""

  opts, args = getopt.getopt(sys.argv[1:], '', [])

  if len(args) < 1:
    print __doc__

  infile = None
  if args[0] == '-':
    infile = sys.stdin
    infile = open(args[0],'r')

  outfile = None
  if len(args) < 2 or args[1] == '-':
    outfile = sys.stdout
    outfile = open(args[1],'w')

  strip_it(infile, outfile)

if __name__ == '__main__':
end of the strip script ===============

Received on Thu May 13 20:14:23 2004

