Stripping 'charset=' from po files [the sequal]

From: Erik Huelsmann <e.huelsmann_at_gmx.net>
Date: 2004-05-13 20:13:52 CEST

In order to prevent charset conversion by 'smart' gettext implementations
our build system has to strip out the the 'charset=UTF-8' string in the
administrative section of po files. The Makefile based system currently
does this by ripping out the entire 'Content-Type' line using 'sed'.

The Windows (python based) build system does not provide sed. To work
around that I wrote the general python based po parser included below. A
separate script does the real stripping. This also provides the (cleaner)
solution to only examen the admin section.

There are several questions to be answered before proceding:

1) We don't want to use the same script for the Makefile build (adding a new
dependency), do we?

2)
a) Do we want the po parser in the Subversion repository?
b) If so: where?

3) Do you have any comments to either script? (the strip charset script has
to be extended to include plural support before this code can be committed)

bye,

Erik.

start of the parser ===============
import string

class PoSink:
def __init__(self):
self.domain = None

def recv_domain(self, domain):
self.domain = domain

  def recv_simple_msg(self, pre_comment, msgid, msgstr):
    pass
  def recv_plural_msg(self, pre_comment, msgid, plural, msgstr_order,
msgstrs):
    pass
  def finish_parse(self):
    pass

# implement a token-parser
#
# the tokens will be defined as (with '$' == EOL)
#
# COMMENT : #(.*)(<EOL>|<EOF>)
# STRING : "<any character including escaped ">*"
# INDEX : '[' NUMBER ']'
# NUMBER : [0-9]+
# other : [a-zA-Z0-9_]+

TOKEN_CHUNK_SIZE = 100 * 1024 # 100kiB
OTHER_TOKEN_CHARS = string.letters + string.digits + '_'

class PoTokens:
  def __init__(self, inp):
    self.inp = inp
    self.buf = inp.read(TOKEN_CHUNK_SIZE)
    self.idx = -1

def get(self):

    # skip initial whitespace
    while 1:
      self.idx += 1

      while self.idx < len(self.buf) and \
                self.buf[self.idx] in string.whitespace:
        self.idx += 1

      if self.idx == len(self.buf):
        self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
        self.idx = -1
        if self.buf == "":
          del self.buf
          return ""

if not self.buf[self.idx] in string.whitespace:
break

start = self.idx

    # string "token"
    if self.buf[start] == "\"":
      token = ""

      end = self.buf.find('"', start+1)
      while 1:
        while end > -1 and self.buf[end - 1] == "\\":
          end = self.buf.find('"', end + 1)

if end == -1:
token += self.buf[start:]

          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
          if not self.buf:
            raise "Unexpected EOF; unterminated string."

          end = self.buf.find('"')
          start = 0
          continue

self.idx = end
return token + self.buf[start:end+1]

    # comment "token"
    if self.buf[start] == "#":
      token = ""

while 1:
end = self.buf.find("\n", start+1)

if end == -1:
token += self.buf[start:]

          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)
          if not self.buf:
            del self.buf
            return token

start = 0
continue

self.idx = end
return token + self.buf[start:end]

    # msgstr "[INDEX]" "token"
    if self.buf[start] == '[':
      token = "["

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.whitespace:
          self.idx += 1

if self.idx == len(self.buf):
self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

if not self.buf:
raise "Unexpected EOF while parsing a msgstr INDEX"

self.idx = start = 0
continue

break

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.digits:
          self.idx += 1

        if self.idx == len(self.buf):
          token += self.buf[start:]
          self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

if not self.buf:
raise "Unexpected EOF in msgstr INDEX"

self.idx = start = 0

token += self.buf[start:self.idx]
break

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in string.whitespace:
          self.idx += 1

if self.idx == len(self.buf):
self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

if not self.buf:
raise "Unexpected EOF while parsing a msgstr INDEX"

self.idx = start = 0
continue

        if self.buf[self.idx] == ']':
          return token + ']'
        else:
          raise "Unexpected character while parsing a msgstr INDEX"

    # character series token
    if self.buf[start] in OTHER_TOKEN_CHARS:
      token = ""

      while 1:
        while self.idx < len(self.buf) and \
                  self.buf[self.idx] in OTHER_TOKEN_CHARS:
          self.idx += 1

if self.idx == len(self.buf):
token += self.buf[start:]

self.buf = self.inp.read(TOKEN_CHUNK_SIZE)

if not self.buf:
return token

self.idx = start = 0
continue

return token + self.buf[start:self.idx]

# unknown token starting character
raise "Unexpected character in input stream (%s)" % self.buf[start]

  def unget(self, token):
    def reget(self=self, ungot=token):
      del self.get

return ungot

self.get = reget

def parse(inp, sink):

  def get_msg_argument(arg_to):
    rv = []
    token = inp.get()
    while token[0] == '"':
      rv += [ token ]
      token = inp.get()

inp.unget(token)

if len(rv) == 0:
raise "Expected %s argument found other token instead" % arg_to

return rv

  comment = []
  while 1:
    token = inp.get()

if not token: # EOF
return

if token[0] == '#':
comment += [ token ]

continue

if token.lower() == 'domain':
token = inp.get()

if token[0] in string.letters + string.digits + '_':
sink.recv_domain(token)

else:
raise "Invalid token where domain name expected"

continue

    if token.lower() == 'msgid':
      msgid = get_msg_argument('msgid')
      msgid_plural = []

      token = inp.get()
      if token.lower() == 'msgid_plural':
        msgid_plural = get_msg_argument('msgid_plural')
        token = inp.get()

      if msgid_plural:
        while token.lower() == 'msgstr':
          token = inp.get()

if not token[0] == '[':
raise "msgid INDEX expected when msgid_plural defined"

msgstr_indices += [ token[1:-1] ]
msgstrs[token[1:-1]] = get_msg_argument('msgstr[INDEX]')

token = inp.get()

if len(msgstr_indices) == 0:
raise "msgstr expected after msgid_plural"

inp.unget(token)

        sink.recv_plural_msg(comment, msgid, msgid_plural,
                             msgstr_indices, msgstrs)
        continue

      else: # not msgid_plural
        if not token.lower() == "msgstr":
          raise "Unexpected token where 'msgstr'"

sink.recv_simple_msg(comment, msgid, get_msg_argument('msgstr'))

comment = []
continue

raise "Unknown token (%s)" % token

end of the parser ===============

start of the strip script ===============
#!/usr/bin/env python

import sys, poparse
import getopt

class CharsetStrippingSink(poparse.PoSink):
def __init__(self, out):
self.out = out

  def recv_simple_msg(self, pre_comment, msgid, msgstr):
    if msgid == [ '""' ]:
      for i in xrange(len(msgstr)):
        msgstr_len = len(msgstr)-1
        # note that the charset could be split over lines
        if msgstr[msgstr_len-i].find("charset=") >= 0:
          del msgstr[msgstr_len-i]
          break

for l in pre_comment:
self.out.write("%s\n" % l)

    msg = "msgid "
    for l in msgid:
      self.out.write("%s%s\n" % (msg, l))
      msg = ""

    msg = "msgstr "
    for l in msgstr:
      self.out.write("%s%s\n" % (msg, l))
      msg = ""

def finish_parse(self):
pass

def strip_it(infile, outfile):
poparse.parse(poparse.PoTokens(infile),
CharsetStrippingSink(outfile))

def main():
"""Docstring to be added"""

opts, args = getopt.getopt(sys.argv[1:], '', [])

  if len(args) < 1:
    print __doc__
    sys.exit(2)

  infile = None
  if args[0] == '-':
    infile = sys.stdin
  else:
    infile = open(args[0],'r')

  outfile = None
  if len(args) < 2 or args[1] == '-':
    outfile = sys.stdout
  else:
    outfile = open(args[1],'w')

strip_it(infile, outfile)

if __name__ == '__main__':
main()
end of the strip script ===============

-- 
NEU : GMX Internet.FreeDSL
Ab sofort DSL-Tarif ohne Grundgebühr: http://www.gmx.net/dsl
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org

Received on Thu May 13 20:14:23 2004

This message: [ Message body ]
Next message: Peter N. Lundblad: "Re: [PATCH] sprintf elimination, take 2"
Previous message: Branko ÄŒibej: "Re: [PATCH] Re: replacing SVN_REVNUM_T_FMT for translation"
Next in thread: Ben Reser: "Re: Stripping 'charset=' from po files [the sequal]"
Reply: Ben Reser: "Re: Stripping 'charset=' from po files [the sequal]"
Reply: Branko ÄŒibej: "Re: Stripping 'charset=' from po files [the sequal]"
Reply: Greg Hudson: "Re: Stripping 'charset=' from po files [the sequal]"

Contemporary messages sorted: [ By Date ] [ By Thread ] [ By Subject ] [ By Author ] [ By messages with attachments ]