Re: [PATCH] Include offending XML in "Malformed XML" error message
From: Charles Bailey <bailey.charles_at_gmail.com>
Date: 2005-04-21 21:43:21 CEST
On 2/28/05, Peter N. Lundblad <peter@famlundblad.se> wrote:
Well, after umpteen interrupts from the rest of life,I finally got a
While it's certainly more complex than just coding another
BTW, on the topic of common behaviors, what led to the use of decimal
Comments welcomed, before I invest any more time along this path.
--
Regards,
Charles Bailey
Lists: bailey _dot_ charles _at_ gmail _dot_ com
Other: bailey _at_ newman _dot_ upenn _dot_ edu
/*
* Quick prototype of "string escaping" common routine and UTF-8 task.
* Builds with
* gcc -I/usr/local/include/subversion-1 -I/usr/local/apr/include/apr-0 \
* -L/usr/local/apr/lib -lsvn_subr-1 -lapr-0 -o esctest esctest.c
*/
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "svn_pools.h"
#include "svn_string.h"
/** Scan @a instr of length @a len bytes, copying to stringbuf @a outsbuf,
* escaping bytes as indicated by the lookup array @isok and the mapping
* function @mapper. Memory is allocated from @a pool.
*
* This is designed to be the common pathway for various string "escaping"
* functions scattered through subversion. The basic approach is to scan
* an input and decide whether each byte is OK as it stands, needs to be
* "escaped" using subversion's "?\uuu" default format, or needs to be
* transformed in some other way. The decision is made using a two step
* process, which is designed to handle the simple cases quickly but allow
* for more complex mappings. Since the typical string will (we hope)
* comprise mostly simple cases, this shouldn't require much code
* complexity or loss of efficiency. The two steps used are:
*
* 1. The value of a byte from the input string ("test byte") is used as an
* index into a (usually 255 byte) array passed in by the caller.
* - If the value of the appropriate array element is 0xff,
* then the test byte is escaped as a "?\uuu" string in the output.
* - If the value of the appropriate element is otherwise non-zero,
* that many bytes are copied verbatim from the input to the output.
* 2. If the array yields a 0 value, then a mapping function provided by
* the caller is used to allow for more complex evaluation. This function
* receives five arguments:
* - a pointer to the pointer used by svn_do_char_escape to
* mark the test byte in the input string
* - a pointer to the start of the input string
* - the length of the input string
* - a pointer to the output stringbuf
* - the ever-helpful pool.
* The mapping function may return an nonzero value, which is interpreted
* as described in step 1 above, or zero, indicating that the test byte
* should be ignored. In the latter case, this is generally because the
* mapping function has done the necessary work itself; it's free to
* modify the output stringbuf and adjust the pointer to the test byte
* as it sees fit (within the bounds of the input string). At a minimum,
* it should at least increment the pointer to the test byte before
* returning 0, in order to avoid an infinite loop.
*/
static void
svn_do_char_escape (svn_stringbuf_t **outsbuf,
unsigned char *instr,
apr_size_t len,
unsigned char *isok,
int (*mapper)(),
apr_pool_t *pool)
{
unsigned char *base, *c;
if (*outsbuf == NULL)
*outsbuf = svn_stringbuf_create ("", pool);
for (c = base = instr; c < instr + len; ) {
apr_size_t count = isok[*c];
if (count == 0) {
if (c > base)
svn_stringbuf_appendbytes (*outsbuf, base, c - base);
count = mapper (&c,instr,len,*outsbuf,pool);
}
if (count == 255) {
char esc[6];
if (c > base)
svn_stringbuf_appendbytes (*outsbuf, base, c - base);
sprintf (esc,"?\\%03u",*c);
svn_stringbuf_appendcstr (*outsbuf, esc);
c++;
base = c;
}
else c += count;
}
if (c > base)
svn_stringbuf_appendbytes (*outsbuf, base, c - base);
}
/** Determine whether the (presumably high-half) byte pointed to by
* @a *cur is the start of a legal UTF-8 sequence in @a str, and tell
* the caller to either copy the legal sequence or escape the current
* byte as illegal.
*/
static int
svn_utf8_mapper (char **cur,
unsigned char *str,
apr_size_t len,
svn_stringbuf_t *target,
apr_pool_t *pool)
{
unsigned char *c, *end = str + len;
if (!cur || !*cur) return 255; /* Can't help you; sorry */
c = *cur;
if (c[0] < 0x80) return 1; /* Shouldn't happen */
if ( (c[0] >= 0xc2 && c[0] < 0xdf) &&
(c + 1 <= end) &&
(c[1] >= 0x80 && c[1] <= 0xbf) )
return 2;
if (c[0] >= 0xe0 && c[0] <= 0xef) {
if ( (c + 2 > end) ||
(c[1] < 0x80 || c[1] > 0xbf) ||
(c[2] < 0x80 || c[2] > 0xbf) ||
(c[0] == 0xe0 && c[1] < 0xa0) ||
(c[0] == 0xed && c[1] > 0x9f) )
return 255;
return 3;
}
if (c[0] >= 0xf0 && c[0] <= 0xf4) {
if ( (c + 3 > end) ||
(c[1] < 0x80 || c[1] > 0xbf) ||
(c[2] < 0x80 || c[2] > 0xbf) ||
(c[3] < 0x80 || c[3] > 0xbf) ||
(c[0] == 0xf0 && c[1] < 0x90) ||
(c[0] == 0xf4 && c[1] > 0x8f) )
return 255;
return 4;
}
return 255; /* Shouldn't happen */
}
/** Copy @a str of length @a len to @a *outsbuf, escaping bytes which are
* not legal UTF-8 sequences. Memory is allocated from @a pool as needed.
* If @a *outsbuf is NULL, a new stringbuf is created.
*
* This is the "public" entry point to the common escaping engine for
* building UTF-8-safe strings.
*/
static void
svn_escape_utf8 (svn_stringbuf_t **outsbuf,
unsigned char *str,
apr_size_t len,
apr_pool_t *pool)
{
static unsigned char utf8safe[255];
if (!utf8safe[0]) {
memset (utf8safe, 1, 0x7f); /* 0x00 - 0x7f always legal */
memset (utf8safe + 0x80, 0, 0x7f); /* 0x80 - 0xff need check, except */
memset (utf8safe + 0xc0, 255, 2); /* 0xc0 - 0xc1 always illegal */
memset (utf8safe + 0xf5, 255, 11); /* oxf5 - 0xff always illegal */
}
svn_do_char_escape (outsbuf, str, len, utf8safe, svn_utf8_mapper, pool);
}
/* Quick hack to run some simple tests */
int main() {
char test[4][6] = { { 'A', 'S', 'C', 'I', 'I', '\0' },
{ 'B', 'E', 'L', 0x07, '!', '\0' },
{ 0xe2, 0x98, 0xba, 'O', 'K', '\0' },
{ 0xe2, 0xff, 0xba, 'N', 'O', '\0' }, };
int i;
svn_stringbuf_t *outsbuf;
apr_pool_t *pool;
apr_initialize();
apr_pool_create(&pool,NULL);
for (i = 0; i < 4; i++) {
outsbuf = NULL;
printf("Original: \"%s\"\n", test[i]);
svn_escape_utf8(&outsbuf, test[i], 5, pool);
printf("Escaped: \"%s\"\n", outsbuf->data);
}
return 0;
}
/* End of message */
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Thu Apr 21 21:44:21 2005
|
This is an archived mail posted to the Subversion Dev mailing list.
This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.