Re: [PATCH] Include offending XML in "Malformed XML" error message
From: Charles Bailey <bailey.charles_at_gmail.com>
Date: 2005-04-21 21:43:21 CEST
On 2/28/05, Peter N. Lundblad <peter@famlundblad.se> wrote:
Well, after umpteen interrupts from the rest of life,I finally got a
While it's certainly more complex than just coding another
BTW, on the topic of common behaviors, what led to the use of decimal
Comments welcomed, before I invest any more time along this path.
-- Regards, Charles Bailey Lists: bailey _dot_ charles _at_ gmail _dot_ com Other: bailey _at_ newman _dot_ upenn _dot_ edu /* * Quick prototype of "string escaping" common routine and UTF-8 task. * Builds with * gcc -I/usr/local/include/subversion-1 -I/usr/local/apr/include/apr-0 \ * -L/usr/local/apr/lib -lsvn_subr-1 -lapr-0 -o esctest esctest.c */ #include <stdio.h> #include <string.h> #include <assert.h> #include "svn_pools.h" #include "svn_string.h" /** Scan @a instr of length @a len bytes, copying to stringbuf @a outsbuf, * escaping bytes as indicated by the lookup array @isok and the mapping * function @mapper. Memory is allocated from @a pool. * * This is designed to be the common pathway for various string "escaping" * functions scattered through subversion. The basic approach is to scan * an input and decide whether each byte is OK as it stands, needs to be * "escaped" using subversion's "?\uuu" default format, or needs to be * transformed in some other way. The decision is made using a two step * process, which is designed to handle the simple cases quickly but allow * for more complex mappings. Since the typical string will (we hope) * comprise mostly simple cases, this shouldn't require much code * complexity or loss of efficiency. The two steps used are: * * 1. The value of a byte from the input string ("test byte") is used as an * index into a (usually 255 byte) array passed in by the caller. * - If the value of the appropriate array element is 0xff, * then the test byte is escaped as a "?\uuu" string in the output. * - If the value of the appropriate element is otherwise non-zero, * that many bytes are copied verbatim from the input to the output. * 2. If the array yields a 0 value, then a mapping function provided by * the caller is used to allow for more complex evaluation. This function * receives five arguments: * - a pointer to the pointer used by svn_do_char_escape to * mark the test byte in the input string * - a pointer to the start of the input string * - the length of the input string * - a pointer to the output stringbuf * - the ever-helpful pool. * The mapping function may return an nonzero value, which is interpreted * as described in step 1 above, or zero, indicating that the test byte * should be ignored. In the latter case, this is generally because the * mapping function has done the necessary work itself; it's free to * modify the output stringbuf and adjust the pointer to the test byte * as it sees fit (within the bounds of the input string). At a minimum, * it should at least increment the pointer to the test byte before * returning 0, in order to avoid an infinite loop. */ static void svn_do_char_escape (svn_stringbuf_t **outsbuf, unsigned char *instr, apr_size_t len, unsigned char *isok, int (*mapper)(), apr_pool_t *pool) { unsigned char *base, *c; if (*outsbuf == NULL) *outsbuf = svn_stringbuf_create ("", pool); for (c = base = instr; c < instr + len; ) { apr_size_t count = isok[*c]; if (count == 0) { if (c > base) svn_stringbuf_appendbytes (*outsbuf, base, c - base); count = mapper (&c,instr,len,*outsbuf,pool); } if (count == 255) { char esc[6]; if (c > base) svn_stringbuf_appendbytes (*outsbuf, base, c - base); sprintf (esc,"?\\%03u",*c); svn_stringbuf_appendcstr (*outsbuf, esc); c++; base = c; } else c += count; } if (c > base) svn_stringbuf_appendbytes (*outsbuf, base, c - base); } /** Determine whether the (presumably high-half) byte pointed to by * @a *cur is the start of a legal UTF-8 sequence in @a str, and tell * the caller to either copy the legal sequence or escape the current * byte as illegal. */ static int svn_utf8_mapper (char **cur, unsigned char *str, apr_size_t len, svn_stringbuf_t *target, apr_pool_t *pool) { unsigned char *c, *end = str + len; if (!cur || !*cur) return 255; /* Can't help you; sorry */ c = *cur; if (c[0] < 0x80) return 1; /* Shouldn't happen */ if ( (c[0] >= 0xc2 && c[0] < 0xdf) && (c + 1 <= end) && (c[1] >= 0x80 && c[1] <= 0xbf) ) return 2; if (c[0] >= 0xe0 && c[0] <= 0xef) { if ( (c + 2 > end) || (c[1] < 0x80 || c[1] > 0xbf) || (c[2] < 0x80 || c[2] > 0xbf) || (c[0] == 0xe0 && c[1] < 0xa0) || (c[0] == 0xed && c[1] > 0x9f) ) return 255; return 3; } if (c[0] >= 0xf0 && c[0] <= 0xf4) { if ( (c + 3 > end) || (c[1] < 0x80 || c[1] > 0xbf) || (c[2] < 0x80 || c[2] > 0xbf) || (c[3] < 0x80 || c[3] > 0xbf) || (c[0] == 0xf0 && c[1] < 0x90) || (c[0] == 0xf4 && c[1] > 0x8f) ) return 255; return 4; } return 255; /* Shouldn't happen */ } /** Copy @a str of length @a len to @a *outsbuf, escaping bytes which are * not legal UTF-8 sequences. Memory is allocated from @a pool as needed. * If @a *outsbuf is NULL, a new stringbuf is created. * * This is the "public" entry point to the common escaping engine for * building UTF-8-safe strings. */ static void svn_escape_utf8 (svn_stringbuf_t **outsbuf, unsigned char *str, apr_size_t len, apr_pool_t *pool) { static unsigned char utf8safe[255]; if (!utf8safe[0]) { memset (utf8safe, 1, 0x7f); /* 0x00 - 0x7f always legal */ memset (utf8safe + 0x80, 0, 0x7f); /* 0x80 - 0xff need check, except */ memset (utf8safe + 0xc0, 255, 2); /* 0xc0 - 0xc1 always illegal */ memset (utf8safe + 0xf5, 255, 11); /* oxf5 - 0xff always illegal */ } svn_do_char_escape (outsbuf, str, len, utf8safe, svn_utf8_mapper, pool); } /* Quick hack to run some simple tests */ int main() { char test[4][6] = { { 'A', 'S', 'C', 'I', 'I', '\0' }, { 'B', 'E', 'L', 0x07, '!', '\0' }, { 0xe2, 0x98, 0xba, 'O', 'K', '\0' }, { 0xe2, 0xff, 0xba, 'N', 'O', '\0' }, }; int i; svn_stringbuf_t *outsbuf; apr_pool_t *pool; apr_initialize(); apr_pool_create(&pool,NULL); for (i = 0; i < 4; i++) { outsbuf = NULL; printf("Original: \"%s\"\n", test[i]); svn_escape_utf8(&outsbuf, test[i], 5, pool); printf("Escaped: \"%s\"\n", outsbuf->data); } return 0; } /* End of message */ --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org For additional commands, e-mail: dev-help@subversion.tigris.orgReceived on Thu Apr 21 21:44:21 2005 |
This is an archived mail posted to the Subversion Dev mailing list.
This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.