On Thu, Sep 6, 2012 at 11:30 PM, <stefan2_at_apache.org> wrote:
> Author: stefan2
> Date: Thu Sep 6 21:30:40 2012
> New Revision: 1381766
>
> URL: http://svn.apache.org/viewvc?rev=1381766&view=rev
> Log:
> Many strings we need to convert to UTF-8 (paths, even log messages)
> contain large ASCII chars only sections. Add utility functions
> to very efficiently skip those sections at the begin of the strings.
>
> * subversion/libsvn_subr/utf_validate.c
> (first_non_ascii_char,
> first_non_ascii_char_cstring): new utility functions
> (svn_utf__last_valid,
> svn_utf__cstring_is_valid,
> svn_utf__is_valid,
> svn_utf__last_valid2): use the new functions to quickly
> skip ASCII-only sections at the head of the strings
>
>
> Modified:
> subversion/trunk/subversion/libsvn_subr/utf_validate.c
>
> Modified: subversion/trunk/subversion/libsvn_subr/utf_validate.c
> URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf_validate.c?rev=1381766&r1=1381765&r2=1381766&view=diff
> ==============================================================================
> --- subversion/trunk/subversion/libsvn_subr/utf_validate.c (original)
> +++ subversion/trunk/subversion/libsvn_subr/utf_validate.c Thu Sep 6 21:30:40 2012
> @@ -57,6 +57,7 @@
> */
>
> #include "private/svn_utf_private.h"
> +#include "private/svn_eol_private.h"
>
> /* Lookup table to categorise each octet in the string. */
> static const char octet_category[256] = {
> @@ -249,12 +250,90 @@ static const char machine [9][14] = {
> FSM_ERROR}, /* 0xf5-0xff */
> };
>
> +/* Scan MAX_LEN bytes in *DATA for non-ASCII chars. Return the position
> + * of the first non-ASCII char or DATA + MAX_LEN if all were ASCII.
> + */
> +static const char *
> +first_non_ascii_char(const char *data, apr_size_t max_len)
> +{
> +#if !SVN_UNALIGNED_ACCESS_IS_OK
> +
> + /* On some systems, we need to make sure that buf is properly aligned
> + * for chunky data access.
> + */
> + if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1))
> + {
> + apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1);
> + if (len > max_len)
> + len = max_len;
> + max_len -= len;
> +
> + for (; len > 0; ++data, --len)
> + if (*data < 0)
> + return data;
> + }
> +
> +#endif
> +
> + /* Scan the input one machine word at a time. */
> + for (; max_len > sizeof(apr_uintptr_t)
> + ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
> + if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
> + break;
> +
> + /* The remaining odd bytes will be examined the naive way: */
> + for (; max_len > 0; ++data, --max_len)
> + if (*data < 0)
> + return data;
> +
> + return data;
> +}
> +
> +/* Scan the C string in *DATA for non-ASCII chars. Return the position
> + * of either the first non-ASCII char or the terminating NUL.
> + */
> +static const char *
> +first_non_ascii_char_cstring(const char *data)
> +{
> + /* We need to make sure that BUF is properly aligned for chunky data
> + * access because we don't know the string's length. Unaligned chunk
> + * read access beyond the NUL terminator could therefore result in a
> + * segfault.
> + */
> + for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
> + if (*data <= 0)
> + return data;
> +
> + /* Scan the input one machine word at a time. */
> + for (; ; data += sizeof(apr_uintptr_t))
> + {
> + /* Check for non-ASCII chars: */
> + apr_uintptr_t chunk = *(const apr_uintptr_t *)data;
> + if (chunk & SVN__BIT_7_SET)
> + break;
> +
> + /* This is the well-known strlen test: */
> + chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET;
> + if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET)
> + break;
> + }
> +
> + /* The remaining odd bytes will be examined the naive way: */
> + for (; ; ++data)
> + if (*data <= 0)
> + return data;
> +
> + return data;
I get a compiler warning here (VS 2010):
c:\research\svn\client_build\trunk2\subversion\libsvn_subr\utf_validate.c(328):
warning C4702: unreachable code
It seems that last "return data;" is superfluous, since the loop above
it can only end by returning?
--
Johan
Received on 2012-09-10 00:59:21 CEST