Index: subversion/include/private/svn_utf_private.h =================================================================== --- subversion/include/private/svn_utf_private.h (revision 1733434) +++ subversion/include/private/svn_utf_private.h (working copy) @@ -150,22 +150,27 @@ svn_utf__normalize(const char **result, const char *str, apr_size_t len, svn_membuf_t *buf); -/* Normalize the UTF-8 string STR to form C and remove case distinctions - * with Unicode's Default Caseless Matching algorithm. Use BUF as a - * temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, assume STR - * is null-terminated; otherwise, consider the string only up to the - * given length. +/* Transform the UTF-8 string to a shape suitable for comparison with + * strcmp(). The tranformation is defined by CASE_INSENSITIVE and + * ACCENT_INSENSITIVE arguments. If CASE_INSENSITIVE is non-zero, + * remove case distinctions from the string. If ACCENT_INSENSITIVE + * is non-zero, remove diacritical marks from the string. * - * Return the resulting string in *RESULT, which shares storage with - * BUF and is valid only until the next time BUF is modified. + * Use BUF as a temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, + * assume STR is null-terminated; otherwise, consider the string only + * up to the given length. Place the tranformed string in *RESULT, which + * shares storage with BUF and is valid only until the next time BUF is + * modified. * * A returned error may indicate that STRING contains invalid UTF-8 or * invalid Unicode codepoints. */ svn_error_t * -svn_utf__casefold(const char **result, - const char *str, apr_size_t len, - svn_membuf_t *buf); +svn_utf__xfrm(const char **result, + const char *str, apr_size_t len, + svn_boolean_t case_insensitive, + svn_boolean_t accent_insensitive, + svn_membuf_t *buf); /* Check if STRING is a valid, NFC-normalized UTF-8 string. Note that * a FALSE return value may indicate that STRING is not valid UTF-8 at Index: subversion/libsvn_subr/utf8proc.c =================================================================== --- subversion/libsvn_subr/utf8proc.c (revision 1733434) +++ subversion/libsvn_subr/utf8proc.c (working copy) @@ -127,7 +127,8 @@ decompose_normalized(apr_size_t *result_length, * of UTF-8 characters. * * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for - * case-insensitive string comparison. + * case-insensitive string comparison. If STRIPMARK is non-zero, strip + * all diacritical marks (e.g., accents) from the string. * * A returned error may indicate that STRING contains invalid UTF-8 or * invalid Unicode codepoints. Any error message comes from utf8proc. @@ -136,10 +137,19 @@ static svn_error_t * normalize_cstring(apr_size_t *result_length, const char *string, apr_size_t length, svn_boolean_t casefold, + svn_boolean_t stripmark, svn_membuf_t *buffer) { - ssize_t result = unicode_decomposition(casefold ? UTF8PROC_CASEFOLD : 0, - string, length, buffer); + int flags = 0; + ssize_t result; + + if (casefold) + flags |= UTF8PROC_CASEFOLD; + + if (stripmark) + flags |= UTF8PROC_STRIPMARK; + + result = unicode_decomposition(flags, string, length, buffer); if (result >= 0) { svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1); @@ -207,18 +217,21 @@ svn_utf__normalize(const char **result, svn_membuf_t *buf) { apr_size_t result_length; - SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, buf)); + SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf)); *result = (const char*)(buf->data); return SVN_NO_ERROR; } svn_error_t * -svn_utf__casefold(const char **result, - const char *str, apr_size_t len, - svn_membuf_t *buf) +svn_utf__xfrm(const char **result, + const char *str, apr_size_t len, + svn_boolean_t case_insensitive, + svn_boolean_t accent_insensitive, + svn_membuf_t *buf) { apr_size_t result_length; - SVN_ERR(normalize_cstring(&result_length, str, len, TRUE, buf)); + SVN_ERR(normalize_cstring(&result_length, str, len, + case_insensitive, accent_insensitive, buf)); *result = (const char*)(buf->data); return SVN_NO_ERROR; } @@ -375,7 +388,8 @@ svn_utf__is_normalized(const char *string, apr_poo apr_size_t result_length; const apr_size_t length = strlen(string); svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool); - err = normalize_cstring(&result_length, string, length, FALSE, &buffer); + err = normalize_cstring(&result_length, string, length, + FALSE, FALSE, &buffer); if (err) { svn_error_clear(err); Index: subversion/svn/log-cmd.c =================================================================== --- subversion/svn/log-cmd.c (revision 1733434) +++ subversion/svn/log-cmd.c (working copy) @@ -112,14 +112,14 @@ display_diff(const svn_log_entry_t *log_entry, } /* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that - * PATTERN is a UTF-8 string normalized to form C with case folding - * applied. Use BUF for temporary allocations. */ + * PATTERN is a UTF-8 string prepared for case- and accent-insensitive + * comparison via svn_utf__xfrm(). */ static svn_boolean_t match(const char *pattern, const char *str, svn_membuf_t *buf) { svn_error_t *err; - err = svn_utf__casefold(&str, str, strlen(str), buf); + err = svn_utf__xfrm(&str, str, strlen(str), TRUE, TRUE, buf); if (err) { /* Can't match invalid data. */ Index: subversion/svn/svn.c =================================================================== --- subversion/svn/svn.c (revision 1733434) +++ subversion/svn/svn.c (working copy) @@ -2397,8 +2397,8 @@ sub_main(int *exit_code, int argc, const char *arg break; case opt_search: SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool)); - SVN_ERR(svn_utf__casefold(&utf8_opt_arg, utf8_opt_arg, - strlen(utf8_opt_arg), &buf)); + SVN_ERR(svn_utf__xfrm(&utf8_opt_arg, utf8_opt_arg, + strlen(utf8_opt_arg), TRUE, TRUE, &buf)); add_search_pattern_group(&opt_state, apr_pstrdup(pool, utf8_opt_arg), pool); @@ -2405,8 +2405,8 @@ sub_main(int *exit_code, int argc, const char *arg break; case opt_search_and: SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool)); - SVN_ERR(svn_utf__casefold(&utf8_opt_arg, utf8_opt_arg, - strlen(utf8_opt_arg), &buf)); + SVN_ERR(svn_utf__xfrm(&utf8_opt_arg, utf8_opt_arg, + strlen(utf8_opt_arg), TRUE, TRUE, &buf)); add_search_pattern_to_latest_group(&opt_state, apr_pstrdup(pool, utf8_opt_arg), pool); Index: subversion/tests/libsvn_subr/utf-test.c =================================================================== --- subversion/tests/libsvn_subr/utf-test.c (revision 1733434) +++ subversion/tests/libsvn_subr/utf-test.c (working copy) @@ -898,87 +898,76 @@ test_utf_normalize(apr_pool_t *pool) static svn_error_t * -test_utf_casefold(apr_pool_t *pool) +test_utf_xfrm(apr_pool_t *pool) { - /* Normalized: NFC */ - static const char nfc[] = - "\xe1\xb9\xa8" /* S with dot above and below */ - "\xc5\xaf" /* u with ring */ - "\xe1\xb8\x87" /* b with macron below */ - "\xe1\xb9\xbd" /* v with tilde */ - "\xe1\xb8\x9d" /* e with breve and cedilla */ - "\xc8\x91" /* r with double grave */ - "\xc5\xa1" /* s with caron */ - "\xe1\xb8\xaf" /* i with diaeresis and acute */ - "\xe1\xbb\x9d" /* o with grave and hook */ - "\xe1\xb9\x8b"; /* n with circumflex below */ + const char *str; + const char *result; + svn_membuf_t buf; - /* Normalized: NFC, case folded */ - static const char nfc_casefold[] = - "\xe1\xb9\xa9" /* s with dot above and below */ - "\xc5\xaf" /* u with ring */ - "\xe1\xb8\x87" /* b with macron below */ - "\xe1\xb9\xbd" /* v with tilde */ - "\xe1\xb8\x9d" /* e with breve and cedilla */ - "\xc8\x91" /* r with double grave */ - "\xc5\xa1" /* s with caron */ - "\xe1\xb8\xaf" /* i with diaeresis and acute */ - "\xe1\xbb\x9d" /* o with grave and hook */ - "\xe1\xb9\x8b"; /* n with circumflex below */ + svn_membuf__create(&buf, 0, pool); - /* Normalized: NFD */ - static const char nfd[] = - "S\xcc\xa3\xcc\x87" /* S with dot above and below */ - "u\xcc\x8a" /* u with ring */ - "b\xcc\xb1" /* b with macron below */ - "v\xcc\x83" /* v with tilde */ - "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ - "r\xcc\x8f" /* r with double grave */ - "s\xcc\x8c" /* s with caron */ - "i\xcc\x88\xcc\x81" /* i with diaeresis and acute */ - "o\xcc\x9b\xcc\x80" /* o with grave and hook */ - "n\xcc\xad"; /* n with circumflex below */ + /* ASCII string */ + str = "Subversion"; + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf)); + SVN_TEST_STRING_ASSERT(result, "Subversion"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf)); + SVN_TEST_STRING_ASSERT(result, "subversion"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf)); + SVN_TEST_STRING_ASSERT(result, "Subversion"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf)); + SVN_TEST_STRING_ASSERT(result, "subversion"); - /* Mixed, denormalized */ - static const char mixup[] = - "S\xcc\x87\xcc\xa3" /* S with dot above and below */ - "\xc5\xaf" /* u with ring */ - "b\xcc\xb1" /* b with macron below */ - "\xe1\xb9\xbd" /* v with tilde */ - "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ - "\xc8\x91" /* r with double grave */ - "s\xcc\x8c" /* s with caron */ - "\xe1\xb8\xaf" /* i with diaeresis and acute */ - "o\xcc\x80\xcc\x9b" /* o with grave and hook */ - "\xe1\xb9\x8b"; /* n with circumflex below */ + /* M (u with diaeresis) (sharp s) en */ + str = "M" "\xc3\xbc" "\xc3\x9f" "en"; + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf)); + SVN_TEST_STRING_ASSERT(result, "M" "\xc3\xbc" "\xc3\x9f" "en"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf)); + SVN_TEST_STRING_ASSERT(result, "m" "\xc3\xbc" "ssen"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf)); + SVN_TEST_STRING_ASSERT(result, "Mu" "\xc3\x9f" "en"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf)); + SVN_TEST_STRING_ASSERT(result, "mussen"); - /* Invalid UTF-8 */ - static const char invalid[] = - "\xe1\xb9\xa8" /* S with dot above and below */ - "\xc5\xaf" /* u with ring */ - "\xe1\xb8\x87" /* b with macron below */ - "\xe1\xb9\xbd" /* v with tilde */ - "\xe1\xb8\x9d" /* e with breve and cedilla */ - "\xc8\x91" /* r with double grave */ - "\xc5\xa1" /* s with caron */ - "\xe1\xb8\xaf" /* i with diaeresis and acute */ - "\xe6" /* Invalid byte */ - "\xe1\xb9\x8b"; /* n with circumflex below */ + /* Na (i with diaeresis) vet (e with acute), decomposed */ + str = "Nai" "\xcc\x88" "vete" "\xcc\x81"; + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf)); + SVN_TEST_STRING_ASSERT(result, "Na" "\xc3\xaf" "vet" "\xc3\xa9"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf)); + SVN_TEST_STRING_ASSERT(result, "na" "\xc3\xaf" "vet" "\xc3\xa9"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf)); + SVN_TEST_STRING_ASSERT(result, "Naivete"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf)); + SVN_TEST_STRING_ASSERT(result, "naivete"); - const char *result; - svn_membuf_t buf; + /* (I with dot above) stanbul */ + str = "\xc4\xb0" "stanbul"; + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf)); + SVN_TEST_STRING_ASSERT(result, "\xc4\xb0" "stanbul"); - svn_membuf__create(&buf, 0, pool); - SVN_ERR(svn_utf__casefold(&result, nfc, strlen(nfc), &buf)); - SVN_TEST_STRING_ASSERT(result, nfc_casefold); - SVN_ERR(svn_utf__casefold(&result, nfd, strlen(nfd), &buf)); - SVN_TEST_STRING_ASSERT(result, nfc_casefold); - SVN_ERR(svn_utf__casefold(&result, mixup, strlen(mixup), &buf)); - SVN_TEST_STRING_ASSERT(result, nfc_casefold); + /* The Latin Capital Letter I with Dot Above (0130) should fold into + Latin Small Letter I (0069) with Combining Dot Above (0307) per full + mapping in http://www.unicode.org/Public/UNIDATA/CaseFolding.txt */ + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf)); + SVN_TEST_STRING_ASSERT(result, "i" "\xcc\x87" "stanbul"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf)); + SVN_TEST_STRING_ASSERT(result, "Istanbul"); + SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf)); + SVN_TEST_STRING_ASSERT(result, "istanbul"); - SVN_TEST_ASSERT_ERROR(svn_utf__casefold(&result, invalid, strlen(invalid), - &buf), + /* Invalid UTF-8 */ + str = "a" "\xe6" "bc"; + SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str), + FALSE, FALSE, &buf), SVN_ERR_UTF8PROC_ERROR); + SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str), + TRUE, FALSE, &buf), + SVN_ERR_UTF8PROC_ERROR); + SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str), + FALSE, TRUE, &buf), + SVN_ERR_UTF8PROC_ERROR); + SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str), + TRUE, TRUE, &buf), + SVN_ERR_UTF8PROC_ERROR); return SVN_NO_ERROR; } @@ -1011,8 +1000,8 @@ static struct svn_test_descriptor_t test_funcs[] = "test svn_utf__utf{16,32}_to_utf8"), SVN_TEST_PASS2(test_utf_normalize, "test svn_utf__normalize"), - SVN_TEST_PASS2(test_utf_casefold, - "test svn_utf__casefold"), + SVN_TEST_PASS2(test_utf_xfrm, + "test svn_utf__xfrm"), SVN_TEST_NULL };