[svn.haxx.se] · SVN Dev · SVN Users · SVN Org · TSVN Dev · TSVN Users · Subclipse Dev · Subclipse Users · this month's index

Re: [PATCH] Optimize svn log --limit

From: Peter N. Lundblad <peter_at_famlundblad.se>
Date: 2005-05-11 21:27:52 CEST

On Tue, 10 May 2005, D.J. Heap wrote:

> This patch optimizes 'svn log --limit' using the method proposed by Greg
> Hudson on the previous 'Quick Analysis' thread. For me, on a 10k revision
> repo a 'svn log --limit 10 svn://localhost/svn-repo/trunk' went from around
> 10 seconds to less than a second. It passes ra_local tests on Window and
> Linux; I would appreciate a thorough review.
>
Cool! See below.

> However, I could not think of a way to optimize the forward-thru-history use
> case very much since there is no reasonable way to read through history
> forward. That is, 'svn log --limit 10 -r 0:HEAD
> svn://localhost/svn-repo/trunk'.
>
Neither can I.

> Index: subversion/libsvn_repos/log.c
> ===================================================================
> --- subversion/libsvn_repos/log.c (revision 14678)
> +++ subversion/libsvn_repos/log.c (working copy)
> +/* This helper to svn_repos_get_logs3 is used to get the path's
> + history. */
> static svn_error_t *
> -history_to_revs_array (void *baton,
> - const char *path,
> - svn_revnum_t revision,
> - apr_pool_t *pool)
> +get_history (struct path_info *info,
> + svn_fs_t *fs,
> + svn_boolean_t cross_copies,
> + svn_repos_authz_func_t authz_read_func,
> + void *authz_read_baton,
> + svn_revnum_t start)
> {
> - apr_array_header_t *revs_array = baton;
> - APR_ARRAY_PUSH (revs_array, svn_revnum_t) = revision;
> + apr_pool_t* temppool;
> +
> + SVN_ERR (svn_fs_history_prev (&info->hist, info->hist, cross_copies,
> + info->newpool));
> + if (! info->hist)
> + return SVN_NO_ERROR;
> +
> + /* Fetch the location information for this history step. */
> + SVN_ERR (svn_fs_history_location (&info->path, &info->history_rev,
> + info->hist, info->newpool));
> +
> + /* Is the history item readable? If not, done with path. */
> + if (authz_read_func)
> + {
> + svn_boolean_t readable;
> + svn_fs_root_t *history_root;
> + SVN_ERR (svn_fs_revision_root (&history_root, fs,
> + info->history_rev,
> + info->newpool));
> + SVN_ERR (authz_read_func (&readable, history_root,
> + info->path,
> + authz_read_baton,
> + info->newpool));
> + if (! readable)
> + info->hist = NULL;
> + }
> +
> + /* Now we can clear the old pool. */
> + temppool = info->oldpool;
> + info->oldpool = info->newpool;
> + svn_pool_clear (temppool);
> + info->newpool = temppool;
> +
> + /* If this history item predates our START revision then
> + don't fetch any more for this path. */
> + if (info->history_rev < start)
> + info->hist = NULL;

Two spaces for indentation.

> return SVN_NO_ERROR;
> }
>
> +/* This helper to svn_repos_get_logs3 is used to advance the path's
> + history to the next one *if* the revision it is at is equal to
> + or greater than current. The changed flag is only touched if
> + this path has history in the current rev. */

Use capitals for argument names in internal docstrings.

> +static svn_error_t *
> +check_history (svn_boolean_t *changed,
> + struct path_info *info,
> + svn_fs_t *fs,
> + svn_revnum_t current,
> + svn_boolean_t cross_copies,
> + svn_repos_authz_func_t authz_read_func,
> + void *authz_read_baton,
> + svn_revnum_t start)
...

> svn_error_t *
> svn_repos_get_logs3 (svn_repos_t *repos,
> const apr_array_header_t *paths,
> @@ -221,7 +327,9 @@
> apr_pool_t *subpool = svn_pool_create (pool);
> svn_fs_t *fs = repos->fs;
> apr_array_header_t *revs = NULL;
> - int count = 0;
> + svn_revnum_t hist_end = end;
> + svn_revnum_t hist_start = start;
> + int i;
>
> SVN_ERR (svn_fs_youngest_rev (&head, fs, pool));
>
> @@ -241,6 +349,13 @@
> (SVN_ERR_FS_NO_SUCH_REVISION, 0,
> _("No such revision %ld"), end);
>
> + /* Get an ordered copy of the start and end revisions. */
> + if (start > end)
> + {
> + hist_start = end;
> + hist_end = start;
> + }
> +
> /* If paths were specified, then we only really care about revisions
> in which those paths were changed. So we ask the filesystem for
> all the revisions in which any of the paths was changed.
> @@ -258,107 +373,102 @@
> && (! svn_path_is_empty (APR_ARRAY_IDX (paths, 0, const char *))))
> || (paths->nelts > 1)))
> {
> - /* If there is only one path, we'll just get its sorted changed
> - revisions. Else, we'll be combining all our findings into a
> - hash (to remove duplicates) and then generating a sorted
> - array from that hash. */
> - if (paths->nelts == 1)
> + svn_revnum_t current;
> + apr_array_header_t *histories;
> + svn_boolean_t any_histories_left = TRUE;
> +
> + revs = apr_array_make (pool, limit ? limit : 10,
> + sizeof (svn_revnum_t));
> + histories = apr_array_make (subpool, paths->nelts,
> + sizeof(struct path_info *));

Missing space before paren.

> + /* Create a history object for each path so we can walk through
> + them all at the same time until we have all changes or limit
> + is reached.
> +
> + There is some pool fun going on due to the fact that we have
> + to hold on to the old pool with the history before we can
> + get the next history. */
> + for (i = 0; i < paths->nelts; i++)
> {
> - /* Get the changed revisions for this path. */
> - const char *this_path = APR_ARRAY_IDX (paths, 0, const char *);
> - revs = apr_array_make (pool, 64, sizeof (svn_revnum_t));
> - SVN_ERR (svn_repos_history2 (fs, this_path,
> - history_to_revs_array, revs,
> - authz_read_func, authz_read_baton,
> - start, end,
> - strict_node_history ? FALSE : TRUE,
> - pool));
> + const char *this_path = APR_ARRAY_IDX (paths, i, const char *);
> + svn_fs_root_t *root;
> + struct path_info *info = apr_palloc (subpool,
> + sizeof(struct path_info));

Space.

> +
> + SVN_ERR (path_history_root (&root, fs, this_path, hist_end,
> + authz_read_func, authz_read_baton,
> + pool));

Why not subpool?

> + info->root = root;
> + info->path = this_path;
> + info->oldpool = svn_pool_create (subpool);
> + info->newpool = svn_pool_create (subpool);
> + SVN_ERR (svn_fs_node_history (&info->hist, info->root, info->path,
> + info->oldpool));
> + SVN_ERR (get_history (info, fs,
> + strict_node_history ? FALSE : TRUE,
> + authz_read_func, authz_read_baton,
> + hist_start));
> + *((struct path_info **) apr_array_push (histories)) = info;
> }
> - else
> +
> + /* Loop through all the revisions in the range and add any
> + where a path was changed to the array. */
> + for (current = hist_end;
> + current >= hist_start && any_histories_left;
> + --current)

Instead of walking through all 150000 revisions, couldn't you keep
track of the next interesting revision (the largest revision of the
revisions in the histories array and jump directly to it? Seems like a
not so important improvement, but anyway.

> {
> - int i;
> - apr_hash_t *all_revs = apr_hash_make (pool);
> - apr_hash_index_t *hi;
> -
> - /* And the search is on... */
> - for (i = 0; i < paths->nelts; i++)
> + svn_boolean_t changed = FALSE;
> + any_histories_left = FALSE;
> + for (i = 0; i < histories->nelts; i++)
> {
> - const char *this_path = APR_ARRAY_IDX (paths, i, const char *);
> - apr_array_header_t *changed_revs =
> - apr_array_make (pool, 64, sizeof (svn_revnum_t));
> - int j;
> + struct path_info *info = APR_ARRAY_IDX (histories, i,
> + struct path_info *);
>
> - /* Get the changed revisions for this path, and add them to
> - the hash (this will eliminate duplicates). */
> - SVN_ERR (svn_repos_history2 (fs, this_path,
> - history_to_revs_array, changed_revs,
> - authz_read_func, authz_read_baton,
> - start, end,
> - strict_node_history ? FALSE : TRUE,
> - pool));
> - for (j = 0; j < changed_revs->nelts; j++)
> - {
> - /* We're re-using the memory allocated for the array
> - here in order to avoid more allocations. */
> - svn_revnum_t *chrev =
> - (((svn_revnum_t *)(changed_revs)->elts) + j);
> - apr_hash_set (all_revs, chrev, sizeof (chrev), (void *)1);
> - }
> + /* Check history for this path in current rev. */
> + SVN_ERR (check_history (&changed, info, fs, current,
> + strict_node_history ? FALSE : TRUE,
> + authz_read_func, authz_read_baton,
> + hist_start));
> + if ( info->hist != NULL )

Spacing around condition expression.

> + any_histories_left = TRUE;
> }
>
> - /* Now that we have a hash of all the revisions in which any of
> - our paths changed, we can convert that back into a sorted
> - array. */
> - revs = apr_array_make (pool, apr_hash_count (all_revs),
> - sizeof (svn_revnum_t));
> - for (hi = apr_hash_first (pool, all_revs);
> - hi;
> - hi = apr_hash_next (hi))
> - {
> - const void *key;
> - svn_revnum_t revision;
> -
> - apr_hash_this (hi, &key, NULL, NULL);
> - revision = *((const svn_revnum_t *)key);
> - (*((svn_revnum_t *) apr_array_push (revs))) = revision;
> - }
> + /* If any of the paths changed in this rev then add it. */
> + if (changed)
> + *(svn_revnum_t*) apr_array_push (revs) = current;
>

And here comes the thing that makes this review worth reading: if the
user wanted the log in reverse order (youngest to oldest, as is the
default), instead of adding all revisions to an array, why not just
send the revision to the caller right here? This would make log really
streamy, even without a limit argument.

> - /* Now sort the array */
> - qsort ((revs)->elts, (revs)->nelts, (revs)->elt_size,
> - svn_sort_compare_revisions);
> + /* See if we've got enough yet -- we can only stop early if
> + they wanted history in normal order and specified a limit. */

I assume "normal order" means reverse order? Might be better to be
explicit?

> + if (limit && start > end && revs->nelts >= limit)
> + break;
> }
> -
> - /* If no revisions were found for these entries, we have nothing
> - to show. Just return now before we break a sweat. */
> - if (! (revs && revs->nelts))
> - return SVN_NO_ERROR;
> }
> + else
> + {
> + /* Build a rev list with all the revs in the range specified. */
> + int count = hist_end - hist_start + 1;
> + if ( limit && start > end && count > limit )

Spacing.

> + count = limit;
> + revs = apr_array_make (pool, count, sizeof (svn_revnum_t));
> + for (i = 0; i < count; ++i)
> + *(svn_revnum_t*) apr_array_push (revs) = hist_end - i;
> + }
>
> - for (this_rev = start;
> - ((start >= end) ? (this_rev >= end) : (this_rev <= end));
> - ((start >= end) ? this_rev-- : this_rev++))
> + /* Work loop for processing the revisions we found. */
> + for (i = 0; i < revs->nelts; ++i)

If you follow my suggestion above, skip this if start > end.

> {
> svn_string_t *author, *date, *message;
> apr_hash_t *r_props, *changed_paths = NULL;
>
> svn_pool_clear (subpool);
>
> - /* If we have a list of revs for use, check to make sure this is
> - one of them. */
> - if (revs)
> - {
> - int i, matched = 0;
> - for (i = 0; ((i < revs->nelts) && (! matched)); i++)
> - {
> - if (this_rev == ((svn_revnum_t *)(revs->elts))[i])
> - matched = 1;
> - }
> + /* Reverse the order we read the list of revs if needed. */
> + if ( start > end )

Spacing, if this doesn't go away.

> + this_rev = APR_ARRAY_IDX (revs, i, svn_revnum_t);
> + else
> + this_rev = APR_ARRAY_IDX (revs, revs->nelts - i - 1, svn_revnum_t);
>
> - if (! matched)
> - continue;
> - }
> -
> - SVN_ERR (svn_fs_revision_proplist (&r_props, fs, this_rev, pool));
> + SVN_ERR (svn_fs_revision_proplist (&r_props, fs, this_rev, subpool));
> author = apr_hash_get (r_props, SVN_PROP_REVISION_AUTHOR,
> APR_HASH_KEY_STRING);
> date = apr_hash_get (r_props, SVN_PROP_REVISION_DATE,
> @@ -416,7 +526,9 @@
> message ? message->data : NULL,
> subpool));
>
> - if (++count == limit)
> + /* We still need to check limit here in case range was specified in
> + forward time order and we couldn't optimize as well. */
> + if ( limit && i >= limit )

Spacing.

> break;
> }
>
>

Regards,
//Peter

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org
Received on Wed May 11 23:38:06 2005

This is an archived mail posted to the Subversion Dev mailing list.

This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.