[svn.haxx.se] · SVN Dev · SVN Users · SVN Org · TSVN Dev · TSVN Users · Subclipse Dev · Subclipse Users · this month's index

work-in-progress unidiff parsing diff

From: Stefan Sperling <stsp_at_elego.de>
Date: Mon, 13 Apr 2009 22:23:09 +0100

Hi,

Greg said I should post this here for review before it
becomes in any way mature...

This is part of an attempt to get rid of out dependency on
GNU patch. It teaches Subversion to parse unidiffs.

The long-term plan is to compute the original and modified texts
of individual diff hunks (original is context and - lines,
modified is context and + lines), getting the corresponding
latest hunk text from the target file of the patch (with possibly
with offset and fuzz), and using svn_diff_mem_string_diff3()
and svn_diff_mem_string_output_merge2() to do a 3-way merge of
each hunk.

With some additional magic we could then replace the original
text hunks in the target file with the 3way merge result.

This way we don't rely on an external patch tool, and we get real
conflict markers inside the target file, instead of .rej files as
patch produces them.

That's the idea, anyway.

So the first patch below is a first draft of the unidiff parser
for you to rip apart.

The second diff is just for debugging and makes
'svn patch /some/patchfile' not do any work for unidiffs,
but show how the parser currently interprets unidiffs instead.
Example output:

   $ cat /tmp/DIFF
   Index: alpha
   ================================================================== --- alpha (revision 2)
   +++ alpha (working copy)
   @@ -1 +1,4 @@
   -alpha
   + alpha
   +
   +
   +a
   $ svn patch /tmp/DIFF
   svn: patch old: alpha
   svn: patch new: alpha
   svn: patch text
   == -alpha
   + alpha
   +
   +
   +a
   == svn: patch original_start: 1
   svn: patch original_length: 1
   svn: patch modified_start: 1
   svn: patch modified_length: 4

Comments appreciated.
When parsing strings in C additional eyes can only help.

BTW, http://www.artima.com/weblogs/viewpost.jsp?thread4293
was helpful.

Thanks,
Stefan

Index: subversion/libsvn_subr/patch.c
==================================================================--- subversion/libsvn_subr/patch.c (revision 37195)
+++ subversion/libsvn_subr/patch.c (working copy)
@@ -18,6 +18,7 @@

 #include <apr_lib.h>
 #include <apr_pools.h>
+#include <apr_errno.h>

 #include "svn_types.h"
 #include "svn_error.h"
@@ -529,3 +530,297 @@ svn_patch__read_tuple(svn_stream_t *from,
   va_end(ap);
   return err;
 }
+
+/* Functions for parsing unidiffs */
+
+svn_error_t *
+svn_patch__get_next_patch(svn_patch_t **patch,
+ apr_file_t *patch_file,
+ const char *eol_str,
+ apr_pool_t *scratch_pool,
+ apr_pool_t *result_pool)
+{
+ const char *minus = "--- ";
+ const char *plus = "+++ ";
+ const char *indicator;
+ svn_stream_t *s;
+ apr_off_t pos;
+ svn_boolean_t eof, in_header;
+ apr_pool_t *iterpool;
+
+ if (apr_file_eof(patch_file) == APR_EOF)
+ {
+ /* No more patches here. */
+ *patch = NULL;
+ return SVN_NO_ERROR;
+ }
+
+ /* Get current seek position -- APR has no ftell() :( */
+ pos = 0;
+ apr_file_seek(patch_file, APR_CUR, &pos);
+
+ /* Record what we already know about the patch. */
+ *patch = apr_pcalloc(result_pool, sizeof(**patch));
+ (*patch)->patch_file = patch_file;
+ (*patch)->eol_str = eol_str;
+
+ /* Get a stream to read lines from the patch file.
+ * The file should not be closed when we close the stream so
+ * make sure it is disowned. */
+ s = svn_stream_from_aprfile2(patch_file, TRUE, scratch_pool);
+
+ indicator = minus;
+ in_header = FALSE;
+ iterpool = svn_pool_create(scratch_pool);
+ do
+ {
+ svn_stringbuf_t *line;
+
+ svn_pool_clear(iterpool);
+
+ /* Read a line from the stream. */
+ SVN_ERR(svn_stream_readline(s, &line, eol_str, &eof, iterpool));
+
+ /* See if we have a diff header. */
+ if (line->len > strlen(indicator) &&
+ strncmp(line->data, indicator, strlen(indicator)) == 0)
+ {
+ /* Looks like it, try to find the filename. */
+ apr_size_t tab = svn_stringbuf_find_char_backward(line, '\t');
+ if (tab >= line->len)
+ /* Not found... */
+ continue;
+
+ line->data[tab] = '\0';
+
+ if ((! in_header) && strcmp(indicator, minus) == 0)
+ {
+ /* First line of header. */
+ (*patch)->old_filename + svn_string_create(line->data + strlen(indicator), result_pool);
+ indicator = plus;
+ in_header = TRUE;
+ }
+ else if (in_header && strcmp(indicator, plus) == 0)
+ {
+ /* Second line of header. */
+ (*patch)->new_filename + svn_string_create(line->data + strlen(indicator), result_pool);
+ in_header = FALSE;
+ break; /* All good! */
+ }
+ else
+ in_header = FALSE;
+ }
+ }
+ while (! eof);
+ svn_pool_destroy(iterpool);
+
+ if ((*patch)->old_filename == NULL || (*patch)->new_filename == NULL)
+ /* Something went wrong, just discard the result. */
+ *patch = NULL;
+
+ SVN_ERR(svn_stream_close(s));
+
+ return SVN_NO_ERROR;
+}
+
+static svn_boolean_t
+parse_offset_t(apr_off_t *offset, const char *number)
+{
+ apr_int64_t parsed_offset;
+
+ errno = 0; /* clear errno for safety */
+ parsed_offset = apr_atoi64(number);
+ if (errno == ERANGE)
+ return FALSE;
+
+ /* apr_off_t is not the same size on all platforms.
+ * We assume it is either 4 or 8 bytes. */
+ if (sizeof(*offset) < sizeof(parsed_offset) && parsed_offset > APR_UINT32_MAX)
+ /* will overflow */
+ return FALSE;
+
+ *offset = parsed_offset;
+ return TRUE;
+}
+
+static svn_boolean_t
+parse_range(apr_off_t *start, apr_off_t *length, char *range)
+{
+ char *comma;
+
+ if (strlen(range) == 0)
+ return FALSE;
+
+ comma = strstr(range, ",");
+ if (comma)
+ {
+ if (strlen(comma + 1) > 0)
+ {
+ /* Try to parse the length. */
+ if (! parse_offset_t(length, comma + 1))
+ return FALSE;
+
+ /* Snip off the end of the string,
+ * so we can comfortably parse the line
+ * number the hunk starts at. */
+ *comma = '\0';
+ }
+ else
+ /* A comma but no length? */
+ return FALSE;
+ }
+ else
+ {
+ *length = 1;
+ }
+
+ /* Try to parse the line number the hunk starts at. */
+ return parse_offset_t(start, range);
+}
+
+svn_error_t *
+svn_patch__get_next_hunk(svn_hunk_t **hunk,
+ svn_patch_t *patch,
+ apr_pool_t *scratch_pool,
+ apr_pool_t *result_pool)
+{
+ const char *atat = "@@";
+ svn_boolean_t eof, in_hunk, hunk_seen;
+ apr_off_t pos, last_line;
+ svn_stringbuf_t *diff_text;
+ svn_stream_t *s;
+ apr_pool_t *iterpool;
+
+ diff_text = svn_stringbuf_create("", scratch_pool);
+ in_hunk = FALSE;
+ hunk_seen = FALSE;
+ *hunk = apr_pcalloc(result_pool, sizeof(**hunk));
+
+ /* Get a stream to read lines from the patch file.
+ * The file should not be closed when we close the stream so
+ * make sure it is disowned. */
+ s = svn_stream_from_aprfile2(patch->patch_file, TRUE, scratch_pool);
+
+ iterpool = svn_pool_create(scratch_pool);
+ do
+ {
+ svn_stringbuf_t *line;
+
+ svn_pool_clear(iterpool);
+
+ /* Remember the current line's offset, and read the line. */
+ last_line = pos;
+ SVN_ERR(svn_stream_readline(s, &line, patch->eol_str, &eof, iterpool));
+ if (! eof)
+ {
+ /* Update line offset for next iteration.
+ * APR has no ftell() :( */
+ pos = 0;
+ apr_file_seek(patch->patch_file, APR_CUR, &pos);
+ }
+
+ if (in_hunk)
+ {
+ char c = line->data[0];
+ if (c == ' ' || c == '-' || c == '+')
+ {
+ svn_stringbuf_appendbytes(diff_text, line->data, line->len);
+ svn_stringbuf_appendbytes(diff_text, patch->eol_str,
+ strlen(patch->eol_str));
+ hunk_seen = TRUE;
+ }
+ else
+ {
+ in_hunk = FALSE;
+ break; /* Hunk was empty or has been read. */
+ }
+ }
+ else if ((! in_hunk) && strncmp(line->data, atat, strlen(atat)) == 0)
+ {
+ /* Looks like we have a hunk header, let's try to rip it apart. */
+ char *p;
+ svn_stringbuf_t *range;
+
+ p = line->data + strlen(atat);
+ if (*p != ' ')
+ /* No. */
+ continue;
+ p++;
+ if (*p != '-')
+ /* Nah... */
+ continue;
+ /* OK, this may be worth allocating some memory for... */
+ range = svn_stringbuf_create_ensure(31, iterpool);
+ p++;
+ while (*p && *p != ' ')
+ {
+ svn_stringbuf_appendbytes(range, p, 1);
+ p++;
+ }
+ if (*p != ' ')
+ /* No no no... */
+ continue;
+
+ /* Try to parse the first range. */
+ if (! parse_range(&(*hunk)->original_start, &(*hunk)->original_length,
+ range->data))
+ continue;
+
+ /* Clear the stringbuf so we can reuse it for the second range. */
+ svn_stringbuf_setempty(range);
+ p++;
+ if (*p != '+')
+ /* Eeek! */
+ continue;
+ /* OK, this may be worth copying... */
+ p++;
+ while (*p && *p != ' ')
+ {
+ svn_stringbuf_appendbytes(range, p, 1);
+ p++;
+ }
+ if (*p != ' ')
+ /* No no no... */
+ continue;
+
+ /* Check for trailing @@ */
+ p++;
+ if (strcmp(p, atat) != 0)
+ continue;
+
+ /* Try to parse the second range. */
+ if (! parse_range(&(*hunk)->modified_start, &(*hunk)->modified_length,
+ range->data))
+ continue;
+
+ /* Hunk header is good. */
+ in_hunk = TRUE;
+ }
+ }
+ while (! eof);
+ svn_pool_destroy(iterpool);
+
+ SVN_ERR(svn_stream_close(s));
+
+ if (! eof)
+ /* Rewind to the start of the line just read, so subsequent calls
+ * to this function or svn_patch__get_next_patch() don't end
+ * up skipping the line -- it may contain a patch or hunk header. */
+ apr_file_seek(patch->patch_file, APR_SET, &last_line);
+
+ if (hunk_seen)
+ {
+ /* Set the hunk's diff text. */
+ (*hunk)->diff_text = svn_string_create(diff_text->data, result_pool);
+
+ /* Compute original and modified texts. */
+ /* TODO */
+ }
+ else
+ /* Something went wrong, just discard the result. */
+ *hunk = NULL;
+
+ return SVN_NO_ERROR;
+}
Index: subversion/include/private/svn_patch.h
==================================================================--- subversion/include/private/svn_patch.h (revision 37195)
+++ subversion/include/private/svn_patch.h (working copy)
@@ -22,12 +22,67 @@
 #include <apr_pools.h>
 #include <apr_tables.h>

+#include "svn_types.h"
 #include "svn_io.h"

 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */

+/* A single hunk inside a patch */
+typedef struct svn_hunk_t {
+ /* The hunk's text as it appeared in the patch file,
+ * without range information. */
+ svn_string_t *diff_text;
+
+ /* The original and modified texts in the hunk range.
+ * Derived from the diff text. */
+ svn_string_t *original_text;
+ svn_string_t *modified_text;
+
+ /* Hunk ranges as they appeared in the patch file. */
+ apr_off_t original_start;
+ apr_off_t original_length;
+ apr_off_t modified_start;
+ apr_off_t modified_length;
+} svn_hunk_t;
+
+/* Data type to manage parsing of patches. */
+/* TODO: Should be made opaque when done with testing. */
+typedef struct svn_patch_t {
+ /* The patch file itself. */
+ apr_file_t *patch_file;
+
+ /* The old and new file names as retreived from the patch file. */
+ svn_string_t *old_filename;
+ svn_string_t *new_filename;
+
+ /* EOL string used in patch file. */
+ const char *eol_str;
+} svn_patch_t;
+
+/* Return the next *PATCH in PATCH_FILE. The patch file is assumed to
+ * have consistent EOL-markers as specified in EOL_STR.
+ * If no patch can be found, set *PATCH to NULL.
+ * Allocate results in RESULT_POOL.
+ * Use SCRATCH_POOL for all other allocations. */
+svn_error_t *
+svn_patch__get_next_patch(svn_patch_t **patch,
+ apr_file_t *patch_file,
+ const char *eol_str,
+ apr_pool_t *scratch_pool,
+ apr_pool_t *result_pool);
+
+/* Return the next *HUNK from a PATCH.
+ * If no hunk can be found, set *HUNK to NULL.
+ * Allocate results in RESULT_POOL.
+ * Use SCRATCH_POOL for all other allocations. */
+svn_error_t *
+svn_patch__get_next_hunk(svn_hunk_t **hunk,
+ svn_patch_t *patch,
+ apr_pool_t *scratch_pool,
+ apr_pool_t *result_pool);
+
 /* Output -- Writing */

 /* Append a command into @a target in a printf-like fashion.

Index: subversion/libsvn_client/patch.c
==================================================================--- subversion/libsvn_client/patch.c (revision 37195)
+++ subversion/libsvn_client/patch.c (working copy)
@@ -38,6 +38,7 @@
 #include <assert.h>

 #include "svn_private_config.h"
+#include "private/svn_patch.h"

 
 /*** Code. ***/
@@ -1800,8 +1801,49 @@ svn_client_patch(const char *patch_path,
     }

   /* Now proceed with the unidiff bytes. */
+#if 0
   SVN_ERR(svn_wc_apply_unidiff(patch_path, force, outfile, errfile,
                                ctx->config, pool));
+#endif
+ {
+ svn_patch_t *p;
+ svn_hunk_t *h;
+ apr_file_t *f;
+ apr_status_t status;

+ status = apr_file_open(&f, patch_path, APR_READ | APR_BINARY, 0, pool);
+ if (status)
+ return svn_error_wrap_apr(status, "Cannot open %s", patch_path);
+
+ do
+ {
+ SVN_ERR(svn_patch__get_next_patch(&p, f, APR_EOL_STR, pool, pool));
+ if (p)
+ {
+ printf("svn: patch old: %s\nsvn: patch new: %s\n",
+ p->old_filename->data, p->new_filename->data);
+ do
+ {
+ SVN_ERR(svn_patch__get_next_hunk(&h, p, pool, pool));
+ if (h)
+ {
+ printf("svn: patch text\n===\n%s===\n"
+ "svn: patch original_start: %lld\n"
+ "svn: patch original_length: %lld\n"
+ "svn: patch modified_start: %lld\n"
+ "svn: patch modified_length: %lld\n",
+ h->diff_text->data,
+ h->original_start,
+ h->original_length,
+ h->modified_start,
+ h->modified_length);
+ }
+ }
+ while (h);
+ }
+ }
+ while (p);
+ }
+
   return SVN_NO_ERROR;
 }
Received on 2009-04-13 23:26:41 CEST

This is an archived mail posted to the Subversion Dev mailing list.

This site is subject to the Apache Privacy Policy and the Apache Public Forum Archive Policy.