git-find-base: rewritten to use newer design

Instead of just checking a few extra headers, add support for all header types that git can output, and be a bit more robust about renames and deletes. For now, assume a rename won't attempt to overwrite an existing file.. Ideally we should do a full check on the base as well.. This model should allow support of all git commit types, not just simple ones. In addition, in order to allow a certain patch format, we add an option to remove duplicates. Do it by default, but enable option to keep duplicate diff chunks. Primarily useful for patches which contain both an inline and attachment. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
author: Jacob Keller <jacob.e.keller@intel.com> 2014-04-04 15:06:52 -0700
committer: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> 2014-04-07 13:20:02 +0300
commit: 2e61508db63a8a523cad78a931782e938d05eb9a (patch)
tree: 41f6a29996f0c4cd90235b3f7601f18855d1d923
parent: 00f5aa1988db9671c6958ab6683f10ec4c66725a (diff)
download: aiaiai-2e61508db63a8a523cad78a931782e938d05eb9a.tar.gz
1 files changed, 193 insertions, 113 deletions
diff --git a/helpers/git-find-base b/helpers/git-find-base
index cf4b939..db7900f 100755
--- a/helpers/git-find-base
+++ b/helpers/git-find-base
@@ -55,157 +55,237 @@ standard out will be a single commit id. If nothing was found, no standard
 output will be generated, and this utility will exit with a non-zero exit code.
 
 Options:
+    -k, --keep  Keep duplicate diff chunks.
     -?, -h      Show this text and exit.
 END
 }
 
-# subroutine to check whether two blob indexes match, (ie: one
-# contains the other regardless of which one is larger)
 sub match_index {
-    my ( $x, $y ) = @_;
+    my ($x, $y) = @_;
 
-    my $lx = length $x;
-    my $ly = length $y;
+    return ( index $x,$y ) == 0 or ( index $y,$x ) == 0;
+}
+
+sub hash_comp(\%\%) {
+    my %x = %{ shift @_ };
+    my %y = %{ shift @_ };
 
-    # Find which length is shortest
-    my $l = $lx >= $ly ? $ly : $lx;
+    ( grep { not ( ( exists $y{$_} ) and $x{$_} eq $y{$_} ) } keys %x ) == 0;
+}
 
-    # Truncate the indexes to the shortest
-    my $tx = substr $lx,0,$l;
-    my $ty = substr $ly,0,$l;
+sub path_exists(\%$) {
+    my %tree = %{ shift @_ };
+    my $path = shift @_;
 
-    # Return the match
-    return $tx == $ty;
+    return exists $tree{$path} and $tree{$path}->{status} eq "";
 }
 
+my $duplicates = '';
+
 Getopt::Long::Configure("pass_through");
-GetOptions('h|?' => sub { show_usage; exit 0; });
+GetOptions('h|?' => sub { show_usage; exit 0; },
+           'keep!' => \$duplicates );
 
 # Slurp the contents into $mbox for processing
 my $mbox = do { local $/; <STDIN> };
 
-# Hash of file-index relations
-my %files = ();
-
-# Split mbox apart by diff lines, preserving the filename we matched against,
-# as well as the full index line. This should handle even the rename case from
-# git diff output. Note, we assume that mbox has correct ordering of patches.
-while ($mbox =~ /^diff --git [iwcoab]\/(?<oldfile>\S+) [iwcoab]\/(?<newfile>\S+)\n(?<new>new file mode [0-7]+\n)?(?<rename>^similarity index .*\n)?(?<from>^rename from \g{oldfile}\n)?(?<to>^rename to \g{newfile}\n)?(?<index>^index .*$)?\n/gm) {
-    my $file = $+{oldfile};
-    my $rename = $+{similarity};
-    my $new = $+{new};
-    my $index = $+{index};
-    $file or die "Could not parse file from diff context.";
-
-    # If we get a rename without an index, simply note that a file was renamed,
-    # and ignore it, since there were no real changes.
-    if ( $rename and not $index ) {
-        print STDERR "Found rename of $file\n";
-        next;
-    }
+# Array of hrefs to chunk contexts
+my @chunks = ();
 
-    # Check the index line for proper formatting.
-    $index =~ /^index ([0-9a-f]+)[.]{2}([0-9a-f]+) [0-7]{6}$/;
-    my $initialshortblob = $1;
-    my $modifiedshortblob = $2;
-    $initialshortblob or die "Could not parse short blob index from diff context. Is the mbox corrupted?";
-
-    # If we have a new file, store the initial setting as "new", and keep the
-    # modified blob for checking future changes in this series.
-    if ($new) {
-        print STDERR "Found new file at $file\n";
-        $files{$file}{"initial"} = "new";
-        $files{$file}{"modified"} = $modifiedshortblob;
-        next;
-    };
-
-    # If we already have this file, simply update the modified blob index
-    if (exists $files{$file}) {
-        # Check if the blob matches the last known result of the file
-        if (match_index($initialshortblob, $files{$file}{"modified"})) {
-            print STDERR "Found further modification of $file, ($initialshortblob -> $modifiedshortblob).\n";
-            $files{$file}{"modified"} = $modifiedshortblob;
-            next;
-        } elsif (match_index($modifiedshortblob, $files{$file}{"modified"}) and match_index($initialshortblob, $files{$file}{"initial"})) {
-            print STDERR "Found duplicate modification of $file. Possible duplicate patch blob, or an incorrect patch format? Ignoring for now.\n";
-        } else {
-            die "Found futher modification of $file that does not match expected index, ($initialshortblob -> $modifiedshortblob). Is the patch sequence out of order?";
+# The possible list of extended headers supported by git-diff output
+my $extended_headers = qr/(old mode|new mode|deleted file mode|new file mode|copy from|copy to|rename from|rename to|similarity index|dissimilarity index|index)/;
+
+# Split mbox apart by diff header chunks, finding a diff line followed by any number of extended header lines
+while ($mbox =~ /^(?<chunk>diff (?s:.*?))(?=^(?!$extended_headers))/gm) {
+
+    # Capture the block
+    my $rawchunk = $+{chunk};
+
+    print STDERR "Found a diff chunk\n";
+    print STDERR $rawchunk;
+
+    # Check whether it has expected format
+    if ( $rawchunk =~ /^diff --git [iwcoab]\/(?<oldpath>\S+) [iwcoab]\/(?<newpath>\S+)$/m ) {
+        # We have a standard git diff chunk. Now, we need to parse the extended
+        # headers from the section.
+
+        my %chunk = ();
+        $chunk{oldpath} = $+{oldpath};
+        $chunk{newpath} = $+{newpath};
+        $chunk{oldindex} = "";
+        $chunk{newindex} = "";
+        $chunk{action} = "none";
+
+        if ( $rawchunk =~ /^index (?<oldindex>[0-9a-fA-F]+)[.]{2}(?<newindex>[0-9a-fA-F]+)( (?<mode>[0-7]{6}))?$/m ) {
+            $chunk{oldindex} = $+{oldindex};
+            $chunk{newindex} = $+{newindex};
+            $chunk{oldmode} = $+{mode};
+            $chunk{newmode} = $+{mode};
         }
-    }
 
-    print STDERR "Found modification to $file, ($initialshortblob -> $modifiedshortblob).\n";
 
-    # We have to process the short blob index into a full index value using
-    # git-rev-parse, otherwise the lookup will fail.
-    open my $rev_parse, '-|', 'git' => 'rev-parse' => '--verify', $initialshortblob
-        or die "Couldn't open pipe to git-rev-parse: ", $!;
+        if ( $rawchunk =~ /^old mode (?<mode>[0-7]{6})$/m ) {
+            $chunk{oldmode} = $+{mode};
+        }
 
-    my $initialblob = <$rev_parse>;
-    close $rev_parse or die "Couldn't expand the blob index: ", $? >> 8;
-    chomp $initialblob;
+        if ( $rawchunk =~ /^new mode (?<mode>[0-7]{6})$/m ) {
+            $chunk{newmode} = $+{mode};
+        }
+
+        if ( $rawchunk =~ /^deleted file mode (?<mode>[0-7]{6})$/m ) {
+            $chunk{oldmode} = $+{mode};
+            $chunk{action} = "delete";
+        }
 
-    # Store the initial blob, as well as the index after modification
-    $files{$file}{"initial"} = $initialblob;
-    $files{$file}{"modified"} = $modifiedshortblob;
+        if ( $rawchunk =~ /^new file mode (?<mode>[0-7]{6})$/m ) {
+            $chunk{newmode} = $+{mode};
+            $chunk{action} = "create";
+        }
+
+        if ( $rawchunk =~ /^rename from \Q$chunk{oldpath}\E$/m ) {
+            $chunk{action} = "rename";
+        }
+
+        if ( $rawchunk =~ /^rename to \Q$chunk{newpath}\E$/m ) {
+            $chunk{action} = "rename";
+        }
+
+        if ( $rawchunk =~ /^similarity index (?<similarity>[0-9]{1,3}%)$/m ) {
+            $chunk{similarity} = $+{similarity};
+        }
+
+        if ( $rawchunk =~ /^dissimilarity index (?<dissimilarity>[0-9]{1,3}%)$/m ) {
+            $chunk{similarity} = 100 - $+{dissimilarity};
+        }
+
+        if ( not $duplicates and ( grep { hash_comp ( %$_, %chunk ) } @chunks ) > 0 ) {
+            print STDERR "Skipping duplicate diff chunk. Disable this behavior with --keep.\n";
+        } else {
+            push (@chunks, \%chunk);
+        }
+
+    } elsif ( $rawchunk =~ /^diff --(combined|cc) (?<newfile>\S+)$/m ) {
+        # We can't use combined diff formats, since these are used for multiple
+        # parents, and are not suitable for this process
+        print STDERR "Found a combined diff format, indicating a merge. We can't find a base commit for a merge!\n";
+        exit 1;
+    } else {
+        # Non git-formats are not supported, as we need the index information
+        print STDERR "Found a diff chunk, but it does not have a recognized format.\n";
+        exit 1;
+    }
 }
 
-# Subroutine to check a commit treeish, ensuring that every blob is present at
-# the correct path. This allows us to determine whether the commit is "good",
-# ie: has all the blobs required to cleanly apply the patch, or not.
+# We have collated all the chunks. Now we need to loop over a series of commits
+# based on user input. For each commit, we will try to build up the list of
+# changes and see if it is applicable.
 sub check_commit {
     my ( $commit ) = @_;
 
-    # Loop through every blob/path combination from the mbox, and check if the
-    # ls-tree on that path matches the blob we need.
-    for my $path ( keys %files) {
-        my $blob = $files{$path}{"initial"};
-
-        # We shouldn't try to find a new file, as it won't exist yet
-        continue if $blob eq "new";
-
-        # Fail with die on the pipe since this should always work.
-        open my $ls_tree, '-|', 'git' => 'ls-tree' => '--full-tree' => $commit => '--', $path
-            or die "Couldn't open pipe to git-ls-tree: ", $!;
-
-        # Return here if we fail to find the file, because it might not yet
-        # exist.
-        my $tree = <$ls_tree>;
-        close $ls_tree or do {
-            print STDERR "Couldn't find matching tree: ", $? >> 8;
-            return;
-        };
-        chomp $tree;
-
-        # Check the output formatting to ensure we didn't get any errors
-        $tree =~ /\A[0-7]{6} (\S+) (\S+)/ or do {
-            print STDERR "Unexpected git-ls-tree output.\n";
-            return;
-        };
-
-        # Return undef if they don't match. This will ensure we bail at the
-        # first conflicting blob, without forcing extra checks.
-        return if $2 ne $blob;
+    # Our current view of the tree
+    my %tree = ();
+
+    # For each chunk, we need to build up the tree. looking up from git-ls-tree
+    # for the first time we find a path. We want to see if our patch could cleanly apply to the given commit.
+    for my $chunk ( @chunks ) {
+
+        # If the path doesn't exist yet, just fill in some information about it
+        # from the real tree
+        if ( not exists $tree{$chunk->{oldpath}} ) {
+            open my $ls_tree, '-|', 'git', => 'ls-tree' => '--full-tree' => $commit => '--' => $chunk->{oldpath}
+                or die "Couldn't open pipe to git-ls-tree: ", $!;
+
+            my $ls_tree_output = <$ls_tree>;
+            close $ls_tree or do {
+                print STDERR "git-ls-tree failed: ", $? >> 8;
+                return 0;
+            };
+
+            # Only add the tree object if we actually have output
+            if ( defined $ls_tree_output ) {
+                chomp $ls_tree_output;
+                $ls_tree_output =~ /\A([0-7]{6}) (blob|tree|commit) (\S+)/ or do {
+                    print STDERR "Unexpected git-ls-tree output.\n";
+                    return 0;
+                };
+
+                $tree{$chunk->{oldpath}} = {
+                    mode => $1,
+                    index => $3,
+                    status => "",
+                };
+            }
+        }
+
+        # We have now added any known information about this path to the tree.
+        # We will now attempt to modify the tree based on the contents of the
+        # chunk.
+
+        if ( $chunk->{action} eq "create" ) {
+            if ( path_exists( %tree, $chunk->{oldpath} ) ) {
+                # This path already exists, so we can't add it!
+                print STDERR "$chunk->{oldpath} already exists.\n";
+                return 0;
+            } else {
+                # We found a patch that either doesn't exist, or is already
+                # been renamed or deleted. We can simply add it here now.
+                $tree{$chunk->{oldpath}}->{mode} = $chunk->{mode};
+                $tree{$chunk->{oldpath}}->{index} = $chunk->{newindex};
+                $tree{$chunk->{oldpath}}->{status} = "";
+            }
+        } else {
+            if ( not path_exists( %tree, $chunk->{oldpath} ) ) {
+                # This path no longer exists, we can't modify it.
+                print STDERR "$chunk->{oldpath} does not exist.\n";
+                return 0;
+            } else {
+                if ( not match_index( $tree{$chunk->{oldpath}}->{index}, $chunk->{oldindex} ) ) {
+                    print STDERR "$chunk->{oldpath} does not have matching index.\n";
+                    return 0;
+                }
+
+                if ( $chunk->{newindex} ) {
+                    $tree{$chunk->{oldpath}}->{index} = $chunk->{newindex};
+                }
+
+                if ( $chunk->{newmode} ) {
+                    $tree{$chunk->{oldpath}}->{mode} = $chunk->{newmode};
+                }
+
+                # Handle special case here for rename and delete actions
+                if ( $chunk->{action} eq "rename" ) {
+                    if ( path_exists( %tree, $chunk->{newpath} ) ) {
+                        print STDERR "$chunk->{newpath} already exists.\n";
+                        return 0;
+                    }
+
+                    $tree{$chunk->{newpath}} = $tree{$chunk->{oldpath}};
+                    $tree{$chunk->{oldpath}}->{status} = "renamed";
+                } elsif ( $chunk->{action} eq "delete" ) {
+                    $tree{$chunk->{oldpath}}->{status} = "deleted";
+                }
+            }
+        }
     }
 
-    # If we get here, then everything matched above, so we can return true.
+    # If we get here, that means we had no issues verifying each chunk, and we
+    # can exit true.
     return 1;
 }
 
-# Open the log pipe. Pass all of our ARGV directly to the log command
-open my $log, '-|', git => log => @ARGV, '--pretty=format:%T %H'
+# Open the git-log pipe. Pass all of our ARGV directly to the rev-list command.
+open my $log, '-|', 'git' => 'log' => @ARGV => '--pretty=%H'
     or die "Couldn't open pipe to git-log: ", $!;
 
-# Loop through each commit in the log, checking if it's tree and hash have all
-# the valid blobs. User can easily modify the log command via options to limit
-# the scope, or reverse ordering. By default we find the most recent commit
-# which has the required blobs.
+# Loop through each commit in the list, checking if the diff chunks can apply
+# cleanly to the commit. Easily allow modifying which commits are checked via
+# options to the git-log command, which allows limiting what can be checked.
 while ( <$log> ) {
     chomp;
-    my ($tree, $commit) = split " ", $_;
 
-    if (check_commit $commit) {
+    if (check_commit $_) {
         # Print the commit hash we found, and exit with a good return status.
-        print "$commit\n";
+        print "$_\n";
         exit 0;
     }
 }
author	Jacob Keller <jacob.e.keller@intel.com>	2014-04-04 15:06:52 -0700
committer	Artem Bityutskiy <artem.bityutskiy@linux.intel.com>	2014-04-07 13:20:02 +0300
commit	2e61508db63a8a523cad78a931782e938d05eb9a (patch)
tree	41f6a29996f0c4cd90235b3f7601f18855d1d923
parent	00f5aa1988db9671c6958ab6683f10ec4c66725a (diff)
download	aiaiai-2e61508db63a8a523cad78a931782e938d05eb9a.tar.gz