aboutsummaryrefslogtreecommitdiffstats
path: root/git-p4.py
diff options
context:
space:
mode:
authorJoel Holdsworth <jholdsworth@nvidia.com>2021-12-16 13:46:19 +0000
committerJunio C Hamano <gitster@pobox.com>2021-12-16 14:06:36 -0800
commit70c0d55349a50707166f9fb9a9720ac1c0530217 (patch)
treea014fa9b5833bb0f6639117ce1ed57ed501f0e8b /git-p4.py
parent4cf67ae1b6c80eb8a63cc8dd752bd3951cffa104 (diff)
downloadgit-70c0d55349a50707166f9fb9a9720ac1c0530217.tar.gz
git-p4: resolve RCS keywords in bytes not utf-8
RCS keywords are strings that are replaced with information from Perforce. Examples include $Date$, $Author$, $File$, $Change$ etc. Perforce resolves these by expanding them with their expanded values when files are synced, but Git's data model requires these expanded values to be converted back into their unexpanded form. Previously, git-p4.py would implement this behaviour through the use of regular expressions. However, the regular expression substitution was applied using decoded strings i.e. the content of incoming commit diffs was first decoded from bytes into UTF-8, processed with regular expressions, then converted back to bytes. Not only is this behaviour inefficient, but it is also a cause of a common issue caused by text files containing invalid UTF-8 data. For files created in Windows, CP1252 Smart Quote Characters (0x93 and 0x94) are seen fairly frequently. These codes are invalid in UTF-8, so if the script encountered any file containing them, on Python 2 the symbols will be corrupted, and on Python 3 the script will fail with an exception. This patch replaces this decoding/encoding with bytes object regular expressions, so that the substitution is performed directly upon the source data with no conversions. A test for smart quote handling has been added to the t9810-git-p4-rcs.sh test suite. Signed-off-by: Joel Holdsworth <jholdsworth@nvidia.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'git-p4.py')
-rwxr-xr-xgit-p4.py15
1 files changed, 8 insertions, 7 deletions
diff --git a/git-p4.py b/git-p4.py
index 7845210e69..986595bef0 100755
--- a/git-p4.py
+++ b/git-p4.py
@@ -56,8 +56,8 @@ defaultBlockSize = 1<<20
p4_access_checked = False
-re_ko_keywords = re.compile(r'\$(Id|Header)(:[^$\n]+)?\$')
-re_k_keywords = re.compile(r'\$(Id|Header|Author|Date|DateTime|Change|File|Revision)(:[^$\n]+)?\$')
+re_ko_keywords = re.compile(br'\$(Id|Header)(:[^$\n]+)?\$')
+re_k_keywords = re.compile(br'\$(Id|Header|Author|Date|DateTime|Change|File|Revision)(:[^$\n]+)?\$')
def p4_build_cmd(cmd):
"""Build a suitable p4 command line.
@@ -1754,9 +1754,9 @@ class P4Submit(Command, P4UserMap):
# Attempt to zap the RCS keywords in a p4 controlled file matching the given regex
(handle, outFileName) = tempfile.mkstemp(dir='.')
try:
- with os.fdopen(handle, "w") as outFile, open(file, "r") as inFile:
+ with os.fdopen(handle, "wb") as outFile, open(file, "rb") as inFile:
for line in inFile.readlines():
- outFile.write(regexp.sub(r'$\1$', line))
+ outFile.write(regexp.sub(br'$\1$', line))
# Forcibly overwrite the original file
os.unlink(file)
shutil.move(outFileName, file)
@@ -2089,7 +2089,9 @@ class P4Submit(Command, P4UserMap):
regexp = p4_keywords_regexp_for_file(file)
if regexp:
# this file is a possibility...look for RCS keywords.
- for line in read_pipe_lines(["git", "diff", "%s^..%s" % (id, id), file]):
+ for line in read_pipe_lines(
+ ["git", "diff", "%s^..%s" % (id, id), file],
+ raw=True):
if regexp.search(line):
if verbose:
print("got keyword match on %s in %s in %s" % (regex.pattern, line, file))
@@ -3020,8 +3022,7 @@ class P4Sync(Command, P4UserMap):
# even though in theory somebody may want that.
regexp = p4_keywords_regexp_for_type(type_base, type_mods)
if regexp:
- contents = [encode_text_stream(regexp.sub(
- r'$\1$', ''.join(decode_text_stream(c) for c in contents)))]
+ contents = [regexp.sub(br'$\1$', c) for c in contents]
if self.largeFileSystem:
(git_mode, contents) = self.largeFileSystem.processContent(git_mode, relPath, contents)