aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-11-04 13:51:07 -0500
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-11-04 17:17:41 -0500
commit61ac811b2b2d6b4e223db18c9261975185363e29 (patch)
tree28f6603dfe31b1f86e0f06363b6402b6e94ab31f
parentccf6f244970e281acad5c4375250844bdf1f578b (diff)
downloadgrokmirror-61ac811b2b2d6b4e223db18c9261975185363e29.tar.gz
Add initial support for objstore preload bundles
When performing initial cloning of large repository collections that contain a lot of forks, we end up spending a lot of time waiting objstore operations. If we are able to preload objestore repositories from the mirror we're cloning from, then we can avoid a lot of unnecessary disk churn. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r--grokmirror.conf10
-rw-r--r--grokmirror/__init__.py20
-rwxr-xr-xgrokmirror/fsck.py35
-rwxr-xr-xgrokmirror/pull.py59
4 files changed, 108 insertions, 16 deletions
diff --git a/grokmirror.conf b/grokmirror.conf
index 966d630..52c91d5 100644
--- a/grokmirror.conf
+++ b/grokmirror.conf
@@ -103,6 +103,10 @@ manifest = ${site}/manifest.js.gz
# even if it hasn't changed.
# See contrib/gitolite/* for example commands to use with gitolite.
#manifest_command = /usr/local/bin/grok-get-gl-manifest.sh
+#
+# If the remote is providing pre-generated preload bundles, list the path
+# here, otherwise we'll try {site}/objstore/preload/
+#preload_bundle_url = https://some-cdn-site.com/preload/
# Used by grok-pull
[pull]
@@ -300,6 +304,12 @@ baselines = */kernel/git/next/linux-next.git
# which will give it priority when creating packs.
islandcores = */kernel/git/torvalds/linux.git
#
+# Generate preload bundles for objstore repos and put them into this
+# location. Unless you are running a major mirroring hub site, you
+# do not want this enabled. See corresponding preload_bundle_url
+# entry in the [remote] section.
+#preload_bundle_outdir = /some/http/accessible/path
+#
# If there are any critical errors, the report will be sent to root. You
# can change the settings below to configure report delivery to suit
# your needs:
diff --git a/grokmirror/__init__.py b/grokmirror/__init__.py
index 66f76b1..9f1b732 100644
--- a/grokmirror/__init__.py
+++ b/grokmirror/__init__.py
@@ -423,6 +423,15 @@ def objstore_virtref(fullpath):
return vh.hexdigest()[:12]
+def objstore_trim_virtref(obstrepo, virtref):
+ args = ['for-each-ref', '--format', 'delete %(refname)', f'refs/virtual/{virtref}']
+ ecode, out, err = run_git_command(obstrepo, args)
+ if ecode == 0 and len(out):
+ out += '\n'
+ args = ['update-ref', '--stdin']
+ run_git_command(obstrepo, args, stdin=out.encode())
+
+
def remove_from_objstore(obstrepo, fullpath):
# is fullpath still using us?
altrepo = get_altrepo(fullpath)
@@ -436,16 +445,7 @@ def remove_from_objstore(obstrepo, fullpath):
os.unlink(os.path.join(fullpath, 'objects', 'info', 'alternates'))
virtref = objstore_virtref(fullpath)
- # Find all refs we have from this sibling
- args = ['for-each-ref', '--format', 'delete %(refname)', 'refs/virtual/%s' % virtref]
- ecode, out, err = run_git_command(obstrepo, args)
- if ecode == 0 and len(out):
- out += '\n'
- args = ['update-ref', '--stdin']
- ecode, out, err = run_git_command(obstrepo, args, stdin=out.encode())
- # Remove the remote
- if ecode > 0:
- return False
+ objstore_trim_virtref(obstrepo, virtref)
args = ['remote', 'remove', virtref]
run_git_command(obstrepo, args)
diff --git a/grokmirror/fsck.py b/grokmirror/fsck.py
index 0f1c34a..e800d55 100755
--- a/grokmirror/fsck.py
+++ b/grokmirror/fsck.py
@@ -29,6 +29,8 @@ import fnmatch
import io
import smtplib
+from pathlib import Path
+
from email.message import EmailMessage
from fcntl import lockf, LOCK_EX, LOCK_UN, LOCK_NB
@@ -52,6 +54,15 @@ def log_errors(fullpath, cmdargs, lines):
break
+def gen_preload_bundle(fullpath, config):
+ outdir = config['fsck'].get('preload_bundle_outdir')
+ Path(outdir).mkdir(parents=True, exist_ok=True)
+ bname = '%s.bundle' % os.path.basename(fullpath)[:-4]
+ args = ['bundle', 'create', os.path.join(outdir, bname), '--all']
+ logger.info(' bundling: %s', bname)
+ grokmirror.run_git_command(fullpath, args)
+
+
def get_blob_set(fullpath):
bset = set()
size = 0
@@ -1043,6 +1054,7 @@ def fsck_mirror(config, force=False, repack_only=False, conn_only=False,
set_baseline = False
set_islandcore = False
new_islandcore = False
+ valid_virtrefs = set()
for virtref, childpath in my_remotes:
# Is it still relevant?
if childpath not in amap[obstrepo]:
@@ -1050,6 +1062,7 @@ def fsck_mirror(config, force=False, repack_only=False, conn_only=False,
grokmirror.remove_from_objstore(obstrepo, childpath)
logger.info('%s: removed remote %s (no longer used)', os.path.basename(obstrepo), childpath)
continue
+ valid_virtrefs.add(virtref)
# Does it need fetching?
fetch = True
@@ -1123,7 +1136,24 @@ def fsck_mirror(config, force=False, repack_only=False, conn_only=False,
if os.path.exists(os.path.join(obstrepo, 'grokmirror.repack')):
repack_requested = True
- if obstrepo not in status or new_islandcore or repack_requested:
+ # Go through all our refs and find all stale virtrefs
+ args = ['for-each-ref', '--format=%(refname)', 'refs/virtual/']
+ trimmed_virtrefs = set()
+ ecode, out, err = grokmirror.run_git_command(obstrepo, args)
+ if ecode == 0 and out:
+ for line in out.split('\n'):
+ chunks = line.split('/')
+ if len(chunks) < 3:
+ # Where did this come from?
+ logger.debug('Weird ref %s in objstore repo %s', line, obstrepo)
+ continue
+ virtref = chunks[2]
+ if virtref not in valid_virtrefs and virtref not in trimmed_virtrefs:
+ logger.info(' trim: stale virtref %s', virtref)
+ grokmirror.objstore_trim_virtref(obstrepo, virtref)
+ trimmed_virtrefs.add(virtref)
+
+ if obstrepo not in status or new_islandcore or trimmed_virtrefs or repack_requested:
# We don't use obstrepo fingerprints, so we set it to None
status[obstrepo] = {
'lastcheck': 'never',
@@ -1200,6 +1230,9 @@ def fsck_mirror(config, force=False, repack_only=False, conn_only=False,
status[fullpath]['lastfullrepack'] = todayiso
status[fullpath]['lastcheck'] = todayiso
status[fullpath]['nextcheck'] = nextcheck.strftime('%F')
+ # Do we need to generate a preload bundle?
+ if config['fsck'].get('preload_bundle_outdir') and grokmirror.is_obstrepo(fullpath, obstdir):
+ gen_preload_bundle(fullpath, config)
logger.info(' next: %s', status[fullpath]['nextcheck'])
else:
logger.warning('Repacking %s was unsuccessful', fullpath)
diff --git a/grokmirror/pull.py b/grokmirror/pull.py
index 39da4cc..d3d9260 100755
--- a/grokmirror/pull.py
+++ b/grokmirror/pull.py
@@ -236,6 +236,43 @@ def spa_worker(config, q_spa, pauseonload):
logger.info(' spa: %s (done: %s)', gitdir, ', '.join(done))
+def objstore_repo_preload(config, obstrepo):
+ bname = os.path.basename(obstrepo)[:-4]
+ obstdir = os.path.realpath(config['core'].get('objstore'))
+ purl = config['remote'].get('preload_bundle_url')
+ if not purl:
+ # wing it -- it only costs us a single http request check
+ site = config['remote'].get('site')
+ if site and site.startswith('http'):
+ purl = '%s/objstore/preload' % site.rstrip('/')
+ else:
+ return
+ burl = '%s/%s.bundle' % (purl.rstrip('/'), bname)
+ bfile = os.path.join(obstdir, '%s.bundle' % bname)
+ sess = grokmirror.get_requests_session()
+ resp = sess.get(burl, stream=True)
+ resp.raise_for_status()
+ logger.info(' objstore: getting preload bundle for %s', bname)
+ with open(bfile, 'wb') as fh:
+ for chunk in resp.iter_content(chunk_size=8192):
+ fh.write(chunk)
+ resp.close()
+
+ # Now we clone from it into the objstore repo
+ ecode, out, err = grokmirror.run_git_command(obstrepo, ['remote', 'add', '--mirror=fetch', '_preload', bfile])
+ if ecode == 0:
+ logger.info(' objstore: preloading %s from the bundle', bname)
+ args = ['remote', 'update', '_preload']
+ ecode, out, err = grokmirror.run_git_command(obstrepo, args)
+ if ecode > 0:
+ logger.info(' objstore: not able to preload, will clone repo-by-repo')
+ else:
+ logger.info(' objstore: successful preload')
+ # Regardless of what happened, we remove _preload and the bundle, then move on
+ grokmirror.run_git_command(obstrepo, ['remote', 'rm', '_preload'])
+ os.unlink(bfile)
+
+
def pull_worker(config, q_pull, q_spa, q_done):
toplevel = os.path.realpath(config['core'].get('toplevel'))
obstdir = os.path.realpath(config['core'].get('objstore'))
@@ -263,6 +300,11 @@ def pull_worker(config, q_pull, q_spa, q_done):
q_pull.put((gitdir, repoinfo, action, q_action))
continue
+ altrepo = grokmirror.get_altrepo(fullpath)
+ obstrepo = None
+ if altrepo and grokmirror.is_obstrepo(altrepo, obstdir):
+ obstrepo = altrepo
+
if action == 'purge':
# Is it a symlink?
if os.path.islink(fullpath):
@@ -308,6 +350,14 @@ def pull_worker(config, q_pull, q_spa, q_done):
if action in ('pull', 'objstore_migrate'):
r_fp = repoinfo.get('fingerprint')
my_fp = grokmirror.get_repo_fingerprint(toplevel, gitdir, force=True)
+ if obstrepo:
+ o_obj_info = grokmirror.get_repo_obj_info(obstrepo)
+ if o_obj_info.get('count') == '0' and o_obj_info.get('in-pack') == '0' and not my_fp:
+ # Try to preload the objstore repo directly
+ try:
+ objstore_repo_preload(config, obstrepo)
+ except: # noqa
+ pass
if r_fp != my_fp:
# Make sure we have the remote set up
@@ -317,7 +367,7 @@ def pull_worker(config, q_pull, q_spa, q_done):
logger.info(' fetch: %s', gitdir)
retries = 1
while True:
- success = pull_repo(toplevel, gitdir, remotename)
+ success = pull_repo(fullpath, remotename)
if success:
break
retries += 1
@@ -534,8 +584,7 @@ def run_post_update_hook(toplevel, gitdir, hookscripts):
logger.info('Hook Stdout (%s): %s', gitdir, output)
-def pull_repo(toplevel, gitdir, remotename):
- fullpath = os.path.join(toplevel, gitdir.lstrip('/'))
+def pull_repo(fullpath, remotename):
args = ['remote', 'update', remotename, '--prune']
retcode, output, error = grokmirror.run_git_command(fullpath, args)
@@ -562,9 +611,9 @@ def pull_repo(toplevel, gitdir, remotename):
else:
debug.append(line)
if debug:
- logger.debug('Stderr (%s): %s', gitdir, '\n'.join(debug))
+ logger.debug('Stderr (%s): %s', fullpath, '\n'.join(debug))
if warn:
- logger.warning('Stderr (%s): %s', gitdir, '\n'.join(warn))
+ logger.warning('Stderr (%s): %s', fullpath, '\n'.join(warn))
return success