diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-11-04 13:51:07 -0500 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-11-04 17:17:41 -0500 |
commit | 61ac811b2b2d6b4e223db18c9261975185363e29 (patch) | |
tree | 28f6603dfe31b1f86e0f06363b6402b6e94ab31f | |
parent | ccf6f244970e281acad5c4375250844bdf1f578b (diff) | |
download | grokmirror-61ac811b2b2d6b4e223db18c9261975185363e29.tar.gz |
Add initial support for objstore preload bundles
When performing initial cloning of large repository collections that
contain a lot of forks, we end up spending a lot of time waiting
objstore operations. If we are able to preload objestore repositories
from the mirror we're cloning from, then we can avoid a lot of
unnecessary disk churn.
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r-- | grokmirror.conf | 10 | ||||
-rw-r--r-- | grokmirror/__init__.py | 20 | ||||
-rwxr-xr-x | grokmirror/fsck.py | 35 | ||||
-rwxr-xr-x | grokmirror/pull.py | 59 |
4 files changed, 108 insertions, 16 deletions
diff --git a/grokmirror.conf b/grokmirror.conf index 966d630..52c91d5 100644 --- a/grokmirror.conf +++ b/grokmirror.conf @@ -103,6 +103,10 @@ manifest = ${site}/manifest.js.gz # even if it hasn't changed. # See contrib/gitolite/* for example commands to use with gitolite. #manifest_command = /usr/local/bin/grok-get-gl-manifest.sh +# +# If the remote is providing pre-generated preload bundles, list the path +# here, otherwise we'll try {site}/objstore/preload/ +#preload_bundle_url = https://some-cdn-site.com/preload/ # Used by grok-pull [pull] @@ -300,6 +304,12 @@ baselines = */kernel/git/next/linux-next.git # which will give it priority when creating packs. islandcores = */kernel/git/torvalds/linux.git # +# Generate preload bundles for objstore repos and put them into this +# location. Unless you are running a major mirroring hub site, you +# do not want this enabled. See corresponding preload_bundle_url +# entry in the [remote] section. +#preload_bundle_outdir = /some/http/accessible/path +# # If there are any critical errors, the report will be sent to root. You # can change the settings below to configure report delivery to suit # your needs: diff --git a/grokmirror/__init__.py b/grokmirror/__init__.py index 66f76b1..9f1b732 100644 --- a/grokmirror/__init__.py +++ b/grokmirror/__init__.py @@ -423,6 +423,15 @@ def objstore_virtref(fullpath): return vh.hexdigest()[:12] +def objstore_trim_virtref(obstrepo, virtref): + args = ['for-each-ref', '--format', 'delete %(refname)', f'refs/virtual/{virtref}'] + ecode, out, err = run_git_command(obstrepo, args) + if ecode == 0 and len(out): + out += '\n' + args = ['update-ref', '--stdin'] + run_git_command(obstrepo, args, stdin=out.encode()) + + def remove_from_objstore(obstrepo, fullpath): # is fullpath still using us? altrepo = get_altrepo(fullpath) @@ -436,16 +445,7 @@ def remove_from_objstore(obstrepo, fullpath): os.unlink(os.path.join(fullpath, 'objects', 'info', 'alternates')) virtref = objstore_virtref(fullpath) - # Find all refs we have from this sibling - args = ['for-each-ref', '--format', 'delete %(refname)', 'refs/virtual/%s' % virtref] - ecode, out, err = run_git_command(obstrepo, args) - if ecode == 0 and len(out): - out += '\n' - args = ['update-ref', '--stdin'] - ecode, out, err = run_git_command(obstrepo, args, stdin=out.encode()) - # Remove the remote - if ecode > 0: - return False + objstore_trim_virtref(obstrepo, virtref) args = ['remote', 'remove', virtref] run_git_command(obstrepo, args) diff --git a/grokmirror/fsck.py b/grokmirror/fsck.py index 0f1c34a..e800d55 100755 --- a/grokmirror/fsck.py +++ b/grokmirror/fsck.py @@ -29,6 +29,8 @@ import fnmatch import io import smtplib +from pathlib import Path + from email.message import EmailMessage from fcntl import lockf, LOCK_EX, LOCK_UN, LOCK_NB @@ -52,6 +54,15 @@ def log_errors(fullpath, cmdargs, lines): break +def gen_preload_bundle(fullpath, config): + outdir = config['fsck'].get('preload_bundle_outdir') + Path(outdir).mkdir(parents=True, exist_ok=True) + bname = '%s.bundle' % os.path.basename(fullpath)[:-4] + args = ['bundle', 'create', os.path.join(outdir, bname), '--all'] + logger.info(' bundling: %s', bname) + grokmirror.run_git_command(fullpath, args) + + def get_blob_set(fullpath): bset = set() size = 0 @@ -1043,6 +1054,7 @@ def fsck_mirror(config, force=False, repack_only=False, conn_only=False, set_baseline = False set_islandcore = False new_islandcore = False + valid_virtrefs = set() for virtref, childpath in my_remotes: # Is it still relevant? if childpath not in amap[obstrepo]: @@ -1050,6 +1062,7 @@ def fsck_mirror(config, force=False, repack_only=False, conn_only=False, grokmirror.remove_from_objstore(obstrepo, childpath) logger.info('%s: removed remote %s (no longer used)', os.path.basename(obstrepo), childpath) continue + valid_virtrefs.add(virtref) # Does it need fetching? fetch = True @@ -1123,7 +1136,24 @@ def fsck_mirror(config, force=False, repack_only=False, conn_only=False, if os.path.exists(os.path.join(obstrepo, 'grokmirror.repack')): repack_requested = True - if obstrepo not in status or new_islandcore or repack_requested: + # Go through all our refs and find all stale virtrefs + args = ['for-each-ref', '--format=%(refname)', 'refs/virtual/'] + trimmed_virtrefs = set() + ecode, out, err = grokmirror.run_git_command(obstrepo, args) + if ecode == 0 and out: + for line in out.split('\n'): + chunks = line.split('/') + if len(chunks) < 3: + # Where did this come from? + logger.debug('Weird ref %s in objstore repo %s', line, obstrepo) + continue + virtref = chunks[2] + if virtref not in valid_virtrefs and virtref not in trimmed_virtrefs: + logger.info(' trim: stale virtref %s', virtref) + grokmirror.objstore_trim_virtref(obstrepo, virtref) + trimmed_virtrefs.add(virtref) + + if obstrepo not in status or new_islandcore or trimmed_virtrefs or repack_requested: # We don't use obstrepo fingerprints, so we set it to None status[obstrepo] = { 'lastcheck': 'never', @@ -1200,6 +1230,9 @@ def fsck_mirror(config, force=False, repack_only=False, conn_only=False, status[fullpath]['lastfullrepack'] = todayiso status[fullpath]['lastcheck'] = todayiso status[fullpath]['nextcheck'] = nextcheck.strftime('%F') + # Do we need to generate a preload bundle? + if config['fsck'].get('preload_bundle_outdir') and grokmirror.is_obstrepo(fullpath, obstdir): + gen_preload_bundle(fullpath, config) logger.info(' next: %s', status[fullpath]['nextcheck']) else: logger.warning('Repacking %s was unsuccessful', fullpath) diff --git a/grokmirror/pull.py b/grokmirror/pull.py index 39da4cc..d3d9260 100755 --- a/grokmirror/pull.py +++ b/grokmirror/pull.py @@ -236,6 +236,43 @@ def spa_worker(config, q_spa, pauseonload): logger.info(' spa: %s (done: %s)', gitdir, ', '.join(done)) +def objstore_repo_preload(config, obstrepo): + bname = os.path.basename(obstrepo)[:-4] + obstdir = os.path.realpath(config['core'].get('objstore')) + purl = config['remote'].get('preload_bundle_url') + if not purl: + # wing it -- it only costs us a single http request check + site = config['remote'].get('site') + if site and site.startswith('http'): + purl = '%s/objstore/preload' % site.rstrip('/') + else: + return + burl = '%s/%s.bundle' % (purl.rstrip('/'), bname) + bfile = os.path.join(obstdir, '%s.bundle' % bname) + sess = grokmirror.get_requests_session() + resp = sess.get(burl, stream=True) + resp.raise_for_status() + logger.info(' objstore: getting preload bundle for %s', bname) + with open(bfile, 'wb') as fh: + for chunk in resp.iter_content(chunk_size=8192): + fh.write(chunk) + resp.close() + + # Now we clone from it into the objstore repo + ecode, out, err = grokmirror.run_git_command(obstrepo, ['remote', 'add', '--mirror=fetch', '_preload', bfile]) + if ecode == 0: + logger.info(' objstore: preloading %s from the bundle', bname) + args = ['remote', 'update', '_preload'] + ecode, out, err = grokmirror.run_git_command(obstrepo, args) + if ecode > 0: + logger.info(' objstore: not able to preload, will clone repo-by-repo') + else: + logger.info(' objstore: successful preload') + # Regardless of what happened, we remove _preload and the bundle, then move on + grokmirror.run_git_command(obstrepo, ['remote', 'rm', '_preload']) + os.unlink(bfile) + + def pull_worker(config, q_pull, q_spa, q_done): toplevel = os.path.realpath(config['core'].get('toplevel')) obstdir = os.path.realpath(config['core'].get('objstore')) @@ -263,6 +300,11 @@ def pull_worker(config, q_pull, q_spa, q_done): q_pull.put((gitdir, repoinfo, action, q_action)) continue + altrepo = grokmirror.get_altrepo(fullpath) + obstrepo = None + if altrepo and grokmirror.is_obstrepo(altrepo, obstdir): + obstrepo = altrepo + if action == 'purge': # Is it a symlink? if os.path.islink(fullpath): @@ -308,6 +350,14 @@ def pull_worker(config, q_pull, q_spa, q_done): if action in ('pull', 'objstore_migrate'): r_fp = repoinfo.get('fingerprint') my_fp = grokmirror.get_repo_fingerprint(toplevel, gitdir, force=True) + if obstrepo: + o_obj_info = grokmirror.get_repo_obj_info(obstrepo) + if o_obj_info.get('count') == '0' and o_obj_info.get('in-pack') == '0' and not my_fp: + # Try to preload the objstore repo directly + try: + objstore_repo_preload(config, obstrepo) + except: # noqa + pass if r_fp != my_fp: # Make sure we have the remote set up @@ -317,7 +367,7 @@ def pull_worker(config, q_pull, q_spa, q_done): logger.info(' fetch: %s', gitdir) retries = 1 while True: - success = pull_repo(toplevel, gitdir, remotename) + success = pull_repo(fullpath, remotename) if success: break retries += 1 @@ -534,8 +584,7 @@ def run_post_update_hook(toplevel, gitdir, hookscripts): logger.info('Hook Stdout (%s): %s', gitdir, output) -def pull_repo(toplevel, gitdir, remotename): - fullpath = os.path.join(toplevel, gitdir.lstrip('/')) +def pull_repo(fullpath, remotename): args = ['remote', 'update', remotename, '--prune'] retcode, output, error = grokmirror.run_git_command(fullpath, args) @@ -562,9 +611,9 @@ def pull_repo(toplevel, gitdir, remotename): else: debug.append(line) if debug: - logger.debug('Stderr (%s): %s', gitdir, '\n'.join(debug)) + logger.debug('Stderr (%s): %s', fullpath, '\n'.join(debug)) if warn: - logger.warning('Stderr (%s): %s', gitdir, '\n'.join(warn)) + logger.warning('Stderr (%s): %s', fullpath, '\n'.join(warn)) return success |