diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2021-09-27 17:11:38 -0400 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2021-09-27 17:11:38 -0400 |
commit | 5e22d743b9be7ddc8ad2d07f09b215016c179f0e (patch) | |
tree | 962a89b0d134fe5b35a47fae347dd7b51bc32c19 | |
parent | dbab40565644f6c26b0059c2036d16ecf791351b (diff) | |
download | korg-helpers-5e22d743b9be7ddc8ad2d07f09b215016c179f0e.tar.gz |
Add bugzilla-junker
This is a helper script I use to keep tabs on spam posted to
bugzilla.kernel.org.
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r-- | bugzilla-junker.example.conf | 6 | ||||
-rw-r--r-- | bugzilla-junker.py | 380 |
2 files changed, 386 insertions, 0 deletions
diff --git a/bugzilla-junker.example.conf b/bugzilla-junker.example.conf new file mode 100644 index 0000000..c6cb810 --- /dev/null +++ b/bugzilla-junker.example.conf @@ -0,0 +1,6 @@ +[main] +url = https://bugzilla.kernel.org +apikey = [apikey here] +spamtag = spam +logfile = /home/user/work/temp/bz-comment-junker.log +cache = /home/user/.cache/bz-comment-junker.cache diff --git a/bugzilla-junker.py b/bugzilla-junker.py new file mode 100644 index 0000000..413f39e --- /dev/null +++ b/bugzilla-junker.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# A helper script to go through the latest comments added to a bugzilla +# to see if any of them link to external sites. If the reviewer deems them +# spammy, the script will tag them as such. +# +# Caution: work in progress +# +__author__ = 'Konstantin Ryabitsev <konstantin@linuxfoundation.org>' + +import sys +import requests +import argparse +import logging +import re +import shelve +import datetime +import notify2 +import time + +from urllib.parse import urlparse +from configparser import ConfigParser + +logger = logging.getLogger('default') + +APIKEY = None +BZURL = None + +REQSESSION = None + +CACHEDATA = None + + +def notify_desktop(message): + notify2.init('bugjunker') + n = notify2.Notification('bugjunker', message) + n.set_timeout(notify2.EXPIRES_NEVER) + n.show() + + +def get_session(): + global REQSESSION + if REQSESSION is None: + REQSESSION = requests.session() + REQSESSION.headers.update({'User-Agent': 'bugjunker'}) + return REQSESSION + + +def ban_hammer(spammers): + params = {} + for spammer in set(spammers): + path = 'user/{spammer}'.format(spammer=spammer) + logger.info('Banning %s', spammer) + payload = { + 'email_enabled': False, + 'login_denied_text': 'Spammer', + } + bz_put(path, params, payload) + + +def tag_hammer(spamcids, spamtag): + params = {} + for cid in set(spamcids): + logger.info('Tagging comment %s', cid) + path = 'bug/comment/{cid}/tags'.format(cid=cid) + payload = { + 'comment_id': cid, + 'add': [spamtag], + } + bz_put(path, params, payload) + + +def bug_hammer(spambugs, args): + params = {} + for bugid in set(spambugs): + logger.info('Junking bug %s', bugid) + path = 'bug/{bugid}'.format(bugid=bugid) + payload = { + 'groups': {'add': [args.group]}, + 'status': args.status, + 'resolution': args.resolution, + } + bz_put(path, params, payload) + + +def bz_get(path, params): + url = '{BZURL}/rest/{path}'.format(BZURL=BZURL, path=path) + params['api_key'] = APIKEY + ses = get_session() + res = ses.get(url, params=params) + return res.json() + + +def bz_put(path, params, payload): + url = '{BZURL}/rest/{path}'.format(BZURL=BZURL, path=path) + params['api_key'] = APIKEY + ses = get_session() + res = ses.put(url, params=params, json=payload) + return res.json() + + +def load_cache(cachefile): + global CACHEDATA + if CACHEDATA is not None: + return CACHEDATA + + # noinspection PyBroadException + try: + with shelve.open(cachefile, 'r') as wc: + logger.info('Loading cache from %s', cachefile) + CACHEDATA = dict(wc) + except: + CACHEDATA = { + 'seencids': list(), + 'seenaids': list(), + 'okdomains': list(), + 'okfolks': list(), + } + pass + + if 'lastrun' in CACHEDATA: + lastrun = CACHEDATA['lastrun'] + else: + lastrun = '24h' + + return lastrun, CACHEDATA + + +def save_cache(cachefile, cachedata): + with shelve.open(cachefile, 'c') as wc: + for key, val in cachedata.items(): + wc[key] = val + wc.sync() + + +def check_bad_urls(urls, okdomains): + for url in urls: + try: + up = urlparse(url) + except ValueError: + return url, None + isok = False + for okd in okdomains: + if okd == up.netloc: + isok = True + break + if not isok: + return url, up.netloc + + return None, None + + +def is_junk_attachment(attid): + attid = str(attid) + logger.info(' checking attachment %s', attid) + path = 'bug/attachment/{attid}'.format(attid=attid) + attinfo = bz_get(path, {}) + if attid not in attinfo['attachments']: + return False + attdata = attinfo['attachments'][attid] + if attdata['is_patch'] or attdata['content_type'] in ('text/plain',): + return False + if attdata['content_type'] == 'text/html': + # Almost certainly junk + logger.info(' junking attachment %s', attid) + payload = { + 'content_type': 'text/plain', + 'filename': 'caution.txt', + 'is_private': True, + } + bz_put(path, {}, payload) + return True + return False + + +def process_bugs(cmdargs, cachefile, c, bugs, spamtag): + spammers = list() + spamcids = list() + spambugs = list() + + for bug in bugs: + logger.info('Analyzing [%s]: %s', bug['id'], bug['summary']) + params = {} + bugid = bug['id'] + path = 'bug/{bugid}/comment'.format(bugid=bugid) + comments = bz_get(path, params) + for f1, f2 in comments['bugs'].items(): + c_count = -1 + for comment in f2['comments']: + c_count += 1 + cid = comment['id'] + if cid in c['seencids']: + # already seen, skip + continue + + c['seencids'].append(cid) + + creator = comment['creator'] + if creator in c['okfolks']: + # Known good person + continue + + tags = comment['tags'] + if spamtag in tags: + # already marked as spammy + continue + + if creator in spammers: + # Made by a known spammer, banit + spamcids.append(cid) + logger.info(' auto-tagging comment by %s: %s', creator, cid) + continue + + if cmdargs.checkatt: + attid = comment['attachment_id'] + if attid is not None and attid not in c['seenaids']: + c['seenaids'].append(attid) + if is_junk_attachment(attid): + logger.info(' check if spammer: %s', creator) + + # Look for remote URLs in the comment + if bug['url'].find('http') > -1 or comment['text'].find('http') > -1: + urls = re.findall(r'(https?://\S+)', comment['text']) + if len(bug['url']): + urls.append(bug['url']) + badurl, baddomain = check_bad_urls(urls, c['okdomains']) + + if badurl is not None: + if cmdargs.noninteractive: + notify_desktop('Spam in %s: %s' % (bug['id'], baddomain)) + when = datetime.datetime.strptime(bug['last_change_time'], '%Y-%m-%dT%H:%M:%SZ') + c['seencids'].remove(cid) + c['lastrun'] = when.strftime('%Y-%m-%d %H:%M:%S') + c['needsinput'] = True + return spammers, spamcids, spambugs + + logger.info(' ---') + logger.info(' suspish URL: %s', badurl) + logger.info(' checkit out: %s/show_bug.cgi?id=%s#c%s', + BZURL, bugid, c_count) + baw = input(' (b)an, (a)llow, (w)hitelist %s: ' % baddomain) + + if baw == 'a': + c['okfolks'].append(creator) + save_cache(cachefile, c) + continue + + if baw == 'w': + logger.info(' whitelisted %s', baddomain) + c['okdomains'].append(baddomain) + c['okfolks'].append(creator) + save_cache(cachefile, c) + continue + + logger.info(' spamcid: %s', cid) + spamcids.append(cid) + + # If it's a comment #0, then the whole bug needs junking + if c_count == 0: + spambugs.append(bug['id']) + + if creator not in spammers: + logger.info(' spammer: %s', creator) + spammers.append(creator) + + return spammers, spamcids, spambugs + + +def main(args): + global BZURL + global APIKEY + + logger.setLevel(logging.DEBUG) + + ch = logging.StreamHandler() + formatter = logging.Formatter('%(message)s') + ch.setFormatter(formatter) + + if args.quiet: + ch.setLevel(logging.CRITICAL) + elif args.debug: + ch.setLevel(logging.DEBUG) + else: + ch.setLevel(logging.INFO) + + logger.addHandler(ch) + + logger.info('Loading configuration file %s', args.config) + config = ConfigParser() + config.read(args.config) + BZURL = config.get('main', 'url') + APIKEY = config.get('main', 'apikey') + + spamtag = config.get('main', 'spamtag') + + if config.get('main', 'logfile'): + ch = logging.FileHandler(config.get('main', 'logfile')) + fmt = '[%(process)d] %(asctime)s - %(message)s' + ch.setFormatter(logging.Formatter(fmt)) + ch.setLevel(logging.INFO) + logger.addHandler(ch) + + cachefile = config.get('main', 'cache') + lastrun, c = load_cache(cachefile) + + + if args.lookback is not None: + lastrun = args.lookback + + while True: + if args.noninteractive and 'needsinput' in c and c['needsinput']: + logger.info('Need to run interactively to make some decisions') + sys.exit(0) + + params = { + 'chfieldfrom': lastrun, + 'include_fields': 'id,summary,last_change_time,url', + } + logger.info('Querying %s for changes since %s', BZURL, lastrun) + + unow = datetime.datetime.utcnow() + json = bz_get('bug', params) + c['lastrun'] = unow.strftime('%Y-%m-%d %H:%M:%S') + c['needsinput'] = False + if len(json['bugs']): + spammers, spamcids, spambugs = process_bugs(args, cachefile, c, json['bugs'], spamtag) + + if len(spammers) or len(spamcids) or len(spambugs): + ban_hammer(spammers) + tag_hammer(spamcids, spamtag) + bug_hammer(spambugs, args) + else: + logger.info('No new spam found') + else: + logger.info('No changes since %s', lastrun) + + save_cache(cachefile, c) + + if not args.sleep: + sys.exit(0) + + logger.info('Sleeping %d seconds', args.sleep) + time.sleep(args.sleep) + + +def cmd(): + description = 'Junk spammy bugzilla comments and ban their authors' + parser = argparse.ArgumentParser(description=description, prog='bz-comment-junker.py') + parser.add_argument('-c', '--config', required=True, + help='Configuration file') + parser.add_argument('-q', '--quiet', action='store_true', default=False, + help='Output only errors') + parser.add_argument('-d', '--debug', action='store_true', default=False, + help='Add debugging info') + parser.add_argument('-l', '--lookback', default=None, + help='How far back to look (default: since last run, or 24h if no cached data)') + parser.add_argument('-n', '--noninteractive', action='store_true', default=False, + help='Run non-interactively and send an alert when potential spam is found') + parser.add_argument('-a', '--check-attachments', action='store_true', dest='checkatt', default=False, + help='Check attachments for junkiness') + parser.add_argument('--sleep', type=int, default=0, + help='After the run, sleep N seconds and then run again') + parser.add_argument('--status', default='RESOLVED', + help='Status value for junked bugs') + parser.add_argument('--resolution', default='INVALID', + help='Resolution value for junked bugs') + parser.add_argument('--product', default='Other', + help='Product value for junked bugs') + parser.add_argument('--component', default='Spam', + help='Component value for junked bugs') + parser.add_argument('--group', default='Junk', + help='Private group name for junked bugs') + + args = parser.parse_args() + main(args) + + +if __name__ == '__main__': + cmd() |