aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2021-09-27 17:11:38 -0400
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2021-09-27 17:11:38 -0400
commit5e22d743b9be7ddc8ad2d07f09b215016c179f0e (patch)
tree962a89b0d134fe5b35a47fae347dd7b51bc32c19
parentdbab40565644f6c26b0059c2036d16ecf791351b (diff)
downloadkorg-helpers-5e22d743b9be7ddc8ad2d07f09b215016c179f0e.tar.gz
Add bugzilla-junker
This is a helper script I use to keep tabs on spam posted to bugzilla.kernel.org. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r--bugzilla-junker.example.conf6
-rw-r--r--bugzilla-junker.py380
2 files changed, 386 insertions, 0 deletions
diff --git a/bugzilla-junker.example.conf b/bugzilla-junker.example.conf
new file mode 100644
index 0000000..c6cb810
--- /dev/null
+++ b/bugzilla-junker.example.conf
@@ -0,0 +1,6 @@
+[main]
+url = https://bugzilla.kernel.org
+apikey = [apikey here]
+spamtag = spam
+logfile = /home/user/work/temp/bz-comment-junker.log
+cache = /home/user/.cache/bz-comment-junker.cache
diff --git a/bugzilla-junker.py b/bugzilla-junker.py
new file mode 100644
index 0000000..413f39e
--- /dev/null
+++ b/bugzilla-junker.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# A helper script to go through the latest comments added to a bugzilla
+# to see if any of them link to external sites. If the reviewer deems them
+# spammy, the script will tag them as such.
+#
+# Caution: work in progress
+#
+__author__ = 'Konstantin Ryabitsev <konstantin@linuxfoundation.org>'
+
+import sys
+import requests
+import argparse
+import logging
+import re
+import shelve
+import datetime
+import notify2
+import time
+
+from urllib.parse import urlparse
+from configparser import ConfigParser
+
+logger = logging.getLogger('default')
+
+APIKEY = None
+BZURL = None
+
+REQSESSION = None
+
+CACHEDATA = None
+
+
+def notify_desktop(message):
+ notify2.init('bugjunker')
+ n = notify2.Notification('bugjunker', message)
+ n.set_timeout(notify2.EXPIRES_NEVER)
+ n.show()
+
+
+def get_session():
+ global REQSESSION
+ if REQSESSION is None:
+ REQSESSION = requests.session()
+ REQSESSION.headers.update({'User-Agent': 'bugjunker'})
+ return REQSESSION
+
+
+def ban_hammer(spammers):
+ params = {}
+ for spammer in set(spammers):
+ path = 'user/{spammer}'.format(spammer=spammer)
+ logger.info('Banning %s', spammer)
+ payload = {
+ 'email_enabled': False,
+ 'login_denied_text': 'Spammer',
+ }
+ bz_put(path, params, payload)
+
+
+def tag_hammer(spamcids, spamtag):
+ params = {}
+ for cid in set(spamcids):
+ logger.info('Tagging comment %s', cid)
+ path = 'bug/comment/{cid}/tags'.format(cid=cid)
+ payload = {
+ 'comment_id': cid,
+ 'add': [spamtag],
+ }
+ bz_put(path, params, payload)
+
+
+def bug_hammer(spambugs, args):
+ params = {}
+ for bugid in set(spambugs):
+ logger.info('Junking bug %s', bugid)
+ path = 'bug/{bugid}'.format(bugid=bugid)
+ payload = {
+ 'groups': {'add': [args.group]},
+ 'status': args.status,
+ 'resolution': args.resolution,
+ }
+ bz_put(path, params, payload)
+
+
+def bz_get(path, params):
+ url = '{BZURL}/rest/{path}'.format(BZURL=BZURL, path=path)
+ params['api_key'] = APIKEY
+ ses = get_session()
+ res = ses.get(url, params=params)
+ return res.json()
+
+
+def bz_put(path, params, payload):
+ url = '{BZURL}/rest/{path}'.format(BZURL=BZURL, path=path)
+ params['api_key'] = APIKEY
+ ses = get_session()
+ res = ses.put(url, params=params, json=payload)
+ return res.json()
+
+
+def load_cache(cachefile):
+ global CACHEDATA
+ if CACHEDATA is not None:
+ return CACHEDATA
+
+ # noinspection PyBroadException
+ try:
+ with shelve.open(cachefile, 'r') as wc:
+ logger.info('Loading cache from %s', cachefile)
+ CACHEDATA = dict(wc)
+ except:
+ CACHEDATA = {
+ 'seencids': list(),
+ 'seenaids': list(),
+ 'okdomains': list(),
+ 'okfolks': list(),
+ }
+ pass
+
+ if 'lastrun' in CACHEDATA:
+ lastrun = CACHEDATA['lastrun']
+ else:
+ lastrun = '24h'
+
+ return lastrun, CACHEDATA
+
+
+def save_cache(cachefile, cachedata):
+ with shelve.open(cachefile, 'c') as wc:
+ for key, val in cachedata.items():
+ wc[key] = val
+ wc.sync()
+
+
+def check_bad_urls(urls, okdomains):
+ for url in urls:
+ try:
+ up = urlparse(url)
+ except ValueError:
+ return url, None
+ isok = False
+ for okd in okdomains:
+ if okd == up.netloc:
+ isok = True
+ break
+ if not isok:
+ return url, up.netloc
+
+ return None, None
+
+
+def is_junk_attachment(attid):
+ attid = str(attid)
+ logger.info(' checking attachment %s', attid)
+ path = 'bug/attachment/{attid}'.format(attid=attid)
+ attinfo = bz_get(path, {})
+ if attid not in attinfo['attachments']:
+ return False
+ attdata = attinfo['attachments'][attid]
+ if attdata['is_patch'] or attdata['content_type'] in ('text/plain',):
+ return False
+ if attdata['content_type'] == 'text/html':
+ # Almost certainly junk
+ logger.info(' junking attachment %s', attid)
+ payload = {
+ 'content_type': 'text/plain',
+ 'filename': 'caution.txt',
+ 'is_private': True,
+ }
+ bz_put(path, {}, payload)
+ return True
+ return False
+
+
+def process_bugs(cmdargs, cachefile, c, bugs, spamtag):
+ spammers = list()
+ spamcids = list()
+ spambugs = list()
+
+ for bug in bugs:
+ logger.info('Analyzing [%s]: %s', bug['id'], bug['summary'])
+ params = {}
+ bugid = bug['id']
+ path = 'bug/{bugid}/comment'.format(bugid=bugid)
+ comments = bz_get(path, params)
+ for f1, f2 in comments['bugs'].items():
+ c_count = -1
+ for comment in f2['comments']:
+ c_count += 1
+ cid = comment['id']
+ if cid in c['seencids']:
+ # already seen, skip
+ continue
+
+ c['seencids'].append(cid)
+
+ creator = comment['creator']
+ if creator in c['okfolks']:
+ # Known good person
+ continue
+
+ tags = comment['tags']
+ if spamtag in tags:
+ # already marked as spammy
+ continue
+
+ if creator in spammers:
+ # Made by a known spammer, banit
+ spamcids.append(cid)
+ logger.info(' auto-tagging comment by %s: %s', creator, cid)
+ continue
+
+ if cmdargs.checkatt:
+ attid = comment['attachment_id']
+ if attid is not None and attid not in c['seenaids']:
+ c['seenaids'].append(attid)
+ if is_junk_attachment(attid):
+ logger.info(' check if spammer: %s', creator)
+
+ # Look for remote URLs in the comment
+ if bug['url'].find('http') > -1 or comment['text'].find('http') > -1:
+ urls = re.findall(r'(https?://\S+)', comment['text'])
+ if len(bug['url']):
+ urls.append(bug['url'])
+ badurl, baddomain = check_bad_urls(urls, c['okdomains'])
+
+ if badurl is not None:
+ if cmdargs.noninteractive:
+ notify_desktop('Spam in %s: %s' % (bug['id'], baddomain))
+ when = datetime.datetime.strptime(bug['last_change_time'], '%Y-%m-%dT%H:%M:%SZ')
+ c['seencids'].remove(cid)
+ c['lastrun'] = when.strftime('%Y-%m-%d %H:%M:%S')
+ c['needsinput'] = True
+ return spammers, spamcids, spambugs
+
+ logger.info(' ---')
+ logger.info(' suspish URL: %s', badurl)
+ logger.info(' checkit out: %s/show_bug.cgi?id=%s#c%s',
+ BZURL, bugid, c_count)
+ baw = input(' (b)an, (a)llow, (w)hitelist %s: ' % baddomain)
+
+ if baw == 'a':
+ c['okfolks'].append(creator)
+ save_cache(cachefile, c)
+ continue
+
+ if baw == 'w':
+ logger.info(' whitelisted %s', baddomain)
+ c['okdomains'].append(baddomain)
+ c['okfolks'].append(creator)
+ save_cache(cachefile, c)
+ continue
+
+ logger.info(' spamcid: %s', cid)
+ spamcids.append(cid)
+
+ # If it's a comment #0, then the whole bug needs junking
+ if c_count == 0:
+ spambugs.append(bug['id'])
+
+ if creator not in spammers:
+ logger.info(' spammer: %s', creator)
+ spammers.append(creator)
+
+ return spammers, spamcids, spambugs
+
+
+def main(args):
+ global BZURL
+ global APIKEY
+
+ logger.setLevel(logging.DEBUG)
+
+ ch = logging.StreamHandler()
+ formatter = logging.Formatter('%(message)s')
+ ch.setFormatter(formatter)
+
+ if args.quiet:
+ ch.setLevel(logging.CRITICAL)
+ elif args.debug:
+ ch.setLevel(logging.DEBUG)
+ else:
+ ch.setLevel(logging.INFO)
+
+ logger.addHandler(ch)
+
+ logger.info('Loading configuration file %s', args.config)
+ config = ConfigParser()
+ config.read(args.config)
+ BZURL = config.get('main', 'url')
+ APIKEY = config.get('main', 'apikey')
+
+ spamtag = config.get('main', 'spamtag')
+
+ if config.get('main', 'logfile'):
+ ch = logging.FileHandler(config.get('main', 'logfile'))
+ fmt = '[%(process)d] %(asctime)s - %(message)s'
+ ch.setFormatter(logging.Formatter(fmt))
+ ch.setLevel(logging.INFO)
+ logger.addHandler(ch)
+
+ cachefile = config.get('main', 'cache')
+ lastrun, c = load_cache(cachefile)
+
+
+ if args.lookback is not None:
+ lastrun = args.lookback
+
+ while True:
+ if args.noninteractive and 'needsinput' in c and c['needsinput']:
+ logger.info('Need to run interactively to make some decisions')
+ sys.exit(0)
+
+ params = {
+ 'chfieldfrom': lastrun,
+ 'include_fields': 'id,summary,last_change_time,url',
+ }
+ logger.info('Querying %s for changes since %s', BZURL, lastrun)
+
+ unow = datetime.datetime.utcnow()
+ json = bz_get('bug', params)
+ c['lastrun'] = unow.strftime('%Y-%m-%d %H:%M:%S')
+ c['needsinput'] = False
+ if len(json['bugs']):
+ spammers, spamcids, spambugs = process_bugs(args, cachefile, c, json['bugs'], spamtag)
+
+ if len(spammers) or len(spamcids) or len(spambugs):
+ ban_hammer(spammers)
+ tag_hammer(spamcids, spamtag)
+ bug_hammer(spambugs, args)
+ else:
+ logger.info('No new spam found')
+ else:
+ logger.info('No changes since %s', lastrun)
+
+ save_cache(cachefile, c)
+
+ if not args.sleep:
+ sys.exit(0)
+
+ logger.info('Sleeping %d seconds', args.sleep)
+ time.sleep(args.sleep)
+
+
+def cmd():
+ description = 'Junk spammy bugzilla comments and ban their authors'
+ parser = argparse.ArgumentParser(description=description, prog='bz-comment-junker.py')
+ parser.add_argument('-c', '--config', required=True,
+ help='Configuration file')
+ parser.add_argument('-q', '--quiet', action='store_true', default=False,
+ help='Output only errors')
+ parser.add_argument('-d', '--debug', action='store_true', default=False,
+ help='Add debugging info')
+ parser.add_argument('-l', '--lookback', default=None,
+ help='How far back to look (default: since last run, or 24h if no cached data)')
+ parser.add_argument('-n', '--noninteractive', action='store_true', default=False,
+ help='Run non-interactively and send an alert when potential spam is found')
+ parser.add_argument('-a', '--check-attachments', action='store_true', dest='checkatt', default=False,
+ help='Check attachments for junkiness')
+ parser.add_argument('--sleep', type=int, default=0,
+ help='After the run, sleep N seconds and then run again')
+ parser.add_argument('--status', default='RESOLVED',
+ help='Status value for junked bugs')
+ parser.add_argument('--resolution', default='INVALID',
+ help='Resolution value for junked bugs')
+ parser.add_argument('--product', default='Other',
+ help='Product value for junked bugs')
+ parser.add_argument('--component', default='Spam',
+ help='Component value for junked bugs')
+ parser.add_argument('--group', default='Junk',
+ help='Private group name for junked bugs')
+
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cmd()