aboutsummaryrefslogtreecommitdiffstats
path: root/isolation.c
blob: 56aa27fa5846bac4d062f85e144f23e765a2232c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <assert.h>
#include <string.h>
#include <errno.h>
#include <sched.h>
#include <pthread.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/prctl.h>

#ifndef PR_SET_TASK_ISOLATION   // Not in system headers yet?
# define PR_SET_TASK_ISOLATION		48
# define PR_GET_TASK_ISOLATION		49
# define PR_TASK_ISOLATION_ENABLE	(1 << 0)
# define PR_TASK_ISOLATION_USERSIG	(1 << 1)
# define PR_TASK_ISOLATION_SET_SIG(sig)	(((sig) & 0x7f) << 8)
# define PR_TASK_ISOLATION_GET_SIG(bits) (((bits) >> 8) & 0x7f)
# define PR_TASK_ISOLATION_NOSIG \
    (PR_TASK_ISOLATION_USERSIG | PR_TASK_ISOLATION_SET_SIG(0))
#endif

// The cpu we are using for isolation tests.
static int task_isolation_cpu;

// Overall status, maintained as tests run.
static int exit_status = EXIT_SUCCESS;

// Set affinity to a single cpu or die if trying to do so fails.
void set_my_cpu(int cpu)
{
	cpu_set_t set;
	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	int rc = sched_setaffinity(0, sizeof(cpu_set_t), &set);
	assert(rc == 0);
}

// Run a child process in task isolation mode and report its status.
// The child does mlockall() and moves itself to the task isolation cpu.
// It then runs SETUP_FUNC (if specified), calls prctl(PR_SET_TASK_ISOLATION, )
// with FLAGS (if non-zero), and then invokes TEST_FUNC and exits
// with its status.
static int run_test(void (*setup_func)(), int (*test_func)(), int flags)
{
	fflush(stdout);
	int pid = fork();
	assert(pid >= 0);
	if (pid != 0) {
		// In parent; wait for child and return its status.
		int status;
		waitpid(pid, &status, 0);
		return status;
	}

	// In child.
	int rc = mlockall(MCL_CURRENT);
	assert(rc == 0);
	set_my_cpu(task_isolation_cpu);
	if (setup_func)
		setup_func();
	if (flags) {
		int rc;
		do
			rc = prctl(PR_SET_TASK_ISOLATION, flags);
		while (rc != 0 && errno == EAGAIN);
		if (rc != 0) {
			printf("couldn't enable isolation (%d): FAIL\n", errno);
			exit(EXIT_FAILURE);
		}
	}
	rc = test_func();
	exit(rc);
}

// Run a test and ensure it is killed with SIGKILL by default,
// for whatever misdemeanor is committed in TEST_FUNC.
// Also test it with SIGUSR1 as well to make sure that works.
static void test_killed(const char *testname, void (*setup_func)(),
			int (*test_func)())
{
	int status = run_test(setup_func, test_func, PR_TASK_ISOLATION_ENABLE);
	if (WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL) {
		printf("%s: OK\n", testname);
	} else {
		printf("%s: FAIL (%#x)\n", testname, status);
		exit_status = EXIT_FAILURE;
	}

	status = run_test(setup_func, test_func,
			  PR_TASK_ISOLATION_ENABLE | PR_TASK_ISOLATION_USERSIG |
			  PR_TASK_ISOLATION_SET_SIG(SIGUSR1));
	if (WIFSIGNALED(status) && WTERMSIG(status) == SIGUSR1) {
		printf("%s (SIGUSR1): OK\n", testname);
	} else {
		printf("%s (SIGUSR1): FAIL (%#x)\n", testname, status);
		exit_status = EXIT_FAILURE;
	}
}

// Run a test and make sure it exits with success.
static void test_ok(const char *testname, void (*setup_func)(),
		    int (*test_func)())
{
	int status = run_test(setup_func, test_func, PR_TASK_ISOLATION_ENABLE);
	if (status == EXIT_SUCCESS) {
		printf("%s: OK\n", testname);
	} else {
		printf("%s: FAIL (%#x)\n", testname, status);
		exit_status = EXIT_FAILURE;
	}
}

// Run a test with no signals and make sure it exits with success.
static void test_nosig(const char *testname, void (*setup_func)(),
		       int (*test_func)())
{
	int status =
		run_test(setup_func, test_func,
			 PR_TASK_ISOLATION_ENABLE | PR_TASK_ISOLATION_NOSIG);
	if (status == EXIT_SUCCESS) {
		printf("%s: OK\n", testname);
	} else {
		printf("%s: FAIL (%#x)\n", testname, status);
		exit_status = EXIT_FAILURE;
	}
}

// Mapping address passed from setup function to test function.
static char *fault_file_mapping;

// mmap() a file in so we can test touching an unmapped page.
static void setup_fault(void)
{
	char fault_file[] = "/tmp/isolation_XXXXXX";
	int fd = mkstemp(fault_file);
	assert(fd >= 0);
	int rc = ftruncate(fd, getpagesize());
	assert(rc == 0);
	fault_file_mapping = mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE,
				  MAP_SHARED, fd, 0);
	assert(fault_file_mapping != MAP_FAILED);
	close(fd);
	unlink(fault_file);
}

// Now touch the unmapped page (and be killed).
static int do_fault(void)
{
	*fault_file_mapping = 1;
	return EXIT_FAILURE;
}

// Make a syscall (and be killed).
static int do_syscall(void)
{
	write(STDOUT_FILENO, "goodbye, world\n", 13);
	return EXIT_FAILURE;
}

// Turn isolation back off and don't be killed.
static int do_syscall_off(void)
{
	prctl(PR_SET_TASK_ISOLATION, 0);
	write(STDOUT_FILENO, "==> hello, world\n", 17);
	return EXIT_SUCCESS;
}

// If we're not getting a signal, make sure we can do multiple system calls.
static int do_syscall_multi(void)
{
	write(STDOUT_FILENO, "==> hello, world 1\n", 19);
	write(STDOUT_FILENO, "==> hello, world 2\n", 19);
	return EXIT_SUCCESS;
}

#ifdef __aarch64__
// ARM64 uses tlbi instructions so doesn't need to interrupt the remote core.
static void test_munmap(void) {}
#else

// Fork a thread that will munmap() after a short while.
// It will deliver a TLB flush to the task isolation core.

static void *start_munmap(void *p)
{
	usleep(500000);   // 0.5s
	munmap(p, getpagesize());
	return 0;
}

static void setup_munmap(void)
{
	// First, go back to cpu 0 and allocate some memory.
	set_my_cpu(0);
	void *p = mmap(0, getpagesize(), PROT_READ|PROT_WRITE,
		       MAP_ANONYMOUS|MAP_POPULATE|MAP_PRIVATE, 0, 0);
	assert(p != MAP_FAILED);

	// Now fire up a thread that will wait half a second on cpu 0
	// and then munmap the mapping.
	pthread_t thr;
	int rc = pthread_create(&thr, NULL, start_munmap, p);
	assert(rc == 0);

	// Back to the task-isolation cpu.
	set_my_cpu(task_isolation_cpu);
}

// Global variable to avoid the compiler outsmarting us.
volatile int munmap_spin;

static int do_munmap(void)
{
	while (munmap_spin < 1000000000)
		++munmap_spin;
	return EXIT_FAILURE;
}

static void test_munmap(void)
{
	test_killed("test_munmap", setup_munmap, do_munmap);
}
#endif

#ifdef __tilegx__
// Make an unaligned access (and be killed).
// Only for tilegx, since other platforms don't do in-kernel fixups.
static int
do_unaligned(void)
{
	static int buf[2];
	volatile int* addr = (volatile int *)((char *)buf + 1);

	*addr;

	asm("nop");
	return EXIT_FAILURE;
}

static void test_unaligned(void)
{
	test_killed("test_unaligned", NULL, do_unaligned);
}
#else
static void test_unaligned(void) {}
#endif

// Fork a process that will spin annoyingly on the same core
// for a second.  Since prctl() won't work if this task is actively
// running, we following this handshake sequence:
//
// 1. Child (in setup_quiesce, here) starts up, sets state 1 to let the
//    parent know it's running, and starts doing short sleeps waiting on a
//    state change.
// 2. Parent (in do_quiesce, below) starts up, spins waiting for state 1,
//    then spins waiting on prctl() to succeed.  At that point it is in
//    isolation mode and the child is completing its most recent sleep.
//    Now, as soon as the parent is scheduled out, it won't schedule back
//    in until the child stops spinning.
// 3. Child sees the state change to 2, sets it to 3, and starts spinning
//    waiting for a second to elapse, at which point it exits.
// 4. Parent spins waiting for the state to get to 3, then makes one
//    syscall.  This should take about a second even though the child
//    was spinning for a whole second after changing the state to 3.

volatile int *statep, *childstate;
struct timeval quiesce_start, quiesce_end;
int child_pid;

static void setup_quiesce(void)
{
	// First, go back to cpu 0 and allocate some shared memory.
	set_my_cpu(0);
	statep = mmap(0, getpagesize(), PROT_READ|PROT_WRITE,
		      MAP_ANONYMOUS|MAP_SHARED, 0, 0);
	assert(statep != MAP_FAILED);
	childstate = statep + 1;

	gettimeofday(&quiesce_start, NULL);

	// Fork and fault in all memory in both.
	child_pid = fork();
	assert(child_pid >= 0);
	if (child_pid == 0)
		*childstate = 1;
	int rc = mlockall(MCL_CURRENT);
	assert(rc == 0);
	if (child_pid != 0) {
		set_my_cpu(task_isolation_cpu);
		return;
	}

	// In child.  Wait until parent notifies us that it has completed
	// its prctl, then jump to its cpu and let it know.
	*childstate = 2;
	while (*statep == 0)
		;
	*childstate = 3;
	//  printf("child: jumping to cpu %d\n", task_isolation_cpu);
	set_my_cpu(task_isolation_cpu);
	//  printf("child: jumped to cpu %d\n", task_isolation_cpu);
	*statep = 2;
	*childstate = 4;

	// Now we are competing for the runqueue on task_isolation_cpu.
	// Spin for one second to ensure the parent gets caught in kernel space.
	struct timeval start, tv;
	gettimeofday(&start, NULL);
	while (1) {
		gettimeofday(&tv, NULL);
		double time = (tv.tv_sec - start.tv_sec) +
			(tv.tv_usec - start.tv_usec) / 1000000.0;
		if (time >= 0.5)
			exit(0);
	}
}

static int do_quiesce(void)
{
	double time;
	int rc;

	rc = prctl(PR_SET_TASK_ISOLATION,
		   PR_TASK_ISOLATION_ENABLE | PR_TASK_ISOLATION_NOSIG);
	if (rc != 0) {
		prctl(PR_SET_TASK_ISOLATION, 0);
		printf("prctl failed: rc %d", rc);
		goto fail;
	}
	*statep = 1;
    
	// Wait for child to come disturb us.
	while (*statep == 1) {
		gettimeofday(&quiesce_end, NULL);
		time = (quiesce_end.tv_sec - quiesce_start.tv_sec) +
			(quiesce_end.tv_usec - quiesce_start.tv_usec)/1000000.0;
		if (time > 0.1 && *statep == 1)	{
			prctl(PR_SET_TASK_ISOLATION, 0);
			printf("timed out at %gs in child migrate loop (%d)\n",
			       time, *childstate);
			char buf[100];
			sprintf(buf, "cat /proc/%d/stack", child_pid);
			system(buf);
			goto fail;
		}
	}
	assert(*statep == 2);

	// At this point the child is spinning, so any interrupt will keep us
	// in kernel space.  Make a syscall to make sure it happens at least
	// once during the second that the child is spinning.
	kill(0, 0);
	gettimeofday(&quiesce_end, NULL);
	prctl(PR_SET_TASK_ISOLATION, 0);
	time = (quiesce_end.tv_sec - quiesce_start.tv_sec) +
		(quiesce_end.tv_usec - quiesce_start.tv_usec) / 1000000.0;
	if (time < 0.4 || time > 0.6) {
		printf("expected 1s wait after quiesce: was %g\n", time);
		goto fail;
	}
	kill(child_pid, SIGKILL);
	return EXIT_SUCCESS;

fail:
	kill(child_pid, SIGKILL);
	return EXIT_FAILURE;
}

#ifdef __tile__
#include <arch/spr_def.h>
#endif

static inline unsigned long get_cycle_count(void)
{
#ifdef __x86_64__
	unsigned int lower, upper;
	__asm__ __volatile__("rdtsc" : "=a"(lower), "=d"(upper));
	return lower | ((unsigned long)upper << 32);
#elif defined(__tile__)
	return __insn_mfspr(SPR_CYCLE);
#elif defined(__aarch64__)
	unsigned long vtick;
	__asm__ volatile("mrs %0, cntvct_el0" : "=r" (vtick));
	return vtick;
#else
#error Unsupported architecture
#endif
}

// Histogram of cycle counts up to HISTSIZE cycles.
#define HISTSIZE 500
long hist[HISTSIZE];

// Information on loss of control of the cpu (more than HISTSIZE cycles).
struct jitter_info {
	unsigned long at;      // cycle of jitter event
	long cycles;           // how long we lost the cpu for
};
#define MAX_EVENTS 100
volatile struct jitter_info jitter[MAX_EVENTS];
unsigned int count;            // index into jitter[]

void jitter_summarize(void)
{
	printf("INFO: loop times:\n");
	unsigned int i;
	for (i = 0 ;i < HISTSIZE; ++i)
		if (hist[i])
			printf("  %d x %ld\n", i, hist[i]);

	if (count)
		printf("ERROR: jitter:\n");
	for (i = 0; i < count; ++i)
		printf("  %ld: %ld cycles\n", jitter[i].at, jitter[i].cycles);
	if (count == sizeof(jitter)/sizeof(jitter[0]))
		printf("  ... more\n");
}

void jitter_sigint(int sig)
{
	(void)sig;
	printf("\n");
	jitter_summarize();
	exit(exit_status);
}

void test_jitter(unsigned long waitticks)
{
	printf("testing task isolation jitter for %ld ticks\n", waitticks);

	signal(SIGINT, jitter_sigint);
	set_my_cpu(task_isolation_cpu);
	int rc = mlockall(MCL_CURRENT);
	assert(rc == 0);

	do
		rc = prctl(PR_SET_TASK_ISOLATION, PR_TASK_ISOLATION_ENABLE);
	while (rc != 0 && errno == EAGAIN);
	if (rc != 0) {
		printf("couldn't enable isolation (%d): FAIL\n", errno);
		exit(EXIT_FAILURE);
	}

	unsigned long start = get_cycle_count();
	unsigned long last = start;
	unsigned long elapsed;
	do {
		unsigned long next = get_cycle_count();
		unsigned long delta = next - last;
		elapsed = next - start;
		if (__builtin_expect(delta > HISTSIZE, 0)) {
			exit_status = EXIT_FAILURE;
			if (count < sizeof(jitter)/sizeof(jitter[0])) {
				jitter[count].cycles = delta;
				jitter[count].at = elapsed;
				++count;
			}
		} else {
			hist[delta]++;
		}
		last = next;

	} while (elapsed < waitticks);

	prctl(PR_SET_TASK_ISOLATION, 0);
	jitter_summarize();
}

int main(int argc, char **argv)
{
	// How many billion ticks to wait after running the other tests?
	unsigned long waitticks;
	if (argc == 1)
		waitticks = 10;
	else if (argc == 2)
		waitticks = strtol(argv[1], NULL, 10);
	else {
		printf("syntax: isolation [gigaticks]\n");
		exit(EXIT_FAILURE);
	}
	waitticks *= 1000000000;

	// Test that the /sys device is present and pick a cpu.
	FILE *f = fopen("/sys/devices/system/cpu/task_isolation", "r");
	if (f == NULL) {
		printf("/sys device: FAIL (%s)\n", strerror(errno));
		exit(EXIT_FAILURE);
	}
	char buf[100];
	char *result = fgets(buf, sizeof(buf), f);
	assert(result == buf);
	fclose(f);
	if (*buf == '\n') {
		printf("No task_isolation cores configured; please reboot with task_isolation=NNN\n");
		exit(EXIT_FAILURE);
	}
	char *end;
	task_isolation_cpu = strtol(buf, &end, 10);
	assert(end != buf);
	assert(*end == ',' || *end == '-' || *end == '\n');
	assert(task_isolation_cpu >= 0);
	printf("/sys device : OK (using task isolation cpu %d)\n",
	       task_isolation_cpu);

	// Test to see if with no mask set, we fail.
	if (prctl(PR_SET_TASK_ISOLATION, PR_TASK_ISOLATION_ENABLE) == 0 ||
	    errno != EINVAL) {
		printf("prctl unaffinitized: FAIL\n");
		exit_status = EXIT_FAILURE;
	} else {
		printf("prctl unaffinitized: OK\n");
	}

	// Or if affinitized to the wrong cpu.
	set_my_cpu(0);
	if (prctl(PR_SET_TASK_ISOLATION, PR_TASK_ISOLATION_ENABLE) == 0 ||
	    errno != EINVAL) {
		printf("prctl on cpu 0: FAIL\n");
		exit_status = EXIT_FAILURE;
	} else {
		printf("prctl on cpu 0: OK\n");
	}

	// Run the tests.
	test_killed("test_fault", setup_fault, do_fault);
	test_killed("test_syscall", NULL, do_syscall);
	test_munmap();
	test_unaligned();
	test_ok("test_off", NULL, do_syscall_off);
	test_nosig("test_multi", NULL, do_syscall_multi);
	test_nosig("test_quiesce", setup_quiesce, do_quiesce);

	// Exit failure if any test failed.
	if (exit_status != EXIT_SUCCESS) {
		printf("Skipping jitter testing due to test failures\n");
		return exit_status;
	}

	test_jitter(waitticks);

	return exit_status;
}