/* * I/O monitor based on block queue trace data * * Copyright IBM Corp. 2008 * * Author(s): Martin Peschke * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "blktrace.h" #include "rbtree.h" #include "jhash.h" #include "blkiomon.h" struct trace { struct blk_io_trace bit; struct rb_node node; struct trace *next; long sequence; }; struct rb_search { struct rb_node **node_ptr; struct rb_node *parent; }; struct dstat_msg { long mtype; struct blkiomon_stat stat; }; struct dstat { struct dstat_msg msg; struct rb_node node; struct dstat *next; }; struct output { char *fn; FILE *fp; char *buf; int pipe; }; static char blkiomon_version[] = "0.3"; static FILE *ifp; static int interval = -1; static struct trace *vacant_traces_list = NULL; static int vacant_traces = 0; #define TRACE_HASH_SIZE 128 struct trace *thash[TRACE_HASH_SIZE] = {}; static struct dstat *vacant_dstats_list = NULL; static struct rb_root dstat_tree[2] = { RB_ROOT, RB_ROOT }; static struct dstat *dstat_list[2] = {}; static int dstat_curr = 0; static struct output drvdata, human, binary, debug; static char *msg_q_name = NULL; static int msg_q_id = -1, msg_q = -1; static long msg_id = -1; static pthread_t interval_thread; static pthread_mutex_t dstat_mutex = PTHREAD_MUTEX_INITIALIZER; int data_is_native = -1; static int up = 1; /* debugging */ static long leftover = 0, driverdata = 0, match = 0, mismatch = 0, sequence = 0; static void dump_bit(struct trace *t, const char *descr) { struct blk_io_trace *bit = &t->bit; if (!debug.fn) return; fprintf(debug.fp, "--- %s ---\n", descr); fprintf(debug.fp, "magic %16d\n", bit->magic); fprintf(debug.fp, "sequence %16d\n", bit->sequence); fprintf(debug.fp, "time %16ld\n", (unsigned long)bit->time); fprintf(debug.fp, "sector %16ld\n", (unsigned long)bit->sector); fprintf(debug.fp, "bytes %16d\n", bit->bytes); fprintf(debug.fp, "action %16x\n", bit->action); fprintf(debug.fp, "pid %16d\n", bit->pid); fprintf(debug.fp, "device %16d\n", bit->device); fprintf(debug.fp, "cpu %16d\n", bit->cpu); fprintf(debug.fp, "error %16d\n", bit->error); fprintf(debug.fp, "pdu_len %16d\n", bit->pdu_len); fprintf(debug.fp, "order %16ld\n", t->sequence); } static void dump_bits(struct trace *t1, struct trace *t2, const char *descr) { struct blk_io_trace *bit1 = &t1->bit; struct blk_io_trace *bit2 = &t2->bit; if (!debug.fn) return; fprintf(debug.fp, "--- %s ---\n", descr); fprintf(debug.fp, "magic %16d %16d\n", bit1->magic, bit2->magic); fprintf(debug.fp, "sequence %16d %16d\n", bit1->sequence, bit2->sequence); fprintf(debug.fp, "time %16ld %16ld\n", (unsigned long)bit1->time, (unsigned long)bit2->time); fprintf(debug.fp, "sector %16ld %16ld\n", (unsigned long)bit1->sector, (unsigned long)bit2->sector); fprintf(debug.fp, "bytes %16d %16d\n", bit1->bytes, bit2->bytes); fprintf(debug.fp, "action %16x %16x\n", bit1->action, bit2->action); fprintf(debug.fp, "pid %16d %16d\n", bit1->pid, bit2->pid); fprintf(debug.fp, "device %16d %16d\n", bit1->device, bit2->device); fprintf(debug.fp, "cpu %16d %16d\n", bit1->cpu, bit2->cpu); fprintf(debug.fp, "error %16d %16d\n", bit1->error, bit2->error); fprintf(debug.fp, "pdu_len %16d %16d\n", bit1->pdu_len, bit2->pdu_len); fprintf(debug.fp, "order %16ld %16ld\n", t1->sequence, t2->sequence); } static struct dstat *blkiomon_alloc_dstat(void) { struct dstat *dstat; if (vacant_dstats_list) { dstat = vacant_dstats_list; vacant_dstats_list = dstat->next; } else dstat = malloc(sizeof(*dstat)); if (!dstat) { fprintf(stderr, "blkiomon: could not allocate device statistic"); return NULL; } blkiomon_stat_init(&dstat->msg.stat); return dstat; } static struct dstat *blkiomon_find_dstat(struct rb_search *search, __u32 device) { struct rb_node **p = &(dstat_tree[dstat_curr].rb_node); struct rb_node *parent = NULL; struct dstat *dstat; while (*p) { parent = *p; dstat = rb_entry(parent, struct dstat, node); if (dstat->msg.stat.device < device) p = &(*p)->rb_left; else if (dstat->msg.stat.device > device) p = &(*p)->rb_right; else return dstat; } search->node_ptr = p; search->parent = parent; return NULL; } static struct dstat *blkiomon_get_dstat(__u32 device) { struct dstat *dstat; struct rb_search search = { 0, }; pthread_mutex_lock(&dstat_mutex); dstat = blkiomon_find_dstat(&search, device); if (dstat) goto out; dstat = blkiomon_alloc_dstat(); if (!dstat) goto out; dstat->msg.stat.device = device; rb_link_node(&dstat->node, search.parent, search.node_ptr); rb_insert_color(&dstat->node, &dstat_tree[dstat_curr]); dstat->next = dstat_list[dstat_curr]; dstat_list[dstat_curr] = dstat; out: pthread_mutex_unlock(&dstat_mutex); return dstat; } static int blkiomon_output_msg_q(struct dstat *dstat) { if (!msg_q_name) return 0; dstat->msg.mtype = msg_id; return msgsnd(msg_q, &dstat->msg, sizeof(struct blkiomon_stat), 0); } static int blkiomon_output_binary(struct dstat *dstat) { struct blkiomon_stat *p = &dstat->msg.stat; if (!binary.fn) return 0; if (fwrite(p, sizeof(*p), 1, binary.fp) != 1) goto failed; if (binary.pipe && fflush(binary.fp)) goto failed; return 0; failed: fprintf(stderr, "blkiomon: could not write to %s\n", binary.fn); fclose(binary.fp); binary.fn = NULL; return 1; } static struct dstat *blkiomon_output(struct dstat *head, struct timespec *ts) { struct dstat *dstat, *tail = NULL; for (dstat = head; dstat; dstat = dstat->next) { dstat->msg.stat.time = ts->tv_sec; blkiomon_stat_print(human.fp, &dstat->msg.stat); blkiomon_stat_to_be(&dstat->msg.stat); blkiomon_output_binary(dstat); blkiomon_output_msg_q(dstat); tail = dstat; } return tail; } static void *blkiomon_interval(void *data) { struct timespec wake, r; struct dstat *head, *tail; int finished; clock_gettime(CLOCK_REALTIME, &wake); while (1) { wake.tv_sec += interval; if (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &wake, &r)) { fprintf(stderr, "blkiomon: interrupted sleep"); continue; } /* grab tree and make data gatherer build up another tree */ pthread_mutex_lock(&dstat_mutex); finished = dstat_curr; dstat_curr = dstat_curr ? 0 : 1; pthread_mutex_unlock(&dstat_mutex); head = dstat_list[finished]; if (!head) continue; dstat_list[finished] = NULL; dstat_tree[finished] = RB_ROOT; tail = blkiomon_output(head, &wake); pthread_mutex_lock(&dstat_mutex); tail->next = vacant_dstats_list; vacant_dstats_list = head; pthread_mutex_unlock(&dstat_mutex); } return data; } #define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE)) static int blkiomon_account(struct blk_io_trace *bit_d, struct blk_io_trace *bit_c) { struct dstat *dstat; struct blkiomon_stat *p; __u64 d2c = (bit_c->time - bit_d->time) / 1000; /* ns -> us */ __u32 size = bit_d->bytes; __u64 thrput = size * 1000 / d2c; dstat = blkiomon_get_dstat(bit_d->device); if (!dstat) return 1; p = &dstat->msg.stat; if (BLK_DATADIR(bit_c->action) & BLK_TC_READ) { minmax_account(&p->thrput_r, thrput); minmax_account(&p->size_r, size); minmax_account(&p->d2c_r, d2c); } else if (BLK_DATADIR(bit_c->action) & BLK_TC_WRITE) { minmax_account(&p->thrput_w, thrput); minmax_account(&p->size_w, size); minmax_account(&p->d2c_w, d2c); } else p->bidir++; histlog2_account(p->size_hist, size, &size_hist); histlog2_account(p->d2c_hist, d2c, &d2c_hist); return 0; } static struct trace *blkiomon_alloc_trace(void) { struct trace *t = vacant_traces_list; if (t) { vacant_traces_list = t->next; vacant_traces--; } else t = malloc(sizeof(*t)); memset(t, 0, sizeof(*t)); return t; } static void blkiomon_free_trace(struct trace *t) { if (vacant_traces < 256) { t->next = vacant_traces_list; vacant_traces_list = t; vacant_traces++; } else free(t); } static int action(int a) { int bits = BLK_TC_WRITE | BLK_TC_READ | BLK_TC_FS | BLK_TC_PC; return a & (BLK_TC_ACT(bits)); } static void blkiomon_store_trace(struct trace *t) { int i = t->bit.sector % TRACE_HASH_SIZE; t->next = thash[i]; thash[i] = t; } static struct trace *blkiomon_fetch_trace(struct blk_io_trace *bit) { int i = bit->sector % TRACE_HASH_SIZE; struct trace *t, *prev = NULL; for (t = thash[i]; t; t = t->next) { if (t->bit.device == bit->device && t->bit.sector == bit->sector && action(t->bit.action) == action(bit->action)) { if (prev) prev->next = t->next; else thash[i] = t->next; return t; } prev = t; } return NULL; } static struct trace *blkiomon_do_trace(struct trace *t) { struct trace *t_stored, *t_old, *t_young; /* store trace if there is no match yet */ t_stored = blkiomon_fetch_trace(&t->bit); if (!t_stored) { blkiomon_store_trace(t); return blkiomon_alloc_trace(); } /* figure out older trace and younger trace */ if (t_stored->bit.time < t->bit.time) { t_old = t_stored; t_young = t; } else { t_old = t; t_young = t_stored; } /* we need an older D trace and a younger C trace */ if (t_old->bit.action & BLK_TC_ACT(BLK_TC_ISSUE) && t_young->bit.action & BLK_TC_ACT(BLK_TC_COMPLETE)) { /* matching D and C traces - update statistics */ match++; blkiomon_account(&t_old->bit, &t_young->bit); blkiomon_free_trace(t_stored); return t; } /* no matching D and C traces - keep more recent trace */ dump_bits(t_old, t_young, "mismatch"); mismatch++; blkiomon_store_trace(t_young); return t_old; } static int blkiomon_dump_drvdata(struct blk_io_trace *bit, void *pdu_buf) { if (!drvdata.fn) return 0; if (fwrite(bit, sizeof(*bit), 1, drvdata.fp) != 1) goto failed; if (fwrite(pdu_buf, bit->pdu_len, 1, drvdata.fp) != 1) goto failed; if (drvdata.pipe && fflush(drvdata.fp)) goto failed; return 0; failed: fprintf(stderr, "blkiomon: could not write to %s\n", drvdata.fn); fclose(drvdata.fp); drvdata.fn = NULL; return 1; } static int blkiomon_do_fifo(void) { struct trace *t; struct blk_io_trace *bit; void *pdu_buf = NULL; t = blkiomon_alloc_trace(); if (!t) return 1; bit = &t->bit; while (up) { if (fread(bit, sizeof(*bit), 1, ifp) != 1) { if (!feof(ifp)) fprintf(stderr, "blkiomon: could not read trace"); break; } if (ferror(ifp)) { clearerr(ifp); fprintf(stderr, "blkiomon: error while reading trace"); break; } if (data_is_native == -1 && check_data_endianness(bit->magic)) { fprintf(stderr, "blkiomon: endianess problem\n"); break; } /* endianess */ trace_to_cpu(bit); if (verify_trace(bit)) { fprintf(stderr, "blkiomon: bad trace\n"); break; } /* read additional trace payload */ if (bit->pdu_len) { pdu_buf = realloc(pdu_buf, bit->pdu_len); if (fread(pdu_buf, bit->pdu_len, 1, ifp) != 1) { clearerr(ifp); fprintf(stderr, "blkiomon: could not read payload\n"); break; } } t->sequence = sequence++; /* forward low-level device driver trace to other tool */ if (bit->action & BLK_TC_ACT(BLK_TC_DRV_DATA)) { driverdata++; if (blkiomon_dump_drvdata(bit, pdu_buf)) { fprintf(stderr, "blkiomon: could not send trace\n"); break; } continue; } if (!(bit->action & BLK_TC_ACT(BLK_TC_ISSUE | BLK_TC_COMPLETE))) continue; /* try to find matching trace and update statistics */ t = blkiomon_do_trace(t); if (!t) { fprintf(stderr, "blkiomon: could not alloc trace\n"); break; } bit = &t->bit; /* t and bit will be recycled for next incoming trace */ } blkiomon_free_trace(t); free(pdu_buf); return 0; } static int blkiomon_open_output(struct output *out) { int mode, vbuf_size; if (!out->fn) return 0; if (!strcmp(out->fn, "-")) { out->fp = fdopen(STDOUT_FILENO, "w"); mode = _IOLBF; vbuf_size = 4096; out->pipe = 1; } else { out->fp = fopen(out->fn, "w"); mode = _IOFBF; vbuf_size = 128 * 1024; out->pipe = 0; } if (!out->fp) goto failed; out->buf = malloc(128 * 1024); if (setvbuf(out->fp, out->buf, mode, vbuf_size)) goto failed; return 0; failed: fprintf(stderr, "blkiomon: could not write to %s\n", out->fn); out->fn = NULL; free(out->buf); return 1; } static int blkiomon_open_msg_q(void) { key_t key; if (!msg_q_name) return 0; if (!msg_q_id || msg_id <= 0) return 1; key = ftok(msg_q_name, msg_q_id); if (key == -1) return 1; while (up) { msg_q = msgget(key, S_IRWXU); if (msg_q >= 0) break; } return (msg_q >= 0 ? 0 : -1); } static void blkiomon_debug(void) { int i; struct trace *t; if (!debug.fn) return; for (i = 0; i < TRACE_HASH_SIZE; i++) for (t = thash[i]; t; t = t->next) { dump_bit(t, "leftover"); leftover++; } fprintf(debug.fp, "%ld leftover, %ld match, %ld mismatch, " "%ld driverdata, %ld overall\n", leftover, match, mismatch, driverdata, sequence); } #define S_OPTS "b:d:D:h:I:Q:q:m:V" static char usage_str[] = "\n\nblkiomon " \ "-I | --interval=\n" \ "[ -h | --human-readable= ]\n" \ "[ -b | --binary= ]\n" \ "[ -d | --dump-lldd= ]\n" \ "[ -D | --debug= ]\n" \ "[ -Q | --msg-queue=]\n" \ "[ -q | --msg-queue-id=]\n" \ "[ -m | --msg-id=]\n" \ "[ -V | --version ]\n\n" \ "\t-I Sample interval.\n" \ "\t-h Human-readable output file.\n" \ "\t-b Binary output file.\n" \ "\t-d Output file for data emitted by low level device driver.\n" \ "\t-D Output file for debugging data.\n" \ "\t-Qqm Output to message queue using given ID for messages.\n" \ "\t-V Print program version.\n\n"; static struct option l_opts[] = { { .name = "human-readable", .has_arg = required_argument, .flag = NULL, .val = 'h' }, { .name = "binary", .has_arg = required_argument, .flag = NULL, .val = 'b' }, { .name = "dump-lldd", .has_arg = required_argument, .flag = NULL, .val = 'd' }, { .name = "debug", .has_arg = required_argument, .flag = NULL, .val = 'D' }, { .name = "interval", .has_arg = required_argument, .flag = NULL, .val = 'I' }, { .name = "msg-queue", .has_arg = required_argument, .flag = NULL, .val = 'Q' }, { .name = "msg-queue-id", .has_arg = required_argument, .flag = NULL, .val = 'q' }, { .name = "msg-id", .has_arg = required_argument, .flag = NULL, .val = 'm' }, { .name = "version", .has_arg = no_argument, .flag = NULL, .val = 'V' }, { .name = NULL, } }; static void blkiomon_signal(int signal) { fprintf(stderr, "blkiomon: terminated by signal\n"); up = signal & 0; } int main(int argc, char *argv[]) { int c; signal(SIGALRM, blkiomon_signal); signal(SIGINT, blkiomon_signal); signal(SIGTERM, blkiomon_signal); signal(SIGQUIT, blkiomon_signal); while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) { switch (c) { case 'h': human.fn = optarg; break; case 'b': binary.fn = optarg; break; case 'd': drvdata.fn = optarg; break; case 'D': debug.fn = optarg; break; case 'I': interval = atoi(optarg); break; case 'Q': msg_q_name = optarg; break; case 'q': msg_q_id = atoi(optarg); break; case 'm': msg_id = atoi(optarg); break; case 'V': printf("%s version %s\n", argv[0], blkiomon_version); return 0; default: fprintf(stderr, "Usage: %s", usage_str); return 1; } } if (interval <= 0) { fprintf(stderr, "Usage: %s", usage_str); return 1; } ifp = fdopen(STDIN_FILENO, "r"); if (!ifp) { perror("blkiomon: could not open stdin for reading"); return 1; } if (blkiomon_open_output(&human)) return 1; if (blkiomon_open_output(&binary)) return 1; if (blkiomon_open_output(&drvdata)) return 1; if (blkiomon_open_output(&debug)) return 1; if (blkiomon_open_msg_q()) return 1; if (pthread_create(&interval_thread, NULL, blkiomon_interval, NULL)) { fprintf(stderr, "blkiomon: could not create thread"); return 1; } blkiomon_do_fifo(); blkiomon_debug(); return 0; }