From 6dc88282bb83994ab2c25d73681d84b73a66cc77 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Jan 22 2016 20:41:20 +0000 Subject: sanlock: improve logging for renewals --- diff --git a/src/delta_lease.c b/src/delta_lease.c index ef1cb3c..3d5189c 100644 --- a/src/delta_lease.c +++ b/src/delta_lease.c @@ -464,6 +464,7 @@ int delta_lease_renew(struct task *task, struct delta_extra *extra, int prev_result, int *read_result, + int log_renewal_level, struct leader_record *leader_last, struct leader_record *leader_ret) { @@ -473,7 +474,7 @@ int delta_lease_renew(struct task *task, char **p_wbuf; char *wbuf; uint32_t checksum; - uint64_t host_id, id_offset, new_ts; + uint64_t host_id, id_offset, new_ts, now; int rv, iobuf_len, sector_size; if (!leader_last) { @@ -569,14 +570,21 @@ int delta_lease_renew(struct task *task, } } + if (log_renewal_level != -1) + log_level(sp->space_id, 0, NULL, log_renewal_level, "delta_renew begin read"); + rv = read_iobuf(disk->fd, disk->offset, task->iobuf, iobuf_len, task, sp->io_timeout); if (rv) { /* the next time delta_lease_renew() is called, prev_result will be this rv. If this rv is SANLK_AIO_TIMEOUT, we'll try to reap the event */ - log_erros(sp, "delta_renew read rv %d offset %llu %s", - rv, (unsigned long long)disk->offset, disk->path); + if (rv == SANLK_AIO_TIMEOUT) + log_erros(sp, "delta_renew read timeout %u sec offset %llu %s", + sp->io_timeout, (unsigned long long)disk->offset, disk->path); + else + log_erros(sp, "delta_renew read rv %d offset %llu %s", + rv, (unsigned long long)disk->offset, disk->path); return rv; } @@ -625,9 +633,11 @@ int delta_lease_renew(struct task *task, new_ts = monotime(); - if (leader.timestamp >= new_ts) { + if (log_renewal_level != -1) + log_level(sp->space_id, 0, NULL, log_renewal_level, "delta_renew begin write for new ts %llu", (unsigned long long)new_ts); + + if (leader.timestamp >= new_ts) log_erros(sp, "delta_renew timestamp too small"); - } leader.timestamp = new_ts; leader.checksum = 0; /* set below */ @@ -670,11 +680,18 @@ int delta_lease_renew(struct task *task, if (rv != SANLK_AIO_TIMEOUT) free(wbuf); + now = monotime(); + if (rv < 0) { - log_erros(sp, "delta_renew write error %d", rv); + log_erros(sp, "delta_renew write time %llu error %d", + (unsigned long long)(now - new_ts), rv); return rv; } + if (now - new_ts >= sp->io_timeout) + log_erros(sp, "delta_renew long write time %llu sec", + (unsigned long long)(now - new_ts)); + /* the paper shows doing a delay and another read here, but it seems unnecessary since we do the same at the beginning of the next renewal */ diff --git a/src/delta_lease.h b/src/delta_lease.h index 968c7c7..38c469a 100644 --- a/src/delta_lease.h +++ b/src/delta_lease.h @@ -33,6 +33,7 @@ int delta_lease_renew(struct task *task, struct delta_extra *extra, int prev_result, int *read_result, + int log_renewal_level, struct leader_record *leader_last, struct leader_record *leader_ret); diff --git a/src/direct.c b/src/direct.c index 2dbb098..c39988c 100644 --- a/src/direct.c +++ b/src/direct.c @@ -251,6 +251,7 @@ static int do_delta_action(int action, NULL, -1, &read_result, + 0, &leader, &leader); break; diff --git a/src/diskio.c b/src/diskio.c index 7647094..218fab1 100644 --- a/src/diskio.c +++ b/src/diskio.c @@ -347,9 +347,17 @@ static struct aicb *find_callback_slot(struct task *task, int ioto) struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); int op = ev_iocb ? ev_iocb->aio_lio_opcode : -1; + const char *op_str; - log_taskw(task, "aio collect %d %p:%p:%p result %ld:%ld old free", - op, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); + if (op == IO_CMD_PREAD) + op_str = "RD"; + else if (op == IO_CMD_PWRITE) + op_str = "WR"; + else + op_str = "UK"; + + log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld old free", + op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); ev_aicb->used = 0; free(ev_aicb->buf); ev_aicb->buf = NULL; @@ -373,6 +381,7 @@ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, struct aicb *aicb; struct iocb *iocb; struct io_event event; + const char *op_str; int rv; if (!ioto) { @@ -426,24 +435,31 @@ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); int op = ev_iocb ? ev_iocb->aio_lio_opcode : -1; + if (op == IO_CMD_PREAD) + op_str = "RD"; + else if (op == IO_CMD_PWRITE) + op_str = "WR"; + else + op_str = "UK"; + ev_aicb->used = 0; if (ev_iocb != iocb) { - log_taskw(task, "aio collect %d %p:%p:%p result %ld:%ld other free", - op, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); + log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld other free", + op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); free(ev_aicb->buf); ev_aicb->buf = NULL; goto retry; } if ((int)event.res < 0) { - log_taskw(task, "aio collect %d %p:%p:%p result %ld:%ld match res", - op, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); + log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match res", + op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = event.res; goto out; } if (event.res != len) { - log_taskw(task, "aio collect %d %p:%p:%p result %ld:%ld match len %d", - op, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2, len); + log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match len %d", + op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2, len); rv = -EMSGSIZE; goto out; } @@ -466,8 +482,15 @@ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, task->to_count++; - log_taskw(task, "aio timeout %d %p:%p:%p ioto %d to_count %d", - cmd, aicb, iocb, buf, ioto, task->to_count); + if (cmd == IO_CMD_PREAD) + op_str = "RD"; + else if (cmd == IO_CMD_PWRITE) + op_str = "WR"; + else + op_str = "UK"; + + log_taskw(task, "aio timeout %s %p:%p:%p ioto %d to_count %d", + op_str, aicb, iocb, buf, ioto, task->to_count); rv = io_cancel(task->aio_ctx, iocb, &event); if (!rv) { @@ -798,31 +821,39 @@ int read_iobuf_reap(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); int op = ev_iocb ? ev_iocb->aio_lio_opcode : -1; + const char *op_str; + + if (op == IO_CMD_PREAD) + op_str = "RD"; + else if (op == IO_CMD_PWRITE) + op_str = "WR"; + else + op_str = "UK"; ev_aicb->used = 0; if (ev_iocb != iocb) { - log_taskw(task, "aio collect %d %p:%p:%p result %ld:%ld other free r", - op, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); + log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld other free r", + op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); free(ev_aicb->buf); ev_aicb->buf = NULL; goto retry; } if ((int)event.res < 0) { - log_taskw(task, "aio collect %d %p:%p:%p result %ld:%ld match res r", - op, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); + log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match res r", + op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = event.res; goto out; } if (event.res != iobuf_len) { - log_taskw(task, "aio collect %d %p:%p:%p result %ld:%ld match len %d r", - op, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2, iobuf_len); + log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match len %d r", + op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2, iobuf_len); rv = -EMSGSIZE; goto out; } - log_taskw(task, "aio collect %d %p:%p:%p result %ld:%ld match reap", - op, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); + log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match reap", + op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = 0; goto out; diff --git a/src/lockspace.c b/src/lockspace.c index 24f75d6..bc8849f 100644 --- a/src/lockspace.c +++ b/src/lockspace.c @@ -547,6 +547,7 @@ static void *lockspace_thread(void *arg_in) struct space *sp; struct leader_record leader; uint64_t delta_begin, last_success = 0; + int log_renewal_level = -1; int rv, delta_length, renewal_interval = 0; int id_renewal_seconds, id_renewal_fail_seconds; int acquire_result, delta_result, read_result; @@ -554,6 +555,9 @@ static void *lockspace_thread(void *arg_in) int stop = 0; int wd_con; + if (com.debug_renew) + log_renewal_level = LOG_DEBUG; + sp = (struct space *)arg_in; memset(&task, 0, sizeof(struct task)); @@ -652,7 +656,6 @@ static void *lockspace_thread(void *arg_in) if (stop) break; - /* * wait between each renewal */ @@ -681,6 +684,7 @@ static void *lockspace_thread(void *arg_in) delta_result = delta_lease_renew(&task, sp, &sp->host_id_disk, sp->space_name, bitmap, &extra, delta_result, &read_result, + log_renewal_level, &leader, &leader); delta_length = monotime() - delta_begin; @@ -732,9 +736,11 @@ static void *lockspace_thread(void *arg_in) } else if (delta_length > id_renewal_seconds) { log_erros(sp, "renewed %llu delta_length %d too long", (unsigned long long)last_success, delta_length); - } else if (com.debug_renew) { - log_space(sp, "renewed %llu delta_length %d interval %d", - (unsigned long long)last_success, delta_length, renewal_interval); + } else { + if (com.debug_renew) { + log_space(sp, "renewed %llu delta_length %d interval %d", + (unsigned long long)last_success, delta_length, renewal_interval); + } } }