From 90481ecfde3e7cec35b34d2431820d77a182d16b Mon Sep 17 00:00:00 2001 From: David Teigland Date: Jun 14 2013 18:27:41 +0000 Subject: sanlock: add get_hosts api Returns info about hosts in a lockspace. Use this to display host status from the command line: sanlock client gets -h 1 Signed-off-by: David Teigland --- diff --git a/src/client.c b/src/client.c index 1cb81ee..a1b1ee3 100644 --- a/src/client.c +++ b/src/client.c @@ -234,6 +234,94 @@ int sanlock_get_lockspaces(struct sanlk_lockspace **lss, int *lss_count, return rv; } +int sanlock_get_hosts(const char *ls_name, uint64_t host_id, + struct sanlk_host **hss, int *hss_count, + uint32_t flags) +{ + struct sm_header h; + struct sanlk_lockspace ls; + struct sanlk_host *hsbuf, *hs; + int rv, fd, i, ret, recv_count; + + if (!ls_name) + return -EINVAL; + + memset(&ls, 0, sizeof(struct sanlk_lockspace)); + strncpy(ls.name, ls_name, SANLK_NAME_LEN); + ls.host_id = host_id; + + rv = connect_socket(&fd); + if (rv < 0) + return rv; + + rv = send_header(fd, SM_CMD_GET_HOSTS, flags, + sizeof(struct sanlk_lockspace), + 0, 0); + if (rv < 0) + goto out; + + rv = send(fd, &ls, sizeof(struct sanlk_lockspace), 0); + if (rv < 0) { + rv = -errno; + goto out; + } + + /* receive result and ls structs */ + + memset(&h, 0, sizeof(struct sm_header)); + + rv = recv(fd, &h, sizeof(h), MSG_WAITALL); + if (rv < 0) { + rv = -errno; + goto out; + } + + if (rv != sizeof(h)) { + rv = -1; + goto out; + } + + /* -ENOSPC means that the daemon's send buffer ran out of space */ + + rv = (int)h.data; + if (rv < 0 && rv != -ENOSPC) + goto out; + + *hss_count = h.data2; + recv_count = h.data2; + + if (!hss) + goto out; + + hsbuf = malloc(recv_count * sizeof(struct sanlk_host)); + if (!hsbuf) + goto out; + + hs = hsbuf; + + for (i = 0; i < recv_count; i++) { + ret = recv(fd, hs, sizeof(struct sanlk_host), MSG_WAITALL); + if (ret < 0) { + rv = -errno; + free(hsbuf); + goto out; + } + + if (ret != sizeof(struct sanlk_host)) { + rv = -1; + free(hsbuf); + goto out; + } + + hs++; + } + + *hss = hsbuf; + out: + close(fd); + return rv; +} + int sanlock_align(struct sanlk_disk *disk) { int rv, fd; diff --git a/src/cmd.c b/src/cmd.c index 8ad72b7..5cb02f4 100644 --- a/src/cmd.c +++ b/src/cmd.c @@ -1931,6 +1931,34 @@ static void cmd_get_lockspaces(int fd, struct sm_header *h_recv) send(fd, send_data_buf, len, MSG_NOSIGNAL); } +static void cmd_get_hosts(int fd, struct sm_header *h_recv) +{ + struct sm_header h; + struct sanlk_lockspace lockspace; + int count = 0, len = 0, rv; + + memset(&h, 0, sizeof(h)); + memcpy(&h, h_recv, sizeof(struct sm_header)); + h.length = sizeof(h); + h.data = 0; + + rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); + if (rv != sizeof(struct sanlk_lockspace)) { + h.data = -ENOTCONN; + goto out; + } + + rv = get_hosts(&lockspace, send_data_buf, &len, &count, LOG_DUMP_SIZE); + + h.length = sizeof(struct sm_header) + len; + h.data = rv; + h.data2 = count; +out: + send(fd, &h, sizeof(struct sm_header), MSG_NOSIGNAL); + if (len) + send(fd, send_data_buf, len, MSG_NOSIGNAL); +} + static void cmd_restrict(int ci, int fd, struct sm_header *h_recv) { log_debug("cmd_restrict ci %d fd %d pid %d flags %x", @@ -2007,6 +2035,10 @@ void call_cmd_daemon(int ci, struct sm_header *h_recv, int client_maxi) strcpy(client[ci].owner_name, "get_lockspaces"); cmd_get_lockspaces(fd, h_recv); break; + case SM_CMD_GET_HOSTS: + strcpy(client[ci].owner_name, "get_hosts"); + cmd_get_hosts(fd, h_recv); + break; }; if (auto_close) diff --git a/src/delta_lease.c b/src/delta_lease.c index e0fe372..19c342b 100644 --- a/src/delta_lease.c +++ b/src/delta_lease.c @@ -655,6 +655,9 @@ int delta_lease_init(struct task *task, if (!max_hosts) max_hosts = DEFAULT_MAX_HOSTS; + if (max_hosts > DEFAULT_MAX_HOSTS) + return -E2BIG; + if (!io_timeout) io_timeout = DEFAULT_IO_TIMEOUT; diff --git a/src/lockspace.c b/src/lockspace.c index 2940549..b5576ea 100644 --- a/src/lockspace.c +++ b/src/lockspace.c @@ -907,6 +907,182 @@ int get_lockspaces(char *buf, int *len, int *count, int maxlen) } /* + * After the lockspace starts, there is a limited amount of + * time that we've been watching the other hosts. This means + * we can't make an accurate assessment of their state, because + * the state is based on monitoring the hosts for host_fail_seconds + * and host_dead_seconds, or seeing a renewal. When none of + * those are true (not enough time monitoring and not seeing a + * renewal), we return UNKNOWN. + * + * (Example number of seconds below are based on hosts using the + * default 10 second io timeout.) + * + * * For hosts that are alive when we start, we return: + * UNKNOWN then LIVE + * + * UNKNOWN would typically last for 10-20 seconds, but it's possible that + * UNKNOWN could persist for up to 80 seconds before LIVE is returned. + * LIVE is returned after we see the timestamp change once. + * + * * For hosts that are dead when we start, we'd return: + * UNKNOWN then FAIL then DEAD + * + * UNKNOWN would last for 80 seconds before we return FAIL. + * FAIL would last for 60 more seconds before we return DEAD. + * + * * Hosts that are failing and don't recover would be the same as prev. + * + * * For hosts thet are failing but recover, we'd return: + * UNKNOWN then FAIL then LIVE + * + * + * For another host that is alive when we start, + * the sequence of values is: + * + * 0: we have not yet called check_other_leases() + * first_check = 0, last_check = 0, last_live = 0 + * + * other host renews its lease + * + * 10: we call check_other_leases() for the first time, + * first_check = 10, last_check = 10, last_live = 10 + * + * other host renews its lease + * + * 20: we call check_other_leases() for the second time, + * first_check = 10, last_check = 20, last_live = 20 + * + * At 10, we have not yet seen a renewal from the other host, i.e. we have + * not seen its timestamp change (we only have one sample). The host could + * be dead or alive, so we set the state to UNKNOWN. The way we know + * that we have not yet observed the timestamp change is that + * first_check == last_live, (10 == 10). + * + * At 20, we have seen a renewal, i.e. the timestamp changed between checks, + * so we return LIVE. + * + * In the other case, if the host was actually dead, not alive, it would not + * have renewed between 10 and 20. So at 20 we would continue to see + * first_check == last_live, and would return UNKNOWN. If the host remains + * dead, we'd continue to report UNKNOWN for the first 80 seconds. + * After 80 seconds, we'd return FAIL. After 140 seconds we'd return DEAD. + */ + +/* Also see host_live() */ + +static uint32_t get_host_flag(struct space *sp, struct host_status *hs) +{ + uint64_t now, last; + uint32_t flags; + uint32_t other_io_timeout; + int other_host_fail_seconds, other_host_dead_seconds; + + now = monotime(); + other_io_timeout = hs->io_timeout; + other_host_fail_seconds = calc_id_renewal_fail_seconds(other_io_timeout); + other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout); + + flags = 0; + + if (!hs->timestamp) { + flags = SANLK_HOST_FREE; + goto out; + } + + if (!hs->last_live) + last = hs->first_check; + else + last = hs->last_live; + + if (sp->host_id == hs->owner_id) { + /* we are alive */ + flags = SANLK_HOST_LIVE; + + } else if ((now - last <= other_host_fail_seconds) && + (hs->first_check == hs->last_live)) { + /* we haven't seen the timestamp change yet */ + flags = SANLK_HOST_UNKNOWN; + + } else if (now - last <= other_host_fail_seconds) { + flags = SANLK_HOST_LIVE; + + } else if (now - last > other_host_dead_seconds) { + flags = SANLK_HOST_DEAD; + + } else if (now - last > other_host_fail_seconds) { + flags = SANLK_HOST_FAIL; + } +out: + return flags; +} + +int get_hosts(struct sanlk_lockspace *ls, char *buf, int *len, int *count, int maxlen) +{ + struct space *sp; + struct host_status *hs; + struct sanlk_host *host; + int host_count = 0; + int i, rv; + + rv = 0; + *len = 0; + *count = 0; + host = (struct sanlk_host *)buf; + + pthread_mutex_lock(&spaces_mutex); + sp = _search_space(ls->name, NULL, 0, &spaces, NULL, NULL); + if (!sp) { + rv = -ENOENT; + goto out; + } + + /* + * Between add_lockspace completing and the first + * time we call check_other_leases, we don't have + * any data on other hosts, so return this error + * to indicate this to the caller. + */ + if (!sp->host_status[0].last_check) { + rv = -EAGAIN; + goto out; + } + + for (i = 0; i < DEFAULT_MAX_HOSTS; i++) { + hs = &sp->host_status[i]; + + if (ls->host_id && ls->host_id != i) + continue; + + if (!ls->host_id && !hs->timestamp) + continue; + + host_count++; + + if (*len + sizeof(struct sanlk_host) > maxlen) { + rv = -ENOSPC; + continue; + } + + host->host_id = i + 1; + host->generation = hs->owner_generation; + host->timestamp = hs->timestamp; + host->io_timeout = hs->io_timeout; + host->flags = get_host_flag(sp, hs); + + *len += sizeof(struct sanlk_host); + + host++; + } + out: + pthread_mutex_unlock(&spaces_mutex); + + *count = host_count; + + return rv; +} + +/* * we call stop_host_id() when all pids are gone and we're in a safe state, so * it's safe to unlink the watchdog right away here. We want to sp the unlink * as soon as it's safe, so we can reduce the chance we get killed by the diff --git a/src/lockspace.h b/src/lockspace.h index 47ce5ba..3ffe17f 100644 --- a/src/lockspace.h +++ b/src/lockspace.h @@ -26,5 +26,6 @@ int rem_lockspace_start(struct sanlk_lockspace *ls, unsigned int *space_id); int rem_lockspace_wait(struct sanlk_lockspace *ls, unsigned int space_id); void free_lockspaces(int wait); int get_lockspaces(char *buf, int *len, int *count, int maxlen); +int get_hosts(struct sanlk_lockspace *ls, char *buf, int *len, int *count, int maxlen); #endif diff --git a/src/main.c b/src/main.c index 28f21da..cfbb9a4 100644 --- a/src/main.c +++ b/src/main.c @@ -1153,6 +1153,7 @@ static void process_connection(int ci) case SM_CMD_HOST_STATUS: case SM_CMD_LOG_DUMP: case SM_CMD_GET_LOCKSPACES: + case SM_CMD_GET_HOSTS: call_cmd_daemon(ci, &h, client_maxi); break; case SM_CMD_ADD_LOCKSPACE: @@ -1782,7 +1783,7 @@ static void print_usage(void) printf("\n"); printf("sanlock client [options]\n"); printf("sanlock client status [-D] [-o p|s]\n"); - printf("sanlock client gets\n"); + printf("sanlock client gets [-h 0|1]\n"); printf("sanlock client host_status -s LOCKSPACE [-D]\n"); printf("sanlock client log_dump\n"); printf("sanlock client shutdown [-f 0|1]\n"); @@ -2015,7 +2016,10 @@ static int read_command_line(int argc, char *argv[]) com.use_watchdog = atoi(optionarg); break; case 'h': - com.high_priority = atoi(optionarg); + if (com.action == ACT_GETS) + com.get_hosts = atoi(optionarg); + else + com.high_priority = atoi(optionarg); break; case 'l': com.mlock_level = atoi(optionarg); @@ -2143,15 +2147,96 @@ static char *lsf_to_str(uint32_t flags) return lsf_str; } +static const char *host_state_str(uint32_t flags) +{ + int val = flags & SANLK_HOST_MASK; + + if (val == SANLK_HOST_FREE) + return "FREE"; + if (val == SANLK_HOST_LIVE) + return "LIVE"; + if (val == SANLK_HOST_FAIL) + return "FAIL"; + if (val == SANLK_HOST_DEAD) + return "DEAD"; + if (val == SANLK_HOST_UNKNOWN) + return "UNKNOWN"; + return "ERROR"; +} + +static int do_client_gets(void) +{ + struct sanlk_lockspace *lss = NULL, *ls; + struct sanlk_host *hss = NULL, *hs; + int ls_count = 0, hs_count = 0; + int i, j, rv; + + rv = sanlock_get_lockspaces(&lss, &ls_count, 0); + if (rv < 0) + log_tool("gets error %d", rv); + + if (rv < 0 && rv != -ENOSPC) { + if (lss) + free(lss); + return rv; + } + + if (!lss) + return 0; + + ls = lss; + + for (i = 0; i < ls_count; i++) { + log_tool("s %.48s:%llu:%s:%llu %s", + ls->name, + (unsigned long long)ls->host_id, + ls->host_id_disk.path, + (unsigned long long)ls->host_id_disk.offset, + !ls->flags ? "" : lsf_to_str(ls->flags)); + + if (!com.get_hosts) + goto next; + + rv = sanlock_get_hosts(ls->name, 0, &hss, &hs_count, 0); + if (rv == -EAGAIN) { + log_tool("hosts not ready"); + goto next; + } + if (rv < 0) { + log_tool("hosts error %d", rv); + goto next; + } + + if (!hss) + goto next; + + hs = hss; + + for (j = 0; j < hs_count; j++) { + log_tool("h %llu gen %llu timestamp %llu %s", + (unsigned long long)hs->host_id, + (unsigned long long)hs->generation, + (unsigned long long)hs->timestamp, + host_state_str(hs->flags)); + hs++; + } + free(hss); + next: + ls++; + } + + free(lss); + return 0; +} + static int do_client(void) { struct sanlk_resource **res_args = NULL; struct sanlk_resource *res; - struct sanlk_lockspace *lss, *ls; char *res_state = NULL; char *res_str = NULL; uint32_t io_timeout = 0; - int i, fd, count; + int i, fd; int rv = 0; if (com.action == ACT_COMMAND || com.action == ACT_ACQUIRE) { @@ -2174,36 +2259,10 @@ static int do_client(void) break; case ACT_GETS: - lss = NULL; - - rv = sanlock_get_lockspaces(&lss, &count, 0); - if (rv < 0) - log_tool("gets error %d", rv); - - if (rv < 0 && rv != -ENOSPC) { - if (lss) - free(lss); - break; - } - - if (!lss) - break; - - ls = lss; - - for (i = 0; i < count; i++) { - log_tool("s %.48s:%llu:%s:%llu %s", - ls->name, - (unsigned long long)ls->host_id, - ls->host_id_disk.path, - (unsigned long long)ls->host_id_disk.offset, - !ls->flags ? "" : lsf_to_str(ls->flags)); - ls++; - } - - free(lss); + rv = do_client_gets(); break; + case ACT_LOG_DUMP: rv = sanlock_log_dump(LOG_DUMP_SIZE); break; diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 577ed0b..3ce4815 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -1697,6 +1697,9 @@ int paxos_lease_init(struct task *task, if (!max_hosts) max_hosts = DEFAULT_MAX_HOSTS; + if (max_hosts > DEFAULT_MAX_HOSTS) + return -E2BIG; + sector_size = token->disks[0].sector_size; align_size = direct_align(&token->disks[0]); diff --git a/src/sanlock.8 b/src/sanlock.8 index edcaf35..7386173 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -262,7 +262,7 @@ Add -D to show extra internal daemon status for debugging. Print lockspaces being managed by the sanlock daemon. The LOCKSPACE string will be followed by ADD or REM if the lockspace is currently being -added or removed. +added or removed. Add -h 1 to also show hosts in each lockspace. .B sanlock client log_dump diff --git a/src/sanlock.h b/src/sanlock.h index 8446ebe..136faa1 100644 --- a/src/sanlock.h +++ b/src/sanlock.h @@ -92,6 +92,14 @@ struct sanlk_lockspace { struct sanlk_disk host_id_disk; }; +struct sanlk_host { + uint64_t host_id; + uint64_t generation; + uint64_t timestamp; + uint32_t io_timeout; + uint32_t flags; +}; + size_t sanlock_path_export(char *dst, const char *src, size_t dstlen); size_t sanlock_path_import(char *dst, const char *src, size_t dstlen); diff --git a/src/sanlock_admin.h b/src/sanlock_admin.h index 5d3d5af..9e01b3c 100644 --- a/src/sanlock_admin.h +++ b/src/sanlock_admin.h @@ -24,6 +24,14 @@ #define SANLK_LSF_ADD 0x00000001 #define SANLK_LSF_REM 0x00000002 +/* host status returned in low byte of sanlk_host.flags by get */ +#define SANLK_HOST_UNKNOWN 0x00000001 +#define SANLK_HOST_FREE 0x00000002 +#define SANLK_HOST_LIVE 0x00000003 +#define SANLK_HOST_FAIL 0x00000004 +#define SANLK_HOST_DEAD 0x00000005 +#define SANLK_HOST_MASK 0x0000000F /* select SANLK_HOST_ from flags */ + /* * add_lockspace returns: * 0: the lockspace has been added successfully @@ -75,6 +83,50 @@ int sanlock_get_lockspaces(struct sanlk_lockspace **lss, int *lss_count, uint32_t flags); /* + * When host_id is > 0, returns the sanlk_host info about the + * specified host_id. + * + * When host_id is 0, returns sanlk_host info about all hosts + * that have been seen alive. + * + * host status returned by sanlk_host.flags & SANLK_HOST_MASK: + * + * UNKNOWN: after adding lockspace, there has not yet been + * enough time monitoring other hosts to make an accurate + * assessment. + * + * FREE: delta lease not held + * the delta lease timestamp is zero + * + * LIVE: the host is alive + * now - last < other_host_fail_seconds + * + * FAIL: the host is failing and may be in recovery (killing pids) + * now - last > other_host_fail_seconds + * + * DEAD: the host is dead, its watchdog has fired + * now - last > other_host_dead_seconds + * + * now: local monotonic time + * + * last: if we have never seen the host's timestamp change, then + * last is the local monotime when we first checked it, otherwise + * last is the local monotime when we last saw the timestamp change + * (which would be some time after it was written by the host.) + * + * other_host_fail_seconds: based on the host's io_timeout, + * the number of seconds after which it would begin recovery + * (killing pids) if still alive and unable to renew its lease. + * + * other_host_dead_seconds: based on the host's io_timeout, + * the number of seconds after which its watchdog has fired. + */ + +int sanlock_get_hosts(const char *ls_name, uint64_t host_id, + struct sanlk_host **hss, int *hss_count, + uint32_t flags); + +/* * Returns the alignment in bytes required by sanlock_init() * (1MB for disks with 512 sectors, 8MB for disks with 4096 sectors) */ diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index f0153c6..09d15b3 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -258,7 +258,8 @@ struct command_line { int debug_renew; int quiet_fail; int use_watchdog; - int high_priority; + int high_priority; /* -h */ + int get_hosts; /* -h */ int mlock_level; int max_worker_threads; int aio_arg; diff --git a/src/sanlock_sock.h b/src/sanlock_sock.h index bbffa7d..d4d6ef6 100644 --- a/src/sanlock_sock.h +++ b/src/sanlock_sock.h @@ -40,6 +40,7 @@ enum { SM_CMD_READ_LOCKSPACE = 20, SM_CMD_READ_RESOURCE = 21, SM_CMD_GET_LOCKSPACES = 22, + SM_CMD_GET_HOSTS = 23, }; struct sm_header {