From 933a474339ded8d025c99356f583d228b6a2b9ba Mon Sep 17 00:00:00 2001 From: David Teigland Date: Nov 29 2017 19:21:50 +0000 Subject: sanlock: fix release interference with paxos The process of releasing a lease on disk involved first zeroing our own dblock values prior to zeroing the leader record. Clearing dblock values can interfere with an in-progress paxos ballot, and can result in a second host owning the same lease version that we released. The incorrect zeroing of dblocks was added in commit d6bef45b9716c581d99466280a52a01c9ebe3bf7 The reason for zeroing was to fix an issue with shared leases here: e40e1f6e22f9b10f08d53fc7da94f5158d9e4ae8 That issue is now resolved by setting a new flag in the dblock to indicate we are releasing the lease. The new flag is set, along with the unchanged dblock values, before clearing the leader block. --- diff --git a/src/leader.h b/src/leader.h index b95866a..d67c243 100644 --- a/src/leader.h +++ b/src/leader.h @@ -18,7 +18,7 @@ #define PAXOS_DISK_MAGIC 0x06152010 #define PAXOS_DISK_CLEAR 0x11282016 #define PAXOS_DISK_VERSION_MAJOR 0x00060000 -#define PAXOS_DISK_VERSION_MINOR 0x00000002 +#define PAXOS_DISK_VERSION_MINOR 0x00000003 #define DELTA_DISK_MAGIC 0x12212010 #define DELTA_DISK_VERSION_MAJOR 0x00030000 diff --git a/src/ondisk.c b/src/ondisk.c index 83b6832..21f9b64 100644 --- a/src/ondisk.c +++ b/src/ondisk.c @@ -92,6 +92,7 @@ void paxos_dblock_in(struct paxos_dblock *end, struct paxos_dblock *pd) pd->inp3 = le64_to_cpu(end->inp3); pd->lver = le64_to_cpu(end->lver); pd->checksum = le32_to_cpu(end->checksum); + pd->flags = le32_to_cpu(end->flags); } void paxos_dblock_out(struct paxos_dblock *pd, struct paxos_dblock *end) @@ -104,6 +105,7 @@ void paxos_dblock_out(struct paxos_dblock *pd, struct paxos_dblock *end) end->lver = cpu_to_le64(pd->lver); /* N.B. the checksum must be computed after the byte swapping */ /* paxos_dblock_out(pd, end); checksum = compute(end), end->checksum = cpu_to_le32(checksum); */ + end->flags = cpu_to_le32(pd->flags); } void mode_block_in(struct mode_block *end, struct mode_block *mb) diff --git a/src/paxos_dblock.h b/src/paxos_dblock.h index a07abc4..7d08c95 100644 --- a/src/paxos_dblock.h +++ b/src/paxos_dblock.h @@ -12,6 +12,8 @@ #define DBLOCK_CHECKSUM_LEN 48 /* ends before checksum field */ +#define DBLOCK_FL_RELEASED 0x00000001 + struct paxos_dblock { uint64_t mbal; uint64_t bal; @@ -20,6 +22,12 @@ struct paxos_dblock { uint64_t inp3; /* host_id's timestamp */ uint64_t lver; uint32_t checksum; + uint32_t flags; /* DBLOCK_FL_ */ }; +/* + * This struct cannot grow any larger than MBLOCK_OFFSET (128) + * because the mode_block starts at that offset in the same sector. + */ + #endif diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 726056f..c13c5a2 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -1627,9 +1627,9 @@ int paxos_lease_acquire(struct task *task, if (cur_leader.write_id != cur_leader.owner_id) { rv = read_dblock(task, token, &token->disks[0], cur_leader.owner_id, &owner_dblock); - if (!rv && !owner_dblock.lver) { + if (!rv && (owner_dblock.flags & DBLOCK_FL_RELEASED)) { /* not an error, but interesting to see */ - log_errot(token, "paxos_acquire owner %llu %llu %llu writer %llu owner dblock free", + log_errot(token, "paxos_acquire owner %llu %llu %llu writer %llu owner dblock released", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, @@ -1956,28 +1956,68 @@ int paxos_lease_release(struct task *task, last = leader_last; if (leader.lver != last->lver) { - log_errot(token, "paxos_release %llu other lver %llu", + log_errot(token, "paxos_release other lver " + "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " + "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, - (unsigned long long)leader.lver); + (unsigned long long)last->owner_id, + (unsigned long long)last->owner_generation, + (unsigned long long)last->timestamp, + (unsigned long long)last->write_id, + (unsigned long long)last->write_generation, + (unsigned long long)last->write_timestamp, + (unsigned long long)leader.lver, + (unsigned long long)leader.owner_id, + (unsigned long long)leader.owner_generation, + (unsigned long long)leader.timestamp, + (unsigned long long)leader.write_id, + (unsigned long long)leader.write_generation, + (unsigned long long)leader.write_timestamp); return SANLK_RELEASE_LVER; } if (leader.timestamp == LEASE_FREE) { - log_errot(token, "paxos_release %llu already free %llu %llu %llu", + log_errot(token, "paxos_release already free " + "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " + "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, + (unsigned long long)last->owner_id, + (unsigned long long)last->owner_generation, + (unsigned long long)last->timestamp, + (unsigned long long)last->write_id, + (unsigned long long)last->write_generation, + (unsigned long long)last->write_timestamp, + (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, - (unsigned long long)leader.timestamp); + (unsigned long long)leader.timestamp, + (unsigned long long)leader.write_id, + (unsigned long long)leader.write_generation, + (unsigned long long)leader.write_timestamp); + return SANLK_RELEASE_OWNER; } if (leader.owner_id != token->host_id || leader.owner_generation != token->host_generation) { - log_errot(token, "paxos_release %llu other owner %llu %llu %llu", + log_errot(token, "paxos_release other owner " + "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " + "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, + (unsigned long long)last->owner_id, + (unsigned long long)last->owner_generation, + (unsigned long long)last->timestamp, + (unsigned long long)last->write_id, + (unsigned long long)last->write_generation, + (unsigned long long)last->write_timestamp, + (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, - (unsigned long long)leader.timestamp); + (unsigned long long)leader.timestamp, + (unsigned long long)leader.write_id, + (unsigned long long)leader.write_generation, + (unsigned long long)leader.write_timestamp); + return SANLK_RELEASE_OWNER; } @@ -1993,19 +2033,45 @@ int paxos_lease_release(struct task *task, * perfectly fine (use log_error since it's interesting * to see when this happens.) */ - log_errot(token, "paxos_release %llu leader different " - "write %llu %llu %llu vs %llu %llu %llu", + log_errot(token, "paxos_release different vals " + "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " + "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, + (unsigned long long)last->owner_id, + (unsigned long long)last->owner_generation, + (unsigned long long)last->timestamp, (unsigned long long)last->write_id, (unsigned long long)last->write_generation, (unsigned long long)last->write_timestamp, + (unsigned long long)leader.lver, + (unsigned long long)leader.owner_id, + (unsigned long long)leader.owner_generation, + (unsigned long long)leader.timestamp, (unsigned long long)leader.write_id, (unsigned long long)leader.write_generation, (unsigned long long)leader.write_timestamp); + /* log_leader_error(0, token, &token->disks[0], last, "paxos_release"); log_leader_error(0, token, &token->disks[0], &leader, "paxos_release"); */ + + /* + * If another host was the writer and committed us as the + * owner, then we don't zero the leader record when we release, + * we just release our dblock (by setting the release flag, + * already done prior to calling paxos_lease_release). This is + * because other hosts will ignore our leader record if we were + * not the writer once we release our dblock. Those other + * hosts will then run a ballot and commit/write a new leader. + * If we are also zeroing the leader, that can race with + * another host writing a new leader, and we could clobber the + * new leader. + */ + if (leader.write_id != token->host_id) { + log_token(token, "paxos_release %llu skip leader write", (unsigned long long)last->lver); + return 0; + } } if (resrename) diff --git a/src/resource.c b/src/resource.c index 6309b9c..81b6d99 100644 --- a/src/resource.c +++ b/src/resource.c @@ -361,8 +361,21 @@ static int write_host_block(struct task *task, struct token *token, log_errot(token, "write_host_block host_id %llu flags %x gen %llu rv %d", (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen, rv); } else { - log_token(token, "write_host_block host_id %llu flags %x gen %llu", - (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen); + if (pd) + log_token(token, "write_host_block host_id %llu flags %x gen %llu dblock %llu:%llu:%llu:%llu:%llu:%llu%s", + (unsigned long long)host_id, + mb_flags, + (unsigned long long)mb_gen, + (unsigned long long)pd->mbal, + (unsigned long long)pd->bal, + (unsigned long long)pd->inp, + (unsigned long long)pd->inp2, + (unsigned long long)pd->inp3, + (unsigned long long)pd->lver, + (pd->flags & DBLOCK_FL_RELEASED) ? ":RELEASED." : "."); + else + log_token(token, "write_host_block host_id %llu flags %x gen %llu dblock 0", + (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen); } if (rv != SANLK_AIO_TIMEOUT) @@ -370,6 +383,29 @@ static int write_host_block(struct task *task, struct token *token, return rv; } +static int write_mblock_zero_dblock_release(struct task *task, struct token *token) +{ + struct paxos_dblock dblock; + + memcpy(&dblock, &token->resource->dblock, sizeof(dblock)); + + dblock.flags = DBLOCK_FL_RELEASED; + + return write_host_block(task, token, token->host_id, 0, 0, &dblock); +} + +static int write_mblock_shared_dblock_release(struct task *task, struct token *token) +{ + struct paxos_dblock dblock; + + memcpy(&dblock, &token->resource->dblock, sizeof(dblock)); + + dblock.flags = DBLOCK_FL_RELEASED; + + return write_host_block(task, token, token->host_id, token->host_generation, + MBLOCK_SHARED, &dblock); +} + static int read_mode_block(struct task *task, struct token *token, uint64_t host_id, struct mode_block *mb_out) { @@ -737,6 +773,11 @@ static int release_disk(struct task *task, struct token *token, * leader says we own the lease, but our dblock is cleared, then our * leader write in release was clobbered, and other hosts will run a * ballot to set a new owner. + * UPDATE to above: we no longer clear our dblock values because that + * can interfere with other hosts running a paxos ballot at the same time, + * instead we now set the DBLOCK_FL_RELEASED flag in our dblock, leaving our + * other dblock values intact, and other hosts look for this flag to indicate + * that we have released. * * [**] For ERASE_ALL we don't want another host running the ballot to select * our dblock values and commit them, making us the owner after we've aborted @@ -838,7 +879,7 @@ static int _release_token(struct task *task, struct token *token, */ if (r_flags & R_ERASE_ALL) { - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token erase all write_host_block %d", rv); ret = rv; @@ -868,7 +909,7 @@ static int _release_token(struct task *task, struct token *token, (unsigned long long)lver, rv); } else if (r_flags & R_UNDO_SHARED) { - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token undo shared write_host_block %d", rv); ret = rv; @@ -889,7 +930,7 @@ static int _release_token(struct task *task, struct token *token, } else if (r_flags & R_SHARED) { /* normal release of sh lease */ - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token shared write_host_block %d", rv); ret = rv; @@ -919,7 +960,7 @@ static int _release_token(struct task *task, struct token *token, } /* Failure here is not a big deal and can be ignored. */ - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release_token write_host_block %d", rv); @@ -1179,6 +1220,7 @@ static int convert_sh2ex_token(struct task *task, struct resource *r, struct tok } memcpy(&r->leader, &leader, sizeof(struct leader_record)); + memcpy(&r->dblock, &dblock, sizeof(dblock)); token->r.lver = leader.lver; /* paxos_lease_acquire set token->shared_count to the number of @@ -1300,8 +1342,7 @@ static int convert_ex2sh_token(struct task *task, struct resource *r, struct tok if (r->flags & R_LVB_WRITE_RELEASE) write_lvb_block(task, r, token); - /* FIXME: preserve dblock */ - rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED, NULL); + rv = write_mblock_shared_dblock_release(task, token); if (rv < 0) { log_errot(token, "convert_ex2sh write_host_block error %d", rv); return rv; @@ -1663,6 +1704,7 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, } memcpy(&r->leader, &leader, sizeof(struct leader_record)); + memcpy(&r->dblock, &dblock, sizeof(dblock)); /* copy lver into token because inquire looks there for it */ if (!(token->acquire_flags & SANLK_RES_SHARED)) @@ -1674,7 +1716,7 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, */ if (token->acquire_flags & SANLK_RES_SHARED) { - rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED, &dblock); + rv = write_mblock_shared_dblock_release(task, token); if (rv < 0) { log_errot(token, "acquire_token sh write_host_block error %d", rv); r->flags &= ~R_SHARED; @@ -2033,7 +2075,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc log_token(token, "release async r_flags %x", r_flags); if (r_flags & R_ERASE_ALL) { - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async erase all write_host_block %d", rv); @@ -2058,7 +2100,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc (unsigned long long)r->leader.lver, rv); } else if (r_flags & R_UNDO_SHARED) { - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async undo shared write_host_block %d", rv); @@ -2075,7 +2117,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc } else if (r_flags & R_SHARED) { /* normal release of sh lease */ - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async shared write_host_block %d", rv); @@ -2094,7 +2136,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc } /* Failure here is not a big deal and can be ignored. */ - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async write_host_block %d", rv); @@ -2256,6 +2298,7 @@ static void *resource_thread(void *arg GNUC_UNUSED) tt->host_generation = r->host_generation; tt->token_id = r->release_token_id; tt->io_timeout = r->io_timeout; + tt->resource = r; /* * Set the time after which we should try to release this diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 472380f..5d799b9 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -130,6 +130,7 @@ struct resource { char killpath[SANLK_HELPER_PATH_LEN]; /* copied from client */ char killargs[SANLK_HELPER_ARGS_LEN]; /* copied from client */ struct leader_record leader; /* copy of last leader_record we wrote */ + struct paxos_dblock dblock; /* copy of last paxos_dblock we wrote */ struct sanlk_resource r; };