From d5e4def0d087dda942e7ac4330b288715603cc61 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Dec 15 2017 19:47:34 +0000 Subject: sanlock: add flags to specify sector size Add flags for specifying lease alignment and sector size to sanlock_write_lockspace() and sanlock_write_resource(): ALIGN1M - create 1MB lockspace/resource which uses 512 byte sector size i/o. ALIGN8M - create 8MB lockspace/resource which uses 4096 byte sector size i/o. When neither flag is set, sanlock uses the existing method for determining alignment and sector size: . On block devices, the sector size reported by libblkid. . On files, always choose 512 byte sector size. The new flags are returned by read_lockspace/read_resource, regardless of whether they were used with write. The sanlock_align() call continues to report the same values as before, which is the size that will be used when no ALIGN flag is given. sanlock_align() can no longer be used to determine the alignment of an existing lockspace/resource. If a lockspace/resource is created with ALIGN8M on a file, it will not be usable by previous versions of sanlock. The sector size is included in the ondisk structures for lockspaces/resources as before. sanlock now reads this value to determine the sector size to use with a lockspace/resource. (In some cases, this requires an extra disk read.) --- diff --git a/src/cmd.c b/src/cmd.c index c090819..c946591 100644 --- a/src/cmd.c +++ b/src/cmd.c @@ -369,6 +369,8 @@ static void cmd_acquire(struct task *task, struct cmd_args *ca) token->host_generation = spi.host_generation; token->pid = cl_pid; token->io_timeout = spi.io_timeout; + token->sector_size = spi.sector_size; + token->align_size = spi.align_size; if (cl->restricted & SANLK_RESTRICT_SIGKILL) token->flags |= T_RESTRICT_SIGKILL; if (cl->restricted & SANLK_RESTRICT_SIGTERM) @@ -1028,6 +1030,8 @@ static void cmd_request(struct task *task, struct cmd_args *ca) } token->io_timeout = spi.io_timeout; + token->sector_size = spi.sector_size; + token->align_size = spi.align_size; error = request_token(task, token, force_mode, &owner_id, (ca->header.cmd_flags & SANLK_REQUEST_NEXT_LVER)); @@ -1495,6 +1499,7 @@ static void cmd_read_lockspace(struct task *task, struct cmd_args *ca) struct sanlk_lockspace lockspace; struct sync_disk sd; uint64_t host_id; + int sector_size; int io_timeout = 0; int fd, rv, result; @@ -1534,12 +1539,28 @@ static void cmd_read_lockspace(struct task *task, struct cmd_args *ca) goto reply; } + if (lockspace.flags & SANLK_LSF_ALIGN1M) + sector_size = 512; + else if (lockspace.flags & SANLK_LSF_ALIGN8M) + sector_size = 4096; + else { + /* reads the first leader record to get sector size */ + result = delta_read_lockspace_sector_size(task, &sd, DEFAULT_IO_TIMEOUT, §or_size); + if (result < 0) + goto out_close; + if ((sector_size != 512) && (sector_size != 4096)) { + result = -EINVAL; + goto out_close; + } + } + /* sets ls->name and io_timeout */ - result = delta_read_lockspace(task, &sd, host_id, &lockspace, + result = delta_read_lockspace(task, &sd, sector_size, host_id, &lockspace, DEFAULT_IO_TIMEOUT, &io_timeout); if (result == SANLK_OK) result = 0; + out_close: close_disks(&sd, 1); reply: log_debug("cmd_read_lockspace %d,%d done %d", ca->ci_in, fd, result); @@ -1626,6 +1647,14 @@ static void cmd_read_resource(struct task *task, struct cmd_args *ca) token->io_timeout = DEFAULT_IO_TIMEOUT; + if (res.flags & SANLK_RES_ALIGN1M) + token->sector_size = 512; + else if (res.flags & SANLK_RES_ALIGN8M) + token->sector_size = 4096; + + if (token->sector_size) + token->align_size = sector_size_to_align_size(token->sector_size); + /* sets res.lockspace_name, res.name, res.lver */ result = paxos_read_resource(task, token, &res); if (result == SANLK_OK) @@ -1720,6 +1749,12 @@ static void cmd_read_resource_owners(struct task *task, struct cmd_args *ca) token->io_timeout = DEFAULT_IO_TIMEOUT; + if (res.flags & SANLK_RES_ALIGN1M) + token->sector_size = 512; + else if (res.flags & SANLK_RES_ALIGN8M) + token->sector_size = 4096; + token->align_size = sector_size_to_align_size(token->sector_size); + send_buf = NULL; send_len = 0; @@ -1791,7 +1826,7 @@ static void cmd_write_lockspace(struct task *task, struct cmd_args *ca) if (ca->header.data2) io_timeout = ca->header.data2; - result = delta_lease_init(task, io_timeout, &sd, lockspace.name, max_hosts); + result = delta_lease_init(task, &lockspace, io_timeout, &sd, max_hosts); close_disks(&sd, 1); reply: @@ -1882,6 +1917,15 @@ static void cmd_write_resource(struct task *task, struct cmd_args *ca) token->io_timeout = DEFAULT_IO_TIMEOUT; + if (token->r.flags & SANLK_RES_ALIGN1M) + token->sector_size = 512; + else if (token->r.flags & SANLK_RES_ALIGN8M) + token->sector_size = 4096; + else + token->sector_size = token->disks[0].sector_size; + + token->align_size = sector_size_to_align_size(token->sector_size); + result = paxos_lease_init(task, token, num_hosts, max_hosts, write_clear); close_disks(token->disks, token->r.num_disks); @@ -2168,6 +2212,7 @@ static int print_state_lockspace(struct space *sp, char *str, const char *list_n "list=%s " "space_id=%u " "io_timeout=%d " + "sector_size=%d " "host_generation=%llu " "renew_fail=%d " "space_dead=%d " @@ -2186,6 +2231,7 @@ static int print_state_lockspace(struct space *sp, char *str, const char *list_n list_name, sp->space_id, sp->io_timeout, + sp->sector_size, (unsigned long long)sp->host_generation, sp->renew_fail, sp->space_dead, @@ -2213,10 +2259,12 @@ static int print_state_resource(struct resource *r, char *str, const char *list_ snprintf(str, SANLK_STATE_MAXSTR-1, "list=%s " "flags=%x " + "sector_size=%d " "lver=%llu " "token_id=%u", list_name, r->flags, + r->sector_size, (unsigned long long)r->leader.lver, token_id); diff --git a/src/delta_lease.c b/src/delta_lease.c index 93aa418..9b3a4a8 100644 --- a/src/delta_lease.c +++ b/src/delta_lease.c @@ -23,6 +23,7 @@ #include #include "sanlock_internal.h" +#include "sanlock.h" #include "diskio.h" #include "ondisk.h" #include "direct.h" @@ -102,14 +103,6 @@ static int verify_leader(struct sync_disk *disk, goto fail; } - if (lr->sector_size != disk->sector_size) { - log_error("verify_leader %llu wrong sector size %d %d %s", - (unsigned long long)host_id, - lr->sector_size, disk->sector_size, disk->path); - result = SANLK_LEADER_SECTORSIZE; - goto fail; - } - if (strncmp(lr->space_name, space_name, NAME_ID_SIZE)) { log_error("verify_leader %llu wrong space name %.48s %.48s %s", (unsigned long long)host_id, @@ -138,7 +131,7 @@ static int verify_leader(struct sync_disk *disk, memset(&leader_end, 0, sizeof(leader_end)); - rv = read_sectors(disk, host_id - 1, 1, (char *)&leader_end, + rv = read_sectors(disk, lr->sector_size, host_id - 1, 1, (char *)&leader_end, sizeof(struct leader_record), NULL, "delta_verify"); @@ -155,6 +148,7 @@ static int verify_leader(struct sync_disk *disk, int delta_read_lockspace(struct task *task, struct sync_disk *disk, + int sector_size, uint64_t host_id, struct sanlk_lockspace *ls, int io_timeout, @@ -170,7 +164,7 @@ int delta_read_lockspace(struct task *task, memset(&leader_end, 0, sizeof(struct leader_record)); - rv = read_sectors(disk, host_id - 1, 1, (char *)&leader_end, sizeof(struct leader_record), + rv = read_sectors(disk, sector_size, host_id - 1, 1, (char *)&leader_end, sizeof(struct leader_record), task, io_timeout, "read_lockspace"); if (rv < 0) return rv; @@ -191,12 +185,52 @@ int delta_read_lockspace(struct task *task, memcpy(ls->name, leader.space_name, SANLK_NAME_LEN); ls->host_id = host_id; *io_timeout_ret = leader.io_timeout; + + if (leader.sector_size == 512) + ls->flags |= SANLK_LSF_ALIGN1M; + else if (leader.sector_size == 4096) + ls->flags |= SANLK_LSF_ALIGN8M; } return error; } -int delta_lease_leader_read(struct task *task, int io_timeout, +int delta_read_lockspace_sector_size(struct task *task, + struct sync_disk *disk, + int io_timeout, + int *sector_size) +{ + struct leader_record leader_end; + struct leader_record leader; + int rv; + + memset(&leader_end, 0, sizeof(struct leader_record)); + + /* + * read the first 4k, which either includes one 4k delta lease or 8 512b + * delta leases. In either case, we only look at the initial leader + * record to get to the sector size. + */ + + rv = read_sectors(disk, 4096, 0, 1, (char *)&leader_end, sizeof(struct leader_record), + task, io_timeout, "read_lockspace_sector_size"); + if (rv < 0) + return rv; + + leader_record_in(&leader_end, &leader); + + if (leader.magic != DELTA_DISK_MAGIC) + return SANLK_LEADER_MAGIC; + + if ((leader.version & 0xFFFF0000) != DELTA_DISK_VERSION_MAJOR) + return SANLK_LEADER_VERSION; + + *sector_size = leader.sector_size; + + return SANLK_OK; +} + +int delta_lease_leader_read(struct task *task, int sector_size, int io_timeout, struct sync_disk *disk, char *space_name, uint64_t host_id, @@ -208,12 +242,17 @@ int delta_lease_leader_read(struct task *task, int io_timeout, uint32_t checksum; int rv, error; + if (!sector_size) { + log_error("delta_lease_leader_read with zero sector_size %s", space_name); + return -EINVAL; + } + /* host_id N is block offset N-1 */ memset(&leader_end, 0, sizeof(struct leader_record)); memset(leader_ret, 0, sizeof(struct leader_record)); - rv = read_sectors(disk, host_id - 1, 1, (char *)&leader_end, sizeof(struct leader_record), + rv = read_sectors(disk, sector_size, host_id - 1, 1, (char *)&leader_end, sizeof(struct leader_record), task, io_timeout, "delta_leader"); if (rv < 0) return rv; @@ -246,7 +285,7 @@ int delta_lease_leader_clobber(struct task *task, int io_timeout, leader_record_out(leader, &leader_end); - rv = write_sector(disk, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), + rv = write_sector(disk, leader->sector_size, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), task, io_timeout, caller); if (rv < 0) return rv; @@ -287,7 +326,7 @@ int delta_lease_acquire(struct task *task, log_space(sp, "delta_acquire begin %.48s:%llu", sp->space_name, (unsigned long long)host_id); - error = delta_lease_leader_read(task, sp->io_timeout, disk, space_name, host_id, &leader, + error = delta_lease_leader_read(task, sp->sector_size, sp->io_timeout, disk, space_name, host_id, &leader, "delta_acquire_begin"); if (error < 0) { log_space(sp, "delta_acquire leader_read1 error %d", error); @@ -363,7 +402,7 @@ int delta_lease_acquire(struct task *task, sleep(1); } - error = delta_lease_leader_read(task, sp->io_timeout, disk, space_name, host_id, + error = delta_lease_leader_read(task, sp->sector_size, sp->io_timeout, disk, space_name, host_id, &leader, "delta_acquire_wait"); if (error < 0) { log_space(sp, "delta_acquire leader_read2 error %d", error); @@ -409,7 +448,7 @@ int delta_lease_acquire(struct task *task, leader.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); - rv = write_sector(disk, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), + rv = write_sector(disk, sp->sector_size, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), task, sp->io_timeout, "delta_leader"); if (rv < 0) { log_space(sp, "delta_acquire write error %d", rv); @@ -430,7 +469,7 @@ int delta_lease_acquire(struct task *task, sleep(1); } - error = delta_lease_leader_read(task, sp->io_timeout, disk, space_name, host_id, &leader, + error = delta_lease_leader_read(task, sp->sector_size, sp->io_timeout, disk, space_name, host_id, &leader, "delta_acquire_check"); if (error < 0) { log_space(sp, "delta_acquire leader_read3 error %d", error); @@ -494,7 +533,7 @@ int delta_lease_renew(struct task *task, iobuf_len = sp->align_size; - sector_size = disk->sector_size; + sector_size = sp->sector_size; /* offset of our leader_record */ id_offset = (host_id - 1) * sector_size; @@ -755,7 +794,7 @@ int delta_lease_release(struct task *task, leader.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); - rv = write_sector(disk, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), + rv = write_sector(disk, sp->sector_size, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), task, sp->io_timeout, "delta_leader"); if (rv < 0) { log_space(sp, "delta_release write error %d", rv); @@ -775,9 +814,9 @@ int delta_lease_release(struct task *task, block device disk->path */ int delta_lease_init(struct task *task, + struct sanlk_lockspace *ls, int io_timeout, struct sync_disk *disk, - char *space_name, int max_hosts) { struct leader_record leader_first; @@ -785,6 +824,7 @@ int delta_lease_init(struct task *task, struct leader_record leader; char *iobuf, **p_iobuf; int iobuf_len; + int sector_size; int align_size; int i, rv; uint32_t checksum; @@ -798,11 +838,16 @@ int delta_lease_init(struct task *task, if (!io_timeout) io_timeout = DEFAULT_IO_TIMEOUT; - align_size = direct_align(disk); - if (align_size < 0) - return align_size; + if (ls->flags & SANLK_LSF_ALIGN1M) + sector_size = 512; + else if (ls->flags & SANLK_LSF_ALIGN8M) + sector_size = 4096; + else + sector_size = disk->sector_size; + + align_size = sector_size_to_align_size(sector_size); - if (disk->sector_size * max_hosts > align_size) + if (sector_size * max_hosts > align_size) return -E2BIG; iobuf_len = align_size; @@ -821,11 +866,11 @@ int delta_lease_init(struct task *task, memset(&leader, 0, sizeof(struct leader_record)); leader.magic = DELTA_DISK_MAGIC; leader.version = DELTA_DISK_VERSION_MAJOR | DELTA_DISK_VERSION_MINOR; - leader.sector_size = disk->sector_size; + leader.sector_size = sector_size; leader.max_hosts = 1; leader.timestamp = LEASE_FREE; leader.io_timeout = io_timeout; - strncpy(leader.space_name, space_name, NAME_ID_SIZE); + strncpy(leader.space_name, ls->name, NAME_ID_SIZE); leader.checksum = 0; /* set below */ /* make the first record invalid so we can do a single atomic @@ -844,7 +889,7 @@ int delta_lease_init(struct task *task, leader.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); - memcpy(iobuf + (i * disk->sector_size), &leader_end, sizeof(struct leader_record)); + memcpy(iobuf + (i * sector_size), &leader_end, sizeof(struct leader_record)); } rv = write_iobuf(disk->fd, disk->offset, iobuf, iobuf_len, task, io_timeout, NULL); @@ -867,7 +912,7 @@ int delta_lease_init(struct task *task, memcpy(iobuf, &leader_end, sizeof(struct leader_record)); - rv = write_iobuf(disk->fd, disk->offset, iobuf, disk->sector_size, task, io_timeout, NULL); + rv = write_iobuf(disk->fd, disk->offset, iobuf, sector_size, task, io_timeout, NULL); out: if (rv != SANLK_AIO_TIMEOUT) free(iobuf); diff --git a/src/delta_lease.h b/src/delta_lease.h index 0cfca29..fcaebb0 100644 --- a/src/delta_lease.h +++ b/src/delta_lease.h @@ -10,6 +10,7 @@ #define __DELTA_LEASE_H__ int delta_lease_leader_read(struct task *task, + int sector_size, int io_timeout, struct sync_disk *disk, char *space_name, @@ -46,18 +47,24 @@ int delta_lease_release(struct task *task, struct leader_record *leader_ret); int delta_lease_init(struct task *task, + struct sanlk_lockspace *ls, int io_timeout, struct sync_disk *disk, - char *space_name, int max_hosts); int delta_read_lockspace(struct task *task, struct sync_disk *disk, + int sector_sze, uint64_t host_id, struct sanlk_lockspace *ls, int io_timeout, int *io_timeout_ret); +int delta_read_lockspace_sector_size(struct task *task, + struct sync_disk *disk, + int io_timeout, + int *sector_size); + int delta_lease_leader_clobber(struct task *task, int io_timeout, struct sync_disk *disk, uint64_t host_id, diff --git a/src/direct.c b/src/direct.c index 3d4832a..0633f75 100644 --- a/src/direct.c +++ b/src/direct.c @@ -32,6 +32,38 @@ #include "delta_lease.h" #include "timeouts.h" +static int direct_read_leader_sector_size(struct task *task, struct sync_disk *sd) +{ + struct leader_record *lr_end; + struct leader_record lr_in; + char *data; + int sector_size = 0; + int datalen; + int rv; + + datalen = 4096; + data = malloc(datalen); + + if (!data) + return 0; + + memset(data, 0, datalen); + + rv = read_sectors(sd, 4096, 0, 1, data, datalen, task, DEFAULT_IO_TIMEOUT, "read_sector_size"); + if (rv < 0) + goto out; + + lr_end = (struct leader_record *)data; + + leader_record_in(lr_end, &lr_in); + + if ((lr_in.magic == DELTA_DISK_MAGIC) || (lr_in.magic == PAXOS_DISK_MAGIC)) + sector_size = lr_in.sector_size; + out: + free(data); + return sector_size; +} + /* * cli: sanlock direct init * cli: sanlock direct read_leader @@ -81,6 +113,7 @@ static int do_paxos_action(int action, struct task *task, int io_timeout, struct struct token *token; struct leader_record leader; struct paxos_dblock dblock; + int sector_size; int disks_len, token_len; int j, rv = 0; @@ -117,10 +150,20 @@ static int do_paxos_action(int action, struct task *task, int io_timeout, struct switch (action) { case ACT_DIRECT_INIT: + sector_size = com.sector_size ? com.sector_size : + token->disks[0].sector_size; + token->sector_size = sector_size; + token->align_size = sector_size_to_align_size(sector_size); + rv = paxos_lease_init(task, token, num_hosts, max_hosts, write_clear); break; case ACT_ACQUIRE: + sector_size = com.sector_size ? com.sector_size : + direct_read_leader_sector_size(task, &token->disks[0]); + token->sector_size = sector_size; + token->align_size = sector_size_to_align_size(sector_size); + token->host_id = local_host_id; token->host_generation = local_host_generation; @@ -128,17 +171,34 @@ static int do_paxos_action(int action, struct task *task, int io_timeout, struct break; case ACT_RELEASE: + sector_size = com.sector_size ? com.sector_size : 4096; + token->sector_size = sector_size; + token->align_size = sector_size_to_align_size(sector_size); + rv = paxos_lease_leader_read(task, token, &leader, "direct_release"); if (rv < 0) break; + + sector_size = leader.sector_size; + token->sector_size = sector_size; + token->align_size = sector_size_to_align_size(sector_size); + rv = paxos_lease_release(task, token, NULL, &leader, leader_ret); break; case ACT_READ_LEADER: + sector_size = com.sector_size ? com.sector_size : 4096; + token->sector_size = sector_size; + token->align_size = sector_size_to_align_size(sector_size); + rv = paxos_lease_leader_read(task, token, &leader, "direct_read_leader"); break; case ACT_WRITE_LEADER: + sector_size = leader_in->sector_size; + token->sector_size = sector_size; + token->align_size = sector_size_to_align_size(sector_size); + rv = paxos_lease_leader_clobber(task, token, leader_in, "direct_clobber"); break; } @@ -196,6 +256,7 @@ static int do_delta_action(int action, struct sync_disk sd; struct space space; char bitmap[HOSTID_BITMAP_SIZE]; + int sector_size; int read_result, rv; int rd_ms, wr_ms; @@ -204,6 +265,8 @@ static int do_delta_action(int action, if (!io_timeout) io_timeout = DEFAULT_IO_TIMEOUT; + memset(&leader, 0, sizeof(leader)); + /* for log_space in delta functions */ memset(&space, 0, sizeof(space)); space.io_timeout = io_timeout; @@ -224,10 +287,24 @@ static int do_delta_action(int action, switch (action) { case ACT_DIRECT_INIT: - rv = delta_lease_init(task, io_timeout, &sd, ls->name, max_hosts); + sector_size = com.sector_size ? com.sector_size : sd.sector_size; + + if (sector_size == 512) + ls->flags |= SANLK_LSF_ALIGN1M; + else if (sector_size == 4096) + ls->flags |= SANLK_LSF_ALIGN8M; + + rv = delta_lease_init(task, ls, io_timeout, &sd, max_hosts); break; case ACT_ACQUIRE_ID: + sector_size = direct_read_leader_sector_size(task, &sd); + if (!sector_size) + return rv; + + space.sector_size = sector_size; + space.align_size = sector_size_to_align_size(sector_size); + rv = delta_lease_acquire(task, &space, &sd, ls->name, our_host_name, @@ -235,13 +312,21 @@ static int do_delta_action(int action, &leader); break; case ACT_RENEW_ID: - rv = delta_lease_leader_read(task, io_timeout, &sd, + sector_size = direct_read_leader_sector_size(task, &sd); + if (!sector_size) + return rv; + + space.sector_size = sector_size; + space.align_size = sector_size_to_align_size(sector_size); + + rv = delta_lease_leader_read(task, sector_size, io_timeout, &sd, ls->name, ls->host_id, &leader, "direct_renew"); if (rv < 0) return rv; + rv = delta_lease_renew(task, &space, &sd, ls->name, bitmap, @@ -254,20 +339,32 @@ static int do_delta_action(int action, &rd_ms, &wr_ms); break; case ACT_RELEASE_ID: - rv = delta_lease_leader_read(task, io_timeout, &sd, + sector_size = direct_read_leader_sector_size(task, &sd); + if (!sector_size) + return rv; + + space.sector_size = sector_size; + space.align_size = sector_size_to_align_size(leader.sector_size); + + rv = delta_lease_leader_read(task, sector_size, io_timeout, &sd, ls->name, ls->host_id, &leader, "direct_release"); if (rv < 0) return rv; + rv = delta_lease_release(task, &space, &sd, ls->name, &leader, &leader); break; case ACT_READ_LEADER: - rv = delta_lease_leader_read(task, io_timeout, &sd, + sector_size = direct_read_leader_sector_size(task, &sd); + if (!sector_size) + return rv; + + rv = delta_lease_leader_read(task, sector_size, io_timeout, &sd, ls->name, ls->host_id, &leader, @@ -411,7 +508,7 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) uint64_t sector_nr; uint64_t dump_size = 0; uint64_t end_sector_nr; - int sector_count, datalen, align_size; + int sector_size, sector_count, datalen, align_size; int i, rv, b; memset(&sd, 0, sizeof(struct sync_disk)); @@ -435,19 +532,18 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) if (rv < 0) return -ENODEV; - rv = direct_align(&sd); - if (rv < 0) - goto out_close; - - align_size = rv; + sector_size = com.sector_size ? com.sector_size : + direct_read_leader_sector_size(task, &sd); + align_size = sector_size_to_align_size(sector_size); + sector_count = align_size / sector_size; datalen = align_size; - sector_count = align_size / sd.sector_size; data = malloc(datalen); if (!data) { rv = -ENOMEM; goto out_close; } + memset(data, 0, datalen); printf("%8s %36s %48s %10s %4s %4s %s", "offset", @@ -471,7 +567,7 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) memset(rname, 0, sizeof(rname)); memset(data, 0, sd.sector_size); - rv = read_sectors(&sd, sector_nr, sector_count, data, datalen, + rv = read_sectors(&sd, sector_size, sector_nr, sector_count, data, datalen, task, DEFAULT_IO_TIMEOUT, "dump"); lr_end = (struct leader_record *)data; @@ -481,7 +577,7 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) if (lr->magic == DELTA_DISK_MAGIC) { for (i = 0; i < sector_count; i++) { - lr_end = (struct leader_record *)(data + (i * sd.sector_size)); + lr_end = (struct leader_record *)(data + (i * sector_size)); if (!lr_end->magic) continue; @@ -497,7 +593,7 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) strncpy(rname, lr->resource_name, NAME_ID_SIZE); printf("%08llu %36s %48s %010llu %04llu %04llu", - (unsigned long long)((sector_nr + i) * sd.sector_size), + (unsigned long long)((sector_nr + i) * sector_size), sname, rname, (unsigned long long)lr->timestamp, (unsigned long long)lr->owner_id, @@ -517,7 +613,7 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) strncpy(rname, lr->resource_name, NAME_ID_SIZE); printf("%08llu %36s %48s %010llu %04llu %04llu %llu", - (unsigned long long)(sector_nr * sd.sector_size), + (unsigned long long)(sector_nr * sector_size), sname, rname, (unsigned long long)lr->timestamp, (unsigned long long)lr->owner_id, @@ -533,7 +629,7 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) printf("\n"); for (i = 0; i < lr->num_hosts; i++) { - char *pd_end = data + ((2 + i) * sd.sector_size); + char *pd_end = data + ((2 + i) * sector_size); struct mode_block *mb_end = (struct mode_block *)(pd_end + MBLOCK_OFFSET); if (force_mode > 1) { @@ -583,7 +679,7 @@ int direct_next_free(struct task *task, char *path) struct leader_record lr; struct sync_disk sd; uint64_t sector_nr; - int sector_count, datalen, align_size; + int sector_size, sector_count, datalen, align_size; int rv; memset(&sd, 0, sizeof(struct sync_disk)); @@ -602,13 +698,13 @@ int direct_next_free(struct task *task, char *path) if (rv < 0) return -ENODEV; - rv = direct_align(&sd); - if (rv < 0) - goto out_close; + sector_size = direct_read_leader_sector_size(task, &sd); + if (!sector_size) + return -EINVAL; - align_size = rv; - datalen = sd.sector_size; - sector_count = align_size / sd.sector_size; + align_size = sector_size_to_align_size(sector_size); + sector_count = align_size / sector_size; + datalen = sector_size; data = malloc(datalen); if (!data) { @@ -620,9 +716,9 @@ int direct_next_free(struct task *task, char *path) rv = -ENOSPC; while (1) { - memset(data, 0, sd.sector_size); + memset(data, 0, sector_size); - rv = read_sectors(&sd, sector_nr, 1, data, datalen, + rv = read_sectors(&sd, sector_size, sector_nr, 1, data, datalen, task, DEFAULT_IO_TIMEOUT, "next_free"); lr_end = (struct leader_record *)data; @@ -630,7 +726,7 @@ int direct_next_free(struct task *task, char *path) leader_record_in(lr_end, &lr); if (lr.magic != DELTA_DISK_MAGIC && lr.magic != PAXOS_DISK_MAGIC) { - printf("%llu\n", (unsigned long long)(sector_nr * sd.sector_size)); + printf("%llu\n", (unsigned long long)(sector_nr * sector_size)); rv = 0; goto out_free; } diff --git a/src/diskio.c b/src/diskio.c index 57fcaa5..95c567c 100644 --- a/src/diskio.c +++ b/src/diskio.c @@ -393,6 +393,8 @@ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, struct io_event event; struct timespec begin, end, diff; const char *op_str; + const char *len_str; + char ms_str[8]; int rv; if (!ioto) { @@ -415,6 +417,27 @@ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, iocb->u.c.nbytes = len; iocb->u.c.offset = offset; + if (cmd == IO_CMD_PREAD) + op_str = "RD"; + else if (cmd == IO_CMD_PWRITE) + op_str = "WR"; + else + op_str = "UK"; + + if (com.debug_io_submit) { + if (len == ONEMB) + len_str = "1MB"; + else if (len == (8 * ONEMB)) + len_str = "8MB"; + else + len_str = NULL; + + if (len_str) + log_taskd(task, "%s %s at %llu", op_str, len_str, (unsigned long long)offset); + else + log_taskd(task, "%s %d at %llu", op_str, len, (unsigned long long)offset); + } + if (ms) clock_gettime(CLOCK_MONOTONIC_RAW, &begin); @@ -449,12 +472,6 @@ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); int op = ev_iocb ? ev_iocb->aio_lio_opcode : -1; - if (ms) { - clock_gettime(CLOCK_MONOTONIC_RAW, &end); - ts_diff(&begin, &end, &diff); - *ms = (diff.tv_sec * 1000) + (diff.tv_nsec / 1000000); - } - if (op == IO_CMD_PREAD) op_str = "RD"; else if (op == IO_CMD_PWRITE) @@ -462,6 +479,12 @@ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, else op_str = "UK"; + if (ms) { + clock_gettime(CLOCK_MONOTONIC_RAW, &end); + ts_diff(&begin, &end, &diff); + *ms = (diff.tv_sec * 1000) + (diff.tv_nsec / 1000000); + } + ev_aicb->used = 0; if (ev_iocb != iocb) { @@ -485,6 +508,28 @@ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, } /* standard success case */ + + if (com.debug_io_complete) { + if (len == ONEMB) + len_str = "1MB"; + else if (len == (8 * ONEMB)) + len_str = "8MB"; + else + len_str = NULL; + + if (ms) { + memset(ms_str, 0, sizeof(ms_str)); + snprintf(ms_str, 7, "%u", *ms); + } + + if (len_str) + log_taskd(task, "%s %s at %llu done %s", + op_str, len_str, (unsigned long long)offset, ms ? ms_str : ""); + else + log_taskd(task, "%s %d at %llu done %s", + op_str, len, (unsigned long long)offset, ms ? ms_str : ""); + } + rv = 0; goto out; } @@ -651,7 +696,7 @@ int write_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, return do_write(fd, offset, iobuf, iobuf_len, task); } -static int _write_sectors(const struct sync_disk *disk, uint64_t sector_nr, +static int _write_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count GNUC_UNUSED, const char *data, int data_len, int iobuf_len, struct task *task, int ioto, @@ -661,10 +706,7 @@ static int _write_sectors(const struct sync_disk *disk, uint64_t sector_nr, uint64_t offset; int rv; - if (!disk->sector_size) - return -EINVAL; - - offset = disk->offset + (sector_nr * disk->sector_size); + offset = disk->offset + (sector_nr * sector_size); p_iobuf = &iobuf; @@ -696,12 +738,17 @@ static int _write_sectors(const struct sync_disk *disk, uint64_t sector_nr, the start of the block device identified by disk->path, data_len must be <= sector_size */ -int write_sector(const struct sync_disk *disk, uint64_t sector_nr, +int write_sector(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, const char *data, int data_len, struct task *task, int ioto, const char *blktype) { - int iobuf_len = disk->sector_size; + int iobuf_len = sector_size; + + if ((sector_size != 4096) && (sector_size != 512)) { + log_error("write_sector bad sector_size %d", sector_size); + return -EINVAL; + } if (data_len > iobuf_len) { log_error("write_sector %s data_len %d max %d %s", @@ -709,26 +756,31 @@ int write_sector(const struct sync_disk *disk, uint64_t sector_nr, return -1; } - return _write_sectors(disk, sector_nr, 1, data, data_len, + return _write_sectors(disk, sector_size, sector_nr, 1, data, data_len, iobuf_len, task, ioto, blktype); } /* write multiple complete sectors, data_len must be multiple of sector size */ -int write_sectors(const struct sync_disk *disk, uint64_t sector_nr, +int write_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count, const char *data, int data_len, struct task *task, int ioto, const char *blktype) { int iobuf_len = data_len; - if (data_len != sector_count * disk->sector_size) { + if ((sector_size != 4096) && (sector_size != 512)) { + log_error("write_sectors bad sector_size %d", sector_size); + return -EINVAL; + } + + if (data_len != sector_count * sector_size) { log_error("write_sectors %s data_len %d sector_count %d %s", blktype, data_len, sector_count, disk->path); return -1; } - return _write_sectors(disk, sector_nr, sector_count, data, data_len, + return _write_sectors(disk, sector_size, sector_nr, sector_count, data, data_len, iobuf_len, task, ioto, blktype); } @@ -751,7 +803,7 @@ int read_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, when reading multiple sectors, data_len will generally equal iobuf_len, but when reading one sector, data_len may be less than iobuf_len. */ -int read_sectors(const struct sync_disk *disk, uint64_t sector_nr, +int read_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count, char *data, int data_len, struct task *task, int ioto, const char *blktype) @@ -761,13 +813,13 @@ int read_sectors(const struct sync_disk *disk, uint64_t sector_nr, int iobuf_len; int rv; - if (!disk->sector_size) { - log_error("read_sectors %s zero sector_size", blktype); + if ((sector_size != 512) && (sector_size != 4096)) { + log_error("read_sectors %s bad sector_size %d", blktype, sector_size); return -EINVAL; } - iobuf_len = sector_count * disk->sector_size; - offset = disk->offset + (sector_nr * disk->sector_size); + iobuf_len = sector_count * sector_size; + offset = disk->offset + (sector_nr * sector_size); p_iobuf = &iobuf; diff --git a/src/diskio.h b/src/diskio.h index 6cb1b4f..a7ba50d 100644 --- a/src/diskio.h +++ b/src/diskio.h @@ -34,17 +34,17 @@ int read_iobuf_reap(int fd, uint64_t offset, char *iobuf, int iobuf_len, * for io, copy out of it for write, and free it */ -int write_sector(const struct sync_disk *disk, uint64_t sector_nr, +int write_sector(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, const char *data, int data_len, struct task *task, int ioto, const char *blktype); -int write_sectors(const struct sync_disk *disk, uint64_t sector_nr, +int write_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count, const char *data, int data_len, struct task *task, int ioto, const char *blktype); -int read_sectors(const struct sync_disk *disk, uint64_t sector_nr, +int read_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count, char *data, int data_len, struct task *task, int ioto, const char *blktype); diff --git a/src/leader.h b/src/leader.h index d67c243..2704299 100644 --- a/src/leader.h +++ b/src/leader.h @@ -74,6 +74,9 @@ struct leader_record { #define HOSTID_BITMAP_OFFSET 256 #define HOSTID_BITMAP_SIZE 256 +/* the request record is in the sector following the leader record + for a paxos lease. */ + #define REQ_DISK_MAGIC 0x08292011 #define REQ_DISK_VERSION_MAJOR 0x00010000 #define REQ_DISK_VERSION_MINOR 0x00000001 diff --git a/src/lockspace.c b/src/lockspace.c index e9bba20..5ab2be4 100644 --- a/src/lockspace.c +++ b/src/lockspace.c @@ -104,6 +104,8 @@ int _lockspace_info(const char *space_name, struct space_info *spi) spi->space_id = sp->space_id; spi->io_timeout = sp->io_timeout; + spi->sector_size = sp->sector_size; + spi->align_size = sp->align_size; spi->host_id = sp->host_id; spi->host_generation = sp->host_generation; spi->killing_pids = sp->killing_pids; @@ -124,7 +126,7 @@ int lockspace_info(const char *space_name, struct space_info *spi) return rv; } -int lockspace_disk(char *space_name, struct sync_disk *disk) +int lockspace_disk(char *space_name, struct sync_disk *disk, int *sector_size) { struct space *sp; int rv = -1; @@ -135,6 +137,7 @@ int lockspace_disk(char *space_name, struct sync_disk *disk) continue; memcpy(disk, &sp->host_id_disk, sizeof(struct sync_disk)); + *sector_size = sp->sector_size; disk->fd = -1; rv = 0; } @@ -287,15 +290,12 @@ void check_other_leases(struct space *sp, char *buf) struct leader_record leader_in; struct leader_record *leader_end; struct leader_record *leader; - struct sync_disk *disk; struct host_status *hs; struct sanlk_host_event he; char *bitmap; uint64_t now; int i, new; - disk = &sp->host_id_disk; - now = monotime(); new = 0; @@ -306,7 +306,7 @@ void check_other_leases(struct space *sp, char *buf) if (!hs->first_check) hs->first_check = now; - leader_end = (struct leader_record *)(buf + (i * disk->sector_size)); + leader_end = (struct leader_record *)(buf + (i * sp->sector_size)); leader_record_in(leader_end, &leader_in); leader = &leader_in; @@ -617,12 +617,26 @@ static void *lockspace_thread(void *arg_in) } opened = 1; - sp->align_size = direct_align(&sp->host_id_disk); - if (sp->align_size < 0) { - log_erros(sp, "direct_align error"); - acquire_result = sp->align_size; - delta_result = -1; - goto set_status; + if (!sp->sector_size) { + int ss = 0; + + rv = delta_read_lockspace_sector_size(&task, &sp->host_id_disk, sp->io_timeout, &ss); + if (rv < 0) { + log_erros(sp, "failed to read device to find sector size error %d %s", rv, sp->host_id_disk.path); + acquire_result = rv; + delta_result = -1; + goto set_status; + } + + if ((ss != 512) && (ss != 4096)) { + log_erros(sp, "failed to get valid sector size %d %s", ss, sp->host_id_disk.path); + acquire_result = SANLK_LEADER_SECTORSIZE; + delta_result = -1; + goto set_status; + } + + sp->sector_size = ss; + sp->align_size = sector_size_to_align_size(ss); } sp->lease_status.renewal_read_buf = malloc(sp->align_size); diff --git a/src/lockspace.h b/src/lockspace.h index 6398b6d..f833efe 100644 --- a/src/lockspace.h +++ b/src/lockspace.h @@ -21,7 +21,7 @@ int _lockspace_info(const char *space_name, struct space_info *spi); int lockspace_info(const char *space_name, struct space_info *spi); /* locks spaces_mutex */ -int lockspace_disk(char *space_name, struct sync_disk *disk); +int lockspace_disk(char *space_name, struct sync_disk *disk, int *sector_size); /* locks spaces_mutex */ int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out); diff --git a/src/main.c b/src/main.c index 0117183..962486f 100644 --- a/src/main.c +++ b/src/main.c @@ -56,8 +56,6 @@ #include "timeouts.h" #include "paxos_lease.h" -#define ONEMB 1048576 - #define SIGRUNPATH 100 /* anything that's not SIGTERM/SIGKILL */ struct thread_pool { @@ -1835,7 +1833,7 @@ static void print_usage(void) printf("sanlock client set_config -s LOCKSPACE [-u 0|1] [-O 0|1]\n"); printf("sanlock client log_dump\n"); printf("sanlock client shutdown [-f 0|1] [-w 0|1]\n"); - printf("sanlock client init -s LOCKSPACE | -r RESOURCE [-z 0|1]\n"); + printf("sanlock client init -s LOCKSPACE | -r RESOURCE [-z 0|1] [-Z 512|4096]\n"); printf("sanlock client read -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock client align -s LOCKSPACE\n"); printf("sanlock client add_lockspace -s LOCKSPACE\n"); @@ -1849,7 +1847,7 @@ static void print_usage(void) printf("sanlock client request -r RESOURCE -f \n"); printf("sanlock client examine -r RESOURCE | -s LOCKSPACE\n"); printf("\n"); - printf("sanlock direct [-a 0|1] [-o 0|1]\n"); + printf("sanlock direct [-a 0|1] [-o 0|1] [-Z 512|4096]\n"); printf("sanlock direct init -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock direct read_leader -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock direct dump [:[:]]\n"); @@ -2168,6 +2166,13 @@ static int read_command_line(int argc, char *argv[]) case 'c': begin_command = 1; break; + + case 'Z': + com.sector_size = atoi(optionarg); + if ((com.sector_size != 512) && (com.sector_size != 4096)) + com.sector_size = 0; + break; + default: log_tool("unknown option: %c", optchar); exit(EXIT_FAILURE); @@ -2355,6 +2360,14 @@ static void read_config_file(void) } else if (!strcmp(str, "paxos_debug_all")) { get_val_int(line, &val); com.paxos_debug_all = val; + + } else if (!strcmp(str, "debug_io")) { + memset(str, 0, sizeof(str)); + get_val_str(line, str); + if (strstr(str, "submit")) + com.debug_io_submit = 1; + if (strstr(str, "complete")) + com.debug_io_complete = 1; } } @@ -2470,8 +2483,18 @@ static int do_client_read(void) int rv, i, hss_count = 0; if (com.lockspace.host_id_disk.path[0]) { + if (com.sector_size == 512) + com.lockspace.flags |= SANLK_LSF_ALIGN1M; + else if (com.sector_size == 4096) + com.lockspace.flags |= SANLK_LSF_ALIGN8M; + rv = sanlock_read_lockspace(&com.lockspace, 0, &io_timeout); } else { + if (com.sector_size == 512) + com.res_args[0]->flags |= SANLK_RES_ALIGN1M; + else if (com.sector_size == 4096) + com.res_args[0]->flags |= SANLK_RES_ALIGN8M; + if (!com.get_hosts) { rv = sanlock_read_resource(com.res_args[0], 0); } else { @@ -2761,15 +2784,27 @@ static int do_client(void) case ACT_CLIENT_INIT: log_tool("init"); - if (com.lockspace.host_id_disk.path[0]) + if (com.lockspace.host_id_disk.path[0]) { + if (com.sector_size == 512) + com.lockspace.flags |= SANLK_LSF_ALIGN1M; + else if (com.sector_size == 4096) + com.lockspace.flags |= SANLK_LSF_ALIGN8M; + rv = sanlock_write_lockspace(&com.lockspace, com.max_hosts, 0, com.io_timeout_arg); - else + } else { + if (com.sector_size == 512) + com.res_args[0]->flags |= SANLK_RES_ALIGN1M; + else if (com.sector_size == 4096) + com.res_args[0]->flags |= SANLK_RES_ALIGN8M; + rv = sanlock_write_resource(com.res_args[0], com.max_hosts, com.num_hosts, com.clear_arg ? SANLK_WRITE_CLEAR : 0); + } + log_tool("init done %d", rv); break; diff --git a/src/paxos_dblock.h b/src/paxos_dblock.h index 7d08c95..da80a19 100644 --- a/src/paxos_dblock.h +++ b/src/paxos_dblock.h @@ -10,6 +10,10 @@ #ifndef __PAXOS_DBLOCK_H__ #define __PAXOS_DBLOCK_H__ +/* The first dblock (for host_id 1) is in the third sector of a paxos lease. + The first sector holds the leader record, and the second sector holds the + request record. */ + #define DBLOCK_CHECKSUM_LEN 48 /* ends before checksum field */ #define DBLOCK_FL_RELEASED 0x00000001 diff --git a/src/paxos_lease.c b/src/paxos_lease.c index a1f43e3..09e4f59 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -81,7 +81,7 @@ int paxos_lease_request_read(struct task *task, struct token *token, /* 1 = request record is second sector */ - rv = read_sectors(&token->disks[0], 1, 1, (char *)&rr_end, + rv = read_sectors(&token->disks[0], token->sector_size, 1, 1, (char *)&rr_end, sizeof(struct request_record), task, token->io_timeout, "request"); if (rv < 0) @@ -100,7 +100,7 @@ int paxos_lease_request_write(struct task *task, struct token *token, request_record_out(rr, &rr_end); - rv = write_sector(&token->disks[0], 1, (char *)&rr_end, + rv = write_sector(&token->disks[0], token->sector_size, 1, (char *)&rr_end, sizeof(struct request_record), task, token->io_timeout, "request"); if (rv < 0) @@ -165,13 +165,15 @@ static int write_dblock_mblock_sh(struct task *task, char *iobuf, **p_iobuf; uint64_t offset; uint32_t checksum; - int iobuf_len, rv; + int iobuf_len, rv, sector_size; memset(&mb, 0, sizeof(mb)); mb.flags = MBLOCK_SHARED; mb.generation = token->host_generation; - iobuf_len = disk->sector_size; + sector_size = token->sector_size; + + iobuf_len = sector_size; if (!iobuf_len) return -EINVAL; @@ -181,7 +183,7 @@ static int write_dblock_mblock_sh(struct task *task, if (rv) return -ENOMEM; - offset = disk->offset + ((2 + host_id - 1) * disk->sector_size); + offset = disk->offset + ((2 + host_id - 1) * sector_size); paxos_dblock_out(pd, &pd_end); @@ -238,7 +240,7 @@ static int write_dblock(struct task *task, pd->checksum = checksum; pd_end.checksum = cpu_to_le32(checksum); - rv = write_sector(disk, 2 + host_id - 1, (char *)&pd_end, sizeof(struct paxos_dblock), + rv = write_sector(disk, token->sector_size, 2 + host_id - 1, (char *)&pd_end, sizeof(struct paxos_dblock), task, token->io_timeout, "dblock"); return rv; } @@ -261,7 +263,7 @@ static int write_leader(struct task *task, lr->checksum = checksum; lr_end.checksum = cpu_to_le32(checksum); - rv = write_sector(disk, 0, (char *)&lr_end, sizeof(struct leader_record), + rv = write_sector(disk, token->sector_size, 0, (char *)&lr_end, sizeof(struct leader_record), task, token->io_timeout, "leader"); return rv; } @@ -290,7 +292,7 @@ int paxos_lease_leader_clobber(struct task *task, leader->checksum = checksum; lr_end.checksum = cpu_to_le32(checksum); - rv = write_sector(&token->disks[0], 0, (char *)&lr_end, sizeof(struct leader_record), + rv = write_sector(&token->disks[0], token->sector_size, 0, (char *)&lr_end, sizeof(struct leader_record), task, token->io_timeout, caller); return rv; } @@ -306,7 +308,7 @@ static int read_dblock(struct task *task, /* 1 leader block + 1 request block; host_id N is block offset N-1 */ - rv = read_sectors(disk, 2 + host_id - 1, 1, (char *)&pd_end, sizeof(struct paxos_dblock), + rv = read_sectors(disk, token->sector_size, 2 + host_id - 1, 1, (char *)&pd_end, sizeof(struct paxos_dblock), task, token->io_timeout, "dblock"); paxos_dblock_in(&pd_end, pd); @@ -324,7 +326,7 @@ static int read_dblocks(struct task *task, char *data; int data_len, rv, i; - data_len = pds_count * disk->sector_size; + data_len = pds_count * sector_size; data = malloc(data_len); if (!data) { @@ -335,7 +337,7 @@ static int read_dblocks(struct task *task, /* 2 = 1 leader block + 1 request block */ - rv = read_sectors(disk, 2, pds_count, data, data_len, + rv = read_sectors(disk, token->sector_size, 2, pds_count, data, data_len, task, "dblocks"); if (rv < 0) goto out_free; @@ -344,7 +346,7 @@ static int read_dblocks(struct task *task, paxos_dblock */ for (i = 0; i < pds_count; i++) { - memcpy(&pd_end, data + (i * disk->sector_size), + memcpy(&pd_end, data + (i * sector_size), sizeof(struct paxos_dblock)); paxos_dblock_in(&pd_end, &pd); @@ -369,9 +371,14 @@ static int read_leader(struct task *task, struct leader_record lr_end; int rv; + if (!token->sector_size) { + log_errot(token, "paxos read_leader with zero sector_size"); + return -EINVAL; + } + /* 0 = leader record is first sector */ - rv = read_sectors(disk, 0, 1, (char *)&lr_end, sizeof(struct leader_record), + rv = read_sectors(disk, token->sector_size, 0, 1, (char *)&lr_end, sizeof(struct leader_record), task, token->io_timeout, "leader"); /* N.B. checksum is computed while the data is in ondisk format. */ @@ -461,7 +468,7 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags, uint32_t checksum; int num_disks = token->r.num_disks; int num_writes, num_reads; - int sector_size = token->disks[0].sector_size; + int sector_size = token->sector_size; int sector_count; int iobuf_len; int phase2 = 0; @@ -965,13 +972,6 @@ static int verify_leader(struct token *token, goto fail; } - if (lr->sector_size != disk->sector_size) { - log_errot(token, "verify_leader wrong sector size %d %d %s", - lr->sector_size, disk->sector_size, disk->path); - result = SANLK_LEADER_SECTORSIZE; - goto fail; - } - if (strncmp(lr->space_name, token->r.lockspace_name, NAME_ID_SIZE)) { log_errot(token, "verify_leader wrong space name %.48s %.48s %s", lr->space_name, token->r.lockspace_name, disk->path); @@ -1008,7 +1008,7 @@ static int verify_leader(struct token *token, memset(&leader_end, 0, sizeof(struct leader_record)); - rv = read_sectors(disk, 0, 1, (char *)&leader_end, + rv = read_sectors(disk, token->sector_size, 0, 1, (char *)&leader_end, sizeof(struct leader_record), NULL, 1, "paxos_verify"); @@ -1047,10 +1047,24 @@ int paxos_read_resource(struct task *task, memset(&leader, 0, sizeof(struct leader_record)); + /* + * We don't know the sector size, so we don't know if we should read + * 512 or 4k, but it doesn't matter since the leader record is all that + * we need. It's probably better to read 4k on a 512 disk than to read 512 + * on a 4k disk, so always do a 4k read. + */ + if (!token->sector_size) { + token->sector_size = 4096; + token->align_size = sector_size_to_align_size(4096); + } + rv = read_leader(task, token, &token->disks[0], &leader, &checksum); if (rv < 0) return rv; + token->sector_size = leader.sector_size; + token->align_size = sector_size_to_align_size(leader.sector_size); + if (!res->lockspace_name[0]) memcpy(token->r.lockspace_name, leader.space_name, NAME_ID_SIZE); @@ -1063,6 +1077,11 @@ int paxos_read_resource(struct task *task, memcpy(res->lockspace_name, leader.space_name, NAME_ID_SIZE); memcpy(res->name, leader.resource_name, NAME_ID_SIZE); res->lver = leader.lver; + + if (leader.sector_size == 512) + res->flags |= SANLK_RES_ALIGN1M; + else if (leader.sector_size == 4096) + res->flags |= SANLK_RES_ALIGN8M; } return rv; @@ -1076,7 +1095,13 @@ int paxos_read_buf(struct task *task, struct sync_disk *disk = &token->disks[0]; int rv, iobuf_len; - iobuf_len = direct_align(disk); + if (!token->sector_size || !token->align_size) { + log_errot(token, "paxos_read_buf with sector_size %d align_size %d", + token->sector_size, token->align_size); + return -EINVAL; + } + + iobuf_len = token->align_size; if (iobuf_len < 0) return iobuf_len; @@ -1262,13 +1287,13 @@ static int _lease_read_one(struct task *task, struct paxos_dblock bk; char *iobuf, **p_iobuf; uint32_t host_id = token->host_id; - uint32_t sector_size = disk->sector_size; + uint32_t sector_size = token->sector_size; uint32_t checksum; struct paxos_dblock *bk_end; uint64_t tmp_mbal = 0; int q, tmp_q = -1, rv, iobuf_len; - iobuf_len = direct_align(disk); + iobuf_len = token->align_size; if (iobuf_len < 0) return iobuf_len; @@ -1588,6 +1613,7 @@ int paxos_lease_acquire(struct task *task, int copy_cur_leader; int disk_open = 0; int error, rv, us; + int ls_sector_size; int other_io_timeout, other_host_dead_seconds; memset(&dblock, 0, sizeof(dblock)); /* shut up compiler */ @@ -1595,6 +1621,11 @@ int paxos_lease_acquire(struct task *task, log_token(token, "paxos_acquire begin %x %llu %d", flags, (unsigned long long)acquire_lver, new_num_hosts); + if (!token->sector_size) { + log_errot(token, "paxos_acquire with zero sector_size"); + return -EINVAL; + } + restart: memset(&tmp_leader, 0, sizeof(tmp_leader)); copy_cur_leader = 0; @@ -1604,6 +1635,18 @@ int paxos_lease_acquire(struct task *task, if (error < 0) goto out; + /* + * It's unusual but possible that the paxos lease was created with a + * different sector size than the lockspace. There could be a reason + * to do this if they are on different disks. + */ + if (cur_leader.sector_size != token->sector_size) { + log_token(token, "paxos_acquire restart with different sector size %d", cur_leader.sector_size); + token->sector_size = cur_leader.sector_size; + token->align_size = sector_size_to_align_size(cur_leader.sector_size); + goto restart; + } + if (flags & PAXOS_ACQUIRE_FORCE) { copy_cur_leader = 1; goto run; @@ -1662,7 +1705,7 @@ int paxos_lease_acquire(struct task *task, if (!disk_open) { memset(&host_id_disk, 0, sizeof(host_id_disk)); - rv = lockspace_disk(cur_leader.space_name, &host_id_disk); + rv = lockspace_disk(cur_leader.space_name, &host_id_disk, &ls_sector_size); if (rv < 0) { log_errot(token, "paxos_acquire no lockspace info %.48s", cur_leader.space_name); @@ -1702,20 +1745,20 @@ int paxos_lease_acquire(struct task *task, (unsigned long long)wait_start); while (1) { - error = delta_lease_leader_read(task, token->io_timeout, &host_id_disk, + error = delta_lease_leader_read(task, ls_sector_size, token->io_timeout, + &host_id_disk, cur_leader.space_name, cur_leader.owner_id, &host_id_leader, "paxos_acquire"); if (error < 0) { log_errot(token, "paxos_acquire owner %llu %llu %llu " - "delta read %d fd %d path %s off %llu ss %u", + "delta read %d fd %d path %s off %llu", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, error, host_id_disk.fd, host_id_disk.path, - (unsigned long long)host_id_disk.offset, - host_id_disk.sector_size); + (unsigned long long)host_id_disk.offset); goto out; } @@ -2332,11 +2375,11 @@ int paxos_lease_init(struct task *task, if (num_hosts > max_hosts) return -EINVAL; - sector_size = token->disks[0].sector_size; + if (!token->sector_size || !token->align_size) + return -EINVAL; - align_size = direct_align(&token->disks[0]); - if (align_size < 0) - return align_size; + sector_size = token->sector_size; + align_size = token->align_size; if (sector_size * (2 + max_hosts) > align_size) return -E2BIG; diff --git a/src/resource.c b/src/resource.c index 05bcc61..cd0cb1b 100644 --- a/src/resource.c +++ b/src/resource.c @@ -108,8 +108,18 @@ int read_resource_owners(struct task *task, struct token *token, disk = &token->disks[0]; - /* we could in-line paxos_read_buf here like we do in read_mode_block */ + /* + * We don't know the sector_size of the resource until the leader + * record has been read, so go with the larger size. + */ + + if (!token->sector_size) { + token->sector_size = 4096; + token->align_size = sector_size_to_align_size(4096); + } + /* we could in-line paxos_read_buf here like we do in read_mode_block */ + retry: rv = paxos_read_buf(task, token, &lease_buf); if (rv < 0) { log_errot(token, "read_resource_owners read_buf rv %d", rv); @@ -125,6 +135,18 @@ int read_resource_owners(struct task *task, struct token *token, leader_record_in(&leader_end, &leader); + if ((token->sector_size == 512) && (leader.sector_size == 4096)) { + /* user flag was wrong */ + token->sector_size = 4096; + token->align_size = sector_size_to_align_size(4096); + free(lease_buf); + lease_buf = NULL; + goto retry; + } + + token->sector_size = leader.sector_size; + token->align_size = sector_size_to_align_size(leader.sector_size); + rv = paxos_verify_leader(token, disk, &leader, checksum, "read_resource_owners"); if (rv < 0) goto out; @@ -135,7 +157,7 @@ int read_resource_owners(struct task *task, struct token *token, host_count++; for (i = 0; i < leader.num_hosts; i++) { - lease_buf_dblock = lease_buf + ((2 + i) * disk->sector_size); + lease_buf_dblock = lease_buf + ((2 + i) * token->sector_size); mb_end = (struct mode_block *)(lease_buf_dblock + MBLOCK_OFFSET); mode_block_in(mb_end, &mb); @@ -187,7 +209,7 @@ int read_resource_owners(struct task *task, struct token *token, } for (i = 0; i < leader.num_hosts; i++) { - lease_buf_dblock = lease_buf + ((2 + i) * disk->sector_size); + lease_buf_dblock = lease_buf + ((2 + i) * token->sector_size); mb_end = (struct mode_block *)(lease_buf_dblock + MBLOCK_OFFSET); mode_block_in(mb_end, &mb); @@ -313,7 +335,7 @@ static int write_host_block(struct task *task, struct token *token, disk = &token->disks[0]; - iobuf_len = disk->sector_size; + iobuf_len = token->sector_size; if (!iobuf_len) return -EINVAL; @@ -350,7 +372,7 @@ static int write_host_block(struct task *task, struct token *token, for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; - offset = disk->offset + ((2 + host_id - 1) * disk->sector_size); + offset = disk->offset + ((2 + host_id - 1) * token->sector_size); rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout, NULL); if (rv < 0) @@ -419,7 +441,7 @@ static int read_mode_block(struct task *task, struct token *token, disk = &token->disks[0]; - iobuf_len = disk->sector_size; + iobuf_len = token->sector_size; if (!iobuf_len) return -EINVAL; @@ -432,7 +454,7 @@ static int read_mode_block(struct task *task, struct token *token, for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; - offset = disk->offset + ((2 + host_id - 1) * disk->sector_size); + offset = disk->offset + ((2 + host_id - 1) * token->sector_size); rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout, NULL); if (rv < 0) @@ -537,9 +559,9 @@ static int read_lvb_block(struct task *task, struct token *token) r = token->resource; disk = &token->disks[0]; - iobuf_len = disk->sector_size; + iobuf_len = token->sector_size; iobuf = r->lvb; - offset = disk->offset + (LVB_SECTOR * disk->sector_size); + offset = disk->offset + (LVB_SECTOR * token->sector_size); if (!r->lvb) return 0; @@ -557,9 +579,9 @@ static int write_lvb_block(struct task *task, struct resource *r, struct token * int iobuf_len, rv; disk = &token->disks[0]; - iobuf_len = disk->sector_size; + iobuf_len = token->sector_size; iobuf = r->lvb; - offset = disk->offset + (LVB_SECTOR * disk->sector_size); + offset = disk->offset + (LVB_SECTOR * token->sector_size); if (!r->lvb) return 0; @@ -1647,23 +1669,14 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, copy_disks(&r->r.disks, &token->r.disks, token->r.num_disks); - if (cmd_flags & SANLK_ACQUIRE_LVB) { - char *iobuf, **p_iobuf; - p_iobuf = &iobuf; - - rv = posix_memalign((void *)p_iobuf, getpagesize(), token->disks[0].sector_size); - if (rv) - log_errot(token, "acquire_token lvb size %d memalign error %d", - token->disks[0].sector_size, rv); - else - r->lvb = iobuf; - } - retry: memset(&leader, 0, sizeof(struct leader_record)); rv = acquire_disk(task, token, acquire_lver, new_num_hosts, owner_nowait, &leader, &dblock); + /* token sector_size starts as ls sector_size, but can change in paxos acquire */ + r->sector_size = token->sector_size; + if (rv == SANLK_ACQUIRE_IDLIVE || rv == SANLK_ACQUIRE_OWNED || rv == SANLK_ACQUIRE_OTHER) { /* * Another host owns the lease. They may be holding for @@ -1796,11 +1809,22 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, out: if (cmd_flags & SANLK_ACQUIRE_LVB) { - rv = read_lvb_block(task, token); - if (rv < 0) { - /* TODO: we should probably notify the caller somehow about - lvb read/write independent of the lease results. */ - log_errot(token, "acquire_token read_lvb error %d", rv); + char *iobuf, **p_iobuf; + p_iobuf = &iobuf; + + /* TODO: we should probably notify the caller somehow about + lvb read/write independent of the lease results. */ + + rv = posix_memalign((void *)p_iobuf, getpagesize(), token->sector_size); + if (rv) { + log_errot(token, "acquire_token lvb size %d memalign error %d", + token->sector_size, rv); + } else { + r->lvb = iobuf; + + rv = read_lvb_block(task, token); + if (rv < 0) + log_errot(token, "acquire_token read_lvb error %d", rv); } } @@ -1835,6 +1859,13 @@ int request_token(struct task *task, struct token *token, uint32_t force_mode, if (rv < 0) goto out; + if (leader.sector_size != token->sector_size) { + /* token sector_size starts with lockspace sector_size, + but it could be different. */ + token->sector_size = leader.sector_size; + token->align_size = sector_size_to_align_size(leader.sector_size); + } + if (leader.timestamp == LEASE_FREE) { *owner_id = 0; rv = SANLK_OK; @@ -2311,6 +2342,8 @@ static void *resource_thread(void *arg GNUC_UNUSED) tt->host_generation = r->host_generation; tt->token_id = r->release_token_id; tt->io_timeout = r->io_timeout; + tt->sector_size = r->sector_size; + tt->align_size = sector_size_to_align_size(r->sector_size); tt->resource = r; /* @@ -2346,6 +2379,8 @@ static void *resource_thread(void *arg GNUC_UNUSED) tt->host_id = r->host_id; tt->host_generation = r->host_generation; tt->io_timeout = r->io_timeout; + tt->sector_size = r->sector_size; + tt->align_size = sector_size_to_align_size(r->sector_size); pid = r->pid; lver = r->leader.lver; diff --git a/src/sanlock.8 b/src/sanlock.8 index 69c4244..370526c 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -583,11 +583,13 @@ exist (command fails). Tell the sanlock daemon to initialize a lockspace on disk. The -o option can be used to specify the io timeout to be written in the host_id leases. +The -Z option can be used to specify the sector size. (Also see sanlock direct init.) .BR "sanlock client init -r" " RESOURCE" Tell the sanlock daemon to initialize a resource lease on disk. +The -Z option can be used to specify the sector size. (Also see sanlock direct init.) .BR "sanlock client read -s" " LOCKSPACE" @@ -727,6 +729,8 @@ host_ids can be changed for special cases using the -n num_hosts and -m max_hosts options.) With -s, the -o option specifies the io timeout to be written in the host_id leases. With -r, the -z 1 option invalidates the resource lease on disk so it cannot be used until reinitialized normally. +The -Z option can be used to specify the sector size (and corresponding +1MB/8MB size.) .BR "sanlock direct read_leader -s" " LOCKSPACE" .br diff --git a/src/sanlock.conf b/src/sanlock.conf index 4debc6b..cd562d1 100644 --- a/src/sanlock.conf +++ b/src/sanlock.conf @@ -46,4 +46,6 @@ # # paxos_debug_all = 0 # command line: n/a - +# +# debug_io = +# command line: n/a diff --git a/src/sanlock.h b/src/sanlock.h index 1ebe745..b7feddc 100644 --- a/src/sanlock.h +++ b/src/sanlock.h @@ -84,10 +84,12 @@ struct sanlk_disk { * host if the lockspace lease is cleanly released. */ -#define SANLK_RES_LVER 0x1 /* lver field is set */ -#define SANLK_RES_NUM_HOSTS 0x2 /* data32 field is new num_hosts */ -#define SANLK_RES_SHARED 0x4 -#define SANLK_RES_PERSISTENT 0x8 +#define SANLK_RES_LVER 0x00000001 /* lver field is set */ +#define SANLK_RES_NUM_HOSTS 0x00000002 /* data32 field is new num_hosts */ +#define SANLK_RES_SHARED 0x00000004 +#define SANLK_RES_PERSISTENT 0x00000008 +#define SANLK_RES_ALIGN1M 0x00000010 /* uses 512 sectors */ +#define SANLK_RES_ALIGN8M 0x00000020 /* uses 4k sectors */ struct sanlk_resource { char lockspace_name[SANLK_NAME_LEN]; /* terminating \0 not required */ @@ -113,10 +115,17 @@ struct sanlk_options { char str[0]; }; +#define SANLK_LSF_ADD 0x00000001 +#define SANLK_LSF_REM 0x00000002 + +/* make these values match the RES equivalent in case of typos */ +#define SANLK_LSF_ALIGN1M 0x00000010 /* uses 512 sectors */ +#define SANLK_LSF_ALIGN8M 0x00000020 /* uses 4k sectors */ + struct sanlk_lockspace { char name[SANLK_NAME_LEN]; uint64_t host_id; - uint32_t flags; + uint32_t flags; /* SANLK_LSF_ */ struct sanlk_disk host_id_disk; }; diff --git a/src/sanlock_admin.h b/src/sanlock_admin.h index d017fed..c4711e9 100644 --- a/src/sanlock_admin.h +++ b/src/sanlock_admin.h @@ -23,10 +23,6 @@ /* write flags */ #define SANLK_WRITE_CLEAR 0x00000001 /* subsequent read will return error */ -/* sanlk_lockspace.flags returned by get */ -#define SANLK_LSF_ADD 0x00000001 -#define SANLK_LSF_REM 0x00000002 - /* host status returned in low byte of sanlk_host.flags by get */ #define SANLK_HOST_UNKNOWN 0x00000001 #define SANLK_HOST_FREE 0x00000002 @@ -175,6 +171,38 @@ int sanlock_init(struct sanlk_lockspace *ls, int max_hosts, int num_hosts); /* + * Alignment and sector size + * + * When ALIGN1M or ALIGN8M is set in sanlk_lockspace | sanlk_resource + * and passed to sanlock_write_lockspace() | sanlock_write_resource(), + * it causes sanlock to create 1M or 8M aligned (and sized) leases, + * which use 512 or 4K sector ios, respectively, for the lockspace | resource. + * + * (A lockspace and its associated resources will typically use the + * same align and sector size, but it's conceivable they would not, e.g. + * if the were placed on different storage with different sector sizes.) + * + * The ALIGN flag overrides sanlock's detection of sector size for disks, + * and overrides the default 512 sector assumption for files. + * + * sanlock_read_lockspace() | sanlock_read_resource() will return + * ALIGN1M or ALIGN8M to indicate the lockspace | resource alignment. + * These flags are returned whether or not they were passed to + * sanlock_write_lockspace() | sanlock_write_resource(). + * (The ALIGN flag can be passed to sanlock_read_lockspace() to avoid + * an extra read to discover the sector size.) + * + * Prior to the addition of ALIGN flags, sanlock will return neither from + * read. The alignment of the lockspace | resource can then be determined + * with sanlock_align(). After the addition of ALIGN flags, sanlock_align() + * no longer correctly indicates the alignment of the lockspace | resource. + * + * With the addition of ALIGN flags, sanlock_align() still reports the + * *default* alignment that sanlock will use for disks or files if an + * ALIGN flag is not passed to write. + */ + +/* * write a lockspace to disk * * the sanlock daemon writes max_hosts lockspace leader records to disk diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 4cee6c1..5140497 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -99,6 +99,8 @@ struct token { int pid; uint32_t flags; /* be careful to avoid using this from different threads */ uint32_t token_id; /* used to refer to this token instance in log messages */ + int sector_size; + int align_size; int space_dead; /* copied from sp->space_dead, set by main thread */ int shared_count; /* set during ballot by paxos_lease_acquire */ char shared_bitmap[HOSTID_BITMAP_SIZE]; /* bit set for host_id with SH */ @@ -123,6 +125,7 @@ struct resource { uint64_t host_generation; uint32_t io_timeout; int pid; /* copied from token when ex */ + int sector_size; uint32_t flags; uint32_t release_token_id; /* copy to temp token (tt) for log messages */ uint64_t thread_release_retry; @@ -188,6 +191,7 @@ struct space { uint32_t flags; /* SP_ */ uint32_t used_retries; uint32_t renewal_read_extend_sec; /* defaults to io_timeout */ + int sector_size; int align_size; int renew_fail; int space_dead; @@ -216,6 +220,8 @@ struct space_info { uint32_t io_timeout; uint64_t host_id; uint64_t host_generation; + int sector_size; + int align_size; int killing_pids; }; @@ -303,8 +309,10 @@ struct command_line { int action; /* ACT_ */ int debug; int debug_renew; - int quiet_fail; + int debug_io_submit; + int debug_io_complete; int paxos_debug_all; + int quiet_fail; int wait; int use_watchdog; int high_priority; /* -h */ @@ -321,6 +329,7 @@ struct command_line { int used; int all; int clear_arg; + int sector_size; char *uname; /* -U */ int uid; /* -U */ char *gname; /* -G */ @@ -412,5 +421,16 @@ EXTERN uint8_t sanlock_version_patch; EXTERN uint8_t sanlock_version_build; EXTERN uint32_t sanlock_version_combined; +#define ONEMB 1048576 + +static inline int sector_size_to_align_size(int sector_size) +{ + if (sector_size == 512) + return ONEMB; + if (sector_size == 4096) + return 8 * ONEMB; + return 0; +} + #endif