#2 journal: Backport compact mode
Merged 3 years ago by daandemeyer. Opened 3 years ago by daandemeyer.
centos-sig-hyperscale/ daandemeyer/systemd journal-compact-entry-item  into  fb-v250.3

@@ -57,8 +57,10 @@ 

              MESON_ARGS=(--optimization=1)

  

              if [[ "$phase" = "RUN_CLANG_ASAN_UBSAN" ]]; then

-                 export CC=clang

-                 export CXX=clang++

+                 # Explicitly use clang-11, since with the default clang-10

+                 # we might trigger some UBSan false-positives. See https://github.com/systemd/systemd/pull/21183

+                 export CC=clang-11

+                 export CXX=clang++-11

                  # Build fuzzer regression tests only with clang (for now),

                  # see: https://github.com/systemd/systemd/pull/15886#issuecomment-632689604

                  # -Db_lundef=false: See https://github.com/mesonbuild/meson/issues/764

file modified
+11
@@ -979,6 +979,17 @@ 

          above.</para></listitem>

        </varlistentry>

  

+       <varlistentry>

+         <term><option>--convert=</option></term>

+ 

+         <listitem><para>Converts the specified journal files to the latest supported journal format. Takes

+         the path to store the converted journal files. The path should include the filename to be used for

+         the converted files, with the <literal>.journal</literal> extension (e.g.

+         <filename>/a/b/c/converted.journal</filename> will store the journal files in the

+         <filename>/a/b/c</filename> directory using <filename>converted.journal</filename> as the filename).

+         </para></listitem>

+       </varlistentry>

+ 

        <xi:include href="standard-options.xml" xpointer="help" />

        <xi:include href="standard-options.xml" xpointer="version" />

        <xi:include href="standard-options.xml" xpointer="no-pager" />

file modified
+2 -1
@@ -2085,7 +2085,8 @@ 

          'journalctl',

          journalctl_sources,

          include_directories : includes,

-         link_with : [libshared],

+         link_with : [libjournal_core,

+                      libshared],

          dependencies : [threads,

                          libdl,

                          libxz,

file modified
+64
@@ -48,6 +48,7 @@ 

  #include "locale-util.h"

  #include "log.h"

  #include "logs-show.h"

+ #include "managed-journal-file.h"

  #include "memory-util.h"

  #include "mkdir.h"

  #include "mount-util.h"
@@ -130,6 +131,7 @@ 

  static uint64_t arg_vacuum_n_files = 0;

  static usec_t arg_vacuum_time = 0;

  static char **arg_output_fields = NULL;

+ static const char *arg_convert = NULL;

  #if HAVE_PCRE2

  static const char *arg_pattern = NULL;

  static pcre2_code *arg_compiled_pattern = NULL;
@@ -155,6 +157,7 @@ 

          ACTION_ROTATE_AND_VACUUM,

          ACTION_LIST_FIELDS,

          ACTION_LIST_FIELD_NAMES,

+         ACTION_CONVERT,

  } arg_action = ACTION_SHOW;

  

  typedef struct BootId {
@@ -402,6 +405,7 @@ 

                 "     --dump-catalog          Show entries in the message catalog\n"

                 "     --update-catalog        Update the message catalog database\n"

                 "     --setup-keys            Generate a new FSS key pair\n"

+                "     --convert=PATH          Convert the journal to the latest journal format\n"

                 "\nSee the %2$s for details.\n",

                 program_invocation_short_name,

                 link,
@@ -456,6 +460,7 @@ 

                  ARG_NO_HOSTNAME,

                  ARG_OUTPUT_FIELDS,

                  ARG_NAMESPACE,

+                 ARG_CONVERT,

          };

  

          static const struct option options[] = {
@@ -523,6 +528,7 @@ 

                  { "no-hostname",          no_argument,       NULL, ARG_NO_HOSTNAME          },

                  { "output-fields",        required_argument, NULL, ARG_OUTPUT_FIELDS        },

                  { "namespace",            required_argument, NULL, ARG_NAMESPACE            },

+                 { "convert",              required_argument, NULL, ARG_CONVERT              },

                  {}

          };

  
@@ -1050,6 +1056,11 @@ 

                          break;

                  }

  

+                 case ARG_CONVERT:

+                         arg_action = ACTION_CONVERT;

+                         arg_convert = optarg;

+                         break;

+ 

                  case '?':

                          return -EINVAL;

  
@@ -2127,6 +2138,54 @@ 

          return 0;

  }

  

+ static int journal_convert(sd_journal *j) {

+         _cleanup_(managed_journal_file_closep) ManagedJournalFile *to = NULL;

+         _cleanup_(mmap_cache_unrefp) MMapCache *mmap = NULL;

+         int r;

+ 

+         assert(arg_convert);

+ 

+         mmap = mmap_cache_new();

+         if (!mmap)

+                 return -ENOMEM;

+ 

+         r = managed_journal_file_open(-1, arg_convert, O_RDWR | O_CREAT, 0640, true, UINT64_MAX, false,

+                                       &(JournalMetrics) { -1, -1, -1, -1, -1, -1 }, mmap, NULL, NULL, &to);

+         if (r < 0)

+                 return log_error_errno(r, "Failed to open journal: %m");

+ 

+         SD_JOURNAL_FOREACH(j) {

+                 Object *o;

+                 JournalFile *from;

+ 

+                 from = j->current_file;

+                 assert(from && from->current_offset > 0);

+ 

+                 r = journal_file_move_to_object(from, OBJECT_ENTRY, from->current_offset, &o);

+                 if (r < 0)

+                         return log_error_errno(r, "Can't read entry: %m");

+ 

+                 r = journal_file_copy_entry(from, to->file, o, from->current_offset);

+                 if (r >= 0)

+                         continue;

+ 

+                 if (!journal_shall_try_append_again(to->file, r))

+                         return log_error_errno(r, "Can't write entry: %m");

+ 

+                 log_info("Rotating journal.");

+ 

+                 r = managed_journal_file_rotate(&to, mmap, true, UINT64_MAX, false, NULL);

+                 if (r < 0)

+                         return log_error_errno(r, "Failed to rotate %s: %m", to->file->path);

+ 

+                 r = journal_file_copy_entry(from, to->file, o, from->current_offset);

+                 if (r < 0)

+                         return log_error_errno(r, "Can't write entry: %m");

+         }

+ 

+         return 0;

+ }

+ 

  int main(int argc, char *argv[]) {

          _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;

          _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
@@ -2238,6 +2297,7 @@ 

          case ACTION_ROTATE_AND_VACUUM:

          case ACTION_LIST_FIELDS:

          case ACTION_LIST_FIELD_NAMES:

+         case ACTION_CONVERT:

                  /* These ones require access to the journal files, continue below. */

                  break;

  
@@ -2392,6 +2452,10 @@ 

          case ACTION_LIST_FIELDS:

                  break;

  

+         case ACTION_CONVERT:

+                 r = journal_convert(j);

+                 goto finish;

+ 

          default:

                  assert_not_reached();

          }

file modified
+3 -51
@@ -30,6 +30,7 @@ 

  #include "io-util.h"

  #include "journal-authenticate.h"

  #include "journal-internal.h"

+ #include "journal-util.h"

  #include "journal-vacuum.h"

  #include "journald-audit.h"

  #include "journald-context.h"
@@ -744,55 +745,6 @@ 

          free_and_replace(s->hostname_field, x);

  }

  

- static bool shall_try_append_again(JournalFile *f, int r) {

-         switch(r) {

- 

-         case -E2BIG:           /* Hit configured limit          */

-         case -EFBIG:           /* Hit fs limit                  */

-         case -EDQUOT:          /* Quota limit hit               */

-         case -ENOSPC:          /* Disk full                     */

-                 log_debug("%s: Allocation limit reached, rotating.", f->path);

-                 return true;

- 

-         case -EIO:             /* I/O error of some kind (mmap) */

-                 log_warning("%s: IO error, rotating.", f->path);

-                 return true;

- 

-         case -EHOSTDOWN:       /* Other machine                 */

-                 log_info("%s: Journal file from other machine, rotating.", f->path);

-                 return true;

- 

-         case -EBUSY:           /* Unclean shutdown              */

-                 log_info("%s: Unclean shutdown, rotating.", f->path);

-                 return true;

- 

-         case -EPROTONOSUPPORT: /* Unsupported feature           */

-                 log_info("%s: Unsupported feature, rotating.", f->path);

-                 return true;

- 

-         case -EBADMSG:         /* Corrupted                     */

-         case -ENODATA:         /* Truncated                     */

-         case -ESHUTDOWN:       /* Already archived              */

-                 log_warning("%s: Journal file corrupted, rotating.", f->path);

-                 return true;

- 

-         case -EIDRM:           /* Journal file has been deleted */

-                 log_warning("%s: Journal file has been deleted, rotating.", f->path);

-                 return true;

- 

-         case -ETXTBSY:         /* Journal file is from the future */

-                 log_warning("%s: Journal file is from the future, rotating.", f->path);

-                 return true;

- 

-         case -EAFNOSUPPORT:

-                 log_warning("%s: underlying file system does not support memory mapping or another required file system feature.", f->path);

-                 return false;

- 

-         default:

-                 return false;

-         }

- }

- 

  static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, size_t n, int priority) {

          bool vacuumed = false, rotate = false;

          struct dual_timestamp ts;
@@ -847,7 +799,7 @@ 

                  return;

          }

  

-         if (vacuumed || !shall_try_append_again(f->file, r)) {

+         if (vacuumed || !journal_shall_try_append_again(f->file, r)) {

                  log_error_errno(r, "Failed to write entry (%zu items, %zu bytes), ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n));

                  return;

          }
@@ -1176,7 +1128,7 @@ 

                  if (r >= 0)

                          continue;

  

-                 if (!shall_try_append_again(s->system_journal->file, r)) {

+                 if (!journal_shall_try_append_again(s->system_journal->file, r)) {

                          log_error_errno(r, "Can't write entry: %m");

                          goto finish;

                  }

@@ -50,7 +50,7 @@ 

                  if (r < 0)

                          return r;

  

-                 n_items += journal_file_entry_array_n_items(&o);

+                 n_items += journal_file_entry_array_n_items(f, &o);

                  p = q;

          }

  
@@ -66,8 +66,8 @@ 

          if (n_unused == 0)

                  return 0;

  

-         offset = p + offsetof(Object, entry_array.items) +

-                 (journal_file_entry_array_n_items(&o) - n_unused) * sizeof(le64_t);

+         offset = p + journal_file_entry_array_items_offset(f) +

+                 (journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f);

          sz = p + le64toh(o.object.size) - offset;

  

          if (sz < MINIMUM_HOLE_SIZE)
@@ -78,10 +78,10 @@ 

  

                  o.object.size = htole64(offset - p);

  

-                 n = pwrite(f->fd, &o, sizeof(EntryArrayObject), p);

+                 n = pwrite(f->fd, &o, journal_file_entry_array_items_offset(f), p);

                  if (n < 0)

                          return log_debug_errno(errno, "Failed to modify entry array object size: %m");

-                 if ((size_t) n != sizeof(EntryArrayObject))

+                 if ((size_t) n != journal_file_entry_array_items_offset(f))

                          return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short pwrite() while modifying entry array object size.");

  

                  f->header->arena_size = htole64(ALIGN64(offset) - le64toh(f->header->header_size));

@@ -13,7 +13,7 @@ 

  #include "path-util.h"

  #include "string-util.h"

  

- int main(int argc, char *argv[]) {

+ static void run_test(int argc, char *argv[]) {

          _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL;

          _cleanup_free_ char *fn = NULL;

          char dn[] = "/var/tmp/test-journal-flush.XXXXXX";
@@ -70,6 +70,12 @@ 

  

          unlink(fn);

          assert_se(rmdir(dn) == 0);

+ }

+ 

+ int main(int argc, char *argv[]) {

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);

+         run_test(argc, argv);

  

-         return 0;

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);

+         run_test(argc, argv);

  }

@@ -299,6 +299,10 @@ 

          test_skip(setup_sequential);

          test_skip(setup_interleaved);

  

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);

+         test_sequence_numbers();

+ 

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);

          test_sequence_numbers();

  

          return 0;

@@ -184,11 +184,18 @@ 

  

          test_setup_logging(LOG_DEBUG);

  

-         /* Run this test twice. Once with old hashing and once with new hashing */

+         /* Run this test multiple times with different configurations of features. */

+ 

+         assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "0", 1) >= 0);

+         run_test();

+ 

          assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "1", 1) >= 0);

          run_test();

  

-         assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "0", 1) >= 0);

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);

+         run_test();

+ 

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);

          run_test();

  

          return 0;

@@ -56,7 +56,7 @@ 

          return r;

  }

  

- int main(int argc, char *argv[]) {

+ static int run_test(int argc, char *argv[]) {

          _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL;

          char t[] = "/var/tmp/journal-XXXXXX";

          unsigned n;
@@ -141,3 +141,11 @@ 

  

          return 0;

  }

+ 

+ int main(int argc, char *argv[]) {

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);

+         run_test(argc, argv);

+ 

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);

+         run_test(argc, argv);

+ }

file modified
+13 -2
@@ -31,7 +31,7 @@ 

          static const char test[] = "TEST1=1", test2[] = "TEST2=2";

          Object *o, *d;

          uint64_t p;

-         sd_id128_t fake_boot_id;

+         sd_id128_t fake_boot_id, boot_id;

          char t[] = "/var/tmp/journal-XXXXXX";

  

          test_setup_logging(LOG_DEBUG);
@@ -68,7 +68,8 @@ 

  

          assert_se(journal_file_next_entry(f->file, p, DIRECTION_DOWN, &o, &p) == 1);

          assert_se(le64toh(o->entry.seqnum) == 3);

-         assert_se(sd_id128_equal(o->entry.boot_id, fake_boot_id));

+         assert_se(journal_file_entry_boot_id(f->file, o, &boot_id) == 0);

+         assert_se(sd_id128_equal(boot_id, fake_boot_id));

  

          assert_se(journal_file_next_entry(f->file, p, DIRECTION_DOWN, &o, &p) == 0);

  
@@ -258,6 +259,16 @@ 

          if (access("/etc/machine-id", F_OK) != 0)

                  return log_tests_skipped("/etc/machine-id not found");

  

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);

+ 

+         test_non_empty();

+         test_empty();

+ #if HAVE_COMPRESSION

+         test_min_compress_size();

+ #endif

+ 

+         assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);

+ 

          test_non_empty();

          test_empty();

  #if HAVE_COMPRESSION

@@ -248,7 +248,7 @@ 

          case OBJECT_DATA:

                  /* All but hash and payload are mutable */

                  gcry_md_write(f->hmac, &o->data.hash, sizeof(o->data.hash));

-                 gcry_md_write(f->hmac, o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));

+                 gcry_md_write(f->hmac, journal_file_data_payload_field(f, o), le64toh(o->object.size) - journal_file_data_payload_offset(f));

                  break;

  

          case OBJECT_FIELD:

@@ -23,6 +23,8 @@ 

  typedef struct HashTableObject HashTableObject;

  typedef struct EntryArrayObject EntryArrayObject;

  typedef struct TagObject TagObject;

+ typedef struct TrieNodeObject TrieNodeObject;

+ typedef struct BootIdObject BootIdObject;

  

  typedef struct EntryItem EntryItem;

  typedef struct HashItem HashItem;
@@ -39,16 +41,21 @@ 

          OBJECT_FIELD_HASH_TABLE,

          OBJECT_ENTRY_ARRAY,

          OBJECT_TAG,

+         OBJECT_TRIE_NODE,

+         OBJECT_TRIE_HASH_TABLE,

+         OBJECT_BOOT_ID,

          _OBJECT_TYPE_MAX

  } ObjectType;

  

  /* Object flags */

  enum {

-         OBJECT_COMPRESSED_XZ   = 1 << 0,

-         OBJECT_COMPRESSED_LZ4  = 1 << 1,

-         OBJECT_COMPRESSED_ZSTD = 1 << 2,

+         OBJECT_COMPRESSED_XZ    = 1 << 0,

+         OBJECT_COMPRESSED_LZ4   = 1 << 1,

+         OBJECT_COMPRESSED_ZSTD  = 1 << 2,

          OBJECT_COMPRESSION_MASK = (OBJECT_COMPRESSED_XZ | OBJECT_COMPRESSED_LZ4 | OBJECT_COMPRESSED_ZSTD),

-         _OBJECT_COMPRESSED_MAX = OBJECT_COMPRESSION_MASK,

+         FIELD_UNIQUE            = 1 << 3,

+         FIELD_INDEXED           = 1 << 4,

+         _OBJECT_COMPRESSED_MAX  = OBJECT_COMPRESSION_MASK,

  };

  

  struct ObjectHeader {
@@ -67,8 +74,15 @@ 

          le64_t entry_offset; /* the first array entry we store inline */ \

          le64_t entry_array_offset;                                      \

          le64_t n_entries;                                               \

-         uint8_t payload[];                                              \

-         }

+         union {                                                         \

+                 uint8_t payload[0];                                     \

+                 struct {                                                \

+                         le32_t tail_entry_array_offset;                 \

+                         le32_t tail_entry_array_n_entries;              \

+                         uint8_t compact[0];                             \

+                 };                                                      \

+         };                                                              \

+ }

  

  struct DataObject DataObject__contents;

  struct DataObject__packed DataObject__contents _packed_;
@@ -91,15 +105,34 @@ 

          le64_t hash;

  } _packed_;

  

- #define EntryObject__contents { \

-         ObjectHeader object;    \

-         le64_t seqnum;          \

-         le64_t realtime;        \

-         le64_t monotonic;       \

-         sd_id128_t boot_id;     \

-         le64_t xor_hash;        \

-         EntryItem items[];      \

-         }

+ /* Extended version of EntryItem that stores extra information that we don't store in the journal file. */

+ typedef struct {

+         uint64_t object_offset;

+         uint64_t hash;

+         /* The hash used to calculate the Entry object's XOR hash field. */

+         uint64_t xor_hash;

+         bool indexed;

+ } EntryItemEx;

+ 

+ #define EntryObject__contents {                  \

+         ObjectHeader object;                     \

+         le64_t seqnum;                           \

+         le64_t realtime;                         \

+         le64_t monotonic;                        \

+         union {                                  \

+                 struct {                         \

+                         sd_id128_t boot_id;      \

+                         le64_t xor_hash;         \

+                         EntryItem items[];       \

+                 };                               \

+                 struct {                         \

+                         le64_t xor_hash_compact; \

+                         le32_t boot_id_offset;   \

+                         le32_t trie_offset;      \

+                         uint8_t payload[];       \

+                 };                               \

+         };                                       \

+ }

  

  struct EntryObject EntryObject__contents;

  struct EntryObject__packed EntryObject__contents _packed_;
@@ -118,7 +151,10 @@ 

  struct EntryArrayObject {

          ObjectHeader object;

          le64_t next_entry_array_offset;

-         le64_t items[];

+         union {

+                 le64_t items[0];

+                 le32_t compact[0];

+         };

  } _packed_;

  

  #define TAG_LENGTH (256/8)
@@ -130,6 +166,27 @@ 

          uint8_t tag[TAG_LENGTH]; /* SHA-256 HMAC */

  } _packed_;

  

+ #define TrieNodeObject__contents { \

+         ObjectHeader object;       \

+         le64_t hash;               \

+         le32_t parent_offset;      \

+         le32_t object_offset;      \

+         le64_t next_hash_offset;   \

+ }

+ 

+ struct TrieNodeObject TrieNodeObject__contents;

+ struct TrieNodeObject__packed TrieNodeObject__contents _packed_;

+ assert_cc(sizeof(struct TrieNodeObject) == sizeof(struct TrieNodeObject__packed));

+ 

+ #define BootIdObject__contents { \

+         ObjectHeader object;     \

+         sd_id128_t value;        \

+ }

+ 

+ struct BootIdObject BootIdObject__contents;

+ struct BootIdObject__packed BootIdObject__contents _packed_;

+ assert_cc(sizeof(struct BootIdObject) == sizeof(struct BootIdObject__packed));

+ 

  union Object {

          ObjectHeader object;

          DataObject data;
@@ -138,6 +195,8 @@ 

          HashTableObject hash_table;

          EntryArrayObject entry_array;

          TagObject tag;

+         TrieNodeObject trie_node;

+         BootIdObject boot_id;

  };

  

  enum {
@@ -153,19 +212,22 @@ 

          HEADER_INCOMPATIBLE_COMPRESSED_LZ4  = 1 << 1,

          HEADER_INCOMPATIBLE_KEYED_HASH      = 1 << 2,

          HEADER_INCOMPATIBLE_COMPRESSED_ZSTD = 1 << 3,

+         HEADER_INCOMPATIBLE_COMPACT         = 1 << 4,

  };

  

  #define HEADER_INCOMPATIBLE_ANY               \

          (HEADER_INCOMPATIBLE_COMPRESSED_XZ |  \

           HEADER_INCOMPATIBLE_COMPRESSED_LZ4 | \

           HEADER_INCOMPATIBLE_KEYED_HASH |     \

-          HEADER_INCOMPATIBLE_COMPRESSED_ZSTD)

+          HEADER_INCOMPATIBLE_COMPRESSED_ZSTD | \

+          HEADER_INCOMPATIBLE_COMPACT)

  

  #define HEADER_INCOMPATIBLE_SUPPORTED                            \

          ((HAVE_XZ ? HEADER_INCOMPATIBLE_COMPRESSED_XZ : 0) |     \

           (HAVE_LZ4 ? HEADER_INCOMPATIBLE_COMPRESSED_LZ4 : 0) |   \

           (HAVE_ZSTD ? HEADER_INCOMPATIBLE_COMPRESSED_ZSTD : 0) | \

-          HEADER_INCOMPATIBLE_KEYED_HASH)

+          HEADER_INCOMPATIBLE_KEYED_HASH |                        \

+          HEADER_INCOMPATIBLE_COMPACT)

  

  enum {

          HEADER_COMPATIBLE_SEALED = 1 << 0,
@@ -215,12 +277,20 @@ 

          /* Added in 246 */                              \

          le64_t data_hash_chain_depth;                   \

          le64_t field_hash_chain_depth;                  \

+         /* Added in 251 */                              \

+         le64_t trie_hash_table_offset;                  \

+         le64_t trie_hash_table_size;                    \

+         le64_t n_trie_nodes;                            \

+         le64_t trie_hash_chain_depth;                   \

+         le64_t boot_id_offset;                          \

+         le32_t tail_entry_array_offset;                 \

+         le32_t tail_entry_array_n_entries;              \

          }

  

  struct Header struct_Header__contents;

  struct Header__packed struct_Header__contents _packed_;

  assert_cc(sizeof(struct Header) == sizeof(struct Header__packed));

- assert_cc(sizeof(struct Header) == 256);

+ assert_cc(sizeof(struct Header) == 304);

  

  #define FSS_HEADER_SIGNATURE                                            \

          ((const char[]) { 'K', 'S', 'H', 'H', 'R', 'H', 'L', 'P' })

@@ -34,6 +34,7 @@ 

  #include "string-util.h"

  #include "strv.h"

  #include "sync-util.h"

+ #include "unaligned.h"

  #include "xattr-util.h"

  

  #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
@@ -299,7 +300,8 @@ 

                  f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |

                  f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4 |

                  f->compress_zstd * HEADER_INCOMPATIBLE_COMPRESSED_ZSTD |

-                 f->keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH);

+                 f->keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH |

+                 f->compact * HEADER_INCOMPATIBLE_COMPACT);

  

          h.compatible_flags = htole32(

                  f->seal * HEADER_COMPATIBLE_SEALED);
@@ -324,7 +326,34 @@ 

          return 0;

  }

  

+ static int journal_file_refresh_boot_id(JournalFile *f, const sd_id128_t *boot_id) {

+         Object *o;

+         uint64_t p;

+         int r;

+ 

+         assert(boot_id);

+ 

+         if (!JOURNAL_HEADER_COMPACT(f->header)) {

+                 f->header->boot_id = *boot_id;

+                 return 0;

+         }

+ 

+         if (sd_id128_equal(*boot_id, f->header->boot_id) && le64toh(f->header->boot_id_offset) > 0)

+                 return 0;

+ 

+         r = journal_file_append_object(f, OBJECT_BOOT_ID, sizeof(BootIdObject), &o, &p);

+         if (r < 0)

+                 return r;

+ 

+         o->boot_id.value = *boot_id;

+         f->header->boot_id_offset = htole64(p);

+         f->header->boot_id = *boot_id;

+ 

+         return 0;

+ }

+ 

  static int journal_file_refresh_header(JournalFile *f) {

+         sd_id128_t boot_id;

          int r;

  

          assert(f);
@@ -337,7 +366,11 @@ 

          else if (r < 0)

                  return r;

  

-         r = sd_id128_get_boot(&f->header->boot_id);

+         r = sd_id128_get_boot(&boot_id);

+         if (r < 0)

+                 return r;

+ 

+         r = journal_file_refresh_boot_id(f, &boot_id);

          if (r < 0)

                  return r;

  
@@ -364,7 +397,7 @@ 

                                    f->path, type, flags & ~any);

                  flags = (flags & any) & ~supported;

                  if (flags) {

-                         const char* strv[5];

+                         const char* strv[6];

                          size_t n = 0;

                          _cleanup_free_ char *t = NULL;

  
@@ -380,6 +413,8 @@ 

                                          strv[n++] = "zstd-compressed";

                                  if (flags & HEADER_INCOMPATIBLE_KEYED_HASH)

                                          strv[n++] = "keyed-hash";

+                                 if (flags & HEADER_INCOMPATIBLE_COMPACT)

+                                         strv[n++] = "compact";

                          }

                          strv[n] = NULL;

                          assert(n < ELEMENTSOF(strv));
@@ -553,6 +588,10 @@ 

          if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)

                  return -E2BIG;

  

+         /* Refuse to go over 4G in compact mode so offsets can be stored in 32-bit. */

+         if (JOURNAL_HEADER_COMPACT(f->header) && offset + size > UINT32_MAX)

+                 return -E2BIG;

+ 

          if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {

                  struct statvfs svfs;

  
@@ -625,7 +664,7 @@ 

          return mmap_cache_fd_get(f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);

  }

  

- static uint64_t minimum_header_size(Object *o) {

+ static uint64_t minimum_header_size(JournalFile *f, Object *o) {

  

          static const uint64_t table[] = {

                  [OBJECT_DATA] = sizeof(DataObject),
@@ -635,8 +674,17 @@ 

                  [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),

                  [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),

                  [OBJECT_TAG] = sizeof(TagObject),

+                 [OBJECT_TRIE_NODE] = sizeof(TrieNodeObject),

+                 [OBJECT_TRIE_HASH_TABLE] = sizeof(HashTableObject),

          };

  

+         if (o->object.type == OBJECT_ENTRY)

+                 return JOURNAL_HEADER_COMPACT(f->header) ? offsetof(Object, entry.payload) :

+                                                            offsetof(Object, entry.items);

+ 

+         if (o->object.type == OBJECT_DATA)

+                 return journal_file_data_payload_offset(f);

+ 

          if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)

                  return sizeof(ObjectHeader);

  
@@ -658,10 +706,10 @@ 

                                                 le64toh(o->data.n_entries),

                                                 offset);

  

-                 if (le64toh(o->object.size) <= offsetof(Object, data.payload))

+                 if (le64toh(o->object.size) <= journal_file_data_payload_offset(f))

                          return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),

                                                 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,

-                                                offsetof(Object, data.payload),

+                                                journal_file_data_payload_offset(f),

                                                 le64toh(o->object.size),

                                                 offset);

  
@@ -700,19 +748,31 @@ 

                  uint64_t sz;

  

                  sz = le64toh(READ_NOW(o->object.size));

-                 if (sz < offsetof(Object, entry.items) ||

-                     (sz - offsetof(Object, entry.items)) % sizeof(EntryItem) != 0)

-                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),

-                                                "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,

-                                                offsetof(Object, entry.items),

-                                                sz,

-                                                offset);

- 

-                 if ((sz - offsetof(Object, entry.items)) / sizeof(EntryItem) <= 0)

-                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),

-                                                "Invalid number items in entry: %" PRIu64 ": %" PRIu64,

-                                                (sz - offsetof(Object, entry.items)) / sizeof(EntryItem),

-                                                offset);

+                 if (JOURNAL_HEADER_COMPACT(f->header)) {

+                         if (sz < offsetof(Object, entry.payload))

+                                 return log_debug_errno(

+                                         SYNTHETIC_ERRNO(EBADMSG),

+                                         "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,

+                                         offsetof(Object, entry.payload),

+                                         sz,

+                                         offset);

+                 } else {

+                         if (sz < offsetof(Object, entry.items) ||

+                             (sz - offsetof(Object, entry.items)) % sizeof(EntryItem) != 0)

+                                 return log_debug_errno(

+                                         SYNTHETIC_ERRNO(EBADMSG),

+                                         "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,

+                                         offsetof(Object, entry.items),

+                                         sz,

+                                         offset);

+ 

+                         if ((sz - offsetof(Object, entry.items)) / sizeof(EntryItem) <= 0)

+                                 return log_debug_errno(

+                                         SYNTHETIC_ERRNO(EBADMSG),

+                                         "Invalid number items in entry: %" PRIu64 ": %" PRIu64,

+                                         (sz - offsetof(Object, entry.items)) / sizeof(EntryItem),

+                                         offset);

+                 }

  

                  if (le64toh(o->entry.seqnum) <= 0)

                          return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
@@ -736,7 +796,8 @@ 

          }

  

          case OBJECT_DATA_HASH_TABLE:

-         case OBJECT_FIELD_HASH_TABLE: {

+         case OBJECT_FIELD_HASH_TABLE:

+         case OBJECT_TRIE_HASH_TABLE: {

                  uint64_t sz;

  

                  sz = le64toh(READ_NOW(o->object.size));
@@ -744,10 +805,9 @@ 

                      (sz - offsetof(Object, hash_table.items)) % sizeof(HashItem) != 0 ||

                      (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem) <= 0)

                          return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),

-                                                "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,

-                                                o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",

-                                                sz,

-                                                offset);

+                                                "Invalid %s size: %" PRIu64 ": %" PRIu64,

+                                                journal_object_type_to_string(o->object.type),

+                                                sz, offset);

  

                  break;

          }
@@ -756,9 +816,9 @@ 

                  uint64_t sz;

  

                  sz = le64toh(READ_NOW(o->object.size));

-                 if (sz < offsetof(Object, entry_array.items) ||

-                     (sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||

-                     (sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0)

+                 if (sz < journal_file_entry_array_items_offset(f) ||

+                     (sz - journal_file_entry_array_items_offset(f)) % journal_file_entry_array_item_size(f) != 0 ||

+                     (sz - journal_file_entry_array_items_offset(f)) / journal_file_entry_array_item_size(f) <= 0)

                          return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),

                                                 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,

                                                 sz,
@@ -786,6 +846,26 @@ 

                                                 le64toh(o->tag.epoch), offset);

  

                  break;

+ 

+         case OBJECT_TRIE_NODE:

+                 if (le64toh(o->object.size) != sizeof(TrieNodeObject))

+                         return log_debug_errno(

+                                 SYNTHETIC_ERRNO(EBADMSG),

+                                 "Invalid object trie node size: %" PRIu64,

+                                 le64toh(o->object.size));

+ 

+                 if (!VALID64(le64toh(o->trie_node.next_hash_offset)) ||

+                     !VALID64(le32toh(o->trie_node.object_offset)) ||

+                     !VALID64(le32toh(o->trie_node.parent_offset)))

+                         return log_debug_errno(

+                                 SYNTHETIC_ERRNO(EBADMSG),

+                                 "Invalid offset (next_hash_offset=" OFSfmt ", object_offset=" OFSfmt32

+                                 ", parent_offset=" OFSfmt32,

+                                 le64toh(o->trie_node.next_hash_offset),

+                                 le32toh(o->trie_node.object_offset),

+                                 le32toh(o->trie_node.parent_offset));

+ 

+                 break;

          }

  

          return 0;
@@ -832,7 +912,7 @@ 

                                         "Attempt to move to object with invalid type: %" PRIu64,

                                         offset);

  

-         if (s < minimum_header_size(o))

+         if (s < minimum_header_size(f, o))

                  return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),

                                         "Attempt to move to truncated object: %" PRIu64,

                                         offset);
@@ -904,12 +984,12 @@ 

                                         "Attempt to read object with invalid type: %" PRIu64,

                                         offset);

  

-         if (s < minimum_header_size(&o))

+         if (s < minimum_header_size(f, &o))

                  return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),

                                         "Attempt to read truncated object: %" PRIu64,

                                         offset);

  

-         if ((size_t) n < minimum_header_size(&o))

+         if ((size_t) n < minimum_header_size(f, &o))

                  return log_debug_errno(SYNTHETIC_ERRNO(EIO),

                                         "Short read while reading object: %" PRIu64,

                                         offset);
@@ -1011,6 +1091,22 @@ 

          return 0;

  }

  

+ static int default_data_hash_table_size(JournalFile *f) {

+         uint64_t s, d;

+ 

+         /* We estimate that we need 1 hash table entry per 768 bytes of journal file and we want to make sure

+          * we never get beyond 75% fill level. Calculate the hash table size for the maximum file size based

+          * on these metrics. In compact, mode, we estimate we need 1 hash table entry per 1152 bytes of

+          * journal file. */

+ 

+         d = JOURNAL_HEADER_COMPACT(f->header) ? 1152 : 768;

+         s = (f->metrics.max_size * 4 / d / 3) * sizeof(HashItem);

+         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)

+                 s = DEFAULT_DATA_HASH_TABLE_SIZE;

+ 

+         return s;

+ }

+ 

  static int journal_file_setup_data_hash_table(JournalFile *f) {

          uint64_t s, p;

          Object *o;
@@ -1019,14 +1115,7 @@ 

          assert(f);

          assert(f->header);

  

-         /* We estimate that we need 1 hash table entry per 768 bytes

-            of journal file and we want to make sure we never get

-            beyond 75% fill level. Calculate the hash table size for

-            the maximum file size based on these metrics. */

- 

-         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);

-         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)

-                 s = DEFAULT_DATA_HASH_TABLE_SIZE;

+         s = default_data_hash_table_size(f);

  

          log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem));

  
@@ -1074,6 +1163,48 @@ 

          return 0;

  }

  

+ static int journal_file_setup_trie_hash_table(JournalFile *f) {

+         uint64_t s, p;

+         Object *o;

+         int r;

+ 

+         assert(f);

+         assert(f->header);

+ 

+         /* Based on the following results from converting a non-compact system journal to compact mode, we

+          * use "1.5 * default data hash table size" as the default trie hash table size.

+          *

+          * OBJECT TYPE      ENTRIES SIZE

+          * Unused           0       0B

+          * Data             963284  89.8M

+          * Field            2544    137.2K

+          * Entry            3269815 574.1M

+          * Data Hash Table  11      39.1M

+          * Field Hash Table 11      57.4K

+          * Entry Array      458484  539.1M

+          * Tag              0       0B

+          * Trie Node        1660978 76.0M

+          * Trie Hash Table  11      39.1M

+          * Boot ID          58      1.8K

+          */

+         s = default_data_hash_table_size(f);

+         s += ALIGN_TO(s / 2, sizeof(HashItem));

+ 

+         log_debug("Reserving %"PRIu64" entries in trie hash table.", s / sizeof(HashItem));

+ 

+         r = journal_file_append_object(

+                 f, OBJECT_TRIE_HASH_TABLE, offsetof(Object, hash_table.items) + s, &o, &p);

+         if (r < 0)

+                 return r;

+ 

+         memzero(o->hash_table.items, s);

+ 

+         f->header->trie_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));

+         f->header->trie_hash_table_size = htole64(s);

+ 

+         return 0;

+ }

+ 

  int journal_file_map_data_hash_table(JournalFile *f) {

          uint64_t s, p;

          void *t;
@@ -1126,6 +1257,29 @@ 

          return 0;

  }

  

+ static int journal_file_map_trie_hash_table(JournalFile *f) {

+         uint64_t s, p;

+         void *t;

+         int r;

+ 

+         assert(f);

+         assert(f->header);

+ 

+         if (f->trie_hash_table)

+                 return 0;

+ 

+         p = le64toh(f->header->trie_hash_table_offset);

+         s = le64toh(f->header->trie_hash_table_size);

+ 

+         r = journal_file_move_to(f, OBJECT_TRIE_HASH_TABLE, true, p, s, &t);

+         if (r < 0)

+                 return r;

+ 

+         f->trie_hash_table = t;

+         return 0;

+ }

+ 

+ 

  static int journal_file_link_field(

                  JournalFile *f,

                  Object *o,
@@ -1222,6 +1376,52 @@ 

          return 0;

  }

  

+ static int journal_file_link_trie_node(

+                 JournalFile *f,

+                 Object *o,

+                 uint64_t offset,

+                 uint64_t hash) {

+ 

+         uint64_t p, h, m;

+         int r;

+ 

+         assert(f);

+         assert(f->header);

+         assert(o);

+         assert(offset > 0);

+ 

+         if (o->object.type != OBJECT_TRIE_NODE)

+                 return -EINVAL;

+ 

+         m = le64toh(READ_NOW(f->header->trie_hash_table_size)) / sizeof(HashItem);

+         if (m <= 0)

+                 return -EBADMSG;

+ 

+         /* This might alter the window we are looking at */

+         o->trie_node.next_hash_offset = 0;

+ 

+         h = hash % m;

+         p = le64toh(f->trie_hash_table[h].tail_hash_offset);

+         if (p == 0)

+                 /* Only entry in the hash table is easy */

+                 f->trie_hash_table[h].head_hash_offset = htole64(offset);

+         else {

+                 /* Move back to the previous data object, to patch in

+                  * pointer */

+ 

+                 r = journal_file_move_to_object(f, OBJECT_TRIE_NODE, p, &o);

+                 if (r < 0)

+                         return r;

+ 

+                 o->trie_node.next_hash_offset = htole64(offset);

+         }

+ 

+         f->trie_hash_table[h].tail_hash_offset = htole64(offset);

+         f->header->n_trie_nodes = htole64(le64toh(f->header->n_trie_nodes) + 1);

+ 

+         return 0;

+ }

+ 

  static int next_hash_offset(

                  JournalFile *f,

                  uint64_t *p,
@@ -1346,7 +1546,7 @@ 

                  const void *data, uint64_t size, uint64_t hash,

                  Object **ret, uint64_t *ret_offset) {

  

-         uint64_t p, osize, h, m, depth = 0;

+         uint64_t p, h, m, depth = 0;

          int r;

  

          assert(f);
@@ -1362,8 +1562,6 @@ 

          if (r < 0)

                  return r;

  

-         osize = offsetof(Object, data.payload) + size;

- 

          m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);

          if (m <= 0)

                  return -EBADMSG;
@@ -1373,6 +1571,8 @@ 

  

          while (p > 0) {

                  Object *o;

+                 void *d;

+                 size_t rsize;

  

                  r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);

                  if (r < 0)
@@ -1381,40 +1581,13 @@ 

                  if (le64toh(o->data.hash) != hash)

                          goto next;

  

-                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {

- #if HAVE_COMPRESSION

-                         uint64_t l;

-                         size_t rsize = 0;

- 

-                         l = le64toh(READ_NOW(o->object.size));

-                         if (l <= offsetof(Object, data.payload))

-                                 return -EBADMSG;

- 

-                         l -= offsetof(Object, data.payload);

- 

-                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,

-                                             o->data.payload, l, &f->compress_buffer, &rsize, 0);

-                         if (r < 0)

-                                 return r;

- 

-                         if (rsize == size &&

-                             memcmp(f->compress_buffer, data, size) == 0) {

- 

-                                 if (ret)

-                                         *ret = o;

- 

-                                 if (ret_offset)

-                                         *ret_offset = p;

- 

-                                 return 1;

-                         }

- #else

-                         return -EPROTONOSUPPORT;

- #endif

-                 } else if (le64toh(o->object.size) == osize &&

-                            memcmp(o->data.payload, data, size) == 0) {

+                 r = journal_file_data_payload(f, o, p, NULL, 0, 0, &d, &rsize);

+                 if (r < 0)

+                         return r;

+                 assert(r > 0); /* journal_file_data_payload() always returns > 0 if no field is provided. */

  

-                         if (ret)

+                 if (memcmp_nn(data, size, d, rsize) == 0) {

+                          if (ret)

                                  *ret = o;

  

                          if (ret_offset)
@@ -1542,7 +1715,23 @@ 

          if (ret_offset)

                  *ret_offset = p;

  

-         return 0;

+         return 1;

+ }

+ 

+ static int maybe_compress_payload(JournalFile *f, uint8_t *dst, const uint8_t *src, size_t size, size_t *rsize) {

+         int compression = 0;

+ 

+ #if HAVE_COMPRESSION

+         if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {

+                 compression = compress_blob(src, size, dst, size - 1, rsize);

+ 

+                 if (compression > 0)

+                         log_debug("Compressed data object %zu -> %zu using %s", size, *rsize,

+                                   object_compressed_to_string(compression));

+         }

+ #endif

+ 

+         return compression;

  }

  

  static int journal_file_append_data(
@@ -1550,10 +1739,11 @@ 

                  const void *data, uint64_t size,

                  Object **ret, uint64_t *ret_offset) {

  

-         uint64_t hash, p, fp, osize;

-         Object *o, *fo;

-         int r, compression = 0;

-         const void *eq;

+         uint64_t hash, p, osize;

+         Object *o;

+         size_t rsize = 0;

+         int compression = 0;

+         int r;

  

          assert(f);

  
@@ -1568,86 +1758,505 @@ 

          if (r > 0)

                  return 0;

  

-         eq = memchr(data, '=', size);

-         if (!eq)

-                 return -EINVAL;

- 

-         osize = offsetof(Object, data.payload) + size;

+         osize = journal_file_data_payload_offset(f) + size;

          r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);

          if (r < 0)

                  return r;

  

          o->data.hash = htole64(hash);

  

- #if HAVE_COMPRESSION

-         if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {

-                 size_t rsize = 0;

+         compression = maybe_compress_payload(f, journal_file_data_payload_field(f, o), data, size, &rsize);

  

-                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);

+         if (compression > 0) {

+                 o->object.size = htole64(journal_file_data_payload_offset(f) + rsize);

+                 o->object.flags |= compression;

+         } else

+                 memcpy_safe(journal_file_data_payload_field(f, o), data, size);

  

-                 if (compression >= 0) {

-                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);

-                         o->object.flags |= compression;

+         r = journal_file_link_data(f, o, p, hash);

+         if (r < 0)

+                 return r;

  

-                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",

-                                   size, rsize, object_compressed_to_string(compression));

-                 } else

-                         /* Compression didn't work, we don't really care why, let's continue without compression */

-                         compression = 0;

-         }

+         /* The linking might have altered the window, so let's only pass the offset to hmac which will

+          * move to the object again if needed. */

+ 

+ #if HAVE_GCRYPT

+         r = journal_file_hmac_put_object(f, OBJECT_DATA, NULL, p);

+         if (r < 0)

+                 return r;

  #endif

  

-         if (compression == 0)

-                 memcpy_safe(o->data.payload, data, size);

+         if (ret) {

+                 r = journal_file_move_to_object(f, OBJECT_DATA, p, ret);

+                 if (r < 0)

+                         return r;

+         }

+ 

+         if (ret_offset)

+                 *ret_offset = p;

  

-         r = journal_file_link_data(f, o, p, hash);

+         return 1;

+ }

+ 

+ static int journal_file_append_trie_node(

+                 JournalFile *f,

+                 uint64_t hash,

+                 uint64_t parent_offset,

+                 uint64_t object_offset,

+                 Object **ret,

+                 uint64_t *ret_offset) {

+ 

+         Object *o;

+         uint64_t p;

+         int r;

+ 

+         /* Map the trie hash table, if it isn't mapped yet. */

+         r = journal_file_map_trie_hash_table(f);

          if (r < 0)

                  return r;

  

-         /* The linking might have altered the window, so let's refresh our pointer. */

-         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);

+         r = journal_file_append_object(f, OBJECT_TRIE_NODE, sizeof(TrieNodeObject), &o, &p);

          if (r < 0)

                  return r;

  

- #if HAVE_GCRYPT

-         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);

+         o->trie_node.hash = htole64(hash);

+         o->trie_node.parent_offset = htole32(parent_offset);

+         o->trie_node.object_offset = htole32(object_offset);

+ 

+         r = journal_file_link_trie_node(f, o, p, hash);

          if (r < 0)

                  return r;

+ 

+         if (ret)

+                 *ret = o;

+         if (ret_offset)

+                 *ret_offset = p;

+ 

+         return 0;

+ }

+ 

+ static int maybe_decompress_payload(

+                 JournalFile *f,

+                 uint8_t *payload,

+                 uint64_t size,

+                 int compression,

+                 const char *field,

+                 size_t field_length,

+                 size_t data_threshold,

+                 void **ret_data,

+                 size_t *ret_size) {

+ 

+         int r;

+ 

+         /* We can't read objects larger than 4G on a 32bit machine */

+         if ((uint64_t) (size_t) size != size)

+                 return -E2BIG;

+ 

+         if (compression != 0) {

+ #if HAVE_COMPRESSION

+                 size_t rsize;

+ 

+                 if (field) {

+                         r = decompress_startswith(

+                                 compression, payload, size, &f->compress_buffer, field, field_length, '=');

+                         if (r < 0)

+                                 return log_debug_errno(

+                                         r,

+                                         "Cannot decompress %s object of length %" PRIu64 ": %m",

+                                         object_compressed_to_string(compression),

+                                         size);

+                         if (r == 0)

+                                 return 0;

+                 }

+ 

+                 r = decompress_blob(compression, payload, size, &f->compress_buffer, &rsize, 0);

+                 if (r < 0)

+                         return r;

+ 

+                 if (ret_data)

+                         *ret_data = f->compress_buffer;

+                 if (ret_size)

+                         *ret_size = rsize;

+ #else

+                 return -EPROTONOSUPPORT;

  #endif

+         } else {

+                 if (field && (size < field_length + 1 || memcmp(payload, field, field_length) != 0 || payload[field_length] != '='))

+                         return 0;

+ 

+                 if (ret_data)

+                         *ret_data = payload;

+                 if (ret_size)

+                         *ret_size = (size_t) size;

+         }

  

-         /* Create field object ... */

-         r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);

+         return 1;

+ }

+ 

+ int journal_file_data_payload(

+                 JournalFile *f,

+                 Object *o,

+                 uint64_t offset,

+                 const char *field,

+                 size_t field_length,

+                 size_t data_threshold,

+                 void **ret_data,

+                 size_t *ret_size) {

+ 

+         uint64_t size;

+         int r;

+ 

+         assert(!field == (field_length == 0)); /* These must be specified together. */

+ 

+         /* If the caller doesn't provide a field or any of the output arguments, let's short-circuit the

+          * execution of this function. */

+         if (!field && !ret_data && !ret_size)

+                 return 1;

+ 

+         if (!o) {

+                 r = journal_file_move_to_object(f, OBJECT_DATA, offset, &o);

+                 if (r < 0)

+                         return r;

+         }

+ 

+         size = le64toh(READ_NOW(o->object.size));

+         if (size < journal_file_data_payload_offset(f))

+                 return -EBADMSG;

+ 

+         size -= journal_file_data_payload_offset(f);

+ 

+         return maybe_decompress_payload(

+                 f,

+                 journal_file_data_payload_field(f, o),

+                 size,

+                 o->object.flags & OBJECT_COMPRESSION_MASK,

+                 field,

+                 field_length,

+                 data_threshold,

+                 ret_data,

+                 ret_size);

+ }

+ 

+ static int journal_file_entry_item_next_trie(

+                 JournalFile *f,

+                 Object *e,

+                 uint64_t offset,

+                 uint64_t *i,

+                 const char *field,

+                 size_t field_length,

+                 size_t data_threshold,

+                 uint64_t *ret_offset,

+                 void **ret_data,

+                 size_t *ret_size) {

+ 

+         uint64_t p;

+         int r;

+ 

+         for (p = *i; p != 0;) {

+                 Object *o;

+                 uint64_t q;

+ 

+                 r = journal_file_move_to_object(f, OBJECT_TRIE_NODE, p, &o);

+                 if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) {

+                         log_debug_errno(r, "Bad trie node at %"PRIu64", skipping remaining entry items: %m", p);

+                         break;

+                 }

+                 if (r < 0)

+                         return r;

+ 

+                 p = le32toh(o->trie_node.parent_offset);

+                 q = le32toh(o->trie_node.object_offset);

+ 

+                 r = journal_file_data_payload(

+                         f,

+                         NULL,

+                         q,

+                         field,

+                         field_length,

+                         data_threshold,

+                         ret_data,

+                         ret_size);

+                 if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) {

+                         log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", q);

+                         continue;

+                 }

+                 if (r < 0)

+                         return r;

+                 if (r == 0)

+                         continue;

+ 

+                 if (ret_offset)

+                         *ret_offset = q;

+ 

+                 *i = p;

+ 

+                 return 1;

+         }

+ 

+         *i = p;

+ 

+         return 0;

+ }

+ 

+ static int journal_file_entry_item_next_inline(

+                JournalFile *f,

+                Object *e,

+                uint64_t offset,

+                uint64_t *i,

+                const char *field,

+                size_t field_length,

+                size_t data_threshold,

+                uint64_t *ret_offset,

+                void **ret_data,

+                size_t *ret_size) {

+ 

+         uint64_t p, sz;

+         int r;

+ 

+         sz = le64toh(READ_NOW(e->object.size));

+         if (sz < offsetof(Object, entry.payload))

+                 return -EBADMSG;

+ 

+         for (p = *i; p < offset + sz;) {

+                 uint8_t *d;

+                 uint8_t flags;

+                 uint64_t isz;

+ 

+                 /* `i` stores the absolute offset of the current inline entry item. We convert it to an

+                  * offset relative to the `payload` field of the entry object and add it to the `payload`

+                  * field to get a pointer to the current inline entry item. */

+                 d = e->entry.payload + p - offset - offsetof(Object, entry.payload);

+ 

+                 p += sizeof(uint8_t) + sizeof(uint32_t);

+                 if (p >= offset + sz)

+                         return -EBADMSG;

+ 

+                 flags = *d++;

+                 isz = unaligned_read_le32(d);

+                 d += sizeof(uint32_t);

+ 

+                 p += isz;

+                 if (p > offset + sz)

+                         return -EBADMSG;

+ 

+                 r = maybe_decompress_payload(

+                         f,

+                         d,

+                         isz,

+                         flags & OBJECT_COMPRESSION_MASK,

+                         field,

+                         field_length,

+                         data_threshold,

+                         ret_data,

+                         ret_size);

+                 if (r == -EBADMSG) {

+                         log_debug("Inline entry item has bad payload, skipping over it.");

+                         continue;

+                 }

+                 if (r < 0)

+                         return r;

+                 if (r == 0)

+                         continue;

+ 

+                 if (ret_offset)

+                         *ret_offset = 0;

+ 

+                 *i = p;

+ 

+                 return 1;

+         }

+ 

+         *i = p;

+ 

+         return 0;

+ }

+ 

+ static int journal_file_entry_item_next_compact(

+                 JournalFile *f,

+                 Object *e,

+                 uint64_t offset,

+                 uint64_t *i,

+                 const char *field,

+                 size_t field_length,

+                 size_t data_threshold,

+                 uint64_t *ret_offset,

+                 void **ret_data,

+                 size_t *ret_size) {

+ 

+         uint64_t p, sz;

+         int r;

+ 

+         if (*i == UINT64_MAX)

+                 return 0;

+ 

+         sz = le64toh(READ_NOW(e->object.size));

+         if (sz < offsetof(Object, entry.payload))

+                 return -EBADMSG;

+ 

+         p = *i == 0 ? le32toh(READ_NOW(e->entry.trie_offset)) : *i;

+ 

+         /* All of an entry's trie and data nodes are located before the entry object in the journal file. */

+         if (p > offset + sz)

+                 return -EBADMSG;

+ 

+         /* If the iterator is located inside the entry object's payload, we're already iterating the inline

+          * entry items so we skip the trie node logic. */

+         if (p < offset) {

+                 r = journal_file_entry_item_next_trie(

+                         f, e, offset, &p, field, field_length, data_threshold, ret_offset, ret_data, ret_size);

+                 if (r < 0)

+                         return r;

+ 

+                 /* If we've iterated all the trie nodes, set the iterator to the start of the inline entry

+                  * items. */

+                 if (p == 0)

+                         p = offset + offsetof(Object, entry.payload);

+ 

+                 if (r > 0) {

+                         *i = p;

+                         return r;

+                 }

+         }

+ 

+         r = journal_file_entry_item_next_inline(

+                 f, e, offset, &p, field, field_length, data_threshold, ret_offset, ret_data, ret_size);

          if (r < 0)

                  return r;

  

-         /* ... and link it in. */

-         o->data.next_field_offset = fo->field.head_data_offset;

-         fo->field.head_data_offset = le64toh(p);

+         /* If we finished with all the inline entry items, set the iterator to UINT64_MAX to indicate that

+          * we've finished iterating all the entry items. */

+         if (p == offset + sz)

+                 p = UINT64_MAX;

+ 

+         *i = p;

+ 

+         return r;

+ }

+ 

+ static int journal_file_entry_item_next_non_compact(

+                 JournalFile *f,

+                 Object *e,

+                 uint64_t offset,

+                 uint64_t *i,

+                 const char *field,

+                 size_t field_length,

+                 size_t data_threshold,

+                 uint64_t *ret_offset,

+                 void **ret_data,

+                 size_t *ret_size) {

+ 

+         uint64_t p, sz;

+         int r;

+ 

+         sz = le64toh(READ_NOW(e->object.size));

+         if (sz < offsetof(Object, entry.items))

+                 return -EBADMSG;

+ 

+         for (p = *i; p < (sz - offsetof(Object, entry.items)) / sizeof(EntryItem); p++) {

+                 uint64_t q;

+ 

+                 q = le64toh(e->entry.items[p].object_offset);

+ 

+                 r = journal_file_data_payload(

+                         f, NULL, q, field, field_length, data_threshold, ret_data, ret_size);

+                 if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) {

+                         log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", q);

+                         continue;

+                 }

+                 if (r < 0)

+                         return r;

+                 if (r == 0)

+                         continue;

+ 

+                 if (ret_offset)

+                         *ret_offset = q;

+ 

+                 *i = ++p;

+ 

+                 return 1;

+         }

+ 

+         *i = p;

+ 

+         return 0;

+ }

+ 

+ int journal_file_entry_item_next(

+                 JournalFile *f,

+                 Object *e,

+                 uint64_t offset,

+                 uint64_t *i,

+                 const char *field,

+                 size_t field_length,

+                 size_t data_threshold,

+                 uint64_t *ret_offset,

+                 void **ret_data,

+                 size_t *ret_size) {

+ 

+         /* Iterates over the entry items of the given entry. The output parameters return data about the Data

+          * object pointed at by the next entry item if requested.

+          *

+          * - If `ret_offset` is not NULL, it is set to the offset of the Data object. If the data is stored

+          *   inline in the entry object, `ret_offset` is set to 0.

+          * - If `ret_data` is not NULL, it is set to a pointer to the decompressed payload of the Data object

+          * - If `ret_size` is not NULL, it is set to the size of the decompressed payload of the Data object

+          *

+          * The iterator is stored in `i`. To start iterating from the start of the entry items, set `i` to

+          * zero. It is automatically updated by this function and should not be touched again unless you want

+          * to restart iterating over the entry items.

+          *

+          * If `field` and `field_length` are given, this function keeps iterating until it finds an entry

+          * item whose Data object payload starts with the given field, followed by the '=' character.

+          *

+          * If `data_threshold` is larger than zero, the decompressed payload is limited to `data_threshold`

+          * amount of bytes.

+          *

+          * This function returns a positive number if it succesfully managed to find the next entry item. If

+          * no more entry items were available, or none of the remaining entry items were of the given field,

+          * it returns zero. If an error occurred, it returns a negative errno value.

+          */

+ 

+         int r;

+ 

+         assert(!e || e->object.type == OBJECT_ENTRY);

+         assert(offset);

+         assert(i);

+         assert(!field == (field_length == 0));

  

-         if (ret)

-                 *ret = o;

+         if (!e) {

+                 r = journal_file_move_to_object(f, OBJECT_ENTRY, offset, &e);

+                 if (r < 0)

+                         return r;

+         }

  

-         if (ret_offset)

-                 *ret_offset = p;

+         return JOURNAL_HEADER_COMPACT(f->header) ?

+                 journal_file_entry_item_next_compact(

+                         f, e, offset, i, field, field_length, data_threshold, ret_offset, ret_data, ret_size) :

+                 journal_file_entry_item_next_non_compact(

+                         f, e, offset, i, field, field_length, data_threshold, ret_offset, ret_data, ret_size);

+ }

  

-         return 0;

+ uint64_t journal_file_entry_xor_hash(JournalFile *f, Object *o) {

+         return JOURNAL_HEADER_COMPACT(f->header) ? le64toh(o->entry.xor_hash_compact) :

+                                                    le64toh(o->entry.xor_hash);

  }

  

- uint64_t journal_file_entry_n_items(Object *o) {

-         uint64_t sz;

-         assert(o);

+ int journal_file_entry_boot_id(JournalFile *f, Object *o, sd_id128_t *ret_boot_id) {

+         int r;

  

-         if (o->object.type != OBJECT_ENTRY)

-                 return 0;

+         if (JOURNAL_HEADER_COMPACT(f->header)) {

+                 r = journal_file_move_to_object(f, OBJECT_BOOT_ID, le32toh(o->entry.boot_id_offset), &o);

+                 if (r < 0)

+                         return r;

  

-         sz = le64toh(READ_NOW(o->object.size));

-         if (sz < offsetof(Object, entry.items))

-                 return 0;

+                 *ret_boot_id = o->boot_id.value;

+         } else

+                 *ret_boot_id = o->entry.boot_id;

  

-         return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);

+         return 0;

  }

  

- uint64_t journal_file_entry_array_n_items(Object *o) {

+ uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) {

          uint64_t sz;

  

          assert(o);
@@ -1659,7 +2268,15 @@ 

          if (sz < offsetof(Object, entry_array.items))

                  return 0;

  

-         return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);

+         return (sz - journal_file_entry_array_items_offset(f)) / journal_file_entry_array_item_size(f);

+ }

+ 

+ uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) {

+         assert(o);

+         assert(o->object.type == OBJECT_ENTRY_ARRAY);

+ 

+         return JOURNAL_HEADER_COMPACT(f->header) ? (uint64_t) le32toh(o->entry_array.compact[i]) :

+                                                    le64toh(o->entry_array.items[i]);

  }

  

  uint64_t journal_file_hash_table_n_items(Object *o) {
@@ -1677,9 +2294,96 @@ 

          return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);

  }

  

+ static int journal_file_find_trie_object(

+                 JournalFile *f,

+                 uint64_t hash,

+                 const EntryItemEx *items,

+                 size_t n_items,

+                 Object **ret,

+                 uint64_t *ret_offset) {

+ 

+         uint64_t p, h, m, depth = 0;

+         int r;

+ 

+         assert(f);

+         assert(f->header);

+         assert(items);

+         assert(n_items > 0);

+ 

+         /* If there's no trie hash table, then there's no entry. */

+         if (le64toh(f->header->trie_hash_table_size) <= 0)

+                 return 0;

+ 

+         /* Map the trie hash table, if it isn't mapped yet. */

+         r = journal_file_map_trie_hash_table(f);

+         if (r < 0)

+                 return r;

+ 

+         m = le64toh(READ_NOW(f->header->trie_hash_table_size)) / sizeof(HashItem);

+         if (m <= 0)

+                 return -EBADMSG;

+ 

+         h = hash % m;

+         p = le64toh(f->trie_hash_table[h].head_hash_offset);

+ 

+         while (p > 0) {

+                 Object *o;

+ 

+                 r = journal_file_move_to_object(f, OBJECT_TRIE_NODE, p, &o);

+                 if (r < 0)

+                         return r;

+ 

+                 if (le64toh(o->trie_node.hash) != hash)

+                         goto next;

+ 

+                 uint64_t q = p;

+                 Object *t = o;

+                 size_t i = n_items - 1;

+ 

+                 for (; i != SIZE_MAX && q != 0; i--, q = le32toh(t->trie_node.parent_offset)) {

+                         r = journal_file_move_to_object(f, OBJECT_TRIE_NODE, q, &t);

+                         if (r < 0)

+                                 return r;

+ 

+                         if (le32toh(t->trie_node.object_offset) != items[i].object_offset)

+                                 break;

+                 }

+ 

+                 if (i == SIZE_MAX && q == 0) {

+                         if (ret)

+                                 *ret = o;

+ 

+                         if (ret_offset)

+                                 *ret_offset = p;

+ 

+                         return 1;

+                 }

+ 

+         next:

+                 r = next_hash_offset(

+                         f, &p, &o->trie_node.next_hash_offset, &depth, &f->header->trie_hash_chain_depth);

+                 if (r < 0)

+                         return r;

+         }

+ 

+         return 0;

+ }

+ 

+ static void write_entry_array_item(JournalFile *f, Object *o, uint64_t i, uint64_t p) {

+         assert(f);

+         assert(o);

+ 

+         if (JOURNAL_HEADER_COMPACT(f->header))

+                 o->entry_array.compact[i] = htole32(p);

+         else

+                 o->entry_array.items[i] = htole64(p);

+ }

+ 

  static int link_entry_into_array(JournalFile *f,

                                   le64_t *first,

                                   le64_t *idx,

+                                  le32_t *tail,

+                                  le32_t *tidx,

                                   uint64_t p) {

          int r;

          uint64_t n = 0, ap = 0, q, i, a, hidx;
@@ -1691,18 +2395,21 @@ 

          assert(idx);

          assert(p > 0);

  

-         a = le64toh(*first);

-         i = hidx = le64toh(READ_NOW(*idx));

+         a = tail ? le32toh(*tail) : le64toh(*first);

+         hidx = le64toh(READ_NOW(*idx));

+         i = tidx ? le32toh(READ_NOW(*tidx)) : hidx;

          while (a > 0) {

  

                  r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);

                  if (r < 0)

                          return r;

  

-                 n = journal_file_entry_array_n_items(o);

+                 n = journal_file_entry_array_n_items(f, o);

                  if (i < n) {

-                         o->entry_array.items[i] = htole64(p);

+                         write_entry_array_item(f, o, i, p);

                          *idx = htole64(hidx + 1);

+                         if (tidx)

+                                 *tidx = htole32(le32toh(*tidx) + 1);

                          return 0;

                  }

  
@@ -1720,7 +2427,7 @@ 

                  n = 4;

  

          r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,

-                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),

+                                        journal_file_entry_array_items_offset(f) + n * journal_file_entry_array_item_size(f),

                                         &o, &q);

          if (r < 0)

                  return r;
@@ -1731,7 +2438,7 @@ 

                  return r;

  #endif

  

-         o->entry_array.items[i] = htole64(p);

+         write_entry_array_item(f, o, i, p);

  

          if (ap == 0)

                  *first = htole64(q);
@@ -1743,10 +2450,15 @@ 

                  o->entry_array.next_entry_array_offset = htole64(q);

          }

  

+         if (tail)

+                 *tail = htole32(q);

+ 

          if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))

                  f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);

  

          *idx = htole64(hidx + 1);

+         if (tidx)

+                 *tidx = htole32(1);

  

          return 0;

  }
@@ -1755,6 +2467,8 @@ 

                                            le64_t *extra,

                                            le64_t *first,

                                            le64_t *idx,

+                                           le32_t *tail,

+                                           le32_t *tidx,

                                            uint64_t p) {

  

          uint64_t hidx;
@@ -1775,7 +2489,7 @@ 

                  le64_t i;

  

                  i = htole64(hidx - 1);

-                 r = link_entry_into_array(f, first, &i, p);

+                 r = link_entry_into_array(f, first, &i, tail, tidx, p);

                  if (r < 0)

                          return r;

          }
@@ -1784,15 +2498,13 @@ 

          return 0;

  }

  

- static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {

-         uint64_t p;

+ static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t p) {

          int r;

  

          assert(f);

          assert(o);

          assert(offset > 0);

  

-         p = le64toh(o->entry.items[i].object_offset);

          r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);

          if (r < 0)

                  return r;
@@ -1801,11 +2513,13 @@ 

                                                &o->data.entry_offset,

                                                &o->data.entry_array_offset,

                                                &o->data.n_entries,

+                                               JOURNAL_HEADER_COMPACT(f->header) ? &o->data.tail_entry_array_offset : NULL,

+                                               JOURNAL_HEADER_COMPACT(f->header) ? &o->data.tail_entry_array_n_entries : NULL,

                                                offset);

  }

  

- static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {

-         uint64_t n;

+ static int journal_file_link_entry(

+                 JournalFile *f, Object *o, uint64_t offset, const EntryItemEx items[], size_t n_items) {

          int r;

  

          assert(f);
@@ -1822,6 +2536,8 @@ 

          r = link_entry_into_array(f,

                                    &f->header->entry_array_offset,

                                    &f->header->n_entries,

+                                   JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_offset) ? &f->header->tail_entry_array_offset : NULL,

+                                   JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_n_entries) ? &f->header->tail_entry_array_n_entries : NULL,

                                    offset);

          if (r < 0)

                  return r;
@@ -1835,15 +2551,17 @@ 

          f->header->tail_entry_monotonic = o->entry.monotonic;

  

          /* Link up the items */

-         n = journal_file_entry_n_items(o);

-         for (uint64_t i = 0; i < n; i++) {

+         for (uint64_t i = 0; i < n_items; i++) {

                  int k;

  

+                 if (!items[i].indexed)

+                         continue;

+ 

                  /* If we fail to link an entry item because we can't allocate a new entry array, don't fail

                   * immediately but try to link the other entry items since it might still be possible to link

                   * those if they don't require a new entry array to be allocated. */

  

-                 k = journal_file_link_entry_item(f, o, offset, i);

+                 k = journal_file_link_entry_item(f, o, offset, items[i].object_offset);

                  if (k == -E2BIG)

                          r = k;

                  else if (k < 0)
@@ -1857,12 +2575,11 @@ 

                  JournalFile *f,

                  const dual_timestamp *ts,

                  const sd_id128_t *boot_id,

-                 uint64_t xor_hash,

-                 const EntryItem items[], unsigned n_items,

+                 const EntryItemEx items[], size_t n_items,

+                 const struct iovec inlined[], size_t n_inlined,

                  uint64_t *seqnum,

                  Object **ret, uint64_t *ret_offset) {

-         uint64_t np;

-         uint64_t osize;

+         uint64_t np, osize, parent_offset = 0, xor_hash = 0;

          Object *o;

          int r;

  
@@ -1871,20 +2588,103 @@ 

          assert(items || n_items == 0);

          assert(ts);

  

-         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));

+         if (!JOURNAL_HEADER_COMPACT(f->header))

+                 assert(n_inlined == 0);

+ 

+         for (uint64_t i = 0; i < n_items; i++)

+                 xor_hash ^= items[i].xor_hash;

+ 

+         if (JOURNAL_HEADER_COMPACT(f->header)) {

+                 size_t i;

+ 

+                 for (i = n_items - 1; i != SIZE_MAX; i--) {

+                         r = journal_file_find_trie_object(f, xor_hash, items, i + 1, NULL, &parent_offset);

+                         if (r < 0)

+                                 return r;

+                         if (r > 0)

+                                 break;

+ 

+                         xor_hash ^= items[i].xor_hash; /* Remove hash from XOR hash. */

+                 }

+ 

+                 for (i += 1; i < n_items; i++) {

+                         uint64_t p;

+ 

+                         xor_hash ^= items[i].xor_hash; /* Add hash back to XOR hash. */

+ 

+                         r = journal_file_append_trie_node(

+                                 f, xor_hash, parent_offset, items[i].object_offset, NULL, &p);

+                         if (r < 0)

+                                 return r;

+ 

+                         parent_offset = p;

+                 }

+         }

+ 

+         osize = JOURNAL_HEADER_COMPACT(f->header) ?

+                 offsetof(Object, entry.payload) :

+                 offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));

+ 

+         for (unsigned i = 0; i < n_inlined; i++) {

+                 xor_hash ^= jenkins_hash64(inlined[i].iov_base, inlined[i].iov_len);

+                 osize += sizeof(uint8_t) + sizeof(uint32_t) + inlined[i].iov_len;

+         }

  

          r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);

          if (r < 0)

                  return r;

  

          o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));

-         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));

+ 

+         if (JOURNAL_HEADER_COMPACT(f->header)) {

+                 uint8_t *p = o->entry.payload;

+                 o->entry.trie_offset = htole32(parent_offset);

+ 

+                 for (unsigned i = 0; i < n_inlined; i++) {

+                         int compression = 0;

+                         size_t rsize;

+ 

+                         /* The format per inlined item is: flags (8-bit), size (32-bit), data (optionally

+                          * compressed). */

+ 

+                         compression = maybe_compress_payload(f, p + sizeof(uint8_t) + sizeof(uint32_t),

+                                                              inlined[i].iov_base, inlined[i].iov_len, &rsize);

+ 

+                         if (compression > 0) {

+                                 *p++ = compression;

+                                 unaligned_write_le32(p, rsize);

+                                 p += sizeof(uint32_t) + rsize;

+                         } else {

+                                 *p++ = 0;

+                                 unaligned_write_le32(p, inlined[i].iov_len);

+                                 p += sizeof(uint32_t);

+                                 memcpy_safe(p, inlined[i].iov_base, inlined[i].iov_len);

+                                 p += inlined[i].iov_len;

+                         }

+                 }

+ 

+                 o->object.size = htole64(offsetof(Object, entry.payload) + (p - o->entry.payload));

+         } else

+                 for (size_t i = 0; i < n_items; i++)

+                         o->entry.items[i] = (EntryItem){ .object_offset = htole64(items[i].object_offset),

+                                                          .hash = htole64(items[i].hash) };

+ 

          o->entry.realtime = htole64(ts->realtime);

          o->entry.monotonic = htole64(ts->monotonic);

-         o->entry.xor_hash = htole64(xor_hash);

-         if (boot_id)

-                 f->header->boot_id = *boot_id;

-         o->entry.boot_id = f->header->boot_id;

+ 

+         if (boot_id) {

+                 r = journal_file_refresh_boot_id(f, boot_id);

+                 if (r < 0)

+                         return r;

+         }

+ 

+         if (JOURNAL_HEADER_COMPACT(f->header)) {

+                 o->entry.xor_hash_compact = htole64(xor_hash);

+                 o->entry.boot_id_offset = htole32(le64toh(f->header->boot_id_offset));

+         } else {

+                 o->entry.xor_hash = htole64(xor_hash);

+                 o->entry.boot_id = f->header->boot_id;

+         }

  

  #if HAVE_GCRYPT

          r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
@@ -1892,7 +2692,7 @@ 

                  return r;

  #endif

  

-         r = journal_file_link_entry(f, o, np);

+         r = journal_file_link_entry(f, o, np, items, n_items);

          if (r < 0)

                  return r;

  
@@ -1987,13 +2787,11 @@ 

          return r;

  }

  

- static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {

-         return CMP(le64toh(a->object_offset), le64toh(b->object_offset));

+ static int entry_item_cmp(const EntryItemEx *a, const EntryItemEx *b) {

+         return CMP(a->object_offset, b->object_offset);

  }

  

- static size_t remove_duplicate_entry_items(EntryItem items[], size_t n) {

- 

-         /* This function relies on the items array being sorted. */

+ static size_t remove_duplicate_entry_items(EntryItemEx items[], size_t n) {

          size_t j = 1;

  

          if (n <= 1)
@@ -2006,6 +2804,30 @@ 

          return j;

  }

  

+ static int journal_file_append_field_from_data(

+                 JournalFile *f,

+                 const char *data,

+                 size_t size,

+                 Object **ret,

+                 uint64_t *ret_offset) {

+         const void *eq;

+         int r;

+ 

+         eq = memchr(data, '=', size);

+         if (!eq)

+                 return -EINVAL;

+ 

+         r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, ret, ret_offset);

+         if (r < 0)

+                 return r;

+ 

+         /* In compact mode, only index newly added fields. */

+         if (JOURNAL_HEADER_COMPACT(f->header) && r > 0)

+                 (*ret)->object.flags |= FIELD_INDEXED;

+ 

+         return r;

+ }

+ 

  int journal_file_append_entry(

                  JournalFile *f,

                  const dual_timestamp *ts,
@@ -2014,10 +2836,11 @@ 

                  uint64_t *seqnum,

                  Object **ret, uint64_t *ret_offset) {

  

-         EntryItem *items;

-         int r;

-         uint64_t xor_hash = 0;

+         EntryItemEx *items;

+         struct iovec *inlined;

+         size_t n_items = 0, n_inlined = 0;

          struct dual_timestamp _ts;

+         int r;

  

          assert(f);

          assert(f->header);
@@ -2043,16 +2866,32 @@ 

                  return r;

  #endif

  

-         items = newa(EntryItem, n_iovec);

+         items = newa(EntryItemEx, n_iovec);

+         inlined = newa(struct iovec, n_iovec);

  

          for (size_t i = 0; i < n_iovec; i++) {

                  uint64_t p;

-                 Object *o;

+                 Object *o, *fo;

+ 

+                 r = journal_file_append_field_from_data(f, iovec[i].iov_base, iovec[i].iov_len, &fo, NULL);

+                 if (r < 0)

+                         return r;

+ 

+                 if (FLAGS_SET(fo->object.flags, FIELD_UNIQUE)) {

+                         inlined[n_inlined++] = iovec[i];

+                         continue;

+                 }

  

                  r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);

                  if (r < 0)

                          return r;

  

+                 if (r > 0) {

+                         /* Link data object into the field object. */

+                         o->data.next_field_offset = fo->field.head_data_offset;

+                         fo->field.head_data_offset = le64toh(p);

+                 }

+ 

                  /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"

                   * journal file flag is on. We use the XOR hash field to quickly determine the identity of a

                   * specific record, and give records with otherwise identical position (i.e. match in seqno,
@@ -2062,23 +2901,24 @@ 

                   * are completely identical (they include the XOR hash after all). For classic Jenkins-hash

                   * files things are easier, we can just take the value from the stored record directly. */

  

-                 if (JOURNAL_HEADER_KEYED_HASH(f->header))

-                         xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len);

-                 else

-                         xor_hash ^= le64toh(o->data.hash);

- 

-                 items[i] = (EntryItem) {

-                         .object_offset = htole64(p),

-                         .hash = o->data.hash,

+                 items[n_items++] = (EntryItemEx){

+                         .object_offset = p,

+                         .hash = le64toh(o->data.hash),

+                         .xor_hash = JOURNAL_HEADER_KEYED_HASH(f->header) ?

+                                 jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len) :

+                                 le64toh(o->data.hash),

+                         .indexed = !JOURNAL_HEADER_COMPACT(f->header) ||

+                                 FLAGS_SET(fo->object.flags, FIELD_INDEXED),

                  };

          }

  

          /* Order by the position on disk, in order to improve seek

           * times for rotating media. */

-         typesafe_qsort(items, n_iovec, entry_item_cmp);

-         n_iovec = remove_duplicate_entry_items(items, n_iovec);

+         typesafe_qsort(items, n_items, entry_item_cmp);

+         n_items = remove_duplicate_entry_items(items, n_items);

  

-         r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, ret_offset);

+         r = journal_file_append_entry_internal(

+                 f, ts, boot_id, items, n_items, inlined, n_inlined, seqnum, ret, ret_offset);

  

          /* If the memory mapping triggered a SIGBUS then we return an

           * IO error and ignore the error code passed down to us, since
@@ -2245,7 +3085,7 @@ 

                  if (r < 0)

                          return r;

  

-                 k = journal_file_entry_array_n_items(o);

+                 k = journal_file_entry_array_n_items(f, o);

                  if (i < k)

                          break;

  
@@ -2265,7 +3105,7 @@ 

                          if (r < 0)

                                  return r;

  

-                         k = journal_file_entry_array_n_items(o);

+                         k = journal_file_entry_array_n_items(f, o);

                          if (k == 0)

                                  break;

  
@@ -2273,12 +3113,12 @@ 

                  }

  

                  do {

-                         p = le64toh(o->entry_array.items[i]);

+                         p = journal_file_entry_array_item(f, o, i);

  

                          r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);

                          if (r >= 0) {

                                  /* Let's cache this item for the next invocation */

-                                 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);

+                                 chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, o, 0), t, i);

  

                                  if (ret_offset)

                                          *ret_offset = p;
@@ -2393,13 +3233,13 @@ 

                  if (r < 0)

                          return r;

  

-                 k = journal_file_entry_array_n_items(array);

+                 k = journal_file_entry_array_n_items(f, array);

                  right = MIN(k, n);

                  if (right <= 0)

                          return 0;

  

                  i = right - 1;

-                 lp = p = le64toh(array->entry_array.items[i]);

+                 lp = p = journal_file_entry_array_item(f, array, i);

                  if (p <= 0)

                          r = -EBADMSG;

                  else
@@ -2432,7 +3272,7 @@ 

                                  if (last_index > 0) {

                                          uint64_t x = last_index - 1;

  

-                                         p = le64toh(array->entry_array.items[x]);

+                                         p = journal_file_entry_array_item(f, array, x);

                                          if (p <= 0)

                                                  return -EBADMSG;

  
@@ -2452,7 +3292,7 @@ 

                                  if (last_index < right) {

                                          uint64_t y = last_index + 1;

  

-                                         p = le64toh(array->entry_array.items[y]);

+                                         p = journal_file_entry_array_item(f, array, y);

                                          if (p <= 0)

                                                  return -EBADMSG;

  
@@ -2482,7 +3322,7 @@ 

                                  assert(left < right);

                                  i = (left + right) / 2;

  

-                                 p = le64toh(array->entry_array.items[i]);

+                                 p = journal_file_entry_array_item(f, array, i);

                                  if (p <= 0)

                                          r = -EBADMSG;

                                  else
@@ -2530,14 +3370,14 @@ 

                  return 0;

  

          /* Let's cache this item for the next invocation */

-         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);

+         chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, array, 0), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);

  

          if (subtract_one && i == 0)

                  p = last_p;

          else if (subtract_one)

-                 p = le64toh(array->entry_array.items[i-1]);

+                 p = journal_file_entry_array_item(f, array, i - 1);

          else

-                 p = le64toh(array->entry_array.items[i]);

+                 p = journal_file_entry_array_item(f, array, i);

  

          if (ret) {

                  r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
@@ -2810,14 +3650,21 @@ 

          f->current_xor_hash = 0;

  }

  

- void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {

+ int journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {

+         int r;

+ 

+         r = journal_file_entry_boot_id(f, o, &f->current_boot_id);

+         if (r < 0)

+                 return r;

+ 

          f->location_type = LOCATION_SEEK;

          f->current_offset = offset;

          f->current_seqnum = le64toh(o->entry.seqnum);

          f->current_realtime = le64toh(o->entry.realtime);

          f->current_monotonic = le64toh(o->entry.monotonic);

-         f->current_boot_id = o->entry.boot_id;

-         f->current_xor_hash = le64toh(o->entry.xor_hash);

+         f->current_xor_hash = journal_file_entry_xor_hash(f, o);

+ 

+         return 0;

  }

  

  int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
@@ -3204,11 +4051,12 @@ 

                 "Sequential number ID: %s\n"

                 "State: %s\n"

                 "Compatible flags:%s%s\n"

-                "Incompatible flags:%s%s%s%s%s\n"

+                "Incompatible flags:%s%s%s%s%s%s\n"

                 "Header size: %"PRIu64"\n"

                 "Arena size: %"PRIu64"\n"

                 "Data hash table size: %"PRIu64"\n"

                 "Field hash table size: %"PRIu64"\n"

+                "Trie hash table size: %"PRIu64"\n"

                 "Rotate suggested: %s\n"

                 "Head sequential number: %"PRIu64" (%"PRIx64")\n"

                 "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
@@ -3231,11 +4079,13 @@ 

                 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",

                 JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "",

                 JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "",

+                JOURNAL_HEADER_COMPACT(f->header) ? " COMPACT" : "",

                 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",

                 le64toh(f->header->header_size),

                 le64toh(f->header->arena_size),

                 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),

                 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),

+                le64toh(f->header->trie_hash_table_size) / sizeof(HashItem),

                 yes_no(journal_file_rotate_suggested(f, 0, LOG_DEBUG)),

                 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),

                 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
@@ -3257,6 +4107,12 @@ 

                         le64toh(f->header->n_fields),

                         100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));

  

+         if (JOURNAL_HEADER_CONTAINS(f->header, n_trie_nodes))

+                 printf("Trie Node objects: %"PRIu64"\n"

+                        "Trie Node hash table fill: %.1f%%\n",

+                        le64toh(f->header->n_trie_nodes),

+                        100.0 * (double) le64toh(f->header->n_trie_nodes) / ((double) (le64toh(f->header->trie_hash_table_size) / sizeof(HashItem))));

+ 

          if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))

                  printf("Tag objects: %"PRIu64"\n",

                         le64toh(f->header->n_tags));
@@ -3272,6 +4128,10 @@ 

                  printf("Deepest data hash chain: %" PRIu64"\n",

                         f->header->data_hash_chain_depth);

  

+         if (JOURNAL_HEADER_CONTAINS(f->header, trie_hash_chain_depth))

+                 printf("Deepest trie hash chain: %" PRIu64"\n",

+                        f->header->trie_hash_chain_depth);

+ 

          if (fstat(f->fd, &st) >= 0)

                  printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st.st_blocks * 512ULL));

  }
@@ -3311,6 +4171,80 @@ 

          return 1;

  }

  

+ static int add_unique_fields(JournalFile *f) {

+         const char *e;

+         int r;

+ 

+         e = getenv("SYSTEMD_JOURNAL_UNIQUE_FIELDS");

+         if (!e)

+                 e = "MESSAGE";

+ 

+         for (const char *p = e;;) {

+                 Object *o;

+                 _cleanup_free_ char *word = NULL;

+ 

+                 r = extract_first_word(&p, &word, NULL, 0);

+                 if (r == 0)

+                         return 0;

+                 if (r == -ENOMEM)

+                         return log_oom();

+                 if (r < 0) {

+                         log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNALD_UNIQUE_FIELDS environment variable, ignoring: %m");

+                         return 0;

+                 }

+ 

+                 if (!journal_field_valid(word, strlen(word), true)) {

+                         log_debug("Invalid field name in $SYSTEMD_JOURNALD_UNIQUE_FIELDS environment variable, ignoring: %s", word);

+                         continue;

+                 }

+ 

+                 r = journal_file_append_field(f, word, strlen(word), &o, NULL);

+                 if (r < 0)

+                         return r;

+ 

+                 o->object.flags |= FIELD_UNIQUE;

+         }

+ 

+         return 0;

+ }

+ 

+ static int add_non_indexed_fields(JournalFile *f) {

+         const char *e;

+         int r;

+ 

+         e = getenv("SYSTEMD_JOURNAL_NON_INDEXED_FIELDS");

+         if (!e)

+                 return 0;

+ 

+         for (const char *p = e;;) {

+                 _cleanup_free_ char *word = NULL;

+ 

+                 r = extract_first_word(&p, &word, NULL, 0);

+                 if (r == 0)

+                         return 0;

+                 if (r == -ENOMEM)

+                         return log_oom();

+                 if (r < 0) {

+                         log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNALD_NON_INDEXED_FIELDS environment variable, ignoring: %m");

+                         return 0;

+                 }

+ 

+                 if (!journal_field_valid(word, strlen(word), true)) {

+                         log_debug("Invalid field name in $SYSTEMD_JOURNALD_NON_INDEXED_FIELDS environment variable, ignoring: %s", word);

+                         continue;

+                 }

+ 

+                 /* By default, all fields are created with the FIELD_INDEXED flag, indicating they should be

+                  * indexed. By creating the fields here but not setting the FIELD_INDEXED flag, we make sure

+                  * they aren't indexed. */

+                 r = journal_file_append_field(f, word, strlen(word), NULL, NULL);

+                 if (r < 0)

+                         return r;

+         }

+ 

+         return 0;

+ }

+ 

  int journal_file_open(

                  int fd,

                  const char *fname,
@@ -3375,6 +4309,14 @@ 

          } else

                  f->keyed_hash = r;

  

+         r = getenv_bool("SYSTEMD_JOURNAL_COMPACT");

+         if (r < 0) {

+                 if (r != -ENXIO)

+                         log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_COMPACT environment variable, ignoring: %m");

+                 f->compact = true;

+         } else

+                 f->compact = r;

+ 

          if (DEBUG_LOGGING) {

                  static int last_seal = -1, last_compress = -1, last_keyed_hash = -1;

                  static uint64_t last_bytes = UINT64_MAX;
@@ -3536,6 +4478,20 @@ 

                  if (r < 0)

                          goto fail;

  

+                 if (JOURNAL_HEADER_COMPACT(f->header)) {

+                         r = journal_file_setup_trie_hash_table(f);

+                         if (r < 0)

+                                 goto fail;

+ 

+                         r = add_unique_fields(f);

+                         if (r < 0)

+                                 goto fail;

+ 

+                         r = add_non_indexed_fields(f);

+                         if (r < 0)

+                                 goto fail;

+                 }

+ 

  #if HAVE_GCRYPT

                  r = journal_file_append_first_tag(f);

                  if (r < 0)
@@ -3647,10 +4603,12 @@ 

  }

  

  int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {

-         uint64_t q, n, xor_hash = 0;

-         const sd_id128_t *boot_id;

+         size_t n = 0;

+         sd_id128_t boot_id;

          dual_timestamp ts;

-         EntryItem *items;

+         EntryItemEx *items;

+         struct iovec *inlined;

+         size_t n_items = 0, n_inlined = 0;

          int r;

  

          assert(from);
@@ -3665,77 +4623,84 @@ 

                  .monotonic = le64toh(o->entry.monotonic),

                  .realtime = le64toh(o->entry.realtime),

          };

-         boot_id = &o->entry.boot_id;

- 

-         n = journal_file_entry_n_items(o);

-         items = newa(EntryItem, n);

  

-         for (uint64_t i = 0; i < n; i++) {

-                 uint64_t l, h;

-                 size_t t;

-                 void *data;

-                 Object *u;

- 

-                 q = le64toh(o->entry.items[i].object_offset);

+         r = journal_file_entry_boot_id(from, o, &boot_id);

+         if (r < 0)

+                 return r;

  

-                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);

+         for (uint64_t i = 0;;) {

+                 r = journal_file_entry_item_next(from, o, p, &i, NULL, 0, 0, NULL, NULL, NULL);

                  if (r < 0)

                          return r;

+                 if (r == 0)

+                         break;

  

-                 l = le64toh(READ_NOW(o->object.size));

-                 if (l < offsetof(Object, data.payload))

-                         return -EBADMSG;

- 

-                 l -= offsetof(Object, data.payload);

-                 t = (size_t) l;

- 

-                 /* We hit the limit on 32bit machines */

-                 if ((uint64_t) t != l)

-                         return -E2BIG;

+                 n++;

+         }

  

-                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {

- #if HAVE_COMPRESSION

-                         size_t rsize = 0;

+         items = newa(EntryItemEx, n);

+         inlined = newa(struct iovec, n);

  

-                         r = decompress_blob(

-                                         o->object.flags & OBJECT_COMPRESSION_MASK,

-                                         o->data.payload, l,

-                                         &from->compress_buffer, &rsize,

-                                         0);

-                         if (r < 0)

-                                 return r;

+         for (uint64_t i = 0, j = 0;; j++) {

+                 uint64_t h;

+                 void *data;

+                 size_t l;

+                 Object *u, *fo;

  

-                         data = from->compress_buffer;

-                         l = rsize;

- #else

-                         return -EPROTONOSUPPORT;

- #endif

-                 } else

-                         data = o->data.payload;

+                 r = journal_file_entry_item_next(from, o, p, &i, NULL, 0, 0, NULL, &data, &l);

+                 if (r < 0)

+                         goto finish;

+                 if (r == 0)

+                         break;

  

                  if (l == 0)

                          return -EBADMSG;

  

-                 r = journal_file_append_data(to, data, l, &u, &h);

+                 r = journal_file_append_field_from_data(to, data, l, &fo, NULL);

                  if (r < 0)

-                         return r;

+                         goto finish;

  

-                 if (JOURNAL_HEADER_KEYED_HASH(to->header))

-                         xor_hash ^= jenkins_hash64(data, l);

-                 else

-                         xor_hash ^= le64toh(u->data.hash);

+                 if (FLAGS_SET(fo->object.flags, FIELD_UNIQUE)) {

+                         struct iovec iovec = {

+                                 .iov_base = memdup(data, l),

+                                 .iov_len = l,

+                         };

  

-                 items[i] = (EntryItem) {

-                         .object_offset = htole64(h),

-                         .hash = u->data.hash,

-                 };

+                         if (!iovec.iov_base) {

+                                 r = -ENOMEM;

+                                 goto finish;

+                         }

+ 

+                         inlined[n_inlined++] = iovec;

+                         continue;

+                 }

  

-                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);

+                 r = journal_file_append_data(to, data, l, &u, &h);

                  if (r < 0)

-                         return r;

+                         goto finish;

+ 

+                 if (r > 0) {

+                         /* Link data object into the field object. */

+                         u->data.next_field_offset = fo->field.head_data_offset;

+                         fo->field.head_data_offset = le64toh(h);

+                 }

+ 

+                 items[n_items++] = (EntryItemEx){

+                         .object_offset = h,

+                         .hash = le64toh(u->data.hash),

+                         .xor_hash = JOURNAL_HEADER_KEYED_HASH(to->header) ? jenkins_hash64(data, l) :

+                                                                             le64toh(u->data.hash),

+                         .indexed = !JOURNAL_HEADER_COMPACT(to->header) ||

+                                 FLAGS_SET(fo->object.flags, FIELD_INDEXED),

+                 };

          }

  

-         r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, NULL, NULL, NULL);

+         r = journal_file_append_entry_internal(

+                 to, &ts, &boot_id, items, n_items, inlined, n_inlined, NULL, NULL, NULL);

+ 

+ finish:

+         for (size_t i = 0; i < n_inlined; i++)

+                 free(inlined[i].iov_base);

  

          if (mmap_cache_fd_got_sigbus(to->cache_fd))

                  return -EIO;
@@ -3958,6 +4923,14 @@ 

                  return true;

          }

  

+         if (JOURNAL_HEADER_CONTAINS(f->header, trie_hash_chain_depth) &&

+             le64toh(f->header->trie_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {

+                 log_full(log_level,

+                          "Trie hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.",

+                          f->path, le64toh(f->header->trie_hash_chain_depth));

+                 return true;

+         }

+ 

          /* Are the data objects properly indexed by field objects? */

          if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&

              JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
@@ -3995,6 +4968,9 @@ 

          [OBJECT_FIELD_HASH_TABLE] = "field hash table",

          [OBJECT_ENTRY_ARRAY] = "entry array",

          [OBJECT_TAG] = "tag",

+         [OBJECT_TRIE_NODE] = "trie node",

+         [OBJECT_TRIE_HASH_TABLE] = "trie hash table",

+         [OBJECT_BOOT_ID] = "boot id",

  };

  

  DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type, ObjectType);

@@ -71,6 +71,7 @@ 

          bool close_fd:1;

          bool archive:1;

          bool keyed_hash:1;

+         bool compact:1;

  

          direction_t last_direction;

          LocationType location_type;
@@ -83,6 +84,7 @@ 

          Header *header;

          HashItem *data_hash_table;

          HashItem *field_hash_table;

+         HashItem *trie_hash_table;

  

          uint64_t current_offset;

          uint64_t current_seqnum;
@@ -150,6 +152,7 @@ 

  /* Use six characters to cover the offsets common in smallish journal

   * files without adding too many zeros. */

  #define OFSfmt "%06"PRIx64

+ #define OFSfmt32 "%06"PRIx32

  

  static inline bool VALID_REALTIME(uint64_t u) {

          /* This considers timestamps until the year 3112 valid. That should be plenty room... */
@@ -184,14 +187,50 @@ 

  #define JOURNAL_HEADER_KEYED_HASH(h) \

          FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_KEYED_HASH)

  

+ #define JOURNAL_HEADER_COMPACT(h) \

+         FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPACT)

+ 

  int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret);

  int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret);

  

  int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset);

  int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset);

  

- uint64_t journal_file_entry_n_items(Object *o) _pure_;

- uint64_t journal_file_entry_array_n_items(Object *o) _pure_;

+ int journal_file_data_payload(

+                 JournalFile *f,

+                 Object *o,

+                 uint64_t offset,

+                 const char *field,

+                 size_t field_length,

+                 size_t data_threshold,

+                 void **ret_data,

+                 size_t *ret_size);

+ 

+ static inline size_t journal_file_data_payload_offset(JournalFile *f) {

+         return JOURNAL_HEADER_COMPACT(f->header) ? offsetof(Object, data.compact) : offsetof(Object, data.payload);

+ }

+ 

+ static inline uint8_t* journal_file_data_payload_field(JournalFile *f, Object *o) {

+         return JOURNAL_HEADER_COMPACT(f->header) ? o->data.compact : o->data.payload;

+ }

+ 

+ int journal_file_entry_item_next(

+                 JournalFile *f,

+                 Object *e,

+                 uint64_t offset,

+                 uint64_t *i,

+                 const char *field,

+                 size_t field_length,

+                 size_t data_threshold,

+                 uint64_t *ret_offset,

+                 void **ret_data,

+                 size_t *ret_size);

+ 

+ uint64_t journal_file_entry_xor_hash(JournalFile *f, Object *o);

+ int journal_file_entry_boot_id(JournalFile *f, Object *o, sd_id128_t *ret_boot_id);

+ 

+ uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) _pure_;

+ uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) _pure_;

  uint64_t journal_file_hash_table_n_items(Object *o) _pure_;

  

  int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *ret_offset);
@@ -204,6 +243,14 @@ 

                  Object **ret,

                  uint64_t *ret_offset);

  

+ static inline size_t journal_file_entry_array_items_offset(JournalFile *f) {

+         return JOURNAL_HEADER_COMPACT(f->header) ? offsetof(Object, entry_array.compact) : offsetof(Object, entry_array.items);

+ }

+ 

+ static inline size_t journal_file_entry_array_item_size(JournalFile *f) {

+         return JOURNAL_HEADER_COMPACT(f->header) ? sizeof(le32_t) : sizeof(le64_t);

+ }

+ 

  int journal_file_find_data_object(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *ret_offset);

  int journal_file_find_data_object_with_hash(JournalFile *f, const void *data, uint64_t size, uint64_t hash, Object **ret, uint64_t *ret_offset);

  
@@ -211,7 +258,7 @@ 

  int journal_file_find_field_object_with_hash(JournalFile *f, const void *field, uint64_t size, uint64_t hash, Object **ret, uint64_t *ret_offset);

  

  void journal_file_reset_location(JournalFile *f);

- void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset);

+ int journal_file_save_location(JournalFile *f, Object *o, uint64_t offset);

  int journal_file_compare_locations(JournalFile *af, JournalFile *bf);

  int journal_file_next_entry(JournalFile *f, uint64_t p, direction_t direction, Object **ret, uint64_t *ret_offset);

  

@@ -159,7 +159,7 @@ 

                  uint64_t h1, h2;

                  int r;

  

-                 if (le64toh(o->data.entry_offset) == 0)

+                 if (!JOURNAL_HEADER_COMPACT(f->header) && le64toh(o->data.entry_offset) == 0)

                          warning(offset, "Unused data (entry_offset==0)");

  

                  if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
@@ -167,16 +167,16 @@ 

                          return -EBADMSG;

                  }

  

-                 if (le64toh(o->object.size) - offsetof(Object, data.payload) <= 0) {

+                 if (le64toh(o->object.size) - journal_file_data_payload_offset(f) <= 0) {

                          error(offset, "Bad object size (<= %zu): %"PRIu64,

-                               offsetof(Object, data.payload),

+                               journal_file_data_payload_offset(f),

                                le64toh(o->object.size));

                          return -EBADMSG;

                  }

  

                  h1 = le64toh(o->data.hash);

-                 r = hash_payload(f, o, offset, o->data.payload,

-                                  le64toh(o->object.size) - offsetof(Object, data.payload),

+                 r = hash_payload(f, o, offset, journal_file_data_payload_field(f, o),

+                                  le64toh(o->object.size) - journal_file_data_payload_offset(f),

                                   &h2);

                  if (r < 0)

                          return r;
@@ -237,19 +237,30 @@ 

          }

  

          case OBJECT_ENTRY:

-                 if ((le64toh(o->object.size) - offsetof(Object, entry.items)) % sizeof(EntryItem) != 0) {

-                         error(offset,

-                               "Bad entry size (<= %zu): %"PRIu64,

-                               offsetof(Object, entry.items),

-                               le64toh(o->object.size));

-                         return -EBADMSG;

-                 }

+                 if (JOURNAL_HEADER_COMPACT(f->header)) {

+                         if (le64toh(o->object.size) < offsetof(Object, entry.payload)) {

+                                 error(offset,

+                                       "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,

+                                       offsetof(Object, entry.payload),

+                                       le64toh(o->object.size),

+                                       offset);

+                                 return -EBADMSG;

+                         }

+                 } else {

+                         if ((le64toh(o->object.size) - offsetof(Object, entry.items)) % sizeof(EntryItem) != 0) {

+                                 error(offset,

+                                       "Bad entry size (<= %zu): %" PRIu64,

+                                       offsetof(Object, entry.items),

+                                       le64toh(o->object.size));

+                                 return -EBADMSG;

+                         }

  

-                 if ((le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem) <= 0) {

-                         error(offset,

-                               "Invalid number items in entry: %"PRIu64,

-                               (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem));

-                         return -EBADMSG;

+                         if ((le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem) <= 0) {

+                                 error(offset,

+                                       "Invalid number items in entry: %" PRIu64,

+                                       (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem));

+                                 return -EBADMSG;

+                         }

                  }

  

                  if (le64toh(o->entry.seqnum) <= 0) {
@@ -273,13 +284,20 @@ 

                          return -EBADMSG;

                  }

  

-                 for (uint64_t i = 0; i < journal_file_entry_n_items(o); i++) {

-                         if (le64toh(o->entry.items[i].object_offset) == 0 ||

-                             !VALID64(le64toh(o->entry.items[i].object_offset))) {

-                                 error(offset,

-                                       "Invalid entry item (%"PRIu64"/%"PRIu64" offset: "OFSfmt,

-                                       i, journal_file_entry_n_items(o),

-                                       le64toh(o->entry.items[i].object_offset));

+                 for (uint64_t i = 0;;) {

+                         uint64_t p;

+                         int r;

+ 

+                         r = journal_file_entry_item_next(f, o, offset, &i, NULL, 0, 0, &p, NULL, NULL);

+                         if (r < 0) {

+                                 error_errno(offset, r, "Invalid entry item (%"PRIu64"): %m", i);

+                                 return r;

+                         }

+                         if (r == 0)

+                                 break;

+ 

+                         if (!VALID64(p)) {

+                                 error(offset, "Invalid entry item (%"PRIu64" offset: "OFSfmt, i, p);

                                  return -EBADMSG;

                          }

                  }
@@ -288,6 +306,7 @@ 

  

          case OBJECT_DATA_HASH_TABLE:

          case OBJECT_FIELD_HASH_TABLE:

+         case OBJECT_TRIE_HASH_TABLE:

                  if ((le64toh(o->object.size) - offsetof(Object, hash_table.items)) % sizeof(HashItem) != 0 ||

                      (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem) <= 0) {

                          error(offset,
@@ -332,8 +351,8 @@ 

                  break;

  

          case OBJECT_ENTRY_ARRAY:

-                 if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||

-                     (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) {

+                 if ((le64toh(o->object.size) - journal_file_entry_array_items_offset(f)) % journal_file_entry_array_item_size(f) != 0 ||

+                     (le64toh(o->object.size) - journal_file_entry_array_items_offset(f)) / journal_file_entry_array_item_size(f) <= 0) {

                          error(offset,

                                "Invalid object entry array size: %"PRIu64,

                                le64toh(o->object.size));
@@ -347,15 +366,15 @@ 

                          return -EBADMSG;

                  }

  

-                 for (uint64_t i = 0; i < journal_file_entry_array_n_items(o); i++)

-                         if (le64toh(o->entry_array.items[i]) != 0 &&

-                             !VALID64(le64toh(o->entry_array.items[i]))) {

+                 for (uint64_t i = 0; i < journal_file_entry_array_n_items(f, o); i++) {

+                         uint64_t q = journal_file_entry_array_item(f, o, i);

+                         if (q != 0 && !VALID64(q)) {

                                  error(offset,

                                        "Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt,

-                                       i, journal_file_entry_array_n_items(o),

-                                       le64toh(o->entry_array.items[i]));

+                                       i, journal_file_entry_array_n_items(f, o), q);

                                  return -EBADMSG;

                          }

+                 }

  

                  break;

  
@@ -375,6 +394,25 @@ 

                  }

  

                  break;

+ 

+         case OBJECT_TRIE_NODE:

+                 if (le64toh(o->object.size) != sizeof(TrieNodeObject)) {

+                         error(offset, "Invalid object trie node size: %"PRIu64, le64toh(o->object.size));

+                         return -EBADMSG;

+                 }

+ 

+                 if (!VALID64(le64toh(o->trie_node.next_hash_offset)) ||

+                     !VALID64(le32toh(o->trie_node.object_offset)) ||

+                     !VALID64(le32toh(o->trie_node.parent_offset))) {

+                         error(offset,

+                               "Invalid offset (next_hash_offset="OFSfmt", object_offset="OFSfmt32", parent_offset="OFSfmt32,

+                               le64toh(o->trie_node.next_hash_offset),

+                               le32toh(o->trie_node.object_offset),

+                               le32toh(o->trie_node.parent_offset));

+                         return -EBADMSG;

+                 }

+ 

+                 break;

          }

  

          return 0;
@@ -487,10 +525,10 @@ 

                          return -EBADMSG;

                  }

  

-                 m = journal_file_entry_array_n_items(o);

+                 m = journal_file_entry_array_n_items(f, o);

                  for (j = 0; i < n && j < m; i++, j++) {

  

-                         q = le64toh(o->entry_array.items[j]);

+                         q = journal_file_entry_array_item(f, o, j);

                          if (q <= last) {

                                  error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last);

                                  return -EBADMSG;
@@ -636,19 +674,23 @@ 

                  MMapFileDescriptor *cache_data_fd, uint64_t n_data,

                  bool last) {

  

-         uint64_t i, n;

          int r;

  

          assert(f);

          assert(o);

          assert(cache_data_fd);

  

-         n = journal_file_entry_n_items(o);

-         for (i = 0; i < n; i++) {

+         for (uint64_t i = 0;;) {

                  uint64_t q;

                  Object *u;

  

-                 q = le64toh(o->entry.items[i].object_offset);

+                 r = journal_file_entry_item_next(f, o, p, &i, NULL, 0, 0, &q, NULL, NULL);

+                 if (r < 0) {

+                         error_errno(p, r, "Invalid entry item of entry");

+                         return r;

+                 }

+                 if (r == 0 || q == 0)

+                         break;

  

                  if (!contains_uint64(cache_data_fd, n_data, q)) {

                          error(p, "Invalid data object of entry");
@@ -729,11 +771,11 @@ 

                          return -EBADMSG;

                  }

  

-                 m = journal_file_entry_array_n_items(o);

+                 m = journal_file_entry_array_n_items(f, o);

                  for (j = 0; i < n && j < m; i++, j++) {

                          uint64_t p;

  

-                         p = le64toh(o->entry_array.items[j]);

+                         p = journal_file_entry_array_item(f, o, j);

                          if (p <= last) {

                                  error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n);

                                  return -EBADMSG;
@@ -814,7 +856,7 @@ 

          uint64_t entry_seqnum = 0, entry_monotonic = 0, entry_realtime = 0;

          sd_id128_t entry_boot_id;

          bool entry_seqnum_set = false, entry_monotonic_set = false, entry_realtime_set = false, found_main_entry_array = false;

-         uint64_t n_weird = 0, n_objects = 0, n_entries = 0, n_data = 0, n_fields = 0, n_data_hash_tables = 0, n_field_hash_tables = 0, n_entry_arrays = 0, n_tags = 0;

+         uint64_t n_weird = 0, n_objects = 0, n_entries = 0, n_data = 0, n_fields = 0, n_data_hash_tables = 0, n_field_hash_tables = 0, n_trie_hash_tables = 0, n_entry_arrays = 0, n_tags = 0;

          usec_t last_usec = 0;

          _cleanup_close_ int data_fd = -1, entry_fd = -1, entry_array_fd = -1;

          _cleanup_fclose_ FILE *data_fp = NULL, *entry_fp = NULL, *entry_array_fp = NULL;
@@ -992,7 +1034,9 @@ 

                          n_fields++;

                          break;

  

-                 case OBJECT_ENTRY:

+                 case OBJECT_ENTRY: {

+                         sd_id128_t boot_id;

+ 

                          if (JOURNAL_HEADER_SEALED(f->header) && n_tags <= 0) {

                                  error(p, "First entry before first tag");

                                  r = -EBADMSG;
@@ -1035,8 +1079,12 @@ 

                          entry_seqnum = le64toh(o->entry.seqnum);

                          entry_seqnum_set = true;

  

+                         r = journal_file_entry_boot_id(f, o, &boot_id);

+                         if (r < 0)

+                                 return r;

+ 

                          if (entry_monotonic_set &&

-                             sd_id128_equal(entry_boot_id, o->entry.boot_id) &&

+                             sd_id128_equal(entry_boot_id, boot_id) &&

                              entry_monotonic > le64toh(o->entry.monotonic)) {

                                  error(p,

                                        "Entry timestamp out of synchronization (%"PRIu64" > %"PRIu64")",
@@ -1047,7 +1095,7 @@ 

                          }

  

                          entry_monotonic = le64toh(o->entry.monotonic);

-                         entry_boot_id = o->entry.boot_id;

+                         entry_boot_id = boot_id;

                          entry_monotonic_set = true;

  

                          if (!entry_realtime_set &&
@@ -1065,6 +1113,7 @@ 

  

                          n_entries++;

                          break;

+                 }

  

                  case OBJECT_DATA_HASH_TABLE:

                          r = verify_hash_table(o, p, &n_data_hash_tables,
@@ -1083,6 +1132,15 @@ 

  

                          break;

  

+                 case OBJECT_TRIE_HASH_TABLE:

+                         r = verify_hash_table(o, p, &n_trie_hash_tables,

+                                               le64toh(f->header->trie_hash_table_offset),

+                                               le64toh(f->header->trie_hash_table_size));

+                         if (r < 0)

+                                 goto fail;

+ 

+                         break;

+ 

                  case OBJECT_ENTRY_ARRAY:

                          r = write_uint64(entry_array_fp, p);

                          if (r < 0)

@@ -5,7 +5,7 @@ 

  #include <sys/stat.h>

  

  /* One context per object type, plus one of the header, plus one "additional" one */

- #define MMAP_CACHE_MAX_CONTEXTS 9

+ #define MMAP_CACHE_MAX_CONTEXTS 12

  

  typedef struct MMapCache MMapCache;

  typedef struct MMapFileDescriptor MMapFileDescriptor;

@@ -111,32 +111,45 @@ 

                  journal_file_reset_location(f);

  }

  

- static void init_location(Location *l, LocationType type, JournalFile *f, Object *o) {

+ static int init_location(Location *l, LocationType type, JournalFile *f, Object *o) {

+         sd_id128_t boot_id;

+         int r;

+ 

          assert(l);

          assert(IN_SET(type, LOCATION_DISCRETE, LOCATION_SEEK));

          assert(f);

  

+         r = journal_file_entry_boot_id(f, o, &boot_id);

+         if (r < 0)

+                 return r;

+ 

          *l = (Location) {

                  .type = type,

                  .seqnum = le64toh(o->entry.seqnum),

                  .seqnum_id = f->header->seqnum_id,

                  .realtime = le64toh(o->entry.realtime),

                  .monotonic = le64toh(o->entry.monotonic),

-                 .boot_id = o->entry.boot_id,

-                 .xor_hash = le64toh(o->entry.xor_hash),

+                 .boot_id = boot_id,

+                 .xor_hash = journal_file_entry_xor_hash(f, o),

                  .seqnum_set = true,

                  .realtime_set = true,

                  .monotonic_set = true,

                  .xor_hash_set = true,

          };

+ 

+         return 0;

  }

  

- static void set_location(sd_journal *j, JournalFile *f, Object *o) {

+ static int set_location(sd_journal *j, JournalFile *f, Object *o) {

+         int r;

+ 

          assert(j);

          assert(f);

          assert(o);

  

-         init_location(&j->current_location, LOCATION_DISCRETE, f, o);

+         r = init_location(&j->current_location, LOCATION_DISCRETE, f, o);

+         if (r < 0)

+                 return r;

  

          j->current_file = f;

          j->current_field = 0;
@@ -144,6 +157,8 @@ 

          /* Let f know its candidate entry was picked. */

          assert(f->location_type == LOCATION_SEEK);

          f->location_type = LOCATION_DISCRETE;

+ 

+         return 0;

  }

  

  static int match_is_valid(const void *data, size_t size) {
@@ -781,7 +796,9 @@ 

                          if (r <= 0)

                                  return r;

  

-                         journal_file_save_location(f, c, cp);

+                         r = journal_file_save_location(f, c, cp);

+                         if (r < 0)

+                                 return r;

                  }

          } else {

                  f->last_direction = direction;
@@ -790,7 +807,9 @@ 

                  if (r <= 0)

                          return r;

  

-                 journal_file_save_location(f, c, cp);

+                 r = journal_file_save_location(f, c, cp);

+                 if (r < 0)

+                         return r;

          }

  

          /* OK, we found the spot, now let's advance until an entry
@@ -871,7 +890,9 @@ 

          if (r < 0)

                  return r;

  

-         set_location(j, new_file, o);

+         r = set_location(j, new_file, o);

+         if (r < 0)

+                 return r;

  

          return 1;

  }
@@ -927,6 +948,7 @@ 

  }

  

  _public_ int sd_journal_get_cursor(sd_journal *j, char **cursor) {

+         sd_id128_t boot_id;

          Object *o;

          int r;

  
@@ -941,12 +963,16 @@ 

          if (r < 0)

                  return r;

  

+         r = journal_file_entry_boot_id(j->current_file, o, &boot_id);

+         if (r < 0)

+                 return r;

+ 

          if (asprintf(cursor,

                       "s=%s;i=%"PRIx64";b=%s;m=%"PRIx64";t=%"PRIx64";x=%"PRIx64,

                       SD_ID128_TO_STRING(j->current_file->header->seqnum_id), le64toh(o->entry.seqnum),

-                      SD_ID128_TO_STRING(o->entry.boot_id), le64toh(o->entry.monotonic),

+                      SD_ID128_TO_STRING(boot_id), le64toh(o->entry.monotonic),

                       le64toh(o->entry.realtime),

-                      le64toh(o->entry.xor_hash)) < 0)

+                      journal_file_entry_xor_hash(j->current_file, o)) < 0)

                  return -ENOMEM;

  

          return 0;
@@ -1054,8 +1080,9 @@ 

  }

  

  _public_ int sd_journal_test_cursor(sd_journal *j, const char *cursor) {

-         int r;

+         sd_id128_t boot_id;

          Object *o;

+         int r;

  

          assert_return(j, -EINVAL);

          assert_return(!journal_pid_changed(j), -ECHILD);
@@ -1068,6 +1095,10 @@ 

          if (r < 0)

                  return r;

  

+         r = journal_file_entry_boot_id(j->current_file, o, &boot_id);

+         if (r < 0)

+                 return r;

+ 

          for (;;) {

                  _cleanup_free_ char *item = NULL;

                  unsigned long long ll;
@@ -1105,7 +1136,7 @@ 

                          k = sd_id128_from_string(item+2, &id);

                          if (k < 0)

                                  return k;

-                         if (!sd_id128_equal(id, o->entry.boot_id))

+                         if (!sd_id128_equal(id, boot_id))

                                  return 0;

                          break;

  
@@ -1126,7 +1157,7 @@ 

                  case 'x':

                          if (sscanf(item+2, "%llx", &ll) != 1)

                                  return -EINVAL;

-                         if (ll != le64toh(o->entry.xor_hash))

+                         if (ll != journal_file_entry_xor_hash(j->current_file, o))

                                  return 0;

                          break;

                  }
@@ -2225,16 +2256,22 @@ 

          if (r < 0)

                  return r;

  

-         if (ret_boot_id)

-                 *ret_boot_id = o->entry.boot_id;

-         else {

-                 sd_id128_t id;

+         if (ret_boot_id) {

+                 r = journal_file_entry_boot_id(f, o, ret_boot_id);

+                 if (r < 0)

+                         return r;

+         } else {

+                 sd_id128_t id, boot_id;

  

                  r = sd_id128_get_boot(&id);

                  if (r < 0)

                          return r;

  

-                 if (!sd_id128_equal(id, o->entry.boot_id))

+                 r = journal_file_entry_boot_id(f, o, &boot_id);

+                 if (r < 0)

+                         return r;

+ 

+                 if (!sd_id128_equal(id, boot_id))

                          return -ESTALE;

          }

  
@@ -2274,10 +2311,10 @@ 

  

  _public_ int sd_journal_get_data(sd_journal *j, const char *field, const void **data, size_t *size) {

          JournalFile *f;

-         uint64_t i, n;

-         size_t field_length;

+         size_t l;

+         uint64_t i = 0;

+         void *d;

          int r;

-         Object *o;

  

          assert_return(j, -EINVAL);

          assert_return(!journal_pid_changed(j), -ECHILD);
@@ -2293,136 +2330,23 @@ 

          if (f->current_offset <= 0)

                  return -EADDRNOTAVAIL;

  

-         r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o);

+         r = journal_file_entry_item_next(

+                 f, NULL, f->current_offset, &i, field, strlen(field), j->data_threshold, NULL, &d, &l);

          if (r < 0)

                  return r;

+         if (r == 0)

+                 return -ENOENT;

  

-         field_length = strlen(field);

- 

-         n = journal_file_entry_n_items(o);

-         for (i = 0; i < n; i++) {

-                 Object *d;

-                 uint64_t p, l;

-                 size_t t;

-                 int compression;

- 

-                 p = le64toh(o->entry.items[i].object_offset);

-                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &d);

-                 if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) {

-                         log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", i);

-                         continue;

-                 }

-                 if (r < 0)

-                         return r;

- 

-                 l = le64toh(d->object.size) - offsetof(Object, data.payload);

- 

-                 compression = d->object.flags & OBJECT_COMPRESSION_MASK;

-                 if (compression) {

- #if HAVE_COMPRESSION

-                         r = decompress_startswith(compression,

-                                                   d->data.payload, l,

-                                                   &f->compress_buffer,

-                                                   field, field_length, '=');

-                         if (r < 0)

-                                 log_debug_errno(r, "Cannot decompress %s object of length %"PRIu64" at offset "OFSfmt": %m",

-                                                 object_compressed_to_string(compression), l, p);

-                         else if (r > 0) {

- 

-                                 size_t rsize;

- 

-                                 r = decompress_blob(compression,

-                                                     d->data.payload, l,

-                                                     &f->compress_buffer, &rsize,

-                                                     j->data_threshold);

-                                 if (r < 0)

-                                         return r;

- 

-                                 *data = f->compress_buffer;

-                                 *size = (size_t) rsize;

- 

-                                 return 0;

-                         }

- #else

-                         return -EPROTONOSUPPORT;

- #endif

-                 } else if (l >= field_length+1 &&

-                            memcmp(d->data.payload, field, field_length) == 0 &&

-                            d->data.payload[field_length] == '=') {

- 

-                         t = (size_t) l;

- 

-                         if ((uint64_t) t != l)

-                                 return -E2BIG;

- 

-                         *data = d->data.payload;

-                         *size = t;

- 

-                         return 0;

-                 }

-         }

- 

-         return -ENOENT;

- }

- 

- static int return_data(

-                 sd_journal *j,

-                 JournalFile *f,

-                 Object *o,

-                 const void **ret_data,

-                 size_t *ret_size) {

- 

-         size_t t;

-         uint64_t l;

-         int compression;

- 

-         assert(j);

-         assert(f);

- 

-         l = le64toh(READ_NOW(o->object.size));

-         if (l < offsetof(Object, data.payload))

-                 return -EBADMSG;

-         l -= offsetof(Object, data.payload);

- 

-         /* We can't read objects larger than 4G on a 32bit machine */

-         t = (size_t) l;

-         if ((uint64_t) t != l)

-                 return -E2BIG;

- 

-         compression = o->object.flags & OBJECT_COMPRESSION_MASK;

-         if (compression) {

- #if HAVE_COMPRESSION

-                 size_t rsize;

-                 int r;

- 

-                 r = decompress_blob(

-                                 compression,

-                                 o->data.payload, l,

-                                 &f->compress_buffer, &rsize,

-                                 j->data_threshold);

-                 if (r < 0)

-                         return r;

- 

-                 if (ret_data)

-                         *ret_data = f->compress_buffer;

-                 if (ret_size)

-                         *ret_size = (size_t) rsize;

- #else

-                 return -EPROTONOSUPPORT;

- #endif

-         } else {

-                 if (ret_data)

-                         *ret_data = o->data.payload;

-                 if (ret_size)

-                         *ret_size = t;

-         }

+         *data = d;

+         *size = l;

  

          return 0;

  }

  

  _public_ int sd_journal_enumerate_data(sd_journal *j, const void **data, size_t *size) {

          JournalFile *f;

-         Object *o;

+         void *d;

+         size_t l;

          int r;

  

          assert_return(j, -EINVAL);
@@ -2437,36 +2361,15 @@ 

          if (f->current_offset <= 0)

                  return -EADDRNOTAVAIL;

  

-         r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o);

-         if (r < 0)

+         r = journal_file_entry_item_next(

+                 f, NULL, f->current_offset, &j->current_field, NULL, 0, j->data_threshold, NULL, &d, &l);

+         if (r <= 0)

                  return r;

  

-         for (uint64_t n = journal_file_entry_n_items(o); j->current_field < n; j->current_field++) {

-                 uint64_t p;

- 

-                 p = le64toh(o->entry.items[j->current_field].object_offset);

-                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);

-                 if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) {

-                         log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", j->current_field);

-                         continue;

-                 }

-                 if (r < 0)

-                         return r;

- 

-                 r = return_data(j, f, o, data, size);

-                 if (r == -EBADMSG) {

-                         log_debug("Entry item %"PRIu64" data payload is bad, skipping over it.", j->current_field);

-                         continue;

-                 }

-                 if (r < 0)

-                         return r;

- 

-                 j->current_field++;

+         *data = d;

+         *size = l;

  

-                 return 1;

-         }

- 

-         return 0;

+         return 1;

  }

  

  _public_ int sd_journal_enumerate_available_data(sd_journal *j, const void **data, size_t *size) {
@@ -2478,7 +2381,21 @@ 

                          return r;

                  if (!JOURNAL_ERRNO_IS_UNAVAILABLE_FIELD(r))

                          return r;

-                 j->current_field++; /* Try with the next field */

+ 

+                 /* Try with the next field */

+                 r = journal_file_entry_item_next(

+                         j->current_file,

+                         NULL,

+                         j->current_file->current_offset,

+                         &j->current_field,

+                         NULL,

+                         0,

+                         0,

+                         NULL,

+                         NULL,

+                         NULL);

+                 if (r <= 0)

+                         return r;

          }

  }

  
@@ -2932,7 +2849,7 @@ 

          for (;;) {

                  JournalFile *of;

                  Object *o;

-                 const void *odata;

+                 void *odata;

                  size_t ol;

                  bool found;

                  int r;
@@ -2976,7 +2893,8 @@ 

                                                 j->unique_offset,

                                                 o->object.type, OBJECT_DATA);

  

-                 r = return_data(j, j->unique_file, o, &odata, &ol);

+                 r = journal_file_data_payload(

+                         j->unique_file, o, j->unique_offset, NULL, 0, j->data_threshold, &odata, &ol);

                  if (r < 0)

                          return r;

  
@@ -3023,9 +2941,8 @@ 

                  if (found)

                          continue;

  

-                 r = return_data(j, j->unique_file, o, ret_data, ret_size);

-                 if (r < 0)

-                         return r;

+                 *ret_data = odata;

+                 *ret_size = ol;

  

                  return 1;

          }

file modified
+49
@@ -137,3 +137,52 @@ 

  

          return r;

  }

+ 

+ bool journal_shall_try_append_again(JournalFile *f, int r) {

+         switch(r) {

+ 

+         case -E2BIG:           /* Hit configured limit          */

+         case -EFBIG:           /* Hit fs limit                  */

+         case -EDQUOT:          /* Quota limit hit               */

+         case -ENOSPC:          /* Disk full                     */

+                 log_debug("%s: Allocation limit reached, rotating.", f->path);

+                 return true;

+ 

+         case -EIO:             /* I/O error of some kind (mmap) */

+                 log_warning("%s: IO error, rotating.", f->path);

+                 return true;

+ 

+         case -EHOSTDOWN:       /* Other machine                 */

+                 log_info("%s: Journal file from other machine, rotating.", f->path);

+                 return true;

+ 

+         case -EBUSY:           /* Unclean shutdown              */

+                 log_info("%s: Unclean shutdown, rotating.", f->path);

+                 return true;

+ 

+         case -EPROTONOSUPPORT: /* Unsupported feature           */

+                 log_info("%s: Unsupported feature, rotating.", f->path);

+                 return true;

+ 

+         case -EBADMSG:         /* Corrupted                     */

+         case -ENODATA:         /* Truncated                     */

+         case -ESHUTDOWN:       /* Already archived              */

+                 log_warning("%s: Journal file corrupted, rotating.", f->path);

+                 return true;

+ 

+         case -EIDRM:           /* Journal file has been deleted */

+                 log_warning("%s: Journal file has been deleted, rotating.", f->path);

+                 return true;

+ 

+         case -ETXTBSY:         /* Journal file is from the future */

+                 log_warning("%s: Journal file is from the future, rotating.", f->path);

+                 return true;

+ 

+         case -EAFNOSUPPORT:

+                 log_warning("%s: underlying file system does not support memory mapping or another required file system feature.", f->path);

+                 return false;

+ 

+         default:

+                 return false;

+         }

+ }

@@ -6,5 +6,9 @@ 

  

  #include "sd-journal.h"

  

+ #include "journal-file.h"

+ 

  int journal_access_blocked(sd_journal *j);

  int journal_access_check_and_warn(sd_journal *j, bool quiet, bool want_other_users);

+ 

+ bool journal_shall_try_append_again(JournalFile *f, int r);

Upstream PR: https://github.com/systemd/systemd/pull/21183

One remaining question is whether compact mode should be enabled by default, do we want to disable by default and enable via envvar? That might be the safer option to go for.

Compatibility features should be disabled by default to encourage people to migrate.

EDIT: Ignore me, I misread the word. I think it makes sense to follow upstream systemd behavior.

Upstream PR: https://github.com/systemd/systemd/pull/21183

One remaining question is whether compact mode should be enabled by default, do we want to disable by default and enable via envvar? That might be the safer option to go for.

I think it's usually the other way around upstream? enable by default and disable with an env var.

rebased onto 0331352

3 years ago

Pull-Request has been merged by daandemeyer

3 years ago