From 3d7e2ce10a6a3f05c786bc56d73413df70d7c55d Mon Sep 17 00:00:00 2001
From: Will Woods <wwoods@redhat.com>
Date: Jun 02 2020 04:47:30 +0000
Subject: Refactoring!


Yeah! Now we have a 'countme' module so we can share code between
parse-access-log.py and countme-totals.py! Wow, cool!

Also - thanks to that - countme-totals.py can now output csv, json, or
sqlite. Fun!

---

diff --git a/countme-totals.py b/countme-totals.py
index cf78cc0..e1bb912 100755
--- a/countme-totals.py
+++ b/countme-totals.py
@@ -2,12 +2,9 @@
 
 import sys
 import argparse
-import datetime
 from collections import Counter
 from typing import NamedTuple
-
-COUNTME_OFFSET = 345600       # 00:00:00 Mon Jan 5 00:00:00 1970
-COUNTME_WINDOW = 7*24*60*60   # Exactly 7 days
+from countme import CountmeItem, weeknum, guessreader, autoreader, make_writer
 
 # NOTE: log timestamps do not move monotonically forward, but they don't
 # seem to ever jump backwards more than 241 seconds. I assume this is
@@ -22,49 +19,15 @@ COUNTME_WINDOW = 7*24*60*60   # Exactly 7 days
 # in 24-hour chunks, any window that extends into the next day means we have to
 # wait 24 hours until we can be sure we have all the data for the previous
 # week, so the effect would be the same if this was 3600 or 43200 or whatever.
+# TODO: this should probably move into the module somewhere..
 LOG_JITTER_WINDOW = 600
 
-def weektuple(ts):
-    '''Return (week_num, week_secs) for a given timestamp'''
-    return divmod(int(ts)-COUNTME_OFFSET, COUNTME_WINDOW)
-
-def week_start_ts(ts):
-    '''Return the timestamp of the start of the week containing ts'''
-    weeksecs = (ts-COUNTME_OFFSET) % COUNTME_WINDOW
-    return ts - weeksecs
-
-def week_start(ts):
-    '''Return an ISO-formatted date string of the Monday that starts the week
-    that contains the given timestamp.'''
-    ts = int(ts)
-    weeksecs = (ts-COUNTME_OFFSET) % COUNTME_WINDOW
-    weekstart = datetime.datetime.utcfromtimestamp(ts - weeksecs)
-    return weekstart.date().isoformat()
-
-# Here's the items we expect to be reading from our input file.
-# TODO: we should be importing this from a 'countme' module or something
-# rather than duplicating it between parse-access-log.py and here
-class CountmeItem(NamedTuple):
-    '''
-    A "countme" match item.
-    Includes the countme value and libdnf User-Agent fields.
-    '''
-    timestamp: int
-    host: str
-    os_name: str
-    os_version: str
-    os_variant: str
-    os_arch: str
-    countme: int
-    repo_tag: str
-    repo_arch: str
-
-# And here's the "bucket" we sort each item into.
+# Here's the "bucket" we sort each item into.
 class CountmeBucket(NamedTuple):
     '''
     This defines the fields that we use to group/aggregate CountmeItems.
     '''
-    week_start: str
+    weeknum: int
     os_name: str
     os_version: str
     os_variant: str
@@ -75,101 +38,7 @@ class CountmeBucket(NamedTuple):
 
     @classmethod
     def from_item(cls, item):
-        return cls._make((week_start(item.timestamp),) + item[2:])
-
-
-# ===========================================================================
-# ====== ItemReader classes =================================================
-# ===========================================================================
-
-class ReaderError(RuntimeError):
-    pass
-
-class ItemReader:
-    def __init__(self, fp, itemtuple, **kwargs):
-        self._fp = fp
-        self._itemtuple = itemtuple
-        self._itemfields = itemtuple._fields
-        self._itemfactory = itemtuple._make
-        self._filefields = None
-        self._get_reader(**kwargs)
-        if not self._filefields:
-            raise ReaderError("no field names found")
-        if self._filefields != self._itemfields:
-            raise ReaderError(f"field mismatch: expected {self._itemfields}, got {self._filefields}")
-    def _get_reader(self):
-        '''Set up the ItemReader.
-        Should set self._filefields to a tuple of the fields found in fp.'''
-        raise NotImplementedError
-    def _iter_rows(self):
-        '''Return an iterator/generator that produces a row for each item.'''
-        raise NotImplementedError
-    def __iter__(self):
-        for item in self._iter_rows():
-            yield self._itemfactory(item)
-
-class CSVReader(ItemReader):
-    def _get_reader(self, **kwargs):
-        import csv
-        self._reader = csv.reader(self._fp)
-        self._filefields = tuple(next(self._reader))
-        # If we have numbers in our fieldnames, probably there was no header
-        if any(name.isnumeric() for name in self._filefields):
-            header = ','.join(fields)
-            raise ReaderError(f"header bad/missing, got: {header}")
-    def _iter_rows(self):
-        return self._reader
-
-# TODO: AWKReader, JSONReader
-
-class SQLiteReader(ItemReader):
-    def _get_reader(self, tablename='countme_raw', **kwargs):
-        import sqlite3
-        self._con = sqlite3.connect(self._fp.name)
-        # TODO: self._con.set_progress_handler(handler, call_interval)
-        self._cur = self._con.cursor()
-        self._tablename = tablename
-        if False and sqlite3.sqlite_version_info >= (3,16,0):
-            fields_sql = f"SELECT name FROM pragma_table_info(?)"
-            self._filefields = tuple(r[0] for r in self._cur.execute(fields_sql, (tablename,)))
-        else:
-            fields_sql = f"PRAGMA table_info('{tablename}')"
-            self._filefields = tuple(r[1] for r in self._cur.execute(fields_sql))
-    def _iter_rows(self):
-        fields = ",".join(self._itemfields)
-        return self._cur.execute(f"SELECT {fields} FROM {self._tablename}")
-
-# BUCKET COUNTER YOOOOOOO
-# TODO: finish/clean this up
-# TODO: If we're doing sqlite->sqlite we can probably do the count in pure SQL,
-#       which is probably much faster? Complicated tho.
-class BucketCounterBase:
-    itemtuple = NotImplemented
-    buckettuple = NotImplemented
-    def __init__(self, item_filter=None, **kwargs):
-        self._count = Counter()
-        self.item_filter = item_filter
-    @classmethod
-    def item_bucket(cls, item):
-        raise NotImplementedError
-    def bucket_count(self, reader):
-        if reader._itemtuple != self.itemtuple:
-            raise ValueError(f"Reader item {reader._itemtuple!r}"
-                             f" does not match expected item {self.itemtuple!r}")
-        # Iterate through (maybe filtered) items and count 'em up
-        itemiter = filter(self.item_filter, reader) if callable(self.item_filter) else iter(reader)
-        count = Counter(self.item_bucket(item) for item in itemiter)
-        # Add the new counts to the total
-        self._count += count
-        # Return the new counts
-        return count
-
-class CountmeBucketCounter:
-    itemtuple = CountmeItem
-    buckettuple = CountmeBucket
-    @classmethod
-    def item_bucket(cls, item):
-        return cls.buckettuple._make((week_start(item.timestamp),) + item[2:])
+        return cls._make((weeknum(item.timestamp),) + item[2:])
 
 # ===========================================================================
 # ====== CLI parser & main() ================================================
@@ -186,39 +55,24 @@ def parse_args(argv=None):
         type=argparse.FileType('rt', encoding='utf-8'), nargs='+',
         help="Data to parse (from parse-access-log.py)")
 
-    # TODO: atomic creation of output file
+    # TODO: atomic creation/update of output file?
     p.add_argument("-o", "--output",
         type=argparse.FileType('at', encoding='utf-8'),
         help="output file (default: stdout)",
         default=sys.stdout)
 
-    # TODO: refuse to overwrite existing files, unless..
-    #p.add_argument("--force"
-    # Or perhaps..
-    #p.add_argument("--update")
+    # TODO: flag to write prelim data to a different file/table; otherwise,
+    # don't include prelim data
 
     p.add_argument("-f", "--format",
-        choices=("csv", "json", "awk"),
+        choices=("csv", "json", "awk", "sqlite"),
         help="output format (default: csv)",
         default="csv")
 
-    # TODO: sqlite output, wheeee.
-    # SQLite counting could probably all be done in pure SQL, tbh, so maybe
-    # that's a totally different script?
-    #p.add_argument("--sqlite", metavar="DB",
-    #    help="sqlite database to write to")
-
-    # TODO: use this..
-    p.add_argument("--progress", action="store_true",
-        help="print some progress info while counting")
-
     p.add_argument("--input-format", choices=("csv", "sqlite", "auto"),
         help="input file format (default: guess from extension)",
         default="auto")
 
-    # TODO: allow specifying cutoff times so we don't double-count?
-    # Also: cutoff time/date for "preliminary" data?
-
     args = p.parse_args(argv)
 
     # Pick the right reader factory
@@ -227,77 +81,61 @@ def parse_args(argv=None):
     elif args.input_format == "sqlite":
         args.reader = SQLiteReader
     elif args.input_format == "auto":
-        args.reader = autoreader
         # Check that we can figure out the right reader(s) before we start..
         for fp in args.infiles:
             if guessreader(fp) is None:
                 raise argparse.ArgumentTypeError(
                     "Can't guess input format for {fp.name!r}. "
                     "Try '--input-format=FMT'.")
+        args.reader = autoreader
     else:
         raise argparse.ArgumentTypeError("unknown input format {args.input_format!r}")
 
-    return args
+    # TODO: if writing to existing file, check & bail out if field mismatch
 
-# Guess the right reader based on the filename.
-def guessreader(fp):
-    if fp.name.endswith(".csv"):
-        reader = CSVReader
-    elif fp.name.endswith(".db"):
-        reader = SQLiteReader
-    else:
-        # FIXME: better format detection!!
-        # TODO: if fp is seekable, peek and figure out filetype
-        reader = None
-    return reader
-
-def autoreader(fp, itemtuple, **kwargs):
-    '''Convenience function to guess & instantiate the right writer'''
-    reader = guessreader(fp)
-    return reader(fp, itemtuple, **kwargs)
+    return args
 
-# FIXME: probably want the ItemWriters from parse-access-logs.py here
 class CountWriter:
-    def __init__(self, fp):
-        import csv
+    def __init__(self, outformat, fp, bucketclass):
         self._fp = fp
-        self._writer = csv.writer(fp)
-    def write(self, bucket, count):
-        self._writer.writerow((count,)+bucket)
+        self._bucketclass = bucketclass
+        self._countclass = NamedTuple(bucketclass.__name__ + "Count",
+                                      [("count", int)] + list(bucketclass.__annotations__.items()))
+        # TODO: countme_prelim "table" for prelim output
+        self._writer = make_writer(outformat, self._fp, self._countclass, timefield='weeknum', tablename='countme_totals')
     @staticmethod
     def sortkey(bucketcount):
         bucket, count = bucketcount
         # Sort by date (old->new), then count (high->low), then other fields.
-        return (bucket.week_start, -count) + bucket
+        return (bucket.weeknum, -count) + bucket
     def writecounts(self, counts):
+        self._writer.write_header()
         for bucket, count in sorted(counts.items(), key=self.sortkey):
-            self.write(bucket, count)
+            countitem = self._countclass._make((count,)+bucket)
+            self._writer.write_item(countitem)
+        self._writer.write_footer()
 
 
 def main():
     args = parse_args()
 
+    # Just a plain ol' Counter
     count = Counter()
 
-    # Set up our counter and bucket-maker.
-    # TODO: CountmeBucketCounter is half-assed; either full-ass it or just
-    # go with a simple item_bucket function.
-    #counter = CountmeBucketCounter()
     # Here's the function that finds the bucket for a given item.
     item_bucket = CountmeBucket.from_item
 
-    # Set up writer.
-    # FIXME: proper writers, append/update mode, etc.
-    countwriter = CountWriter(args.output)
-
+    # Okay, start reading our inputs and doing counts
     for inf in args.infiles:
         for item in args.reader(inf, CountmeItem):
             bucket = item_bucket(item)
             count[bucket] += 1
 
-    # TODO: how do we tell preliminary counts from final ones?
-    # if bucket.week_start in prelim_weeks: ...
-    countwriter.writecounts(count)
+    # TODO: how do we split preliminary counts from final ones?
+
+    # Write the counts.
+    writer = CountWriter(args.format, args.output, CountmeBucket)
+    writer.writecounts(count)
 
 
 if __name__ == '__main__':
diff --git a/countme/__init__.py b/countme/__init__.py
new file mode 100644
index 0000000..2fe6860
--- /dev/null
+++ b/countme/__init__.py
@@ -0,0 +1,388 @@
+# countme - parsing Fedora httpd access_log files to structured data.
+#
+# Copyright (C) 2020, Red Hat Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+# Author: Will Woods <wwoods@redhat.com>
+#
+# The main point of this script, as it says above, is parsing access_log to
+# structured data. I'm trying to avoid packing Fedora-specific data-massaging
+# into this; tools further down the pipeline can be responsible for figuring
+# out how to group "updates-released-f32" and "fedora-modular-source-32".
+
+import os
+import re
+from datetime import date, time, datetime, timezone
+from urllib.parse import parse_qsl
+from typing import NamedTuple, Optional
+
+from .regex import COUNTME_LOG_RE, MIRRORS_LOG_RE
+
+# TODO: clean this up so it only exports the common/needed bits
+__all__ = (
+    'weeknum', 'parse_logtime', 'parse_querydict',
+
+    'ItemWriter', 'CSVWriter', 'JSONWriter', 'AWKWriter', 'SQLiteWriter',
+    'ItemReader', 'CSVReader',                            'SQLiteReader',
+
+    'make_writer', 'guessreader', 'autoreader',
+
+    'LogItem',    'MirrorItem',    'CountmeItem',
+    'LogMatcher', 'MirrorMatcher', 'CountmeMatcher',
+)
+
+# ===========================================================================
+# ====== Output item definitions and helpers ================================
+# ===========================================================================
+
+DAY_LEN = 24*60*60
+WEEK_LEN = 7*DAY_LEN
+COUNTME_EPOCH = 345600          # =00:00:00 Mon Jan 5 00:00:00 1970 (UTC)
+MONTHIDX = {
+    'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
+    'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12
+}
+
+def weeknum(timestamp):
+    return (int(timestamp) - COUNTME_EPOCH) // WEEK_LEN
+
+def parse_logtime(logtime):
+    # Equivalent to - but faster than:
+    #    return datetime.strptime(logtime, "%d/%b/%Y:%H:%M:%S %z")
+    # It's like ~1.2usec vs 12usec, which might seem trivial but in my tests
+    # the regex parser can handle like ~200k lines/sec - or 5usec/line - so
+    # an extra ~10usec to parse the time field is not insignificant.
+    # logtime: '29/Mar/2020:16:04:28 +0000'
+    # ISO8601: '2020-03-29T16:04:28+00:00'
+    y = logtime[7:11]
+    m = MONTHIDX[logtime[3:6]]
+    d = logtime[0:2]
+    time = logtime[12:20]
+    offh = logtime[21:24]
+    offm = logtime[24:26]
+    return datetime.fromisoformat(f"{y}-{m:02}-{d}T{time}{offh}:{offm}")
+
+def parse_logdate(logtime):
+    y = int(logtime[7:11])
+    m = MONTHIDX[logtime[3:6]]
+    d = int(logtime[0:2])
+    return date(y,m,d)
+
+def parse_querydict(querystr):
+    '''Parse request query the way mirrormanager does (last value wins)'''
+    return dict(parse_qsl(querystr))
+
+class LogItem(NamedTuple):
+    '''
+    Generic access.log data holder.
+    '''
+    host: str
+    identity: str
+    time: str
+    method: str
+    path: str
+    query: Optional[str]
+    protocol: str
+    status: int
+    nbytes: Optional[int]
+    referrer: str
+    user_agent: str
+
+    def datetime(self):
+        return parse_logtime(self.time)
+
+    def timestamp(self):
+        return parse_logtime(self.time).timestamp()
+
+    def queryitems(self):
+        return parse_qsl(self.query)
+
+    def querydict(self):
+        return parse_querydict(self.query)
+
+# TODO: would be kinda nice if there was a clear subclass / translation
+# between item classes... or if compile_log_regex made the class for you?
+# Or something? It feels like these things should be more closely bound.
+
+
+class MirrorItem(NamedTuple):
+    '''
+    A basic mirrorlist/metalink metadata item.
+    Each item has a timestamp, IP, and the requested repo= and arch= values.
+    '''
+    timestamp: int
+    host: str
+    repo_tag: Optional[str]
+    repo_arch: Optional[str]
+
+class CountmeItem(NamedTuple):
+    '''
+    A "countme" match item.
+    Includes the countme value and libdnf User-Agent fields.
+    '''
+    timestamp: int
+    host: str
+    os_name: str
+    os_version: str
+    os_variant: str
+    os_arch: str
+    countme: int
+    repo_tag: str
+    repo_arch: str
+
+class LogMatcher:
+    '''Base class for a LogMatcher, which iterates through a log file'''
+    regex = NotImplemented
+    itemtuple = NotImplemented
+    def __init__(self, fileobj):
+        self.fileobj = fileobj
+    def iteritems(self):
+        # TODO: at this point we're single-threaded and CPU-bound;
+        # multithreading would speed things up here.
+        for line in self.fileobj:
+            match = self.regex.match(line)
+            if match:
+                yield self.make_item(match)
+    __iter__ = iteritems
+    @classmethod
+    def make_item(cls, match):
+        raise NotImplementedError
+
+class MirrorMatcher(LogMatcher):
+    '''Match all mirrorlist/metalink items, like mirrorlist.py does.'''
+    regex = MIRRORS_LOG_RE
+    itemtuple = MirrorItem
+    @classmethod
+    def make_item(cls, match):
+        timestamp = parse_logtime(match['time']).timestamp()
+        query = parse_querydict(match['query'])
+        return cls.itemtuple(timestamp = int(timestamp),
+                             host      = match['host'],
+                             repo_tag  = query.get('repo'),
+                             repo_arch = query.get('arch'))
+
+class CountmeMatcher(LogMatcher):
+    '''Match the libdnf-style "countme" requests.'''
+    regex = COUNTME_LOG_RE
+    itemtuple = CountmeItem
+    @classmethod
+    def make_item(cls, match):
+        timestamp = parse_logtime(match['time']).timestamp()
+        query = parse_querydict(match['query'])
+        return cls.itemtuple(timestamp  = int(timestamp),
+                             host       = match['host'],
+                             os_name    = match['os_name'],
+                             os_version = match['os_version'],
+                             os_variant = match['os_variant'],
+                             os_arch    = match['os_arch'],
+                             countme    = int(query.get('countme')),
+                             repo_tag   = query.get('repo'),
+                             repo_arch  = query.get('arch'))
+
+# ===========================================================================
+# ====== ItemWriters - output formatting classes ============================
+# ===========================================================================
+
+class ItemWriter:
+    def __init__(self, fp, itemtuple, timefield='timestamp', **kwargs):
+        self._fp = fp
+        self._itemtuple = itemtuple
+        self._fields = itemtuple._fields
+        assert timefield in self._fields, f"{itemtuple.__name__!r} has no time field {timefield!r}"
+        self._timefield = timefield
+        self._get_writer(**kwargs)
+    def _get_writer(self):
+        raise NotImplementedError
+    def write_item(self, item):
+        raise NotImplementedError
+    def write_header(self):
+        pass
+    def write_footer(self):
+        pass
+
+class JSONWriter(ItemWriter):
+    def _get_writer(self):
+        import json
+        self._dump = json.dump
+    def write_item(self, item):
+        self._dump(item._asdict(), self._fp)
+
+class CSVWriter(ItemWriter):
+    def _get_writer(self):
+        import csv
+        self._writer = csv.writer(self._fp)
+    def write_header(self):
+        self._writer.writerow(self._fields)
+    def write_item(self, item):
+        self._writer.writerow(item)
+
+class AWKWriter(ItemWriter):
+    def _get_writer(self, field_separator='\t'):
+        self._fieldsep = field_separator
+    def _write_row(self, vals):
+        self._fp.write(self._fieldsep.join(str(v) for v in vals) + '\n')
+    def write_header(self):
+        self._write_row(self._fields)
+    def write_item(self, item):
+        self._write_row(item)
+
+class SQLiteWriter(ItemWriter):
+    '''Write each item as a new row in a SQLite database table.'''
+    # We have to get a little fancier with types here since SQL tables expect
+    # typed values. Good thing Python has types now, eh?
+    SQL_TYPE = {
+        int: "INTEGER NOT NULL",
+        str: "TEXT NOT NULL",
+        float: "REAL NOT NULL",
+        bytes: "BLOB NOT NULL",
+        Optional[int]: "INTEGER",
+        Optional[str]: "TEXT",
+        Optional[float]: "REAL",
+        Optional[bytes]: "BLOB",
+    }
+    def _sqltype(self, fieldname):
+        typehint = self._itemtuple.__annotations__[fieldname]
+        return self.SQL_TYPE.get(typehint, "TEXT")
+    def _get_writer(self, tablename='countme_raw'):
+        self._tablename = tablename
+        import sqlite3
+        self._con = sqlite3.connect(self._fp.name)
+        self._cur = self._con.cursor()
+        # Generate SQL commands so we can use them later.
+        # self._create_table creates the table, with column names and types
+        # matching the names and types of the fields in self._itemtuple.
+        self._create_table = (
+            "CREATE TABLE IF NOT EXISTS {table} ({coldefs})".format(
+                table=tablename,
+                coldefs=",".join(f"{f} {self._sqltype(f)}" for f in self._fields),
+            )
+        )
+        # self._insert_item is an "INSERT" command with '?' placeholders.
+        self._insert_item = (
+            "INSERT INTO {table} ({colnames}) VALUES ({colvals})".format(
+                table=tablename,
+                colnames=",".join(self._fields),
+                colvals=",".join("?" for f in self._fields),
+            )
+        )
+        # self._create_time_index creates an index on 'timestamp' or whatever
+        # the time-series field is.
+        self._create_time_index = (
+            "CREATE INDEX IF NOT EXISTS {timefield}_idx on {table} ({timefield})".format(
+                table=tablename,
+                timefield=self._timefield
+            )
+        )
+    def write_header(self):
+        self._cur.execute(self._create_table)
+    def write_item(self, item):
+        self._cur.execute(self._insert_item, item)
+    def write_footer(self):
+        self._cur.execute(self._create_time_index)
+        self._con.commit()
+
+def make_writer(name, *args, **kwargs):
+    '''Convenience function to grab/instantiate the right writer'''
+    if name == "csv":
+        writer = CSVWriter
+    elif name == "json":
+        writer = JSONWriter
+    elif name == "awk":
+        writer = AWKWriter
+    elif name == "sqlite":
+        writer = SQLiteWriter
+    else:
+        raise ValueError(f"Unknown writer '{name}'")
+    return writer(*args, **kwargs)
+
+# ===========================================================================
+# ====== ItemReaders - counterpart to ItemWriter ============================
+# ===========================================================================
+
+class ReaderError(RuntimeError):
+    pass
+
+class ItemReader:
+    def __init__(self, fp, itemtuple, **kwargs):
+        self._fp = fp
+        self._itemtuple = itemtuple
+        self._itemfields = itemtuple._fields
+        self._itemfactory = itemtuple._make
+        self._filefields = None
+        self._get_reader(**kwargs)
+        if not self._filefields:
+            raise ReaderError("no field names found")
+        if self._filefields != self._itemfields:
+            raise ReaderError(f"field mismatch: expected {self._itemfields}, got {self._filefields}")
+    def _get_reader(self):
+        '''Set up the ItemReader.
+        Should set self._filefields to a tuple of the fields found in fp.'''
+        raise NotImplementedError
+    def _iter_rows(self):
+        '''Return an iterator/generator that produces a row for each item.'''
+        raise NotImplementedError
+    def __iter__(self):
+        for item in self._iter_rows():
+            yield self._itemfactory(item)
+
+class CSVReader(ItemReader):
+    def _get_reader(self, **kwargs):
+        import csv
+        self._reader = csv.reader(self._fp)
+        self._filefields = tuple(next(self._reader))
+        # If we have numbers in our fieldnames, probably there was no header
+        if any(name.isnumeric() for name in self._filefields):
+            header = ','.join(fields)
+            raise ReaderError(f"header bad/missing, got: {header}")
+    def _iter_rows(self):
+        return self._reader
+
+# TODO: AWKReader, JSONReader
+
+class SQLiteReader(ItemReader):
+    def _get_reader(self, tablename='countme_raw', **kwargs):
+        import sqlite3
+        self._con = sqlite3.connect(self._fp.name)
+        # TODO: self._con.set_progress_handler(handler, call_interval)
+        self._cur = self._con.cursor()
+        self._tablename = tablename
+        if False and sqlite3.sqlite_version_info >= (3,16,0):
+            fields_sql = f"SELECT name FROM pragma_table_info(?)"
+            self._filefields = tuple(r[0] for r in self._cur.execute(fields_sql, (tablename,)))
+        else:
+            fields_sql = f"PRAGMA table_info('{tablename}')"
+            self._filefields = tuple(r[1] for r in self._cur.execute(fields_sql))
+    def _iter_rows(self):
+        fields = ",".join(self._itemfields)
+        return self._cur.execute(f"SELECT {fields} FROM {self._tablename}")
+
+# Guess the right reader based on the filename.
+def guessreader(fp):
+    if fp.name.endswith(".csv"):
+        reader = CSVReader
+    elif fp.name.endswith(".db"):
+        reader = SQLiteReader
+    else:
+        # FIXME: better format detection!!
+        # TODO: if fp is seekable, peek and figure out filetype
+        reader = None
+    return reader
+
+# TODO: should have name/args more like make_writer...
+def autoreader(fp, itemtuple, **kwargs):
+    '''Convenience function to guess & instantiate the right writer'''
+    reader = guessreader(fp)
+    return reader(fp, itemtuple, **kwargs)
+
diff --git a/countme/progress.py b/countme/progress.py
new file mode 100644
index 0000000..67a871f
--- /dev/null
+++ b/countme/progress.py
@@ -0,0 +1,164 @@
+# countme.progress: progress meters for CLI output
+#
+# Copyright (C) 2020, Red Hat Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+# Author: Will Woods <wwoods@redhat.com>
+
+import os
+from .regex import compile_log_regex, LOG_DATE_RE
+
+__all__ = (
+    'ReadProgress', 'TQDMReadProgress', 'DIYReadProgress',
+)
+
+# ===========================================================================
+# ====== Progress meters & helpers ==========================================
+# ===========================================================================
+
+def log_date(line):
+    match = LOG_DATE_RE.match(line)
+    if match:
+        return match['date']
+    return "??/??/????"
+
+class ReadProgressBase:
+    def __init__(self, logs, display=True):
+        '''logs should be a sequence of line-iterable file-like objects.
+        if display is False, no progress output will be printed.'''
+        self.logs = logs
+        self.display = display
+
+    def __iter__(self):
+        '''Iterator for ReadProgress; yields a sequence of line-iterable
+        file-like objects (one for each log in logs).'''
+        for num, logf in enumerate(self.logs):
+            yield self._iter_log_lines(logf, num)
+
+    def _iter_log_lines(self, logf, lognum):
+        raise NotImplementedError
+
+
+# If we have the tqdm module available then hooray
+class TQDMReadProgress(ReadProgressBase):
+    def _iter_log_lines(self, logf, num):
+        # Make a progress meter for this file
+        prog = tqdm(unit="B", unit_scale=True, unit_divisor=1024,
+                    total=os.stat(logf.name).st_size,
+                    disable=True if not self.display else None,
+                    desc=f"log {num+1}/{len(self.logs)}")
+        # Get the first line manually so we can get logdate
+        line = next(logf)
+        prog.set_description(f"{prog.desc}, date={log_date(line)}")
+        # Update bar and yield the first line
+        prog.update(len(line))
+        yield line
+        # And now we do the rest of the file
+        for line in logf:
+            prog.update(len(line))
+            yield line
+        prog.close()
+
+class DIYReadProgress(ReadProgressBase):
+    def _iter_log_lines(self, logf, num):
+        # Make a progress meter for this file
+        prog = diyprog(total=os.stat(logf.name).st_size,
+                       disable=True if not self.display else None,
+                       desc=f"log {num+1}/{len(self.logs)}")
+        # Get the first line manually so we can get logdate
+        line = next(logf)
+        prog.set_description(f"{prog.desc}, date={log_date(line)}")
+        # Update bar and yield the first line
+        prog.update(len(line))
+        yield line
+        # And now we do the rest of the file
+        for line in logf:
+            prog.update(len(line))
+            yield line
+        prog.close()
+
+class diyprog:
+    def __init__(self, desc=None, total=None, file=None, disable=False,
+                 unit='b', unit_scale=True, barchar='_-=#'):
+        self.desc = desc
+        self.total = total
+        self.file = file
+        self.disable = disable
+        self.unit = unit
+        self.unit_scale = unit_scale
+        #self.unit_divisor = unit_divisor
+        self.count = 0
+        self.showat = 0
+        self.barchar = barchar
+
+    def set_description(self, desc=None, refresh=True):
+        self.desc = desc
+        if refresh:
+            self.display()
+
+    def update(self, n=1):
+        if self.disable: return
+        self.count += n
+        if self.count >= self.showat:
+            self.showat += self.total // 100
+            self.display()
+
+    @staticmethod
+    def hrsize(n):
+        for suffix in 'kmgtp':
+            n /= 1000
+            if n < 1000:
+                break
+        return f"{n:.1f}{suffix}"
+
+    @staticmethod
+    def hrtime(nsecs):
+        m, s = divmod(int(nsecs), 60)
+        if m > 60:
+            h, m = divmod(m, 60)
+            return f"{h:02d}h{m:02d}m{s:02d}s"
+        elif m:
+            return f"{m:02d}m{s:02d}s"
+        else:
+            return f"{s:02d}s"
+
+    def display(self):
+        unit = self.unit
+        desc = self.desc
+        if self.unit_scale:
+            count = self.hrsize(self.count) + unit
+            total = self.hrsize(self.total) + unit
+        else:
+            count = str(self.count) + unit
+            total = str(self.total) + unit
+        pct = (self.count * 100) // self.total
+        bar = (pct // 4) * self.barchar[-1]
+        if pct < 100:
+            bar += self.barchar[pct % 4]
+        print(f"{desc}: {pct:>3}% [{bar:<25}] {count:>7}/{total:<7}",
+              flush=True, file=self.file, end='\r')
+
+    def close(self):
+        if self.disable: return
+        print(flush=True, file=self.file)
+
+# Default ReadProgress: use tqdm if possible, else use the DIY one
+try:
+    # TODO: make this work with a local tqdm checkout/git submodule
+    from tqdm import tqdm
+    ReadProgress = TQDMReadProgress
+except ImportError:
+    ReadProgress = DIYReadProgress
+
diff --git a/countme/regex.py b/countme/regex.py
new file mode 100644
index 0000000..28c8214
--- /dev/null
+++ b/countme/regex.py
@@ -0,0 +1,172 @@
+# countme.regex - regexes for log matching and parsing
+#
+# Copyright (C) 2020, Red Hat Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+# Author: Will Woods <wwoods@redhat.com>
+
+import re
+
+__all__ = (
+    'compile_log_regex',
+    'LOG_RE', 'LIBDNF_USER_AGENT_RE',
+    'MIRRORS_LOG_RE', 'COUNTME_LOG_RE', 'LOG_DATE_RE',
+)
+
+# ===========================================================================
+# ====== Regexes! Get your regexes here! ====================================
+# ===========================================================================
+
+# Log format, according to ansible/roles/httpd/proxy/templates/httpd.conf.j2:
+#   LogFormat "%a %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""
+# That's the standard Combined Log Format, with numeric IPs (%a).
+#
+# Example log line:
+#   240.159.140.173 - - [29/Mar/2020:16:04:28 +0000] "GET /metalink?repo=fedora-modular-32&arch=x86_64&countme=1 HTTP/2.0" 200 18336 "-" "libdnf (Fedora 32; workstation; Linux.x86_64)"
+#
+# Here it is as a Python regex, with a format placeholder for the actual field
+# contents. Default field regexes are in LOG_PATTERN_FIELDS, below, and
+# compile_log_regex() lets you construct more interesting regexes that only
+# match the lines you care about.
+# The request target is split into 'path' and 'query'; 'path' is always
+# present but 'query' may be absent, depending on the value of 'query_match'.
+# 'query_match' should be '?' (optional), '' (required), or '{0}' (absent).
+LOG_PATTERN_FORMAT = (
+    r'^'
+    r'(?P<host>{host})\s'
+    r'(?P<identity>{identity})\s'
+    r'(?P<user>{user})\s'
+    r'\[(?P<time>{time})\]\s'
+    r'"(?P<method>{method})\s'
+    r'(?P<path>{path})(?:\?(?P<query>{query})){query_match}'
+    r'\s(?P<protocol>{protocol})"\s'
+    r'(?P<status>{status})\s'
+    r'(?P<nbytes>{nbytes})\s'
+    r'"(?P<referrer>{referrer})"\s'
+    r'"(?P<user_agent>{user_agent})"\s*'
+    r'$'
+)
+
+# Pattern for a HTTP header token, as per RFC7230.
+# Basically: all printable ASCII chars except '"(),/:;<=>?@[\]{}'
+# (see https://tools.ietf.org/html/rfc7230#section-3.2.6)
+HTTP_TOKEN_PATTERN=r"[\w\#$%^!&'*+.`|~-]+"
+
+# Here's the default/fallback patterns for each field.
+# Note that all fields are non-zero width except query, which is optional,
+# and query_match, which should be '?', '', or '{0}', as described above.
+LOG_PATTERN_FIELDS = {
+    'host':       '\S+',
+    'identity':   '\S+',
+    'user':       '\S+',
+    'time':       '.+?',
+    'method':     HTTP_TOKEN_PATTERN,
+    'path':       '[^\s\?]+',
+    'query':      '\S*',
+    'query_match':'?',
+    'protocol':   'HTTP/\d\.\d',
+    'status':     '\d+',
+    'nbytes':     '\d+|-',
+    'referrer':   '[^"]+',
+    'user_agent': '.+?',
+}
+
+# A regex for libdnf user-agent strings.
+# Examples:
+#   "libdnf/0.35.5 (Fedora 32; workstation; Linux.x86_64)"
+#   "libdnf (Fedora 32; generic; Linux.x86_64)"
+#
+# The format, according to libdnf/utils/os-release.cpp:getUserAgent():
+#   f"{USER_AGENT} ({os_name} {os_version}; {os_variant}; {os_canon}.{os_arch})"
+# where:
+#   USER_AGENT = "libdnf" or "libdnf/{LIBDNF_VERSION}"
+#   os_name    = os-release NAME
+#   os_version = os-release VERSION_ID
+#   os_variant = os-release VARIANT_ID
+#   os_canon   = rpm %_os (via libdnf getCanonOS())
+#   os_arch    = rpm %_arch (via libdnf getBaseArch())
+#
+# (libdnf before 0.37.2 used "libdnf/{LIBDNF_VERSION}" as USER_AGENT, but the
+# version number was dropped in commit d8d0984 due to privacy concerns.)
+#
+# For more info on the User-Agent header, see RFC7231, Section 5.5.3:
+#   https://tools.ietf.org/html/rfc7231#section-5.5.3)
+LIBDNF_USER_AGENT_PATTERN = (
+    r'(?P<product>libdnf(?:/(?P<product_version>\S+))?)\s+'
+    r'\('
+      r'(?P<os_name>.*)\s'
+      r'(?P<os_version>[0-9a-z._-]*?);\s'
+      r'(?P<os_variant>[0-9a-z._-]*);\s'
+      r'(?P<os_canon>[\w./]+)\.'
+      r'(?P<os_arch>\w+)'
+    r'\)'
+)
+LIBDNF_USER_AGENT_RE = re.compile(LIBDNF_USER_AGENT_PATTERN)
+
+# Helper function for making compiled log-matching regexes.
+def compile_log_regex(flags=0, ascii=True, query_present=None, **kwargs):
+    '''
+    Return a compiled re.Pattern object that should match lines in access_log,
+    capturing each field (as listed in LOG_PATTERN_FIELDS) in its own group.
+
+    The default regex to match each field is in LOG_PATTERN_FIELDS but you
+    can supply your own custom regexes as keyword arguments, like so:
+
+        mirror_request_pattern = compile_log_regex(path='/foo.*?')
+
+    The `flags` argument is passed to `re.compile()`. Since access_log contents
+    should (according to the httpd docs) be ASCII-only, that flag is added by
+    default, but you can turn that off by adding 'ascii=False'.
+
+    If `query_present` is True, then the regex only matches lines where the
+    target resource has a query string - i.e. query is required.
+    If False, it only matches lines *without* a query string.
+    If None (the default), the query string is optional.
+    '''
+    if ascii:
+        flags |= re.ASCII
+
+    fields      = LOG_PATTERN_FIELDS.copy()
+    fields.update(kwargs)
+
+    if query_present is not None:
+        fields['query_match'] = '' if query_present else '{0}'
+
+    pattern = LOG_PATTERN_FORMAT.format(**fields)
+
+    return re.compile(pattern, flags=flags)
+
+# Default matcher that should match any access.log line
+LOG_RE = compile_log_regex()
+
+# Compiled pattern to match all mirrorlist/metalink hits, like mirrorlist.py
+MIRRORS_LOG_RE = compile_log_regex(path=r'/metalink|/mirrorlist')
+
+# Compiled pattern for countme lines.
+# We only count:
+#   * GET requests for /metalink or /mirrorlist,
+#   * that have a query string containing "&countme=\d+",
+#   * with libdnf's User-Agent string (see above).
+COUNTME_LOG_RE = compile_log_regex(
+    method        = "GET",
+    query_present = True,
+    path          = r'/metalink|/mirrorlist',
+    query         = r'\S+&countme=\d+\S*',
+    status        = r'200|302',
+    user_agent    = LIBDNF_USER_AGENT_PATTERN,
+)
+
+# Regex for pulling the date out of a log line
+LOG_DATE_RE = compile_log_regex(time=r'(?P<date>[^:]+):.*?')
diff --git a/parse-access-log.py b/parse-access-log.py
index 143d241..3a8bbdf 100755
--- a/parse-access-log.py
+++ b/parse-access-log.py
@@ -23,542 +23,12 @@
 # into this; tools further down the pipeline can be responsible for figuring
 # out how to group "updates-released-f32" and "fedora-modular-source-32".
 
-import os
-import re
 import sys
 import argparse
-from datetime import datetime
-from urllib.parse import urlparse, parse_qsl
-from typing import NamedTuple, Optional
 
-# ===========================================================================
-# ====== Regexes! Get your regexes here! ====================================
-# ===========================================================================
-
-# Log format, according to ansible/roles/httpd/proxy/templates/httpd.conf.j2:
-#   LogFormat "%a %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""
-# That's the standard Combined Log Format, with numeric IPs (%a).
-#
-# Example log line:
-#   240.159.140.173 - - [29/Mar/2020:16:04:28 +0000] "GET /metalink?repo=fedora-modular-32&arch=x86_64&countme=1 HTTP/2.0" 200 18336 "-" "libdnf (Fedora 32; workstation; Linux.x86_64)"
-#
-# Here it is as a Python regex, with a format placeholder for the actual field
-# contents. Default field regexes are in LOG_PATTERN_FIELDS, below, and
-# compile_log_regex() lets you construct more interesting regexes that only
-# match the lines you care about.
-# The request target is split into 'path' and 'query'; 'path' is always
-# present but 'query' may be absent, depending on the value of 'query_match'.
-# 'query_match' should be '?' (optional), '' (required), or '{0}' (absent).
-LOG_PATTERN_FORMAT = (
-    r'^'
-    r'(?P<host>{host})\s'
-    r'(?P<identity>{identity})\s'
-    r'(?P<user>{user})\s'
-    r'\[(?P<time>{time})\]\s'
-    r'"(?P<method>{method})\s'
-    r'(?P<path>{path})(?:\?(?P<query>{query})){query_match}'
-    r'\s(?P<protocol>{protocol})"\s'
-    r'(?P<status>{status})\s'
-    r'(?P<nbytes>{nbytes})\s'
-    r'"(?P<referrer>{referrer})"\s'
-    r'"(?P<user_agent>{user_agent})"\s*'
-    r'$'
-)
-
-# Pattern for a HTTP header token, as per RFC7230.
-# Basically: all printable ASCII chars except '"(),/:;<=>?@[\]{}'
-# (see https://tools.ietf.org/html/rfc7230#section-3.2.6)
-HTTP_TOKEN_PATTERN=r"[\w\#$%^!&'*+.`|~-]+"
-
-# Here's the default/fallback patterns for each field.
-# Note that all fields are non-zero width except query, which is optional,
-# and query_match, which should be '?', '', or '{0}', as described above.
-LOG_PATTERN_FIELDS = {
-    'host':       '\S+',
-    'identity':   '\S+',
-    'user':       '\S+',
-    'time':       '.+?',
-    'method':     HTTP_TOKEN_PATTERN,
-    'path':       '[^\s\?]+',
-    'query':      '\S*',
-    'query_match':'?',
-    'protocol':   'HTTP/\d\.\d',
-    'status':     '\d+',
-    'nbytes':     '\d+|-',
-    'referrer':   '[^"]+',
-    'user_agent': '.+?',
-}
-
-# A regex for libdnf user-agent strings.
-# Examples:
-#   "libdnf/0.35.5 (Fedora 32; workstation; Linux.x86_64)"
-#   "libdnf (Fedora 32; generic; Linux.x86_64)"
-#
-# The format, according to libdnf/utils/os-release.cpp:getUserAgent():
-#   f"{USER_AGENT} ({os_name} {os_version}; {os_variant}; {os_canon}.{os_arch})"
-# where:
-#   USER_AGENT = "libdnf" or "libdnf/{LIBDNF_VERSION}"
-#   os_name    = os-release NAME
-#   os_version = os-release VERSION_ID
-#   os_variant = os-release VARIANT_ID
-#   os_canon   = rpm %_os (via libdnf getCanonOS())
-#   os_arch    = rpm %_arch (via libdnf getBaseArch())
-#
-# (libdnf before 0.37.2 used "libdnf/{LIBDNF_VERSION}" as USER_AGENT, but the
-# version number was dropped in commit d8d0984 due to privacy concerns.)
-#
-# For more info on the User-Agent header, see RFC7231, Section 5.5.3:
-#   https://tools.ietf.org/html/rfc7231#section-5.5.3)
-LIBDNF_USER_AGENT_PATTERN = (
-    r'(?P<product>libdnf(?:/(?P<product_version>\S+))?)\s+'
-    r'\('
-      r'(?P<os_name>.*)\s'
-      r'(?P<os_version>[0-9a-z._-]*?);\s'
-      r'(?P<os_variant>[0-9a-z._-]*);\s'
-      r'(?P<os_canon>[\w./]+)\.'
-      r'(?P<os_arch>\w+)'
-    r'\)'
-)
-LIBDNF_USER_AGENT_RE = re.compile(LIBDNF_USER_AGENT_PATTERN)
-
-# Helper function for making compiled log-matching regexes.
-def compile_log_regex(flags=0, ascii=True, query_present=None, **kwargs):
-    '''
-    Return a compiled re.Pattern object that should match lines in access_log,
-    capturing each field (as listed in LOG_PATTERN_FIELDS) in its own group.
-
-    The default regex to match each field is in LOG_PATTERN_FIELDS but you
-    can supply your own custom regexes as keyword arguments, like so:
-
-        mirror_request_pattern = compile_log_regex(path='/foo.*?')
-
-    The `flags` argument is passed to `re.compile()`. Since access_log contents
-    should (according to the httpd docs) be ASCII-only, that flag is added by
-    default, but you can turn that off by adding 'ascii=False'.
-
-    If `query_present` is True, then the regex only matches lines where the
-    target resource has a query string - i.e. query is required.
-    If False, it only matches lines *without* a query string.
-    If None (the default), the query string is optional.
-    '''
-    if ascii:
-        flags |= re.ASCII
-
-    fields      = LOG_PATTERN_FIELDS.copy()
-    fields.update(kwargs)
+from countme import CountmeMatcher, MirrorMatcher, make_writer
 
-    if query_present is not None:
-        fields['query_match'] = '' if query_present else '{0}'
-
-    pattern = LOG_PATTERN_FORMAT.format(**fields)
-
-    return re.compile(pattern, flags=flags)
-
-# Default matcher that should match any access.log line
-LOG_RE = compile_log_regex()
-
-# Compiled pattern to match all mirrorlist/metalink hits, like mirrorlist.py
-MIRRORS_LOG_RE = compile_log_regex(path=r'/metalink|/mirrorlist')
-
-# Compiled pattern for countme lines.
-# We only count:
-#   * GET requests for /metalink or /mirrorlist,
-#   * that have a query string containing "&countme=\d+",
-#   * with libdnf's User-Agent string (see above).
-COUNTME_LOG_RE = compile_log_regex(
-    method        = "GET",
-    query_present = True,
-    path          = r'/metalink|/mirrorlist',
-    query         = r'\S+&countme=\d+\S*',
-    status        = r'200|302',
-    user_agent    = LIBDNF_USER_AGENT_PATTERN,
-)
-
-# ===========================================================================
-# ====== Output item definitions and helpers ================================
-# ===========================================================================
-
-DAY_LEN = 24*60*60
-WEEK_LEN = 7*DAY_LEN
-COUNTME_EPOCH = 345600          # =00:00:00 Mon Jan 5 00:00:00 1970 (UTC)
-COUNTME_EPOCH_ORDINAL = 719167  # same, as an ordinal day number
-
-class CountmeWeek(NamedTuple):
-    '''
-    A `datetime`-style object representing a point in time in a
-    countme-defined "week".  Week 0 started Mon Jan 5 00:00:00 1970 and each
-    week is exactly 7*24*60*60 seconds long.
-
-    Times are stored as (weeknum, weeksec) tuples representing the week number
-    and the elapsed number of seconds since the start of that week.
-    '''
-    weeknum: int
-    weeksec: int = 0 # Log times only have integer-second precision
-
-    @classmethod
-    def fromtimestamp(cls, ts):
-        return cls._make(divmod(int(ts) - COUNTME_EPOCH, WEEK_LEN))
-    @classmethod
-    def fromordinal(cls, day):
-        weeknum, weekday = divmod(day - COUNTME_EPOCH_ORDINAL, 7)
-        return cls(weeknum, weekday * WEEK_LEN)
-    @classmethod
-    def fromlogtime(cls, logtime):
-        dt = datetime.strptime(logtime, "%d/%b/%Y:%H:%M:%S %z")
-        return cls._make(divmod(int(dt.timestamp()) - COUNTME_EPOCH, WEEK_LEN))
-    @classmethod
-    def now(cls):
-        return cls.fromtimestamp(datetime.utcnow().timestamp())
-    @classmethod
-    def today(cls):
-        return cls.fromordinal(datetime.utcnow().toordinal())
-
-    def toordinal(self):
-        return COUNTME_EPOCH_ORDINAL + (self.weeknum * 7)
-    def timestamp(self):
-        return COUNTME_EPOCH + (self.weeknum * WEEK_LEN) + self.weeksec
-    def start_ts(self):
-        return COUNTME_EPOCH + (self.weeknum * WEEK_LEN)
-    def time_range(self):
-        '''Return [start, end) timestamps. Like range(), 'end' is not included.'''
-        start = COUNTME_EPOCH + (self.weeknum * WEEK_LEN)
-        return (start, start+WEEK_LEN)
-    def time_between(self):
-        '''Return [start, last] timestamps. 'last' is included in the range.'''
-        start = COUNTME_EPOCH + (self.weeknum * WEEK_LEN)
-        return (start, start+WEEK_LEN-1)
-
-def parse_logtime(logtime):
-    '''Parse the log's 'time' string to a `datetime` object.'''
-    return datetime.strptime(logtime, "%d/%b/%Y:%H:%M:%S %z")
-
-def parse_querydict(querystr):
-    '''Parse request query the way mirrormanager does (last value wins)'''
-    return dict(parse_qsl(querystr))
-
-
-class MirrorItem(NamedTuple):
-    '''
-    A basic mirrorlist/metalink metadata item.
-    Each item has a timestamp, IP, and the requested repo= and arch= values.
-    '''
-    timestamp: int
-    host: str
-    repo_tag: Optional[str]
-    repo_arch: Optional[str]
-
-class CountmeItem(NamedTuple):
-    '''
-    A "countme" match item.
-    Includes the countme value and libdnf User-Agent fields.
-    '''
-    timestamp: int
-    host: str
-    os_name: str
-    os_version: str
-    os_variant: str
-    os_arch: str
-    countme: int
-    repo_tag: str
-    repo_arch: str
-
-class LogMatcher:
-    '''Base class for a LogMatcher, which iterates through a log file'''
-    regex = NotImplemented
-    itemtuple = NotImplemented
-    def __init__(self, fileobj):
-        self.fileobj = fileobj
-    def iteritems(self):
-        for line in self.fileobj:
-            match = self.regex.match(line)
-            if match:
-                yield self.make_item(match)
-    __iter__ = iteritems
-    @classmethod
-    def make_item(cls, match):
-        raise NotImplementedError
-
-class MirrorMatcher(LogMatcher):
-    '''Match all mirrorlist/metalink items, like mirrorlist.py does.'''
-    regex = MIRRORS_LOG_RE
-    itemtuple = MirrorItem
-    @classmethod
-    def make_item(cls, match):
-        timestamp = parse_logtime(match['time']).timestamp()
-        query = parse_querydict(match['query'])
-        return cls.itemtuple(timestamp = int(timestamp),
-                             host      = match['host'],
-                             repo_tag  = query.get('repo'),
-                             repo_arch = query.get('arch'))
-
-class CountmeMatcher(LogMatcher):
-    '''Match the libdnf-style "countme" requests.'''
-    regex = COUNTME_LOG_RE
-    itemtuple = CountmeItem
-    @classmethod
-    def make_item(cls, match):
-        timestamp = parse_logtime(match['time']).timestamp()
-        query = parse_querydict(match['query'])
-        return cls.itemtuple(timestamp  = int(timestamp),
-                             host       = match['host'],
-                             os_name    = match['os_name'],
-                             os_version = match['os_version'],
-                             os_variant = match['os_variant'],
-                             os_arch    = match['os_arch'],
-                             countme    = int(query.get('countme')),
-                             repo_tag   = query.get('repo'),
-                             repo_arch  = query.get('arch'))
-
-# ===========================================================================
-# ====== Output formatting classes ==========================================
-# ===========================================================================
-
-class ItemWriter:
-    def __init__(self, fp, itemtuple, **kwargs):
-        self._fp = fp
-        self._itemtuple = itemtuple
-        self._fields = itemtuple._fields
-        assert "timestamp" in self._fields, f"{itemtuple.__class__.__name__!r} has no 'timestamp' field"
-        self._get_writer(**kwargs)
-    def _get_writer(self):
-        raise NotImplementedError
-    def write_item(self, item):
-        raise NotImplementedError
-    def write_header(self):
-        pass
-    def write_footer(self):
-        pass
-
-class JSONWriter(ItemWriter):
-    def _get_writer(self):
-        import json
-        self._dump = json.dump
-    def write_item(self, item):
-        self._dump(item._asdict(), self._fp)
-
-class CSVWriter(ItemWriter):
-    def _get_writer(self):
-        import csv
-        self._writer = csv.writer(self._fp)
-    def write_header(self):
-        self._writer.writerow(self._fields)
-    def write_item(self, item):
-        self._writer.writerow(item)
-
-class AWKWriter(ItemWriter):
-    def _get_writer(self, field_separator='\t'):
-        self._fieldsep = field_separator
-    def _write_row(self, vals):
-        self._fp.write(self._fieldsep.join(str(v) for v in vals) + '\n')
-    def write_header(self):
-        self._write_row(self._fields)
-    def write_item(self, item):
-        self._write_row(item)
-
-class SQLiteWriter(ItemWriter):
-    '''Write each item as a new row in a SQLite database table.'''
-    # We have to get a little fancier with types here since SQL tables expect
-    # typed values. Good thing Python has types now, eh?
-    SQL_TYPE = {
-        int: "INTEGER NOT NULL",
-        str: "TEXT NOT NULL",
-        float: "REAL NOT NULL",
-        bytes: "BLOB NOT NULL",
-        Optional[int]: "INTEGER",
-        Optional[str]: "TEXT",
-        Optional[float]: "REAL",
-        Optional[bytes]: "BLOB",
-    }
-    def _sqltype(self, fieldname):
-        typehint = self._itemtuple.__annotations__[fieldname]
-        return self.SQL_TYPE.get(typehint, "TEXT")
-    def _get_writer(self, tablename='countme_raw'):
-        self._tablename = tablename
-        import sqlite3
-        self._con = sqlite3.connect(self._fp.name)
-        self._cur = self._con.cursor()
-        # Generate SQL commands so we can use them later.
-        # self._create_table creates the table, with column names and types
-        # matching the names and types of the fields in self._itemtuple.
-        self._create_table = (
-            "CREATE TABLE IF NOT EXISTS {table} ({coldefs})".format(
-                table=tablename,
-                coldefs=",".join(f"{f} {self._sqltype(f)}" for f in self._fields),
-            )
-        )
-        # self._insert_item is an "INSERT" command with '?' placeholders.
-        self._insert_item = (
-            "INSERT INTO {table} ({colnames}) VALUES ({colvals})".format(
-                table=tablename,
-                colnames=",".join(self._fields),
-                colvals=",".join("?" for f in self._fields),
-            )
-        )
-        # self._create_time_index creates an index on 'timestamp'.
-        self._create_time_index = (
-            "CREATE INDEX IF NOT EXISTS timestamp_idx on {table} (timestamp)".format(
-                table=tablename,
-            )
-        )
-    def write_header(self):
-        self._cur.execute(self._create_table)
-    def write_item(self, item):
-        self._cur.execute(self._insert_item, item)
-    def write_footer(self):
-        self._cur.execute(self._create_time_index)
-        self._con.commit()
-
-def make_writer(name, fp, itemtuple):
-    '''Convenience function to grab/instantiate the right writer'''
-    if name == "csv":
-        writer = CSVWriter
-    elif name == "json":
-        writer = JSONWriter
-    elif name == "awk":
-        writer = AWKWriter
-    elif name == "sqlite":
-        writer = SQLiteWriter
-    else:
-        raise ValueError(f"Unknown writer '{name}'")
-    return writer(fp, itemtuple)
-
-# ===========================================================================
-# ====== Progress meters & helpers ==========================================
-# ===========================================================================
-
-LOG_DATE_RE = compile_log_regex(time=r'(?P<date>[^:]+):.*?')
-def log_date(line):
-    match = LOG_DATE_RE.match(line)
-    if match:
-        return match['date']
-    return "??/??/????"
-
-
-# If we have the tqdm module available then hooray, they can do the work
-class TQDMLogProgress:
-    def __init__(self, logs, display=True):
-        from tqdm import tqdm
-        self.logs = logs
-        self.disable = True if not display else None
-
-    def __iter__(self):
-        for n, logf in enumerate(self.logs):
-            yield self._iter_and_count_bytes(logf, n)
-
-    def _iter_and_count_bytes(self, logf, lognum):
-        # Make a progress meter for this file
-        prog = tqdm(unit="B", unit_scale=True, unit_divisor=1024,
-                    total=os.stat(logf.name).st_size,
-                    disable=self.disable,
-                    desc=f"log {lognum+1}/{len(self.logs)}")
-        # Get the first line manually so we can get logdate
-        line = next(logf)
-        prog.set_description(f"{prog.desc}, date={log_date(line)}")
-        # Update bar and yield the first line
-        prog.update(len(line))
-        yield line
-        # And now we do the rest of the file
-        for line in logf:
-            prog.update(len(line))
-            yield line
-        prog.close()
-
-
-class DIYLogProgress:
-    '''A very basic progress meter to be used when tqdm isn't available.'''
-    def __init__(self, logs, display=True):
-        self.logs = logs
-        self.display = display
-        self.desc = ''
-        self._file_size = {f.name:os.stat(f.name).st_size for f in logs}
-        self._total_size = sum(os.stat(f.name).st_size for f in logs)
-        self._prev_read = 0
-        self._total_read = 0
-        self._cur_name = None
-        self._cur_size = 0
-        self._pct_vals = []
-        self._cur_read = 0
-        self._last_pct = None
-        self._next_show = 0
-
-    def __iter__(self):
-        for n, logf in enumerate(self.logs):
-            self.set_file(logf.name)
-            yield self.iter_and_count_bytes(logf, n)
-            self.end_file()
-
-    def iter_and_count_bytes(self, logf, lognum):
-        line = next(logf)
-        self.desc = f"log {lognum+1}/{len(self.logs)}, date={log_date(line)}"
-        self.update_bytes(len(line))
-        yield line
-        for line in logf:
-            self.update_bytes(len(line))
-            yield line
-
-    def set_file(self, name):
-        self._cur_size = self._file_size[name]
-        self._pct_vals = [self._cur_size*n//100 for n in range(101)]
-        self._cur_read = 0
-        self._last_pct = 0
-        self._next_show = self._pct_vals[1]
-
-    def update_bytes(self, size):
-        self._cur_read += size
-        if self._cur_read >= self._next_show:
-            self.show()
-
-    def end_file(self):
-        self._prev_read += self._cur_read
-        if self._last_pct < 100: # rounding error or something...
-            self.show()
-
-    def show(self):
-        cur_read = self._cur_read
-        cur_size = self._cur_size
-        cur_pct = 100*cur_read // cur_size
-        total_size = self._total_size
-        total_read = self._prev_read + cur_read
-        total_pct = 100*total_read // total_size
-        if self.display:
-            if len(self.logs) > 1:
-                print(f"{self.desc}:{cur_pct:3}%"
-                      f" ({hrsize(cur_read)}/{hrsize(cur_size)}),"
-                      f" total:{total_pct:3}%"
-                      f" ({hrsize(total_read)}/{hrsize(total_size)})")
-            else:
-                print(f"{self.desc}:{cur_pct:3}%"
-                      f" ({hrsize(cur_read)}/{hrsize(cur_size)})")
-        self._last_pct = cur_pct
-        if self._last_pct < 100:
-            self._next_show = self._pct_vals[self._last_pct+1]
-        self._total_read = total_read
-
-# Formatting helper for human-readable data sizes
-def hrsize(nbytes):
-    for suffix in ("b", "kb", "mb", "gb"):
-        if nbytes < 1000:
-            break
-        nbytes /= 1000
-    return f"{nbytes:.1f}{suffix}"
-
-# Formatting helper for human-readable time intervals
-def hrtime(nsecs):
-    m, s = divmod(int(nsecs), 60)
-    if m > 60:
-        h, m = divmod(m, 60)
-        return f"{h:02d}h{m:02d}m{s:02d}s"
-    elif m:
-        return f"{m:02d}m{s:02d}s"
-    else:
-        return f"{s:02d}s"
-
-# Set up LogProgress so it falls back to our own code if tqdm isn't here
-try:
-    from tqdm import tqdm
-    LogProgress = TQDMLogProgress
-except ImportError:
-    LogProgress = DIYLogProgress
+from countme.progress import ReadProgress
 
 # ===========================================================================
 # ====== CLI parser & main() ================================================
@@ -612,12 +82,10 @@ def parse_args(argv=None):
 def main():
     args = parse_args()
 
-    prog = LogProgress(args.logs, display=args.progress)
-
     # TODO: If we're appending to an existing file, check_header() instead?
     args.writer.write_header()
 
-    for logf in prog:
+    for logf in ReadProgress(args.logs, display=args.progress):
         for item in args.matcher(logf):
             args.writer.write_item(item)