From 3d7e2ce10a6a3f05c786bc56d73413df70d7c55d Mon Sep 17 00:00:00 2001 From: Will Woods Date: Jun 02 2020 04:47:30 +0000 Subject: Refactoring! Yeah! Now we have a 'countme' module so we can share code between parse-access-log.py and countme-totals.py! Wow, cool! Also - thanks to that - countme-totals.py can now output csv, json, or sqlite. Fun! --- diff --git a/countme-totals.py b/countme-totals.py index cf78cc0..e1bb912 100755 --- a/countme-totals.py +++ b/countme-totals.py @@ -2,12 +2,9 @@ import sys import argparse -import datetime from collections import Counter from typing import NamedTuple - -COUNTME_OFFSET = 345600 # 00:00:00 Mon Jan 5 00:00:00 1970 -COUNTME_WINDOW = 7*24*60*60 # Exactly 7 days +from countme import CountmeItem, weeknum, guessreader, autoreader, make_writer # NOTE: log timestamps do not move monotonically forward, but they don't # seem to ever jump backwards more than 241 seconds. I assume this is @@ -22,49 +19,15 @@ COUNTME_WINDOW = 7*24*60*60 # Exactly 7 days # in 24-hour chunks, any window that extends into the next day means we have to # wait 24 hours until we can be sure we have all the data for the previous # week, so the effect would be the same if this was 3600 or 43200 or whatever. +# TODO: this should probably move into the module somewhere.. LOG_JITTER_WINDOW = 600 -def weektuple(ts): - '''Return (week_num, week_secs) for a given timestamp''' - return divmod(int(ts)-COUNTME_OFFSET, COUNTME_WINDOW) - -def week_start_ts(ts): - '''Return the timestamp of the start of the week containing ts''' - weeksecs = (ts-COUNTME_OFFSET) % COUNTME_WINDOW - return ts - weeksecs - -def week_start(ts): - '''Return an ISO-formatted date string of the Monday that starts the week - that contains the given timestamp.''' - ts = int(ts) - weeksecs = (ts-COUNTME_OFFSET) % COUNTME_WINDOW - weekstart = datetime.datetime.utcfromtimestamp(ts - weeksecs) - return weekstart.date().isoformat() - -# Here's the items we expect to be reading from our input file. -# TODO: we should be importing this from a 'countme' module or something -# rather than duplicating it between parse-access-log.py and here -class CountmeItem(NamedTuple): - ''' - A "countme" match item. - Includes the countme value and libdnf User-Agent fields. - ''' - timestamp: int - host: str - os_name: str - os_version: str - os_variant: str - os_arch: str - countme: int - repo_tag: str - repo_arch: str - -# And here's the "bucket" we sort each item into. +# Here's the "bucket" we sort each item into. class CountmeBucket(NamedTuple): ''' This defines the fields that we use to group/aggregate CountmeItems. ''' - week_start: str + weeknum: int os_name: str os_version: str os_variant: str @@ -75,101 +38,7 @@ class CountmeBucket(NamedTuple): @classmethod def from_item(cls, item): - return cls._make((week_start(item.timestamp),) + item[2:]) - - -# =========================================================================== -# ====== ItemReader classes ================================================= -# =========================================================================== - -class ReaderError(RuntimeError): - pass - -class ItemReader: - def __init__(self, fp, itemtuple, **kwargs): - self._fp = fp - self._itemtuple = itemtuple - self._itemfields = itemtuple._fields - self._itemfactory = itemtuple._make - self._filefields = None - self._get_reader(**kwargs) - if not self._filefields: - raise ReaderError("no field names found") - if self._filefields != self._itemfields: - raise ReaderError(f"field mismatch: expected {self._itemfields}, got {self._filefields}") - def _get_reader(self): - '''Set up the ItemReader. - Should set self._filefields to a tuple of the fields found in fp.''' - raise NotImplementedError - def _iter_rows(self): - '''Return an iterator/generator that produces a row for each item.''' - raise NotImplementedError - def __iter__(self): - for item in self._iter_rows(): - yield self._itemfactory(item) - -class CSVReader(ItemReader): - def _get_reader(self, **kwargs): - import csv - self._reader = csv.reader(self._fp) - self._filefields = tuple(next(self._reader)) - # If we have numbers in our fieldnames, probably there was no header - if any(name.isnumeric() for name in self._filefields): - header = ','.join(fields) - raise ReaderError(f"header bad/missing, got: {header}") - def _iter_rows(self): - return self._reader - -# TODO: AWKReader, JSONReader - -class SQLiteReader(ItemReader): - def _get_reader(self, tablename='countme_raw', **kwargs): - import sqlite3 - self._con = sqlite3.connect(self._fp.name) - # TODO: self._con.set_progress_handler(handler, call_interval) - self._cur = self._con.cursor() - self._tablename = tablename - if False and sqlite3.sqlite_version_info >= (3,16,0): - fields_sql = f"SELECT name FROM pragma_table_info(?)" - self._filefields = tuple(r[0] for r in self._cur.execute(fields_sql, (tablename,))) - else: - fields_sql = f"PRAGMA table_info('{tablename}')" - self._filefields = tuple(r[1] for r in self._cur.execute(fields_sql)) - def _iter_rows(self): - fields = ",".join(self._itemfields) - return self._cur.execute(f"SELECT {fields} FROM {self._tablename}") - -# BUCKET COUNTER YOOOOOOO -# TODO: finish/clean this up -# TODO: If we're doing sqlite->sqlite we can probably do the count in pure SQL, -# which is probably much faster? Complicated tho. -class BucketCounterBase: - itemtuple = NotImplemented - buckettuple = NotImplemented - def __init__(self, item_filter=None, **kwargs): - self._count = Counter() - self.item_filter = item_filter - @classmethod - def item_bucket(cls, item): - raise NotImplementedError - def bucket_count(self, reader): - if reader._itemtuple != self.itemtuple: - raise ValueError(f"Reader item {reader._itemtuple!r}" - f" does not match expected item {self.itemtuple!r}") - # Iterate through (maybe filtered) items and count 'em up - itemiter = filter(self.item_filter, reader) if callable(self.item_filter) else iter(reader) - count = Counter(self.item_bucket(item) for item in itemiter) - # Add the new counts to the total - self._count += count - # Return the new counts - return count - -class CountmeBucketCounter: - itemtuple = CountmeItem - buckettuple = CountmeBucket - @classmethod - def item_bucket(cls, item): - return cls.buckettuple._make((week_start(item.timestamp),) + item[2:]) + return cls._make((weeknum(item.timestamp),) + item[2:]) # =========================================================================== # ====== CLI parser & main() ================================================ @@ -186,39 +55,24 @@ def parse_args(argv=None): type=argparse.FileType('rt', encoding='utf-8'), nargs='+', help="Data to parse (from parse-access-log.py)") - # TODO: atomic creation of output file + # TODO: atomic creation/update of output file? p.add_argument("-o", "--output", type=argparse.FileType('at', encoding='utf-8'), help="output file (default: stdout)", default=sys.stdout) - # TODO: refuse to overwrite existing files, unless.. - #p.add_argument("--force" - # Or perhaps.. - #p.add_argument("--update") + # TODO: flag to write prelim data to a different file/table; otherwise, + # don't include prelim data p.add_argument("-f", "--format", - choices=("csv", "json", "awk"), + choices=("csv", "json", "awk", "sqlite"), help="output format (default: csv)", default="csv") - # TODO: sqlite output, wheeee. - # SQLite counting could probably all be done in pure SQL, tbh, so maybe - # that's a totally different script? - #p.add_argument("--sqlite", metavar="DB", - # help="sqlite database to write to") - - # TODO: use this.. - p.add_argument("--progress", action="store_true", - help="print some progress info while counting") - p.add_argument("--input-format", choices=("csv", "sqlite", "auto"), help="input file format (default: guess from extension)", default="auto") - # TODO: allow specifying cutoff times so we don't double-count? - # Also: cutoff time/date for "preliminary" data? - args = p.parse_args(argv) # Pick the right reader factory @@ -227,77 +81,61 @@ def parse_args(argv=None): elif args.input_format == "sqlite": args.reader = SQLiteReader elif args.input_format == "auto": - args.reader = autoreader # Check that we can figure out the right reader(s) before we start.. for fp in args.infiles: if guessreader(fp) is None: raise argparse.ArgumentTypeError( "Can't guess input format for {fp.name!r}. " "Try '--input-format=FMT'.") + args.reader = autoreader else: raise argparse.ArgumentTypeError("unknown input format {args.input_format!r}") - return args + # TODO: if writing to existing file, check & bail out if field mismatch -# Guess the right reader based on the filename. -def guessreader(fp): - if fp.name.endswith(".csv"): - reader = CSVReader - elif fp.name.endswith(".db"): - reader = SQLiteReader - else: - # FIXME: better format detection!! - # TODO: if fp is seekable, peek and figure out filetype - reader = None - return reader - -def autoreader(fp, itemtuple, **kwargs): - '''Convenience function to guess & instantiate the right writer''' - reader = guessreader(fp) - return reader(fp, itemtuple, **kwargs) + return args -# FIXME: probably want the ItemWriters from parse-access-logs.py here class CountWriter: - def __init__(self, fp): - import csv + def __init__(self, outformat, fp, bucketclass): self._fp = fp - self._writer = csv.writer(fp) - def write(self, bucket, count): - self._writer.writerow((count,)+bucket) + self._bucketclass = bucketclass + self._countclass = NamedTuple(bucketclass.__name__ + "Count", + [("count", int)] + list(bucketclass.__annotations__.items())) + # TODO: countme_prelim "table" for prelim output + self._writer = make_writer(outformat, self._fp, self._countclass, timefield='weeknum', tablename='countme_totals') @staticmethod def sortkey(bucketcount): bucket, count = bucketcount # Sort by date (old->new), then count (high->low), then other fields. - return (bucket.week_start, -count) + bucket + return (bucket.weeknum, -count) + bucket def writecounts(self, counts): + self._writer.write_header() for bucket, count in sorted(counts.items(), key=self.sortkey): - self.write(bucket, count) + countitem = self._countclass._make((count,)+bucket) + self._writer.write_item(countitem) + self._writer.write_footer() def main(): args = parse_args() + # Just a plain ol' Counter count = Counter() - # Set up our counter and bucket-maker. - # TODO: CountmeBucketCounter is half-assed; either full-ass it or just - # go with a simple item_bucket function. - #counter = CountmeBucketCounter() # Here's the function that finds the bucket for a given item. item_bucket = CountmeBucket.from_item - # Set up writer. - # FIXME: proper writers, append/update mode, etc. - countwriter = CountWriter(args.output) - + # Okay, start reading our inputs and doing counts for inf in args.infiles: for item in args.reader(inf, CountmeItem): bucket = item_bucket(item) count[bucket] += 1 - # TODO: how do we tell preliminary counts from final ones? - # if bucket.week_start in prelim_weeks: ... - countwriter.writecounts(count) + # TODO: how do we split preliminary counts from final ones? + + # Write the counts. + writer = CountWriter(args.format, args.output, CountmeBucket) + writer.writecounts(count) if __name__ == '__main__': diff --git a/countme/__init__.py b/countme/__init__.py new file mode 100644 index 0000000..2fe6860 --- /dev/null +++ b/countme/__init__.py @@ -0,0 +1,388 @@ +# countme - parsing Fedora httpd access_log files to structured data. +# +# Copyright (C) 2020, Red Hat Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Author: Will Woods +# +# The main point of this script, as it says above, is parsing access_log to +# structured data. I'm trying to avoid packing Fedora-specific data-massaging +# into this; tools further down the pipeline can be responsible for figuring +# out how to group "updates-released-f32" and "fedora-modular-source-32". + +import os +import re +from datetime import date, time, datetime, timezone +from urllib.parse import parse_qsl +from typing import NamedTuple, Optional + +from .regex import COUNTME_LOG_RE, MIRRORS_LOG_RE + +# TODO: clean this up so it only exports the common/needed bits +__all__ = ( + 'weeknum', 'parse_logtime', 'parse_querydict', + + 'ItemWriter', 'CSVWriter', 'JSONWriter', 'AWKWriter', 'SQLiteWriter', + 'ItemReader', 'CSVReader', 'SQLiteReader', + + 'make_writer', 'guessreader', 'autoreader', + + 'LogItem', 'MirrorItem', 'CountmeItem', + 'LogMatcher', 'MirrorMatcher', 'CountmeMatcher', +) + +# =========================================================================== +# ====== Output item definitions and helpers ================================ +# =========================================================================== + +DAY_LEN = 24*60*60 +WEEK_LEN = 7*DAY_LEN +COUNTME_EPOCH = 345600 # =00:00:00 Mon Jan 5 00:00:00 1970 (UTC) +MONTHIDX = { + 'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, + 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12 +} + +def weeknum(timestamp): + return (int(timestamp) - COUNTME_EPOCH) // WEEK_LEN + +def parse_logtime(logtime): + # Equivalent to - but faster than: + # return datetime.strptime(logtime, "%d/%b/%Y:%H:%M:%S %z") + # It's like ~1.2usec vs 12usec, which might seem trivial but in my tests + # the regex parser can handle like ~200k lines/sec - or 5usec/line - so + # an extra ~10usec to parse the time field is not insignificant. + # logtime: '29/Mar/2020:16:04:28 +0000' + # ISO8601: '2020-03-29T16:04:28+00:00' + y = logtime[7:11] + m = MONTHIDX[logtime[3:6]] + d = logtime[0:2] + time = logtime[12:20] + offh = logtime[21:24] + offm = logtime[24:26] + return datetime.fromisoformat(f"{y}-{m:02}-{d}T{time}{offh}:{offm}") + +def parse_logdate(logtime): + y = int(logtime[7:11]) + m = MONTHIDX[logtime[3:6]] + d = int(logtime[0:2]) + return date(y,m,d) + +def parse_querydict(querystr): + '''Parse request query the way mirrormanager does (last value wins)''' + return dict(parse_qsl(querystr)) + +class LogItem(NamedTuple): + ''' + Generic access.log data holder. + ''' + host: str + identity: str + time: str + method: str + path: str + query: Optional[str] + protocol: str + status: int + nbytes: Optional[int] + referrer: str + user_agent: str + + def datetime(self): + return parse_logtime(self.time) + + def timestamp(self): + return parse_logtime(self.time).timestamp() + + def queryitems(self): + return parse_qsl(self.query) + + def querydict(self): + return parse_querydict(self.query) + +# TODO: would be kinda nice if there was a clear subclass / translation +# between item classes... or if compile_log_regex made the class for you? +# Or something? It feels like these things should be more closely bound. + + +class MirrorItem(NamedTuple): + ''' + A basic mirrorlist/metalink metadata item. + Each item has a timestamp, IP, and the requested repo= and arch= values. + ''' + timestamp: int + host: str + repo_tag: Optional[str] + repo_arch: Optional[str] + +class CountmeItem(NamedTuple): + ''' + A "countme" match item. + Includes the countme value and libdnf User-Agent fields. + ''' + timestamp: int + host: str + os_name: str + os_version: str + os_variant: str + os_arch: str + countme: int + repo_tag: str + repo_arch: str + +class LogMatcher: + '''Base class for a LogMatcher, which iterates through a log file''' + regex = NotImplemented + itemtuple = NotImplemented + def __init__(self, fileobj): + self.fileobj = fileobj + def iteritems(self): + # TODO: at this point we're single-threaded and CPU-bound; + # multithreading would speed things up here. + for line in self.fileobj: + match = self.regex.match(line) + if match: + yield self.make_item(match) + __iter__ = iteritems + @classmethod + def make_item(cls, match): + raise NotImplementedError + +class MirrorMatcher(LogMatcher): + '''Match all mirrorlist/metalink items, like mirrorlist.py does.''' + regex = MIRRORS_LOG_RE + itemtuple = MirrorItem + @classmethod + def make_item(cls, match): + timestamp = parse_logtime(match['time']).timestamp() + query = parse_querydict(match['query']) + return cls.itemtuple(timestamp = int(timestamp), + host = match['host'], + repo_tag = query.get('repo'), + repo_arch = query.get('arch')) + +class CountmeMatcher(LogMatcher): + '''Match the libdnf-style "countme" requests.''' + regex = COUNTME_LOG_RE + itemtuple = CountmeItem + @classmethod + def make_item(cls, match): + timestamp = parse_logtime(match['time']).timestamp() + query = parse_querydict(match['query']) + return cls.itemtuple(timestamp = int(timestamp), + host = match['host'], + os_name = match['os_name'], + os_version = match['os_version'], + os_variant = match['os_variant'], + os_arch = match['os_arch'], + countme = int(query.get('countme')), + repo_tag = query.get('repo'), + repo_arch = query.get('arch')) + +# =========================================================================== +# ====== ItemWriters - output formatting classes ============================ +# =========================================================================== + +class ItemWriter: + def __init__(self, fp, itemtuple, timefield='timestamp', **kwargs): + self._fp = fp + self._itemtuple = itemtuple + self._fields = itemtuple._fields + assert timefield in self._fields, f"{itemtuple.__name__!r} has no time field {timefield!r}" + self._timefield = timefield + self._get_writer(**kwargs) + def _get_writer(self): + raise NotImplementedError + def write_item(self, item): + raise NotImplementedError + def write_header(self): + pass + def write_footer(self): + pass + +class JSONWriter(ItemWriter): + def _get_writer(self): + import json + self._dump = json.dump + def write_item(self, item): + self._dump(item._asdict(), self._fp) + +class CSVWriter(ItemWriter): + def _get_writer(self): + import csv + self._writer = csv.writer(self._fp) + def write_header(self): + self._writer.writerow(self._fields) + def write_item(self, item): + self._writer.writerow(item) + +class AWKWriter(ItemWriter): + def _get_writer(self, field_separator='\t'): + self._fieldsep = field_separator + def _write_row(self, vals): + self._fp.write(self._fieldsep.join(str(v) for v in vals) + '\n') + def write_header(self): + self._write_row(self._fields) + def write_item(self, item): + self._write_row(item) + +class SQLiteWriter(ItemWriter): + '''Write each item as a new row in a SQLite database table.''' + # We have to get a little fancier with types here since SQL tables expect + # typed values. Good thing Python has types now, eh? + SQL_TYPE = { + int: "INTEGER NOT NULL", + str: "TEXT NOT NULL", + float: "REAL NOT NULL", + bytes: "BLOB NOT NULL", + Optional[int]: "INTEGER", + Optional[str]: "TEXT", + Optional[float]: "REAL", + Optional[bytes]: "BLOB", + } + def _sqltype(self, fieldname): + typehint = self._itemtuple.__annotations__[fieldname] + return self.SQL_TYPE.get(typehint, "TEXT") + def _get_writer(self, tablename='countme_raw'): + self._tablename = tablename + import sqlite3 + self._con = sqlite3.connect(self._fp.name) + self._cur = self._con.cursor() + # Generate SQL commands so we can use them later. + # self._create_table creates the table, with column names and types + # matching the names and types of the fields in self._itemtuple. + self._create_table = ( + "CREATE TABLE IF NOT EXISTS {table} ({coldefs})".format( + table=tablename, + coldefs=",".join(f"{f} {self._sqltype(f)}" for f in self._fields), + ) + ) + # self._insert_item is an "INSERT" command with '?' placeholders. + self._insert_item = ( + "INSERT INTO {table} ({colnames}) VALUES ({colvals})".format( + table=tablename, + colnames=",".join(self._fields), + colvals=",".join("?" for f in self._fields), + ) + ) + # self._create_time_index creates an index on 'timestamp' or whatever + # the time-series field is. + self._create_time_index = ( + "CREATE INDEX IF NOT EXISTS {timefield}_idx on {table} ({timefield})".format( + table=tablename, + timefield=self._timefield + ) + ) + def write_header(self): + self._cur.execute(self._create_table) + def write_item(self, item): + self._cur.execute(self._insert_item, item) + def write_footer(self): + self._cur.execute(self._create_time_index) + self._con.commit() + +def make_writer(name, *args, **kwargs): + '''Convenience function to grab/instantiate the right writer''' + if name == "csv": + writer = CSVWriter + elif name == "json": + writer = JSONWriter + elif name == "awk": + writer = AWKWriter + elif name == "sqlite": + writer = SQLiteWriter + else: + raise ValueError(f"Unknown writer '{name}'") + return writer(*args, **kwargs) + +# =========================================================================== +# ====== ItemReaders - counterpart to ItemWriter ============================ +# =========================================================================== + +class ReaderError(RuntimeError): + pass + +class ItemReader: + def __init__(self, fp, itemtuple, **kwargs): + self._fp = fp + self._itemtuple = itemtuple + self._itemfields = itemtuple._fields + self._itemfactory = itemtuple._make + self._filefields = None + self._get_reader(**kwargs) + if not self._filefields: + raise ReaderError("no field names found") + if self._filefields != self._itemfields: + raise ReaderError(f"field mismatch: expected {self._itemfields}, got {self._filefields}") + def _get_reader(self): + '''Set up the ItemReader. + Should set self._filefields to a tuple of the fields found in fp.''' + raise NotImplementedError + def _iter_rows(self): + '''Return an iterator/generator that produces a row for each item.''' + raise NotImplementedError + def __iter__(self): + for item in self._iter_rows(): + yield self._itemfactory(item) + +class CSVReader(ItemReader): + def _get_reader(self, **kwargs): + import csv + self._reader = csv.reader(self._fp) + self._filefields = tuple(next(self._reader)) + # If we have numbers in our fieldnames, probably there was no header + if any(name.isnumeric() for name in self._filefields): + header = ','.join(fields) + raise ReaderError(f"header bad/missing, got: {header}") + def _iter_rows(self): + return self._reader + +# TODO: AWKReader, JSONReader + +class SQLiteReader(ItemReader): + def _get_reader(self, tablename='countme_raw', **kwargs): + import sqlite3 + self._con = sqlite3.connect(self._fp.name) + # TODO: self._con.set_progress_handler(handler, call_interval) + self._cur = self._con.cursor() + self._tablename = tablename + if False and sqlite3.sqlite_version_info >= (3,16,0): + fields_sql = f"SELECT name FROM pragma_table_info(?)" + self._filefields = tuple(r[0] for r in self._cur.execute(fields_sql, (tablename,))) + else: + fields_sql = f"PRAGMA table_info('{tablename}')" + self._filefields = tuple(r[1] for r in self._cur.execute(fields_sql)) + def _iter_rows(self): + fields = ",".join(self._itemfields) + return self._cur.execute(f"SELECT {fields} FROM {self._tablename}") + +# Guess the right reader based on the filename. +def guessreader(fp): + if fp.name.endswith(".csv"): + reader = CSVReader + elif fp.name.endswith(".db"): + reader = SQLiteReader + else: + # FIXME: better format detection!! + # TODO: if fp is seekable, peek and figure out filetype + reader = None + return reader + +# TODO: should have name/args more like make_writer... +def autoreader(fp, itemtuple, **kwargs): + '''Convenience function to guess & instantiate the right writer''' + reader = guessreader(fp) + return reader(fp, itemtuple, **kwargs) + diff --git a/countme/progress.py b/countme/progress.py new file mode 100644 index 0000000..67a871f --- /dev/null +++ b/countme/progress.py @@ -0,0 +1,164 @@ +# countme.progress: progress meters for CLI output +# +# Copyright (C) 2020, Red Hat Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Author: Will Woods + +import os +from .regex import compile_log_regex, LOG_DATE_RE + +__all__ = ( + 'ReadProgress', 'TQDMReadProgress', 'DIYReadProgress', +) + +# =========================================================================== +# ====== Progress meters & helpers ========================================== +# =========================================================================== + +def log_date(line): + match = LOG_DATE_RE.match(line) + if match: + return match['date'] + return "??/??/????" + +class ReadProgressBase: + def __init__(self, logs, display=True): + '''logs should be a sequence of line-iterable file-like objects. + if display is False, no progress output will be printed.''' + self.logs = logs + self.display = display + + def __iter__(self): + '''Iterator for ReadProgress; yields a sequence of line-iterable + file-like objects (one for each log in logs).''' + for num, logf in enumerate(self.logs): + yield self._iter_log_lines(logf, num) + + def _iter_log_lines(self, logf, lognum): + raise NotImplementedError + + +# If we have the tqdm module available then hooray +class TQDMReadProgress(ReadProgressBase): + def _iter_log_lines(self, logf, num): + # Make a progress meter for this file + prog = tqdm(unit="B", unit_scale=True, unit_divisor=1024, + total=os.stat(logf.name).st_size, + disable=True if not self.display else None, + desc=f"log {num+1}/{len(self.logs)}") + # Get the first line manually so we can get logdate + line = next(logf) + prog.set_description(f"{prog.desc}, date={log_date(line)}") + # Update bar and yield the first line + prog.update(len(line)) + yield line + # And now we do the rest of the file + for line in logf: + prog.update(len(line)) + yield line + prog.close() + +class DIYReadProgress(ReadProgressBase): + def _iter_log_lines(self, logf, num): + # Make a progress meter for this file + prog = diyprog(total=os.stat(logf.name).st_size, + disable=True if not self.display else None, + desc=f"log {num+1}/{len(self.logs)}") + # Get the first line manually so we can get logdate + line = next(logf) + prog.set_description(f"{prog.desc}, date={log_date(line)}") + # Update bar and yield the first line + prog.update(len(line)) + yield line + # And now we do the rest of the file + for line in logf: + prog.update(len(line)) + yield line + prog.close() + +class diyprog: + def __init__(self, desc=None, total=None, file=None, disable=False, + unit='b', unit_scale=True, barchar='_-=#'): + self.desc = desc + self.total = total + self.file = file + self.disable = disable + self.unit = unit + self.unit_scale = unit_scale + #self.unit_divisor = unit_divisor + self.count = 0 + self.showat = 0 + self.barchar = barchar + + def set_description(self, desc=None, refresh=True): + self.desc = desc + if refresh: + self.display() + + def update(self, n=1): + if self.disable: return + self.count += n + if self.count >= self.showat: + self.showat += self.total // 100 + self.display() + + @staticmethod + def hrsize(n): + for suffix in 'kmgtp': + n /= 1000 + if n < 1000: + break + return f"{n:.1f}{suffix}" + + @staticmethod + def hrtime(nsecs): + m, s = divmod(int(nsecs), 60) + if m > 60: + h, m = divmod(m, 60) + return f"{h:02d}h{m:02d}m{s:02d}s" + elif m: + return f"{m:02d}m{s:02d}s" + else: + return f"{s:02d}s" + + def display(self): + unit = self.unit + desc = self.desc + if self.unit_scale: + count = self.hrsize(self.count) + unit + total = self.hrsize(self.total) + unit + else: + count = str(self.count) + unit + total = str(self.total) + unit + pct = (self.count * 100) // self.total + bar = (pct // 4) * self.barchar[-1] + if pct < 100: + bar += self.barchar[pct % 4] + print(f"{desc}: {pct:>3}% [{bar:<25}] {count:>7}/{total:<7}", + flush=True, file=self.file, end='\r') + + def close(self): + if self.disable: return + print(flush=True, file=self.file) + +# Default ReadProgress: use tqdm if possible, else use the DIY one +try: + # TODO: make this work with a local tqdm checkout/git submodule + from tqdm import tqdm + ReadProgress = TQDMReadProgress +except ImportError: + ReadProgress = DIYReadProgress + diff --git a/countme/regex.py b/countme/regex.py new file mode 100644 index 0000000..28c8214 --- /dev/null +++ b/countme/regex.py @@ -0,0 +1,172 @@ +# countme.regex - regexes for log matching and parsing +# +# Copyright (C) 2020, Red Hat Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Author: Will Woods + +import re + +__all__ = ( + 'compile_log_regex', + 'LOG_RE', 'LIBDNF_USER_AGENT_RE', + 'MIRRORS_LOG_RE', 'COUNTME_LOG_RE', 'LOG_DATE_RE', +) + +# =========================================================================== +# ====== Regexes! Get your regexes here! ==================================== +# =========================================================================== + +# Log format, according to ansible/roles/httpd/proxy/templates/httpd.conf.j2: +# LogFormat "%a %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" +# That's the standard Combined Log Format, with numeric IPs (%a). +# +# Example log line: +# 240.159.140.173 - - [29/Mar/2020:16:04:28 +0000] "GET /metalink?repo=fedora-modular-32&arch=x86_64&countme=1 HTTP/2.0" 200 18336 "-" "libdnf (Fedora 32; workstation; Linux.x86_64)" +# +# Here it is as a Python regex, with a format placeholder for the actual field +# contents. Default field regexes are in LOG_PATTERN_FIELDS, below, and +# compile_log_regex() lets you construct more interesting regexes that only +# match the lines you care about. +# The request target is split into 'path' and 'query'; 'path' is always +# present but 'query' may be absent, depending on the value of 'query_match'. +# 'query_match' should be '?' (optional), '' (required), or '{0}' (absent). +LOG_PATTERN_FORMAT = ( + r'^' + r'(?P{host})\s' + r'(?P{identity})\s' + r'(?P{user})\s' + r'\[(?P