PR#6: Cache: Use database to cache fetched data - fedora-qa/oraculum

fedora-qa / oraculum

#6 Cache: Use database to cache fetched data

Merged 4 years ago by frantisekz. Opened 4 years ago by frantisekz.

db_store into master

Cache: Use database to cache fetched data

František Zatloukal • 4 years ago

2ae97ab

alembic/README

file added

		`@@ -0,0 +1,1 @@`
		`+ Generic single-database configuration.`
		`\ No newline at end of file`

alembic/env.py

file modified

+23 -17

		`@@ -1,8 +1,10 @@`
		`- from __future__ import with_statement`
		`- from alembic import context`
		`- from sqlalchemy import engine_from_config, pool`
		`from logging.config import fileConfig`

		`+ from sqlalchemy import engine_from_config`
		`+ from sqlalchemy import pool`
		`+`
		`+ from alembic import context`
		`+`
		`# add '.' to the pythonpath to support migration inside development env`
		`import sys`
		`sys.path.append('.')`
		`@@ -18,6 +20,7 @@`
		`# add your model's MetaData object here`
		`# for 'autogenerate' support`
		`from oraculum import db`
		`+ # target_metadata = mymodel.Base.metadata`
		`target_metadata = db.metadata`

		`# other values from the config, defined by the needs of env.py,`
		`@@ -38,8 +41,14 @@`
		`script output.`

		`"""`
		`- url = config.get_main_option("sqlalchemy.url")`
		`- context.configure(url=url)`
		`+ #url = config.get_main_option("sqlalchemy.url")`
		`+ from oraculum import app`
		`+ context.configure(`
		`+ url=app.config['SQLALCHEMY_DATABASE_URI'],`
		`+ target_metadata=target_metadata,`
		`+ literal_binds=True,`
		`+ dialect_opts={"paramstyle": "named"},`
		`+ )`

		`with context.begin_transaction():`
		`context.run_migrations()`
		`@@ -52,27 +61,24 @@`
		`and associate a connection with the context.`

		`"""`
		`-`
		`alembic_config = config.get_section(config.config_ini_section)`
		`from oraculum import app`
		`alembic_config['sqlalchemy.url'] = app.config['SQLALCHEMY_DATABASE_URI']`

		`- engine = engine_from_config(`
		`+ connectable = engine_from_config(`
		`alembic_config,`
		`- prefix='sqlalchemy.',`
		`- poolclass=pool.NullPool)`
		`-`
		`- connection = engine.connect()`
		`- context.configure(`
		`- connection=connection,`
		`- target_metadata=target_metadata`
		`+ prefix="sqlalchemy.",`
		`+ poolclass=pool.NullPool,`
		`)`

		`- try:`
		`+ with connectable.connect() as connection:`
		`+ context.configure(`
		`+ connection=connection, target_metadata=target_metadata`
		`+ )`
		`+`
		`with context.begin_transaction():`
		`context.run_migrations()`
		`- finally:`
		`- connection.close()`
		`+`

		`if context.is_offline_mode():`
		`run_migrations_offline()`

alembic/script.py.mako

file modified

+3 -3

		`@@ -5,6 +5,9 @@`
		`Create Date: ${create_date}`

		`"""`
		`+ from alembic import op`
		`+ import sqlalchemy as sa`
		`+ ${imports if imports else ""}`

		`# revision identifiers, used by Alembic.`
		`revision = ${repr(up_revision)}`
		`@@ -12,9 +15,6 @@`
		`branch_labels = ${repr(branch_labels)}`
		`depends_on = ${repr(depends_on)}`

		`- from alembic import op`
		`- import sqlalchemy as sa`
		`- ${imports if imports else ""}`

		`def upgrade():`
		`${upgrades if upgrades else "pass"}`

alembic/versions/15f5eeb9f635_initial_revision.py

file removed

-21

		`@@ -1,21 +0,0 @@`
		`- """Initial revision`
		`-`
		`- Revision ID: 15f5eeb9f635`
		`- Revises:`
		`- Create Date: 2015-04-29 13:43:16.481727`
		`-`
		`- """`
		`-`
		`- # revision identifiers, used by Alembic.`
		`- revision = '15f5eeb9f635'`
		`- down_revision = None`
		`- branch_labels = None`
		`- depends_on = None`
		`-`
		`-`
		`- def upgrade():`
		`- pass`
		`-`
		`-`
		`- def downgrade():`
		`- pass`

alembic/versions/f37edffe0265_db_cache.py

file added

+35

		`@@ -0,0 +1,35 @@`
		`+ """DB Cache`
		`+`
		`+ Revision ID: f37edffe0265`
		`+ Revises:`
		`+ Create Date: 2020-03-09 14:13:11.323459`
		`+`
		`+ """`
		`+ from alembic import op`
		`+ import sqlalchemy as sa`
		`+`
		`+`
		`+ # revision identifiers, used by Alembic.`
		`+ revision = 'f37edffe0265'`
		`+ down_revision = None`
		`+ branch_labels = None`
		`+ depends_on = None`
		`+`
		`+`
		`+ def upgrade():`
		`+ # ### commands auto generated by Alembic - please adjust! ###`
		`+ op.create_table('cached_data',`
		`+ sa.Column('id', sa.Integer(), nullable=False),`
		`+ sa.Column('provider', sa.Text(), nullable=True),`
		`+ sa.Column('time_created', sa.DateTime(), nullable=True),`
		`+ sa.Column('raw_text', sa.Text(), nullable=True),`
		`+ sa.PrimaryKeyConstraint('id'),`
		`+ sa.UniqueConstraint('provider')`
		`+ )`
		`+ # ### end Alembic commands ###`
		`+`
		`+`
		`+ def downgrade():`
		`+ # ### commands auto generated by Alembic - please adjust! ###`
		`+ op.drop_table('cached_data')`
		`+ # ### end Alembic commands ###`

conf/settings.py.example

file modified

		`@@ -3,6 +3,8 @@`
		`SQLALCHEMY_DATABASE_URI = 'postgresql+psycopg2://dbuser:dbpassword@dbhost:dbport/dbname'`
		`SHOW_DB_URI = False`
		`PRODUCTION = True`
		`+ MAX_DB_AGE = 1800 # Max cache age allowed in seconds`
		`+ SKIP_CACHE_AGE_CHECK = False # Skip checking cache age in runtime, make sure to set up cron with "runcli.py sync" if set to True`

		`FILE_LOGGING = False`
		`SYSLOG_LOGGING = False`

init_db.sh

file modified

+4 -4

		`@@ -1,13 +1,13 @@`
		`#!/usr/bin/bash`
		`# this is a simple script to aid in the setup of a new db for F18`

		`- echo "No database required at the moment. DONE"`
		`- exit 0`
		`+ #echo "No database required at the moment. DONE"`
		`+ #exit 0`


		`# init db`
		`- python run_cli.py init_db`
		`+ python3 run_cli.py init_db`

		`# insert mock data`
		`- python run_cli.py mock_data`
		`+ python3 run_cli.py mock_data`

oraculum/__init__.py

file modified

+2 -2

		`@@ -18,7 +18,7 @@`
		`# Josef Skladanka <jskladan@redhat.com>`

		`from flask import Flask`
		`- #from flask_sqlalchemy import SQLAlchemy`
		`+ from flask_sqlalchemy import SQLAlchemy`

		`from flask_caching import Cache`
		`from flask_cors import CORS`
		`@@ -108,7 +108,7 @@`
		`app.config['JSON_AS_ASCII'] = False`

		`## database`
		`- #db = SQLAlchemy(app)`
		`+ db = SQLAlchemy(app)`
		`#`
		`# register blueprints`

oraculum/cli.py

file modified

+78 -113

		`@@ -17,121 +17,86 @@`
		`# Authors:`
		`# Josef Skladanka <jskladan@redhat.com>`

		`- #import os`
		`- #import sys`
		`- #from optparse import OptionParser`
		`- #`
		`- #from alembic.config import Config`
		`- #from alembic import command as al_command`
		`- #from alembic.migration import MigrationContext`
		`- #`
		`- #from oraculum import db`
		`- #from oraculum.models.user import User`
		`- #`
		`- #`
		`- #def get_alembic_config():`
		`- # # the location of the alembic ini file and alembic scripts changes when`
		`- # # installed via package`
		`- # if os.path.exists("./alembic.ini"):`
		`- # alembic_cfg = Config("./alembic.ini")`
		`- # else:`
		`- # alembic_cfg = Config("/usr/share/oraculum/alembic.ini",`
		`- # ini_section='alembic-packaged')`
		`- # return alembic_cfg`
		`- #`
		`- #`
		`- #def upgrade_db(*args):`
		`- # alembic_cfg = get_alembic_config()`
		`- #`
		`- # context = MigrationContext.configure(db.engine.connect())`
		`- # current_rev = context.get_current_revision()`
		- # print "Upgrading Database to `head` from `%s`" % current_rev
		`- #`
		`- # al_command.upgrade(alembic_cfg, "head")`
		`- #`
		`- #`
		`- #def init_alembic(*args):`
		`- # alembic_cfg = get_alembic_config()`
		`- #`
		`- # # check to see if the db has already been initialized by checking for an`
		`- # # alembic revision`
		`- # context = MigrationContext.configure(db.engine.connect())`
		`- # current_rev = context.get_current_revision()`
		`- #`
		`- # if not current_rev:`
		`- # print "Initializing alembic"`
		`- # print " - Setting the version to the first revision"`
		`- # al_command.stamp(alembic_cfg, "15f5eeb9f635")`
		`- # else:`
		`- # print "Alembic already initialized"`
		`- #`
		`- #`
		`- #def initialize_db(destructive):`
		`- # alembic_cfg = get_alembic_config()`
		`- #`
		`- # print "Initializing database"`
		`- # if destructive:`
		`- # print " - Dropping all tables"`
		`- # db.drop_all()`
		`- # print " - Creating tables"`
		`- # db.create_all()`
		`- # print " - Stamping alembic's current version to 'head'"`
		`- # al_command.stamp(alembic_cfg, "head")`
		`- #`
		`- # init_alembic()`
		`- # upgrade_db()`
		`- #`
		`- #`
		`- #def mock_data(destructive):`
		`- # print "Populating tables with mock-data"`
		`- #`
		`- # if destructive or not db.session.query(User).count():`
		`- # print " - User"`
		`- # data_users = [('admin', 'admin'), ('user', 'user')]`
		`- #`
		`- # for d in data_users:`
		`- # u = User(*d)`
		`- # db.session.add(u)`
		`- #`
		`- # db.session.commit()`
		`- # else:`
		`- # print " - skipped User"`
		`- #`
		`- #`
		`- #def main():`
		`- # possible_commands = ['init_db', 'mock_data', 'upgrade_db', 'init_alembic']`
		`- #`
		`- # usage = 'usage: [DEV=true] %prog ' + "(%s)" % ' \| '.join(possible_commands)`
		`- # parser = OptionParser(usage=usage)`
		`- # parser.add_option("-d", "--destructive",`
		`- # action="store_true", dest="destructive", default=False,`
		- # help="Drop tables in `init_db`; Store data in `mock_data` "
		`- # "even if the tables are not empty")`
		`- #`
		`- # (options, args) = parser.parse_args()`
		`- #`
		`- # if len(args) != 1 or args[0] not in possible_commands:`
		`- # print usage`
		`- # print`
		`- # print 'Please use one of the following commands: %s' % str(possible_commands)`
		`- # sys.exit(1)`
		`- #`
		`- # command = {`
		`- # 'init_db': initialize_db,`
		`- # 'mock_data': mock_data,`
		`- # 'upgrade_db': upgrade_db,`
		`- # 'init_alembic': init_alembic,`
		`- # }[args[0]]`
		`- #`
		`- # if not options.destructive:`
		`- # print "Proceeding with non-destructive init. To perform destructive "\`
		`- # "steps use -d option."`
		`- #`
		`- # command(options.destructive)`
		`- #`
		`+ import os`
		`+ import sys`
		`+ from optparse import OptionParser`
		`+`
		`+ from alembic.config import Config`
		`+ from alembic import command as al_command`
		`+ from alembic.migration import MigrationContext`
		`+`
		`+ from oraculum import app, db, controllers`
		`+ from oraculum.utils import db_utils`
		`+`
		`+ def get_alembic_config():`
		`+ # the location of the alembic ini file and alembic scripts changes when`
		`+ # installed via package`
		`+ if os.path.exists("./alembic.ini"):`
		`+ alembic_cfg = Config("./alembic.ini")`
		`+ else:`
		`+ alembic_cfg = Config("/usr/share/blockerbugs/alembic.ini",`
		`+ ini_section='alembic-packaged')`
		`+ return alembic_cfg`
		`+`
		`+`
		`+ def initialize_db():`
		`+ alembic_cfg = get_alembic_config()`
		`+`
		`+ # check to see if the db has already been initialized by checking for an`
		`+ # alembic revision`
		`+ context = MigrationContext.configure(db.engine.connect())`
		`+ current_rev = context.get_current_revision()`
		`+`
		`+ if current_rev:`
		`+ print("Database already initialized and at rev %s - not re-initializing" % current_rev)`
		`+ else:`
		`+ print("Initializing Database")`
		`+ db.drop_all()`
		`+ db.create_all()`
		`+`
		`+ print("Initializing alembic version")`
		`+`
		`+ al_command.stamp(alembic_cfg, "head")`
		`+`
		`+`
		`+ def upgrade_db():`
		`+ print("Upgrading Database to Latest Revision")`
		`+ alembic_cfg = get_alembic_config()`
		`+ al_command.upgrade(alembic_cfg, "head")`
		`+`
		`+ def sync():`
		`+ print("Refreshing DB Cache")`
		`+ app.config['MAX_DB_AGE'] = 0`
		`+ app.config['SKIP_CACHE_AGE_CHECK'] = False`
		`+ db_utils.refresh_data("get_actions", controllers.main.get_actions())`
		`+ #db_utils.refresh_data("api_v0_meetings", controllers.main.api_v0_meetings()) FIXME`

		`def main():`
		`- print("No cli options available")`
		`+ possible_commands = ['init_db', 'generate_config', 'upgrade_db', 'sync']`
		`+`
		`+ usage = 'usage: %prog [options] command \n Possible Commands: ' + ' '.join(`
		`+ possible_commands)`
		`+ parser = OptionParser(usage=usage)`
		`+ parser.add_option('-d', '--dburi', dest='dburi', help='dburi to use')`
		`+`
		`+ (options, args) = parser.parse_args()`
		`+`
		`+ if len(args) < 1:`
		`+ print("need to have at least 1 command")`
		`+ sys.exit(1)`
		`+`
		`+ command = args[0]`
		`+ if not command in possible_commands:`
		`+ print("Invalid command: %s" % command)`
		`+ print("Please use one of the following commands: %s" % str(possible_commands))`
		`+ sys.exit(1)`
		`+`
		`+ elif command == 'upgrade_db':`
		`+ upgrade_db()`
		`+ elif command == 'init_db':`
		`+ initialize_db()`
		`+ elif command == 'sync':`
		`+ sync()`

		`if __name__ == '__main__':`
		`main()`

oraculum/controllers/main.py

file modified

+26 -21

		`@@ -17,33 +17,35 @@`
		`# Authors:`
		`# Josef Skladanka <jskladan@redhat.com>`

		`- from oraculum import app, cache`
		`- from oraculum.data_providers import PROVIDERS`
		`- from oraculum.utils import fedocal, schedule, blockerbugs, meetbot`
		`-`
		`- import flask`
		`- from flask import request, url_for, jsonify`
		`- from werkzeug.contrib.cache import SimpleCache`
		`-`
		`- import re`
		`import urllib`
		`import itertools`
		`+ import re`
		`import requests`

		`+ import flask`
		`+ from flask import request, url_for, jsonify`
		`+ from werkzeug.contrib.cache import SimpleCache`
		`+ from oraculum import app, cache`
		`+ from oraculum.data_providers import PROVIDERS`
		`+ from oraculum.utils import fedocal, schedule, blockerbugs, meetbot, db_utils`

		`# FIXME: Move this into a proper config file`
		`CACHE_TIMEOUT = 3600`
		`+ # MAX_DB_AGE defined in oraculum.utils.db_utils`

		`@app.route('/api/v1/landing_page')`
		`- @cache.cached(timeout=CACHE_TIMEOUT)`
		`+ #@cache.cached(timeout=CACHE_TIMEOUT)`
		`def api_v0_meetings():`
		`+ cached = db_utils.get_db_data("api_v0_meetings")`
		`+ if cached:`
		`+ return cached`
		`+`
		`mtgs = fedocal.get_qa_meetings()`
		`last_qa_meeting = meetbot.get_last_qa_meeting()`
		`sched = schedule.get_schedule()`
		`blockers = blockerbugs.get_blockerbugs()`
		`stable = schedule.current_stable()`
		`devel = schedule.current_devel()`
		`-`
		`resp = {`
		`'meetings': mtgs,`
		`'last_qa_meeting': last_qa_meeting,`
		`@@ -52,17 +54,20 @@`
		`'stable': stable,`
		`'devel': devel,`
		`}`
		`- return jsonify(resp)`
		`+ json_resp = jsonify(resp)`
		`+ db_utils.refresh_data("api_v0_meetings", resp)`
		`+ return json_resp`

		`- CACHE = SimpleCache()`
		`def get_actions(provider=None, tags=None):`
		`- actions = CACHE.get('api_v1_actions')`
		`- TIMEOUT = 3600 # FIXME - make the caching configurable (ideally per-provider)`
		`- if actions is None:`
		`- actions = []`
		`- for p_name, p_module in PROVIDERS.items():`
		`- actions.extend(p_module.get_actions())`
		`- CACHE.set('api_v1_actions', actions, timeout=TIMEOUT)`
		`+ actions = []`
		`+ for p_name, p_module in PROVIDERS.items():`
		`+ cached = db_utils.get_db_data(p_name)`
		`+ if cached:`
		`+ actions.extend(cached)`
		`+ else:`
		`+ p_actions = p_module.get_actions()`
		`+ actions.extend(p_actions)`
		`+ db_utils.refresh_data(p_name, p_actions)`

		`if provider:`
		`actions = [a for a in actions if a['provider'] == provider]`
		`@@ -73,7 +78,7 @@`


		`@app.route('/api/v1/actions')`
		`- def api_v1_actons():`
		`+ def api_v1_actions():`
		`provider = request.args.get('provider')`
		`tags = request.args.get('tags')`

oraculum/models/db_cache.py

file added

+35

		`@@ -0,0 +1,35 @@`
		`+ #`
		`+ # landing_page.py - Database model for landing page`
		`+ #`
		`+ # Copyright 2020, Red Hat, Inc`
		`+ #`
		`+ # This program is free software; you can redistribute it and/or modify`
		`+ # it under the terms of the GNU General Public License as published by`
		`+ # the Free Software Foundation; either version 2 of the License, or`
		`+ # (at your option) any later version.`
		`+ #`
		`+ # This program is distributed in the hope that it will be useful,`
		`+ # but WITHOUT ANY WARRANTY; without even the implied warranty of`
		`+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
		`+ # GNU General Public License for more details.`
		`+ #`
		`+ # You should have received a copy of the GNU General Public License along`
		`+ # with this program; if not, write to the Free Software Foundation, Inc.,`
		`+ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`
		`+ #`
		`+ # Authors:`
		`+ # Frantisek Zatloukal <fzatlouk@redhat.com>`
		`+`
		`+ from oraculum import db`
		`+ from datetime import datetime`
		`+`
		`+ class CachedData(db.Model):`
		`+ id = db.Column(db.Integer, primary_key=True)`
		`+ provider = db.Column(db.Text, unique=True)`
		`+ time_created = db.Column(db.DateTime, unique=False)`
		`+ raw_text = db.Column(db.Text, unique=False)`
		`+`
		`+ def __init__(self, provider, raw_text):`
		`+ self.provider = provider`
		`+ self.time_created = datetime.utcnow()`
		`+ self.raw_text = raw_text`

oraculum/utils/db_utils.py

file added

+66

		`@@ -0,0 +1,66 @@`
		`+ #`
		`+ # db_utils.py - Database helper functions landing page cache`
		`+ #`
		`+ # Copyright 2020, Red Hat, Inc`
		`+ #`
		`+ # This program is free software; you can redistribute it and/or modify`
		`+ # it under the terms of the GNU General Public License as published by`
		`+ # the Free Software Foundation; either version 2 of the License, or`
		`+ # (at your option) any later version.`
		`+ #`
		`+ # This program is distributed in the hope that it will be useful,`
		`+ # but WITHOUT ANY WARRANTY; without even the implied warranty of`
		`+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
		`+ # GNU General Public License for more details.`
		`+ #`
		`+ # You should have received a copy of the GNU General Public License along`
		`+ # with this program; if not, write to the Free Software Foundation, Inc.,`
		`+ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`
		`+ #`
		`+ # Authors:`
		`+ # Frantisek Zatloukal <fzatlouk@redhat.com>`
		`+ import json`
		`+`
		`+ from datetime import datetime, timedelta`
		`+ from oraculum import app, db`
		`+ from oraculum.models.db_cache import CachedData`
		`+`
		`+ def is_new_enough(db_time):`
		`+ """`
		`+ Checks if given db_time is new enough according to MAX_DB_AGE`
		`+ Skips check if SKIP_CACHE_AGE_CHECK is set to True`
		`+ """`
		`+ if app.config['SKIP_CACHE_AGE_CHECK']:`
		`+ return True`
		`+ if not db_time:`
		`+ return False`
		`+ if (datetime.utcnow() - timedelta(seconds=app.config['MAX_DB_AGE'])) >= db_time:`
		`+ return False`
		`+ return True`
		`+`
		`+ def refresh_data(provider, data):`
		`+ """`
		`+ Refreshes given data for given provider in the db`
		`+ Returns immediately when SKIP_CACHE_AGE_CHECK is set`
		`+ """`
		`+ if app.config['SKIP_CACHE_AGE_CHECK']:`
		`+ return True`
		`+ row = CachedData.query.filter_by(provider=provider).first()`
		`+ if row:`
		`+ row.time_created = datetime.utcnow()`
		`+ row.data = data`
		`+ else:`
		`+ db.session.add(CachedData(provider, json.dumps(data)))`
		`+ db.session.commit()`
		`+`
		`+ def get_db_data(provider):`
		`+ """`
		`+ Returns raw_text for API serving`
		`+ Tries to fetch raw_text from db if new enough, returns false on cache miss`
		`+ """`
		`+ row = CachedData.query.filter_by(provider=provider).first()`
		`+ if row and is_new_enough(row.time_created):`
		`+ app.logger.debug("DB Cache hit for provider: %s" % provider)`
		`+ return json.loads(row.raw_text)`
		`+ app.logger.debug("DB Cache miss for provider: %s" % provider)`
		`+ return False`

oraculum/utils/meetbot.py

file modified

-1

		`@@ -20,4 +20,3 @@`
		`}`

		`return data`
		`-`
		`\ No newline at end of file`

oraculum/utils/schedule.py

file modified

+8 -2

		`@@ -37,7 +37,6 @@`
		`summary = c.get('summary')`
		`date = c.get('dtstart').dt.astimezone(pytz.UTC)`
		`if _is_relevant(summary):`
		`- print(c)`
		`data.append({`
		`'summary': summary,`
		`'date': date.strftime("%d %b %Y"),`
		`@@ -63,5 +62,12 @@`
		`'current': not any([d['current'] for d in data]),`
		`}`
		`data = [prebranch] + data`
		`+ data_cleaned = []`
		`+ for schedule_event in data:`
		`+ data_cleaned.append({`
		`+ 'summary': schedule_event["summary"],`
		`+ 'date': schedule_event["date"],`
		`+ 'current': schedule_event["current"]`
		`+ })`

		`- return data`
		`+ return data_cleaned`

frantisekz commented 4 years ago

Implements caching in database instead of memory as described in https://pagure.io/fedora-qa/oraculum/issue/3 .

Currently, handles only landing_page stuff (meetings, last_qa_meeting, schedule, blockerbugs, stable, devel).

TODO:

Handle case where it might be okay to have for example zero meetings and stop refreshing db every time in that case. Maybe rely more on provider_timestamp and relax some checks in db_utils... - DONE
dtdate in schedule is displayed in a different date/time format when received from cache compared to fetch. At the moment, I don't have any idea why is that happening, might need to take a second look. It's stored and always handled as a string, this shouldn't be happening :/

Some questions that came to my mind:

Should I add some verification that the data fetch succeeded and don't remove old db data if it didn't? And maybe allow to configure longer cache lifetime for providers that failed to obtain data?

Notes to myself:

Don't save time_created for each stored value (they should never differ too much), create special table that would store time_created on per provider basis. This would make checking cache age cheaper. - DONE
Create sync function in cli.py that would preload all data into database. That way, it could be called every n minutes by cron, systemd timer,... - DONE
Make it possible to configure the app to completely skip time_created verification (in deployments where the db cache creation is handled elsewhere). - DONE (might be optimized a little more - by skipping "load_all_providers_timestamp" completely)

1 new commit added

Separate table for timestamps

4 years ago

frantisekz commented 4 years ago

I've added (in a separate commit, yay!) changes to use extra table with timestamps on provider-basis instead of checking time for each entry. It (at least in my theory) should be faster in hot-path (loading cached data, which is expected to be hit by users in production).

I am planning to study/find how much of an impact calling query per provider makes ( load_providers_timestamp(provider_name) ), if it is measurable, I'll try to change it to load all the providers at once and reuse that for each provider.

Also, part of that commit is alembic reinit which I've done when desperately trying to figure out why is alembic autogenerate not working. I can revert that if you want, but seems to be newer version of env.py and seems to work just fine.

2 new commits added

Separate table for timestamps
Cache: Use database to cache fetched data

4 years ago

frantisekz commented 4 years ago

I am planning to study/find how much of an impact calling query per provider makes ( load_providers_timestamp(provider_name) ), if it is measurable, I'll try to change it to load all the providers at once and reuse that for each provider.

Done, without that, the "optimized" version was actually slower.

Without Separate table for timestamps:

$ ab -n 5000 -c 500 http://0.0.0.0:5000/api/v1/landing_page
This is ApacheBench, Version 2.3 <$Revision: 1843412 $>
Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
Licensed to The Apache Software Foundation, http://www.apache.org/

Benchmarking 0.0.0.0 (be patient)
Completed 500 requests
Completed 1000 requests
Completed 1500 requests
Completed 2000 requests
Completed 2500 requests
Completed 3000 requests
Completed 3500 requests
Completed 4000 requests
Completed 4500 requests
Completed 5000 requests
Finished 5000 requests


Server Software:        Werkzeug/0.14.1
Server Hostname:        0.0.0.0
Server Port:            5000

Document Path:          /api/v1/landing_page
Document Length:        1869 bytes

Concurrency Level:      500
Time taken for tests:   53.233 seconds
Complete requests:      5000
Failed requests:        0
Total transferred:      10245000 bytes
HTML transferred:       9345000 bytes
Requests per second:    93.93 [#/sec] (mean)
Time per request:       5323.339 [ms] (mean)
Time per request:       10.647 [ms] (mean, across all concurrent requests)
Transfer rate:          187.94 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:        0   64 244.9      0    1018
Processing:    56 5194 820.9   5265    7708
Waiting:       41 5193 820.8   5263    7705
Total:         56 5258 873.2   5312    8161

Percentage of the requests served within a certain time (ms)
  50%   5312
  66%   5558
  75%   5811
  80%   5884
  90%   6252
  95%   6570
  98%   6979
  99%   7221
 100%   8161 (longest request)

With Separate table for timestamps:

$ ab -n 5000 -c 500 http://0.0.0.0:5000/api/v1/landing_page
This is ApacheBench, Version 2.3 <$Revision: 1843412 $>
Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
Licensed to The Apache Software Foundation, http://www.apache.org/

Benchmarking 0.0.0.0 (be patient)
Completed 500 requests
Completed 1000 requests
Completed 1500 requests
Completed 2000 requests
Completed 2500 requests
Completed 3000 requests
Completed 3500 requests
Completed 4000 requests
Completed 4500 requests
Completed 5000 requests
Finished 5000 requests


Server Software:        Werkzeug/0.14.1
Server Hostname:        0.0.0.0
Server Port:            5000

Document Path:          /api/v1/landing_page
Document Length:        1869 bytes

Concurrency Level:      500
Time taken for tests:   45.347 seconds
Complete requests:      5000
Failed requests:        0
Total transferred:      10245000 bytes
HTML transferred:       9345000 bytes
Requests per second:    110.26 [#/sec] (mean)
Time per request:       4534.721 [ms] (mean)
Time per request:       9.069 [ms] (mean, across all concurrent requests)
Transfer rate:          220.63 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:        0   67 254.1      0    1049
Processing:    47 4412 698.2   4540    7289
Waiting:       27 4411 698.1   4538    7288
Total:         47 4478 778.2   4553    7695

Percentage of the requests served within a certain time (ms)
  50%   4553
  66%   4631
  75%   4910
  80%   4953
  90%   5319
  95%   5684
  98%   6214
  99%   6592
 100%   7695 (longest request)

2 new commits added

Separate table for timestamps
Cache: Use database to cache fetched data

4 years ago

2 new commits added

Separate table for timestamps
Cache: Use database to cache fetched data

4 years ago

1 new commit added

Relax some checks around db cache

4 years ago

1 new commit added

Allow to set SKIP_CACHE_AGE_CHECK

4 years ago

1 new commit added

Add sync option to the cli

4 years ago

frantisekz commented 4 years ago

So, I've added sync command to the cli (and option to skip cache age verification in the runtime).

However, the sync operation seems not to be atomic (but I am really not sure about this, loading the page during sync seemed to work just fine), do you have any ideas how to improve it @jskladan ? Or does sqlalchemy have any way to lock the db, so anybody connecting during sync would just wait for it to finish?

According to off-PR communication with @jskladan , it is atomic.

Edited 4 years ago by frantisekz

1 new commit added

schedule.get_schedule returns dtdate as datetime, change db format to db.DateTime

4 years ago

1 new commit added

docs: Update and add some docstrings

4 years ago

rebased onto 7d6e458

4 years ago

1 new commit added

Support caching manual testing

4 years ago

2 new commits added

Support caching manual testing
Cache: Use database to cache fetched data

4 years ago

1 new commit added

Refactor caching code

4 years ago

frantisekz commented 4 years ago

@jskladan Refactored code for smaller complexity, seems to work just fine.

Just note I've changed the API (removed dtdate (datetime type) from schedule.py, nothing seems to need it, but would be nice to have ack either from you or @lbrabec , I didn't try to understand the code there, so :) ...)

lbrabec commented 4 years ago

The key dtdate (of type datetime) is needed for sorting.

In your code, you are trying to sort using key date which is str in human readable
format "%d %b %Y", e.g. "09 Mar 2020".
This format is not suitable for sorting as "09 Apr 2020" would be sorted before "09 Mar 2020".

After sorting, the key dtdate is not needed anymore so you can pop it. Or don't use dtdate key at all, put datetime directly in key date, sort the list using this key and finally replace values in date with date.strftime("%d %b %Y").

3 new commits added

Refactor caching code
Support caching manual testing
Cache: Use database to cache fetched data

4 years ago

frantisekz commented 4 years ago

@lbrabec Should be fixed now, thanks!

3 new commits added

Refactor caching code
Support caching manual testing
Cache: Use database to cache fetched data

4 years ago

jskladan commented on line 51 of oraculum/utils/db_utils.py 4 years ago

You sure you want to have the commit() here? I see no reason for the db to "not contain any data" for a small amount of time.

jskladan commented on line 53 of oraculum/utils/db_utils.py 4 years ago

Any specific reason to have the commit() here? I'd rather have the delete/write in one transaction, so we don't have an unnecessary a "the db is empty" moment

jskladan commented on line 54 of oraculum/controllers/main.py 4 years ago

In the long run, I'd like to have the "refresh data" code running asynchronously from the "here, client, have a response" code.

ATM, I think it would make sense to have the refresh_data() called on a "cache miss" in the get_db_data() call. Would make more sense to me, and would also probably make the code a bit more streamlined (this is relevant to all get_db_data and refresh_data calls in general, not only this specific one)