Commit de7dec40 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Simplified the data controller even further.

parent 61cd4347
Pipeline #2088 failed with stage
in 14 minutes and 21 seconds
...@@ -40,13 +40,9 @@ log.info("Creating Redis connection.") ...@@ -40,13 +40,9 @@ log.info("Creating Redis connection.")
redis = StrictRedis(host=app.config['REDIS_HOST'], redis = StrictRedis(host=app.config['REDIS_HOST'],
port=app.config['REDIS_PORT']) port=app.config['REDIS_PORT'])
# Configure app with composed configurations to save admin some work
app.config['SESSION_REDIS'] = redis
app.config['CELERY_RESULT_BACKEND'] = 'redis://{}:{}'.format(
app.config['REDIS_HOST'], app.config['REDIS_PORT'])
# Set new session interface for app # Set new session interface for app
log.info("Replacing default session interface.") log.info("Replacing default session interface.")
app.config['SESSION_REDIS'] = redis
Session(app) Session(app)
# allow everyone to submit requests # allow everyone to submit requests
......
from fractalis.utils import list_classes_with_base_class from fractalis.utils import list_classes_with_base_class
from .job import AnalyticsJob from fractalis.analytics.task import AnalyticTask
TASK_REGISTRY = list_classes_with_base_class('fractalis.analytics.tasks',
JOB_REGISTRY = list_classes_with_base_class('fractalis.analytics.jobs', AnalyticTask)
AnalyticsJob)
...@@ -9,8 +9,8 @@ from flask.wrappers import Response ...@@ -9,8 +9,8 @@ from flask.wrappers import Response
from fractalis import celery from fractalis import celery
from fractalis.validator import validate_json, validate_schema from fractalis.validator import validate_json, validate_schema
from fractalis.analytics.schema import create_job_schema from fractalis.analytics.schema import create_task_schema
from fractalis.analytics.job import AnalyticsJob from fractalis.analytics.task import AnalyticTask
analytics_blueprint = Blueprint('analytics_blueprint', __name__) analytics_blueprint = Blueprint('analytics_blueprint', __name__)
...@@ -21,76 +21,76 @@ logger = logging.getLogger(__name__) ...@@ -21,76 +21,76 @@ logger = logging.getLogger(__name__)
def prepare_session() -> None: def prepare_session() -> None:
"""Make sure the session is properly initialized before each request.""" """Make sure the session is properly initialized before each request."""
session.permanent = True session.permanent = True
if 'jobs' not in session: if 'analytic_tasks' not in session:
logger.debug("Initializing jobs field in session dict.") logger.debug("Initializing analytic_tasks field in session dict.")
session['jobs'] = [] session['analytic_tasks'] = []
if 'data_ids' not in session: if 'data_tasks' not in session:
logger.debug("Initializing data_ids field in session dict.") logger.debug("Initializing data_tasks field in session dict.")
session['data_ids'] = [] session['data_tasks'] = []
@analytics_blueprint.route('', methods=['POST']) @analytics_blueprint.route('', methods=['POST'])
@validate_json @validate_json
@validate_schema(create_job_schema) @validate_schema(create_task_schema)
def create_job() -> Tuple[Response, int]: def create_task() -> Tuple[Response, int]:
"""Create a new analytics job based on the parameters in the POST body. """Create a new analytics task based on the parameters in the POST body.
See doc/api/ for more information. See doc/api/ for more information.
:return: Flask Response :return: Flask Response
""" """
logger.debug("Received POST request on /analytics.") logger.debug("Received POST request on /analytics.")
json = request.get_json(force=True) # pattern enforced by decorators json = request.get_json(force=True) # pattern enforced by decorators
analytics_job = AnalyticsJob.factory(json['job_name']) analytic_task = AnalyticTask.factory(json['task_name'])
if analytics_job is None: if analytic_task is None:
logger.error("Could not submit job for unknown job name: " logger.error("Could not submit task for unknown task name: "
"'{}'".format(json['job_name'])) "'{}'".format(json['task_name']))
return jsonify({'error_msg': "Job with name '{}' not found.".format( return jsonify({'error_msg': "Task with name '{}' not found."
json['job_name'])}), 400 .format(json['task_name'])}), 400
async_result = analytics_job.delay(accessible_data_ids=session['data_ids'], async_result = analytic_task.delay(data_tasks=session['data_tasks'],
args=json['args']) args=json['args'])
session['jobs'].append(async_result.id) session['analytic_tasks'].append(async_result.id)
logger.debug("Job successfully submitted. Sending response.") logger.debug("Task successfully submitted. Sending response.")
return jsonify({'job_id': async_result.id}), 201 return jsonify({'task_id': async_result.id}), 201
@analytics_blueprint.route('/<uuid:job_id>', methods=['GET']) @analytics_blueprint.route('/<uuid:task_id>', methods=['GET'])
def get_job_details(job_id: UUID) -> Tuple[Response, int]: def get_task_details(task_id: UUID) -> Tuple[Response, int]:
"""Get job details for the given job_id. """Get task details for the given task_id.
See doc/api/ for more information. See doc/api/ for more information.
:param job_id: ID returned on job creation. :param task_id: ID returned on task creation.
:return: Flask Response :return: Flask Response
""" """
logger.debug("Received GET request on /analytics/job_id.") logger.debug("Received GET request on /analytics/task_id.")
wait = request.args.get('wait') == '1' wait = request.args.get('wait') == '1'
job_id = str(job_id) task_id = str(task_id)
if job_id not in session['jobs']: if task_id not in session['analytic_tasks']:
error = "Job ID '{}' not found in session. " \ error = "Task ID '{}' not found in session. " \
"Refusing access.".format(job_id) "Refusing access.".format(task_id)
logger.warning(error) logger.warning(error)
return jsonify({'error': error}), 403 return jsonify({'error': error}), 403
async_result = celery.AsyncResult(job_id) async_result = celery.AsyncResult(task_id)
if wait: if wait:
async_result.get(propagate=False) # make job synchronous async_result.get(propagate=False) # make task synchronous
logger.debug("Job found and has access. Sending response.") logger.debug("Task found and has access. Sending response.")
return jsonify({'state': async_result.state, return jsonify({'state': async_result.state,
'result': async_result.result}), 200 'result': async_result.result}), 200
@analytics_blueprint.route('/<uuid:job_id>', methods=['DELETE']) @analytics_blueprint.route('/<uuid:task_id>', methods=['DELETE'])
def cancel_job(job_id: UUID) -> Tuple[Response, int]: def cancel_task(task_id: UUID) -> Tuple[Response, int]:
"""Cancel a job for a given job_id. """Cancel a task for a given task_id.
See doc/api/ for more information. See doc/api/ for more information.
:param job_id: ID returned on job creation. :param task_id: ID returned on task creation.
:return: Flask Response :return: Flask Response
""" """
logger.debug("Received DELETE request on /analytics/job_id.") logger.debug("Received DELETE request on /analytics/task_id.")
job_id = str(job_id) task_id = str(task_id)
if job_id not in session['jobs']: if task_id not in session['analytic_tasks']:
error = "Job ID '{}' not found in session. " \ error = "Task ID '{}' not found in session. " \
"Refusing access.".format(job_id) "Refusing access.".format(task_id)
logger.warning(error) logger.warning(error)
return jsonify({'error': error}), 403 return jsonify({'error': error}), 403
wait = request.args.get('wait') == '1' wait = request.args.get('wait') == '1'
# possibly dangerous: http://stackoverflow.com/a/29627549 # possibly dangerous: http://stackoverflow.com/a/29627549
celery.control.revoke(job_id, terminate=True, signal='SIGUSR1', wait=wait) celery.control.revoke(task_id, terminate=True, signal='SIGUSR1', wait=wait)
logger.debug("Successfully send term signal to task. Sending response.") logger.debug("Successfully send term signal to task. Sending response.")
return jsonify(''), 200 return jsonify(''), 200
create_job_schema = { create_task_schema = {
"type": "object", "type": "object",
"properties": { "properties": {
"job_name": {"type": "string", "minLength": 5}, "job_name": {"type": "string", "minLength": 5},
"args": {"type": "object", "minProperties": 1}, "args": {"type": "object", "minProperties": 1},
}, },
"required": ["job_name", "args"] "required": ["task_name", "args"]
} }
...@@ -13,7 +13,7 @@ from fractalis import redis ...@@ -13,7 +13,7 @@ from fractalis import redis
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class AnalyticsJob(Task, metaclass=abc.ABCMeta): class AnalyticTask(Task, metaclass=abc.ABCMeta):
@property @property
@abc.abstractmethod @abc.abstractmethod
...@@ -21,51 +21,54 @@ class AnalyticsJob(Task, metaclass=abc.ABCMeta): ...@@ -21,51 +21,54 @@ class AnalyticsJob(Task, metaclass=abc.ABCMeta):
pass pass
@staticmethod @staticmethod
def factory(job_name): def factory(task_name):
from . import JOB_REGISTRY from . import TASK_REGISTRY
for job in JOB_REGISTRY: for task in TASK_REGISTRY:
if job.name == job_name: if task.name == task_name:
return job() return task()
@abc.abstractmethod @abc.abstractmethod
def main(self): def main(self):
pass pass
@staticmethod @staticmethod
def prepare_args(accessible_data_ids, args): def prepare_args(data_tasks, args):
arguments = {} arguments = {}
for arg in args: for arg in args:
value = args[arg] value = args[arg]
if (isinstance(value, str) and if (isinstance(value, str) and
value.startswith('$') and value.endswith('$')): value.startswith('$') and value.endswith('$')):
data_id = value[1:-1] data_task_id = value[1:-1]
if data_id not in accessible_data_ids:
error = "No permission to use data_id '{}'" \ if data_task_id not in data_tasks:
"for analysis".format(data_id) error = "No permission to use data_task_id '{}'" \
"for analysis".format(data_task_id)
logger.error(error) logger.error(error)
raise KeyError(error) raise PermissionError(error)
entry = redis.get('data:{}'.format(data_id)) entry = redis.get('data:{}'.format(data_task_id))
if not entry: if not entry:
error = "The key '{}' does not match any entry in Redis. " \ error = "The key '{}' does not match any entry in Redis. " \
"Value probably expired.".format(data_id) "Value probably expired.".format(data_task_id)
logger.error(error) logger.error(error)
raise LookupError(error) raise LookupError(error)
data_obj = json.loads(entry.decode('utf-8')) data_obj = json.loads(entry.decode('utf-8'))
# update 'last_access' internal if not data_obj['loaded']:
data_obj['last_access'] = time.time() error = "The data task '{}' has not been loaded, yet." \
redis.set(name='data:{}'.format(data_id), value=data_obj) "Wait for it to complete before using it in an " \
"analysis task.".format(data_task_id)
logger.error(error)
raise ValueError(error)
file_path = data_obj['file_path'] file_path = data_obj['file_path']
value = pd.read_csv(file_path) value = pd.read_csv(file_path)
arguments[arg] = value arguments[arg] = value
return arguments return arguments
def run(self, accessible_data_ids, args): def run(self, data_tasks, args):
arguments = self.prepare_args(accessible_data_ids, args) arguments = self.prepare_args(data_tasks, args)
result = self.main(**arguments) result = self.main(**arguments)
try: try:
if type(result) != dict: if type(result) != dict:
error = "The job '{}' returned an object with type '{}', " \ error = "The task '{}' returned an object with type '{}', " \
"instead of expected type 'dict'." "instead of expected type 'dict'."
logger.error(error) logger.error(error)
raise ValueError(error) raise ValueError(error)
......
...@@ -2,10 +2,10 @@ import pandas as pd ...@@ -2,10 +2,10 @@ import pandas as pd
import numpy as np import numpy as np
from scipy import stats from scipy import stats
from fractalis.analytics.job import AnalyticsJob from fractalis.analytics.task import AnalyticTask
class CorrelationJob(AnalyticsJob): class CorrelationTask(AnalyticTask):
name = 'compute-correlation' name = 'compute-correlation'
......
...@@ -7,7 +7,7 @@ from datetime import timedelta ...@@ -7,7 +7,7 @@ from datetime import timedelta
# Flask # Flask
SECRET_KEY = str(uuid4()) # set me manually in production SECRET_KEY = 'OVERWRITE ME IN PRODUCTION!!!'
DEBUG = False DEBUG = False
TESTING = False TESTING = False
REDIS_HOST = '127.0.0.1' REDIS_HOST = '127.0.0.1'
...@@ -24,7 +24,9 @@ SESSION_USE_SIGNER = False ...@@ -24,7 +24,9 @@ SESSION_USE_SIGNER = False
# Celery # Celery
BROKER_URL = 'amqp://' BROKER_URL = 'amqp://'
CELERY_RESULT_BACKEND = 'redis://{}:{}'.format(REDIS_HOST, REDIS_PORT)
CELERYD_TASK_SOFT_TIME_LIMIT = 60 * 10 CELERYD_TASK_SOFT_TIME_LIMIT = 60 * 10
CELERY_TASK_RESULT_EXPIRES = timedelta(hours=1)
CELERYD_HIJACK_ROOT_LOGGER = False CELERYD_HIJACK_ROOT_LOGGER = False
# Fractalis # Fractalis
......
"""The /data controller. Please refer to doc/api for more information.""" """The /data controller. Please refer to doc/api for more information."""
import json import json
import time
import logging import logging
from uuid import UUID
from typing import Tuple from typing import Tuple
from flask import Blueprint, session, request, jsonify from flask import Blueprint, session, request, jsonify
...@@ -13,6 +11,7 @@ from fractalis.data.etlhandler import ETLHandler ...@@ -13,6 +11,7 @@ from fractalis.data.etlhandler import ETLHandler
from fractalis.data.schema import create_data_schema from fractalis.data.schema import create_data_schema
from fractalis.validator import validate_json, validate_schema from fractalis.validator import validate_json, validate_schema
from fractalis import celery, redis from fractalis import celery, redis
from fractalis.sync import remove_data
data_blueprint = Blueprint('data_blueprint', __name__) data_blueprint = Blueprint('data_blueprint', __name__)
...@@ -23,21 +22,18 @@ logger = logging.getLogger(__name__) ...@@ -23,21 +22,18 @@ logger = logging.getLogger(__name__)
def prepare_session() -> None: def prepare_session() -> None:
"""Make sure the session is properly initialized before each request.""" """Make sure the session is properly initialized before each request."""
session.permanent = True session.permanent = True
if 'jobs' not in session: if 'data_tasks' not in session:
logger.debug("Initializing jobs field in session dict.") logger.debug("Initializing data_tasks field in session dict.")
session['jobs'] = [] session['data_tasks'] = []
if 'data_ids' not in session:
logger.debug("Initializing data_ids field in session dict.")
session['data_ids'] = []
@data_blueprint.route('', methods=['POST']) @data_blueprint.route('', methods=['POST'])
@validate_json @validate_json
@validate_schema(create_data_schema) @validate_schema(create_data_schema)
def create_data_job() -> Tuple[Response, int]: def create_data_task() -> Tuple[Response, int]:
"""Submit a new ETL task based on the payload of the request body. """Submit new ETL tasks based on the payload of the request body.
See doc/api/ for more information. See doc/api/ for more information.
:return: Flask Response :return: Empty response. Everything important is stored in the session.
""" """
logger.debug("Received POST request on /data.") logger.debug("Received POST request on /data.")
wait = request.args.get('wait') == '1' wait = request.args.get('wait') == '1'
...@@ -45,126 +41,70 @@ def create_data_job() -> Tuple[Response, int]: ...@@ -45,126 +41,70 @@ def create_data_job() -> Tuple[Response, int]:
etl_handler = ETLHandler.factory(handler=payload['handler'], etl_handler = ETLHandler.factory(handler=payload['handler'],
server=payload['server'], server=payload['server'],
auth=payload['auth']) auth=payload['auth'])
job_ids = etl_handler.handle(descriptors=payload['descriptors'], wait=wait) task_ids = etl_handler.handle(descriptors=payload['descriptors'], wait=wait)
session['jobs'] += job_ids session['data_tasks'] += task_ids
session['jobs'] = list(set(session['data_jobs'])) # make unique session['data_tasks'] = list(set(session['data_tasks']))
logger.debug("Jobs successfully submitted. Sending response.") logger.debug("Tasks successfully submitted. Sending response.")
return jsonify({'job_ids': job_ids}), 201 return jsonify(''), 201
@data_blueprint.route('/<uuid:job_id>', methods=['GET'])
def get_data_job_state(job_id: UUID) -> Tuple[Response, int]:
"""Get information for data that matches given job_id. If the job was
successful add the data_id associated with the successful job to the session
for access control and return it.
:param job_id: The id associated with the previously submitted job.
See doc/api/ for more information.
:return: Flask Response
"""
logger.debug("Received GET request on /data/job_id.")
job_id = str(job_id)
wait = request.args.get('wait') == '1'
if job_id not in session['jobs']:
error = "Job ID '{}' not found in session. " \
"Refusing access.".format(job_id)
logger.warning(error)
return jsonify({'error': error}), 403
async_result = celery.AsyncResult(job_id)
if wait:
async_result.get(propagate=False)
if async_result.state == 'SUCCESS':
logger.debug("Job '{}' successful. Adding data_id '{}' "
"to session.".format(job_id, async_result.result))
session['data_ids'] = async_result.result
logger.debug("Job found and has access. Sending response.")
return jsonify({'state': async_result.state,
'result': async_result.result}), 200
@data_blueprint.route('/<string:data_id>', methods=['GET'])
def get_data_by_id(data_id: str) -> Tuple[Response, int]:
"""Given a data id return the related Redis DB entry.
:param data_id: The id returned by the data job submitted by create_data_job
:return: Parsed and modified data entry from Redis.
"""
logger.debug("Received GET request on /data/data_id.")
if data_id not in session['data_ids']:
error = "Data ID '{}' not found in session. " \
"Refusing access.".format(data_id)
logger.warning(error)
return jsonify({'error': error}), 403
value = redis.get('data:{}'.format(data_id))
if not value:
error = "Could not find data entry in Redis for data_id: " \
"'{}'. The entry probably expired.".format(data_id)
logger.warning(error)
return jsonify({'error': error}), 404
data_obj = json.loads(value.decode('utf-8'))
# update 'last_access' internal
data_obj['last_access'] = time.time()
redis.set(name='data:{}'.format(data_id), value=data_obj)
# remove internal information from response
del data_obj['file_path']
del data_obj['last_access']
logger.debug("Data found and has access. Sending response.")
return jsonify({'data_state': data_obj}), 200
@data_blueprint.route('', methods=['GET']) @data_blueprint.route('', methods=['GET'])
def get_all_data_state() -> Tuple[Response, int]: def get_all_data() -> Tuple[Response, int]:
"""Get information for all data that the current session can access. """Get information for all tasks that have been submitted in the lifetime
of the current session.
See doc/api/ for more information. See doc/api/ for more information.
:return: Flask Response :return: Information associated with each submitted task
""" """
logger.debug("Received GET request on /data.") logger.debug("Received GET request on /data.")
wait = request.args.get('wait') == '1'
data_states = [] data_states = []
for data_id in session['data_ids']: for task_id in session['data_tasks']:
value = redis.get('data:{}'.format(data_id)) async_result = celery.AsyncResult(task_id)
if wait:
logger.debug("'wait' was set. Waiting for tasks to finish ...")
async_result.get(propagate=False)
value = redis.get('data:{}'.format(task_id))
if not value: if not value:
error = "Could not find data entry in Redis for data_id: " \ error = "Could not find data entry in Redis for task_id: " \
"'{}'. The entry probably expired.".format(data_id) "'{}'. The entry probably expired.".format(task_id)
logger.warning(error) logger.warning(error)
continue continue
data_obj = json.loads(value.decode('utf-8')) data_state = json.loads(value.decode('utf-8'))
# update 'last_access' internal
data_obj['last_access'] = time.time()
redis.set(name='data:{}'.format(data_id), value=data_obj)
# remove internal information from response # remove internal information from response
del data_obj['file_path'] del data_state['file_path']
del data_obj['last_access'] del data_state['last_access']
data_states.append(data_obj) # add additional information to response
data_state['etl_state'] = async_result.state
data_state['etl_message'] = async_result.result
data_states.append(data_state)
logger.debug("Data states collected. Sending response.") logger.debug("Data states collected. Sending response.")
return jsonify({'data_states': data_states}), 200 return jsonify({'data_states': data_states}), 200
@data_blueprint.route('/<string:data_id>', methods=['DELETE']) @data_blueprint.route('/<string:task_id>', methods=['DELETE'])
def delete_data(data_id: str) -> Tuple[Response, int]: def delete_data(task_id: str) -> Tuple[Response, int]:
"""This only deletes data from the session, not Redis or the file system. """Remove all traces of the data associated with the given task id.
This is enough to disable data visibility for the current user, but does not :param task_id: The id associated with the data
influence other users of the same data. Fractalis automatically removes
entries that are no longer accessed after a certain period of time.
:param data_id: The id returned by the data job submitted by create_data_job
See doc/api/ for more information. See doc/api/ for more information.
:return: Flask Response :return: Empty response.
""" """
logger.debug("Received DELETE request on /data/data_id.") logger.debug("Received DELETE request on /data/task_id.")
if data_id in session['data_ids']: if task_id in session['data_tasks']:
session['data_ids'].remove(data_id) session['data_tasks'].remove(task_id)
remove_data.delay(task_id)
logger.debug("Successfully removed data from session. Sending response.") logger.debug("Successfully removed data from session. Sending response.")
return jsonify(''), 200 return jsonify(''), 200
@data_blueprint.