diff --git a/snappea/foreman.py b/snappea/foreman.py index 4dd0d82..5c88af7 100644 --- a/snappea/foreman.py +++ b/snappea/foreman.py @@ -261,6 +261,13 @@ class Foreman: while self.create_workers() == self.settings.TASK_QS_LIMIT: pass # `== TASK_QS_LIMIT`: as documented above + # in workaholic mode, the checks-after-unblock may have led to the conclusion "I need to stop but I can't + # because there is still work to do". For that case, we need to do another check at the moment the work + # might have actually been done and before blocking again, i.e. here. Note that the other place where we + # block is the .acquire() in create_workers(), but there we _know_ we're not out of Tasks (by definition of + # acquiring the worker semaphore), so this single check is enough. + self.check_for_stopping() + def create_workers(self): """returns the number of workers created (AKA tasks done)""" @@ -336,6 +343,13 @@ class Foreman: def check_for_stopping(self): if not self.stopping: return + + if self.settings.WORKAHOLIC: + with durable_atomic(using="snappea"): + if Task.objects.exists(): + logger.info("Not stopping yet: Workaholic mode, waiting for all tasks to finish") + return + logger.info("Stopping") # Loop over all tasks, waiting for them to finish. If they don't finish in time (GRACEFUL_TIMEOUT), we'll kill diff --git a/snappea/settings.py b/snappea/settings.py index 29585a4..0842b96 100644 --- a/snappea/settings.py +++ b/snappea/settings.py @@ -8,6 +8,26 @@ DEFAULTS = { "WAKEUP_CALLS_DIR": "/tmp/snappea.wakeup", "NUM_WORKERS": 4, + + # Workaholic mode: I will not stop, even when I'm told to, until _all_ of my tasks are done. This was built for the + # case of Docker but might just be useful outside it. Consider: + # + # * snappea and the server are in the same container, and communicate via an sqlite queue (file) in the container. + # * containers are supposed to be disposable; the message queue will be disposed of when the container is; the + # ingested (but not yet digested) events in the /tmp/ dir will be too, by the way. + # * snappea may get a TERM signal because either the container is being stopped, or when the server exits (via + # bugsink-server-unified). + # + # Given the above, it's better for snappea to do all the work it can before it gets killed the drastic way when + # Docker gets impatient, than to quickly shut down and leave the server with a bunch of unprocessed events. This is + # what the "workaholic" mode is for. + # + # Note about scenario that we don't deal with 100%: on docker-stop, the sigterm is sent to both processes at the + # same time. Gunicorn may then take some time to fully shut down while still serving requests, and in that + # time-taking enqueue new tasks; such tasks would not be picked up, even in workaholic mode. An improvement could be + # to shut down in-order, but for now this is in "perfectionism" territory for us. + "WORKAHOLIC": False, + "GRACEFUL_TIMEOUT": 10, "TASK_QS_LIMIT": 100,