fix(api): clear job cancelled flag when worker starts a new job (#269)

2023-03-19 17:57:14 -05:00 · 2023-03-19 17:57:14 -05:00 · aefa5b4613
parent ba0767179c
commit aefa5b4613
2 changed files with 20 additions and 10 deletions
--- a/api/onnx_web/worker/context.py
+++ b/api/onnx_web/worker/context.py
@ -40,6 +40,10 @@ class WorkerContext:
        self.active_pid = active_pid
        self.last_progress = None
    def start(self, job: str) -> None:
        self.job = job
        self.set_cancel(cancel=False)
    def is_cancelled(self) -> bool:
        return self.cancel.value
@ -92,7 +96,7 @@ class WorkerContext:
                block=False,
            )
-    def set_finished(self) -> None:
+    def finish(self) -> None:
        logger.debug("setting finished for job %s", self.job)
        self.last_progress = ProgressCommand(
            self.job,
@ -107,7 +111,7 @@ class WorkerContext:
            block=False,
        )
-    def set_failed(self) -> None:
+    def fail(self) -> None:
        logger.warning("setting failure for job %s", self.job)
        try:
            self.last_progress = ProgressCommand(
--- a/api/onnx_web/worker/worker.py
+++ b/api/onnx_web/worker/worker.py
@ -22,7 +22,7 @@ def worker_main(context: WorkerContext, server: ServerContext):
    apply_patches(server)
    setproctitle("onnx-web worker: %s" % (context.device.device))
-    logger.trace("checking in from worker, %s", get_available_providers())
+    logger.trace("checking in from worker with providers: %s", get_available_providers())
    # make leaking workers easier to recycle
    context.progress.cancel_join_thread()
@ -37,34 +37,40 @@ def worker_main(context: WorkerContext, server: ServerContext):
                )
                exit(EXIT_REPLACED)
            # wait briefly for the next job
            job = context.pending.get(timeout=1.0)
-            logger.info("worker for %s got job: %s", context.device.device, job.name)
+            logger.info("worker %s got job: %s", context.device.device, job.name)
-            context.job = job.name  # TODO: hax
+            # clear flags and save the job name
            context.start(job.name)
            logger.info("starting job: %s", job.name)
            # reset progress, which does a final check for cancellation
            context.set_progress(0)
            job.fn(context, *job.args, **job.kwargs)
            # confirm completion of the job
            logger.info("job succeeded: %s", job.name)
-            context.set_finished()
+            context.finish()
        except Empty:
            pass
        except KeyboardInterrupt:
            logger.info("worker got keyboard interrupt")
-            context.set_failed()
+            context.fail()
            exit(EXIT_INTERRUPT)
        except ValueError:
            logger.exception("value error in worker, exiting: %s")
-            context.set_failed()
+            context.fail()
            exit(EXIT_ERROR)
        except Exception as e:
            e_str = str(e)
            if "Failed to allocate memory" in e_str or "out of memory" in e_str:
                logger.error("detected out-of-memory error, exiting: %s", e)
-                context.set_failed()
+                context.fail()
                exit(EXIT_MEMORY)
            else:
                logger.exception(
                    "error while running job",
                )
-                context.set_failed()
+                context.fail()
                # carry on