2023-02-26 05:49:39 +00:00
|
|
|
from logging import getLogger
|
2023-03-06 03:37:39 +00:00
|
|
|
from os import getpid
|
2023-03-07 14:02:53 +00:00
|
|
|
from queue import Empty
|
2023-02-28 04:52:43 +00:00
|
|
|
from sys import exit
|
2023-02-26 20:15:30 +00:00
|
|
|
|
2023-02-26 19:09:24 +00:00
|
|
|
from setproctitle import setproctitle
|
2023-02-26 05:49:39 +00:00
|
|
|
|
2023-07-15 22:05:27 +00:00
|
|
|
from ..errors import RetryException
|
2023-02-26 18:32:48 +00:00
|
|
|
from ..server import ServerContext, apply_patches
|
2023-02-26 21:21:58 +00:00
|
|
|
from ..torch_before_ort import get_available_providers
|
2023-02-26 20:15:30 +00:00
|
|
|
from .context import WorkerContext
|
2023-02-26 05:49:39 +00:00
|
|
|
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
2023-03-06 03:37:39 +00:00
|
|
|
EXIT_ERROR = 1
|
|
|
|
EXIT_INTERRUPT = 0
|
|
|
|
EXIT_MEMORY = 2
|
|
|
|
EXIT_REPLACED = 3
|
|
|
|
EXIT_SUCCESS = 0
|
|
|
|
|
2023-04-01 16:59:03 +00:00
|
|
|
MEMORY_ERRORS = [
|
|
|
|
"Failed to allocate memory",
|
2023-04-01 17:06:14 +00:00
|
|
|
"hipErrorOutOfMemory",
|
2023-04-01 20:46:52 +00:00
|
|
|
"MIOPEN failure",
|
2023-04-01 17:06:14 +00:00
|
|
|
"out of memory",
|
2023-04-01 19:26:45 +00:00
|
|
|
"rocblas_status_memory_error",
|
2023-04-01 16:59:03 +00:00
|
|
|
]
|
|
|
|
|
2023-02-26 18:32:48 +00:00
|
|
|
|
2023-10-07 00:04:48 +00:00
|
|
|
def worker_main(
|
|
|
|
worker: WorkerContext, server: ServerContext, *args, exit=exit, patch=True
|
|
|
|
):
|
2023-04-10 01:33:03 +00:00
|
|
|
setproctitle("onnx-web worker: %s" % (worker.device.device))
|
2023-02-26 18:32:48 +00:00
|
|
|
|
2023-10-07 00:04:48 +00:00
|
|
|
if patch:
|
|
|
|
apply_patches(server)
|
|
|
|
|
2023-03-20 01:16:52 +00:00
|
|
|
logger.trace(
|
|
|
|
"checking in from worker with providers: %s", get_available_providers()
|
|
|
|
)
|
2023-02-26 21:06:40 +00:00
|
|
|
|
2023-03-06 02:07:06 +00:00
|
|
|
# make leaking workers easier to recycle
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.progress.cancel_join_thread()
|
2023-03-06 02:07:06 +00:00
|
|
|
|
2023-02-26 05:49:39 +00:00
|
|
|
while True:
|
2023-02-27 02:09:42 +00:00
|
|
|
try:
|
2023-04-10 01:33:03 +00:00
|
|
|
if not worker.is_active():
|
2023-03-07 14:02:53 +00:00
|
|
|
logger.warning(
|
|
|
|
"worker %s has been replaced by %s, exiting",
|
|
|
|
getpid(),
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.get_active(),
|
2023-03-07 14:02:53 +00:00
|
|
|
)
|
2023-11-18 23:20:45 +00:00
|
|
|
return exit(EXIT_REPLACED)
|
2023-03-06 03:28:21 +00:00
|
|
|
|
2023-03-19 22:57:14 +00:00
|
|
|
# wait briefly for the next job
|
2023-04-20 22:36:29 +00:00
|
|
|
job = worker.pending.get(timeout=worker.timeout)
|
2023-04-10 01:33:03 +00:00
|
|
|
logger.info("worker %s got job: %s", worker.device.device, job.name)
|
2023-02-28 04:45:29 +00:00
|
|
|
|
2023-03-19 22:57:14 +00:00
|
|
|
# clear flags and save the job name
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.start(job.name)
|
2023-03-18 20:12:09 +00:00
|
|
|
logger.info("starting job: %s", job.name)
|
2023-03-19 22:57:14 +00:00
|
|
|
|
|
|
|
# reset progress, which does a final check for cancellation
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.set_progress(0)
|
|
|
|
job.fn(worker, *job.args, **job.kwargs)
|
2023-03-19 22:57:14 +00:00
|
|
|
|
|
|
|
# confirm completion of the job
|
2023-03-18 20:12:09 +00:00
|
|
|
logger.info("job succeeded: %s", job.name)
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.finish()
|
2023-02-28 04:37:43 +00:00
|
|
|
except Empty:
|
2023-04-16 01:37:53 +00:00
|
|
|
logger.trace("worker reached end of queue, setting idle flag")
|
|
|
|
worker.set_idle()
|
2023-02-28 04:52:43 +00:00
|
|
|
except KeyboardInterrupt:
|
2023-07-18 03:46:02 +00:00
|
|
|
logger.debug("worker got keyboard interrupt")
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.fail()
|
2023-11-18 23:20:45 +00:00
|
|
|
return exit(EXIT_INTERRUPT)
|
2023-07-15 22:05:27 +00:00
|
|
|
except RetryException:
|
2023-07-18 03:46:02 +00:00
|
|
|
logger.exception("retry error in worker, exiting")
|
2023-07-15 22:05:27 +00:00
|
|
|
worker.fail()
|
2023-11-18 23:20:45 +00:00
|
|
|
return exit(EXIT_ERROR)
|
2023-03-18 20:16:41 +00:00
|
|
|
except ValueError:
|
2023-07-18 03:46:02 +00:00
|
|
|
logger.exception("value error in worker, exiting")
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.fail()
|
2023-11-18 23:20:45 +00:00
|
|
|
return exit(EXIT_ERROR)
|
2023-02-26 17:16:33 +00:00
|
|
|
except Exception as e:
|
2023-03-16 23:34:28 +00:00
|
|
|
e_str = str(e)
|
2023-04-01 22:14:56 +00:00
|
|
|
# restart the worker on memory errors
|
2023-04-01 16:59:03 +00:00
|
|
|
for e_mem in MEMORY_ERRORS:
|
|
|
|
if e_mem in e_str:
|
|
|
|
logger.error("detected out-of-memory error, exiting: %s", e)
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.fail()
|
2023-11-18 23:20:45 +00:00
|
|
|
return exit(EXIT_MEMORY)
|
2023-04-01 22:14:56 +00:00
|
|
|
|
|
|
|
# carry on for other errors
|
|
|
|
logger.exception(
|
|
|
|
"unrecognized error while running job",
|
|
|
|
)
|
2023-04-10 01:33:03 +00:00
|
|
|
worker.fail()
|