2023-02-26 05:49:39 +00:00
|
|
|
from logging import getLogger
|
2023-03-06 03:28:21 +00:00
|
|
|
from os import getpid
|
2023-03-18 20:32:49 +00:00
|
|
|
from typing import Any, Callable, Optional
|
2023-02-26 05:49:39 +00:00
|
|
|
|
2023-02-26 20:15:30 +00:00
|
|
|
from torch.multiprocessing import Queue, Value
|
|
|
|
|
2023-08-21 03:28:40 +00:00
|
|
|
from ..errors import CancelledException
|
2023-02-26 05:49:39 +00:00
|
|
|
from ..params import DeviceParams
|
2023-03-18 20:16:41 +00:00
|
|
|
from .command import JobCommand, ProgressCommand
|
2023-02-26 05:49:39 +00:00
|
|
|
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
ProgressCallback = Callable[[int, int, Any], None]
|
|
|
|
|
2023-02-26 20:15:30 +00:00
|
|
|
|
2023-02-26 05:49:39 +00:00
|
|
|
class WorkerContext:
|
2023-03-01 03:44:52 +00:00
|
|
|
cancel: "Value[bool]"
|
2023-07-16 00:00:20 +00:00
|
|
|
job: Optional[str]
|
|
|
|
name: str
|
2023-03-18 20:12:09 +00:00
|
|
|
pending: "Queue[JobCommand]"
|
2023-03-18 20:32:49 +00:00
|
|
|
active_pid: "Value[int]"
|
2023-03-18 20:12:09 +00:00
|
|
|
progress: "Queue[ProgressCommand]"
|
2023-03-18 20:32:49 +00:00
|
|
|
last_progress: Optional[ProgressCommand]
|
2023-04-16 01:37:53 +00:00
|
|
|
idle: "Value[bool]"
|
2023-04-20 22:36:29 +00:00
|
|
|
timeout: float
|
2023-07-15 16:20:25 +00:00
|
|
|
retries: int
|
2023-09-28 23:45:04 +00:00
|
|
|
initial_retries: int
|
2023-02-26 05:49:39 +00:00
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
2023-07-16 00:00:20 +00:00
|
|
|
name: str,
|
2023-02-26 05:49:39 +00:00
|
|
|
device: DeviceParams,
|
2023-03-01 03:44:52 +00:00
|
|
|
cancel: "Value[bool]",
|
|
|
|
logs: "Queue[str]",
|
2023-03-18 20:12:09 +00:00
|
|
|
pending: "Queue[JobCommand]",
|
|
|
|
progress: "Queue[ProgressCommand]",
|
2023-03-18 20:32:49 +00:00
|
|
|
active_pid: "Value[int]",
|
2023-04-16 01:37:53 +00:00
|
|
|
idle: "Value[bool]",
|
2023-09-11 23:41:59 +00:00
|
|
|
retries: int,
|
2023-09-28 23:45:04 +00:00
|
|
|
timeout: float,
|
2023-02-26 05:49:39 +00:00
|
|
|
):
|
2023-07-18 03:46:02 +00:00
|
|
|
self.job = None
|
2023-07-16 00:00:20 +00:00
|
|
|
self.name = name
|
2023-02-26 05:49:39 +00:00
|
|
|
self.device = device
|
2023-02-27 02:09:42 +00:00
|
|
|
self.cancel = cancel
|
2023-02-26 05:49:39 +00:00
|
|
|
self.progress = progress
|
2023-02-27 02:09:42 +00:00
|
|
|
self.logs = logs
|
|
|
|
self.pending = pending
|
2023-03-18 20:32:49 +00:00
|
|
|
self.active_pid = active_pid
|
2023-03-18 22:27:41 +00:00
|
|
|
self.last_progress = None
|
2023-04-16 01:37:53 +00:00
|
|
|
self.idle = idle
|
2023-09-28 23:45:04 +00:00
|
|
|
self.initial_retries = retries
|
2023-09-11 23:41:59 +00:00
|
|
|
self.retries = retries
|
2023-09-28 23:45:04 +00:00
|
|
|
self.timeout = timeout
|
2023-02-26 05:49:39 +00:00
|
|
|
|
2023-03-19 22:57:14 +00:00
|
|
|
def start(self, job: str) -> None:
|
|
|
|
self.job = job
|
2023-09-28 23:45:04 +00:00
|
|
|
self.retries = self.initial_retries
|
2023-03-19 22:57:14 +00:00
|
|
|
self.set_cancel(cancel=False)
|
2023-04-16 01:37:53 +00:00
|
|
|
self.set_idle(idle=False)
|
|
|
|
|
|
|
|
def is_active(self) -> bool:
|
|
|
|
return self.get_active() == getpid()
|
2023-03-19 22:57:14 +00:00
|
|
|
|
2023-02-26 05:49:39 +00:00
|
|
|
def is_cancelled(self) -> bool:
|
|
|
|
return self.cancel.value
|
|
|
|
|
2023-04-16 01:37:53 +00:00
|
|
|
def is_idle(self) -> bool:
|
|
|
|
return self.idle.value
|
2023-03-06 03:28:21 +00:00
|
|
|
|
2023-03-18 20:32:49 +00:00
|
|
|
def get_active(self) -> int:
|
|
|
|
with self.active_pid.get_lock():
|
|
|
|
return self.active_pid.value
|
2023-03-06 03:28:21 +00:00
|
|
|
|
2023-02-26 05:49:39 +00:00
|
|
|
def get_device(self) -> DeviceParams:
|
|
|
|
"""
|
|
|
|
Get the device assigned to this job.
|
|
|
|
"""
|
|
|
|
return self.device
|
|
|
|
|
|
|
|
def get_progress(self) -> int:
|
2023-03-18 20:32:49 +00:00
|
|
|
if self.last_progress is not None:
|
|
|
|
return self.last_progress.progress
|
|
|
|
|
|
|
|
return 0
|
2023-02-26 05:49:39 +00:00
|
|
|
|
|
|
|
def get_progress_callback(self) -> ProgressCallback:
|
2023-11-18 23:18:23 +00:00
|
|
|
from ..chain.pipeline import ChainProgress
|
2023-04-30 04:31:08 +00:00
|
|
|
|
2023-02-26 05:49:39 +00:00
|
|
|
def on_progress(step: int, timestep: int, latents: Any):
|
|
|
|
on_progress.step = step
|
2023-03-18 20:12:09 +00:00
|
|
|
self.set_progress(step)
|
2023-02-26 05:49:39 +00:00
|
|
|
|
2023-04-30 04:28:51 +00:00
|
|
|
return ChainProgress.from_progress(on_progress)
|
2023-02-26 05:49:39 +00:00
|
|
|
|
|
|
|
def set_cancel(self, cancel: bool = True) -> None:
|
|
|
|
with self.cancel.get_lock():
|
|
|
|
self.cancel.value = cancel
|
|
|
|
|
2023-04-16 01:37:53 +00:00
|
|
|
def set_idle(self, idle: bool = True) -> None:
|
|
|
|
with self.idle.get_lock():
|
|
|
|
self.idle.value = idle
|
|
|
|
|
2023-02-26 05:49:39 +00:00
|
|
|
def set_progress(self, progress: int) -> None:
|
2023-07-16 00:00:20 +00:00
|
|
|
if self.job is None:
|
|
|
|
raise RuntimeError("no job on which to set progress")
|
|
|
|
|
2023-03-18 20:12:09 +00:00
|
|
|
if self.is_cancelled():
|
2023-08-21 03:28:40 +00:00
|
|
|
raise CancelledException("job has been cancelled")
|
2023-03-18 20:32:49 +00:00
|
|
|
|
2023-07-16 00:00:20 +00:00
|
|
|
logger.debug("setting progress for job %s to %s", self.job, progress)
|
2023-03-18 20:32:49 +00:00
|
|
|
self.last_progress = ProgressCommand(
|
|
|
|
self.job,
|
|
|
|
self.device.device,
|
2023-07-16 00:00:20 +00:00
|
|
|
False,
|
|
|
|
progress,
|
2023-03-18 20:32:49 +00:00
|
|
|
self.is_cancelled(),
|
|
|
|
False,
|
|
|
|
)
|
2023-07-16 00:00:20 +00:00
|
|
|
|
2023-03-18 20:16:41 +00:00
|
|
|
self.progress.put(
|
2023-03-18 20:32:49 +00:00
|
|
|
self.last_progress,
|
2023-03-18 20:16:41 +00:00
|
|
|
block=False,
|
|
|
|
)
|
2023-03-18 20:12:09 +00:00
|
|
|
|
2023-07-16 00:00:20 +00:00
|
|
|
def finish(self) -> None:
|
|
|
|
if self.job is None:
|
|
|
|
logger.warning("setting finished without an active job")
|
|
|
|
else:
|
|
|
|
logger.debug("setting finished for job %s", self.job)
|
2023-03-18 20:32:49 +00:00
|
|
|
self.last_progress = ProgressCommand(
|
|
|
|
self.job,
|
|
|
|
self.device.device,
|
|
|
|
True,
|
|
|
|
self.get_progress(),
|
|
|
|
self.is_cancelled(),
|
2023-07-16 00:00:20 +00:00
|
|
|
False,
|
2023-03-18 20:32:49 +00:00
|
|
|
)
|
2023-03-18 20:16:41 +00:00
|
|
|
self.progress.put(
|
2023-03-18 20:32:49 +00:00
|
|
|
self.last_progress,
|
2023-03-18 20:16:41 +00:00
|
|
|
block=False,
|
|
|
|
)
|
2023-07-16 00:00:20 +00:00
|
|
|
|
|
|
|
def fail(self) -> None:
|
|
|
|
if self.job is None:
|
|
|
|
logger.warning("setting failure without an active job")
|
|
|
|
else:
|
|
|
|
logger.warning("setting failure for job %s", self.job)
|
|
|
|
try:
|
|
|
|
self.last_progress = ProgressCommand(
|
|
|
|
self.job,
|
|
|
|
self.device.device,
|
|
|
|
True,
|
|
|
|
self.get_progress(),
|
|
|
|
self.is_cancelled(),
|
|
|
|
True,
|
|
|
|
)
|
|
|
|
self.progress.put(
|
|
|
|
self.last_progress,
|
|
|
|
block=False,
|
|
|
|
)
|
|
|
|
except Exception:
|
|
|
|
logger.exception("error setting failure on job %s", self.job)
|
2023-02-27 23:14:53 +00:00
|
|
|
|
|
|
|
|
|
|
|
class JobStatus:
|
2023-03-01 14:26:40 +00:00
|
|
|
name: str
|
|
|
|
device: str
|
|
|
|
progress: int
|
|
|
|
cancelled: bool
|
|
|
|
finished: bool
|
|
|
|
|
2023-02-27 23:14:53 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
name: str,
|
2023-03-01 14:26:40 +00:00
|
|
|
device: DeviceParams,
|
2023-02-27 23:14:53 +00:00
|
|
|
progress: int = 0,
|
|
|
|
cancelled: bool = False,
|
|
|
|
finished: bool = False,
|
|
|
|
) -> None:
|
|
|
|
self.name = name
|
2023-03-01 14:26:40 +00:00
|
|
|
self.device = device.device
|
2023-02-27 23:14:53 +00:00
|
|
|
self.progress = progress
|
|
|
|
self.cancelled = cancelled
|
|
|
|
self.finished = finished
|