fix(api): attempt to read progress updates from recycled workers
This commit is contained in:
parent
27500eccb5
commit
2d2283e1eb
|
@ -26,7 +26,7 @@ class DevicePoolExecutor:
|
||||||
progress_interval: float
|
progress_interval: float
|
||||||
recycle_interval: float
|
recycle_interval: float
|
||||||
|
|
||||||
leaking: List[Tuple[str, Process]]
|
leaking: List[Tuple[str, Process, WorkerContext]]
|
||||||
context: Dict[str, WorkerContext] # Device -> Context
|
context: Dict[str, WorkerContext] # Device -> Context
|
||||||
current: Dict[str, "Value[int]"] # Device -> pid
|
current: Dict[str, "Value[int]"] # Device -> pid
|
||||||
pending: Dict[str, "Queue[JobCommand]"]
|
pending: Dict[str, "Queue[JobCommand]"]
|
||||||
|
@ -256,7 +256,7 @@ class DevicePoolExecutor:
|
||||||
worker.pid,
|
worker.pid,
|
||||||
device,
|
device,
|
||||||
)
|
)
|
||||||
self.leaking.append((device, worker))
|
self.leak_worker(device)
|
||||||
else:
|
else:
|
||||||
logger.debug("worker for device %s has died", device)
|
logger.debug("worker for device %s has died", device)
|
||||||
|
|
||||||
|
@ -273,10 +273,9 @@ class DevicePoolExecutor:
|
||||||
|
|
||||||
def join_leaking(self):
|
def join_leaking(self):
|
||||||
if len(self.leaking) > 0:
|
if len(self.leaking) > 0:
|
||||||
logger.warning("cleaning up %s leaking workers", len(self.leaking))
|
for device, worker, _context in self.leaking:
|
||||||
for device, worker in self.leaking:
|
logger.warning(
|
||||||
logger.debug(
|
"shutting down leaking worker %s for device %s", worker.pid, device
|
||||||
"shutting down worker %s for device %s", worker.pid, device
|
|
||||||
)
|
)
|
||||||
worker.join(self.join_timeout)
|
worker.join(self.join_timeout)
|
||||||
if worker.is_alive():
|
if worker.is_alive():
|
||||||
|
@ -312,7 +311,7 @@ class DevicePoolExecutor:
|
||||||
worker.pid,
|
worker.pid,
|
||||||
device,
|
device,
|
||||||
)
|
)
|
||||||
self.leaking.append((device, worker))
|
self.leak_worker(device)
|
||||||
else:
|
else:
|
||||||
del worker
|
del worker
|
||||||
|
|
||||||
|
@ -463,6 +462,11 @@ class DevicePoolExecutor:
|
||||||
)
|
)
|
||||||
self.context[progress.device].set_cancel()
|
self.context[progress.device].set_cancel()
|
||||||
|
|
||||||
|
def leak_worker(self, device: str):
|
||||||
|
context = self.context[device]
|
||||||
|
worker = self.workers[device]
|
||||||
|
self.leaking.append((device, worker, context))
|
||||||
|
|
||||||
|
|
||||||
def health_main(pool: DevicePoolExecutor):
|
def health_main(pool: DevicePoolExecutor):
|
||||||
logger.trace("checking in from health worker thread")
|
logger.trace("checking in from health worker thread")
|
||||||
|
@ -494,10 +498,25 @@ def logger_main(pool: DevicePoolExecutor, logs: "Queue[str]"):
|
||||||
logger.exception("error in log worker")
|
logger.exception("error in log worker")
|
||||||
|
|
||||||
|
|
||||||
def progress_main(
|
def progress_main(pool: DevicePoolExecutor):
|
||||||
pool: DevicePoolExecutor
|
|
||||||
):
|
|
||||||
logger.trace("checking in from progress worker thread")
|
logger.trace("checking in from progress worker thread")
|
||||||
|
|
||||||
|
for device, _worker, context in pool.leaking:
|
||||||
|
# whether the worker is alive or not, try to clear its queues
|
||||||
|
try:
|
||||||
|
progress = context.progress.get_nowait()
|
||||||
|
while progress is not None:
|
||||||
|
pool.update_job(progress)
|
||||||
|
progress = context.progress.get_nowait()
|
||||||
|
except Empty:
|
||||||
|
logger.trace("empty queue in leaking worker for device %s", device)
|
||||||
|
pass
|
||||||
|
except ValueError as e:
|
||||||
|
logger.debug("value error in leaking worker for device %s: %s", device, e)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
logger.exception("error in leaking worker for device %s", device)
|
||||||
|
|
||||||
for device, queue in pool.progress.items():
|
for device, queue in pool.progress.items():
|
||||||
try:
|
try:
|
||||||
progress = queue.get_nowait()
|
progress = queue.get_nowait()
|
||||||
|
|
Loading…
Reference in New Issue