Skip to content

Commit 2621026

Browse files
committed
Improved keeper logging, now maintains updated current daily statistics until day rollover, along with network capacities.
1 parent 8e70c8b commit 2621026

File tree

5 files changed

+497
-29
lines changed

5 files changed

+497
-29
lines changed

tensorlink/nodes/contract_manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ class ContractManager:
1212
"""
1313
Manages blockchain contract interactions for validator proposals and job management.
1414
15-
This class handles the creation, submission, and execution of proposals for
16-
validator removal and job completion on the blockchain.
15+
This class handles the creation, submission, voting, and execution of proposals for
16+
validator removal, job completion, and reward distribution on the blockchain.
1717
"""
1818

1919
def __init__(self, node, multi_sig_contract, chain, public_key: str):

tensorlink/nodes/job_monitor.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ def monitor_job(self, job_id: str):
125125
f"Job monitor beginning for job: {job_id}",
126126
colour="blue",
127127
level=logging.INFO,
128+
tag="Job Monitor",
128129
)
129130

130131
job_data = self._get_job_data(job_id)
@@ -171,6 +172,7 @@ def monitor_job(self, job_id: str):
171172
self.node.debug_print(
172173
f"Validator -> Job inspection complete for job: {job_id}",
173174
colour="blue",
175+
tag="Job Monitor",
174176
)
175177
job_data["last_seen"] = time.time()
176178
self.node.dht.routing_table[job_id] = job_data
@@ -180,6 +182,7 @@ def monitor_job(self, job_id: str):
180182
f"Error in health check cycle: {str(e)}",
181183
colour="bright_red",
182184
level=logging.ERROR,
185+
tag="Job Monitor",
183186
)
184187
break
185188

@@ -240,6 +243,7 @@ def _check_single_worker(self, worker: str, module_id: str) -> bool:
240243
f"Worker health check failed for {worker}: {str(e)}",
241244
colour="yellow",
242245
level=logging.WARNING,
246+
tag="Job Monitor",
243247
)
244248
return False
245249

@@ -287,6 +291,7 @@ def _verify_worker_proof(self, worker: str, proof: Dict, metrics: Dict) -> bool:
287291
f"Error verifying proof of work: {str(e)}",
288292
colour="bright_red",
289293
level=logging.ERROR,
294+
tag="Job Monitor",
290295
)
291296
return False
292297

@@ -322,6 +327,7 @@ def _handle_invalid_proof(self, worker: str, module_id: str):
322327
f"Invalid proof of work from worker {worker} for module {module_id}",
323328
colour="bright_red",
324329
level=logging.WARNING,
330+
tag="Job Monitor",
325331
)
326332

327333
# Record violation
@@ -349,6 +355,7 @@ def _penalize_worker(self, worker: str, module_id: str):
349355
f"Error applying worker penalty: {str(e)}",
350356
colour="bright_red",
351357
level=logging.ERROR,
358+
tag="Job Monitor",
352359
)
353360

354361
def _get_job_data(self, job_id: str) -> Optional[Dict]:
@@ -360,6 +367,7 @@ def _get_job_data(self, job_id: str) -> Optional[Dict]:
360367
f"Failed to retrieve job data: {str(e)}",
361368
colour="bright_red",
362369
level=logging.ERROR,
370+
tag="Job Monitor",
363371
)
364372
return None
365373

@@ -388,6 +396,7 @@ def _check_user_status(self, job_data: Dict) -> bool:
388396
f"Error checking user status: {str(e)}",
389397
colour="yellow",
390398
level=logging.WARNING,
399+
tag="Job Monitor",
391400
)
392401
return False
393402

@@ -411,6 +420,7 @@ def _handle_worker_failure(self, worker: str, module_id: str):
411420
f"Worker {worker} failed for module {module_id}. Initiating recovery...",
412421
colour="yellow",
413422
level=logging.WARNING,
423+
tag="Job Monitor",
414424
)
415425
# Implement worker recovery strategy
416426
# TODO: Add worker replacement logic
@@ -437,6 +447,7 @@ def _cleanup_job(self, job_data: Dict, final_status: JobStatus):
437447
f"Job {job_data['id']} cleaned up successfully with status: {final_status.value}",
438448
colour="green",
439449
level=logging.INFO,
450+
tag="Job Monitor",
440451
)
441452

442453
# Pass over job to contract manager
@@ -448,6 +459,7 @@ def _cleanup_job(self, job_data: Dict, final_status: JobStatus):
448459
f"Error during job cleanup: {str(e)}",
449460
colour="bright_red",
450461
level=logging.ERROR,
462+
tag="Job Monitor",
451463
)
452464

453465
def _cleanup_workers(self, job_data: Dict):
@@ -467,6 +479,7 @@ def _cleanup_workers(self, job_data: Dict):
467479
f"Error shutting down worker {worker}: {str(e)}",
468480
colour="yellow",
469481
level=logging.WARNING,
482+
tag="Job Monitor",
470483
)
471484

472485
def _should_terminate_job(self, job_data: Dict, current_status: JobStatus) -> bool:
@@ -525,6 +538,7 @@ def _handle_job_failure(self, job_id: str, reason: str):
525538
f"Could not retrieve data for failed job {job_id}",
526539
colour="bright_red",
527540
level=logging.ERROR,
541+
tag="Job Monitor",
528542
)
529543
return
530544

@@ -562,11 +576,13 @@ def _handle_job_failure(self, job_id: str, reason: str):
562576
f"Job {job_id} failed: {reason}. Cleaning Up",
563577
colour="bright_red",
564578
level=logging.ERROR,
579+
tag="Job Monitor",
565580
)
566581

567582
except Exception as e:
568583
self.node.debug_print(
569584
f"Error handling job failure: {str(e)}",
570585
colour="bright_red",
571586
level=logging.ERROR,
587+
tag="Job Monitor",
572588
)

0 commit comments

Comments
 (0)