diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 74426fe..9704c51 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,3 +19,4 @@ repos: rev: "v1.8.0" hooks: - id: numpydoc-validation +exclude: tests/data/.* diff --git a/doc/changes/DM-48752.misc.rst b/doc/changes/DM-48752.misc.rst new file mode 100644 index 0000000..890c636 --- /dev/null +++ b/doc/changes/DM-48752.misc.rst @@ -0,0 +1 @@ +Improved reporting of provisioning job in atypical situations. diff --git a/doc/lsst.ctrl.bps.htcondor/userguide.rst b/doc/lsst.ctrl.bps.htcondor/userguide.rst index 992391d..dd36ada 100644 --- a/doc/lsst.ctrl.bps.htcondor/userguide.rst +++ b/doc/lsst.ctrl.bps.htcondor/userguide.rst @@ -148,6 +148,17 @@ from files. So, the detailed report can distinguish between failed and deleted jobs, and thus will show ``D`` in the flag column for a running workflow if there is a deleted job. +Rarely, a detailed report may warn about job submission issues. For example: + +.. code-block:: bash + + Warn: Job submission issues (last: 01/30/25 10:36:57) + +A job submission issue could be intermittent or not. It may cause +problems with the status or counts in the reports. To get more information +about the submission issue, look in the ``*.dag.dagman.out`` file for +errors, in particular lines containing ``submit attempt failed``. + Occasionally, some jobs are put on hold by HTCondor. To see the reason why jobs are being held, use @@ -276,12 +287,27 @@ Look for the line starting with "Provisioning job status". For example calibrate 0 0 1 0 0 0 0 0 0 0 0 1 finalJob 0 0 1 0 0 0 0 0 0 0 0 1 +If the provisioning job status is UNREADY, check the end of the report to see +if there is a warning about submission issues. There may be a temporary problem. +Check the ``*.dag.dagman.out`` in run submit directory for errors, in +particular for ``ERROR: submit attempt failed``. + +If the provisioning job status is HELD, the hold reason will appear in parentheses. + The service job managing the glideins will be automatically canceled once the workflow is completed. However, the existing glideins will be left for HTCondor to shut them down once they remain inactive for the period specified by ``provisioningMaxIdleTime`` (default value: 15 min., see below) or maximum wall time is reached. +The provisioning job is expected to run as long as the workflow. If the job +dies, the job status will be `FAILED`. If the job just completed successfully, +the job status will be `SUCCEEDED` with a message saying it ended early (which +may or may not cause a problem since existing glideins could remain running). +To get more information about either of these cases, check the job output +and error files in the `jobs/provisioningJob` subdirectory. + + If the automatic provisioning of the resources is enabled, the script that the service job is supposed to run in order to provide the required resources *must be* defined by the ``provisioningScript`` setting in the ``provisioning`` diff --git a/pyproject.toml b/pyproject.toml index e118389..d132360 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,6 +113,15 @@ convention = "numpy" # not fit on one line. add-ignore = ["D107", "D105", "D102", "D100", "D200", "D205", "D400", "D104"] +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + [tool.ruff] target-version = "py311" line-length = 110 diff --git a/python/lsst/ctrl/bps/htcondor/htcondor_service.py b/python/lsst/ctrl/bps/htcondor/htcondor_service.py index 348453d..671f8c7 100644 --- a/python/lsst/ctrl/bps/htcondor/htcondor_service.py +++ b/python/lsst/ctrl/bps/htcondor/htcondor_service.py @@ -79,7 +79,7 @@ read_dag_log, read_dag_status, read_node_status, - summary_from_dag, + summarize_dag, write_dag_info, ) from .provisioner import Provisioner @@ -154,7 +154,7 @@ def prepare(self, config, generic_workflow, out_prefix=None): if enable_provisioning: provisioner = Provisioner(config) provisioner.configure() - provisioner.prepare("provisioning_job.bash", prefix=out_prefix) + provisioner.prepare("provisioningJob.bash", prefix=out_prefix) provisioner.provision(workflow.dag) with time_this( @@ -1317,9 +1317,9 @@ def _create_detailed_report_from_jobs( job_state_counts=dag_ad.get("state_counts", state_counts), exit_code_summary=_get_exit_code_summary(jobs), ) - + specific_info = WmsSpecificInfo() for job_id, job_ad in jobs.items(): - if not is_service_job(job_id): + if not is_service_job(job_ad): try: job_report = WmsJobReport( wms_id=job_id, @@ -1334,33 +1334,85 @@ def _create_detailed_report_from_jobs( _LOG.error("Job missing key '%s': %s", str(ex), job_ad) raise else: - job_label = job_ad.get("bps_job_label") - if job_label is None: - _LOG.warning("Service job with id '%s': missing label, no action taken", job_id) - elif job_label == dag_ad.get("bps_provisioning_job", "MISS"): - report.specific_info = WmsSpecificInfo() - job_status = _htc_status_to_wms_state(job_ad) - if job_status == WmsStates.DELETED: - if "Reason" in job_ad and "Removed by DAGMan" in job_ad["Reason"]: - job_status = WmsStates.SUCCEEDED - report.specific_info.add_message( - template="Provisioning job status: {status}", - context={"status": job_status.name}, - ) - else: - _LOG.warning( - "Service job with id '%s' (label '%s'): no handler, no action taken", job_id, job_label - ) + _LOG.debug( + "Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'", + job_id, + job_ad["DAGNodeName"], + job_ad.get("bps_job_label", "MISS"), + job_ad.get("NodeStatus", "MISS"), + job_ad.get("JobStatus", "MISS"), + ) + _add_service_job_specific_info(job_ad, specific_info) + + if specific_info: + report.specific_info = specific_info # Add the removed entry to restore the original content of the dictionary. # The ordering of keys will be change permanently though. jobs.update({wms_workflow_id: dag_ad}) + # Workflow will exit with non-zero DAG_STATUS if problem with + # any of the wms jobs. So change FAILED to SUCCEEDED if all + # payload jobs SUCCEEDED. + if report.total_number_jobs == report.job_state_counts[WmsStates.SUCCEEDED]: + report.state = WmsStates.SUCCEEDED + run_reports = {report.wms_id: report} _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) return run_reports +def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpecificInfo) -> None: + """Generate report information for service job. + + Parameters + ---------- + job_ad : `dict` [`str`, `Any`] + Provisioning job information. + specific_info : `lsst.ctrl.bps.WmsSpecificInfo` + Where to add message. + """ + status_details = "" + job_status = _htc_status_to_wms_state(job_ad) + + # Service jobs in queue are deleted when DAG is done. + # To get accurate status, need to check other info. + if ( + job_status == WmsStates.DELETED + and "Reason" in job_ad + and ( + "Removed by DAGMan" in job_ad["Reason"] + or "removed because bool: +def is_service_job(job_ad: dict[str, Any]) -> bool: """Determine if a job is a service one. Parameters ---------- - job_id : str - HTCondor job id. + job_ad : `dict` [`str`, Any] + Information about an HTCondor job. Returns ------- @@ -2159,10 +2211,7 @@ def is_service_job(job_id: str) -> bool: Notes ----- At the moment, HTCondor does not provide a native way to distinguish - between payload and service jobs in the workflow. As a result, the current - implementation depends entirely on the logic that is used in - :py:func:`read_node_status()` (service jobs are given ids with ClusterId=0 - and ProcId=some integer). If it changes, this function needs to be - updated too. + between payload and service jobs in the workflow. This code depends + on read_node_status adding bps_job_type. """ - return int(float(job_id)) == 0 + return job_ad.get("bps_job_type", "MISSING") == "service" diff --git a/python/lsst/ctrl/bps/htcondor/lssthtc.py b/python/lsst/ctrl/bps/htcondor/lssthtc.py index 6a69347..a4092c5 100644 --- a/python/lsst/ctrl/bps/htcondor/lssthtc.py +++ b/python/lsst/ctrl/bps/htcondor/lssthtc.py @@ -63,7 +63,8 @@ "read_dag_nodes_log", "read_dag_status", "read_node_status", - "summary_from_dag", + "summarize_dag", + "update_job_info", "update_job_info", "write_dag_info", ] @@ -1245,7 +1246,7 @@ def update_job_info(job_info, other_info): return job_info -def summary_from_dag(dir_name): +def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]: """Build bps_run_summary string from dag file. Parameters @@ -1256,51 +1257,64 @@ def summary_from_dag(dir_name): Returns ------- summary : `str` - Semi-colon separated list of job labels and counts. + Semi-colon separated list of job labels and counts (Same format as saved in dag classad). job_name_to_label : `dict` [`str`, `str`] Mapping of job names to job labels. + job_name_to_type : `dict` [`str`, `str`] + Mapping of job names to job types + (e.g., payload, final, service). """ # Later code depends upon insertion order - counts = defaultdict(int) + counts: defaultdict[str, int] = defaultdict(int) # counts of payload jobs per label job_name_to_label = {} + job_name_to_type = {} try: dag = next(Path(dir_name).glob("*.dag")) with open(dag) as fh: for line in fh: + job_name = "" if line.startswith("JOB"): - m = re.match(r'JOB (\S+) "jobs/([^/]+)/', line) + m = re.match(r'JOB (\S+) "?jobs/([^/]+)/', line) if m: + job_name = m.group(1) label = m.group(2) if label == "init": label = "pipetaskInit" - job_name_to_label[m.group(1)] = label counts[label] += 1 else: # Check if Pegasus submission m = re.match(r"JOB (\S+) (\S+)", line) if m: + job_name = m.group(1) label = pegasus_name_to_label(m.group(1)) - job_name_to_label[m.group(1)] = label counts[label] += 1 else: _LOG.warning("Parse DAG: unmatched job line: %s", line) + job_type = "payload" elif line.startswith("FINAL"): m = re.match(r"FINAL (\S+) jobs/([^/]+)/", line) if m: + job_name = m.group(1) label = m.group(2) - job_name_to_label[m.group(1)] = label - counts[label] += 1 + counts[label] += 1 # final counts a payload job. + job_type = "final" elif line.startswith("SERVICE"): m = re.match(r"SERVICE (\S+) jobs/([^/]+)/", line) if m: + job_name = m.group(1) label = m.group(2) - job_name_to_label[m.group(1)] = label + job_type = "service" + + if job_name: + job_name_to_label[job_name] = label + job_name_to_type[job_name] = job_type + except (OSError, PermissionError, StopIteration): pass summary = ";".join([f"{name}:{counts[name]}" for name in counts]) - _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_label) - return summary, job_name_to_label + _LOG.debug("summarize_dag: %s %s %s", summary, job_name_to_label, job_name_to_type) + return summary, job_name_to_label, job_name_to_type def pegasus_name_to_label(name): @@ -1400,7 +1414,7 @@ def read_node_status(wms_path): file. """ # Get jobid info from other places to fill in gaps in info from node_status - _, job_name_to_label = summary_from_dag(wms_path) + _, job_name_to_label, job_name_to_type = summarize_dag(wms_path) wms_workflow_id, loginfo = read_dag_log(wms_path) loginfo = read_dag_nodes_log(wms_path) _LOG.debug("loginfo = %s", loginfo) @@ -1409,17 +1423,17 @@ def read_node_status(wms_path): if "LogNotes" in job_info: m = re.match(r"DAG Node: (\S+)", job_info["LogNotes"]) if m: - job_name_to_id[m.group(1)] = job_id - job_info["DAGNodeName"] = m.group(1) + job_name = m.group(1) + job_name_to_id[job_name] = job_id + job_info["DAGNodeName"] = job_name + job_info["bps_job_type"] = job_name_to_type[job_name] + job_info["bps_job_label"] = job_name_to_label[job_name] + jobs = loginfo + fake_id = -1.0 # For nodes that do not yet have a job id, give fake one try: node_status = next(Path(wms_path).glob("*.node_status")) - except StopIteration: - return loginfo - jobs = {} - fake_id = -1.0 # For nodes that do not yet have a job id, give fake one - try: with open(node_status) as fh: for ad in classad.parseAds(fh): match ad["Type"]: @@ -1438,22 +1452,19 @@ def read_node_status(wms_path): # Make job info as if came from condor_q. if job_name in job_name_to_id: job_id = str(job_name_to_id[job_name]) + job = jobs[job_id] else: job_id = str(fake_id) + job_name_to_id[job_name] = job_id + job = dict(ad) + jobs[job_id] = job fake_id -= 1 - job = dict(ad) job["ClusterId"] = int(float(job_id)) job["DAGManJobID"] = wms_workflow_id job["DAGNodeName"] = job_name job["bps_job_label"] = job_label + job["bps_job_type"] = job_name_to_type[job_name] - # Include information retrieved from the event log - # if available. - jobs[job_id] = job - try: - jobs[job_id] |= loginfo[job_id] - except KeyError: - pass case "StatusEnd": # Skip node status file "epilog". pass @@ -1463,24 +1474,22 @@ def read_node_status(wms_path): ad["Type"], wms_path, ) - except (OSError, PermissionError): + except (StopIteration, OSError, PermissionError): pass - else: - # Assume that the jobs found in the event log, but *not* in the node - # status file are the service jobs as HTCondor does not include - # information about these jobs in the node status file at the moment. - # - # Note: To be able to easily identify the service jobs downstream, - # we reverse the ClusterId and ProcId in their HTCondor ids in internal - # use. For example, if HTCondor id of a service job is '1.0', we will - # use '0.1' instead. - service_jobs = {job_id: loginfo[job_id] for job_id in set(loginfo) - set(jobs)} - job_id_to_name = { - job_id: job_name for job_name, job_id in job_name_to_id.items() if job_id in service_jobs - } - for job_id, job_info in service_jobs.items(): - job_info["bps_job_label"] = job_name_to_label[job_id_to_name[job_id]] - jobs[f"{job_info['ProcId']}.{job_info['ClusterId']}"] = job_info + + # Check for missing jobs (e.g., submission failure or not submitted yet) + # Use dag info to create job placeholders + for name in set(job_name_to_label) - set(job_name_to_id): + job = {} + job["ClusterId"] = int(float(fake_id)) + job["ProcId"] = 0 + job["DAGManJobID"] = wms_workflow_id + job["DAGNodeName"] = name + job["bps_job_label"] = job_name_to_label[name] + job["bps_job_type"] = job_name_to_type[name] + job["NodeStatus"] = NodeStatus.NOT_READY + jobs[f"{job['ClusterId']}.{job['ProcId']}"] = job + fake_id -= 1 return jobs diff --git a/tests/data/tiny_problems/tiny_problems.dag b/tests/data/tiny_problems/tiny_problems.dag new file mode 100644 index 0000000..05b0c9b --- /dev/null +++ b/tests/data/tiny_problems/tiny_problems.dag @@ -0,0 +1,26 @@ +JOB pipetaskInit "jobs/pipetaskInit/pipetaskInit.sub" +JOB 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a "jobs/label1/val1/4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a.sub" +JOB 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b "jobs/label1/val1/057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b.sub" +JOB 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a "jobs/label2/val1/696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a.sub" +JOB 40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b "jobs/label2/val1/40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b.sub" +PARENT pipetaskInit CHILD 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a +PARENT pipetaskInit CHILD 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b +PARENT 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a CHILD 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a +PARENT 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b CHILD 40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b +DOT u_testuser_tiny_20250213T175935Z.dot +NODE_STATUS_FILE u_testuser_tiny_20250213T175935Z.node_status +SET_JOB_ATTR bps_isjob= "True" +SET_JOB_ATTR bps_project= "dev" +SET_JOB_ATTR bps_campaign= "tiny" +SET_JOB_ATTR bps_run= "u_testuser_tiny_20250213T175935Z" +SET_JOB_ATTR bps_operator= "testuser" +SET_JOB_ATTR bps_payload= "tiny" +SET_JOB_ATTR bps_runsite= "testpool" +SET_JOB_ATTR bps_wms_service= "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorService" +SET_JOB_ATTR bps_wms_workflow= "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorWorkflow" +SET_JOB_ATTR bps_run_quanta= "label1:2;label2:2" +SET_JOB_ATTR bps_job_summary= "pipetaskInit:1;label1:2;label2:2;finalJob:1" +SET_JOB_ATTR bps_provisioning_job= "provisioningJob" +FINAL finalJob jobs/finalJob/finalJob.sub +SCRIPT POST finalJob /work/testuser/ctrl_bps_htcondor/python/lsst/ctrl/bps/htcondor/final_post.sh finalJob $DAG_STATUS $RETURN +SERVICE provisioningJob jobs/provisioningJob/provisioningJob.sub diff --git a/tests/data/tiny_problems/tiny_problems.dag.dagman.log b/tests/data/tiny_problems/tiny_problems.dag.dagman.log new file mode 100644 index 0000000..8709efb --- /dev/null +++ b/tests/data/tiny_problems/tiny_problems.dag.dagman.log @@ -0,0 +1,15 @@ +000 (9228.000.000) 2025-02-13 11:59:46 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> +... +001 (9228.000.000) 2025-02-13 11:59:46 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> +... +005 (9228.000.000) 2025-02-13 12:00:49 Job terminated. + (1) Normal termination (return value 1) + Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 0 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job +... diff --git a/tests/data/tiny_problems/tiny_problems.dag.dagman.out b/tests/data/tiny_problems/tiny_problems.dag.dagman.out new file mode 100644 index 0000000..d766490 --- /dev/null +++ b/tests/data/tiny_problems/tiny_problems.dag.dagman.out @@ -0,0 +1,573 @@ +02/13/25 11:59:46 Result of reading /etc/issue: \S + +02/13/25 11:59:46 Result of reading /etc/redhat-release: AlmaLinux release 9.5 (Teal Serval) + +02/13/25 11:59:46 Using IDs: 20 processors, 10 CPUs, 10 HTs +02/13/25 11:59:46 Enumerating interfaces: lo 127.0.0.1 up +02/13/25 11:59:46 Enumerating interfaces: enp11s0 10.0.0.33 up +02/13/25 11:59:46 Enumerating interfaces: lo ::1 up +02/13/25 11:59:46 Enumerating interfaces: enp11s0 2601:248:8500:b50::d234 up +02/13/25 11:59:46 Enumerating interfaces: enp11s0 2601:248:8500:b50:b696:91ff:fe06:4d05 up +02/13/25 11:59:46 Enumerating interfaces: enp11s0 fe80::b696:91ff:fe06:4d05 up +02/13/25 11:59:46 Directory::Rewind(): path "/work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/etc/condor/config.d" does not exist (yet) +02/13/25 11:59:46 Cannot open /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/etc/condor/config.d: No such file or directory +02/13/25 11:59:46 ****************************************************** +02/13/25 11:59:46 ** condor_scheduniv_exec.9228.0 (CONDOR_DAGMAN) STARTING UP +02/13/25 11:59:46 ** /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_dagman +02/13/25 11:59:46 ** SubsystemInfo: name=DAGMAN type=DAGMAN(9) class=CLIENT(2) +02/13/25 11:59:46 ** Configuration: subsystem:DAGMAN local: class:CLIENT +02/13/25 11:59:46 ** $CondorVersion: 23.0.3 2024-04-04 $ +02/13/25 11:59:46 ** $CondorPlatform: X86_64-CentOS_7.9 $ +02/13/25 11:59:46 ** PID = 61169 +02/13/25 11:59:46 ** Log last touched time unavailable (No such file or directory) +02/13/25 11:59:46 ****************************************************** +02/13/25 11:59:46 Using config source: /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/etc/condor/condor_config +02/13/25 11:59:46 Using local config sources: +02/13/25 11:59:46 /etc/condor/condor_config +02/13/25 11:59:46 /etc/condor/config.d/00-minicondor +02/13/25 11:59:46 /etc/condor/config.d/00-security +02/13/25 11:59:46 /etc/condor/config.d/10-stash-plugin.conf +02/13/25 11:59:46 /etc/condor/config.d/99-lsst +02/13/25 11:59:46 /etc/condor/condor_config.local +02/13/25 11:59:46 /home/testuser/.condor/user_config +02/13/25 11:59:46 config Macros = 101, Sorted = 101, StringBytes = 3173, TablesBytes = 3724 +02/13/25 11:59:46 CLASSAD_CACHING is ENABLED +02/13/25 11:59:46 Daemon Log is logging: D_ALWAYS:2 D_ERROR D_STATUS +02/13/25 11:59:46 Internal pipe for signals resized to 4096 from 65536 +02/13/25 11:59:46 DaemonCore: No command port requested. +02/13/25 11:59:46 Setting maximum accepts per cycle 8. +02/13/25 11:59:46 Setting maximum UDP messages per cycle 100. +02/13/25 11:59:46 Will use TCP to update collector <10.0.0.33:9618> +02/13/25 11:59:46 Not using shared port because no command port requested +02/13/25 11:59:46 DAGMAN_USE_STRICT setting: 1 +02/13/25 11:59:46 DAGMAN_VERBOSITY setting: 3 +02/13/25 11:59:46 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 +02/13/25 11:59:46 DAGMAN_DEBUG_CACHE_ENABLE setting: False +02/13/25 11:59:46 DAGMAN_SUBMIT_DELAY setting: 0 +02/13/25 11:59:46 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 +02/13/25 11:59:46 DAGMAN_STARTUP_CYCLE_DETECT setting: False +02/13/25 11:59:46 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 100 +02/13/25 11:59:46 DAGMAN_AGGRESSIVE_SUBMIT setting: False +02/13/25 11:59:46 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 1 +02/13/25 11:59:46 DAGMAN_QUEUE_UPDATE_INTERVAL setting: 30 +02/13/25 11:59:46 DAGMAN_DEFAULT_PRIORITY setting: 0 +02/13/25 11:59:46 DAGMAN_SUPPRESS_NOTIFICATION setting: True +02/13/25 11:59:46 allow_events (DAGMAN_ALLOW_EVENTS) setting: 114 +02/13/25 11:59:46 DAGMAN_RETRY_SUBMIT_FIRST setting: True +02/13/25 11:59:46 DAGMAN_RETRY_NODE_FIRST setting: False +02/13/25 11:59:46 DAGMAN_MAX_JOBS_IDLE setting: 1000 +02/13/25 11:59:46 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 +02/13/25 11:59:46 DAGMAN_MAX_PRE_SCRIPTS setting: 20 +02/13/25 11:59:46 DAGMAN_MAX_POST_SCRIPTS setting: 20 +02/13/25 11:59:46 DAGMAN_MAX_HOLD_SCRIPTS setting: 20 +02/13/25 11:59:46 DAGMAN_MUNGE_NODE_NAMES setting: True +02/13/25 11:59:46 DAGMAN_PROHIBIT_MULTI_JOBS setting: False +02/13/25 11:59:46 DAGMAN_SUBMIT_DEPTH_FIRST setting: False +02/13/25 11:59:46 DAGMAN_ALWAYS_RUN_POST setting: False +02/13/25 11:59:46 DAGMAN_CONDOR_SUBMIT_EXE setting: /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_submit +02/13/25 11:59:46 DAGMAN_USE_DIRECT_SUBMIT setting: True +02/13/25 11:59:46 DAGMAN_DEFAULT_APPEND_VARS setting: False +02/13/25 11:59:46 DAGMAN_ABORT_DUPLICATES setting: True +02/13/25 11:59:46 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True +02/13/25 11:59:46 DAGMAN_PENDING_REPORT_INTERVAL setting: 60 +02/13/25 11:59:46 DAGMAN_AUTO_RESCUE setting: True +02/13/25 11:59:46 DAGMAN_MAX_RESCUE_NUM setting: 100 +02/13/25 11:59:46 DAGMAN_WRITE_PARTIAL_RESCUE setting: True +02/13/25 11:59:46 DAGMAN_DEFAULT_NODE_LOG setting: @(DAG_DIR)/@(DAG_FILE).nodes.log +02/13/25 11:59:46 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True +02/13/25 11:59:46 DAGMAN_MAX_JOB_HOLDS setting: 100 +02/13/25 11:59:46 DAGMAN_HOLD_CLAIM_TIME setting: 20 +02/13/25 11:59:46 ALL_DEBUG setting: D_FULLDEBUG +02/13/25 11:59:46 DAGMAN_DEBUG setting: +02/13/25 11:59:46 DAGMAN_SUPPRESS_JOB_LOGS setting: False +02/13/25 11:59:46 DAGMAN_REMOVE_NODE_JOBS setting: True +02/13/25 11:59:46 DAGMAN will adjust edges after parsing +02/13/25 11:59:46 argv[0] == "condor_scheduniv_exec.9228.0" +02/13/25 11:59:46 argv[1] == "-Lockfile" +02/13/25 11:59:46 argv[2] == "/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.lock" +02/13/25 11:59:46 argv[3] == "-AutoRescue" +02/13/25 11:59:46 argv[4] == "1" +02/13/25 11:59:46 argv[5] == "-DoRescueFrom" +02/13/25 11:59:46 argv[6] == "0" +02/13/25 11:59:46 argv[7] == "-Dag" +02/13/25 11:59:46 argv[8] == "/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag" +02/13/25 11:59:46 argv[9] == "-Suppress_notification" +02/13/25 11:59:46 argv[10] == "-CsdVersion" +02/13/25 11:59:46 argv[11] == "$CondorVersion: 23.0.3 2024-04-04 $" +02/13/25 11:59:46 argv[12] == "-Dagman" +02/13/25 11:59:46 argv[13] == "/work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_dagman" +02/13/25 11:59:46 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:46 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:46 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:46 Workflow batch-id: <9228.0> +02/13/25 11:59:46 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:46 Workflow batch-name: +02/13/25 11:59:46 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:46 Workflow accounting_group: <> +02/13/25 11:59:46 Workflow accounting_group_user: <> +02/13/25 11:59:46 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:46 Warning: failed to get attribute DAGNodeName +02/13/25 11:59:46 DAGMAN_LOG_ON_NFS_IS_ERROR setting: False +02/13/25 11:59:46 Default node log file is: +02/13/25 11:59:46 DAG Lockfile will be written to /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.lock +02/13/25 11:59:46 DAG Input file is /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag +02/13/25 11:59:46 Parsing 1 dagfiles +02/13/25 11:59:46 Parsing /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag ... +02/13/25 11:59:46 TmpDir(0)::TmpDir() +02/13/25 11:59:46 TmpDir(1)::TmpDir() +02/13/25 11:59:46 TmpDir(1)::Cd2MainDir() +02/13/25 11:59:46 TmpDir(1)::~TmpDir() +02/13/25 11:59:46 TmpDir(2)::TmpDir() +02/13/25 11:59:46 TmpDir(2)::Cd2MainDir() +02/13/25 11:59:46 TmpDir(2)::~TmpDir() +02/13/25 11:59:46 TmpDir(3)::TmpDir() +02/13/25 11:59:46 TmpDir(3)::Cd2MainDir() +02/13/25 11:59:46 TmpDir(3)::~TmpDir() +02/13/25 11:59:46 TmpDir(4)::TmpDir() +02/13/25 11:59:46 TmpDir(4)::Cd2MainDir() +02/13/25 11:59:46 TmpDir(4)::~TmpDir() +02/13/25 11:59:46 TmpDir(5)::TmpDir() +02/13/25 11:59:46 TmpDir(5)::Cd2MainDir() +02/13/25 11:59:46 TmpDir(5)::~TmpDir() +02/13/25 11:59:46 TmpDir(6)::TmpDir() +02/13/25 11:59:46 TmpDir(6)::Cd2MainDir() +02/13/25 11:59:46 TmpDir(6)::~TmpDir() +02/13/25 11:59:46 TmpDir(7)::TmpDir() +02/13/25 11:59:46 TmpDir(7)::Cd2MainDir() +02/13/25 11:59:46 TmpDir(7)::~TmpDir() +02/13/25 11:59:46 TmpDir(0)::~TmpDir() +02/13/25 11:59:46 Adjusting edges +02/13/25 11:59:46 Dag contains 6 total jobs +02/13/25 11:59:46 Bootstrapping... +02/13/25 11:59:46 Number of pre-completed nodes: 0 +02/13/25 11:59:46 ReadMultipleUserLogs::monitorLogFile(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log, 1) +02/13/25 11:59:46 MultiLogFiles::InitializeFile(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log, 0) +02/13/25 11:59:46 ReadMultipleUserLogs: didn't find LogFileMonitor object for /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log (64770:484441158) +02/13/25 11:59:46 MultiLogFiles::InitializeFile(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log, 1) +02/13/25 11:59:46 MultiLogFiles: truncating log file /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log +02/13/25 11:59:46 ReadMultipleUserLogs: created LogFileMonitor object for log file /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log +02/13/25 11:59:46 init: Opening file /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log +02/13/25 11:59:46 Opening log file #0 '/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log' (is_lock_cur=false,seek=false,read_header=true) +02/13/25 11:59:46 Error, apparently invalid user log file +02/13/25 11:59:46 ReadMultipleUserLogs: added log file /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log (64770:484441158) to active list +02/13/25 11:59:46 Starting service node provisioningJob... +02/13/25 11:59:46 DAG status: 0 (DAG_STATUS_OK) +02/13/25 11:59:46 Of 6 nodes total: +02/13/25 11:59:46 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 11:59:46 === === === === === === === === +02/13/25 11:59:46 0 0 0 0 1 5 0 0 +02/13/25 11:59:46 0 job proc(s) currently held +02/13/25 11:59:46 DAGMan Runtime Statistics: [ EventCycleTimeCount = 0.0; EventCycleTimeSum = 0.0; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeCount = 0.0; SleepCycleTimeSum = 0.0; SubmitCycleTimeCount = 0.0; SubmitCycleTimeSum = 0.0; ] +02/13/25 11:59:46 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:46 Registering condor_event_timer... +02/13/25 11:59:47 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:47 Submitting HTCondor Node provisioningJob job(s)... +02/13/25 11:59:47 TmpDir(8)::TmpDir() +02/13/25 11:59:47 TmpDir(8)::Cd2TmpDir() +02/13/25 11:59:47 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/13/25 11:59:47 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:47 TmpDir(8)::Cd2MainDir() +02/13/25 11:59:47 TmpDir(8)::~TmpDir() +02/13/25 11:59:47 assigned HTCondor ID (9229.0.0) +02/13/25 11:59:47 Submitting HTCondor Node pipetaskInit job(s)... +02/13/25 11:59:47 TmpDir(9)::TmpDir() +02/13/25 11:59:47 TmpDir(9)::Cd2TmpDir() +02/13/25 11:59:47 Submitting node pipetaskInit from file jobs/pipetaskInit/pipetaskInit.sub using direct job submission +02/13/25 11:59:47 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:47 TmpDir(9)::Cd2MainDir() +02/13/25 11:59:47 TmpDir(9)::~TmpDir() +02/13/25 11:59:47 assigned HTCondor ID (9230.0.0) +02/13/25 11:59:47 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 11:59:47 Just submitted 2 jobs this cycle... +02/13/25 11:59:47 DAG status: 0 (DAG_STATUS_OK) +02/13/25 11:59:47 Of 6 nodes total: +02/13/25 11:59:47 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 11:59:47 === === === === === === === === +02/13/25 11:59:47 0 0 1 0 0 5 0 0 +02/13/25 11:59:47 0 job proc(s) currently held +02/13/25 11:59:47 DAGMan Runtime Statistics: [ EventCycleTimeCount = 0.0; EventCycleTimeSum = 0.0; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeCount = 0.0; SleepCycleTimeSum = 0.0; SubmitCycleTimeAvg = 0.1274127960205078; SubmitCycleTimeCount = 1.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 0.1274127960205078; SubmitCycleTimeStd = 0.1274127960205078; SubmitCycleTimeSum = 0.1274127960205078; ] +02/13/25 11:59:48 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:48 Currently monitoring 1 HTCondor log file(s) +02/13/25 11:59:48 ReadMultipleUserLogs::readEvent() +02/13/25 11:59:48 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 11:59:48 Reassigning the id of job provisioningJob from (9229.0.0) to (9229.0.0) +02/13/25 11:59:48 Event: ULOG_SUBMIT for HTCondor Node provisioningJob (9229.0.0) {02/13/25 11:59:47} +02/13/25 11:59:48 Number of idle job procs: 1 +02/13/25 11:59:48 ReadMultipleUserLogs::readEvent() +02/13/25 11:59:48 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 11:59:48 Reassigning the id of job pipetaskInit from (9230.0.0) to (9230.0.0) +02/13/25 11:59:48 Event: ULOG_SUBMIT for HTCondor Node pipetaskInit (9230.0.0) {02/13/25 11:59:47} +02/13/25 11:59:48 Number of idle job procs: 2 +02/13/25 11:59:48 ReadMultipleUserLogs::readEvent() +02/13/25 11:59:48 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 11:59:49 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:50 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:51 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:52 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:52 Currently monitoring 1 HTCondor log file(s) +02/13/25 11:59:52 ReadMultipleUserLogs::readEvent() +02/13/25 11:59:52 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 11:59:52 Event: ULOG_EXECUTE for HTCondor Node provisioningJob (9229.0.0) {02/13/25 11:59:51} +02/13/25 11:59:52 Number of idle job procs: 1 +02/13/25 11:59:52 ReadMultipleUserLogs::readEvent() +02/13/25 11:59:52 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 11:59:53 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:54 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:55 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:56 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:56 Currently monitoring 1 HTCondor log file(s) +02/13/25 11:59:56 ReadMultipleUserLogs::readEvent() +02/13/25 11:59:56 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 11:59:56 Event: ULOG_EXECUTE for HTCondor Node pipetaskInit (9230.0.0) {02/13/25 11:59:55} +02/13/25 11:59:56 Number of idle job procs: 0 +02/13/25 11:59:56 ReadMultipleUserLogs::readEvent() +02/13/25 11:59:56 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 11:59:57 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:58 ReadMultipleUserLogs::GetLogStatus() +02/13/25 11:59:59 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:00 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:01 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:02 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:03 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:04 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:05 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:06 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:07 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:08 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:08 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:08 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:08 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:08 Event: ULOG_JOB_TERMINATED for HTCondor Node pipetaskInit (9230.0.0) {02/13/25 12:00:07} +02/13/25 12:00:08 Number of idle job procs: 0 +02/13/25 12:00:08 Node pipetaskInit job proc (9230.0.0) completed successfully. +02/13/25 12:00:08 Node pipetaskInit job completed +02/13/25 12:00:08 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:08 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:08 DAG status: 0 (DAG_STATUS_OK) +02/13/25 12:00:08 Of 6 nodes total: +02/13/25 12:00:08 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:08 === === === === === === === === +02/13/25 12:00:08 1 0 0 0 2 3 0 0 +02/13/25 12:00:08 0 job proc(s) currently held +02/13/25 12:00:08 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.006201301302228656; EventCycleTimeCount = 21.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02785429546285483; EventCycleTimeSum = 0.1302273273468018; LogProcessCycleTimeAvg = 0.0001814961433410645; LogProcessCycleTimeCount = 4.0; LogProcessCycleTimeMax = 0.0002131462097167969; LogProcessCycleTimeMin = 0.0001490116119384766; LogProcessCycleTimeStd = 2.639554977691035E-05; LogProcessCycleTimeSum = 0.0007259845733642578; SleepCycleTimeAvg = 1.001077697390602; SleepCycleTimeCount = 21.0; SleepCycleTimeMax = 1.001123905181885; SleepCycleTimeMin = 1.001029014587402; SleepCycleTimeStd = 2.576064202461596E-05; SleepCycleTimeSum = 21.02263164520264; SubmitCycleTimeAvg = 0.005813663656061346; SubmitCycleTimeCount = 22.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.02715956000814675; SubmitCycleTimeSum = 0.1279006004333496; ] +02/13/25 12:00:09 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:09 Submitting HTCondor Node 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b job(s)... +02/13/25 12:00:09 TmpDir(10)::TmpDir() +02/13/25 12:00:09 TmpDir(10)::Cd2TmpDir() +02/13/25 12:00:09 Submitting node 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b from file jobs/label1/val1/057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b.sub using direct job submission +02/13/25 12:00:09 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:09 TmpDir(10)::Cd2MainDir() +02/13/25 12:00:09 TmpDir(10)::~TmpDir() +02/13/25 12:00:09 assigned HTCondor ID (9231.0.0) +02/13/25 12:00:09 Submitting HTCondor Node 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a job(s)... +02/13/25 12:00:09 TmpDir(11)::TmpDir() +02/13/25 12:00:09 TmpDir(11)::Cd2TmpDir() +02/13/25 12:00:09 Submitting node 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a from file jobs/label1/val1/4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a.sub using direct job submission +02/13/25 12:00:09 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:09 TmpDir(11)::Cd2MainDir() +02/13/25 12:00:09 TmpDir(11)::~TmpDir() +02/13/25 12:00:09 assigned HTCondor ID (9232.0.0) +02/13/25 12:00:09 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:09 Just submitted 2 jobs this cycle... +02/13/25 12:00:09 DAG status: 0 (DAG_STATUS_OK) +02/13/25 12:00:09 Of 6 nodes total: +02/13/25 12:00:09 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:09 === === === === === === === === +02/13/25 12:00:09 1 0 2 0 0 3 0 0 +02/13/25 12:00:09 0 job proc(s) currently held +02/13/25 12:00:09 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.005936655131253329; EventCycleTimeCount = 22.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02721133594654065; EventCycleTimeSum = 0.1306064128875732; LogProcessCycleTimeAvg = 0.0001814961433410645; LogProcessCycleTimeCount = 4.0; LogProcessCycleTimeMax = 0.0002131462097167969; LogProcessCycleTimeMin = 0.0001490116119384766; LogProcessCycleTimeStd = 2.639554977691035E-05; LogProcessCycleTimeSum = 0.0007259845733642578; SleepCycleTimeAvg = 1.001076936721802; SleepCycleTimeCount = 22.0; SleepCycleTimeMax = 1.001123905181885; SleepCycleTimeMin = 1.001029014587402; SleepCycleTimeStd = 2.539172525361444E-05; SleepCycleTimeSum = 22.02369260787964; SubmitCycleTimeAvg = 0.01071233334748641; SubmitCycleTimeCount = 23.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.03544069300694536; SubmitCycleTimeSum = 0.2463836669921875; ] +02/13/25 12:00:10 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:10 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:10 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:10 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:10 Reassigning the id of job 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b from (9231.0.0) to (9231.0.0) +02/13/25 12:00:10 Event: ULOG_SUBMIT for HTCondor Node 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b (9231.0.0) {02/13/25 12:00:09} +02/13/25 12:00:10 Number of idle job procs: 1 +02/13/25 12:00:10 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:10 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:10 Reassigning the id of job 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a from (9232.0.0) to (9232.0.0) +02/13/25 12:00:10 Event: ULOG_SUBMIT for HTCondor Node 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a (9232.0.0) {02/13/25 12:00:09} +02/13/25 12:00:10 Number of idle job procs: 2 +02/13/25 12:00:10 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:10 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:10 Event: ULOG_EXECUTE for HTCondor Node 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b (9231.0.0) {02/13/25 12:00:09} +02/13/25 12:00:10 Number of idle job procs: 1 +02/13/25 12:00:10 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:10 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:11 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:11 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:11 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:11 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:11 Event: ULOG_EXECUTE for HTCondor Node 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a (9232.0.0) {02/13/25 12:00:11} +02/13/25 12:00:11 Number of idle job procs: 0 +02/13/25 12:00:11 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:11 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:12 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:13 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:14 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:15 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:16 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:17 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:17 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:18 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:19 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:20 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:20 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:20 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:20 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:20 Event: ULOG_JOB_TERMINATED for HTCondor Node 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b (9231.0.0) {02/13/25 12:00:19} +02/13/25 12:00:20 Number of idle job procs: 0 +02/13/25 12:00:20 Node 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b job proc (9231.0.0) failed with status 1. +02/13/25 12:00:20 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:20 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:20 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/13/25 12:00:20 Of 6 nodes total: +02/13/25 12:00:20 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:20 === === === === === === === === +02/13/25 12:00:20 1 0 1 0 0 2 1 1 +02/13/25 12:00:20 0 job proc(s) currently held +02/13/25 12:00:20 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.007817853580821644; EventCycleTimeCount = 33.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02982489759928784; EventCycleTimeSum = 0.2579891681671143; LogProcessCycleTimeAvg = 0.0002171993255615234; LogProcessCycleTimeCount = 7.0; LogProcessCycleTimeMax = 0.00032806396484375; LogProcessCycleTimeMin = 0.0001490116119384766; LogProcessCycleTimeStd = 7.319696300074032E-05; LogProcessCycleTimeSum = 0.001520395278930664; SleepCycleTimeAvg = 1.001071966055668; SleepCycleTimeCount = 33.0; SleepCycleTimeMax = 1.001123905181885; SleepCycleTimeMin = 1.001029014587402; SleepCycleTimeStd = 2.3962188467318E-05; SleepCycleTimeSum = 33.03537487983704; SubmitCycleTimeAvg = 0.007253415444317986; SubmitCycleTimeCount = 34.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.02937917256241071; SubmitCycleTimeSum = 0.2466161251068115; ] +02/13/25 12:00:21 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:21 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:21 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:21 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:21 Event: ULOG_JOB_TERMINATED for HTCondor Node 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a (9232.0.0) {02/13/25 12:00:21} +02/13/25 12:00:21 Number of idle job procs: 0 +02/13/25 12:00:21 Node 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a job proc (9232.0.0) completed successfully. +02/13/25 12:00:21 Node 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a job completed +02/13/25 12:00:21 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:21 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:21 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/13/25 12:00:21 Of 6 nodes total: +02/13/25 12:00:21 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:21 === === === === === === === === +02/13/25 12:00:21 2 0 0 0 1 1 1 1 +02/13/25 12:00:21 0 job proc(s) currently held +02/13/25 12:00:21 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.007607382886550005; EventCycleTimeCount = 34.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02939515874199166; EventCycleTimeSum = 0.2586510181427002; LogProcessCycleTimeAvg = 0.0002269148826599121; LogProcessCycleTimeCount = 8.0; LogProcessCycleTimeMax = 0.00032806396484375; LogProcessCycleTimeMin = 0.0001490116119384766; LogProcessCycleTimeStd = 7.312682441209861E-05; LogProcessCycleTimeSum = 0.001815319061279297; SleepCycleTimeAvg = 1.00107302385218; SleepCycleTimeCount = 34.0; SleepCycleTimeMax = 1.001123905181885; SleepCycleTimeMin = 1.001029014587402; SleepCycleTimeStd = 2.438914996316164E-05; SleepCycleTimeSum = 34.03648281097412; SubmitCycleTimeAvg = 0.00704646110534668; SubmitCycleTimeCount = 35.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.02896978576746286; SubmitCycleTimeSum = 0.2466261386871338; ] +02/13/25 12:00:22 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:22 Submitting HTCondor Node 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a job(s)... +02/13/25 12:00:22 TmpDir(12)::TmpDir() +02/13/25 12:00:22 TmpDir(12)::Cd2TmpDir() +02/13/25 12:00:22 Submitting node 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a from file jobs/label2/val1/696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a.sub using direct job submission +02/13/25 12:00:22 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:22 TmpDir(12)::Cd2MainDir() +02/13/25 12:00:22 TmpDir(12)::~TmpDir() +02/13/25 12:00:22 assigned HTCondor ID (9233.0.0) +02/13/25 12:00:22 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:22 Just submitted 1 job this cycle... +02/13/25 12:00:22 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/13/25 12:00:22 Of 6 nodes total: +02/13/25 12:00:22 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:22 === === === === === === === === +02/13/25 12:00:22 2 0 1 0 0 1 1 1 +02/13/25 12:00:22 0 job proc(s) currently held +02/13/25 12:00:22 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.007405601228986467; EventCycleTimeCount = 35.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.0289842445247662; EventCycleTimeSum = 0.2591960430145264; LogProcessCycleTimeAvg = 0.0002269148826599121; LogProcessCycleTimeCount = 8.0; LogProcessCycleTimeMax = 0.00032806396484375; LogProcessCycleTimeMin = 0.0001490116119384766; LogProcessCycleTimeStd = 7.312682441209861E-05; LogProcessCycleTimeSum = 0.001815319061279297; SleepCycleTimeAvg = 1.001072822298323; SleepCycleTimeCount = 35.0; SleepCycleTimeMax = 1.001123905181885; SleepCycleTimeMin = 1.001029014587402; SleepCycleTimeStd = 2.405737578462968E-05; SleepCycleTimeSum = 35.03754878044128; SubmitCycleTimeAvg = 0.0088220304912991; SubmitCycleTimeCount = 36.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.03047564992574948; SubmitCycleTimeSum = 0.3175930976867676; ] +02/13/25 12:00:23 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:23 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:23 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:23 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:23 Reassigning the id of job 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a from (9233.0.0) to (9233.0.0) +02/13/25 12:00:23 Event: ULOG_SUBMIT for HTCondor Node 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a (9233.0.0) {02/13/25 12:00:22} +02/13/25 12:00:23 Number of idle job procs: 1 +02/13/25 12:00:23 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:23 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:23 Event: ULOG_EXECUTE for HTCondor Node 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a (9233.0.0) {02/13/25 12:00:22} +02/13/25 12:00:23 Number of idle job procs: 0 +02/13/25 12:00:23 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:23 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:24 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:25 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:26 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:27 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:28 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:29 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:30 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:31 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:32 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:33 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:34 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:34 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:34 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:34 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:34 Event: ULOG_JOB_TERMINATED for HTCondor Node 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a (9233.0.0) {02/13/25 12:00:33} +02/13/25 12:00:34 Number of idle job procs: 0 +02/13/25 12:00:34 Node 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a job proc (9233.0.0) completed successfully. +02/13/25 12:00:34 Node 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a job completed +02/13/25 12:00:34 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:34 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:34 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/13/25 12:00:34 Of 6 nodes total: +02/13/25 12:00:34 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:34 === === === === === === === === +02/13/25 12:00:34 3 0 0 0 0 1 1 1 +02/13/25 12:00:34 0 job proc(s) currently held +02/13/25 12:00:34 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.007057626196678648; EventCycleTimeCount = 47.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02686973707127884; EventCycleTimeSum = 0.3317084312438965; LogProcessCycleTimeAvg = 0.0002347230911254883; LogProcessCycleTimeCount = 10.0; LogProcessCycleTimeMax = 0.00032806396484375; LogProcessCycleTimeMin = 0.0001490116119384766; LogProcessCycleTimeStd = 6.666586413808144E-05; LogProcessCycleTimeSum = 0.002347230911254883; SleepCycleTimeAvg = 1.001023754160455; SleepCycleTimeCount = 47.0; SleepCycleTimeMax = 1.001123905181885; SleepCycleTimeMin = 1.000128984451294; SleepCycleTimeStd = 0.0001947942048269502; SleepCycleTimeSum = 47.04811644554138; SubmitCycleTimeAvg = 0.006620511412620544; SubmitCycleTimeCount = 48.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.02657975089009677; SubmitCycleTimeSum = 0.3177845478057861; ] +02/13/25 12:00:34 ERROR: the following job(s) failed: +02/13/25 12:00:34 ---------------------- Job ---------------------- +02/13/25 12:00:34 Node Name: 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b +02/13/25 12:00:34 Noop: false +02/13/25 12:00:34 NodeID: 1 +02/13/25 12:00:34 Node Status: STATUS_ERROR +02/13/25 12:00:34 Node return val: 1 +02/13/25 12:00:34 Error: Job proc (9231.0.0) failed with status 1 +02/13/25 12:00:34 Job Submit File: jobs/label1/val1/057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b.sub +02/13/25 12:00:34 HTCondor Job ID: (9231.0.0) +02/13/25 12:00:34 PARENTS: pipetaskInit WAITING: 0 CHILDREN: 40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b +02/13/25 12:00:34 --------------------------------------- +02/13/25 12:00:34 Aborting DAG... +02/13/25 12:00:34 Writing Rescue DAG to /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.rescue001... +02/13/25 12:00:34 Removing submitted jobs... +02/13/25 12:00:34 Removing any/all submitted HTCondor jobs... +02/13/25 12:00:34 Running: /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_rm -const DAGManJobId==9228 -reason DAG' 'Abort:' 'DAG' 'is' 'exiting' 'and' 'writing' 'rescue' 'file. +02/13/25 12:00:34 Note: 0 total job deferrals because of -MaxJobs limit (0) +02/13/25 12:00:34 Note: 0 total job deferrals because of -MaxIdle limit (1000) +02/13/25 12:00:34 Note: 0 total job deferrals because of node category throttles +02/13/25 12:00:34 Note: 0 total PRE script deferrals because of -MaxPre limit (20) or DEFER +02/13/25 12:00:34 Note: 0 total POST script deferrals because of -MaxPost limit (20) or DEFER +02/13/25 12:00:34 Note: 0 total HOLD script deferrals because of -MaxHold limit (20) or DEFER +02/13/25 12:00:34 Starting final node... +02/13/25 12:00:35 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:35 Submitting HTCondor Node finalJob job(s)... +02/13/25 12:00:35 TmpDir(13)::TmpDir() +02/13/25 12:00:35 TmpDir(13)::Cd2TmpDir() +02/13/25 12:00:35 Submitting node finalJob from file jobs/finalJob/finalJob.sub using direct job submission +02/13/25 12:00:35 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:35 Submit warning: Submit:0:the line 'concurrency_limit = db_limit' was unused by DAGMAN. Is it a typo? +02/13/25 12:00:35 TmpDir(13)::Cd2MainDir() +02/13/25 12:00:35 TmpDir(13)::~TmpDir() +02/13/25 12:00:35 assigned HTCondor ID (9234.0.0) +02/13/25 12:00:35 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:35 Just submitted 1 job this cycle... +02/13/25 12:00:35 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:35 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:35 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:35 Event: ULOG_JOB_EVICTED for HTCondor Node provisioningJob (9229.0.0) {02/13/25 12:00:34} +02/13/25 12:00:35 Number of idle job procs: 1 +02/13/25 12:00:35 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:35 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:35 Event: ULOG_JOB_ABORTED for HTCondor Node provisioningJob (9229.0.0) {02/13/25 12:00:34} +02/13/25 12:00:35 Number of idle job procs: 0 +02/13/25 12:00:35 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:35 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:35 Reassigning the id of job finalJob from (9234.0.0) to (9234.0.0) +02/13/25 12:00:35 Event: ULOG_SUBMIT for HTCondor Node finalJob (9234.0.0) {02/13/25 12:00:35} +02/13/25 12:00:35 Number of idle job procs: 1 +02/13/25 12:00:35 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:35 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:35 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/13/25 12:00:35 Of 6 nodes total: +02/13/25 12:00:35 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:35 === === === === === === === === +02/13/25 12:00:35 3 0 1 0 0 0 1 1 +02/13/25 12:00:35 0 job proc(s) currently held +02/13/25 12:00:35 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.007057626196678648; EventCycleTimeCount = 47.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02686973707127884; EventCycleTimeSum = 0.3317084312438965; LogProcessCycleTimeAvg = 0.0002348423004150391; LogProcessCycleTimeCount = 11.0; LogProcessCycleTimeMax = 0.00032806396484375; LogProcessCycleTimeMin = 0.0001490116119384766; LogProcessCycleTimeStd = 6.324602767408629E-05; LogProcessCycleTimeSum = 0.00258326530456543; SleepCycleTimeAvg = 1.02497402826945; SleepCycleTimeCount = 48.0; SleepCycleTimeMax = 2.150636911392212; SleepCycleTimeMin = 1.000128984451294; SleepCycleTimeStd = 0.1659324783532571; SleepCycleTimeSum = 49.19875335693359; SubmitCycleTimeAvg = 0.007984219765176579; SubmitCycleTimeCount = 49.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.02798017300408003; SubmitCycleTimeSum = 0.3912267684936523; ] +02/13/25 12:00:36 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:36 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:36 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:36 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:36 Event: ULOG_EXECUTE for HTCondor Node finalJob (9234.0.0) {02/13/25 12:00:36} +02/13/25 12:00:36 Number of idle job procs: 0 +02/13/25 12:00:36 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:36 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:37 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:38 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:39 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:40 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:41 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:42 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:43 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:44 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:45 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:46 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:47 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:47 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:47 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:47 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:47 Event: ULOG_JOB_TERMINATED for HTCondor Node finalJob (9234.0.0) {02/13/25 12:00:47} +02/13/25 12:00:47 Number of idle job procs: 0 +02/13/25 12:00:47 Node finalJob job proc (9234.0.0) completed successfully. +02/13/25 12:00:47 Node finalJob job completed +02/13/25 12:00:47 Running POST script of Node finalJob... +02/13/25 12:00:47 TmpDir(14)::TmpDir() +02/13/25 12:00:47 TmpDir(14)::Cd2TmpDir() +02/13/25 12:00:47 Warning: mysin has length 0 (ignore if produced by DAGMan; see gittrac #4987, #5031) +02/13/25 12:00:47 TmpDir(14)::Cd2MainDir() +02/13/25 12:00:47 TmpDir(14)::~TmpDir() +02/13/25 12:00:47 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:47 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:47 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/13/25 12:00:47 Of 6 nodes total: +02/13/25 12:00:47 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:47 === === === === === === === === +02/13/25 12:00:47 3 0 0 1 0 0 1 1 +02/13/25 12:00:47 0 job proc(s) currently held +02/13/25 12:00:47 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.006893731779971365; EventCycleTimeCount = 59.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02566633038303247; EventCycleTimeSum = 0.4067301750183105; LogProcessCycleTimeAvg = 0.0007870013897235577; LogProcessCycleTimeCount = 13.0; LogProcessCycleTimeMax = 0.007516860961914062; LogProcessCycleTimeMin = 0.0001308917999267578; LogProcessCycleTimeStd = 0.002023099241970908; LogProcessCycleTimeSum = 0.01023101806640625; SleepCycleTimeAvg = 1.020191744963328; SleepCycleTimeCount = 60.0; SleepCycleTimeMax = 2.150636911392212; SleepCycleTimeMin = 1.000128984451294; SleepCycleTimeStd = 0.1484135045635764; SleepCycleTimeSum = 61.21150469779968; SubmitCycleTimeAvg = 0.006416653023391473; SubmitCycleTimeCount = 61.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 4.053115844726562E-06; SubmitCycleTimeStd = 0.02522921207283313; SubmitCycleTimeSum = 0.3914158344268799; ] +02/13/25 12:00:47 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:47 Initializing user log writer for /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log, (9234.0.0) +02/13/25 12:00:47 WriteUserLog::initialize: opened /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log successfully +02/13/25 12:00:47 WriteUserLog::user_priv_flag (~) is 0 +02/13/25 12:00:48 ReadMultipleUserLogs::GetLogStatus() +02/13/25 12:00:48 Currently monitoring 1 HTCondor log file(s) +02/13/25 12:00:48 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:48 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:48 Event: ULOG_POST_SCRIPT_TERMINATED for HTCondor Node finalJob (9234.0.0) {02/13/25 12:00:47} +02/13/25 12:00:48 POST Script of node finalJob failed with status 2 +02/13/25 12:00:48 POST for Node finalJob returned 2 +02/13/25 12:00:48 ReadMultipleUserLogs::readEvent() +02/13/25 12:00:48 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:48 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/13/25 12:00:48 Of 6 nodes total: +02/13/25 12:00:48 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:48 === === === === === === === === +02/13/25 12:00:48 3 0 0 0 0 0 2 1 +02/13/25 12:00:48 0 job proc(s) currently held +02/13/25 12:00:48 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.007063102722167969; EventCycleTimeCount = 60.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02548168506656051; EventCycleTimeSum = 0.4237861633300781; LogProcessCycleTimeAvg = 0.0007470675877162389; LogProcessCycleTimeCount = 14.0; LogProcessCycleTimeMax = 0.007516860961914062; LogProcessCycleTimeMin = 0.0001308917999267578; LogProcessCycleTimeStd = 0.001949465478399763; LogProcessCycleTimeSum = 0.01045894622802734; SleepCycleTimeAvg = 1.019881682317765; SleepCycleTimeCount = 61.0; SleepCycleTimeMax = 2.150636911392212; SleepCycleTimeMin = 1.000128984451294; SleepCycleTimeStd = 0.1471914512940767; SleepCycleTimeSum = 62.21278262138367; SubmitCycleTimeAvg = 0.006313339356453188; SubmitCycleTimeCount = 62.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 4.053115844726562E-06; SubmitCycleTimeStd = 0.02503478121336396; SubmitCycleTimeSum = 0.3914270401000977; ] +02/13/25 12:00:48 ERROR: the following job(s) failed: +02/13/25 12:00:48 ---------------------- Job ---------------------- +02/13/25 12:00:48 Node Name: 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b +02/13/25 12:00:48 Noop: false +02/13/25 12:00:48 NodeID: 1 +02/13/25 12:00:48 Node Status: STATUS_ERROR +02/13/25 12:00:48 Node return val: 1 +02/13/25 12:00:48 Error: Job proc (9231.0.0) failed with status 1 +02/13/25 12:00:48 Job Submit File: jobs/label1/val1/057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b.sub +02/13/25 12:00:48 HTCondor Job ID: (9231.0.0) +02/13/25 12:00:48 PARENTS: pipetaskInit WAITING: 0 CHILDREN: 40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b +02/13/25 12:00:48 ---------------------- Job ---------------------- +02/13/25 12:00:48 Node Name: finalJob +02/13/25 12:00:48 Noop: false +02/13/25 12:00:48 NodeID: 5 +02/13/25 12:00:48 Node Status: STATUS_ERROR +02/13/25 12:00:48 Node return val: 2 +02/13/25 12:00:48 Error: Job failed due to DAGMAN error 0 and POST Script failed with status 2 +02/13/25 12:00:48 Job Submit File: jobs/finalJob/finalJob.sub +02/13/25 12:00:48 POST Script: /work/testuser/ctrl_bps_htcondor/python/lsst/ctrl/bps/htcondor/final_post.sh finalJob $DAG_STATUS $RETURN +02/13/25 12:00:48 HTCondor Job ID: (9234.0.0) +02/13/25 12:00:48 PARENTS: WAITING: 0 CHILDREN: +02/13/25 12:00:48 --------------------------------------- +02/13/25 12:00:48 Aborting DAG... +02/13/25 12:00:48 Writing Rescue DAG to /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.rescue001... +02/13/25 12:00:48 Removing submitted jobs... +02/13/25 12:00:48 Removing any/all submitted HTCondor jobs... +02/13/25 12:00:48 Running: /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_rm -const DAGManJobId==9228 -reason DAG' 'Abort:' 'DAG' 'is' 'exiting' 'and' 'writing' 'rescue' 'file. +02/13/25 12:00:49 Note: 0 total job deferrals because of -MaxJobs limit (0) +02/13/25 12:00:49 Note: 0 total job deferrals because of -MaxIdle limit (1000) +02/13/25 12:00:49 Note: 0 total job deferrals because of node category throttles +02/13/25 12:00:49 Note: 0 total PRE script deferrals because of -MaxPre limit (20) or DEFER +02/13/25 12:00:49 Note: 0 total POST script deferrals because of -MaxPost limit (20) or DEFER +02/13/25 12:00:49 Note: 0 total HOLD script deferrals because of -MaxHold limit (20) or DEFER +02/13/25 12:00:49 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/13/25 12:00:49 Of 6 nodes total: +02/13/25 12:00:49 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 12:00:49 === === === === === === === === +02/13/25 12:00:49 3 0 0 0 0 0 2 1 +02/13/25 12:00:49 0 job proc(s) currently held +02/13/25 12:00:49 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.007063102722167969; EventCycleTimeCount = 60.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02548168506656051; EventCycleTimeSum = 0.4237861633300781; LogProcessCycleTimeAvg = 0.0007470675877162389; LogProcessCycleTimeCount = 14.0; LogProcessCycleTimeMax = 0.007516860961914062; LogProcessCycleTimeMin = 0.0001308917999267578; LogProcessCycleTimeStd = 0.001949465478399763; LogProcessCycleTimeSum = 0.01045894622802734; SleepCycleTimeAvg = 1.019881682317765; SleepCycleTimeCount = 61.0; SleepCycleTimeMax = 2.150636911392212; SleepCycleTimeMin = 1.000128984451294; SleepCycleTimeStd = 0.1471914512940767; SleepCycleTimeSum = 62.21278262138367; SubmitCycleTimeAvg = 0.006313339356453188; SubmitCycleTimeCount = 62.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 4.053115844726562E-06; SubmitCycleTimeStd = 0.02503478121336396; SubmitCycleTimeSum = 0.3914270401000977; ] +02/13/25 12:00:49 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 12:00:49 Wrote metrics file /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.metrics. +02/13/25 12:00:49 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.007063102722167969; EventCycleTimeCount = 60.0; EventCycleTimeMax = 0.1277668476104736; EventCycleTimeMin = 3.981590270996094E-05; EventCycleTimeStd = 0.02548168506656051; EventCycleTimeSum = 0.4237861633300781; LogProcessCycleTimeAvg = 0.0007470675877162389; LogProcessCycleTimeCount = 14.0; LogProcessCycleTimeMax = 0.007516860961914062; LogProcessCycleTimeMin = 0.0001308917999267578; LogProcessCycleTimeStd = 0.001949465478399763; LogProcessCycleTimeSum = 0.01045894622802734; SleepCycleTimeAvg = 1.019881682317765; SleepCycleTimeCount = 61.0; SleepCycleTimeMax = 2.150636911392212; SleepCycleTimeMin = 1.000128984451294; SleepCycleTimeStd = 0.1471914512940767; SleepCycleTimeSum = 62.21278262138367; SubmitCycleTimeAvg = 0.006313339356453188; SubmitCycleTimeCount = 62.0; SubmitCycleTimeMax = 0.1274127960205078; SubmitCycleTimeMin = 4.053115844726562E-06; SubmitCycleTimeStd = 0.02503478121336396; SubmitCycleTimeSum = 0.3914270401000977; ] +02/13/25 12:00:49 ReadMultipleUserLogs::unmonitorLogFile(/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log) +02/13/25 12:00:49 ReadMultipleUserLogs: found LogFileMonitor object for /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log (64770:484441158) +02/13/25 12:00:49 Closing file +02/13/25 12:00:49 ReadMultipleUserLogs: removed log file /work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag.nodes.log (64770:484441158) from active list +02/13/25 12:00:49 **** condor_scheduniv_exec.9228.0 (condor_DAGMAN) pid 61169 EXITING WITH STATUS 1 diff --git a/tests/data/tiny_problems/tiny_problems.dag.metrics b/tests/data/tiny_problems/tiny_problems.dag.metrics new file mode 100644 index 0000000..d30c6a1 --- /dev/null +++ b/tests/data/tiny_problems/tiny_problems.dag.metrics @@ -0,0 +1,25 @@ +{ + "client":"condor_dagman", + "version":"23.0.3", + "planner":"", + "planner_version":"", + "type":"metrics", + "wf_uuid":"", + "root_wf_uuid":"", + "start_time":1739469586.220, + "end_time":1739469649.054, + "duration":62.834, + "exitcode":1, + "dagman_id":"9228", + "parent_dagman_id":"", + "rescue_dag_number":0, + "jobs":6, + "jobs_failed":3, + "jobs_succeeded":3, + "dag_jobs":0, + "dag_jobs_failed":0, + "dag_jobs_succeeded":0, + "total_jobs":6, + "total_jobs_run":6, + "DagStatus":2 +} diff --git a/tests/data/tiny_problems/tiny_problems.dag.nodes.log b/tests/data/tiny_problems/tiny_problems.dag.nodes.log new file mode 100644 index 0000000..38d8d52 --- /dev/null +++ b/tests/data/tiny_problems/tiny_problems.dag.nodes.log @@ -0,0 +1,174 @@ +000 (9229.000.000) 2025-02-13 11:59:47 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: provisioningJob +... +000 (9230.000.000) 2025-02-13 11:59:47 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: pipetaskInit +... +001 (9229.000.000) 2025-02-13 11:59:51 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=starter_2644_afe5_30> +... +001 (9230.000.000) 2025-02-13 11:59:55 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_61182" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9230.000.000) 2025-02-13 12:00:07 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:05, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:05, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 0 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 83 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 12 + TimeSlotBusy (s) : 12 + + Job terminated of its own accord at 2025-02-13T18:00:07Z with exit-code 0. +... +000 (9231.000.000) 2025-02-13 12:00:09 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: 057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b +... +000 (9232.000.000) 2025-02-13 12:00:09 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: 4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a +... +001 (9231.000.000) 2025-02-13 12:00:09 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_61212" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +001 (9232.000.000) 2025-02-13 12:00:11 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_2@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_61228" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9231.000.000) 2025-02-13 12:00:19 Job terminated. + (1) Normal termination (return value 1) + Usr 0 00:00:04, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:04, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 6249 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 6249 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 90 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 10 + TimeSlotBusy (s) : 10 + + Job terminated of its own accord at 2025-02-13T18:00:19Z with exit-code 1. +... +005 (9232.000.000) 2025-02-13 12:00:21 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:04, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:04, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 2909 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 2909 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 86 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 10 + TimeSlotBusy (s) : 10 + + Job terminated of its own accord at 2025-02-13T18:00:21Z with exit-code 0. +... +000 (9233.000.000) 2025-02-13 12:00:22 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: 696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a +... +001 (9233.000.000) 2025-02-13 12:00:22 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_61249" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9233.000.000) 2025-02-13 12:00:33 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:05, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:05, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 2846 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 2846 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 87 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 11 + TimeSlotBusy (s) : 11 + + Job terminated of its own accord at 2025-02-13T18:00:33Z with exit-code 0. +... +004 (9229.000.000) 2025-02-13 12:00:34 Job was evicted. + (0) CPU times + Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job +... +009 (9229.000.000) 2025-02-13 12:00:34 Job was aborted. + DAG Abort: DAG is exiting and writing rescue file. (by user testuser) +... +000 (9234.000.000) 2025-02-13 12:00:35 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: finalJob +... +001 (9234.000.000) 2025-02-13 12:00:36 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_2@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_61270" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9234.000.000) 2025-02-13 12:00:47 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:05, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:05, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 6204 - Run Bytes Sent By Job + 214 - Run Bytes Received By Job + 6204 - Total Bytes Sent By Job + 214 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 89 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 11 + TimeSlotBusy (s) : 12 + + Job terminated of its own accord at 2025-02-13T18:00:47Z with exit-code 0. +... +016 (9234.000.000) 2025-02-13 12:00:47 POST Script terminated. + (1) Normal termination (return value 2) + DAG Node: finalJob +... diff --git a/tests/data/tiny_problems/tiny_problems.info.json b/tests/data/tiny_problems/tiny_problems.info.json new file mode 100644 index 0000000..c62ce74 --- /dev/null +++ b/tests/data/tiny_problems/tiny_problems.info.json @@ -0,0 +1 @@ +{"test02": {"9228.0": {"ClusterId": 9228, "GlobalJobId": "test02#9228.0#1739469586", "bps_wms_service": "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorService", "bps_project": "dev", "bps_payload": "tiny", "bps_operator": "testuser", "bps_wms_workflow": "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorWorkflow", "bps_provisioning_job": "provisioningJob", "bps_run_quanta": "label1:2;label2:2", "bps_campaign": "tiny", "bps_runsite": "testpool", "bps_job_summary": "pipetaskInit:1;label1:2;label2:2;finalJob:1", "bps_run": "u_testuser_tiny_20250213T175935Z", "bps_isjob": "True"}}} diff --git a/tests/data/tiny_problems/tiny_problems.node_status b/tests/data/tiny_problems/tiny_problems.node_status new file mode 100644 index 0000000..5dde19c --- /dev/null +++ b/tests/data/tiny_problems/tiny_problems.node_status @@ -0,0 +1,78 @@ +[ + Type = "DagStatus"; + DagFiles = { + "/work/testuser/submit/u/testuser/tiny/20250213T175935Z/u_testuser_tiny_20250213T175935Z.dag" + }; + Timestamp = 1739469648; /* "Thu Feb 13 12:00:48 2025" */ + DagStatus = 6; /* "STATUS_ERROR (failed)" */ + NodesTotal = 6; + NodesDone = 3; + NodesPre = 0; + NodesQueued = 0; + NodesPost = 0; + NodesReady = 0; + NodesUnready = 0; + NodesFutile = 1; + NodesFailed = 2; + JobProcsHeld = 0; + JobProcsIdle = 0; /* includes held */ +] +[ + Type = "NodeStatus"; + Node = "pipetaskInit"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b"; + NodeStatus = 6; /* "STATUS_ERROR" */ + StatusDetails = "Job proc (9231.0.0) failed with status 1"; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b"; + NodeStatus = 7; /* "STATUS_FUTILE" */ + StatusDetails = "Had an ancestor node fail"; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "finalJob"; + NodeStatus = 6; /* "STATUS_ERROR" */ + StatusDetails = "Job failed due to DAGMAN error 0 and POST Script failed with status 2"; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "StatusEnd"; + EndTime = 1739469648; /* "Thu Feb 13 12:00:48 2025" */ + NextUpdate = 0; /* "none" */ +] diff --git a/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag new file mode 100644 index 0000000..a6cc1d3 --- /dev/null +++ b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag @@ -0,0 +1,22 @@ +JOB pipetaskInit "jobs/pipetaskInit/pipetaskInit.sub" +JOB 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 "jobs/label1/val1/78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2.sub" +JOB 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 "jobs/label2/val1/98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2.sub" +PARENT pipetaskInit CHILD 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 +PARENT 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 CHILD 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 +DOT u_testuser_tiny_20250212T200412Z.dot +NODE_STATUS_FILE u_testuser_tiny_20250212T200412Z.node_status +SET_JOB_ATTR bps_isjob= "True" +SET_JOB_ATTR bps_project= "dev" +SET_JOB_ATTR bps_campaign= "quick" +SET_JOB_ATTR bps_run= "u_testuser_tiny_20250212T200412Z" +SET_JOB_ATTR bps_operator= "testuser" +SET_JOB_ATTR bps_payload= "tiny" +SET_JOB_ATTR bps_runsite= "testpool" +SET_JOB_ATTR bps_wms_service= "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorService" +SET_JOB_ATTR bps_wms_workflow= "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorWorkflow" +SET_JOB_ATTR bps_run_quanta= "label1:1;label2:1" +SET_JOB_ATTR bps_job_summary= "pipetaskInit:1;label1:1;label2:1;finalJob:1" +SET_JOB_ATTR bps_provisioning_job= "provisioningJob" +FINAL finalJob jobs/finalJob/finalJob.sub +SCRIPT POST finalJob /work/testuser/ctrl_bps_htcondor/python/lsst/ctrl/bps/htcondor/final_post.sh finalJob $DAG_STATUS $RETURN +SERVICE provisioningJob jobs/provisioningJob/provisioningJob.sub diff --git a/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.log b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.log new file mode 100644 index 0000000..6ee9dcc --- /dev/null +++ b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.log @@ -0,0 +1,15 @@ +000 (9198.000.000) 2025-02-12 14:04:26 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1514_0e79> +... +001 (9198.000.000) 2025-02-12 14:04:26 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1514_0e79> +... +005 (9198.000.000) 2025-02-12 14:06:58 Job terminated. + (1) Normal termination (return value 1) + Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 0 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job +... diff --git a/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.out b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.out new file mode 100644 index 0000000..2fec1fe --- /dev/null +++ b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.out @@ -0,0 +1,622 @@ +02/12/25 14:04:27 Result of reading /etc/issue: \S + +02/12/25 14:04:27 Result of reading /etc/redhat-release: AlmaLinux release 9.5 (Teal Serval) + +02/12/25 14:04:27 Using IDs: 20 processors, 10 CPUs, 10 HTs +02/12/25 14:04:27 Enumerating interfaces: lo 127.0.0.1 up +02/12/25 14:04:27 Enumerating interfaces: enp11s0 10.0.0.33 up +02/12/25 14:04:27 Enumerating interfaces: lo ::1 up +02/12/25 14:04:27 Enumerating interfaces: enp11s0 2601:248:8500:b50::d234 up +02/12/25 14:04:27 Enumerating interfaces: enp11s0 2601:248:8500:b50:b696:91ff:fe06:4d05 up +02/12/25 14:04:27 Enumerating interfaces: enp11s0 fe80::b696:91ff:fe06:4d05 up +02/12/25 14:04:27 Directory::Rewind(): path "/work/lsst_stack/w_2025_06/conda/envs/lsst-scipipe-9.0.0/etc/condor/config.d" does not exist (yet) +02/12/25 14:04:27 Cannot open /work/lsst_stack/w_2025_06/conda/envs/lsst-scipipe-9.0.0/etc/condor/config.d: No such file or directory +02/12/25 14:04:27 ****************************************************** +02/12/25 14:04:27 ** condor_scheduniv_exec.9198.0 (CONDOR_DAGMAN) STARTING UP +02/12/25 14:04:27 ** /work/lsst_stack/w_2025_06/conda/envs/lsst-scipipe-9.0.0/bin/condor_dagman +02/12/25 14:04:27 ** SubsystemInfo: name=DAGMAN type=DAGMAN(9) class=CLIENT(2) +02/12/25 14:04:27 ** Configuration: subsystem:DAGMAN local: class:CLIENT +02/12/25 14:04:27 ** $CondorVersion: 23.0.3 2024-04-04 $ +02/12/25 14:04:27 ** $CondorPlatform: X86_64-CentOS_7.9 $ +02/12/25 14:04:27 ** PID = 35494 +02/12/25 14:04:27 ** Log last touched time unavailable (No such file or directory) +02/12/25 14:04:27 ****************************************************** +02/12/25 14:04:27 Using config source: /work/lsst_stack/w_2025_06/conda/envs/lsst-scipipe-9.0.0/etc/condor/condor_config +02/12/25 14:04:27 Using local config sources: +02/12/25 14:04:27 /etc/condor/condor_config +02/12/25 14:04:27 /etc/condor/config.d/00-minicondor +02/12/25 14:04:27 /etc/condor/config.d/00-security +02/12/25 14:04:27 /etc/condor/config.d/10-stash-plugin.conf +02/12/25 14:04:27 /etc/condor/config.d/99-lsst +02/12/25 14:04:27 /etc/condor/condor_config.local +02/12/25 14:04:27 /home/testuser/.condor/user_config +02/12/25 14:04:27 config Macros = 101, Sorted = 101, StringBytes = 3188, TablesBytes = 3724 +02/12/25 14:04:27 CLASSAD_CACHING is ENABLED +02/12/25 14:04:27 Daemon Log is logging: D_ALWAYS:2 D_ERROR D_STATUS +02/12/25 14:04:27 Internal pipe for signals resized to 4096 from 65536 +02/12/25 14:04:27 DaemonCore: No command port requested. +02/12/25 14:04:27 Setting maximum accepts per cycle 8. +02/12/25 14:04:27 Setting maximum UDP messages per cycle 100. +02/12/25 14:04:27 Will use TCP to update collector <10.0.0.33:9618> +02/12/25 14:04:27 Not using shared port because no command port requested +02/12/25 14:04:27 DAGMAN_USE_STRICT setting: 1 +02/12/25 14:04:27 DAGMAN_VERBOSITY setting: 3 +02/12/25 14:04:27 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 +02/12/25 14:04:27 DAGMAN_DEBUG_CACHE_ENABLE setting: False +02/12/25 14:04:27 DAGMAN_SUBMIT_DELAY setting: 0 +02/12/25 14:04:27 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 +02/12/25 14:04:27 DAGMAN_STARTUP_CYCLE_DETECT setting: False +02/12/25 14:04:27 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 100 +02/12/25 14:04:27 DAGMAN_AGGRESSIVE_SUBMIT setting: False +02/12/25 14:04:27 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 1 +02/12/25 14:04:27 DAGMAN_QUEUE_UPDATE_INTERVAL setting: 30 +02/12/25 14:04:27 DAGMAN_DEFAULT_PRIORITY setting: 0 +02/12/25 14:04:27 DAGMAN_SUPPRESS_NOTIFICATION setting: True +02/12/25 14:04:27 allow_events (DAGMAN_ALLOW_EVENTS) setting: 114 +02/12/25 14:04:27 DAGMAN_RETRY_SUBMIT_FIRST setting: True +02/12/25 14:04:27 DAGMAN_RETRY_NODE_FIRST setting: False +02/12/25 14:04:27 DAGMAN_MAX_JOBS_IDLE setting: 1000 +02/12/25 14:04:27 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 +02/12/25 14:04:27 DAGMAN_MAX_PRE_SCRIPTS setting: 20 +02/12/25 14:04:27 DAGMAN_MAX_POST_SCRIPTS setting: 20 +02/12/25 14:04:27 DAGMAN_MAX_HOLD_SCRIPTS setting: 20 +02/12/25 14:04:27 DAGMAN_MUNGE_NODE_NAMES setting: True +02/12/25 14:04:27 DAGMAN_PROHIBIT_MULTI_JOBS setting: False +02/12/25 14:04:27 DAGMAN_SUBMIT_DEPTH_FIRST setting: False +02/12/25 14:04:27 DAGMAN_ALWAYS_RUN_POST setting: False +02/12/25 14:04:27 DAGMAN_CONDOR_SUBMIT_EXE setting: /work/lsst_stack/w_2025_06/conda/envs/lsst-scipipe-9.0.0/bin/condor_submit +02/12/25 14:04:27 DAGMAN_USE_DIRECT_SUBMIT setting: True +02/12/25 14:04:27 DAGMAN_DEFAULT_APPEND_VARS setting: False +02/12/25 14:04:27 DAGMAN_ABORT_DUPLICATES setting: True +02/12/25 14:04:27 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True +02/12/25 14:04:27 DAGMAN_PENDING_REPORT_INTERVAL setting: 60 +02/12/25 14:04:27 DAGMAN_AUTO_RESCUE setting: True +02/12/25 14:04:27 DAGMAN_MAX_RESCUE_NUM setting: 100 +02/12/25 14:04:27 DAGMAN_WRITE_PARTIAL_RESCUE setting: True +02/12/25 14:04:27 DAGMAN_DEFAULT_NODE_LOG setting: @(DAG_DIR)/@(DAG_FILE).nodes.log +02/12/25 14:04:27 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True +02/12/25 14:04:27 DAGMAN_MAX_JOB_HOLDS setting: 100 +02/12/25 14:04:27 DAGMAN_HOLD_CLAIM_TIME setting: 20 +02/12/25 14:04:27 ALL_DEBUG setting: D_FULLDEBUG +02/12/25 14:04:27 DAGMAN_DEBUG setting: +02/12/25 14:04:27 DAGMAN_SUPPRESS_JOB_LOGS setting: False +02/12/25 14:04:27 DAGMAN_REMOVE_NODE_JOBS setting: True +02/12/25 14:04:27 DAGMAN will adjust edges after parsing +02/12/25 14:04:27 argv[0] == "condor_scheduniv_exec.9198.0" +02/12/25 14:04:27 argv[1] == "-Lockfile" +02/12/25 14:04:27 argv[2] == "/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.lock" +02/12/25 14:04:27 argv[3] == "-AutoRescue" +02/12/25 14:04:27 argv[4] == "1" +02/12/25 14:04:27 argv[5] == "-DoRescueFrom" +02/12/25 14:04:27 argv[6] == "0" +02/12/25 14:04:27 argv[7] == "-Dag" +02/12/25 14:04:27 argv[8] == "/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag" +02/12/25 14:04:27 argv[9] == "-Suppress_notification" +02/12/25 14:04:27 argv[10] == "-CsdVersion" +02/12/25 14:04:27 argv[11] == "$CondorVersion: 23.0.3 2024-04-04 $" +02/12/25 14:04:27 argv[12] == "-Dagman" +02/12/25 14:04:27 argv[13] == "/work/lsst_stack/w_2025_06/conda/envs/lsst-scipipe-9.0.0/bin/condor_dagman" +02/12/25 14:04:27 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:04:27 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:04:27 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:04:27 Workflow batch-id: <9198.0> +02/12/25 14:04:27 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:04:27 Workflow batch-name: +02/12/25 14:04:27 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:04:27 Workflow accounting_group: <> +02/12/25 14:04:27 Workflow accounting_group_user: <> +02/12/25 14:04:27 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:04:27 Warning: failed to get attribute DAGNodeName +02/12/25 14:04:27 DAGMAN_LOG_ON_NFS_IS_ERROR setting: False +02/12/25 14:04:27 Default node log file is: +02/12/25 14:04:27 DAG Lockfile will be written to /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.lock +02/12/25 14:04:27 DAG Input file is /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag +02/12/25 14:04:27 Parsing 1 dagfiles +02/12/25 14:04:27 Parsing /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag ... +02/12/25 14:04:27 TmpDir(0)::TmpDir() +02/12/25 14:04:27 TmpDir(1)::TmpDir() +02/12/25 14:04:27 TmpDir(1)::Cd2MainDir() +02/12/25 14:04:27 TmpDir(1)::~TmpDir() +02/12/25 14:04:27 TmpDir(2)::TmpDir() +02/12/25 14:04:27 TmpDir(2)::Cd2MainDir() +02/12/25 14:04:27 TmpDir(2)::~TmpDir() +02/12/25 14:04:27 TmpDir(3)::TmpDir() +02/12/25 14:04:27 TmpDir(3)::Cd2MainDir() +02/12/25 14:04:27 TmpDir(3)::~TmpDir() +02/12/25 14:04:27 TmpDir(4)::TmpDir() +02/12/25 14:04:27 TmpDir(4)::Cd2MainDir() +02/12/25 14:04:27 TmpDir(4)::~TmpDir() +02/12/25 14:04:27 TmpDir(5)::TmpDir() +02/12/25 14:04:27 TmpDir(5)::Cd2MainDir() +02/12/25 14:04:27 TmpDir(5)::~TmpDir() +02/12/25 14:04:27 TmpDir(0)::~TmpDir() +02/12/25 14:04:27 Adjusting edges +02/12/25 14:04:27 Dag contains 4 total jobs +02/12/25 14:04:27 Bootstrapping... +02/12/25 14:04:27 Number of pre-completed nodes: 0 +02/12/25 14:04:27 ReadMultipleUserLogs::monitorLogFile(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log, 1) +02/12/25 14:04:27 MultiLogFiles::InitializeFile(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log, 0) +02/12/25 14:04:27 ReadMultipleUserLogs: didn't find LogFileMonitor object for /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log (64770:22095619392) +02/12/25 14:04:27 MultiLogFiles::InitializeFile(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log, 1) +02/12/25 14:04:27 MultiLogFiles: truncating log file /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log +02/12/25 14:04:27 ReadMultipleUserLogs: created LogFileMonitor object for log file /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log +02/12/25 14:04:27 init: Opening file /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log +02/12/25 14:04:27 Opening log file #0 '/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log' (is_lock_cur=false,seek=false,read_header=true) +02/12/25 14:04:27 Error, apparently invalid user log file +02/12/25 14:04:27 ReadMultipleUserLogs: added log file /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log (64770:22095619392) to active list +02/12/25 14:04:27 Starting service node provisioningJob... +02/12/25 14:04:27 DAG status: 0 (DAG_STATUS_OK) +02/12/25 14:04:27 Of 4 nodes total: +02/12/25 14:04:27 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:04:27 === === === === === === === === +02/12/25 14:04:27 0 0 0 0 1 3 0 0 +02/12/25 14:04:27 0 job proc(s) currently held +02/12/25 14:04:27 DAGMan Runtime Statistics: [ EventCycleTimeCount = 0.0; EventCycleTimeSum = 0.0; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeCount = 0.0; SleepCycleTimeSum = 0.0; SubmitCycleTimeCount = 0.0; SubmitCycleTimeSum = 0.0; ] +02/12/25 14:04:27 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:04:27 Registering condor_event_timer... +02/12/25 14:04:28 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:28 Submitting HTCondor Node provisioningJob job(s)... +02/12/25 14:04:28 TmpDir(6)::TmpDir() +02/12/25 14:04:28 TmpDir(6)::Cd2TmpDir() +02/12/25 14:04:28 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/12/25 14:04:28 ERROR: submit attempt failed, errno=2 No such file or directory +02/12/25 14:04:28 could not open submit file : jobs/provisioningJob/provisioningJob.sub - can't open file +02/12/25 14:04:28 Submit warning: Submit:0:the line 'DAG_PARENT_NAMES = ' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_id = 9198.0' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_name = u_testuser_tiny_20250212T200412Z.dag+9198' was unused by DAGMAN. Is it a typo? +02/12/25 14:04:28 TmpDir(6)::Cd2MainDir() +02/12/25 14:04:28 TmpDir(6)::~TmpDir() +02/12/25 14:04:28 Job submit try 1/6 failed, will try again in >= 1 second. +02/12/25 14:04:28 DAG status: 0 (DAG_STATUS_OK) +02/12/25 14:04:28 Of 4 nodes total: +02/12/25 14:04:28 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:04:28 === === === === === === === === +02/12/25 14:04:28 0 0 0 0 1 3 0 0 +02/12/25 14:04:28 0 job proc(s) currently held +02/12/25 14:04:28 DAGMan Runtime Statistics: [ EventCycleTimeCount = 0.0; EventCycleTimeSum = 0.0; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeCount = 0.0; SleepCycleTimeSum = 0.0; SubmitCycleTimeAvg = 0.0004110336303710938; SubmitCycleTimeCount = 1.0; SubmitCycleTimeMax = 0.0004110336303710938; SubmitCycleTimeMin = 0.0004110336303710938; SubmitCycleTimeStd = 0.0004110336303710938; SubmitCycleTimeSum = 0.0004110336303710938; ] +02/12/25 14:04:29 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:29 Submitting HTCondor Node provisioningJob job(s)... +02/12/25 14:04:29 TmpDir(7)::TmpDir() +02/12/25 14:04:29 TmpDir(7)::Cd2TmpDir() +02/12/25 14:04:29 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/12/25 14:04:29 ERROR: submit attempt failed, errno=2 No such file or directory +02/12/25 14:04:29 could not open submit file : jobs/provisioningJob/provisioningJob.sub - can't open file +02/12/25 14:04:29 Submit warning: Submit:0:the line 'DAG_PARENT_NAMES = ' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_id = 9198.0' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_name = u_testuser_tiny_20250212T200412Z.dag+9198' was unused by DAGMAN. Is it a typo? +02/12/25 14:04:29 TmpDir(7)::Cd2MainDir() +02/12/25 14:04:29 TmpDir(7)::~TmpDir() +02/12/25 14:04:29 Job submit try 2/6 failed, will try again in >= 2 seconds. +02/12/25 14:04:30 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:32 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:32 Submitting HTCondor Node provisioningJob job(s)... +02/12/25 14:04:32 TmpDir(8)::TmpDir() +02/12/25 14:04:32 TmpDir(8)::Cd2TmpDir() +02/12/25 14:04:32 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/12/25 14:04:32 ERROR: submit attempt failed, errno=2 No such file or directory +02/12/25 14:04:32 could not open submit file : jobs/provisioningJob/provisioningJob.sub - can't open file +02/12/25 14:04:32 Submit warning: Submit:0:the line 'DAG_PARENT_NAMES = ' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_id = 9198.0' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_name = u_testuser_tiny_20250212T200412Z.dag+9198' was unused by DAGMAN. Is it a typo? +02/12/25 14:04:32 TmpDir(8)::Cd2MainDir() +02/12/25 14:04:32 TmpDir(8)::~TmpDir() +02/12/25 14:04:32 Job submit try 3/6 failed, will try again in >= 4 seconds. +02/12/25 14:04:33 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:35 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:37 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:37 Submitting HTCondor Node provisioningJob job(s)... +02/12/25 14:04:37 TmpDir(9)::TmpDir() +02/12/25 14:04:37 TmpDir(9)::Cd2TmpDir() +02/12/25 14:04:37 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/12/25 14:04:37 ERROR: submit attempt failed, errno=2 No such file or directory +02/12/25 14:04:37 could not open submit file : jobs/provisioningJob/provisioningJob.sub - can't open file +02/12/25 14:04:37 Submit warning: Submit:0:the line 'DAG_PARENT_NAMES = ' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_id = 9198.0' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_name = u_testuser_tiny_20250212T200412Z.dag+9198' was unused by DAGMAN. Is it a typo? +02/12/25 14:04:37 TmpDir(9)::Cd2MainDir() +02/12/25 14:04:37 TmpDir(9)::~TmpDir() +02/12/25 14:04:37 Job submit try 4/6 failed, will try again in >= 8 seconds. +02/12/25 14:04:38 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:40 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:42 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:44 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:46 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:46 Submitting HTCondor Node provisioningJob job(s)... +02/12/25 14:04:46 TmpDir(10)::TmpDir() +02/12/25 14:04:46 TmpDir(10)::Cd2TmpDir() +02/12/25 14:04:46 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/12/25 14:04:46 ERROR: submit attempt failed, errno=2 No such file or directory +02/12/25 14:04:46 could not open submit file : jobs/provisioningJob/provisioningJob.sub - can't open file +02/12/25 14:04:46 Submit warning: Submit:0:the line 'DAG_PARENT_NAMES = ' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_id = 9198.0' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_name = u_testuser_tiny_20250212T200412Z.dag+9198' was unused by DAGMAN. Is it a typo? +02/12/25 14:04:46 TmpDir(10)::Cd2MainDir() +02/12/25 14:04:46 TmpDir(10)::~TmpDir() +02/12/25 14:04:46 Job submit try 5/6 failed, will try again in >= 16 seconds. +02/12/25 14:04:47 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:49 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:51 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:53 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:55 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:57 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:04:58 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:04:59 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:01 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:03 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:03 Submitting HTCondor Node provisioningJob job(s)... +02/12/25 14:05:03 TmpDir(11)::TmpDir() +02/12/25 14:05:03 TmpDir(11)::Cd2TmpDir() +02/12/25 14:05:03 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/12/25 14:05:03 ERROR: submit attempt failed, errno=2 No such file or directory +02/12/25 14:05:03 could not open submit file : jobs/provisioningJob/provisioningJob.sub - can't open file +02/12/25 14:05:03 Submit warning: Submit:0:the line 'DAG_PARENT_NAMES = ' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_id = 9198.0' was unused by DAGMAN. Is it a typo? +|Submit:0:the line 'batch_name = u_testuser_tiny_20250212T200412Z.dag+9198' was unused by DAGMAN. Is it a typo? +02/12/25 14:05:03 TmpDir(11)::Cd2MainDir() +02/12/25 14:05:03 TmpDir(11)::~TmpDir() +02/12/25 14:05:03 Job submit failed after 6 tries. +02/12/25 14:05:03 Shortcutting node provisioningJob retries because of submit failure(s) +02/12/25 14:05:04 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:04 Submitting HTCondor Node pipetaskInit job(s)... +02/12/25 14:05:04 TmpDir(12)::TmpDir() +02/12/25 14:05:04 TmpDir(12)::Cd2TmpDir() +02/12/25 14:05:04 Submitting node pipetaskInit from file jobs/pipetaskInit/pipetaskInit.sub using direct job submission +02/12/25 14:05:04 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:05:04 TmpDir(12)::Cd2MainDir() +02/12/25 14:05:04 TmpDir(12)::~TmpDir() +02/12/25 14:05:04 assigned HTCondor ID (9199.0.0) +02/12/25 14:05:04 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:05:04 Just submitted 1 job this cycle... +02/12/25 14:05:04 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:05:04 Of 4 nodes total: +02/12/25 14:05:04 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:05:04 === === === === === === === === +02/12/25 14:05:04 0 0 1 0 0 3 0 0 +02/12/25 14:05:04 0 job proc(s) currently held +02/12/25 14:05:04 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.7155343350910005; EventCycleTimeCount = 21.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 0.0002870559692382812; EventCycleTimeStd = 0.4634626065431778; EventCycleTimeSum = 15.02622103691101; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeAvg = 1.001063187917073; SleepCycleTimeCount = 21.0; SleepCycleTimeMax = 1.001128911972046; SleepCycleTimeMin = 1.000479936599731; SleepCycleTimeStd = 0.0001354056407465115; SleepCycleTimeSum = 21.02232694625854; SubmitCycleTimeAvg = 0.6845031868327748; SubmitCycleTimeCount = 22.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 0.0002090930938720703; SubmitCycleTimeStd = 0.4742877991758767; SubmitCycleTimeSum = 15.05907011032104; ] +02/12/25 14:05:05 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:05 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:05:05 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:05 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:05 Reassigning the id of job pipetaskInit from (9199.0.0) to (9199.0.0) +02/12/25 14:05:05 Event: ULOG_SUBMIT for HTCondor Node pipetaskInit (9199.0.0) {02/12/25 14:05:04} +02/12/25 14:05:05 Number of idle job procs: 1 +02/12/25 14:05:05 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:05 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:05 Event: ULOG_EXECUTE for HTCondor Node pipetaskInit (9199.0.0) {02/12/25 14:05:04} +02/12/25 14:05:05 Number of idle job procs: 0 +02/12/25 14:05:05 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:05 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:06 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:07 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:08 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:09 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:10 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:11 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:12 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:13 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:14 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:15 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:16 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:17 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:17 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:05:17 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:17 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:17 Event: ULOG_JOB_TERMINATED for HTCondor Node pipetaskInit (9199.0.0) {02/12/25 14:05:16} +02/12/25 14:05:17 Number of idle job procs: 0 +02/12/25 14:05:17 Node pipetaskInit job proc (9199.0.0) completed successfully. +02/12/25 14:05:17 Node pipetaskInit job completed +02/12/25 14:05:17 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:17 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:17 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:05:17 Of 4 nodes total: +02/12/25 14:05:17 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:05:17 === === === === === === === === +02/12/25 14:05:17 1 0 0 0 1 2 0 0 +02/12/25 14:05:17 0 job proc(s) currently held +02/12/25 14:05:17 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.4432389034944422; EventCycleTimeCount = 34.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.503619635541643; EventCycleTimeSum = 15.07012271881104; LogProcessCycleTimeAvg = 0.0002884864807128906; LogProcessCycleTimeCount = 2.0; LogProcessCycleTimeMax = 0.0003111362457275391; LogProcessCycleTimeMin = 0.0002658367156982422; LogProcessCycleTimeStd = 3.203160486827947E-05; LogProcessCycleTimeSum = 0.0005769729614257812; SleepCycleTimeAvg = 1.00100088820738; SleepCycleTimeCount = 34.0; SleepCycleTimeMax = 1.00113582611084; SleepCycleTimeMin = 1.000178813934326; SleepCycleTimeStd = 0.0002347831889328732; SleepCycleTimeSum = 34.0340301990509; SubmitCycleTimeAvg = 0.4302666596003941; SubmitCycleTimeCount = 35.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.5015382160374527; SubmitCycleTimeSum = 15.05933308601379; ] +02/12/25 14:05:18 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:18 Submitting HTCondor Node 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 job(s)... +02/12/25 14:05:18 TmpDir(13)::TmpDir() +02/12/25 14:05:18 TmpDir(13)::Cd2TmpDir() +02/12/25 14:05:18 Submitting node 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 from file jobs/label1/val1/78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2.sub using direct job submission +02/12/25 14:05:18 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:05:18 TmpDir(13)::Cd2MainDir() +02/12/25 14:05:18 TmpDir(13)::~TmpDir() +02/12/25 14:05:18 assigned HTCondor ID (9200.0.0) +02/12/25 14:05:18 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:05:18 Just submitted 1 job this cycle... +02/12/25 14:05:18 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:05:18 Of 4 nodes total: +02/12/25 14:05:18 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:05:18 === === === === === === === === +02/12/25 14:05:18 1 0 1 0 0 2 0 0 +02/12/25 14:05:18 0 job proc(s) currently held +02/12/25 14:05:18 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.4305910791669573; EventCycleTimeCount = 35.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.5017686837771882; EventCycleTimeSum = 15.07068777084351; LogProcessCycleTimeAvg = 0.0002884864807128906; LogProcessCycleTimeCount = 2.0; LogProcessCycleTimeMax = 0.0003111362457275391; LogProcessCycleTimeMin = 0.0002658367156982422; LogProcessCycleTimeStd = 3.203160486827947E-05; LogProcessCycleTimeSum = 0.0005769729614257812; SleepCycleTimeAvg = 1.001004607336862; SleepCycleTimeCount = 35.0; SleepCycleTimeMax = 1.00113582611084; SleepCycleTimeMin = 1.000178813934326; SleepCycleTimeStd = 0.0002323488632145553; SleepCycleTimeSum = 35.03516125679016; SubmitCycleTimeAvg = 0.4194978343115913; SubmitCycleTimeCount = 36.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.4985263703176162; SubmitCycleTimeSum = 15.10192203521729; ] +02/12/25 14:05:19 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:19 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:05:19 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:19 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:19 Reassigning the id of job 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 from (9200.0.0) to (9200.0.0) +02/12/25 14:05:19 Event: ULOG_SUBMIT for HTCondor Node 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 (9200.0.0) {02/12/25 14:05:18} +02/12/25 14:05:19 Number of idle job procs: 1 +02/12/25 14:05:19 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:19 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:19 Event: ULOG_EXECUTE for HTCondor Node 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 (9200.0.0) {02/12/25 14:05:18} +02/12/25 14:05:19 Number of idle job procs: 0 +02/12/25 14:05:19 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:19 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:20 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:21 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:22 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:23 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:24 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:25 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:26 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:27 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:28 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:28 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:05:29 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:30 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:31 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:32 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:33 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:34 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:35 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:36 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:37 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:38 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:39 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:40 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:41 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:42 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:43 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:44 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:45 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:46 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:47 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:48 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:49 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:50 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:51 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:52 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:53 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:54 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:55 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:55 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:05:55 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:55 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:55 Event: ULOG_JOB_TERMINATED for HTCondor Node 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 (9200.0.0) {02/12/25 14:05:54} +02/12/25 14:05:55 Number of idle job procs: 0 +02/12/25 14:05:55 Node 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 job proc (9200.0.0) completed successfully. +02/12/25 14:05:55 Node 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 job completed +02/12/25 14:05:55 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:55 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:55 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:05:55 Of 4 nodes total: +02/12/25 14:05:55 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:05:55 === === === === === === === === +02/12/25 14:05:55 2 0 0 0 1 1 0 0 +02/12/25 14:05:55 0 job proc(s) currently held +02/12/25 14:05:55 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.2100589573383331; EventCycleTimeCount = 72.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.4089569966115902; EventCycleTimeSum = 15.12424492835999; LogProcessCycleTimeAvg = 0.0003111958503723145; LogProcessCycleTimeCount = 4.0; LogProcessCycleTimeMax = 0.0003378391265869141; LogProcessCycleTimeMin = 0.0002658367156982422; LogProcessCycleTimeStd = 3.2248187718179E-05; LogProcessCycleTimeSum = 0.001244783401489258; SleepCycleTimeAvg = 1.00102616680993; SleepCycleTimeCount = 72.0; SleepCycleTimeMax = 1.00113582611084; SleepCycleTimeMin = 1.000178813934326; SleepCycleTimeStd = 0.0001697360906804993; SleepCycleTimeSum = 72.07388401031494; SubmitCycleTimeAvg = 0.2068866768928423; SubmitCycleTimeCount = 73.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.4067002972245296; SubmitCycleTimeSum = 15.10272741317749; ] +02/12/25 14:05:56 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:56 Submitting HTCondor Node 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 job(s)... +02/12/25 14:05:56 TmpDir(14)::TmpDir() +02/12/25 14:05:56 TmpDir(14)::Cd2TmpDir() +02/12/25 14:05:56 Submitting node 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 from file jobs/label2/val1/98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2.sub using direct job submission +02/12/25 14:05:56 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:05:56 TmpDir(14)::Cd2MainDir() +02/12/25 14:05:56 TmpDir(14)::~TmpDir() +02/12/25 14:05:56 assigned HTCondor ID (9201.0.0) +02/12/25 14:05:56 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:05:56 Just submitted 1 job this cycle... +02/12/25 14:05:56 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:05:56 Of 4 nodes total: +02/12/25 14:05:56 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:05:56 === === === === === === === === +02/12/25 14:05:56 2 0 1 0 0 1 0 0 +02/12/25 14:05:56 0 job proc(s) currently held +02/12/25 14:05:56 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.20718953380846; EventCycleTimeCount = 73.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.4068464311846497; EventCycleTimeSum = 15.12483596801758; LogProcessCycleTimeAvg = 0.0003111958503723145; LogProcessCycleTimeCount = 4.0; LogProcessCycleTimeMax = 0.0003378391265869141; LogProcessCycleTimeMin = 0.0002658367156982422; LogProcessCycleTimeStd = 3.2248187718179E-05; LogProcessCycleTimeSum = 0.001244783401489258; SleepCycleTimeAvg = 1.001026767573945; SleepCycleTimeCount = 73.0; SleepCycleTimeMax = 1.00113582611084; SleepCycleTimeMin = 1.000178813934326; SleepCycleTimeStd = 0.0001686313843457671; SleepCycleTimeSum = 73.07495403289795; SubmitCycleTimeAvg = 0.2047529671643231; SubmitCycleTimeCount = 74.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.404321912729151; SubmitCycleTimeSum = 15.15171957015991; ] +02/12/25 14:05:57 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:57 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:05:57 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:57 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:57 Reassigning the id of job 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 from (9201.0.0) to (9201.0.0) +02/12/25 14:05:57 Event: ULOG_SUBMIT for HTCondor Node 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 (9201.0.0) {02/12/25 14:05:56} +02/12/25 14:05:57 Number of idle job procs: 1 +02/12/25 14:05:57 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:57 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:57 Event: ULOG_EXECUTE for HTCondor Node 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 (9201.0.0) {02/12/25 14:05:56} +02/12/25 14:05:57 Number of idle job procs: 0 +02/12/25 14:05:57 ReadMultipleUserLogs::readEvent() +02/12/25 14:05:57 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:05:58 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:05:58 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:05:59 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:00 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:01 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:02 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:03 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:04 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:05 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:06 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:07 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:08 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:09 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:10 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:11 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:12 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:13 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:14 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:15 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:16 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:17 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:18 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:19 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:20 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:21 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:22 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:23 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:24 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:25 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:26 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:27 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:28 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:28 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:06:29 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:30 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:31 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:32 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:33 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:34 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:35 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:36 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:37 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:38 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:39 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:40 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:41 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:42 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:43 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:43 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:06:43 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:43 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:43 Event: ULOG_JOB_TERMINATED for HTCondor Node 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 (9201.0.0) {02/12/25 14:06:43} +02/12/25 14:06:43 Number of idle job procs: 0 +02/12/25 14:06:43 Node 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 job proc (9201.0.0) completed successfully. +02/12/25 14:06:43 Node 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 job completed +02/12/25 14:06:43 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:43 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:43 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:06:43 Of 4 nodes total: +02/12/25 14:06:43 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:06:43 === === === === === === === === +02/12/25 14:06:43 3 0 0 0 0 1 0 0 +02/12/25 14:06:43 0 job proc(s) currently held +02/12/25 14:06:43 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.1266128758589427; EventCycleTimeCount = 120.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.332172346184732; EventCycleTimeSum = 15.19354510307312; LogProcessCycleTimeAvg = 0.0003099441528320312; LogProcessCycleTimeCount = 6.0; LogProcessCycleTimeMax = 0.0003499984741210938; LogProcessCycleTimeMin = 0.0002648830413818359; LogProcessCycleTimeStd = 3.677215315780272E-05; LogProcessCycleTimeSum = 0.001859664916992188; SleepCycleTimeAvg = 1.001030806700389; SleepCycleTimeCount = 120.0; SleepCycleTimeMax = 1.00113582611084; SleepCycleTimeMin = 1.000084161758423; SleepCycleTimeStd = 0.0001626073315709091; SleepCycleTimeSum = 120.1236968040466; SubmitCycleTimeAvg = 0.1252294966011993; SubmitCycleTimeCount = 121.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.3308895735957696; SubmitCycleTimeSum = 15.15276908874512; ] +02/12/25 14:06:43 Starting final node... +02/12/25 14:06:44 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:44 Submitting HTCondor Node finalJob job(s)... +02/12/25 14:06:44 TmpDir(15)::TmpDir() +02/12/25 14:06:44 TmpDir(15)::Cd2TmpDir() +02/12/25 14:06:44 Submitting node finalJob from file jobs/finalJob/finalJob.sub using direct job submission +02/12/25 14:06:44 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:06:44 Submit warning: Submit:0:the line 'concurrency_limit = db_limit' was unused by DAGMAN. Is it a typo? +02/12/25 14:06:44 TmpDir(15)::Cd2MainDir() +02/12/25 14:06:44 TmpDir(15)::~TmpDir() +02/12/25 14:06:44 assigned HTCondor ID (9202.0.0) +02/12/25 14:06:44 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:06:44 Just submitted 1 job this cycle... +02/12/25 14:06:44 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:06:44 Of 4 nodes total: +02/12/25 14:06:44 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:06:44 === === === === === === === === +02/12/25 14:06:44 3 0 1 0 0 0 0 0 +02/12/25 14:06:44 0 job proc(s) currently held +02/12/25 14:06:44 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.1266128758589427; EventCycleTimeCount = 120.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.332172346184732; EventCycleTimeSum = 15.19354510307312; LogProcessCycleTimeAvg = 0.0003099441528320312; LogProcessCycleTimeCount = 6.0; LogProcessCycleTimeMax = 0.0003499984741210938; LogProcessCycleTimeMin = 0.0002648830413818359; LogProcessCycleTimeStd = 3.677215315780272E-05; LogProcessCycleTimeSum = 0.001859664916992188; SleepCycleTimeAvg = 1.009309873108036; SleepCycleTimeCount = 121.0; SleepCycleTimeMax = 2.002797842025757; SleepCycleTimeMin = 1.000084161758423; SleepCycleTimeStd = 0.09106987444403573; SleepCycleTimeSum = 122.1264946460724; SubmitCycleTimeAvg = 0.1245629103457341; SubmitCycleTimeCount = 122.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.3296016691364838; SubmitCycleTimeSum = 15.19667506217957; ] +02/12/25 14:06:45 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:45 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:06:45 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:45 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:45 Reassigning the id of job finalJob from (9202.0.0) to (9202.0.0) +02/12/25 14:06:45 Event: ULOG_SUBMIT for HTCondor Node finalJob (9202.0.0) {02/12/25 14:06:44} +02/12/25 14:06:45 Number of idle job procs: 1 +02/12/25 14:06:45 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:45 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:45 Event: ULOG_EXECUTE for HTCondor Node finalJob (9202.0.0) {02/12/25 14:06:45} +02/12/25 14:06:45 Number of idle job procs: 0 +02/12/25 14:06:45 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:45 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:46 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:47 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:48 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:49 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:50 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:51 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:52 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:53 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:54 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:55 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:56 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:57 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:57 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:06:57 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:57 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:57 Event: ULOG_JOB_TERMINATED for HTCondor Node finalJob (9202.0.0) {02/12/25 14:06:57} +02/12/25 14:06:57 Number of idle job procs: 0 +02/12/25 14:06:57 Node finalJob job proc (9202.0.0) completed successfully. +02/12/25 14:06:57 Node finalJob job completed +02/12/25 14:06:57 Running POST script of Node finalJob... +02/12/25 14:06:57 TmpDir(16)::TmpDir() +02/12/25 14:06:57 TmpDir(16)::Cd2TmpDir() +02/12/25 14:06:57 Warning: mysin has length 0 (ignore if produced by DAGMan; see gittrac #4987, #5031) +02/12/25 14:06:57 TmpDir(16)::Cd2MainDir() +02/12/25 14:06:57 TmpDir(16)::~TmpDir() +02/12/25 14:06:57 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:57 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:57 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:06:57 Of 4 nodes total: +02/12/25 14:06:57 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:06:57 === === === === === === === === +02/12/25 14:06:57 3 0 0 1 0 0 0 0 +02/12/25 14:06:57 0 job proc(s) currently held +02/12/25 14:06:57 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.1145775533260259; EventCycleTimeCount = 133.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.3175414266360285; EventCycleTimeSum = 15.23881459236145; LogProcessCycleTimeAvg = 0.001238316297531128; LogProcessCycleTimeCount = 8.0; LogProcessCycleTimeMax = 0.007828950881958008; LogProcessCycleTimeMin = 0.0002179145812988281; LogProcessCycleTimeStd = 0.002663394554712142; LogProcessCycleTimeSum = 0.009906530380249023; SleepCycleTimeAvg = 1.008503136350148; SleepCycleTimeCount = 134.0; SleepCycleTimeMax = 2.002797842025757; SleepCycleTimeMin = 1.000084161758423; SleepCycleTimeStd = 0.0865399506651973; SleepCycleTimeSum = 135.1394202709198; SubmitCycleTimeAvg = 0.1125696058626528; SubmitCycleTimeCount = 135.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.3153692681784284; SubmitCycleTimeSum = 15.19689679145813; ] +02/12/25 14:06:57 Initializing user log writer for /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log, (9202.0.0) +02/12/25 14:06:57 WriteUserLog::initialize: opened /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log successfully +02/12/25 14:06:57 WriteUserLog::user_priv_flag (~) is 0 +02/12/25 14:06:58 ReadMultipleUserLogs::GetLogStatus() +02/12/25 14:06:58 Currently monitoring 1 HTCondor log file(s) +02/12/25 14:06:58 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:58 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:58 Event: ULOG_POST_SCRIPT_TERMINATED for HTCondor Node finalJob (9202.0.0) {02/12/25 14:06:57} +02/12/25 14:06:58 POST Script of node finalJob failed with status 2 +02/12/25 14:06:58 POST for Node finalJob returned 2 +02/12/25 14:06:58 ReadMultipleUserLogs::readEvent() +02/12/25 14:06:58 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:58 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:06:58 Of 4 nodes total: +02/12/25 14:06:58 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:06:58 === === === === === === === === +02/12/25 14:06:58 3 0 0 0 0 0 1 0 +02/12/25 14:06:58 0 job proc(s) currently held +02/12/25 14:06:58 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.1137832652277021; EventCycleTimeCount = 134.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.3164790006290021; EventCycleTimeSum = 15.24695754051208; LogProcessCycleTimeAvg = 0.001131296157836914; LogProcessCycleTimeCount = 9.0; LogProcessCycleTimeMax = 0.007828950881958008; LogProcessCycleTimeMin = 0.0002179145812988281; LogProcessCycleTimeStd = 0.002511979603323447; LogProcessCycleTimeSum = 0.01018166542053223; SleepCycleTimeAvg = 1.008472638660007; SleepCycleTimeCount = 135.0; SleepCycleTimeMax = 2.002797842025757; SleepCycleTimeMin = 1.000084161758423; SleepCycleTimeStd = 0.08621716389030679; SleepCycleTimeSum = 136.143806219101; SubmitCycleTimeAvg = 0.1117422931334552; SubmitCycleTimeCount = 136.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.3143471580175156; SubmitCycleTimeSum = 15.1969518661499; ] +02/12/25 14:06:58 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:06:58 ERROR: the following job(s) failed: +02/12/25 14:06:58 ---------------------- Job ---------------------- +02/12/25 14:06:58 Node Name: finalJob +02/12/25 14:06:58 Noop: false +02/12/25 14:06:58 NodeID: 3 +02/12/25 14:06:58 Node Status: STATUS_ERROR +02/12/25 14:06:58 Node return val: 2 +02/12/25 14:06:58 Error: Job failed due to DAGMAN error 0 and POST Script failed with status 2 +02/12/25 14:06:58 Job Submit File: jobs/finalJob/finalJob.sub +02/12/25 14:06:58 POST Script: /work/testuser/ctrl_bps_htcondor/python/lsst/ctrl/bps/htcondor/final_post.sh finalJob $DAG_STATUS $RETURN +02/12/25 14:06:58 HTCondor Job ID: (9202.0.0) +02/12/25 14:06:58 PARENTS: WAITING: 0 CHILDREN: +02/12/25 14:06:58 --------------------------------------- +02/12/25 14:06:58 Aborting DAG... +02/12/25 14:06:58 Writing Rescue DAG to /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.rescue001... +02/12/25 14:06:58 Removing submitted jobs... +02/12/25 14:06:58 Removing any/all submitted HTCondor jobs... +02/12/25 14:06:58 Running: /work/lsst_stack/w_2025_06/conda/envs/lsst-scipipe-9.0.0/bin/condor_rm -const DAGManJobId==9198 -reason DAG' 'Abort:' 'DAG' 'is' 'exiting' 'and' 'writing' 'rescue' 'file. +02/12/25 14:06:58 Note: 0 total job deferrals because of -MaxJobs limit (0) +02/12/25 14:06:58 Note: 0 total job deferrals because of -MaxIdle limit (1000) +02/12/25 14:06:58 Note: 0 total job deferrals because of node category throttles +02/12/25 14:06:58 Note: 0 total PRE script deferrals because of -MaxPre limit (20) or DEFER +02/12/25 14:06:58 Note: 0 total POST script deferrals because of -MaxPost limit (20) or DEFER +02/12/25 14:06:58 Note: 0 total HOLD script deferrals because of -MaxHold limit (20) or DEFER +02/12/25 14:06:58 DAG status: 2 (DAG_STATUS_NODE_FAILED) +02/12/25 14:06:58 Of 4 nodes total: +02/12/25 14:06:58 Done Pre Queued Post Ready Un-Ready Failed Futile +02/12/25 14:06:58 === === === === === === === === +02/12/25 14:06:58 3 0 0 0 0 0 1 0 +02/12/25 14:06:58 0 job proc(s) currently held +02/12/25 14:06:58 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.1137832652277021; EventCycleTimeCount = 134.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.3164790006290021; EventCycleTimeSum = 15.24695754051208; LogProcessCycleTimeAvg = 0.001131296157836914; LogProcessCycleTimeCount = 9.0; LogProcessCycleTimeMax = 0.007828950881958008; LogProcessCycleTimeMin = 0.0002179145812988281; LogProcessCycleTimeStd = 0.002511979603323447; LogProcessCycleTimeSum = 0.01018166542053223; SleepCycleTimeAvg = 1.008472638660007; SleepCycleTimeCount = 135.0; SleepCycleTimeMax = 2.002797842025757; SleepCycleTimeMin = 1.000084161758423; SleepCycleTimeStd = 0.08621716389030679; SleepCycleTimeSum = 136.143806219101; SubmitCycleTimeAvg = 0.1117422931334552; SubmitCycleTimeCount = 136.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.3143471580175156; SubmitCycleTimeSum = 15.1969518661499; ] +02/12/25 14:06:58 SharedPortClient: sent connection request to local schedd for shared port id schedd_1514_0e79 +02/12/25 14:06:58 Wrote metrics file /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.metrics. +02/12/25 14:06:58 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.1137832652277021; EventCycleTimeCount = 134.0; EventCycleTimeMax = 1.008378982543945; EventCycleTimeMin = 4.410743713378906E-05; EventCycleTimeStd = 0.3164790006290021; EventCycleTimeSum = 15.24695754051208; LogProcessCycleTimeAvg = 0.001131296157836914; LogProcessCycleTimeCount = 9.0; LogProcessCycleTimeMax = 0.007828950881958008; LogProcessCycleTimeMin = 0.0002179145812988281; LogProcessCycleTimeStd = 0.002511979603323447; LogProcessCycleTimeSum = 0.01018166542053223; SleepCycleTimeAvg = 1.008472638660007; SleepCycleTimeCount = 135.0; SleepCycleTimeMax = 2.002797842025757; SleepCycleTimeMin = 1.000084161758423; SleepCycleTimeStd = 0.08621716389030679; SleepCycleTimeSum = 136.143806219101; SubmitCycleTimeAvg = 0.1117422931334552; SubmitCycleTimeCount = 136.0; SubmitCycleTimeMax = 1.001091957092285; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.3143471580175156; SubmitCycleTimeSum = 15.1969518661499; ] +02/12/25 14:06:58 ReadMultipleUserLogs::unmonitorLogFile(/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log) +02/12/25 14:06:58 ReadMultipleUserLogs: found LogFileMonitor object for /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log (64770:22095619392) +02/12/25 14:06:58 Closing file +02/12/25 14:06:58 ReadMultipleUserLogs: removed log file /work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag.nodes.log (64770:22095619392) from active list +02/12/25 14:06:58 **** condor_scheduniv_exec.9198.0 (condor_DAGMAN) pid 35494 EXITING WITH STATUS 1 diff --git a/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.metrics b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.metrics new file mode 100644 index 0000000..183f284 --- /dev/null +++ b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.metrics @@ -0,0 +1,25 @@ +{ + "client":"condor_dagman", + "version":"23.0.3", + "planner":"", + "planner_version":"", + "type":"metrics", + "wf_uuid":"", + "root_wf_uuid":"", + "start_time":1739390667.067, + "end_time":1739390818.662, + "duration":151.596, + "exitcode":1, + "dagman_id":"9198", + "parent_dagman_id":"", + "rescue_dag_number":0, + "jobs":4, + "jobs_failed":2, + "jobs_succeeded":3, + "dag_jobs":0, + "dag_jobs_failed":0, + "dag_jobs_succeeded":0, + "total_jobs":4, + "total_jobs_run":5, + "DagStatus":2 +} diff --git a/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.nodes.log b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.nodes.log new file mode 100644 index 0000000..21b0093 --- /dev/null +++ b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.nodes.log @@ -0,0 +1,128 @@ +000 (9199.000.000) 2025-02-12 14:05:04 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1514_0e79> + DAG Node: pipetaskInit +... +001 (9199.000.000) 2025-02-12 14:05:04 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1514_0e79> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_35582" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9199.000.000) 2025-02-12 14:05:16 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:05, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:05, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 0 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 83 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 12 + TimeSlotBusy (s) : 12 + + Job terminated of its own accord at 2025-02-12T20:05:16Z with exit-code 0. +... +000 (9200.000.000) 2025-02-12 14:05:18 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1514_0e79> + DAG Node: 78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2 +... +001 (9200.000.000) 2025-02-12 14:05:18 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1514_0e79> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_35627" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9200.000.000) 2025-02-12 14:05:54 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:28, Sys 0 00:00:01 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:28, Sys 0 00:00:01 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 11666 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 11666 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 95 1 2048 + GPUs : 0 + Memory (MB) : 1299 2048 2048 + TimeExecute (s) : 36 + TimeSlotBusy (s) : 36 + + Job terminated of its own accord at 2025-02-12T20:05:54Z with exit-code 0. +... +000 (9201.000.000) 2025-02-12 14:05:56 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1514_0e79> + DAG Node: 98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2 +... +001 (9201.000.000) 2025-02-12 14:05:56 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1514_0e79> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_35713" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9201.000.000) 2025-02-12 14:06:43 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:38, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:38, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 22802 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 22802 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 106 1 2048 + GPUs : 0 + Memory (MB) : 386 2048 2048 + TimeExecute (s) : 47 + TimeSlotBusy (s) : 47 + + Job terminated of its own accord at 2025-02-12T20:06:43Z with exit-code 0. +... +000 (9202.000.000) 2025-02-12 14:06:44 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1514_0e79> + DAG Node: finalJob +... +001 (9202.000.000) 2025-02-12 14:06:45 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1514_0e79> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_35831" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9202.000.000) 2025-02-12 14:06:57 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:05, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:05, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 6256 - Run Bytes Sent By Job + 214 - Run Bytes Received By Job + 6256 - Total Bytes Sent By Job + 214 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 89 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 12 + TimeSlotBusy (s) : 13 + + Job terminated of its own accord at 2025-02-12T20:06:57Z with exit-code 0. +... +016 (9202.000.000) 2025-02-12 14:06:57 POST Script terminated. + (1) Normal termination (return value 2) + DAG Node: finalJob +... diff --git a/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.info.json b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.info.json new file mode 100644 index 0000000..d9ef407 --- /dev/null +++ b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.info.json @@ -0,0 +1 @@ +{"test02": {"9198.0": {"ClusterId": 9198, "GlobalJobId": "test02#9198.0#1739390666", "bps_wms_service": "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorService", "bps_project": "dev", "bps_payload": "tiny", "bps_operator": "testuser", "bps_wms_workflow": "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorWorkflow", "bps_provisioning_job": "provisioningJob", "bps_run_quanta": "label1:1;label2:1", "bps_campaign": "quick", "bps_runsite": "testpool", "bps_job_summary": "pipetaskInit:1;label1:1;label2:1;finalJob:1", "bps_run": "u_testuser_tiny_20250212T200412Z", "bps_isjob": "True"}}} \ No newline at end of file diff --git a/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.node_status b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.node_status new file mode 100644 index 0000000..9dab662 --- /dev/null +++ b/tests/data/tiny_prov_no_submit/tiny_prov_no_submit.node_status @@ -0,0 +1,60 @@ +[ + Type = "DagStatus"; + DagFiles = { + "/work/testuser/submit/u/testuser/tiny/20250212T200412Z/u_testuser_tiny_20250212T200412Z.dag" + }; + Timestamp = 1739390818; /* "Wed Feb 12 14:06:58 2025" */ + DagStatus = 6; /* "STATUS_ERROR (failed)" */ + NodesTotal = 4; + NodesDone = 3; + NodesPre = 0; + NodesQueued = 0; + NodesPost = 0; + NodesReady = 0; + NodesUnready = 0; + NodesFutile = 0; + NodesFailed = 1; + JobProcsHeld = 0; + JobProcsIdle = 0; /* includes held */ +] +[ + Type = "NodeStatus"; + Node = "pipetaskInit"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "78475da3-6922-4c9c-8395-ff355196c10f_label1_val1_val2"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "98070426-9528-48d7-a995-444a72bdb6d0_label2_val1_val2"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "finalJob"; + NodeStatus = 6; /* "STATUS_ERROR" */ + StatusDetails = "Job failed due to DAGMAN error 0 and POST Script failed with status 2"; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "StatusEnd"; + EndTime = 1739390818; /* "Wed Feb 12 14:06:58 2025" */ + NextUpdate = 0; /* "none" */ +] diff --git a/tests/data/tiny_running/tiny_running.dag b/tests/data/tiny_running/tiny_running.dag new file mode 100644 index 0000000..bbe452e --- /dev/null +++ b/tests/data/tiny_running/tiny_running.dag @@ -0,0 +1,22 @@ +JOB pipetaskInit "jobs/pipetaskInit/pipetaskInit.sub" +JOB ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 "jobs/label1/val1/ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2.sub" +JOB dbf919fa-5453-4b05-8806-ad6390fda0a3_label2_val1_val2 "jobs/label2/val1/dbf919fa-5453-4b05-8806-ad6390fda0a3_label2_val1_val2.sub" +PARENT pipetaskInit CHILD ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 +PARENT ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 CHILD dbf919fa-5453-4b05-8806-ad6390fda0a3_label2_val1_val2 +DOT u_testuser_tiny_20250225T160151Z.dot +NODE_STATUS_FILE u_testuser_tiny_20250225T160151Z.node_status +SET_JOB_ATTR bps_isjob= "True" +SET_JOB_ATTR bps_project= "dev" +SET_JOB_ATTR bps_campaign= "quick" +SET_JOB_ATTR bps_run= "u_testuser_tiny_20250225T160151Z" +SET_JOB_ATTR bps_operator= "testuser" +SET_JOB_ATTR bps_payload= "tiny" +SET_JOB_ATTR bps_runsite= "testpool" +SET_JOB_ATTR bps_wms_service= "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorService" +SET_JOB_ATTR bps_wms_workflow= "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorWorkflow" +SET_JOB_ATTR bps_run_quanta= "label1:1;label2:1" +SET_JOB_ATTR bps_job_summary= "pipetaskInit:1;label1:1;label2:1;finalJob:1" +SET_JOB_ATTR bps_provisioning_job= "provisioningJob" +FINAL finalJob jobs/finalJob/finalJob.sub +SCRIPT POST finalJob /work/testuser/ctrl_bps_htcondor/python/lsst/ctrl/bps/htcondor/final_post.sh finalJob $DAG_STATUS $RETURN +SERVICE provisioningJob jobs/provisioningJob/provisioningJob.sub diff --git a/tests/data/tiny_running/tiny_running.dag.dagman.log b/tests/data/tiny_running/tiny_running.dag.dagman.log new file mode 100644 index 0000000..f0d305b --- /dev/null +++ b/tests/data/tiny_running/tiny_running.dag.dagman.log @@ -0,0 +1,4 @@ +000 (9248.000.000) 2025-02-25 10:03:01 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1764_8927> +... +001 (9248.000.000) 2025-02-25 10:03:01 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1764_8927> +... diff --git a/tests/data/tiny_running/tiny_running.dag.dagman.out b/tests/data/tiny_running/tiny_running.dag.dagman.out new file mode 100644 index 0000000..0da052e --- /dev/null +++ b/tests/data/tiny_running/tiny_running.dag.dagman.out @@ -0,0 +1,168 @@ +02/25/25 10:03:01 ****************************************************** +02/25/25 10:03:01 ** condor_scheduniv_exec.9248.0 (CONDOR_DAGMAN) STARTING UP +02/25/25 10:03:01 ** /work/lsst_stack/w_2025_08/conda/envs/lsst-scipipe-10.0.0/bin/condor_dagman +02/25/25 10:03:01 ** SubsystemInfo: name=DAGMAN type=DAGMAN(9) class=CLIENT(2) +02/25/25 10:03:01 ** Configuration: subsystem:DAGMAN local: class:CLIENT +02/25/25 10:03:01 ** $CondorVersion: 24.0.4 2025-02-10 $ +02/25/25 10:03:01 ** $CondorPlatform: X86_64-AlmaLinux_9.5 $ +02/25/25 10:03:01 ** PID = 15068 RealUID = 27031 +02/25/25 10:03:01 ** Log last touched time unavailable (No such file or directory) +02/25/25 10:03:01 ****************************************************** +02/25/25 10:03:01 Using config source: /work/lsst_stack/w_2025_08/conda/envs/lsst-scipipe-10.0.0/etc/condor/condor_config +02/25/25 10:03:01 Using local config sources: +02/25/25 10:03:01 /etc/condor/condor_config +02/25/25 10:03:01 /etc/condor/config.d/00-minicondor +02/25/25 10:03:01 /etc/condor/config.d/00-security +02/25/25 10:03:01 /etc/condor/config.d/10-stash-plugin.conf +02/25/25 10:03:01 /etc/condor/config.d/99-lsst +02/25/25 10:03:01 /etc/condor/condor_config.local +02/25/25 10:03:01 /home/testuser/.condor/user_config +02/25/25 10:03:01 config Macros = 98, Sorted = 98, StringBytes = 3185, TablesBytes = 3616 +02/25/25 10:03:01 CLASSAD_CACHING is ENABLED +02/25/25 10:03:01 Daemon Log is logging: D_ALWAYS D_ERROR D_STATUS +02/25/25 10:03:01 DaemonCore: No command port requested. +02/25/25 10:03:01 DAGMAN_USE_STRICT setting: 1 +02/25/25 10:03:01 DAGMAN_VERBOSITY setting: 3 +02/25/25 10:03:01 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 +02/25/25 10:03:01 DAGMAN_DEBUG_CACHE_ENABLE setting: False +02/25/25 10:03:01 DAGMAN_SUBMIT_DELAY setting: 0 +02/25/25 10:03:01 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 +02/25/25 10:03:01 DAGMAN_STARTUP_CYCLE_DETECT setting: False +02/25/25 10:03:01 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 100 +02/25/25 10:03:01 DAGMAN_AGGRESSIVE_SUBMIT setting: False +02/25/25 10:03:01 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 1 +02/25/25 10:03:01 DAGMAN_QUEUE_UPDATE_INTERVAL setting: 30 +02/25/25 10:03:01 DAGMAN_DEFAULT_PRIORITY setting: 0 +02/25/25 10:03:01 DAGMAN_SUPPRESS_NOTIFICATION setting: False +02/25/25 10:03:01 allow_events (DAGMAN_ALLOW_EVENTS) setting: 114 +02/25/25 10:03:01 DAGMAN_RETRY_SUBMIT_FIRST setting: True +02/25/25 10:03:01 DAGMAN_RETRY_NODE_FIRST setting: False +02/25/25 10:03:01 DAGMAN_MAX_JOBS_IDLE setting: 1000 +02/25/25 10:03:01 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 +02/25/25 10:03:01 DAGMAN_MAX_PRE_SCRIPTS setting: 20 +02/25/25 10:03:01 DAGMAN_MAX_POST_SCRIPTS setting: 20 +02/25/25 10:03:01 DAGMAN_MAX_HOLD_SCRIPTS setting: 20 +02/25/25 10:03:01 DAGMAN_MUNGE_NODE_NAMES setting: True +02/25/25 10:03:01 DAGMAN_PROHIBIT_MULTI_JOBS setting: False +02/25/25 10:03:01 DAGMAN_SUBMIT_DEPTH_FIRST setting: False +02/25/25 10:03:01 DAGMAN_ALWAYS_RUN_POST setting: False +02/25/25 10:03:01 DAGMAN_CONDOR_SUBMIT_EXE setting: /work/lsst_stack/w_2025_08/conda/envs/lsst-scipipe-10.0.0/bin/condor_submit +02/25/25 10:03:01 DAGMAN_CONDOR_RM_EXE setting: /work/lsst_stack/w_2025_08/conda/envs/lsst-scipipe-10.0.0/bin/condor_rm +02/25/25 10:03:01 DAGMAN_USE_DIRECT_SUBMIT setting: True +02/25/25 10:03:01 DAGMAN_PRODUCE_JOB_CREDENTIALS setting: True +02/25/25 10:03:01 DAGMAN_DEFAULT_APPEND_VARS setting: False +02/25/25 10:03:01 DAGMAN_INHERIT_ATTRS_PREFIX setting: +02/25/25 10:03:01 DAGMAN_INHERIT_ATTRS setting: +02/25/25 10:03:01 DAGMAN_ABORT_DUPLICATES setting: True +02/25/25 10:03:01 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True +02/25/25 10:03:01 DAGMAN_PENDING_REPORT_INTERVAL setting: 60 +02/25/25 10:03:01 DAGMAN_CHECK_QUEUE_INTERVAL setting: 28800 +02/25/25 10:03:01 DAGMAN_AUTO_RESCUE setting: True +02/25/25 10:03:01 DAGMAN_MAX_RESCUE_NUM setting: 100 +02/25/25 10:03:01 DAGMAN_WRITE_PARTIAL_RESCUE setting: True +02/25/25 10:03:01 DAGMAN_DEFAULT_NODE_LOG setting: @(DAG_DIR)/@(DAG_FILE).nodes.log +02/25/25 10:03:01 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True +02/25/25 10:03:01 DAGMAN_MAX_JOB_HOLDS setting: 100 +02/25/25 10:03:01 DAGMAN_HOLD_CLAIM_TIME setting: 20 +02/25/25 10:03:01 ALL_DEBUG setting: D_FULLDEBUG +02/25/25 10:03:01 DAGMAN_SUPPRESS_JOB_LOGS setting: False +02/25/25 10:03:01 DAGMAN_REMOVE_NODE_JOBS setting: True +02/25/25 10:03:01 DAGMAN_REMOVE_JOBS_AFTER_LIMIT_CHANGE setting: False +02/25/25 10:03:01 DAGMAN will adjust edges after parsing +02/25/25 10:03:01 argv[0] == "condor_scheduniv_exec.9248.0" +02/25/25 10:03:01 argv[1] == "-Lockfile" +02/25/25 10:03:01 argv[2] == "/work/testuser/submit/u/testuser/tiny/20250225T160151Z/u_testuser_tiny_20250225T160151Z.dag.lock" +02/25/25 10:03:01 argv[3] == "-Dag" +02/25/25 10:03:01 argv[4] == "/work/testuser/submit/u/testuser/tiny/20250225T160151Z/u_testuser_tiny_20250225T160151Z.dag" +02/25/25 10:03:01 argv[5] == "-MaxIdle" +02/25/25 10:03:01 argv[6] == "1000" +02/25/25 10:03:01 argv[7] == "-CsdVersion" +02/25/25 10:03:01 argv[8] == "$CondorVersion: 24.0.4 2025-02-10 $" +02/25/25 10:03:01 argv[9] == "-dagman" +02/25/25 10:03:01 argv[10] == "/work/lsst_stack/w_2025_08/conda/envs/lsst-scipipe-10.0.0/bin/condor_dagman" +02/25/25 10:03:01 argv[11] == "-AutoRescue" +02/25/25 10:03:01 argv[12] == "1" +02/25/25 10:03:01 argv[13] == "-DoRescueFrom" +02/25/25 10:03:01 argv[14] == "0" +02/25/25 10:03:01 Workflow batch-id: <9248.0> +02/25/25 10:03:01 Workflow batch-name: +02/25/25 10:03:01 Workflow accounting_group: <> +02/25/25 10:03:01 Workflow accounting_group_user: <> +02/25/25 10:03:01 Warning: failed to get attribute DAGNodeName +02/25/25 10:03:01 DAGMAN_LOG_ON_NFS_IS_ERROR setting: False +02/25/25 10:03:01 Default node log file is: +02/25/25 10:03:01 DAG Lockfile will be written to /work/testuser/submit/u/testuser/tiny/20250225T160151Z/u_testuser_tiny_20250225T160151Z.dag.lock +02/25/25 10:03:01 DAG Input file is /work/testuser/submit/u/testuser/tiny/20250225T160151Z/u_testuser_tiny_20250225T160151Z.dag +02/25/25 10:03:01 Parsing 1 dagfiles +02/25/25 10:03:01 Parsing /work/testuser/submit/u/testuser/tiny/20250225T160151Z/u_testuser_tiny_20250225T160151Z.dag ... +02/25/25 10:03:01 Adjusting edges +02/25/25 10:03:01 Dag contains 4 total nodes +02/25/25 10:03:01 Bootstrapping... +02/25/25 10:03:01 Number of pre-completed nodes: 0 +02/25/25 10:03:01 MultiLogFiles: truncating log file /work/testuser/submit/u/testuser/tiny/20250225T160151Z/u_testuser_tiny_20250225T160151Z.dag.nodes.log +02/25/25 10:03:01 Starting service node provisioningJob... +02/25/25 10:03:01 DAG status: 0 (DAG_STATUS_OK) +02/25/25 10:03:01 Of 4 nodes total: +02/25/25 10:03:01 Done Pre Queued Post Ready Un-Ready Failed Futile +02/25/25 10:03:01 === === === === === === === === +02/25/25 10:03:01 0 0 0 0 1 3 0 0 +02/25/25 10:03:01 0 job proc(s) currently held +02/25/25 10:03:01 DAGMan Runtime Statistics: [EventCycleTimeCount = 0.0; EventCycleTimeSum = 0.0; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeCount = 0.0; SleepCycleTimeSum = 0.0; SubmitCycleTimeCount = 0.0; SubmitCycleTimeSum = 0.0; ] +02/25/25 10:03:01 Registering condor_event_timer... +02/25/25 10:03:02 Submitting HTCondor Node provisioningJob job(s)... +02/25/25 10:03:02 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/25/25 10:03:02 assigned HTCondor ID (9249.0.0) +02/25/25 10:03:02 Submitting HTCondor Node pipetaskInit job(s)... +02/25/25 10:03:02 Submitting node pipetaskInit from file jobs/pipetaskInit/pipetaskInit.sub using direct job submission +02/25/25 10:03:02 assigned HTCondor ID (9250.0.0) +02/25/25 10:03:02 Just submitted 2 jobs this cycle... +02/25/25 10:03:02 DAG status: 0 (DAG_STATUS_OK) +02/25/25 10:03:02 Of 4 nodes total: +02/25/25 10:03:02 Done Pre Queued Post Ready Un-Ready Failed Futile +02/25/25 10:03:02 === === === === === === === === +02/25/25 10:03:02 0 0 1 0 0 3 0 0 +02/25/25 10:03:02 0 job proc(s) currently held +02/25/25 10:03:02 DAGMan Runtime Statistics: [EventCycleTimeCount = 0.0; EventCycleTimeSum = 0.0; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeCount = 0.0; SleepCycleTimeSum = 0.0; SubmitCycleTimeAvg = 0.1724209785461426; SubmitCycleTimeCount = 1.0; SubmitCycleTimeMax = 0.1724209785461426; SubmitCycleTimeMin = 0.1724209785461426; SubmitCycleTimeStd = 0.1724209785461426; SubmitCycleTimeSum = 0.1724209785461426; ] +02/25/25 10:03:03 Currently monitoring 1 HTCondor log file(s) +02/25/25 10:03:03 Reassigning the id of job provisioningJob from (9249.0.0) to (9249.0.0) +02/25/25 10:03:03 Event: ULOG_SUBMIT for HTCondor Node provisioningJob (9249.0.0) {02/25/25 10:03:02} +02/25/25 10:03:03 Number of idle job procs: 1 +02/25/25 10:03:03 Reassigning the id of job pipetaskInit from (9250.0.0) to (9250.0.0) +02/25/25 10:03:03 Event: ULOG_SUBMIT for HTCondor Node pipetaskInit (9250.0.0) {02/25/25 10:03:02} +02/25/25 10:03:03 Number of idle job procs: 2 +02/25/25 10:03:08 Currently monitoring 1 HTCondor log file(s) +02/25/25 10:03:08 Event: ULOG_EXECUTE for HTCondor Node provisioningJob (9249.0.0) {02/25/25 10:03:08} +02/25/25 10:03:08 Number of idle job procs: 1 +02/25/25 10:03:12 Currently monitoring 1 HTCondor log file(s) +02/25/25 10:03:12 Event: ULOG_EXECUTE for HTCondor Node pipetaskInit (9250.0.0) {02/25/25 10:03:12} +02/25/25 10:03:12 Number of idle job procs: 0 +02/25/25 10:03:26 Currently monitoring 1 HTCondor log file(s) +02/25/25 10:03:26 Event: ULOG_JOB_TERMINATED for HTCondor Node pipetaskInit (9250.0.0) {02/25/25 10:03:26} +02/25/25 10:03:26 Number of idle job procs: 0 +02/25/25 10:03:26 Node pipetaskInit job proc (9250.0.0) completed successfully. +02/25/25 10:03:26 Node pipetaskInit job completed +02/25/25 10:03:26 DAG status: 0 (DAG_STATUS_OK) +02/25/25 10:03:26 Of 4 nodes total: +02/25/25 10:03:26 Done Pre Queued Post Ready Un-Ready Failed Futile +02/25/25 10:03:26 === === === === === === === === +02/25/25 10:03:26 1 0 0 0 1 2 0 0 +02/25/25 10:03:26 0 job proc(s) currently held +02/25/25 10:03:26 DAGMan Runtime Statistics: [EventCycleTimeAvg = 0.007261514663696289; EventCycleTimeCount = 24.0; EventCycleTimeMax = 0.1727619171142578; EventCycleTimeMin = 1.9073486328125E-05; EventCycleTimeStd = 0.03525156490816845; EventCycleTimeSum = 0.1742763519287109; LogProcessCycleTimeAvg = 0.0002812743186950684; LogProcessCycleTimeCount = 4.0; LogProcessCycleTimeMax = 0.0003252029418945312; LogProcessCycleTimeMin = 0.0002429485321044922; LogProcessCycleTimeStd = 4.006253973721482E-05; LogProcessCycleTimeSum = 0.001125097274780273; SleepCycleTimeAvg = 1.000992685556412; SleepCycleTimeCount = 24.0; SleepCycleTimeMax = 1.001121997833252; SleepCycleTimeMin = 1.000223875045776; SleepCycleTimeStd = 0.0002134073896281548; SleepCycleTimeSum = 24.02382445335388; SubmitCycleTimeAvg = 0.006900444030761719; SubmitCycleTimeCount = 25.0; SubmitCycleTimeMax = 0.1724209785461426; SubmitCycleTimeMin = 2.86102294921875E-06; SubmitCycleTimeStd = 0.03448344471009052; SubmitCycleTimeSum = 0.172511100769043; ] +02/25/25 10:03:27 Submitting HTCondor Node ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 job(s)... +02/25/25 10:03:27 Submitting node ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 from file jobs/label1/val1/ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2.sub using direct job submission +02/25/25 10:03:27 assigned HTCondor ID (9251.0.0) +02/25/25 10:03:27 Just submitted 1 job this cycle... +02/25/25 10:03:27 DAG status: 0 (DAG_STATUS_OK) +02/25/25 10:03:27 Of 4 nodes total: +02/25/25 10:03:27 Done Pre Queued Post Ready Un-Ready Failed Futile +02/25/25 10:03:27 === === === === === === === === +02/25/25 10:03:27 1 0 1 0 0 2 0 0 +02/25/25 10:03:27 0 job proc(s) currently held +02/25/25 10:03:27 DAGMan Runtime Statistics: [EventCycleTimeAvg = 0.006990776062011718; EventCycleTimeCount = 25.0; EventCycleTimeMax = 0.1727619171142578; EventCycleTimeMin = 1.9073486328125E-05; EventCycleTimeStd = 0.03453588392840239; EventCycleTimeSum = 0.174769401550293; LogProcessCycleTimeAvg = 0.0002812743186950684; LogProcessCycleTimeCount = 4.0; LogProcessCycleTimeMax = 0.0003252029418945312; LogProcessCycleTimeMin = 0.0002429485321044922; LogProcessCycleTimeStd = 4.006253973721482E-05; LogProcessCycleTimeSum = 0.001125097274780273; SleepCycleTimeAvg = 1.000996417999268; SleepCycleTimeCount = 25.0; SleepCycleTimeMax = 1.001121997833252; SleepCycleTimeMin = 1.000223875045776; SleepCycleTimeStd = 0.0002097459868836666; SleepCycleTimeSum = 25.02491044998169; SubmitCycleTimeAvg = 0.008469077257009653; SubmitCycleTimeCount = 26.0; SubmitCycleTimeMax = 0.1724209785461426; SubmitCycleTimeMin = 2.86102294921875E-06; SubmitCycleTimeStd = 0.03472059195739079; SubmitCycleTimeSum = 0.220196008682251; ] +02/25/25 10:03:28 Currently monitoring 1 HTCondor log file(s) +02/25/25 10:03:28 Reassigning the id of job ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 from (9251.0.0) to (9251.0.0) +02/25/25 10:03:28 Event: ULOG_SUBMIT for HTCondor Node ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 (9251.0.0) {02/25/25 10:03:27} +02/25/25 10:03:28 Number of idle job procs: 1 +02/25/25 10:03:29 Currently monitoring 1 HTCondor log file(s) +02/25/25 10:03:29 Event: ULOG_EXECUTE for HTCondor Node ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 (9251.0.0) {02/25/25 10:03:29} +02/25/25 10:03:29 Number of idle job procs: 0 diff --git a/tests/data/tiny_running/tiny_running.dag.nodes.log b/tests/data/tiny_running/tiny_running.dag.nodes.log new file mode 100644 index 0000000..380eb00 --- /dev/null +++ b/tests/data/tiny_running/tiny_running.dag.nodes.log @@ -0,0 +1,47 @@ +000 (9249.000.000) 2025-02-25 10:03:02 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1764_8927> + DAG Node: provisioningJob +... +000 (9250.000.000) 2025-02-25 10:03:02 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1764_8927> + DAG Node: pipetaskInit +... +001 (9249.000.000) 2025-02-25 10:03:08 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=starter_2454_d09d_2> +... +001 (9250.000.000) 2025-02-25 10:03:12 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1764_8927> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_15083" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9250.000.000) 2025-02-25 10:03:26 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:07, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:07, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 0 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 84 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 14 + TimeSlotBusy (s) : 14 + + Job terminated of its own accord at 2025-02-25T16:03:26Z with exit-code 0. +... +000 (9251.000.000) 2025-02-25 10:03:27 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1764_8927> + DAG Node: ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2 +... +001 (9251.000.000) 2025-02-25 10:03:29 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1764_8927> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_15125" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... diff --git a/tests/data/tiny_running/tiny_running.info.json b/tests/data/tiny_running/tiny_running.info.json new file mode 100644 index 0000000..83fc365 --- /dev/null +++ b/tests/data/tiny_running/tiny_running.info.json @@ -0,0 +1 @@ +{"test02": {"9248.0": {"ClusterId": 9248, "GlobalJobId": "test02#9248.0#1740499381", "bps_run": "u_testuser_tiny_20250225T160151Z", "bps_isjob": "True", "bps_payload": "tiny", "bps_project": "dev", "bps_runsite": "testpool", "bps_campaign": "quick", "bps_operator": "testuser", "bps_run_quanta": "label1:1;label2:1", "bps_job_summary": "pipetaskInit:1;label1:1;label2:1;finalJob:1", "bps_wms_service": "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorService", "bps_wms_workflow": "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorWorkflow", "bps_provisioning_job": "provisioningJob"}}} \ No newline at end of file diff --git a/tests/data/tiny_running/tiny_running.node_status b/tests/data/tiny_running/tiny_running.node_status new file mode 100644 index 0000000..8da0e58 --- /dev/null +++ b/tests/data/tiny_running/tiny_running.node_status @@ -0,0 +1,60 @@ +[ + Type = "DagStatus"; + DagFiles = { + "/work/testuser/submit/u/testuser/tiny/20250225T160151Z/u_testuser_tiny_20250225T160151Z.dag" + }; + Timestamp = 1740499442; /* "Tue Feb 25 10:04:02 2025" */ + DagStatus = 3; /* "STATUS_SUBMITTED ()" */ + NodesTotal = 4; + NodesDone = 1; + NodesPre = 0; + NodesQueued = 1; + NodesPost = 0; + NodesReady = 0; + NodesUnready = 2; + NodesFutile = 0; + NodesFailed = 0; + JobProcsHeld = 0; + JobProcsIdle = 0; /* includes held */ +] +[ + Type = "NodeStatus"; + Node = "pipetaskInit"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "ca27ea57-c014-44c1-838a-78c06bc3ec1b_label1_val1_val2"; + NodeStatus = 3; /* "STATUS_SUBMITTED" */ + StatusDetails = "not_idle"; + RetryCount = 0; + JobProcsQueued = 1; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "dbf919fa-5453-4b05-8806-ad6390fda0a3_label2_val1_val2"; + NodeStatus = 0; /* "STATUS_NOT_READY" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "finalJob"; + NodeStatus = 0; /* "STATUS_NOT_READY" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "StatusEnd"; + EndTime = 1740499442; /* "Tue Feb 25 10:04:02 2025" */ + NextUpdate = 1740499502; /* "Tue Feb 25 10:05:02 2025" */ +] diff --git a/tests/data/tiny_success/tiny_success.dag b/tests/data/tiny_success/tiny_success.dag new file mode 100644 index 0000000..6f6494c --- /dev/null +++ b/tests/data/tiny_success/tiny_success.dag @@ -0,0 +1,22 @@ +JOB pipetaskInit "jobs/pipetaskInit/pipetaskInit.sub" +JOB 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 "jobs/label1/val1/5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2.sub" +JOB 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 "jobs/label2/val1/0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2.sub" +PARENT pipetaskInit CHILD 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 +PARENT 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 CHILD 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 +DOT u_testuser_tiny_20250213T164427Z.dot +NODE_STATUS_FILE u_testuser_tiny_20250213T164427Z.node_status +SET_JOB_ATTR bps_isjob= "True" +SET_JOB_ATTR bps_project= "dev" +SET_JOB_ATTR bps_campaign= "quick" +SET_JOB_ATTR bps_run= "u_testuser_tiny_20250213T164427Z" +SET_JOB_ATTR bps_operator= "testuser" +SET_JOB_ATTR bps_payload= "tiny" +SET_JOB_ATTR bps_runsite= "testpool" +SET_JOB_ATTR bps_wms_service= "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorService" +SET_JOB_ATTR bps_wms_workflow= "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorWorkflow" +SET_JOB_ATTR bps_run_quanta= "label1:1;label2:1" +SET_JOB_ATTR bps_job_summary= "pipetaskInit:1;label1:1;label2:1;finalJob:1" +SET_JOB_ATTR bps_provisioning_job= "provisioningJob" +FINAL finalJob jobs/finalJob/finalJob.sub +SCRIPT POST finalJob /work/testuser/ctrl_bps_htcondor/python/lsst/ctrl/bps/htcondor/final_post.sh finalJob $DAG_STATUS $RETURN +SERVICE provisioningJob jobs/provisioningJob/provisioningJob.sub diff --git a/tests/data/tiny_success/tiny_success.dag.dagman.log b/tests/data/tiny_success/tiny_success.dag.dagman.log new file mode 100644 index 0000000..b2ba2c2 --- /dev/null +++ b/tests/data/tiny_success/tiny_success.dag.dagman.log @@ -0,0 +1,15 @@ +000 (9208.000.000) 2025-02-13 10:44:38 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> +... +001 (9208.000.000) 2025-02-13 10:44:38 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> +... +005 (9208.000.000) 2025-02-13 10:46:46 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 0 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job +... diff --git a/tests/data/tiny_success/tiny_success.dag.dagman.out b/tests/data/tiny_success/tiny_success.dag.dagman.out new file mode 100644 index 0000000..cacd800 --- /dev/null +++ b/tests/data/tiny_success/tiny_success.dag.dagman.out @@ -0,0 +1,543 @@ +02/13/25 10:44:38 Result of reading /etc/issue: \S + +02/13/25 10:44:38 Result of reading /etc/redhat-release: AlmaLinux release 9.5 (Teal Serval) + +02/13/25 10:44:38 Using IDs: 20 processors, 10 CPUs, 10 HTs +02/13/25 10:44:38 Enumerating interfaces: lo 127.0.0.1 up +02/13/25 10:44:38 Enumerating interfaces: enp11s0 10.0.0.33 up +02/13/25 10:44:38 Enumerating interfaces: lo ::1 up +02/13/25 10:44:38 Enumerating interfaces: enp11s0 2601:248:8500:b50::d234 up +02/13/25 10:44:38 Enumerating interfaces: enp11s0 2601:248:8500:b50:b696:91ff:fe06:4d05 up +02/13/25 10:44:38 Enumerating interfaces: enp11s0 fe80::b696:91ff:fe06:4d05 up +02/13/25 10:44:38 Directory::Rewind(): path "/work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/etc/condor/config.d" does not exist (yet) +02/13/25 10:44:38 Cannot open /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/etc/condor/config.d: No such file or directory +02/13/25 10:44:38 ****************************************************** +02/13/25 10:44:38 ** condor_scheduniv_exec.9208.0 (CONDOR_DAGMAN) STARTING UP +02/13/25 10:44:38 ** /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_dagman +02/13/25 10:44:38 ** SubsystemInfo: name=DAGMAN type=DAGMAN(9) class=CLIENT(2) +02/13/25 10:44:38 ** Configuration: subsystem:DAGMAN local: class:CLIENT +02/13/25 10:44:38 ** $CondorVersion: 23.0.3 2024-04-04 $ +02/13/25 10:44:38 ** $CondorPlatform: X86_64-CentOS_7.9 $ +02/13/25 10:44:38 ** PID = 55288 +02/13/25 10:44:38 ** Log last touched time unavailable (No such file or directory) +02/13/25 10:44:38 ****************************************************** +02/13/25 10:44:38 Using config source: /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/etc/condor/condor_config +02/13/25 10:44:38 Using local config sources: +02/13/25 10:44:38 /etc/condor/condor_config +02/13/25 10:44:38 /etc/condor/config.d/00-minicondor +02/13/25 10:44:38 /etc/condor/config.d/00-security +02/13/25 10:44:38 /etc/condor/config.d/10-stash-plugin.conf +02/13/25 10:44:38 /etc/condor/config.d/99-lsst +02/13/25 10:44:38 /etc/condor/condor_config.local +02/13/25 10:44:38 /home/testuser/.condor/user_config +02/13/25 10:44:38 config Macros = 101, Sorted = 101, StringBytes = 3187, TablesBytes = 3724 +02/13/25 10:44:38 CLASSAD_CACHING is ENABLED +02/13/25 10:44:38 Daemon Log is logging: D_ALWAYS:2 D_ERROR D_STATUS +02/13/25 10:44:38 Internal pipe for signals resized to 4096 from 65536 +02/13/25 10:44:38 DaemonCore: No command port requested. +02/13/25 10:44:38 Setting maximum accepts per cycle 8. +02/13/25 10:44:38 Setting maximum UDP messages per cycle 100. +02/13/25 10:44:38 Will use TCP to update collector <10.0.0.33:9618> +02/13/25 10:44:38 Not using shared port because no command port requested +02/13/25 10:44:38 DAGMAN_USE_STRICT setting: 1 +02/13/25 10:44:38 DAGMAN_VERBOSITY setting: 3 +02/13/25 10:44:38 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 +02/13/25 10:44:38 DAGMAN_DEBUG_CACHE_ENABLE setting: False +02/13/25 10:44:38 DAGMAN_SUBMIT_DELAY setting: 0 +02/13/25 10:44:38 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 +02/13/25 10:44:38 DAGMAN_STARTUP_CYCLE_DETECT setting: False +02/13/25 10:44:38 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 100 +02/13/25 10:44:38 DAGMAN_AGGRESSIVE_SUBMIT setting: False +02/13/25 10:44:38 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 1 +02/13/25 10:44:38 DAGMAN_QUEUE_UPDATE_INTERVAL setting: 30 +02/13/25 10:44:38 DAGMAN_DEFAULT_PRIORITY setting: 0 +02/13/25 10:44:38 DAGMAN_SUPPRESS_NOTIFICATION setting: True +02/13/25 10:44:38 allow_events (DAGMAN_ALLOW_EVENTS) setting: 114 +02/13/25 10:44:38 DAGMAN_RETRY_SUBMIT_FIRST setting: True +02/13/25 10:44:38 DAGMAN_RETRY_NODE_FIRST setting: False +02/13/25 10:44:38 DAGMAN_MAX_JOBS_IDLE setting: 1000 +02/13/25 10:44:38 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 +02/13/25 10:44:38 DAGMAN_MAX_PRE_SCRIPTS setting: 20 +02/13/25 10:44:38 DAGMAN_MAX_POST_SCRIPTS setting: 20 +02/13/25 10:44:38 DAGMAN_MAX_HOLD_SCRIPTS setting: 20 +02/13/25 10:44:38 DAGMAN_MUNGE_NODE_NAMES setting: True +02/13/25 10:44:38 DAGMAN_PROHIBIT_MULTI_JOBS setting: False +02/13/25 10:44:38 DAGMAN_SUBMIT_DEPTH_FIRST setting: False +02/13/25 10:44:38 DAGMAN_ALWAYS_RUN_POST setting: False +02/13/25 10:44:38 DAGMAN_CONDOR_SUBMIT_EXE setting: /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_submit +02/13/25 10:44:38 DAGMAN_USE_DIRECT_SUBMIT setting: True +02/13/25 10:44:38 DAGMAN_DEFAULT_APPEND_VARS setting: False +02/13/25 10:44:38 DAGMAN_ABORT_DUPLICATES setting: True +02/13/25 10:44:38 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True +02/13/25 10:44:38 DAGMAN_PENDING_REPORT_INTERVAL setting: 60 +02/13/25 10:44:38 DAGMAN_AUTO_RESCUE setting: True +02/13/25 10:44:38 DAGMAN_MAX_RESCUE_NUM setting: 100 +02/13/25 10:44:38 DAGMAN_WRITE_PARTIAL_RESCUE setting: True +02/13/25 10:44:38 DAGMAN_DEFAULT_NODE_LOG setting: @(DAG_DIR)/@(DAG_FILE).nodes.log +02/13/25 10:44:38 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True +02/13/25 10:44:38 DAGMAN_MAX_JOB_HOLDS setting: 100 +02/13/25 10:44:38 DAGMAN_HOLD_CLAIM_TIME setting: 20 +02/13/25 10:44:38 ALL_DEBUG setting: D_FULLDEBUG +02/13/25 10:44:38 DAGMAN_DEBUG setting: +02/13/25 10:44:38 DAGMAN_SUPPRESS_JOB_LOGS setting: False +02/13/25 10:44:38 DAGMAN_REMOVE_NODE_JOBS setting: True +02/13/25 10:44:38 DAGMAN will adjust edges after parsing +02/13/25 10:44:38 argv[0] == "condor_scheduniv_exec.9208.0" +02/13/25 10:44:38 argv[1] == "-Lockfile" +02/13/25 10:44:38 argv[2] == "/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.lock" +02/13/25 10:44:38 argv[3] == "-AutoRescue" +02/13/25 10:44:38 argv[4] == "1" +02/13/25 10:44:38 argv[5] == "-DoRescueFrom" +02/13/25 10:44:38 argv[6] == "0" +02/13/25 10:44:38 argv[7] == "-Dag" +02/13/25 10:44:38 argv[8] == "/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag" +02/13/25 10:44:38 argv[9] == "-Suppress_notification" +02/13/25 10:44:38 argv[10] == "-CsdVersion" +02/13/25 10:44:38 argv[11] == "$CondorVersion: 23.0.3 2024-04-04 $" +02/13/25 10:44:38 argv[12] == "-Dagman" +02/13/25 10:44:38 argv[13] == "/work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_dagman" +02/13/25 10:44:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:38 Workflow batch-id: <9208.0> +02/13/25 10:44:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:38 Workflow batch-name: +02/13/25 10:44:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:38 Workflow accounting_group: <> +02/13/25 10:44:38 Workflow accounting_group_user: <> +02/13/25 10:44:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:38 Warning: failed to get attribute DAGNodeName +02/13/25 10:44:38 DAGMAN_LOG_ON_NFS_IS_ERROR setting: False +02/13/25 10:44:38 Default node log file is: +02/13/25 10:44:38 DAG Lockfile will be written to /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.lock +02/13/25 10:44:38 DAG Input file is /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag +02/13/25 10:44:38 Parsing 1 dagfiles +02/13/25 10:44:38 Parsing /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag ... +02/13/25 10:44:38 TmpDir(0)::TmpDir() +02/13/25 10:44:38 TmpDir(1)::TmpDir() +02/13/25 10:44:38 TmpDir(1)::Cd2MainDir() +02/13/25 10:44:38 TmpDir(1)::~TmpDir() +02/13/25 10:44:38 TmpDir(2)::TmpDir() +02/13/25 10:44:38 TmpDir(2)::Cd2MainDir() +02/13/25 10:44:38 TmpDir(2)::~TmpDir() +02/13/25 10:44:38 TmpDir(3)::TmpDir() +02/13/25 10:44:38 TmpDir(3)::Cd2MainDir() +02/13/25 10:44:38 TmpDir(3)::~TmpDir() +02/13/25 10:44:38 TmpDir(4)::TmpDir() +02/13/25 10:44:38 TmpDir(4)::Cd2MainDir() +02/13/25 10:44:38 TmpDir(4)::~TmpDir() +02/13/25 10:44:38 TmpDir(5)::TmpDir() +02/13/25 10:44:38 TmpDir(5)::Cd2MainDir() +02/13/25 10:44:38 TmpDir(5)::~TmpDir() +02/13/25 10:44:38 TmpDir(0)::~TmpDir() +02/13/25 10:44:38 Adjusting edges +02/13/25 10:44:38 Dag contains 4 total jobs +02/13/25 10:44:38 Bootstrapping... +02/13/25 10:44:38 Number of pre-completed nodes: 0 +02/13/25 10:44:38 ReadMultipleUserLogs::monitorLogFile(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log, 1) +02/13/25 10:44:38 MultiLogFiles::InitializeFile(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log, 0) +02/13/25 10:44:38 ReadMultipleUserLogs: didn't find LogFileMonitor object for /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log (64770:484438042) +02/13/25 10:44:38 MultiLogFiles::InitializeFile(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log, 1) +02/13/25 10:44:38 MultiLogFiles: truncating log file /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log +02/13/25 10:44:38 ReadMultipleUserLogs: created LogFileMonitor object for log file /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log +02/13/25 10:44:38 init: Opening file /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log +02/13/25 10:44:38 Opening log file #0 '/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log' (is_lock_cur=false,seek=false,read_header=true) +02/13/25 10:44:38 Error, apparently invalid user log file +02/13/25 10:44:38 ReadMultipleUserLogs: added log file /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log (64770:484438042) to active list +02/13/25 10:44:38 Starting service node provisioningJob... +02/13/25 10:44:38 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:44:38 Of 4 nodes total: +02/13/25 10:44:38 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:44:38 === === === === === === === === +02/13/25 10:44:38 0 0 0 0 1 3 0 0 +02/13/25 10:44:38 0 job proc(s) currently held +02/13/25 10:44:38 DAGMan Runtime Statistics: [ EventCycleTimeCount = 0.0; EventCycleTimeSum = 0.0; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeCount = 0.0; SleepCycleTimeSum = 0.0; SubmitCycleTimeCount = 0.0; SubmitCycleTimeSum = 0.0; ] +02/13/25 10:44:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:38 Registering condor_event_timer... +02/13/25 10:44:39 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:39 Submitting HTCondor Node provisioningJob job(s)... +02/13/25 10:44:39 TmpDir(6)::TmpDir() +02/13/25 10:44:39 TmpDir(6)::Cd2TmpDir() +02/13/25 10:44:39 Submitting node provisioningJob from file jobs/provisioningJob/provisioningJob.sub using direct job submission +02/13/25 10:44:39 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:39 TmpDir(6)::Cd2MainDir() +02/13/25 10:44:39 TmpDir(6)::~TmpDir() +02/13/25 10:44:39 assigned HTCondor ID (9209.0.0) +02/13/25 10:44:39 Submitting HTCondor Node pipetaskInit job(s)... +02/13/25 10:44:39 TmpDir(7)::TmpDir() +02/13/25 10:44:39 TmpDir(7)::Cd2TmpDir() +02/13/25 10:44:39 Submitting node pipetaskInit from file jobs/pipetaskInit/pipetaskInit.sub using direct job submission +02/13/25 10:44:39 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:39 TmpDir(7)::Cd2MainDir() +02/13/25 10:44:39 TmpDir(7)::~TmpDir() +02/13/25 10:44:39 assigned HTCondor ID (9210.0.0) +02/13/25 10:44:39 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:39 Just submitted 2 jobs this cycle... +02/13/25 10:44:39 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:44:39 Of 4 nodes total: +02/13/25 10:44:39 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:44:39 === === === === === === === === +02/13/25 10:44:39 0 0 1 0 0 3 0 0 +02/13/25 10:44:39 0 job proc(s) currently held +02/13/25 10:44:39 DAGMan Runtime Statistics: [ EventCycleTimeCount = 0.0; EventCycleTimeSum = 0.0; LogProcessCycleTimeCount = 0.0; LogProcessCycleTimeSum = 0.0; SleepCycleTimeCount = 0.0; SleepCycleTimeSum = 0.0; SubmitCycleTimeAvg = 0.2648820877075195; SubmitCycleTimeCount = 1.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 0.2648820877075195; SubmitCycleTimeStd = 0.2648820877075195; SubmitCycleTimeSum = 0.2648820877075195; ] +02/13/25 10:44:40 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:40 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:44:40 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:40 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:40 Reassigning the id of job provisioningJob from (9209.0.0) to (9209.0.0) +02/13/25 10:44:40 Event: ULOG_SUBMIT for HTCondor Node provisioningJob (9209.0.0) {02/13/25 10:44:39} +02/13/25 10:44:40 Number of idle job procs: 1 +02/13/25 10:44:40 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:40 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:40 Reassigning the id of job pipetaskInit from (9210.0.0) to (9210.0.0) +02/13/25 10:44:40 Event: ULOG_SUBMIT for HTCondor Node pipetaskInit (9210.0.0) {02/13/25 10:44:39} +02/13/25 10:44:40 Number of idle job procs: 2 +02/13/25 10:44:40 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:40 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:41 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:42 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:43 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:44 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:44 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:44:44 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:44 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:44 Event: ULOG_EXECUTE for HTCondor Node provisioningJob (9209.0.0) {02/13/25 10:44:43} +02/13/25 10:44:44 Number of idle job procs: 1 +02/13/25 10:44:44 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:44 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:45 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:46 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:46 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:44:46 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:46 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:46 Event: ULOG_EXECUTE for HTCondor Node pipetaskInit (9210.0.0) {02/13/25 10:44:46} +02/13/25 10:44:46 Number of idle job procs: 0 +02/13/25 10:44:46 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:46 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:47 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:48 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:49 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:50 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:51 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:52 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:53 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:54 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:55 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:56 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:57 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:58 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:58 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:44:58 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:58 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:58 Event: ULOG_JOB_TERMINATED for HTCondor Node pipetaskInit (9210.0.0) {02/13/25 10:44:58} +02/13/25 10:44:58 Number of idle job procs: 0 +02/13/25 10:44:58 Node pipetaskInit job proc (9210.0.0) completed successfully. +02/13/25 10:44:58 Node pipetaskInit job completed +02/13/25 10:44:58 ReadMultipleUserLogs::readEvent() +02/13/25 10:44:58 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:44:58 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:44:58 Of 4 nodes total: +02/13/25 10:44:58 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:44:58 === === === === === === === === +02/13/25 10:44:58 1 0 0 0 1 2 0 0 +02/13/25 10:44:58 0 job proc(s) currently held +02/13/25 10:44:58 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.0141294002532959; EventCycleTimeCount = 19.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.911422729492188E-05; EventCycleTimeStd = 0.06104268809134626; EventCycleTimeSum = 0.2684586048126221; LogProcessCycleTimeAvg = 0.0002171993255615234; LogProcessCycleTimeCount = 4.0; LogProcessCycleTimeMax = 0.0003049373626708984; LogProcessCycleTimeMin = 0.0001349449157714844; LogProcessCycleTimeStd = 6.95013672380173E-05; LogProcessCycleTimeSum = 0.0008687973022460938; SleepCycleTimeAvg = 1.001078329588238; SleepCycleTimeCount = 19.0; SleepCycleTimeMax = 1.001111030578613; SleepCycleTimeMin = 1.001055002212524; SleepCycleTimeStd = 1.665696562452007E-05; SleepCycleTimeSum = 19.02048826217651; SubmitCycleTimeAvg = 0.01326546669006348; SubmitCycleTimeCount = 20.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.05922440914104214; SubmitCycleTimeSum = 0.2653093338012695; ] +02/13/25 10:44:59 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:44:59 Submitting HTCondor Node 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 job(s)... +02/13/25 10:44:59 TmpDir(8)::TmpDir() +02/13/25 10:44:59 TmpDir(8)::Cd2TmpDir() +02/13/25 10:44:59 Submitting node 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 from file jobs/label1/val1/5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2.sub using direct job submission +02/13/25 10:44:59 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:59 TmpDir(8)::Cd2MainDir() +02/13/25 10:44:59 TmpDir(8)::~TmpDir() +02/13/25 10:44:59 assigned HTCondor ID (9211.0.0) +02/13/25 10:44:59 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:44:59 Just submitted 1 job this cycle... +02/13/25 10:44:59 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:44:59 Of 4 nodes total: +02/13/25 10:44:59 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:44:59 === === === === === === === === +02/13/25 10:44:59 1 0 1 0 0 2 0 0 +02/13/25 10:44:59 0 job proc(s) currently held +02/13/25 10:44:59 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.01344383955001831; EventCycleTimeCount = 20.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.911422729492188E-05; EventCycleTimeStd = 0.0594936411334977; EventCycleTimeSum = 0.2688767910003662; LogProcessCycleTimeAvg = 0.0002171993255615234; LogProcessCycleTimeCount = 4.0; LogProcessCycleTimeMax = 0.0003049373626708984; LogProcessCycleTimeMin = 0.0001349449157714844; LogProcessCycleTimeStd = 6.95013672380173E-05; LogProcessCycleTimeSum = 0.0008687973022460938; SleepCycleTimeAvg = 1.001078510284424; SleepCycleTimeCount = 20.0; SleepCycleTimeMax = 1.001111030578613; SleepCycleTimeMin = 1.001055002212524; SleepCycleTimeStd = 1.623282435596784E-05; SleepCycleTimeSum = 20.02157020568848; SubmitCycleTimeAvg = 0.01515463420322963; SubmitCycleTimeCount = 21.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.05837038750898548; SubmitCycleTimeSum = 0.3182473182678223; ] +02/13/25 10:45:00 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:00 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:45:00 ReadMultipleUserLogs::readEvent() +02/13/25 10:45:00 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:45:00 Reassigning the id of job 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 from (9211.0.0) to (9211.0.0) +02/13/25 10:45:00 Event: ULOG_SUBMIT for HTCondor Node 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 (9211.0.0) {02/13/25 10:44:59} +02/13/25 10:45:00 Number of idle job procs: 1 +02/13/25 10:45:00 ReadMultipleUserLogs::readEvent() +02/13/25 10:45:00 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:45:00 Event: ULOG_EXECUTE for HTCondor Node 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 (9211.0.0) {02/13/25 10:45:00} +02/13/25 10:45:00 Number of idle job procs: 0 +02/13/25 10:45:00 ReadMultipleUserLogs::readEvent() +02/13/25 10:45:00 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:45:01 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:02 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:03 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:04 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:05 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:06 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:07 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:08 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:09 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:09 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:45:10 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:11 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:12 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:13 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:14 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:15 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:16 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:17 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:18 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:19 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:20 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:21 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:22 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:23 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:24 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:25 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:26 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:27 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:28 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:29 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:30 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:31 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:32 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:33 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:34 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:35 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:36 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:37 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:37 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:45:37 ReadMultipleUserLogs::readEvent() +02/13/25 10:45:37 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:45:37 Event: ULOG_JOB_TERMINATED for HTCondor Node 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 (9211.0.0) {02/13/25 10:45:37} +02/13/25 10:45:37 Number of idle job procs: 0 +02/13/25 10:45:37 Node 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 job proc (9211.0.0) completed successfully. +02/13/25 10:45:37 Node 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 job completed +02/13/25 10:45:37 ReadMultipleUserLogs::readEvent() +02/13/25 10:45:37 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:45:37 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:45:37 Of 4 nodes total: +02/13/25 10:45:37 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:45:37 === === === === === === === === +02/13/25 10:45:37 2 0 0 0 1 1 0 0 +02/13/25 10:45:37 0 job proc(s) currently held +02/13/25 10:45:37 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.005758433506406587; EventCycleTimeCount = 58.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.506111145019531E-05; EventCycleTimeStd = 0.03550222505586587; EventCycleTimeSum = 0.333989143371582; LogProcessCycleTimeAvg = 0.0002189874649047852; LogProcessCycleTimeCount = 6.0; LogProcessCycleTimeMax = 0.0003049373626708984; LogProcessCycleTimeMin = 0.0001349449157714844; LogProcessCycleTimeStd = 5.406600618247893E-05; LogProcessCycleTimeSum = 0.001313924789428711; SleepCycleTimeAvg = 1.001046308155717; SleepCycleTimeCount = 58.0; SleepCycleTimeMax = 1.00111985206604; SleepCycleTimeMin = 1.000271081924438; SleepCycleTimeStd = 0.0001221654847513134; SleepCycleTimeSum = 58.06068587303162; SubmitCycleTimeAvg = 0.005406969684665486; SubmitCycleTimeCount = 59.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.03504679087086938; SubmitCycleTimeSum = 0.3190112113952637; ] +02/13/25 10:45:38 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:38 Submitting HTCondor Node 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 job(s)... +02/13/25 10:45:38 TmpDir(9)::TmpDir() +02/13/25 10:45:38 TmpDir(9)::Cd2TmpDir() +02/13/25 10:45:38 Submitting node 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 from file jobs/label2/val1/0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2.sub using direct job submission +02/13/25 10:45:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:45:38 TmpDir(9)::Cd2MainDir() +02/13/25 10:45:38 TmpDir(9)::~TmpDir() +02/13/25 10:45:38 assigned HTCondor ID (9212.0.0) +02/13/25 10:45:38 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:45:38 Just submitted 1 job this cycle... +02/13/25 10:45:38 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:45:38 Of 4 nodes total: +02/13/25 10:45:38 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:45:38 === === === === === === === === +02/13/25 10:45:38 2 0 1 0 0 1 0 0 +02/13/25 10:45:38 0 job proc(s) currently held +02/13/25 10:45:38 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.005667969331902973; EventCycleTimeCount = 59.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.506111145019531E-05; EventCycleTimeStd = 0.03520169959002112; EventCycleTimeSum = 0.3344101905822754; LogProcessCycleTimeAvg = 0.0002189874649047852; LogProcessCycleTimeCount = 6.0; LogProcessCycleTimeMax = 0.0003049373626708984; LogProcessCycleTimeMin = 0.0001349449157714844; LogProcessCycleTimeStd = 5.406600618247893E-05; LogProcessCycleTimeSum = 0.001313924789428711; SleepCycleTimeAvg = 1.001047607195579; SleepCycleTimeCount = 59.0; SleepCycleTimeMax = 1.001122951507568; SleepCycleTimeMin = 1.000271081924438; SleepCycleTimeStd = 0.0001215181099412882; SleepCycleTimeSum = 59.06180882453918; SubmitCycleTimeAvg = 0.00615383783976237; SubmitCycleTimeCount = 60.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.03522680810472513; SubmitCycleTimeSum = 0.3692302703857422; ] +02/13/25 10:45:39 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:39 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:45:39 ReadMultipleUserLogs::readEvent() +02/13/25 10:45:39 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:45:39 Reassigning the id of job 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 from (9212.0.0) to (9212.0.0) +02/13/25 10:45:39 Event: ULOG_SUBMIT for HTCondor Node 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 (9212.0.0) {02/13/25 10:45:38} +02/13/25 10:45:39 Number of idle job procs: 1 +02/13/25 10:45:39 ReadMultipleUserLogs::readEvent() +02/13/25 10:45:39 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:45:39 Event: ULOG_EXECUTE for HTCondor Node 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 (9212.0.0) {02/13/25 10:45:39} +02/13/25 10:45:39 Number of idle job procs: 0 +02/13/25 10:45:39 ReadMultipleUserLogs::readEvent() +02/13/25 10:45:39 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:45:39 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:45:40 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:41 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:42 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:43 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:44 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:45 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:46 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:47 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:48 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:49 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:50 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:51 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:52 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:53 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:54 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:55 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:56 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:57 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:58 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:45:59 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:00 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:01 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:02 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:03 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:04 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:05 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:06 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:07 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:08 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:09 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:09 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:46:10 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:11 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:12 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:13 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:14 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:15 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:16 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:17 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:18 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:19 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:20 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:21 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:22 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:23 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:24 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:25 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:26 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:26 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:46:26 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:26 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:26 Event: ULOG_JOB_TERMINATED for HTCondor Node 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 (9212.0.0) {02/13/25 10:46:26} +02/13/25 10:46:26 Number of idle job procs: 0 +02/13/25 10:46:26 Node 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 job proc (9212.0.0) completed successfully. +02/13/25 10:46:26 Node 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 job completed +02/13/25 10:46:26 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:26 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:26 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:46:26 Of 4 nodes total: +02/13/25 10:46:26 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:46:26 === === === === === === === === +02/13/25 10:46:26 3 0 0 0 0 1 0 0 +02/13/25 10:46:26 0 job proc(s) currently held +02/13/25 10:46:26 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.003787789389351818; EventCycleTimeCount = 107.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.506111145019531E-05; EventCycleTimeStd = 0.02658425213470169; EventCycleTimeSum = 0.4052934646606445; LogProcessCycleTimeAvg = 0.0002273917198181152; LogProcessCycleTimeCount = 8.0; LogProcessCycleTimeMax = 0.0003049373626708984; LogProcessCycleTimeMin = 0.0001349449157714844; LogProcessCycleTimeStd = 4.888316009475682E-05; LogProcessCycleTimeSum = 0.001819133758544922; SleepCycleTimeAvg = 1.00101508826853; SleepCycleTimeCount = 107.0; SleepCycleTimeMax = 1.001122951507568; SleepCycleTimeMin = 1.000068187713623; SleepCycleTimeStd = 0.0001817913634841936; SleepCycleTimeSum = 107.1086144447327; SubmitCycleTimeAvg = 0.003428200880686442; SubmitCycleTimeCount = 108.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.0263367285721693; SubmitCycleTimeSum = 0.3702456951141357; ] +02/13/25 10:46:26 Starting final node... +02/13/25 10:46:27 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:27 Submitting HTCondor Node finalJob job(s)... +02/13/25 10:46:27 TmpDir(10)::TmpDir() +02/13/25 10:46:27 TmpDir(10)::Cd2TmpDir() +02/13/25 10:46:27 Submitting node finalJob from file jobs/finalJob/finalJob.sub using direct job submission +02/13/25 10:46:27 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:46:28 Submit warning: Submit:0:the line 'concurrency_limit = db_limit' was unused by DAGMAN. Is it a typo? +02/13/25 10:46:28 TmpDir(10)::Cd2MainDir() +02/13/25 10:46:28 TmpDir(10)::~TmpDir() +02/13/25 10:46:28 assigned HTCondor ID (9213.0.0) +02/13/25 10:46:28 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:46:28 Just submitted 1 job this cycle... +02/13/25 10:46:28 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:46:28 Of 4 nodes total: +02/13/25 10:46:28 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:46:28 === === === === === === === === +02/13/25 10:46:28 3 0 1 0 0 0 0 0 +02/13/25 10:46:28 0 job proc(s) currently held +02/13/25 10:46:28 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.003787789389351818; EventCycleTimeCount = 107.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.506111145019531E-05; EventCycleTimeStd = 0.02658425213470169; EventCycleTimeSum = 0.4052934646606445; LogProcessCycleTimeAvg = 0.0002273917198181152; LogProcessCycleTimeCount = 8.0; LogProcessCycleTimeMax = 0.0003049373626708984; LogProcessCycleTimeMin = 0.0001349449157714844; LogProcessCycleTimeStd = 4.888316009475682E-05; LogProcessCycleTimeSum = 0.001819133758544922; SleepCycleTimeAvg = 1.010289309201417; SleepCycleTimeCount = 108.0; SleepCycleTimeMax = 2.002630949020386; SleepCycleTimeMin = 1.000068187713623; SleepCycleTimeStd = 0.09638070098176346; SleepCycleTimeSum = 109.1112453937531; SubmitCycleTimeAvg = 0.003930842110870082; SubmitCycleTimeCount = 109.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.02673461218469543; SubmitCycleTimeSum = 0.4284617900848389; ] +02/13/25 10:46:29 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:29 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:46:29 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:29 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:29 Reassigning the id of job finalJob from (9213.0.0) to (9213.0.0) +02/13/25 10:46:29 Event: ULOG_SUBMIT for HTCondor Node finalJob (9213.0.0) {02/13/25 10:46:27} +02/13/25 10:46:29 Number of idle job procs: 1 +02/13/25 10:46:29 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:29 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:30 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:31 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:32 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:32 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:46:32 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:32 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:32 Event: ULOG_EXECUTE for HTCondor Node finalJob (9213.0.0) {02/13/25 10:46:31} +02/13/25 10:46:32 Number of idle job procs: 0 +02/13/25 10:46:32 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:32 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:33 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:34 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:35 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:36 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:37 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:38 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:39 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:40 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:40 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:46:41 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:42 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:43 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:44 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:45 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:45 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:46:45 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:45 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:45 Event: ULOG_JOB_TERMINATED for HTCondor Node finalJob (9213.0.0) {02/13/25 10:46:44} +02/13/25 10:46:45 Number of idle job procs: 0 +02/13/25 10:46:45 Node finalJob job proc (9213.0.0) completed successfully. +02/13/25 10:46:45 Node finalJob job completed +02/13/25 10:46:45 Running POST script of Node finalJob... +02/13/25 10:46:45 TmpDir(11)::TmpDir() +02/13/25 10:46:45 TmpDir(11)::Cd2TmpDir() +02/13/25 10:46:45 Warning: mysin has length 0 (ignore if produced by DAGMan; see gittrac #4987, #5031) +02/13/25 10:46:45 TmpDir(11)::Cd2MainDir() +02/13/25 10:46:45 TmpDir(11)::~TmpDir() +02/13/25 10:46:45 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:45 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:45 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:46:45 Of 4 nodes total: +02/13/25 10:46:45 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:46:45 === === === === === === === === +02/13/25 10:46:45 3 0 0 1 0 0 0 0 +02/13/25 10:46:45 0 job proc(s) currently held +02/13/25 10:46:45 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.003824610863962481; EventCycleTimeCount = 124.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.506111145019531E-05; EventCycleTimeStd = 0.02519980999250643; EventCycleTimeSum = 0.4742517471313477; LogProcessCycleTimeAvg = 0.0009112791581587358; LogProcessCycleTimeCount = 11.0; LogProcessCycleTimeMax = 0.007851839065551758; LogProcessCycleTimeMin = 0.0001349449157714844; LogProcessCycleTimeStd = 0.002302391251811755; LogProcessCycleTimeSum = 0.01002407073974609; SleepCycleTimeAvg = 1.009027576446533; SleepCycleTimeCount = 125.0; SleepCycleTimeMax = 2.002630949020386; SleepCycleTimeMin = 1.000068187713623; SleepCycleTimeStd = 0.08958746263386831; SleepCycleTimeSum = 126.1284470558167; SubmitCycleTimeAvg = 0.003402849984547449; SubmitCycleTimeCount = 126.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.02488647568143831; SubmitCycleTimeSum = 0.4287590980529785; ] +02/13/25 10:46:45 Initializing user log writer for /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log, (9213.0.0) +02/13/25 10:46:45 WriteUserLog::initialize: opened /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log successfully +02/13/25 10:46:45 WriteUserLog::user_priv_flag (~) is 0 +02/13/25 10:46:46 ReadMultipleUserLogs::GetLogStatus() +02/13/25 10:46:46 Currently monitoring 1 HTCondor log file(s) +02/13/25 10:46:46 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:46 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:46 Event: ULOG_POST_SCRIPT_TERMINATED for HTCondor Node finalJob (9213.0.0) {02/13/25 10:46:45} +02/13/25 10:46:46 POST Script of node finalJob completed successfully. +02/13/25 10:46:46 ReadMultipleUserLogs::readEvent() +02/13/25 10:46:46 ReadMultipleUserLogs::readEventFromLog(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:46 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:46:46 Of 4 nodes total: +02/13/25 10:46:46 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:46:46 === === === === === === === === +02/13/25 10:46:46 4 0 0 0 0 0 0 0 +02/13/25 10:46:46 0 job proc(s) currently held +02/13/25 10:46:46 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.003858310699462891; EventCycleTimeCount = 125.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.506111145019531E-05; EventCycleTimeStd = 0.02510082011465964; EventCycleTimeSum = 0.4822888374328613; LogProcessCycleTimeAvg = 0.0008463462193806967; LogProcessCycleTimeCount = 12.0; LogProcessCycleTimeMax = 0.007851839065551758; LogProcessCycleTimeMin = 0.0001320838928222656; LogProcessCycleTimeStd = 0.002206737711218562; LogProcessCycleTimeSum = 0.01015615463256836; SleepCycleTimeAvg = 1.009002619319492; SleepCycleTimeCount = 126.0; SleepCycleTimeMax = 2.002630949020386; SleepCycleTimeMin = 1.000068187713623; SleepCycleTimeStd = 0.08922883297240598; SleepCycleTimeSum = 127.134330034256; SubmitCycleTimeAvg = 0.003376112209530327; SubmitCycleTimeCount = 127.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.0247893544656815; SubmitCycleTimeSum = 0.4287662506103516; ] +02/13/25 10:46:46 Executing: /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_rm 9209 -const DAGManJobId==9208 -reason Removed' 'by' 'DAGMan +02/13/25 10:46:46 Running: /work/lsst_stack/w_2025_07/conda/envs/lsst-scipipe-9.0.0/bin/condor_rm 9209 -const DAGManJobId==9208 -reason Removed' 'by' 'DAGMan +02/13/25 10:46:46 Warning checking HTCondor job events: BAD EVENT: job (9209.0.0) ended, total end count != 1 (0) +02/13/25 10:46:46 All jobs Completed! +02/13/25 10:46:46 Note: 0 total job deferrals because of -MaxJobs limit (0) +02/13/25 10:46:46 Note: 0 total job deferrals because of -MaxIdle limit (1000) +02/13/25 10:46:46 Note: 0 total job deferrals because of node category throttles +02/13/25 10:46:46 Note: 0 total PRE script deferrals because of -MaxPre limit (20) or DEFER +02/13/25 10:46:46 Note: 0 total POST script deferrals because of -MaxPost limit (20) or DEFER +02/13/25 10:46:46 Note: 0 total HOLD script deferrals because of -MaxHold limit (20) or DEFER +02/13/25 10:46:46 DAG status: 0 (DAG_STATUS_OK) +02/13/25 10:46:46 Of 4 nodes total: +02/13/25 10:46:46 Done Pre Queued Post Ready Un-Ready Failed Futile +02/13/25 10:46:46 === === === === === === === === +02/13/25 10:46:46 4 0 0 0 0 0 0 0 +02/13/25 10:46:46 0 job proc(s) currently held +02/13/25 10:46:46 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.003858310699462891; EventCycleTimeCount = 125.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.506111145019531E-05; EventCycleTimeStd = 0.02510082011465964; EventCycleTimeSum = 0.4822888374328613; LogProcessCycleTimeAvg = 0.0008463462193806967; LogProcessCycleTimeCount = 12.0; LogProcessCycleTimeMax = 0.007851839065551758; LogProcessCycleTimeMin = 0.0001320838928222656; LogProcessCycleTimeStd = 0.002206737711218562; LogProcessCycleTimeSum = 0.01015615463256836; SleepCycleTimeAvg = 1.009002619319492; SleepCycleTimeCount = 126.0; SleepCycleTimeMax = 2.002630949020386; SleepCycleTimeMin = 1.000068187713623; SleepCycleTimeStd = 0.08922883297240598; SleepCycleTimeSum = 127.134330034256; SubmitCycleTimeAvg = 0.003376112209530327; SubmitCycleTimeCount = 127.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.0247893544656815; SubmitCycleTimeSum = 0.4287662506103516; ] +02/13/25 10:46:46 SharedPortClient: sent connection request to local schedd for shared port id schedd_1499_c48e +02/13/25 10:46:46 Wrote metrics file /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.metrics. +02/13/25 10:46:46 DAGMan Runtime Statistics: [ EventCycleTimeAvg = 0.003858310699462891; EventCycleTimeCount = 125.0; EventCycleTimeMax = 0.2662038803100586; EventCycleTimeMin = 4.506111145019531E-05; EventCycleTimeStd = 0.02510082011465964; EventCycleTimeSum = 0.4822888374328613; LogProcessCycleTimeAvg = 0.0008463462193806967; LogProcessCycleTimeCount = 12.0; LogProcessCycleTimeMax = 0.007851839065551758; LogProcessCycleTimeMin = 0.0001320838928222656; LogProcessCycleTimeStd = 0.002206737711218562; LogProcessCycleTimeSum = 0.01015615463256836; SleepCycleTimeAvg = 1.009002619319492; SleepCycleTimeCount = 126.0; SleepCycleTimeMax = 2.002630949020386; SleepCycleTimeMin = 1.000068187713623; SleepCycleTimeStd = 0.08922883297240598; SleepCycleTimeSum = 127.134330034256; SubmitCycleTimeAvg = 0.003376112209530327; SubmitCycleTimeCount = 127.0; SubmitCycleTimeMax = 0.2648820877075195; SubmitCycleTimeMin = 5.006790161132812E-06; SubmitCycleTimeStd = 0.0247893544656815; SubmitCycleTimeSum = 0.4287662506103516; ] +02/13/25 10:46:46 ReadMultipleUserLogs::unmonitorLogFile(/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log) +02/13/25 10:46:46 ReadMultipleUserLogs: found LogFileMonitor object for /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log (64770:484438042) +02/13/25 10:46:46 Closing file +02/13/25 10:46:46 ReadMultipleUserLogs: removed log file /work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag.nodes.log (64770:484438042) from active list +02/13/25 10:46:46 **** condor_scheduniv_exec.9208.0 (condor_DAGMAN) pid 55288 EXITING WITH STATUS 0 diff --git a/tests/data/tiny_success/tiny_success.dag.metrics b/tests/data/tiny_success/tiny_success.dag.metrics new file mode 100644 index 0000000..48c0ab3 --- /dev/null +++ b/tests/data/tiny_success/tiny_success.dag.metrics @@ -0,0 +1,25 @@ +{ + "client":"condor_dagman", + "version":"23.0.3", + "planner":"", + "planner_version":"", + "type":"metrics", + "wf_uuid":"", + "root_wf_uuid":"", + "start_time":1739465078.420, + "end_time":1739465206.297, + "duration":127.877, + "exitcode":0, + "dagman_id":"9208", + "parent_dagman_id":"", + "rescue_dag_number":0, + "jobs":4, + "jobs_failed":0, + "jobs_succeeded":5, + "dag_jobs":0, + "dag_jobs_failed":0, + "dag_jobs_succeeded":0, + "total_jobs":4, + "total_jobs_run":5, + "DagStatus":0 +} diff --git a/tests/data/tiny_success/tiny_success.dag.nodes.log b/tests/data/tiny_success/tiny_success.dag.nodes.log new file mode 100644 index 0000000..6ea6bb7 --- /dev/null +++ b/tests/data/tiny_success/tiny_success.dag.nodes.log @@ -0,0 +1,143 @@ +000 (9209.000.000) 2025-02-13 10:44:39 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: provisioningJob +... +000 (9210.000.000) 2025-02-13 10:44:39 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: pipetaskInit +... +001 (9209.000.000) 2025-02-13 10:44:43 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=starter_2644_afe5_7> +... +001 (9210.000.000) 2025-02-13 10:44:46 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_55311" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9210.000.000) 2025-02-13 10:44:58 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:05, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:05, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 0 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 83 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 11 + TimeSlotBusy (s) : 11 + + Job terminated of its own accord at 2025-02-13T16:44:57Z with exit-code 0. +... +000 (9211.000.000) 2025-02-13 10:44:59 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: 5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2 +... +001 (9211.000.000) 2025-02-13 10:45:00 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_55353" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9211.000.000) 2025-02-13 10:45:37 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:27, Sys 0 00:00:03 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:27, Sys 0 00:00:03 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 11666 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 11666 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 95 1 2048 + GPUs : 0 + Memory (MB) : 949 2048 2048 + TimeExecute (s) : 37 + TimeSlotBusy (s) : 38 + + Job terminated of its own accord at 2025-02-13T16:45:37Z with exit-code 0. +... +000 (9212.000.000) 2025-02-13 10:45:38 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: 0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2 +... +001 (9212.000.000) 2025-02-13 10:45:39 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_55483" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9212.000.000) 2025-02-13 10:46:26 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:40, Sys 0 00:00:01 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:40, Sys 0 00:00:01 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 22802 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 22802 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 106 1 2048 + GPUs : 0 + Memory (MB) : 356 2048 2048 + TimeExecute (s) : 47 + TimeSlotBusy (s) : 47 + + Job terminated of its own accord at 2025-02-13T16:46:26Z with exit-code 0. +... +000 (9213.000.000) 2025-02-13 10:46:27 Job submitted from host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=schedd_1499_c48e> + DAG Node: finalJob +... +001 (9213.000.000) 2025-02-13 10:46:31 Job executing on host: <10.0.0.33:9618?addrs=10.0.0.33-9618+[2601-248-8500-b50--d234]-9618&alias=test02&noUDP&sock=startd_1499_c48e> + SlotName: slot1_1@test02 + CondorScratchDir = "/var/lib/condor/execute/dir_55755" + Cpus = 1 + Disk = 2048 + GPUs = 0 + Memory = 2048 +... +005 (9213.000.000) 2025-02-13 10:46:44 Job terminated. + (1) Normal termination (return value 0) + Usr 0 00:00:05, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:05, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 6256 - Run Bytes Sent By Job + 214 - Run Bytes Received By Job + 6256 - Total Bytes Sent By Job + 214 - Total Bytes Received By Job + Partitionable Resources : Usage Request Allocated + Cpus : 0 1 1 + Disk (KB) : 89 1 2048 + GPUs : 0 + Memory (MB) : 0 2048 2048 + TimeExecute (s) : 13 + TimeSlotBusy (s) : 13 + + Job terminated of its own accord at 2025-02-13T16:46:44Z with exit-code 0. +... +016 (9213.000.000) 2025-02-13 10:46:45 POST Script terminated. + (1) Normal termination (return value 0) + DAG Node: finalJob +... +004 (9209.000.000) 2025-02-13 10:46:46 Job was evicted. + (0) CPU times + Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job +... +009 (9209.000.000) 2025-02-13 10:46:46 Job was aborted. + removed because fired when job (9208.0) was removed +... diff --git a/tests/data/tiny_success/tiny_success.info.json b/tests/data/tiny_success/tiny_success.info.json new file mode 100644 index 0000000..27d5deb --- /dev/null +++ b/tests/data/tiny_success/tiny_success.info.json @@ -0,0 +1 @@ +{"test02": {"9208.0": {"ClusterId": 9208, "GlobalJobId": "test02#9208.0#1739465078", "bps_wms_service": "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorService", "bps_project": "dev", "bps_payload": "tiny", "bps_operator": "testuser", "bps_wms_workflow": "lsst.ctrl.bps.htcondor.htcondor_service.HTCondorWorkflow", "bps_provisioning_job": "provisioningJob", "bps_run_quanta": "label1:1;label2:1", "bps_campaign": "quick", "bps_runsite": "testpool", "bps_job_summary": "pipetaskInit:1;label1:1;label2:1;finalJob:1", "bps_run": "u_testuser_tiny_20250213T164427Z", "bps_isjob": "True"}}} diff --git a/tests/data/tiny_success/tiny_success.node_status b/tests/data/tiny_success/tiny_success.node_status new file mode 100644 index 0000000..a33918e --- /dev/null +++ b/tests/data/tiny_success/tiny_success.node_status @@ -0,0 +1,60 @@ +[ + Type = "DagStatus"; + DagFiles = { + "/work/testuser/submit/u/testuser/tiny/20250213T164427Z/u_testuser_tiny_20250213T164427Z.dag" + }; + Timestamp = 1739465206; /* "Thu Feb 13 10:46:46 2025" */ + DagStatus = 5; /* "STATUS_DONE (success)" */ + NodesTotal = 4; + NodesDone = 4; + NodesPre = 0; + NodesQueued = 0; + NodesPost = 0; + NodesReady = 0; + NodesUnready = 0; + NodesFutile = 0; + NodesFailed = 0; + JobProcsHeld = 0; + JobProcsIdle = 0; /* includes held */ +] +[ + Type = "NodeStatus"; + Node = "pipetaskInit"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "5bba27bd-8df7-4668-a9c5-e911192c5cdb_label1_val1_val2"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "0b225f1f-6edf-4380-b546-76c97947a88f_label2_val1_val2"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "NodeStatus"; + Node = "finalJob"; + NodeStatus = 5; /* "STATUS_DONE" */ + StatusDetails = ""; + RetryCount = 0; + JobProcsQueued = 0; + JobProcsHeld = 0; +] +[ + Type = "StatusEnd"; + EndTime = 1739465206; /* "Thu Feb 13 10:46:46 2025" */ + NextUpdate = 0; /* "none" */ +] diff --git a/tests/test_htcondor_service.py b/tests/test_htcondor_service.py index 8f2c6aa..52a3b0d 100644 --- a/tests/test_htcondor_service.py +++ b/tests/test_htcondor_service.py @@ -31,24 +31,28 @@ import os import unittest from pathlib import Path -from shutil import copy2 +from shutil import copy2, copytree import htcondor -from lsst.ctrl.bps import BpsConfig, GenericWorkflowExec, GenericWorkflowJob, WmsStates +from lsst.ctrl.bps import BpsConfig, GenericWorkflowExec, GenericWorkflowJob, WmsSpecificInfo, WmsStates from lsst.ctrl.bps.htcondor.htcondor_config import HTC_DEFAULTS_URI from lsst.ctrl.bps.htcondor.htcondor_service import ( HTCondorService, JobStatus, NodeStatus, WmsIdType, + _add_service_job_specific_info, + _create_detailed_report_from_jobs, _get_exit_code_summary, _get_info_from_path, + _get_run_summary, _get_state_counts_from_dag_job, _htc_node_status_to_wms_state, _htc_status_to_wms_state, _translate_job_cmds, _wms_id_to_dir, + is_service_job, ) from lsst.ctrl.bps.htcondor.lssthtc import MISSING_ID from lsst.utils.tests import temporaryDirectory @@ -532,3 +536,491 @@ def testRelPathId(self, _wms_id_type_mock): self.assertEqual(id_type, WmsIdType.PATH) self.assertEqual(abs_path.resolve(), wms_path) os.chdir(orig_dir) + + +class AddServiceJobSpecificInfoTestCase(unittest.TestCase): + """Test _add_service_job_specific_info function. + + Note: The job_ad's are hardcoded in these tests. The + values in the dictionaries come from plugin code as + well as HTCondor. Changes in either of those codes + that produce data for the job_ad can break this + function without breaking these unit tests. + + Also, since hold status/messages stick around, testing + various cases with and without job being held just to + ensure get right status in both cases. + """ + + def testNotSubmitted(self): + # Service job not submitted yet or can't be submitted. + # (Typically an plugin bug.) + # At this function level, can't tell if not submitted + # yet or problem so it never will. + job_ad = { + "ClusterId": -64, + "DAGManJobID": "8997.0", + "DAGNodeName": "provisioningJob", + "NodeStatus": NodeStatus.NOT_READY, + "ProcId": 0, + "bps_job_label": "service_provisioningJob", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "UNREADY", "status_details": ""} + ) + + def testRunning(self): + # DAG hasn't completed (Running or held), + # Service job is running. + job_ad = { + "ClusterId": 8523, + "ProcId": 0, + "DAGNodeName": "provisioningJob", + "JobStatus": JobStatus.RUNNING, + } + + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""} + ) + + def testDied(self): + # DAG hasn't completed (Running or held), + # Service job failed (completed non-zero exit code) + job_ad = { + "ClusterId": 8761, + "ProcId": 0, + "DAGNodeName": "provisioningJob", + "JobStatus": JobStatus.COMPLETED, + "ExitCode": 4, + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "FAILED", "status_details": ""} + ) + + def testDeleted(self): + # Deleted by user (never held) + job_ad = { + "ClusterId": 9086, + "DAGNodeName": "provisioningJob", + "JobStatus": JobStatus.REMOVED, + "ProcId": 0, + "Reason": "via condor_rm (by user mgower)", + "job_evicted_time": "2025-02-11T11:35:04", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "DELETED", "status_details": ""} + ) + + def testSucceedEarly(self): + # DAG hasn't completed (Running or held), + # Service job completed with exit code 0 + job_ad = { + "ClusterId": 8761, + "ProcId": 0, + "DAGNodeName": "provisioningJob", + "JobStatus": JobStatus.COMPLETED, + "ExitCode": 0, + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, + { + "job_name": "provisioningJob", + "status": "SUCCEEDED", + "status_details": "(Note: Finished before workflow.)", + }, + ) + + def testSucceedOldRemoveMessage(self): + # DAG completed, job was in running state when removed. + job_ad = { + "ClusterId": 8761, + "ProcId": 0, + "DAGNodeName": "provisioningJob", + "JobStatus": JobStatus.REMOVED, + "Reason": "Removed by DAGMan (by user mgower)", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""} + ) + + def testSucceed(self): + # DAG completed, job was in running state when removed. + job_ad = { + "ClusterId": 8761, + "ProcId": 0, + "DAGNodeName": "provisioningJob", + "JobStatus": JobStatus.REMOVED, + "Reason": ( + "removed because " + " fired when job (8556.0) was removed" + ), + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""} + ) + + def testUserHeldWhileRunning(self): + # DAG hasn't completed (Running or held), + # user put at least service job on hold + job_ad = { + "ClusterId": 8523, + "ProcId": 0, + "DAGNodeName": "provisioningJob", + "JobStatus": JobStatus.HELD, + "HoldReason": "via condor_hold (by user mgower)", + "HoldReasonCode": 1, + "HoldReasonSubCode": 0, + } + + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, + { + "job_name": "provisioningJob", + "status": "HELD", + "status_details": "(via condor_hold (by user mgower))", + }, + ) + + def testHeldByHTC(self): + # Job put on hold by HTCondor, removed when DAG ends + job_ad = { + "ClusterId": 8693, + "DAGNodeName": "provisioningJob", + "HoldReason": "Failed to execute", + "HoldReasonCode": 6, + "HoldReasonSubCode": 2, + "JobStatus": JobStatus.REMOVED, + "ProcId": 0, + "Reason": "Removed by DAGMan (by user mgower)", + "job_held_time": "2025-02-07T12:50:07", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, + { + "job_name": "provisioningJob", + "status": "DELETED", + "status_details": "(Job was held for the following reason: Failed to execute)", + }, + ) + + def testHeldReleasedRunning(self): + # DAG hasn't completed (Running or held), + # Since held info will be in job_ad, make sure knows released. + job_ad = { + "ClusterId": 8625, + "DAGNodeName": "provisioningJob", + "HoldReason": "via condor_hold (by user mgower)", + "HoldReasonCode": 1, + "HoldReasonSubCode": 0, + "JobStatus": JobStatus.RUNNING, + "LogNotes": "DAG Node: provisioningJob", + "ProcId": 0, + "job_held_time": "2025-02-07T12:33:34", + "job_released_time": "2025-02-07T12:33:47", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""} + ) + + def testHeldReleasedDied(self): + # Since held info will be in job_ad, + # make sure knows status after released. + job_ad = { + "ClusterId": 9120, + "DAGNodeName": "provisioningJob", + "ExitBySignal": False, + "ExitCode": 4, + "HoldReason": "via condor_hold (by user mgower)", + "HoldReasonCode": 1, + "HoldReasonSubCode": 0, + "JobStatus": JobStatus.COMPLETED, + "ProcId": 0, + "Reason": "via condor_release (by user mgower)", + "ReturnValue": 4, + "TerminatedNormally": True, + "job_held_time": "2025-02-11T11:46:40", + "job_released_time": "2025-02-11T11:46:47", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "FAILED", "status_details": ""} + ) + + def testHeldReleasedSuccessEarly(self): + # Since held info will be in job_ad, + # make sure knows status after released. + job_ad = { + "ClusterId": 9154, + "DAGNodeName": "provisioningJob", + "ExitBySignal": False, + "ExitCode": 0, + "HoldReason": "via condor_hold (by user mgower)", + "HoldReasonCode": 1, + "HoldReasonSubCode": 0, + "JobStatus": JobStatus.COMPLETED, + "ProcId": 0, + "Reason": "via condor_release (by user mgower)", + "TerminatedNormally": True, + "job_held_time": "2025-02-11T11:55:20", + "job_released_time": "2025-02-11T11:55:25", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, + { + "job_name": "provisioningJob", + "status": "SUCCEEDED", + "status_details": "(Note: Finished before workflow.)", + }, + ) + + def testHeldReleasedSuccess(self): + # DAG has completed. + # Since held info will be in job_ad, + # make sure knows status after released. + job_ad = { + "ClusterId": 8625, + "DAGNodeName": "provisioningJob", + "HoldReason": "via condor_hold (by user mgower)", + "HoldReasonCode": 1, + "HoldReasonSubCode": 0, + "JobStatus": JobStatus.REMOVED, + "ProcId": 0, + "Reason": "removed because fired when job (8624.0) was removed", + "job_held_time": "2025-02-07T12:33:34", + "job_released_time": "2025-02-07T12:33:47", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""} + ) + + def testHeldReleasedDeleted(self): + # Since held info will be in job_ad, + # make sure knows status after released. + job_ad = { + "ClusterId": 9086, + "DAGNodeName": "provisioningJob", + "HoldReason": "via condor_hold (by user mgower)", + "HoldReasonCode": 1, + "HoldReasonSubCode": 0, + "JobStatus": JobStatus.REMOVED, + "ProcId": 0, + "Reason": "via condor_rm (by user mgower)", + "job_evicted_time": "2025-02-11T11:35:04", + "job_held_time": "2025-02-11T11:35:04", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, {"job_name": "provisioningJob", "status": "DELETED", "status_details": ""} + ) + + def testHeldReleasedHeld(self): + # Since release info will be in job_ad, + # make sure knows held after release. + job_ad = { + "ClusterId": 8659, + "DAGNodeName": "provisioningJob", + "HoldReason": "via condor_hold (by user mgower)", + "HoldReasonCode": 1, + "HoldReasonSubCode": 0, + "JobStatus": JobStatus.REMOVED, + "ProcId": 0, + "Reason": "Removed by DAGMan (by user mgower)", + "TerminatedNormally": False, + "job_held_time": "2025-02-07T12:36:15", + "job_released_time": "2025-02-07T12:36:07", + } + results = WmsSpecificInfo() + _add_service_job_specific_info(job_ad, results) + self.assertEqual( + results.context, + { + "job_name": "provisioningJob", + "status": "DELETED", + "status_details": "(Job was held for the following reason: via condor_hold (by user mgower))", + }, + ) + + +class GetRunSummaryTestCase(unittest.TestCase): + """Test _get_run_summary function.""" + + def testJobSummaryInJobAd(self): + summary = "pipetaskInit:1;label1:2;label2:2;finalJob:1" + job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_summary": summary} + results = _get_run_summary(job_ad) + self.assertEqual(results, summary) + + def testRunSummaryInJobAd(self): + summary = "pipetaskInit:1;label1:2;label2:2;finalJob:1" + job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_run_summary": summary} + results = _get_run_summary(job_ad) + self.assertEqual(results, summary) + + def testSummaryFromDag(self): + with temporaryDirectory() as tmp_dir: + copy2(f"{TESTDIR}/data/good.dag", tmp_dir) + job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "Iwd": tmp_dir} + results = _get_run_summary(job_ad) + self.assertEqual(results, "pipetaskInit:1;label1:1;label2:1;label3:1;finalJob:1") + + def testSummaryNoDag(self): + with self.assertLogs(logger=logger, level="WARNING") as cm: + with temporaryDirectory() as tmp_dir: + job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "Iwd": tmp_dir} + results = _get_run_summary(job_ad) + self.assertEqual(results, "") + self.assertIn("lsst.ctrl.bps.htcondor", cm.records[0].name) + self.assertIn("Could not get run summary for htcondor job", cm.output[0]) + + +class IsServiceJobTestCase(unittest.TestCase): + """Test is_service_job function.""" + + def testNotServiceJob(self): + job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_type": "payload"} + self.assertFalse(is_service_job(job_ad)) + + def testIsServiceJob(self): + job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_type": "service"} + self.assertTrue(is_service_job(job_ad)) + + def testMissingBpsType(self): + job_ad = { + "ClusterId": 8659, + "DAGNodeName": "testJob", + } + self.assertFalse(is_service_job(job_ad)) + + +class CreateDetailedReportFromJobsTestCase(unittest.TestCase): + """Test _create_detailed_report_from_jobs function.""" + + def testTinySuccess(self): + with temporaryDirectory() as tmp_dir: + test_submit_dir = os.path.join(tmp_dir, "tiny_success") + copytree(f"{TESTDIR}/data/tiny_success", test_submit_dir) + wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir) + run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) + self.assertEqual(len(run_reports), 1) + report = run_reports[wms_workflow_id] + self.assertEqual(report.wms_id, wms_workflow_id) + self.assertEqual(report.state, WmsStates.SUCCEEDED) + self.assertTrue(os.path.samefile(report.path, test_submit_dir)) + self.assertEqual(report.run_summary, "pipetaskInit:1;label1:1;label2:1;finalJob:1") + self.assertEqual( + report.job_state_counts, + { + WmsStates.UNKNOWN: 0, + WmsStates.MISFIT: 0, + WmsStates.UNREADY: 0, + WmsStates.READY: 0, + WmsStates.PENDING: 0, + WmsStates.RUNNING: 0, + WmsStates.DELETED: 0, + WmsStates.HELD: 0, + WmsStates.SUCCEEDED: 4, + WmsStates.FAILED: 0, + WmsStates.PRUNED: 0, + }, + ) + self.assertEqual( + report.specific_info.context, + {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""}, + ) + + def testTinyProblems(self): + with temporaryDirectory() as tmp_dir: + test_submit_dir = os.path.join(tmp_dir, "tiny_problems") + copytree(f"{TESTDIR}/data/tiny_problems", test_submit_dir) + wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir) + run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) + self.assertEqual(len(run_reports), 1) + report = run_reports[wms_workflow_id] + self.assertEqual(report.wms_id, wms_workflow_id) + self.assertEqual(report.state, WmsStates.FAILED) + self.assertTrue(os.path.samefile(report.path, test_submit_dir)) + self.assertEqual(report.run_summary, "pipetaskInit:1;label1:2;label2:2;finalJob:1") + self.assertEqual( + report.job_state_counts, + { + WmsStates.UNKNOWN: 0, + WmsStates.MISFIT: 0, + WmsStates.UNREADY: 0, + WmsStates.READY: 0, + WmsStates.PENDING: 0, + WmsStates.RUNNING: 0, + WmsStates.DELETED: 0, + WmsStates.HELD: 0, + WmsStates.SUCCEEDED: 4, + WmsStates.FAILED: 1, + WmsStates.PRUNED: 1, + }, + ) + self.assertEqual( + run_reports[wms_workflow_id].specific_info.context, + {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""}, + ) + + def testTinyRunning(self): + with temporaryDirectory() as tmp_dir: + test_submit_dir = os.path.join(tmp_dir, "tiny_running") + copytree(f"{TESTDIR}/data/tiny_running", test_submit_dir) + wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir) + run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) + self.assertEqual(len(run_reports), 1) + report = run_reports[wms_workflow_id] + self.assertEqual(report.wms_id, wms_workflow_id) + self.assertEqual(report.state, WmsStates.RUNNING) + self.assertTrue(os.path.samefile(report.path, test_submit_dir)) + self.assertEqual(report.run_summary, "pipetaskInit:1;label1:1;label2:1;finalJob:1") + self.assertEqual( + report.job_state_counts, + { + WmsStates.UNKNOWN: 0, + WmsStates.MISFIT: 0, + WmsStates.UNREADY: 2, + WmsStates.READY: 0, + WmsStates.PENDING: 0, + WmsStates.RUNNING: 1, + WmsStates.DELETED: 0, + WmsStates.HELD: 0, + WmsStates.SUCCEEDED: 1, + WmsStates.FAILED: 0, + WmsStates.PRUNED: 0, + }, + ) + self.assertEqual( + report.specific_info.context, + {"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_lssthtc.py b/tests/test_lssthtc.py index e055ecb..2d46aa2 100644 --- a/tests/test_lssthtc.py +++ b/tests/test_lssthtc.py @@ -31,7 +31,7 @@ import pathlib import tempfile import unittest -from shutil import copy2 +from shutil import copy2, rmtree import htcondor @@ -197,22 +197,23 @@ def test_no_messages(self): self.assertEqual("", results) -class SummaryFromDagTestCase(unittest.TestCase): - """Test summary_from_dag function.""" +class SummarizeDagTestCase(unittest.TestCase): + """Test summarize_dag function.""" def test_no_dag_file(self): with temporaryDirectory() as tmp_dir: - summary, job_name_to_pipetask = lssthtc.summary_from_dag(tmp_dir) + summary, job_name_to_pipetask, job_name_to_type = lssthtc.summarize_dag(tmp_dir) self.assertFalse(len(job_name_to_pipetask)) + self.assertFalse(len(job_name_to_type)) self.assertFalse(summary) def test_success(self): with temporaryDirectory() as tmp_dir: copy2(f"{TESTDIR}/data/good.dag", tmp_dir) - summary, job_name_to_pipetask = lssthtc.summary_from_dag(tmp_dir) + summary, job_name_to_label, job_name_to_type = lssthtc.summarize_dag(tmp_dir) self.assertEqual(summary, "pipetaskInit:1;label1:1;label2:1;label3:1;finalJob:1") self.assertEqual( - job_name_to_pipetask, + job_name_to_label, { "pipetaskInit": "pipetaskInit", "0682f8f9-12f0-40a5-971e-8b30c7231e5c_label1_val1_val2": "label1", @@ -221,6 +222,98 @@ def test_success(self): "finalJob": "finalJob", }, ) + self.assertEqual( + job_name_to_type, + { + "pipetaskInit": "payload", + "0682f8f9-12f0-40a5-971e-8b30c7231e5c_label1_val1_val2": "payload", + "d0305e2d-f164-4a85-bd24-06afe6c84ed9_label2_val1_val2": "payload", + "2806ecc9-1bba-4362-8fff-ab4e6abb9f83_label3_val1_val2": "payload", + "finalJob": "final", + }, + ) + + def test_service(self): + with temporaryDirectory() as tmp_dir: + copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag", tmp_dir) + summary, job_name_to_label, job_name_to_type = lssthtc.summarize_dag(tmp_dir) + self.assertEqual(summary, "pipetaskInit:1;label1:2;label2:2;finalJob:1") + self.assertEqual( + job_name_to_label, + { + "pipetaskInit": "pipetaskInit", + "4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a": "label1", + "057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b": "label1", + "696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a": "label2", + "40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b": "label2", + "finalJob": "finalJob", + "provisioningJob": "provisioningJob", + }, + ) + self.assertEqual( + job_name_to_type, + { + "pipetaskInit": "payload", + "4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a": "payload", + "057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b": "payload", + "696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a": "payload", + "40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b": "payload", + "finalJob": "final", + "provisioningJob": "service", + }, + ) + + +class ReadDagNodesLogTestCase(unittest.TestCase): + """Test read_dag_nodes_log function.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + + def tearDown(self): + rmtree(self.tmpdir, ignore_errors=True) + + def testFileMissing(self): + with self.assertRaisesRegex(FileNotFoundError, "DAGMan node log not found in"): + _, _ = lssthtc.read_dag_nodes_log(self.tmpdir) + + +class ReadNodeStatusTestCase(unittest.TestCase): + """Test read_node_status function.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + + def tearDown(self): + rmtree(self.tmpdir, ignore_errors=True) + + def testServiceJobNotSubmitted(self): + # tiny_prov_no_submit files have successful workflow + # but provisioningJob could not submit. + copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.nodes.log", self.tmpdir) + copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.log", self.tmpdir) + copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.node_status", self.tmpdir) + copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag", self.tmpdir) + + jobs = lssthtc.read_node_status(self.tmpdir) + found = [id_ for id_ in jobs if jobs[id_].get("bps_job_type", "MISS") == "service"] + self.assertEqual(len(found), 1) + self.assertEqual(jobs[found[0]]["DAGNodeName"], "provisioningJob") + self.assertEqual(jobs[found[0]]["NodeStatus"], lssthtc.NodeStatus.NOT_READY) + + def testMissingStatusFile(self): + copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag.nodes.log", self.tmpdir) + copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag.dagman.log", self.tmpdir) + copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag", self.tmpdir) + + jobs = lssthtc.read_node_status(self.tmpdir) + self.assertEqual(len(jobs), 7) + self.assertEqual(jobs["9230.0"]["DAGNodeName"], "pipetaskInit") + self.assertEqual(jobs["9230.0"]["bps_job_type"], "payload") + self.assertEqual(jobs["9230.0"]["JobStatus"], lssthtc.JobStatus.COMPLETED) + found = [id_ for id_ in jobs if jobs[id_].get("bps_job_type", "MISS") == "service"] + self.assertEqual(len(found), 1) + self.assertEqual(jobs[found[0]]["DAGNodeName"], "provisioningJob") if __name__ == "__main__":