From 4f26a67f9d73739c2cabfdc0c7c4dc6727a12e9a Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 21 Nov 2024 16:06:51 -0700 Subject: [PATCH 1/5] Make v28 release notes --- doc/changes/DM-35145.feature.rst | 1 - doc/changes/DM-38538.doc.rst | 1 - doc/changes/DM-42579.feature.rst | 1 - doc/changes/DM-43932.misc.rst | 1 - doc/changes/DM-44107.bugfix.rst | 1 - doc/changes/DM-44110.misc.rst | 1 - doc/changes/DM-44457.misc.rst | 1 - doc/changes/DM-44668.bugfix.rst | 1 - doc/changes/DM-44668.feature.rst | 1 - doc/changes/DM-45654.misc.rst | 1 - doc/changes/DM-46046.misc.rst | 2 -- doc/lsst.ctrl.bps.htcondor/CHANGES.rst | 30 ++++++++++++++++++++++++++ 12 files changed, 30 insertions(+), 12 deletions(-) delete mode 100644 doc/changes/DM-35145.feature.rst delete mode 100644 doc/changes/DM-38538.doc.rst delete mode 100644 doc/changes/DM-42579.feature.rst delete mode 100644 doc/changes/DM-43932.misc.rst delete mode 100644 doc/changes/DM-44107.bugfix.rst delete mode 100644 doc/changes/DM-44110.misc.rst delete mode 100644 doc/changes/DM-44457.misc.rst delete mode 100644 doc/changes/DM-44668.bugfix.rst delete mode 100644 doc/changes/DM-44668.feature.rst delete mode 100644 doc/changes/DM-45654.misc.rst delete mode 100644 doc/changes/DM-46046.misc.rst diff --git a/doc/changes/DM-35145.feature.rst b/doc/changes/DM-35145.feature.rst deleted file mode 100644 index c20a097..0000000 --- a/doc/changes/DM-35145.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Implemented basic ping method for HTCondor plugin that checks Schedd and Collector are running and user can authenticate to them. It does not check that there are compute resources that can run the user's jobs. diff --git a/doc/changes/DM-38538.doc.rst b/doc/changes/DM-38538.doc.rst deleted file mode 100644 index 1251820..0000000 --- a/doc/changes/DM-38538.doc.rst +++ /dev/null @@ -1 +0,0 @@ -Added a section describing how to release held jobs to the package documentation. diff --git a/doc/changes/DM-42579.feature.rst b/doc/changes/DM-42579.feature.rst deleted file mode 100644 index 68683b4..0000000 --- a/doc/changes/DM-42579.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Added ability for the plugin to call ``allocateNodes.py`` during workflow execution in order to manage required computational resources automatically. diff --git a/doc/changes/DM-43932.misc.rst b/doc/changes/DM-43932.misc.rst deleted file mode 100644 index a2d9ae3..0000000 --- a/doc/changes/DM-43932.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Report better error message when failed submission from /tmp. diff --git a/doc/changes/DM-44107.bugfix.rst b/doc/changes/DM-44107.bugfix.rst deleted file mode 100644 index fe0ec60..0000000 --- a/doc/changes/DM-44107.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fixed status when job held and released. diff --git a/doc/changes/DM-44110.misc.rst b/doc/changes/DM-44110.misc.rst deleted file mode 100644 index 38ece09..0000000 --- a/doc/changes/DM-44110.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Provided a default value for the ``memoryLimit`` parameter so it will be set automatically for the users if this plugin is used. diff --git a/doc/changes/DM-44457.misc.rst b/doc/changes/DM-44457.misc.rst deleted file mode 100644 index d7e4a2d..0000000 --- a/doc/changes/DM-44457.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Fixed held and deleted state_counts for reporting. diff --git a/doc/changes/DM-44668.bugfix.rst b/doc/changes/DM-44668.bugfix.rst deleted file mode 100644 index c222024..0000000 --- a/doc/changes/DM-44668.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fixed report listing auto-memory retry as failed when actually successful. diff --git a/doc/changes/DM-44668.feature.rst b/doc/changes/DM-44668.feature.rst deleted file mode 100644 index fb2f9a9..0000000 --- a/doc/changes/DM-44668.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Update plugin to use retryUnlessExit values so WMS won't rerun some failures that will just fail every time. diff --git a/doc/changes/DM-45654.misc.rst b/doc/changes/DM-45654.misc.rst deleted file mode 100644 index a7ddeb6..0000000 --- a/doc/changes/DM-45654.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Updated plugin to allow spaces in job submit file path. diff --git a/doc/changes/DM-46046.misc.rst b/doc/changes/DM-46046.misc.rst deleted file mode 100644 index 053d54a..0000000 --- a/doc/changes/DM-46046.misc.rst +++ /dev/null @@ -1,2 +0,0 @@ -Updated ``bps restart`` to work with relative path as id. -Updated ``bps report --id `` to display absolute path. diff --git a/doc/lsst.ctrl.bps.htcondor/CHANGES.rst b/doc/lsst.ctrl.bps.htcondor/CHANGES.rst index 3b7bfee..e2a12b9 100644 --- a/doc/lsst.ctrl.bps.htcondor/CHANGES.rst +++ b/doc/lsst.ctrl.bps.htcondor/CHANGES.rst @@ -1,3 +1,33 @@ +lsst-ctrl-bps-htcondor v28.0.0 (2024-11-21) +=========================================== + +New Features +------------ + +- Implemented basic ping method for HTCondor plugin that checks Schedd and Collector are running and user can authenticate to them. + It does not check that there are compute resources that can run the user's jobs. (`DM-35145 `_) +- Added ability for the plugin to call ``allocateNodes.py`` during workflow execution in order to manage required computational resources automatically. (`DM-42579 `_) +- Updated plugin to use ``retryUnlessExit`` values so WMS won't rerun some failures that will just fail every time. (`DM-44668 `_) + + +Bug Fixes +--------- + +- Fixed status when job held and released. (`DM-44107 `_) +- Fixed report listing auto-memory retry as failed when actually successful. (`DM-44668 `_) + + +Other Changes and Additions +--------------------------- + +- Reported better error message when failed submission from ``/tmp``. (`DM-43932 `_) +- Provided a default value for the ``memoryLimit`` parameter so it will be set automatically for the users if this plugin is used. (`DM-44110 `_) +- Fixed held and deleted ``state_counts`` for reporting. (`DM-44457 `_) +- Updated plugin to allow spaces in job submit file path. (`DM-45654 `_) +- Updated ``bps restart`` to work with relative path as id. + Updated ``bps report --id `` to display absolute path. (`DM-46046 `_) +- Added a section describing how to release held jobs to the package documentation. (`DM-38538 `_) + lsst-ctrl-bps-htcondor v27.0.0 (2024-06-04) =========================================== From 5ede0314376bbcafad09e5fd28202025225280dc Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 21 Nov 2024 16:09:32 -0700 Subject: [PATCH 2/5] Refresh pre-commit --- .pre-commit-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5cfa04f..d2f3a91 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-yaml args: @@ -9,7 +9,7 @@ repos: - id: trailing-whitespace - id: check-toml - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.10.0 hooks: - id: black # It is recommended to specify the latest version of Python @@ -24,10 +24,10 @@ repos: name: isort (python) - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.4.7 + rev: v0.7.4 hooks: - id: ruff - repo: https://github.com/numpy/numpydoc - rev: "v1.7.0" + rev: "v1.8.0" hooks: - id: numpydoc-validation From 5f0efc79df6b2cb32f3dc3c7c02cbad6266ea71e Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 21 Nov 2024 16:10:44 -0700 Subject: [PATCH 3/5] Fix some whitespace issues --- tests/data/bad_submit.dag.dagman.out | 2 +- tests/data/test_no_messages.dag.dagman.out | 16 ++++---- ...ines_check_20240727T003507Z.dag.dagman.out | 8 ++-- ...lines_check_20240727T003507Z.dag.nodes.log | 40 +++++++++---------- ...pipelines_check_20240727T003507Z.info.json | 2 +- 5 files changed, 34 insertions(+), 34 deletions(-) diff --git a/tests/data/bad_submit.dag.dagman.out b/tests/data/bad_submit.dag.dagman.out index cdba888..c73e353 100644 --- a/tests/data/bad_submit.dag.dagman.out +++ b/tests/data/bad_submit.dag.dagman.out @@ -43,7 +43,7 @@ 07/25/24 20:05:11 Node Name: one 07/25/24 20:05:11 Noop: false 07/25/24 20:05:11 NodeID: 0 -07/25/24 20:05:11 Node Status: STATUS_ERROR +07/25/24 20:05:11 Node Status: STATUS_ERROR 07/25/24 20:05:11 Node return val: -1 07/25/24 20:05:11 Error: Job submit failed 07/25/24 20:05:11 Job Submit File: bad_submit2.sub diff --git a/tests/data/test_no_messages.dag.dagman.out b/tests/data/test_no_messages.dag.dagman.out index f62545d..0800cba 100644 --- a/tests/data/test_no_messages.dag.dagman.out +++ b/tests/data/test_no_messages.dag.dagman.out @@ -1,7 +1,7 @@ 07/23/24 17:24:38 Result of reading /etc/issue: \S - + 07/23/24 17:24:38 Result of reading /etc/redhat-release: AlmaLinux release 9.4 (Seafoam Ocelot) - + 07/23/24 17:24:38 Using IDs: 20 processors, 10 CPUs, 10 HTs 07/23/24 17:24:38 Enumerating interfaces: lo 127.0.0.1 up 07/23/24 17:24:38 Enumerating interfaces: enp11s0 10.0.0.33 up @@ -22,7 +22,7 @@ 07/23/24 17:24:38 ** Log last touched time unavailable (No such file or directory) 07/23/24 17:24:38 ****************************************************** 07/23/24 17:24:38 Using config source: /work/lsst_stack/w_2024_29/conda/envs/lsst-scipipe-8.0.0/etc/condor/condor_config -07/23/24 17:24:38 Using local config sources: +07/23/24 17:24:38 Using local config sources: 07/23/24 17:24:38 /etc/condor/condor_config 07/23/24 17:24:38 /etc/condor/config.d/00-htcondor-9.0.config 07/23/24 17:24:38 /etc/condor/config.d/00-minicondor @@ -77,7 +77,7 @@ 07/23/24 17:24:38 DAGMAN_MAX_JOB_HOLDS setting: 100 07/23/24 17:24:38 DAGMAN_HOLD_CLAIM_TIME setting: 20 07/23/24 17:24:38 ALL_DEBUG setting: D_FULLDEBUG -07/23/24 17:24:38 DAGMAN_DEBUG setting: +07/23/24 17:24:38 DAGMAN_DEBUG setting: 07/23/24 17:24:38 DAGMAN_SUPPRESS_JOB_LOGS setting: False 07/23/24 17:24:38 DAGMAN_REMOVE_NODE_JOBS setting: True 07/23/24 17:24:38 DAGMAN will adjust edges after parsing @@ -289,7 +289,7 @@ 07/23/24 17:25:45 Node Name: pipetaskInit 07/23/24 17:25:45 Noop: false 07/23/24 17:25:45 NodeID: 0 -07/23/24 17:25:45 Node Status: STATUS_ERROR +07/23/24 17:25:45 Node Status: STATUS_ERROR 07/23/24 17:25:45 Node return val: -1002 07/23/24 17:25:45 Error: HTCondor reported ULOG_JOB_ABORTED event for job proc (1101.0.0) 07/23/24 17:25:45 Job Submit File: jobs/pipetaskInit/pipetaskInit.sub @@ -399,7 +399,7 @@ 07/23/24 17:25:58 Node Name: pipetaskInit 07/23/24 17:25:58 Noop: false 07/23/24 17:25:58 NodeID: 0 -07/23/24 17:25:58 Node Status: STATUS_ERROR +07/23/24 17:25:58 Node Status: STATUS_ERROR 07/23/24 17:25:58 Node return val: -1002 07/23/24 17:25:58 Error: HTCondor reported ULOG_JOB_ABORTED event for job proc (1101.0.0) 07/23/24 17:25:58 Job Submit File: jobs/pipetaskInit/pipetaskInit.sub @@ -409,13 +409,13 @@ 07/23/24 17:25:58 Node Name: finalJob 07/23/24 17:25:58 Noop: false 07/23/24 17:25:58 NodeID: 4 -07/23/24 17:25:58 Node Status: STATUS_ERROR +07/23/24 17:25:58 Node Status: STATUS_ERROR 07/23/24 17:25:58 Node return val: 2 07/23/24 17:25:58 Error: Job failed due to DAGMAN error 0 and POST Script failed with status 2 07/23/24 17:25:58 Job Submit File: jobs/finalJob/finalJob.sub 07/23/24 17:25:58 POST Script: /work/mgower/gen3work/summary-report-held-44457/ctrl_bps_htcondor/python/lsst/ctrl/bps/htcondor/final_post.sh finalJob $DAG_STATUS $RETURN 07/23/24 17:25:58 HTCondor Job ID: (1102.0.0) -07/23/24 17:25:58 PARENTS: WAITING: 0 CHILDREN: +07/23/24 17:25:58 PARENTS: WAITING: 0 CHILDREN: 07/23/24 17:25:58 --------------------------------------- 07/23/24 17:25:58 Aborting DAG... 07/23/24 17:25:58 Writing Rescue DAG to /work/mgower/gen3work/summary-report-held-44457/submit/u/mgower/pipelines_check/20240723T222426Z/u_mgower_pipelines_check_20240723T222426Z.dag.rescue001... diff --git a/tests/data/test_pipelines_check_20240727T003507Z.dag.dagman.out b/tests/data/test_pipelines_check_20240727T003507Z.dag.dagman.out index 80cdf55..8c9bffe 100644 --- a/tests/data/test_pipelines_check_20240727T003507Z.dag.dagman.out +++ b/tests/data/test_pipelines_check_20240727T003507Z.dag.dagman.out @@ -1,7 +1,7 @@ 07/26/24 19:35:18 Result of reading /etc/issue: \S - + 07/26/24 19:35:18 Result of reading /etc/redhat-release: AlmaLinux release 9.4 (Seafoam Ocelot) - + 07/26/24 19:35:18 Using IDs: 20 processors, 10 CPUs, 10 HTs 07/26/24 19:35:18 Enumerating interfaces: lo 127.0.0.1 up 07/26/24 19:35:18 Enumerating interfaces: enp11s0 10.0.0.33 up @@ -22,7 +22,7 @@ 07/26/24 19:35:18 ** Log last touched time unavailable (No such file or directory) 07/26/24 19:35:18 ****************************************************** 07/26/24 19:35:18 Using config source: /work/lsst_stack/w_2024_30/conda/envs/lsst-scipipe-8.0.0/etc/condor/condor_config -07/26/24 19:35:18 Using local config sources: +07/26/24 19:35:18 Using local config sources: 07/26/24 19:35:18 /etc/condor/condor_config 07/26/24 19:35:18 /etc/condor/config.d/00-htcondor-9.0.config 07/26/24 19:35:18 /etc/condor/config.d/00-minicondor @@ -77,7 +77,7 @@ 07/26/24 19:35:18 DAGMAN_MAX_JOB_HOLDS setting: 100 07/26/24 19:35:18 DAGMAN_HOLD_CLAIM_TIME setting: 20 07/26/24 19:35:18 ALL_DEBUG setting: D_FULLDEBUG -07/26/24 19:35:18 DAGMAN_DEBUG setting: +07/26/24 19:35:18 DAGMAN_DEBUG setting: 07/26/24 19:35:18 DAGMAN_SUPPRESS_JOB_LOGS setting: False 07/26/24 19:35:18 DAGMAN_REMOVE_NODE_JOBS setting: True 07/26/24 19:35:18 DAGMAN will adjust edges after parsing diff --git a/tests/data/test_pipelines_check_20240727T003507Z.dag.nodes.log b/tests/data/test_pipelines_check_20240727T003507Z.dag.nodes.log index 36be58d..17fd749 100644 --- a/tests/data/test_pipelines_check_20240727T003507Z.dag.nodes.log +++ b/tests/data/test_pipelines_check_20240727T003507Z.dag.nodes.log @@ -18,10 +18,10 @@ 0 - Run Bytes Received By Job 0 - Total Bytes Sent By Job 0 - Total Bytes Received By Job - Partitionable Resources : Usage Request Allocated - Cpus : 1 1 - Disk (KB) : 76 2 471265 - Memory (MB) : 3 2048 2048 + Partitionable Resources : Usage Request Allocated + Cpus : 1 1 + Disk (KB) : 76 2 471265 + Memory (MB) : 3 2048 2048 Job terminated of its own accord at 2024-07-27T00:35:37Z with exit-code 0. ... @@ -45,10 +45,10 @@ 0 - Run Bytes Received By Job 11282 - Total Bytes Sent By Job 0 - Total Bytes Received By Job - Partitionable Resources : Usage Request Allocated - Cpus : 1 1 - Disk (KB) : 88 2 471265 - Memory (MB) : 1088 2048 2048 + Partitionable Resources : Usage Request Allocated + Cpus : 1 1 + Disk (KB) : 88 2 471265 + Memory (MB) : 1088 2048 2048 Job terminated of its own accord at 2024-07-27T00:36:09Z with exit-code 0. ... @@ -72,10 +72,10 @@ 0 - Run Bytes Received By Job 20819 - Total Bytes Sent By Job 0 - Total Bytes Received By Job - Partitionable Resources : Usage Request Allocated - Cpus : 1.00 1 1 - Disk (KB) : 103 2 471265 - Memory (MB) : 356 2048 2048 + Partitionable Resources : Usage Request Allocated + Cpus : 1.00 1 1 + Disk (KB) : 103 2 471265 + Memory (MB) : 356 2048 2048 Job terminated of its own accord at 2024-07-27T00:37:51Z with exit-code 0. ... @@ -99,10 +99,10 @@ 0 - Run Bytes Received By Job 11375 - Total Bytes Sent By Job 0 - Total Bytes Received By Job - Partitionable Resources : Usage Request Allocated - Cpus : 1 1 - Disk (KB) : 88 2 471265 - Memory (MB) : 382 2048 2048 + Partitionable Resources : Usage Request Allocated + Cpus : 1 1 + Disk (KB) : 88 2 471265 + Memory (MB) : 382 2048 2048 Job terminated of its own accord at 2024-07-27T00:38:27Z with exit-code 0. ... @@ -126,10 +126,10 @@ 214 - Run Bytes Received By Job 6733 - Total Bytes Sent By Job 214 - Total Bytes Received By Job - Partitionable Resources : Usage Request Allocated - Cpus : 1 1 - Disk (KB) : 83 1 471265 - Memory (MB) : 0 2048 2048 + Partitionable Resources : Usage Request Allocated + Cpus : 1 1 + Disk (KB) : 83 1 471265 + Memory (MB) : 0 2048 2048 Job terminated of its own accord at 2024-07-27T00:38:36Z with exit-code 0. ... diff --git a/tests/data/test_pipelines_check_20240727T003507Z.info.json b/tests/data/test_pipelines_check_20240727T003507Z.info.json index b659c2a..c693633 100644 --- a/tests/data/test_pipelines_check_20240727T003507Z.info.json +++ b/tests/data/test_pipelines_check_20240727T003507Z.info.json @@ -1 +1 @@ -{"acws02": {"1163.0": {"ClusterId": 1163, "GlobalJobId": "acws02#1163.0#1722040518"}}} \ No newline at end of file +{"acws02": {"1163.0": {"ClusterId": 1163, "GlobalJobId": "acws02#1163.0#1722040518"}}} From c5f3db3561dbacdb29d74defec3b930f075b02ea Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 21 Nov 2024 16:11:33 -0700 Subject: [PATCH 4/5] Fix numpydoc doc string lint failure --- tests/test_htcondor_service.py | 4 ++-- tests/test_lssthtc.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_htcondor_service.py b/tests/test_htcondor_service.py index 6649496..1277621 100644 --- a/tests/test_htcondor_service.py +++ b/tests/test_htcondor_service.py @@ -420,7 +420,7 @@ def testCounts(self): class GetInfoFromPathTestCase(unittest.TestCase): - """Test _get_info_from_path function""" + """Test _get_info_from_path function.""" def test_tmpdir_abort(self): with temporaryDirectory() as tmp_dir: @@ -472,7 +472,7 @@ def test_relative_path(self): class WmsIdToDirTestCase(unittest.TestCase): - """Test _wms_id_to_dir function""" + """Test _wms_id_to_dir function.""" @unittest.mock.patch("lsst.ctrl.bps.htcondor.htcondor_service._wms_id_type") def testInvalidIdType(self, _wms_id_type_mock): diff --git a/tests/test_lssthtc.py b/tests/test_lssthtc.py index 4817a23..f6cf184 100644 --- a/tests/test_lssthtc.py +++ b/tests/test_lssthtc.py @@ -24,7 +24,7 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . -"""Unit tests for classes and functions in lssthtc.py""" +"""Unit tests for classes and functions in lssthtc.py.""" import logging import os From 51609d2f23fbff451e620d0f0c5018f6b6696c23 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 21 Nov 2024 16:13:12 -0700 Subject: [PATCH 5/5] Upload test report to codecov --- .github/workflows/build.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index fef0ce5..8676fac 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -51,12 +51,18 @@ jobs: - name: Run tests run: | - pytest -r a -v --cov=python --cov=tests --cov-report=xml --cov-report=term --cov-branch + pytest -r a -v --cov=python --cov=tests --cov-report=xml --cov-report=term --cov-branch \ + --junitxml=junit.xml -o junit_family=legacy - name: Upload coverage to codecov uses: codecov/codecov-action@v4 with: files: ./coverage.xml token: ${{ secrets.CODECOV_TOKEN }} + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} pypi: