From fb56adee9b2bb8e4fcbe9ea91413f48284ad6d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 9 Oct 2023 12:15:21 +0200 Subject: [PATCH 01/42] Added scaffold for `import` command. Added new `import` command, to import a Workflow-Run profiles compliant RO-Crate into a new staged working directory, in order to run it. --- wfexs_backend/__main__.py | 62 ++++++++++++++++++++++++++++------ wfexs_backend/wfexs_backend.py | 40 ++++++++++++++++++++++ wfexs_backend/workflow.py | 34 +++++++++++++++++++ 3 files changed, 126 insertions(+), 10 deletions(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index a4343165..e83376ad 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -115,6 +115,10 @@ class WfExS_Commands(StrDocEnum): "staged-workdir", "Staged working directories handling subcommands", ) + Import = ( + "import", + "Workflow Run RO-Crate import into a new staged working directory", + ) Export = ("export", "Staged working directories export subcommands") ExportStage = ("export-stage", "Export the staging directory as an RO-Crate") OfflineExecute = ( @@ -219,13 +223,22 @@ def genParserSub( ) if preStageParams: - ap_.add_argument( - "-W", - "--workflow-config", - dest="workflowConfigFilename", - required=True, - help="Configuration file, describing workflow and inputs", - ) + if command != WfExS_Commands.Import: + ap_.add_argument( + "-W", + "--workflow-config", + dest="workflowConfigFilename", + required=True, + help="Configuration file, describing workflow and inputs", + ) + else: + ap_.add_argument( + "-R", + "--workflow-rocrate", + dest="workflowROCrateFilenameOrURI", + required=True, + help="Workflow Run RO-Crate describing a previous workflow execution. It can be either a local path or an URI resolvable from WfExS with no authentication", + ) if preStageParams or exportParams or command == WfExS_Commands.ReStage: ap_.add_argument( @@ -269,7 +282,12 @@ def genParserSub( if ( command - in (WfExS_Commands.Stage, WfExS_Commands.ReStage, WfExS_Commands.Execute) + in ( + WfExS_Commands.Stage, + WfExS_Commands.ReStage, + WfExS_Commands.Import, + WfExS_Commands.Execute, + ) or exportParams ): ap_.add_argument( @@ -282,7 +300,12 @@ def genParserSub( if ( command - in (WfExS_Commands.Stage, WfExS_Commands.StagedWorkDir, WfExS_Commands.Execute) + in ( + WfExS_Commands.Stage, + WfExS_Commands.StagedWorkDir, + WfExS_Commands.Import, + WfExS_Commands.Execute, + ) or postStageParams or exportParams ): @@ -1034,6 +1057,8 @@ def main() -> None: ap_r_s = genParserSub(sp, WfExS_Commands.ReStage, postStageParams=True) + ap_imp = genParserSub(sp, WfExS_Commands.Import, preStageParams=True) + ap_m = genParserSub(sp, WfExS_Commands.MountWorkDir, postStageParams=True) ap_es = genParserSub( @@ -1153,6 +1178,7 @@ def main() -> None: WfExS_Commands.ListPushers, WfExS_Commands.Stage, WfExS_Commands.ReStage, + WfExS_Commands.Import, WfExS_Commands.Execute, ): updated_config, local_config = WfExSBackend.bootstrap( @@ -1259,7 +1285,7 @@ def main() -> None: args.workflowConfigFilename, args.securityContextsConfigFilename ) sys.exit(retval) - else: + elif command == WfExS_Commands.Stage: wfInstance = wfBackend.fromFiles( args.workflowConfigFilename, args.securityContextsConfigFilename, @@ -1269,6 +1295,22 @@ def main() -> None: private_key_passphrase=private_key_passphrase, orcids=op_orcids, ) + elif command == WfExS_Commands.Import: + wfInstance = wfBackend.fromPreviousROCrate( + args.workflowROCrateFilenameOrURI, + args.securityContextsConfigFilename, + nickname_prefix=args.nickname_prefix, + public_key_filenames=args.public_key_files, + private_key_filename=args.private_key_file, + private_key_passphrase=private_key_passphrase, + orcids=op_orcids, + ) + else: + print( + f"[ERROR] Unimplemented command {command.value}. Stopping.", + file=sys.stderr, + ) + sys.exit(1) # This is needed to be sure the encfs instance is unmounted if command != WfExS_Commands.MountWorkDir: diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index a2bd9037..4b48d9a2 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1021,6 +1021,46 @@ def fromPreviousInstanceDeclaration( paranoidMode=paranoidMode, ) + def fromPreviousROCrate( + self, + workflowROCrateFilenameOrURI: "Union[AnyPath, URIType]", + securityContextsConfigFilename: "Optional[AnyPath]" = None, + nickname_prefix: "Optional[str]" = None, + orcids: "Sequence[str]" = [], + public_key_filenames: "Sequence[AnyPath]" = [], + private_key_filename: "Optional[AnyPath]" = None, + private_key_passphrase: "Optional[str]" = None, + paranoidMode: "bool" = False, + ) -> "WF": + # Let's check whether it is a local file + # or a remote RO-Crate + parsedROCrateURI = urllib.parse.urlparse(workflowROCrateFilenameOrURI) + if parsedROCrateURI.scheme == "": + workflowROCrateFilename = cast("AnyPath", workflowROCrateFilenameOrURI) + else: + self.logger.info(f"* Fetching RO-Crate {workflowROCrateFilenameOrURI}") + local_content = self.cacheFetch( + cast("URIType", workflowROCrateFilenameOrURI), + cacheType=CacheType.ROCrate, + offline=False, + ignoreCache=True, + registerInCache=False, + ) + + workflowROCrateFilename = local_content.path + + return WF.FromPreviousROCrate( + self, + workflowROCrateFilename, + securityContextsConfigFilename=securityContextsConfigFilename, + nickname_prefix=nickname_prefix, + orcids=orcids, + public_key_filenames=public_key_filenames, + private_key_filename=private_key_filename, + private_key_passphrase=private_key_passphrase, + paranoidMode=paranoidMode, + ) + def parseAndValidateSecurityContextFile( self, securityContextsConfigFilename: "AnyPath" ) -> "Tuple[ExitVal, SecurityContextConfigBlock]": diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 1af8b818..b04e70fd 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1241,6 +1241,40 @@ def FromPreviousInstanceDeclaration( paranoidMode=paranoidMode, ) + @classmethod + def FromPreviousROCrate( + cls, + wfexs: "WfExSBackend", + workflowROCrateFilename: "AnyPath", + securityContextsConfigFilename: "Optional[AnyPath]" = None, + nickname_prefix: "Optional[str]" = None, + orcids: "Sequence[str]" = [], + public_key_filenames: "Sequence[AnyPath]" = [], + private_key_filename: "Optional[AnyPath]" = None, + private_key_passphrase: "Optional[str]" = None, + paranoidMode: "bool" = False, + ) -> "WF": + """ + This class method creates a new staged working directory + based on the declaration of an existing one + """ + + # TODO + assert False, "The implementation of this method has to be finished" + workflow_meta = {} + + return cls.FromStagedRecipe( + wfexs, + workflow_meta, + securityContextsConfigFilename=securityContextsConfigFilename, + nickname_prefix=nickname_prefix, + orcids=orcids, + public_key_filenames=public_key_filenames, + private_key_filename=private_key_filename, + private_key_passphrase=private_key_passphrase, + paranoidMode=paranoidMode, + ) + @classmethod def FromDescription( cls, From 54bad5f09fd7575029f3ddfa766a918b535bb482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 9 Oct 2023 18:10:16 +0200 Subject: [PATCH 02/42] Fixed issue in `execute`, which arose with previous changes in command line handling --- wfexs_backend/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index e83376ad..e6e9be4f 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -1285,7 +1285,7 @@ def main() -> None: args.workflowConfigFilename, args.securityContextsConfigFilename ) sys.exit(retval) - elif command == WfExS_Commands.Stage: + elif command in (WfExS_Commands.Stage, WfExS_Commands.Execute): wfInstance = wfBackend.fromFiles( args.workflowConfigFilename, args.securityContextsConfigFilename, From 9387dcf18461c3898b10ba2d76e4ac45f3a0caec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 9 Oct 2023 18:11:29 +0200 Subject: [PATCH 03/42] Added additional code in preparation of the JSON-LD query part --- wfexs_backend/wfexs_backend.py | 1 + wfexs_backend/workflow.py | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 4b48d9a2..863512d1 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1052,6 +1052,7 @@ def fromPreviousROCrate( return WF.FromPreviousROCrate( self, workflowROCrateFilename, + public_name=workflowROCrateFilenameOrURI, securityContextsConfigFilename=securityContextsConfigFilename, nickname_prefix=nickname_prefix, orcids=orcids, diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index b04e70fd..ce1083e6 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -34,6 +34,7 @@ import threading import time import warnings +import zipfile from typing import ( cast, @@ -183,6 +184,7 @@ SecurityContextVault, ) import bagit +import magic from . import common as common_defs_module @@ -307,6 +309,8 @@ class MaterializedExportAction(NamedTuple): NextflowWorkflowEngine, ] +ROCRATE_JSONLD_FILENAME: "Final[str]" = "ro-crate-metadata.json" + def _wakeupEncDir( cond: "threading.Condition", workDir: "AbsPath", logger: "logging.Logger" @@ -1246,6 +1250,7 @@ def FromPreviousROCrate( cls, wfexs: "WfExSBackend", workflowROCrateFilename: "AnyPath", + public_name: "str", # Mainly used for provenance and exceptions securityContextsConfigFilename: "Optional[AnyPath]" = None, nickname_prefix: "Optional[str]" = None, orcids: "Sequence[str]" = [], @@ -1259,6 +1264,52 @@ def FromPreviousROCrate( based on the declaration of an existing one """ + # Is it a bare file or an archive? + jsonld_filename: "Optional[str]" = None + if os.path.isdir(workflowROCrateFilename): + jsonld_filename = os.path.join( + workflowROCrateFilename, ROCRATE_JSONLD_FILENAME + ) + if not os.path.exists(jsonld_filename): + raise WFException( + f"{public_name} does not contain a member {ROCRATE_JSONLD_FILENAME}" + ) + elif os.path.isfile(workflowROCrateFilename): + jsonld_filename = workflowROCrateFilename + else: + raise WFException(f"Input {public_name} is neither a file or a directory") + + jsonld_bin: "Optional[bytes]" = None + putative_mime = magic.from_file(jsonld_filename, mime=True) + if putative_mime == "application/json": + with open(jsonld_filename, mode="rb") as jdf: + jsonld_bin = jdf.read() + elif putative_mime == "application/zip": + with zipfile.ZipFile(workflowROCrateFilename, mode="r") as zf: + try: + jsonld_bin = zf.read(ROCRATE_JSONLD_FILENAME) + except Exception as e: + raise WFException( + f"Unable to locate {ROCRATE_JSONLD_FILENAME} within {public_name}" + ) from e + + putative_mime_ld = magic.from_buffer(jsonld_bin) + if putative_mime_ld != "application/json": + raise WFException( + f"{ROCRATE_JSONLD_FILENAME} from within {public_name} has unmanagable MIME {putative_mime_ld}" + ) + else: + raise WFException( + f"The RO-Crate parsing code does not know how to parse {public_name} with MIME {putative_mime}" + ) + + try: + jsonld_obj = json.loads(jsonld_bin) + except json.JSONDecodeError as jde: + raise WFException( + f"Content from {public_name} is not a valid JSON" + ) from jde + # TODO assert False, "The implementation of this method has to be finished" workflow_meta = {} From 015152f681d6489dd2a13f4e32d175987f35a211 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 25 Apr 2024 21:08:39 +0200 Subject: [PATCH 04/42] Translated `magic` import from `wfexs_backend.workflow` into a lazy one --- wfexs_backend/workflow.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 7819a29a..5abf82b3 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -214,12 +214,17 @@ AcceptableLicenceSchemes, LicenceMatcherSingleton, ) +from .utils.misc import ( + lazy_import, +) from .security_context import ( SecurityContextVault, ) import bagit -import magic + +magic = lazy_import("magic") +# import magic from . import __url__ as wfexs_backend_url from . import __official_name__ as wfexs_backend_name From 20dd03165d0dea6426ad0060bd5c5c5c9e876151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 30 Apr 2024 15:33:40 +0200 Subject: [PATCH 05/42] Added initial code to validate whether the input is claiming to be a WRROC --- requirements.txt | 3 +- wfexs_backend/workflow.py | 141 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8bed8583..8cbdcf7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,4 +26,5 @@ pyxdg groovy-parser == 0.1.1 data-url pgzip -defusedxml \ No newline at end of file +defusedxml +rdflib >= 7.0.0 diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 5abf82b3..2bca19f5 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -235,6 +235,10 @@ # We have preference for the C based loader and dumper, but the code # should fallback to default implementations when C ones are not present +import pyld # type: ignore[import, import-untyped] +import rdflib +import rdflib.plugins.sparql + import yaml YAMLLoader: "Type[Union[yaml.Loader, yaml.CLoader]]" @@ -1307,6 +1311,124 @@ def FromPreviousInstanceDeclaration( paranoidMode=paranoidMode, ) + # This is needed due limitations from rdflib mangling relative ids + WFEXS_TRICK_SPARQL_PRE_PREFIX: "Final[str]" = "shttp:" + WFEXS_TRICK_SPARQL_BASE: "Final[str]" = f"{WFEXS_TRICK_SPARQL_PRE_PREFIX}///" + WFEXS_TRICK_SPARQL_NS: "Final[str]" = "wfexs" + + SPARQL_NS = { + "dc": "http://purl.org/dc/elements/1.1/", + "dcterms": "http://purl.org/dc/terms/", + "s": "http://schema.org/", + "rocrate": "https://w3id.org/ro/crate/", + "wfcrate": "https://w3id.org/workflowhub/workflow-ro-crate/", + "wfhprofile": "https://about.workflowhub.eu/Workflow-RO-Crate/", + "wrprocess": "https://w3id.org/ro/wfrun/process/", + "wrwf": "https://w3id.org/ro/wfrun/workflow/", + WFEXS_TRICK_SPARQL_NS: WFEXS_TRICK_SPARQL_BASE, + } + + IS_ROCRATE_SPARQL: "Final[str]" = """\ +SELECT ?rocratejson ?rootdataset ?rocrateprofile ?wfcrateprofile ?wrprocessprofile ?wrwfprofile +WHERE { + ?rocratejson + a s:CreativeWork ; + dcterms:conformsTo ?rocrateprofile ; + s:about ?rootdataset . + ?rootdataset a s:Dataset . + FILTER ( + STRSTARTS(str(?rocrateprofile), str(rocrate:)) + ) . + OPTIONAL { + ?rocratejson dcterms:conformsTo ?wfcrateprofile . + FILTER ( + ?wfcrateprofile = wfhprofile: || STRSTARTS(str(?wfcrateprofile), str(wfcrate:)) + ) . + OPTIONAL { + ?rootdataset + dcterms:conformsTo ?wfcrateprofile ; + dcterms:conformsTo ?wrprocessprofile ; + dcterms:conformsTo ?wrwfprofile . + FILTER ( + STRSTARTS(str(?wrprocessprofile), str(wrprocess:)) && + STRSTARTS(str(?wrwfprofile), str(wrwf:)) + ) . + } + } +} +""" + + @classmethod + def IdentifyROCrate( + cls, jsonld: "Mapping[str, Any]", public_name: "str" + ) -> "Tuple[Optional[rdflib.query.ResultRow], rdflib.graph.Graph]": + """ + This method is used to identify where the input JSON is a + JSON-LD related to RO-Crate. + + The returned value is a tuple, where the first element is the + result row giving the QName of the root dataset, and the different + profiles being matched: RO-Crate, Workflow RO-Crate, WRROC process and WRROC workflow. + The second element of the returned tuple is the rdflib RDF + graph from the read JSON-LD, which should allow exploring it. + """ + jsonld_obj = cast("MutableMapping[str, Any]", copy.deepcopy(jsonld)) + # Let's load it using RDFLib tricks + context: "MutableSequence[Union[str, Mapping[str, str]]]" + got_context = jsonld_obj.get("@context") + if got_context is None: + context = [] + elif isinstance(got_context, (str, dict)): + context = [got_context] + elif isinstance(got_context, list): + context = got_context + + # Setting the augmented context with the trick + context.append( + { + "@base": cls.WFEXS_TRICK_SPARQL_BASE, + } + ) + + if context != got_context: + jsonld_obj["@context"] = context + + # Now, let's load it in RDFLib, in order learn + g = rdflib.Graph() + parsed = g.parse( + data=json.dumps(jsonld_obj), + format="json-ld", + base=cls.WFEXS_TRICK_SPARQL_PRE_PREFIX, + ) + + # This query will tell us whether the JSON-LD is about an RO-Crate 1.1 + q = rdflib.plugins.sparql.prepareQuery( + cls.IS_ROCRATE_SPARQL, + initNs=cls.SPARQL_NS, + ) + + # TODO: cache resolution of contexts + # TODO: disallow network access for context resolution + # when not in right phase + try: + qres = g.query(q) + except Exception as e: + raise WFException( + f"Unable to perform JSON-LD check query over {public_name} (see cascading exceptions)" + ) from e + + resrow: "Optional[rdflib.query.ResultRow]" = None + # In the future, there could be more than one match, when + # nested RO-Crate scenarios happen + for row in qres: + assert isinstance( + row, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + resrow = row + break + + return (resrow, g) + @classmethod def FromPreviousROCrate( cls, @@ -1343,9 +1465,11 @@ def FromPreviousROCrate( jsonld_bin: "Optional[bytes]" = None putative_mime = magic.from_file(jsonld_filename, mime=True) + # Bare possible RO-Crate if putative_mime == "application/json": with open(jsonld_filename, mode="rb") as jdf: jsonld_bin = jdf.read() + # Archived possible RO-Crate elif putative_mime == "application/zip": with zipfile.ZipFile(workflowROCrateFilename, mode="r") as zf: try: @@ -1365,6 +1489,7 @@ def FromPreviousROCrate( f"The RO-Crate parsing code does not know how to parse {public_name} with MIME {putative_mime}" ) + # Let's parse the JSON (in order to check whether it is valid) try: jsonld_obj = json.loads(jsonld_bin) except json.JSONDecodeError as jde: @@ -1372,8 +1497,24 @@ def FromPreviousROCrate( f"Content from {public_name} is not a valid JSON" ) from jde + matched_crate, g = cls.IdentifyROCrate(jsonld_obj, public_name) + # Is it an RO-Crate? + if matched_crate is None: + raise WFException(f"JSON-LD from {public_name} is not an RO-Crate") + + if matched_crate.wfcrateprofile is None: + raise WFException(f"JSON-LD from {public_name} is not a Workflow RO-Crate") + + if matched_crate.wrwfprofile is None: + raise WFException(f"JSON-LD from {public_name} is not a WRROC Workflow") + # TODO assert False, "The implementation of this method has to be finished" + + pyld.jsonld.set_document_loader(pyld.jsonld.aiohttp_document_loader(timeout=10)) + # expand a document, removing its context + # see: https://json-ld.org/spec/latest/json-ld/#expanded-document-form + expanded = pyld.jsonld.expand(jsonld_obj) workflow_meta = {} return cls.FromStagedRecipe( From a6dd82465a5c811feb68a3a765e5f387fde1577f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 30 Apr 2024 15:41:24 +0200 Subject: [PATCH 06/42] Added minor fix to command line handling, so it does not try accessing args.workflowConfigFilename for imports --- wfexs_backend/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index 60f7f407..db115950 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -1343,7 +1343,7 @@ def main() -> None: file=sys.stderr, ) sys.exit(1) - elif not args.workflowConfigFilename: + elif command != WfExS_Commands.Import and not args.workflowConfigFilename: print("[ERROR] Workflow config was not provided! Stopping.", file=sys.stderr) sys.exit(1) elif command == WfExS_Commands.ConfigValidate: From a4b517de0c69132c0b1894565da4cd1905621a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 30 Apr 2024 18:51:32 +0200 Subject: [PATCH 07/42] Fixed Zenodo DOI's resolution --- wfexs_backend/fetchers/doi.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/wfexs_backend/fetchers/doi.py b/wfexs_backend/fetchers/doi.py index 61fcda51..0f6d8821 100644 --- a/wfexs_backend/fetchers/doi.py +++ b/wfexs_backend/fetchers/doi.py @@ -58,6 +58,7 @@ DOI_HANDLE_REST = "https://doi.org/api/handles/" ZENODO_RECORD_PREFIX = "/record/" +ZENODO_NEW_RECORD_PREFIX = "/doi/10.5281/zenodo." B2SHARE_RECORD_PREFIX = "/records/" OSF_IO_RECORD_PREFIX = "/" WORKFLOWHUB_RECORD_PREFIX = "/workflows/" @@ -180,6 +181,7 @@ def fetchDOI( doi_resolved_parsed = parse.urlparse(doi_resolved_url) if doi_resolved_parsed.scheme in ("http", "https"): # If it is from zenodo, let's delegate on zenodo pseudo-CURIE + append_fragment = False if ( doi_resolved_parsed.netloc == "zenodo.org" and doi_resolved_parsed.path.startswith(ZENODO_RECORD_PREFIX) @@ -187,6 +189,15 @@ def fetchDOI( doi_resolved_url = ( "zenodo:" + doi_resolved_parsed.path[len(ZENODO_RECORD_PREFIX) :] ) + append_fragment = True + elif ( + doi_resolved_parsed.netloc == "zenodo.org" + and doi_resolved_parsed.path.startswith(ZENODO_NEW_RECORD_PREFIX) + ): + doi_resolved_url = ( + "zenodo:" + doi_resolved_parsed.path[len(ZENODO_NEW_RECORD_PREFIX) :] + ) + append_fragment = True elif ( doi_resolved_parsed.netloc == "b2share.eudat.eu" and doi_resolved_parsed.path.startswith(B2SHARE_RECORD_PREFIX) @@ -194,6 +205,7 @@ def fetchDOI( doi_resolved_url = ( "b2share:" + doi_resolved_parsed.path[len(B2SHARE_RECORD_PREFIX) :] ) + append_fragment = True elif ( doi_resolved_parsed.netloc == "osf.io" and doi_resolved_parsed.path.startswith(OSF_IO_RECORD_PREFIX) @@ -202,6 +214,7 @@ def fetchDOI( "osf.io:" + doi_resolved_parsed.path[len(OSF_IO_RECORD_PREFIX) :].split("/")[0] ) + append_fragment = True elif ( doi_resolved_parsed.netloc == "workflowhub.eu" and doi_resolved_parsed.path.startswith(WORKFLOWHUB_RECORD_PREFIX) @@ -218,6 +231,9 @@ def fetchDOI( if len(version_a) > 0: doi_resolved_url += "/" + parse.quote(version_a[0], safe="") + if append_fragment and len(parsedInputURL.fragment) > 0: + doi_resolved_url += "/" + parsedInputURL.fragment + return ProtocolFetcherReturn( kind_or_resolved=cast("URIType", doi_resolved_url), metadata_array=metadata_array, From 608dca6fb2f9b734f615fd5791c9618389b3c832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 30 Apr 2024 18:52:13 +0200 Subject: [PATCH 08/42] Zenodo metadata licence handling has changed since the implementation of this fetcher --- wfexs_backend/fetchers/zenodo.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/wfexs_backend/fetchers/zenodo.py b/wfexs_backend/fetchers/zenodo.py index c47c1f22..f4b78ce7 100644 --- a/wfexs_backend/fetchers/zenodo.py +++ b/wfexs_backend/fetchers/zenodo.py @@ -57,7 +57,8 @@ # See https://developers.zenodo.org/#retrieve37 ZENODO_SCHEME = "zenodo" ZENODO_RECORD_REST = "https://zenodo.org/api/records/" -ZENODO_LICENSE_REST = "https://zenodo.org/api/licenses/" +ZENODO_OLD_LICENSE_REST = "https://zenodo.org/api/licenses/" +ZENODO_LICENSE_REST = "https://zenodo.org/api/vocabularies/licenses/" def fetchZenodo( @@ -132,10 +133,14 @@ def fetchZenodo( metadata_array.extend(metametalicio) except FetcherException as fe: raise FetcherException( - f"Error fetching Zenodo licence metadata {zenodo_lic_id} for {zenodo_id} : {fe.code} {fe.reason}" + f"Error fetching Zenodo licence metadata {zenodo_lic_id} for {zenodo_id} using {licence_meta_url} : {fe.code} {fe.reason}" ) from fe - licence_url = l_metadata.get("metadata", {}).get("url") + # New style + licence_url = l_metadata.get("props", {}).get("url") + if licence_url is None: + # Old style + licence_url = l_metadata.get("metadata", {}).get("url") if licence_url is None: raise FetcherException( f"Zenodo licence metadata {zenodo_lic_id} needed to describe {zenodo_id} is inconsistent: {l_metadata}" @@ -143,7 +148,10 @@ def fetchZenodo( # When no URL, then the text should suffice if licence_url == "": - licence_url = l_metadata["metadata"].get("title", zenodo_lic_id) + licence_url = l_metadata.get("title", {}).get("en", "") + # Old style + if licence_url == "": + licence_url = l_metadata["metadata"].get("title", zenodo_lic_id) # Let's select the contents kind: "Optional[ContentKind]" = None From 2a3fe011ce0d664742348698f1049bf95a15f069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 30 Apr 2024 18:53:47 +0200 Subject: [PATCH 09/42] Check against MIME type instead of MIME description on execution import --- wfexs_backend/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 2bca19f5..f08f8ac8 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1479,7 +1479,7 @@ def FromPreviousROCrate( f"Unable to locate {ROCRATE_JSONLD_FILENAME} within {public_name}" ) from e - putative_mime_ld = magic.from_buffer(jsonld_bin) + putative_mime_ld = magic.from_buffer(jsonld_bin, mime=True) if putative_mime_ld != "application/json": raise WFException( f"{ROCRATE_JSONLD_FILENAME} from within {public_name} has unmanagable MIME {putative_mime_ld}" From 359be0d03b361b309ef5c661806f08d41bb2b679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 1 May 2024 01:23:51 +0200 Subject: [PATCH 10/42] Cache remote RO-Crate contents to be imported, unless paranoid mode is enabled --- wfexs_backend/wfexs_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index b1dc70f2..8820c1a7 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1103,8 +1103,8 @@ def fromPreviousROCrate( cast("URIType", workflowROCrateFilenameOrURI), cacheType=CacheType.ROCrate, offline=False, - ignoreCache=True, - registerInCache=False, + ignoreCache=paranoidMode, + registerInCache=not paranoidMode, ) workflowROCrateFilename = local_content.path From 1200e79b91cf6180064295cc74c84fab9886e1d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 1 May 2024 01:25:34 +0200 Subject: [PATCH 11/42] pyld + rdflib are used to enable proper parsing and querying --- requirements.txt | 3 ++ wfexs_backend/workflow.py | 60 +++++++++++++++++++++++---------------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8cbdcf7f..ed286459 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,7 @@ groovy-parser == 0.1.1 data-url pgzip defusedxml +# Needed for proper JSON-LD parsing + SPARQL query +aiohttp +pyld rdflib >= 7.0.0 diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index f08f8ac8..636bde81 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -235,6 +235,8 @@ # We have preference for the C based loader and dumper, but the code # should fallback to default implementations when C ones are not present +# Needed by pyld to detect it +import aiohttp import pyld # type: ignore[import, import-untyped] import rdflib import rdflib.plugins.sparql @@ -1373,30 +1375,40 @@ def IdentifyROCrate( graph from the read JSON-LD, which should allow exploring it. """ jsonld_obj = cast("MutableMapping[str, Any]", copy.deepcopy(jsonld)) - # Let's load it using RDFLib tricks - context: "MutableSequence[Union[str, Mapping[str, str]]]" - got_context = jsonld_obj.get("@context") - if got_context is None: - context = [] - elif isinstance(got_context, (str, dict)): - context = [got_context] - elif isinstance(got_context, list): - context = got_context - - # Setting the augmented context with the trick - context.append( - { - "@base": cls.WFEXS_TRICK_SPARQL_BASE, - } - ) - if context != got_context: - jsonld_obj["@context"] = context + # # Let's load it using RDFLib tricks + # context: "MutableSequence[Union[str, Mapping[str, str]]]" + # got_context = jsonld_obj.get("@context") + # if got_context is None: + # context = [] + # elif isinstance(got_context, (str, dict)): + # context = [got_context] + # elif isinstance(got_context, list): + # context = got_context + # + # # Setting the augmented context with the trick + # context.append( + # { + # "@base": cls.WFEXS_TRICK_SPARQL_BASE, + # } + # ) + # + # if context != got_context: + # jsonld_obj["@context"] = context # Now, let's load it in RDFLib, in order learn g = rdflib.Graph() + # expand a document, removing its context + # see: https://json-ld.org/spec/latest/json-ld/#expanded-document-form + # which is the issue RDFLib 7.0.0 has + + # jsonld_obj_ser = jsonld_obj + jsonld_obj_ser = { + "@graph": pyld.jsonld.expand(jsonld_obj, {"keepFreeFloatingNodes": True}) + } + jsonld_str = json.dumps(jsonld_obj_ser) parsed = g.parse( - data=json.dumps(jsonld_obj), + data=jsonld_str, format="json-ld", base=cls.WFEXS_TRICK_SPARQL_PRE_PREFIX, ) @@ -1508,13 +1520,11 @@ def FromPreviousROCrate( if matched_crate.wrwfprofile is None: raise WFException(f"JSON-LD from {public_name} is not a WRROC Workflow") - # TODO - assert False, "The implementation of this method has to be finished" + raise NotImplementedError( + "The implementation of this method has to be finished" + ) - pyld.jsonld.set_document_loader(pyld.jsonld.aiohttp_document_loader(timeout=10)) - # expand a document, removing its context - # see: https://json-ld.org/spec/latest/json-ld/#expanded-document-form - expanded = pyld.jsonld.expand(jsonld_obj) + # TODO workflow_meta = {} return cls.FromStagedRecipe( From 6f1cf28da81c2fbb26e3ae9529dabcffb2de1e23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 7 May 2024 16:23:41 +0200 Subject: [PATCH 12/42] Workflow type is now matched from the provided RO-Crate manifest. Also, detection code has been moved from WfExSBackend to WF class, so it is better reused. --- wfexs_backend/wfexs_backend.py | 71 +++++------- wfexs_backend/workflow.py | 199 ++++++++++++++++++++++++++++++--- 2 files changed, 211 insertions(+), 59 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 8820c1a7..130aa401 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1143,17 +1143,23 @@ def parseAndValidateSecurityContextFile( def validateConfigFiles( self, - workflowMetaFilename: "AnyPath", + workflowMetaFilename: "Union[AnyPath, WorkflowMetaConfigBlock]", securityContextsConfigFilename: "Optional[AnyPath]" = None, ) -> "ExitVal": numErrors = 0 - self.logger.info(f"Validating {workflowMetaFilename}") + workflow_meta: "WorkflowMetaConfigBlock" + + if isinstance(workflowMetaFilename, str): + self.logger.info(f"Validating {workflowMetaFilename}") - with open(workflowMetaFilename, mode="r", encoding="utf-8") as wcf: - workflow_meta = unmarshall_namedtuple(yaml.safe_load(wcf)) + with open(workflowMetaFilename, mode="r", encoding="utf-8") as wcf: + workflow_meta = unmarshall_namedtuple(yaml.safe_load(wcf)) - if not isinstance(workflow_meta, dict): - workflow_meta = {} + if not isinstance(workflow_meta, dict): + workflow_meta = {} + else: + self.logger.info(f"Validating inline configuration") + workflow_meta = workflowMetaFilename valErrors = config_validate(workflow_meta, WF.STAGE_DEFINITION_SCHEMA) if len(valErrors) == 0: @@ -2291,8 +2297,8 @@ def getWorkflowRepoFromROCrateFile( # TODO: get roCrateObj mainEntity programming language # self.logger.debug(roCrateObj.root_dataset.as_jsonld()) - mainEntityProgrammingLanguageId = None - mainEntityProgrammingLanguageUrl = None + mainEntityProgrammingLanguageId: "Optional[str]" = None + mainEntityProgrammingLanguageUrl: "Optional[str]" = None mainEntityIdHolder: "Optional[str]" = None mainEntityId = None workflowPID = None @@ -2321,49 +2327,22 @@ def getWorkflowRepoFromROCrateFile( mainEntityProgrammingLanguageId = eAsLD.get("identifier", {}).get("@id") mainEntityProgrammingLanguageUrl = eAsLD.get("url", {}).get("@id") - # Now, it is time to match the language id - engineDescById: "Optional[WorkflowType]" = None - engineDescByUrl: "Optional[WorkflowType]" = None - for possibleEngineDesc in WF.WORKFLOW_ENGINES: - if (engineDescById is None) and ( - mainEntityProgrammingLanguageId is not None - ): - for pat in possibleEngineDesc.uriMatch: - if isinstance(pat, Pattern): - match = pat.search(mainEntityProgrammingLanguageId) - if match: - engineDescById = possibleEngineDesc - elif pat == mainEntityProgrammingLanguageId: - engineDescById = possibleEngineDesc - - if (engineDescByUrl is None) and ( - mainEntityProgrammingLanguageUrl == possibleEngineDesc.url - ): - engineDescByUrl = possibleEngineDesc - - engineDesc: "WorkflowType" - if engineDescById is not None: - engineDesc = engineDescById - elif engineDescByUrl is not None: - engineDesc = engineDescByUrl - else: + if mainEntityProgrammingLanguageUrl is None: raise WfExSBackendException( - "Found programming language {} (url {}) in RO-Crate manifest is not among the acknowledged ones".format( - mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl - ) + "Workflow RO-Crate manifest does not describe the workflow language" ) - if ( - (engineDescById is not None) - and (engineDescByUrl is not None) - and engineDescById != engineDescByUrl - ): - self.logger.warning( - "Found programming language {} (url {}) leads to different engines".format( - mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl - ) + with warnings.catch_warnings(record=True) as rec_w: + warnings.simplefilter("always") + + engineDesc = WF.MatchWorkflowType( + mainEntityProgrammingLanguageUrl, mainEntityProgrammingLanguageId ) + # Logging possibly emitted warnings + for w in rec_w: + self.logger.warning(w.message) + if (expectedEngineDesc is not None) and engineDesc != expectedEngineDesc: raise WfExSBackendException( "Expected programming language {} does not match identified one {} in RO-Crate manifest".format( diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 636bde81..c221eb51 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -404,6 +404,10 @@ class ExportActionException(AbstractWfExSException): pass +class WFWarning(UserWarning): + pass + + class WF: """ Workflow enaction class @@ -1322,6 +1326,8 @@ def FromPreviousInstanceDeclaration( "dc": "http://purl.org/dc/elements/1.1/", "dcterms": "http://purl.org/dc/terms/", "s": "http://schema.org/", + "bs": "https://bioschemas.org/", + "bsworkflow": "https://bioschemas.org/profiles/ComputationalWorkflow/", "rocrate": "https://w3id.org/ro/crate/", "wfcrate": "https://w3id.org/workflowhub/workflow-ro-crate/", "wfhprofile": "https://about.workflowhub.eu/Workflow-RO-Crate/", @@ -1331,7 +1337,7 @@ def FromPreviousInstanceDeclaration( } IS_ROCRATE_SPARQL: "Final[str]" = """\ -SELECT ?rocratejson ?rootdataset ?rocrateprofile ?wfcrateprofile ?wrprocessprofile ?wrwfprofile +SELECT ?rocratejson ?rootdataset ?rocrateprofile ?wfcrateprofile ?mainentity ?bsworkflowprofile ?wrprocessprofile ?wrwfprofile WHERE { ?rocratejson a s:CreativeWork ; @@ -1346,6 +1352,16 @@ def FromPreviousInstanceDeclaration( FILTER ( ?wfcrateprofile = wfhprofile: || STRSTARTS(str(?wfcrateprofile), str(wfcrate:)) ) . + OPTIONAL { + ?rootdataset + s:mainEntity ?mainentity . + ?mainentity + a bs:ComputationalWorkflow ; + dcterms:conformsTo ?bsworkflowprofile . + FILTER ( + STRSTARTS(str(?bsworkflowprofile), str(bsworkflow:)) + ) . + } OPTIONAL { ?rootdataset dcterms:conformsTo ?wfcrateprofile ; @@ -1441,6 +1457,118 @@ def IdentifyROCrate( return (resrow, g) + OBTAIN_WORKFLOW_PID_SPARQL: "Final[str]" = """\ +SELECT ?identifier ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version +WHERE { + ?mainentity s:programmingLanguage ?programminglanguage . + ?programminglanguage + a s:ComputerLanguage ; + s:url ?programminglanguage_url . + OPTIONAL { + ?mainentity s:identifier ?identifier . + } + OPTIONAL { + ?programminglanguage + s:version ?programminglanguage_version . + } + OPTIONAL { + ?programminglanguage + s:identifier ?programminglanguage_identifier . + } +} +""" + + @classmethod + def GenerateWorkflowMetaFromJSONLD( + cls, + jsonld_obj: "Mapping[str, Any]", + public_name: "str", + retrospective_first: "bool" = True, + ) -> "WritableWorkflowMetaConfigBlock": + matched_crate, g = cls.IdentifyROCrate(jsonld_obj, public_name) + # Is it an RO-Crate? + if matched_crate is None: + raise WFException(f"JSON-LD from {public_name} is not an RO-Crate") + + if matched_crate.wfcrateprofile is None: + raise WFException(f"JSON-LD from {public_name} is not a Workflow RO-Crate") + + if matched_crate.mainentity is None: + raise WFException( + f"Unable to find the main entity workflow at {public_name} Workflow RO-Crate" + ) + + if matched_crate.wrwfprofile is None: + raise WFException(f"JSON-LD from {public_name} is not a WRROC Workflow") + + # This query will tell us where the original workflow was located, + # its language and version + qlang = rdflib.plugins.sparql.prepareQuery( + cls.OBTAIN_WORKFLOW_PID_SPARQL, + initNs=cls.SPARQL_NS, + ) + + # TODO: cache resolution of contexts + # TODO: disallow network access for context resolution + # when not in right phase + try: + qlangres = g.query( + qlang, + initBindings={ + "mainentity": matched_crate.mainentity, + }, + ) + except Exception as e: + raise WFException( + f"Unable to perform JSON-LD workflow details query over {public_name} (see cascading exceptions)" + ) from e + + langrow: "Optional[rdflib.query.ResultRow]" = None + # In the future, there could be more than one match, when + # nested RO-Crate scenarios happen + for row in qlangres: + assert isinstance( + row, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + langrow = row + break + + if langrow is None: + raise WFException( + f"Unable to get workflow engine details from {public_name}" + ) + + programminglanguage_url = ( + None + if langrow.programminglanguage_url is None + else str(langrow.programminglanguage_url) + ) + programminglanguage_identifier = ( + None + if langrow.programminglanguage_identifier is None + else str(langrow.programminglanguage_identifier) + ) + # Getting the workflow type + workflow_type = cls.MatchWorkflowType( + programminglanguage_url, programminglanguage_identifier + ) + + # At this point we know the workflow engine is supported + # but we need to be sure the container solution is also supported + + # TODO: finish + + workflow_meta: "WritableWorkflowMetaConfigBlock" = { + "workflow_id": {}, + "workflow_type": workflow_type.shortname, + "environment": {}, + "params": {}, + "outputs": {}, + "workflow_config": {}, + } + + return workflow_meta + @classmethod def FromPreviousROCrate( cls, @@ -1509,24 +1637,17 @@ def FromPreviousROCrate( f"Content from {public_name} is not a valid JSON" ) from jde - matched_crate, g = cls.IdentifyROCrate(jsonld_obj, public_name) - # Is it an RO-Crate? - if matched_crate is None: - raise WFException(f"JSON-LD from {public_name} is not an RO-Crate") - - if matched_crate.wfcrateprofile is None: - raise WFException(f"JSON-LD from {public_name} is not a Workflow RO-Crate") + workflow_meta = cls.GenerateWorkflowMetaFromJSONLD(jsonld_obj, public_name) - if matched_crate.wrwfprofile is None: - raise WFException(f"JSON-LD from {public_name} is not a WRROC Workflow") + if wfexs.validateConfigFiles(workflow_meta, securityContextsConfigFilename) > 0: + raise WFException( + f"Generated WfExS description from {public_name} fails (have a look at the log messages for details)" + ) raise NotImplementedError( "The implementation of this method has to be finished" ) - # TODO - workflow_meta = {} - return cls.FromStagedRecipe( wfexs, workflow_meta, @@ -1631,6 +1752,58 @@ def FromForm( paranoid_mode=paranoidMode, ) + @classmethod + def MatchWorkflowType( + cls, + mainEntityProgrammingLanguageUrl: "str", + mainEntityProgrammingLanguageId: "Optional[str]", + ) -> "WorkflowType": + # Now, it is time to match the language id + engineDescById: "Optional[WorkflowType]" = None + engineDescByUrl: "Optional[WorkflowType]" = None + for possibleEngineDesc in cls.WORKFLOW_ENGINES: + if (engineDescById is None) and ( + mainEntityProgrammingLanguageId is not None + ): + for pat in possibleEngineDesc.uriMatch: + if isinstance(pat, Pattern): + match = pat.search(mainEntityProgrammingLanguageId) + if match: + engineDescById = possibleEngineDesc + elif pat == mainEntityProgrammingLanguageId: + engineDescById = possibleEngineDesc + + if (engineDescByUrl is None) and ( + mainEntityProgrammingLanguageUrl == possibleEngineDesc.url + ): + engineDescByUrl = possibleEngineDesc + + engineDesc: "WorkflowType" + if engineDescById is not None: + engineDesc = engineDescById + elif engineDescByUrl is not None: + engineDesc = engineDescByUrl + else: + raise WFException( + "Found programming language {} (url {}) in RO-Crate manifest is not among the acknowledged ones".format( + mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl + ) + ) + + if ( + (engineDescById is not None) + and (engineDescByUrl is not None) + and engineDescById != engineDescByUrl + ): + warnings.warn( + "Queried programming language {} (url {}) leads to different engines".format( + mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl + ), + WFWarning, + ) + + return engineDesc + def fetchWorkflow( self, workflow_id: "WorkflowId", From d590021544a78d7ce55a7351d0255418f0fd4e28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 8 May 2024 01:35:11 +0200 Subject: [PATCH 13/42] Removed limitation about the possible values of descriptor_type, as it is the version of WfExS-backend the one defining the accepted values --- wfexs_backend/schemas/stage-definition.json | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/wfexs_backend/schemas/stage-definition.json b/wfexs_backend/schemas/stage-definition.json index 501c5600..8d727e8d 100644 --- a/wfexs_backend/schemas/stage-definition.json +++ b/wfexs_backend/schemas/stage-definition.json @@ -567,11 +567,7 @@ "workflow_type": { "title": "The optional, forced workflow type", "description": "When this key is set, it is forcing the workflow type. WfExS-backend is usually able to identify the workflow type, unless there are two intertwined workflows in the same location", - "type": "string", - "enum": [ - "nextflow", - "cwl" - ] + "type": "string" }, "workflow_config": { "type": "object", From 00b68b300ff0da566ee74be0b3e2c62df3b31053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 8 May 2024 01:38:43 +0200 Subject: [PATCH 14/42] Added more checks about the descriptor type (a.k.a. the forced workflow type) --- requirements.txt | 2 ++ wfexs_backend/wfexs_backend.py | 28 ++++++++++++++++++++ wfexs_backend/workflow.py | 47 ++++++++++++++++++++++++++++------ 3 files changed, 69 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index ed286459..e7362889 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,8 @@ groovy-parser == 0.1.1 data-url pgzip defusedxml +# This is needed for exception groups +exceptiongroup ; python_version < '3.11' # Needed for proper JSON-LD parsing + SPARQL query aiohttp pyld diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 130aa401..2c8f0acd 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1696,6 +1696,24 @@ def cacheWorkflow( and the version will represent either the branch, tag or specific commit. So, the whole TRS fetching machinery is bypassed. """ + + requested_workflow_type: "Optional[WorkflowType]" = None + if descriptor_type is not None: + # First, try with the workflow type shortname + requested_workflow_type = WF.RECOGNIZED_SHORTNAME_DESCRIPTORS.get( + descriptor_type + ) + if requested_workflow_type is None: + # then, with the workflow type TRS name + requested_workflow_type = WF.RECOGNIZED_TRS_DESCRIPTORS.get( + descriptor_type + ) + + if requested_workflow_type is None: + self.logger.warning( + f"Workflow of type {descriptor_type} is not supported by this version of WfExS-backend" + ) + putative_repo_url = str(workflow_id) parsedRepoURL = urllib.parse.urlparse(putative_repo_url) @@ -1796,6 +1814,14 @@ def cacheWorkflow( # This can be incorrect, but let it be for now if i_workflow is not None: + if ( + requested_workflow_type is not None + and requested_workflow_type != i_workflow.workflow_type + ): + message = f"Fetched workflow is of type {i_workflow.workflow_type.shortname} , but it was explicitly requested to be of type {requested_workflow_type.shortname}" + self.logger.error(message) + raise WfExSBackendException(message) + guessedRepo = i_workflow.remote_repo engineDesc = i_workflow.workflow_type if cached_putative_path is not None: @@ -2294,6 +2320,8 @@ def getWorkflowRepoFromROCrateFile( :return: """ roCrateObj = FixedROCrate(roCrateFile) + # roCrateJSON = roCrateObj.metadata.generate() + # WF.IdentifyROCrate(roCrateJON, roCrateFile) # TODO: get roCrateObj mainEntity programming language # self.logger.debug(roCrateObj.root_dataset.as_jsonld()) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index c221eb51..0a54356e 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -316,6 +316,10 @@ if TYPE_CHECKING: from .wfexs_backend import WfExSBackend +# This code needs exception groups +if sys.version_info[:2] < (3, 11): + from exceptiongroup import ExceptionGroup + # Related export namedtuples class ExportItem(NamedTuple): @@ -378,6 +382,7 @@ def __missing__(self, key: KT) -> VT: ] ROCRATE_JSONLD_FILENAME: "Final[str]" = "ro-crate-metadata.json" +LEGACY_ROCRATE_JSONLD_FILENAME: "Final[str]" = "ro-crate-metadata.jsonld" def _wakeupEncDir( @@ -434,6 +439,10 @@ class WF: map(lambda t: (t.trs_descriptor, t), WORKFLOW_ENGINES) ) + RECOGNIZED_SHORTNAME_DESCRIPTORS: "Final[Mapping[TRS_Workflow_Descriptor, WorkflowType]]" = dict( + map(lambda t: (t.shortname, t), WORKFLOW_ENGINES) + ) + def __init__( self, wfexs: "WfExSBackend", @@ -470,6 +479,7 @@ def __init__( versioning, providing an UUID, etc. :param descriptor_type: The type of descriptor that represents this version of the workflow (e.g. CWL, WDL, NFL, or GALAXY). It is optional, so it is guessed from the calls to the API. + It can be either the short name of the workflow engine, or the name used by GA4GH TRS. :param trs_endpoint: The TRS endpoint used to find the workflow. :param params: Optional params for the workflow execution. :param outputs: @@ -547,9 +557,17 @@ def __init__( workflow_meta["nickname"] = nickname if descriptor_type is not None: descriptor = self.RECOGNIZED_TRS_DESCRIPTORS.get(descriptor_type) + if descriptor is None: + descriptor = self.RECOGNIZED_SHORTNAME_DESCRIPTORS.get( + descriptor_type + ) + if descriptor is not None: workflow_meta["workflow_type"] = descriptor.shortname else: + self.logger.warning( + f"This instance of WfExS backend does not recognize workflows of type {descriptor_type}" + ) workflow_meta["workflow_type"] = descriptor_type if trs_endpoint is not None: workflow_meta["trs_endpoint"] = trs_endpoint @@ -1591,12 +1609,19 @@ def FromPreviousROCrate( # Is it a bare file or an archive? jsonld_filename: "Optional[str]" = None if os.path.isdir(workflowROCrateFilename): - jsonld_filename = os.path.join( + possible_jsonld_filename = os.path.join( workflowROCrateFilename, ROCRATE_JSONLD_FILENAME ) - if not os.path.exists(jsonld_filename): + legacy_jsonld_filename = os.path.join( + workflowROCrateFilename, LEGACY_ROCRATE_JSONLD_FILENAME + ) + if os.path.exists(possible_jsonld_filename): + jsonld_filename = possible_jsonld_filename + elif os.path.exists(legacy_jsonld_filename): + jsonld_filename = legacy_jsonld_filename + else: raise WFException( - f"{public_name} does not contain a member {ROCRATE_JSONLD_FILENAME}" + f"{public_name} does not contain a member {ROCRATE_JSONLD_FILENAME} or {LEGACY_ROCRATE_JSONLD_FILENAME}" ) elif os.path.isfile(workflowROCrateFilename): jsonld_filename = workflowROCrateFilename @@ -1615,9 +1640,15 @@ def FromPreviousROCrate( try: jsonld_bin = zf.read(ROCRATE_JSONLD_FILENAME) except Exception as e: - raise WFException( - f"Unable to locate {ROCRATE_JSONLD_FILENAME} within {public_name}" - ) from e + try: + jsonld_bin = zf.read(LEGACY_ROCRATE_JSONLD_FILENAME) + except Exception as e2: + raise WFException( + f"Unable to locate RO-Crate metadata descriptor within {public_name}" + ) from ExceptionGroup( + f"Both {ROCRATE_JSONLD_FILENAME} and {LEGACY_ROCRATE_JSONLD_FILENAME} tried", + [e, e2], + ) putative_mime_ld = magic.from_buffer(jsonld_bin, mime=True) if putative_mime_ld != "application/json": @@ -1785,7 +1816,7 @@ def MatchWorkflowType( engineDesc = engineDescByUrl else: raise WFException( - "Found programming language {} (url {}) in RO-Crate manifest is not among the acknowledged ones".format( + "Found programming language {} (url {}) in RO-Crate manifest is not among the supported ones by WfExS-backend".format( mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl ) ) @@ -1796,7 +1827,7 @@ def MatchWorkflowType( and engineDescById != engineDescByUrl ): warnings.warn( - "Queried programming language {} (url {}) leads to different engines".format( + "Queried programming language {} and its url {} lead to different engines".format( mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl ), WFWarning, From e4161113107c190572a2201134237ad246906802 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 8 May 2024 19:15:45 +0200 Subject: [PATCH 15/42] Huge code reorganization focused on truly modularize both supported container factories and workflow engines. * Also, added a couple of new command line commands to list them. * Additionally, created wfexs_backend.utils.rocrate.ROCrateToolbox , to move there the methods to generate a WfExS definition from an input RO-Crate metadata. * Other code was also refactored. * Now the list of executions and containers from a WRROC is walked. --- wfexs_backend/__main__.py | 39 ++ .../__init__.py} | 63 +-- .../abstract_docker_container.py | 14 +- .../docker_container.py | 14 +- .../container_factories/no_container.py | 110 +++++ .../podman_container.py | 14 +- .../singularity_container.py | 16 +- wfexs_backend/ro_crate.py | 4 +- wfexs_backend/utils/rocrate.py | 448 ++++++++++++++++++ wfexs_backend/wfexs_backend.py | 236 ++++++++- wfexs_backend/workflow.py | 357 +------------- .../__init__.py} | 110 +++-- .../{ => workflow_engines}/cwl_engine.py | 23 +- .../{ => workflow_engines}/nextflow_engine.py | 30 +- 14 files changed, 972 insertions(+), 506 deletions(-) rename wfexs_backend/{container.py => container_factories/__init__.py} (90%) rename wfexs_backend/{ => container_factories}/abstract_docker_container.py (98%) rename wfexs_backend/{ => container_factories}/docker_container.py (98%) create mode 100644 wfexs_backend/container_factories/no_container.py rename wfexs_backend/{ => container_factories}/podman_container.py (98%) rename wfexs_backend/{ => container_factories}/singularity_container.py (99%) create mode 100644 wfexs_backend/utils/rocrate.py rename wfexs_backend/{engine.py => workflow_engines/__init__.py} (93%) rename wfexs_backend/{ => workflow_engines}/cwl_engine.py (99%) rename wfexs_backend/{ => workflow_engines}/nextflow_engine.py (99%) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index db115950..c34fdce7 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -105,6 +105,14 @@ class WfExS_Commands(StrDocEnum): "list-licences", f"List the documented licences, both embedded and fetched from SPDX release {LicenceMatcherSingleton.DEFAULT_SPDX_VERSION}", ) + ListContainerFactories = ( + "list-container-factories", + "List the supported container factories", + ) + ListWorkflowEngines = ( + "list-workflow-engines", + "List the supported workflow engines", + ) Stage = ( "stage", "Prepare the staging (working) directory for workflow execution, fetching dependencies and contents", @@ -402,6 +410,29 @@ def processListPushersCommand(wfBackend: "WfExSBackend", logLevel: "int") -> "in return 0 +def processListContainerFactoriesCommand( + wfBackend: "WfExSBackend", logLevel: "int" +) -> "int": + container_types = wfBackend.listImplementedContainerTypes() + print(f"{len(container_types)} supported container factories") + for container_type in container_types: + print(f"\t{container_type.value}") + + return 0 + + +def processListWorkflowEnginesCommand( + wfBackend: "WfExSBackend", logLevel: "int" +) -> "int": + print(f"{len(wfBackend.WORKFLOW_ENGINES)} supported workflow engines") + for workflow_type in wfBackend.WORKFLOW_ENGINES: + print( + f"\t{workflow_type.shortname} => {workflow_type.name} (priority {workflow_type.priority})" + ) + + return 0 + + def processListLicencesCommand(wfBackend: "WfExSBackend", logLevel: "int") -> "int": licence_matcher = LicenceMatcherSingleton() documented_licences = licence_matcher.describeDocumentedLicences() @@ -1098,6 +1129,8 @@ def _get_wfexs_argparse_internal( ap_lf = genParserSub(sp, WfExS_Commands.ListFetchers) ap_lp = genParserSub(sp, WfExS_Commands.ListPushers) + ap_lc = genParserSub(sp, WfExS_Commands.ListContainerFactories) + ap_lw = genParserSub(sp, WfExS_Commands.ListWorkflowEngines) ap_ll = genParserSub(sp, WfExS_Commands.ListLicences) ap_cv = genParserSub(sp, WfExS_Commands.ConfigValidate, preStageParams=True) @@ -1260,6 +1293,12 @@ def main() -> None: if command == WfExS_Commands.ListPushers: sys.exit(processListPushersCommand(wfBackend, logLevel)) + if command == WfExS_Commands.ListContainerFactories: + sys.exit(processListContainerFactoriesCommand(wfBackend, logLevel)) + + if command == WfExS_Commands.ListWorkflowEngines: + sys.exit(processListWorkflowEnginesCommand(wfBackend, logLevel)) + if command == WfExS_Commands.ListLicences: sys.exit(processListLicencesCommand(wfBackend, logLevel)) diff --git a/wfexs_backend/container.py b/wfexs_backend/container_factories/__init__.py similarity index 90% rename from wfexs_backend/container.py rename to wfexs_backend/container_factories/__init__.py index 5f55d3e8..fbf9af5b 100644 --- a/wfexs_backend/container.py +++ b/wfexs_backend/container_factories/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ TYPE_CHECKING, ) -from .common import ( +from ..common import ( AbstractWfExSException, ContainerTaggedName, ContainerType, @@ -44,6 +44,7 @@ if TYPE_CHECKING: from typing import ( Any, + ClassVar, Mapping, MutableMapping, MutableSequence, @@ -61,7 +62,7 @@ Final, ) - from .common import ( + from ..common import ( AbsPath, AnyPath, ContainerEngineVersionStr, @@ -88,7 +89,7 @@ class DockerManifestMetadata(TypedDict): YAMLLoader: TypeAlias = Union[yaml.Loader, yaml.CLoader] -from . import common +from .. import common # A couple of constants needed for several fixes DOCKER_SCHEME: "Final[str]" = "docker" @@ -225,6 +226,9 @@ class ContainerNotFoundException(ContainerFactoryException): class ContainerFactory(abc.ABC): + # Is this implementation enabled? + ENABLED: "ClassVar[bool]" = True + def __init__( self, cacheDir: "Optional[AnyPath]" = None, @@ -489,54 +493,3 @@ def supportsFeature(self, feat: "str") -> "bool": factory in this installation. Currently userns """ return feat in self._features - - -class NoContainerFactory(ContainerFactory): - """ - The 'no container approach', for development and local installed software - """ - - # def __init__(self, cacheDir=None, local_config=None, engine_name='unset'): - # super().__init__(cacheDir=cacheDir, local_config=local_config, engine_name=engine_name) - AcceptedContainerTypes = set([common.ContainerType.NoContainer]) - - @classmethod - def ContainerType(cls) -> "common.ContainerType": - return common.ContainerType.NoContainer - - @classmethod - def AcceptsContainerType( - cls, container_type: "Union[common.ContainerType, Set[common.ContainerType]]" - ) -> "bool": - return not cls.AcceptedContainerTypes.isdisjoint( - container_type if isinstance(container_type, set) else (container_type,) - ) - - def engine_version(self) -> "ContainerEngineVersionStr": - """No container engine, empty version""" - return cast("ContainerEngineVersionStr", "") - - def materializeSingleContainer( - self, - tag: "ContainerTaggedName", - simpleFileNameMethod: "ContainerFileNamingMethod", - containers_dir: "Optional[Union[RelPath, AbsPath]]" = None, - offline: "bool" = False, - force: "bool" = False, - ) -> "Optional[Container]": - """ - This is a no-op - """ - return None - - def deploySingleContainer( - self, - container: "Container", - simpleFileNameMethod: "ContainerFileNamingMethod", - containers_dir: "Optional[AnyPath]" = None, - force: "bool" = False, - ) -> "bool": - """ - This is a no-op - """ - return False diff --git a/wfexs_backend/abstract_docker_container.py b/wfexs_backend/container_factories/abstract_docker_container.py similarity index 98% rename from wfexs_backend/abstract_docker_container.py rename to wfexs_backend/container_factories/abstract_docker_container.py index ccf53c74..a4842eb6 100644 --- a/wfexs_backend/abstract_docker_container.py +++ b/wfexs_backend/container_factories/abstract_docker_container.py @@ -34,14 +34,14 @@ TYPE_CHECKING, ) -from .utils.misc import lazy_import +from ..utils.misc import lazy_import magic = lazy_import("magic") # import magic import pgzip -from .common import ( +from ..common import ( AbstractWfExSException, ) @@ -69,7 +69,7 @@ Final, ) - from .common import ( + from ..common import ( AbsPath, AnyPath, ContainerEngineVersionStr, @@ -83,7 +83,7 @@ RelPath, ) - from .container import ( + from . import ( Container, ) @@ -97,13 +97,13 @@ class DockerManifestMetadata(TypedDict): manifests: "Sequence[DockerLikeManifest]" -from . import common -from .container import ( +from .. import common +from . import ( ContainerFactory, ContainerFactoryException, DOCKER_URI_PREFIX, ) -from .utils.digests import ComputeDigestFromObject +from ..utils.digests import ComputeDigestFromObject DOCKER_PROTO = DOCKER_URI_PREFIX + "//" diff --git a/wfexs_backend/docker_container.py b/wfexs_backend/container_factories/docker_container.py similarity index 98% rename from wfexs_backend/docker_container.py rename to wfexs_backend/container_factories/docker_container.py index 76896674..29613aef 100644 --- a/wfexs_backend/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ Final, ) - from .common import ( + from ..common import ( AbsPath, AnyPath, ContainerFileNamingMethod, @@ -51,17 +51,17 @@ URIType, ) - from .container import ( + from . import ( DockerManifestMetadata, ) -from .common import ( +from ..common import ( ContainerType, DEFAULT_DOCKER_CMD, META_JSON_POSTFIX, ) -from .container import ( +from . import ( Container, ContainerEngineException, ContainerFactoryException, @@ -70,11 +70,11 @@ AbstractDockerContainerFactory, DOCKER_PROTO, ) -from .utils.contents import ( +from ..utils.contents import ( link_or_copy, real_unlink_if_exists, ) -from .utils.digests import ComputeDigestFromFile +from ..utils.digests import ComputeDigestFromFile class DockerContainerFactory(AbstractDockerContainerFactory): diff --git a/wfexs_backend/container_factories/no_container.py b/wfexs_backend/container_factories/no_container.py new file mode 100644 index 00000000..9d74f1be --- /dev/null +++ b/wfexs_backend/container_factories/no_container.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from typing import ( + Any, + Mapping, + MutableMapping, + MutableSequence, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, + ) + + from ..common import ( + AbsPath, + AnyPath, + ContainerEngineVersionStr, + ContainerFileNamingMethod, + ContainerLocalConfig, + ContainerOperatingSystem, + ContainerTaggedName, + Fingerprint, + ProcessorArchitecture, + RelPath, + URIType, + ) + + +from . import ( + Container, + ContainerFactory, +) + +from .. import common + + +class NoContainerFactory(ContainerFactory): + """ + The 'no container approach', for development and local installed software + """ + + # def __init__(self, cacheDir=None, local_config=None, engine_name='unset'): + # super().__init__(cacheDir=cacheDir, local_config=local_config, engine_name=engine_name) + AcceptedContainerTypes = set([common.ContainerType.NoContainer]) + + @classmethod + def ContainerType(cls) -> "common.ContainerType": + return common.ContainerType.NoContainer + + @classmethod + def AcceptsContainerType( + cls, container_type: "Union[common.ContainerType, Set[common.ContainerType]]" + ) -> "bool": + return not cls.AcceptedContainerTypes.isdisjoint( + container_type if isinstance(container_type, set) else (container_type,) + ) + + def engine_version(self) -> "ContainerEngineVersionStr": + """No container engine, empty version""" + return cast("ContainerEngineVersionStr", "") + + def materializeSingleContainer( + self, + tag: "ContainerTaggedName", + simpleFileNameMethod: "ContainerFileNamingMethod", + containers_dir: "Optional[Union[RelPath, AbsPath]]" = None, + offline: "bool" = False, + force: "bool" = False, + ) -> "Optional[Container]": + """ + This is a no-op + """ + return None + + def deploySingleContainer( + self, + container: "Container", + simpleFileNameMethod: "ContainerFileNamingMethod", + containers_dir: "Optional[AnyPath]" = None, + force: "bool" = False, + ) -> "bool": + """ + This is a no-op + """ + return False diff --git a/wfexs_backend/podman_container.py b/wfexs_backend/container_factories/podman_container.py similarity index 98% rename from wfexs_backend/podman_container.py rename to wfexs_backend/container_factories/podman_container.py index 959eefe5..ba352976 100644 --- a/wfexs_backend/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ Final, ) - from .common import ( + from ..common import ( AbsPath, AnyPath, ContainerFileNamingMethod, @@ -51,16 +51,16 @@ URIType, ) - from .container import ( + from . import ( DockerManifestMetadata, ) -from .common import ( +from ..common import ( ContainerType, DEFAULT_PODMAN_CMD, META_JSON_POSTFIX, ) -from .container import ( +from . import ( Container, ContainerEngineException, ContainerFactoryException, @@ -70,11 +70,11 @@ AbstractDockerContainerFactory, DOCKER_PROTO, ) -from .utils.contents import ( +from ..utils.contents import ( link_or_copy, real_unlink_if_exists, ) -from .utils.digests import ComputeDigestFromFile +from ..utils.digests import ComputeDigestFromFile class PodmanContainerFactory(AbstractDockerContainerFactory): diff --git a/wfexs_backend/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py similarity index 99% rename from wfexs_backend/singularity_container.py rename to wfexs_backend/container_factories/singularity_container.py index 6c9f14ed..9bf85289 100644 --- a/wfexs_backend/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,12 +32,12 @@ from urllib import parse import uuid -from .common import ( +from ..common import ( META_JSON_POSTFIX, DEFAULT_SINGULARITY_CMD, ) -from . import common +from .. import common if TYPE_CHECKING: from typing import ( @@ -56,7 +56,7 @@ TypedDict, ) - from .common import ( + from ..common import ( AbsPath, AnyPath, ContainerFileNamingMethod, @@ -78,7 +78,7 @@ class SingularityManifest(TypedDict): image_signature: NotRequired[Fingerprint] -from .container import ( +from . import ( Container, ContainerFactory, ContainerEngineException, @@ -87,9 +87,9 @@ class SingularityManifest(TypedDict): DOCKER_SCHEME, ) -from .utils.contents import link_or_copy -from .utils.digests import ComputeDigestFromFile, nihDigester -from .utils.docker import DockerHelper +from ..utils.contents import link_or_copy +from ..utils.digests import ComputeDigestFromFile, nihDigester +from ..utils.docker import DockerHelper class FailedContainerTag(NamedTuple): diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index d1f085ec..ab2d9dac 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -78,11 +78,11 @@ WorkflowEngineVersionStr, ) - from .container import ( + from .container_factories import ( Container, ) - from .engine import ( + from .workflow_engines import ( MaterializedWorkflowEngine, WorkflowType, ) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py new file mode 100644 index 00000000..95704add --- /dev/null +++ b/wfexs_backend/utils/rocrate.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import + +import abc +import copy +import inspect +import json +import logging + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from typing import ( + Any, + Mapping, + MutableMapping, + Optional, + Tuple, + ) + + from typing_extensions import ( + Final, + ) + + from ..common import ( + WritableWorkflowMetaConfigBlock, + ) + + from ..wfexs_backend import ( + WfExSBackend, + ) + +# Needed by pyld to detect it +import aiohttp +import pyld # type: ignore[import, import-untyped] +import rdflib +import rdflib.plugins.sparql + + +class ROCrateToolboxException(Exception): + pass + + +class ROCrateToolbox(abc.ABC): + # This is needed due limitations from rdflib mangling relative ids + WFEXS_TRICK_SPARQL_PRE_PREFIX: "Final[str]" = "shttp:" + WFEXS_TRICK_SPARQL_BASE: "Final[str]" = f"{WFEXS_TRICK_SPARQL_PRE_PREFIX}///" + WFEXS_TRICK_SPARQL_NS: "Final[str]" = "wfexs" + + SPARQL_NS = { + "dc": "http://purl.org/dc/elements/1.1/", + "dcterms": "http://purl.org/dc/terms/", + "s": "http://schema.org/", + "bs": "https://bioschemas.org/", + "bsworkflow": "https://bioschemas.org/profiles/ComputationalWorkflow/", + "rocrate": "https://w3id.org/ro/crate/", + "wfcrate": "https://w3id.org/workflowhub/workflow-ro-crate/", + "wfhprofile": "https://about.workflowhub.eu/Workflow-RO-Crate/", + "wrprocess": "https://w3id.org/ro/wfrun/process/", + "wrwf": "https://w3id.org/ro/wfrun/workflow/", + "wrterm": "https://w3id.org/ro/terms/workflow-run#", + "wikidata": "https://www.wikidata.org/wiki/", + WFEXS_TRICK_SPARQL_NS: WFEXS_TRICK_SPARQL_BASE, + } + + def __init__(self, wfexs: "WfExSBackend"): + if wfexs is None: + raise ROCrateToolboxException( + "Unable to initialize, no WfExSBackend instance provided" + ) + + # Getting a logger focused on specific classes + self.logger = logging.getLogger( + dict(inspect.getmembers(self))["__module__"] + + "::" + + self.__class__.__name__ + ) + + self.wfexs = wfexs + + IS_ROCRATE_SPARQL: "Final[str]" = """\ +SELECT ?rocratejson ?rootdataset ?rocrateprofile ?wfcrateprofile ?mainentity ?bsworkflowprofile ?wrprocessprofile ?wrwfprofile +WHERE { + ?rocratejson + a s:CreativeWork ; + dcterms:conformsTo ?rocrateprofile ; + s:about ?rootdataset . + ?rootdataset a s:Dataset . + FILTER ( + STRSTARTS(str(?rocrateprofile), str(rocrate:)) + ) . + OPTIONAL { + ?rocratejson dcterms:conformsTo ?wfcrateprofile . + FILTER ( + ?wfcrateprofile = wfhprofile: || STRSTARTS(str(?wfcrateprofile), str(wfcrate:)) + ) . + OPTIONAL { + ?rootdataset + s:mainEntity ?mainentity . + ?mainentity + a bs:ComputationalWorkflow ; + dcterms:conformsTo ?bsworkflowprofile . + FILTER ( + STRSTARTS(str(?bsworkflowprofile), str(bsworkflow:)) + ) . + } + OPTIONAL { + ?rootdataset + dcterms:conformsTo ?wfcrateprofile ; + dcterms:conformsTo ?wrprocessprofile ; + dcterms:conformsTo ?wrwfprofile . + FILTER ( + STRSTARTS(str(?wrprocessprofile), str(wrprocess:)) && + STRSTARTS(str(?wrwfprofile), str(wrwf:)) + ) . + } + } +} +""" + + def identifyROCrate( + self, jsonld: "Mapping[str, Any]", public_name: "str" + ) -> "Tuple[Optional[rdflib.query.ResultRow], rdflib.graph.Graph]": + """ + This method is used to identify where the input JSON is a + JSON-LD related to RO-Crate. + + The returned value is a tuple, where the first element is the + result row giving the QName of the root dataset, and the different + profiles being matched: RO-Crate, Workflow RO-Crate, WRROC process and WRROC workflow. + The second element of the returned tuple is the rdflib RDF + graph from the read JSON-LD, which should allow exploring it. + """ + jsonld_obj = cast("MutableMapping[str, Any]", copy.deepcopy(jsonld)) + + # # Let's load it using RDFLib tricks + # context: "MutableSequence[Union[str, Mapping[str, str]]]" + # got_context = jsonld_obj.get("@context") + # if got_context is None: + # context = [] + # elif isinstance(got_context, (str, dict)): + # context = [got_context] + # elif isinstance(got_context, list): + # context = got_context + # + # # Setting the augmented context with the trick + # context.append( + # { + # "@base": self.WFEXS_TRICK_SPARQL_BASE, + # } + # ) + # + # if context != got_context: + # jsonld_obj["@context"] = context + + # Now, let's load it in RDFLib, in order learn + g = rdflib.Graph() + # expand a document, removing its context + # see: https://json-ld.org/spec/latest/json-ld/#expanded-document-form + # which is the issue RDFLib 7.0.0 has + + # jsonld_obj_ser = jsonld_obj + jsonld_obj_ser = { + "@graph": pyld.jsonld.expand(jsonld_obj, {"keepFreeFloatingNodes": True}) + } + jsonld_str = json.dumps(jsonld_obj_ser) + parsed = g.parse( + data=jsonld_str, + format="json-ld", + base=self.WFEXS_TRICK_SPARQL_PRE_PREFIX, + ) + + # This query will tell us whether the JSON-LD is about an RO-Crate 1.1 + q = rdflib.plugins.sparql.prepareQuery( + self.IS_ROCRATE_SPARQL, + initNs=self.SPARQL_NS, + ) + + # TODO: cache resolution of contexts + # TODO: disallow network access for context resolution + # when not in right phase + try: + qres = g.query(q) + except Exception as e: + raise ROCrateToolboxException( + f"Unable to perform JSON-LD check query over {public_name} (see cascading exceptions)" + ) from e + + resrow: "Optional[rdflib.query.ResultRow]" = None + # In the future, there could be more than one match, when + # nested RO-Crate scenarios happen + for row in qres: + assert isinstance( + row, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + resrow = row + break + + return (resrow, g) + + OBTAIN_WORKFLOW_PID_SPARQL: "Final[str]" = """\ +SELECT ?identifier ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version +WHERE { + ?mainentity s:programmingLanguage ?programminglanguage . + ?programminglanguage + a s:ComputerLanguage ; + s:url ?programminglanguage_url . + OPTIONAL { + ?mainentity s:identifier ?identifier . + } + OPTIONAL { + ?programminglanguage + s:version ?programminglanguage_version . + } + OPTIONAL { + ?programminglanguage + s:identifier ?programminglanguage_identifier . + } +} +""" + + OBTAIN_RUNS_SPARQL: "Final[str]" = """\ +SELECT ?execution +WHERE { + ?rootdataset s:mentions ?execution . + ?execution + a s:CreateAction ; + s:instrument ?mainentity . +} +""" + + OBTAIN_RUN_CONTAINERS: "Final[str]" = """\ +SELECT ?container ?container_additional_type ?type_of_container ?type_of_container_type ?container_registry ?container_name ?container_tag ?container_sha256 ?container_platform ?container_arch +WHERE { + ?execution wrterm:containerImage ?container . + ?container + a wrterm:ContainerImage ; + s:additionalType ?container_additional_type . + OPTIONAL { + ?container + s:softwareRequirements ?container_type ; + s:applicationCategory ?type_of_container . + ?container_type + a s:SoftwareApplication ; + s:applicationCategory ?type_of_container_type . + FILTER( + STRSTARTS(str(?type_of_container), str(wikidata:)) && + STRSTARTS(str(?type_of_container_type), str(wikidata:)) + ) . + } + OPTIONAL { + ?container wrterm:registry ?container_registry . + } + OPTIONAL { + ?container s:name ?container_name . + } + OPTIONAL { + ?container wrterm:tag ?container_tag . + } + OPTIONAL { + ?container wrterm:sha256 ?container_sha256 . + } + OPTIONAL { + ?container + a s:SoftwareApplication ; + s:operatingSystem ?container_platform . + } + OPTIONAL { + ?container + a s:SoftwareApplication ; + s:processorRequirements ?container_arch . + } +} +""" + + def generateWorkflowMetaFromJSONLD( + self, + jsonld_obj: "Mapping[str, Any]", + public_name: "str", + retrospective_first: "bool" = True, + ) -> "WritableWorkflowMetaConfigBlock": + matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) + # Is it an RO-Crate? + if matched_crate is None: + raise ROCrateToolboxException( + f"JSON-LD from {public_name} is not an RO-Crate" + ) + + if matched_crate.wfcrateprofile is None: + raise ROCrateToolboxException( + f"JSON-LD from {public_name} is not a Workflow RO-Crate" + ) + + if matched_crate.mainentity is None: + raise ROCrateToolboxException( + f"Unable to find the main entity workflow at {public_name} Workflow RO-Crate" + ) + + if matched_crate.wrwfprofile is None: + raise ROCrateToolboxException( + f"JSON-LD from {public_name} is not a WRROC Workflow" + ) + + # This query will tell us where the original workflow was located, + # its language and version + qlang = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_WORKFLOW_PID_SPARQL, + initNs=self.SPARQL_NS, + ) + + # TODO: cache resolution of contexts + # TODO: disallow network access for context resolution + # when not in right phase + try: + qlangres = g.query( + qlang, + initBindings={ + "mainentity": matched_crate.mainentity, + }, + ) + except Exception as e: + raise ROCrateToolboxException( + f"Unable to perform JSON-LD workflow details query over {public_name} (see cascading exceptions)" + ) from e + + langrow: "Optional[rdflib.query.ResultRow]" = None + # In the future, there could be more than one match, when + # nested RO-Crate scenarios happen + for row in qlangres: + assert isinstance( + row, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + langrow = row + break + + if langrow is None: + raise ROCrateToolboxException( + f"Unable to get workflow engine details from {public_name}" + ) + + programminglanguage_url = ( + None + if langrow.programminglanguage_url is None + else str(langrow.programminglanguage_url) + ) + programminglanguage_identifier = ( + None + if langrow.programminglanguage_identifier is None + else str(langrow.programminglanguage_identifier) + ) + # Getting the workflow type + workflow_type = self.wfexs.matchWorkflowType( + programminglanguage_url, programminglanguage_identifier + ) + + # At this point we know WfExS supports the workflow engine. + # Now it is the moment to choose whether to use one of the stored + # executions as template (retrospective provenance) + # or delegate on the prospective one. + if retrospective_first: + # For the retrospective provenance at least an execution must + # be described in the RO-Crate. Once one is chosen, + # we need to be sure the container solution used then is + # also supported. + # So, we are starting with the retrospective provenance + # gathering the list of containers, to learn + # whi. + try: + qexecs = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_RUNS_SPARQL, + initNs=self.SPARQL_NS, + ) + qexecsres = g.query( + qexecs, + initBindings={ + "rootdataset": matched_crate.rootdataset, + "mainentity": matched_crate.mainentity, + }, + ) + for execrow in qexecsres: + assert isinstance( + execrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + print(f"\tExecution {execrow.execution}") + qcontainers = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_RUN_CONTAINERS, + initNs=self.SPARQL_NS, + ) + qcontainersres = g.query( + qcontainers, + initBindings={ + "execution": execrow.execution, + }, + ) + for containerrow in qcontainersres: + assert isinstance( + containerrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + print( + f"""\ +Container {containerrow.container} +{containerrow.container_additional_type} +{containerrow.type_of_container} +{containerrow.type_of_container_type} +{containerrow.container_registry} +{containerrow.container_name} +{containerrow.container_tag} +{containerrow.container_sha256} +{containerrow.container_platform} +{containerrow.container_arch} +""" + ) + except Exception as e: + raise ROCrateToolboxException( + f"Unable to perform JSON-LD workflow execution details query over {public_name} (see cascading exceptions)" + ) from e + + # TODO: finish + + workflow_meta: "WritableWorkflowMetaConfigBlock" = { + "workflow_id": {}, + "workflow_type": workflow_type.shortname, + "environment": {}, + "params": {}, + "outputs": {}, + "workflow_config": {}, + } + + return workflow_meta diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 2c8f0acd..9135fd81 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -86,7 +86,11 @@ SchemeHandlerCacheHandler, ) -from .engine import ( +from .container_factories import ( + ContainerFactory, +) + +from .workflow_engines import ( WORKDIR_META_RELDIR, WORKDIR_PASSPHRASE_FILE, WORKDIR_WORKFLOW_META_FILE, @@ -111,6 +115,10 @@ WfExSPassGenSingleton, ) +from .utils.rocrate import ( + ROCrateToolbox, +) + from .fetchers import ( AbstractStatefulFetcher, DocumentedProtocolFetcher, @@ -134,6 +142,10 @@ WFException, ) +from .workflow_engines import ( + WorkflowEngine, +) + from .fetchers.trs_files import ( TRS_SCHEME_PREFIX, INTERNAL_TRS_SCHEME_PREFIX, @@ -166,6 +178,7 @@ from .common import ( AbsPath, AnyPath, + ContainerType, EnvironmentBlock, ExitVal, ExportActionBlock, @@ -189,7 +202,7 @@ WritableWfExSConfigBlock, ) - from .engine import ( + from .workflow_engines import ( AbstractWorkflowEngineType, WorkflowType, ) @@ -612,6 +625,36 @@ def __init__( # All the export plugins should be added here self.findAndAddExportPluginsFromModuleName() + # Registry of workflow engines is created here + self._workflow_engines: "MutableMapping[str, Type[WorkflowEngine]]" = dict() + + # All the workflow engines should be added here + self.findAndAddWorkflowEnginesFromModuleName() + + self.WORKFLOW_ENGINES: "Sequence[WorkflowType]" = sorted( + map(lambda clazz: clazz.MyWorkflowType(), self._workflow_engines.values()), + key=lambda clz: (-clz.priority, clz.shortname), + ) + + self.RECOGNIZED_TRS_DESCRIPTORS: "Mapping[TRS_Workflow_Descriptor, WorkflowType]" = dict( + map(lambda t: (t.trs_descriptor, t), self.WORKFLOW_ENGINES) + ) + + self.RECOGNIZED_SHORTNAME_DESCRIPTORS: "Mapping[TRS_Workflow_Descriptor, WorkflowType]" = dict( + map(lambda t: (t.shortname, t), self.WORKFLOW_ENGINES) + ) + + # Registry of container factories is created here + self._container_factories: "MutableMapping[ContainerType, Type[ContainerFactory]]" = ( + dict() + ) + + # All the container factories should be added here + self.findAndAddContainerFactoriesFromModuleName() + + # The toolbox to be shared with others + self.rocrate_toolbox = ROCrateToolbox(self) + @property def cacheWorkflowDir(self) -> "AbsPath": return self.cachePathMap[CacheType.Workflow] @@ -655,6 +698,165 @@ def instantiateStatefulFetcher( return cast("StatefulFetcher", instStatefulFetcher) + def findAndAddWorkflowEnginesFromModuleName( + self, + the_module_name: "str" = "wfexs_backend.workflow_engines", + ) -> None: + try: + the_module = importlib.import_module(the_module_name) + self.findAndAddWorkflowEnginesFromModule(the_module) + except Exception as e: + errmsg = f"Unable to import module {the_module_name} in order to gather workflow engines, due errors:" + self.logger.exception(errmsg) + raise WfExSBackendException(errmsg) from e + + def findAndAddWorkflowEnginesFromModule( + self, + the_module: "ModuleType", + ) -> None: + for finder, module_name, ispkg in iter_namespace(the_module): + try: + named_module = importlib.import_module(module_name) + except: + self.logger.exception( + f"Skipping module {module_name} in order to gather workflow engines, due errors:" + ) + continue + + for name, obj in inspect.getmembers(named_module): + if ( + inspect.isclass(obj) + and not inspect.isabstract(obj) + and issubclass(obj, WorkflowEngine) + ): + # Now, let's learn whether the class is enabled + if obj.MyWorkflowType().enabled: + self.addWorkflowEngine(obj) + else: + self.logger.debug( + f"Workflow engine class {name} from module {named_module} was not eligible" + ) + + def addWorkflowEngine(self, workflowEngineClazz: "Type[WorkflowEngine]") -> None: + self._workflow_engines[ + workflowEngineClazz.MyWorkflowType().shortname + ] = workflowEngineClazz + + def listWorkflowEngines(self) -> "Sequence[str]": + return list(self._workflow_engines.keys()) + + def getWorkflowEngineClass( + self, engine_shortname: "str" + ) -> "Optional[Type[WorkflowEngine]]": + return self._workflow_engines.get(engine_shortname) + + def matchWorkflowType( + self, + mainEntityProgrammingLanguageUrl: "str", + mainEntityProgrammingLanguageId: "Optional[str]", + ) -> "WorkflowType": + # Now, it is time to match the language id + engineDescById: "Optional[WorkflowType]" = None + engineDescByUrl: "Optional[WorkflowType]" = None + for possibleEngineDesc in self.WORKFLOW_ENGINES: + if (engineDescById is None) and ( + mainEntityProgrammingLanguageId is not None + ): + for pat in possibleEngineDesc.uriMatch: + if isinstance(pat, Pattern): + match = pat.search(mainEntityProgrammingLanguageId) + if match: + engineDescById = possibleEngineDesc + elif pat == mainEntityProgrammingLanguageId: + engineDescById = possibleEngineDesc + + if (engineDescByUrl is None) and ( + mainEntityProgrammingLanguageUrl == possibleEngineDesc.url + ): + engineDescByUrl = possibleEngineDesc + + engineDesc: "WorkflowType" + if engineDescById is not None: + engineDesc = engineDescById + elif engineDescByUrl is not None: + engineDesc = engineDescByUrl + else: + raise WfExSBackendException( + "Found programming language {} (url {}) in RO-Crate manifest is not among the supported ones by WfExS-backend".format( + mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl + ) + ) + + if ( + (engineDescById is not None) + and (engineDescByUrl is not None) + and engineDescById != engineDescByUrl + ): + self.logger.warning( + "Queried programming language {} and its url {} lead to different engines".format( + mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl + ) + ) + + return engineDesc + + def findAndAddContainerFactoriesFromModuleName( + self, + the_module_name: "str" = "wfexs_backend.container_factories", + ) -> None: + try: + the_module = importlib.import_module(the_module_name) + self.findAndAddContainerFactoriesFromModule(the_module) + except Exception as e: + errmsg = f"Unable to import module {the_module_name} in order to gather container factories, due errors:" + self.logger.exception(errmsg) + raise WfExSBackendException(errmsg) from e + + def findAndAddContainerFactoriesFromModule( + self, + the_module: "ModuleType", + ) -> None: + for finder, module_name, ispkg in iter_namespace(the_module): + try: + named_module = importlib.import_module(module_name) + except: + self.logger.exception( + f"Skipping module {module_name} in order to gather container factories, due errors:" + ) + continue + + for name, obj in inspect.getmembers(named_module): + if ( + inspect.isclass(obj) + and not inspect.isabstract(obj) + and issubclass(obj, ContainerFactory) + ): + # Now, let's learn whether the class is enabled + if getattr(obj, "ENABLED", False): + self.addContainerFactory(obj) + else: + self.logger.debug( + f"Container factory class {name} from module {named_module} was not eligible" + ) + + def addContainerFactory( + self, containerFactoryClazz: "Type[ContainerFactory]" + ) -> None: + self._container_factories[ + containerFactoryClazz.ContainerType() + ] = containerFactoryClazz + + def listImplementedContainerTypes(self) -> "Sequence[ContainerType]": + return list(self._container_factories.keys()) + + def listContainerFactoryClasses(self) -> "Sequence[Type[ContainerFactory]]": + return list(self._container_factories.values()) + + def getContainerFactoryClass( + self, container_type: "ContainerType" + ) -> "Optional[Type[ContainerFactory]]": + return self._container_factories.get(container_type) + def findAndAddExportPluginsFromModuleName( self, the_module_name: "str" = "wfexs_backend.pushers", @@ -1649,6 +1851,7 @@ def instantiateEngine( ) -> "AbstractWorkflowEngineType": return engineDesc.clazz.FromStagedSetup( staged_setup=stagedSetup, + container_factory_classes=self.listContainerFactoryClasses(), cache_dir=self.cacheDir, cache_workflow_dir=self.cacheWorkflowDir, cache_workflow_inputs_dir=self.cacheWorkflowInputsDir, @@ -1700,12 +1903,12 @@ def cacheWorkflow( requested_workflow_type: "Optional[WorkflowType]" = None if descriptor_type is not None: # First, try with the workflow type shortname - requested_workflow_type = WF.RECOGNIZED_SHORTNAME_DESCRIPTORS.get( + requested_workflow_type = self.RECOGNIZED_SHORTNAME_DESCRIPTORS.get( descriptor_type ) if requested_workflow_type is None: # then, with the workflow type TRS name - requested_workflow_type = WF.RECOGNIZED_TRS_DESCRIPTORS.get( + requested_workflow_type = self.RECOGNIZED_TRS_DESCRIPTORS.get( descriptor_type ) @@ -2016,7 +2219,7 @@ def getWorkflowRepoFromTRS( # Now, realize whether it matches chosenDescriptorType = descriptor_type if chosenDescriptorType is None: - for candidateDescriptorType in WF.RECOGNIZED_TRS_DESCRIPTORS.keys(): + for candidateDescriptorType in self.RECOGNIZED_TRS_DESCRIPTORS.keys(): if candidateDescriptorType in toolDescriptorTypes: chosenDescriptorType = candidateDescriptorType break @@ -2036,7 +2239,7 @@ def getWorkflowRepoFromTRS( rawToolDesc, ) ) - elif chosenDescriptorType not in WF.RECOGNIZED_TRS_DESCRIPTORS: + elif chosenDescriptorType not in self.RECOGNIZED_TRS_DESCRIPTORS: raise WFException( "Descriptor type {} is not among the acknowledged ones by this backend. Version {} of workflow {} from {} . Raw answer:\n{}".format( descriptor_type, @@ -2071,7 +2274,9 @@ def getWorkflowRepoFromTRS( metadata_array, ) = self.getWorkflowBundleFromURI( roCrateURL, - expectedEngineDesc=WF.RECOGNIZED_TRS_DESCRIPTORS[chosenDescriptorType], + expectedEngineDesc=self.RECOGNIZED_TRS_DESCRIPTORS[ + chosenDescriptorType + ], offline=offline, ignoreCache=ignoreCache, ) @@ -2088,7 +2293,7 @@ def getWorkflowRepoFromTRS( ignoreCache=ignoreCache, ) - expectedEngineDesc = WF.RECOGNIZED_TRS_DESCRIPTORS[chosenDescriptorType] + expectedEngineDesc = self.RECOGNIZED_TRS_DESCRIPTORS[chosenDescriptorType] trs_meta = cached_trs_files.metadata_array[0] remote_workflow_entrypoint = trs_meta.metadata.get( "remote_workflow_entrypoint" @@ -2321,7 +2526,7 @@ def getWorkflowRepoFromROCrateFile( """ roCrateObj = FixedROCrate(roCrateFile) # roCrateJSON = roCrateObj.metadata.generate() - # WF.IdentifyROCrate(roCrateJON, roCrateFile) + # self.rocrate_toolbox.IdentifyROCrate(roCrateJON, roCrateFile) # TODO: get roCrateObj mainEntity programming language # self.logger.debug(roCrateObj.root_dataset.as_jsonld()) @@ -2360,16 +2565,9 @@ def getWorkflowRepoFromROCrateFile( "Workflow RO-Crate manifest does not describe the workflow language" ) - with warnings.catch_warnings(record=True) as rec_w: - warnings.simplefilter("always") - - engineDesc = WF.MatchWorkflowType( - mainEntityProgrammingLanguageUrl, mainEntityProgrammingLanguageId - ) - - # Logging possibly emitted warnings - for w in rec_w: - self.logger.warning(w.message) + engineDesc = self.matchWorkflowType( + mainEntityProgrammingLanguageUrl, mainEntityProgrammingLanguageId + ) if (expectedEngineDesc is not None) and engineDesc != expectedEngineDesc: raise WfExSBackendException( diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 0a54356e..9288d504 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -129,7 +129,7 @@ EncryptedFSType, ) - from .engine import ( + from .workflow_engines import ( AbstractWorkflowEngineType, ) @@ -192,10 +192,10 @@ import urllib.parse # This is needed to assure yaml.safe_load unmarshalls gives no error -from .container import ( +from .container_factories import ( Container, ) -from .engine import ( +from .workflow_engines import ( WorkflowType, ) @@ -217,6 +217,9 @@ from .utils.misc import ( lazy_import, ) +from .utils.rocrate import ( + ROCrateToolbox, +) from .security_context import ( SecurityContextVault, @@ -234,13 +237,6 @@ # We have preference for the C based loader and dumper, but the code # should fallback to default implementations when C ones are not present - -# Needed by pyld to detect it -import aiohttp -import pyld # type: ignore[import, import-untyped] -import rdflib -import rdflib.plugins.sparql - import yaml YAMLLoader: "Type[Union[yaml.Loader, yaml.CLoader]]" @@ -275,7 +271,7 @@ from .encrypted_fs import ENCRYPTED_FS_MOUNT_IMPLEMENTATIONS -from .engine import ( +from .workflow_engines import ( MaterializedWorkflowEngine, STATS_DAG_DOT_FILE, WorkflowEngine, @@ -310,9 +306,6 @@ TRS_SCHEME_PREFIX, ) -from .nextflow_engine import NextflowWorkflowEngine -from .cwl_engine import CWLWorkflowEngine - if TYPE_CHECKING: from .wfexs_backend import WfExSBackend @@ -372,15 +365,6 @@ def __missing__(self, key: KT) -> VT: return cast(VT, key) -# The list of classes to be taken into account -# CWL detection is before, as Nextflow one is -# a bit lax (only detects a couple of too common -# keywords) -WORKFLOW_ENGINE_CLASSES: "Final[Sequence[Type[WorkflowEngine]]]" = [ - CWLWorkflowEngine, - NextflowWorkflowEngine, -] - ROCRATE_JSONLD_FILENAME: "Final[str]" = "ro-crate-metadata.json" LEGACY_ROCRATE_JSONLD_FILENAME: "Final[str]" = "ro-crate-metadata.jsonld" @@ -431,18 +415,6 @@ class WF: ) TRS_TOOLS_PATH: "Final[str]" = "tools/" - WORKFLOW_ENGINES: "Final[Sequence[WorkflowType]]" = list( - map(lambda clazz: clazz.MyWorkflowType(), WORKFLOW_ENGINE_CLASSES) - ) - - RECOGNIZED_TRS_DESCRIPTORS: "Final[Mapping[TRS_Workflow_Descriptor, WorkflowType]]" = dict( - map(lambda t: (t.trs_descriptor, t), WORKFLOW_ENGINES) - ) - - RECOGNIZED_SHORTNAME_DESCRIPTORS: "Final[Mapping[TRS_Workflow_Descriptor, WorkflowType]]" = dict( - map(lambda t: (t.shortname, t), WORKFLOW_ENGINES) - ) - def __init__( self, wfexs: "WfExSBackend", @@ -556,9 +528,9 @@ def __init__( if nickname is not None: workflow_meta["nickname"] = nickname if descriptor_type is not None: - descriptor = self.RECOGNIZED_TRS_DESCRIPTORS.get(descriptor_type) + descriptor = self.wfexs.RECOGNIZED_TRS_DESCRIPTORS.get(descriptor_type) if descriptor is None: - descriptor = self.RECOGNIZED_SHORTNAME_DESCRIPTORS.get( + descriptor = self.wfexs.RECOGNIZED_SHORTNAME_DESCRIPTORS.get( descriptor_type ) @@ -1335,258 +1307,6 @@ def FromPreviousInstanceDeclaration( paranoidMode=paranoidMode, ) - # This is needed due limitations from rdflib mangling relative ids - WFEXS_TRICK_SPARQL_PRE_PREFIX: "Final[str]" = "shttp:" - WFEXS_TRICK_SPARQL_BASE: "Final[str]" = f"{WFEXS_TRICK_SPARQL_PRE_PREFIX}///" - WFEXS_TRICK_SPARQL_NS: "Final[str]" = "wfexs" - - SPARQL_NS = { - "dc": "http://purl.org/dc/elements/1.1/", - "dcterms": "http://purl.org/dc/terms/", - "s": "http://schema.org/", - "bs": "https://bioschemas.org/", - "bsworkflow": "https://bioschemas.org/profiles/ComputationalWorkflow/", - "rocrate": "https://w3id.org/ro/crate/", - "wfcrate": "https://w3id.org/workflowhub/workflow-ro-crate/", - "wfhprofile": "https://about.workflowhub.eu/Workflow-RO-Crate/", - "wrprocess": "https://w3id.org/ro/wfrun/process/", - "wrwf": "https://w3id.org/ro/wfrun/workflow/", - WFEXS_TRICK_SPARQL_NS: WFEXS_TRICK_SPARQL_BASE, - } - - IS_ROCRATE_SPARQL: "Final[str]" = """\ -SELECT ?rocratejson ?rootdataset ?rocrateprofile ?wfcrateprofile ?mainentity ?bsworkflowprofile ?wrprocessprofile ?wrwfprofile -WHERE { - ?rocratejson - a s:CreativeWork ; - dcterms:conformsTo ?rocrateprofile ; - s:about ?rootdataset . - ?rootdataset a s:Dataset . - FILTER ( - STRSTARTS(str(?rocrateprofile), str(rocrate:)) - ) . - OPTIONAL { - ?rocratejson dcterms:conformsTo ?wfcrateprofile . - FILTER ( - ?wfcrateprofile = wfhprofile: || STRSTARTS(str(?wfcrateprofile), str(wfcrate:)) - ) . - OPTIONAL { - ?rootdataset - s:mainEntity ?mainentity . - ?mainentity - a bs:ComputationalWorkflow ; - dcterms:conformsTo ?bsworkflowprofile . - FILTER ( - STRSTARTS(str(?bsworkflowprofile), str(bsworkflow:)) - ) . - } - OPTIONAL { - ?rootdataset - dcterms:conformsTo ?wfcrateprofile ; - dcterms:conformsTo ?wrprocessprofile ; - dcterms:conformsTo ?wrwfprofile . - FILTER ( - STRSTARTS(str(?wrprocessprofile), str(wrprocess:)) && - STRSTARTS(str(?wrwfprofile), str(wrwf:)) - ) . - } - } -} -""" - - @classmethod - def IdentifyROCrate( - cls, jsonld: "Mapping[str, Any]", public_name: "str" - ) -> "Tuple[Optional[rdflib.query.ResultRow], rdflib.graph.Graph]": - """ - This method is used to identify where the input JSON is a - JSON-LD related to RO-Crate. - - The returned value is a tuple, where the first element is the - result row giving the QName of the root dataset, and the different - profiles being matched: RO-Crate, Workflow RO-Crate, WRROC process and WRROC workflow. - The second element of the returned tuple is the rdflib RDF - graph from the read JSON-LD, which should allow exploring it. - """ - jsonld_obj = cast("MutableMapping[str, Any]", copy.deepcopy(jsonld)) - - # # Let's load it using RDFLib tricks - # context: "MutableSequence[Union[str, Mapping[str, str]]]" - # got_context = jsonld_obj.get("@context") - # if got_context is None: - # context = [] - # elif isinstance(got_context, (str, dict)): - # context = [got_context] - # elif isinstance(got_context, list): - # context = got_context - # - # # Setting the augmented context with the trick - # context.append( - # { - # "@base": cls.WFEXS_TRICK_SPARQL_BASE, - # } - # ) - # - # if context != got_context: - # jsonld_obj["@context"] = context - - # Now, let's load it in RDFLib, in order learn - g = rdflib.Graph() - # expand a document, removing its context - # see: https://json-ld.org/spec/latest/json-ld/#expanded-document-form - # which is the issue RDFLib 7.0.0 has - - # jsonld_obj_ser = jsonld_obj - jsonld_obj_ser = { - "@graph": pyld.jsonld.expand(jsonld_obj, {"keepFreeFloatingNodes": True}) - } - jsonld_str = json.dumps(jsonld_obj_ser) - parsed = g.parse( - data=jsonld_str, - format="json-ld", - base=cls.WFEXS_TRICK_SPARQL_PRE_PREFIX, - ) - - # This query will tell us whether the JSON-LD is about an RO-Crate 1.1 - q = rdflib.plugins.sparql.prepareQuery( - cls.IS_ROCRATE_SPARQL, - initNs=cls.SPARQL_NS, - ) - - # TODO: cache resolution of contexts - # TODO: disallow network access for context resolution - # when not in right phase - try: - qres = g.query(q) - except Exception as e: - raise WFException( - f"Unable to perform JSON-LD check query over {public_name} (see cascading exceptions)" - ) from e - - resrow: "Optional[rdflib.query.ResultRow]" = None - # In the future, there could be more than one match, when - # nested RO-Crate scenarios happen - for row in qres: - assert isinstance( - row, rdflib.query.ResultRow - ), "Check the SPARQL code, as it should be a SELECT query" - resrow = row - break - - return (resrow, g) - - OBTAIN_WORKFLOW_PID_SPARQL: "Final[str]" = """\ -SELECT ?identifier ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version -WHERE { - ?mainentity s:programmingLanguage ?programminglanguage . - ?programminglanguage - a s:ComputerLanguage ; - s:url ?programminglanguage_url . - OPTIONAL { - ?mainentity s:identifier ?identifier . - } - OPTIONAL { - ?programminglanguage - s:version ?programminglanguage_version . - } - OPTIONAL { - ?programminglanguage - s:identifier ?programminglanguage_identifier . - } -} -""" - - @classmethod - def GenerateWorkflowMetaFromJSONLD( - cls, - jsonld_obj: "Mapping[str, Any]", - public_name: "str", - retrospective_first: "bool" = True, - ) -> "WritableWorkflowMetaConfigBlock": - matched_crate, g = cls.IdentifyROCrate(jsonld_obj, public_name) - # Is it an RO-Crate? - if matched_crate is None: - raise WFException(f"JSON-LD from {public_name} is not an RO-Crate") - - if matched_crate.wfcrateprofile is None: - raise WFException(f"JSON-LD from {public_name} is not a Workflow RO-Crate") - - if matched_crate.mainentity is None: - raise WFException( - f"Unable to find the main entity workflow at {public_name} Workflow RO-Crate" - ) - - if matched_crate.wrwfprofile is None: - raise WFException(f"JSON-LD from {public_name} is not a WRROC Workflow") - - # This query will tell us where the original workflow was located, - # its language and version - qlang = rdflib.plugins.sparql.prepareQuery( - cls.OBTAIN_WORKFLOW_PID_SPARQL, - initNs=cls.SPARQL_NS, - ) - - # TODO: cache resolution of contexts - # TODO: disallow network access for context resolution - # when not in right phase - try: - qlangres = g.query( - qlang, - initBindings={ - "mainentity": matched_crate.mainentity, - }, - ) - except Exception as e: - raise WFException( - f"Unable to perform JSON-LD workflow details query over {public_name} (see cascading exceptions)" - ) from e - - langrow: "Optional[rdflib.query.ResultRow]" = None - # In the future, there could be more than one match, when - # nested RO-Crate scenarios happen - for row in qlangres: - assert isinstance( - row, rdflib.query.ResultRow - ), "Check the SPARQL code, as it should be a SELECT query" - langrow = row - break - - if langrow is None: - raise WFException( - f"Unable to get workflow engine details from {public_name}" - ) - - programminglanguage_url = ( - None - if langrow.programminglanguage_url is None - else str(langrow.programminglanguage_url) - ) - programminglanguage_identifier = ( - None - if langrow.programminglanguage_identifier is None - else str(langrow.programminglanguage_identifier) - ) - # Getting the workflow type - workflow_type = cls.MatchWorkflowType( - programminglanguage_url, programminglanguage_identifier - ) - - # At this point we know the workflow engine is supported - # but we need to be sure the container solution is also supported - - # TODO: finish - - workflow_meta: "WritableWorkflowMetaConfigBlock" = { - "workflow_id": {}, - "workflow_type": workflow_type.shortname, - "environment": {}, - "params": {}, - "outputs": {}, - "workflow_config": {}, - } - - return workflow_meta - @classmethod def FromPreviousROCrate( cls, @@ -1668,8 +1388,11 @@ def FromPreviousROCrate( f"Content from {public_name} is not a valid JSON" ) from jde - workflow_meta = cls.GenerateWorkflowMetaFromJSONLD(jsonld_obj, public_name) + workflow_meta = wfexs.rocrate_toolbox.generateWorkflowMetaFromJSONLD( + jsonld_obj, public_name + ) + # Last, be sure that what it has been generated is correct if wfexs.validateConfigFiles(workflow_meta, securityContextsConfigFilename) > 0: raise WFException( f"Generated WfExS description from {public_name} fails (have a look at the log messages for details)" @@ -1783,58 +1506,6 @@ def FromForm( paranoid_mode=paranoidMode, ) - @classmethod - def MatchWorkflowType( - cls, - mainEntityProgrammingLanguageUrl: "str", - mainEntityProgrammingLanguageId: "Optional[str]", - ) -> "WorkflowType": - # Now, it is time to match the language id - engineDescById: "Optional[WorkflowType]" = None - engineDescByUrl: "Optional[WorkflowType]" = None - for possibleEngineDesc in cls.WORKFLOW_ENGINES: - if (engineDescById is None) and ( - mainEntityProgrammingLanguageId is not None - ): - for pat in possibleEngineDesc.uriMatch: - if isinstance(pat, Pattern): - match = pat.search(mainEntityProgrammingLanguageId) - if match: - engineDescById = possibleEngineDesc - elif pat == mainEntityProgrammingLanguageId: - engineDescById = possibleEngineDesc - - if (engineDescByUrl is None) and ( - mainEntityProgrammingLanguageUrl == possibleEngineDesc.url - ): - engineDescByUrl = possibleEngineDesc - - engineDesc: "WorkflowType" - if engineDescById is not None: - engineDesc = engineDescById - elif engineDescByUrl is not None: - engineDesc = engineDescByUrl - else: - raise WFException( - "Found programming language {} (url {}) in RO-Crate manifest is not among the supported ones by WfExS-backend".format( - mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl - ) - ) - - if ( - (engineDescById is not None) - and (engineDescByUrl is not None) - and engineDescById != engineDescByUrl - ): - warnings.warn( - "Queried programming language {} and its url {} lead to different engines".format( - mainEntityProgrammingLanguageId, mainEntityProgrammingLanguageUrl - ), - WFWarning, - ) - - return engineDesc - def fetchWorkflow( self, workflow_id: "WorkflowId", @@ -1923,7 +1594,7 @@ def fetchWorkflow( # A valid engine must be identified from the fetched content # TODO: decide whether to force some specific version if self.engineDesc is None: - for engineDesc in self.WORKFLOW_ENGINES: + for engineDesc in self.wfexs.WORKFLOW_ENGINES: self.logger.debug("Testing engine " + engineDesc.trs_descriptor) engine = self.wfexs.instantiateEngine(engineDesc, self.staged_setup) diff --git a/wfexs_backend/engine.py b/wfexs_backend/workflow_engines/__init__.py similarity index 93% rename from wfexs_backend/engine.py rename to wfexs_backend/workflow_engines/__init__.py index 25568af2..7bc6a879 100644 --- a/wfexs_backend/engine.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -33,7 +33,7 @@ TYPE_CHECKING, ) -from .common import ( +from ..common import ( AbstractWfExSException, ContainerType, ContentKind, @@ -61,7 +61,11 @@ Union, ) - from .common import ( + from typing_extensions import ( + Final, + ) + + from ..common import ( AbstractGeneratedContent, AbsPath, AnyPath, @@ -89,17 +93,17 @@ WorkflowEngineVersionStr, ) - from .container import ( + from ..container_factories import ( Container, + ContainerFactory, ) -from .container import ContainerFactory, NoContainerFactory -from .singularity_container import SingularityContainerFactory -from .docker_container import DockerContainerFactory -from .podman_container import PodmanContainerFactory +from ..container_factories.no_container import ( + NoContainerFactory, +) -from .utils.contents import CWLDesc2Content, GetGeneratedDirectoryContent -from .utils.digests import ComputeDigestFromFile, nihDigester +from ..utils.contents import CWLDesc2Content, GetGeneratedDirectoryContent +from ..utils.digests import ComputeDigestFromFile, nihDigester # Constants WORKDIR_INPUTS_RELDIR = "inputs" @@ -129,6 +133,10 @@ STATS_DAG_DOT_FILE = "dag.dot" +# Default priority +DEFAULT_PRIORITY: "Final[int]" = 0 + + class WorkflowType(NamedTuple): """ engineName: symbolic name of the engine @@ -152,6 +160,8 @@ class WorkflowType(NamedTuple): url: "URIType" trs_descriptor: "TRS_Workflow_Descriptor" rocrate_programming_language: "str" + priority: "int" = DEFAULT_PRIORITY + enabled: "bool" = True @classmethod def _value_fixes(cls) -> "Mapping[str, Optional[str]]": @@ -288,6 +298,9 @@ def launchWorkflow( def FromStagedSetup( cls, staged_setup: "StagedSetup", + container_factory_classes: "Sequence[Type[ContainerFactory]]" = [ + NoContainerFactory + ], cache_dir: "Optional[AnyPath]" = None, cache_workflow_dir: "Optional[AnyPath]" = None, cache_workflow_inputs_dir: "Optional[AnyPath]" = None, @@ -305,18 +318,10 @@ class WorkflowEngineException(AbstractWfExSException): pass -CONTAINER_FACTORY_CLASSES: "Sequence[Type[ContainerFactory]]" = [ - SingularityContainerFactory, - DockerContainerFactory, - PodmanContainerFactory, - NoContainerFactory, -] - - class WorkflowEngine(AbstractWorkflowEngineType): def __init__( self, - container_type: "ContainerType" = ContainerType.NoContainer, + container_factory_clazz: "Type[ContainerFactory]" = NoContainerFactory, cacheDir: "Optional[AnyPath]" = None, workflow_config: "Optional[Mapping[str, Any]]" = None, local_config: "Optional[EngineLocalConfig]" = None, @@ -492,33 +497,25 @@ def __init__( engine_mode = EngineMode(engine_mode) self.engine_mode = engine_mode - if not self.supportsContainerType(container_type): + container_type = container_factory_clazz.ContainerType() + if not self.SupportsContainerType(container_type): raise WorkflowEngineException( f"Current implementation of {self.__class__.__name__} does not support {container_type}" ) - if secure_exec and not self.supportsSecureExecContainerType(container_type): + if secure_exec and not self.SupportsSecureExecContainerType(container_type): raise WorkflowEngineException( f"Due technical limitations, secure or paranoid executions are incompatible with {container_type}" ) - for containerFactory in CONTAINER_FACTORY_CLASSES: - if containerFactory.ContainerType() == container_type: - self.logger.debug(f"Container type {container_type}") - self.container_factory = containerFactory( - cacheDir=cacheDir, - stagedContainersDir=stagedContainersDir, - local_config=local_config, - engine_name=self.__class__.__name__, - tempDir=self.tempDir, - ) - break - else: - raise WorkflowEngineException( - "FATAL: No container factory implementation for {}".format( - container_type - ) - ) + self.logger.debug(f"Instantiating container type {container_type}") + self.container_factory = container_factory_clazz( + cacheDir=cacheDir, + stagedContainersDir=stagedContainersDir, + local_config=local_config, + engine_name=self.__class__.__name__, + tempDir=self.tempDir, + ) isUserNS = self.container_factory.supportsFeature("userns") self.logger.debug( @@ -559,6 +556,9 @@ def __init__( def FromStagedSetup( cls, staged_setup: "StagedSetup", + container_factory_classes: "Sequence[Type[ContainerFactory]]" = [ + NoContainerFactory + ], cache_dir: "Optional[AnyPath]" = None, cache_workflow_dir: "Optional[AnyPath]" = None, cache_workflow_inputs_dir: "Optional[AnyPath]" = None, @@ -576,8 +576,18 @@ def FromStagedSetup( :param config_directory: """ + the_container_factory_clazz: "Optional[Type[ContainerFactory]]" = None + for container_factory_clazz in container_factory_classes: + if container_factory_clazz.ContainerType() == staged_setup.container_type: + the_container_factory_clazz = container_factory_clazz + # self.logger.debug(f"Selected container type {staged_setup.container_type}") + break + else: + raise WorkflowEngineException( + f"FATAL: No container factory implementation for {staged_setup.container_type}" + ) return cls( - container_type=staged_setup.container_type, + container_factory_clazz=the_container_factory_clazz, workflow_config=staged_setup.workflow_config, engineTweaksDir=staged_setup.engine_tweaks_dir, workDir=staged_setup.work_dir, @@ -607,11 +617,27 @@ def SupportedContainerTypes(cls) -> "Set[ContainerType]": def SupportedSecureExecContainerTypes(cls) -> "Set[ContainerType]": pass - def supportsContainerType(self, containerType: "ContainerType") -> "bool": - return containerType in self.SupportedContainerTypes() + @classmethod + def SupportsContainerType(cls, container_type: "ContainerType") -> "bool": + return container_type in cls.SupportedContainerTypes() + + @classmethod + def SupportsContainerFactory( + cls, container_factory_clazz: "Type[ContainerFactory]" + ) -> "bool": + return cls.SupportsContainerType(container_factory_clazz.ContainerType()) - def supportsSecureExecContainerType(self, containerType: "ContainerType") -> "bool": - return containerType in self.SupportedSecureExecContainerTypes() + @classmethod + def SupportsSecureExecContainerType(cls, container_type: "ContainerType") -> "bool": + return container_type in cls.SupportedSecureExecContainerTypes() + + @classmethod + def SupportsSecureExecContainerFactory( + cls, container_factory_clazz: "Type[ContainerFactory]" + ) -> "bool": + return cls.SupportsSecureExecContainerType( + container_factory_clazz.ContainerType() + ) @abc.abstractmethod def identifyWorkflow( diff --git a/wfexs_backend/cwl_engine.py b/wfexs_backend/workflow_engines/cwl_engine.py similarity index 99% rename from wfexs_backend/cwl_engine.py rename to wfexs_backend/workflow_engines/cwl_engine.py index 0585b67a..2730f7ed 100644 --- a/wfexs_backend/cwl_engine.py +++ b/wfexs_backend/workflow_engines/cwl_engine.py @@ -35,7 +35,7 @@ TYPE_CHECKING, ) -from .common import ( +from ..common import ( ContainerTaggedName, ContainerType, ContentKind, @@ -56,6 +56,7 @@ Sequence, Set, Tuple, + Type, Union, ) @@ -63,7 +64,7 @@ TypeAlias, ) - from .common import ( + from ..common import ( AbsPath, AnyPath, EngineLocalConfig, @@ -79,6 +80,10 @@ WorkflowEngineVersionStr, ) + from ..container_factories import ( + ContainerFactory, + ) + ExecInputVal: TypeAlias = Union[ bool, int, @@ -99,7 +104,8 @@ import jsonpath_ng.ext import yaml -from .engine import ( +from . import ( + DEFAULT_PRIORITY, MaterializedWorkflowEngine, STATS_DAG_DOT_FILE, WORKDIR_STATS_RELDIR, @@ -110,7 +116,11 @@ WorkflowType, ) -from .utils.contents import ( +from ..container_factories.no_container import ( + NoContainerFactory, +) + +from ..utils.contents import ( CWLClass2WfExS, link_or_copy, ) @@ -184,7 +194,7 @@ class CWLWorkflowEngine(WorkflowEngine): def __init__( self, - container_type: "ContainerType" = ContainerType.NoContainer, + container_factory_clazz: "Type[ContainerFactory]" = NoContainerFactory, cacheDir: "Optional[AnyPath]" = None, workflow_config: "Optional[Mapping[str, Any]]" = None, local_config: "Optional[EngineLocalConfig]" = None, @@ -202,7 +212,7 @@ def __init__( config_directory: "Optional[AnyPath]" = None, ): super().__init__( - container_type=container_type, + container_factory_clazz=container_factory_clazz, cacheDir=cacheDir, workflow_config=workflow_config, local_config=local_config, @@ -285,6 +295,7 @@ def MyWorkflowType(cls) -> "WorkflowType": url=cast("URIType", "https://www.commonwl.org/"), trs_descriptor="CWL", rocrate_programming_language="https://w3id.org/workflowhub/workflow-ro-crate#cwl", + priority=DEFAULT_PRIORITY + 10, ) @classmethod diff --git a/wfexs_backend/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py similarity index 99% rename from wfexs_backend/nextflow_engine.py rename to wfexs_backend/workflow_engines/nextflow_engine.py index babeb96c..737d53a1 100644 --- a/wfexs_backend/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -37,7 +37,7 @@ TYPE_CHECKING, ) -from .common import ( +from ..common import ( ContainerTaggedName, ContainerType, ContentKind, @@ -61,12 +61,13 @@ Sequence, Set, Tuple, + Type, Union, ) from typing_extensions import Final - from .common import ( + from ..common import ( AbsPath, AnyPath, EngineLocalConfig, @@ -83,7 +84,11 @@ WorkflowEngineVersionStr, ) - from .utils.groovy_parsing import ( + from ..container_factories import ( + ContainerFactory, + ) + + from ..utils.groovy_parsing import ( ContextAssignments, NfInclude, NfIncludeConfig, @@ -91,8 +96,8 @@ NfWorkflow, ) -from .engine import WorkflowEngine, WorkflowEngineException -from .engine import ( +from . import WorkflowEngine, WorkflowEngineException +from . import ( MaterializedWorkflowEngine, STATS_DAG_DOT_FILE, WORKDIR_STATS_RELDIR, @@ -100,11 +105,16 @@ WORKDIR_STDERR_FILE, WorkflowType, ) -from .fetchers.http import fetchClassicURL -from .utils.contents import ( + +from ..container_factories.no_container import ( + NoContainerFactory, +) + +from ..fetchers.http import fetchClassicURL +from ..utils.contents import ( copy2_nofollow, ) -from .utils.groovy_parsing import ( +from ..utils.groovy_parsing import ( analyze_nf_content, ERROR_PROCESS_NAME, ) @@ -167,7 +177,7 @@ class NextflowWorkflowEngine(WorkflowEngine): def __init__( self, - container_type: "ContainerType" = ContainerType.NoContainer, + container_factory_clazz: "Type[ContainerFactory]" = NoContainerFactory, cacheDir: "Optional[AnyPath]" = None, workflow_config: "Optional[Mapping[str, Any]]" = None, local_config: "Optional[EngineLocalConfig]" = None, @@ -185,7 +195,7 @@ def __init__( config_directory: "Optional[AnyPath]" = None, ): super().__init__( - container_type=container_type, + container_factory_clazz=container_factory_clazz, cacheDir=cacheDir, workflow_config=workflow_config, local_config=local_config, From 7e606716e588dd8cd10bfe76d393d87b87a286d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 8 May 2024 23:30:38 +0200 Subject: [PATCH 16/42] Several constants and maps have been moved from wfexs_backend.ro_crate module to `wfexs_backend.utils.rocrate` one --- wfexs_backend/ro_crate.py | 57 +++++----------------------------- wfexs_backend/utils/rocrate.py | 55 +++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 50 deletions(-) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index ab2d9dac..d560bbed 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -19,7 +19,6 @@ import atexit import copy -import enum import inspect import logging import os @@ -95,6 +94,12 @@ import uuid from .utils.misc import lazy_import +from .utils.rocrate import ( + ContainerType2AdditionalType, + ContainerTypeMetadata, + ContainerTypeMetadataDetails, + WORKFLOW_RUN_CONTEXT, +) magic = lazy_import("magic") # import magic @@ -347,21 +352,6 @@ class FixedFile(FixedMixin, rocrate.model.file.File): # type: ignore[misc] pass -WORKFLOW_RUN_CONTEXT: "Final[str]" = "https://w3id.org/ro/terms/workflow-run" - - -class ContainerImageAdditionalType(enum.Enum): - Docker = WORKFLOW_RUN_CONTEXT + "#DockerImage" - Singularity = WORKFLOW_RUN_CONTEXT + "#SIFImage" - - -ContainerType2AdditionalType: "Mapping[ContainerType, ContainerImageAdditionalType]" = { - ContainerType.Docker: ContainerImageAdditionalType.Docker, - ContainerType.Singularity: ContainerImageAdditionalType.Singularity, - ContainerType.Podman: ContainerImageAdditionalType.Docker, -} - - class ContainerImage(rocrate.model.entity.Entity): # type: ignore[misc] TYPES = ["ContainerImage", "SoftwareApplication"] @@ -686,40 +676,11 @@ def add_workflow_ext( return cast("FixedWorkflow", workflow) -class ContainerTypeMetadata(NamedTuple): - sa_id: "str" - applicationCategory: "str" - ct_applicationCategory: "str" - - class WorkflowRunROCrate: """ This class rules the generation of an RO-Crate """ - ContainerTypeMetadataDetails: "Final[Mapping[ContainerType, ContainerTypeMetadata]]" = { - ContainerType.Singularity: ContainerTypeMetadata( - sa_id="https://apptainer.org/", - applicationCategory="https://www.wikidata.org/wiki/Q51294208", - ct_applicationCategory="https://www.wikidata.org/wiki/Q7935198", - ), - ContainerType.Docker: ContainerTypeMetadata( - sa_id="https://www.docker.com/", - applicationCategory="https://www.wikidata.org/wiki/Q15206305", - ct_applicationCategory="https://www.wikidata.org/wiki/Q7935198", - ), - ContainerType.Podman: ContainerTypeMetadata( - sa_id="https://podman.io/", - applicationCategory="https://www.wikidata.org/wiki/Q70876440", - ct_applicationCategory="https://www.wikidata.org/wiki/Q7935198", - ), - ContainerType.Conda: ContainerTypeMetadata( - sa_id="https://conda.io/", - applicationCategory="https://www.wikidata.org/wiki/Q22907431", - ct_applicationCategory="https://www.wikidata.org/wiki/Q98400282", - ), - } - def __init__( self, remote_repo: "RemoteRepo", @@ -1156,9 +1117,7 @@ def _add_containers( if container in self._added_containers: continue - container_type_metadata = self.ContainerTypeMetadataDetails[ - container.type - ] + container_type_metadata = ContainerTypeMetadataDetails[container.type] crate_cont_type = self.cached_cts.get(container.type) if crate_cont_type is None: container_type = ( @@ -1192,7 +1151,7 @@ def _add_containers( crate_source_cont_type = crate_cont_type container_source_type_metadata = container_type_metadata else: - container_source_type_metadata = self.ContainerTypeMetadataDetails[ + container_source_type_metadata = ContainerTypeMetadataDetails[ container.source_type ] crate_source_cont_type = self.cached_cts.get(container.source_type) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 95704add..350321f3 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -19,12 +19,14 @@ import abc import copy +import enum import inspect import json import logging from typing import ( cast, + NamedTuple, TYPE_CHECKING, ) @@ -55,6 +57,57 @@ import rdflib import rdflib.plugins.sparql +from ..common import ( + ContainerType, +) + + +class ContainerTypeMetadata(NamedTuple): + sa_id: "str" + applicationCategory: "str" + ct_applicationCategory: "str" + + +ContainerTypeMetadataDetails: "Final[Mapping[ContainerType, ContainerTypeMetadata]]" = { + ContainerType.Singularity: ContainerTypeMetadata( + sa_id="https://apptainer.org/", + applicationCategory="https://www.wikidata.org/wiki/Q51294208", + ct_applicationCategory="https://www.wikidata.org/wiki/Q7935198", + ), + ContainerType.Docker: ContainerTypeMetadata( + sa_id="https://www.docker.com/", + applicationCategory="https://www.wikidata.org/wiki/Q15206305", + ct_applicationCategory="https://www.wikidata.org/wiki/Q7935198", + ), + ContainerType.Podman: ContainerTypeMetadata( + sa_id="https://podman.io/", + applicationCategory="https://www.wikidata.org/wiki/Q70876440", + ct_applicationCategory="https://www.wikidata.org/wiki/Q7935198", + ), + ContainerType.Conda: ContainerTypeMetadata( + sa_id="https://conda.io/", + applicationCategory="https://www.wikidata.org/wiki/Q22907431", + ct_applicationCategory="https://www.wikidata.org/wiki/Q98400282", + ), +} + +WORKFLOW_RUN_CONTEXT: "Final[str]" = "https://w3id.org/ro/terms/workflow-run" +WORKFLOW_RUN_NAMESPACE: "Final[str]" = WORKFLOW_RUN_CONTEXT + "#" + + +class ContainerImageAdditionalType(enum.Enum): + Docker = WORKFLOW_RUN_NAMESPACE + "DockerImage" + Singularity = WORKFLOW_RUN_NAMESPACE + "SIFImage" + # No one is available for Conda yet + + +ContainerType2AdditionalType: "Mapping[ContainerType, ContainerImageAdditionalType]" = { + ContainerType.Docker: ContainerImageAdditionalType.Docker, + ContainerType.Singularity: ContainerImageAdditionalType.Singularity, + ContainerType.Podman: ContainerImageAdditionalType.Docker, + # No one is available for Conda yet +} + class ROCrateToolboxException(Exception): pass @@ -77,7 +130,7 @@ class ROCrateToolbox(abc.ABC): "wfhprofile": "https://about.workflowhub.eu/Workflow-RO-Crate/", "wrprocess": "https://w3id.org/ro/wfrun/process/", "wrwf": "https://w3id.org/ro/wfrun/workflow/", - "wrterm": "https://w3id.org/ro/terms/workflow-run#", + "wrterm": WORKFLOW_RUN_NAMESPACE, "wikidata": "https://www.wikidata.org/wiki/", WFEXS_TRICK_SPARQL_NS: WFEXS_TRICK_SPARQL_BASE, } From 6bc34180b0bdfaa65fd7858565cde59c28c2b719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 10 May 2024 15:38:12 +0200 Subject: [PATCH 17/42] Although this constant is already defined in another file, repeat it here to avoid more complicated dependencies --- wfexs_backend/container_factories/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index fbf9af5b..6f16bcf8 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -94,6 +94,8 @@ class DockerManifestMetadata(TypedDict): # A couple of constants needed for several fixes DOCKER_SCHEME: "Final[str]" = "docker" DOCKER_URI_PREFIX: "Final[str]" = DOCKER_SCHEME + ":" +# This string is a repetition from what it is in the helper +DEFAULT_DOCKER_REGISTRY: "Final[str]" = "docker.io" @dataclass @@ -162,7 +164,7 @@ def decompose_docker_tagged_name( registry = tagged_name[0:left_slash_pos] tagged_name = tagged_name[left_slash_pos + 1 :] else: - registry = "docker.io" + registry = DEFAULT_DOCKER_REGISTRY # Now, the tag label right_colon_pos = tagged_name.rfind(":") From fc3c04b335b5f2eee7f63eb775d06317a02ad121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 10 May 2024 15:39:24 +0200 Subject: [PATCH 18/42] Now the list of containers from the read execution is build. As this list of containers does not have an explicit representation in the workflow staging definition, explicitly return it. --- wfexs_backend/utils/rocrate.py | 207 +++++++++++++++++++++++++++++++-- wfexs_backend/workflow.py | 5 +- 2 files changed, 203 insertions(+), 9 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 350321f3..d4a31863 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -30,12 +30,16 @@ TYPE_CHECKING, ) +import warnings + if TYPE_CHECKING: from typing import ( Any, Mapping, MutableMapping, + MutableSequence, Optional, + Sequence, Tuple, ) @@ -44,6 +48,10 @@ ) from ..common import ( + ContainerOperatingSystem, + Fingerprint, + ProcessorArchitecture, + URIType, WritableWorkflowMetaConfigBlock, ) @@ -61,6 +69,15 @@ ContainerType, ) +from ..container_factories import ( + DEFAULT_DOCKER_REGISTRY, + Container, +) + +from .digests import ( + stringifyDigest, +) + class ContainerTypeMetadata(NamedTuple): sa_id: "str" @@ -91,6 +108,11 @@ class ContainerTypeMetadata(NamedTuple): ), } +ApplicationCategory2ContainerType: "Final[Mapping[str, ContainerType]]" = { + container_type_metadata.applicationCategory: container_type + for container_type, container_type_metadata in ContainerTypeMetadataDetails.items() +} + WORKFLOW_RUN_CONTEXT: "Final[str]" = "https://w3id.org/ro/terms/workflow-run" WORKFLOW_RUN_NAMESPACE: "Final[str]" = WORKFLOW_RUN_CONTEXT + "#" @@ -101,13 +123,18 @@ class ContainerImageAdditionalType(enum.Enum): # No one is available for Conda yet -ContainerType2AdditionalType: "Mapping[ContainerType, ContainerImageAdditionalType]" = { +ContainerType2AdditionalType: "Final[Mapping[ContainerType, ContainerImageAdditionalType]]" = { ContainerType.Docker: ContainerImageAdditionalType.Docker, ContainerType.Singularity: ContainerImageAdditionalType.Singularity, ContainerType.Podman: ContainerImageAdditionalType.Docker, # No one is available for Conda yet } +AdditionalType2ContainerType: "Final[Mapping[ContainerImageAdditionalType, ContainerType]]" = { + ContainerImageAdditionalType.Docker: ContainerType.Docker, + ContainerImageAdditionalType.Singularity: ContainerType.Singularity, +} + class ROCrateToolboxException(Exception): pass @@ -232,9 +259,18 @@ def identifyROCrate( # which is the issue RDFLib 7.0.0 has # jsonld_obj_ser = jsonld_obj - jsonld_obj_ser = { - "@graph": pyld.jsonld.expand(jsonld_obj, {"keepFreeFloatingNodes": True}) - } + with warnings.catch_warnings(): + # Disable possible warnings emitted by pyld library + # when it is not run in debug mode + if self.logger.getEffectiveLevel() > logging.DEBUG: + warnings.filterwarnings( + "ignore", category=SyntaxWarning, module="^pyld\.jsonld$" + ) + jsonld_obj_ser = { + "@graph": pyld.jsonld.expand( + jsonld_obj, {"keepFreeFloatingNodes": True} + ) + } jsonld_str = json.dumps(jsonld_obj_ser) parsed = g.parse( data=jsonld_str, @@ -350,7 +386,7 @@ def generateWorkflowMetaFromJSONLD( jsonld_obj: "Mapping[str, Any]", public_name: "str", retrospective_first: "bool" = True, - ) -> "WritableWorkflowMetaConfigBlock": + ) -> "Tuple[WritableWorkflowMetaConfigBlock, Sequence[Container]]": matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) # Is it an RO-Crate? if matched_crate is None: @@ -420,7 +456,9 @@ def generateWorkflowMetaFromJSONLD( if langrow.programminglanguage_identifier is None else str(langrow.programminglanguage_identifier) ) - # Getting the workflow type + # Getting the workflow type. + # This call will raise an exception in case the workflow type + # is not supported by this implementation. workflow_type = self.wfexs.matchWorkflowType( programminglanguage_url, programminglanguage_identifier ) @@ -429,6 +467,9 @@ def generateWorkflowMetaFromJSONLD( # Now it is the moment to choose whether to use one of the stored # executions as template (retrospective provenance) # or delegate on the prospective one. + container_type: "Optional[ContainerType]" = None + additional_container_type: "Optional[ContainerType]" = None + the_containers: "MutableSequence[Container]" = [] if retrospective_first: # For the retrospective provenance at least an execution must # be described in the RO-Crate. Once one is chosen, @@ -464,11 +505,74 @@ def generateWorkflowMetaFromJSONLD( "execution": execrow.execution, }, ) + + # This is the first pass, to learn about the kind of + # container factory to use + for containerrow in qcontainersres: + assert isinstance( + containerrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + # These hints were left by WfExS, but they are not expected + # from other implementations. + if containerrow.type_of_container is not None: + putative_container_type = ( + ApplicationCategory2ContainerType.get( + str(containerrow.type_of_container) + ) + ) + if container_type is None: + container_type = putative_container_type + elif ( + putative_container_type is not None + and putative_container_type != container_type + ): + self.logger.warning( + f"Not all the containers of execution {str(matched_crate.mainentity)} were materialized with {container_type} factory (also found {putative_container_type})" + ) + + # These hints should be left by any compliant WRROC + # implementation + if containerrow.container_additional_type is not None: + try: + putative_additional_container_type = ( + AdditionalType2ContainerType.get( + ContainerImageAdditionalType( + str(containerrow.container_additional_type) + ) + ) + ) + if additional_container_type is None: + additional_container_type = ( + putative_additional_container_type + ) + elif ( + putative_additional_container_type is not None + and putative_additional_container_type + not in (container_type, additional_container_type) + ): + self.logger.warning( + f"Not all the containers of execution {str(matched_crate.mainentity)} were labelled with {additional_container_type} factory (also found {putative_additional_container_type})" + ) + except Exception as e: + self.logger.error( + f"Unable to map additional type {str(containerrow.container_additional_type)} for {str(containerrow.container)}" + ) + + # Assigning this, as it is going to be used later to + # build the list of containers + if container_type is None and additional_container_type is not None: + container_type = additional_container_type + + if container_type is None: + continue + + # This is the second pass, to generate the list of + # containers described in the RO-Crate for containerrow in qcontainersres: assert isinstance( containerrow, rdflib.query.ResultRow ), "Check the SPARQL code, as it should be a SELECT query" - print( + self.logger.debug( f"""\ Container {containerrow.container} {containerrow.container_additional_type} @@ -482,6 +586,86 @@ def generateWorkflowMetaFromJSONLD( {containerrow.container_arch} """ ) + + if ( + containerrow.container_additional_type is not None + and containerrow.container_name is not None + ): + try: + putative_additional_container_type = ( + AdditionalType2ContainerType.get( + ContainerImageAdditionalType( + str(containerrow.container_additional_type) + ) + ) + ) + registries: "Optional[Mapping[ContainerType, str]]" = ( + None + ) + fingerprint = None + origTaggedName = "" + taggedName = "" + image_signature = None + if ( + putative_additional_container_type + == ContainerType.Docker + ): + the_registry = ( + str(containerrow.container_registry) + if containerrow.container_registry is not None + else DEFAULT_DOCKER_REGISTRY + ) + registries = { + ContainerType.Docker: the_registry, + } + container_identifier = str( + containerrow.container_name + ) + assert containerrow.container_sha256 is not None + fingerprint = f"{the_registry}/{container_identifier}@sha256:{str(containerrow.container_sha256)}" + assert containerrow.container_tag is not None + origTaggedName = f"{container_identifier}:{str(containerrow.container_tag)}" + taggedName = f"docker://{the_registry}/{container_identifier}:{str(containerrow.container_tag)}" + # Disable for now + # image_signature = stringifyDigest("sha256", bytes.fromhex(str(containerrow.container_sha256))) + elif ( + putative_additional_container_type + == ContainerType.Singularity + ): + origTaggedName = str(containerrow.container_name) + taggedName = origTaggedName + fingerprint = origTaggedName + + the_containers.append( + Container( + origTaggedName=origTaggedName, + type=container_type, + registries=registries, + taggedName=cast("URIType", taggedName), + architecture=None + if containerrow.container_arch is None + else cast( + "ProcessorArchitecture", + str(containerrow.container_arch), + ), + operatingSystem=None + if containerrow.container_platform is None + else cast( + "ContainerOperatingSystem", + str(containerrow.container_platform), + ), + fingerprint=cast("Fingerprint", fingerprint), + source_type=putative_additional_container_type, + image_signature=image_signature, + ) + ) + except Exception as e: + self.logger.exception( + f"Unable to assign from additional type {str(containerrow.container_additional_type)} for {str(containerrow.container)}" + ) + + # TODO: deal with more than one execution + break except Exception as e: raise ROCrateToolboxException( f"Unable to perform JSON-LD workflow execution details query over {public_name} (see cascading exceptions)" @@ -489,6 +673,9 @@ def generateWorkflowMetaFromJSONLD( # TODO: finish + self.logger.info( + f"Workflow type {workflow_type} container factory {container_type} {additional_container_type}" + ) workflow_meta: "WritableWorkflowMetaConfigBlock" = { "workflow_id": {}, "workflow_type": workflow_type.shortname, @@ -497,5 +684,9 @@ def generateWorkflowMetaFromJSONLD( "outputs": {}, "workflow_config": {}, } + if container_type is not None: + workflow_meta["workflow_config"]["containerType"] = container_type.value + + self.logger.info(f"{json.dumps(workflow_meta, indent=4)}") - return workflow_meta + return workflow_meta, the_containers diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 9288d504..26a04b99 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1388,7 +1388,10 @@ def FromPreviousROCrate( f"Content from {public_name} is not a valid JSON" ) from jde - workflow_meta = wfexs.rocrate_toolbox.generateWorkflowMetaFromJSONLD( + ( + workflow_meta, + the_containers, + ) = wfexs.rocrate_toolbox.generateWorkflowMetaFromJSONLD( jsonld_obj, public_name ) From a3384a54eb3b355f14ec2453fdcc75ab0b1ce232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 11 May 2024 02:11:10 +0200 Subject: [PATCH 19/42] Added query to get all the inputs involved in a previous execution. Next commits should implement rebuilding the input parameters properly. --- wfexs_backend/utils/rocrate.py | 439 ++++++++++++++++++++------------- 1 file changed, 272 insertions(+), 167 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index d4a31863..26296a70 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -337,7 +337,7 @@ def identifyROCrate( } """ - OBTAIN_RUN_CONTAINERS: "Final[str]" = """\ + OBTAIN_RUN_CONTAINERS_SPARQL: "Final[str]" = """\ SELECT ?container ?container_additional_type ?type_of_container ?type_of_container_type ?container_registry ?container_name ?container_tag ?container_sha256 ?container_platform ?container_arch WHERE { ?execution wrterm:containerImage ?container . @@ -381,6 +381,264 @@ def identifyROCrate( } """ + # This compound query is much faster when each of the UNION components + # is evaluated separatedly + OBTAIN_INPUTS_SPARQL: "Final[str]" = """\ +SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?value ?inputcol ?component ?leaf_type +WHERE { + ?execution s:object ?input . + { + # A file, which is a schema.org MediaObject + VALUES (?additional_type) { ( "File" ) } + ?input + a s:MediaObject ; + s:contentUrl ?fileuri ; + s:exampleOfWork ?inputfp . + ?inputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + } UNION { + # A directory, which is a schema.org Dataset + VALUES (?additional_type) { ( "Dataset" ) } + ?input + a s:Dataset ; + s:contentUrl ?fileuri ; + s:exampleOfWork ?inputfp ; + s:hasPart+ ?component . + ?inputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + ?component + a s:MediaObject . + } UNION { + # A single property value, which can be either Integer, Text, Boolean or Float + VALUES (?additional_type) { ( "Integer" ) ( "Text" ) ( "Boolean" ) ( "Float" ) } + ?input + a s:PropertyValue ; + s:exampleOfWork ?inputfp ; + s:value ?value . + ?inputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + } UNION { + # A combination of files or directories or property values + VALUES (?leaf_type ?additional_type) { ( s:Integer "Collection" ) ( s:Text "Collection" ) ( s:Boolean "Collection" ) ( s:Float "Collection" ) ( s:MediaObject "Collection" ) ( s:Dataset "Collection" ) } + ?input + a s:Collection ; + s:exampleOfWork ?inputfp ; + s:hasPart+ ?component . + ?inputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + ?component + a ?leaf_type . + OPTIONAL { + ?component s:contentUrl ?fileuri . + } + OPTIONAL { + ?component s:value ?value . + } + } +} +""" + + def _parseContainersFromExecution( + self, + g: "rdflib.graph.Graph", + execution: "rdflib.term.Identifier", + main_entity: "rdflib.term.Identifier", + ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": + # Get the list of containers + qcontainers = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_RUN_CONTAINERS_SPARQL, + initNs=self.SPARQL_NS, + ) + qcontainersres = g.query( + qcontainers, + initBindings={ + "execution": execution, + }, + ) + + container_type: "Optional[ContainerType]" = None + additional_container_type: "Optional[ContainerType]" = None + the_containers: "MutableSequence[Container]" = [] + # This is the first pass, to learn about the kind of + # container factory to use + for containerrow in qcontainersres: + assert isinstance( + containerrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + # These hints were left by WfExS, but they are not expected + # from other implementations. + if containerrow.type_of_container is not None: + putative_container_type = ApplicationCategory2ContainerType.get( + str(containerrow.type_of_container) + ) + if container_type is None: + container_type = putative_container_type + elif ( + putative_container_type is not None + and putative_container_type != container_type + ): + self.logger.warning( + f"Not all the containers of execution {main_entity} were materialized with {container_type} factory (also found {putative_container_type})" + ) + + # These hints should be left by any compliant WRROC + # implementation + if containerrow.container_additional_type is not None: + try: + putative_additional_container_type = ( + AdditionalType2ContainerType.get( + ContainerImageAdditionalType( + str(containerrow.container_additional_type) + ) + ) + ) + if additional_container_type is None: + additional_container_type = putative_additional_container_type + elif ( + putative_additional_container_type is not None + and putative_additional_container_type + not in (container_type, additional_container_type) + ): + self.logger.warning( + f"Not all the containers of execution {main_entity} were labelled with {additional_container_type} factory (also found {putative_additional_container_type})" + ) + except Exception as e: + self.logger.error( + f"Unable to map additional type {str(containerrow.container_additional_type)} for {str(containerrow.container)}" + ) + + # Assigning this, as it is going to be used later to + # build the list of containers + if container_type is None and additional_container_type is not None: + container_type = additional_container_type + + if container_type is None: + return None + + # This is the second pass, to generate the list of + # containers described in the RO-Crate + for containerrow in qcontainersres: + assert isinstance( + containerrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + self.logger.debug( + f"""\ +Container {containerrow.container} +{containerrow.container_additional_type} +{containerrow.type_of_container} +{containerrow.type_of_container_type} +{containerrow.container_registry} +{containerrow.container_name} +{containerrow.container_tag} +{containerrow.container_sha256} +{containerrow.container_platform} +{containerrow.container_arch} +""" + ) + + if ( + containerrow.container_additional_type is not None + and containerrow.container_name is not None + ): + try: + putative_additional_container_type = ( + AdditionalType2ContainerType.get( + ContainerImageAdditionalType( + str(containerrow.container_additional_type) + ) + ) + ) + registries: "Optional[Mapping[ContainerType, str]]" = None + fingerprint = None + origTaggedName = "" + taggedName = "" + image_signature = None + if putative_additional_container_type == ContainerType.Docker: + the_registry = ( + str(containerrow.container_registry) + if containerrow.container_registry is not None + else DEFAULT_DOCKER_REGISTRY + ) + registries = { + ContainerType.Docker: the_registry, + } + container_identifier = str(containerrow.container_name) + assert containerrow.container_sha256 is not None + fingerprint = f"{the_registry}/{container_identifier}@sha256:{str(containerrow.container_sha256)}" + assert containerrow.container_tag is not None + origTaggedName = ( + f"{container_identifier}:{str(containerrow.container_tag)}" + ) + taggedName = f"docker://{the_registry}/{container_identifier}:{str(containerrow.container_tag)}" + # Disable for now + # image_signature = stringifyDigest("sha256", bytes.fromhex(str(containerrow.container_sha256))) + elif ( + putative_additional_container_type == ContainerType.Singularity + ): + origTaggedName = str(containerrow.container_name) + taggedName = origTaggedName + fingerprint = origTaggedName + + the_containers.append( + Container( + origTaggedName=origTaggedName, + type=container_type, + registries=registries, + taggedName=cast("URIType", taggedName), + architecture=None + if containerrow.container_arch is None + else cast( + "ProcessorArchitecture", + str(containerrow.container_arch), + ), + operatingSystem=None + if containerrow.container_platform is None + else cast( + "ContainerOperatingSystem", + str(containerrow.container_platform), + ), + fingerprint=cast("Fingerprint", fingerprint), + source_type=putative_additional_container_type, + image_signature=image_signature, + ) + ) + except Exception as e: + self.logger.exception( + f"Unable to assign from additional type {str(containerrow.container_additional_type)} for {str(containerrow.container)}" + ) + + return container_type, the_containers + + def _parseInputsFromExecution( + self, + g: "rdflib.graph.Graph", + execution: "rdflib.term.Identifier", + main_entity: "rdflib.term.Identifier", + ) -> "None": + # Get the list of inputs + qinputs = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_INPUTS_SPARQL, + initNs=self.SPARQL_NS, + ) + qinputsres = g.query( + qinputs, + initBindings={ + "execution": execution, + }, + ) + + # TODO: implement this + + return None + def generateWorkflowMetaFromJSONLD( self, jsonld_obj: "Mapping[str, Any]", @@ -469,7 +727,7 @@ def generateWorkflowMetaFromJSONLD( # or delegate on the prospective one. container_type: "Optional[ContainerType]" = None additional_container_type: "Optional[ContainerType]" = None - the_containers: "MutableSequence[Container]" = [] + the_containers: "Sequence[Container]" = [] if retrospective_first: # For the retrospective provenance at least an execution must # be described in the RO-Crate. Once one is chosen, @@ -495,176 +753,23 @@ def generateWorkflowMetaFromJSONLD( execrow, rdflib.query.ResultRow ), "Check the SPARQL code, as it should be a SELECT query" print(f"\tExecution {execrow.execution}") - qcontainers = rdflib.plugins.sparql.prepareQuery( - self.OBTAIN_RUN_CONTAINERS, - initNs=self.SPARQL_NS, - ) - qcontainersres = g.query( - qcontainers, - initBindings={ - "execution": execrow.execution, - }, - ) - # This is the first pass, to learn about the kind of - # container factory to use - for containerrow in qcontainersres: - assert isinstance( - containerrow, rdflib.query.ResultRow - ), "Check the SPARQL code, as it should be a SELECT query" - # These hints were left by WfExS, but they are not expected - # from other implementations. - if containerrow.type_of_container is not None: - putative_container_type = ( - ApplicationCategory2ContainerType.get( - str(containerrow.type_of_container) - ) - ) - if container_type is None: - container_type = putative_container_type - elif ( - putative_container_type is not None - and putative_container_type != container_type - ): - self.logger.warning( - f"Not all the containers of execution {str(matched_crate.mainentity)} were materialized with {container_type} factory (also found {putative_container_type})" - ) - - # These hints should be left by any compliant WRROC - # implementation - if containerrow.container_additional_type is not None: - try: - putative_additional_container_type = ( - AdditionalType2ContainerType.get( - ContainerImageAdditionalType( - str(containerrow.container_additional_type) - ) - ) - ) - if additional_container_type is None: - additional_container_type = ( - putative_additional_container_type - ) - elif ( - putative_additional_container_type is not None - and putative_additional_container_type - not in (container_type, additional_container_type) - ): - self.logger.warning( - f"Not all the containers of execution {str(matched_crate.mainentity)} were labelled with {additional_container_type} factory (also found {putative_additional_container_type})" - ) - except Exception as e: - self.logger.error( - f"Unable to map additional type {str(containerrow.container_additional_type)} for {str(containerrow.container)}" - ) - - # Assigning this, as it is going to be used later to - # build the list of containers - if container_type is None and additional_container_type is not None: - container_type = additional_container_type - - if container_type is None: + contresult = self._parseContainersFromExecution( + g, execrow.execution, main_entity=matched_crate.mainentity + ) + # TODO: deal with more than one execution + if contresult is None: continue - # This is the second pass, to generate the list of - # containers described in the RO-Crate - for containerrow in qcontainersres: - assert isinstance( - containerrow, rdflib.query.ResultRow - ), "Check the SPARQL code, as it should be a SELECT query" - self.logger.debug( - f"""\ -Container {containerrow.container} -{containerrow.container_additional_type} -{containerrow.type_of_container} -{containerrow.type_of_container_type} -{containerrow.container_registry} -{containerrow.container_name} -{containerrow.container_tag} -{containerrow.container_sha256} -{containerrow.container_platform} -{containerrow.container_arch} -""" - ) + container_type, the_containers = contresult - if ( - containerrow.container_additional_type is not None - and containerrow.container_name is not None - ): - try: - putative_additional_container_type = ( - AdditionalType2ContainerType.get( - ContainerImageAdditionalType( - str(containerrow.container_additional_type) - ) - ) - ) - registries: "Optional[Mapping[ContainerType, str]]" = ( - None - ) - fingerprint = None - origTaggedName = "" - taggedName = "" - image_signature = None - if ( - putative_additional_container_type - == ContainerType.Docker - ): - the_registry = ( - str(containerrow.container_registry) - if containerrow.container_registry is not None - else DEFAULT_DOCKER_REGISTRY - ) - registries = { - ContainerType.Docker: the_registry, - } - container_identifier = str( - containerrow.container_name - ) - assert containerrow.container_sha256 is not None - fingerprint = f"{the_registry}/{container_identifier}@sha256:{str(containerrow.container_sha256)}" - assert containerrow.container_tag is not None - origTaggedName = f"{container_identifier}:{str(containerrow.container_tag)}" - taggedName = f"docker://{the_registry}/{container_identifier}:{str(containerrow.container_tag)}" - # Disable for now - # image_signature = stringifyDigest("sha256", bytes.fromhex(str(containerrow.container_sha256))) - elif ( - putative_additional_container_type - == ContainerType.Singularity - ): - origTaggedName = str(containerrow.container_name) - taggedName = origTaggedName - fingerprint = origTaggedName - - the_containers.append( - Container( - origTaggedName=origTaggedName, - type=container_type, - registries=registries, - taggedName=cast("URIType", taggedName), - architecture=None - if containerrow.container_arch is None - else cast( - "ProcessorArchitecture", - str(containerrow.container_arch), - ), - operatingSystem=None - if containerrow.container_platform is None - else cast( - "ContainerOperatingSystem", - str(containerrow.container_platform), - ), - fingerprint=cast("Fingerprint", fingerprint), - source_type=putative_additional_container_type, - image_signature=image_signature, - ) - ) - except Exception as e: - self.logger.exception( - f"Unable to assign from additional type {str(containerrow.container_additional_type)} for {str(containerrow.container)}" - ) + # TODO: which are the needed inputs, to be integrated + # into the latter workflow_meta? + self._parseInputsFromExecution( + g, execrow.execution, main_entity=matched_crate.mainentity + ) - # TODO: deal with more than one execution + # Now, let's get the list of input parameters break except Exception as e: raise ROCrateToolboxException( From 6f491d074eaf9da4df41cbcb6f97e0b29dc22ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 14 May 2024 21:08:55 +0200 Subject: [PATCH 20/42] Added fixes for stubs which arose while doing a test with official mypy pre-commit hook --- .pre-commit-config.yaml | 36 ++-- mypy-requirements.txt | 2 +- mypy-stubs/crypt4gh/cli.pyi | 2 +- mypy-stubs/crypt4gh/debug.pyi | 1 - mypy-stubs/crypt4gh/keys/debug.pyi | 1 - mypy-stubs/crypt4gh/keys/ssh.pyi | 3 +- mypy-stubs/defusedxml/expatbuilder.pyi | 32 +++- mypy-stubs/defusedxml/expatreader.pyi | 2 +- mypy-stubs/defusedxml/lxml.pyi | 45 +++-- mypy-stubs/defusedxml/minidom.pyi | 3 +- mypy-stubs/defusedxml/pulldom.pyi | 4 +- mypy-stubs/defusedxml/sax.pyi | 11 +- mypy-stubs/defusedxml/xmlrpc.pyi | 37 +++- mypy-stubs/jsonpath_ng/bin/jsonpath.pyi | 2 + mypy-stubs/jsonpath_ng/ext/arithmetic.pyi | 2 + mypy-stubs/jsonpath_ng/ext/filter.pyi | 2 + mypy-stubs/jsonpath_ng/ext/iterable.pyi | 1 + mypy-stubs/jsonpath_ng/ext/string.pyi | 2 + mypy-stubs/rocrate/cli.pyi | 11 +- mypy-stubs/xdg/BaseDirectory.pyi | 21 --- mypy-stubs/xdg/Config.pyi | 17 -- mypy-stubs/xdg/DesktopEntry.pyi | 71 -------- mypy-stubs/xdg/Exceptions.pyi | 41 ----- mypy-stubs/xdg/IconTheme.pyi | 70 -------- mypy-stubs/xdg/IniFile.pyi | 81 --------- mypy-stubs/xdg/Locale.pyi | 11 -- mypy-stubs/xdg/Menu.pyi | 199 ---------------------- mypy-stubs/xdg/MenuEditor.pyi | 130 -------------- mypy-stubs/xdg/Mime.pyi | 124 -------------- mypy-stubs/xdg/RecentFiles.pyi | 40 ----- mypy-stubs/xdg/util.pyi | 5 - 31 files changed, 143 insertions(+), 866 deletions(-) delete mode 100644 mypy-stubs/xdg/BaseDirectory.pyi delete mode 100644 mypy-stubs/xdg/Config.pyi delete mode 100644 mypy-stubs/xdg/DesktopEntry.pyi delete mode 100644 mypy-stubs/xdg/Exceptions.pyi delete mode 100644 mypy-stubs/xdg/IconTheme.pyi delete mode 100644 mypy-stubs/xdg/IniFile.pyi delete mode 100644 mypy-stubs/xdg/Locale.pyi delete mode 100644 mypy-stubs/xdg/Menu.pyi delete mode 100644 mypy-stubs/xdg/MenuEditor.pyi delete mode 100644 mypy-stubs/xdg/Mime.pyi delete mode 100644 mypy-stubs/xdg/RecentFiles.pyi delete mode 100644 mypy-stubs/xdg/util.pyi diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 10b0ac64..182fed68 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,11 @@ fail_fast: true repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-json + - id: check-yaml - repo: local hooks: - id: pylint @@ -16,20 +21,31 @@ repos: language: system types: - python -# files: \.py$ -# exclude: "^[^/]*env/|development-[^/]*/|docs/" exclude: "/(site-packages|development-[^/]*|docs|node_modules|__pycache__|\\..*)/$" require_serial: true -# entry: ./basic-installer.bash mypy --strict - entry: mypy --strict --show-error-codes --no-warn-unused-ignores -# Main problem: python executable path, used to find the environment is hard -# - repo: https://github.com/pre-commit/mirrors-mypy.git -# rev: v1.2.0 + entry: mypy + args: [--strict, --show-error-codes, --no-warn-unused-ignores] + +# Main problem: python executable path, used to find the environment, is hardcoded + - repo: https://github.com/pre-commit/mirrors-mypy.git + rev: v1.10.0 + hooks: + - id: mypy + alias: mirrormypy + exclude: "/(site-packages|development-[^/]*|docs|node_modules|__pycache__|\\..*)/$" + require_serial: true + entry: env PYTHONPATH=. mypy + args: [--strict, --show-error-codes, --no-warn-unused-ignores, --python-executable, .full/bin/python] + stages: [manual] +# - repo: https://github.com/pylint-dev/pylint.git +# rev: v3.2.0 # hooks: -# - id: mypy +# - id: pylint +# alias: mirrorpylint # exclude: "^[^/]*env/|dev-[^/]*/" -# require_serial: true -# args: [--strict, --show-error-codes, --no-warn-unused-ignores, --python-executable, .pyWEenv/bin/python] +# entry: env PYTHONPATH=. pylint +# args: [-j, "4", --source-roots, .] +# stages: [manual] # - repo: meta - repo: https://github.com/inab/python-extended-json-schema-validator.git rev: v0.15.1 diff --git a/mypy-requirements.txt b/mypy-requirements.txt index 7b27cce1..303f3fae 100644 --- a/mypy-requirements.txt +++ b/mypy-requirements.txt @@ -3,6 +3,7 @@ mypy-extensions # typing libraries, needed by mypy types-boto3 types-certifi +types-cryptography types-jsonschema types-mypy-extensions types-openpyxl @@ -13,4 +14,3 @@ types-PyYAML types-requests types-setuptools # We do not use this directly, but the crypt4gh stubs embedded in the repo -cryptography >= 41.0.0 diff --git a/mypy-stubs/crypt4gh/cli.pyi b/mypy-stubs/crypt4gh/cli.pyi index aa619efc..f2137d76 100644 --- a/mypy-stubs/crypt4gh/cli.pyi +++ b/mypy-stubs/crypt4gh/cli.pyi @@ -8,6 +8,7 @@ from typing import ( Optional, Pattern, Sequence, + Tuple, ) from typing_extensions import ( @@ -18,7 +19,6 @@ LOG: Logger C4GH_DEBUG: str | Literal[False] DEFAULT_SK: Optional[str] DEFAULT_LOG: Optional[str] -__doc__: str def parse_args(argv: Sequence[str] = ...) -> Mapping[str, Any]: ... diff --git a/mypy-stubs/crypt4gh/debug.pyi b/mypy-stubs/crypt4gh/debug.pyi index 41d4cc13..1bff62ab 100644 --- a/mypy-stubs/crypt4gh/debug.pyi +++ b/mypy-stubs/crypt4gh/debug.pyi @@ -13,7 +13,6 @@ from typing import ( LOG: Logger DEFAULT_SK: Optional[str] DEFAULT_LOG: Optional[str] -__doc__: str def parse_args(argv: Sequence[str] = ...) -> Mapping[str, Any]: ... def output(args: Mapping[str, Any]) -> None: ... diff --git a/mypy-stubs/crypt4gh/keys/debug.pyi b/mypy-stubs/crypt4gh/keys/debug.pyi index c76a93d5..d84abcac 100644 --- a/mypy-stubs/crypt4gh/keys/debug.pyi +++ b/mypy-stubs/crypt4gh/keys/debug.pyi @@ -7,6 +7,5 @@ from typing import ( ) LOG: Logger -__doc__: str def main(argv: Sequence[str] = ...) -> None: ... diff --git a/mypy-stubs/crypt4gh/keys/ssh.pyi b/mypy-stubs/crypt4gh/keys/ssh.pyi index 11def06a..7186fe68 100644 --- a/mypy-stubs/crypt4gh/keys/ssh.pyi +++ b/mypy-stubs/crypt4gh/keys/ssh.pyi @@ -12,13 +12,12 @@ from typing import ( ) from cryptography.hazmat.primitives.ciphers import Cipher -from cryptography.hazmat.primitives.ciphers.modes import Mode LOG: Logger MAGIC_WORD: bytes def get_derived_key_length(ciphername: bytes) -> int: ... -def get_cipher(ciphername: bytes, derived_key: bytes) -> Cipher[Mode]: ... +def get_cipher(ciphername: bytes, derived_key: bytes) -> Cipher: ... def decode_string(stream: IO[bytes]) -> bytes: ... def parse_private_key( stream: IO[bytes], callback: Callable[[], EncodeProto] diff --git a/mypy-stubs/defusedxml/expatbuilder.pyi b/mypy-stubs/defusedxml/expatbuilder.pyi index 1f53c12a..f887593a 100644 --- a/mypy-stubs/defusedxml/expatbuilder.pyi +++ b/mypy-stubs/defusedxml/expatbuilder.pyi @@ -20,21 +20,41 @@ class DefusedExpatBuilder(_ExpatBuilder): forbid_external: bool = True, ) -> None: ... def defused_start_doctype_decl( - self, name, sysid, pubid, has_internal_subset + self, + name: str | None, + sysid: str | None, + pubid: str | None, + has_internal_subset: bool, ) -> None: ... def defused_entity_decl( - self, name, is_parameter_entity, value, base, sysid, pubid, notation_name + self, + name: str | None, + is_parameter_entity: bool, + value: str | None, + base: str | None, + sysid: str | None, + pubid: str | None, + notation_name: str | None, ) -> None: ... def defused_unparsed_entity_decl( - self, name, base, sysid, pubid, notation_name + self, + name: str | None, + base: str | None, + sysid: str | None, + pubid: str | None, + notation_name: str | None, ) -> None: ... def defused_external_entity_ref_handler( - self, context, base, sysid, pubid + self, + context: str | None, + base: str | None, + sysid: str | None, + pubid: str | None, ) -> None: ... - def install(self, parser) -> None: ... + def install(self, parser: _ExpatBuilder) -> None: ... class DefusedExpatBuilderNS(_Namespaces, DefusedExpatBuilder): - def install(self, parser) -> None: ... + def install(self, parser: _ExpatBuilder) -> None: ... def reset(self) -> None: ... def parse( diff --git a/mypy-stubs/defusedxml/expatreader.pyi b/mypy-stubs/defusedxml/expatreader.pyi index 7453fee1..70d79e13 100644 --- a/mypy-stubs/defusedxml/expatreader.pyi +++ b/mypy-stubs/defusedxml/expatreader.pyi @@ -7,4 +7,4 @@ __origin__: str DefusedExpatParser = Incomplete -def create_parser(*args, **kwargs): ... +def create_parser(*args: str, **kwargs: str) -> DefusedExpatParser: ... diff --git a/mypy-stubs/defusedxml/lxml.pyi b/mypy-stubs/defusedxml/lxml.pyi index 1cab02e8..11825913 100644 --- a/mypy-stubs/defusedxml/lxml.pyi +++ b/mypy-stubs/defusedxml/lxml.pyi @@ -1,5 +1,10 @@ import threading from _typeshed import Incomplete +from _typeshed import SupportsRead +from typing import ( + Iterator, + Sequence, +) # Not bothering with types here as lxml support is supposed to be dropped in a future version # of defusedxml @@ -13,41 +18,49 @@ class _ElementBase: ... class RestrictedElement(_ElementBase): blacklist: Incomplete - def __iter__(self): ... - def iterchildren(self, tag: Incomplete | None = ..., reversed: bool = ...): ... - def iter(self, tag: Incomplete | None = ..., *tags): ... - def iterdescendants(self, tag: Incomplete | None = ..., *tags): ... - def itersiblings(self, tag: Incomplete | None = ..., preceding: bool = ...): ... - def getchildren(self): ... - def getiterator(self, tag: Incomplete | None = ...): ... + def __iter__(self) -> Iterator[_ElementBase]: ... + def iterchildren( + self, tag: Incomplete | None = ..., reversed: bool = ... + ) -> Iterator[_ElementBase]: ... + def iter( + self, tag: _ElementBase | None = ..., *tags: _ElementBase + ) -> Iterator[_ElementBase]: ... + def iterdescendants( + self, tag: _ElementBase | None = ..., *tags: _ElementBase + ) -> Iterator[_ElementBase]: ... + def itersiblings( + self, tag: Incomplete | None = ..., preceding: bool = ... + ) -> Iterator[_ElementBase]: ... + def getchildren(self) -> Sequence[_ElementBase]: ... + def getiterator(self, tag: Incomplete | None = ...) -> Iterator[_ElementBase]: ... class GlobalParserTLS(threading.local): parser_config: Incomplete element_class: Incomplete - def createDefaultParser(self): ... - def setDefaultParser(self, parser) -> None: ... - def getDefaultParser(self): ... + def createDefaultParser(self) -> Incomplete: ... + def setDefaultParser(self, parser: Incomplete) -> None: ... + def getDefaultParser(self) -> Incomplete: ... getDefaultParser: Incomplete def check_docinfo( - elementtree, forbid_dtd: bool = ..., forbid_entities: bool = ... + elementtree: Incomplete, forbid_dtd: bool = ..., forbid_entities: bool = ... ) -> None: ... def parse( - source, + source: str | SupportsRead[bytes | str], parser: Incomplete | None = ..., base_url: Incomplete | None = ..., forbid_dtd: bool = ..., forbid_entities: bool = ..., -): ... +) -> Incomplete: ... def fromstring( - text, + text: str, parser: Incomplete | None = ..., base_url: Incomplete | None = ..., forbid_dtd: bool = ..., forbid_entities: bool = ..., -): ... +) -> Incomplete: ... XML = fromstring -def iterparse(*args, **kwargs) -> None: ... +def iterparse(*args: str, **kwargs: str) -> None: ... diff --git a/mypy-stubs/defusedxml/minidom.pyi b/mypy-stubs/defusedxml/minidom.pyi index 974d1a93..6649c1f4 100644 --- a/mypy-stubs/defusedxml/minidom.pyi +++ b/mypy-stubs/defusedxml/minidom.pyi @@ -1,10 +1,11 @@ from _typeshed import Incomplete +from _typeshed import SupportsRead from xml.dom.minidom import Document __origin__: str def parse( - file, + file: str | SupportsRead[bytes | str], parser: Incomplete | None = None, bufsize: int | None = None, forbid_dtd: bool = False, diff --git a/mypy-stubs/defusedxml/pulldom.pyi b/mypy-stubs/defusedxml/pulldom.pyi index 173065ab..541e38cb 100644 --- a/mypy-stubs/defusedxml/pulldom.pyi +++ b/mypy-stubs/defusedxml/pulldom.pyi @@ -1,3 +1,5 @@ +from _typeshed import SupportsRead + from xml.dom.pulldom import DOMEventStream from .expatreader import DefusedExpatParser @@ -5,7 +7,7 @@ from .expatreader import DefusedExpatParser __origin__: str def parse( - stream_or_string, + stream_or_string: str | SupportsRead[bytes | str], parser: DefusedExpatParser | None = None, bufsize: int | None = None, forbid_dtd: bool = False, diff --git a/mypy-stubs/defusedxml/sax.pyi b/mypy-stubs/defusedxml/sax.pyi index a4c32eda..bd5d10c1 100644 --- a/mypy-stubs/defusedxml/sax.pyi +++ b/mypy-stubs/defusedxml/sax.pyi @@ -1,21 +1,22 @@ from _typeshed import Incomplete +from _typeshed import SupportsRead from xml.sax import ErrorHandler as _ErrorHandler - +from xml.sax.handler import ContentHandler as _ContentHandler from .expatreader import DefusedExpatParser __origin__: str def parse( - source, - handler, + source: str | SupportsRead[bytes | str], + handler: _ContentHandler, errorHandler: _ErrorHandler = ..., forbid_dtd: bool = False, forbid_entities: bool = True, forbid_external: bool = True, ) -> None: ... def parseString( - string, - handler, + string: str, + handler: _ContentHandler, errorHandler: _ErrorHandler = ..., forbid_dtd: bool = False, forbid_entities: bool = True, diff --git a/mypy-stubs/defusedxml/xmlrpc.pyi b/mypy-stubs/defusedxml/xmlrpc.pyi index b3501f34..bee386e7 100644 --- a/mypy-stubs/defusedxml/xmlrpc.pyi +++ b/mypy-stubs/defusedxml/xmlrpc.pyi @@ -1,10 +1,13 @@ from _typeshed import Incomplete -from xmlrpc.client import ExpatParser +from xmlrpc.client import ( + ExpatParser, + Unmarshaller, +) __origin__: str MAX_DATA: int = 31457280 -def defused_gzip_decode(data, limit: int | None = None): ... +def defused_gzip_decode(data: bytes | bytearray, limit: int | None = None) -> bytes: ... # Couldn't type this as a class deriving from gzip.GzipFile # since overwriting `read` method does not define an optional argument @@ -17,22 +20,42 @@ class DefusedExpatParser(ExpatParser): forbid_external: bool def __init__( self, - target, + target: Unmarshaller, forbid_dtd: bool = False, forbid_entities: bool = True, forbid_external: bool = True, ) -> None: ... def defused_start_doctype_decl( - self, name, sysid, pubid, has_internal_subset + self, + name: str | None, + sysid: str | None, + pubid: str | None, + has_internal_subset: bool, ) -> None: ... def defused_entity_decl( - self, name, is_parameter_entity, value, base, sysid, pubid, notation_name + self, + name: str | None, + is_parameter_entity: bool, + value: str | None, + base: str | None, + sysid: str | None, + pubid: str | None, + notation_name: str | None, ) -> None: ... def defused_unparsed_entity_decl( - self, name, base, sysid, pubid, notation_name + self, + name: str | None, + base: str | None, + sysid: str | None, + pubid: str | None, + notation_name: str | None, ) -> None: ... def defused_external_entity_ref_handler( - self, context, base, sysid, pubid + self, + context: str | None, + base: str | None, + sysid: str | None, + pubid: str | None, ) -> None: ... def monkey_patch() -> None: ... diff --git a/mypy-stubs/jsonpath_ng/bin/jsonpath.pyi b/mypy-stubs/jsonpath_ng/bin/jsonpath.pyi index 27d37e9a..49abe641 100644 --- a/mypy-stubs/jsonpath_ng/bin/jsonpath.pyi +++ b/mypy-stubs/jsonpath_ng/bin/jsonpath.pyi @@ -1,3 +1,5 @@ +from typing import Sequence + from jsonpath_ng import parse as parse from jsonpath_ng.jsonpath import ( DatumInContext, diff --git a/mypy-stubs/jsonpath_ng/ext/arithmetic.pyi b/mypy-stubs/jsonpath_ng/ext/arithmetic.pyi index 74431b0e..80d4ef7c 100644 --- a/mypy-stubs/jsonpath_ng/ext/arithmetic.pyi +++ b/mypy-stubs/jsonpath_ng/ext/arithmetic.pyi @@ -1,4 +1,6 @@ from typing import ( + Callable, + Mapping, Sequence, ) diff --git a/mypy-stubs/jsonpath_ng/ext/filter.pyi b/mypy-stubs/jsonpath_ng/ext/filter.pyi index 8b633ff1..24c8621b 100644 --- a/mypy-stubs/jsonpath_ng/ext/filter.pyi +++ b/mypy-stubs/jsonpath_ng/ext/filter.pyi @@ -1,5 +1,7 @@ from typing import ( Any, + Callable, + Mapping, Sequence, ) from .. import DatumInContext as DatumInContext, Index as Index, JSONPath as JSONPath diff --git a/mypy-stubs/jsonpath_ng/ext/iterable.pyi b/mypy-stubs/jsonpath_ng/ext/iterable.pyi index 41cfc94e..14ad58fc 100644 --- a/mypy-stubs/jsonpath_ng/ext/iterable.pyi +++ b/mypy-stubs/jsonpath_ng/ext/iterable.pyi @@ -1,5 +1,6 @@ from typing import ( Any, + Optional, Sequence, ) diff --git a/mypy-stubs/jsonpath_ng/ext/string.pyi b/mypy-stubs/jsonpath_ng/ext/string.pyi index 38144b02..35d6b0d2 100644 --- a/mypy-stubs/jsonpath_ng/ext/string.pyi +++ b/mypy-stubs/jsonpath_ng/ext/string.pyi @@ -1,9 +1,11 @@ from typing import ( + Any, Optional, Pattern, Sequence, ) from .. import DatumInContext as DatumInContext, This as This +from ..jsonpath import JSONVal SUB: Pattern[str] SPLIT: Pattern[str] diff --git a/mypy-stubs/rocrate/cli.pyi b/mypy-stubs/rocrate/cli.pyi index cfa51a55..0fc04127 100644 --- a/mypy-stubs/rocrate/cli.pyi +++ b/mypy-stubs/rocrate/cli.pyi @@ -3,6 +3,7 @@ from typing import ( Any, Optional, Sequence, + Set, ) from .model.computerlanguage import LANG_MAP as LANG_MAP @@ -36,14 +37,18 @@ def suite( ) -> None: ... def instance( crate_dir: str, - suite, - url, + suite: Optional[str], + url: str, resource: str, service: str, identifier: Optional[str], name: Optional[str], ) -> None: ... def definition( - crate_dir: str, suite, path: str, engine: str, engine_version: Optional[str] + crate_dir: str, + suite: Optional[str], + path: str, + engine: str, + engine_version: Optional[str], ) -> None: ... def write_zip(crate_dir: str, dst: str) -> None: ... diff --git a/mypy-stubs/xdg/BaseDirectory.pyi b/mypy-stubs/xdg/BaseDirectory.pyi deleted file mode 100644 index c105cfd8..00000000 --- a/mypy-stubs/xdg/BaseDirectory.pyi +++ /dev/null @@ -1,21 +0,0 @@ -from collections.abc import Generator -from typing import ( - Optional, - Sequence, -) - -xdg_data_home: str -xdg_data_dirs: Sequence[str] -xdg_config_home: str -xdg_config_dirs: Sequence[str] -xdg_cache_home: str -xdg_state_home: str - -def save_config_path(*resource: str) -> str: ... -def save_data_path(*resource: str) -> str: ... -def save_cache_path(*resource: str) -> str: ... -def save_state_path(*resource: str) -> str: ... -def load_config_paths(*resource: str) -> Generator[str, None, None]: ... -def load_first_config(*resource: str) -> Optional[str]: ... -def load_data_paths(*resource: str) -> Generator[str, None, None]: ... -def get_runtime_dir(strict: bool = ...) -> str: ... diff --git a/mypy-stubs/xdg/Config.pyi b/mypy-stubs/xdg/Config.pyi deleted file mode 100644 index 353f6830..00000000 --- a/mypy-stubs/xdg/Config.pyi +++ /dev/null @@ -1,17 +0,0 @@ -from typing import ( - Optional, -) - -language: str -windowmanager: Optional[str] -icon_theme: str -icon_size: int -cache_time: int -root_mode: bool - -def setWindowManager(wm: str) -> None: ... -def setIconTheme(theme: str) -> None: ... -def setIconSize(size: int) -> None: ... -def setCacheTime(time: int) -> None: ... -def setLocale(lang: str) -> None: ... -def setRootMode(boolean: bool) -> None: ... diff --git a/mypy-stubs/xdg/DesktopEntry.pyi b/mypy-stubs/xdg/DesktopEntry.pyi deleted file mode 100644 index 657e2b6a..00000000 --- a/mypy-stubs/xdg/DesktopEntry.pyi +++ /dev/null @@ -1,71 +0,0 @@ -from typing import ( - MutableMapping, - Optional, - Pattern, - Sequence, -) - -from xdg.Exceptions import ParsingError as ParsingError -from xdg.IniFile import IniFile as IniFile, is_ascii as is_ascii -from xdg.util import which as which - -class DesktopEntry(IniFile): - defaultGroup: str - content: MutableMapping[str, MutableMapping[str, str]] - def __init__(self, filename: Optional[str] = ...) -> None: ... - def parse(self, file: str) -> None: ... - def findTryExec(self) -> Optional[str]: ... - def getType(self) -> str: ... - def getVersion(self) -> float: ... - def getVersionString(self) -> str: ... - def getName(self) -> str: ... - def getGenericName(self) -> str: ... - def getNoDisplay(self) -> bool: ... - def getComment(self) -> str: ... - def getIcon(self) -> str: ... - def getHidden(self) -> bool: ... - def getOnlyShowIn(self) -> Sequence[str]: ... - def getNotShowIn(self) -> Sequence[str]: ... - def getTryExec(self) -> str: ... - def getExec(self) -> str: ... - def getPath(self) -> str: ... - def getTerminal(self) -> bool: ... - def getMimeType(self) -> Sequence[Pattern[str]]: ... - def getMimeTypes(self) -> Sequence[str]: ... - def getCategories(self) -> Sequence[str]: ... - def getStartupNotify(self) -> bool: ... - def getStartupWMClass(self) -> str: ... - def getURL(self) -> str: ... - def getServiceTypes(self) -> Sequence[str]: ... - def getDocPath(self) -> str: ... - def getKeywords(self) -> Sequence[str]: ... - def getInitialPreference(self) -> str: ... - def getDev(self) -> str: ... - def getFSType(self) -> str: ... - def getMountPoint(self) -> str: ... - def getReadonly(self) -> bool: ... - def getUnmountIcon(self) -> str: ... - def getMiniIcon(self) -> str: ... - def getTerminalOptions(self) -> str: ... - def getDefaultApp(self) -> str: ... - def getProtocols(self) -> Sequence[str]: ... - def getExtensions(self) -> Sequence[str]: ... - def getBinaryPattern(self) -> str: ... - def getMapNotify(self) -> str: ... - def getEncoding(self) -> str: ... - def getSwallowTitle(self) -> str: ... - def getSwallowExec(self) -> str: ... - def getSortOrder(self) -> Sequence[str]: ... - def getFilePattern(self) -> Pattern[str]: ... - def getActions(self) -> Sequence[str]: ... - filename: str - def new(self, filename: str) -> None: ... - type: str - name: str - def checkExtras(self) -> None: ... - def checkGroup(self, group: str) -> None: ... - def checkKey(self, key: str, value: str, group: str) -> None: ... - def checkType(self, key: str, type: str) -> None: ... - def checkOnlyShowIn(self, value: str) -> None: ... - def checkCategories(self, value: str) -> None: ... - def checkCategorie(self, value: str) -> None: ... diff --git a/mypy-stubs/xdg/Exceptions.pyi b/mypy-stubs/xdg/Exceptions.pyi deleted file mode 100644 index 764a1b11..00000000 --- a/mypy-stubs/xdg/Exceptions.pyi +++ /dev/null @@ -1,41 +0,0 @@ -debug: bool - -class Error(Exception): - msg: str - def __init__(self, msg: str) -> None: ... - -class ValidationError(Error): - msg: str - file: str - def __init__(self, msg: str, file: str) -> None: ... - -class ParsingError(Error): - msg: str - file: str - def __init__(self, msg: str, file: str) -> None: ... - -class NoKeyError(Error): - key: str - group: str - file: str - def __init__(self, key: str, group: str, file: str) -> None: ... - -class DuplicateKeyError(Error): - key: str - group: str - file: str - def __init__(self, key: str, group: str, file: str) -> None: ... - -class NoGroupError(Error): - group: str - file: str - def __init__(self, group: str, file: str) -> None: ... - -class DuplicateGroupError(Error): - group: str - file: str - def __init__(self, group: str, file: str) -> None: ... - -class NoThemeError(Error): - theme: str - def __init__(self, theme: str) -> None: ... diff --git a/mypy-stubs/xdg/IconTheme.pyi b/mypy-stubs/xdg/IconTheme.pyi deleted file mode 100644 index cdef2e28..00000000 --- a/mypy-stubs/xdg/IconTheme.pyi +++ /dev/null @@ -1,70 +0,0 @@ -from typing import ( - Any, - MutableMapping, - MutableSequence, - Optional, - Sequence, - Tuple, -) - -from xdg.BaseDirectory import xdg_data_dirs as xdg_data_dirs -from xdg.Exceptions import NoThemeError as NoThemeError, debug as debug -from xdg.IniFile import IniFile as IniFile, is_ascii as is_ascii - -class IconTheme(IniFile): - def __init__(self) -> None: ... - dir: str - def parse(self, file: str) -> None: ... - def getDir(self) -> str: ... - def getName(self) -> str: ... - def getComment(self) -> str: ... - def getInherits(self) -> Sequence[str]: ... - def getDirectories(self) -> Sequence[str]: ... - def getScaledDirectories(self) -> Sequence[str]: ... - def getHidden(self) -> bool: ... - def getExample(self) -> str: ... - def getSize(self, directory: str) -> int: ... - def getContext(self, directory: str) -> str: ... - def getType(self, directory: str) -> str: ... - def getMaxSize(self, directory: str) -> int: ... - def getMinSize(self, directory: str) -> int: ... - def getThreshold(self, directory: str) -> int: ... - def getScale(self, directory: str) -> int: ... - name: str - comment: str - directories: str - def checkExtras(self) -> None: ... - type: str - def checkGroup(self, group: str) -> None: ... - def checkKey(self, key: str, value: str, group: str) -> None: ... - -class IconData(IniFile): - def __init__(self) -> None: ... - def parse(self, file: str) -> None: ... - def getDisplayName(self) -> str: ... - def getEmbeddedTextRectangle(self) -> Sequence[int]: ... - def getAttachPoints(self) -> Sequence[Tuple[int, int]]: ... - def checkExtras(self) -> None: ... - def checkGroup(self, group: str) -> None: ... - def checkKey(self, key: str, value: str, group: str) -> None: ... - -icondirs: MutableSequence[str] -themes: MutableSequence[IconTheme] -theme_cache: MutableMapping[str, Sequence[Any]] -dir_cache: MutableMapping[str, Tuple[Sequence[str], float, float]] -icon_cache: MutableMapping[Tuple[str, int, str, Tuple[str, ...]], Tuple[float, str]] - -def getIconPath( - iconname: str, - size: Optional[int] = ..., - theme: Optional[str] = ..., - extensions: Sequence[str] = ..., -) -> str: ... -def getIconData(path: str) -> Optional[IconData]: ... -def LookupIcon( - iconname: str, size: int, theme: str, extensions: Sequence[str] -) -> str: ... -def DirectoryMatchesSize( - subdir: str, iconsize: int, theme: IconTheme -) -> Optional[bool]: ... -def DirectorySizeDistance(subdir: str, iconsize: int, theme: IconTheme) -> int: ... diff --git a/mypy-stubs/xdg/IniFile.pyi b/mypy-stubs/xdg/IniFile.pyi deleted file mode 100644 index e77deb32..00000000 --- a/mypy-stubs/xdg/IniFile.pyi +++ /dev/null @@ -1,81 +0,0 @@ -from typing import ( - Iterator, - MutableMapping, - MutableSequence, - Optional, - Pattern, - Sequence, - Tuple, -) - -from xdg.Exceptions import ( - DuplicateGroupError as DuplicateGroupError, - DuplicateKeyError as DuplicateKeyError, - NoGroupError as NoGroupError, - NoKeyError as NoKeyError, - ParsingError as ParsingError, - ValidationError as ValidationError, - debug as debug, -) -from xdg.util import u as u - -def is_ascii(s: str) -> bool: ... - -class IniFile: - defaultGroup: str - fileExtension: str - filename: str - tainted: bool - content: MutableMapping[str, MutableMapping[str, str]] - def __init__(self, filename: Optional[str] = ...) -> None: ... - def __cmp__(self, other): ... - def parse(self, filename: str, headers: Optional[Sequence[str]] = ...) -> None: ... - def get( - self, - key: str, - group: Optional[str] = ..., - locale: bool = ..., - type: str = ..., - list: bool = ..., - strict: bool = ..., - ) -> ( - bool - | int - | float - | Pattern[str] - | Tuple[int, int] - | Sequence[bool] - | Sequence[int] - | Sequence[float] - | Sequence[Pattern[str]] - | Sequence[Tuple[int, int]] - ): ... - def getList(self, string: str) -> Sequence[str]: ... - warnings: MutableSequence[str] - errors: MutableSequence[str] - def validate(self, report: str = ...) -> None: ... - def checkGroup(self, group: str) -> None: ... - def checkKey(self, key: str, value: str, group: str) -> None: ... - def checkValue( - self, key: str, value: str, type: str = ..., list: bool = ... - ) -> None: ... - def checkExtras(self) -> None: ... - def checkBoolean(self, value: str) -> int: ... - def checkNumber(self, value: str) -> Optional[int]: ... - def checkInteger(self, value: str) -> Optional[int]: ... - def checkPoint(self, value: str) -> Optional[int]: ... - def checkString(self, value: str) -> int: ... - def checkRegex(self, value: str) -> Optional[int]: ... - def write(self, filename: Optional[str] = ..., trusted: bool = ...) -> None: ... - def set( - self, key: str, value: str, group: Optional[str] = ..., locale: bool = ... - ) -> None: ... - def addGroup(self, group: str) -> None: ... - def removeGroup(self, group: str): ... - def removeKey( - self, key: str, group: Optional[str] = ..., locales: bool = ... - ) -> str: ... - def groups(self) -> Iterator[str]: ... - def hasGroup(self, group: str) -> bool: ... - def hasKey(self, key: str, group: Optional[str] = ...) -> bool: ... - def getFileName(self) -> str: ... diff --git a/mypy-stubs/xdg/Locale.pyi b/mypy-stubs/xdg/Locale.pyi deleted file mode 100644 index d6ce7946..00000000 --- a/mypy-stubs/xdg/Locale.pyi +++ /dev/null @@ -1,11 +0,0 @@ -from typing import ( - Optional, - Sequence, -) - -regex: str - -def expand_languages(languages: Optional[Sequence[str]] = ...) -> Sequence[str]: ... -def update(language: Optional[str] = ...) -> None: ... - -langs: Sequence[str] diff --git a/mypy-stubs/xdg/Menu.pyi b/mypy-stubs/xdg/Menu.pyi deleted file mode 100644 index aa3ae5af..00000000 --- a/mypy-stubs/xdg/Menu.pyi +++ /dev/null @@ -1,199 +0,0 @@ -import ast -from typing import ( - MutableMapping, - MutableSequence, - Optional, - Sequence, -) - -from xml.etree.ElementTree import Element - -from collections.abc import Generator -from xdg.BaseDirectory import ( - xdg_config_dirs as xdg_config_dirs, - xdg_data_dirs as xdg_data_dirs, -) -from xdg.DesktopEntry import DesktopEntry as DesktopEntry -from xdg.Exceptions import ParsingError as ParsingError -from xdg.util import PY3 as PY3 - -DELETED: str -NO_DISPLAY: str -HIDDEN: str -EMPTY: str -NOT_SHOW_IN: str -NO_EXEC: str - -MiscMenuEntry = Separator | MenuEntry | Menu - -class Menu: - Name: str - Directory: Optional[MenuEntry] - Entries: MutableSequence[MiscMenuEntry] - Doc: str - Filename: str - Depth: int - Parent: Optional[Menu] - NotInXml: bool - Show: bool - Visible: int - AppDirs: MutableSequence[str] - DefaultLayout: Optional[Layout] - Deleted: Optional[bool] - Directories: MutableSequence[str] - DirectoryDirs: MutableSequence[str] - Layout: Optional[Layout] - MenuEntries: MutableSequence[MenuEntry] - Moves: MutableSequence[Move] - OnlyUnallocated: Optional[bool] - Rules: MutableSequence[Rule] - Submenus: MutableSequence[Menu] - def __init__(self) -> None: ... - def __add__(self, other: Menu) -> Menu: ... - def __cmp__(self, other: Menu) -> int: ... - def __lt__(self, other: Menu) -> bool: ... - def __eq__(self, other: Menu) -> bool: ... - def getEntries( - self, show_hidden: bool = ... - ) -> Generator[MiscMenuEntry, None, None]: ... - def getMenuEntry(self, desktopfileid: str, deep: bool = ...) -> MenuEntry: ... - def getMenu(self, path: str) -> Menu: ... - def getPath(self, org: bool = ..., toplevel: bool = ...) -> str: ... - def getName(self) -> str: ... - def getGenericName(self) -> str: ... - def getComment(self) -> str: ... - def getIcon(self) -> str: ... - def sort(self) -> None: ... - def addSubmenu(self, newmenu: Menu) -> None: ... - def merge_inline(self, submenu: Menu) -> None: ... - -class Move: - Old: str - New: str - def __init__(self, old: str = ..., new: str = ...) -> None: ... - def __cmp__(self, other: Move): ... - -class Layout: - show_empty: bool - inline: bool - inline_limit: int - inline_header: bool - inline_alias: bool - def __init__( - self, - show_empty: bool = ..., - inline: bool = ..., - inline_limit: int = ..., - inline_header: bool = ..., - inline_alias: bool = ..., - ) -> None: ... - @property - def order(self) -> Sequence[Sequence[str]]: ... - @order.setter - def order(self, order: Sequence[Sequence[str]]) -> None: ... - -class Rule: - TYPE_INCLUDE: int - TYPE_EXCLUDE: int - @classmethod - def fromFilename(cls, type: int, filename: str) -> Rule: ... - Type: int - expression: ast.Expression - # Which is the type annotation from class "code"? - code: Any - def __init__(self, type: int, expression: ast.Expression) -> None: ... - def apply( - self, menuentries: Sequence[MenuEntry], run: int - ) -> Sequence[MenuEntry]: ... - -class MenuEntry: - TYPE_USER: str - TYPE_SYSTEM: str - TYPE_BOTH: str - DesktopEntry: DesktopEntry - Show: bool - Original: Optional[MenuEntry] - Parents: MutableSequence[Menu] - Allocated: bool - Add: bool - MatchedInclude: bool - Categories: Sequence[str] - def __init__(self, filename: str, dir: str = ..., prefix: str = ...) -> None: ... - def save(self) -> None: ... - def getDir(self) -> str: ... - def getType(self) -> str: ... - Filename: str - Prefix: str - DesktopFileID: str - def setAttributes( - self, filename: str, dir: str = ..., prefix: str = ... - ) -> None: ... - def updateAttributes(self) -> None: ... - def __cmp__(self, other: MenuEntry) -> int: ... - def __lt__(self, other: MenuEntry) -> bool: ... - def __eq__(self, other: MenuEntry) -> bool: ... - -class Separator: - Parent: Menu - Show: bool - def __init__(self, parent: Menu) -> None: ... - -class Header: - Name: str - GenericName: str - Comment: str - def __init__(self, name: str, generic_name: str, comment: str) -> None: ... - -TYPE_DIR: int -TYPE_FILE: int - -class XMLMenuBuilder: - debug: bool - def __init__(self, debug: bool = ...) -> None: ... - cache: MenuEntryCache - def parse(self, filename: Optional[str] = ...) -> Menu: ... - def parse_menu(self, node: Element, filename: str) -> Menu: ... - def parse_node( - self, node: Element, filename: str, parent: Optional[Menu] = ... - ) -> None: ... - def parse_layout(self, node: Element) -> Layout: ... - def parse_move(self, node: Element) -> Move: ... - def parse_rule(self, node: Element) -> Rule: ... - def parse_bool_op( - self, node: Element, operator: ast.And | ast.Or - ) -> Optional[ast.Expr]: ... - def parse_rule_node(self, node: Element) -> ast.Expr: ... - def parse_app_dir(self, value, filename: str, parent: Menu) -> None: ... - def parse_default_app_dir(self, filename: str, parent: Menu) -> None: ... - def parse_directory_dir(self, value, filename: str, parent: Menu) -> None: ... - def parse_default_directory_dir(self, filename: str, parent: Menu) -> None: ... - def parse_merge_file( - self, value, child, filename: str, parent: Optional[Menu] - ) -> None: ... - def parse_merge_dir( - self, value, child, filename: str, parent: Optional[Menu] - ) -> None: ... - def parse_default_merge_dirs( - self, child, filename: str, parent: Optional[Menu] - ) -> None: ... - def merge_file(self, filename: str, child, parent: Optional[Menu]) -> None: ... - def parse_legacy_dir(self, dir_, prefix, filename: str, parent: Menu) -> None: ... - def merge_legacy_dir(self, dir_, prefix, filename: str, parent: Menu): ... - def parse_kde_legacy_dirs(self, filename: str, parent: Menu) -> None: ... - def post_parse(self, menu: Menu) -> None: ... - def generate_not_only_allocated(self, menu: Menu) -> None: ... - def generate_only_allocated(self, menu: Menu) -> None: ... - def handle_moves(self, menu: Menu) -> None: ... - -class MenuEntryCache: - cacheEntries: MutableMapping[str, MutableSequence[MenuEntry]] - cache: MutableMapping[str, Sequence[MenuEntry]] - def __init__(self) -> None: ... - def add_menu_entries( - self, dirs: Sequence[str], prefix: str = ..., legacy: bool = ... - ) -> None: ... - def get_menu_entries( - self, dirs: Sequence[str], legacy: bool = ... - ) -> Sequence[MenuEntry]: ... - -def parse(filename: Optional[str] = ..., debug: bool = ...) -> Menu: ... diff --git a/mypy-stubs/xdg/MenuEditor.pyi b/mypy-stubs/xdg/MenuEditor.pyi deleted file mode 100644 index 7dc11da1..00000000 --- a/mypy-stubs/xdg/MenuEditor.pyi +++ /dev/null @@ -1,130 +0,0 @@ -from typing import ( - Optional, - MutableSequence, -) -from xml.etree.ElementTree import Element - -from xdg.BaseDirectory import ( - xdg_config_dirs as xdg_config_dirs, - xdg_data_dirs as xdg_data_dirs, -) -from xdg.Config import setRootMode as setRootMode -from xdg.Exceptions import ParsingError as ParsingError -from xdg.Menu import ( - Layout as Layout, - Menu as Menu, - MenuEntry as MenuEntry, - Separator as Separator, - XMLMenuBuilder as XMLMenuBuilder, -) - -class MenuEditor: - menu: Optional[Menu] - filename: Optional[str] - tree: Optional[Element] - parser: XMLMenuBuilder - filenames: MutableSequence[str] - def __init__( - self, - menu: Optional[Menu] = ..., - filename: Optional[str] = ..., - root: bool = ..., - ) -> None: ... - def parse( - self, - menu: Optional[Menu | str] = ..., - filename: Optional[str] = ..., - root: bool = ..., - ) -> None: ... - def save(self) -> None: ... - def createMenuEntry( - self, - parent: Menu, - name: str, - command: Optional[str] = ..., - genericname: Optional[str] = ..., - comment: Optional[str] = ..., - icon: Optional[str] = ..., - terminal: Optional[bool] = ..., - after: Optional[MenuEntry] = ..., - before: Optional[MenuEntry] = ..., - ) -> MenuEntry: ... - def createMenu( - self, - parent: Menu, - name, - genericname: Optional[str] = ..., - comment: Optional[str] = ..., - icon: Optional[str] = ..., - after: Optional[Menu] = ..., - before: Optional[Menu] = ..., - ) -> Menu: ... - def createSeparator( - self, - parent: Menu, - after: Optional[MenuEntry] = ..., - before: Optional[MenuEntry] = ..., - ) -> Separator: ... - def moveMenuEntry( - self, - menuentry: MenuEntry, - oldparent: Menu, - newparent: Menu, - after: Optional[MenuEntry] = ..., - before: Optional[MenuEntry] = ..., - ) -> MenuEntry: ... - def moveMenu( - self, - menu: Menu, - oldparent: Menu, - newparent: Menu, - after: Optional[Menu] = ..., - before: Optional[Menu] = ..., - ) -> Menu: ... - def moveSeparator( - self, - separator: Separator, - parent: Menu, - after: Optional[MenuEntry] = ..., - before: Optional[MenuEntry] = ..., - ) -> Separator: ... - def copyMenuEntry( - self, - menuentry: MenuEntry, - oldparent: Menu, - newparent: Menu, - after: Optional[MenuEntry] = ..., - before: Optional[MenuEntry] = ..., - ) -> MenuEntry: ... - def editMenuEntry( - self, - menuentry: MenuEntry, - name: Optional[str] = ..., - genericname: Optional[str] = ..., - comment: Optional[str] = ..., - command: Optional[str] = ..., - icon: Optional[str] = ..., - terminal: Optional[bool] = ..., - nodisplay: Optional[bool] = ..., - hidden: Optional[bool] = ..., - ) -> MenuEntry: ... - def editMenu( - self, - menu: Menu, - name: Optional[str] = ..., - genericname: Optional[str] = ..., - comment: Optional[str] = ..., - icon: Optional[str] = ..., - nodisplay: Optional[bool] = ..., - hidden: Optional[bool] = ..., - ) -> Menu: ... - def hideMenuEntry(self, menuentry: MenuEntry) -> None: ... - def unhideMenuEntry(self, menuentry: MenuEntry) -> None: ... - def hideMenu(self, menu: Menu) -> None: ... - def unhideMenu(self, menu: Menu) -> None: ... - def deleteMenuEntry(self, menuentry: MenuEntry) -> MenuEntry: ... - def revertMenuEntry(self, menuentry: MenuEntry) -> MenuEntry: ... - def deleteMenu(self, menu: Menu) -> Menu: ... - def revertMenu(self, menu: Menu) -> Menu: ... - def deleteSeparator(self, separator: Separator) -> Separator: ... - def getAction(self, entry: Menu | MenuEntry | Separator) -> str: ... diff --git a/mypy-stubs/xdg/Mime.pyi b/mypy-stubs/xdg/Mime.pyi deleted file mode 100644 index f3ebe062..00000000 --- a/mypy-stubs/xdg/Mime.pyi +++ /dev/null @@ -1,124 +0,0 @@ -from typing import ( - Any, - IO, - MutableMapping, - MutableSequence, - Optional, - Pattern, - Sequence, - Set, - Tuple, -) - -from xdg import BaseDirectory as BaseDirectory - -FREE_NS: str -types: MutableMapping[Tuple[str, str], MIMEType] -exts: Optional[Any] -globs: Optional[GlobDB] -literals: Optional[Any] -magic: Optional[MagicDB] -PY3: bool - -def lookup(media: str, subtype: Optional[str] = ...) -> MIMEtype: ... - -class MIMEtype: - def __new__(cls, media: str, subtype: Optional[str] = ...) -> MIMEtype: ... - def get_comment(self) -> str: ... - def canonical(self) -> MIMEtype: ... - def inherits_from(self) -> Set[MIMEtype]: ... - def __hash__(self) -> integer: ... - -class UnknownMagicRuleFormat(ValueError): ... -class DiscardMagicRules(Exception): ... - -class MagicRule: - also: Optional[Pattern[str]] - start: int - value: bytes - mask: bytes - word: bytes - range: int - def __init__( - self, start: int, value: bytes, mask: bytes, word: bytes, range: int - ) -> None: ... - rule_ending_re: Pattern[str] - @classmethod - def from_file(cls, f: IO[bytes]) -> Tuple[int, MagicRule]: ... - def maxlen(self) -> int: ... - def match(self, buffer: bytes) -> bool: ... - def match0(self, buffer: bytes) -> bool: ... - -RuleTree = Sequence[Tuple[MagicRule, "RuleTree"]] - -class MagicMatchAny: - rules: Sequence[MagicRule] - def __init__(self, rules: Sequence[MagicRule]) -> None: ... - def match(self, buffer: bytes) -> bool: ... - def maxlen(self) -> int: ... - @classmethod - def from_file(cls, f: str) -> MagicMatchAny: ... - @classmethod - def from_rule_tree(cls, tree: RuleTree) -> MagicMatchAny: ... - -class MagicDB: - bytype: MutableMapping[MIMEtype, Sequence[Tuple[int, MagicMatchAny]]] - def __init__(self) -> None: ... - def merge_file(self, fname: str) -> None: ... - alltypes: MutableSequence[Tuple[int, MIMEtype, MagicMatchAny]] - maxlen: int - def finalise(self) -> None: ... - def match_data( - self, - data: bytes, - max_pri: int = ..., - min_pri: int = ..., - possible: Optional[Sequence[MIMEtype]] = ..., - ) -> MIMEtype: ... - def match( - self, - path: str, - max_pri: int = ..., - min_pri: int = ..., - possible: Optional[Sequence[MIMEtype]] = ..., - ) -> MIMEtype: ... - -class GlobDB: - allglobs: MutableMapping[MIMEtype, Set[Tuple[int, str, Tuple[str, ...]]]] - def __init__(self) -> None: ... - def merge_file(self, path: str) -> None: ... - exts: MutableMapping[str, Sequence[Tuple[MIMEtype, int]]] - cased_exts: MutableMapping[str, Sequence[Tuple[MIMEtype, int]]] - globs: MutableSequence[Tuple[Pattern[str], MIMEtype, int]] - literals: MutableMapping[str, Sequence[Tuple[MIMEtype, int]]] - cased_literals: MutableMapping[str, Sequence[Tuple[MIMEtype, int]]] - def finalise(self) -> None: ... - def first_match(self, path: str) -> Optional[Tuple[MIMEtype, int]]: ... - def all_matches(self, path: str) -> Sequence[Tuple[MIMEtype, int]]: ... - -text: MIMEtype -octet_stream: MIMEtype -inode_block: MIMEtype -inode_char: MIMEtype -inode_dir: MIMEtype -inode_fifo: MIMEtype -inode_socket: MIMEtype -inode_symlink: MIMEtype -inode_door: MIMEtype -app_exe: MIMEtype - -def update_cache() -> None: ... -def get_type_by_name(path: str) -> Optional[Tuple[MIMEtype, int]]: ... -def get_type_by_contents( - path: str, max_pri: int = ..., min_pri: int = ... -) -> MIMEtype: ... -def get_type_by_data( - data: bytes, max_pri: int = ..., min_pri: int = ... -) -> MIMEtype: ... -def get_type( - path: str, follow: bool = ..., name_pri: int = ... -) -> Tuple[MIMEtype, int] | MIMEtype: ... -def get_type2(path: str, follow: bool = ...) -> Tuple[MIMEtype, int] | MIMEtype: ... -def is_text_file(path: str) -> bool: ... -def get_extensions(mimetype: MIMEtype) -> Set[str]: ... -def install_mime_info(application: str, package_file: str) -> None: ... diff --git a/mypy-stubs/xdg/RecentFiles.pyi b/mypy-stubs/xdg/RecentFiles.pyi deleted file mode 100644 index 1adc8ce4..00000000 --- a/mypy-stubs/xdg/RecentFiles.pyi +++ /dev/null @@ -1,40 +0,0 @@ -from typing import ( - MutableSequence, - Optional, - Sequence, -) - -from xdg.Exceptions import ParsingError as ParsingError - -class RecentFiles: - RecentFiles: MutableSequence[RecentFile] - filename: str - def __init__(self) -> None: ... - def parse(self, filename: Optional[str] = ...) -> None: ... - def write(self, filename: Optional[str] = ...) -> None: ... - def getFiles( - self, - mimetypes: Optional[Sequence[str]] = ..., - groups: Optional[Sequence[str]] = ..., - limit: int = ..., - ): ... - def addFile( - self, - item: RecentFile, - mimetype: str, - groups: Optional[Sequence[str]] = ..., - private: bool = ..., - ) -> None: ... - def deleteFile(self, item: RecentFile) -> None: ... - def sort(self) -> None: ... - -class RecentFile: - URI: str - MimeType: str - Timestamp: str - Private: bool - Groups: MutableSequence[str] - def __init__(self) -> None: ... - def __cmp__(self, other: RecentFile): ... - def __lt__(self, other: RecentFile): ... - def __eq__(self, other: RecentFile): ... diff --git a/mypy-stubs/xdg/util.pyi b/mypy-stubs/xdg/util.pyi deleted file mode 100644 index 71bac70a..00000000 --- a/mypy-stubs/xdg/util.pyi +++ /dev/null @@ -1,5 +0,0 @@ -from shutil import which as which - -PY3: bool - -def u(s: bytes) -> str: ... From 7c1f30c597a42d7b2c979d6fa1bb4caf7fe36a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 14 May 2024 21:34:46 +0200 Subject: [PATCH 21/42] Fixes for several issues uncovered while trying Python 3.12 and clean installation --- wfexs_backend/fetchers/git.py | 9 ++++++--- wfexs_backend/fetchers/trs_files.py | 2 ++ wfexs_backend/ro_crate.py | 2 +- wfexs_backend/utils/rocrate.py | 2 +- wfexs_backend/wfexs_backend.py | 5 +++++ wfexs_backend/workflow_engines/cwl_engine.py | 1 + 6 files changed, 16 insertions(+), 5 deletions(-) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index 3e987a23..1b85bb3c 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -209,6 +209,9 @@ def doMaterializeRepo( gitcheckout_params.extend(["origin", repoTag]) else: doRepoUpdate = False + # These are needed to remove a pylint complaint + gitclone_params = None + gitcheckout_params = None if doRepoUpdate: with tempfile.NamedTemporaryFile() as git_stdout, tempfile.NamedTemporaryFile() as git_stderr: @@ -345,13 +348,12 @@ def fetch( repoTag = None # Getting the repoRelPath (if available) + repoRelPath: "Optional[str]" = None if len(parsedInputURL.fragment) > 0: frag_qs = parse.parse_qs(parsedInputURL.fragment) subDirArr = frag_qs.get("subdirectory", []) if len(subDirArr) > 0: repoRelPath = subDirArr[0] - else: - repoRelPath = None # Now, reassemble the repoURL, to be used by git client repoURL = cast( @@ -362,7 +364,8 @@ def fetch( repo_tag_destdir, repo_desc, metadata_array = self.doMaterializeRepo( repoURL, repoTag=repoTag ) - repo_desc["relpath"] = cast("RelPath", repoRelPath) + if repoRelPath is not None: + repo_desc["relpath"] = cast("RelPath", repoRelPath) preferredName: "Optional[RelPath]" if repoRelPath is not None: diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 0690022e..d0f3f379 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -168,6 +168,8 @@ def fetchTRSFiles( fragment="", ) ) + else: + raise FetcherException(f"FIXME: Unhandled scheme {parsedInputURL.scheme}") topMeta = { "fetched": metadata_url, diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index d560bbed..5ceb68b1 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -2202,7 +2202,7 @@ def writeWRROC(self, filename: "AnyPath") -> None: # when it is not run in debug mode if self.logger.getEffectiveLevel() > logging.DEBUG: warnings.filterwarnings( - "ignore", category=UserWarning, module="^rocrate\.model\.file$" + "ignore", category=UserWarning, module=r"^rocrate\.model\.file$" ) self.crate.write_zip(filename) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 26296a70..bd806e7a 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -264,7 +264,7 @@ def identifyROCrate( # when it is not run in debug mode if self.logger.getEffectiveLevel() > logging.DEBUG: warnings.filterwarnings( - "ignore", category=SyntaxWarning, module="^pyld\.jsonld$" + "ignore", category=SyntaxWarning, module=r"^pyld\.jsonld$" ) jsonld_obj_ser = { "@graph": pyld.jsonld.expand( diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 9135fd81..1b377a41 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1683,6 +1683,8 @@ def statusStagedWorkflows( # state unmarshalling and validations if wfInstance is not None: mStatus = wfInstance.getMarshallingStatus(reread_stats=True) + else: + mStatus = None yield instance_id, nickname, creation, wfSetup, mStatus @@ -2705,8 +2707,11 @@ def downloadContent( assert firstParsedURI is not None # Assure workflow inputs directory exists before the next step + workflowInputs_destdir: "AbsPath" if isinstance(dest, CacheType): workflowInputs_destdir = self.cachePathMap[dest] + else: + workflowInputs_destdir = dest self.logger.info( "downloading workflow input: {}".format(" or ".join(remote_uris)) diff --git a/wfexs_backend/workflow_engines/cwl_engine.py b/wfexs_backend/workflow_engines/cwl_engine.py index 2730f7ed..e100141b 100644 --- a/wfexs_backend/workflow_engines/cwl_engine.py +++ b/wfexs_backend/workflow_engines/cwl_engine.py @@ -1224,6 +1224,7 @@ def launchWorkflow( else: retVal = -1 matOutputs = [] + started = ended = datetime.datetime.min # Create augmentedInputs properly augmentedInputs = self.augmentCWLInputs(matInputs, cwl_dict_inputs) From 782f7032a5c81bb709f7d76d764ceac7c5e72e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 15 May 2024 02:42:16 +0200 Subject: [PATCH 22/42] More refinements in pre-commit hooks --- .pre-commit-config.yaml | 39 +++++++++++++++++++++++++-------------- mypy-requirements.txt | 7 ++++++- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 182fed68..1f0c20db 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,6 +6,8 @@ repos: hooks: - id: check-json - id: check-yaml + types: [file] + files: \.(yaml|yml|wfex\.(stage|export|ctxt))$ - repo: local hooks: - id: pylint @@ -14,29 +16,38 @@ repos: require_serial: true types: - python - exclude: "^[^/]*env/|dev-[^/]*/" - entry: pylint -j 4 --source-roots . + exclude: "^[^/]*env/|dev-[^/]*/|docs/" + entry: pylint + args: [-j , "4", --source-roots, .] - id: mypy name: Local MyPy language: system - types: + types_or: - python + - pyi exclude: "/(site-packages|development-[^/]*|docs|node_modules|__pycache__|\\..*)/$" require_serial: true entry: mypy args: [--strict, --show-error-codes, --no-warn-unused-ignores] -# Main problem: python executable path, used to find the environment, is hardcoded - - repo: https://github.com/pre-commit/mirrors-mypy.git - rev: v1.10.0 - hooks: - - id: mypy - alias: mirrormypy - exclude: "/(site-packages|development-[^/]*|docs|node_modules|__pycache__|\\..*)/$" - require_serial: true - entry: env PYTHONPATH=. mypy - args: [--strict, --show-error-codes, --no-warn-unused-ignores, --python-executable, .full/bin/python] - stages: [manual] +## Main problem: python executable path, used to find the environment, is hardcoded +# - repo: https://github.com/pre-commit/mirrors-mypy.git +# rev: v1.10.0 +# hooks: +# - id: mypy +# alias: mirrormypy +# exclude: "/(site-packages|development-[^/]*|docs|node_modules|__pycache__|\\..*)/$" +# require_serial: true +# entry: env PYTHONPATH=. mypy +# args: [--strict, --show-error-codes, --no-warn-unused-ignores, --python-executable, .full/bin/python] +# stages: [manual] +# - repo: meta +# hooks: +# - id: identity +# types: +# - python +# exclude: "/(site-packages|development-[^/]*|docs|node_modules|__pycache__|\\..*)/$" +# stages: [manual] # - repo: https://github.com/pylint-dev/pylint.git # rev: v3.2.0 # hooks: diff --git a/mypy-requirements.txt b/mypy-requirements.txt index 303f3fae..3eef413a 100644 --- a/mypy-requirements.txt +++ b/mypy-requirements.txt @@ -3,7 +3,10 @@ mypy-extensions # typing libraries, needed by mypy types-boto3 types-certifi +# We do not use this directly, but the crypt4gh stubs embedded in the repo types-cryptography +# We do not use this directly, but the rdflib annotations +types-html5lib types-jsonschema types-mypy-extensions types-openpyxl @@ -13,4 +16,6 @@ types-pyxdg types-PyYAML types-requests types-setuptools -# We do not use this directly, but the crypt4gh stubs embedded in the repo +# We do not use this directly, but indirectly the type annotations for +# extended_nc_client +types-six From ef7f98683b92c06e193a64208c8828c1f44a1147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 15 May 2024 03:01:06 +0200 Subject: [PATCH 23/42] Added shellcheck manual hook, to check shell scripts correctness --- .pre-commit-config.yaml | 6 ++++++ dev-requirements.txt | 1 + wfexs_backend/payloads/nodejs_wrapper.bash | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1f0c20db..8f00b218 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,6 +8,12 @@ repos: - id: check-yaml types: [file] files: \.(yaml|yml|wfex\.(stage|export|ctxt))$ + - repo: https://github.com/koalaman/shellcheck-precommit.git + rev: v0.10.0 + hooks: + - id: shellcheck + #args: [--format,json] + stages: [manual] - repo: local hooks: - id: pylint diff --git a/dev-requirements.txt b/dev-requirements.txt index aaa53575..011443d1 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -15,4 +15,5 @@ mccabe >= 0.7.0 mypy >= 1.1.1 json-schema-for-humans >= 0.45.1 pre-commit >= 2.17.0 +# This is the last version of black supporting Python 3.7 black == 23.3.0 diff --git a/wfexs_backend/payloads/nodejs_wrapper.bash b/wfexs_backend/payloads/nodejs_wrapper.bash index c6ca95de..11fae27a 100644 --- a/wfexs_backend/payloads/nodejs_wrapper.bash +++ b/wfexs_backend/payloads/nodejs_wrapper.bash @@ -18,7 +18,7 @@ done set -e singCommand="$(type -P singularity || true)" -if [ -n "$singCommand"] ; then +if [ -n "$singCommand" ] ; then NODEWRAPPERSIFDIR="$(realpath "$(dirname "$0")" 2> /dev/null)"/../share if [ ! -d "$NODEWRAPPERSIFDIR" ] ; then mkdir -p "$NODEWRAPPERSIFDIR" From afc2913ae2d41556e07c86681da0949a18196ba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 15 May 2024 12:02:42 +0200 Subject: [PATCH 24/42] Added fast check to fail fast on wrong python code --- .pre-commit-config.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8f00b218..19693bad 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,13 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 hooks: + # This one has been added to fail fast on syntax errors + # before running expensive pylint + - id: check-ast + types_or: + - python + - pyi + exclude: "/(site-packages|development-[^/]*|docs|node_modules|__pycache__|\\..*)/$" - id: check-json - id: check-yaml types: [file] From e46d902445cee36ce6a51df19e06448925a709da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 15 May 2024 12:03:19 +0200 Subject: [PATCH 25/42] Refined the place where an assertion is performed --- wfexs_backend/workflow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 26a04b99..6bb73af4 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1723,6 +1723,7 @@ def injectInputs( "injectInputs is being deprecated", PendingDeprecationWarning, stacklevel=2 ) if workflowInputs_destdir is None: + assert self.inputsDir is not None workflowInputs_destdir = self.inputsDir if workflowInputs_cacheDir is None: workflowInputs_cacheDir = CacheType.Input @@ -1735,8 +1736,6 @@ def injectInputs( "Cannot inject inputs as the store directory is undefined" ) - assert workflowInputs_destdir is not None - for path in paths: # We are sending the context name thinking in the future, # as it could contain potential hints for authenticated access From c9f4c9dd99e6f496685dbc88ecaa0417d87c392f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 15 May 2024 19:07:00 +0200 Subject: [PATCH 26/42] `engineVer` parameter passed to both Nextflow and CWL engines is honored. --- wfexs_backend/workflow_engines/cwl_engine.py | 3 --- wfexs_backend/workflow_engines/nextflow_engine.py | 12 ++++++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/wfexs_backend/workflow_engines/cwl_engine.py b/wfexs_backend/workflow_engines/cwl_engine.py index e100141b..0e9f3e9d 100644 --- a/wfexs_backend/workflow_engines/cwl_engine.py +++ b/wfexs_backend/workflow_engines/cwl_engine.py @@ -341,9 +341,6 @@ def identifyWorkflow( # TODO: select the minimum cwltool version based on cwlVersion # TODO: Check best version of the engine - if localWf.relPath is not None: - engineVer = self.cwltool_version - if engineVer is None: engineVer = self.cwltool_version diff --git a/wfexs_backend/workflow_engines/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py index 737d53a1..7b81deb5 100644 --- a/wfexs_backend/workflow_engines/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -452,7 +452,6 @@ def identifyWorkflow( # Let's record all the configuration files nxfScripts: "MutableSequence[RelPath]" = [] absolutePutativeCandidateNf: "Optional[AbsPath]" = None - engineVer = None minimalEngineVer = None kw_20_04_Pat: "Optional[Pattern[str]]" = re.compile( r"\$(?:(?:launchDir|moduleDir|projectDir)|\{(?:launchDir|moduleDir|projectDir)\})" @@ -533,7 +532,16 @@ def identifyWorkflow( putativeEngineVerVal[1] ) if matched: - engineVer = cast("EngineVersion", matched.group(1)) + if engineVer is None or engineVer < matched.group( + 1 + ): + engineVer = cast( + "EngineVersion", matched.group(1) + ) + else: + self.logger.info( + f"Manifest reports version {matched.group(1)}, but version {engineVer} was requested" + ) break else: self.logger.debug( From aef6fe0b14d29ec3190021000f291f0cffc48db1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 15 May 2024 19:08:18 +0200 Subject: [PATCH 27/42] `materializeEngine` now has an explicit parameter to skip or perform the identification phase. --- wfexs_backend/workflow_engines/__init__.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 7bc6a879..647858ac 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -253,7 +253,10 @@ def staged_containers_dir(self) -> "AnyPath": @abc.abstractmethod def materializeEngine( - self, localWf: "LocalWorkflow", engineVersion: "Optional[EngineVersion]" = None + self, + localWf: "LocalWorkflow", + engineVersion: "Optional[EngineVersion]" = None, + do_identify: "bool" = False, ) -> "Optional[MaterializedWorkflowEngine]": pass @@ -672,7 +675,10 @@ def GetEngineVersion( return matWfEng.instance._get_engine_version_str(matWfEng) def materializeEngine( - self, localWf: "LocalWorkflow", engineVersion: "Optional[EngineVersion]" = None + self, + localWf: "LocalWorkflow", + engineVersion: "Optional[EngineVersion]" = None, + do_identify: "bool" = False, ) -> "Optional[MaterializedWorkflowEngine]": """ Method to ensure the required engine version is materialized @@ -681,7 +687,7 @@ def materializeEngine( """ # This method can be forced to materialize an specific engine version - if engineVersion is None: + if do_identify or engineVersion is None: # The identification could return an augmented LocalWorkflow instance resLocalWf: "Optional[LocalWorkflow]" engineVersion, resLocalWf = self.identifyWorkflow(localWf, engineVersion) From 9e5097a6de880946c90a7aa8372e26ac2716e5d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 16 May 2024 16:03:16 +0200 Subject: [PATCH 28/42] Fixes on RO-Crate generation, as values for some predicates do not have to be "stringified" --- wfexs_backend/ro_crate.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 5ceb68b1..0ea6d277 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -1377,7 +1377,8 @@ def addWorkflowInputs( # inputs and environment variables in an standardized way self.wf_file.append_to(fp_dest, formal_parameter, compact=True) value_required = not in_item.implicit - formal_parameter["valueRequired"] = str(value_required) + # This one must be a real boolean, as of schema.org + formal_parameter["valueRequired"] = value_required item_signature = cast( "bytes", @@ -1545,7 +1546,12 @@ def addWorkflowInputs( if some_not_null: if in_item.implicit and len(in_item.values) == 1: - formal_parameter["defaultValue"] = str(in_item.values[0]) + the_default_value: "Union[bool,str,float,int]" + if isinstance(in_item.values[0], (bool, int, float)): + the_default_value = in_item.values[0] + else: + the_default_value = str(in_item.values[0]) + formal_parameter["defaultValue"] = the_default_value for itemInAtomicValues in cast( "Sequence[Union[bool,str,float,int]]", in_item.values @@ -1563,8 +1569,13 @@ def addWorkflowInputs( ) else: fixedAtomicValue = itemInAtomicValues + the_value: "Union[bool,str,float,int]" + if isinstance(fixedAtomicValue, (bool, int, float)): + the_value = fixedAtomicValue + else: + the_value = str(fixedAtomicValue) parameter_value = PropertyValue( - self.crate, in_item.name, str(fixedAtomicValue) + self.crate, in_item.name, value=the_value ) crate_pv = self.crate.add(parameter_value) if isinstance(crate_coll, Collection): From f6fc13d17c07f465e9b23a377a124a7fd22708ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 16 May 2024 17:26:30 +0200 Subject: [PATCH 29/42] An additional fix related to FormalParameter conformation --- wfexs_backend/ro_crate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 0ea6d277..09424139 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -185,7 +185,7 @@ def __init__( "name": name, # As of https://www.researchobject.org/ro-crate/1.1/workflows.html#describing-inputs-and-outputs "conformsTo": { - "@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE/", + "@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE", }, } From 6080943202f5fdcc7ed8a4c2108bbc21aad6172a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 16 May 2024 18:28:21 +0200 Subject: [PATCH 30/42] Workflow execution inputs for retrospective RO-Crates are properly extracted. --- wfexs_backend/utils/rocrate.py | 326 +++++++++++++++++++++++++++------ wfexs_backend/workflow.py | 44 ++++- 2 files changed, 313 insertions(+), 57 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index bd806e7a..d135f689 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -41,6 +41,7 @@ Optional, Sequence, Tuple, + Union, ) from typing_extensions import ( @@ -50,7 +51,12 @@ from ..common import ( ContainerOperatingSystem, Fingerprint, + MutableParamsBlock, + ParamsBlock, ProcessorArchitecture, + RelPath, + RepoURL, + RepoTag, URIType, WritableWorkflowMetaConfigBlock, ) @@ -59,6 +65,10 @@ WfExSBackend, ) + from ..workflow_engines import ( + WorkflowType, + ) + # Needed by pyld to detect it import aiohttp import pyld # type: ignore[import, import-untyped] @@ -67,6 +77,8 @@ from ..common import ( ContainerType, + ContentKind, + RemoteRepo, ) from ..container_factories import ( @@ -146,10 +158,12 @@ class ROCrateToolbox(abc.ABC): WFEXS_TRICK_SPARQL_BASE: "Final[str]" = f"{WFEXS_TRICK_SPARQL_PRE_PREFIX}///" WFEXS_TRICK_SPARQL_NS: "Final[str]" = "wfexs" + SCHEMA_ORG_PREFIX: "Final[str]" = "http://schema.org/" + SPARQL_NS = { "dc": "http://purl.org/dc/elements/1.1/", "dcterms": "http://purl.org/dc/terms/", - "s": "http://schema.org/", + "s": SCHEMA_ORG_PREFIX, "bs": "https://bioschemas.org/", "bsworkflow": "https://bioschemas.org/profiles/ComputationalWorkflow/", "rocrate": "https://w3id.org/ro/crate/", @@ -162,6 +176,15 @@ class ROCrateToolbox(abc.ABC): WFEXS_TRICK_SPARQL_NS: WFEXS_TRICK_SPARQL_BASE, } + LEAF_TYPE_2_ADDITIONAL_TYPE: "Final[Mapping[str, str]]" = { + SCHEMA_ORG_PREFIX + "Integer": "Integer", + SCHEMA_ORG_PREFIX + "Text": "Text", + SCHEMA_ORG_PREFIX + "Boolean": "Boolean", + SCHEMA_ORG_PREFIX + "Float": "Float", + SCHEMA_ORG_PREFIX + "MediaObject": "File", + SCHEMA_ORG_PREFIX + "Dataset": "Directory", + } + def __init__(self, wfexs: "WfExSBackend"): if wfexs is None: raise ROCrateToolboxException( @@ -215,6 +238,13 @@ def __init__(self, wfexs: "WfExSBackend"): } } } +""" + + GET_LICENCES_SPARQL: "Final[str]" = """\ +SELECT ?license +WHERE { + ?entity s:license ?license . +} """ def identifyROCrate( @@ -307,7 +337,7 @@ def identifyROCrate( return (resrow, g) OBTAIN_WORKFLOW_PID_SPARQL: "Final[str]" = """\ -SELECT ?identifier ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version +SELECT ?identifier ?workflow_repository ?workflow_version ?workflow_url ?workflow_alternate_name ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version WHERE { ?mainentity s:programmingLanguage ?programminglanguage . ?programminglanguage @@ -316,6 +346,18 @@ def identifyROCrate( OPTIONAL { ?mainentity s:identifier ?identifier . } + OPTIONAL { + ?mainentity s:codeRepository ?workflow_repository . + } + OPTIONAL { + ?mainentity s:version ?workflow_version . + } + OPTIONAL { + ?mainentity s:url ?workflow_url . + } + OPTIONAL { + ?mainentity s:alternateName ?workflow_alternate_name . + } OPTIONAL { ?programminglanguage s:version ?programminglanguage_version . @@ -389,7 +431,7 @@ def identifyROCrate( ?execution s:object ?input . { # A file, which is a schema.org MediaObject - VALUES (?additional_type) { ( "File" ) } + BIND ( "File" AS ?additional_type ) ?input a s:MediaObject ; s:contentUrl ?fileuri ; @@ -400,18 +442,25 @@ def identifyROCrate( s:additionalType ?additional_type . } UNION { # A directory, which is a schema.org Dataset - VALUES (?additional_type) { ( "Dataset" ) } + BIND ( "Dataset" AS ?additional_type ) ?input a s:Dataset ; s:contentUrl ?fileuri ; - s:exampleOfWork ?inputfp ; - s:hasPart+ ?component . + s:exampleOfWork ?inputfp . ?inputfp a bs:FormalParameter ; s:name ?name ; s:additionalType ?additional_type . - ?component - a s:MediaObject . + FILTER EXISTS { + # subquery to determine it is not an empty Dataset + SELECT ?dircomp + WHERE { + ?input + s:hasPart+ ?dircomp . + ?dircomp + a s:MediaObject . + } + } } UNION { # A single property value, which can be either Integer, Text, Boolean or Float VALUES (?additional_type) { ( "Integer" ) ( "Text" ) ( "Boolean" ) ( "Float" ) } @@ -425,7 +474,8 @@ def identifyROCrate( s:additionalType ?additional_type . } UNION { # A combination of files or directories or property values - VALUES (?leaf_type ?additional_type) { ( s:Integer "Collection" ) ( s:Text "Collection" ) ( s:Boolean "Collection" ) ( s:Float "Collection" ) ( s:MediaObject "Collection" ) ( s:Dataset "Collection" ) } + BIND ( "Collection" AS ?additional_type ) + VALUES ( ?leaf_type ) { ( s:Integer ) ( s:Text ) ( s:Boolean ) ( s:Float ) ( s:MediaObject ) ( s:Dataset ) } ?input a s:Collection ; s:exampleOfWork ?inputfp ; @@ -622,7 +672,9 @@ def _parseInputsFromExecution( g: "rdflib.graph.Graph", execution: "rdflib.term.Identifier", main_entity: "rdflib.term.Identifier", - ) -> "None": + default_licences: "Sequence[str]", + public_name: "str", + ) -> "ParamsBlock": # Get the list of inputs qinputs = rdflib.plugins.sparql.prepareQuery( self.OBTAIN_INPUTS_SPARQL, @@ -636,37 +688,145 @@ def _parseInputsFromExecution( ) # TODO: implement this + params: "MutableParamsBlock" = {} + for inputrow in qinputsres: + assert isinstance( + inputrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" - return None + base = params + param_path = str(inputrow.name).split(".") + param_last = param_path[-1] + + # Reaching the relative position + if len(param_path) > 1: + for param_step in param_path[0:-1]: + base = base.setdefault(param_step, {}) + + # Now, fill in the values + additional_type = str(inputrow.additional_type) + valarr: "Optional[MutableSequence[Any]]" = None + valobj: "Optional[MutableMapping[str, Any]]" = None + # Is it a nested one? + if additional_type == "Collection": + leaf_type = str(inputrow.leaf_type) + leaf_additional_type = self.LEAF_TYPE_2_ADDITIONAL_TYPE.get(leaf_type) + if leaf_additional_type is None: + raise ROCrateToolboxException( + f"Unable to handle contents of type {leaf_type} in input Collection {str(inputrow.name)}" + ) + additional_type = leaf_additional_type + if leaf_additional_type not in ("File", "Dataset"): + valarr = base.setdefault(param_last, []) + + # Is it a file or a directory? + if additional_type in ("File", "Dataset"): + valobj = base.setdefault( + param_last, + { + "c-l-a-s-s": ContentKind.Directory.name + if additional_type == "Dataset" + else ContentKind.File.name, + }, + ) - def generateWorkflowMetaFromJSONLD( + if isinstance(valobj, dict): + licences = self._getLicences(g, inputrow.input, public_name) + if len(licences) == 0: + licences = default_licences + the_url: "Union[str, Mapping[str, Any]]" + if len(licences) == 0: + the_url = str(inputrow.fileuri) + else: + the_url = { + "uri": str(inputrow.fileuri), + "licences": licences, + } + + valurl = valobj.get("url") + if isinstance(valurl, (str, dict)): + valurl = [valurl] + valobj["url"] = valurl + + if isinstance(valurl, list): + valurl.append(the_url) + else: + valobj["url"] = the_url + else: + the_value_node: "rdflib.term.Identifier" = inputrow.value + the_value: "Union[str, int, float, bool]" + if isinstance(the_value_node, rdflib.term.Literal): + the_value = the_value_node.value + else: + the_value = str(the_value_node) + + if additional_type == "Integer": + try: + the_value = int(the_value) + except: + self.logger.exception( + f"Expected type {additional_type} for value {the_value}" + ) + elif additional_type == "Boolean": + the_value = bool(the_value) + elif additional_type == "Float": + the_value = float(the_value) + elif additional_type == "Text": + the_value = str(the_value) + else: + raise ROCrateToolboxException( + f"Unable to handle additional type {additional_type} for input {str(inputrow.name)}" + ) + + if isinstance(valarr, list): + valarr.append(the_value) + else: + base[param_last] = the_value + + return params + + def _getLicences( self, - jsonld_obj: "Mapping[str, Any]", + g: "rdflib.graph.Graph", + entity: "rdflib.term.Identifier", public_name: "str", - retrospective_first: "bool" = True, - ) -> "Tuple[WritableWorkflowMetaConfigBlock, Sequence[Container]]": - matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) - # Is it an RO-Crate? - if matched_crate is None: - raise ROCrateToolboxException( - f"JSON-LD from {public_name} is not an RO-Crate" + ) -> "Sequence[str]": + # This query will return the list of licences associated to the + # input entity + qlic = rdflib.plugins.sparql.prepareQuery( + self.GET_LICENCES_SPARQL, + initNs=self.SPARQL_NS, + ) + # TODO: cache resolution of contexts + # TODO: disallow network access for context resolution + # when not in right phase + try: + qlicres = g.query( + qlic, + initBindings={ + "entity": entity, + }, ) - - if matched_crate.wfcrateprofile is None: + except Exception as e: raise ROCrateToolboxException( - f"JSON-LD from {public_name} is not a Workflow RO-Crate" - ) + f"Unable to perform JSON-LD workflow details query over {public_name} (see cascading exceptions)" + ) from e - if matched_crate.mainentity is None: - raise ROCrateToolboxException( - f"Unable to find the main entity workflow at {public_name} Workflow RO-Crate" - ) + licences: "MutableSequence[str]" = [] + for licrow in qlicres: + assert isinstance( + licrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + licences.append(str(licrow.license)) - if matched_crate.wrwfprofile is None: - raise ROCrateToolboxException( - f"JSON-LD from {public_name} is not a WRROC Workflow" - ) + return licences + def _extractWorkflowMetadata( + self, + g: "rdflib.graph.Graph", + main_entity: "rdflib.term.Identifier", + public_name: "str", + ) -> "Tuple[RemoteRepo, WorkflowType]": # This query will tell us where the original workflow was located, # its language and version qlang = rdflib.plugins.sparql.prepareQuery( @@ -681,7 +841,7 @@ def generateWorkflowMetaFromJSONLD( qlangres = g.query( qlang, initBindings={ - "mainentity": matched_crate.mainentity, + "mainentity": main_entity, }, ) except Exception as e: @@ -701,9 +861,42 @@ def generateWorkflowMetaFromJSONLD( if langrow is None: raise ROCrateToolboxException( - f"Unable to get workflow engine details from {public_name}" + f"Unable to get workflow PID and engine details from {public_name}" ) + # Creating the workflow permanent identifier + repo_pid: "str" + if langrow.workflow_repository is not None: + repo_pid = str(langrow.workflow_repository) + + elif langrow.identifier is not None: + repo_pid = str(langrow.identifier) + elif langrow.workflow_url is not None: + repo_pid = str(langrow.workflow_url) + else: + raise ROCrateToolboxException( + f"Unable to infer the permanent identifier from the workflow at {public_name}" + ) + + repo_version: "Optional[str]" = None + if langrow.workflow_version: + repo_version = str(langrow.workflow_version) + + repo_relpath: "Optional[str]" = None + if langrow.workflow_alternate_name is not None: + repo_relpath = str(langrow.workflow_alternate_name) + + repo_web_url: "Optional[str]" = None + if langrow.workflow_url is not None: + repo_web_url = str(langrow.workflow_url) + + repo = RemoteRepo( + repo_url=cast("RepoURL", repo_pid), + tag=cast("Optional[RepoTag]", repo_version), + rel_path=cast("Optional[RelPath]", repo_relpath), + web_url=cast("Optional[URIType]", repo_web_url), + ) + programminglanguage_url = ( None if langrow.programminglanguage_url is None @@ -721,6 +914,43 @@ def generateWorkflowMetaFromJSONLD( programminglanguage_url, programminglanguage_identifier ) + return repo, workflow_type + + def generateWorkflowMetaFromJSONLD( + self, + jsonld_obj: "Mapping[str, Any]", + public_name: "str", + retrospective_first: "bool" = True, + ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, Sequence[Container], ParamsBlock]": + matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) + # Is it an RO-Crate? + if matched_crate is None: + raise ROCrateToolboxException( + f"JSON-LD from {public_name} is not an RO-Crate" + ) + + if matched_crate.wfcrateprofile is None: + raise ROCrateToolboxException( + f"JSON-LD from {public_name} is not a Workflow RO-Crate" + ) + + if matched_crate.mainentity is None: + raise ROCrateToolboxException( + f"Unable to find the main entity workflow at {public_name} Workflow RO-Crate" + ) + + if matched_crate.wrwfprofile is None: + raise ROCrateToolboxException( + f"JSON-LD from {public_name} is not a WRROC Workflow" + ) + + # The default crate licences + crate_licences = self._getLicences(g, matched_crate.mainentity, public_name) + + repo, workflow_type = self._extractWorkflowMetadata( + g, matched_crate.mainentity, public_name + ) + # At this point we know WfExS supports the workflow engine. # Now it is the moment to choose whether to use one of the stored # executions as template (retrospective provenance) @@ -728,6 +958,7 @@ def generateWorkflowMetaFromJSONLD( container_type: "Optional[ContainerType]" = None additional_container_type: "Optional[ContainerType]" = None the_containers: "Sequence[Container]" = [] + params: "ParamsBlock" = {} if retrospective_first: # For the retrospective provenance at least an execution must # be described in the RO-Crate. Once one is chosen, @@ -765,8 +996,12 @@ def generateWorkflowMetaFromJSONLD( # TODO: which are the needed inputs, to be integrated # into the latter workflow_meta? - self._parseInputsFromExecution( - g, execrow.execution, main_entity=matched_crate.mainentity + params = self._parseInputsFromExecution( + g, + execrow.execution, + main_entity=matched_crate.mainentity, + default_licences=crate_licences, + public_name=public_name, ) # Now, let's get the list of input parameters @@ -777,21 +1012,6 @@ def generateWorkflowMetaFromJSONLD( ) from e # TODO: finish + assert container_type is not None - self.logger.info( - f"Workflow type {workflow_type} container factory {container_type} {additional_container_type}" - ) - workflow_meta: "WritableWorkflowMetaConfigBlock" = { - "workflow_id": {}, - "workflow_type": workflow_type.shortname, - "environment": {}, - "params": {}, - "outputs": {}, - "workflow_config": {}, - } - if container_type is not None: - workflow_meta["workflow_config"]["containerType"] = container_type.value - - self.logger.info(f"{json.dumps(workflow_meta, indent=4)}") - - return workflow_meta, the_containers + return repo, workflow_type, container_type, the_containers, params diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 6bb73af4..f4799436 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1389,11 +1389,29 @@ def FromPreviousROCrate( ) from jde ( - workflow_meta, + repo, + workflow_type, + container_type, the_containers, + params, ) = wfexs.rocrate_toolbox.generateWorkflowMetaFromJSONLD( jsonld_obj, public_name ) + logging.info( + f"Repo {repo} workflow type {workflow_type} container factory {container_type}" + ) + workflow_meta: "WritableWorkflowMetaConfigBlock" = { + "workflow_id": {}, + "workflow_type": workflow_type.shortname, + "environment": {}, + "params": params, + "outputs": {}, + "workflow_config": {}, + } + if container_type is not None: + workflow_meta["workflow_config"]["containerType"] = container_type.value + + logging.info(f"{json.dumps(workflow_meta, indent=4)}") # Last, be sure that what it has been generated is correct if wfexs.validateConfigFiles(workflow_meta, securityContextsConfigFilename) > 0: @@ -1637,7 +1655,12 @@ def fetchWorkflow( self.engineVer = engineVer self.localWorkflow = candidateLocalWorkflow - def setupEngine(self, offline: "bool" = False, ignoreCache: "bool" = False) -> None: + def setupEngine( + self, + offline: "bool" = False, + ignoreCache: "bool" = False, + initial_engine_version: "Optional[EngineVersion]" = None, + ) -> None: # The engine is populated by self.fetchWorkflow() if self.engine is None: assert self.id is not None @@ -1654,13 +1677,26 @@ def setupEngine(self, offline: "bool" = False, ignoreCache: "bool" = False) -> N self.engine is not None ), "Workflow engine not properly identified or set up" + engine_version: "Optional[EngineVersion]" if self.materializedEngine is None: assert self.localWorkflow is not None localWorkflow = self.localWorkflow + do_identify = True + if self.engineVer is not None: + engine_version = self.engineVer + else: + engine_version = initial_engine_version else: localWorkflow = self.materializedEngine.workflow - - matWfEngV2 = self.engine.materializeEngine(localWorkflow, self.engineVer) + engine_version = self.materializedEngine.version + do_identify = False + + # This is to avoid double initialization + matWfEngV2 = self.engine.materializeEngine( + localWorkflow, + engineVersion=engine_version, + do_identify=do_identify, + ) # At this point, there can be uninitialized elements if matWfEngV2 is not None: From 3be737db37d534c8f5310fc7c33b080652a54600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 17 May 2024 12:45:40 +0200 Subject: [PATCH 31/42] Added codepath to extract inputs and containers from WRROCs describing a prospective provenance scenario (i.e. all is gathered, but nothing was run) --- wfexs_backend/utils/rocrate.py | 287 ++++++++++++++++++++++++++++++--- 1 file changed, 262 insertions(+), 25 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index d135f689..2bf327c0 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -129,12 +129,25 @@ class ContainerTypeMetadata(NamedTuple): WORKFLOW_RUN_NAMESPACE: "Final[str]" = WORKFLOW_RUN_CONTEXT + "#" +CONTAINER_DOCKERIMAGE_SHORT: "Final[str]" = "DockerImage" +CONTAINER_SIFIMAGE_SHORT: "Final[str]" = "SIFImage" + + class ContainerImageAdditionalType(enum.Enum): - Docker = WORKFLOW_RUN_NAMESPACE + "DockerImage" - Singularity = WORKFLOW_RUN_NAMESPACE + "SIFImage" + Docker = WORKFLOW_RUN_NAMESPACE + CONTAINER_DOCKERIMAGE_SHORT + Singularity = WORKFLOW_RUN_NAMESPACE + CONTAINER_SIFIMAGE_SHORT # No one is available for Conda yet +# This is needed to match ill implementations +StrContainerAdditionalType2ContainerImageAdditionalType: "Final[Mapping[str, ContainerImageAdditionalType]]" = { + ContainerImageAdditionalType.Docker.value: ContainerImageAdditionalType.Docker, + CONTAINER_DOCKERIMAGE_SHORT: ContainerImageAdditionalType.Docker, + ContainerImageAdditionalType.Singularity.value: ContainerImageAdditionalType.Singularity, + CONTAINER_SIFIMAGE_SHORT: ContainerImageAdditionalType.Singularity, +} + + ContainerType2AdditionalType: "Final[Mapping[ContainerType, ContainerImageAdditionalType]]" = { ContainerType.Docker: ContainerImageAdditionalType.Docker, ContainerType.Singularity: ContainerImageAdditionalType.Singularity, @@ -165,7 +178,8 @@ class ROCrateToolbox(abc.ABC): "dcterms": "http://purl.org/dc/terms/", "s": SCHEMA_ORG_PREFIX, "bs": "https://bioschemas.org/", - "bsworkflow": "https://bioschemas.org/profiles/ComputationalWorkflow/", + "bswfprofile": "https://bioschemas.org/profiles/ComputationalWorkflow/", + "bsworkflow": "https://bioschemas.org/ComputationalWorkflow#", "rocrate": "https://w3id.org/ro/crate/", "wfcrate": "https://w3id.org/workflowhub/workflow-ro-crate/", "wfhprofile": "https://about.workflowhub.eu/Workflow-RO-Crate/", @@ -223,7 +237,7 @@ def __init__(self, wfexs: "WfExSBackend"): a bs:ComputationalWorkflow ; dcterms:conformsTo ?bsworkflowprofile . FILTER ( - STRSTARTS(str(?bsworkflowprofile), str(bsworkflow:)) + STRSTARTS(str(?bsworkflowprofile), str(bswfprofile:)) ) . } OPTIONAL { @@ -346,15 +360,6 @@ def identifyROCrate( OPTIONAL { ?mainentity s:identifier ?identifier . } - OPTIONAL { - ?mainentity s:codeRepository ?workflow_repository . - } - OPTIONAL { - ?mainentity s:version ?workflow_version . - } - OPTIONAL { - ?mainentity s:url ?workflow_url . - } OPTIONAL { ?mainentity s:alternateName ?workflow_alternate_name . } @@ -366,6 +371,45 @@ def identifyROCrate( ?programminglanguage s:identifier ?programminglanguage_identifier . } + { + { + FILTER NOT EXISTS { + ?mainentity s:isBasedOn ?origmainentity . + ?origmainentity + a bs:ComputationalWorkflow ; + dcterms:conformsTo ?bsworkflowprofile . + FILTER ( + STRSTARTS(str(?bsworkflowprofile), str(bswfprofile:)) + ) . + } + OPTIONAL { + ?mainentity s:codeRepository ?workflow_repository . + } + OPTIONAL { + ?mainentity s:version ?workflow_version . + } + OPTIONAL { + ?mainentity s:url ?workflow_url . + } + } UNION { + ?mainentity s:isBasedOn ?origmainentity . + ?origmainentity + a bs:ComputationalWorkflow ; + dcterms:conformsTo ?bsworkflowprofile . + OPTIONAL { + ?origmainentity s:codeRepository ?workflow_repository . + } + OPTIONAL { + ?origmainentity s:version ?workflow_version . + } + OPTIONAL { + ?origmainentity s:url ?workflow_url . + } + FILTER ( + STRSTARTS(str(?bsworkflowprofile), str(bswfprofile:)) + ) . + } + } } """ @@ -382,7 +426,55 @@ def identifyROCrate( OBTAIN_RUN_CONTAINERS_SPARQL: "Final[str]" = """\ SELECT ?container ?container_additional_type ?type_of_container ?type_of_container_type ?container_registry ?container_name ?container_tag ?container_sha256 ?container_platform ?container_arch WHERE { - ?execution wrterm:containerImage ?container . + { + ?execution wrterm:containerImage ?container . + } UNION { + ?entity s:softwareAddOn ?container. + } + ?container + a wrterm:ContainerImage ; + s:additionalType ?container_additional_type . + OPTIONAL { + ?container + s:softwareRequirements ?container_type ; + s:applicationCategory ?type_of_container . + ?container_type + a s:SoftwareApplication ; + s:applicationCategory ?type_of_container_type . + FILTER( + STRSTARTS(str(?type_of_container), str(wikidata:)) && + STRSTARTS(str(?type_of_container_type), str(wikidata:)) + ) . + } + OPTIONAL { + ?container wrterm:registry ?container_registry . + } + OPTIONAL { + ?container s:name ?container_name . + } + OPTIONAL { + ?container wrterm:tag ?container_tag . + } + OPTIONAL { + ?container wrterm:sha256 ?container_sha256 . + } + OPTIONAL { + ?container + a s:SoftwareApplication ; + s:operatingSystem ?container_platform . + } + OPTIONAL { + ?container + a s:SoftwareApplication ; + s:processorRequirements ?container_arch . + } +} +""" + + OBTAIN_WF_CONTAINERS_SPARQL: "Final[str]" = """\ +SELECT ?container ?container_additional_type ?type_of_container ?type_of_container_type ?container_registry ?container_name ?container_tag ?container_sha256 ?container_platform ?container_arch +WHERE { + ?entity s:softwareAddOn ?container. ?container a wrterm:ContainerImage ; s:additionalType ?container_additional_type . @@ -425,8 +517,62 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separatedly - OBTAIN_INPUTS_SPARQL: "Final[str]" = """\ -SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?value ?inputcol ?component ?leaf_type + OBTAIN_WORKFLOW_INPUTS_SPARQL: "Final[str]" = """\ +SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?value ?component ?leaf_type +WHERE { + ?main_entity bsworkflow:input ?inputfp . + ?inputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type ; + s:workExample ?input . + { + # A file, which is a schema.org MediaObject + ?input + a s:MediaObject ; + s:contentUrl ?fileuri . + } UNION { + # A directory, which is a schema.org Dataset + ?input + a s:Dataset ; + s:contentUrl ?fileuri . + FILTER EXISTS { + # subquery to determine it is not an empty Dataset + SELECT ?dircomp + WHERE { + ?input + s:hasPart+ ?dircomp . + ?dircomp + a s:MediaObject . + } + } + } UNION { + # A single property value, which can be either Integer, Text, Boolean or Float + ?input + a s:PropertyValue ; + s:value ?value . + } UNION { + # A combination of files or directories or property values + VALUES ( ?leaf_type ) { ( s:Integer ) ( s:Text ) ( s:Boolean ) ( s:Float ) ( s:MediaObject ) ( s:Dataset ) } + ?input + a s:Collection ; + s:hasPart+ ?component . + ?component + a ?leaf_type . + OPTIONAL { + ?component s:contentUrl ?fileuri . + } + OPTIONAL { + ?component s:value ?value . + } + } +} +""" + + # This compound query is much faster when each of the UNION components + # is evaluated separatedly + OBTAIN_EXECUTION_INPUTS_SPARQL: "Final[str]" = """\ +SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?value ?component ?leaf_type WHERE { ?execution s:object ?input . { @@ -496,6 +642,26 @@ def identifyROCrate( } """ + def _parseContainersFromWorkflow( + self, + g: "rdflib.graph.Graph", + main_entity: "rdflib.term.Identifier", + ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": + # Get the list of containers + qcontainers = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_RUN_CONTAINERS_SPARQL, + initNs=self.SPARQL_NS, + ) + qcontainersres = g.query( + qcontainers, + initBindings={ + "execution": rdflib.term.Literal(None), + "entity": main_entity, + }, + ) + + return self.__parseContainersResults(qcontainersres, main_entity) + def _parseContainersFromExecution( self, g: "rdflib.graph.Graph", @@ -511,9 +677,17 @@ def _parseContainersFromExecution( qcontainers, initBindings={ "execution": execution, + "entity": rdflib.term.Literal(None), }, ) + return self.__parseContainersResults(qcontainersres, main_entity) + + def __parseContainersResults( + self, + qcontainersres: "rdflib.query.Result", + main_entity: "rdflib.term.Identifier", + ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": container_type: "Optional[ContainerType]" = None additional_container_type: "Optional[ContainerType]" = None the_containers: "MutableSequence[Container]" = [] @@ -543,10 +717,17 @@ def _parseContainersFromExecution( # implementation if containerrow.container_additional_type is not None: try: + putative_additional_container_image_additional_type = ( + StrContainerAdditionalType2ContainerImageAdditionalType.get( + str(containerrow.container_additional_type) + ) + ) putative_additional_container_type = ( - AdditionalType2ContainerType.get( - ContainerImageAdditionalType( - str(containerrow.container_additional_type) + None + if putative_additional_container_image_additional_type is None + else ( + AdditionalType2ContainerType.get( + putative_additional_container_image_additional_type ) ) ) @@ -599,10 +780,17 @@ def _parseContainersFromExecution( and containerrow.container_name is not None ): try: + putative_additional_container_image_additional_type = ( + StrContainerAdditionalType2ContainerImageAdditionalType.get( + str(containerrow.container_additional_type) + ) + ) putative_additional_container_type = ( - AdditionalType2ContainerType.get( - ContainerImageAdditionalType( - str(containerrow.container_additional_type) + None + if putative_additional_container_image_additional_type is None + else ( + AdditionalType2ContainerType.get( + putative_additional_container_image_additional_type ) ) ) @@ -621,9 +809,11 @@ def _parseContainersFromExecution( ContainerType.Docker: the_registry, } container_identifier = str(containerrow.container_name) - assert containerrow.container_sha256 is not None - fingerprint = f"{the_registry}/{container_identifier}@sha256:{str(containerrow.container_sha256)}" assert containerrow.container_tag is not None + if containerrow.container_sha256 is not None: + fingerprint = f"{the_registry}/{container_identifier}@sha256:{str(containerrow.container_sha256)}" + else: + fingerprint = f"{the_registry}/{container_identifier}:{str(containerrow.container_tag)}" origTaggedName = ( f"{container_identifier}:{str(containerrow.container_tag)}" ) @@ -677,7 +867,7 @@ def _parseInputsFromExecution( ) -> "ParamsBlock": # Get the list of inputs qinputs = rdflib.plugins.sparql.prepareQuery( - self.OBTAIN_INPUTS_SPARQL, + self.OBTAIN_EXECUTION_INPUTS_SPARQL, initNs=self.SPARQL_NS, ) qinputsres = g.query( @@ -687,6 +877,36 @@ def _parseInputsFromExecution( }, ) + return self.__parseInputsResults(qinputsres, g, default_licences, public_name) + + def _parseInputsFromMainEntity( + self, + g: "rdflib.graph.Graph", + main_entity: "rdflib.term.Identifier", + default_licences: "Sequence[str]", + public_name: "str", + ) -> "ParamsBlock": + # Get the list of inputs + qwinputs = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_WORKFLOW_INPUTS_SPARQL, + initNs=self.SPARQL_NS, + ) + qwinputsres = g.query( + qwinputs, + initBindings={ + "main_entity": main_entity, + }, + ) + + return self.__parseInputsResults(qwinputsres, g, default_licences, public_name) + + def __parseInputsResults( + self, + qinputsres: "rdflib.query.Result", + g: "rdflib.graph.Graph", + default_licences: "Sequence[str]", + public_name: "str", + ) -> "ParamsBlock": # TODO: implement this params: "MutableParamsBlock" = {} for inputrow in qinputsres: @@ -1011,6 +1231,23 @@ def generateWorkflowMetaFromJSONLD( f"Unable to perform JSON-LD workflow execution details query over {public_name} (see cascading exceptions)" ) from e + # Following the prospective path + if len(params) == 0: + contresult = self._parseContainersFromWorkflow( + g, + main_entity=matched_crate.mainentity, + ) + # TODO: deal with more than one execution + if contresult is not None: + container_type, the_containers = contresult + + params = self._parseInputsFromMainEntity( + g, + main_entity=matched_crate.mainentity, + default_licences=crate_licences, + public_name=public_name, + ) + # TODO: finish assert container_type is not None From 098477e18b570a68a589cdaa3e6f85e57f4d3ffb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 21 May 2024 20:32:17 +0200 Subject: [PATCH 32/42] Output block is now properly built, based on SPARQL query results. Also, several type definitions have been moved from common to their most natural place. --- wfexs_backend/cache_handler.py | 1 - wfexs_backend/common.py | 24 -- wfexs_backend/container_factories/__init__.py | 9 +- .../abstract_docker_container.py | 2 - .../container_factories/docker_container.py | 4 +- .../container_factories/no_container.py | 5 +- .../container_factories/podman_container.py | 4 +- .../singularity_container.py | 7 +- wfexs_backend/ro_crate.py | 2 +- wfexs_backend/security_context.py | 3 + wfexs_backend/utils/rocrate.py | 253 +++++++++++++++++- wfexs_backend/wfexs_backend.py | 25 +- wfexs_backend/workflow.py | 38 ++- wfexs_backend/workflow_engines/__init__.py | 12 +- wfexs_backend/workflow_engines/cwl_engine.py | 9 +- .../workflow_engines/nextflow_engine.py | 9 +- 16 files changed, 331 insertions(+), 76 deletions(-) diff --git a/wfexs_backend/cache_handler.py b/wfexs_backend/cache_handler.py index e60b9b33..ad095271 100644 --- a/wfexs_backend/cache_handler.py +++ b/wfexs_backend/cache_handler.py @@ -64,7 +64,6 @@ Fingerprint, ProgsMapping, RelPath, - SecurityContextConfig, WritableSecurityContextConfig, URIType, ) diff --git a/wfexs_backend/common.py b/wfexs_backend/common.py index d0d8f751..1a7a875a 100644 --- a/wfexs_backend/common.py +++ b/wfexs_backend/common.py @@ -133,12 +133,9 @@ class EngineMode(enum.Enum): RepoURL = NewType("RepoURL", URIType) # The tag, branch or hash of a workflow in a git repository RepoTag = NewType("RepoTag", str) - # This is also an absolute path - EnginePath = NewType("EnginePath", AbsPath) # This is a container engine version ContainerEngineVersionStr = NewType("ContainerEngineVersionStr", str) - WorkflowEngineVersionStr = NewType("WorkflowEngineVersionStr", str) ContainerOperatingSystem = NewType("ContainerOperatingSystem", str) ProcessorArchitecture = NewType("ProcessorArchitecture", str) @@ -160,27 +157,6 @@ class EngineMode(enum.Enum): SecurityContextConfig: TypeAlias = Mapping[str, Any] WritableSecurityContextConfig: TypeAlias = MutableMapping[str, Any] - SecurityContextConfigBlock: TypeAlias = Mapping[str, SecurityContextConfig] - - # TODO: study using TypedDict - LocalConfig: TypeAlias = Mapping[str, Any] - ContainerLocalConfig: TypeAlias = Mapping[str, Any] - EngineLocalConfig: TypeAlias = Mapping[str, Any] - WorkflowConfigBlock: TypeAlias = Mapping[str, Any] - WorkflowMetaConfigBlock: TypeAlias = Mapping[str, Any] - WritableWorkflowMetaConfigBlock: TypeAlias = MutableMapping[str, Any] - WfExSConfigBlock: TypeAlias = Mapping[str, Any] - WritableWfExSConfigBlock: TypeAlias = MutableMapping[str, Any] - ExportActionBlock: TypeAlias = Mapping[str, Any] - ParamsBlock: TypeAlias = Mapping[str, Any] - EnvironmentBlock: TypeAlias = Mapping[str, Any] - MutableParamsBlock: TypeAlias = MutableMapping[str, Any] - OutputsBlock: TypeAlias = Mapping[str, Any] - PlaceHoldersBlock: TypeAlias = Mapping[str, Union[int, float, str]] - - # As each workflow engine can have its own naming convention, leave them to - # provide it - ContainerFileNamingMethod: TypeAlias = Callable[[URIType], RelPath] ## BEWARE!!!! The names of these keys MUST NOT CHANGE diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 6f16bcf8..c10118ba 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -44,6 +44,7 @@ if TYPE_CHECKING: from typing import ( Any, + Callable, ClassVar, Mapping, MutableMapping, @@ -66,8 +67,6 @@ AbsPath, AnyPath, ContainerEngineVersionStr, - ContainerFileNamingMethod, - ContainerLocalConfig, ContainerOperatingSystem, ContainerTaggedName, Fingerprint, @@ -76,6 +75,12 @@ URIType, ) + # As each workflow engine can have its own naming convention, leave them to + # provide it + ContainerFileNamingMethod: TypeAlias = Callable[[URIType], RelPath] + + ContainerLocalConfig: TypeAlias = Mapping[str, Any] + DockerLikeManifest: TypeAlias = Mapping[str, Any] MutableDockerLikeManifest: TypeAlias = MutableMapping[str, Any] diff --git a/wfexs_backend/container_factories/abstract_docker_container.py b/wfexs_backend/container_factories/abstract_docker_container.py index a4842eb6..64e18bf7 100644 --- a/wfexs_backend/container_factories/abstract_docker_container.py +++ b/wfexs_backend/container_factories/abstract_docker_container.py @@ -73,8 +73,6 @@ AbsPath, AnyPath, ContainerEngineVersionStr, - ContainerFileNamingMethod, - ContainerLocalConfig, ContainerOperatingSystem, ContainerTaggedName, ExitVal, diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index 29613aef..4101d771 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -41,8 +41,6 @@ from ..common import ( AbsPath, AnyPath, - ContainerFileNamingMethod, - ContainerLocalConfig, ContainerOperatingSystem, ContainerTaggedName, Fingerprint, @@ -52,6 +50,8 @@ ) from . import ( + ContainerFileNamingMethod, + ContainerLocalConfig, DockerManifestMetadata, ) diff --git a/wfexs_backend/container_factories/no_container.py b/wfexs_backend/container_factories/no_container.py index 9d74f1be..1ef7f34b 100644 --- a/wfexs_backend/container_factories/no_container.py +++ b/wfexs_backend/container_factories/no_container.py @@ -40,8 +40,6 @@ AbsPath, AnyPath, ContainerEngineVersionStr, - ContainerFileNamingMethod, - ContainerLocalConfig, ContainerOperatingSystem, ContainerTaggedName, Fingerprint, @@ -50,6 +48,9 @@ URIType, ) + from . import ( + ContainerFileNamingMethod, + ) from . import ( Container, diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index ba352976..ddf00a25 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -41,8 +41,6 @@ from ..common import ( AbsPath, AnyPath, - ContainerFileNamingMethod, - ContainerLocalConfig, ContainerOperatingSystem, ContainerTaggedName, Fingerprint, @@ -52,6 +50,8 @@ ) from . import ( + ContainerFileNamingMethod, + ContainerLocalConfig, DockerManifestMetadata, ) diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index 9bf85289..6f3a6131 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -59,8 +59,6 @@ from ..common import ( AbsPath, AnyPath, - ContainerFileNamingMethod, - ContainerLocalConfig, ContainerTaggedName, Fingerprint, ProcessorArchitecture, @@ -68,6 +66,11 @@ URIType, ) + from . import ( + ContainerFileNamingMethod, + ContainerLocalConfig, + ) + class SingularityManifest(TypedDict): registryServer: Required[str] registryType: Required[str] diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 09424139..23c0e4c1 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -74,7 +74,6 @@ SymbolicOutputName, URIType, WFLangVersion, - WorkflowEngineVersionStr, ) from .container_factories import ( @@ -84,6 +83,7 @@ from .workflow_engines import ( MaterializedWorkflowEngine, WorkflowType, + WorkflowEngineVersionStr, ) from .utils.licences import ( diff --git a/wfexs_backend/security_context.py b/wfexs_backend/security_context.py index cbb95e94..fbe085ad 100644 --- a/wfexs_backend/security_context.py +++ b/wfexs_backend/security_context.py @@ -48,6 +48,9 @@ AnyPath, RelPath, SecurityContextConfig, + ) + + from .wfexs_backend import ( SecurityContextConfigBlock, ) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 2bf327c0..49bca96f 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -51,20 +51,24 @@ from ..common import ( ContainerOperatingSystem, Fingerprint, - MutableParamsBlock, - ParamsBlock, ProcessorArchitecture, RelPath, RepoURL, RepoTag, URIType, - WritableWorkflowMetaConfigBlock, ) from ..wfexs_backend import ( WfExSBackend, ) + from ..workflow import ( + MutableParamsBlock, + ParamsBlock, + MutableOutputsBlock, + OutputsBlock, + ) + from ..workflow_engines import ( WorkflowType, ) @@ -199,6 +203,13 @@ class ROCrateToolbox(abc.ABC): SCHEMA_ORG_PREFIX + "Dataset": "Directory", } + # WfExS-backend is not able to deal with collections of atomic values + # (yet) + LEAF_TYPE_2_OUTPUT_ADDITIONAL_TYPE: "Final[Mapping[str, str]]" = { + SCHEMA_ORG_PREFIX + "MediaObject": "File", + SCHEMA_ORG_PREFIX + "Dataset": "Directory", + } + def __init__(self, wfexs: "WfExSBackend"): if wfexs is None: raise ROCrateToolboxException( @@ -567,6 +578,23 @@ def identifyROCrate( } } } +""" + + # This compound query is much faster when each of the UNION components + # is evaluated separatedly + OBTAIN_WORKFLOW_OUTPUTS_SPARQL: "Final[str]" = """\ +SELECT ?name ?outputfp ?additional_type ?default_value +WHERE { + ?main_entity bsworkflow:output ?outputfp . + ?outputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + OPTIONAL { + ?ouputfp + s:defaultValue ?default_value . + } +} """ # This compound query is much faster when each of the UNION components @@ -640,6 +668,93 @@ def identifyROCrate( } } } +""" + + # This compound query is much faster when each of the UNION components + # is evaluated separatedly + OBTAIN_EXECUTION_OUTPUTS_SPARQL: "Final[str]" = """\ +SELECT ?output ?name ?alternate_name ?outputfp ?default_value ?additional_type ?fileuri ?value ?component ?leaf_type +WHERE { + ?execution s:result ?output . + { + # A file, which is a schema.org MediaObject + BIND ( "File" AS ?additional_type ) + ?output + a s:MediaObject ; + s:exampleOfWork ?outputfp . + ?outputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + OPTIONAL { + ?output + s:contentUrl ?fileuri . + } + } UNION { + # A directory, which is a schema.org Dataset + BIND ( "Dataset" AS ?additional_type ) + ?output + a s:Dataset ; + s:exampleOfWork ?outputfp . + ?outputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + FILTER EXISTS { + # subquery to determine it is not an empty Dataset + SELECT ?dircomp + WHERE { + ?output + s:hasPart+ ?dircomp . + ?dircomp + a s:MediaObject . + } + } + OPTIONAL { + ?output + s:contentUrl ?fileuri . + } + } UNION { + # A single property value, which can be either Integer, Text, Boolean or Float + VALUES (?additional_type) { ( "Integer" ) ( "Text" ) ( "Boolean" ) ( "Float" ) } + ?output + a s:PropertyValue ; + s:exampleOfWork ?outputfp ; + s:value ?value . + ?outputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + } UNION { + # A combination of files or directories or property values + BIND ( "Collection" AS ?additional_type ) + VALUES ( ?leaf_type ) { ( s:Integer ) ( s:Text ) ( s:Boolean ) ( s:Float ) ( s:MediaObject ) ( s:Dataset ) } + ?output + a s:Collection ; + s:exampleOfWork ?outputfp ; + s:hasPart+ ?component . + ?outputfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + ?component + a ?leaf_type . + OPTIONAL { + ?component s:contentUrl ?fileuri . + } + OPTIONAL { + ?component s:value ?value . + } + } + OPTIONAL { + ?ouputfp + s:defaultValue ?default_value . + } + OPTIONAL { + ?output + s:alternateName ?alternate_name . + } +} """ def _parseContainersFromWorkflow( @@ -857,6 +972,119 @@ def __parseContainersResults( return container_type, the_containers + def _parseOutputsFromExecution( + self, + g: "rdflib.graph.Graph", + execution: "rdflib.term.Identifier", + main_entity: "rdflib.term.Identifier", + public_name: "str", + ) -> "OutputsBlock": + # Get the list of outputs + qoutputs = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_EXECUTION_OUTPUTS_SPARQL, + initNs=self.SPARQL_NS, + ) + qoutputsres = g.query( + qoutputs, + initBindings={ + "execution": execution, + }, + ) + + return self.__parseOutputsResults(qoutputsres, g, public_name) + + def _parseOutputsFromMainEntity( + self, + g: "rdflib.graph.Graph", + main_entity: "rdflib.term.Identifier", + public_name: "str", + ) -> "OutputsBlock": + # Get the list of outputs + qwoutputs = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_WORKFLOW_OUTPUTS_SPARQL, + initNs=self.SPARQL_NS, + ) + qwoutputsres = g.query( + qwoutputs, + initBindings={ + "main_entity": main_entity, + }, + ) + + return self.__parseOutputsResults(qwoutputsres, g, public_name) + + def __parseOutputsResults( + self, + qoutputsres: "rdflib.query.Result", + g: "rdflib.graph.Graph", + public_name: "str", + ) -> "OutputsBlock": + # TODO: implement this + outputs: "MutableOutputsBlock" = {} + for outputrow in qoutputsres: + assert isinstance( + outputrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + + base = outputs + output_path = str(outputrow.name).split(".") + output_last = output_path[-1] + + # Reaching the relative position + if len(output_path) > 1: + for output_step in output_path[0:-1]: + base = base.setdefault(output_step, {}) + + # Now, fill in the values + additional_type = str(outputrow.additional_type) + # Is it a nested one? + cardinality = "1" + if additional_type == "Collection": + if not hasattr(outputrow, "leaf_type"): + raise ROCrateToolboxException( + f"Unable to handle Collections of unknown type in output {str(outputrow.name)}" + ) + + cardinality = "+" + leaf_output_type = str(outputrow.leaf_type) + leaf_output_additional_type = ( + self.LEAF_TYPE_2_OUTPUT_ADDITIONAL_TYPE.get(leaf_output_type) + ) + if leaf_output_additional_type is None: + raise ROCrateToolboxException( + f"Unable to handle contents of type {leaf_output_type} in output Collection {str(outputrow.name)}" + ) + additional_type = leaf_output_additional_type + + # Is it a file or a directory? + if additional_type not in ("File", "Dataset"): + raise ROCrateToolboxException( + f"Unable to handle contents of additional type {additional_type} in output Collection {str(outputrow.name)}" + ) + + preferred_name: "Optional[str]" = ( + None + if outputrow.default_value is None + else str(outputrow.default_value) + ) + if hasattr(outputrow, "alternate_name"): + preferred_name = str(outputrow.alternate_name) + + valobj: "MutableMapping[str, Any]" = base.setdefault( + output_last, + { + "c-l-a-s-s": ContentKind.Directory.name + if additional_type == "Dataset" + else ContentKind.File.name, + "cardinality": cardinality, + }, + ) + + if preferred_name is not None: + valobj["preferredName"] = preferred_name + + return outputs + def _parseInputsFromExecution( self, g: "rdflib.graph.Graph", @@ -1141,7 +1369,7 @@ def generateWorkflowMetaFromJSONLD( jsonld_obj: "Mapping[str, Any]", public_name: "str", retrospective_first: "bool" = True, - ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, Sequence[Container], ParamsBlock]": + ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, Sequence[Container], ParamsBlock, OutputsBlock]": matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) # Is it an RO-Crate? if matched_crate is None: @@ -1179,6 +1407,7 @@ def generateWorkflowMetaFromJSONLD( additional_container_type: "Optional[ContainerType]" = None the_containers: "Sequence[Container]" = [] params: "ParamsBlock" = {} + outputs: "OutputsBlock" = {} if retrospective_first: # For the retrospective provenance at least an execution must # be described in the RO-Crate. Once one is chosen, @@ -1224,6 +1453,13 @@ def generateWorkflowMetaFromJSONLD( public_name=public_name, ) + outputs = self._parseOutputsFromExecution( + g, + execrow.execution, + main_entity=matched_crate.mainentity, + public_name=public_name, + ) + # Now, let's get the list of input parameters break except Exception as e: @@ -1248,7 +1484,14 @@ def generateWorkflowMetaFromJSONLD( public_name=public_name, ) + if len(outputs) == 0: + outputs = self._parseOutputsFromMainEntity( + g, + main_entity=matched_crate.mainentity, + public_name=public_name, + ) + # TODO: finish assert container_type is not None - return repo, workflow_type, container_type, the_containers, params + return repo, workflow_type, container_type, the_containers, params, outputs diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 1b377a41..ecd0843d 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -171,7 +171,10 @@ Union, ) - from typing_extensions import Final + from typing_extensions import ( + Final, + TypeAlias, + ) from crypt4gh.header import CompoundKey @@ -179,27 +182,18 @@ AbsPath, AnyPath, ContainerType, - EnvironmentBlock, ExitVal, - ExportActionBlock, MarshallingStatus, - OutputsBlock, - ParamsBlock, ProgsMapping, RelPath, RepoTag, RepoURL, SecurityContextConfig, - SecurityContextConfigBlock, StagedSetup, SymbolicName, TRS_Workflow_Descriptor, URIType, - WfExSConfigBlock, WfExSInstanceId, - WorkflowConfigBlock, - WorkflowMetaConfigBlock, - WritableWfExSConfigBlock, ) from .workflow_engines import ( @@ -216,10 +210,21 @@ ) from .workflow import ( + EnvironmentBlock, + ExportActionBlock, + OutputsBlock, + ParamsBlock, WFVersionId, + WorkflowConfigBlock, WorkflowId, + WorkflowMetaConfigBlock, ) + SecurityContextConfigBlock: TypeAlias = Mapping[str, SecurityContextConfig] + + WfExSConfigBlock: TypeAlias = Mapping[str, Any] + WritableWfExSConfigBlock: TypeAlias = MutableMapping[str, Any] + class IdentifiedWorkflow(NamedTuple): """ diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index f4799436..8ce0cff7 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -84,6 +84,7 @@ from typing_extensions import ( Final, Literal, + TypeAlias, TypedDict, Required, NotRequired, @@ -96,31 +97,20 @@ ContainerEngineVersionStr, ContainerOperatingSystem, EngineVersion, - EnvironmentBlock, ExitVal, - ExportActionBlock, LicenceDescription, MaterializedOutput, - MutableParamsBlock, - OutputsBlock, - ParamsBlock, - PlaceHoldersBlock, ProcessorArchitecture, RelPath, RepoTag, RepoURL, SecurityContextConfig, - SecurityContextConfigBlock, SymbolicName, SymbolicParamName, SymbolicOutputName, TRS_Workflow_Descriptor, WfExSInstanceId, - WorkflowConfigBlock, - WorkflowEngineVersionStr, - WorkflowMetaConfigBlock, WritableSecurityContextConfig, - WritableWorkflowMetaConfigBlock, URIType, URIWithMetadata, ) @@ -131,6 +121,7 @@ from .workflow_engines import ( AbstractWorkflowEngineType, + WorkflowEngineVersionStr, ) from .pushers import ( @@ -186,8 +177,25 @@ total=False, ) - WFVersionId = Union[str, int] - WorkflowId = Union[str, int] + WFVersionId: TypeAlias = Union[str, int] + WorkflowId: TypeAlias = Union[str, int] + + ExportActionBlock: TypeAlias = Mapping[str, Any] + + MutableParamsBlock: TypeAlias = MutableMapping[str, Any] + ParamsBlock: TypeAlias = Mapping[str, Any] + + PlaceHoldersBlock: TypeAlias = Mapping[str, Union[int, float, str]] + + EnvironmentBlock: TypeAlias = Mapping[str, Any] + + MutableOutputsBlock: TypeAlias = MutableMapping[str, Any] + OutputsBlock: TypeAlias = Mapping[str, Any] + + WorkflowConfigBlock: TypeAlias = Mapping[str, Any] + + WorkflowMetaConfigBlock: TypeAlias = Mapping[str, Any] + WritableWorkflowMetaConfigBlock: TypeAlias = MutableMapping[str, Any] import urllib.parse @@ -1394,18 +1402,20 @@ def FromPreviousROCrate( container_type, the_containers, params, + outputs, ) = wfexs.rocrate_toolbox.generateWorkflowMetaFromJSONLD( jsonld_obj, public_name ) logging.info( f"Repo {repo} workflow type {workflow_type} container factory {container_type}" ) + logging.info(f"Containers {the_containers}") workflow_meta: "WritableWorkflowMetaConfigBlock" = { "workflow_id": {}, "workflow_type": workflow_type.shortname, "environment": {}, "params": params, - "outputs": {}, + "outputs": outputs, "workflow_config": {}, } if container_type is not None: diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 647858ac..4beb4bb9 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -52,6 +52,7 @@ Callable, Mapping, MutableSequence, + NewType, Optional, Pattern, Sequence, @@ -63,6 +64,7 @@ from typing_extensions import ( Final, + TypeAlias, ) from ..common import ( @@ -72,8 +74,6 @@ ContainerEngineVersionStr, ContainerOperatingSystem, ContainerTaggedName, - EngineLocalConfig, - EnginePath, EngineVersion, ExitVal, ExpectedOutput, @@ -90,7 +90,6 @@ TRS_Workflow_Descriptor, URIType, WFLangVersion, - WorkflowEngineVersionStr, ) from ..container_factories import ( @@ -98,6 +97,13 @@ ContainerFactory, ) + EngineLocalConfig: TypeAlias = Mapping[str, Any] + + # This is also an absolute path + EnginePath = NewType("EnginePath", AbsPath) + + WorkflowEngineVersionStr = NewType("WorkflowEngineVersionStr", str) + from ..container_factories.no_container import ( NoContainerFactory, ) diff --git a/wfexs_backend/workflow_engines/cwl_engine.py b/wfexs_backend/workflow_engines/cwl_engine.py index 0e9f3e9d..8a7308fa 100644 --- a/wfexs_backend/workflow_engines/cwl_engine.py +++ b/wfexs_backend/workflow_engines/cwl_engine.py @@ -67,8 +67,6 @@ from ..common import ( AbsPath, AnyPath, - EngineLocalConfig, - EnginePath, EngineVersion, ExitVal, ExpectedOutput, @@ -77,7 +75,6 @@ RelPath, SymbolicParamName, URIType, - WorkflowEngineVersionStr, ) from ..container_factories import ( @@ -99,6 +96,12 @@ from jsonpath_ng.jsonpath import JSONVal + from . import ( + EngineLocalConfig, + EnginePath, + WorkflowEngineVersionStr, + ) + import jsonpath_ng import jsonpath_ng.ext diff --git a/wfexs_backend/workflow_engines/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py index 7b81deb5..52d1eba5 100644 --- a/wfexs_backend/workflow_engines/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -70,9 +70,7 @@ from ..common import ( AbsPath, AnyPath, - EngineLocalConfig, EngineMode, - EnginePath, EngineVersion, ExitVal, ExpectedOutput, @@ -81,7 +79,6 @@ RelPath, SymbolicParamName, URIType, - WorkflowEngineVersionStr, ) from ..container_factories import ( @@ -96,6 +93,12 @@ NfWorkflow, ) + from . import ( + EngineLocalConfig, + EnginePath, + WorkflowEngineVersionStr, + ) + from . import WorkflowEngine, WorkflowEngineException from . import ( MaterializedWorkflowEngine, From dd0d829d552cd071d0ec87571532eb83ec86695b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 22 May 2024 00:57:36 +0200 Subject: [PATCH 33/42] Environment block is now properly built, based on SPARQL query results. Also, a few type definitions have been moved from common to their most natural places. --- wfexs_backend/common.py | 5 - wfexs_backend/container_factories/__init__.py | 9 +- .../abstract_docker_container.py | 3 - .../container_factories/docker_container.py | 4 +- .../container_factories/no_container.py | 4 +- .../container_factories/podman_container.py | 4 +- .../singularity_container.py | 2 +- wfexs_backend/ro_crate.py | 37 +- wfexs_backend/utils/rocrate.py | 323 +++++++++++++++++- wfexs_backend/workflow.py | 21 +- wfexs_backend/workflow_engines/__init__.py | 6 +- 11 files changed, 369 insertions(+), 49 deletions(-) diff --git a/wfexs_backend/common.py b/wfexs_backend/common.py index 1a7a875a..584a381f 100644 --- a/wfexs_backend/common.py +++ b/wfexs_backend/common.py @@ -134,11 +134,6 @@ class EngineMode(enum.Enum): # The tag, branch or hash of a workflow in a git repository RepoTag = NewType("RepoTag", str) - # This is a container engine version - ContainerEngineVersionStr = NewType("ContainerEngineVersionStr", str) - ContainerOperatingSystem = NewType("ContainerOperatingSystem", str) - ProcessorArchitecture = NewType("ProcessorArchitecture", str) - # This is a workflow engine version EngineVersion = NewType("EngineVersion", str) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index c10118ba..090d659d 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -49,6 +49,7 @@ Mapping, MutableMapping, MutableSequence, + NewType, Optional, Sequence, Set, @@ -66,11 +67,8 @@ from ..common import ( AbsPath, AnyPath, - ContainerEngineVersionStr, - ContainerOperatingSystem, ContainerTaggedName, Fingerprint, - ProcessorArchitecture, RelPath, URIType, ) @@ -81,6 +79,11 @@ ContainerLocalConfig: TypeAlias = Mapping[str, Any] + # This is a container engine version + ContainerEngineVersionStr = NewType("ContainerEngineVersionStr", str) + ContainerOperatingSystem = NewType("ContainerOperatingSystem", str) + ProcessorArchitecture = NewType("ProcessorArchitecture", str) + DockerLikeManifest: TypeAlias = Mapping[str, Any] MutableDockerLikeManifest: TypeAlias = MutableMapping[str, Any] diff --git a/wfexs_backend/container_factories/abstract_docker_container.py b/wfexs_backend/container_factories/abstract_docker_container.py index 64e18bf7..d03a0e7b 100644 --- a/wfexs_backend/container_factories/abstract_docker_container.py +++ b/wfexs_backend/container_factories/abstract_docker_container.py @@ -72,12 +72,9 @@ from ..common import ( AbsPath, AnyPath, - ContainerEngineVersionStr, - ContainerOperatingSystem, ContainerTaggedName, ExitVal, Fingerprint, - ProcessorArchitecture, RelPath, ) diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index 4101d771..53ddb730 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -41,10 +41,8 @@ from ..common import ( AbsPath, AnyPath, - ContainerOperatingSystem, ContainerTaggedName, Fingerprint, - ProcessorArchitecture, RelPath, URIType, ) @@ -52,7 +50,9 @@ from . import ( ContainerFileNamingMethod, ContainerLocalConfig, + ContainerOperatingSystem, DockerManifestMetadata, + ProcessorArchitecture, ) from ..common import ( diff --git a/wfexs_backend/container_factories/no_container.py b/wfexs_backend/container_factories/no_container.py index 1ef7f34b..710006dd 100644 --- a/wfexs_backend/container_factories/no_container.py +++ b/wfexs_backend/container_factories/no_container.py @@ -39,16 +39,14 @@ from ..common import ( AbsPath, AnyPath, - ContainerEngineVersionStr, - ContainerOperatingSystem, ContainerTaggedName, Fingerprint, - ProcessorArchitecture, RelPath, URIType, ) from . import ( + ContainerEngineVersionStr, ContainerFileNamingMethod, ) diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index ddf00a25..ab2d8a82 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -41,10 +41,8 @@ from ..common import ( AbsPath, AnyPath, - ContainerOperatingSystem, ContainerTaggedName, Fingerprint, - ProcessorArchitecture, RelPath, URIType, ) @@ -52,7 +50,9 @@ from . import ( ContainerFileNamingMethod, ContainerLocalConfig, + ContainerOperatingSystem, DockerManifestMetadata, + ProcessorArchitecture, ) from ..common import ( diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index 6f3a6131..efc749b5 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -61,7 +61,6 @@ AnyPath, ContainerTaggedName, Fingerprint, - ProcessorArchitecture, RelPath, URIType, ) @@ -69,6 +68,7 @@ from . import ( ContainerFileNamingMethod, ContainerLocalConfig, + ProcessorArchitecture, ) class SingularityManifest(TypedDict): diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 23c0e4c1..35ffbfb6 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -55,15 +55,12 @@ AbsPath, AbstractGeneratedContent, AnyPath, - ContainerEngineVersionStr, - ContainerOperatingSystem, EngineVersion, ExpectedOutput, Fingerprint, LocalWorkflow, MaterializedInput, MaterializedOutput, - ProcessorArchitecture, ProgsMapping, RelPath, RemoteRepo, @@ -78,6 +75,9 @@ from .container_factories import ( Container, + ContainerEngineVersionStr, + ContainerOperatingSystem, + ProcessorArchitecture, ) from .workflow_engines import ( @@ -2217,6 +2217,23 @@ def writeWRROC(self, filename: "AnyPath") -> None: ) self.crate.write_zip(filename) + def addStagedWorkflowDetails( + self, + inputs: "Sequence[MaterializedInput]", + environment: "Sequence[MaterializedInput]", + outputs: "Optional[Sequence[ExpectedOutput]]", + ) -> None: + """ + This method is used for WRROCs with only prospective provenance + """ + self.addWorkflowInputs(inputs, are_envvars=False) + + if len(environment) > 0: + self.addWorkflowInputs(environment, are_envvars=True) + + if outputs is not None: + self.addWorkflowExpectedOutputs(outputs) + def addWorkflowExecution( self, stagedExec: "StagedExecution", @@ -2263,11 +2280,15 @@ def addWorkflowExecution( stagedExec.augmentedInputs, are_envvars=False, ) - crate_envvars = self.addWorkflowInputs( - stagedExec.environment, - are_envvars=True, - ) - crate_action["object"] = [*crate_inputs, *crate_envvars] + crate_action["object"] = crate_inputs + + # Add environment, according to WRROC 0.4 + if len(stagedExec.environment) > 0: + crate_envvars = self.addWorkflowInputs( + stagedExec.environment, + are_envvars=True, + ) + crate_action["environment"] = crate_envvars # TODO: Add engine specific traces # see https://www.researchobject.org/workflow-run-crate/profiles/workflow_run_crate#adding-engine-specific-traces diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 49bca96f..1e1b6f12 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -49,20 +49,24 @@ ) from ..common import ( - ContainerOperatingSystem, Fingerprint, - ProcessorArchitecture, RelPath, RepoURL, RepoTag, URIType, ) + from ..container_factories import ( + ContainerOperatingSystem, + ProcessorArchitecture, + ) + from ..wfexs_backend import ( WfExSBackend, ) from ..workflow import ( + EnvironmentBlock, MutableParamsBlock, ParamsBlock, MutableOutputsBlock, @@ -527,7 +531,7 @@ def identifyROCrate( """ # This compound query is much faster when each of the UNION components - # is evaluated separatedly + # is evaluated separately OBTAIN_WORKFLOW_INPUTS_SPARQL: "Final[str]" = """\ SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?value ?component ?leaf_type WHERE { @@ -581,7 +585,65 @@ def identifyROCrate( """ # This compound query is much faster when each of the UNION components - # is evaluated separatedly + # is evaluated separately + OBTAIN_WORKFLOW_ENV_SPARQL: "Final[str]" = """\ +SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?value ?component ?leaf_type +WHERE { + ?main_entity wrterm:environment ?envfp . + ?envfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type ; + s:workExample ?env . + { + # A file, which is a schema.org MediaObject + ?env + a s:MediaObject ; + s:name ?name_env ; + s:contentUrl ?fileuri . + } UNION { + # A directory, which is a schema.org Dataset + ?env + a s:Dataset ; + s:name ?name_env ; + s:contentUrl ?fileuri . + FILTER EXISTS { + # subquery to determine it is not an empty Dataset + SELECT ?dircomp + WHERE { + ?env + s:hasPart+ ?dircomp . + ?dircomp + a s:MediaObject . + } + } + } UNION { + # A single property value, which can be either Integer, Text, Boolean or Float + ?env + a s:PropertyValue ; + s:name ?name_env ; + s:value ?value . + } UNION { + # A combination of files or directories or property values + VALUES ( ?leaf_type ) { ( s:Integer ) ( s:Text ) ( s:Boolean ) ( s:Float ) ( s:MediaObject ) ( s:Dataset ) } + ?env + a s:Collection ; + s:name ?name_env ; + s:hasPart+ ?component . + ?component + a ?leaf_type . + OPTIONAL { + ?component s:contentUrl ?fileuri . + } + OPTIONAL { + ?component s:value ?value . + } + } +} +""" + + # This compound query is much faster when each of the UNION components + # is evaluated separately OBTAIN_WORKFLOW_OUTPUTS_SPARQL: "Final[str]" = """\ SELECT ?name ?outputfp ?additional_type ?default_value WHERE { @@ -598,7 +660,7 @@ def identifyROCrate( """ # This compound query is much faster when each of the UNION components - # is evaluated separatedly + # is evaluated separately OBTAIN_EXECUTION_INPUTS_SPARQL: "Final[str]" = """\ SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?value ?component ?leaf_type WHERE { @@ -671,7 +733,84 @@ def identifyROCrate( """ # This compound query is much faster when each of the UNION components - # is evaluated separatedly + # is evaluated separately + OBTAIN_EXECUTION_ENV_SPARQL: "Final[str]" = """\ +SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?value ?component ?leaf_type +WHERE { + ?execution wrterm:environment ?env . + { + # A file, which is a schema.org MediaObject + BIND ( "File" AS ?additional_type ) + ?env + a s:MediaObject ; + s:name ?name_env ; + s:contentUrl ?fileuri ; + s:exampleOfWork ?envfp . + ?envfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + } UNION { + # A directory, which is a schema.org Dataset + BIND ( "Dataset" AS ?additional_type ) + ?env + a s:Dataset ; + s:name ?name_env ; + s:contentUrl ?fileuri ; + s:exampleOfWork ?envfp . + ?envfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + FILTER EXISTS { + # subquery to determine it is not an empty Dataset + SELECT ?dircomp + WHERE { + ?input + s:hasPart+ ?dircomp . + ?dircomp + a s:MediaObject . + } + } + } UNION { + # A single property value, which can be either Integer, Text, Boolean or Float + VALUES (?additional_type) { ( "Integer" ) ( "Text" ) ( "Boolean" ) ( "Float" ) } + ?env + a s:PropertyValue ; + s:name ?name_env ; + s:exampleOfWork ?envfp ; + s:value ?value . + ?envfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + } UNION { + # A combination of files or directories or property values + BIND ( "Collection" AS ?additional_type ) + VALUES ( ?leaf_type ) { ( s:Integer ) ( s:Text ) ( s:Boolean ) ( s:Float ) ( s:MediaObject ) ( s:Dataset ) } + ?env + a s:Collection ; + s:name ?name_env ; + s:exampleOfWork ?envfp ; + s:hasPart+ ?component . + ?envfp + a bs:FormalParameter ; + s:name ?name ; + s:additionalType ?additional_type . + ?component + a ?leaf_type . + OPTIONAL { + ?component s:contentUrl ?fileuri . + } + OPTIONAL { + ?component s:value ?value . + } + } +} +""" + + # This compound query is much faster when each of the UNION components + # is evaluated separately OBTAIN_EXECUTION_OUTPUTS_SPARQL: "Final[str]" = """\ SELECT ?output ?name ?alternate_name ?outputfp ?default_value ?additional_type ?fileuri ?value ?component ?leaf_type WHERE { @@ -1233,6 +1372,150 @@ def __parseInputsResults( return params + def _parseEnvFromExecution( + self, + g: "rdflib.graph.Graph", + execution: "rdflib.term.Identifier", + main_entity: "rdflib.term.Identifier", + default_licences: "Sequence[str]", + public_name: "str", + ) -> "EnvironmentBlock": + # Get the list of inputs + qenv = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_EXECUTION_ENV_SPARQL, + initNs=self.SPARQL_NS, + ) + qenvres = g.query( + qenv, + initBindings={ + "execution": execution, + }, + ) + + return self.__parseEnvResults(qenvres, g, default_licences, public_name) + + def _parseEnvFromMainEntity( + self, + g: "rdflib.graph.Graph", + main_entity: "rdflib.term.Identifier", + default_licences: "Sequence[str]", + public_name: "str", + ) -> "EnvironmentBlock": + # Get the list of inputs + qwenv = rdflib.plugins.sparql.prepareQuery( + self.OBTAIN_WORKFLOW_ENV_SPARQL, + initNs=self.SPARQL_NS, + ) + qwenvres = g.query( + qwenv, + initBindings={ + "main_entity": main_entity, + }, + ) + + return self.__parseEnvResults(qwenvres, g, default_licences, public_name) + + def __parseEnvResults( + self, + qenvres: "rdflib.query.Result", + g: "rdflib.graph.Graph", + default_licences: "Sequence[str]", + public_name: "str", + ) -> "EnvironmentBlock": + """ + This method is (almost) identical to __parseInputsResults + """ + # TODO: implement this + environment: "MutableMapping[str, Any]" = {} + for envrow in qenvres: + assert isinstance( + envrow, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + + env_name = str(envrow.name) + + # Now, fill in the values + additional_type = str(envrow.additional_type) + valarr: "Optional[MutableSequence[Any]]" = None + valobj: "Optional[MutableMapping[str, Any]]" = None + # Is it a nested one? + if additional_type == "Collection": + leaf_type = str(envrow.leaf_type) + leaf_additional_type = self.LEAF_TYPE_2_ADDITIONAL_TYPE.get(leaf_type) + if leaf_additional_type is None: + raise ROCrateToolboxException( + f"Unable to handle contents of type {leaf_type} in Collection reflecting contents pointed by environment variable {env_name}" + ) + additional_type = leaf_additional_type + if leaf_additional_type not in ("File", "Dataset"): + valarr = environment.setdefault(env_name, []) + + # Is it a file or a directory? + if additional_type in ("File", "Dataset"): + valobj = environment.setdefault( + env_name, + { + "c-l-a-s-s": ContentKind.Directory.name + if additional_type == "Dataset" + else ContentKind.File.name, + }, + ) + + if isinstance(valobj, dict): + licences = self._getLicences(g, envrow.env, public_name) + if len(licences) == 0: + licences = default_licences + the_url: "Union[str, Mapping[str, Any]]" + if len(licences) == 0: + the_url = str(envrow.fileuri) + else: + the_url = { + "uri": str(envrow.fileuri), + "licences": licences, + } + + valurl = valobj.get("url") + if isinstance(valurl, (str, dict)): + valurl = [valurl] + valobj["url"] = valurl + + if isinstance(valurl, list): + valurl.append(the_url) + else: + valobj["url"] = the_url + else: + the_value_node: "rdflib.term.Identifier" = envrow.value + the_value: "Union[str, int, float, bool]" + if isinstance(the_value_node, rdflib.term.Literal): + the_value = the_value_node.value + else: + the_value = str(the_value_node) + + if additional_type == "Integer": + try: + the_value = int(the_value) + except: + self.logger.exception( + f"Expected type {additional_type} for value {the_value} in environment variable {env_name}" + ) + elif additional_type == "Boolean": + the_value = bool(the_value) + elif additional_type == "Float": + the_value = float(the_value) + elif additional_type == "Text": + the_value = str(the_value) + else: + raise ROCrateToolboxException( + f"Unable to handle additional type {additional_type} for environment variable {env_name}" + ) + + if isinstance(valarr, list): + valarr.append(the_value) + else: + environment[env_name] = the_value + + return environment + def _getLicences( self, g: "rdflib.graph.Graph", @@ -1369,7 +1652,7 @@ def generateWorkflowMetaFromJSONLD( jsonld_obj: "Mapping[str, Any]", public_name: "str", retrospective_first: "bool" = True, - ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, Sequence[Container], ParamsBlock, OutputsBlock]": + ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, Sequence[Container], ParamsBlock, EnvironmentBlock, OutputsBlock]": matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) # Is it an RO-Crate? if matched_crate is None: @@ -1407,6 +1690,7 @@ def generateWorkflowMetaFromJSONLD( additional_container_type: "Optional[ContainerType]" = None the_containers: "Sequence[Container]" = [] params: "ParamsBlock" = {} + environment: "EnvironmentBlock" = {} outputs: "OutputsBlock" = {} if retrospective_first: # For the retrospective provenance at least an execution must @@ -1453,6 +1737,14 @@ def generateWorkflowMetaFromJSONLD( public_name=public_name, ) + environment = self._parseEnvFromExecution( + g, + execrow.execution, + main_entity=matched_crate.mainentity, + default_licences=crate_licences, + public_name=public_name, + ) + outputs = self._parseOutputsFromExecution( g, execrow.execution, @@ -1484,6 +1776,13 @@ def generateWorkflowMetaFromJSONLD( public_name=public_name, ) + environment = self._parseEnvFromMainEntity( + g, + main_entity=matched_crate.mainentity, + default_licences=crate_licences, + public_name=public_name, + ) + if len(outputs) == 0: outputs = self._parseOutputsFromMainEntity( g, @@ -1494,4 +1793,12 @@ def generateWorkflowMetaFromJSONLD( # TODO: finish assert container_type is not None - return repo, workflow_type, container_type, the_containers, params, outputs + return ( + repo, + workflow_type, + container_type, + the_containers, + params, + environment, + outputs, + ) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 8ce0cff7..2c6b666b 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -94,13 +94,10 @@ AbsPath, AnyContent, AnyPath, - ContainerEngineVersionStr, - ContainerOperatingSystem, EngineVersion, ExitVal, LicenceDescription, MaterializedOutput, - ProcessorArchitecture, RelPath, RepoTag, RepoURL, @@ -115,6 +112,12 @@ URIWithMetadata, ) + from .container_factories import ( + ContainerEngineVersionStr, + ContainerOperatingSystem, + ProcessorArchitecture, + ) + from .encrypted_fs import ( EncryptedFSType, ) @@ -1402,6 +1405,7 @@ def FromPreviousROCrate( container_type, the_containers, params, + environment, outputs, ) = wfexs.rocrate_toolbox.generateWorkflowMetaFromJSONLD( jsonld_obj, public_name @@ -1413,7 +1417,7 @@ def FromPreviousROCrate( workflow_meta: "WritableWorkflowMetaConfigBlock" = { "workflow_id": {}, "workflow_type": workflow_type.shortname, - "environment": {}, + "environment": environment, "params": params, "outputs": outputs, "workflow_config": {}, @@ -4487,16 +4491,11 @@ def createStageResearchObject( crate_pid=crate_pid, ) - wrroc.addWorkflowInputs( + wrroc.addStagedWorkflowDetails( self.materializedParams, - are_envvars=False, - ) - wrroc.addWorkflowInputs( self.materializedEnvironment, - are_envvars=True, + self.outputs, ) - if self.outputs is not None: - wrroc.addWorkflowExpectedOutputs(self.outputs) # Save RO-crate as execution.crate.zip if filename is None: diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 4beb4bb9..62978379 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -71,8 +71,6 @@ AbstractGeneratedContent, AbsPath, AnyPath, - ContainerEngineVersionStr, - ContainerOperatingSystem, ContainerTaggedName, EngineVersion, ExitVal, @@ -81,7 +79,6 @@ LocalWorkflow, MaterializedInput, MaterializedContent, - ProcessorArchitecture, RelPath, StagedExecution, StagedSetup, @@ -94,7 +91,10 @@ from ..container_factories import ( Container, + ContainerEngineVersionStr, ContainerFactory, + ContainerOperatingSystem, + ProcessorArchitecture, ) EngineLocalConfig: TypeAlias = Mapping[str, Any] From 6f7e4d0e10abec10ee67080e28ece0e385f0c8d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 22 May 2024 01:04:22 +0200 Subject: [PATCH 34/42] Updated copyright year --- WfExS-backend.py | 2 +- apptainer-local-installer.bash | 2 +- apptainer12-local-installer.bash | 2 +- basic-installer.bash | 2 +- full-installer.bash | 2 +- fuse2fs-local-installer.bash | 2 +- setup.py | 2 +- singularity-local-installer.bash | 2 +- squashfuse-local-installer.bash | 2 +- wfexs_backend/common.py | 2 +- wfexs_backend/encrypted_fs.py | 2 +- wfexs_backend/fetchers/__init__.py | 2 +- wfexs_backend/fetchers/b2share.py | 2 +- wfexs_backend/fetchers/data.py | 2 +- wfexs_backend/fetchers/doi.py | 2 +- wfexs_backend/fetchers/drs.py | 2 +- wfexs_backend/fetchers/fasp.py | 2 +- wfexs_backend/fetchers/file.py | 2 +- wfexs_backend/fetchers/ftp.py | 2 +- wfexs_backend/fetchers/git.py | 2 +- wfexs_backend/fetchers/http.py | 2 +- wfexs_backend/fetchers/internal/ftp_downloader.py | 2 +- wfexs_backend/fetchers/osf_io.py | 2 +- wfexs_backend/fetchers/pride.py | 2 +- wfexs_backend/fetchers/sftp.py | 2 +- wfexs_backend/fetchers/swh.py | 2 +- wfexs_backend/fetchers/trs_files.py | 2 +- wfexs_backend/fetchers/wiktionary.py | 2 +- wfexs_backend/fetchers/zenodo.py | 2 +- wfexs_backend/pushers/abstract_contexted_export.py | 2 +- wfexs_backend/security_context.py | 2 +- wfexs_backend/utils/digests.py | 2 +- wfexs_backend/utils/docker.py | 2 +- wfexs_backend/utils/groovy_parsing.py | 2 +- wfexs_backend/utils/licences.py | 2 +- wfexs_backend/utils/marshalling_handling.py | 2 +- wfexs_backend/utils/passphrase_wrapper.py | 2 +- wfexs_backend/wfexs_backend.py | 2 +- wfexs_backend/workflow_engines/__init__.py | 2 +- wfexs_backend/workflow_engines/cwl_engine.py | 2 +- wfexs_backend/workflow_engines/nextflow_engine.py | 2 +- 41 files changed, 41 insertions(+), 41 deletions(-) diff --git a/WfExS-backend.py b/WfExS-backend.py index d7470066..ead64497 100755 --- a/WfExS-backend.py +++ b/WfExS-backend.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/apptainer-local-installer.bash b/apptainer-local-installer.bash index 47bd6fd0..51fa01c9 100755 --- a/apptainer-local-installer.bash +++ b/apptainer-local-installer.bash @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/apptainer12-local-installer.bash b/apptainer12-local-installer.bash index d8dc6088..53eb5da8 100755 --- a/apptainer12-local-installer.bash +++ b/apptainer12-local-installer.bash @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/basic-installer.bash b/basic-installer.bash index cddb7d72..79553e5e 100755 --- a/basic-installer.bash +++ b/basic-installer.bash @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/full-installer.bash b/full-installer.bash index 943baf51..ff2af12c 100755 --- a/full-installer.bash +++ b/full-installer.bash @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/fuse2fs-local-installer.bash b/fuse2fs-local-installer.bash index 84946a85..8c4884a0 100755 --- a/fuse2fs-local-installer.bash +++ b/fuse2fs-local-installer.bash @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/setup.py b/setup.py index 68b8a7b9..e7928926 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/singularity-local-installer.bash b/singularity-local-installer.bash index b04a24df..f2680d56 100755 --- a/singularity-local-installer.bash +++ b/singularity-local-installer.bash @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/squashfuse-local-installer.bash b/squashfuse-local-installer.bash index 6b3009dd..d8d66649 100755 --- a/squashfuse-local-installer.bash +++ b/squashfuse-local-installer.bash @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/common.py b/wfexs_backend/common.py index 584a381f..902e1826 100644 --- a/wfexs_backend/common.py +++ b/wfexs_backend/common.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/encrypted_fs.py b/wfexs_backend/encrypted_fs.py index cc416b89..b5c131cf 100644 --- a/wfexs_backend/encrypted_fs.py +++ b/wfexs_backend/encrypted_fs.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index eb04768a..7f54a116 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/b2share.py b/wfexs_backend/fetchers/b2share.py index d3d8f8d7..e3043136 100644 --- a/wfexs_backend/fetchers/b2share.py +++ b/wfexs_backend/fetchers/b2share.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/data.py b/wfexs_backend/fetchers/data.py index 1942d311..fe6861a7 100644 --- a/wfexs_backend/fetchers/data.py +++ b/wfexs_backend/fetchers/data.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/doi.py b/wfexs_backend/fetchers/doi.py index 0f6d8821..1467d60e 100644 --- a/wfexs_backend/fetchers/doi.py +++ b/wfexs_backend/fetchers/doi.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/drs.py b/wfexs_backend/fetchers/drs.py index 95113a16..b4d09fa4 100644 --- a/wfexs_backend/fetchers/drs.py +++ b/wfexs_backend/fetchers/drs.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/fasp.py b/wfexs_backend/fetchers/fasp.py index a9ebd6ca..f183369d 100644 --- a/wfexs_backend/fetchers/fasp.py +++ b/wfexs_backend/fetchers/fasp.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/file.py b/wfexs_backend/fetchers/file.py index fd8d278f..1d6a813c 100644 --- a/wfexs_backend/fetchers/file.py +++ b/wfexs_backend/fetchers/file.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/ftp.py b/wfexs_backend/fetchers/ftp.py index 313502c8..2b6bdcb5 100644 --- a/wfexs_backend/fetchers/ftp.py +++ b/wfexs_backend/fetchers/ftp.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index 1b85bb3c..b783aaa4 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/http.py b/wfexs_backend/fetchers/http.py index d1b0a274..4ae3eabc 100644 --- a/wfexs_backend/fetchers/http.py +++ b/wfexs_backend/fetchers/http.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/internal/ftp_downloader.py b/wfexs_backend/fetchers/internal/ftp_downloader.py index 9672e855..0466ba02 100644 --- a/wfexs_backend/fetchers/internal/ftp_downloader.py +++ b/wfexs_backend/fetchers/internal/ftp_downloader.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/osf_io.py b/wfexs_backend/fetchers/osf_io.py index c39e4949..68eaed3d 100644 --- a/wfexs_backend/fetchers/osf_io.py +++ b/wfexs_backend/fetchers/osf_io.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/pride.py b/wfexs_backend/fetchers/pride.py index 20ddf2cc..cbe8c28b 100644 --- a/wfexs_backend/fetchers/pride.py +++ b/wfexs_backend/fetchers/pride.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/sftp.py b/wfexs_backend/fetchers/sftp.py index 461dac62..f46f8bd8 100644 --- a/wfexs_backend/fetchers/sftp.py +++ b/wfexs_backend/fetchers/sftp.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index bf1838dc..7c180e80 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index d0f3f379..039eda51 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/wiktionary.py b/wfexs_backend/fetchers/wiktionary.py index 6aaec9ae..9d92c9f7 100644 --- a/wfexs_backend/fetchers/wiktionary.py +++ b/wfexs_backend/fetchers/wiktionary.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/fetchers/zenodo.py b/wfexs_backend/fetchers/zenodo.py index f4b78ce7..ab2b559f 100644 --- a/wfexs_backend/fetchers/zenodo.py +++ b/wfexs_backend/fetchers/zenodo.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/pushers/abstract_contexted_export.py b/wfexs_backend/pushers/abstract_contexted_export.py index 0a55b915..651130f8 100644 --- a/wfexs_backend/pushers/abstract_contexted_export.py +++ b/wfexs_backend/pushers/abstract_contexted_export.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/security_context.py b/wfexs_backend/security_context.py index fbe085ad..866f04c4 100644 --- a/wfexs_backend/security_context.py +++ b/wfexs_backend/security_context.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/utils/digests.py b/wfexs_backend/utils/digests.py index b064ce75..6626693e 100644 --- a/wfexs_backend/utils/digests.py +++ b/wfexs_backend/utils/digests.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/utils/docker.py b/wfexs_backend/utils/docker.py index 11aedf7e..6903dc82 100644 --- a/wfexs_backend/utils/docker.py +++ b/wfexs_backend/utils/docker.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/utils/groovy_parsing.py b/wfexs_backend/utils/groovy_parsing.py index 5026f007..a6dd16b5 100644 --- a/wfexs_backend/utils/groovy_parsing.py +++ b/wfexs_backend/utils/groovy_parsing.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 # Parts of this module are inspired on translated-groovy3-parser.py # from groovy-parser module -# Copyright (C) 2023 Barcelona Supercomputing Center, José M. Fernández +# Copyright (C) 2024 Barcelona Supercomputing Center, José M. Fernández # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/utils/licences.py b/wfexs_backend/utils/licences.py index 684b55e2..85f6fa45 100644 --- a/wfexs_backend/utils/licences.py +++ b/wfexs_backend/utils/licences.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/utils/marshalling_handling.py b/wfexs_backend/utils/marshalling_handling.py index 6b97c15a..1fde7ec9 100644 --- a/wfexs_backend/utils/marshalling_handling.py +++ b/wfexs_backend/utils/marshalling_handling.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/utils/passphrase_wrapper.py b/wfexs_backend/utils/passphrase_wrapper.py index 23094338..c5c11210 100644 --- a/wfexs_backend/utils/passphrase_wrapper.py +++ b/wfexs_backend/utils/passphrase_wrapper.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index ecd0843d..6e4fa770 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 62978379..9a993733 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/workflow_engines/cwl_engine.py b/wfexs_backend/workflow_engines/cwl_engine.py index 8a7308fa..bf244add 100644 --- a/wfexs_backend/workflow_engines/cwl_engine.py +++ b/wfexs_backend/workflow_engines/cwl_engine.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/wfexs_backend/workflow_engines/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py index 52d1eba5..a9c7f33d 100644 --- a/wfexs_backend/workflow_engines/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2023 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 13f646e2d39a73b031e05e7ad65736c2698ae09e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 22 May 2024 17:14:20 +0200 Subject: [PATCH 35/42] Several declarations have been moved from `wfexs_backend.common` to `wfexs_backend.fetchers` --- tests/fetchers/test_git.py | 4 +- wfexs_backend/common.py | 37 ----------------- wfexs_backend/fetchers/__init__.py | 65 +++++++++++++++++++++++++++++- wfexs_backend/fetchers/git.py | 32 ++++++++------- wfexs_backend/fetchers/swh.py | 32 +++++++-------- wfexs_backend/ro_crate.py | 5 ++- wfexs_backend/utils/rocrate.py | 5 ++- wfexs_backend/wfexs_backend.py | 18 ++++++--- wfexs_backend/workflow.py | 4 +- 9 files changed, 122 insertions(+), 80 deletions(-) diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index d9b5f364..daa3bc0d 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -34,9 +34,9 @@ URIType, ) -from wfexs_backend.common import ( - RepoGuessFlavor, +from wfexs_backend.fetchers import ( RemoteRepo, + RepoGuessFlavor, RepoType, ) from wfexs_backend.fetchers.git import guess_git_repo_params diff --git a/wfexs_backend/common.py b/wfexs_backend/common.py index 902e1826..b3eadded 100644 --- a/wfexs_backend/common.py +++ b/wfexs_backend/common.py @@ -548,43 +548,6 @@ class LocalWorkflow(NamedTuple): TRS_Workflow_Descriptor: TypeAlias = str -class RepoType(enum.Enum): - Git = "git" - Raw = "raw" - Other = "other" - SoftwareHeritage = "swh" - TRS = "trs" - - @classmethod - def _undeprecate_table(cls) -> "Mapping[str, str]": - # These fixes are needed to map deprecated values - # to the most approximate ones - return { - "github": "git", - "gitlab": "git", - "bitbucket": "git", - } - - -class RepoGuessFlavor(enum.Enum): - GitHub = "github" - GitLab = "gitlab" - BitBucket = "bitbucket" - - -class RemoteRepo(NamedTuple): - """ - Remote repository description - """ - - repo_url: "RepoURL" - tag: "Optional[RepoTag]" = None - rel_path: "Optional[RelPath]" = None - repo_type: "Optional[RepoType]" = None - web_url: "Optional[URIType]" = None - guess_flavor: "Optional[RepoGuessFlavor]" = None - - class StagedSetup(NamedTuple): instance_id: "WfExSInstanceId" container_type: "ContainerType" diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index 7f54a116..a43e52c9 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -19,6 +19,7 @@ from __future__ import absolute_import import abc +import enum import logging from typing import ( @@ -72,6 +73,7 @@ class RepoDesc(TypedDict): tag: Required[Optional[RepoTag]] checkout: Required[RepoTag] relpath: NotRequired[RelPath] + repotype: NotRequired[str] class ProtocolFetcherReturn(NamedTuple): @@ -231,18 +233,77 @@ class RepoGuessException(FetcherException): pass +class RepoType(enum.Enum): + Git = "git" + Raw = "raw" + Other = "other" + SoftwareHeritage = "swh" + TRS = "trs" + + @classmethod + def _undeprecate_table(cls) -> "Mapping[str, str]": + # These fixes are needed to map deprecated values + # to the most approximate ones + return { + "github": "git", + "gitlab": "git", + "bitbucket": "git", + } + + +class RepoGuessFlavor(enum.Enum): + GitHub = "github" + GitLab = "gitlab" + BitBucket = "bitbucket" + + +class RemoteRepo(NamedTuple): + """ + Remote repository description + """ + + repo_url: "RepoURL" + tag: "Optional[RepoTag]" = None + rel_path: "Optional[RelPath]" = None + repo_type: "Optional[RepoType]" = None + web_url: "Optional[URIType]" = None + guess_flavor: "Optional[RepoGuessFlavor]" = None + checkout: "Optional[RepoTag]" = None + + def gen_repo_desc(self) -> "Optional[RepoDesc]": + retval: "RepoDesc" = { + "repo": self.repo_url, + "tag": self.tag, + "checkout": self.get_checkout(), + } + + if self.repo_type is not None: + retval["repotype"] = self.repo_type.value + + return retval + + def get_checkout(self) -> "RepoTag": + return ( + self.checkout + if self.checkout is not None + else self.tag + if self.tag is not None + else cast("RepoTag", "") + ) + + class AbstractRepoFetcher(AbstractStatefulFetcher): PRIORITY: "ClassVar[int]" = DEFAULT_PRIORITY + 10 @abc.abstractmethod - def doMaterializeRepo( + def materialize_repo( self, repoURL: "RepoURL", repoTag: "Optional[RepoTag]" = None, repo_tag_destdir: "Optional[AbsPath]" = None, base_repo_destdir: "Optional[AbsPath]" = None, doUpdate: "Optional[bool]" = True, - ) -> "Tuple[AbsPath, RepoDesc, Sequence[URIWithMetadata]]": + ) -> "Tuple[AbsPath, RemoteRepo, Sequence[URIWithMetadata]]": pass diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index b783aaa4..dbc7810a 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -60,7 +60,6 @@ from . import ( AbstractStatefulFetcher, - RepoDesc, ) @@ -73,14 +72,14 @@ DocumentedStatefulProtocolFetcher, FetcherException, ProtocolFetcherReturn, + RemoteRepo, RepoGuessException, + RepoGuessFlavor, + RepoType, ) from ..common import ( ContentKind, - RemoteRepo, - RepoGuessFlavor, - RepoType, URIWithMetadata, ) @@ -128,14 +127,14 @@ def description(self) -> "str": def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": return (cls.DEFAULT_GIT_CMD,) - def doMaterializeRepo( + def materialize_repo( self, repoURL: "RepoURL", repoTag: "Optional[RepoTag]" = None, repo_tag_destdir: "Optional[AbsPath]" = None, base_repo_destdir: "Optional[AbsPath]" = None, doUpdate: "Optional[bool]" = True, - ) -> "Tuple[AbsPath, RepoDesc, Sequence[URIWithMetadata]]": + ) -> "Tuple[AbsPath, RemoteRepo, Sequence[URIWithMetadata]]": """ :param repoURL: The URL to the repository. @@ -267,6 +266,7 @@ def doMaterializeRepo( gitrevparse_params = [self.git_cmd, "rev-parse", "--verify", "HEAD"] self.logger.debug(f'Running "{" ".join(gitrevparse_params)}"') + repo_effective_checkout: "Optional[RepoTag]" = None with subprocess.Popen( gitrevparse_params, stdout=subprocess.PIPE, @@ -278,15 +278,16 @@ def doMaterializeRepo( "RepoTag", revproc.stdout.read().rstrip() ) - repo_desc: "RepoDesc" = { - "repo": repoURL, - "tag": repoTag, - "checkout": repo_effective_checkout, - } + remote_repo = RemoteRepo( + repo_url=repoURL, + tag=repoTag, + repo_type=RepoType.Git, + checkout=repo_effective_checkout, + ) return ( repo_tag_destdir, - repo_desc, + remote_repo, [], ) @@ -361,11 +362,11 @@ def fetch( parse.urlunparse((gitScheme, parsedInputURL.netloc, gitPath, "", "", "")), ) - repo_tag_destdir, repo_desc, metadata_array = self.doMaterializeRepo( + repo_tag_destdir, remote_repo, metadata_array = self.materialize_repo( repoURL, repoTag=repoTag ) if repoRelPath is not None: - repo_desc["relpath"] = cast("RelPath", repoRelPath) + remote_repo = remote_repo._replace(rel_path=cast("RelPath", repoRelPath)) preferredName: "Optional[RelPath]" if repoRelPath is not None: @@ -387,6 +388,9 @@ def fetch( # shutil.move(cachedContentPath, cachedFilename) link_or_copy(cast("AnyPath", cachedContentPath), cachedFilename) + repo_desc: "Optional[Mapping[str, Any]]" = remote_repo.gen_repo_desc() + if repo_desc is None: + repo_desc = {} augmented_metadata_array = [ URIWithMetadata( uri=remote_file, metadata=repo_desc, preferredName=preferredName diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index 7c180e80..e12d6976 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -50,10 +50,6 @@ Final, ) - from . import ( - RepoDesc, - ) - from ..common import ( AbsPath, AnyPath, @@ -71,15 +67,15 @@ DocumentedStatefulProtocolFetcher, FetcherException, ProtocolFetcherReturn, + RemoteRepo, RepoGuessException, + RepoType, ) from .http import fetchClassicURL from ..common import ( ContentKind, - RemoteRepo, - RepoType, URIWithMetadata, ) @@ -120,14 +116,14 @@ def description(self) -> "str": def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": return tuple() - def doMaterializeRepo( + def materialize_repo( self, repoURL: "RepoURL", repoTag: "Optional[RepoTag]" = None, repo_tag_destdir: "Optional[AbsPath]" = None, base_repo_destdir: "Optional[AbsPath]" = None, doUpdate: "Optional[bool]" = True, - ) -> "Tuple[AbsPath, RepoDesc, Sequence[URIWithMetadata]]": + ) -> "Tuple[AbsPath, RemoteRepo, Sequence[URIWithMetadata]]": # If we are here is because the repo is valid # as it should have been checked by guess_swh_repo_params @@ -480,15 +476,16 @@ def doMaterializeRepo( f"Unexpected Software Heritage object type {object_type} for {repoURL}" ) - repo_desc: "RepoDesc" = { - "repo": repoURL, - "tag": repoTag, - "checkout": cast("RepoTag", repo_effective_checkout), - } + remote_repo = RemoteRepo( + repo_url=repoURL, + tag=repoTag, + repo_type=RepoType.SoftwareHeritage, + checkout=cast("RepoTag", repo_effective_checkout), + ) return ( repo_tag_destdir, - repo_desc, + remote_repo, metadata_array, ) @@ -517,7 +514,7 @@ def fetch( repoRelPath = None # It is materialized in a temporary location - repo_tag_destdir, repo_desc, metadata_array = self.doMaterializeRepo( + repo_tag_destdir, remote_repo, metadata_array = self.materialize_repo( cast("RepoURL", remote_file) ) @@ -533,7 +530,7 @@ def fetch( # This is to remove spurious detections repoRelPath = None - repo_desc["relpath"] = cast("RelPath", repoRelPath) + remote_repo = remote_repo._replace(rel_path=cast("RelPath", repoRelPath)) if os.path.isdir(cachedContentPath): kind = ContentKind.Directory @@ -547,6 +544,9 @@ def fetch( # shutil.move(cachedContentPath, cachedFilename) link_or_copy(cast("AnyPath", cachedContentPath), cachedFilename) + repo_desc: "Optional[Mapping[str, Any]]" = remote_repo.gen_repo_desc() + if repo_desc is None: + repo_desc = {} augmented_metadata_array = [ URIWithMetadata( uri=remote_file, metadata=repo_desc, preferredName=preferredName diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 35ffbfb6..d3d3fef3 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -63,7 +63,6 @@ MaterializedOutput, ProgsMapping, RelPath, - RemoteRepo, RepoTag, RepoURL, StagedExecution, @@ -80,6 +79,10 @@ ProcessorArchitecture, ) + from .fetchers import ( + RemoteRepo, + ) + from .workflow_engines import ( MaterializedWorkflowEngine, WorkflowType, diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 1e1b6f12..48c249b0 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -86,7 +86,6 @@ from ..common import ( ContainerType, ContentKind, - RemoteRepo, ) from ..container_factories import ( @@ -98,6 +97,10 @@ stringifyDigest, ) +from ..fetchers import ( + RemoteRepo, +) + class ContainerTypeMetadata(NamedTuple): sa_id: "str" diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 6e4fa770..10d68ff1 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -67,8 +67,6 @@ DEFAULT_PROGS, LicensedURI, MaterializedContent, - RemoteRepo, - RepoType, URIWithMetadata, ) @@ -123,6 +121,8 @@ AbstractStatefulFetcher, DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher, + RemoteRepo, + RepoType, ) from .fetchers.git import ( @@ -2393,7 +2393,7 @@ def _doMaterializeGitRepo( :return: """ gitFetcherInst = self.instantiateStatefulFetcher(GitFetcher) - repoDir, repo_desc, metadata_array = gitFetcherInst.doMaterializeRepo( + repoDir, materialized_repo, metadata_array = gitFetcherInst.materialize_repo( repo.repo_url, repoTag=repo.tag, doUpdate=doUpdate, @@ -2410,6 +2410,9 @@ def _doMaterializeGitRepo( if repo.tag is not None: remote_url += "@" + repo.tag + repo_desc: "Optional[Mapping[str, Any]]" = materialized_repo.gen_repo_desc() + if repo_desc is None: + repo_desc = {} augmented_metadata_array = [ URIWithMetadata( uri=cast("URIType", remote_url), @@ -2419,7 +2422,7 @@ def _doMaterializeGitRepo( ] return ( cast("URIType", remote_url), - repo_desc["checkout"], + materialized_repo.get_checkout(), repoDir, augmented_metadata_array, ) @@ -2437,12 +2440,15 @@ def _doMaterializeSoftwareHeritageDirOrContent( :return: """ swhFetcherInst = self.instantiateStatefulFetcher(SoftwareHeritageFetcher) - repoDir, repo_desc, metadata_array = swhFetcherInst.doMaterializeRepo( + repoDir, materialized_repo, metadata_array = swhFetcherInst.materialize_repo( cast("RepoURL", repo.tag) if repo.tag is not None else repo.repo_url, doUpdate=doUpdate, base_repo_destdir=self.cacheWorkflowDir, ) + repo_desc: "Optional[Mapping[str, Any]]" = materialized_repo.gen_repo_desc() + if repo_desc is None: + repo_desc = {} augmented_metadata_array = [ URIWithMetadata( uri=cast("URIType", repo.repo_url), @@ -2452,7 +2458,7 @@ def _doMaterializeSoftwareHeritageDirOrContent( ] return ( repo.repo_url, - repo_desc["checkout"], + materialized_repo.get_checkout(), repoDir, augmented_metadata_array, ) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 2c6b666b..8324c278 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -57,6 +57,9 @@ from .fetchers import ( FetcherException, + # Next ones are needed for correct unmarshalling + RemoteRepo, + RepoType, ) from .utils.orcid import ( @@ -276,7 +279,6 @@ MarshallingStatus, MaterializedContent, MaterializedInput, - RemoteRepo, StagedSetup, ) From c1f8fdbf436912a3faeee90b5e54369836e2e919 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 22 May 2024 18:04:23 +0200 Subject: [PATCH 36/42] Added `GuessRepoParams` class method to `wfexs_backend.fetchers.AbstractRepoFetcher` Also, refactored previous implementations in wfexs_backend.fetchers.git and wfexs_backend.fetchers.swh under GitFetcher and SoftwareHeritageFetcher. This work is needed to allow future modular repository providers. --- tests/fetchers/test_git.py | 4 +- wfexs_backend/fetchers/__init__.py | 10 + wfexs_backend/fetchers/git.py | 721 +++++++++++++++-------------- wfexs_backend/fetchers/swh.py | 214 ++++----- wfexs_backend/wfexs_backend.py | 6 +- 5 files changed, 489 insertions(+), 466 deletions(-) diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index daa3bc0d..37d80bc8 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -39,7 +39,7 @@ RepoGuessFlavor, RepoType, ) -from wfexs_backend.fetchers.git import guess_git_repo_params +from wfexs_backend.fetchers.git import GitFetcher WfExS_basedir = Path(__file__).parent.parent WfExS_basedir_file_uri = WfExS_basedir.as_uri() @@ -194,7 +194,7 @@ ) def test_guess_git_repo_params(url: "str", expected: "RemoteRepo") -> "None": logger = logging.Logger("name") - output = guess_git_repo_params(cast("URIType", url), logger=logger) + output = GitFetcher.GuessRepoParams(cast("URIType", url), logger=logger) # When no tag is given, ignore what it was discovered if output is not None and expected is not None and expected.tag is None: diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index a43e52c9..a71ee483 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -306,6 +306,16 @@ def materialize_repo( ) -> "Tuple[AbsPath, RemoteRepo, Sequence[URIWithMetadata]]": pass + @classmethod + @abc.abstractmethod + def GuessRepoParams( + cls, + orig_wf_url: "Union[URIType, parse.ParseResult]", + logger: "Optional[logging.Logger]" = None, + fail_ok: "bool" = False, + ) -> "Optional[RemoteRepo]": + pass + class AbstractStatefulStreamingFetcher(AbstractStatefulFetcher): def fetch( diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index dbc7810a..aed589bc 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -94,6 +94,11 @@ class GitFetcher(AbstractRepoFetcher): GITHUB_SCHEME: "Final[str]" = "github" DEFAULT_GIT_CMD: "Final[SymbolicName]" = cast("SymbolicName", "git") + HEAD_LABEL: "Final[bytes]" = b"HEAD" + REFS_HEADS_PREFIX: "Final[bytes]" = b"refs/heads/" + REFS_TAGS_PREFIX: "Final[bytes]" = b"refs/tags/" + GIT_SCHEMES: "Final[Sequence[str]]" = ["https", "git", "ssh", "file"] + def __init__( self, progs: "ProgsMapping", setup_block: "Optional[Mapping[str, Any]]" = None ): @@ -127,6 +132,369 @@ def description(self) -> "str": def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": return (cls.DEFAULT_GIT_CMD,) + @classmethod + def _find_git_repo_in_uri( + cls, + remote_file: "Union[URIType, parse.ParseResult]", + ) -> "Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]": + if isinstance(remote_file, parse.ParseResult): + parsedInputURL = remote_file + else: + parsedInputURL = parse.urlparse(remote_file) + sp_path = parsedInputURL.path.split("/") + + shortest_pre_path: "Optional[URIType]" = None + longest_post_path: "Optional[Sequence[str]]" = None + repo_type: "Optional[RepoType]" = None + guessed_repo_flavor: "Optional[RepoGuessFlavor]" = None + the_remote_uri: "Optional[str]" = None + b_default_repo_tag: "Optional[str]" = None + repo_branches: "Optional[MutableSequence[RepoTag]]" = None + for pos in range(len(sp_path), 0, -1): + pre_path = "/".join(sp_path[:pos]) + if pre_path == "": + pre_path = "/" + remote_uri_anc = parse.urlunparse(parsedInputURL._replace(path=pre_path)) + + remote_refs_dict: "Mapping[bytes, bytes]" + try: + # Dulwich works both with file, ssh, git and http(s) protocols + remote_refs_dict = dulwich.porcelain.ls_remote(remote_uri_anc) + repo_type = RepoType.Git + except ( + dulwich.errors.NotGitRepository, + dulwich.errors.GitProtocolError, + ) as ngr: + # Skip and continue + continue + + the_remote_uri = remote_uri_anc + + head_remote_ref = remote_refs_dict[cls.HEAD_LABEL] + repo_branches = [] + b_default_repo_tag = None + for remote_label, remote_ref in remote_refs_dict.items(): + if remote_label.startswith(cls.REFS_HEADS_PREFIX): + b_repo_tag = remote_label[len(cls.REFS_HEADS_PREFIX) :].decode( + "utf-8", errors="continue" + ) + repo_branches.append(cast("RepoTag", b_repo_tag)) + if b_default_repo_tag is None and remote_ref == head_remote_ref: + b_default_repo_tag = b_repo_tag + + # It is considered a git repo! + shortest_pre_path = cast("URIType", pre_path) + longest_post_path = sp_path[pos:] + if repo_type is None: + # Metadata is all we really need + repo_type = RepoType.Raw + req = request.Request(remote_uri_anc, method="HEAD") + try: + with request.urlopen(req) as resp: + # Is it gitlab? + if list( + filter( + lambda c: "gitlab" in c, + resp.headers.get_all("Set-Cookie"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.GitLab + elif list( + filter( + lambda c: GITHUB_NETLOC in c, + resp.headers.get_all("Set-Cookie"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.GitHub + elif list( + filter( + lambda c: "bitbucket" in c, + resp.headers.get_all("X-View-Name"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.BitBucket + except Exception as e: + pass + + if repo_type is None: + raise RepoGuessException(f"Unable to identify {remote_file} as a git repo") + + if b_default_repo_tag is None: + raise RepoGuessException( + f"No tag was obtained while getting default branch name from {remote_file}" + ) + + assert longest_post_path is not None + assert repo_branches is not None + + repo = RemoteRepo( + repo_url=cast("RepoURL", the_remote_uri), + tag=cast("RepoTag", b_default_repo_tag), + repo_type=repo_type, + guess_flavor=guessed_repo_flavor, + ) + return repo, longest_post_path, repo_branches + + @classmethod + def GuessRepoParams( + cls, + wf_url: "Union[URIType, parse.ParseResult]", + logger: "Optional[logging.Logger]" = None, + fail_ok: "bool" = False, + ) -> "Optional[RemoteRepo]": + repoURL = None + repoTag = None + repoRelPath = None + repoType: "Optional[RepoType]" = None + guessedRepoFlavor: "Optional[RepoGuessFlavor]" = None + web_url: "Optional[URIType]" = None + + # Deciding which is the input + if isinstance(wf_url, parse.ParseResult): + parsed_wf_url = wf_url + else: + parsed_wf_url = parse.urlparse(wf_url) + + # These are the usual URIs which can be understood by pip + # See https://pip.pypa.io/en/stable/cli/pip_install/#git + found_params: "Optional[Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]]" = ( + None + ) + try: + if parsed_wf_url.scheme == cls.GITHUB_SCHEME: + repoType = RepoType.Git + guessedRepoFlavor = RepoGuessFlavor.GitHub + + gh_path_split = parsed_wf_url.path.split("/") + gh_path = "/".join(gh_path_split[:2]) + gh_post_path = list(map(parse.unquote_plus, gh_path_split[2:])) + if len(gh_post_path) > 0: + repoTag = gh_post_path[0] + if len(gh_post_path) > 1: + repoRelPath = "/".join(gh_post_path[1:]) + + repoURL = parse.urlunparse( + parse.ParseResult( + scheme="https", + netloc=GITHUB_NETLOC, + path=gh_path, + params="", + query="", + fragment="", + ) + ) + found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) + + elif ( + parsed_wf_url.scheme in ("http", "https") + and parsed_wf_url.netloc == GITHUB_NETLOC + and "@" not in parsed_wf_url.path + and parsed_wf_url.fragment == "" + ): + found_params = cls._find_git_repo_in_uri(parsed_wf_url) + repoURL = found_params[0].repo_url + repoType = RepoType.Git + guessedRepoFlavor = RepoGuessFlavor.GitHub + + # And now, guessing the tag and the relative path + # WARNING! This code can have problems with tags which contain slashes + wf_path = found_params[1] + repo_branches_tags = found_params[2] + if len(wf_path) > 1 and (wf_path[0] in ("blob", "tree")): + wf_path_tag = list(map(parse.unquote_plus, wf_path[1:])) + + tag_relpath = "/".join(wf_path_tag) + for repo_branch_tag in repo_branches_tags: + if repo_branch_tag == tag_relpath or tag_relpath.startswith( + repo_branch_tag + "/" + ): + repoTag = repo_branch_tag + if len(tag_relpath) > len(repo_branch_tag): + tag_relpath = tag_relpath[len(repo_branch_tag) + 1 :] + if len(tag_relpath) > 0: + repoRelPath = tag_relpath + break + else: + # Fallback + repoTag = wf_path_tag[0] + if len(wf_path_tag) > 0: + repoRelPath = "/".join(wf_path_tag[1:]) + elif ( + parsed_wf_url.scheme in ("http", "https") + and parsed_wf_url.netloc == "raw.githubusercontent.com" + ): + repoType = RepoType.Git + guessedRepoFlavor = RepoGuessFlavor.GitHub + wf_path = list(map(parse.unquote_plus, parsed_wf_url.path.split("/"))) + if len(wf_path) >= 3: + # Rebuilding it + repoGitPath = wf_path[:3] + repoGitPath[-1] += ".git" + + # Rebuilding repo git path + repoURL = parse.urlunparse( + ("https", GITHUB_NETLOC, "/".join(repoGitPath), "", "", "") + ) + + # And now, guessing the tag/checkout and the relative path + # WARNING! This code can have problems with tags which contain slashes + found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) + if len(wf_path) >= 4: + repo_branches_tags = found_params[2] + # Validate against existing branch and tag names + tag_relpath = "/".join(wf_path[3:]) + for repo_branch_tag in repo_branches_tags: + if ( + repo_branch_tag == tag_relpath + or tag_relpath.startswith(repo_branch_tag + "/") + ): + repoTag = repo_branch_tag + if len(tag_relpath) > len(repo_branch_tag): + tag_relpath = tag_relpath[ + len(repo_branch_tag) + 1 : + ] + if len(tag_relpath) > 0: + repoRelPath = tag_relpath + break + else: + # Fallback + repoTag = wf_path[3] + if len(wf_path) > 4: + repoRelPath = "/".join(wf_path[4:]) + elif ( + parsed_wf_url.scheme == "" + or (parsed_wf_url.scheme in cls.GetSchemeHandlers()) + or (parsed_wf_url.scheme in cls.GIT_SCHEMES) + ): + if parsed_wf_url.scheme == "": + # It could be a checkout uri in the form of 'git@github.com:inab/WfExS-backend.git' + if ( + parsed_wf_url.netloc == "" + and ("@" in parsed_wf_url.path) + and (":" in parsed_wf_url.path) + ): + gitScheme = "ssh" + parsed_wf_url = parse.urlparse( + f"{gitScheme}://" + + parse.urlunparse(parsed_wf_url).replace(":", "/") + ) + else: + if logger is not None: + logger.debug( + f"No scheme in repo URL. Choices are: {', '.join(cls.GIT_SCHEMES)}" + ) + return None + # Getting the scheme git is going to understand + elif parsed_wf_url.scheme.startswith(cls.GIT_PROTO_PREFIX): + gitScheme = parsed_wf_url.scheme[len(cls.GIT_PROTO_PREFIX) :] + denorm_parsed_wf_url = parsed_wf_url._replace(scheme=gitScheme) + parsed_wf_url = parse.urlparse( + parse.urlunparse(denorm_parsed_wf_url) + ) + else: + gitScheme = parsed_wf_url.scheme + + if gitScheme not in cls.GIT_SCHEMES: + if logger is not None: + logger.debug( + f"Unknown scheme {gitScheme} in repo URL. Choices are: {', '.join(cls.GIT_SCHEMES)}" + ) + return None + + # Beware ssh protocol!!!! I has a corner case with URLs like + # ssh://git@github.com:inab/WfExS-backend.git' + if parsed_wf_url.scheme == "ssh" and ":" in parsed_wf_url.netloc: + new_netloc = parsed_wf_url.netloc + # Translating it to something better + colon_pos = new_netloc.rfind(":") + new_netloc = ( + new_netloc[:colon_pos] + "/" + new_netloc[colon_pos + 1 :] + ) + denorm_parsed_wf_url = parsed_wf_url._replace(netloc=new_netloc) + parsed_wf_url = parse.urlparse( + parse.urlunparse(denorm_parsed_wf_url) + ) + + # Getting the tag or branch + if "@" in parsed_wf_url.path: + gitPath, repoTag = parsed_wf_url.path.split("@", 1) + else: + gitPath = parsed_wf_url.path + + # Getting the repoRelPath (if available) + if len(parsed_wf_url.fragment) > 0: + frag_qs = parse.parse_qs(parsed_wf_url.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if len(subDirArr) > 0: + repoRelPath = subDirArr[0] + + # Now, reassemble the repoURL + repoURL = parse.urlunparse( + (gitScheme, parsed_wf_url.netloc, gitPath, "", "", "") + ) + found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) + guessedRepoFlavor = found_params[0].guess_flavor + # TODO handling other popular cases, like bitbucket + else: + found_params = cls._find_git_repo_in_uri(parsed_wf_url) + + except RepoGuessException as gge: + if not fail_ok: + import traceback + + traceback.print_exc() + raise FetcherException( + f"FIXME: Unsupported http(s) git repository {wf_url} (see cascade exception)" + ) from gge + + if found_params is not None: + if repoTag is None: + repoTag = found_params[0].tag + repoType = found_params[0].repo_type + if guessedRepoFlavor is None: + guessedRepoFlavor = found_params[0].guess_flavor + elif not fail_ok: + raise FetcherException( + f"FIXME: Unsupported git repository {wf_url}. (Is it really a git repo???)" + ) + + if logger is not None: + logger.debug( + "From {} was derived (type {}, flavor {}) {} {} {}".format( + wf_url, repoType, guessedRepoFlavor, repoURL, repoTag, repoRelPath + ) + ) + + if repoURL is None: + return None + + # if repoType == RepoType.GitHub: + # wf_entrypoint_path = [ + # + # ] + # web_url = urllib.parse.urlunparse( + # ( + # "https", + # "raw.githubusercontent.com", + # "/".join(wf_entrypoint_path), + # "", + # "", + # "", + # ) + # ) + + return RemoteRepo( + repo_url=cast("RepoURL", repoURL), + tag=cast("Optional[RepoTag]", repoTag), + rel_path=cast("Optional[RelPath]", repoRelPath), + repo_type=repoType, + guess_flavor=guessedRepoFlavor, + web_url=web_url, + ) + def materialize_repo( self, repoURL: "RepoURL", @@ -403,356 +771,3 @@ def fetch( # TODO: Identify licences in git repositories?? licences=None, ) - - -HEAD_LABEL = b"HEAD" -REFS_HEADS_PREFIX = b"refs/heads/" -REFS_TAGS_PREFIX = b"refs/tags/" -GIT_SCHEMES = ["https", "git", "ssh", "file"] - - -def guess_git_repo_params( - wf_url: "Union[URIType, parse.ParseResult]", - logger: "logging.Logger", - fail_ok: "bool" = False, -) -> "Optional[RemoteRepo]": - repoURL = None - repoTag = None - repoRelPath = None - repoType: "Optional[RepoType]" = None - guessedRepoFlavor: "Optional[RepoGuessFlavor]" = None - web_url: "Optional[URIType]" = None - - # Deciding which is the input - if isinstance(wf_url, parse.ParseResult): - parsed_wf_url = wf_url - else: - parsed_wf_url = parse.urlparse(wf_url) - - # These are the usual URIs which can be understood by pip - # See https://pip.pypa.io/en/stable/cli/pip_install/#git - found_params: "Optional[Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]]" = None - try: - if parsed_wf_url.scheme == GitFetcher.GITHUB_SCHEME: - repoType = RepoType.Git - guessedRepoFlavor = RepoGuessFlavor.GitHub - - gh_path_split = parsed_wf_url.path.split("/") - gh_path = "/".join(gh_path_split[:2]) - gh_post_path = list(map(parse.unquote_plus, gh_path_split[2:])) - if len(gh_post_path) > 0: - repoTag = gh_post_path[0] - if len(gh_post_path) > 1: - repoRelPath = "/".join(gh_post_path[1:]) - - repoURL = parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=GITHUB_NETLOC, - path=gh_path, - params="", - query="", - fragment="", - ) - ) - found_params = find_git_repo_in_uri(cast("URIType", repoURL)) - - elif ( - parsed_wf_url.scheme in ("http", "https") - and parsed_wf_url.netloc == GITHUB_NETLOC - and "@" not in parsed_wf_url.path - and parsed_wf_url.fragment == "" - ): - found_params = find_git_repo_in_uri(parsed_wf_url) - repoURL = found_params[0].repo_url - repoType = RepoType.Git - guessedRepoFlavor = RepoGuessFlavor.GitHub - - # And now, guessing the tag and the relative path - # WARNING! This code can have problems with tags which contain slashes - wf_path = found_params[1] - repo_branches_tags = found_params[2] - if len(wf_path) > 1 and (wf_path[0] in ("blob", "tree")): - wf_path_tag = list(map(parse.unquote_plus, wf_path[1:])) - - tag_relpath = "/".join(wf_path_tag) - for repo_branch_tag in repo_branches_tags: - if repo_branch_tag == tag_relpath or tag_relpath.startswith( - repo_branch_tag + "/" - ): - repoTag = repo_branch_tag - if len(tag_relpath) > len(repo_branch_tag): - tag_relpath = tag_relpath[len(repo_branch_tag) + 1 :] - if len(tag_relpath) > 0: - repoRelPath = tag_relpath - break - else: - # Fallback - repoTag = wf_path_tag[0] - if len(wf_path_tag) > 0: - repoRelPath = "/".join(wf_path_tag[1:]) - elif ( - parsed_wf_url.scheme in ("http", "https") - and parsed_wf_url.netloc == "raw.githubusercontent.com" - ): - repoType = RepoType.Git - guessedRepoFlavor = RepoGuessFlavor.GitHub - wf_path = list(map(parse.unquote_plus, parsed_wf_url.path.split("/"))) - if len(wf_path) >= 3: - # Rebuilding it - repoGitPath = wf_path[:3] - repoGitPath[-1] += ".git" - - # Rebuilding repo git path - repoURL = parse.urlunparse( - ("https", GITHUB_NETLOC, "/".join(repoGitPath), "", "", "") - ) - - # And now, guessing the tag/checkout and the relative path - # WARNING! This code can have problems with tags which contain slashes - found_params = find_git_repo_in_uri(cast("URIType", repoURL)) - if len(wf_path) >= 4: - repo_branches_tags = found_params[2] - # Validate against existing branch and tag names - tag_relpath = "/".join(wf_path[3:]) - for repo_branch_tag in repo_branches_tags: - if repo_branch_tag == tag_relpath or tag_relpath.startswith( - repo_branch_tag + "/" - ): - repoTag = repo_branch_tag - if len(tag_relpath) > len(repo_branch_tag): - tag_relpath = tag_relpath[len(repo_branch_tag) + 1 :] - if len(tag_relpath) > 0: - repoRelPath = tag_relpath - break - else: - # Fallback - repoTag = wf_path[3] - if len(wf_path) > 4: - repoRelPath = "/".join(wf_path[4:]) - elif ( - parsed_wf_url.scheme == "" - or (parsed_wf_url.scheme in GitFetcher.GetSchemeHandlers()) - or (parsed_wf_url.scheme in GIT_SCHEMES) - ): - if parsed_wf_url.scheme == "": - # It could be a checkout uri in the form of 'git@github.com:inab/WfExS-backend.git' - if ( - parsed_wf_url.netloc == "" - and ("@" in parsed_wf_url.path) - and (":" in parsed_wf_url.path) - ): - gitScheme = "ssh" - parsed_wf_url = parse.urlparse( - f"{gitScheme}://" - + parse.urlunparse(parsed_wf_url).replace(":", "/") - ) - else: - logger.debug( - f"No scheme in repo URL. Choices are: {', '.join(GIT_SCHEMES)}" - ) - return None - # Getting the scheme git is going to understand - elif parsed_wf_url.scheme.startswith(GitFetcher.GIT_PROTO_PREFIX): - gitScheme = parsed_wf_url.scheme[len(GitFetcher.GIT_PROTO_PREFIX) :] - denorm_parsed_wf_url = parsed_wf_url._replace(scheme=gitScheme) - parsed_wf_url = parse.urlparse(parse.urlunparse(denorm_parsed_wf_url)) - else: - gitScheme = parsed_wf_url.scheme - - if gitScheme not in GIT_SCHEMES: - logger.debug( - f"Unknown scheme {gitScheme} in repo URL. Choices are: {', '.join(GIT_SCHEMES)}" - ) - return None - - # Beware ssh protocol!!!! I has a corner case with URLs like - # ssh://git@github.com:inab/WfExS-backend.git' - if parsed_wf_url.scheme == "ssh" and ":" in parsed_wf_url.netloc: - new_netloc = parsed_wf_url.netloc - # Translating it to something better - colon_pos = new_netloc.rfind(":") - new_netloc = new_netloc[:colon_pos] + "/" + new_netloc[colon_pos + 1 :] - denorm_parsed_wf_url = parsed_wf_url._replace(netloc=new_netloc) - parsed_wf_url = parse.urlparse(parse.urlunparse(denorm_parsed_wf_url)) - - # Getting the tag or branch - if "@" in parsed_wf_url.path: - gitPath, repoTag = parsed_wf_url.path.split("@", 1) - else: - gitPath = parsed_wf_url.path - - # Getting the repoRelPath (if available) - if len(parsed_wf_url.fragment) > 0: - frag_qs = parse.parse_qs(parsed_wf_url.fragment) - subDirArr = frag_qs.get("subdirectory", []) - if len(subDirArr) > 0: - repoRelPath = subDirArr[0] - - # Now, reassemble the repoURL - repoURL = parse.urlunparse( - (gitScheme, parsed_wf_url.netloc, gitPath, "", "", "") - ) - found_params = find_git_repo_in_uri(cast("URIType", repoURL)) - guessedRepoFlavor = found_params[0].guess_flavor - # TODO handling other popular cases, like bitbucket - else: - found_params = find_git_repo_in_uri(parsed_wf_url) - - except RepoGuessException as gge: - if not fail_ok: - import traceback - - traceback.print_exc() - raise FetcherException( - f"FIXME: Unsupported http(s) git repository {wf_url} (see cascade exception)" - ) from gge - - if found_params is not None: - if repoTag is None: - repoTag = found_params[0].tag - repoType = found_params[0].repo_type - if guessedRepoFlavor is None: - guessedRepoFlavor = found_params[0].guess_flavor - elif not fail_ok: - raise FetcherException( - f"FIXME: Unsupported git repository {wf_url}. (Is it really a git repo???)" - ) - - logger.debug( - "From {} was derived (type {}, flavor {}) {} {} {}".format( - wf_url, repoType, guessedRepoFlavor, repoURL, repoTag, repoRelPath - ) - ) - - if repoURL is None: - return None - - # if repoType == RepoType.GitHub: - # wf_entrypoint_path = [ - # - # ] - # web_url = urllib.parse.urlunparse( - # ( - # "https", - # "raw.githubusercontent.com", - # "/".join(wf_entrypoint_path), - # "", - # "", - # "", - # ) - # ) - - return RemoteRepo( - repo_url=cast("RepoURL", repoURL), - tag=cast("Optional[RepoTag]", repoTag), - rel_path=cast("Optional[RelPath]", repoRelPath), - repo_type=repoType, - guess_flavor=guessedRepoFlavor, - web_url=web_url, - ) - - -def find_git_repo_in_uri( - remote_file: "Union[URIType, parse.ParseResult]", -) -> "Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]": - if isinstance(remote_file, parse.ParseResult): - parsedInputURL = remote_file - else: - parsedInputURL = parse.urlparse(remote_file) - sp_path = parsedInputURL.path.split("/") - - shortest_pre_path: "Optional[URIType]" = None - longest_post_path: "Optional[Sequence[str]]" = None - repo_type: "Optional[RepoType]" = None - guessed_repo_flavor: "Optional[RepoGuessFlavor]" = None - the_remote_uri: "Optional[str]" = None - b_default_repo_tag: "Optional[str]" = None - repo_branches: "Optional[MutableSequence[RepoTag]]" = None - for pos in range(len(sp_path), 0, -1): - pre_path = "/".join(sp_path[:pos]) - if pre_path == "": - pre_path = "/" - remote_uri_anc = parse.urlunparse(parsedInputURL._replace(path=pre_path)) - - remote_refs_dict: "Mapping[bytes, bytes]" - try: - # Dulwich works both with file, ssh, git and http(s) protocols - remote_refs_dict = dulwich.porcelain.ls_remote(remote_uri_anc) - repo_type = RepoType.Git - except ( - dulwich.errors.NotGitRepository, - dulwich.errors.GitProtocolError, - ) as ngr: - # Skip and continue - continue - - the_remote_uri = remote_uri_anc - - head_remote_ref = remote_refs_dict[HEAD_LABEL] - repo_branches = [] - b_default_repo_tag = None - for remote_label, remote_ref in remote_refs_dict.items(): - if remote_label.startswith(REFS_HEADS_PREFIX): - b_repo_tag = remote_label[len(REFS_HEADS_PREFIX) :].decode( - "utf-8", errors="continue" - ) - repo_branches.append(cast("RepoTag", b_repo_tag)) - if b_default_repo_tag is None and remote_ref == head_remote_ref: - b_default_repo_tag = b_repo_tag - - # It is considered a git repo! - shortest_pre_path = cast("URIType", pre_path) - longest_post_path = sp_path[pos:] - if repo_type is None: - # Metadata is all we really need - repo_type = RepoType.Raw - req = request.Request(remote_uri_anc, method="HEAD") - try: - with request.urlopen(req) as resp: - # Is it gitlab? - if list( - filter( - lambda c: "gitlab" in c, - resp.headers.get_all("Set-Cookie"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.GitLab - elif list( - filter( - lambda c: GITHUB_NETLOC in c, - resp.headers.get_all("Set-Cookie"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.GitHub - elif list( - filter( - lambda c: "bitbucket" in c, - resp.headers.get_all("X-View-Name"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.BitBucket - except Exception as e: - pass - - if repo_type is None: - raise RepoGuessException(f"Unable to identify {remote_file} as a git repo") - - if b_default_repo_tag is None: - raise RepoGuessException( - f"No tag was obtained while getting default branch name from {remote_file}" - ) - - assert longest_post_path is not None - assert repo_branches is not None - - repo = RemoteRepo( - repo_url=cast("RepoURL", the_remote_uri), - tag=cast("RepoTag", b_default_repo_tag), - repo_type=repo_type, - guess_flavor=guessed_repo_flavor, - ) - return repo, longest_post_path, repo_branches diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index e12d6976..d80bf904 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -116,6 +116,110 @@ def description(self) -> "str": def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": return tuple() + @classmethod + def _resolve_swh_id( + cls, + the_id: "URIType", + ) -> "Tuple[Mapping[str, Any], MutableSequence[URIWithMetadata]]": + # ## Use the resolver, see https://archive.softwareheritage.org/api/1/resolve/doc/ + # curl -H "Accept: application/json" https://archive.softwareheritage.org/api/1/resolve/swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c/ + # The service does not work with quoted identifiers, neither with + # fully unquoted identifiers. Only the semicolons have to be + # substituted + swh_quoted_id = the_id.replace(";", parse.quote(";")) + resio = io.BytesIO() + # urljoin cannot be used due working with URIs + resolve_uri = cast("URIType", cls.SWH_API_REST_RESOLVE + swh_quoted_id + "/") + try: + _, metaresio, _ = fetchClassicURL( + resolve_uri, + resio, + secContext={ + "headers": { + "Accept": "application/json", + }, + }, + ) + res_doc = json.loads(resio.getvalue().decode("utf-8")) + except Exception as e: + raise FetcherException(f"HTTP REST call {resolve_uri} failed") from e + + if not isinstance(res_doc, dict): + raise FetcherException(f"{the_id} is not valid. Message: {res_doc}") + + gathered_meta = { + "fetched": resolve_uri, + "payload": res_doc, + } + metadata_array = [ + URIWithMetadata( + uri=the_id, + metadata=gathered_meta, + ) + ] + metadata_array.extend(metaresio) + + return res_doc, metadata_array + + @classmethod + def GuessRepoParams( + cls, + orig_wf_url: "Union[URIType, parse.ParseResult]", + logger: "Optional[logging.Logger]" = None, + fail_ok: "bool" = False, + ) -> "Optional[RemoteRepo]": + # Deciding which is the input + wf_url: "RepoURL" + parsed_wf_url: "parse.ParseResult" + if isinstance(orig_wf_url, parse.ParseResult): + parsed_wf_url = orig_wf_url + wf_url = cast("RepoURL", parse.urlunparse(orig_wf_url)) + else: + wf_url = cast("RepoURL", orig_wf_url) + parsed_wf_url = parse.urlparse(orig_wf_url) + + if parsed_wf_url.scheme not in cls.GetSchemeHandlers(): + return None + + # ## Check against Software Heritage the validity of the id + # echo '["swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c"]' | curl -H "Content-Type: application/json" -T - -X POST https://archive.softwareheritage.org/api/1/known/ + putative_core_swhid = wf_url.split(";", 1)[0] + try: + valio = io.BytesIO() + _, metavalio, _ = fetchClassicURL( + cls.SWH_API_REST_KNOWN, + valio, + secContext={ + "headers": { + "Content-Type": "application/json", + }, + "method": "POST", + # Only core SWHids are accepted + "data": json.dumps([putative_core_swhid]).encode("utf-8"), + }, + ) + val_doc = json.loads(valio.getvalue().decode("utf-8")) + except Exception as e: + if fail_ok: + return None + raise + + # It could be a valid swh identifier, but it is not registered + if not isinstance(val_doc, dict) or not val_doc.get( + putative_core_swhid, {} + ).get("known", False): + return None + + # Now we are sure it is known, let's learn the web url to browse it + resolved_payload, _ = cls._resolve_swh_id(wf_url) + web_url = resolved_payload["browse_url"] + return RemoteRepo( + repo_url=wf_url, + tag=cast("RepoTag", putative_core_swhid), + repo_type=RepoType.SoftwareHeritage, + web_url=web_url, + ) + def materialize_repo( self, repoURL: "RepoURL", @@ -125,14 +229,14 @@ def materialize_repo( doUpdate: "Optional[bool]" = True, ) -> "Tuple[AbsPath, RemoteRepo, Sequence[URIWithMetadata]]": # If we are here is because the repo is valid - # as it should have been checked by guess_swh_repo_params + # as it should have been checked by GuessRepoParams # ## Use the resolver, see https://archive.softwareheritage.org/api/1/resolve/doc/ # curl -H "Accept: application/json" https://archive.softwareheritage.org/api/1/resolve/swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c/ # The service does not work with quoted identifiers, neither with # fully unquoted identifiers. Only the semicolons have to be # substituted - res_doc, metadata_array = resolve_swh_id(repoURL) + res_doc, metadata_array = self._resolve_swh_id(repoURL) # Error handling if "exception" in res_doc: @@ -146,7 +250,7 @@ def materialize_repo( if object_type == "content": anchor = res_doc.get("metadata", {}).get("anchor") if anchor is not None: - anc_res_doc, anchor_metadata_array = resolve_swh_id(anchor) + anc_res_doc, anchor_metadata_array = self._resolve_swh_id(anchor) metadata_array.extend(anchor_metadata_array) # Now, truly yes the context @@ -559,107 +663,3 @@ def fetch( # TODO: Integrate licences from swh report?? licences=None, ) - - -def resolve_swh_id( - the_id: "URIType", -) -> "Tuple[Mapping[str, Any], MutableSequence[URIWithMetadata]]": - # ## Use the resolver, see https://archive.softwareheritage.org/api/1/resolve/doc/ - # curl -H "Accept: application/json" https://archive.softwareheritage.org/api/1/resolve/swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c/ - # The service does not work with quoted identifiers, neither with - # fully unquoted identifiers. Only the semicolons have to be - # substituted - swh_quoted_id = the_id.replace(";", parse.quote(";")) - resio = io.BytesIO() - # urljoin cannot be used due working with URIs - resolve_uri = cast( - "URIType", SoftwareHeritageFetcher.SWH_API_REST_RESOLVE + swh_quoted_id + "/" - ) - try: - _, metaresio, _ = fetchClassicURL( - resolve_uri, - resio, - secContext={ - "headers": { - "Accept": "application/json", - }, - }, - ) - res_doc = json.loads(resio.getvalue().decode("utf-8")) - except Exception as e: - raise FetcherException(f"HTTP REST call {resolve_uri} failed") from e - - if not isinstance(res_doc, dict): - raise FetcherException(f"{the_id} is not valid. Message: {res_doc}") - - gathered_meta = { - "fetched": resolve_uri, - "payload": res_doc, - } - metadata_array = [ - URIWithMetadata( - uri=the_id, - metadata=gathered_meta, - ) - ] - metadata_array.extend(metaresio) - - return res_doc, metadata_array - - -def guess_swh_repo_params( - orig_wf_url: "Union[URIType, parse.ParseResult]", - logger: "logging.Logger", - fail_ok: "bool" = False, -) -> "Optional[RemoteRepo]": - # Deciding which is the input - wf_url: "RepoURL" - parsed_wf_url: "parse.ParseResult" - if isinstance(orig_wf_url, parse.ParseResult): - parsed_wf_url = orig_wf_url - wf_url = cast("RepoURL", parse.urlunparse(orig_wf_url)) - else: - wf_url = cast("RepoURL", orig_wf_url) - parsed_wf_url = parse.urlparse(orig_wf_url) - - if parsed_wf_url.scheme not in SoftwareHeritageFetcher.GetSchemeHandlers(): - return None - - # ## Check against Software Heritage the validity of the id - # echo '["swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c"]' | curl -H "Content-Type: application/json" -T - -X POST https://archive.softwareheritage.org/api/1/known/ - putative_core_swhid = wf_url.split(";", 1)[0] - try: - valio = io.BytesIO() - _, metavalio, _ = fetchClassicURL( - SoftwareHeritageFetcher.SWH_API_REST_KNOWN, - valio, - secContext={ - "headers": { - "Content-Type": "application/json", - }, - "method": "POST", - # Only core SWHids are accepted - "data": json.dumps([putative_core_swhid]).encode("utf-8"), - }, - ) - val_doc = json.loads(valio.getvalue().decode("utf-8")) - except Exception as e: - if fail_ok: - return None - raise - - # It could be a valid swh identifier, but it is not registered - if not isinstance(val_doc, dict) or not val_doc.get(putative_core_swhid, {}).get( - "known", False - ): - return None - - # Now we are sure it is known, let's learn the web url to browse it - resolved_payload, _ = resolve_swh_id(wf_url) - web_url = resolved_payload["browse_url"] - return RemoteRepo( - repo_url=wf_url, - tag=cast("RepoTag", putative_core_swhid), - repo_type=RepoType.SoftwareHeritage, - web_url=web_url, - ) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 10d68ff1..91ea158d 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -127,11 +127,9 @@ from .fetchers.git import ( GitFetcher, - guess_git_repo_params, ) from .fetchers.swh import ( - guess_swh_repo_params, SoftwareHeritageFetcher, ) @@ -1876,12 +1874,12 @@ def guess_repo_params( else: parsedRepoURL = urllib.parse.urlparse(wf_url) - remote_repo = guess_swh_repo_params( + remote_repo = SoftwareHeritageFetcher.GuessRepoParams( parsedRepoURL, logger=self.logger, fail_ok=fail_ok ) if remote_repo is None: # Assume it might be a git repo or a link to a git repo - remote_repo = guess_git_repo_params( + remote_repo = GitFetcher.GuessRepoParams( parsedRepoURL, logger=self.logger, fail_ok=fail_ok ) From b8ca7aac266bc554ac462437452be2cc7d4b7b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 May 2024 20:48:47 +0200 Subject: [PATCH 37/42] Implement build_pid_from_repo, to generate a workflow PID from the shredded details --- tests/fetchers/test_git.py | 76 ++++++++++++-- tests/fetchers/test_swh.py | 145 +++++++++++++++++++++++++++ wfexs_backend/fetchers/__init__.py | 14 +++ wfexs_backend/fetchers/git.py | 154 +++++++++++++++++++++++++++++ wfexs_backend/fetchers/swh.py | 15 +++ 5 files changed, 395 insertions(+), 9 deletions(-) create mode 100644 tests/fetchers/test_swh.py diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index 37d80bc8..3702cc5c 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -27,6 +27,10 @@ ) if TYPE_CHECKING: + from typing import ( + Optional, + ) + from wfexs_backend.common import ( RelPath, RepoTag, @@ -46,9 +50,11 @@ WfExS_git_basedir = WfExS_basedir / ".git" WfExS_git_basedir_file_uri = WfExS_git_basedir.as_uri() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) -@pytest.mark.parametrize( - ["url", "expected"], +GIT_TESTBED = pytest.mark.parametrize( + ["url", "remote_repo", "repo_pid"], [ ( "https://github.com/inab/WfExS-backend.git", @@ -58,6 +64,7 @@ guess_flavor=RepoGuessFlavor.GitHub, repo_type=RepoType.Git, ), + "git+https://github.com/inab/WfExS-backend.git@main", ), ( "git+https://github.com/inab/WfExS-backend.git", @@ -66,14 +73,17 @@ tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), + "git+https://github.com/inab/WfExS-backend.git@main", ), ( - "https://github.com/inab/WfExS-backend.git@0.1.2", + "https://github.com/inab/WfExS-backend.git@0.2.0", RemoteRepo( repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend.git"), repo_type=RepoType.Git, - tag=cast("RepoTag", "0.1.2"), + tag=cast("RepoTag", "0.2.0"), + checkout=cast("RepoTag", "906f48308c62e78bff2057fd60862d08707df6b7"), ), + "git+https://github.com/inab/WfExS-backend.git@906f48308c62e78bff2057fd60862d08707df6b7", ), ( "https://github.com/inab/WfExS-backend.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", @@ -85,6 +95,7 @@ "RelPath", "workflow_examples/ipc/cosifer_test1_cwl.wfex.stage" ), ), + "git+https://github.com/inab/WfExS-backend.git@main#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", ), ( "ssh://git@github.com:inab/WfExS-backend.git", @@ -93,6 +104,7 @@ tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), + "git+ssh://git@github.com:inab/WfExS-backend.git@main", ), ( "git+ssh://git@github.com:inab/WfExS-backend.git", @@ -101,6 +113,7 @@ tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), + "git+ssh://git@github.com:inab/WfExS-backend.git@main", ), ( "ssh://git@github.com:inab/WfExS-backend.git@0.1.2", @@ -109,6 +122,7 @@ repo_type=RepoType.Git, tag=cast("RepoTag", "0.1.2"), ), + "git+ssh://git@github.com:inab/WfExS-backend.git@0.1.2", ), ( "ssh://git@github.com:inab/WfExS-backend.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", @@ -120,6 +134,7 @@ "RelPath", "workflow_examples/ipc/cosifer_test1_cwl.wfex.stage" ), ), + "git+ssh://git@github.com:inab/WfExS-backend.git@main#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", ), ( WfExS_git_basedir_file_uri, @@ -127,6 +142,7 @@ repo_url=cast("RepoURL", WfExS_git_basedir_file_uri), repo_type=RepoType.Git, ), + "git+" + WfExS_git_basedir_file_uri, ), ( "git+" + WfExS_git_basedir_file_uri, @@ -134,6 +150,7 @@ repo_url=cast("RepoURL", WfExS_git_basedir_file_uri), repo_type=RepoType.Git, ), + "git+" + WfExS_git_basedir_file_uri, ), ( WfExS_git_basedir_file_uri + "@0.1.2", @@ -142,6 +159,7 @@ repo_type=RepoType.Git, tag=cast("RepoTag", "0.1.2"), ), + "git+" + WfExS_git_basedir_file_uri + "@0.1.2", ), ( WfExS_git_basedir_file_uri @@ -153,10 +171,14 @@ "RelPath", "workflow_examples/ipc/cosifer_test1_cwl.wfex.stage" ), ), + "git+" + + WfExS_git_basedir_file_uri + + "#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", ), ( "github.com/inab/WfExS-backend.git", None, + None, ), ( "git@github.com:inab/WfExS-backend.git", @@ -165,6 +187,7 @@ tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), + "git+ssh://git@github.com:inab/WfExS-backend.git@main", ), ( "ssh://git@github.com:inab/WfExS-backend", @@ -173,6 +196,7 @@ tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), + "git+ssh://git@github.com:inab/WfExS-backend.git@main", ), ( "https://github.com/inab/WfExS-backend", @@ -182,6 +206,7 @@ repo_type=RepoType.Git, tag=cast("RepoTag", "main"), ), + "git+https://github.com/inab/WfExS-backend.git@main", ), ( WfExS_basedir_file_uri, @@ -189,14 +214,47 @@ repo_url=cast("RepoURL", WfExS_basedir_file_uri), repo_type=RepoType.Git, ), + "git+" + WfExS_basedir_file_uri, + ), + ( + "github:inab/ipc_workflows/cosifer-20210322/cosifer/cwl/cosifer-workflow.cwl", + RemoteRepo( + repo_url=cast("RepoURL", "https://github.com/inab/ipc_workflows"), + guess_flavor=RepoGuessFlavor.GitHub, + repo_type=RepoType.Git, + tag=cast("RepoTag", "cosifer-20210322"), + rel_path=cast("RelPath", "cosifer/cwl/cosifer-workflow.cwl"), + ), + "git+https://github.com/inab/ipc_workflows.git@cosifer-20210322#subdirectory=cosifer/cwl/cosifer-workflow.cwl", ), ], ) -def test_guess_git_repo_params(url: "str", expected: "RemoteRepo") -> "None": - logger = logging.Logger("name") + + +@GIT_TESTBED +def test_guess_git_repo_params( + url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]" +) -> "None": output = GitFetcher.GuessRepoParams(cast("URIType", url), logger=logger) # When no tag is given, ignore what it was discovered - if output is not None and expected is not None and expected.tag is None: - output = output._replace(tag=None) - assert output == expected + if output is not None and remote_repo is not None: + if remote_repo.tag is None: + output = output._replace(tag=None) + # For now, patch this + if remote_repo.checkout is not None: + output = output._replace(checkout=remote_repo.checkout) + assert output == remote_repo + + +@GIT_TESTBED +def test_build_git_pid_from_repo( + url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]" +) -> "None": + if remote_repo is None: + pytest.skip("Skipped test because no remote repo was provided") + else: + fetcher = GitFetcher({}) + output = fetcher.build_pid_from_repo(remote_repo) + + assert output == repo_pid diff --git a/tests/fetchers/test_swh.py b/tests/fetchers/test_swh.py new file mode 100644 index 00000000..0bc34a90 --- /dev/null +++ b/tests/fetchers/test_swh.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import logging + +from pathlib import Path + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from typing import ( + Optional, + ) + + from wfexs_backend.common import ( + RelPath, + RepoTag, + RepoURL, + URIType, + ) + +from wfexs_backend.fetchers import ( + RemoteRepo, + RepoGuessFlavor, + RepoType, +) +from wfexs_backend.fetchers.swh import SoftwareHeritageFetcher + +WfExS_basedir = Path(__file__).parent.parent +WfExS_basedir_file_uri = WfExS_basedir.as_uri() +WfExS_git_basedir = WfExS_basedir / ".git" +WfExS_git_basedir_file_uri = WfExS_git_basedir.as_uri() + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +SWH_TESTBED = pytest.mark.parametrize( + ["url", "remote_repo", "repo_pid"], + [ + ( + "swh:1:dir:6b1abfafa9baf6ffbe2ab1da2b036ed3ae8879a9;origin=https://github.com/inab/Wetlab2Variations;visit=swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325;anchor=swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c;path=/nextflow/", + RemoteRepo( + repo_url=cast( + "RepoURL", + "swh:1:dir:6b1abfafa9baf6ffbe2ab1da2b036ed3ae8879a9;origin=https://github.com/inab/Wetlab2Variations;visit=swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325;anchor=swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c;path=/nextflow/", + ), + tag=cast( + "RepoTag", "swh:1:dir:6b1abfafa9baf6ffbe2ab1da2b036ed3ae8879a9" + ), + repo_type=RepoType.SoftwareHeritage, + ), + "swh:1:dir:6b1abfafa9baf6ffbe2ab1da2b036ed3ae8879a9;origin=https://github.com/inab/Wetlab2Variations;visit=swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325;anchor=swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c;path=/nextflow/", + ), + ( + "swh:1:cnt:deb7365914c0fdf51fd0a4e9a75b4afe7f8d93f7;origin=https://github.com/inab/Wetlab2Variations;visit=swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325;anchor=swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c;path=/nextflow/nextflow.nf", + RemoteRepo( + repo_url=cast( + "RepoURL", + "swh:1:cnt:deb7365914c0fdf51fd0a4e9a75b4afe7f8d93f7;origin=https://github.com/inab/Wetlab2Variations;visit=swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325;anchor=swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c;path=/nextflow/nextflow.nf", + ), + tag=cast( + "RepoTag", "swh:1:cnt:deb7365914c0fdf51fd0a4e9a75b4afe7f8d93f7" + ), + repo_type=RepoType.SoftwareHeritage, + ), + "swh:1:cnt:deb7365914c0fdf51fd0a4e9a75b4afe7f8d93f7;origin=https://github.com/inab/Wetlab2Variations;visit=swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325;anchor=swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c;path=/nextflow/nextflow.nf", + ), + ( + "swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c", + RemoteRepo( + repo_url=cast( + "RepoURL", "swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c" + ), + tag=cast( + "RepoTag", "swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c" + ), + repo_type=RepoType.SoftwareHeritage, + ), + "swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c", + ), + ( + "swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325", + RemoteRepo( + repo_url=cast( + "RepoURL", "swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325" + ), + tag=cast( + "RepoTag", "swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325" + ), + repo_type=RepoType.SoftwareHeritage, + ), + "swh:1:snp:4f8cb5f83b5a0b8d9d629e8cfcb979bba0b6b325", + ), + ], +) + + +@SWH_TESTBED +def test_guess_swh_repo_params( + url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]" +) -> "None": + output = SoftwareHeritageFetcher.GuessRepoParams( + cast("URIType", url), logger=logger + ) + + # When no web url is given, ignore what it was discovered + if output is not None and remote_repo is not None: + if remote_repo.web_url is None: + output = output._replace(web_url=None) + # For now, patch this + if remote_repo.checkout is None: + output = output._replace(checkout=None) + assert output == remote_repo + + +@SWH_TESTBED +def test_build_swh_pid_from_repo( + url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]" +) -> "None": + if remote_repo is None: + pytest.skip("Skipped test because no remote repo was provided") + else: + fetcher = SoftwareHeritageFetcher({}) + output = fetcher.build_pid_from_repo(remote_repo) + + assert output == repo_pid diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index a71ee483..1ad6b716 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -306,6 +306,16 @@ def materialize_repo( ) -> "Tuple[AbsPath, RemoteRepo, Sequence[URIWithMetadata]]": pass + @abc.abstractmethod + def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": + """ + This method is required to generate a PID which usually + represents an element (usually a workflow) in a repository. + If the fetcher does not recognize the type of repo, it should + return None + """ + pass + @classmethod @abc.abstractmethod def GuessRepoParams( @@ -317,6 +327,10 @@ def GuessRepoParams( pass +if TYPE_CHECKING: + RepoFetcher = TypeVar("RepoFetcher", bound=AbstractRepoFetcher) + + class AbstractStatefulStreamingFetcher(AbstractStatefulFetcher): def fetch( self, diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index aed589bc..061a36ce 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -495,6 +495,160 @@ def GuessRepoParams( web_url=web_url, ) + def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": + """ + This method is required to generate a PID which usually + represents an element (usually a workflow) in a repository. + If the fetcher does not recognize the type of repo, it should + return None + """ + parsed_wf_url = parse.urlparse(remote_repo.repo_url) + + retval: "Optional[str]" = None + if parsed_wf_url.scheme == "": + # It could be a checkout uri in the form of 'git@github.com:inab/WfExS-backend.git' + if ( + parsed_wf_url.netloc == "" + and ("@" in parsed_wf_url.path) + and (":" in parsed_wf_url.path) + ): + parsed_wf_url = parse.urlparse("ssh://" + remote_repo.repo_url) + else: + return None + + if parsed_wf_url.scheme == self.GITHUB_SCHEME: + gh_path_split = parsed_wf_url.path.split("/") + gh_path = "/".join(gh_path_split[:2]) + + if not gh_path.endswith(".git"): + gh_path += ".git" + checkout = remote_repo.get_checkout() + if len(checkout) > 0: + gh_path += "@" + checkout + if remote_repo.rel_path is not None and len(remote_repo.rel_path) > 0: + fragment = parse.urlencode( + [("subdirectory", remote_repo.rel_path)], + safe="/", + quote_via=parse.quote, + ) + else: + fragment = "" + + retval = parse.urlunparse( + parse.ParseResult( + scheme="git+https", + netloc=GITHUB_NETLOC, + path=gh_path, + params="", + query="", + fragment=fragment, + ) + ) + elif parsed_wf_url.scheme in ("http", "https"): + if ( + parsed_wf_url.netloc == GITHUB_NETLOC + and "@" not in parsed_wf_url.path + and parsed_wf_url.fragment == "" + ): + gh_path_split = parsed_wf_url.path.split("/") + gh_path = "/".join(gh_path_split[:3]) + + if not gh_path.endswith(".git"): + gh_path += ".git" + checkout = remote_repo.get_checkout() + if len(checkout) > 0: + gh_path += "@" + checkout + if remote_repo.rel_path is not None and len(remote_repo.rel_path) > 0: + fragment = parse.urlencode( + [("subdirectory", remote_repo.rel_path)], + safe="/", + quote_via=parse.quote, + ) + else: + fragment = "" + + retval = parse.urlunparse( + parse.ParseResult( + scheme="git+" + parsed_wf_url.scheme, + netloc=GITHUB_NETLOC, + path=gh_path, + params="", + query="", + fragment=fragment, + ) + ) + else: + # Default + retval = "git+" + remote_repo.repo_url + checkout = remote_repo.get_checkout() + if len(checkout) > 0: + retval += "@" + checkout + if remote_repo.rel_path is not None and len(remote_repo.rel_path) > 0: + fragment = parse.urlencode( + [("subdirectory", remote_repo.rel_path)], + safe="/", + quote_via=parse.quote, + ) + retval += "#" + fragment + + elif (parsed_wf_url.scheme in self.GetSchemeHandlers()) or ( + parsed_wf_url.scheme in self.GIT_SCHEMES + ): + # Getting the scheme git is going to understand + if parsed_wf_url.scheme.startswith(self.GIT_PROTO_PREFIX): + gitScheme = parsed_wf_url.scheme[len(self.GIT_PROTO_PREFIX) :] + denorm_parsed_wf_url = parsed_wf_url._replace(scheme=gitScheme) + parsed_wf_url = parse.urlparse(parse.urlunparse(denorm_parsed_wf_url)) + else: + gitScheme = parsed_wf_url.scheme + + if gitScheme not in self.GIT_SCHEMES: + self.logger.debug( + f"Unknown scheme {gitScheme} in repo URL. Choices are: {', '.join(self.GIT_SCHEMES)}" + ) + return None + + # Beware ssh protocol!!!! I has a corner case with URLs like + # ssh://git@github.com:inab/WfExS-backend.git' + if parsed_wf_url.scheme == "ssh": + if ":" in parsed_wf_url.netloc: + new_netloc = parsed_wf_url.netloc + # Translating it to something better + colon_pos = new_netloc.rfind(":") + new_netloc = ( + new_netloc[:colon_pos] + "/" + new_netloc[colon_pos + 1 :] + ) + denorm_parsed_wf_url = parsed_wf_url._replace(netloc=new_netloc) + parsed_wf_url = parse.urlparse( + parse.urlunparse(denorm_parsed_wf_url) + ) + + newpath = parsed_wf_url.path + if newpath[0] == "/": + newpath = ":" + newpath[1:] + else: + newpath = parsed_wf_url.path + + if parsed_wf_url.netloc.endswith(GITHUB_NETLOC) and not newpath.endswith( + ".git" + ): + newpath += ".git" + retval = ( + "git+" + parsed_wf_url.scheme + "://" + parsed_wf_url.netloc + newpath + ) + checkout = remote_repo.get_checkout() + if len(checkout) > 0: + retval += "@" + checkout + if remote_repo.rel_path is not None and len(remote_repo.rel_path) > 0: + fragment = parse.urlencode( + [("subdirectory", remote_repo.rel_path)], + safe="/", + quote_via=parse.quote, + ) + retval += "#" + fragment + + return retval + def materialize_repo( self, repoURL: "RepoURL", diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index d80bf904..647cdf66 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -216,10 +216,25 @@ def GuessRepoParams( return RemoteRepo( repo_url=wf_url, tag=cast("RepoTag", putative_core_swhid), + checkout=cast("RepoTag", putative_core_swhid), repo_type=RepoType.SoftwareHeritage, web_url=web_url, ) + def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": + """ + This method is required to generate a PID which usually + represents an element (usually a workflow) in a repository. + If the fetcher does not recognize the type of repo, it should + return None + """ + parsed_wf_url = parse.urlparse(remote_repo.repo_url) + if parsed_wf_url.scheme not in self.GetSchemeHandlers(): + return None + + # FIXME: improve this + return remote_repo.repo_url + def materialize_repo( self, repoURL: "RepoURL", From 53143959b631a8d337b022ba121633273be0a872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 May 2024 20:49:49 +0200 Subject: [PATCH 38/42] Fixed SPARQL query which return details about the workflow, so the original one is used instead of a possibly consolidated one. --- wfexs_backend/utils/rocrate.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 48c249b0..49c870aa 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -375,12 +375,6 @@ def identifyROCrate( ?programminglanguage a s:ComputerLanguage ; s:url ?programminglanguage_url . - OPTIONAL { - ?mainentity s:identifier ?identifier . - } - OPTIONAL { - ?mainentity s:alternateName ?workflow_alternate_name . - } OPTIONAL { ?programminglanguage s:version ?programminglanguage_version . @@ -409,6 +403,12 @@ def identifyROCrate( OPTIONAL { ?mainentity s:url ?workflow_url . } + OPTIONAL { + ?mainentity s:identifier ?identifier . + } + OPTIONAL { + ?mainentity s:alternateName ?workflow_alternate_name . + } } UNION { ?mainentity s:isBasedOn ?origmainentity . ?origmainentity @@ -426,6 +426,12 @@ def identifyROCrate( FILTER ( STRSTARTS(str(?bsworkflowprofile), str(bswfprofile:)) ) . + OPTIONAL { + ?origmainentity s:identifier ?identifier . + } + OPTIONAL { + ?origmainentity s:alternateName ?workflow_alternate_name . + } } } } @@ -1719,7 +1725,7 @@ def generateWorkflowMetaFromJSONLD( assert isinstance( execrow, rdflib.query.ResultRow ), "Check the SPARQL code, as it should be a SELECT query" - print(f"\tExecution {execrow.execution}") + self.logger.debug(f"\tExecution {execrow.execution}") contresult = self._parseContainersFromExecution( g, execrow.execution, main_entity=matched_crate.mainentity From 5058e32bba74aecc3fef81c9b954b80afbbbb146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 May 2024 20:52:01 +0200 Subject: [PATCH 39/42] Now it is possible to re-create a previous workflow execution!!!! It is currently working with WRROCs from CWL workflows. (Nextflow WWROC generation has to be fixed) --- wfexs_backend/__main__.py | 20 +++++++++++++++ wfexs_backend/wfexs_backend.py | 46 +++++++++++++++++++++++++++++++--- wfexs_backend/workflow.py | 22 +++++++++------- 3 files changed, 76 insertions(+), 12 deletions(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index c34fdce7..c3384aab 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -254,6 +254,23 @@ def genParserSub( help="Workflow Run RO-Crate describing a previous workflow execution. It can be either a local path or an URI resolvable from WfExS with no authentication", ) + if command in (WfExS_Commands.Import, WfExS_Commands.ReStage): + ap_.add_argument( + "-s", + "--no-secure", + dest="secure", + action="store_false", + default=True, + help="Make unsecured working directory", + ) + ap_.add_argument( + "-S", + "--secure", + dest="secure", + action="store_true", + help="Make secured working directory (default)", + ) + if preStageParams or exportParams or command == WfExS_Commands.ReStage: ap_.add_argument( "-Z", @@ -1409,6 +1426,7 @@ def main() -> None: private_key_filename=args.private_key_file, private_key_passphrase=private_key_passphrase, orcids=op_orcids, + secure=args.secure, ) else: print( @@ -1443,6 +1461,7 @@ def main() -> None: private_key_filename=args.private_key_file, private_key_passphrase=private_key_passphrase, orcids=op_orcids, + secure=args.secure, ) wfSetup = wfInstance.getStagedSetup() @@ -1455,6 +1474,7 @@ def main() -> None: if command in ( WfExS_Commands.Stage, + WfExS_Commands.Import, WfExS_Commands.ReStage, WfExS_Commands.Execute, ): diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 91ea158d..fc172388 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -118,6 +118,7 @@ ) from .fetchers import ( + AbstractRepoFetcher, AbstractStatefulFetcher, DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher, @@ -200,6 +201,7 @@ ) from .fetchers import ( + RepoFetcher, StatefulFetcher, ) @@ -607,10 +609,11 @@ def __init__( self.baseWorkDir = baseWorkDir self.defaultParanoidMode = False - # cacheHandler is created on first use self._sngltn: "MutableMapping[Type[AbstractStatefulFetcher], AbstractStatefulFetcher]" = ( dict() ) + self.repo_fetchers: "MutableSequence[AbstractRepoFetcher]" = list() + # cacheHandler is created on first use self.cacheHandler = SchemeHandlerCacheHandler(self.cacheDir) fetchers_setup_block = local_config.get("fetchers-setup") @@ -701,6 +704,16 @@ def instantiateStatefulFetcher( return cast("StatefulFetcher", instStatefulFetcher) + def instantiateRepoFetcher( + self, + repoFetcher: "Type[RepoFetcher]", + setup_block: "Optional[Mapping[str, Any]]" = None, + ) -> "RepoFetcher": + """ + Method to instantiate repo fetchers once + """ + return self.instantiateStatefulFetcher(repoFetcher, setup_block=setup_block) + def findAndAddWorkflowEnginesFromModuleName( self, the_module_name: "str" = "wfexs_backend.workflow_engines", @@ -748,6 +761,9 @@ def addWorkflowEngine(self, workflowEngineClazz: "Type[WorkflowEngine]") -> None def listWorkflowEngines(self) -> "Sequence[str]": return list(self._workflow_engines.keys()) + def listWorkflowEngineClasses(self) -> "Sequence[Type[WorkflowEngine]]": + return list(self._workflow_engines.values()) + def getWorkflowEngineClass( self, engine_shortname: "str" ) -> "Optional[Type[WorkflowEngine]]": @@ -1028,6 +1044,10 @@ def addSchemeHandlers( else schemeHandler.description, priority=schemeHandler.priority, ) + + # Also, if it is a repository fetcher, record it separately + if isinstance(instSchemeInstance, AbstractRepoFetcher): + self.repo_fetchers.append(instSchemeInstance) elif isinstance(schemeHandler, DocumentedProtocolFetcher) and callable( schemeHandler.fetcher ): @@ -1041,6 +1061,22 @@ def addSchemeHandlers( self.cacheHandler.addRawSchemeHandlers(instSchemeHandlers) + def gen_workflow_pid(self, remote_repo: "RemoteRepo") -> "str": + """ + This method tries generating the workflow pid passing the remote + repo to each one of the registered repo fetchers. The contract + is that BuildPIDFromRepo should return None if it does not + recognize the repo_url as usable. + """ + retval: "Optional[str]" = None + + for fetcher in self.repo_fetchers: + retval = fetcher.build_pid_from_repo(remote_repo) + if retval is not None: + break + + return remote_repo.repo_url if retval is None else retval + def describeFetchableSchemes(self) -> "Sequence[Tuple[str, str, int]]": return self.cacheHandler.describeRegisteredSchemes() @@ -1272,6 +1308,7 @@ def fromPreviousInstanceDeclaration( public_key_filenames: "Sequence[AnyPath]" = [], private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, + secure: "bool" = True, paranoidMode: "bool" = False, ) -> "WF": return WF.FromPreviousInstanceDeclaration( @@ -1283,6 +1320,7 @@ def fromPreviousInstanceDeclaration( public_key_filenames=public_key_filenames, private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, + secure=secure, paranoidMode=paranoidMode, ) @@ -1295,6 +1333,7 @@ def fromPreviousROCrate( public_key_filenames: "Sequence[AnyPath]" = [], private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, + secure: "bool" = True, paranoidMode: "bool" = False, ) -> "WF": # Let's check whether it is a local file @@ -1324,6 +1363,7 @@ def fromPreviousROCrate( public_key_filenames=public_key_filenames, private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, + secure=secure, paranoidMode=paranoidMode, ) @@ -2390,7 +2430,7 @@ def _doMaterializeGitRepo( :param doUpdate: :return: """ - gitFetcherInst = self.instantiateStatefulFetcher(GitFetcher) + gitFetcherInst = self.instantiateRepoFetcher(GitFetcher) repoDir, materialized_repo, metadata_array = gitFetcherInst.materialize_repo( repo.repo_url, repoTag=repo.tag, @@ -2437,7 +2477,7 @@ def _doMaterializeSoftwareHeritageDirOrContent( :param doUpdate: :return: """ - swhFetcherInst = self.instantiateStatefulFetcher(SoftwareHeritageFetcher) + swhFetcherInst = self.instantiateRepoFetcher(SoftwareHeritageFetcher) repoDir, materialized_repo, metadata_array = swhFetcherInst.materialize_repo( cast("RepoURL", repo.tag) if repo.tag is not None else repo.repo_url, doUpdate=doUpdate, diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 8324c278..1bec89d6 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1289,6 +1289,7 @@ def FromPreviousInstanceDeclaration( public_key_filenames: "Sequence[AnyPath]" = [], private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, + secure: "bool" = True, paranoidMode: "bool" = False, ) -> "WF": """ @@ -1308,6 +1309,9 @@ def FromPreviousInstanceDeclaration( if k_name in workflow_meta: del workflow_meta[k_name] + # We also have to reset the secure mode + workflow_meta.setdefault("workflow_config", {})["secure"] = secure + return cls.FromStagedRecipe( wfexs, workflow_meta, @@ -1332,6 +1336,7 @@ def FromPreviousROCrate( public_key_filenames: "Sequence[AnyPath]" = [], private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, + secure: "bool" = True, paranoidMode: "bool" = False, ) -> "WF": """ @@ -1412,22 +1417,25 @@ def FromPreviousROCrate( ) = wfexs.rocrate_toolbox.generateWorkflowMetaFromJSONLD( jsonld_obj, public_name ) - logging.info( + workflow_pid = wfexs.gen_workflow_pid(repo) + logging.debug( f"Repo {repo} workflow type {workflow_type} container factory {container_type}" ) - logging.info(f"Containers {the_containers}") + logging.debug(f"Containers {the_containers}") workflow_meta: "WritableWorkflowMetaConfigBlock" = { - "workflow_id": {}, + "workflow_id": workflow_pid, "workflow_type": workflow_type.shortname, "environment": environment, "params": params, "outputs": outputs, - "workflow_config": {}, + "workflow_config": { + "secure": secure, + }, } if container_type is not None: workflow_meta["workflow_config"]["containerType"] = container_type.value - logging.info(f"{json.dumps(workflow_meta, indent=4)}") + logging.debug(f"{json.dumps(workflow_meta, indent=4)}") # Last, be sure that what it has been generated is correct if wfexs.validateConfigFiles(workflow_meta, securityContextsConfigFilename) > 0: @@ -1435,10 +1443,6 @@ def FromPreviousROCrate( f"Generated WfExS description from {public_name} fails (have a look at the log messages for details)" ) - raise NotImplementedError( - "The implementation of this method has to be finished" - ) - return cls.FromStagedRecipe( wfexs, workflow_meta, From 708e413ee1145482dfb5c1defe8bf34d8e5b34a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 May 2024 22:48:18 +0200 Subject: [PATCH 40/42] Version bump to 0.99.1. Added support for RO-Crate import (only metadata), and input parameters replacement. Still pending to fix RO-Crate generation for Nextflow, so the imported RO-Crates can be consumed without issues. --- CITATION.cff | 2 +- CODE_OF_CONDUCT.md | 2 +- README.md | 112 ++++++++++++-- TODO.md | 8 +- development-docs/wfexs-commands.graphml | 56 +++++-- development-docs/wfexs-commands.svg | 185 +++++++++++++----------- wfexs_backend/__init__.py | 2 +- wfexs_backend/__main__.py | 33 +++-- wfexs_backend/wfexs_backend.py | 4 + wfexs_backend/workflow.py | 90 +++++++++++- 10 files changed, 363 insertions(+), 131 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 1f20e30f..ffb6f065 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -28,4 +28,4 @@ message: "If you use this software, please cite it using these metadata." repository-code: "https://github.com/inab/WfExS-backend" type: software title: "WfExS-backend" -version: 0.99.0 +version: 0.99.1 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 2fc97f5e..79031fc1 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -56,7 +56,7 @@ representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported to the community leaders responsible for enforcement at . +reported to the community leaders responsible for enforcement at . All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the diff --git a/README.md b/README.md index f826b337..0af060e4 100644 --- a/README.md +++ b/README.md @@ -68,13 +68,13 @@ python WfExS-backend.py --full-help usage: WfExS-backend.py [-h] [--log-file LOGFILENAME] [-q] [-v] [-d] [-L LOCALCONFIGFILENAME] [--cache-dir CACHEDIR] [-V] [--full-help] - {init,cache,staged-workdir,export,list-fetchers,list-exporters,list-licences,config-validate,stage,re-stage,mount-workdir,export-stage,offline-execute,execute,export-results,export-crate} + {init,cache,staged-workdir,export,list-fetchers,list-exporters,list-container-factories,list-workflow-engines,list-licences,config-validate,stage,re-stage,import,mount-workdir,export-stage,offline-execute,execute,export-results,export-crate} ... -WfExS (workflow execution service) backend 0.10.1-12-g24f2de3 -(24f2de3bb0b0f3f8a59c90ec16fdf07b66ebe641, branch jmfernandez) +WfExS (workflow execution service) backend 0.99.0-43-g5058e32 +(5058e32bba74aecc3fef81c9b954b80afbbbb146, branch full_circle) -optional arguments: +options: -h, --help show this help message and exit --log-file LOGFILENAME Store messages in a file instead of using standard @@ -86,8 +86,8 @@ optional arguments: -L LOCALCONFIGFILENAME, --local-config LOCALCONFIGFILENAME Local installation configuration file (can also be set up through WFEXS_CONFIG_FILE environment variable) - (default: /home/jmfernandez/projects/WfExS- - backend/wfexs_config.yml) + (default: /home/jmfernandez/projects/WfExS/WfExS- + backend_full_circle/wfexs_config.yml) --cache-dir CACHEDIR Caching directory (default: None) -V, --version show program's version number and exit --full-help It returns full help (default: False) @@ -95,15 +95,19 @@ optional arguments: commands: Command to run. It must be one of these - {init,cache,staged-workdir,export,list-fetchers,list-exporters,list-licences,config-validate,stage,re-stage,mount-workdir,export-stage,offline-execute,execute,export-results,export-crate} + {init,cache,staged-workdir,export,list-fetchers,list-exporters,list-container-factories,list-workflow-engines,list-licences,config-validate,stage,re-stage,import,mount-workdir,export-stage,offline-execute,execute,export-results,export-crate} init Init local setup cache Cache handling subcommands staged-workdir Staged working directories handling subcommands export Staged working directories export subcommands list-fetchers List the supported fetchers / schemes list-exporters List the supported export plugins + list-container-factories + List the supported container factories + list-workflow-engines + List the supported workflow engines list-licences List the documented licences, both embedded and - fetched from SPDX release 3.22 + fetched from SPDX release 3.23 config-validate Validate the configuration files to be used for staging and execution stage Prepare the staging (working) directory for workflow @@ -111,6 +115,8 @@ commands: re-stage Prepare a new staging (working) directory for workflow execution, repeating the fetch of dependencies and contents + import Workflow Run RO-Crate import into a new staged working + directory mount-workdir Mount the encrypted staging directory on secure staging scenarios export-stage Export the staging directory as an RO-Crate @@ -321,6 +327,28 @@ optional arguments: ``` usage: WfExS-backend.py list-exporters [-h] +optional arguments: + -h, --help show this help message and exit + +``` + +
+Subparser list-container-factories + +``` +usage: WfExS-backend.py list-container-factories [-h] + +optional arguments: + -h, --help show this help message and exit + +``` +
+
+Subparser list-workflow-engines + +``` +usage: WfExS-backend.py list-workflow-engines [-h] + optional arguments: -h, --help show this help message and exit @@ -403,15 +431,24 @@ secure workdir arguments: Subparser re-stage ``` -usage: WfExS-backend.py re-stage [-h] [-Z SECURITYCONTEXTSCONFIGFILENAME] +Subparser 're-stage' +usage: WfExS-backend.py re-stage [-h] [-W WORKFLOWCONFIGFILENAME] [-s] [-S] + [-Z SECURITYCONTEXTSCONFIGFILENAME] [-n NICKNAME_PREFIX] [--orcid ORCIDS] [--public-key-file PUBLIC_KEY_FILES] [--private-key-file PRIVATE_KEY_FILE] [--private-key-passphrase-envvar PRIVATE_KEY_PASSPHRASE_ENVVAR] -J WORKFLOWWORKINGDIRECTORY -optional arguments: +options: -h, --help show this help message and exit + -W WORKFLOWCONFIGFILENAME, --workflow-config WORKFLOWCONFIGFILENAME + Optional configuration file, describing some inputs + which will replace the base, original ones (default: + None) + -s, --no-secure Make unsecured working directory (default: True) + -S, --secure Make secured working directory (default) (default: + False) -Z SECURITYCONTEXTSCONFIGFILENAME, --creds-config SECURITYCONTEXTSCONFIGFILENAME Configuration file, describing security contexts, which hold credentials and similar (default: None) @@ -440,6 +477,61 @@ secure workdir arguments: the private key needed to unlock an encrypted working directory. (default: ) +``` +
+
+Subparser import + +``` +Subparser 'import' +usage: WfExS-backend.py import [-h] -R WORKFLOWROCRATEFILENAMEORURI + [-W WORKFLOWCONFIGFILENAME] [-s] [-S] + [-Z SECURITYCONTEXTSCONFIGFILENAME] + [-n NICKNAME_PREFIX] [--orcid ORCIDS] + [--public-key-file PUBLIC_KEY_FILES] + [--private-key-file PRIVATE_KEY_FILE] + [--private-key-passphrase-envvar PRIVATE_KEY_PASSPHRASE_ENVVAR] + +options: + -h, --help show this help message and exit + -R WORKFLOWROCRATEFILENAMEORURI, --workflow-rocrate WORKFLOWROCRATEFILENAMEORURI + Workflow Run RO-Crate describing a previous workflow + execution. It can be either a local path or an URI + resolvable from WfExS with no authentication (default: + None) + -W WORKFLOWCONFIGFILENAME, --workflow-config WORKFLOWCONFIGFILENAME + Optional configuration file, describing some inputs + which will replace the base, original ones (default: + None) + -s, --no-secure Make unsecured working directory (default: True) + -S, --secure Make secured working directory (default) (default: + False) + -Z SECURITYCONTEXTSCONFIGFILENAME, --creds-config SECURITYCONTEXTSCONFIGFILENAME + Configuration file, describing security contexts, + which hold credentials and similar (default: None) + -n NICKNAME_PREFIX, --nickname-prefix NICKNAME_PREFIX + Nickname prefix to be used on staged workdir creation + (default: None) + --orcid ORCIDS ORCID(s) of the person(s) staging, running or + exporting the workflow scenario (default: []) + --public-key-file PUBLIC_KEY_FILES + This parameter switches on secure processing. Path to + the public key(s) to be used to encrypt the working + directory (default: []) + +secure workdir arguments: + Private key and passphrase to access secured working directories + + --private-key-file PRIVATE_KEY_FILE + This parameter passes the name of the file containing + the private key needed to unlock an encrypted working + directory. (default: None) + --private-key-passphrase-envvar PRIVATE_KEY_PASSPHRASE_ENVVAR + This parameter passes the name of the environment + variable containing the passphrase needed to decrypt + the private key needed to unlock an encrypted working + directory. (default: ) + ```
diff --git a/TODO.md b/TODO.md index 6b99c123..c2dd40f5 100644 --- a/TODO.md +++ b/TODO.md @@ -55,15 +55,17 @@ references to the inputs, that we are going to use to instantiate the workflows. - [x] **Step 7.k**: Upload to Dataverse. -- [ ] **Step 8**: (partially implemented) Create execution provenance, which includes uploading URLs of results and / or DOIs / URIs. +- [x] **Step 8**: (partially implemented) Create execution provenance, which includes uploading URLs of results and / or DOIs / URIs. - [x] **Step 9**: Generate RO-Crate from execution provenance and exported results. - [x] **Step 9.a**: Generated RO-Crate should be consumable by WorkflowHub. - - [ ] **Step 9.c**: Generated RO-Crate should be consumable by WfExS-backend. + - [x] **Step 9.c**: Generated RO-Crate should be consumable by WfExS-backend. - - [ ] **Step 9.d**: Add full circle capabilities. Re-execute workflow with the very same parameters from previously generated RO-Crate. + - [x] **Step 9.d**: Add full circle capabilities. Re-execute workflow with the very same parameters from previously generated RO-Crate (only metadata). + + - [ ] **Step 9.e**: Add full circle capabilities. Re-execute workflow with the very same parameters from previously generated RO-Crate (reusing payloads). ## Other features diff --git a/development-docs/wfexs-commands.graphml b/development-docs/wfexs-commands.graphml index 200cf7ba..ade39ce7 100644 --- a/development-docs/wfexs-commands.graphml +++ b/development-docs/wfexs-commands.graphml @@ -235,7 +235,7 @@ RO-Crate - + cache (subcommands) @@ -259,7 +259,7 @@ RO-Crate - + ls @@ -269,7 +269,7 @@ RO-Crate - + rm @@ -279,7 +279,7 @@ RO-Crate - + inject @@ -289,7 +289,7 @@ RO-Crate - + input @@ -299,7 +299,7 @@ RO-Crate - + validate @@ -309,7 +309,7 @@ RO-Crate - + ro-crate @@ -319,7 +319,7 @@ RO-Crate - + ga4gh-trs @@ -329,7 +329,7 @@ RO-Crate - + workflow @@ -339,7 +339,7 @@ RO-Crate - + fetch @@ -470,6 +470,16 @@ crate + + + + + + + import + + + @@ -945,6 +955,32 @@ crate + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/development-docs/wfexs-commands.svg b/development-docs/wfexs-commands.svg index 0cf2ac5c..35511f8e 100644 --- a/development-docs/wfexs-commands.svg +++ b/development-docs/wfexs-commands.svg @@ -1,4 +1,4 @@ - + @@ -63,39 +63,39 @@ - + - + - + - + - + - + - + - + - + @@ -135,11 +135,15 @@ + + + + - + - + @@ -324,158 +328,158 @@ - + - cache (subcommands) + cache (subcommands) - + - - ls + + ls - + - - rm + + rm - + - - inject + + inject - + - - input + + input - + - - validate + + validate - + - - ro-crate + + ro-crate - + - - ga4gh-trs + + ga4gh-trs - + - - workflow + + workflow - + - - fetch + + fetch - - - + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + @@ -550,6 +554,13 @@ re-stage + + + + + + import + @@ -567,6 +578,12 @@ + + + + + + diff --git a/wfexs_backend/__init__.py b/wfexs_backend/__init__.py index 5f35e2ca..2b21ec36 100644 --- a/wfexs_backend/__init__.py +++ b/wfexs_backend/__init__.py @@ -21,7 +21,7 @@ __license__ = "Apache 2.0" # https://www.python.org/dev/peps/pep-0396/ -__version__ = "0.99.0" +__version__ = "0.99.1" __url__ = "https://github.com/inab/WfExS-backend" __official_name__ = "WfExS-backend" diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index c3384aab..b46e4fe0 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -237,15 +237,7 @@ def genParserSub( ) if preStageParams: - if command != WfExS_Commands.Import: - ap_.add_argument( - "-W", - "--workflow-config", - dest="workflowConfigFilename", - required=True, - help="Configuration file, describing workflow and inputs", - ) - else: + if command == WfExS_Commands.Import: ap_.add_argument( "-R", "--workflow-rocrate", @@ -254,7 +246,18 @@ def genParserSub( help="Workflow Run RO-Crate describing a previous workflow execution. It can be either a local path or an URI resolvable from WfExS with no authentication", ) - if command in (WfExS_Commands.Import, WfExS_Commands.ReStage): + not_restage = command not in (WfExS_Commands.Import, WfExS_Commands.ReStage) + ap_.add_argument( + "-W", + "--workflow-config", + dest="workflowConfigFilename", + required=not_restage, + help="Configuration file, describing workflow and inputs" + if not_restage + else "Optional configuration file, describing some inputs which will replace the base, original ones", + ) + + if not not_restage: ap_.add_argument( "-s", "--no-secure", @@ -1153,7 +1156,9 @@ def _get_wfexs_argparse_internal( ap_s = genParserSub(sp, WfExS_Commands.Stage, preStageParams=True) - ap_r_s = genParserSub(sp, WfExS_Commands.ReStage, postStageParams=True) + ap_r_s = genParserSub( + sp, WfExS_Commands.ReStage, preStageParams=True, postStageParams=True + ) ap_imp = genParserSub(sp, WfExS_Commands.Import, preStageParams=True) @@ -1420,7 +1425,8 @@ def main() -> None: elif command == WfExS_Commands.Import: wfInstance = wfBackend.fromPreviousROCrate( args.workflowROCrateFilenameOrURI, - args.securityContextsConfigFilename, + securityContextsConfigFilename=args.securityContextsConfigFilename, + replaced_parameters_filename=args.workflowConfigFilename, nickname_prefix=args.nickname_prefix, public_key_filenames=args.public_key_files, private_key_filename=args.private_key_file, @@ -1455,7 +1461,8 @@ def main() -> None: sys.stderr.flush() wfInstance = wfBackend.fromPreviousInstanceDeclaration( source_wfInstance, - args.securityContextsConfigFilename, + securityContextsConfigFilename=args.securityContextsConfigFilename, + replaced_parameters_filename=args.workflowConfigFilename, nickname_prefix=args.nickname_prefix, public_key_filenames=args.public_key_files, private_key_filename=args.private_key_file, diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index fc172388..5b25c01b 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1303,6 +1303,7 @@ def fromPreviousInstanceDeclaration( self, wfInstance: "WF", securityContextsConfigFilename: "Optional[AnyPath]" = None, + replaced_parameters_filename: "Optional[AnyPath]" = None, nickname_prefix: "Optional[str]" = None, orcids: "Sequence[str]" = [], public_key_filenames: "Sequence[AnyPath]" = [], @@ -1315,6 +1316,7 @@ def fromPreviousInstanceDeclaration( self, wfInstance, securityContextsConfigFilename=securityContextsConfigFilename, + replaced_parameters_filename=replaced_parameters_filename, nickname_prefix=nickname_prefix, orcids=orcids, public_key_filenames=public_key_filenames, @@ -1328,6 +1330,7 @@ def fromPreviousROCrate( self, workflowROCrateFilenameOrURI: "Union[AnyPath, URIType]", securityContextsConfigFilename: "Optional[AnyPath]" = None, + replaced_parameters_filename: "Optional[AnyPath]" = None, nickname_prefix: "Optional[str]" = None, orcids: "Sequence[str]" = [], public_key_filenames: "Sequence[AnyPath]" = [], @@ -1358,6 +1361,7 @@ def fromPreviousROCrate( workflowROCrateFilename, public_name=workflowROCrateFilenameOrURI, securityContextsConfigFilename=securityContextsConfigFilename, + replaced_parameters_filename=replaced_parameters_filename, nickname_prefix=nickname_prefix, orcids=orcids, public_key_filenames=public_key_filenames, diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 1bec89d6..03053784 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1165,6 +1165,56 @@ def getMarshallingStatus(self, reread_stats: "bool" = False) -> "MarshallingStat def enableParanoidMode(self) -> None: self.paranoidMode = True + @staticmethod + def __read_yaml_config(filename: "AnyPath") -> "WritableWorkflowMetaConfigBlock": + with open(filename, mode="r", encoding="utf-8") as wcf: + workflow_meta = unmarshall_namedtuple(yaml.safe_load(wcf)) + + return cast("WritableWorkflowMetaConfigBlock", workflow_meta) + + @classmethod + def __merge_params_from_file( + cls, + wfexs: "WfExSBackend", + base_workflow_meta: "WorkflowMetaConfigBlock", + replaced_parameters_filename: "AnyPath", + ) -> "WritableWorkflowMetaConfigBlock": + new_params_meta = cls.__read_yaml_config(replaced_parameters_filename) + + if ( + not isinstance(base_workflow_meta, dict) + or "params" not in base_workflow_meta + ): + raise WFException( + "Base workflow metadata does not have the proper WfExS parameters structure" + ) + + if not isinstance(new_params_meta, dict) or "params" not in new_params_meta: + raise WFException( + f"Loaded {replaced_parameters_filename} does not have the proper WfExS parameters structure" + ) + + # Now, trim everything but what it is allowed + existing_keys = set(new_params_meta.keys()) + existing_keys.remove("params") + if len(existing_keys) > 0: + for key in existing_keys: + del new_params_meta[key] + + # This key is needed to pass the validation + new_params_meta["workflow_id"] = "dummy" + # Let's check! + if wfexs.validateConfigFiles(new_params_meta) > 0: + raise WFException( + f"Loaded WfExS parameters from {replaced_parameters_filename} fails (have a look at the log messages for details)" + ) + + # Last, merge + workflow_meta = copy.deepcopy(base_workflow_meta) + workflow_meta["params"].update(new_params_meta["params"]) + + return workflow_meta + @classmethod def FromWorkDir( cls, @@ -1218,8 +1268,7 @@ def FromFiles( This class method creates a new staged working directory """ - with open(workflowMetaFilename, mode="r", encoding="utf-8") as wcf: - workflow_meta = unmarshall_namedtuple(yaml.safe_load(wcf)) + workflow_meta = cls.__read_yaml_config(workflowMetaFilename) return cls.FromStagedRecipe( wfexs, @@ -1284,6 +1333,7 @@ def FromPreviousInstanceDeclaration( wfexs: "WfExSBackend", wfInstance: "WF", securityContextsConfigFilename: "Optional[AnyPath]" = None, + replaced_parameters_filename: "Optional[AnyPath]" = None, nickname_prefix: "Optional[str]" = None, orcids: "Sequence[str]" = [], public_key_filenames: "Sequence[AnyPath]" = [], @@ -1304,6 +1354,11 @@ def FromPreviousInstanceDeclaration( # Now we should be able to get the configuration file workflow_meta = copy.deepcopy(wfInstance.staging_recipe) + if replaced_parameters_filename is not None: + workflow_meta = cls.__merge_params_from_file( + wfexs, workflow_meta, replaced_parameters_filename + ) + # We have to reset the inherited paranoid mode and nickname for k_name in ("nickname", "paranoid_mode"): if k_name in workflow_meta: @@ -1331,6 +1386,7 @@ def FromPreviousROCrate( workflowROCrateFilename: "AnyPath", public_name: "str", # Mainly used for provenance and exceptions securityContextsConfigFilename: "Optional[AnyPath]" = None, + replaced_parameters_filename: "Optional[AnyPath]" = None, nickname_prefix: "Optional[str]" = None, orcids: "Sequence[str]" = [], public_key_filenames: "Sequence[AnyPath]" = [], @@ -1437,6 +1493,11 @@ def FromPreviousROCrate( logging.debug(f"{json.dumps(workflow_meta, indent=4)}") + if replaced_parameters_filename is not None: + workflow_meta = cls.__merge_params_from_file( + wfexs, workflow_meta, replaced_parameters_filename + ) + # Last, be sure that what it has been generated is correct if wfexs.validateConfigFiles(workflow_meta, securityContextsConfigFilename) > 0: raise WFException( @@ -3513,6 +3574,22 @@ def marshallConfig( return self.configMarshalled + def __get_combined_globals(self) -> "Mapping[str, Any]": + """ + This method is needed since workflow engines and container factories + are dynamically loaded. + """ + combined_globals = copy.copy(common_defs_module.__dict__) + combined_globals.update(globals()) + combined_globals.update( + [ + (workflow_engine.__name__, workflow_engine) + for workflow_engine in self.wfexs.listWorkflowEngineClasses() + ] + ) + + return combined_globals + def unmarshallConfig( self, fail_ok: "bool" = False ) -> "Optional[Union[bool, datetime.datetime]]": @@ -3741,8 +3818,7 @@ def unmarshallStage( with open(marshalled_stage_file, mode="r", encoding="utf-8") as msF: marshalled_stage = yaml.load(msF, Loader=YAMLLoader) - combined_globals = copy.copy(common_defs_module.__dict__) - combined_globals.update(globals()) + combined_globals = self.__get_combined_globals() stage = unmarshall_namedtuple(marshalled_stage, combined_globals) self.remote_repo = stage.get("remote_repo") # This one takes precedence @@ -3944,8 +4020,7 @@ def unmarshallExecute( try: with open(marshalled_execution_file, mode="r", encoding="utf-8") as meF: marshalled_execution = yaml.load(meF, Loader=YAMLLoader) - combined_globals = copy.copy(common_defs_module.__dict__) - combined_globals.update(globals()) + combined_globals = self.__get_combined_globals() execution_read = unmarshall_namedtuple( marshalled_execution, combined_globals ) @@ -4144,8 +4219,7 @@ def unmarshallExport( try: with open(marshalled_export_file, mode="r", encoding="utf-8") as meF: marshalled_export = yaml.load(meF, Loader=YAMLLoader) - combined_globals = copy.copy(common_defs_module.__dict__) - combined_globals.update(globals()) + combined_globals = self.__get_combined_globals() self.runExportActions = unmarshall_namedtuple( marshalled_export, combined_globals ) From e7f6cb0892e0349e22c7f11574b695724c7074d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 May 2024 22:55:36 +0200 Subject: [PATCH 41/42] Trying to add support back for Python 3.7 --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e7362889..9110abea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,4 +32,5 @@ exceptiongroup ; python_version < '3.11' # Needed for proper JSON-LD parsing + SPARQL query aiohttp pyld -rdflib >= 7.0.0 +rdflib >= 7.0.0 ; python_version >= '3.8' +rdflib < 7.0.0 ; python_version < '3.8' From 92a6e219bd66373630662b9e7d56e3a6a34c8be4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 May 2024 23:00:52 +0200 Subject: [PATCH 42/42] Possible fix for a pylint zealous error --- wfexs_backend/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 03053784..1302d8af 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1439,7 +1439,7 @@ def FromPreviousROCrate( except Exception as e2: raise WFException( f"Unable to locate RO-Crate metadata descriptor within {public_name}" - ) from ExceptionGroup( + ) from ExceptionGroup( # pylint: disable=possibly-used-before-assignment f"Both {ROCRATE_JSONLD_FILENAME} and {LEGACY_ROCRATE_JSONLD_FILENAME} tried", [e, e2], )