Merge branch 'main' into fix-export-readers

ImperialCollegeLondon · Feb 7, 2025 · 5edb097 · 5edb097
2 parents 2490ca8 + 7428f83
commit 5edb097
Show file tree

Hide file tree

Showing 11 changed files with 1,455 additions and 616 deletions.
diff --git a/.github/workflows/ci_template.yml b/.github/workflows/ci_template.yml
@@ -9,7 +9,7 @@ on:
         type: string
       python-version:
         description: 'Python version'
-        default: '["3.9", "3.12"]'
+        default: '["3.10", "3.13"]'
         type: string
 
 jobs:
@@ -27,7 +27,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
 
     - name: Install Poetry
-      uses: abatilo/actions-poetry@v3.0.1
+      uses: abatilo/actions-poetry@v4.0.0
       with:
         poetry-version: 1.8.3
 
@@ -38,6 +38,6 @@ jobs:
       run: poetry run pytest
 
     - name: Upload coverage to Codecov
-      if: success() && (matrix.os == 'ubuntu-latest' && matrix.python-version == 3.9)
+      if: success() && (matrix.os == 'ubuntu-latest' && matrix.python-version == 3.10)
       uses: codecov/codecov-action@v5
 
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -12,7 +12,7 @@ jobs:
     uses: ./.github/workflows/ci_template.yml
     with:
       os: '["ubuntu-latest", "windows-latest", "macos-latest"]'
-      python-version: '["3.9", "3.10", "3.11", "3.12"]'
+      python-version: '["3.10", "3.11", "3.12", "3.13"]'
 
   build-wheel:
     needs: test
@@ -37,10 +37,10 @@ jobs:
 
     - uses: actions/setup-python@v5
       with:
-        python-version: 3.9
+        python-version: 3.10
 
     - name: Install Poetry
-      uses: abatilo/actions-poetry@v3.0.1
+      uses: abatilo/actions-poetry@v4.0.0
       with:
         poetry-version: 1.8.3
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,20 +5,20 @@ repos:
       - id: check-merge-conflict
       - id: debug-statements
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.3
+    rev: v0.9.4
     hooks:
       - id: ruff
         types_or: [python]
         args: [--fix]
       - id: ruff-format
         types_or: [python]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.13.0"
+    rev: "v1.14.1"
     hooks:
       - id: mypy
-        additional_dependencies: [types-PyYAML]
+        additional_dependencies: [types-PyYAML, pydantic]
   - repo: https://github.com/igorshubovych/markdownlint-cli
-    rev: v0.43.0
+    rev: v0.44.0
     hooks:
       - id: markdownlint
         args: ["--disable", "MD013", "MD041", "--"]
diff --git a/csvy/validators/__init__.py b/csvy/validators/__init__.py
@@ -0,0 +1,65 @@
+"""Validators for the CSVY format."""
+
+from collections.abc import Mapping
+from typing import Any
+
+from pydantic import BaseModel
+
+from .csv_dialect import CSVDialectValidator  # noqa: F401
+from .registry import VALIDATORS_REGISTRY, register_validator  # noqa: F401
+from .table_schema import SchemaValidator  # noqa: F401
+
+
+def validate_header(header: dict[str, Any]) -> dict[str, Any]:
+    """Run the validators on the header.
+
+    This function runs the validators on the header. It uses the keys of the header to
+    find the validators in the registry and runs them on the corresponding values. As
+    a result, some values in the header may be replaced by the validated values in the
+    form of Pydantic models.
+
+    If the header is an already validated header, the Pydantic models within, if any,
+    are dumped to dictionaries and re-validated, again. This accounts for the case where
+    attributes of the Pydantic models are changed to invalid values.
+
+    Args:
+        header: The header of the CSVY file.
+
+    Returns:
+        The validated header.
+
+    """
+    validated_header: dict[str, Any] = {}
+    for key, value in header.items():
+        value_ = value.model_dump() if isinstance(value, BaseModel) else value
+        if key in VALIDATORS_REGISTRY:
+            if not isinstance(value_, Mapping):
+                raise TypeError(
+                    f"Value for '{key}' must be a mapping, not a '{type(value_)}'."
+                )
+            validator = VALIDATORS_REGISTRY[key]
+            validated_header[key] = validator(**value_)
+        else:
+            validated_header[key] = value_
+    return validated_header
+
+
+def header_to_dict(header: dict[str, Any]) -> dict[str, Any]:
+    """Transform the header into a serializable dictionary.
+
+    Transforms the header with validators to a header with dictionaries that can be
+    saved as yaml.
+
+    Args:
+        header: Dictionary to be saved as the header of the CSVY file.
+
+    Returns:
+        The validated header, as a serializable dictionary.
+
+    """
+    validated_header = {}
+    for key, value in header.items():
+        validated_header[key] = (
+            value.model_dump() if isinstance(value, BaseModel) else value
+        )
+    return validated_header
diff --git a/csvy/validators.py → csvy/validators/csv_dialect.py b/csvy/validators.py → csvy/validators/csv_dialect.py
@@ -1,99 +1,16 @@
-"""Module that contains validators for the CSVY file format."""
+"""CSV Dialect-related validation."""
+
+from __future__ import annotations
 
 import csv
-from collections.abc import Mapping
-from typing import Any, Callable, Optional, TypeVar
+from enum import Enum
+from typing import Any, TypeVar
 
 from pydantic import BaseModel, Field
 
-VALIDATORS_REGISTRY: dict[str, type[BaseModel]] = {}
-"""Registry of validators to run on the header."""
-
-
-def register_validator(
-    name: str, overwrite: bool = False
-) -> Callable[[type[BaseModel]], type[BaseModel]]:
-    """Register a validator in the registry.
-
-    This function is a decorator that registers a validator in the registry. The name
-    of the validator is used as the key in the registry.
-
-    Args:
-        name: The name of the validator.
-        overwrite: Whether to overwrite the validator if it already exists.
-
-    Returns:
-        The decorator function that registers the validator.
-
-    """
-
-    def decorator(cls: type[BaseModel]) -> type[BaseModel]:
-        if not issubclass(cls, BaseModel):
-            raise TypeError("Validators must be subclasses of pydantic.BaseModel.")
-
-        if name in VALIDATORS_REGISTRY and not overwrite:
-            raise ValueError(f"Validator with name '{name}' already exists.")
-
-        VALIDATORS_REGISTRY[name] = cls
-        return cls
-
-    return decorator
-
-
-def validate_header(header: dict[str, Any]) -> dict[str, Any]:
-    """Run the validators on the header.
-
-    This function runs the validators on the header. It uses the keys of the header to
-    find the validators in the registry and runs them on the corresponding values. As
-    a result, some values in the header may be replaced by the validated values in the
-    form of Pydantic models.
-
-    If the header is an already validated header, the Pydantic models within, if any,
-    are dumped to dictionaries and re-validated, again. This accounts for the case where
-    attributes of the Pydantic models are changed to invalid values.
-
-    Args:
-        header: The header of the CSVY file.
-
-    Returns:
-        The validated header.
-
-    """
-    validated_header: dict[str, Any] = {}
-    for key, value in header.items():
-        value_ = value.model_dump() if isinstance(value, BaseModel) else value
-        if key in VALIDATORS_REGISTRY:
-            if not isinstance(value_, Mapping):
-                raise TypeError(
-                    f"Value for '{key}' must be a mapping, not a '{type(value_)}'."
-                )
-            validator = VALIDATORS_REGISTRY[key]
-            validated_header[key] = validator(**value_)
-        else:
-            validated_header[key] = value_
-    return validated_header
-
-
-def header_to_dict(header: dict[str, Any]) -> dict[str, Any]:
-    """Transform the header into a serializable dictionary.
-
-    Transforms the header with validators to a header with dictionaries that can be
-    saved as yaml.
-
-    Args:
-        header: Dictionary to be saved as the header of the CSVY file.
-
-    Returns:
-        The validated header, as a serializable dictionary.
-
-    """
-    validated_header = {}
-    for key, value in header.items():
-        validated_header[key] = (
-            value.model_dump() if isinstance(value, BaseModel) else value
-        )
-    return validated_header
+from .registry import register_validator
 
+# CSV Dialect-related validation
 
 # Create a generic variable that can be 'Parent', or any subclass.
 T = TypeVar("T", bound="CSVDialectValidator")
@@ -127,7 +44,7 @@ class CSVDialectValidator(BaseModel):
 
     delimiter: str = Field(default=",")
     doublequote: bool = Field(default=True)
-    escapechar: Optional[str] = Field(default=None)
+    escapechar: str | None = Field(default=None)
     lineterminator: str = Field(default="\r\n")
     quotechar: str = Field(default='"')
     skipinitialspace: bool = Field(default=False)
@@ -223,3 +140,113 @@ def unix_dialect(cls: type[T]) -> T:
             quotechar=unix.quotechar or '"',
             skipinitialspace=unix.skipinitialspace,
         )
+
+
+# Table Schema-related validation
+
+
+class TypeEnum(str, Enum):
+    """Enumeration of the possible types for the Table Schema."""
+
+    STRING = "string"
+    INTEGER = "integer"
+    BOOLEAN = "boolean"
+    OBJECT = "object"
+    ARRAY = "array"
+    DATE = "date"
+    TIME = "time"
+    DATETIME = "datetime"
+    YEAR = "year"
+    YEARMONTH = "yearmonth"
+    DURATION = "duration"
+    GEOPOINT = "geopoint"
+    GEOJSON = "geojson"
+    ANY = "any"
+
+
+class ConstraintsValidator(BaseModel):
+    """Validator for the constraints in the Table Schema.
+
+    This class is used to validate the constraints in the Table Schema. It is based on
+    the constraints defined in the Table Schema specification.
+
+    Attributes:
+        required: A boolean indicating if the value is required.
+        unique: A boolean indicating if the value is unique.
+        minimum: The minimum of the value. Applies to types: integer, number, date,
+            time, datetime, year, yearmonth.
+        maximum: The maximum value. Applies to types: integer, number, date,
+            time, datetime, year, yearmonth.
+        minLength: The minimum length of the field. Applies to collections (string,
+            array, object).
+        maxLength: The maximum length of the field. Applies to collections (string,
+            array, object).
+        pattern: A regular expression pattern that the value must match. Applies to
+            types: string.
+        enum: A list of possible values for the field.
+
+    """
+
+    required: bool | None = Field(None)
+    unique: bool | None = Field(None)
+    minimum: int | float | None = Field(None)
+    maximum: int | float | None = Field(None)
+    minLength: int | None = Field(None)
+    maxLength: int | None = Field(None)
+    pattern: str | None = Field(None)
+    enum: list[Any] | None = Field(None)
+
+
+class ColumnValidator(BaseModel):
+    """Validator for the columns in the Table Schema.
+
+    This class is used to validate the columns in the Table Schema. It is based on the
+    columns defined in the Table Schema specification.
+
+    Attributes:
+        name: The name of the column.
+        title: A nicer human readable label or title for the field.
+        type_: A string specifying the type.
+        format_: A string specifying a format.
+        example: An example value for the field.
+        description: A description for the field.
+        constraints: A dictionary of constraints for the field.
+
+    """
+
+    name: str = Field(..., description="Column name.")
+    title: str | None = Field(
+        None, description="A nicer human readable label or title for the field."
+    )
+    type_: TypeEnum | None = Field(
+        None, alias="type", description="A string specifying the type."
+    )
+    format_: str | None = Field(
+        None, alias="format", description="A string specifying a format."
+    )
+    example: str | None = Field(None, description="An example value for the field.")
+    description: str | None = Field(None, description="A description for the field.")
+    constraints: ConstraintsValidator | None = Field(
+        None, description="A dictionary of constraints for the field."
+    )
+
+    def model_dump(self, *args, **kwargs) -> dict[str, Any]:
+        """Dump the model to a dictionary.
+
+        This method dumps the model to a dictionary. It sets exclude_unset to True and
+        by_alias to True, so that only the attributes that were set are included in the
+        dictionary and their aliases are always used.
+
+        Finally, it converts the attributes that are Enum instances to their values.
+
+        Returns:
+            A dictionary with the model attributes.
+
+        """
+        kwargs["exclude_unset"] = True
+        kwargs["by_alias"] = True
+        output = super().model_dump(*args, **kwargs)
+        for key, value in output.items():
+            if isinstance(value, Enum):
+                output[key] = value.value
+        return output