Merge branch 'release/1.4.0'

ecmwf · May 23, 2024 · 7588885 · 7588885
2 parents 5d37b8c + 3213232
commit 7588885
Show file tree

Hide file tree

Showing 25 changed files with 502 additions and 92 deletions.
diff --git a/.flake8 b/.flake8
@@ -3,4 +3,4 @@ max-line-length = 120
 extend-ignore =
     E203
 per-file-ignores =
-    __init__.py:F401
+    __init__.py:F401
diff --git a/.github/ci-config.yml b/.github/ci-config.yml
@@ -0,0 +1,7 @@
+dependencies: |
+  ecmwf/ecbuild
+  ecmwf/eckit
+  ecmwf/odc
+dependency_branch: develop
+parallelism_factor: 8
+self_build: false
diff --git a/.github/ci-hpc-config.yml b/.github/ci-hpc-config.yml
@@ -0,0 +1,9 @@
+build:
+  python: '3.10'
+  modules:
+    - ninja
+  dependencies:
+    - ecmwf/ecbuild@develop
+    - ecmwf/eckit@develop
+    - ecmwf/odc@develop
+  parallel: 64
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -0,0 +1,11 @@
+name: cd
+
+on:
+  push:
+    tags:
+      - '**'
+
+jobs:
+  pypi:
+    uses: ecmwf-actions/reusable-workflows/.github/workflows/cd-pypi.yml@v2
+    secrets: inherit
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,41 +1,39 @@
 name: ci
 
-# Controls when the workflow will run
 on:
-
-  # Trigger the workflow on all pushes
+  # Trigger the workflow on push to master or develop, except tag creation
   push:
     branches:
-    - '**'
-    tags:
-    - '**'
+      - 'master'
+      - 'develop'
+    tags-ignore:
+      - '**'
 
-  # Trigger the workflow on all pull requests
+  # Trigger the workflow on pull request
   pull_request: ~
 
-  # Allow workflow to be dispatched on demand
+  # Trigger the workflow manually
   workflow_dispatch: ~
 
+  # Trigger after public PR approved for CI
+  pull_request_target:
+    types: [labeled]
+
 jobs:
+  # Run CI including downstream packages on self-hosted runners
+  downstream-ci:
+    name: downstream-ci
+    if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }}
+    uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci.yml@main
+    with:
+      pyodc: ecmwf/pyodc@${{ github.event.pull_request.head.sha || github.sha }}
+    secrets: inherit
 
-  # Calls a reusable CI workflow to qa, test & deploy the current repository.
-  #   It will pull in all needed dependencies and produce a code coverage report on success.
-  #   If all checks were successful and a new release tag pushed, the package will be published on PyPI.
-  #   In case the job fails, a message will be posted to a Microsoft Teams channel.
-  ci:
-    name: ci
-    uses: ecmwf-actions/reusable-workflows/.github/workflows/ci-python.yml@v1
+  # Build downstream packages on HPC
+  downstream-ci-hpc:
+    name: downstream-ci-hpc
+    if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }}
+    uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci-hpc.yml@main
     with:
-      codecov_upload: true
-      notify_teams: true
-      build_package_inputs: |
-        self_build: false
-        dependencies: |
-          ecmwf/ecbuild
-          ecmwf/eckit
-          ecmwf/odc
-        dependency_branch: develop
-    secrets:
-      pypi_username: ${{ secrets.PYPI_USERNAME }}
-      pypi_password: ${{ secrets.PYPI_PASSWORD }}
-      incoming_webhook: ${{ secrets.MS_TEAMS_INCOMING_WEBHOOK }}
+      pyodc: ecmwf/pyodc@${{ github.event.pull_request.head.sha || github.sha }}
+    secrets: inherit
diff --git a/.github/workflows/label-public-pr.yml b/.github/workflows/label-public-pr.yml
@@ -0,0 +1,10 @@
+# Manage labels of pull requests that originate from forks
+name: label-public-pr
+
+on:
+  pull_request_target:
+    types: [opened, synchronize]
+
+jobs:
+  label:
+    uses: ecmwf-actions/reusable-workflows/.github/workflows/label-pr.yml@v2
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@ dist/
 *egg-info*
 .eggs
 .ipynb_checkpoints
+
+# editors
+.vscode
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,20 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+  -   id: trailing-whitespace
+  -   id: end-of-file-fixer
+  -   id: check-yaml
+  -   id: check-added-large-files
+- repo: https://github.com/PyCQA/isort
+  rev: 5.12.0
+  hooks:
+    - id: isort
+- repo: https://github.com/psf/black
+  rev: 23.7.0
+  hooks:
+    - id: black
+- repo: https://github.com/PyCQA/flake8
+  rev: 6.1.0
+  hooks:
+    - id: flake8
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ The package contains two different implementations of the same library:
 * [pandoc]
 * [Jupyter Notebook]
 
-For `codc` to work, `odc` library must be compiled and installed on the system and made available to Python (through the CFFI mechanism) as a shared library. There are multiple ways to make the library visible to CFFI: it can be installed as a system library, the installation prefix can be passed in `odc_DIR` environment variable, or the library directory can be included in `LD_LIBRARY_PATH`.
+For `codc` to work, `odc` library must be compiled and installed on the system and made available to Python (through the CFFI mechanism) as a shared library. There are multiple ways to make the library visible to CFFI: it can be installed as a system library, the installation prefix can be passed in the `odc_DIR` or `ODC_DIR` environment variables, or the library directory can be included in `LD_LIBRARY_PATH`.
 
 ## Installation
 

diff --git a/codc/__init__.py b/codc/__init__.py
@@ -4,4 +4,4 @@
 from .lib import ODCException
 from .reader import Reader, read_odb
 
-__version__ = "1.3.0"
+__version__ = "1.4.0"
diff --git a/codc/constants.py b/codc/constants.py
@@ -1,6 +1,6 @@
 from enum import IntEnum, unique
 
-from .lib import lib
+from .lib import ffi, lib, ODCException
 
 
 @unique
@@ -19,3 +19,19 @@ class DataType(IntEnum):
 STRING = DataType.STRING
 BITFIELD = DataType.BITFIELD
 DOUBLE = DataType.DOUBLE
+
+
+_type_names = {}
+
+def type_name(typ):
+    try:
+        return _type_names[typ]
+    except KeyError:
+        try:
+            pname = ffi.new("const char**")
+            lib.odc_column_type_name(typ, pname)
+            name = ffi.string(pname[0]).decode("utf-8")
+        except ODCException:
+            name = "<unknown>"
+        _type_names[typ] = name
+        return name
diff --git a/codc/encoder.py b/codc/encoder.py
@@ -57,15 +57,15 @@ def infer_column_type(arr, override_type):
                     dtype = INTEGER
                 else:
                     dtype = DOUBLE
-            elif arr.dtype == "object":
+            if arr.dtype == "object" or pandas.api.types.is_string_dtype(arr):
                 if not arr.isnull().all() and all(s is None or isinstance(s, str) for s in arr):
                     dtype = STRING
                 elif arr.isnull().all():
                     dtype = INTEGER
 
         # With an inferred, or supplied column type, massage the data into a form that can be encoded
 
-        if arr.dtype == "object":
+        if arr.dtype == "object" or pandas.api.types.is_string_dtype(arr):
             # Map strings into an array that can be read in C
             if dtype == STRING:
                 return_arr = return_arr.astype("|S{}".format(max(8, 8 * (1 + ((max(len(s) for s in arr) - 1) // 8)))))
@@ -106,7 +106,7 @@ def infer_column_type(arr, override_type):
         data, dtype = infer_column_type(data, types.get(name, None))
         data_cache.append(data)
 
-        lib.odc_encoder_add_column(encoder, name.encode("utf-8"), dtype)
+        lib.odc_encoder_add_column(encoder, str(name).encode("utf-8"), dtype)
         lib.odc_encoder_column_set_data_array(
             encoder,
             i,

diff --git a/codc/frame.py b/codc/frame.py
@@ -1,4 +1,4 @@
-from .constants import BITFIELD, DOUBLE, INTEGER, REAL, STRING, DataType
+from .constants import BITFIELD, DOUBLE, INTEGER, REAL, STRING, DataType, type_name
 from .lib import ffi, lib, memoize_constant
 
 try:
@@ -64,7 +64,7 @@ def __str__(self):
             bitfield_str = "(" + ",".join("{}:{}".format(b.name, b.size) for b in self.bitfields) + ")"
         else:
             bitfield_str = ""
-        return "{}:{}{}".format(self.name, self.dtype, bitfield_str)
+        return "{}:{}{}".format(self.name, type_name(self.dtype), bitfield_str)
 
     def __repr__(self):
         return str(self)
@@ -177,18 +177,28 @@ def dataframe(self, columns=None):
 
         if columns is not None:
             final_columns = set()
+            _original_columns = self.column_dict.keys()
+            _original_simple_columns = self.simple_column_dict.keys()
+
             for colname in columns:
-                dotpos = colname.find(".")
-                if dotpos == -1:
+
+                # If the column is already present, then use that one directly.
+                # This ensures that we can handle exploded bitfield columns, and extract bitfields from
+                # existing columns below
+                if colname in _original_columns or colname in _original_simple_columns:
                     final_columns.add(colname)
                 else:
-                    column_name = colname[:dotpos]
-                    sp = colname[dotpos + 1 :].split("@")
-                    bitfield_name = sp[0]
-                    if len(sp) > 1:
-                        column_name += "@" + sp[1]
-                    final_columns.add(column_name)
-                    bitfields.append((bitfield_name, column_name, colname))
+                    dotpos = colname.find(".")
+                    if dotpos == -1:
+                        final_columns.add(colname)
+                    else:
+                        column_name = colname[:dotpos]
+                        sp = colname[dotpos + 1 :].split("@")
+                        bitfield_name = sp[0]
+                        if len(sp) > 1:
+                            column_name += "@" + sp[1]
+                        final_columns.add(column_name)
+                        bitfields.append((bitfield_name, column_name, colname))
             columns = list(final_columns)
 
         df = self._dataframe_internal(columns)

diff --git a/codc/lib.py b/codc/lib.py
@@ -16,6 +16,7 @@
 import os
 
 import cffi
+import findlibs
 from pkg_resources import parse_version
 
 __odc_version__ = "1.4.0"
@@ -44,24 +45,11 @@ class PatchedLib:
     def __init__(self):
         ffi.cdef(self.__read_header())
 
-        libnames = [
-            "odccore",
-        ]
-        for env_var in ("ODC_DIR", "odc_DIR"):
-            if os.environ.get(env_var):
-                libnames.insert(0, os.path.join(os.environ[env_var], "lib/libodccore"))
-                libnames.insert(0, os.path.join(os.environ[env_var], "lib64/libodccore"))
-                libnames.insert(0, os.path.join(os.environ[env_var], "lib/libodccore.so"))
-                libnames.insert(0, os.path.join(os.environ[env_var], "lib64/libodccore.so"))
-
-        for libname in libnames:
-            try:
-                self.__lib = ffi.dlopen(libname)
-                break
-            except Exception as e:
-                last_exception = e
-        else:
-            raise CFFIModuleLoadFailed() from last_exception
+        library_path = findlibs.find("odccore", pkg_name="odc")
+        if library_path is None:
+            raise RuntimeError("Cannot find the odccore library")
+
+        self.__lib = ffi.dlopen(library_path)
 
         # Todo: Version check against __version__
 

diff --git a/docs/content/installation.rst b/docs/content/installation.rst
@@ -25,7 +25,7 @@ Optional
 
 .. note::
 
-   For **codc** to work, the **odc** library must be compiled and installed on the system and made available to Python (through the CFFI mechanism) as a shared library. There are multiple ways to make the library visible to CFFI: it can be installed as a system library, the installation prefix can be passed in ``odc_DIR`` environment variable, or the library directory can be included in ``LD_LIBRARY_PATH``.
+   For **codc** to work, the **odc** library must be compiled and installed on the system and made available to Python (through the CFFI mechanism) as a shared library. There are multiple ways to make the library visible to CFFI: it can be installed as a system library, the installation prefix can be passed in the ``odc_DIR`` or ``ODC_DIR`` environment variables, or the library directory can be included in ``LD_LIBRARY_PATH``. For example, if you cloned the odc source code to `$HOME/odc` and used `$HOME/odc/build` as the builddir, you could use `export odc_DIR=$HOME/odc/build`.
 
 
 .. _`odc`: https://github.com/ecmwf/odc
@@ -81,4 +81,4 @@ To check if the modules were installed correctly:
 
 .. _`PyPI`: https://pypi.org
 .. _`Conda`: https://docs.conda.io
-.. _`conda-forge`: https://conda-forge.org
+.. _`conda-forge`: https://conda-forge.org
diff --git a/pyodc/__init__.py b/pyodc/__init__.py
@@ -3,4 +3,4 @@
 from .frame import ColumnInfo, Frame
 from .reader import Reader, read_odb
 
-__version__ = "1.3.0"
+__version__ = "1.4.0"
diff --git a/pyodc/codec.py b/pyodc/codec.py
@@ -57,7 +57,7 @@ def data_size(self):
         return 8
 
     def encode_header(self, stream):
-        stream.encodeString(self.column_name)
+        stream.encodeString(str(self.column_name))
         stream.encodeInt32(self.type)
 
         if self.type == DataType.BITFIELD:
@@ -116,11 +116,27 @@ def numChanges(self):
 
 class Constant(Codec):
     @classmethod
-    def from_dataframe(cls, column_name: str, data: pd.Series, data_type: DataType, bitfields):
+    def from_dataframe(cls, column_name: str, data: pd.Series, data_type: DataType, bitfields: list):
         assert data.nunique() == 1 and not data.hasnans
-        assert not bitfields
         value = next(iter(data))
-        return cls(column_name, value, value, data_type)
+
+        if bitfields:
+            assert data_type == DataType.BITFIELD
+            bitfield_names = [bf if isinstance(bf, str) else bf[0] for bf in bitfields]
+            bitfield_sizes = [1 if isinstance(bf, str) else bf[1] for bf in bitfields]
+        else:
+            bitfield_names = []
+            bitfield_sizes = []
+
+        return cls(
+            column_name,
+            minval=value,
+            maxval=value,
+            data_type=data_type,
+            has_missing=False,
+            bitfield_names=bitfield_names,
+            bitfield_sizes=bitfield_sizes,
+        )
 
     def encode(self, stream, value):
         pass
@@ -432,7 +448,7 @@ def select_codec(column_name: str, data: pd.Series, data_type, bitfields):
                 data_type = DataType.DOUBLE
         elif data.dtype == "float32":
             data_type = DataType.REAL
-        elif data.dtype == "object":
+        elif data.dtype == "object" or pd.api.types.is_string_dtype(data):
             if not data.isnull().all() and all(s is None or isinstance(s, str) for s in data):
                 data_type = DataType.STRING
 
@@ -481,7 +497,7 @@ def select_codec(column_name: str, data: pd.Series, data_type, bitfields):
             codec_class = ShortReal2
 
     elif data_type == DataType.STRING:
-        if data.nunique() == 1 and not data.hasnans:
+        if data.nunique() == 1 and len(data.iloc[0]) <= 8 and not data.hasnans:
             codec_class = ConstantString
         elif data.nunique() <= 256:
             codec_class = Int8String