Skip to content

Commit

Permalink
Merge branch 'release/1.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
simondsmart committed May 23, 2024
2 parents 5d37b8c + 3213232 commit 7588885
Show file tree
Hide file tree
Showing 25 changed files with 502 additions and 92 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ max-line-length = 120
extend-ignore =
E203
per-file-ignores =
__init__.py:F401
__init__.py:F401
7 changes: 7 additions & 0 deletions .github/ci-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
dependencies: |
ecmwf/ecbuild
ecmwf/eckit
ecmwf/odc
dependency_branch: develop
parallelism_factor: 8
self_build: false
9 changes: 9 additions & 0 deletions .github/ci-hpc-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
build:
python: '3.10'
modules:
- ninja
dependencies:
- ecmwf/ecbuild@develop
- ecmwf/eckit@develop
- ecmwf/odc@develop
parallel: 64
11 changes: 11 additions & 0 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: cd

on:
push:
tags:
- '**'

jobs:
pypi:
uses: ecmwf-actions/reusable-workflows/.github/workflows/cd-pypi.yml@v2
secrets: inherit
54 changes: 26 additions & 28 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,41 +1,39 @@
name: ci

# Controls when the workflow will run
on:

# Trigger the workflow on all pushes
# Trigger the workflow on push to master or develop, except tag creation
push:
branches:
- '**'
tags:
- '**'
- 'master'
- 'develop'
tags-ignore:
- '**'

# Trigger the workflow on all pull requests
# Trigger the workflow on pull request
pull_request: ~

# Allow workflow to be dispatched on demand
# Trigger the workflow manually
workflow_dispatch: ~

# Trigger after public PR approved for CI
pull_request_target:
types: [labeled]

jobs:
# Run CI including downstream packages on self-hosted runners
downstream-ci:
name: downstream-ci
if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }}
uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci.yml@main
with:
pyodc: ecmwf/pyodc@${{ github.event.pull_request.head.sha || github.sha }}
secrets: inherit

# Calls a reusable CI workflow to qa, test & deploy the current repository.
# It will pull in all needed dependencies and produce a code coverage report on success.
# If all checks were successful and a new release tag pushed, the package will be published on PyPI.
# In case the job fails, a message will be posted to a Microsoft Teams channel.
ci:
name: ci
uses: ecmwf-actions/reusable-workflows/.github/workflows/ci-python.yml@v1
# Build downstream packages on HPC
downstream-ci-hpc:
name: downstream-ci-hpc
if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }}
uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci-hpc.yml@main
with:
codecov_upload: true
notify_teams: true
build_package_inputs: |
self_build: false
dependencies: |
ecmwf/ecbuild
ecmwf/eckit
ecmwf/odc
dependency_branch: develop
secrets:
pypi_username: ${{ secrets.PYPI_USERNAME }}
pypi_password: ${{ secrets.PYPI_PASSWORD }}
incoming_webhook: ${{ secrets.MS_TEAMS_INCOMING_WEBHOOK }}
pyodc: ecmwf/pyodc@${{ github.event.pull_request.head.sha || github.sha }}
secrets: inherit
10 changes: 10 additions & 0 deletions .github/workflows/label-public-pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Manage labels of pull requests that originate from forks
name: label-public-pr

on:
pull_request_target:
types: [opened, synchronize]

jobs:
label:
uses: ecmwf-actions/reusable-workflows/.github/workflows/label-pr.yml@v2
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ dist/
*egg-info*
.eggs
.ipynb_checkpoints

# editors
.vscode
20 changes: 20 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 23.7.0
hooks:
- id: black
- repo: https://github.com/PyCQA/flake8
rev: 6.1.0
hooks:
- id: flake8
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ The package contains two different implementations of the same library:
* [pandoc]
* [Jupyter Notebook]

For `codc` to work, `odc` library must be compiled and installed on the system and made available to Python (through the CFFI mechanism) as a shared library. There are multiple ways to make the library visible to CFFI: it can be installed as a system library, the installation prefix can be passed in `odc_DIR` environment variable, or the library directory can be included in `LD_LIBRARY_PATH`.
For `codc` to work, `odc` library must be compiled and installed on the system and made available to Python (through the CFFI mechanism) as a shared library. There are multiple ways to make the library visible to CFFI: it can be installed as a system library, the installation prefix can be passed in the `odc_DIR` or `ODC_DIR` environment variables, or the library directory can be included in `LD_LIBRARY_PATH`.

## Installation

Expand Down
2 changes: 1 addition & 1 deletion codc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from .lib import ODCException
from .reader import Reader, read_odb

__version__ = "1.3.0"
__version__ = "1.4.0"
18 changes: 17 additions & 1 deletion codc/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import IntEnum, unique

from .lib import lib
from .lib import ffi, lib, ODCException


@unique
Expand All @@ -19,3 +19,19 @@ class DataType(IntEnum):
STRING = DataType.STRING
BITFIELD = DataType.BITFIELD
DOUBLE = DataType.DOUBLE


_type_names = {}

def type_name(typ):
try:
return _type_names[typ]
except KeyError:
try:
pname = ffi.new("const char**")
lib.odc_column_type_name(typ, pname)
name = ffi.string(pname[0]).decode("utf-8")
except ODCException:
name = "<unknown>"
_type_names[typ] = name
return name
6 changes: 3 additions & 3 deletions codc/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,15 @@ def infer_column_type(arr, override_type):
dtype = INTEGER
else:
dtype = DOUBLE
elif arr.dtype == "object":
if arr.dtype == "object" or pandas.api.types.is_string_dtype(arr):
if not arr.isnull().all() and all(s is None or isinstance(s, str) for s in arr):
dtype = STRING
elif arr.isnull().all():
dtype = INTEGER

# With an inferred, or supplied column type, massage the data into a form that can be encoded

if arr.dtype == "object":
if arr.dtype == "object" or pandas.api.types.is_string_dtype(arr):
# Map strings into an array that can be read in C
if dtype == STRING:
return_arr = return_arr.astype("|S{}".format(max(8, 8 * (1 + ((max(len(s) for s in arr) - 1) // 8)))))
Expand Down Expand Up @@ -106,7 +106,7 @@ def infer_column_type(arr, override_type):
data, dtype = infer_column_type(data, types.get(name, None))
data_cache.append(data)

lib.odc_encoder_add_column(encoder, name.encode("utf-8"), dtype)
lib.odc_encoder_add_column(encoder, str(name).encode("utf-8"), dtype)
lib.odc_encoder_column_set_data_array(
encoder,
i,
Expand Down
32 changes: 21 additions & 11 deletions codc/frame.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .constants import BITFIELD, DOUBLE, INTEGER, REAL, STRING, DataType
from .constants import BITFIELD, DOUBLE, INTEGER, REAL, STRING, DataType, type_name
from .lib import ffi, lib, memoize_constant

try:
Expand Down Expand Up @@ -64,7 +64,7 @@ def __str__(self):
bitfield_str = "(" + ",".join("{}:{}".format(b.name, b.size) for b in self.bitfields) + ")"
else:
bitfield_str = ""
return "{}:{}{}".format(self.name, self.dtype, bitfield_str)
return "{}:{}{}".format(self.name, type_name(self.dtype), bitfield_str)

def __repr__(self):
return str(self)
Expand Down Expand Up @@ -177,18 +177,28 @@ def dataframe(self, columns=None):

if columns is not None:
final_columns = set()
_original_columns = self.column_dict.keys()
_original_simple_columns = self.simple_column_dict.keys()

for colname in columns:
dotpos = colname.find(".")
if dotpos == -1:

# If the column is already present, then use that one directly.
# This ensures that we can handle exploded bitfield columns, and extract bitfields from
# existing columns below
if colname in _original_columns or colname in _original_simple_columns:
final_columns.add(colname)
else:
column_name = colname[:dotpos]
sp = colname[dotpos + 1 :].split("@")
bitfield_name = sp[0]
if len(sp) > 1:
column_name += "@" + sp[1]
final_columns.add(column_name)
bitfields.append((bitfield_name, column_name, colname))
dotpos = colname.find(".")
if dotpos == -1:
final_columns.add(colname)
else:
column_name = colname[:dotpos]
sp = colname[dotpos + 1 :].split("@")
bitfield_name = sp[0]
if len(sp) > 1:
column_name += "@" + sp[1]
final_columns.add(column_name)
bitfields.append((bitfield_name, column_name, colname))
columns = list(final_columns)

df = self._dataframe_internal(columns)
Expand Down
24 changes: 6 additions & 18 deletions codc/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import os

import cffi
import findlibs
from pkg_resources import parse_version

__odc_version__ = "1.4.0"
Expand Down Expand Up @@ -44,24 +45,11 @@ class PatchedLib:
def __init__(self):
ffi.cdef(self.__read_header())

libnames = [
"odccore",
]
for env_var in ("ODC_DIR", "odc_DIR"):
if os.environ.get(env_var):
libnames.insert(0, os.path.join(os.environ[env_var], "lib/libodccore"))
libnames.insert(0, os.path.join(os.environ[env_var], "lib64/libodccore"))
libnames.insert(0, os.path.join(os.environ[env_var], "lib/libodccore.so"))
libnames.insert(0, os.path.join(os.environ[env_var], "lib64/libodccore.so"))

for libname in libnames:
try:
self.__lib = ffi.dlopen(libname)
break
except Exception as e:
last_exception = e
else:
raise CFFIModuleLoadFailed() from last_exception
library_path = findlibs.find("odccore", pkg_name="odc")
if library_path is None:
raise RuntimeError("Cannot find the odccore library")

self.__lib = ffi.dlopen(library_path)

# Todo: Version check against __version__

Expand Down
4 changes: 2 additions & 2 deletions docs/content/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Optional

.. note::

For **codc** to work, the **odc** library must be compiled and installed on the system and made available to Python (through the CFFI mechanism) as a shared library. There are multiple ways to make the library visible to CFFI: it can be installed as a system library, the installation prefix can be passed in ``odc_DIR`` environment variable, or the library directory can be included in ``LD_LIBRARY_PATH``.
For **codc** to work, the **odc** library must be compiled and installed on the system and made available to Python (through the CFFI mechanism) as a shared library. There are multiple ways to make the library visible to CFFI: it can be installed as a system library, the installation prefix can be passed in the ``odc_DIR`` or ``ODC_DIR`` environment variables, or the library directory can be included in ``LD_LIBRARY_PATH``. For example, if you cloned the odc source code to `$HOME/odc` and used `$HOME/odc/build` as the builddir, you could use `export odc_DIR=$HOME/odc/build`.


.. _`odc`: https://github.com/ecmwf/odc
Expand Down Expand Up @@ -81,4 +81,4 @@ To check if the modules were installed correctly:
.. _`PyPI`: https://pypi.org
.. _`Conda`: https://docs.conda.io
.. _`conda-forge`: https://conda-forge.org
.. _`conda-forge`: https://conda-forge.org
2 changes: 1 addition & 1 deletion pyodc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from .frame import ColumnInfo, Frame
from .reader import Reader, read_odb

__version__ = "1.3.0"
__version__ = "1.4.0"
28 changes: 22 additions & 6 deletions pyodc/codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def data_size(self):
return 8

def encode_header(self, stream):
stream.encodeString(self.column_name)
stream.encodeString(str(self.column_name))
stream.encodeInt32(self.type)

if self.type == DataType.BITFIELD:
Expand Down Expand Up @@ -116,11 +116,27 @@ def numChanges(self):

class Constant(Codec):
@classmethod
def from_dataframe(cls, column_name: str, data: pd.Series, data_type: DataType, bitfields):
def from_dataframe(cls, column_name: str, data: pd.Series, data_type: DataType, bitfields: list):
assert data.nunique() == 1 and not data.hasnans
assert not bitfields
value = next(iter(data))
return cls(column_name, value, value, data_type)

if bitfields:
assert data_type == DataType.BITFIELD
bitfield_names = [bf if isinstance(bf, str) else bf[0] for bf in bitfields]
bitfield_sizes = [1 if isinstance(bf, str) else bf[1] for bf in bitfields]
else:
bitfield_names = []
bitfield_sizes = []

return cls(
column_name,
minval=value,
maxval=value,
data_type=data_type,
has_missing=False,
bitfield_names=bitfield_names,
bitfield_sizes=bitfield_sizes,
)

def encode(self, stream, value):
pass
Expand Down Expand Up @@ -432,7 +448,7 @@ def select_codec(column_name: str, data: pd.Series, data_type, bitfields):
data_type = DataType.DOUBLE
elif data.dtype == "float32":
data_type = DataType.REAL
elif data.dtype == "object":
elif data.dtype == "object" or pd.api.types.is_string_dtype(data):
if not data.isnull().all() and all(s is None or isinstance(s, str) for s in data):
data_type = DataType.STRING

Expand Down Expand Up @@ -481,7 +497,7 @@ def select_codec(column_name: str, data: pd.Series, data_type, bitfields):
codec_class = ShortReal2

elif data_type == DataType.STRING:
if data.nunique() == 1 and not data.hasnans:
if data.nunique() == 1 and len(data.iloc[0]) <= 8 and not data.hasnans:
codec_class = ConstantString
elif data.nunique() <= 256:
codec_class = Int8String
Expand Down
Loading

0 comments on commit 7588885

Please sign in to comment.