From 6e4611fefbca524ed42945810711d245d5be235c Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Sun, 11 Aug 2024 12:09:08 -0700 Subject: [PATCH] py (#81) * add shared library for fasta parsing that can be used with python * add tests and classes for use in python * add automatic testing + building + releasing of resulting python package --- .github/workflows/build.yml | 132 ++++++++++++++++++++++++++++++++++ README.md | 7 +- go.work | 3 +- py/.gitignore | 117 ++++++++++++++++++++++++++++++ py/README.md | 7 ++ py/dnadesign/__init__.py | 0 py/dnadesign/cffi_bindings.py | 33 +++++++++ py/dnadesign/definitions.h | 17 +++++ py/dnadesign/fasta_parser.py | 26 +++++++ py/go.mod | 7 ++ py/go.sum | 6 ++ py/lib.go | 95 ++++++++++++++++++++++++ py/setup.py | 31 ++++++++ py/tests/__init__.py | 0 py/tests/data/example.fasta | 11 +++ py/tests/test_fasta_parser.py | 17 +++++ 16 files changed, 506 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/build.yml create mode 100644 py/.gitignore create mode 100644 py/README.md create mode 100644 py/dnadesign/__init__.py create mode 100644 py/dnadesign/cffi_bindings.py create mode 100644 py/dnadesign/definitions.h create mode 100644 py/dnadesign/fasta_parser.py create mode 100644 py/go.mod create mode 100644 py/go.sum create mode 100644 py/lib.go create mode 100644 py/setup.py create mode 100644 py/tests/__init__.py create mode 100644 py/tests/data/example.fasta create mode 100644 py/tests/test_fasta_parser.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..60f8d1fc --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,132 @@ +name: Build and Package + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + strategy: + matrix: + include: + - os: ubuntu-latest + arch: amd64 + - os: ubuntu-latest + arch: arm64 + - os: macos-latest # This will be ARM64 + arch: arm64 + - os: macos-13 # This will be AMD64 + arch: amd64 + + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v2 + + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: '1.22' + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel cffi + + - name: Install Zig + uses: goto-bus-stop/setup-zig@v2 + with: + version: 0.11.0 + + - name: Set up Xcode + if: runner.os == 'macOS' + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: latest-stable + + - name: Build Go shared library + working-directory: ./py + run: | + if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "arm64" ]; then + CC="zig cc -target aarch64-linux-gnu" GOOS=linux GOARCH=arm64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go + elif [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "amd64" ]; then + CC="zig cc -target x86_64-linux-gnu" GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go + elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "arm64" ]; then + CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 go build -o dnadesign/libdnadesign.dylib -buildmode=c-shared lib.go + elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "amd64" ]; then + CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 go build -o dnadesign/libdnadesign.dylib -buildmode=c-shared lib.go + fi + env: + CGO_ENABLED: 1 + + - name: List directory contents + working-directory: ./py/dnadesign + run: ls -l + + - name: Build Python package + working-directory: ./py + run: python setup.py sdist bdist_wheel + + - name: Test wheel in fresh environment + run: | + python -m venv test_env + source test_env/bin/activate + pip install ./py/dist/*.whl + python -c "from dnadesign import fasta_parser; print('Library loaded successfully')" + pip install pytest + pytest ./py/tests -v --capture=no + continue-on-error: true + + - name: Debug segmentation fault (macOS) + if: failure() && runner.os == 'macOS' + run: | + lldb -o "run" -o "bt all" -o "quit" -- python -m pytest ./py/tests -v + + - name: Debug segmentation fault (Linux) + if: failure() && runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y gdb + gdb -ex "run" -ex "bt full" -ex "quit" --args python -m pytest ./py/tests -v + + - name: Upload artifacts + uses: actions/upload-artifact@v2 + with: + name: dist-${{ runner.os }}-${{ matrix.arch }} + path: py/dist/ + + publish: + needs: build + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install twine + + - name: Download artifacts + uses: actions/download-artifact@v2 + with: + path: dist + + - name: Publish to PyPI + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: twine upload dist/**/*/dist/* diff --git a/README.md b/README.md index 7267791b..4384cc27 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,7 @@ DnaDesign is a Go project creating tools for automated genetic design, spanning On the highest level: * `lib` contains core functionality as a go library. * `external` contains functions to work with external bioinformatics command-line interfaces. -* `api` contains an OpenAPI exposing all the major functions of lib. -* `deployment` contains full integration tests and yaml for deploying the DnaDesign API to a k3s cluster. +* `py` contains code to use the dnadesign library in python using a C shared library. ### Detailed repo organization @@ -43,6 +42,9 @@ On the highest level: * [external/minimap2](https://pkg.go.dev/github.com/koeng101/dnadesign/external/minimap2) contains a function for working with [minimap2](https://github.com/lh3/minimap2) with Go. * [external/samtools](https://pkg.go.dev/github.com/koeng101/dnadesign/external/samtools) contains a function for generating pileup files using [samtools](https://github.com/samtools/samtools) with Go. +## Python + +We have python package, `dnadesign`, which allows python users to use dnadesign. This is a work-in-progress: more documentation coming soon! ## Contributing @@ -71,6 +73,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Added minimal python packaging [#81](https://github.com/Koeng101/dnadesign/pull/81) - Greatly simplified the Ligate function [#77](https://github.com/Koeng101/dnadesign/pull/77) - Updated barcoding functions to handle edge case of hanging-edge barcodes [#74](https://github.com/Koeng101/dnadesign/pull/74) - Updated megamash to use int instead of uint for minimal Kmer counts (so you can use -1) [#73](https://github.com/Koeng101/dnadesign/pull/73) diff --git a/go.work b/go.work index b7479224..1c851ee7 100644 --- a/go.work +++ b/go.work @@ -1,6 +1,7 @@ -go 1.22.0 +go 1.22.5 use ( ./external ./lib + ./py ) diff --git a/py/.gitignore b/py/.gitignore new file mode 100644 index 00000000..0489c125 --- /dev/null +++ b/py/.gitignore @@ -0,0 +1,117 @@ +dnadesign/lib +dnadesign/libdnadesign.h +venv + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypi.org project descriptions, pipenv is a dependable Python packaging tool. +# it generates the following which should not be included in your version control system. +Pipfile.lock + +# poetry +poetry.lock + +# MyPy +.mypy_cache/ +dmypy.json +dmypy.json.# + +# Pyre type checker +.pyre/ + +# PyCharm + all JetBrains IDEs +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, GoLand +.idea/ +*.iml + +# VSCode +.vscode/ + +# Environment variables +.env +.envrc + diff --git a/py/README.md b/py/README.md new file mode 100644 index 00000000..63b4ad96 --- /dev/null +++ b/py/README.md @@ -0,0 +1,7 @@ +# DnaDesign (Python) +This directory contains code for allowing python users to use dnadesign through a shared C library. + +This is a work-in-progress. Right now, we have only ported the fasta parser. + +### Other platforms +If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them. diff --git a/py/dnadesign/__init__.py b/py/dnadesign/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/py/dnadesign/cffi_bindings.py b/py/dnadesign/cffi_bindings.py new file mode 100644 index 00000000..6415a2c0 --- /dev/null +++ b/py/dnadesign/cffi_bindings.py @@ -0,0 +1,33 @@ +from cffi import FFI +import platform +import os +import sys + +ffi = FFI() + +# Define common types based on the platform architecture +is_64b = sys.maxsize > 2**32 +if is_64b: + ffi.cdef("typedef long GoInt;\n") +else: + ffi.cdef("typedef int GoInt;\n") + +current_dir = os.path.dirname(__file__) + +# Build the path to definitions.h and libdnadesign relative to the current script +definitions_path = os.path.join(current_dir, 'definitions.h') + +# Determine the correct library name based on the operating system and architecture +if sys.platform.startswith('darwin'): + lib_name = 'libdnadesign.dylib' +else: + lib_name = 'libdnadesign.so' + +lib_path = os.path.join(current_dir, lib_name) + +# Read the C declarations from an external file +with open(definitions_path, 'r') as f: + ffi.cdef(f.read()) + +# Load the shared library +lib = ffi.dlopen(lib_path) diff --git a/py/dnadesign/definitions.h b/py/dnadesign/definitions.h new file mode 100644 index 00000000..f66ca240 --- /dev/null +++ b/py/dnadesign/definitions.h @@ -0,0 +1,17 @@ +typedef struct FILE FILE; +FILE* fopen(const char* path, const char* mode); +int fclose(FILE* fp); + +typedef struct { + char* identifier; + char* sequence; +} FastaRecord; + +typedef struct { + FastaRecord* records; + GoInt numRecords; + char* error; +} FastaResult; + +FastaResult ParseFastaFromCFile(void* cfile); +FastaResult ParseFastaFromCString(char* cstring); diff --git a/py/dnadesign/fasta_parser.py b/py/dnadesign/fasta_parser.py new file mode 100644 index 00000000..3b7cf1f1 --- /dev/null +++ b/py/dnadesign/fasta_parser.py @@ -0,0 +1,26 @@ +from typing import List, Optional +from .cffi_bindings import ffi, lib + +class FastaRecord: + def __init__(self, identifier: str, sequence: str): + self.identifier = identifier + self.sequence = sequence + +def parse_fasta_from_c_file(file_path: str) -> List[FastaRecord]: + cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8')) + result = lib.ParseFastaFromCFile(cfile) + return _process_result(result) + +def parse_fasta_from_c_string(cstring: str) -> List[FastaRecord]: + result = lib.ParseFastaFromCString(cstring.encode('utf-8')) + return _process_result(result) + +def _process_result(result) -> List[FastaRecord]: + if result.error != ffi.NULL: + error_str = ffi.string(result.error).decode('utf-8') + raise Exception("Error parsing FASTA: " + error_str) + num_records = result.numRecords + records = ffi.cast("FastaRecord*", result.records) + return [FastaRecord(ffi.string(records[i].identifier).decode('utf-8'), + ffi.string(records[i].sequence).decode('utf-8')) + for i in range(num_records)] diff --git a/py/go.mod b/py/go.mod new file mode 100644 index 00000000..40ffe8ed --- /dev/null +++ b/py/go.mod @@ -0,0 +1,7 @@ +module github.com/koeng101/dnadesign/py + +go 1.22.5 + +require github.com/koeng101/dnadesign/lib v0.0.0-20240531162423-45295e318ef3 + +require golang.org/x/sync v0.5.0 // indirect diff --git a/py/go.sum b/py/go.sum new file mode 100644 index 00000000..59ee4141 --- /dev/null +++ b/py/go.sum @@ -0,0 +1,6 @@ +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/koeng101/dnadesign/lib v0.0.0-20240531162423-45295e318ef3 h1:sFmsnmeffIPhLUBSdhy+9pIaCBHCTddOApZpE3Wvd2I= +github.com/koeng101/dnadesign/lib v0.0.0-20240531162423-45295e318ef3/go.mod h1:sGDJMyNYf4fMqEwwMj2icJ5PpNNc7RjxvctJTM25pLY= +golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= +golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= diff --git a/py/lib.go b/py/lib.go new file mode 100644 index 00000000..1081a2bb --- /dev/null +++ b/py/lib.go @@ -0,0 +1,95 @@ +package main + +/* +#include +#include + +// FastaRecord +typedef struct { + char* identifier; + char* sequence; +} FastaRecord; +*/ +import "C" +import ( + "io" + "strings" + "unsafe" + + "github.com/koeng101/dnadesign/lib/bio" +) + +/****************************************************************************** +Aug 10, 2024 + +Interoperation with CFile + +******************************************************************************/ + +// Function to create an io.Reader from a C FILE*. +func readerFromCFile(cfile *C.FILE) io.Reader { + return &fileReader{file: cfile} +} + +type fileReader struct { + file *C.FILE +} + +func (f *fileReader) Read(p []byte) (n int, err error) { + buffer := (*C.char)(unsafe.Pointer(&p[0])) + count := C.size_t(len(p)) + result := C.fread(unsafe.Pointer(buffer), 1, count, f.file) + if result == 0 { + if C.feof(f.file) != 0 { + return 0, io.EOF + } + return 0, io.ErrUnexpectedEOF + } + return int(result), nil +} + +/****************************************************************************** +Aug 10, 2024 + +Fasta + +******************************************************************************/ + +// goFastaToCFasta converts an io.Reader to a C.FastaResult +func goFastaToCFasta(reader io.Reader) (*C.FastaRecord, int, *C.char) { + parser := bio.NewFastaParser(reader) + records, err := parser.Parse() + if err != nil { + return nil, 0, C.CString(err.Error()) + } + + cRecords := (*C.FastaRecord)(C.malloc(C.size_t(len(records)) * C.size_t(unsafe.Sizeof(C.FastaRecord{})))) + slice := (*[1<<30 - 1]C.FastaRecord)(unsafe.Pointer(cRecords))[:len(records):len(records)] + + for i, read := range records { + slice[i].identifier = C.CString(read.Identifier) + slice[i].sequence = C.CString(read.Sequence) + } + + return cRecords, len(records), nil +} + +//export ParseFastaFromCFile +func ParseFastaFromCFile(cfile *C.FILE) (*C.FastaRecord, int, *C.char) { + reader := readerFromCFile(cfile) + return goFastaToCFasta(reader) +} + +//export ParseFastaFromCString +func ParseFastaFromCString(cstring *C.char) (*C.FastaRecord, int, *C.char) { + reader := strings.NewReader(C.GoString(cstring)) + return goFastaToCFasta(reader) +} + +/****************************************************************************** + +main.go + +******************************************************************************/ + +func main() {} diff --git a/py/setup.py b/py/setup.py new file mode 100644 index 00000000..9c837dd0 --- /dev/null +++ b/py/setup.py @@ -0,0 +1,31 @@ +import os +import platform +from setuptools import setup, find_packages + +def get_shared_lib_ext(): + if platform.system() == "Darwin": + return ".dylib" + elif platform.system() == "Windows": + return ".dll" + else: + return ".so" + +setup( + name='dnadesign', + version='0.1.1', + packages=find_packages(), + package_data={'dnadesign': ['definitions.h', 'libdnadesign.h', "libdnadesign" + get_shared_lib_ext()]}, + install_requires=[ + "cffi>=1.0.0", + ], + setup_requires=[ + "cffi>=1.0.0", + ], + + include_package_data=True, + zip_safe=False, + author='Keoni Gandall', + author_email='koeng101@gmail.com', + description='Python bindings for dnadesign', + url='https://github.com/koeng101/dnadesign' +) diff --git a/py/tests/__init__.py b/py/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/py/tests/data/example.fasta b/py/tests/data/example.fasta new file mode 100644 index 00000000..94b1ea48 --- /dev/null +++ b/py/tests/data/example.fasta @@ -0,0 +1,11 @@ +>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] +LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV +EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG +LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL +GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX +IENY + +>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken +ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID +FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA +DIDGDGQVNYEEFVQMMTAK* diff --git a/py/tests/test_fasta_parser.py b/py/tests/test_fasta_parser.py new file mode 100644 index 00000000..12ec3c94 --- /dev/null +++ b/py/tests/test_fasta_parser.py @@ -0,0 +1,17 @@ +import pytest +import os +from dnadesign.fasta_parser import parse_fasta_from_c_file, parse_fasta_from_c_string, FastaRecord + +def test_parse_fasta_from_c_file(): + current_dir = os.path.dirname(__file__) + example_path = os.path.join(current_dir, 'data/example.fasta') + records = parse_fasta_from_c_file(example_path) + assert len(records) > 0 + assert all(isinstance(r, FastaRecord) for r in records) + +def test_parse_fasta_from_c_string(): + fasta_data = ">test\nATCG\n" + records = parse_fasta_from_c_string(fasta_data) + assert len(records) == 1 + assert records[0].identifier == "test" + assert records[0].sequence == "ATCG"