From 6e4611fefbca524ed42945810711d245d5be235c Mon Sep 17 00:00:00 2001
From: Koeng101 <Koeng101@gmail.com>
Date: Sun, 11 Aug 2024 12:09:08 -0700
Subject: [PATCH] py (#81)

* add shared library for fasta parsing that can be used with python
* add tests and classes for use in python
* add automatic testing + building + releasing of resulting python package
---
 .github/workflows/build.yml   | 132 ++++++++++++++++++++++++++++++++++
 README.md                     |   7 +-
 go.work                       |   3 +-
 py/.gitignore                 | 117 ++++++++++++++++++++++++++++++
 py/README.md                  |   7 ++
 py/dnadesign/__init__.py      |   0
 py/dnadesign/cffi_bindings.py |  33 +++++++++
 py/dnadesign/definitions.h    |  17 +++++
 py/dnadesign/fasta_parser.py  |  26 +++++++
 py/go.mod                     |   7 ++
 py/go.sum                     |   6 ++
 py/lib.go                     |  95 ++++++++++++++++++++++++
 py/setup.py                   |  31 ++++++++
 py/tests/__init__.py          |   0
 py/tests/data/example.fasta   |  11 +++
 py/tests/test_fasta_parser.py |  17 +++++
 16 files changed, 506 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/build.yml
 create mode 100644 py/.gitignore
 create mode 100644 py/README.md
 create mode 100644 py/dnadesign/__init__.py
 create mode 100644 py/dnadesign/cffi_bindings.py
 create mode 100644 py/dnadesign/definitions.h
 create mode 100644 py/dnadesign/fasta_parser.py
 create mode 100644 py/go.mod
 create mode 100644 py/go.sum
 create mode 100644 py/lib.go
 create mode 100644 py/setup.py
 create mode 100644 py/tests/__init__.py
 create mode 100644 py/tests/data/example.fasta
 create mode 100644 py/tests/test_fasta_parser.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 00000000..60f8d1fc
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,132 @@
+name: Build and Package
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        include:
+          - os: ubuntu-latest
+            arch: amd64
+          - os: ubuntu-latest
+            arch: arm64
+          - os: macos-latest  # This will be ARM64
+            arch: arm64
+          - os: macos-13  # This will be AMD64
+            arch: amd64
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v2
+      with:
+        go-version: '1.22'
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.10'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel cffi
+
+    - name: Install Zig
+      uses: goto-bus-stop/setup-zig@v2
+      with:
+        version: 0.11.0
+
+    - name: Set up Xcode
+      if: runner.os == 'macOS'
+      uses: maxim-lobanov/setup-xcode@v1
+      with:
+        xcode-version: latest-stable
+
+    - name: Build Go shared library
+      working-directory: ./py
+      run: |
+        if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "arm64" ]; then
+          CC="zig cc -target aarch64-linux-gnu" GOOS=linux GOARCH=arm64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
+        elif [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "amd64" ]; then
+          CC="zig cc -target x86_64-linux-gnu" GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
+        elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "arm64" ]; then
+          CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 go build -o dnadesign/libdnadesign.dylib -buildmode=c-shared lib.go
+        elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "amd64" ]; then
+          CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 go build -o dnadesign/libdnadesign.dylib -buildmode=c-shared lib.go
+        fi
+      env:
+        CGO_ENABLED: 1
+
+    - name: List directory contents
+      working-directory: ./py/dnadesign
+      run: ls -l
+
+    - name: Build Python package
+      working-directory: ./py
+      run: python setup.py sdist bdist_wheel
+
+    - name: Test wheel in fresh environment
+      run: |
+        python -m venv test_env
+        source test_env/bin/activate
+        pip install ./py/dist/*.whl
+        python -c "from dnadesign import fasta_parser; print('Library loaded successfully')"
+        pip install pytest
+        pytest ./py/tests -v --capture=no
+      continue-on-error: true
+
+    - name: Debug segmentation fault (macOS)
+      if: failure() && runner.os == 'macOS'
+      run: |
+        lldb -o "run" -o "bt all" -o "quit" -- python -m pytest ./py/tests -v
+
+    - name: Debug segmentation fault (Linux)
+      if: failure() && runner.os == 'Linux'
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y gdb
+        gdb -ex "run" -ex "bt full" -ex "quit" --args python -m pytest ./py/tests -v
+
+    - name: Upload artifacts
+      uses: actions/upload-artifact@v2
+      with:
+        name: dist-${{ runner.os }}-${{ matrix.arch }}
+        path: py/dist/
+
+  publish:
+    needs: build
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.10'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install twine
+
+    - name: Download artifacts
+      uses: actions/download-artifact@v2
+      with:
+        path: dist
+
+    - name: Publish to PyPI
+      env:
+        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      run: twine upload dist/**/*/dist/*
diff --git a/README.md b/README.md
index 7267791b..4384cc27 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,7 @@ DnaDesign is a Go project creating tools for automated genetic design, spanning
 On the highest level:
 * `lib` contains core functionality as a go library.
 * `external` contains functions to work with external bioinformatics command-line interfaces.
-* `api` contains an OpenAPI exposing all the major functions of lib.
-* `deployment` contains full integration tests and yaml for deploying the DnaDesign API to a k3s cluster.
+* `py` contains code to use the dnadesign library in python using a C shared library.
 
 ### Detailed repo organization
 
@@ -43,6 +42,9 @@ On the highest level:
     * [external/minimap2](https://pkg.go.dev/github.com/koeng101/dnadesign/external/minimap2) contains a function for working with [minimap2](https://github.com/lh3/minimap2) with Go.
     * [external/samtools](https://pkg.go.dev/github.com/koeng101/dnadesign/external/samtools) contains a function for generating pileup files using [samtools](https://github.com/samtools/samtools) with Go.
 
+## Python
+
+We have python package, `dnadesign`, which allows python users to use dnadesign. This is a work-in-progress: more documentation coming soon!
 
 ## Contributing
 
@@ -71,6 +73,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+- Added minimal python packaging [#81](https://github.com/Koeng101/dnadesign/pull/81)
 - Greatly simplified the Ligate function [#77](https://github.com/Koeng101/dnadesign/pull/77)
 - Updated barcoding functions to handle edge case of hanging-edge barcodes [#74](https://github.com/Koeng101/dnadesign/pull/74)
 - Updated megamash to use int instead of uint for minimal Kmer counts (so you can use -1) [#73](https://github.com/Koeng101/dnadesign/pull/73)
diff --git a/go.work b/go.work
index b7479224..1c851ee7 100644
--- a/go.work
+++ b/go.work
@@ -1,6 +1,7 @@
-go 1.22.0
+go 1.22.5
 
 use (
 	./external
 	./lib
+	./py
 )
diff --git a/py/.gitignore b/py/.gitignore
new file mode 100644
index 00000000..0489c125
--- /dev/null
+++ b/py/.gitignore
@@ -0,0 +1,117 @@
+dnadesign/lib
+dnadesign/libdnadesign.h
+venv
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypi.org project descriptions, pipenv is a dependable Python packaging tool.
+#   it generates the following which should not be included in your version control system.
+Pipfile.lock
+
+# poetry
+poetry.lock
+
+# MyPy
+.mypy_cache/
+dmypy.json
+dmypy.json.#
+
+# Pyre type checker
+.pyre/
+
+# PyCharm + all JetBrains IDEs
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, GoLand
+.idea/
+*.iml
+
+# VSCode
+.vscode/
+
+# Environment variables
+.env
+.envrc
+
diff --git a/py/README.md b/py/README.md
new file mode 100644
index 00000000..63b4ad96
--- /dev/null
+++ b/py/README.md
@@ -0,0 +1,7 @@
+# DnaDesign (Python)
+This directory contains code for allowing python users to use dnadesign through a shared C library.
+
+This is a work-in-progress. Right now, we have only ported the fasta parser. 
+
+### Other platforms
+If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them.
diff --git a/py/dnadesign/__init__.py b/py/dnadesign/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/py/dnadesign/cffi_bindings.py b/py/dnadesign/cffi_bindings.py
new file mode 100644
index 00000000..6415a2c0
--- /dev/null
+++ b/py/dnadesign/cffi_bindings.py
@@ -0,0 +1,33 @@
+from cffi import FFI
+import platform
+import os
+import sys
+
+ffi = FFI()
+
+# Define common types based on the platform architecture
+is_64b = sys.maxsize > 2**32
+if is_64b:
+    ffi.cdef("typedef long GoInt;\n")
+else:
+    ffi.cdef("typedef int GoInt;\n")
+
+current_dir = os.path.dirname(__file__)
+
+# Build the path to definitions.h and libdnadesign relative to the current script
+definitions_path = os.path.join(current_dir, 'definitions.h')
+
+# Determine the correct library name based on the operating system and architecture
+if sys.platform.startswith('darwin'):
+    lib_name = 'libdnadesign.dylib'
+else:
+    lib_name = 'libdnadesign.so'
+
+lib_path = os.path.join(current_dir, lib_name)
+
+# Read the C declarations from an external file
+with open(definitions_path, 'r') as f:
+    ffi.cdef(f.read())
+
+# Load the shared library
+lib = ffi.dlopen(lib_path)
diff --git a/py/dnadesign/definitions.h b/py/dnadesign/definitions.h
new file mode 100644
index 00000000..f66ca240
--- /dev/null
+++ b/py/dnadesign/definitions.h
@@ -0,0 +1,17 @@
+typedef struct FILE FILE;
+FILE* fopen(const char* path, const char* mode);
+int fclose(FILE* fp);
+
+typedef struct {
+    char* identifier;
+    char* sequence;
+} FastaRecord;
+
+typedef struct {
+    FastaRecord* records;
+    GoInt numRecords;
+    char* error;
+} FastaResult;
+
+FastaResult ParseFastaFromCFile(void* cfile);
+FastaResult ParseFastaFromCString(char* cstring);
diff --git a/py/dnadesign/fasta_parser.py b/py/dnadesign/fasta_parser.py
new file mode 100644
index 00000000..3b7cf1f1
--- /dev/null
+++ b/py/dnadesign/fasta_parser.py
@@ -0,0 +1,26 @@
+from typing import List, Optional
+from .cffi_bindings import ffi, lib
+
+class FastaRecord:
+    def __init__(self, identifier: str, sequence: str):
+        self.identifier = identifier
+        self.sequence = sequence
+
+def parse_fasta_from_c_file(file_path: str) -> List[FastaRecord]:
+    cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8'))
+    result = lib.ParseFastaFromCFile(cfile)
+    return _process_result(result)
+
+def parse_fasta_from_c_string(cstring: str) -> List[FastaRecord]:
+    result = lib.ParseFastaFromCString(cstring.encode('utf-8'))
+    return _process_result(result)
+
+def _process_result(result) -> List[FastaRecord]:
+    if result.error != ffi.NULL:
+        error_str = ffi.string(result.error).decode('utf-8')
+        raise Exception("Error parsing FASTA: " + error_str)
+    num_records = result.numRecords
+    records = ffi.cast("FastaRecord*", result.records)
+    return [FastaRecord(ffi.string(records[i].identifier).decode('utf-8'),
+                        ffi.string(records[i].sequence).decode('utf-8'))
+            for i in range(num_records)]
diff --git a/py/go.mod b/py/go.mod
new file mode 100644
index 00000000..40ffe8ed
--- /dev/null
+++ b/py/go.mod
@@ -0,0 +1,7 @@
+module github.com/koeng101/dnadesign/py
+
+go 1.22.5
+
+require github.com/koeng101/dnadesign/lib v0.0.0-20240531162423-45295e318ef3
+
+require golang.org/x/sync v0.5.0 // indirect
diff --git a/py/go.sum b/py/go.sum
new file mode 100644
index 00000000..59ee4141
--- /dev/null
+++ b/py/go.sum
@@ -0,0 +1,6 @@
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/koeng101/dnadesign/lib v0.0.0-20240531162423-45295e318ef3 h1:sFmsnmeffIPhLUBSdhy+9pIaCBHCTddOApZpE3Wvd2I=
+github.com/koeng101/dnadesign/lib v0.0.0-20240531162423-45295e318ef3/go.mod h1:sGDJMyNYf4fMqEwwMj2icJ5PpNNc7RjxvctJTM25pLY=
+golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
+golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
diff --git a/py/lib.go b/py/lib.go
new file mode 100644
index 00000000..1081a2bb
--- /dev/null
+++ b/py/lib.go
@@ -0,0 +1,95 @@
+package main
+
+/*
+#include <stdio.h>
+#include <stdlib.h>
+
+// FastaRecord
+typedef struct {
+    char* identifier;
+    char* sequence;
+} FastaRecord;
+*/
+import "C"
+import (
+	"io"
+	"strings"
+	"unsafe"
+
+	"github.com/koeng101/dnadesign/lib/bio"
+)
+
+/******************************************************************************
+Aug 10, 2024
+
+Interoperation with CFile
+
+******************************************************************************/
+
+// Function to create an io.Reader from a C FILE*.
+func readerFromCFile(cfile *C.FILE) io.Reader {
+	return &fileReader{file: cfile}
+}
+
+type fileReader struct {
+	file *C.FILE
+}
+
+func (f *fileReader) Read(p []byte) (n int, err error) {
+	buffer := (*C.char)(unsafe.Pointer(&p[0]))
+	count := C.size_t(len(p))
+	result := C.fread(unsafe.Pointer(buffer), 1, count, f.file)
+	if result == 0 {
+		if C.feof(f.file) != 0 {
+			return 0, io.EOF
+		}
+		return 0, io.ErrUnexpectedEOF
+	}
+	return int(result), nil
+}
+
+/******************************************************************************
+Aug 10, 2024
+
+Fasta
+
+******************************************************************************/
+
+// goFastaToCFasta converts an io.Reader to a C.FastaResult
+func goFastaToCFasta(reader io.Reader) (*C.FastaRecord, int, *C.char) {
+	parser := bio.NewFastaParser(reader)
+	records, err := parser.Parse()
+	if err != nil {
+		return nil, 0, C.CString(err.Error())
+	}
+
+	cRecords := (*C.FastaRecord)(C.malloc(C.size_t(len(records)) * C.size_t(unsafe.Sizeof(C.FastaRecord{}))))
+	slice := (*[1<<30 - 1]C.FastaRecord)(unsafe.Pointer(cRecords))[:len(records):len(records)]
+
+	for i, read := range records {
+		slice[i].identifier = C.CString(read.Identifier)
+		slice[i].sequence = C.CString(read.Sequence)
+	}
+
+	return cRecords, len(records), nil
+}
+
+//export ParseFastaFromCFile
+func ParseFastaFromCFile(cfile *C.FILE) (*C.FastaRecord, int, *C.char) {
+	reader := readerFromCFile(cfile)
+	return goFastaToCFasta(reader)
+}
+
+//export ParseFastaFromCString
+func ParseFastaFromCString(cstring *C.char) (*C.FastaRecord, int, *C.char) {
+	reader := strings.NewReader(C.GoString(cstring))
+	return goFastaToCFasta(reader)
+}
+
+/******************************************************************************
+
+main.go
+
+******************************************************************************/
+
+func main() {}
diff --git a/py/setup.py b/py/setup.py
new file mode 100644
index 00000000..9c837dd0
--- /dev/null
+++ b/py/setup.py
@@ -0,0 +1,31 @@
+import os
+import platform
+from setuptools import setup, find_packages
+
+def get_shared_lib_ext():
+    if platform.system() == "Darwin":
+        return ".dylib"
+    elif platform.system() == "Windows":
+        return ".dll"
+    else:
+        return ".so"
+
+setup(
+    name='dnadesign',
+    version='0.1.1',
+    packages=find_packages(),
+    package_data={'dnadesign': ['definitions.h', 'libdnadesign.h', "libdnadesign" + get_shared_lib_ext()]},
+    install_requires=[
+        "cffi>=1.0.0",
+    ],
+    setup_requires=[
+        "cffi>=1.0.0",
+    ],
+
+    include_package_data=True,
+    zip_safe=False,
+    author='Keoni Gandall',
+    author_email='koeng101@gmail.com',
+    description='Python bindings for dnadesign',
+    url='https://github.com/koeng101/dnadesign'
+)
diff --git a/py/tests/__init__.py b/py/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/py/tests/data/example.fasta b/py/tests/data/example.fasta
new file mode 100644
index 00000000..94b1ea48
--- /dev/null
+++ b/py/tests/data/example.fasta
@@ -0,0 +1,11 @@
+>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
+LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
+EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
+LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
+GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
+IENY
+
+>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken
+ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
+FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA
+DIDGDGQVNYEEFVQMMTAK*
diff --git a/py/tests/test_fasta_parser.py b/py/tests/test_fasta_parser.py
new file mode 100644
index 00000000..12ec3c94
--- /dev/null
+++ b/py/tests/test_fasta_parser.py
@@ -0,0 +1,17 @@
+import pytest
+import os
+from dnadesign.fasta_parser import parse_fasta_from_c_file, parse_fasta_from_c_string, FastaRecord
+
+def test_parse_fasta_from_c_file():
+    current_dir = os.path.dirname(__file__)
+    example_path = os.path.join(current_dir, 'data/example.fasta')
+    records = parse_fasta_from_c_file(example_path)
+    assert len(records) > 0
+    assert all(isinstance(r, FastaRecord) for r in records)
+
+def test_parse_fasta_from_c_string():
+    fasta_data = ">test\nATCG\n"
+    records = parse_fasta_from_c_string(fasta_data)
+    assert len(records) == 1
+    assert records[0].identifier == "test"
+    assert records[0].sequence == "ATCG"