/dev/null +++ b/CMakeLists.txt @@ -0,0 +1,79 @@ +cmake_minimum_required(VERSION 3.2) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Set extension name here +set(TARGET_NAME exon) +set(EXTENSION_NAME ${TARGET_NAME}_extension) +set(CMAKE_CXX_STANDARD 11) + +project(${TARGET_NAME}) + + +Include(FetchContent) + +FetchContent_Declare( + arrow + GIT_REPOSITORY https://github.com/apache/arrow.git + GIT_TAG apache-arrow-11.0.0 +) + +FetchContent_Declare( + httplib + GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git + GIT_TAG v0.12.0 +) + +FetchContent_Declare( + json + URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz +) + +FetchContent_Declare( + Corrosion + GIT_REPOSITORY https://github.com/corrosion-rs/corrosion.git + GIT_TAG v0.3.5 +) + +FetchContent_Declare( + spdlog + GIT_REPOSITORY https://github.com/gabime/spdlog.git + GIT_TAG v1.11.0 +) + +list(APPEND available_contents httplib json Corrosion spdlog arrow) + +FetchContent_MakeAvailable(${available_contents}) + +corrosion_import_crate(MANIFEST_PATH rust/Cargo.toml + PROFILE release +) + +include_directories(exon/include) +add_subdirectory(exon/src) + +add_library(${EXTENSION_NAME} STATIC ${EXTENSION_SOURCES}) + +# Build extensions +set(PARAMETERS "-warnings") +build_loadable_extension(${TARGET_NAME} ${PARAMETERS} ${EXTENSION_SOURCES}) + +find_package(OpenSSL REQUIRED) +message("-- wtt: Found openssl ${OPENSSL_VERSION}") + +target_link_libraries(${EXTENSION_NAME} + PUBLIC + "${CMAKE_CURRENT_BINARY_DIR}/Release/rust.lib" + ntdll + Secur32 + OpenSSL::SSL + OpenSSL::Crypto + bcrypt + ncrypt + Userenv) + + +install( + TARGETS ${EXTENSION_NAME} + EXPORT "${DUCKDB_EXPORT_SET}" + LIBRARY DESTINATION "${INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") \ No newline at end of file diff --git a/Makefile b/Makefile index 6f1aa7d..8742ef2 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,35 @@ pull: git submodule init git submodule update --recursive --remote + + +MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +PROJ_DIR := $(dir $(MKFILE_PATH)) + +OSX_BUILD_UNIVERSAL_FLAG= +ifeq (${OSX_BUILD_UNIVERSAL}, 1) + OSX_BUILD_UNIVERSAL_FLAG=-DOSX_BUILD_UNIVERSAL=1 +endif +ifeq (${STATIC_LIBCPP}, 1) + STATIC_LIBCPP=-DSTATIC_LIBCPP=TRUE +endif + +ifeq ($(GEN),ninja) + GENERATOR=-G "Ninja" + FORCE_COLOR=-DFORCE_COLORED_OUTPUT=1 +endif + +BUILD_FLAGS=-DEXTENSION_STATIC_BUILD=1 ${OSX_BUILD_UNIVERSAL_FLAG} ${STATIC_LIBCPP} +ifeq (${BUILD_SHELL}, 0) + BUILD_FLAGS += -DBUILD_SHELL=0 +endif + +CLIENT_FLAGS := + +# These flags will make DuckDB build the extension +EXTENSION_FLAGS=-DENABLE_SANITIZER=OFF -DDUCKDB_OOT_EXTENSION_NAMES="exon" -DDUCKDB_OOT_EXTENSION_EXON_PATH="$(PROJ_DIR)" -DDUCKDB_OOT_EXTENSION_EXON_SHOULD_LINK="TRUE" -DDUCKDB_OOT_EXTENSION_EXON_INCLUDE_PATH="$(PROJ_DIR)exon/include" + +release: + mkdir -p build/release && \ + cmake $(GENERATOR) $(FORCE_COLOR) $(EXTENSION_FLAGS) ${CLIENT_FLAGS} -DEXTENSION_STATIC_BUILD=1 -DCMAKE_BUILD_TYPE=Release ${BUILD_FLAGS} -S ./duckdb/ -B build/release && \ + cmake --build build/release --config Release \ No newline at end of file diff --git a/duckdb b/duckdb new file mode 160000 index 0000000..9d5158c --- /dev/null +++ b/duckdb @@ -0,0 +1 @@ +Subproject commit 9d5158ccd2741528b4d7e4fe330d500823968e70 diff --git a/exon/include/exon/sam_functions/module.hpp b/exon/include/exon/sam_functions/module.hpp new file mode 100644 index 0000000..f526b30 --- /dev/null +++ b/exon/include/exon/sam_functions/module.hpp @@ -0,0 +1,34 @@ +// Copyright 2023 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +namespace exon +{ + + class SamFunctions + { + public: + static duckdb::unique_ptr GetParseCIGARStringFunction(); + static duckdb::unique_ptr GetExtractFromCIGARFunction(); + static std::vector> GetSamFunctions(); + }; + +} // namespace wtt01 \ No newline at end of file diff --git a/exon/include/exon_extension.hpp b/exon/include/exon_extension.hpp new file mode 100644 index 0000000..8fcde92 --- /dev/null +++ b/exon/include/exon_extension.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "duckdb.hpp" + +namespace duckdb { + +class ExonExtension : public Extension { +public: + void Load(DuckDB &db) override; + std::string Name() override; +}; + +} // namespace duckdb \ No newline at end of file diff --git a/exon/include/rust.hpp b/exon/include/rust.hpp new file mode 100644 index 0000000..ef9cd33 --- /dev/null +++ b/exon/include/rust.hpp @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include + +struct ReaderResult { + const char *error; +}; + +struct ReplacementScanResult { + const char *file_type; +}; + +struct CResult { + const char *value; + const char *error; +}; + +struct CExtractResponse { + uintptr_t sequence_start; + uintptr_t sequence_len; + const char *extracted_sequence; + const char *error; +}; + +extern "C" { + +ReaderResult new_reader(ArrowArrayStream *stream_ptr, + const char *uri, + uintptr_t batch_size, + const char *compression, + const char *file_format); + +ReplacementScanResult replacement_scan(const char *uri); + +bool is_segmented(uint16_t flag); + +bool is_unmapped(uint16_t flag); + +bool is_properly_aligned(uint16_t flag); + +bool is_mate_unmapped(uint16_t flag); + +bool is_reverse_complemented(uint16_t flag); + +bool is_mate_reverse_complemented(uint16_t flag); + +bool is_first_segment(uint16_t flag); + +bool is_last_segment(uint16_t flag); + +bool is_secondary(uint16_t flag); + +bool is_quality_control_failed(uint16_t flag); + +bool is_duplicate(uint16_t flag); + +bool is_supplementary(uint16_t flag); + +CResult parse_cigar(const char *cigar); + +CExtractResponse extract_from_cigar(const char *sequence_str, const char *cigar_str); + +} // extern "C" diff --git a/exon/src/CMakeLists.txt b/exon/src/CMakeLists.txt new file mode 100644 index 0000000..eff4d34 --- /dev/null +++ b/exon/src/CMakeLists.txt @@ -0,0 +1,5 @@ +add_subdirectory(exon) + +set(EXTENSION_SOURCES + ${EXTENSION_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/exon_extension.cpp + PARENT_SCOPE) \ No newline at end of file diff --git a/exon/src/exon/CMakeLists.txt b/exon/src/exon/CMakeLists.txt new file mode 100644 index 0000000..73ab348 --- /dev/null +++ b/exon/src/exon/CMakeLists.txt @@ -0,0 +1,6 @@ +add_subdirectory(sam_functions) + +set(EXTENSION_SOURCES + ${EXTENSION_SOURCES} + PARENT_SCOPE +) \ No newline at end of file diff --git a/exon/src/exon/sam_functions/CMakeLists.txt b/exon/src/exon/sam_functions/CMakeLists.txt new file mode 100644 index 0000000..a2ccba7 --- /dev/null +++ b/exon/src/exon/sam_functions/CMakeLists.txt @@ -0,0 +1,5 @@ +set(EXTENSION_SOURCES + ${EXTENSION_SOURCES} + ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp + PARENT_SCOPE +) diff --git a/exon/src/exon/sam_functions/module.cpp b/exon/src/exon/sam_functions/module.cpp new file mode 100644 index 0000000..11135ba --- /dev/null +++ b/exon/src/exon/sam_functions/module.cpp @@ -0,0 +1,184 @@ +// Copyright 2023 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "exon/sam_functions/module.hpp" + +#include "rust.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace exon +{ + + void ParseCIGARString(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) + { + result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); + + for (duckdb::idx_t i = 0; i < args.size(); i++) + { + auto string_value = args.data[0].GetValue(i); + auto ss = string_value.ToString(); + + CResult cigar = parse_cigar(ss.c_str()); + if (cigar.error) + { + throw std::runtime_error("Invalid CIGAR string: " + ss); + } + + auto ops = duckdb::StringUtil::Split(cigar.value, ';'); + + duckdb::vector op_values; + + for (auto op : ops) + { + duckdb::child_list_t struct_values; + auto op_parts = duckdb::StringUtil::Split(op, '='); + + if (op_parts.size() != 2) + { + throw std::runtime_error("Invalid CIGAR string"); + } + + auto op_type = op_parts[0]; + auto op_length = op_parts[1]; + + auto op_type_value = duckdb::Value(op_type); + auto op_length_value = duckdb::Value::INTEGER(std::atoi(op_length.c_str())); + + struct_values.push_back(std::make_pair("op", op_type_value)); + struct_values.push_back(std::make_pair("len", op_length_value)); + + op_values.push_back(duckdb::Value::STRUCT(struct_values)); + } + + result.SetValue(i, duckdb::Value::LIST(op_values)); + } + } + + void ExtractSequence(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) + { + for (duckdb::idx_t i = 0; i < args.size(); i++) + { + auto sequence = args.data[0].GetValue(i).ToString(); + auto cigar = args.data[1].GetValue(i).ToString(); + + auto extract_result = extract_from_cigar(sequence.c_str(), cigar.c_str()); + if (extract_result.error) + { + throw std::runtime_error("Invalid CIGAR string"); + } + + duckdb::child_list_t struct_values; + struct_values.push_back(std::make_pair("sequence_start", duckdb::Value::INTEGER(extract_result.sequence_start))); + struct_values.push_back(std::make_pair("sequence_end", duckdb::Value::INTEGER(extract_result.sequence_len))); + struct_values.push_back(std::make_pair("sequence", duckdb::Value(extract_result.extracted_sequence))); + + auto struct_value = duckdb::Value::STRUCT(struct_values); + + result.SetValue(i, struct_value); + } + } + + duckdb::unique_ptr SamFunctions::GetExtractFromCIGARFunction() + { + duckdb::ScalarFunctionSet set("extract_from_cigar"); + + duckdb::child_list_t struct_children; + struct_children.push_back(std::make_pair("sequence_start", duckdb::LogicalType::INTEGER)); + struct_children.push_back(std::make_pair("sequence_end", duckdb::LogicalType::INTEGER)); + struct_children.push_back(std::make_pair("sequence", duckdb::LogicalType::VARCHAR)); + + auto record_type = duckdb::LogicalType::STRUCT(std::move(struct_children)); + + set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR, duckdb::LogicalType::VARCHAR}, record_type, ExtractSequence)); + + return duckdb::make_uniq(set); + } + + duckdb::unique_ptr SamFunctions::GetParseCIGARStringFunction() + { + duckdb::ScalarFunctionSet set("parse_cigar"); + + duckdb::child_list_t struct_children; + struct_children.push_back(std::make_pair("op", duckdb::LogicalType::VARCHAR)); + struct_children.push_back(std::make_pair("len", duckdb::LogicalType::INTEGER)); + + auto record_type = duckdb::LogicalType::STRUCT(std::move(struct_children)); + auto row_type = duckdb::LogicalType::LIST(std::move(record_type)); + + set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, row_type, ParseCIGARString)); + + return duckdb::make_uniq(set); + } + + std::vector> SamFunctions::GetSamFunctions() + { + + struct SamFunction + { + std::string name; + std::function func; + }; + + std::vector sam_functions = { + {"is_segmented", is_segmented}, + {"is_unmapped", is_unmapped}, + {"is_properly_aligned", is_properly_aligned}, + {"is_mate_unmapped", is_mate_unmapped}, + {"is_reverse_complemented", is_reverse_complemented}, + {"is_mate_reverse_complemented", is_mate_reverse_complemented}, + {"is_first_segment", is_first_segment}, + {"is_last_segment", is_last_segment}, + {"is_secondary", is_secondary}, + {"is_quality_control_failed", is_quality_control_failed}, + {"is_duplicate", is_duplicate}, + {"is_supplementary", is_supplementary}}; + + std::vector> sam_scalar_functions; + + for (auto &sam_function : sam_functions) + { + duckdb::ScalarFunctionSet set(sam_function.name); + + auto duckdb_function = [sam_function](duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) + { + result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); + for (duckdb::idx_t i = 0; i < args.size(); i++) + { + auto value = args.data[0].GetValue(i); + auto int_value = duckdb::IntegerValue::Get(value); + + auto bool_value = sam_function.func(int_value); + + result.SetValue(i, duckdb::Value::BOOLEAN(bool_value)); + } + }; + + set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::INTEGER}, duckdb::LogicalType::BOOLEAN, duckdb_function)); + + sam_scalar_functions.emplace_back(duckdb::make_uniq(set)); + } + + return sam_scalar_functions; + } + +} \ No newline at end of file diff --git a/exon/src/exon_extension.cpp b/exon/src/exon_extension.cpp new file mode 100644 index 0000000..ff67c62 --- /dev/null +++ b/exon/src/exon_extension.cpp @@ -0,0 +1,50 @@ +#define DUCKDB_EXTENSION_MAIN + +#include "exon_extension.hpp" +#include "exon/sam_functions/module.hpp" +#include "duckdb.hpp" + +using namespace duckdb; + +namespace duckdb { + +static void LoadInternal(DatabaseInstance &instance) { + Connection con(instance); + con.BeginTransaction(); + + auto &context = *con.context; + auto &catalog = Catalog::GetSystemCatalog(context); + + auto get_sam_functions = exon::SamFunctions::GetSamFunctions(); + for (auto ["derive"]} +serde_json = "1.0" +tokio = {version = "1.28.1", features = ["rt-multi-thread"]} +url = "2.4.0" +zstd = "0.12.3" + +[build-dependencies] +cbindgen = "0.24.5" diff --git a/rust/build.rs b/rust/build.rs new file mode 100644 index 0000000..eab790e --- /dev/null +++ b/rust/build.rs @@ -0,0 +1,31 @@ +// Copyright 2023 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +extern crate cbindgen; + +use std::env; + +fn main() { + let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + let out_dir = "./../exon/include/"; + + let dest_path = std::path::Path::new(&out_dir).join("rust.hpp"); + + cbindgen::Builder::new() + .with_crate(crate_dir) + // .with_header("#include ") + .generate() + .expect("Unable to generate bindings") + .write_to_file(dest_path); +} diff --git a/rust/src/arrow_reader.rs b/rust/src/arrow_reader.rs new file mode 100644 index 0000000..7bb5713 --- /dev/null +++ b/rust/src/arrow_reader.rs @@ -0,0 +1,172 @@ +// Copyright 2023 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + ffi::{c_char, CStr, CString}, + ptr::null, + str::FromStr, + sync::Arc, +}; + +use arrow::ffi_stream::FFI_ArrowArrayStream as ArrowArrayStream; +use datafusion::{ + datasource::file_format::file_type::FileCompressionType, + prelude::{SessionConfig, SessionContext}, +}; +use exon::{ + context::ExonSessionExt, datasources::ExonFileType, + ffi::create_dataset_stream_from_table_provider, +}; +use object_store::aws::AmazonS3Builder; +use tokio::runtime::Runtime; +use url::Url; + +#[repr(C)] +pub struct ReaderResult { + error: *const c_char, +} + +#[no_mangle] +pub unsafe extern "C" fn new_reader( + stream_ptr: *mut ArrowArrayStream, + uri: *const c_char, + batch_size: usize, + compression: *const c_char, + file_format: *const c_char, +) -> ReaderResult { + let uri = CStr::from_ptr(uri).to_str().unwrap(); + let rt = Arc::new(Runtime::new().unwrap()); + + // if compression is null, try to infer from file extension + let compression_type = if compression.is_null() { + let extension = uri.split('.').last().unwrap(); + match extension { + "gz" => FileCompressionType::GZIP, + "zst" => FileCompressionType::ZSTD, + _ => FileCompressionType::UNCOMPRESSED, + } + } else { + let compression = CStr::from_ptr(compression).to_str().unwrap(); + let compression = + FileCompressionType::from_str(compression).unwrap_or(FileCompressionType::UNCOMPRESSED); + + compression + }; + + let file_type = CStr::from_ptr(file_format).to_str().unwrap(); + let file_type = match ExonFileType::from_str(file_type) { + Ok(file_type) => file_type, + Err(_) => { + let error = CString::new(format!("could not parse file_format {}", file_type)).unwrap(); + return ReaderResult { + error: error.into_raw(), + }; + } + }; + + let config = SessionConfig::new().with_batch_size(batch_size); + let ctx = SessionContext::with_config(config); + + // handle s3 + if uri.starts_with("s3://") { + let url_from_uri = match Url::parse(uri) { + Ok(url) => url, + Err(e) => { + let error = CString::new(format!("could not parse uri: {}", e)).unwrap(); + return ReaderResult { + error: error.into_raw(), + }; + } + }; + + let host_str = match url_from_uri.host_str() { + Some(host_str) => host_str, + None => { + let error = CString::new("could not parse host_str").unwrap(); + return ReaderResult { + error: error.into_raw(), + }; + } + }; + + let s3 = match AmazonS3Builder::from_env() + .with_bucket_name(host_str) + .build() + { + Ok(s3) => s3, + Err(e) => { + let error = CString::new(format!("could not create s3 client: {}", e)).unwrap(); + return ReaderResult { + error: error.into_raw(), + }; + } + }; + + let path = format!("s3://{}", host_str); + let s3_url = Url::parse(&path).unwrap(); + ctx.runtime_env() + .register_object_store(&s3_url, Arc::new(s3)); + } + + rt.block_on(async { + let df = match ctx + .read_exon_table(uri, file_type, Some(compression_type)) + .await + { + Ok(df) => df, + Err(e) => { + let error = CString::new(format!("could not read table: {}", e)).unwrap(); + return ReaderResult { + error: error.into_raw(), + }; + } + }; + + create_dataset_stream_from_table_provider(df, rt.clone(), stream_ptr).await; + ReaderResult { + error: std::ptr::null(), + } + }) +} + +#[repr(C)] +pub struct ReplacementScanResult { + file_type: *const c_char, +} + +#[no_mangle] +pub unsafe extern "C" fn replacement_scan(uri: *const c_char) -> ReplacementScanResult { + let uri = CStr::from_ptr(uri).to_str().unwrap(); + let mut exts = uri.rsplit('.'); + let mut splitted = exts.next().unwrap_or(""); + + let file_compression_type = + FileCompressionType::from_str(splitted).unwrap_or(FileCompressionType::UNCOMPRESSED); + + if file_compression_type.is_compressed() { + splitted = exts.next().unwrap_or(""); + } + + match ExonFileType::from_str(splitted) { + Ok(file_type) => { + let ft_string = file_type.to_string(); + return ReplacementScanResult { + file_type: CString::new(ft_string).unwrap().into_raw(), + }; + } + Err(_) => { + return ReplacementScanResult { file_type: null() }; + } + } +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000..35e059a --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,17 @@ +// Copyright 2023 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod arrow_reader; + +pub mod sam_functions; diff --git a/rust/src/sam_functions.rs b/rust/src/sam_functions.rs new file mode 100644 index 0000000..fdde48c --- /dev/null +++ b/rust/src/sam_functions.rs @@ -0,0 +1,200 @@ +// Copyright 2023 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::ffi::{c_char, CString}; + +use noodles::sam::record::{cigar::Cigar, Flags}; + +#[no_mangle] +pub extern "C" fn is_segmented(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::SEGMENTED) +} + +#[no_mangle] +pub extern "C" fn is_unmapped(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::UNMAPPED) +} + +#[no_mangle] +pub extern "C" fn is_properly_aligned(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::PROPERLY_ALIGNED) +} + +#[no_mangle] +pub extern "C" fn is_mate_unmapped(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::MATE_UNMAPPED) +} + +#[no_mangle] +pub extern "C" fn is_reverse_complemented(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::REVERSE_COMPLEMENTED) +} + +#[no_mangle] +pub extern "C" fn is_mate_reverse_complemented(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::MATE_REVERSE_COMPLEMENTED) +} + +#[no_mangle] +pub extern "C" fn is_first_segment(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + + flag.contains(Flags::FIRST_SEGMENT) +} + +#[no_mangle] +pub extern "C" fn is_last_segment(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::LAST_SEGMENT) +} + +#[no_mangle] +pub extern "C" fn is_secondary(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::SECONDARY) +} + +#[no_mangle] +pub extern "C" fn is_quality_control_failed(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::QC_FAIL) +} + +#[no_mangle] +pub extern "C" fn is_duplicate(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::DUPLICATE) +} + +#[no_mangle] +pub extern "C" fn is_supplementary(flag: u16) -> bool { + let flag = Flags::from_bits_truncate(flag); + flag.contains(Flags::SUPPLEMENTARY) +} + +#[repr(C)] +pub struct CResult { + value: *const c_char, + error: *const c_char, +} + +impl CResult { + fn new(value: &str) -> Self { + Self { + value: CString::new(value).unwrap().into_raw(), + error: std::ptr::null(), + } + } + + fn error(error: &str) -> Self { + Self { + value: std::ptr::null(), + error: CString::new(error).unwrap().into_raw(), + } + } +} + +#[no_mangle] +pub extern "C" fn parse_cigar(cigar: *const c_char) -> CResult { + let cigar = unsafe { std::ffi::CStr::from_ptr(cigar) }; + let cigar = cigar.to_str().unwrap(); + + let cigar_obj: Cigar = match cigar.parse() { + Ok(cigar) => cigar, + Err(e) => return CResult::error(&e.to_string()), + }; + + let serialized_obj = cigar_obj + .iter() + .map(|op| format!("{}={}", op.kind(), op.len())) + .collect::>() + .join(";"); + + CResult::new(serialized_obj.as_str()) +} + +#[repr(C)] +pub struct CExtractResponse { + sequence_start: usize, + sequence_len: usize, + extracted_sequence: *const c_char, + error: *const c_char, +} + +impl CExtractResponse { + fn new(sequence_start: usize, sequence_len: usize, extracted_sequence: &str) -> Self { + Self { + sequence_start, + sequence_len, + extracted_sequence: CString::new(extracted_sequence).unwrap().into_raw(), + error: std::ptr::null(), + } + } + + fn error(error: &str) -> Self { + Self { + sequence_start: 0, + sequence_len: 0, + extracted_sequence: std::ptr::null(), + error: CString::new(error).unwrap().into_raw(), + } + } +} + +#[no_mangle] +pub extern "C" fn extract_from_cigar( + sequence_str: *const c_char, + cigar_str: *const c_char, +) -> CExtractResponse { + let cigar = unsafe { std::ffi::CStr::from_ptr(cigar_str) }; + let cigar = match cigar.to_str() { + Ok(cigar) => cigar, + Err(e) => return CExtractResponse::error(&e.to_string()), + }; + + let cigar_obj: Cigar = match cigar.parse() { + Ok(cigar) => cigar, + Err(e) => return CExtractResponse::error(&e.to_string()), + }; + + let total_ops = cigar_obj.len(); + let first_ops = cigar_obj[0]; + let last_ops = cigar_obj[total_ops - 1]; + + let sequence = unsafe { std::ffi::CStr::from_ptr(sequence_str) }; + let sequence = match sequence.to_str() { + Ok(sequence) => sequence, + Err(e) => return CExtractResponse::error(&e.to_string()), + }; + + let sequence_start = match first_ops.kind() { + noodles::sam::record::cigar::op::Kind::Insertion => first_ops.len(), + _ => 0, + }; + + let sequence_len = match last_ops.kind() { + noodles::sam::record::cigar::op::Kind::Insertion => sequence.len() - last_ops.len(), + _ => sequence.len(), + }; + + let sequence = &sequence[sequence_start..sequence_len]; + + CExtractResponse::new(sequence_start, sequence_len, sequence) +}