From 2b636620b7a175319e1b98019898534dd0f98d4b Mon Sep 17 00:00:00 2001 From: keelan Date: Fri, 10 Apr 2026 13:26:54 +0100 Subject: [PATCH 1/4] expo bioscript support --- expo-bioscript/.gitignore | 4 + expo-bioscript/README.md | 94 ++++++ expo-bioscript/android/.gitignore | 2 + expo-bioscript/android/.gitkeep | 1 + expo-bioscript/android/build.gradle | 36 +++ .../android/src/main/AndroidManifest.xml | 1 + .../modules/bioscript/ExpoBioscriptModule.kt | 48 +++ .../bioscript/ExpoBioscriptNativeBridge.kt | 26 ++ expo-bioscript/expo-module.config.json | 9 + expo-bioscript/ios/.gitkeep | 1 + expo-bioscript/ios/ExpoBioscript.podspec | 32 ++ expo-bioscript/ios/ExpoBioscriptModule.swift | 63 ++++ expo-bioscript/package.json | 32 ++ expo-bioscript/scripts/.gitkeep | 1 + expo-bioscript/scripts/build-rust-android.sh | 85 ++++++ expo-bioscript/scripts/build-rust-ios.sh | 53 ++++ expo-bioscript/src/.gitkeep | 1 + expo-bioscript/src/ExpoBioscript.types.ts | 25 ++ expo-bioscript/src/ExpoBioscriptModule.ts | 10 + expo-bioscript/src/index.ts | 13 + rust/Cargo.lock | 194 +++++++++++- rust/bioscript/Cargo.toml | 10 +- rust/bioscript/src/genotype.rs | 33 +- rust/bioscript/src/lib.rs | 289 ++++++++++++++++++ rust/bioscript/src/prepare.rs | 33 +- 25 files changed, 1082 insertions(+), 14 deletions(-) create mode 100644 expo-bioscript/.gitignore create mode 100644 expo-bioscript/README.md create mode 100644 expo-bioscript/android/.gitignore create mode 100644 expo-bioscript/android/.gitkeep create mode 100644 expo-bioscript/android/build.gradle create mode 100644 expo-bioscript/android/src/main/AndroidManifest.xml create mode 100644 expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptModule.kt create mode 100644 expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptNativeBridge.kt create mode 100644 expo-bioscript/expo-module.config.json create mode 100644 expo-bioscript/ios/.gitkeep create mode 100644 expo-bioscript/ios/ExpoBioscript.podspec create mode 100644 expo-bioscript/ios/ExpoBioscriptModule.swift create mode 100644 expo-bioscript/package.json create mode 100644 expo-bioscript/scripts/.gitkeep create mode 100644 expo-bioscript/scripts/build-rust-android.sh create mode 100644 expo-bioscript/scripts/build-rust-ios.sh create mode 100644 expo-bioscript/src/.gitkeep create mode 100644 expo-bioscript/src/ExpoBioscript.types.ts create mode 100644 expo-bioscript/src/ExpoBioscriptModule.ts create mode 100644 expo-bioscript/src/index.ts diff --git a/expo-bioscript/.gitignore b/expo-bioscript/.gitignore new file mode 100644 index 0000000..c7cd14f --- /dev/null +++ b/expo-bioscript/.gitignore @@ -0,0 +1,4 @@ +android/.gradle/ +android/build/ +android/src/main/jniLibs/ +ios/Artifacts/ diff --git a/expo-bioscript/README.md b/expo-bioscript/README.md new file mode 100644 index 0000000..5af7e46 --- /dev/null +++ b/expo-bioscript/README.md @@ -0,0 +1,94 @@ +# expo-bioscript + +Expo module wrapper for the BioScript runtime. + +## Current + +BioScript currently lives in this repo as a Rust-first runtime for secure genomic analysis with Pythonic syntax via Monty. + +Right now: +- `rust/bioscript` contains the runtime logic +- `monty/` is the interpreter/runtime dependency +- `expo-bioscript/` contains a working wrapper scaffold with Android and iOS packaging scripts +- BioScript is still evolving, so the mobile-facing API is not treated as stable + +The current goal is to expose a narrow Expo-native interface to BioScript without exposing generic Monty execution directly. + +## Near-Term + +The expected implementation is: +- keep Monty internal +- add a thin Rust FFI layer for mobile-safe entrypoints +- expose a small Expo API such as: + - `runFile(...)` + - `runCode(...)` + - `prepareIndexes(...)` +- enforce resource limits, restricted I/O, and blocked OS/network access through the BioScript runtime + +This `expo-bioscript` folder is the home for the Expo wrapper layer: +- `ios/` +- `android/` +- `src/` +- `scripts/` + +Current wrapper status: +- `runFile(...)` is implemented end-to-end through the Rust FFI layer +- Android native packaging builds successfully +- iOS native packaging builds successfully +- Apple mobile targets currently disable HTS-backed CRAM/BAM indexing and lookup paths + +## First API + +The first Expo-facing API should be `runFile(...)`, not `runCode(...)`. + +Reason: +- it fits the current BioScript runtime better +- it keeps execution rooted in explicit files instead of arbitrary code strings +- it is the safer first bridge for mobile + +Proposed request shape: + +```ts +type RunFileRequest = { + scriptPath: string; + root?: string; + inputFile?: string; + outputFile?: string; + participantId?: string; + traceReportPath?: string; + timingReportPath?: string; + inputFormat?: 'auto' | 'text' | 'zip' | 'vcf' | 'cram'; + inputIndex?: string; + referenceFile?: string; + referenceIndex?: string; + autoIndex?: boolean; + cacheDir?: string; + maxDurationMs?: number; + maxMemoryBytes?: number; + maxAllocations?: number; + maxRecursionDepth?: number; +}; +``` + +Proposed response shape: + +```ts +type RunFileResult = { + ok: true; +}; +``` + +For the first implementation, errors should be surfaced as native/module exceptions rather than encoded into the success payload. + +## Long-Term + +If the API stabilizes, this will likely evolve into a cleaner split: + +- `bioscript-core` + - runtime, domain logic, Monty integration +- `bioscript-ffi` + - stable native/mobile boundary +- `expo-bioscript` + - Expo module wrapper + +At that point, `expo-bioscript` may move into its own repo and depend on BioScript through a more stable versioned interface. diff --git a/expo-bioscript/android/.gitignore b/expo-bioscript/android/.gitignore new file mode 100644 index 0000000..9ff2a37 --- /dev/null +++ b/expo-bioscript/android/.gitignore @@ -0,0 +1,2 @@ +/build +/.cxx diff --git a/expo-bioscript/android/.gitkeep b/expo-bioscript/android/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/expo-bioscript/android/.gitkeep @@ -0,0 +1 @@ + diff --git a/expo-bioscript/android/build.gradle b/expo-bioscript/android/build.gradle new file mode 100644 index 0000000..3cac9f2 --- /dev/null +++ b/expo-bioscript/android/build.gradle @@ -0,0 +1,36 @@ +plugins { + id 'com.android.library' + id 'expo-module-gradle-plugin' +} + +group = 'expo.modules.bioscript' +version = '0.1.0' + +def reactNativeArchitectures() { + def value = project.getProperties().get('reactNativeArchitectures') + return value ? value.split(',') : ['armeabi-v7a', 'x86', 'x86_64', 'arm64-v8a'] +} + +tasks.register('buildRustAndroid', Exec) { + workingDir projectDir + commandLine 'sh', '../scripts/build-rust-android.sh', *reactNativeArchitectures() +} + +android { + namespace 'expo.modules.bioscript' + + defaultConfig { + versionCode 1 + versionName '0.1.0' + } + + sourceSets { + main { + jniLibs.srcDirs = ['src/main/jniLibs'] + } + } +} + +tasks.matching { it.name == 'preBuild' }.configureEach { + dependsOn tasks.named('buildRustAndroid') +} diff --git a/expo-bioscript/android/src/main/AndroidManifest.xml b/expo-bioscript/android/src/main/AndroidManifest.xml new file mode 100644 index 0000000..cc947c5 --- /dev/null +++ b/expo-bioscript/android/src/main/AndroidManifest.xml @@ -0,0 +1 @@ + diff --git a/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptModule.kt b/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptModule.kt new file mode 100644 index 0000000..e5c41cf --- /dev/null +++ b/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptModule.kt @@ -0,0 +1,48 @@ +package expo.modules.bioscript + +import expo.modules.kotlin.exception.CodedException +import expo.modules.kotlin.modules.Module +import expo.modules.kotlin.modules.ModuleDefinition +import org.json.JSONObject + +internal class BioscriptUnavailableException : + CodedException("bioscript native Android library is unavailable.") + +internal class BioscriptInvalidResponseException : + CodedException("bioscript returned an invalid JSON response.") + +internal class BioscriptRuntimeException(message: String) : + CodedException(message) + +class ExpoBioscriptModule : Module() { + override fun definition() = ModuleDefinition { + Name("ExpoBioscript") + + Function("isAvailable") { + ExpoBioscriptNativeBridge.isAvailable() + } + + AsyncFunction("runFile") { request: Map -> + if (!ExpoBioscriptNativeBridge.isAvailable()) { + throw BioscriptUnavailableException() + } + + val requestJson = JSONObject(request).toString() + val response = ExpoBioscriptNativeBridge.runFile(requestJson) + ?: throw BioscriptInvalidResponseException() + + val json = try { + JSONObject(response) + } catch (_: Exception) { + throw BioscriptInvalidResponseException() + } + + if (!json.optBoolean("ok")) { + throw BioscriptRuntimeException(json.optString("error", "bioscript runFile failed")) + } + + val value = json.optJSONObject("value") ?: throw BioscriptInvalidResponseException() + mapOf("ok" to value.optBoolean("ok")) + } + } +} diff --git a/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptNativeBridge.kt b/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptNativeBridge.kt new file mode 100644 index 0000000..7ffb09d --- /dev/null +++ b/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptNativeBridge.kt @@ -0,0 +1,26 @@ +package expo.modules.bioscript + +internal class ExpoBioscriptNativeBridge private constructor() { + companion object { + private val nativeLibraryLoaded: Boolean by lazy { + try { + System.loadLibrary("bioscript") + true + } catch (_: UnsatisfiedLinkError) { + false + } + } + + @JvmStatic + fun isAvailable(): Boolean = nativeLibraryLoaded + + @JvmStatic + fun runFile(requestJson: String): String? { + check(nativeLibraryLoaded) { "bioscript native library is unavailable on Android." } + return runFileNative(requestJson) + } + + @JvmStatic + private external fun runFileNative(requestJson: String): String? + } +} diff --git a/expo-bioscript/expo-module.config.json b/expo-bioscript/expo-module.config.json new file mode 100644 index 0000000..d640414 --- /dev/null +++ b/expo-bioscript/expo-module.config.json @@ -0,0 +1,9 @@ +{ + "platforms": ["apple", "android"], + "apple": { + "modules": ["ExpoBioscriptModule"] + }, + "android": { + "modules": ["expo.modules.bioscript.ExpoBioscriptModule"] + } +} diff --git a/expo-bioscript/ios/.gitkeep b/expo-bioscript/ios/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/expo-bioscript/ios/.gitkeep @@ -0,0 +1 @@ + diff --git a/expo-bioscript/ios/ExpoBioscript.podspec b/expo-bioscript/ios/ExpoBioscript.podspec new file mode 100644 index 0000000..8fc45be --- /dev/null +++ b/expo-bioscript/ios/ExpoBioscript.podspec @@ -0,0 +1,32 @@ +require 'json' + +package = JSON.parse(File.read(File.join(__dir__, '..', 'package.json'))) + +Pod::Spec.new do |s| + s.name = 'ExpoBioscript' + s.version = package['version'] + s.summary = package['description'] + s.description = package['description'] + s.license = package['license'] + s.author = package['author'] + s.platforms = { + :ios => '15.1', + :tvos => '15.1' + } + s.swift_version = '5.9' + s.source = { :git => 'https://github.com/OpenMined/biovault-app.git' } + s.static_framework = true + + s.dependency 'ExpoModulesCore' + s.pod_target_xcconfig = { + 'DEFINES_MODULE' => 'YES' + } + s.prepare_command = <<-CMD + set -e + sh ../scripts/build-rust-ios.sh + CMD + s.vendored_libraries = 'Artifacts/*.a' + + s.source_files = 'ExpoBioscriptModule.swift' + s.preserve_paths = '../../rust/**/*', '../scripts/**/*', 'Artifacts/*.a' +end diff --git a/expo-bioscript/ios/ExpoBioscriptModule.swift b/expo-bioscript/ios/ExpoBioscriptModule.swift new file mode 100644 index 0000000..67fa9eb --- /dev/null +++ b/expo-bioscript/ios/ExpoBioscriptModule.swift @@ -0,0 +1,63 @@ +import ExpoModulesCore +import Foundation + +@_silgen_name("bioscript_run_file_json") +func bioscript_run_file_json(_ requestJson: UnsafePointer) -> UnsafeMutablePointer? + +@_silgen_name("bioscript_free_string") +func bioscript_free_string(_ ptr: UnsafeMutablePointer) + +public final class ExpoBioscriptModule: Module { + public func definition() -> ModuleDefinition { + Name("ExpoBioscript") + + Function("isAvailable") { + true + } + + AsyncFunction("runFile") { (request: [String: Any]) -> [String: Any] in + let jsonData = try JSONSerialization.data(withJSONObject: request, options: []) + guard let requestJson = String(data: jsonData, encoding: .utf8) else { + throw BioscriptModuleError.invalidRequestEncoding + } + + guard let requestCString = strdup(requestJson) else { + throw BioscriptModuleError.allocationFailed + } + defer { free(requestCString) } + + guard let resultPointer = bioscript_run_file_json(requestCString) else { + throw BioscriptModuleError.nullResponse + } + defer { bioscript_free_string(resultPointer) } + + let resultString = String(cString: resultPointer) + guard let resultData = resultString.data(using: .utf8) else { + throw BioscriptModuleError.invalidResponseEncoding + } + + guard let resultObject = try JSONSerialization.jsonObject(with: resultData) as? [String: Any] else { + throw BioscriptModuleError.invalidResponseShape + } + + guard (resultObject["ok"] as? Bool) == true else { + let message = (resultObject["error"] as? String) ?? "bioscript runFile failed" + throw Exception(name: "BioscriptRunFileError", description: message) + } + + guard let value = resultObject["value"] as? [String: Any] else { + throw BioscriptModuleError.invalidResponseShape + } + + return value + } + } +} + +private enum BioscriptModuleError: Error { + case allocationFailed + case invalidRequestEncoding + case nullResponse + case invalidResponseEncoding + case invalidResponseShape +} diff --git a/expo-bioscript/package.json b/expo-bioscript/package.json new file mode 100644 index 0000000..118e8ef --- /dev/null +++ b/expo-bioscript/package.json @@ -0,0 +1,32 @@ +{ + "name": "expo-bioscript", + "version": "0.1.0", + "description": "Expo native module for integrating the BioScript runtime.", + "main": "src/index.ts", + "react-native": "src/index.ts", + "types": "src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "sideEffects": false, + "files": [ + "src", + "android", + "ios", + "scripts", + "expo-module.config.json", + "README.md" + ], + "keywords": [ + "expo", + "react-native", + "bioscript", + "rust" + ], + "author": "kj", + "license": "MIT", + "peerDependencies": { + "expo": "*", + "react-native": "*" + } +} diff --git a/expo-bioscript/scripts/.gitkeep b/expo-bioscript/scripts/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/expo-bioscript/scripts/.gitkeep @@ -0,0 +1 @@ + diff --git a/expo-bioscript/scripts/build-rust-android.sh b/expo-bioscript/scripts/build-rust-android.sh new file mode 100644 index 0000000..b3f4951 --- /dev/null +++ b/expo-bioscript/scripts/build-rust-android.sh @@ -0,0 +1,85 @@ +#!/bin/sh +set -eu + +ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +RUST_WORKSPACE_DIR="$ROOT_DIR/../rust" +RUST_MANIFEST="$RUST_WORKSPACE_DIR/bioscript/Cargo.toml" +ANDROID_DIR="$ROOT_DIR/android" +JNI_LIBS_DIR="$ANDROID_DIR/src/main/jniLibs" +CARGO_HOME_DIR="${CARGO_HOME:-/tmp/expo-bioscript-cargo}" +SDK_ROOT_DIR="${ANDROID_HOME:-${ANDROID_SDK_ROOT:-}}" +APP_ANDROID_LOCAL_PROPERTIES="$ROOT_DIR/../../android/local.properties" + +if [ "$#" -eq 0 ]; then + set -- arm64-v8a x86_64 +fi + +if [ -f "$HOME/.cargo/env" ]; then + . "$HOME/.cargo/env" +fi + +if [ -z "$SDK_ROOT_DIR" ] && [ -f "$APP_ANDROID_LOCAL_PROPERTIES" ]; then + SDK_ROOT_DIR="$(sed -n 's/^sdk\.dir=//p' "$APP_ANDROID_LOCAL_PROPERTIES" | tail -n 1)" +fi + +if [ -z "$SDK_ROOT_DIR" ] && [ -d "$HOME/Library/Android/sdk" ]; then + SDK_ROOT_DIR="$HOME/Library/Android/sdk" +fi + +if [ -z "$SDK_ROOT_DIR" ]; then + echo "ANDROID_HOME or ANDROID_SDK_ROOT must be set" + exit 1 +fi + +if ! command -v cargo >/dev/null 2>&1; then + echo "cargo is not available in PATH" + exit 1 +fi + +if ! command -v rustup >/dev/null 2>&1; then + echo "rustup is not available in PATH" + exit 1 +fi + +if ! command -v cargo-ndk >/dev/null 2>&1; then + echo "cargo-ndk is required for Android builds" + echo "Install it with: cargo install cargo-ndk" + exit 1 +fi + +NDK_ARGS="" + +for ABI in "$@"; do + case "$ABI" in + armeabi-v7a) + RUST_TARGET="armv7-linux-androideabi" + ;; + arm64-v8a) + RUST_TARGET="aarch64-linux-android" + ;; + x86) + RUST_TARGET="i686-linux-android" + ;; + x86_64) + RUST_TARGET="x86_64-linux-android" + ;; + *) + echo "Unsupported Android ABI: $ABI" + exit 1 + ;; + esac + + if ! rustup target list --installed | grep -q "^$RUST_TARGET$"; then + echo "Missing Rust target: $RUST_TARGET" + echo "Install it with: rustup target add $RUST_TARGET" + exit 1 + fi + + NDK_ARGS="$NDK_ARGS -t $ABI" +done + +mkdir -p "$CARGO_HOME_DIR" "$JNI_LIBS_DIR" + +cd "$RUST_WORKSPACE_DIR" +# shellcheck disable=SC2086 +CARGO_HOME="$CARGO_HOME_DIR" cargo ndk $NDK_ARGS -o "$JNI_LIBS_DIR" build --manifest-path "$RUST_MANIFEST" --release diff --git a/expo-bioscript/scripts/build-rust-ios.sh b/expo-bioscript/scripts/build-rust-ios.sh new file mode 100644 index 0000000..af40414 --- /dev/null +++ b/expo-bioscript/scripts/build-rust-ios.sh @@ -0,0 +1,53 @@ +#!/bin/sh +set -eu + +ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +IOS_DIR="$ROOT_DIR/ios" +RUST_WORKSPACE_DIR="$ROOT_DIR/../rust" +RUST_MANIFEST="$RUST_WORKSPACE_DIR/bioscript/Cargo.toml" +CARGO_HOME_DIR="${CARGO_HOME:-/tmp/expo-bioscript-cargo}" +ARTIFACTS_DIR="$IOS_DIR/Artifacts" +DEVICE_TARGET="aarch64-apple-ios" +SIM_TARGET="aarch64-apple-ios-sim" + +if [ -f "$HOME/.cargo/env" ]; then + . "$HOME/.cargo/env" +fi + +if ! command -v cargo >/dev/null 2>&1; then + echo "cargo is not available in PATH" + exit 1 +fi + +if ! command -v rustup >/dev/null 2>&1; then + echo "rustup is not available in PATH" + exit 1 +fi + +for RUST_TARGET in "$DEVICE_TARGET" "$SIM_TARGET"; do + if rustup target list --installed | grep -q "^$RUST_TARGET$"; then + continue + fi + + echo "Missing Rust target: $RUST_TARGET" + echo "Install it with: rustup target add $RUST_TARGET" + exit 1 +done + +mkdir -p "$CARGO_HOME_DIR" + +cd "$RUST_WORKSPACE_DIR" +CARGO_HOME="$CARGO_HOME_DIR" cargo build --manifest-path "$RUST_MANIFEST" --target "$DEVICE_TARGET" --release +CARGO_HOME="$CARGO_HOME_DIR" cargo build --manifest-path "$RUST_MANIFEST" --target "$SIM_TARGET" --release + +DEVICE_LIB="$RUST_WORKSPACE_DIR/target/$DEVICE_TARGET/release/libbioscript.a" +SIM_LIB="$RUST_WORKSPACE_DIR/target/$SIM_TARGET/release/libbioscript.a" + +if [ ! -f "$DEVICE_LIB" ] || [ ! -f "$SIM_LIB" ]; then + echo "Missing Rust build artifacts for Bioscript iOS packaging" + exit 1 +fi + +mkdir -p "$ARTIFACTS_DIR" +cp "$DEVICE_LIB" "$ARTIFACTS_DIR/libbioscript_ios.a" +cp "$SIM_LIB" "$ARTIFACTS_DIR/libbioscript_sim.a" diff --git a/expo-bioscript/src/.gitkeep b/expo-bioscript/src/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/expo-bioscript/src/.gitkeep @@ -0,0 +1 @@ + diff --git a/expo-bioscript/src/ExpoBioscript.types.ts b/expo-bioscript/src/ExpoBioscript.types.ts new file mode 100644 index 0000000..d8d8b00 --- /dev/null +++ b/expo-bioscript/src/ExpoBioscript.types.ts @@ -0,0 +1,25 @@ +export type BioscriptInputFormat = 'auto' | 'text' | 'zip' | 'vcf' | 'cram'; + +export type RunFileRequest = { + scriptPath: string; + root?: string; + inputFile?: string; + outputFile?: string; + participantId?: string; + traceReportPath?: string; + timingReportPath?: string; + inputFormat?: BioscriptInputFormat; + inputIndex?: string; + referenceFile?: string; + referenceIndex?: string; + autoIndex?: boolean; + cacheDir?: string; + maxDurationMs?: number; + maxMemoryBytes?: number; + maxAllocations?: number; + maxRecursionDepth?: number; +}; + +export type RunFileResult = { + ok: true; +}; diff --git a/expo-bioscript/src/ExpoBioscriptModule.ts b/expo-bioscript/src/ExpoBioscriptModule.ts new file mode 100644 index 0000000..34729a0 --- /dev/null +++ b/expo-bioscript/src/ExpoBioscriptModule.ts @@ -0,0 +1,10 @@ +import { NativeModule, requireNativeModule } from 'expo'; + +import type { RunFileRequest, RunFileResult } from './ExpoBioscript.types'; + +declare class ExpoBioscriptModule extends NativeModule { + isAvailable(): boolean; + runFile(request: RunFileRequest): Promise; +} + +export default requireNativeModule('ExpoBioscript'); diff --git a/expo-bioscript/src/index.ts b/expo-bioscript/src/index.ts new file mode 100644 index 0000000..4288984 --- /dev/null +++ b/expo-bioscript/src/index.ts @@ -0,0 +1,13 @@ +import ExpoBioscriptModule from './ExpoBioscriptModule'; + +import type { BioscriptInputFormat, RunFileRequest, RunFileResult } from './ExpoBioscript.types'; + +export type { BioscriptInputFormat, RunFileRequest, RunFileResult } from './ExpoBioscript.types'; + +export function isBioscriptAvailable(): boolean { + return ExpoBioscriptModule.isAvailable(); +} + +export function runFile(request: RunFileRequest): Promise { + return ExpoBioscriptModule.runFile(request); +} diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 336ffe6..c84ed0d 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -137,9 +137,12 @@ dependencies = [ name = "bioscript" version = "0.1.0" dependencies = [ + "jni", "monty", "noodles", "rust-htslib", + "serde", + "serde_json", "serde_yaml", "zip", ] @@ -257,6 +260,12 @@ dependencies = [ "shlex", ] +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + [[package]] name = "cexpr" version = "0.6.0" @@ -321,6 +330,16 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2550f75b8cfac212855f6b1885455df8eaee8fe8e246b647d69146142e016084" +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + [[package]] name = "compact_str" version = "0.9.0" @@ -383,7 +402,7 @@ dependencies = [ "openssl-sys", "pkg-config", "vcpkg", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -850,6 +869,50 @@ dependencies = [ "smallvec", ] +[[package]] +name = "jni" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" +dependencies = [ + "cesu8", + "cfg-if", + "combine", + "jni-sys 0.3.1", + "log", + "thiserror 1.0.69", + "walkdir", + "windows-sys 0.45.0", +] + +[[package]] +name = "jni-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" +dependencies = [ + "jni-sys 0.4.1", +] + +[[package]] +name = "jni-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" +dependencies = [ + "jni-sys-macros", +] + +[[package]] +name = "jni-sys-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "jobserver" version = "0.1.34" @@ -1562,6 +1625,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -1610,6 +1682,19 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + [[package]] name = "serde_yaml" version = "0.9.34+deprecated" @@ -1894,6 +1979,16 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -1954,6 +2049,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -2013,13 +2117,37 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + [[package]] name = "windows-sys" version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] @@ -2028,28 +2156,46 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -2062,24 +2208,48 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -2227,6 +2397,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + [[package]] name = "zopfli" version = "0.8.3" diff --git a/rust/bioscript/Cargo.toml b/rust/bioscript/Cargo.toml index 743d6dc..1ae4a18 100644 --- a/rust/bioscript/Cargo.toml +++ b/rust/bioscript/Cargo.toml @@ -3,12 +3,20 @@ name = "bioscript" version = "0.1.0" edition = "2024" +[lib] +crate-type = ["rlib", "staticlib", "cdylib"] + [dependencies] +jni = "0.21" monty = { path = "../../monty/crates/monty" } noodles = { version = "0.104.0", features = ["bgzf"] } zip = { version = "2.2.0", default-features = false, features = ["deflate"] } -rust-htslib = "0.51.0" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" serde_yaml = "0.9.34" +[target.'cfg(not(any(target_os = "ios", target_os = "tvos")))'.dependencies] +rust-htslib = "0.51.0" + [lints.clippy] pedantic = { level = "warn", priority = -1 } diff --git a/rust/bioscript/src/genotype.rs b/rust/bioscript/src/genotype.rs index 5011586..25f3432 100644 --- a/rust/bioscript/src/genotype.rs +++ b/rust/bioscript/src/genotype.rs @@ -7,11 +7,14 @@ use std::{ }; use noodles::bgzf; +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] use rust_htslib::bam::{self, Read}; use zip::ZipArchive; use crate::runtime::RuntimeError; -use crate::variant::{Assembly, VariantKind, VariantObservation, VariantSpec}; +use crate::variant::{VariantObservation, VariantSpec}; +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +use crate::variant::{Assembly, VariantKind}; const COMMENT_PREFIXES: [&str; 2] = ["#", "//"]; @@ -65,6 +68,7 @@ struct DelimitedBackend { zip_entry_name: Option, } +#[cfg_attr(any(target_os = "ios", target_os = "tvos"), allow(dead_code))] #[derive(Debug, Clone)] struct CramBackend { path: PathBuf, @@ -249,7 +253,7 @@ impl GenotypeStore { }, QueryBackend::Cram(_) => BackendCapabilities { rsid_lookup: false, - locus_lookup: true, + locus_lookup: cfg!(not(any(target_os = "ios", target_os = "tvos"))), }, } } @@ -371,6 +375,7 @@ impl DelimitedBackend { } } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] impl CramBackend { fn backend_name(&self) -> &'static str { "cram" @@ -636,6 +641,20 @@ impl CramBackend { } } +#[cfg(any(target_os = "ios", target_os = "tvos"))] +impl CramBackend { + fn backend_name(&self) -> &'static str { + "cram" + } + + fn lookup_variant(&self, _variant: &VariantSpec) -> Result { + Err(RuntimeError::Unsupported( + "CRAM/BAM-backed lookup is not supported on Apple mobile targets".to_owned(), + )) + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn choose_variant_locus(variant: &VariantSpec, reference_file: &Path) -> Option<(Assembly, GenomicLocus)> { match detect_reference_assembly(reference_file) { Some(Assembly::Grch38) => variant @@ -656,6 +675,7 @@ fn choose_variant_locus(variant: &VariantSpec, reference_file: &Path) -> Option< } } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn detect_reference_assembly(reference_file: &Path) -> Option { let lower = reference_file.to_string_lossy().to_ascii_lowercase(); if lower.contains("grch38") || lower.contains("hg38") || lower.contains("assembly38") { @@ -667,6 +687,7 @@ fn detect_reference_assembly(reference_file: &Path) -> Option { } } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn fetch_locus(reader: &mut bam::IndexedReader, locus: &GenomicLocus) -> Result<(), RuntimeError> { let tid = header_tid(reader.header(), &locus.chrom).ok_or_else(|| { RuntimeError::Unsupported(format!( @@ -686,15 +707,18 @@ fn fetch_locus(reader: &mut bam::IndexedReader, locus: &GenomicLocus) -> Result< }) } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn header_tid(header: &bam::HeaderView, chrom: &str) -> Option { let candidates = [chrom.to_owned(), format!("chr{chrom}"), chrom.trim_start_matches("chr").to_owned()]; candidates.iter().find_map(|candidate| header.tid(candidate.as_bytes())) } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn describe_locus(locus: &GenomicLocus) -> String { format!("{}:{}-{}", locus.chrom, locus.start, locus.end) } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn anchor_window(locus: &GenomicLocus) -> GenomicLocus { let anchor = locus.start.saturating_sub(1); GenomicLocus { @@ -704,10 +728,12 @@ fn anchor_window(locus: &GenomicLocus) -> GenomicLocus { } } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn first_base(value: &str) -> Option { value.trim().chars().next().map(|ch| ch.to_ascii_uppercase()) } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn infer_snp_genotype( reference: char, alternate: char, @@ -728,6 +754,7 @@ fn infer_snp_genotype( } } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn infer_copy_number_genotype( reference: &str, alternate: &str, @@ -823,6 +850,7 @@ fn detect_delimiter(lines: &[String]) -> Delimiter { Delimiter::Tab } +#[allow(dead_code)] struct RowParser { delimiter: Delimiter, header: Option>, @@ -830,6 +858,7 @@ struct RowParser { alias_map: HashMap<&'static str, BTreeSet<&'static str>>, } +#[allow(dead_code)] impl RowParser { fn new(delimiter: Delimiter) -> Self { let mut alias_map = HashMap::new(); diff --git a/rust/bioscript/src/lib.rs b/rust/bioscript/src/lib.rs index 7ada152..6124302 100644 --- a/rust/bioscript/src/lib.rs +++ b/rust/bioscript/src/lib.rs @@ -1,3 +1,10 @@ +use std::{ + env, ffi::{CStr, CString}, fs, os::raw::c_char, path::PathBuf, time::{Duration, Instant}, +}; + +use monty::{MontyObject, ResourceLimits}; +use serde::{Deserialize, Serialize}; + pub mod genotype; pub mod prepare; pub mod runtime; @@ -11,3 +18,285 @@ pub use prepare::{PrepareRequest, PreparedPaths, prepare_indexes, shell_flags}; pub use runtime::{BioscriptRuntime, RuntimeConfig, RuntimeError, StageTiming}; pub use validator::{FileReport, Issue, Severity, ValidationReport, validate_variants_path}; pub use variant::{Assembly, VariantKind, VariantObservation, VariantSpec}; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RunFileRequest { + pub script_path: String, + pub root: Option, + pub input_file: Option, + pub output_file: Option, + pub participant_id: Option, + pub trace_report_path: Option, + pub timing_report_path: Option, + pub input_format: Option, + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, + pub auto_index: Option, + pub cache_dir: Option, + pub max_duration_ms: Option, + pub max_memory_bytes: Option, + pub max_allocations: Option, + pub max_recursion_depth: Option, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct RunFileResult { + pub ok: bool, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +struct FfiResult { + ok: bool, + #[serde(skip_serializing_if = "Option::is_none")] + value: Option, + #[serde(skip_serializing_if = "Option::is_none")] + error: Option, +} + +pub fn run_file_request(request: RunFileRequest) -> Result { + let script_path = PathBuf::from(&request.script_path); + let runtime_root = match request.root { + Some(dir) => PathBuf::from(dir), + None => env::current_dir().map_err(|err| format!("failed to get current directory: {err}"))?, + }; + + let mut loader = GenotypeLoadOptions::default(); + if let Some(value) = request.input_format.as_deref() { + if value.eq_ignore_ascii_case("auto") { + loader.format = None; + } else { + let parsed = value + .parse::() + .map_err(|err| format!("invalid inputFormat value {value}: {err}"))?; + loader.format = Some(parsed); + } + } + loader.input_index = request.input_index.map(PathBuf::from); + loader.reference_file = request.reference_file.map(PathBuf::from); + loader.reference_index = request.reference_index.map(PathBuf::from); + + let mut limits = ResourceLimits::new() + .max_duration(Duration::from_millis(100)) + .max_memory(8 * 1024 * 1024) + .max_allocations(200_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + + if let Some(value) = request.max_duration_ms { + limits = limits.max_duration(Duration::from_millis(value)); + } + if let Some(value) = request.max_memory_bytes { + limits = limits.max_memory(value); + } + if let Some(value) = request.max_allocations { + limits = limits.max_allocations(value); + } + if let Some(value) = request.max_recursion_depth { + limits = limits.max_recursion_depth(Some(value)); + } + + let mut ffi_timings: Vec = Vec::new(); + if request.auto_index.unwrap_or(false) { + let auto_index_started = Instant::now(); + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let effective_cache = request + .cache_dir + .as_ref() + .map(PathBuf::from) + .unwrap_or_else(|| cwd.join(".bioscript-cache")); + let prepare_request = PrepareRequest { + root: runtime_root.clone(), + cwd: cwd.clone(), + cache_dir: effective_cache, + input_file: request.input_file.clone(), + input_format: loader.format, + reference_file: loader + .reference_file + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + }; + let prepared = prepare_indexes(&prepare_request)?; + if let Some(idx) = prepared.input_index + && loader.input_index.is_none() + { + loader.input_index = Some(idx); + } + if let Some(ref_file) = prepared.reference_file { + loader.reference_file = Some(ref_file); + } + if let Some(ref_idx) = prepared.reference_index + && loader.reference_index.is_none() + { + loader.reference_index = Some(ref_idx); + } + ffi_timings.push(StageTiming { + stage: "auto_index".to_owned(), + duration_ms: auto_index_started.elapsed().as_millis(), + detail: "prepare_indexes".to_owned(), + }); + } + + let runtime = BioscriptRuntime::with_config( + runtime_root, + RuntimeConfig { + limits, + loader, + }, + ) + .map_err(|err| err.to_string())?; + + let mut inputs = Vec::new(); + if let Some(input_file) = request.input_file { + inputs.push(("input_file", MontyObject::String(input_file))); + } + if let Some(output_file) = request.output_file { + inputs.push(("output_file", MontyObject::String(output_file))); + } + if let Some(participant_id) = request.participant_id { + inputs.push(("participant_id", MontyObject::String(participant_id))); + } + + runtime + .run_file( + &script_path, + request.trace_report_path.as_deref().map(std::path::Path::new), + inputs, + ) + .map_err(|err| err.to_string())?; + + if let Some(timing_path) = request.timing_report_path { + let mut all_timings = ffi_timings; + all_timings.extend(runtime.timing_snapshot()); + write_timing_report(&PathBuf::from(timing_path), &all_timings)?; + } + + Ok(RunFileResult { ok: true }) +} + +fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create timing report dir {}: {err}", parent.display()))?; + } + let mut output = String::from("stage\tduration_ms\tdetail\n"); + for timing in timings { + output.push_str(&format!( + "{}\t{}\t{}\n", + timing.stage, + timing.duration_ms, + timing.detail.replace('\t', " ") + )); + } + fs::write(path, output).map_err(|err| format!("failed to write timing report {}: {err}", path.display())) +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn bioscript_run_file_json(request_json: *const c_char) -> *mut c_char { + let response = unsafe { + if request_json.is_null() { + FfiResult:: { + ok: false, + value: None, + error: Some("request_json was null".to_owned()), + } + } else { + match CStr::from_ptr(request_json).to_str() { + Ok(value) => match serde_json::from_str::(value) { + Ok(request) => match run_file_request(request) { + Ok(result) => FfiResult { + ok: true, + value: Some(result), + error: None, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(error), + }, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(format!("invalid request JSON: {error}")), + }, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(format!("request_json was not valid UTF-8: {error}")), + }, + } + } + }; + + match serde_json::to_string(&response) { + Ok(json) => match CString::new(json) { + Ok(value) => value.into_raw(), + Err(_) => std::ptr::null_mut(), + }, + Err(_) => std::ptr::null_mut(), + } +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn bioscript_free_string(ptr: *mut c_char) { + if !ptr.is_null() { + unsafe { + let _ = CString::from_raw(ptr); + } + } +} + +#[cfg(target_os = "android")] +pub mod android { + use crate::{RunFileRequest, run_file_request}; + use jni::JNIEnv; + use jni::objects::{JClass, JString}; + + /// JNI entrypoint used by the Android Expo module bridge. + /// + /// # Safety + /// - Called by the JVM with valid JNI references and strings. + /// - Follows standard JNI safety rules. + #[unsafe(no_mangle)] + pub unsafe extern "system" fn Java_expo_modules_bioscript_ExpoBioscriptNativeBridge_runFileNative< + 'local, + >( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + request_json: JString<'local>, + ) -> JString<'local> { + let request_string: String = match env.get_string(&request_json) { + Ok(value) => value.into(), + Err(error) => { + return env + .new_string( + serde_json::json!({ + "ok": false, + "error": format!("failed to read request json from JVM: {error}") + }) + .to_string(), + ) + .expect("jni string allocation failed"); + } + }; + + let response = match serde_json::from_str::(&request_string) { + Ok(request) => match run_file_request(request) { + Ok(result) => serde_json::json!({ "ok": true, "value": result }).to_string(), + Err(error) => serde_json::json!({ "ok": false, "error": error }).to_string(), + }, + Err(error) => serde_json::json!({ + "ok": false, + "error": format!("invalid request JSON: {error}") + }) + .to_string(), + }; + + env.new_string(response).expect("jni string allocation failed") + } +} diff --git a/rust/bioscript/src/prepare.rs b/rust/bioscript/src/prepare.rs index 867d383..6eb380e 100644 --- a/rust/bioscript/src/prepare.rs +++ b/rust/bioscript/src/prepare.rs @@ -1,10 +1,15 @@ use std::{ - collections::hash_map::DefaultHasher, fs, - hash::{Hash, Hasher}, path::{Path, PathBuf}, }; +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +use std::{ + collections::hash_map::DefaultHasher, + hash::{Hash, Hasher}, +}; + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] use rust_htslib::{bam, faidx}; use crate::genotype::GenotypeSourceFormat; @@ -106,6 +111,14 @@ fn detect_alignment_input(path: &Path) -> bool { } fn ensure_alignment_index(path: &Path, cache_dir: &Path) -> Result { + #[cfg(any(target_os = "ios", target_os = "tvos"))] + { + let _ = (path, cache_dir); + return Err("alignment indexing is not supported on Apple mobile targets".to_owned()); + } + + #[cfg(not(any(target_os = "ios", target_os = "tvos")))] + { if let Some(existing) = adjacent_alignment_index(path) { return Ok(existing); } @@ -137,8 +150,10 @@ fn ensure_alignment_index(path: &Path, cache_dir: &Path) -> Result Option { let lower = path.to_string_lossy().to_ascii_lowercase(); let candidates = if lower.ends_with(".cram") { @@ -156,6 +171,14 @@ fn adjacent_alignment_index(path: &Path) -> Option { } fn ensure_reference_index(path: &Path, cache_dir: &Path) -> Result<(PathBuf, PathBuf), String> { + #[cfg(any(target_os = "ios", target_os = "tvos"))] + { + let _ = (path, cache_dir); + return Err("reference indexing is not supported on Apple mobile targets".to_owned()); + } + + #[cfg(not(any(target_os = "ios", target_os = "tvos")))] + { let adjacent = adjacent_reference_index(path); if let Some(index) = adjacent { return Ok((path.to_path_buf(), index)); @@ -179,13 +202,16 @@ fn ensure_reference_index(path: &Path, cache_dir: &Path) -> Result<(PathBuf, Pat } Ok((cached_reference, cached_index)) + } } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn adjacent_reference_index(path: &Path) -> Option { let candidate = cached_reference_index_path(path); candidate.exists().then_some(candidate) } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn cached_reference_index_path(path: &Path) -> PathBuf { if let Some(ext) = path.extension().and_then(|ext| ext.to_str()) { path.with_extension(format!("{ext}.fai")) @@ -194,6 +220,7 @@ fn cached_reference_index_path(path: &Path) -> PathBuf { } } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn create_reference_link(source: &Path, target: &Path) -> Result<(), String> { if let Some(parent) = target.parent() { fs::create_dir_all(parent) @@ -225,6 +252,7 @@ fn create_reference_link(source: &Path, target: &Path) -> Result<(), String> { } } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn stable_stem(path: &Path) -> String { let mut hasher = DefaultHasher::new(); path.to_string_lossy().hash(&mut hasher); @@ -237,6 +265,7 @@ fn stable_stem(path: &Path) -> String { format!("{file_name}-{hash:016x}") } +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] fn cache_reference_name(path: &Path) -> String { let file_name = path .file_name() From 54aff8cf33b157e918a4abcadf2d2db3380e51af Mon Sep 17 00:00:00 2001 From: keelan Date: Fri, 10 Apr 2026 13:37:28 +0100 Subject: [PATCH 2/4] Remove subfolder --- expo-bioscript/.gitignore | 4 - expo-bioscript/README.md | 94 ------------------- expo-bioscript/android/.gitignore | 2 - expo-bioscript/android/.gitkeep | 1 - expo-bioscript/android/build.gradle | 36 ------- .../android/src/main/AndroidManifest.xml | 1 - .../modules/bioscript/ExpoBioscriptModule.kt | 48 ---------- .../bioscript/ExpoBioscriptNativeBridge.kt | 26 ----- expo-bioscript/expo-module.config.json | 9 -- expo-bioscript/ios/.gitkeep | 1 - expo-bioscript/ios/ExpoBioscript.podspec | 32 ------- expo-bioscript/ios/ExpoBioscriptModule.swift | 63 ------------- expo-bioscript/package.json | 32 ------- expo-bioscript/scripts/.gitkeep | 1 - expo-bioscript/scripts/build-rust-android.sh | 85 ----------------- expo-bioscript/scripts/build-rust-ios.sh | 53 ----------- expo-bioscript/src/.gitkeep | 1 - expo-bioscript/src/ExpoBioscript.types.ts | 25 ----- expo-bioscript/src/ExpoBioscriptModule.ts | 10 -- expo-bioscript/src/index.ts | 13 --- 20 files changed, 537 deletions(-) delete mode 100644 expo-bioscript/.gitignore delete mode 100644 expo-bioscript/README.md delete mode 100644 expo-bioscript/android/.gitignore delete mode 100644 expo-bioscript/android/.gitkeep delete mode 100644 expo-bioscript/android/build.gradle delete mode 100644 expo-bioscript/android/src/main/AndroidManifest.xml delete mode 100644 expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptModule.kt delete mode 100644 expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptNativeBridge.kt delete mode 100644 expo-bioscript/expo-module.config.json delete mode 100644 expo-bioscript/ios/.gitkeep delete mode 100644 expo-bioscript/ios/ExpoBioscript.podspec delete mode 100644 expo-bioscript/ios/ExpoBioscriptModule.swift delete mode 100644 expo-bioscript/package.json delete mode 100644 expo-bioscript/scripts/.gitkeep delete mode 100644 expo-bioscript/scripts/build-rust-android.sh delete mode 100644 expo-bioscript/scripts/build-rust-ios.sh delete mode 100644 expo-bioscript/src/.gitkeep delete mode 100644 expo-bioscript/src/ExpoBioscript.types.ts delete mode 100644 expo-bioscript/src/ExpoBioscriptModule.ts delete mode 100644 expo-bioscript/src/index.ts diff --git a/expo-bioscript/.gitignore b/expo-bioscript/.gitignore deleted file mode 100644 index c7cd14f..0000000 --- a/expo-bioscript/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -android/.gradle/ -android/build/ -android/src/main/jniLibs/ -ios/Artifacts/ diff --git a/expo-bioscript/README.md b/expo-bioscript/README.md deleted file mode 100644 index 5af7e46..0000000 --- a/expo-bioscript/README.md +++ /dev/null @@ -1,94 +0,0 @@ -# expo-bioscript - -Expo module wrapper for the BioScript runtime. - -## Current - -BioScript currently lives in this repo as a Rust-first runtime for secure genomic analysis with Pythonic syntax via Monty. - -Right now: -- `rust/bioscript` contains the runtime logic -- `monty/` is the interpreter/runtime dependency -- `expo-bioscript/` contains a working wrapper scaffold with Android and iOS packaging scripts -- BioScript is still evolving, so the mobile-facing API is not treated as stable - -The current goal is to expose a narrow Expo-native interface to BioScript without exposing generic Monty execution directly. - -## Near-Term - -The expected implementation is: -- keep Monty internal -- add a thin Rust FFI layer for mobile-safe entrypoints -- expose a small Expo API such as: - - `runFile(...)` - - `runCode(...)` - - `prepareIndexes(...)` -- enforce resource limits, restricted I/O, and blocked OS/network access through the BioScript runtime - -This `expo-bioscript` folder is the home for the Expo wrapper layer: -- `ios/` -- `android/` -- `src/` -- `scripts/` - -Current wrapper status: -- `runFile(...)` is implemented end-to-end through the Rust FFI layer -- Android native packaging builds successfully -- iOS native packaging builds successfully -- Apple mobile targets currently disable HTS-backed CRAM/BAM indexing and lookup paths - -## First API - -The first Expo-facing API should be `runFile(...)`, not `runCode(...)`. - -Reason: -- it fits the current BioScript runtime better -- it keeps execution rooted in explicit files instead of arbitrary code strings -- it is the safer first bridge for mobile - -Proposed request shape: - -```ts -type RunFileRequest = { - scriptPath: string; - root?: string; - inputFile?: string; - outputFile?: string; - participantId?: string; - traceReportPath?: string; - timingReportPath?: string; - inputFormat?: 'auto' | 'text' | 'zip' | 'vcf' | 'cram'; - inputIndex?: string; - referenceFile?: string; - referenceIndex?: string; - autoIndex?: boolean; - cacheDir?: string; - maxDurationMs?: number; - maxMemoryBytes?: number; - maxAllocations?: number; - maxRecursionDepth?: number; -}; -``` - -Proposed response shape: - -```ts -type RunFileResult = { - ok: true; -}; -``` - -For the first implementation, errors should be surfaced as native/module exceptions rather than encoded into the success payload. - -## Long-Term - -If the API stabilizes, this will likely evolve into a cleaner split: - -- `bioscript-core` - - runtime, domain logic, Monty integration -- `bioscript-ffi` - - stable native/mobile boundary -- `expo-bioscript` - - Expo module wrapper - -At that point, `expo-bioscript` may move into its own repo and depend on BioScript through a more stable versioned interface. diff --git a/expo-bioscript/android/.gitignore b/expo-bioscript/android/.gitignore deleted file mode 100644 index 9ff2a37..0000000 --- a/expo-bioscript/android/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/build -/.cxx diff --git a/expo-bioscript/android/.gitkeep b/expo-bioscript/android/.gitkeep deleted file mode 100644 index 8b13789..0000000 --- a/expo-bioscript/android/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/expo-bioscript/android/build.gradle b/expo-bioscript/android/build.gradle deleted file mode 100644 index 3cac9f2..0000000 --- a/expo-bioscript/android/build.gradle +++ /dev/null @@ -1,36 +0,0 @@ -plugins { - id 'com.android.library' - id 'expo-module-gradle-plugin' -} - -group = 'expo.modules.bioscript' -version = '0.1.0' - -def reactNativeArchitectures() { - def value = project.getProperties().get('reactNativeArchitectures') - return value ? value.split(',') : ['armeabi-v7a', 'x86', 'x86_64', 'arm64-v8a'] -} - -tasks.register('buildRustAndroid', Exec) { - workingDir projectDir - commandLine 'sh', '../scripts/build-rust-android.sh', *reactNativeArchitectures() -} - -android { - namespace 'expo.modules.bioscript' - - defaultConfig { - versionCode 1 - versionName '0.1.0' - } - - sourceSets { - main { - jniLibs.srcDirs = ['src/main/jniLibs'] - } - } -} - -tasks.matching { it.name == 'preBuild' }.configureEach { - dependsOn tasks.named('buildRustAndroid') -} diff --git a/expo-bioscript/android/src/main/AndroidManifest.xml b/expo-bioscript/android/src/main/AndroidManifest.xml deleted file mode 100644 index cc947c5..0000000 --- a/expo-bioscript/android/src/main/AndroidManifest.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptModule.kt b/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptModule.kt deleted file mode 100644 index e5c41cf..0000000 --- a/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptModule.kt +++ /dev/null @@ -1,48 +0,0 @@ -package expo.modules.bioscript - -import expo.modules.kotlin.exception.CodedException -import expo.modules.kotlin.modules.Module -import expo.modules.kotlin.modules.ModuleDefinition -import org.json.JSONObject - -internal class BioscriptUnavailableException : - CodedException("bioscript native Android library is unavailable.") - -internal class BioscriptInvalidResponseException : - CodedException("bioscript returned an invalid JSON response.") - -internal class BioscriptRuntimeException(message: String) : - CodedException(message) - -class ExpoBioscriptModule : Module() { - override fun definition() = ModuleDefinition { - Name("ExpoBioscript") - - Function("isAvailable") { - ExpoBioscriptNativeBridge.isAvailable() - } - - AsyncFunction("runFile") { request: Map -> - if (!ExpoBioscriptNativeBridge.isAvailable()) { - throw BioscriptUnavailableException() - } - - val requestJson = JSONObject(request).toString() - val response = ExpoBioscriptNativeBridge.runFile(requestJson) - ?: throw BioscriptInvalidResponseException() - - val json = try { - JSONObject(response) - } catch (_: Exception) { - throw BioscriptInvalidResponseException() - } - - if (!json.optBoolean("ok")) { - throw BioscriptRuntimeException(json.optString("error", "bioscript runFile failed")) - } - - val value = json.optJSONObject("value") ?: throw BioscriptInvalidResponseException() - mapOf("ok" to value.optBoolean("ok")) - } - } -} diff --git a/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptNativeBridge.kt b/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptNativeBridge.kt deleted file mode 100644 index 7ffb09d..0000000 --- a/expo-bioscript/android/src/main/java/expo/modules/bioscript/ExpoBioscriptNativeBridge.kt +++ /dev/null @@ -1,26 +0,0 @@ -package expo.modules.bioscript - -internal class ExpoBioscriptNativeBridge private constructor() { - companion object { - private val nativeLibraryLoaded: Boolean by lazy { - try { - System.loadLibrary("bioscript") - true - } catch (_: UnsatisfiedLinkError) { - false - } - } - - @JvmStatic - fun isAvailable(): Boolean = nativeLibraryLoaded - - @JvmStatic - fun runFile(requestJson: String): String? { - check(nativeLibraryLoaded) { "bioscript native library is unavailable on Android." } - return runFileNative(requestJson) - } - - @JvmStatic - private external fun runFileNative(requestJson: String): String? - } -} diff --git a/expo-bioscript/expo-module.config.json b/expo-bioscript/expo-module.config.json deleted file mode 100644 index d640414..0000000 --- a/expo-bioscript/expo-module.config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "platforms": ["apple", "android"], - "apple": { - "modules": ["ExpoBioscriptModule"] - }, - "android": { - "modules": ["expo.modules.bioscript.ExpoBioscriptModule"] - } -} diff --git a/expo-bioscript/ios/.gitkeep b/expo-bioscript/ios/.gitkeep deleted file mode 100644 index 8b13789..0000000 --- a/expo-bioscript/ios/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/expo-bioscript/ios/ExpoBioscript.podspec b/expo-bioscript/ios/ExpoBioscript.podspec deleted file mode 100644 index 8fc45be..0000000 --- a/expo-bioscript/ios/ExpoBioscript.podspec +++ /dev/null @@ -1,32 +0,0 @@ -require 'json' - -package = JSON.parse(File.read(File.join(__dir__, '..', 'package.json'))) - -Pod::Spec.new do |s| - s.name = 'ExpoBioscript' - s.version = package['version'] - s.summary = package['description'] - s.description = package['description'] - s.license = package['license'] - s.author = package['author'] - s.platforms = { - :ios => '15.1', - :tvos => '15.1' - } - s.swift_version = '5.9' - s.source = { :git => 'https://github.com/OpenMined/biovault-app.git' } - s.static_framework = true - - s.dependency 'ExpoModulesCore' - s.pod_target_xcconfig = { - 'DEFINES_MODULE' => 'YES' - } - s.prepare_command = <<-CMD - set -e - sh ../scripts/build-rust-ios.sh - CMD - s.vendored_libraries = 'Artifacts/*.a' - - s.source_files = 'ExpoBioscriptModule.swift' - s.preserve_paths = '../../rust/**/*', '../scripts/**/*', 'Artifacts/*.a' -end diff --git a/expo-bioscript/ios/ExpoBioscriptModule.swift b/expo-bioscript/ios/ExpoBioscriptModule.swift deleted file mode 100644 index 67fa9eb..0000000 --- a/expo-bioscript/ios/ExpoBioscriptModule.swift +++ /dev/null @@ -1,63 +0,0 @@ -import ExpoModulesCore -import Foundation - -@_silgen_name("bioscript_run_file_json") -func bioscript_run_file_json(_ requestJson: UnsafePointer) -> UnsafeMutablePointer? - -@_silgen_name("bioscript_free_string") -func bioscript_free_string(_ ptr: UnsafeMutablePointer) - -public final class ExpoBioscriptModule: Module { - public func definition() -> ModuleDefinition { - Name("ExpoBioscript") - - Function("isAvailable") { - true - } - - AsyncFunction("runFile") { (request: [String: Any]) -> [String: Any] in - let jsonData = try JSONSerialization.data(withJSONObject: request, options: []) - guard let requestJson = String(data: jsonData, encoding: .utf8) else { - throw BioscriptModuleError.invalidRequestEncoding - } - - guard let requestCString = strdup(requestJson) else { - throw BioscriptModuleError.allocationFailed - } - defer { free(requestCString) } - - guard let resultPointer = bioscript_run_file_json(requestCString) else { - throw BioscriptModuleError.nullResponse - } - defer { bioscript_free_string(resultPointer) } - - let resultString = String(cString: resultPointer) - guard let resultData = resultString.data(using: .utf8) else { - throw BioscriptModuleError.invalidResponseEncoding - } - - guard let resultObject = try JSONSerialization.jsonObject(with: resultData) as? [String: Any] else { - throw BioscriptModuleError.invalidResponseShape - } - - guard (resultObject["ok"] as? Bool) == true else { - let message = (resultObject["error"] as? String) ?? "bioscript runFile failed" - throw Exception(name: "BioscriptRunFileError", description: message) - } - - guard let value = resultObject["value"] as? [String: Any] else { - throw BioscriptModuleError.invalidResponseShape - } - - return value - } - } -} - -private enum BioscriptModuleError: Error { - case allocationFailed - case invalidRequestEncoding - case nullResponse - case invalidResponseEncoding - case invalidResponseShape -} diff --git a/expo-bioscript/package.json b/expo-bioscript/package.json deleted file mode 100644 index 118e8ef..0000000 --- a/expo-bioscript/package.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "name": "expo-bioscript", - "version": "0.1.0", - "description": "Expo native module for integrating the BioScript runtime.", - "main": "src/index.ts", - "react-native": "src/index.ts", - "types": "src/index.ts", - "exports": { - ".": "./src/index.ts" - }, - "sideEffects": false, - "files": [ - "src", - "android", - "ios", - "scripts", - "expo-module.config.json", - "README.md" - ], - "keywords": [ - "expo", - "react-native", - "bioscript", - "rust" - ], - "author": "kj", - "license": "MIT", - "peerDependencies": { - "expo": "*", - "react-native": "*" - } -} diff --git a/expo-bioscript/scripts/.gitkeep b/expo-bioscript/scripts/.gitkeep deleted file mode 100644 index 8b13789..0000000 --- a/expo-bioscript/scripts/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/expo-bioscript/scripts/build-rust-android.sh b/expo-bioscript/scripts/build-rust-android.sh deleted file mode 100644 index b3f4951..0000000 --- a/expo-bioscript/scripts/build-rust-android.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/sh -set -eu - -ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" -RUST_WORKSPACE_DIR="$ROOT_DIR/../rust" -RUST_MANIFEST="$RUST_WORKSPACE_DIR/bioscript/Cargo.toml" -ANDROID_DIR="$ROOT_DIR/android" -JNI_LIBS_DIR="$ANDROID_DIR/src/main/jniLibs" -CARGO_HOME_DIR="${CARGO_HOME:-/tmp/expo-bioscript-cargo}" -SDK_ROOT_DIR="${ANDROID_HOME:-${ANDROID_SDK_ROOT:-}}" -APP_ANDROID_LOCAL_PROPERTIES="$ROOT_DIR/../../android/local.properties" - -if [ "$#" -eq 0 ]; then - set -- arm64-v8a x86_64 -fi - -if [ -f "$HOME/.cargo/env" ]; then - . "$HOME/.cargo/env" -fi - -if [ -z "$SDK_ROOT_DIR" ] && [ -f "$APP_ANDROID_LOCAL_PROPERTIES" ]; then - SDK_ROOT_DIR="$(sed -n 's/^sdk\.dir=//p' "$APP_ANDROID_LOCAL_PROPERTIES" | tail -n 1)" -fi - -if [ -z "$SDK_ROOT_DIR" ] && [ -d "$HOME/Library/Android/sdk" ]; then - SDK_ROOT_DIR="$HOME/Library/Android/sdk" -fi - -if [ -z "$SDK_ROOT_DIR" ]; then - echo "ANDROID_HOME or ANDROID_SDK_ROOT must be set" - exit 1 -fi - -if ! command -v cargo >/dev/null 2>&1; then - echo "cargo is not available in PATH" - exit 1 -fi - -if ! command -v rustup >/dev/null 2>&1; then - echo "rustup is not available in PATH" - exit 1 -fi - -if ! command -v cargo-ndk >/dev/null 2>&1; then - echo "cargo-ndk is required for Android builds" - echo "Install it with: cargo install cargo-ndk" - exit 1 -fi - -NDK_ARGS="" - -for ABI in "$@"; do - case "$ABI" in - armeabi-v7a) - RUST_TARGET="armv7-linux-androideabi" - ;; - arm64-v8a) - RUST_TARGET="aarch64-linux-android" - ;; - x86) - RUST_TARGET="i686-linux-android" - ;; - x86_64) - RUST_TARGET="x86_64-linux-android" - ;; - *) - echo "Unsupported Android ABI: $ABI" - exit 1 - ;; - esac - - if ! rustup target list --installed | grep -q "^$RUST_TARGET$"; then - echo "Missing Rust target: $RUST_TARGET" - echo "Install it with: rustup target add $RUST_TARGET" - exit 1 - fi - - NDK_ARGS="$NDK_ARGS -t $ABI" -done - -mkdir -p "$CARGO_HOME_DIR" "$JNI_LIBS_DIR" - -cd "$RUST_WORKSPACE_DIR" -# shellcheck disable=SC2086 -CARGO_HOME="$CARGO_HOME_DIR" cargo ndk $NDK_ARGS -o "$JNI_LIBS_DIR" build --manifest-path "$RUST_MANIFEST" --release diff --git a/expo-bioscript/scripts/build-rust-ios.sh b/expo-bioscript/scripts/build-rust-ios.sh deleted file mode 100644 index af40414..0000000 --- a/expo-bioscript/scripts/build-rust-ios.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/sh -set -eu - -ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" -IOS_DIR="$ROOT_DIR/ios" -RUST_WORKSPACE_DIR="$ROOT_DIR/../rust" -RUST_MANIFEST="$RUST_WORKSPACE_DIR/bioscript/Cargo.toml" -CARGO_HOME_DIR="${CARGO_HOME:-/tmp/expo-bioscript-cargo}" -ARTIFACTS_DIR="$IOS_DIR/Artifacts" -DEVICE_TARGET="aarch64-apple-ios" -SIM_TARGET="aarch64-apple-ios-sim" - -if [ -f "$HOME/.cargo/env" ]; then - . "$HOME/.cargo/env" -fi - -if ! command -v cargo >/dev/null 2>&1; then - echo "cargo is not available in PATH" - exit 1 -fi - -if ! command -v rustup >/dev/null 2>&1; then - echo "rustup is not available in PATH" - exit 1 -fi - -for RUST_TARGET in "$DEVICE_TARGET" "$SIM_TARGET"; do - if rustup target list --installed | grep -q "^$RUST_TARGET$"; then - continue - fi - - echo "Missing Rust target: $RUST_TARGET" - echo "Install it with: rustup target add $RUST_TARGET" - exit 1 -done - -mkdir -p "$CARGO_HOME_DIR" - -cd "$RUST_WORKSPACE_DIR" -CARGO_HOME="$CARGO_HOME_DIR" cargo build --manifest-path "$RUST_MANIFEST" --target "$DEVICE_TARGET" --release -CARGO_HOME="$CARGO_HOME_DIR" cargo build --manifest-path "$RUST_MANIFEST" --target "$SIM_TARGET" --release - -DEVICE_LIB="$RUST_WORKSPACE_DIR/target/$DEVICE_TARGET/release/libbioscript.a" -SIM_LIB="$RUST_WORKSPACE_DIR/target/$SIM_TARGET/release/libbioscript.a" - -if [ ! -f "$DEVICE_LIB" ] || [ ! -f "$SIM_LIB" ]; then - echo "Missing Rust build artifacts for Bioscript iOS packaging" - exit 1 -fi - -mkdir -p "$ARTIFACTS_DIR" -cp "$DEVICE_LIB" "$ARTIFACTS_DIR/libbioscript_ios.a" -cp "$SIM_LIB" "$ARTIFACTS_DIR/libbioscript_sim.a" diff --git a/expo-bioscript/src/.gitkeep b/expo-bioscript/src/.gitkeep deleted file mode 100644 index 8b13789..0000000 --- a/expo-bioscript/src/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/expo-bioscript/src/ExpoBioscript.types.ts b/expo-bioscript/src/ExpoBioscript.types.ts deleted file mode 100644 index d8d8b00..0000000 --- a/expo-bioscript/src/ExpoBioscript.types.ts +++ /dev/null @@ -1,25 +0,0 @@ -export type BioscriptInputFormat = 'auto' | 'text' | 'zip' | 'vcf' | 'cram'; - -export type RunFileRequest = { - scriptPath: string; - root?: string; - inputFile?: string; - outputFile?: string; - participantId?: string; - traceReportPath?: string; - timingReportPath?: string; - inputFormat?: BioscriptInputFormat; - inputIndex?: string; - referenceFile?: string; - referenceIndex?: string; - autoIndex?: boolean; - cacheDir?: string; - maxDurationMs?: number; - maxMemoryBytes?: number; - maxAllocations?: number; - maxRecursionDepth?: number; -}; - -export type RunFileResult = { - ok: true; -}; diff --git a/expo-bioscript/src/ExpoBioscriptModule.ts b/expo-bioscript/src/ExpoBioscriptModule.ts deleted file mode 100644 index 34729a0..0000000 --- a/expo-bioscript/src/ExpoBioscriptModule.ts +++ /dev/null @@ -1,10 +0,0 @@ -import { NativeModule, requireNativeModule } from 'expo'; - -import type { RunFileRequest, RunFileResult } from './ExpoBioscript.types'; - -declare class ExpoBioscriptModule extends NativeModule { - isAvailable(): boolean; - runFile(request: RunFileRequest): Promise; -} - -export default requireNativeModule('ExpoBioscript'); diff --git a/expo-bioscript/src/index.ts b/expo-bioscript/src/index.ts deleted file mode 100644 index 4288984..0000000 --- a/expo-bioscript/src/index.ts +++ /dev/null @@ -1,13 +0,0 @@ -import ExpoBioscriptModule from './ExpoBioscriptModule'; - -import type { BioscriptInputFormat, RunFileRequest, RunFileResult } from './ExpoBioscript.types'; - -export type { BioscriptInputFormat, RunFileRequest, RunFileResult } from './ExpoBioscript.types'; - -export function isBioscriptAvailable(): boolean { - return ExpoBioscriptModule.isAvailable(); -} - -export function runFile(request: RunFileRequest): Promise { - return ExpoBioscriptModule.runFile(request); -} From 9a95338ba101135881e304688075e932e159fad3 Mon Sep 17 00:00:00 2001 From: keelan Date: Fri, 10 Apr 2026 14:08:01 +0100 Subject: [PATCH 3/4] split core and ffi --- rust/Cargo.lock | 22 +- rust/Cargo.toml | 2 +- rust/bioscript-core/Cargo.toml | 17 + rust/bioscript-core/src/genotype.rs | 1501 ++++++++++++++++++++++++++ rust/bioscript-core/src/lib.rs | 13 + rust/bioscript-core/src/prepare.rs | 298 +++++ rust/bioscript-core/src/runtime.rs | 1160 ++++++++++++++++++++ rust/bioscript-core/src/validator.rs | 315 ++++++ rust/bioscript-core/src/variant.rs | 52 + rust/bioscript-ffi/Cargo.toml | 17 + rust/bioscript-ffi/src/lib.rs | 288 +++++ rust/bioscript/Cargo.toml | 13 +- rust/bioscript/src/lib.rs | 303 +----- 13 files changed, 3687 insertions(+), 314 deletions(-) create mode 100644 rust/bioscript-core/Cargo.toml create mode 100644 rust/bioscript-core/src/genotype.rs create mode 100644 rust/bioscript-core/src/lib.rs create mode 100644 rust/bioscript-core/src/prepare.rs create mode 100644 rust/bioscript-core/src/runtime.rs create mode 100644 rust/bioscript-core/src/validator.rs create mode 100644 rust/bioscript-core/src/variant.rs create mode 100644 rust/bioscript-ffi/Cargo.toml create mode 100644 rust/bioscript-ffi/src/lib.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index c84ed0d..a62ab8c 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -137,16 +137,34 @@ dependencies = [ name = "bioscript" version = "0.1.0" dependencies = [ - "jni", + "bioscript-core", + "monty", + "zip", +] + +[[package]] +name = "bioscript-core" +version = "0.1.0" +dependencies = [ "monty", "noodles", "rust-htslib", "serde", - "serde_json", "serde_yaml", "zip", ] +[[package]] +name = "bioscript-ffi" +version = "0.1.0" +dependencies = [ + "bioscript-core", + "jni", + "monty", + "serde", + "serde_json", +] + [[package]] name = "bit-set" version = "0.8.0" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 82e822f..7ff4459 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["bioscript"] +members = ["bioscript", "bioscript-core", "bioscript-ffi"] [profile.dev] opt-level = 2 diff --git a/rust/bioscript-core/Cargo.toml b/rust/bioscript-core/Cargo.toml new file mode 100644 index 0000000..dbafc0a --- /dev/null +++ b/rust/bioscript-core/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "bioscript-core" +version = "0.1.0" +edition = "2024" + +[dependencies] +monty = { path = "../../monty/crates/monty" } +noodles = { version = "0.104.0", features = ["bgzf"] } +zip = { version = "2.2.0", default-features = false, features = ["deflate"] } +serde = { version = "1.0", features = ["derive"] } +serde_yaml = "0.9.34" + +[target.'cfg(not(any(target_os = "ios", target_os = "tvos")))'.dependencies] +rust-htslib = "0.51.0" + +[lints.clippy] +pedantic = { level = "warn", priority = -1 } diff --git a/rust/bioscript-core/src/genotype.rs b/rust/bioscript-core/src/genotype.rs new file mode 100644 index 0000000..25f3432 --- /dev/null +++ b/rust/bioscript-core/src/genotype.rs @@ -0,0 +1,1501 @@ +use std::{ + collections::{BTreeSet, HashMap}, + fs::File, + io::{BufRead, BufReader}, + path::{Path, PathBuf}, + str::FromStr, +}; + +use noodles::bgzf; +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +use rust_htslib::bam::{self, Read}; +use zip::ZipArchive; + +use crate::runtime::RuntimeError; +use crate::variant::{VariantObservation, VariantSpec}; +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +use crate::variant::{Assembly, VariantKind}; + +const COMMENT_PREFIXES: [&str; 2] = ["#", "//"]; + +const RSID_ALIASES: &[&str] = &["rsid", "name", "snp", "marker", "id", "snpid"]; +const CHROM_ALIASES: &[&str] = &["chromosome", "chr", "chrom"]; +const POSITION_ALIASES: &[&str] = &[ + "position", + "pos", + "coordinate", + "basepairposition", + "basepair", +]; +const GENOTYPE_ALIASES: &[&str] = &[ + "genotype", + "gt", + "result", + "results", + "result1", + "call", + "calls", + "yourcode", + "code", + "genotypevalue", + "variation", +]; +const ALLELE1_ALIASES: &[&str] = &["allele1", "allelea", "allele_a", "allele1top"]; +const ALLELE2_ALIASES: &[&str] = &["allele2", "alleleb", "allele_b", "allele2top"]; + +#[derive(Debug, Clone)] +pub struct GenotypeStore { + backend: QueryBackend, +} + +#[derive(Debug, Clone)] +enum QueryBackend { + RsidMap(RsidMapBackend), + Delimited(DelimitedBackend), + Cram(CramBackend), +} + +#[derive(Debug, Clone)] +struct RsidMapBackend { + format: GenotypeSourceFormat, + values: HashMap, +} + +#[derive(Debug, Clone)] +struct DelimitedBackend { + format: GenotypeSourceFormat, + path: PathBuf, + zip_entry_name: Option, +} + +#[cfg_attr(any(target_os = "ios", target_os = "tvos"), allow(dead_code))] +#[derive(Debug, Clone)] +struct CramBackend { + path: PathBuf, + options: GenotypeLoadOptions, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum QueryKind { + GenotypeByRsid, + GenotypeByLocus, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GenomicLocus { + pub chrom: String, + pub start: i64, + pub end: i64, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct BackendCapabilities { + pub rsid_lookup: bool, + pub locus_lookup: bool, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GenotypeSourceFormat { + Text, + Zip, + Vcf, + Cram, +} + +impl FromStr for GenotypeSourceFormat { + type Err = String; + + fn from_str(value: &str) -> Result { + match value.trim().to_ascii_lowercase().as_str() { + "txt" | "text" | "genotype" => Ok(Self::Text), + "zip" => Ok(Self::Zip), + "vcf" => Ok(Self::Vcf), + "cram" => Ok(Self::Cram), + other => Err(format!("unsupported input format: {other}")), + } + } +} + +#[derive(Debug, Clone, Default)] +pub struct GenotypeLoadOptions { + pub format: Option, + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, +} + +impl GenotypeStore { + pub fn from_file(path: &Path) -> Result { + Self::from_file_with_options(path, &GenotypeLoadOptions::default()) + } + + pub fn from_file_with_options(path: &Path, options: &GenotypeLoadOptions) -> Result { + match detect_source_format(path, options.format)? { + GenotypeSourceFormat::Text => Ok(Self::from_delimited_file(path, GenotypeSourceFormat::Text, None)), + GenotypeSourceFormat::Zip => Self::from_zip_file(path), + GenotypeSourceFormat::Vcf => Self::from_vcf_file(path), + GenotypeSourceFormat::Cram => Self::from_cram_file(path, options), + } + } + + fn from_vcf_file(path: &Path) -> Result { + let lines = if path + .extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("gz")) + { + read_bgzf_lines(path)? + } else { + read_plain_lines(path)? + }; + + Self::from_vcf_lines(lines) + } + + fn from_zip_file(path: &Path) -> Result { + let selected = select_zip_entry(path)?; + let lower = selected.to_ascii_lowercase(); + if lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") { + let file = File::open(path).map_err(|err| { + RuntimeError::Io(format!("failed to open genotype zip {}: {err}", path.display())) + })?; + let mut archive = ZipArchive::new(file).map_err(|err| { + RuntimeError::Io(format!("failed to read genotype zip {}: {err}", path.display())) + })?; + let entry = archive.by_name(&selected).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype entry {selected} in {}: {err}", + path.display() + )) + })?; + let lines = read_lines_from_reader(BufReader::new(entry), path)?; + return Self::from_vcf_lines(lines); + } + Ok(Self::from_delimited_file(path, GenotypeSourceFormat::Zip, Some(selected))) + } + + fn from_cram_file(path: &Path, options: &GenotypeLoadOptions) -> Result { + Ok(Self { + backend: QueryBackend::Cram(CramBackend { + path: path.to_path_buf(), + options: options.clone(), + }), + }) + } + + fn from_vcf_lines(lines: Vec) -> Result { + let mut values = HashMap::new(); + + for line in lines { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with("##") || trimmed.starts_with("#CHROM") { + continue; + } + + let fields: Vec<&str> = trimmed.split('\t').collect(); + if fields.len() < 10 { + continue; + } + + let rsid = fields[2].trim(); + if rsid.is_empty() || rsid == "." { + continue; + } + + let reference = fields[3].trim(); + let alternates: Vec<&str> = fields[4] + .split(',') + .map(str::trim) + .filter(|alt| !alt.is_empty() && *alt != ".") + .collect(); + if reference.is_empty() || alternates.is_empty() { + continue; + } + + let sample_gt = fields[9].split(':').next().unwrap_or("."); + if let Some(genotype) = genotype_from_vcf_gt(sample_gt, reference, &alternates) { + values.insert(rsid.to_owned(), genotype); + } + } + + Ok(Self::from_rsid_map(GenotypeSourceFormat::Vcf, values)) + } + + fn from_rsid_map(format: GenotypeSourceFormat, values: HashMap) -> Self { + Self { + backend: QueryBackend::RsidMap(RsidMapBackend { format, values }), + } + } + + fn from_delimited_file( + path: &Path, + format: GenotypeSourceFormat, + zip_entry_name: Option, + ) -> Self { + Self { + backend: QueryBackend::Delimited(DelimitedBackend { + format, + path: path.to_path_buf(), + zip_entry_name, + }), + } + } + + pub fn capabilities(&self) -> BackendCapabilities { + match &self.backend { + QueryBackend::RsidMap(_) => BackendCapabilities { + rsid_lookup: true, + locus_lookup: false, + }, + QueryBackend::Delimited(_) => BackendCapabilities { + rsid_lookup: true, + locus_lookup: true, + }, + QueryBackend::Cram(_) => BackendCapabilities { + rsid_lookup: false, + locus_lookup: cfg!(not(any(target_os = "ios", target_os = "tvos"))), + }, + } + } + + pub fn supports(&self, query: QueryKind) -> bool { + let caps = self.capabilities(); + match query { + QueryKind::GenotypeByRsid => caps.rsid_lookup, + QueryKind::GenotypeByLocus => caps.locus_lookup, + } + } + + pub fn backend_name(&self) -> &'static str { + match &self.backend { + QueryBackend::RsidMap(map) => map.backend_name(), + QueryBackend::Delimited(backend) => backend.backend_name(), + QueryBackend::Cram(backend) => backend.backend_name(), + } + } + + pub fn get(&self, rsid: &str) -> Result, RuntimeError> { + match &self.backend { + QueryBackend::RsidMap(map) => Ok(map.values.get(rsid).cloned()), + QueryBackend::Delimited(backend) => backend.get(rsid), + QueryBackend::Cram(backend) => backend.lookup_variant(&VariantSpec { + rsids: vec![rsid.to_owned()], + ..VariantSpec::default() + }).map(|obs| obs.genotype), + } + } + + pub fn lookup_variant(&self, variant: &VariantSpec) -> Result { + match &self.backend { + QueryBackend::RsidMap(map) => map.lookup_variant(variant), + QueryBackend::Delimited(backend) => backend.lookup_variant(variant), + QueryBackend::Cram(backend) => backend.lookup_variant(variant), + } + } + + pub fn lookup_variants(&self, variants: &[VariantSpec]) -> Result, RuntimeError> { + if let QueryBackend::Delimited(backend) = &self.backend { + return backend.lookup_variants(variants); + } + let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); + indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); + + let mut results = vec![VariantObservation::default(); variants.len()]; + for (original_idx, variant) in indexed { + results[original_idx] = self.lookup_variant(variant)?; + } + Ok(results) + } +} + +impl RsidMapBackend { + fn backend_name(&self) -> &'static str { + match self.format { + GenotypeSourceFormat::Text => "text", + GenotypeSourceFormat::Zip => "zip", + GenotypeSourceFormat::Vcf => "vcf", + GenotypeSourceFormat::Cram => "cram", + } + } + + fn lookup_variant(&self, variant: &VariantSpec) -> Result { + for rsid in &variant.rsids { + if let Some(value) = self.values.get(rsid) { + return Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: Some(rsid.clone()), + genotype: Some(value.clone()), + evidence: vec![format!("resolved by rsid {rsid}")], + ..VariantObservation::default() + }); + } + } + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + evidence: vec!["no matching rsid found".to_owned()], + ..VariantObservation::default() + }) + } +} + +#[derive(Debug, Clone)] +struct ParsedDelimitedRow { + rsid: Option, + chrom: Option, + position: Option, + genotype: String, +} + +impl DelimitedBackend { + fn backend_name(&self) -> &'static str { + match self.format { + GenotypeSourceFormat::Text => "text", + GenotypeSourceFormat::Zip => "zip", + GenotypeSourceFormat::Vcf => "vcf", + GenotypeSourceFormat::Cram => "cram", + } + } + + fn get(&self, rsid: &str) -> Result, RuntimeError> { + let results = self.lookup_variants(&[VariantSpec { + rsids: vec![rsid.to_owned()], + ..VariantSpec::default() + }])?; + Ok(results.into_iter().next().and_then(|obs| obs.genotype)) + } + + fn lookup_variant(&self, variant: &VariantSpec) -> Result { + let mut results = self.lookup_variants(std::slice::from_ref(variant))?; + Ok(results.pop().unwrap_or_default()) + } + + fn lookup_variants(&self, variants: &[VariantSpec]) -> Result, RuntimeError> { + scan_delimited_variants(self, variants) + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +impl CramBackend { + fn backend_name(&self) -> &'static str { + "cram" + } + + fn lookup_variant(&self, variant: &VariantSpec) -> Result { + let Some(reference_file) = self.options.reference_file.as_ref() else { + return Err(RuntimeError::Unsupported(format!( + "backend '{}' cannot satisfy query '{}' for {} without --reference-file", + self.backend_name(), + describe_query(variant), + self.path.display() + ))); + }; + + let Some((assembly, locus)) = choose_variant_locus(variant, reference_file) else { + let mut detail = format!( + "backend '{}' cannot satisfy query '{}' for {} using reference {}", + self.backend_name(), + describe_query(variant), + self.path.display(), + reference_file.display() + ); + detail.push_str(". This backend needs GRCh37/GRCh38 coordinates, not only rsIDs"); + if let Some(reference_index) = self.options.reference_index.as_ref() { + detail.push_str(&format!(" (reference index {})", reference_index.display())); + } + if let Some(input_index) = self.options.input_index.as_ref() { + detail.push_str(&format!(" (input index {})", input_index.display())); + } + return Err(RuntimeError::Unsupported(detail)); + }; + + let observation = match variant.kind.unwrap_or(VariantKind::Other) { + VariantKind::Snp => self.observe_snp(variant, assembly, &locus, reference_file)?, + VariantKind::Deletion => self.observe_deletion(variant, assembly, &locus, reference_file)?, + VariantKind::Insertion | VariantKind::Indel | VariantKind::Other => { + return Err(RuntimeError::Unsupported(format!( + "backend '{}' does not yet support {:?} observation for {}", + self.backend_name(), + variant.kind.unwrap_or(VariantKind::Other), + self.path.display() + ))); + } + }; + + Ok(observation) + } + + fn observe_snp( + &self, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + reference_file: &Path, + ) -> Result { + let reference = variant + .reference + .as_deref() + .and_then(first_base) + .ok_or_else(|| RuntimeError::InvalidArguments("SNP variant requires ref/reference".to_owned()))?; + let alternate = variant + .alternate + .as_deref() + .and_then(first_base) + .ok_or_else(|| RuntimeError::InvalidArguments("SNP variant requires alt/alternate".to_owned()))?; + + let target_pos = locus.start; + let mut alt_count = 0u32; + let mut ref_count = 0u32; + let mut depth = 0u32; + + self.with_pileups(reference_file, locus, |pileup| { + let pos1 = i64::from(pileup.pos()) + 1; + if pos1 != target_pos { + return; + } + + for alignment in pileup.alignments() { + if alignment.is_del() || alignment.is_refskip() { + continue; + } + let Some(qpos) = alignment.qpos() else { + continue; + }; + let record = alignment.record(); + let bases = record.seq().as_bytes(); + let Some(base) = bases.get(qpos).copied() else { + continue; + }; + let base = (base as char).to_ascii_uppercase(); + depth += 1; + if base == reference { + ref_count += 1; + } else if base == alternate { + alt_count += 1; + } + } + })?; + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly: Some(assembly), + genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + evidence: vec![format!( + "observed SNP at {}:{} depth={} ref_count={} alt_count={}", + locus.chrom, target_pos, depth, ref_count, alt_count + )], + }) + } + + fn observe_deletion( + &self, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + reference_file: &Path, + ) -> Result { + let deletion_length = variant.deletion_length.ok_or_else(|| { + RuntimeError::InvalidArguments("deletion variant requires deletion_length".to_owned()) + })?; + let reference = variant.reference.clone().unwrap_or_else(|| "I".to_owned()); + let alternate = variant.alternate.clone().unwrap_or_else(|| "D".to_owned()); + let anchor_pos = locus.start.saturating_sub(1); + + let mut alt_count = 0u32; + let mut ref_count = 0u32; + let mut depth = 0u32; + + self.with_pileups(reference_file, &anchor_window(locus), |pileup| { + let pos1 = i64::from(pileup.pos()) + 1; + if pos1 != anchor_pos { + return; + } + + for alignment in pileup.alignments() { + if alignment.is_refskip() { + continue; + } + depth += 1; + match alignment.indel() { + bam::pileup::Indel::Del(len) if usize::try_from(len).ok() == Some(deletion_length) => { + alt_count += 1; + } + _ => { + ref_count += 1; + } + } + } + })?; + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly: Some(assembly), + genotype: infer_copy_number_genotype(&reference, &alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + evidence: vec![format!( + "observed deletion anchor {}:{} len={} depth={} ref_count={} alt_count={}", + locus.chrom, anchor_pos, deletion_length, depth, ref_count, alt_count + )], + }) + } + + fn with_pileups( + &self, + reference_file: &Path, + locus: &GenomicLocus, + mut on_pileup: F, + ) -> Result<(), RuntimeError> + where + F: FnMut(&bam::pileup::Pileup), + { + if let Some(index_path) = self.options.input_index.as_ref() { + let mut reader = bam::IndexedReader::from_path_and_index(&self.path, index_path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open indexed CRAM {} with index {}: {err}", + self.path.display(), + index_path.display() + )) + })?; + reader.set_reference(reference_file).map_err(|err| { + RuntimeError::Io(format!( + "failed to set CRAM reference {} for {}: {err}", + reference_file.display(), + self.path.display() + )) + })?; + fetch_locus(&mut reader, locus)?; + for pileup in reader.pileup() { + let pileup = pileup.map_err(|err| { + RuntimeError::Io(format!("failed while piling up {}: {err}", self.path.display())) + })?; + on_pileup(&pileup); + } + return Ok(()); + } + + if self.path.with_extension("cram.crai").exists() || self.path.with_extension("crai").exists() { + let mut reader = bam::IndexedReader::from_path(&self.path).map_err(|err| { + RuntimeError::Io(format!("failed to open indexed CRAM {}: {err}", self.path.display())) + })?; + reader.set_reference(reference_file).map_err(|err| { + RuntimeError::Io(format!( + "failed to set CRAM reference {} for {}: {err}", + reference_file.display(), + self.path.display() + )) + })?; + fetch_locus(&mut reader, locus)?; + for pileup in reader.pileup() { + let pileup = pileup.map_err(|err| { + RuntimeError::Io(format!("failed while piling up {}: {err}", self.path.display())) + })?; + on_pileup(&pileup); + } + return Ok(()); + } + + let mut reader = bam::Reader::from_path(&self.path) + .map_err(|err| RuntimeError::Io(format!("failed to open CRAM {}: {err}", self.path.display())))?; + reader.set_reference(reference_file).map_err(|err| { + RuntimeError::Io(format!( + "failed to set CRAM reference {} for {}: {err}", + reference_file.display(), + self.path.display() + )) + })?; + + let target_tid = header_tid(reader.header(), &locus.chrom).ok_or_else(|| { + RuntimeError::Unsupported(format!( + "reference {} does not contain contig {} for {}", + self.path.display(), + locus.chrom, + describe_locus(locus) + )) + })?; + + for pileup in reader.pileup() { + let pileup = pileup.map_err(|err| { + RuntimeError::Io(format!("failed while piling up {}: {err}", self.path.display())) + })?; + if pileup.tid() != target_tid { + continue; + } + let pos1 = i64::from(pileup.pos()) + 1; + if pos1 < locus.start { + continue; + } + if pos1 > locus.end { + break; + } + on_pileup(&pileup); + } + + Ok(()) + } +} + +#[cfg(any(target_os = "ios", target_os = "tvos"))] +impl CramBackend { + fn backend_name(&self) -> &'static str { + "cram" + } + + fn lookup_variant(&self, _variant: &VariantSpec) -> Result { + Err(RuntimeError::Unsupported( + "CRAM/BAM-backed lookup is not supported on Apple mobile targets".to_owned(), + )) + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn choose_variant_locus(variant: &VariantSpec, reference_file: &Path) -> Option<(Assembly, GenomicLocus)> { + match detect_reference_assembly(reference_file) { + Some(Assembly::Grch38) => variant + .grch38 + .clone() + .map(|locus| (Assembly::Grch38, locus)) + .or_else(|| variant.grch37.clone().map(|locus| (Assembly::Grch37, locus))), + Some(Assembly::Grch37) => variant + .grch37 + .clone() + .map(|locus| (Assembly::Grch37, locus)) + .or_else(|| variant.grch38.clone().map(|locus| (Assembly::Grch38, locus))), + None => variant + .grch38 + .clone() + .map(|locus| (Assembly::Grch38, locus)) + .or_else(|| variant.grch37.clone().map(|locus| (Assembly::Grch37, locus))), + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn detect_reference_assembly(reference_file: &Path) -> Option { + let lower = reference_file.to_string_lossy().to_ascii_lowercase(); + if lower.contains("grch38") || lower.contains("hg38") || lower.contains("assembly38") { + Some(Assembly::Grch38) + } else if lower.contains("grch37") || lower.contains("hg19") || lower.contains("assembly37") { + Some(Assembly::Grch37) + } else { + None + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn fetch_locus(reader: &mut bam::IndexedReader, locus: &GenomicLocus) -> Result<(), RuntimeError> { + let tid = header_tid(reader.header(), &locus.chrom).ok_or_else(|| { + RuntimeError::Unsupported(format!( + "indexed CRAM does not contain contig {} for {}", + locus.chrom, + describe_locus(locus) + )) + })?; + + let start = locus.start.saturating_sub(1); + let end = locus.end; + reader.fetch((tid as i32, start, end)).map_err(|err| { + RuntimeError::Io(format!( + "failed to fetch {}:{}-{}: {err}", + locus.chrom, locus.start, locus.end + )) + }) +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn header_tid(header: &bam::HeaderView, chrom: &str) -> Option { + let candidates = [chrom.to_owned(), format!("chr{chrom}"), chrom.trim_start_matches("chr").to_owned()]; + candidates.iter().find_map(|candidate| header.tid(candidate.as_bytes())) +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn describe_locus(locus: &GenomicLocus) -> String { + format!("{}:{}-{}", locus.chrom, locus.start, locus.end) +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn anchor_window(locus: &GenomicLocus) -> GenomicLocus { + let anchor = locus.start.saturating_sub(1); + GenomicLocus { + chrom: locus.chrom.clone(), + start: anchor, + end: anchor, + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn first_base(value: &str) -> Option { + value.trim().chars().next().map(|ch| ch.to_ascii_uppercase()) +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn infer_snp_genotype( + reference: char, + alternate: char, + _ref_count: u32, + alt_count: u32, + depth: u32, +) -> Option { + if depth == 0 { + return None; + } + let alt_fraction = f64::from(alt_count) / f64::from(depth); + if alt_fraction >= 0.8 { + Some(format!("{alternate}{alternate}")) + } else if alt_fraction <= 0.2 { + Some(format!("{reference}{reference}")) + } else { + Some(format!("{reference}{alternate}")) + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn infer_copy_number_genotype( + reference: &str, + alternate: &str, + _ref_count: u32, + alt_count: u32, + depth: u32, +) -> Option { + if depth == 0 { + return None; + } + let alt_fraction = f64::from(alt_count) / f64::from(depth); + if alt_fraction >= 0.8 { + Some(format!("{alternate}{alternate}")) + } else if alt_fraction <= 0.2 { + Some(format!("{reference}{reference}")) + } else { + Some(format!("{reference}{alternate}")) + } +} + +fn describe_query(variant: &VariantSpec) -> &'static str { + if variant.has_coordinates() { + "variant_by_locus" + } else { + "variant_by_rsid" + } +} + +fn variant_sort_key(variant: &VariantSpec) -> (u8, String, i64, i64, String) { + if let Some(locus) = &variant.grch38 { + return ( + 0, + chrom_sort_key(&locus.chrom), + locus.start, + locus.end, + variant.rsids.first().cloned().unwrap_or_default(), + ); + } + if let Some(locus) = &variant.grch37 { + return ( + 1, + chrom_sort_key(&locus.chrom), + locus.start, + locus.end, + variant.rsids.first().cloned().unwrap_or_default(), + ); + } + ( + 2, + "~".to_owned(), + i64::MAX, + i64::MAX, + variant.rsids.first().cloned().unwrap_or_default(), + ) +} + +fn chrom_sort_key(raw: &str) -> String { + let chrom = raw.trim().strip_prefix("chr").unwrap_or(raw.trim()); + if let Ok(value) = chrom.parse::() { + return format!("{value:03}"); + } + match chrom.to_ascii_uppercase().as_str() { + "X" => "023".to_owned(), + "Y" => "024".to_owned(), + "M" | "MT" => "025".to_owned(), + other => format!("999-{other}"), + } +} + +#[derive(Debug, Clone, Copy)] +enum Delimiter { + Tab, + Comma, + Space, +} + +fn detect_delimiter(lines: &[String]) -> Delimiter { + for line in lines { + let trimmed = line.trim(); + if trimmed.is_empty() || COMMENT_PREFIXES.iter().any(|prefix| trimmed.starts_with(prefix)) { + continue; + } + if line.contains('\t') { + return Delimiter::Tab; + } + if line.contains(',') { + return Delimiter::Comma; + } + if trimmed.split_whitespace().count() > 1 { + return Delimiter::Space; + } + } + Delimiter::Tab +} + +#[allow(dead_code)] +struct RowParser { + delimiter: Delimiter, + header: Option>, + comment_header: Option>, + alias_map: HashMap<&'static str, BTreeSet<&'static str>>, +} + +#[allow(dead_code)] +impl RowParser { + fn new(delimiter: Delimiter) -> Self { + let mut alias_map = HashMap::new(); + alias_map.insert("rsid", RSID_ALIASES.iter().copied().collect()); + alias_map.insert("chromosome", CHROM_ALIASES.iter().copied().collect()); + alias_map.insert("position", POSITION_ALIASES.iter().copied().collect()); + alias_map.insert("genotype", GENOTYPE_ALIASES.iter().copied().collect()); + alias_map.insert("allele1", ALLELE1_ALIASES.iter().copied().collect()); + alias_map.insert("allele2", ALLELE2_ALIASES.iter().copied().collect()); + Self { + delimiter, + header: None, + comment_header: None, + alias_map, + } + } + + fn consume_line(&mut self, line: &str) -> Result, RuntimeError> { + Ok(self + .consume_record(line)? + .and_then(|row| row.rsid.map(|rsid| (rsid, row.genotype)))) + } + + fn consume_record(&mut self, line: &str) -> Result, RuntimeError> { + let trimmed = line.trim(); + if trimmed.is_empty() { + return Ok(None); + } + + let trimmed = strip_bom(trimmed); + if let Some(prefix) = COMMENT_PREFIXES.iter().find(|prefix| trimmed.starts_with(**prefix)) { + let candidate = trimmed.trim_start_matches(prefix).trim(); + if !candidate.is_empty() { + let fields = self.parse_fields(candidate); + if self.looks_like_header(&fields) { + self.comment_header = Some(fields); + } + } + return Ok(None); + } + + let fields = self.parse_fields(strip_bom(line)); + if fields.is_empty() { + return Ok(None); + } + + if self.header.is_none() { + if self.looks_like_header(&fields) { + self.header = Some(fields); + return Ok(None); + } + if let Some(header) = self.comment_header.take() { + self.header = Some(header); + } else { + self.header = Some(self.default_header(fields.len())); + } + } + + let header = self.header.as_ref().expect("header initialized"); + let mut row_map = HashMap::new(); + for (idx, value) in fields.into_iter().enumerate() { + if idx >= header.len() { + continue; + } + row_map.insert(normalize_name(&header[idx]), strip_inline_comment(&value)); + } + + let rsid = self.lookup(&row_map, "rsid").filter(|value| !value.is_empty()); + let chrom = self.lookup(&row_map, "chromosome").filter(|value| !value.is_empty()); + let position = self.lookup(&row_map, "position").and_then(|value| value.parse::().ok()); + if rsid.is_none() && (chrom.is_none() || position.is_none()) { + return Ok(None); + } + + let genotype = if let Some(gt) = self.lookup(&row_map, "genotype") { + gt + } else { + let allele1 = self.lookup(&row_map, "allele1").unwrap_or_default(); + let allele2 = self.lookup(&row_map, "allele2").unwrap_or_default(); + format!("{allele1}{allele2}") + }; + + Ok(Some(ParsedDelimitedRow { + rsid, + chrom, + position, + genotype: normalize_genotype(&genotype), + })) + } + + fn parse_fields(&self, line: &str) -> Vec { + match self.delimiter { + Delimiter::Tab => line.split('\t').map(|field| field.trim().to_owned()).collect(), + Delimiter::Space => line.split_whitespace().map(str::to_owned).collect(), + Delimiter::Comma => split_csv_line(line), + } + } + + fn looks_like_header(&self, fields: &[String]) -> bool { + fields.first().is_some_and(|first| { + self.alias_map + .get("rsid") + .is_some_and(|aliases| aliases.contains(normalize_name(first).as_str())) + }) + } + + fn lookup(&self, row_map: &HashMap, key: &str) -> Option { + let aliases = self.alias_map.get(key)?; + for alias in aliases { + let key = normalize_name(alias); + if let Some(value) = row_map.get(&key) && !value.is_empty() { + return Some(value.clone()); + } + } + None + } + + fn default_header(&self, field_count: usize) -> Vec { + let base = ["rsid", "chromosome", "position", "genotype"]; + if field_count <= base.len() { + base[..field_count].iter().map(|s| (*s).to_owned()).collect() + } else { + let mut header: Vec = base.iter().map(|s| (*s).to_owned()).collect(); + for idx in 0..(field_count - header.len()) { + header.push(format!("extra_{idx}")); + } + header + } + } +} + +fn strip_bom(value: &str) -> &str { + value.strip_prefix('\u{feff}').unwrap_or(value) +} + +fn normalize_name(name: &str) -> String { + name.trim() + .to_ascii_lowercase() + .chars() + .filter(|ch| !matches!(ch, ' ' | '_' | '-')) + .collect() +} + +fn strip_inline_comment(value: &str) -> String { + for marker in ["#", "//"] { + if let Some(idx) = value.find(marker) { + return value[..idx].trim().to_owned(); + } + } + value.trim().to_owned() +} + +fn normalize_genotype(value: &str) -> String { + let cleaned = value.trim().replace(' ', "").to_ascii_uppercase(); + if cleaned.is_empty() || matches!(cleaned.as_str(), "NA" | "N/A" | "#N/A" | "NONE") { + return "--".to_owned(); + } + if cleaned.contains('/') { + let parts: Vec<&str> = cleaned.split('/').collect(); + if parts.iter().any(|part| part.is_empty() || *part == "-") { + return "ID".to_owned(); + } + return parts.concat(); + } + cleaned +} + +fn split_csv_line(line: &str) -> Vec { + let mut fields = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + let chars = line.chars().peekable(); + + for ch in chars { + match ch { + '"' => in_quotes = !in_quotes, + ',' if !in_quotes => { + fields.push(current.trim().to_owned()); + current.clear(); + } + _ => current.push(ch), + } + } + fields.push(current.trim().to_owned()); + fields +} + +fn read_plain_lines(path: &Path) -> Result, RuntimeError> { + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open genotype file {}: {err}", path.display())))?; + read_lines_from_reader(BufReader::new(file), path) +} + +fn select_zip_entry(path: &Path) -> Result { + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open genotype zip {}: {err}", path.display())))?; + let mut archive = ZipArchive::new(file) + .map_err(|err| RuntimeError::Io(format!("failed to read genotype zip {}: {err}", path.display())))?; + + let mut selected_name: Option = None; + for idx in 0..archive.len() { + let entry = archive.by_index(idx).map_err(|err| { + RuntimeError::Io(format!("failed to inspect genotype zip {}: {err}", path.display())) + })?; + if entry.is_dir() { + continue; + } + let name = entry.name().to_owned(); + let lower = name.to_ascii_lowercase(); + if lower.ends_with(".txt") + || lower.ends_with(".csv") + || lower.ends_with(".tsv") + || lower.ends_with(".vcf") + || lower.ends_with(".vcf.gz") + { + return Ok(name); + } + if selected_name.is_none() { + selected_name = Some(name); + } + } + + selected_name.ok_or_else(|| { + RuntimeError::Unsupported(format!( + "zip archive {} does not contain a supported genotype file", + path.display() + )) + }) +} + +fn scan_delimited_variants( + backend: &DelimitedBackend, + variants: &[VariantSpec], +) -> Result, RuntimeError> { + let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); + indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); + + let mut rsid_targets: HashMap> = HashMap::new(); + let mut coord_targets: HashMap<(String, i64), Vec> = HashMap::new(); + let mut results = vec![VariantObservation::default(); variants.len()]; + let mut unresolved = variants.len(); + + for (idx, variant) in &indexed { + for rsid in &variant.rsids { + rsid_targets.entry(rsid.clone()).or_default().push(*idx); + } + if let Some(locus) = variant.grch38.as_ref().or(variant.grch37.as_ref()) { + coord_targets + .entry((locus.chrom.trim_start_matches("chr").to_ascii_lowercase(), locus.start)) + .or_default() + .push(*idx); + } + } + + let mut scan_reader = |reader: &mut dyn BufRead| -> Result<(), RuntimeError> { + let mut probe_lines = Vec::new(); + let mut buf = String::new(); + for _ in 0..8 { + buf.clear(); + let bytes = reader.read_line(&mut buf).map_err(|err| { + RuntimeError::Io(format!("failed to read genotype stream {}: {err}", backend.path.display())) + })?; + if bytes == 0 { + break; + } + probe_lines.push(buf.trim_end_matches(['\n', '\r']).to_owned()); + } + + let delimiter = detect_delimiter(&probe_lines); + let mut column_indexes: Option = None; + let mut comment_header: Option> = None; + + let mut process_line = |line: &str| -> Result { + let Some(row) = parse_streaming_row( + line, + delimiter, + &mut column_indexes, + &mut comment_header, + )? else { + return Ok(unresolved == 0); + }; + + if let Some(rsid) = row.rsid.as_ref() + && let Some(target_indexes) = rsid_targets.get(rsid) + { + for &target_idx in target_indexes { + if results[target_idx].genotype.is_none() { + results[target_idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + matched_rsid: Some(rsid.clone()), + genotype: Some(row.genotype.clone()), + evidence: vec![format!("resolved by rsid {rsid}")], + ..VariantObservation::default() + }; + unresolved = unresolved.saturating_sub(1); + } + } + } + + if unresolved == 0 { + return Ok(true); + } + + if let (Some(chrom), Some(position)) = (row.chrom.as_ref(), row.position) { + let key = (chrom.trim_start_matches("chr").to_ascii_lowercase(), position); + if let Some(target_indexes) = coord_targets.get(&key) { + for &target_idx in target_indexes { + if results[target_idx].genotype.is_none() { + results[target_idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + matched_rsid: row.rsid.clone(), + genotype: Some(row.genotype.clone()), + evidence: vec![format!("resolved by locus {}:{}", chrom, position)], + ..VariantObservation::default() + }; + unresolved = unresolved.saturating_sub(1); + } + } + } + } + Ok(unresolved == 0) + }; + + for line in &probe_lines { + if process_line(line)? { + return Ok(()); + } + } + + loop { + buf.clear(); + let bytes = reader.read_line(&mut buf).map_err(|err| { + RuntimeError::Io(format!("failed to read genotype stream {}: {err}", backend.path.display())) + })?; + if bytes == 0 { + break; + } + if process_line(buf.trim_end_matches(['\n', '\r']))? { + break; + } + } + Ok(()) + }; + + match backend.format { + GenotypeSourceFormat::Text => { + let file = File::open(&backend.path).map_err(|err| { + RuntimeError::Io(format!("failed to open genotype file {}: {err}", backend.path.display())) + })?; + let mut reader = BufReader::new(file); + scan_reader(&mut reader)?; + } + GenotypeSourceFormat::Zip => { + let entry_name = backend.zip_entry_name.as_ref().ok_or_else(|| { + RuntimeError::Unsupported(format!( + "zip backend missing selected entry for {}", + backend.path.display() + )) + })?; + let file = File::open(&backend.path).map_err(|err| { + RuntimeError::Io(format!("failed to open genotype zip {}: {err}", backend.path.display())) + })?; + let mut archive = ZipArchive::new(file).map_err(|err| { + RuntimeError::Io(format!("failed to read genotype zip {}: {err}", backend.path.display())) + })?; + let entry = archive.by_name(entry_name).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype entry {entry_name} in {}: {err}", + backend.path.display() + )) + })?; + let mut reader = BufReader::new(entry); + scan_reader(&mut reader)?; + } + _ => { + return Err(RuntimeError::Unsupported( + "streaming delimited backend only supports text and zip".to_owned(), + )) + } + } + + for (idx, variant) in indexed { + if results[idx].genotype.is_none() { + results[idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + evidence: vec![format!("no matching rsid or locus found for {}", describe_query(variant))], + ..VariantObservation::default() + }; + } + } + + Ok(results) +} + +#[derive(Debug, Clone, Copy)] +struct DelimitedColumnIndexes { + rsid: Option, + chrom: Option, + position: Option, + genotype: Option, + allele1: Option, + allele2: Option, +} + +fn parse_streaming_row( + line: &str, + delimiter: Delimiter, + column_indexes: &mut Option, + comment_header: &mut Option>, +) -> Result, RuntimeError> { + let trimmed = line.trim(); + if trimmed.is_empty() { + return Ok(None); + } + + let trimmed = strip_bom(trimmed); + if let Some(prefix) = COMMENT_PREFIXES.iter().find(|prefix| trimmed.starts_with(**prefix)) { + let candidate = trimmed.trim_start_matches(prefix).trim(); + if !candidate.is_empty() { + let fields = parse_owned_fields(candidate, delimiter); + if looks_like_header_fields(&fields) { + *comment_header = Some(fields); + } + } + return Ok(None); + } + + let fields = parse_owned_fields(strip_bom(line), delimiter); + if fields.is_empty() { + return Ok(None); + } + + if column_indexes.is_none() { + if looks_like_header_fields(&fields) { + *column_indexes = Some(build_column_indexes(&fields)); + return Ok(None); + } + if let Some(header) = comment_header.take() { + *column_indexes = Some(build_column_indexes(&header)); + } else { + *column_indexes = Some(default_column_indexes(fields.len())); + } + } + + let indexes = column_indexes.expect("streaming column indexes initialized"); + let rsid = indexes + .rsid + .and_then(|idx| fields.get(idx)) + .map(|value| strip_inline_comment(value).trim().to_owned()) + .filter(|value| !value.is_empty()); + let chrom = indexes + .chrom + .and_then(|idx| fields.get(idx)) + .map(|value| strip_inline_comment(value).trim().to_owned()) + .filter(|value| !value.is_empty()); + let position = indexes + .position + .and_then(|idx| fields.get(idx)) + .and_then(|value| strip_inline_comment(value).trim().parse::().ok()); + if rsid.is_none() && (chrom.is_none() || position.is_none()) { + return Ok(None); + } + + let genotype = if let Some(idx) = indexes.genotype { + fields.get(idx).map(|value| strip_inline_comment(value)).unwrap_or_default().to_owned() + } else { + let allele1 = indexes + .allele1 + .and_then(|idx| fields.get(idx)) + .map(|value| strip_inline_comment(value)) + .unwrap_or_default(); + let allele2 = indexes + .allele2 + .and_then(|idx| fields.get(idx)) + .map(|value| strip_inline_comment(value)) + .unwrap_or_default(); + format!("{allele1}{allele2}") + }; + + Ok(Some(ParsedDelimitedRow { + rsid, + chrom, + position, + genotype: normalize_genotype(&genotype), + })) +} + +fn parse_owned_fields(line: &str, delimiter: Delimiter) -> Vec { + match delimiter { + Delimiter::Tab => line.split('\t').map(|field| field.trim().to_owned()).collect(), + Delimiter::Space => line.split_whitespace().map(str::to_owned).collect(), + Delimiter::Comma => split_csv_line(line), + } +} + +fn looks_like_header_fields(fields: &[String]) -> bool { + fields.first().is_some_and(|first| RSID_ALIASES.contains(&normalize_name(first).as_str())) +} + +fn build_column_indexes(header: &[String]) -> DelimitedColumnIndexes { + DelimitedColumnIndexes { + rsid: find_header_index(header, RSID_ALIASES), + chrom: find_header_index(header, CHROM_ALIASES), + position: find_header_index(header, POSITION_ALIASES), + genotype: find_header_index(header, GENOTYPE_ALIASES), + allele1: find_header_index(header, ALLELE1_ALIASES), + allele2: find_header_index(header, ALLELE2_ALIASES), + } +} + +fn default_column_indexes(field_count: usize) -> DelimitedColumnIndexes { + DelimitedColumnIndexes { + rsid: (field_count > 0).then_some(0), + chrom: (field_count > 1).then_some(1), + position: (field_count > 2).then_some(2), + genotype: (field_count > 3).then_some(3), + allele1: None, + allele2: None, + } +} + +fn find_header_index(header: &[String], aliases: &[&str]) -> Option { + header + .iter() + .position(|field| aliases.iter().any(|alias| normalize_name(field) == normalize_name(alias))) +} + +fn read_bgzf_lines(path: &Path) -> Result, RuntimeError> { + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open genotype file {}: {err}", path.display())))?; + let reader = bgzf::io::Reader::new(file); + read_lines_from_reader(BufReader::new(reader), path) +} + +fn read_lines_from_reader(mut reader: R, path: &Path) -> Result, RuntimeError> { + let mut lines = Vec::new(); + let mut buf = String::new(); + loop { + buf.clear(); + let bytes = reader + .read_line(&mut buf) + .map_err(|err| RuntimeError::Io(format!("failed to read genotype file {}: {err}", path.display())))?; + if bytes == 0 { + break; + } + lines.push(buf.trim_end_matches(['\n', '\r']).to_owned()); + } + Ok(lines) +} + +fn detect_source_format( + path: &Path, + forced: Option, +) -> Result { + if let Some(format) = forced { + return Ok(format); + } + + let lower = path.to_string_lossy().to_ascii_lowercase(); + if lower.ends_with(".zip") { + return Ok(GenotypeSourceFormat::Zip); + } + if lower.ends_with(".cram") { + return Ok(GenotypeSourceFormat::Cram); + } + if lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") { + return Ok(GenotypeSourceFormat::Vcf); + } + + let lines = read_plain_lines(path)?; + if looks_like_vcf_lines(&lines) { + Ok(GenotypeSourceFormat::Vcf) + } else { + Ok(GenotypeSourceFormat::Text) + } +} + +fn looks_like_vcf_lines(lines: &[String]) -> bool { + lines.iter().any(|line| { + let trimmed = line.trim_start(); + trimmed.starts_with("##fileformat=VCF") || trimmed.starts_with("#CHROM\t") + }) +} + +fn genotype_from_vcf_gt(gt: &str, reference: &str, alternates: &[&str]) -> Option { + if matches!(gt.trim(), "" | "." | "./." | ".|.") { + return Some("--".to_owned()); + } + + let cleaned = gt.trim().replace('|', "/"); + let parts: Vec<&str> = cleaned.split('/').collect(); + if parts.len() != 2 || parts.iter().any(|part| *part == ".") { + return Some("--".to_owned()); + } + + let ref_token = vcf_reference_token(reference, alternates); + let mut out = String::new(); + for part in parts { + let idx = part.parse::().ok()?; + if idx == 0 { + out.push_str(&ref_token); + } else { + let alt = alternates.get(idx - 1)?; + out.push_str(&vcf_alt_token(reference, alt)); + } + } + + Some(normalize_genotype(&out)) +} + +fn vcf_reference_token(reference: &str, alternates: &[&str]) -> String { + let mut saw_shorter = false; + let mut saw_longer = false; + + for alt in alternates { + match alt.len().cmp(&reference.len()) { + std::cmp::Ordering::Less => saw_shorter = true, + std::cmp::Ordering::Greater => saw_longer = true, + std::cmp::Ordering::Equal => {} + } + } + + match (saw_shorter, saw_longer) { + (true, false) => "I".to_owned(), + (false, true) => "D".to_owned(), + _ => normalize_sequence_token(reference), + } +} + +fn vcf_alt_token(reference: &str, alternate: &str) -> String { + match alternate.len().cmp(&reference.len()) { + std::cmp::Ordering::Less => "D".to_owned(), + std::cmp::Ordering::Greater => "I".to_owned(), + std::cmp::Ordering::Equal => normalize_sequence_token(alternate), + } +} + +fn normalize_sequence_token(value: &str) -> String { + value.trim().to_ascii_uppercase() +} diff --git a/rust/bioscript-core/src/lib.rs b/rust/bioscript-core/src/lib.rs new file mode 100644 index 0000000..7ada152 --- /dev/null +++ b/rust/bioscript-core/src/lib.rs @@ -0,0 +1,13 @@ +pub mod genotype; +pub mod prepare; +pub mod runtime; +pub mod validator; +pub mod variant; + +pub use genotype::{ + BackendCapabilities, GenomicLocus, GenotypeLoadOptions, GenotypeSourceFormat, QueryKind, +}; +pub use prepare::{PrepareRequest, PreparedPaths, prepare_indexes, shell_flags}; +pub use runtime::{BioscriptRuntime, RuntimeConfig, RuntimeError, StageTiming}; +pub use validator::{FileReport, Issue, Severity, ValidationReport, validate_variants_path}; +pub use variant::{Assembly, VariantKind, VariantObservation, VariantSpec}; diff --git a/rust/bioscript-core/src/prepare.rs b/rust/bioscript-core/src/prepare.rs new file mode 100644 index 0000000..6eb380e --- /dev/null +++ b/rust/bioscript-core/src/prepare.rs @@ -0,0 +1,298 @@ +use std::{ + fs, + path::{Path, PathBuf}, +}; + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +use std::{ + collections::hash_map::DefaultHasher, + hash::{Hash, Hasher}, +}; + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +use rust_htslib::{bam, faidx}; + +use crate::genotype::GenotypeSourceFormat; + +#[derive(Debug, Clone, Default)] +pub struct PrepareRequest { + pub root: PathBuf, + pub cwd: PathBuf, + pub cache_dir: PathBuf, + pub input_file: Option, + pub input_format: Option, + pub reference_file: Option, +} + +#[derive(Debug, Clone, Default)] +pub struct PreparedPaths { + pub input_file: Option, + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, + pub cache_dir: PathBuf, +} + +pub fn prepare_indexes(request: &PrepareRequest) -> Result { + let root = canonical_dir(&request.root)?; + let cache_dir = resolve_cache_dir(&request.cwd, &request.cache_dir); + fs::create_dir_all(&cache_dir) + .map_err(|err| format!("failed to create cache dir {}: {err}", cache_dir.display()))?; + + let input_file = request + .input_file + .as_deref() + .map(|value| resolve_rooted_path(&root, value)) + .transpose()?; + let reference_file = request + .reference_file + .as_deref() + .map(|value| resolve_rooted_path(&root, value)) + .transpose()?; + + let input_index = match (&input_file, request.input_format) { + (Some(path), Some(GenotypeSourceFormat::Cram)) => Some(ensure_alignment_index(path, &cache_dir)?), + (Some(path), None) if detect_alignment_input(path) => Some(ensure_alignment_index(path, &cache_dir)?), + _ => None, + }; + + let (prepared_reference_file, reference_index) = match reference_file { + Some(path) => { + let (reference_file, reference_index) = ensure_reference_index(&path, &cache_dir)?; + (Some(reference_file), Some(reference_index)) + } + None => (None, None), + }; + + Ok(PreparedPaths { + input_file, + input_index, + reference_file: prepared_reference_file, + reference_index, + cache_dir, + }) +} + +fn canonical_dir(path: &Path) -> Result { + path.canonicalize() + .map_err(|err| format!("failed to canonicalize {}: {err}", path.display())) +} + +fn resolve_rooted_path(root: &Path, raw: &str) -> Result { + let raw_path = Path::new(raw); + let resolved = if raw_path.is_absolute() { + raw_path.to_path_buf() + } else { + root.join(raw_path) + }; + let canonical = resolved + .canonicalize() + .map_err(|err| format!("failed to resolve {}: {err}", resolved.display()))?; + if !canonical.starts_with(root) { + return Err(format!( + "path escapes bioscript root: {}", + canonical.display() + )); + } + Ok(canonical) +} + +fn resolve_cache_dir(cwd: &Path, cache_dir: &Path) -> PathBuf { + if cache_dir.is_absolute() { + cache_dir.to_path_buf() + } else { + cwd.join(cache_dir) + } +} + +fn detect_alignment_input(path: &Path) -> bool { + let lower = path.to_string_lossy().to_ascii_lowercase(); + lower.ends_with(".cram") || lower.ends_with(".bam") +} + +fn ensure_alignment_index(path: &Path, cache_dir: &Path) -> Result { + #[cfg(any(target_os = "ios", target_os = "tvos"))] + { + let _ = (path, cache_dir); + return Err("alignment indexing is not supported on Apple mobile targets".to_owned()); + } + + #[cfg(not(any(target_os = "ios", target_os = "tvos")))] + { + if let Some(existing) = adjacent_alignment_index(path) { + return Ok(existing); + } + + let ext = if path + .extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("cram")) + { + "csi" + } else { + "bai" + }; + let out = cache_dir.join(format!("{}.{ext}", stable_stem(path))); + if out.exists() { + return Ok(out); + } + + let idx_type = if ext == "bai" { + bam::index::Type::Bai + } else { + bam::index::Type::Csi(14) + }; + bam::index::build(path, Some(&out), idx_type, 1).map_err(|err| { + format!( + "failed to build alignment index {} for {}: {err}", + out.display(), + path.display() + ) + })?; + Ok(out) + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn adjacent_alignment_index(path: &Path) -> Option { + let lower = path.to_string_lossy().to_ascii_lowercase(); + let candidates = if lower.ends_with(".cram") { + vec![ + path.with_extension("cram.crai"), + path.with_extension("crai"), + path.with_extension("cram.csi"), + path.with_extension("csi"), + ] + } else { + vec![path.with_extension("bam.bai"), path.with_extension("bai"), path.with_extension("csi")] + }; + + candidates.into_iter().find(|candidate| candidate.exists()) +} + +fn ensure_reference_index(path: &Path, cache_dir: &Path) -> Result<(PathBuf, PathBuf), String> { + #[cfg(any(target_os = "ios", target_os = "tvos"))] + { + let _ = (path, cache_dir); + return Err("reference indexing is not supported on Apple mobile targets".to_owned()); + } + + #[cfg(not(any(target_os = "ios", target_os = "tvos")))] + { + let adjacent = adjacent_reference_index(path); + if let Some(index) = adjacent { + return Ok((path.to_path_buf(), index)); + } + + let cached_reference = cache_dir.join(cache_reference_name(path)); + if !cached_reference.exists() { + create_reference_link(path, &cached_reference)?; + } + + let cached_index = adjacent_reference_index(&cached_reference) + .unwrap_or_else(|| cached_reference_index_path(&cached_reference)); + if !cached_index.exists() { + faidx::build(&cached_reference).map_err(|err| { + format!( + "failed to build FASTA index {} for {}: {err}", + cached_index.display(), + cached_reference.display() + ) + })?; + } + + Ok((cached_reference, cached_index)) + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn adjacent_reference_index(path: &Path) -> Option { + let candidate = cached_reference_index_path(path); + candidate.exists().then_some(candidate) +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn cached_reference_index_path(path: &Path) -> PathBuf { + if let Some(ext) = path.extension().and_then(|ext| ext.to_str()) { + path.with_extension(format!("{ext}.fai")) + } else { + path.with_extension("fai") + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn create_reference_link(source: &Path, target: &Path) -> Result<(), String> { + if let Some(parent) = target.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create cache dir {}: {err}", parent.display()))?; + } + + #[cfg(unix)] + { + std::os::unix::fs::symlink(source, target).map_err(|err| { + format!( + "failed to create cached reference link {} -> {}: {err}", + target.display(), + source.display() + ) + })?; + Ok(()) + } + + #[cfg(not(unix))] + { + fs::copy(source, target).map_err(|err| { + format!( + "failed to copy cached reference {} -> {}: {err}", + source.display(), + target.display() + ) + })?; + Ok(()) + } +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn stable_stem(path: &Path) -> String { + let mut hasher = DefaultHasher::new(); + path.to_string_lossy().hash(&mut hasher); + let hash = hasher.finish(); + let file_name = path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or("input") + .replace(['/', ' ', ':'], "_"); + format!("{file_name}-{hash:016x}") +} + +#[cfg(not(any(target_os = "ios", target_os = "tvos")))] +fn cache_reference_name(path: &Path) -> String { + let file_name = path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or("reference.fa") + .replace(['/', ' ', ':'], "_"); + format!("{}-ref", stable_stem(Path::new(&file_name))) +} + +pub fn shell_flags(prepared: &PreparedPaths) -> String { + let mut parts = Vec::new(); + if let Some(path) = &prepared.input_file { + parts.push(format!("--input-file {}", shell_quote(path))); + } + if let Some(path) = &prepared.input_index { + parts.push(format!("--input-index {}", shell_quote(path))); + } + if let Some(path) = &prepared.reference_file { + parts.push(format!("--reference-file {}", shell_quote(path))); + } + if let Some(path) = &prepared.reference_index { + parts.push(format!("--reference-index {}", shell_quote(path))); + } + parts.join(" ") +} + +fn shell_quote(path: &Path) -> String { + let text = path.to_string_lossy(); + format!("'{}'", text.replace('\'', "'\"'\"'")) +} diff --git a/rust/bioscript-core/src/runtime.rs b/rust/bioscript-core/src/runtime.rs new file mode 100644 index 0000000..b5a1e03 --- /dev/null +++ b/rust/bioscript-core/src/runtime.rs @@ -0,0 +1,1160 @@ +use std::{ + collections::{BTreeMap, HashMap}, + error::Error, + fmt, + fs, + path::{Component, Path, PathBuf}, + sync::{ + Arc, Mutex, + atomic::{AtomicU64, Ordering}, + }, + time::{Duration, Instant}, +}; + +use monty::{ + LimitedTracker, MontyException, MontyObject, MontyRun, NameLookupResult, PrintWriter, ResourceLimits, + RunProgress, +}; + +use crate::genotype::{GenotypeLoadOptions, GenotypeStore, GenomicLocus}; +use crate::variant::{VariantKind, VariantSpec}; + +type HostFunction = + fn(&BioscriptRuntime, &[MontyObject], &[(MontyObject, MontyObject)]) -> Result; + +#[derive(Debug, Clone)] +pub struct RuntimeConfig { + pub limits: ResourceLimits, + pub loader: GenotypeLoadOptions, +} + +impl Default for RuntimeConfig { + fn default() -> Self { + let limits = ResourceLimits::new() + .max_duration(Duration::from_millis(100)) + .max_memory(8 * 1024 * 1024) + .max_allocations(200_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + Self { + limits, + loader: GenotypeLoadOptions::default(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RuntimeError { + Monty(String), + Unsupported(String), + InvalidArguments(String), + Io(String), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StageTiming { + pub stage: String, + pub duration_ms: u128, + pub detail: String, +} + +impl fmt::Display for RuntimeError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Monty(msg) | Self::Unsupported(msg) | Self::InvalidArguments(msg) | Self::Io(msg) => { + f.write_str(msg) + } + } + } +} + +impl Error for RuntimeError {} + +impl From for RuntimeError { + fn from(value: MontyException) -> Self { + Self::Monty(value.to_string()) + } +} + +struct RuntimeState { + next_handle: AtomicU64, + genotype_files: Mutex>, + trace_lines: Mutex>, + timings: Mutex>, +} + +impl RuntimeState { + fn new() -> Self { + Self { + next_handle: AtomicU64::new(1), + genotype_files: Mutex::new(HashMap::new()), + trace_lines: Mutex::new(Vec::new()), + timings: Mutex::new(Vec::new()), + } + } + + fn next_handle(&self) -> u64 { + self.next_handle.fetch_add(1, Ordering::Relaxed) + } +} + +#[derive(Clone)] +pub struct BioscriptRuntime { + root: PathBuf, + config: RuntimeConfig, + functions: BTreeMap<&'static str, HostFunction>, + state: Arc, +} + +impl BioscriptRuntime { + pub fn new(root: impl Into) -> Result { + Self::with_config(root, RuntimeConfig::default()) + } + + pub fn with_config(root: impl Into, config: RuntimeConfig) -> Result { + let root = root.into(); + let canonical_root = root.canonicalize().map_err(|err| { + RuntimeError::Io(format!("failed to canonicalize bioscript root {}: {err}", root.display())) + })?; + + let mut functions = BTreeMap::new(); + functions.insert("read_text", host_read_text as HostFunction); + functions.insert("write_text", host_write_text as HostFunction); + functions.insert("__bioscript_trace__", host_trace as HostFunction); + + Ok(Self { + root: canonical_root, + config, + functions, + state: Arc::new(RuntimeState::new()), + }) + } + + #[must_use] + pub fn root(&self) -> &Path { + &self.root + } + + #[must_use] + pub fn config(&self) -> &RuntimeConfig { + &self.config + } + + pub fn run_file( + &self, + script_path: impl AsRef, + trace_report_path: Option<&Path>, + mut extra_inputs: Vec<(&str, MontyObject)>, + ) -> Result { + let run_started = Instant::now(); + let script_path = script_path.as_ref(); + let code = fs::read_to_string(script_path).map_err(|err| { + RuntimeError::Io(format!("failed to read script {}: {err}", script_path.display())) + })?; + let instrumented = instrument_source(&code); + self.state.trace_lines.lock().expect("trace mutex poisoned").clear(); + self.state.timings.lock().expect("timings mutex poisoned").clear(); + + extra_inputs.push(("__name__", MontyObject::String("__main__".to_owned()))); + extra_inputs.push(( + "__file__", + MontyObject::String(script_path.display().to_string()), + )); + extra_inputs.push(("bioscript", bioscript_object())); + + let result = self.run_script(&instrumented, &script_path.display().to_string(), extra_inputs)?; + + if let Some(report_path) = trace_report_path { + self.write_trace_report(report_path, &code)?; + } + self.record_timing( + "run_file_total", + run_started.elapsed(), + format!("script={}", script_path.display()), + ); + + Ok(result) + } + + #[must_use] + pub fn timing_snapshot(&self) -> Vec { + self.state.timings.lock().expect("timings mutex poisoned").clone() + } + + pub fn run_script( + &self, + code: &str, + script_name: &str, + inputs: Vec<(&str, MontyObject)>, + ) -> Result { + let input_names = inputs.iter().map(|(name, _)| (*name).to_owned()).collect(); + let input_values = inputs.into_iter().map(|(_, value)| value).collect(); + let runner = MontyRun::new(code.to_owned(), script_name, input_names)?; + let tracker = LimitedTracker::new(self.config.limits.clone()); + let mut progress = runner.start(input_values, tracker, PrintWriter::Stdout)?; + + loop { + progress = match progress { + RunProgress::Complete(value) => return Ok(value), + RunProgress::NameLookup(lookup) => { + let result = if self.functions.contains_key(lookup.name.as_str()) { + NameLookupResult::Value(MontyObject::Function { + name: lookup.name.clone(), + docstring: None, + }) + } else { + NameLookupResult::Undefined + }; + lookup.resume(result, PrintWriter::Stdout)? + } + RunProgress::FunctionCall(call) => { + if call.method_call { + let result = self.dispatch_method_call(&call.function_name, &call.args, &call.kwargs)?; + call.resume(result, PrintWriter::Stdout)? + } else { + let Some(handler) = self.functions.get(call.function_name.as_str()) else { + return Err(RuntimeError::Unsupported(format!( + "unknown bioscript host function: {}", + call.function_name + ))); + }; + let result = handler(self, &call.args, &call.kwargs)?; + call.resume(result, PrintWriter::Stdout)? + } + } + RunProgress::ResolveFutures(state) => { + return Err(RuntimeError::Unsupported(format!( + "async futures are not supported in bioscript runtime: {:?}", + state.pending_call_ids() + ))); + } + RunProgress::OsCall(call) => { + return Err(RuntimeError::Unsupported(format!("OS call {} is blocked", call.function))); + } + }; + } + } + + fn dispatch_method_call( + &self, + method_name: &str, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let class_name = match args.first() { + Some(MontyObject::Dataclass { name, .. }) => name.as_str(), + _ => "", + }; + + match (class_name, method_name) { + ("Bioscript", "load_genotypes") => self.method_load_genotypes(args, kwargs), + ("Bioscript", "variant") => self.method_variant(args, kwargs), + ("Bioscript", "query_plan") => self.method_query_plan(args, kwargs), + ("Bioscript", "write_tsv") => self.method_write_tsv(args, kwargs), + ("Bioscript", "read_text") => self.method_read_text(args, kwargs), + ("Bioscript", "write_text") => self.method_write_text(args, kwargs), + ("GenotypeFile", "get") => self.method_genotype_get(args, kwargs), + ("GenotypeFile", "lookup_variant") => self.method_genotype_lookup_variant(args, kwargs), + ("GenotypeFile", "lookup_variants") => self.method_genotype_lookup_variants(args, kwargs), + _ => Err(RuntimeError::Unsupported(format!( + "'{class_name}' object has no attribute '{method_name}'" + ))), + } + } + + fn method_load_genotypes( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "bioscript.load_genotypes")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "bioscript.load_genotypes expects self and path".to_owned(), + )); + } + let path = self.resolve_user_path(&expect_string_arg(args, 1, "bioscript.load_genotypes")?)?; + let loader = self.resolved_loader_options()?; + let store = GenotypeStore::from_file_with_options(&path, &loader)?; + let handle = self.state.next_handle(); + self.state + .genotype_files + .lock() + .expect("genotype mutex poisoned") + .insert(handle, store); + self.record_timing( + "load_genotypes", + started.elapsed(), + format!("path={}", path.display()), + ); + Ok(genotype_file_object(handle)) + } + + fn resolved_loader_options(&self) -> Result { + let mut loader = self.config.loader.clone(); + loader.input_index = resolve_optional_loader_path(self, loader.input_index)?; + loader.reference_file = resolve_optional_loader_path(self, loader.reference_file)?; + loader.reference_index = resolve_optional_loader_path(self, loader.reference_index)?; + Ok(loader) + } + + fn method_genotype_get( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + reject_kwargs(kwargs, "GenotypeFile.get")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.get expects self and rsid".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let rsid = expect_string_arg(args, 1, "GenotypeFile.get")?; + let guard = self.state.genotype_files.lock().expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + Ok(match store.get(&rsid)? { + Some(value) => MontyObject::String(value), + None => MontyObject::None, + }) + } + + fn method_variant( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + if args.len() != 1 { + return Err(RuntimeError::InvalidArguments( + "bioscript.variant expects only self as a positional argument".to_owned(), + )); + } + let spec = variant_spec_from_kwargs(kwargs)?; + Ok(variant_object(&spec)) + } + + fn method_query_plan( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + reject_kwargs(kwargs, "bioscript.query_plan")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "bioscript.query_plan expects self and a list of variants".to_owned(), + )); + } + let variants = variant_specs_from_plan(&args[1])?; + Ok(variant_plan_object(&variants)) + } + + fn method_genotype_lookup_variant( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "GenotypeFile.lookup_variant")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.lookup_variant expects self and variant".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let spec = dataclass_to_variant_spec(&args[1])?; + let guard = self.state.genotype_files.lock().expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + let observation = store.lookup_variant(&spec)?; + self.record_timing( + "lookup_variant", + started.elapsed(), + format!("rsids={}", spec.rsids.join("|")), + ); + Ok(match observation.genotype { + Some(value) => MontyObject::String(value), + None => MontyObject::None, + }) + } + + fn method_genotype_lookup_variants( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "GenotypeFile.lookup_variants")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.lookup_variants expects self and a variant plan".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let specs = variant_specs_from_plan(&args[1])?; + let guard = self.state.genotype_files.lock().expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + let observations = store.lookup_variants(&specs)?; + self.record_timing( + "lookup_variants", + started.elapsed(), + format!("count={}", specs.len()), + ); + Ok(MontyObject::List( + observations + .into_iter() + .map(|observation| match observation.genotype { + Some(value) => MontyObject::String(value), + None => MontyObject::None, + }) + .collect(), + )) + } + + fn method_write_tsv( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "bioscript.write_tsv")?; + if args.len() != 3 { + return Err(RuntimeError::InvalidArguments( + "bioscript.write_tsv expects self, path, rows".to_owned(), + )); + } + let path = self.resolve_user_path(&expect_string_arg(args, 1, "bioscript.write_tsv")?)?; + let rows = expect_rows(&args[2])?; + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + RuntimeError::Io(format!("failed to create parent dir {}: {err}", parent.display())) + })?; + } + let mut output = String::new(); + if let Some(first) = rows.first() { + let headers: Vec = first.keys().cloned().collect(); + output.push_str(&headers.join("\t")); + output.push('\n'); + for row in &rows { + let values: Vec = headers + .iter() + .map(|header| row.get(header).cloned().unwrap_or_default()) + .collect(); + output.push_str(&values.join("\t")); + output.push('\n'); + } + } + fs::write(&path, output) + .map_err(|err| RuntimeError::Io(format!("failed to write {}: {err}", path.display())))?; + self.record_timing( + "write_tsv", + started.elapsed(), + format!("path={} rows={}", path.display(), rows.len()), + ); + Ok(MontyObject::None) + } + + fn record_timing(&self, stage: &str, duration: Duration, detail: String) { + self.state + .timings + .lock() + .expect("timings mutex poisoned") + .push(StageTiming { + stage: stage.to_owned(), + duration_ms: duration.as_millis(), + detail, + }); + } + + fn method_read_text( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + if args.is_empty() { + return Err(RuntimeError::InvalidArguments( + "bioscript.read_text expects self and path".to_owned(), + )); + } + host_read_text(self, &args[1..], kwargs) + } + + fn method_write_text( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + if args.is_empty() { + return Err(RuntimeError::InvalidArguments( + "bioscript.write_text expects self, path, text".to_owned(), + )); + } + host_write_text(self, &args[1..], kwargs) + } + + fn resolve_user_path(&self, raw_path: &str) -> Result { + let path = Path::new(raw_path); + if path.is_absolute() { + return Err(RuntimeError::InvalidArguments(format!( + "absolute paths are not allowed: {raw_path}" + ))); + } + for component in path.components() { + match component { + Component::ParentDir | Component::RootDir | Component::Prefix(_) => { + return Err(RuntimeError::InvalidArguments(format!( + "path escapes bioscript root: {raw_path}" + ))); + } + Component::CurDir | Component::Normal(_) => {} + } + } + Ok(self.root.join(path)) + } + + fn write_trace_report(&self, report_path: &Path, original_code: &str) -> Result<(), RuntimeError> { + let trace_lines = self.state.trace_lines.lock().expect("trace mutex poisoned").clone(); + let lines: Vec<&str> = original_code.lines().collect(); + let mut output = String::from("step\tline\tcode\tlookup_key\tlookup_url\n"); + for (idx, line_no) in trace_lines.iter().enumerate() { + let source = lines.get(line_no.saturating_sub(1)).copied().unwrap_or(""); + let trimmed = source.trim(); + let statement = statement_context(&lines, *line_no); + let (lookup_key, lookup_url) = trace_lookup_metadata(&statement); + output.push_str(&format!( + "{}\t{}\t{}\t{}\t{}\n", + idx + 1, + line_no, + trimmed, + lookup_key.unwrap_or_default(), + lookup_url.unwrap_or_default() + )); + } + if let Some(parent) = report_path.parent() { + fs::create_dir_all(parent).map_err(|err| { + RuntimeError::Io(format!("failed to create parent dir {}: {err}", parent.display())) + })?; + } + fs::write(report_path, output) + .map_err(|err| RuntimeError::Io(format!("failed to write {}: {err}", report_path.display())))?; + Ok(()) + } +} + +fn trace_lookup_metadata(source: &str) -> (Option, Option) { + if let Some(rsid) = extract_rsid(source) { + let url = format!("https://www.ncbi.nlm.nih.gov/snp/{rsid}"); + return (Some(rsid), Some(url)); + } + + if let Some(coord) = extract_coordinate(source) { + let lower = source.to_ascii_lowercase(); + let host = if lower.contains("grch37") || lower.contains("hg19") { + "https://grch37.ensembl.org" + } else { + "https://www.ensembl.org" + }; + let url = format!("{host}/Homo_sapiens/Location/View?r={coord}"); + return (Some(coord), Some(url)); + } + + (None, None) +} + +fn statement_context(lines: &[&str], line_no: usize) -> String { + let Some(start_idx) = line_no.checked_sub(1) else { + return String::new(); + }; + let Some(first_line) = lines.get(start_idx) else { + return String::new(); + }; + + let mut out = String::from(first_line.trim()); + let mut depth = update_nesting_depth(0, first_line); + let mut current = start_idx + 1; + + while depth > 0 { + let Some(line) = lines.get(current) else { + break; + }; + if !out.is_empty() { + out.push(' '); + } + out.push_str(line.trim()); + depth = update_nesting_depth(depth, line); + current += 1; + } + + out +} + +fn extract_rsid(source: &str) -> Option { + let chars: Vec = source.chars().collect(); + let len = chars.len(); + let mut idx = 0; + while idx + 2 <= len { + if chars[idx] == 'r' + && chars.get(idx + 1) == Some(&'s') + && (idx == 0 || !chars[idx - 1].is_ascii_alphanumeric()) + { + let mut end = idx + 2; + while end < len && chars[end].is_ascii_digit() { + end += 1; + } + if end > idx + 2 { + return Some(chars[idx..end].iter().collect()); + } + } + idx += 1; + } + None +} + +fn extract_coordinate(source: &str) -> Option { + for token in source + .split(|ch: char| ch.is_whitespace() || matches!(ch, '"' | '\'' | ',' | ')' | '(' | '[' | ']' | '{' | '}')) + { + let cleaned = token.trim_matches(|ch: char| matches!(ch, ';')); + let normalized = cleaned.strip_prefix("chr").unwrap_or(cleaned); + if let Some((chrom, rest)) = normalized.split_once(':') + && !chrom.is_empty() + && chrom.chars().all(|ch| ch.is_ascii_alphanumeric()) + { + if let Some((start, end)) = rest.split_once('-') { + if start.chars().all(|ch| ch.is_ascii_digit()) && end.chars().all(|ch| ch.is_ascii_digit()) { + return Some(format!("{chrom}:{start}-{end}")); + } + } else if rest.chars().all(|ch| ch.is_ascii_digit()) { + return Some(format!("{chrom}:{rest}-{rest}")); + } + } + } + None +} + +fn bioscript_object() -> MontyObject { + MontyObject::Dataclass { + name: "Bioscript".to_owned(), + type_id: 1, + field_names: vec![], + attrs: vec![].into(), + frozen: true, + } +} + +fn genotype_file_object(handle_id: u64) -> MontyObject { + MontyObject::Dataclass { + name: "GenotypeFile".to_owned(), + type_id: 2, + field_names: vec!["handle_id".to_owned()], + attrs: vec![( + MontyObject::String("handle_id".to_owned()), + MontyObject::Int(handle_id as i64), + )] + .into(), + frozen: true, + } +} + +fn variant_object(spec: &VariantSpec) -> MontyObject { + let mut attrs = Vec::new(); + attrs.push(( + MontyObject::String("rsids".to_owned()), + MontyObject::List(spec.rsids.iter().cloned().map(MontyObject::String).collect()), + )); + if let Some(locus) = &spec.grch37 { + attrs.push(( + MontyObject::String("grch37".to_owned()), + MontyObject::String(format!("{}:{}-{}", locus.chrom, locus.start, locus.end)), + )); + } + if let Some(locus) = &spec.grch38 { + attrs.push(( + MontyObject::String("grch38".to_owned()), + MontyObject::String(format!("{}:{}-{}", locus.chrom, locus.start, locus.end)), + )); + } + if let Some(reference) = &spec.reference { + attrs.push(( + MontyObject::String("reference".to_owned()), + MontyObject::String(reference.clone()), + )); + } + if let Some(alternate) = &spec.alternate { + attrs.push(( + MontyObject::String("alternate".to_owned()), + MontyObject::String(alternate.clone()), + )); + } + if let Some(kind) = spec.kind { + attrs.push(( + MontyObject::String("kind".to_owned()), + MontyObject::String(variant_kind_name(kind).to_owned()), + )); + } + if let Some(length) = spec.deletion_length { + attrs.push(( + MontyObject::String("deletion_length".to_owned()), + MontyObject::Int(length as i64), + )); + } + if !spec.motifs.is_empty() { + attrs.push(( + MontyObject::String("motifs".to_owned()), + MontyObject::List(spec.motifs.iter().cloned().map(MontyObject::String).collect()), + )); + } + + MontyObject::Dataclass { + name: "Variant".to_owned(), + type_id: 3, + field_names: vec![ + "rsids".to_owned(), + "grch37".to_owned(), + "grch38".to_owned(), + "reference".to_owned(), + "alternate".to_owned(), + "kind".to_owned(), + "deletion_length".to_owned(), + "motifs".to_owned(), + ], + attrs: attrs.into(), + frozen: true, + } +} + +fn variant_plan_object(variants: &[VariantSpec]) -> MontyObject { + MontyObject::Dataclass { + name: "VariantPlan".to_owned(), + type_id: 4, + field_names: vec!["variants".to_owned()], + attrs: vec![( + MontyObject::String("variants".to_owned()), + MontyObject::List(variants.iter().map(variant_object).collect()), + )] + .into(), + frozen: true, + } +} + +fn dataclass_handle_id(obj: &MontyObject, expected_name: &str) -> Result { + match obj { + MontyObject::Dataclass { name, attrs, .. } if name == expected_name => { + for (key, value) in attrs { + if matches!(key, MontyObject::String(text) if text == "handle_id") + && let MontyObject::Int(id) = value + { + return Ok(*id as u64); + } + } + Err(RuntimeError::InvalidArguments(format!( + "{expected_name} missing handle_id" + ))) + } + _ => Err(RuntimeError::InvalidArguments(format!( + "expected {expected_name} object" + ))), + } +} + +fn dataclass_to_variant_spec(obj: &MontyObject) -> Result { + let MontyObject::Dataclass { name, attrs, .. } = obj else { + return Err(RuntimeError::InvalidArguments("expected Variant object".to_owned())); + }; + if name != "Variant" { + return Err(RuntimeError::InvalidArguments(format!("expected Variant object, got {name}"))); + } + + let mut spec = VariantSpec::default(); + for (key, value) in attrs { + let MontyObject::String(key) = key else { + continue; + }; + match key.as_str() { + "rsids" => spec.rsids = string_list_from_object(value)?, + "grch37" => spec.grch37 = string_from_optional(value)?.map(|v| parse_locus_string(&v)).transpose()?, + "grch38" => spec.grch38 = string_from_optional(value)?.map(|v| parse_locus_string(&v)).transpose()?, + "reference" => spec.reference = string_from_optional(value)?, + "alternate" => spec.alternate = string_from_optional(value)?, + "kind" => spec.kind = string_from_optional(value)?.as_deref().map(parse_variant_kind).transpose()?, + "deletion_length" => spec.deletion_length = int_from_optional(value)?.map(|v| v as usize), + "motifs" => spec.motifs = string_list_from_object(value)?, + _ => {} + } + } + Ok(spec) +} + +fn variant_specs_from_plan(obj: &MontyObject) -> Result, RuntimeError> { + match obj { + MontyObject::List(items) => items.iter().map(dataclass_to_variant_spec).collect(), + MontyObject::Dataclass { name, attrs, .. } if name == "VariantPlan" => { + for (key, value) in attrs { + if matches!(key, MontyObject::String(text) if text == "variants") { + return variant_specs_from_plan(value); + } + } + Err(RuntimeError::InvalidArguments( + "VariantPlan missing variants".to_owned(), + )) + } + _ => Err(RuntimeError::InvalidArguments( + "expected a list of Variant objects or a VariantPlan".to_owned(), + )), + } +} + +fn variant_spec_from_kwargs(kwargs: &[(MontyObject, MontyObject)]) -> Result { + let mut spec = VariantSpec::default(); + for (key, value) in kwargs { + let MontyObject::String(key) = key else { + return Err(RuntimeError::InvalidArguments( + "bioscript.variant keyword names must be strings".to_owned(), + )); + }; + match key.as_str() { + "rsid" | "rsids" => spec.rsids = string_or_list(value)?, + "grch37" => spec.grch37 = string_from_optional(value)?.map(|v| parse_locus_string(&v)).transpose()?, + "grch38" => spec.grch38 = string_from_optional(value)?.map(|v| parse_locus_string(&v)).transpose()?, + "ref" | "reference" => spec.reference = string_from_optional(value)?, + "alt" | "alternate" => spec.alternate = string_from_optional(value)?, + "kind" => spec.kind = string_from_optional(value)?.as_deref().map(parse_variant_kind).transpose()?, + "deletion_length" => spec.deletion_length = int_from_optional(value)?.map(|v| v as usize), + "motifs" => spec.motifs = string_or_list(value)?, + other => { + return Err(RuntimeError::InvalidArguments(format!( + "bioscript.variant does not accept keyword '{other}'" + ))) + } + } + } + Ok(spec) +} + +fn parse_locus_string(value: &str) -> Result { + let normalized = value.trim().strip_prefix("chr").unwrap_or(value.trim()); + let Some((chrom, rest)) = normalized.split_once(':') else { + return Err(RuntimeError::InvalidArguments(format!("invalid locus string: {value}"))); + }; + let (start, end) = if let Some((start, end)) = rest.split_once('-') { + (start, end) + } else { + (rest, rest) + }; + let start = start + .parse::() + .map_err(|err| RuntimeError::InvalidArguments(format!("invalid locus start {value}: {err}")))?; + let end = end + .parse::() + .map_err(|err| RuntimeError::InvalidArguments(format!("invalid locus end {value}: {err}")))?; + Ok(GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + }) +} + +fn parse_variant_kind(value: &str) -> Result { + match value.trim().to_ascii_lowercase().as_str() { + "snp" => Ok(VariantKind::Snp), + "insertion" | "ins" => Ok(VariantKind::Insertion), + "deletion" | "del" => Ok(VariantKind::Deletion), + "indel" => Ok(VariantKind::Indel), + "other" => Ok(VariantKind::Other), + other => Err(RuntimeError::InvalidArguments(format!("invalid variant kind: {other}"))), + } +} + +fn variant_kind_name(kind: VariantKind) -> &'static str { + match kind { + VariantKind::Snp => "snp", + VariantKind::Insertion => "insertion", + VariantKind::Deletion => "deletion", + VariantKind::Indel => "indel", + VariantKind::Other => "other", + } +} + +fn string_or_list(value: &MontyObject) -> Result, RuntimeError> { + match value { + MontyObject::String(text) => Ok(vec![text.clone()]), + MontyObject::List(_) => string_list_from_object(value), + MontyObject::None => Ok(Vec::new()), + other => Err(RuntimeError::InvalidArguments(format!( + "expected string or list of strings, got {other:?}" + ))), + } +} + +fn string_list_from_object(value: &MontyObject) -> Result, RuntimeError> { + match value { + MontyObject::List(items) => { + let mut out = Vec::new(); + for item in items { + let MontyObject::String(text) = item else { + return Err(RuntimeError::InvalidArguments( + "expected list of strings".to_owned(), + )); + }; + out.push(text.clone()); + } + Ok(out) + } + MontyObject::None => Ok(Vec::new()), + other => Err(RuntimeError::InvalidArguments(format!( + "expected list of strings, got {other:?}" + ))), + } +} + +fn string_from_optional(value: &MontyObject) -> Result, RuntimeError> { + match value { + MontyObject::None => Ok(None), + MontyObject::String(text) => Ok(Some(text.clone())), + other => Err(RuntimeError::InvalidArguments(format!( + "expected optional string, got {other:?}" + ))), + } +} + +fn int_from_optional(value: &MontyObject) -> Result, RuntimeError> { + match value { + MontyObject::None => Ok(None), + MontyObject::Int(v) => Ok(Some(*v)), + other => Err(RuntimeError::InvalidArguments(format!( + "expected optional int, got {other:?}" + ))), + } +} + +fn reject_kwargs(kwargs: &[(MontyObject, MontyObject)], function_name: &str) -> Result<(), RuntimeError> { + if kwargs.is_empty() { + Ok(()) + } else { + Err(RuntimeError::InvalidArguments(format!( + "{function_name} does not accept keyword arguments" + ))) + } +} + +fn resolve_optional_loader_path( + runtime: &BioscriptRuntime, + path: Option, +) -> Result, RuntimeError> { + path.map(|path| { + if path.is_absolute() { + Ok(path) + } else { + runtime.resolve_user_path(&path.to_string_lossy()) + } + }) + .transpose() +} + +fn expect_string_arg(args: &[MontyObject], index: usize, function_name: &str) -> Result { + let Some(value) = args.get(index) else { + return Err(RuntimeError::InvalidArguments(format!( + "{function_name} missing argument at position {index}" + ))); + }; + match value { + MontyObject::String(text) => Ok(text.clone()), + other => Err(RuntimeError::InvalidArguments(format!( + "{function_name} expected str at position {index}, got {other:?}" + ))), + } +} + +fn expect_rows(value: &MontyObject) -> Result>, RuntimeError> { + let MontyObject::List(rows) = value else { + return Err(RuntimeError::InvalidArguments( + "write_tsv expects a list of dict rows".to_owned(), + )); + }; + + let mut out = Vec::new(); + for row in rows { + let MontyObject::Dict(dict) = row else { + return Err(RuntimeError::InvalidArguments( + "write_tsv row must be a dict".to_owned(), + )); + }; + let mut mapped = BTreeMap::new(); + for (key, value) in dict { + let MontyObject::String(key) = key else { + return Err(RuntimeError::InvalidArguments( + "write_tsv dict keys must be strings".to_owned(), + )); + }; + mapped.insert(key.clone(), stringify_value(value)); + } + out.push(mapped); + } + Ok(out) +} + +fn stringify_value(value: &MontyObject) -> String { + match value { + MontyObject::None => String::new(), + MontyObject::String(text) => text.clone(), + MontyObject::Int(v) => v.to_string(), + MontyObject::Bool(v) => v.to_string(), + other => format!("{other}"), + } +} + +fn host_read_text( + runtime: &BioscriptRuntime, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], +) -> Result { + reject_kwargs(kwargs, "read_text")?; + let path = runtime.resolve_user_path(&expect_string_arg(args, 0, "read_text")?)?; + let content = fs::read_to_string(&path) + .map_err(|err| RuntimeError::Io(format!("failed to read {}: {err}", path.display())))?; + Ok(MontyObject::String(content)) +} + +fn host_write_text( + runtime: &BioscriptRuntime, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], +) -> Result { + reject_kwargs(kwargs, "write_text")?; + let path = runtime.resolve_user_path(&expect_string_arg(args, 0, "write_text")?)?; + let content = expect_string_arg(args, 1, "write_text")?; + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + RuntimeError::Io(format!("failed to create parent dir {}: {err}", parent.display())) + })?; + } + fs::write(&path, content) + .map_err(|err| RuntimeError::Io(format!("failed to write {}: {err}", path.display())))?; + Ok(MontyObject::None) +} + +fn host_trace( + runtime: &BioscriptRuntime, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], +) -> Result { + reject_kwargs(kwargs, "__bioscript_trace__")?; + if let Some(MontyObject::Int(v)) = args.first() { + runtime + .state + .trace_lines + .lock() + .expect("trace mutex poisoned") + .push(*v as usize); + } + Ok(MontyObject::None) +} + +fn instrument_source(code: &str) -> String { + let mut out = Vec::new(); + let mut nesting_depth = 0usize; + let mut pending_backslash = false; + for (idx, line) in code.lines().enumerate() { + let line_no = idx + 1; + let trimmed = line.trim_start(); + + let in_continuation = nesting_depth > 0 || pending_backslash; + let should_trace = !in_continuation + && !trimmed.is_empty() + && !trimmed.starts_with('#') + && !trimmed.starts_with('@') + && !trimmed.starts_with('"') + && !trimmed.starts_with('\'') + && !trimmed.starts_with(']') + && !trimmed.starts_with(')') + && !trimmed.starts_with('}') + && !trimmed.starts_with(',') + && !trimmed.starts_with('+') + && !trimmed.starts_with('-') + && !trimmed.starts_with('*') + && !trimmed.starts_with('/') + && !trimmed.starts_with('%') + && !trimmed.starts_with("and ") + && !trimmed.starts_with("or ") + && !trimmed.starts_with("if ") + && !trimmed.starts_with("for ") + && !trimmed.starts_with("elif ") + && !trimmed.starts_with("else:") + && !trimmed.starts_with("except") + && !trimmed.starts_with("finally:") + && !trimmed.ends_with(':'); + + if should_trace { + let indent_len = line.len() - trimmed.len(); + let indent = &line[..indent_len]; + out.push(format!("{indent}__bioscript_trace__({line_no})")); + } + out.push(line.to_owned()); + + pending_backslash = ends_with_unescaped_backslash(line); + nesting_depth = update_nesting_depth(nesting_depth, line); + } + if code.ends_with('\n') { + out.join("\n") + "\n" + } else { + out.join("\n") + } +} + +fn ends_with_unescaped_backslash(line: &str) -> bool { + let trimmed = line.trim_end(); + if !trimmed.ends_with('\\') { + return false; + } + + let slash_count = trimmed.chars().rev().take_while(|ch| *ch == '\\').count(); + slash_count % 2 == 1 +} + +fn update_nesting_depth(mut depth: usize, line: &str) -> usize { + let mut chars = line.chars().peekable(); + let mut in_single = false; + let mut in_double = false; + + while let Some(ch) = chars.next() { + if in_single { + if ch == '\\' { + chars.next(); + } else if ch == '\'' { + in_single = false; + } + continue; + } + + if in_double { + if ch == '\\' { + chars.next(); + } else if ch == '"' { + in_double = false; + } + continue; + } + + match ch { + '#' => break, + '\'' => in_single = true, + '"' => in_double = true, + '(' | '[' | '{' => depth += 1, + ')' | ']' | '}' => depth = depth.saturating_sub(1), + _ => {} + } + } + + depth +} diff --git a/rust/bioscript-core/src/validator.rs b/rust/bioscript-core/src/validator.rs new file mode 100644 index 0000000..807ca53 --- /dev/null +++ b/rust/bioscript-core/src/validator.rs @@ -0,0 +1,315 @@ +use std::{ + fmt, + fs, + path::{Path, PathBuf}, +}; + +use serde_yaml::Value; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Severity { + Error, + Warning, +} + +impl fmt::Display for Severity { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Error => f.write_str("error"), + Self::Warning => f.write_str("warning"), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Issue { + pub severity: Severity, + pub path: String, + pub message: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FileReport { + pub file: PathBuf, + pub issues: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ValidationReport { + pub files_scanned: usize, + pub reports: Vec, +} + +impl ValidationReport { + #[must_use] + pub fn total_issues(&self) -> usize { + self.reports.iter().map(|report| report.issues.len()).sum() + } + + #[must_use] + pub fn total_errors(&self) -> usize { + self.reports + .iter() + .flat_map(|report| &report.issues) + .filter(|issue| issue.severity == Severity::Error) + .count() + } + + #[must_use] + pub fn total_warnings(&self) -> usize { + self.reports + .iter() + .flat_map(|report| &report.issues) + .filter(|issue| issue.severity == Severity::Warning) + .count() + } + + #[must_use] + pub fn has_errors(&self) -> bool { + self.total_errors() > 0 + } + + #[must_use] + pub fn render_text(&self) -> String { + let mut out = String::new(); + out.push_str(&format!( + "files_scanned: {}\nerrors: {}\nwarnings: {}\n", + self.files_scanned, + self.total_errors(), + self.total_warnings() + )); + for report in &self.reports { + out.push('\n'); + out.push_str(&format!("file: {}\n", report.file.display())); + for issue in &report.issues { + out.push_str(&format!( + " - [{}] {}: {}\n", + issue.severity, issue.path, issue.message + )); + } + } + out + } +} + +pub fn validate_variants_path(path: &Path) -> Result { + let files = collect_variant_files(path)?; + let mut reports = Vec::new(); + for file in &files { + let report = validate_variant_file(file)?; + if !report.issues.is_empty() { + reports.push(report); + } + } + Ok(ValidationReport { + files_scanned: files.len(), + reports, + }) +} + +fn collect_variant_files(path: &Path) -> Result, String> { + if path.is_file() { + return Ok(vec![path.to_path_buf()]); + } + + let mut files = Vec::new(); + collect_variant_files_recursive(path, &mut files)?; + files.sort(); + Ok(files) +} + +fn collect_variant_files_recursive(path: &Path, files: &mut Vec) -> Result<(), String> { + let entries = fs::read_dir(path) + .map_err(|err| format!("failed to read directory {}: {err}", path.display()))?; + for entry in entries { + let entry = entry.map_err(|err| format!("failed to read directory entry: {err}"))?; + let entry_path = entry.path(); + if entry_path.is_dir() { + collect_variant_files_recursive(&entry_path, files)?; + continue; + } + let Some(file_name) = entry_path.file_name().and_then(|name| name.to_str()) else { + continue; + }; + if matches!(file_name, "variant.yaml" | "variant.yml") { + files.push(entry_path); + } + } + Ok(()) +} + +fn validate_variant_file(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + let value: Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; + + let mut issues = Vec::new(); + validate_required_shape(&value, &mut issues); + validate_kind_vs_tags(&value, &mut issues); + validate_pgx_shape(&value, &mut issues); + + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} + +fn validate_required_shape(root: &Value, issues: &mut Vec) { + require_const(root, &["schema"], "bioscript:variant", issues); + require_const(root, &["version"], "1.0", issues); + require_path(root, &["variant_id"], issues); + require_path(root, &["alleles"], issues); + require_path(root, &["alleles", "kind"], issues); + require_path(root, &["alleles", "ref"], issues); + if value_at(root, &["alleles", "alts"]).is_none() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles".to_owned(), + message: "missing allele definition; expected alleles.alts".to_owned(), + }); + } + + if value_at(root, &["alleles", "alt"]).is_some() { + issues.push(Issue { + severity: Severity::Warning, + path: "alleles.alt".to_owned(), + message: "alleles.alt is legacy shape; prefer alleles.alts and optional alleles.canonical_alt".to_owned(), + }); + } + + let has_identifiers = value_at(root, &["identifiers"]) + .and_then(Value::as_mapping) + .is_some_and(|mapping| !mapping.is_empty()); + let has_coordinates = ["grch37", "grch38"] + .iter() + .any(|assembly| value_at(root, &["coordinates", assembly]).is_some()); + if !has_identifiers && !has_coordinates { + issues.push(Issue { + severity: Severity::Error, + path: "identifiers/coordinates".to_owned(), + message: "expected at least one identifier block or one coordinate block".to_owned(), + }); + } + if let Some(canonical_alt) = scalar_at(root, &["alleles", "canonical_alt"]) { + let alts = seq_at(root, &["alleles", "alts"]).unwrap_or_default(); + if !alts.iter().any(|alt| alt == &canonical_alt) { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.canonical_alt".to_owned(), + message: format!( + "canonical_alt '{}' is not present in alleles.alts {:?}", + canonical_alt, alts + ), + }); + } + } +} + +fn validate_kind_vs_tags(root: &Value, issues: &mut Vec) { + let Some(kind) = scalar_at(root, &["alleles", "kind"]) else { + return; + }; + let Some(tags) = seq_at(root, &["research", "tags"]) else { + return; + }; + + let has_snp = tags.iter().any(|tag| tag == "snp"); + let has_snv = tags.iter().any(|tag| tag == "snv"); + if kind == "snv" && has_snp && !has_snv { + issues.push(Issue { + severity: Severity::Warning, + path: "research.tags".to_owned(), + message: "alleles.kind is 'snv' but research.tags uses 'snp'; pick one vocabulary and use it consistently".to_owned(), + }); + } +} + +fn validate_pgx_shape(root: &Value, issues: &mut Vec) { + let Some(pgx) = mapping_at(root, &["clinical", "pgx"]) else { + return; + }; + + for key in ["drug_labels", "annotations", "clinical_annotations"] { + let Some(items) = pgx.get(Value::String(key.to_owned())).and_then(Value::as_sequence) else { + continue; + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Warning, + path: format!("clinical.pgx.{key}[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + + if let Some(level) = mapping + .get(Value::String("pgx_level".to_owned())) + .and_then(Value::as_str) + && level.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Warning, + path: format!("clinical.pgx.{key}[{idx}].pgx_level"), + message: "empty pgx_level string; prefer null/omitted or a normalized controlled value".to_owned(), + }); + } + } + } +} + +fn require_const(root: &Value, path: &[&str], expected: &str, issues: &mut Vec) { + match scalar_at(root, path) { + Some(actual) if actual == expected => {} + Some(actual) => issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: format!("expected '{expected}', found '{actual}'"), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: "missing required field".to_owned(), + }), + } +} + +fn require_path(root: &Value, path: &[&str], issues: &mut Vec) { + if value_at(root, path).is_none() { + issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: "missing required field".to_owned(), + }); + } +} + +fn value_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Value> { + let mut current = root; + for key in path { + let mapping = current.as_mapping()?; + current = mapping.get(Value::String((*key).to_owned()))?; + } + Some(current) +} + +fn mapping_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a serde_yaml::Mapping> { + value_at(root, path)?.as_mapping() +} + +fn scalar_at(root: &Value, path: &[&str]) -> Option { + value_at(root, path).and_then(|value| match value { + Value::String(text) => Some(text.clone()), + Value::Number(number) => Some(number.to_string()), + _ => None, + }) +} + +fn seq_at(root: &Value, path: &[&str]) -> Option> { + value_at(root, path)?.as_sequence().map(|items| { + items.iter() + .filter_map(|item| item.as_str().map(ToOwned::to_owned)) + .collect() + }) +} diff --git a/rust/bioscript-core/src/variant.rs b/rust/bioscript-core/src/variant.rs new file mode 100644 index 0000000..ebf8c12 --- /dev/null +++ b/rust/bioscript-core/src/variant.rs @@ -0,0 +1,52 @@ +use crate::genotype::GenomicLocus; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Assembly { + Grch37, + Grch38, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum VariantKind { + Snp, + Insertion, + Deletion, + Indel, + Other, +} + +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct VariantSpec { + pub rsids: Vec, + pub grch37: Option, + pub grch38: Option, + pub reference: Option, + pub alternate: Option, + pub kind: Option, + pub deletion_length: Option, + pub motifs: Vec, +} + +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct VariantObservation { + pub backend: String, + pub matched_rsid: Option, + pub assembly: Option, + pub genotype: Option, + pub ref_count: Option, + pub alt_count: Option, + pub depth: Option, + pub evidence: Vec, +} + +impl VariantSpec { + #[must_use] + pub fn has_rsids(&self) -> bool { + !self.rsids.is_empty() + } + + #[must_use] + pub fn has_coordinates(&self) -> bool { + self.grch37.is_some() || self.grch38.is_some() + } +} diff --git a/rust/bioscript-ffi/Cargo.toml b/rust/bioscript-ffi/Cargo.toml new file mode 100644 index 0000000..1361fbc --- /dev/null +++ b/rust/bioscript-ffi/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "bioscript-ffi" +version = "0.1.0" +edition = "2024" + +[lib] +crate-type = ["rlib", "staticlib", "cdylib"] + +[dependencies] +bioscript-core = { path = "../bioscript-core" } +jni = "0.21" +monty = { path = "../../monty/crates/monty" } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" + +[lints.clippy] +pedantic = { level = "warn", priority = -1 } diff --git a/rust/bioscript-ffi/src/lib.rs b/rust/bioscript-ffi/src/lib.rs new file mode 100644 index 0000000..404bcbd --- /dev/null +++ b/rust/bioscript-ffi/src/lib.rs @@ -0,0 +1,288 @@ +use std::{ + env, + ffi::{CStr, CString}, + fs, + os::raw::c_char, + path::PathBuf, + time::{Duration, Instant}, +}; + +use bioscript_core::{ + BioscriptRuntime, GenotypeLoadOptions, GenotypeSourceFormat, PrepareRequest, RuntimeConfig, + StageTiming, prepare_indexes, +}; +use monty::{MontyObject, ResourceLimits}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RunFileRequest { + pub script_path: String, + pub root: Option, + pub input_file: Option, + pub output_file: Option, + pub participant_id: Option, + pub trace_report_path: Option, + pub timing_report_path: Option, + pub input_format: Option, + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, + pub auto_index: Option, + pub cache_dir: Option, + pub max_duration_ms: Option, + pub max_memory_bytes: Option, + pub max_allocations: Option, + pub max_recursion_depth: Option, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct RunFileResult { + pub ok: bool, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +struct FfiResult { + ok: bool, + #[serde(skip_serializing_if = "Option::is_none")] + value: Option, + #[serde(skip_serializing_if = "Option::is_none")] + error: Option, +} + +pub fn run_file_request(request: RunFileRequest) -> Result { + let script_path = PathBuf::from(&request.script_path); + let runtime_root = match request.root { + Some(dir) => PathBuf::from(dir), + None => env::current_dir().map_err(|err| format!("failed to get current directory: {err}"))?, + }; + + let mut loader = GenotypeLoadOptions::default(); + if let Some(value) = request.input_format.as_deref() { + if value.eq_ignore_ascii_case("auto") { + loader.format = None; + } else { + let parsed = value + .parse::() + .map_err(|err| format!("invalid inputFormat value {value}: {err}"))?; + loader.format = Some(parsed); + } + } + loader.input_index = request.input_index.map(PathBuf::from); + loader.reference_file = request.reference_file.map(PathBuf::from); + loader.reference_index = request.reference_index.map(PathBuf::from); + + let mut limits = ResourceLimits::new() + .max_duration(Duration::from_millis(100)) + .max_memory(8 * 1024 * 1024) + .max_allocations(200_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + + if let Some(value) = request.max_duration_ms { + limits = limits.max_duration(Duration::from_millis(value)); + } + if let Some(value) = request.max_memory_bytes { + limits = limits.max_memory(value); + } + if let Some(value) = request.max_allocations { + limits = limits.max_allocations(value); + } + if let Some(value) = request.max_recursion_depth { + limits = limits.max_recursion_depth(Some(value)); + } + + let mut ffi_timings: Vec = Vec::new(); + if request.auto_index.unwrap_or(false) { + let auto_index_started = Instant::now(); + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let effective_cache = request + .cache_dir + .as_ref() + .map(PathBuf::from) + .unwrap_or_else(|| cwd.join(".bioscript-cache")); + let prepare_request = PrepareRequest { + root: runtime_root.clone(), + cwd: cwd.clone(), + cache_dir: effective_cache, + input_file: request.input_file.clone(), + input_format: loader.format, + reference_file: loader + .reference_file + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + }; + let prepared = prepare_indexes(&prepare_request)?; + if let Some(idx) = prepared.input_index + && loader.input_index.is_none() + { + loader.input_index = Some(idx); + } + if let Some(ref_file) = prepared.reference_file { + loader.reference_file = Some(ref_file); + } + if let Some(ref_idx) = prepared.reference_index + && loader.reference_index.is_none() + { + loader.reference_index = Some(ref_idx); + } + ffi_timings.push(StageTiming { + stage: "auto_index".to_owned(), + duration_ms: auto_index_started.elapsed().as_millis(), + detail: "prepare_indexes".to_owned(), + }); + } + + let runtime = BioscriptRuntime::with_config( + runtime_root, + RuntimeConfig { limits, loader }, + ) + .map_err(|err| err.to_string())?; + + let mut inputs = Vec::new(); + if let Some(input_file) = request.input_file { + inputs.push(("input_file", MontyObject::String(input_file))); + } + if let Some(output_file) = request.output_file { + inputs.push(("output_file", MontyObject::String(output_file))); + } + if let Some(participant_id) = request.participant_id { + inputs.push(("participant_id", MontyObject::String(participant_id))); + } + + runtime + .run_file( + &script_path, + request.trace_report_path.as_deref().map(std::path::Path::new), + inputs, + ) + .map_err(|err| err.to_string())?; + + if let Some(timing_path) = request.timing_report_path { + let mut all_timings = ffi_timings; + all_timings.extend(runtime.timing_snapshot()); + write_timing_report(&PathBuf::from(timing_path), &all_timings)?; + } + + Ok(RunFileResult { ok: true }) +} + +fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create timing report dir {}: {err}", parent.display()))?; + } + let mut output = String::from("stage\tduration_ms\tdetail\n"); + for timing in timings { + output.push_str(&format!( + "{}\t{}\t{}\n", + timing.stage, + timing.duration_ms, + timing.detail.replace('\t', " ") + )); + } + fs::write(path, output).map_err(|err| format!("failed to write timing report {}: {err}", path.display())) +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn bioscript_run_file_json(request_json: *const c_char) -> *mut c_char { + let response = unsafe { + if request_json.is_null() { + FfiResult:: { + ok: false, + value: None, + error: Some("request_json was null".to_owned()), + } + } else { + match CStr::from_ptr(request_json).to_str() { + Ok(value) => match serde_json::from_str::(value) { + Ok(request) => match run_file_request(request) { + Ok(result) => FfiResult { + ok: true, + value: Some(result), + error: None, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(error), + }, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(format!("invalid request JSON: {error}")), + }, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(format!("request_json was not valid UTF-8: {error}")), + }, + } + } + }; + + match serde_json::to_string(&response) { + Ok(json) => match CString::new(json) { + Ok(value) => value.into_raw(), + Err(_) => std::ptr::null_mut(), + }, + Err(_) => std::ptr::null_mut(), + } +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn bioscript_free_string(ptr: *mut c_char) { + if !ptr.is_null() { + unsafe { + let _ = CString::from_raw(ptr); + } + } +} + +#[cfg(target_os = "android")] +pub mod android { + use crate::{RunFileRequest, run_file_request}; + use jni::JNIEnv; + use jni::objects::{JClass, JString}; + + #[unsafe(no_mangle)] + pub unsafe extern "system" fn Java_expo_modules_bioscript_ExpoBioscriptNativeBridge_runFileNative< + 'local, + >( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + request_json: JString<'local>, + ) -> JString<'local> { + let request_string: String = match env.get_string(&request_json) { + Ok(value) => value.into(), + Err(error) => { + return env + .new_string( + serde_json::json!({ + "ok": false, + "error": format!("failed to read request json from JVM: {error}") + }) + .to_string(), + ) + .expect("jni new_string should succeed"); + } + }; + + let response = match serde_json::from_str::(&request_string) { + Ok(request) => match run_file_request(request) { + Ok(value) => serde_json::json!({ "ok": true, "value": value }).to_string(), + Err(error) => serde_json::json!({ "ok": false, "error": error }).to_string(), + }, + Err(error) => { + serde_json::json!({ "ok": false, "error": format!("invalid request JSON: {error}") }) + .to_string() + } + }; + + env.new_string(response).expect("jni new_string should succeed") + } +} diff --git a/rust/bioscript/Cargo.toml b/rust/bioscript/Cargo.toml index 1ae4a18..711e158 100644 --- a/rust/bioscript/Cargo.toml +++ b/rust/bioscript/Cargo.toml @@ -4,19 +4,14 @@ version = "0.1.0" edition = "2024" [lib] -crate-type = ["rlib", "staticlib", "cdylib"] +crate-type = ["rlib"] [dependencies] -jni = "0.21" +bioscript-core = { path = "../bioscript-core" } monty = { path = "../../monty/crates/monty" } -noodles = { version = "0.104.0", features = ["bgzf"] } -zip = { version = "2.2.0", default-features = false, features = ["deflate"] } -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -serde_yaml = "0.9.34" -[target.'cfg(not(any(target_os = "ios", target_os = "tvos")))'.dependencies] -rust-htslib = "0.51.0" +[dev-dependencies] +zip = { version = "2.2.0", default-features = false, features = ["deflate"] } [lints.clippy] pedantic = { level = "warn", priority = -1 } diff --git a/rust/bioscript/src/lib.rs b/rust/bioscript/src/lib.rs index 6124302..e5bd25a 100644 --- a/rust/bioscript/src/lib.rs +++ b/rust/bioscript/src/lib.rs @@ -1,302 +1 @@ -use std::{ - env, ffi::{CStr, CString}, fs, os::raw::c_char, path::PathBuf, time::{Duration, Instant}, -}; - -use monty::{MontyObject, ResourceLimits}; -use serde::{Deserialize, Serialize}; - -pub mod genotype; -pub mod prepare; -pub mod runtime; -pub mod validator; -pub mod variant; - -pub use genotype::{ - BackendCapabilities, GenomicLocus, GenotypeLoadOptions, GenotypeSourceFormat, QueryKind, -}; -pub use prepare::{PrepareRequest, PreparedPaths, prepare_indexes, shell_flags}; -pub use runtime::{BioscriptRuntime, RuntimeConfig, RuntimeError, StageTiming}; -pub use validator::{FileReport, Issue, Severity, ValidationReport, validate_variants_path}; -pub use variant::{Assembly, VariantKind, VariantObservation, VariantSpec}; - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct RunFileRequest { - pub script_path: String, - pub root: Option, - pub input_file: Option, - pub output_file: Option, - pub participant_id: Option, - pub trace_report_path: Option, - pub timing_report_path: Option, - pub input_format: Option, - pub input_index: Option, - pub reference_file: Option, - pub reference_index: Option, - pub auto_index: Option, - pub cache_dir: Option, - pub max_duration_ms: Option, - pub max_memory_bytes: Option, - pub max_allocations: Option, - pub max_recursion_depth: Option, -} - -#[derive(Debug, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct RunFileResult { - pub ok: bool, -} - -#[derive(Debug, Serialize)] -#[serde(rename_all = "camelCase")] -struct FfiResult { - ok: bool, - #[serde(skip_serializing_if = "Option::is_none")] - value: Option, - #[serde(skip_serializing_if = "Option::is_none")] - error: Option, -} - -pub fn run_file_request(request: RunFileRequest) -> Result { - let script_path = PathBuf::from(&request.script_path); - let runtime_root = match request.root { - Some(dir) => PathBuf::from(dir), - None => env::current_dir().map_err(|err| format!("failed to get current directory: {err}"))?, - }; - - let mut loader = GenotypeLoadOptions::default(); - if let Some(value) = request.input_format.as_deref() { - if value.eq_ignore_ascii_case("auto") { - loader.format = None; - } else { - let parsed = value - .parse::() - .map_err(|err| format!("invalid inputFormat value {value}: {err}"))?; - loader.format = Some(parsed); - } - } - loader.input_index = request.input_index.map(PathBuf::from); - loader.reference_file = request.reference_file.map(PathBuf::from); - loader.reference_index = request.reference_index.map(PathBuf::from); - - let mut limits = ResourceLimits::new() - .max_duration(Duration::from_millis(100)) - .max_memory(8 * 1024 * 1024) - .max_allocations(200_000) - .gc_interval(1000) - .max_recursion_depth(Some(200)); - - if let Some(value) = request.max_duration_ms { - limits = limits.max_duration(Duration::from_millis(value)); - } - if let Some(value) = request.max_memory_bytes { - limits = limits.max_memory(value); - } - if let Some(value) = request.max_allocations { - limits = limits.max_allocations(value); - } - if let Some(value) = request.max_recursion_depth { - limits = limits.max_recursion_depth(Some(value)); - } - - let mut ffi_timings: Vec = Vec::new(); - if request.auto_index.unwrap_or(false) { - let auto_index_started = Instant::now(); - let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; - let effective_cache = request - .cache_dir - .as_ref() - .map(PathBuf::from) - .unwrap_or_else(|| cwd.join(".bioscript-cache")); - let prepare_request = PrepareRequest { - root: runtime_root.clone(), - cwd: cwd.clone(), - cache_dir: effective_cache, - input_file: request.input_file.clone(), - input_format: loader.format, - reference_file: loader - .reference_file - .as_ref() - .map(|p| p.to_string_lossy().to_string()), - }; - let prepared = prepare_indexes(&prepare_request)?; - if let Some(idx) = prepared.input_index - && loader.input_index.is_none() - { - loader.input_index = Some(idx); - } - if let Some(ref_file) = prepared.reference_file { - loader.reference_file = Some(ref_file); - } - if let Some(ref_idx) = prepared.reference_index - && loader.reference_index.is_none() - { - loader.reference_index = Some(ref_idx); - } - ffi_timings.push(StageTiming { - stage: "auto_index".to_owned(), - duration_ms: auto_index_started.elapsed().as_millis(), - detail: "prepare_indexes".to_owned(), - }); - } - - let runtime = BioscriptRuntime::with_config( - runtime_root, - RuntimeConfig { - limits, - loader, - }, - ) - .map_err(|err| err.to_string())?; - - let mut inputs = Vec::new(); - if let Some(input_file) = request.input_file { - inputs.push(("input_file", MontyObject::String(input_file))); - } - if let Some(output_file) = request.output_file { - inputs.push(("output_file", MontyObject::String(output_file))); - } - if let Some(participant_id) = request.participant_id { - inputs.push(("participant_id", MontyObject::String(participant_id))); - } - - runtime - .run_file( - &script_path, - request.trace_report_path.as_deref().map(std::path::Path::new), - inputs, - ) - .map_err(|err| err.to_string())?; - - if let Some(timing_path) = request.timing_report_path { - let mut all_timings = ffi_timings; - all_timings.extend(runtime.timing_snapshot()); - write_timing_report(&PathBuf::from(timing_path), &all_timings)?; - } - - Ok(RunFileResult { ok: true }) -} - -fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { - if let Some(parent) = path.parent() { - fs::create_dir_all(parent) - .map_err(|err| format!("failed to create timing report dir {}: {err}", parent.display()))?; - } - let mut output = String::from("stage\tduration_ms\tdetail\n"); - for timing in timings { - output.push_str(&format!( - "{}\t{}\t{}\n", - timing.stage, - timing.duration_ms, - timing.detail.replace('\t', " ") - )); - } - fs::write(path, output).map_err(|err| format!("failed to write timing report {}: {err}", path.display())) -} - -#[unsafe(no_mangle)] -pub unsafe extern "C" fn bioscript_run_file_json(request_json: *const c_char) -> *mut c_char { - let response = unsafe { - if request_json.is_null() { - FfiResult:: { - ok: false, - value: None, - error: Some("request_json was null".to_owned()), - } - } else { - match CStr::from_ptr(request_json).to_str() { - Ok(value) => match serde_json::from_str::(value) { - Ok(request) => match run_file_request(request) { - Ok(result) => FfiResult { - ok: true, - value: Some(result), - error: None, - }, - Err(error) => FfiResult:: { - ok: false, - value: None, - error: Some(error), - }, - }, - Err(error) => FfiResult:: { - ok: false, - value: None, - error: Some(format!("invalid request JSON: {error}")), - }, - }, - Err(error) => FfiResult:: { - ok: false, - value: None, - error: Some(format!("request_json was not valid UTF-8: {error}")), - }, - } - } - }; - - match serde_json::to_string(&response) { - Ok(json) => match CString::new(json) { - Ok(value) => value.into_raw(), - Err(_) => std::ptr::null_mut(), - }, - Err(_) => std::ptr::null_mut(), - } -} - -#[unsafe(no_mangle)] -pub unsafe extern "C" fn bioscript_free_string(ptr: *mut c_char) { - if !ptr.is_null() { - unsafe { - let _ = CString::from_raw(ptr); - } - } -} - -#[cfg(target_os = "android")] -pub mod android { - use crate::{RunFileRequest, run_file_request}; - use jni::JNIEnv; - use jni::objects::{JClass, JString}; - - /// JNI entrypoint used by the Android Expo module bridge. - /// - /// # Safety - /// - Called by the JVM with valid JNI references and strings. - /// - Follows standard JNI safety rules. - #[unsafe(no_mangle)] - pub unsafe extern "system" fn Java_expo_modules_bioscript_ExpoBioscriptNativeBridge_runFileNative< - 'local, - >( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - request_json: JString<'local>, - ) -> JString<'local> { - let request_string: String = match env.get_string(&request_json) { - Ok(value) => value.into(), - Err(error) => { - return env - .new_string( - serde_json::json!({ - "ok": false, - "error": format!("failed to read request json from JVM: {error}") - }) - .to_string(), - ) - .expect("jni string allocation failed"); - } - }; - - let response = match serde_json::from_str::(&request_string) { - Ok(request) => match run_file_request(request) { - Ok(result) => serde_json::json!({ "ok": true, "value": result }).to_string(), - Err(error) => serde_json::json!({ "ok": false, "error": error }).to_string(), - }, - Err(error) => serde_json::json!({ - "ok": false, - "error": format!("invalid request JSON: {error}") - }) - .to_string(), - }; - - env.new_string(response).expect("jni string allocation failed") - } -} +pub use bioscript_core::*; From 7ff94303339c0134e2f652e33e742e3be2e2f160 Mon Sep 17 00:00:00 2001 From: keelan Date: Fri, 10 Apr 2026 15:07:32 +0100 Subject: [PATCH 4/4] fix filepath --- rust/bioscript-ffi/src/lib.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/rust/bioscript-ffi/src/lib.rs b/rust/bioscript-ffi/src/lib.rs index 404bcbd..bde23c2 100644 --- a/rust/bioscript-ffi/src/lib.rs +++ b/rust/bioscript-ffi/src/lib.rs @@ -102,7 +102,14 @@ pub fn run_file_request(request: RunFileRequest) -> Result