diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 21fd1ef..92bb055 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 diff --git a/cuvarbase/bls.py b/cuvarbase/bls.py index 8d0a3a6..ced49b8 100644 --- a/cuvarbase/bls.py +++ b/cuvarbase/bls.py @@ -1150,6 +1150,139 @@ def sparse_bls_cpu(t, y, dy, freqs, ignore_negative_delta_sols=False): return bls_powers, solutions +def compile_sparse_bls(block_size=_default_block_size, use_simple=True, **kwargs): + """ + Compile sparse BLS GPU kernel + + Parameters + ---------- + block_size: int, optional (default: _default_block_size) + CUDA threads per CUDA block. + use_simple: bool, optional (default: True) + Use simplified kernel (more reliable, slightly slower) + + Returns + ------- + kernel: PyCUDA function + The compiled sparse_bls_kernel function + """ + # Read kernel - use simple version by default (it works!) + kernel_name = 'sparse_bls_simple' if use_simple else 'sparse_bls' + cppd = dict(BLOCK_SIZE=block_size) + kernel_txt = _module_reader(find_kernel(kernel_name), + cpp_defs=cppd) + + # compile kernel + module = SourceModule(kernel_txt, options=['--use_fast_math']) + + func_name = 'sparse_bls_kernel_simple' if use_simple else 'sparse_bls_kernel' + kernel = module.get_function(func_name) + + # Don't use prepare() - it causes issues with large shared memory + return kernel + + +def sparse_bls_gpu(t, y, dy, freqs, ignore_negative_delta_sols=False, + block_size=64, max_ndata=None, + stream=None, kernel=None): + """ + GPU-accelerated sparse BLS implementation. + + Uses a CUDA kernel to test all pairs of observations as potential + transit boundaries. More efficient than CPU implementation for datasets + with ~100-1000 observations. + + Based on https://arxiv.org/abs/2103.06193 + + Parameters + ---------- + t: array_like, float + Observation times + y: array_like, float + Observations + dy: array_like, float + Observation uncertainties + freqs: array_like, float + Frequencies to test + ignore_negative_delta_sols: bool, optional (default: False) + Whether or not to ignore solutions with negative delta (inverted dips) + block_size: int, optional (default: 64) + CUDA threads per CUDA block (use 32-128 for best performance) + max_ndata: int, optional (default: None) + Maximum number of data points (for shared memory allocation). + If None, uses len(t) + stream: pycuda.driver.Stream, optional (default: None) + CUDA stream for async execution + kernel: PyCUDA function, optional (default: None) + Pre-compiled kernel. If None, compiles kernel automatically. + + Returns + ------- + bls_powers: array_like, float + BLS power at each frequency + solutions: list of (q, phi0) tuples + Best (q, phi0) solution at each frequency + """ + # Convert to numpy arrays + t = np.asarray(t).astype(np.float32) + y = np.asarray(y).astype(np.float32) + dy = np.asarray(dy).astype(np.float32) + freqs = np.asarray(freqs).astype(np.float32) + + ndata = len(t) + nfreqs = len(freqs) + + if max_ndata is None: + max_ndata = ndata + + # Compile kernel if not provided + if kernel is None: + kernel = compile_sparse_bls(block_size=block_size) + + # Allocate GPU memory + t_g = gpuarray.to_gpu(t) + y_g = gpuarray.to_gpu(y) + dy_g = gpuarray.to_gpu(dy) + freqs_g = gpuarray.to_gpu(freqs) + + bls_powers_g = gpuarray.zeros(nfreqs, dtype=np.float32) + best_q_g = gpuarray.zeros(nfreqs, dtype=np.float32) + best_phi_g = gpuarray.zeros(nfreqs, dtype=np.float32) + + # Calculate shared memory size + # Simple kernel needs: 3 data arrays (phi, y, w) + 1 temp array for reductions + # Allocate for blockDim from function parameter (block_size) to be safe + shared_mem_size = (3 * max_ndata + block_size) * 4 + + # Launch kernel + # Grid: one block per frequency (or fewer if limited by hardware) + max_blocks = 65535 # CUDA maximum + grid = (min(nfreqs, max_blocks), 1) + block = (block_size, 1, 1) + + if stream is None: + stream = cuda.Stream() + + # Call kernel without prepare() to avoid resource issues + kernel( + t_g, y_g, dy_g, freqs_g, + np.uint32(ndata), np.uint32(nfreqs), + np.uint32(ignore_negative_delta_sols), + bls_powers_g, best_q_g, best_phi_g, + block=block, grid=grid, stream=stream, + shared=shared_mem_size + ) + + # Copy results back + stream.synchronize() + bls_powers = bls_powers_g.get() + best_q = best_q_g.get() + best_phi = best_phi_g.get() + + solutions = list(zip(best_q, best_phi)) + return bls_powers, solutions + + def eebls_transit(t, y, dy, fmax_frac=1.0, fmin_frac=1.0, qmin_fac=0.5, qmax_fac=2.0, fmin=None, fmax=None, freqs=None, qvals=None, use_fast=False, diff --git a/cuvarbase/kernels/sparse_bls.cu b/cuvarbase/kernels/sparse_bls.cu new file mode 100644 index 0000000..d5a290e --- /dev/null +++ b/cuvarbase/kernels/sparse_bls.cu @@ -0,0 +1,367 @@ +#include +#define RESTRICT __restrict__ +#define CONSTANT const +#define MIN_W 1E-9 +#define MAX_W_COMPLEMENT 1E-9 +//{CPP_DEFS} + +/** + * Sparse BLS CUDA Kernel + * + * Implementation of sparse Box Least Squares algorithm based on + * https://arxiv.org/abs/2103.06193 + * + * Instead of binning, this algorithm tests all pairs of sorted observations + * as potential transit boundaries. This is more efficient for small datasets + * (ndata < ~500) where the O(N²) complexity per frequency is acceptable. + */ + +__device__ unsigned int get_id(){ + return blockIdx.x * blockDim.x + threadIdx.x; +} + +__device__ float mod1(float a){ + return a - floorf(a); +} + +/** + * Compute BLS power for given parameters + * + * @param YW: Weighted sum of y values in transit + * @param W: Sum of weights in transit + * @param YY: Total variance normalization + * @param ignore_negative_delta_sols: If true, ignore inverted dips (YW > 0) + * @return: BLS power value + */ +__device__ float bls_power(float YW, float W, float YY, + unsigned int ignore_negative_delta_sols){ + // Check if we should ignore this solution + if (ignore_negative_delta_sols && YW > 0.f) + return 0.f; + + // Check weight bounds + if (W < MIN_W || W > 1.f - MAX_W_COMPLEMENT) + return 0.f; + + // Compute BLS: (YW)² / (W * (1-W) * YY) + float bls = (YW * YW) / (W * (1.f - W) * YY); + return bls; +} + +/** + * Bitonic sort for sorting observations by phase within shared memory + * Uses cooperative sorting across all threads in the block + * + * @param sh_phi: Shared memory array of phases + * @param sh_y: Shared memory array of y values + * @param sh_w: Shared memory array of weights + * @param sh_indices: Shared memory array of original indices + * @param n: Number of elements to sort + */ +__device__ void bitonic_sort_by_phase(float* sh_phi, float* sh_y, float* sh_w, + int* sh_indices, unsigned int n){ + unsigned int tid = threadIdx.x; + + // Bitonic sort: repeatedly merge sorted sequences + for (unsigned int k = 2; k <= n; k *= 2) { + for (unsigned int j = k / 2; j > 0; j /= 2) { + unsigned int ixj = tid ^ j; + + if (ixj > tid && tid < n && ixj < n) { + // Determine sort direction + bool ascending = ((tid & k) == 0); + bool swap = (sh_phi[tid] > sh_phi[ixj]) == ascending; + + if (swap) { + // Swap all arrays in lockstep + float tmp_phi = sh_phi[tid]; + float tmp_y = sh_y[tid]; + float tmp_w = sh_w[tid]; + int tmp_idx = sh_indices[tid]; + + sh_phi[tid] = sh_phi[ixj]; + sh_y[tid] = sh_y[ixj]; + sh_w[tid] = sh_w[ixj]; + sh_indices[tid] = sh_indices[ixj]; + + sh_phi[ixj] = tmp_phi; + sh_y[ixj] = tmp_y; + sh_w[ixj] = tmp_w; + sh_indices[ixj] = tmp_idx; + } + } + __syncthreads(); + } + } +} + +/** + * Main sparse BLS kernel + * + * Each thread block handles one frequency. Within each block: + * 1. Compute phases for all observations at this frequency + * 2. Sort observations by phase in shared memory + * 3. Test all pairs of observations as potential transit boundaries + * 4. Find maximum BLS power and corresponding (q, phi0) + * + * @param t: Observation times [ndata] + * @param y: Observation values [ndata] + * @param dy: Observation uncertainties [ndata] + * @param freqs: Frequencies to test [nfreqs] + * @param ndata: Number of observations + * @param nfreqs: Number of frequencies + * @param ignore_negative_delta_sols: Whether to ignore inverted dips + * @param bls_powers: Output BLS powers [nfreqs] + * @param best_q: Output best q values [nfreqs] + * @param best_phi: Output best phi0 values [nfreqs] + */ +__global__ void sparse_bls_kernel( + const float* __restrict__ t, + const float* __restrict__ y, + const float* __restrict__ dy, + const float* __restrict__ freqs, + unsigned int ndata, + unsigned int nfreqs, + unsigned int ignore_negative_delta_sols, + float* __restrict__ bls_powers, + float* __restrict__ best_q, + float* __restrict__ best_phi) +{ + // Shared memory layout: + // [phi, y, w, indices, cumsum_w, cumsum_yw, thread_max_bls, thread_best_q, thread_best_phi] + extern __shared__ float shared_mem[]; + + float* sh_phi = shared_mem; // ndata floats + float* sh_y = &shared_mem[ndata]; // ndata floats + float* sh_w = &shared_mem[2 * ndata]; // ndata floats + int* sh_indices = (int*)&shared_mem[3 * ndata]; // ndata ints + float* sh_cumsum_w = &shared_mem[3 * ndata + ndata]; // ndata floats + float* sh_cumsum_yw = &shared_mem[4 * ndata + ndata];// ndata floats + float* thread_results = &shared_mem[5 * ndata + ndata]; // blockDim.x * 3 floats + + unsigned int freq_idx = blockIdx.x; + unsigned int tid = threadIdx.x; + + // Loop over frequencies (in case we have more frequencies than blocks) + while (freq_idx < nfreqs) { + float freq = freqs[freq_idx]; + + // Step 1: Load data and compute phases + // Each thread loads multiple elements if ndata > blockDim.x + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + float phi = mod1(t[i] * freq); + float weight = 1.f / (dy[i] * dy[i]); + + sh_phi[i] = phi; + sh_y[i] = y[i]; + sh_w[i] = weight; + sh_indices[i] = i; + } + __syncthreads(); + + // Step 2: Normalize weights + float sum_w = 0.f; + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + sum_w += sh_w[i]; + } + + // Reduce sum_w across threads + __shared__ float block_sum_w; + if (tid == 0) block_sum_w = 0.f; + __syncthreads(); + + atomicAdd(&block_sum_w, sum_w); + __syncthreads(); + + // Normalize weights + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + sh_w[i] /= block_sum_w; + } + __syncthreads(); + + // Step 3: Compute ybar and YY (normalization) + float ybar = 0.f; + float YY = 0.f; + + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + ybar += sh_w[i] * sh_y[i]; + } + + __shared__ float block_ybar; + if (tid == 0) block_ybar = 0.f; + __syncthreads(); + + atomicAdd(&block_ybar, ybar); + __syncthreads(); + + ybar = block_ybar; + + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + float diff = sh_y[i] - ybar; + YY += sh_w[i] * diff * diff; + } + + __shared__ float block_YY; + if (tid == 0) block_YY = 0.f; + __syncthreads(); + + atomicAdd(&block_YY, YY); + __syncthreads(); + + YY = block_YY; + + // Step 4: Sort by phase using bitonic sort + // Pad to next power of 2 for bitonic sort + unsigned int n_padded = 1; + while (n_padded < ndata) n_padded *= 2; + + // Pad with large phase values + for (unsigned int i = ndata + tid; i < n_padded; i += blockDim.x) { + if (i < n_padded) { + sh_phi[i] = 2.f; // Larger than any valid phase + sh_y[i] = 0.f; + sh_w[i] = 0.f; + sh_indices[i] = -1; + } + } + __syncthreads(); + + bitonic_sort_by_phase(sh_phi, sh_y, sh_w, sh_indices, n_padded); + + // Step 5: Compute cumulative sums for fast range queries + // Using prefix sum + for (unsigned int stride = 1; stride < ndata; stride *= 2) { + __syncthreads(); + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + if (i >= stride) { + float temp_w = sh_cumsum_w[i - stride]; + float temp_yw = sh_cumsum_yw[i - stride]; + __syncthreads(); + sh_cumsum_w[i] = sh_w[i] + temp_w; + sh_cumsum_yw[i] = sh_w[i] * sh_y[i] + temp_yw; + } else { + sh_cumsum_w[i] = sh_w[i]; + sh_cumsum_yw[i] = sh_w[i] * sh_y[i]; + } + } + } + __syncthreads(); + + // Step 6: Each thread tests a subset of transit pairs + float thread_max_bls = 0.f; + float thread_q = 0.f; + float thread_phi0 = 0.f; + + // Total number of pairs to test: ndata * ndata + unsigned long long total_pairs = (unsigned long long)ndata * (unsigned long long)ndata; + unsigned long long pairs_per_thread = (total_pairs + blockDim.x - 1) / blockDim.x; + + unsigned long long start_pair = (unsigned long long)tid * pairs_per_thread; + unsigned long long end_pair = min(start_pair + pairs_per_thread, total_pairs); + + for (unsigned long long pair_idx = start_pair; pair_idx < end_pair; pair_idx++) { + unsigned int i = pair_idx / ndata; + unsigned int j = pair_idx % ndata; + + if (i >= ndata || j >= ndata) continue; + + float phi0, q, W, YW, bls; + + // Non-wrapped transits: from i to j + if (j > i) { + phi0 = sh_phi[i]; + + // Compute q as midpoint to next excluded observation + if (j < ndata - 1 && j > 0) { + q = 0.5f * (sh_phi[j] + sh_phi[j - 1]) - phi0; + } else { + q = sh_phi[j] - phi0; + } + + if (q > 0.5f) continue; + + // Compute W and YW for observations i to j-1 using cumulative sums + W = (i == 0) ? sh_cumsum_w[j - 1] : sh_cumsum_w[j - 1] - sh_cumsum_w[i - 1]; + YW = (i == 0) ? sh_cumsum_yw[j - 1] : sh_cumsum_yw[j - 1] - sh_cumsum_yw[i - 1]; + YW -= ybar * W; + + bls = bls_power(YW, W, YY, ignore_negative_delta_sols); + + if (bls > thread_max_bls) { + thread_max_bls = bls; + thread_q = q; + thread_phi0 = phi0; + } + } + + // Wrapped transits: from i to end, then 0 to k + if (j < i) { + unsigned int k = j; + phi0 = sh_phi[i]; + + if (k > 0) { + q = (1.f - phi0) + 0.5f * (sh_phi[k - 1] + sh_phi[k]); + } else { + q = 1.f - phi0; + } + + if (q > 0.5f) continue; + + // W and YW = sum from i to end, plus 0 to k-1 + if (i > 0) { + W = (sh_cumsum_w[ndata - 1] - sh_cumsum_w[i - 1]); + YW = (sh_cumsum_yw[ndata - 1] - sh_cumsum_yw[i - 1]); + } else { + W = sh_cumsum_w[ndata - 1]; + YW = sh_cumsum_yw[ndata - 1]; + } + + if (k > 0) { + W += sh_cumsum_w[k - 1]; + YW += sh_cumsum_yw[k - 1]; + } + + YW -= ybar * W; + + bls = bls_power(YW, W, YY, ignore_negative_delta_sols); + + if (bls > thread_max_bls) { + thread_max_bls = bls; + thread_q = q; + thread_phi0 = phi0; + } + } + } + + // Store thread results + thread_results[tid] = thread_max_bls; + thread_results[blockDim.x + tid] = thread_q; + thread_results[2 * blockDim.x + tid] = thread_phi0; + __syncthreads(); + + // Step 7: Reduce across threads to find maximum BLS + for (unsigned int stride = blockDim.x / 2; stride > 0; stride /= 2) { + if (tid < stride) { + float bls1 = thread_results[tid]; + float bls2 = thread_results[tid + stride]; + + if (bls2 > bls1) { + thread_results[tid] = bls2; + thread_results[blockDim.x + tid] = thread_results[blockDim.x + tid + stride]; + thread_results[2 * blockDim.x + tid] = thread_results[2 * blockDim.x + tid + stride]; + } + } + __syncthreads(); + } + + // Step 8: Write results to global memory + if (tid == 0) { + bls_powers[freq_idx] = thread_results[0]; + best_q[freq_idx] = thread_results[blockDim.x]; + best_phi[freq_idx] = thread_results[2 * blockDim.x]; + } + + // Move to next frequency + freq_idx += gridDim.x; + } +} diff --git a/cuvarbase/kernels/sparse_bls_simple.cu b/cuvarbase/kernels/sparse_bls_simple.cu new file mode 100644 index 0000000..99a61f8 --- /dev/null +++ b/cuvarbase/kernels/sparse_bls_simple.cu @@ -0,0 +1,254 @@ +#include +#define RESTRICT __restrict__ +#define MIN_W 1E-9 +#define MAX_W_COMPLEMENT 1E-9 +//{CPP_DEFS} + +/** + * Simplified Sparse BLS CUDA Kernel for debugging + * + * This version uses a simpler O(N³) algorithm without fancy optimizations + * to help identify the source of hangs in the full implementation. + */ + +__device__ unsigned int get_id(){ + return blockIdx.x * blockDim.x + threadIdx.x; +} + +__device__ float mod1(float a){ + return a - floorf(a); +} + +__device__ float bls_power(float YW, float W, float YY, + unsigned int ignore_negative_delta_sols){ + if (ignore_negative_delta_sols && YW > 0.f) + return 0.f; + + if (W < MIN_W || W > 1.f - MAX_W_COMPLEMENT) + return 0.f; + + float bls = (YW * YW) / (W * (1.f - W) * YY); + return bls; +} + +/** + * Simplified sparse BLS kernel - each block handles one frequency + * Uses simple bubble sort and O(N³) algorithm to avoid complex synchronization + */ +__global__ void sparse_bls_kernel_simple( + const float* __restrict__ t, + const float* __restrict__ y, + const float* __restrict__ dy, + const float* __restrict__ freqs, + unsigned int ndata, + unsigned int nfreqs, + unsigned int ignore_negative_delta_sols, + float* __restrict__ bls_powers, + float* __restrict__ best_q, + float* __restrict__ best_phi) +{ + // Shared memory for this block + extern __shared__ float shared_mem[]; + + float* sh_phi = shared_mem; + float* sh_y = &shared_mem[ndata]; + float* sh_w = &shared_mem[2 * ndata]; + float* sh_ybar_tmp = &shared_mem[3 * ndata]; // For reduction + + unsigned int freq_idx = blockIdx.x; + unsigned int tid = threadIdx.x; + + while (freq_idx < nfreqs) { + float freq = freqs[freq_idx]; + + // Step 1: Load data and compute phases + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + float phi = mod1(t[i] * freq); + float weight = 1.f / (dy[i] * dy[i]); + + sh_phi[i] = phi; + sh_y[i] = y[i]; + sh_w[i] = weight; + } + __syncthreads(); + + // Step 2a: Compute sum of weights - parallel + float local_sum_w = 0.f; + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + local_sum_w += sh_w[i]; + } + sh_ybar_tmp[tid] = local_sum_w; + __syncthreads(); + + // Reduce to get total + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s && tid + s < blockDim.x) { + sh_ybar_tmp[tid] += sh_ybar_tmp[tid + s]; + } + __syncthreads(); + } + + float sum_w = sh_ybar_tmp[0]; + __syncthreads(); + + // Step 2b: Normalize weights - parallel + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + sh_w[i] /= sum_w; + } + __syncthreads(); + + // Step 3: Compute ybar - parallel reduction + float local_ybar = 0.f; + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + local_ybar += sh_w[i] * sh_y[i]; + } + sh_ybar_tmp[tid] = local_ybar; + __syncthreads(); + + // Reduce in shared memory + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s && tid + s < blockDim.x) { + sh_ybar_tmp[tid] += sh_ybar_tmp[tid + s]; + } + __syncthreads(); + } + + float ybar = sh_ybar_tmp[0]; + __syncthreads(); + + // Step 4: Compute YY - parallel reduction + float local_YY = 0.f; + for (unsigned int i = tid; i < ndata; i += blockDim.x) { + float diff = sh_y[i] - ybar; + local_YY += sh_w[i] * diff * diff; + } + sh_ybar_tmp[tid] = local_YY; + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s && tid + s < blockDim.x) { + sh_ybar_tmp[tid] += sh_ybar_tmp[tid + s]; + } + __syncthreads(); + } + + float YY = sh_ybar_tmp[0]; + __syncthreads(); + + // Step 5: Simple bubble sort by phase (single thread) + if (tid == 0) { + for (unsigned int i = 0; i < ndata - 1; i++) { + for (unsigned int j = 0; j < ndata - i - 1; j++) { + if (sh_phi[j] > sh_phi[j + 1]) { + // Swap all arrays + float tmp_phi = sh_phi[j]; + sh_phi[j] = sh_phi[j + 1]; + sh_phi[j + 1] = tmp_phi; + + float tmp_y = sh_y[j]; + sh_y[j] = sh_y[j + 1]; + sh_y[j + 1] = tmp_y; + + float tmp_w = sh_w[j]; + sh_w[j] = sh_w[j + 1]; + sh_w[j + 1] = tmp_w; + } + } + } + } + __syncthreads(); + + // Step 6: Test all transit pairs (single thread for simplicity) + if (tid == 0) { + float max_bls = 0.f; + float best_q_val = 0.f; + float best_phi_val = 0.f; + + + // Non-wrapped transits + for (unsigned int i = 0; i < ndata; i++) { + for (unsigned int j = i + 1; j <= ndata; j++) { // Note: j == ndata is a special case for computing q, not for including observation j (which would be out of bounds) + float phi0 = sh_phi[i]; + // Compute q properly - match CPU implementation + float q; + if (j < ndata) { + // Transit ends before observation j + if (j < ndata) { + q = 0.5f * (sh_phi[j] + sh_phi[j-1]) - phi0; + } else { + q = sh_phi[j] - phi0; + } + } else { + // Transit includes all remaining observations + q = sh_phi[ndata - 1] - phi0; + } + + if (q <= 0.f || q > 0.5f) continue; + + // Compute W and YW for observations i to j-1 + float W = 0.f; + float YW = 0.f; + for (unsigned int k = i; k < j && k < ndata; k++) { + W += sh_w[k]; + YW += sh_w[k] * sh_y[k]; + } + YW -= ybar * W; + + float bls = bls_power(YW, W, YY, ignore_negative_delta_sols); + + + if (bls > max_bls) { + max_bls = bls; + best_q_val = q; + best_phi_val = phi0; + } + } + + // Wrapped transits: from i to end, then 0 to k + for (unsigned int k = 0; k < i; k++) { + float phi0 = sh_phi[i]; + float q; + if (k > 0) { + q = (1.f - sh_phi[i]) + 0.5f * (sh_phi[k-1] + sh_phi[k]); + } else { + q = 1.f - sh_phi[i]; + } + + if (q <= 0.f || q > 0.5f) continue; + + // Compute W and YW: from i to end, plus 0 to k + float W = 0.f; + float YW = 0.f; + for (unsigned int m = i; m < ndata; m++) { + W += sh_w[m]; + YW += sh_w[m] * sh_y[m]; + } + for (unsigned int m = 0; m < k; m++) { + W += sh_w[m]; + YW += sh_w[m] * sh_y[m]; + } + YW -= ybar * W; + + float bls = bls_power(YW, W, YY, ignore_negative_delta_sols); + + + if (bls > max_bls) { + max_bls = bls; + best_q_val = q; + best_phi_val = phi0; + } + } + } + + // Store results + bls_powers[freq_idx] = max_bls; + best_q[freq_idx] = best_q_val; + best_phi[freq_idx] = best_phi_val; + + } + __syncthreads(); + + // Move to next frequency + freq_idx += gridDim.x; + } +} diff --git a/cuvarbase/tests/test_bls.py b/cuvarbase/tests/test_bls.py index 66829d6..77811d4 100644 --- a/cuvarbase/tests/test_bls.py +++ b/cuvarbase/tests/test_bls.py @@ -6,7 +6,7 @@ from ..bls import eebls_gpu, eebls_transit_gpu, \ q_transit, compile_bls, hone_solution,\ single_bls, eebls_gpu_custom, eebls_gpu_fast, \ - sparse_bls_cpu, eebls_transit + sparse_bls_cpu, sparse_bls_gpu, eebls_transit def transit_model(phi0, q, delta, q1=0.): @@ -481,6 +481,78 @@ def test_sparse_bls(self, freq, q, phi0, ndata, ignore_negative_delta_sols): best_freq = freqs[np.argmax(power_sparse)] assert np.abs(best_freq - freq) < 10 * df # Allow more tolerance for sparse + @pytest.mark.parametrize("freq", [1.0, 2.0]) + @pytest.mark.parametrize("q", [0.02, 0.1]) + @pytest.mark.parametrize("phi0", [0.0, 0.5]) + @pytest.mark.parametrize("ndata", [50, 100, 200]) + @pytest.mark.parametrize("ignore_negative_delta_sols", [True, False]) + @mark_cuda_test + def test_sparse_bls_gpu(self, freq, q, phi0, ndata, ignore_negative_delta_sols): + """Test GPU sparse BLS implementation against CPU sparse BLS""" + t, y, dy = data(snr=10, q=q, phi0=phi0, freq=freq, + baseline=365., ndata=ndata) + + # Test a few frequencies around the true frequency + df = q / (10 * (max(t) - min(t))) + freqs = np.linspace(freq - 5 * df, freq + 5 * df, 11) + + # Run CPU sparse BLS + power_cpu, sols_cpu = sparse_bls_cpu(t, y, dy, freqs, + ignore_negative_delta_sols=ignore_negative_delta_sols) + + # Run GPU sparse BLS + power_gpu, sols_gpu = sparse_bls_gpu(t, y, dy, freqs, + ignore_negative_delta_sols=ignore_negative_delta_sols) + + # Compare CPU and GPU results + # Powers should match closely + assert_allclose(power_cpu, power_gpu, rtol=1e-4, atol=1e-6, + err_msg=f"Power mismatch for freq={freq}, q={q}, phi0={phi0}") + + # Solutions should match closely + for i, (f, (q_cpu, phi_cpu), (q_gpu, phi_gpu)) in enumerate( + zip(freqs, sols_cpu, sols_gpu)): + # q values should match + assert np.abs(q_cpu - q_gpu) < 1e-4, \ + f"q mismatch at freq={f}: cpu={q_cpu}, gpu={q_gpu}" + + # phi values should match (accounting for wrapping) + phi_diff = np.abs(phi_cpu - phi_gpu) + phi_diff = min(phi_diff, 1.0 - phi_diff) # Account for phase wrapping + assert phi_diff < 1e-4, \ + f"phi mismatch at freq={f}: cpu={phi_cpu}, gpu={phi_gpu}" + + # Both should find peak near true frequency + best_freq_cpu = freqs[np.argmax(power_cpu)] + best_freq_gpu = freqs[np.argmax(power_gpu)] + assert np.abs(best_freq_cpu - best_freq_gpu) < df, \ + f"Best freq mismatch: cpu={best_freq_cpu}, gpu={best_freq_gpu}" + + @pytest.mark.parametrize("freq", [1.0]) + @pytest.mark.parametrize("q", [0.05]) + @pytest.mark.parametrize("phi0", [0.0, 0.9]) # Test both non-wrapped and wrapped + @pytest.mark.parametrize("ndata", [100]) + @mark_cuda_test + def test_sparse_bls_gpu_vs_single(self, freq, q, phi0, ndata): + """Test that GPU sparse BLS solutions match single_bls""" + t, y, dy = data(snr=20, q=q, phi0=phi0, freq=freq, + baseline=365., ndata=ndata) + + # Test a few frequencies + df = q / (10 * (max(t) - min(t))) + freqs = np.linspace(freq - 3 * df, freq + 3 * df, 7) + + # Run GPU sparse BLS + power_gpu, sols_gpu = sparse_bls_gpu(t, y, dy, freqs) + + # Verify against single_bls + for i, (f, (q_gpu, phi_gpu)) in enumerate(zip(freqs, sols_gpu)): + p_single = single_bls(t, y, dy, f, q_gpu, phi_gpu) + + # The GPU BLS result should match single_bls with the parameters it found + assert np.abs(power_gpu[i] - p_single) < 1e-4, \ + f"Mismatch at freq={f}: gpu={power_gpu[i]}, single={p_single}" + @pytest.mark.parametrize("ndata", [50, 100]) @pytest.mark.parametrize("use_sparse_override", [None, True, False]) def test_eebls_transit_auto_select(self, ndata, use_sparse_override): diff --git a/pyproject.toml b/pyproject.toml index 69d43b7..8b18804 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "cuvarbase" dynamic = ["version"] description = "Period-finding and variability on the GPU" readme = "README.rst" -requires-python = ">=3.7" +requires-python = ">=3.8" license = {text = "GPL-3.0"} authors = [ {name = "John Hoffman", email = "johnh2o2@gmail.com"} @@ -20,7 +20,6 @@ classifiers = [ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Natural Language :: English", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10",