diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 21fd1ef..92bb055 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     
     steps:
     - uses: actions/checkout@v3
diff --git a/cuvarbase/bls.py b/cuvarbase/bls.py
index 8d0a3a6..ced49b8 100644
--- a/cuvarbase/bls.py
+++ b/cuvarbase/bls.py
@@ -1150,6 +1150,139 @@ def sparse_bls_cpu(t, y, dy, freqs, ignore_negative_delta_sols=False):
     return bls_powers, solutions
 
 
+def compile_sparse_bls(block_size=_default_block_size, use_simple=True, **kwargs):
+    """
+    Compile sparse BLS GPU kernel
+
+    Parameters
+    ----------
+    block_size: int, optional (default: _default_block_size)
+        CUDA threads per CUDA block.
+    use_simple: bool, optional (default: True)
+        Use simplified kernel (more reliable, slightly slower)
+
+    Returns
+    -------
+    kernel: PyCUDA function
+        The compiled sparse_bls_kernel function
+    """
+    # Read kernel - use simple version by default (it works!)
+    kernel_name = 'sparse_bls_simple' if use_simple else 'sparse_bls'
+    cppd = dict(BLOCK_SIZE=block_size)
+    kernel_txt = _module_reader(find_kernel(kernel_name),
+                                cpp_defs=cppd)
+
+    # compile kernel
+    module = SourceModule(kernel_txt, options=['--use_fast_math'])
+
+    func_name = 'sparse_bls_kernel_simple' if use_simple else 'sparse_bls_kernel'
+    kernel = module.get_function(func_name)
+
+    # Don't use prepare() - it causes issues with large shared memory
+    return kernel
+
+
+def sparse_bls_gpu(t, y, dy, freqs, ignore_negative_delta_sols=False,
+                   block_size=64, max_ndata=None,
+                   stream=None, kernel=None):
+    """
+    GPU-accelerated sparse BLS implementation.
+
+    Uses a CUDA kernel to test all pairs of observations as potential
+    transit boundaries. More efficient than CPU implementation for datasets
+    with ~100-1000 observations.
+
+    Based on https://arxiv.org/abs/2103.06193
+
+    Parameters
+    ----------
+    t: array_like, float
+        Observation times
+    y: array_like, float
+        Observations
+    dy: array_like, float
+        Observation uncertainties
+    freqs: array_like, float
+        Frequencies to test
+    ignore_negative_delta_sols: bool, optional (default: False)
+        Whether or not to ignore solutions with negative delta (inverted dips)
+    block_size: int, optional (default: 64)
+        CUDA threads per CUDA block (use 32-128 for best performance)
+    max_ndata: int, optional (default: None)
+        Maximum number of data points (for shared memory allocation).
+        If None, uses len(t)
+    stream: pycuda.driver.Stream, optional (default: None)
+        CUDA stream for async execution
+    kernel: PyCUDA function, optional (default: None)
+        Pre-compiled kernel. If None, compiles kernel automatically.
+
+    Returns
+    -------
+    bls_powers: array_like, float
+        BLS power at each frequency
+    solutions: list of (q, phi0) tuples
+        Best (q, phi0) solution at each frequency
+    """
+    # Convert to numpy arrays
+    t = np.asarray(t).astype(np.float32)
+    y = np.asarray(y).astype(np.float32)
+    dy = np.asarray(dy).astype(np.float32)
+    freqs = np.asarray(freqs).astype(np.float32)
+
+    ndata = len(t)
+    nfreqs = len(freqs)
+
+    if max_ndata is None:
+        max_ndata = ndata
+
+    # Compile kernel if not provided
+    if kernel is None:
+        kernel = compile_sparse_bls(block_size=block_size)
+
+    # Allocate GPU memory
+    t_g = gpuarray.to_gpu(t)
+    y_g = gpuarray.to_gpu(y)
+    dy_g = gpuarray.to_gpu(dy)
+    freqs_g = gpuarray.to_gpu(freqs)
+
+    bls_powers_g = gpuarray.zeros(nfreqs, dtype=np.float32)
+    best_q_g = gpuarray.zeros(nfreqs, dtype=np.float32)
+    best_phi_g = gpuarray.zeros(nfreqs, dtype=np.float32)
+
+    # Calculate shared memory size
+    # Simple kernel needs: 3 data arrays (phi, y, w) + 1 temp array for reductions
+    # Allocate for blockDim from function parameter (block_size) to be safe
+    shared_mem_size = (3 * max_ndata + block_size) * 4
+
+    # Launch kernel
+    # Grid: one block per frequency (or fewer if limited by hardware)
+    max_blocks = 65535  # CUDA maximum
+    grid = (min(nfreqs, max_blocks), 1)
+    block = (block_size, 1, 1)
+
+    if stream is None:
+        stream = cuda.Stream()
+
+    # Call kernel without prepare() to avoid resource issues
+    kernel(
+        t_g, y_g, dy_g, freqs_g,
+        np.uint32(ndata), np.uint32(nfreqs),
+        np.uint32(ignore_negative_delta_sols),
+        bls_powers_g, best_q_g, best_phi_g,
+        block=block, grid=grid, stream=stream,
+        shared=shared_mem_size
+    )
+
+    # Copy results back
+    stream.synchronize()
+    bls_powers = bls_powers_g.get()
+    best_q = best_q_g.get()
+    best_phi = best_phi_g.get()
+
+    solutions = list(zip(best_q, best_phi))
+    return bls_powers, solutions
+
+
 def eebls_transit(t, y, dy, fmax_frac=1.0, fmin_frac=1.0,
                   qmin_fac=0.5, qmax_fac=2.0, fmin=None,
                   fmax=None, freqs=None, qvals=None, use_fast=False,
diff --git a/cuvarbase/kernels/sparse_bls.cu b/cuvarbase/kernels/sparse_bls.cu
new file mode 100644
index 0000000..d5a290e
--- /dev/null
+++ b/cuvarbase/kernels/sparse_bls.cu
@@ -0,0 +1,367 @@
+#include <stdio.h>
+#define RESTRICT __restrict__
+#define CONSTANT const
+#define MIN_W 1E-9
+#define MAX_W_COMPLEMENT 1E-9
+//{CPP_DEFS}
+
+/**
+ * Sparse BLS CUDA Kernel
+ *
+ * Implementation of sparse Box Least Squares algorithm based on
+ * https://arxiv.org/abs/2103.06193
+ *
+ * Instead of binning, this algorithm tests all pairs of sorted observations
+ * as potential transit boundaries. This is more efficient for small datasets
+ * (ndata < ~500) where the O(N²) complexity per frequency is acceptable.
+ */
+
+__device__ unsigned int get_id(){
+    return blockIdx.x * blockDim.x + threadIdx.x;
+}
+
+__device__ float mod1(float a){
+    return a - floorf(a);
+}
+
+/**
+ * Compute BLS power for given parameters
+ *
+ * @param YW: Weighted sum of y values in transit
+ * @param W: Sum of weights in transit
+ * @param YY: Total variance normalization
+ * @param ignore_negative_delta_sols: If true, ignore inverted dips (YW > 0)
+ * @return: BLS power value
+ */
+__device__ float bls_power(float YW, float W, float YY,
+                          unsigned int ignore_negative_delta_sols){
+    // Check if we should ignore this solution
+    if (ignore_negative_delta_sols && YW > 0.f)
+        return 0.f;
+
+    // Check weight bounds
+    if (W < MIN_W || W > 1.f - MAX_W_COMPLEMENT)
+        return 0.f;
+
+    // Compute BLS: (YW)² / (W * (1-W) * YY)
+    float bls = (YW * YW) / (W * (1.f - W) * YY);
+    return bls;
+}
+
+/**
+ * Bitonic sort for sorting observations by phase within shared memory
+ * Uses cooperative sorting across all threads in the block
+ *
+ * @param sh_phi: Shared memory array of phases
+ * @param sh_y: Shared memory array of y values
+ * @param sh_w: Shared memory array of weights
+ * @param sh_indices: Shared memory array of original indices
+ * @param n: Number of elements to sort
+ */
+__device__ void bitonic_sort_by_phase(float* sh_phi, float* sh_y, float* sh_w,
+                                     int* sh_indices, unsigned int n){
+    unsigned int tid = threadIdx.x;
+
+    // Bitonic sort: repeatedly merge sorted sequences
+    for (unsigned int k = 2; k <= n; k *= 2) {
+        for (unsigned int j = k / 2; j > 0; j /= 2) {
+            unsigned int ixj = tid ^ j;
+
+            if (ixj > tid && tid < n && ixj < n) {
+                // Determine sort direction
+                bool ascending = ((tid & k) == 0);
+                bool swap = (sh_phi[tid] > sh_phi[ixj]) == ascending;
+
+                if (swap) {
+                    // Swap all arrays in lockstep
+                    float tmp_phi = sh_phi[tid];
+                    float tmp_y = sh_y[tid];
+                    float tmp_w = sh_w[tid];
+                    int tmp_idx = sh_indices[tid];
+
+                    sh_phi[tid] = sh_phi[ixj];
+                    sh_y[tid] = sh_y[ixj];
+                    sh_w[tid] = sh_w[ixj];
+                    sh_indices[tid] = sh_indices[ixj];
+
+                    sh_phi[ixj] = tmp_phi;
+                    sh_y[ixj] = tmp_y;
+                    sh_w[ixj] = tmp_w;
+                    sh_indices[ixj] = tmp_idx;
+                }
+            }
+            __syncthreads();
+        }
+    }
+}
+
+/**
+ * Main sparse BLS kernel
+ *
+ * Each thread block handles one frequency. Within each block:
+ * 1. Compute phases for all observations at this frequency
+ * 2. Sort observations by phase in shared memory
+ * 3. Test all pairs of observations as potential transit boundaries
+ * 4. Find maximum BLS power and corresponding (q, phi0)
+ *
+ * @param t: Observation times [ndata]
+ * @param y: Observation values [ndata]
+ * @param dy: Observation uncertainties [ndata]
+ * @param freqs: Frequencies to test [nfreqs]
+ * @param ndata: Number of observations
+ * @param nfreqs: Number of frequencies
+ * @param ignore_negative_delta_sols: Whether to ignore inverted dips
+ * @param bls_powers: Output BLS powers [nfreqs]
+ * @param best_q: Output best q values [nfreqs]
+ * @param best_phi: Output best phi0 values [nfreqs]
+ */
+__global__ void sparse_bls_kernel(
+    const float* __restrict__ t,
+    const float* __restrict__ y,
+    const float* __restrict__ dy,
+    const float* __restrict__ freqs,
+    unsigned int ndata,
+    unsigned int nfreqs,
+    unsigned int ignore_negative_delta_sols,
+    float* __restrict__ bls_powers,
+    float* __restrict__ best_q,
+    float* __restrict__ best_phi)
+{
+    // Shared memory layout:
+    // [phi, y, w, indices, cumsum_w, cumsum_yw, thread_max_bls, thread_best_q, thread_best_phi]
+    extern __shared__ float shared_mem[];
+
+    float* sh_phi = shared_mem;                           // ndata floats
+    float* sh_y = &shared_mem[ndata];                     // ndata floats
+    float* sh_w = &shared_mem[2 * ndata];                 // ndata floats
+    int* sh_indices = (int*)&shared_mem[3 * ndata];      // ndata ints
+    float* sh_cumsum_w = &shared_mem[3 * ndata + ndata]; // ndata floats
+    float* sh_cumsum_yw = &shared_mem[4 * ndata + ndata];// ndata floats
+    float* thread_results = &shared_mem[5 * ndata + ndata]; // blockDim.x * 3 floats
+
+    unsigned int freq_idx = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+
+    // Loop over frequencies (in case we have more frequencies than blocks)
+    while (freq_idx < nfreqs) {
+        float freq = freqs[freq_idx];
+
+        // Step 1: Load data and compute phases
+        // Each thread loads multiple elements if ndata > blockDim.x
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            float phi = mod1(t[i] * freq);
+            float weight = 1.f / (dy[i] * dy[i]);
+
+            sh_phi[i] = phi;
+            sh_y[i] = y[i];
+            sh_w[i] = weight;
+            sh_indices[i] = i;
+        }
+        __syncthreads();
+
+        // Step 2: Normalize weights
+        float sum_w = 0.f;
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            sum_w += sh_w[i];
+        }
+
+        // Reduce sum_w across threads
+        __shared__ float block_sum_w;
+        if (tid == 0) block_sum_w = 0.f;
+        __syncthreads();
+
+        atomicAdd(&block_sum_w, sum_w);
+        __syncthreads();
+
+        // Normalize weights
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            sh_w[i] /= block_sum_w;
+        }
+        __syncthreads();
+
+        // Step 3: Compute ybar and YY (normalization)
+        float ybar = 0.f;
+        float YY = 0.f;
+
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            ybar += sh_w[i] * sh_y[i];
+        }
+
+        __shared__ float block_ybar;
+        if (tid == 0) block_ybar = 0.f;
+        __syncthreads();
+
+        atomicAdd(&block_ybar, ybar);
+        __syncthreads();
+
+        ybar = block_ybar;
+
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            float diff = sh_y[i] - ybar;
+            YY += sh_w[i] * diff * diff;
+        }
+
+        __shared__ float block_YY;
+        if (tid == 0) block_YY = 0.f;
+        __syncthreads();
+
+        atomicAdd(&block_YY, YY);
+        __syncthreads();
+
+        YY = block_YY;
+
+        // Step 4: Sort by phase using bitonic sort
+        // Pad to next power of 2 for bitonic sort
+        unsigned int n_padded = 1;
+        while (n_padded < ndata) n_padded *= 2;
+
+        // Pad with large phase values
+        for (unsigned int i = ndata + tid; i < n_padded; i += blockDim.x) {
+            if (i < n_padded) {
+                sh_phi[i] = 2.f; // Larger than any valid phase
+                sh_y[i] = 0.f;
+                sh_w[i] = 0.f;
+                sh_indices[i] = -1;
+            }
+        }
+        __syncthreads();
+
+        bitonic_sort_by_phase(sh_phi, sh_y, sh_w, sh_indices, n_padded);
+
+        // Step 5: Compute cumulative sums for fast range queries
+        // Using prefix sum
+        for (unsigned int stride = 1; stride < ndata; stride *= 2) {
+            __syncthreads();
+            for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+                if (i >= stride) {
+                    float temp_w = sh_cumsum_w[i - stride];
+                    float temp_yw = sh_cumsum_yw[i - stride];
+                    __syncthreads();
+                    sh_cumsum_w[i] = sh_w[i] + temp_w;
+                    sh_cumsum_yw[i] = sh_w[i] * sh_y[i] + temp_yw;
+                } else {
+                    sh_cumsum_w[i] = sh_w[i];
+                    sh_cumsum_yw[i] = sh_w[i] * sh_y[i];
+                }
+            }
+        }
+        __syncthreads();
+
+        // Step 6: Each thread tests a subset of transit pairs
+        float thread_max_bls = 0.f;
+        float thread_q = 0.f;
+        float thread_phi0 = 0.f;
+
+        // Total number of pairs to test: ndata * ndata
+        unsigned long long total_pairs = (unsigned long long)ndata * (unsigned long long)ndata;
+        unsigned long long pairs_per_thread = (total_pairs + blockDim.x - 1) / blockDim.x;
+
+        unsigned long long start_pair = (unsigned long long)tid * pairs_per_thread;
+        unsigned long long end_pair = min(start_pair + pairs_per_thread, total_pairs);
+
+        for (unsigned long long pair_idx = start_pair; pair_idx < end_pair; pair_idx++) {
+            unsigned int i = pair_idx / ndata;
+            unsigned int j = pair_idx % ndata;
+
+            if (i >= ndata || j >= ndata) continue;
+
+            float phi0, q, W, YW, bls;
+
+            // Non-wrapped transits: from i to j
+            if (j > i) {
+                phi0 = sh_phi[i];
+
+                // Compute q as midpoint to next excluded observation
+                if (j < ndata - 1 && j > 0) {
+                    q = 0.5f * (sh_phi[j] + sh_phi[j - 1]) - phi0;
+                } else {
+                    q = sh_phi[j] - phi0;
+                }
+
+                if (q > 0.5f) continue;
+
+                // Compute W and YW for observations i to j-1 using cumulative sums
+                W = (i == 0) ? sh_cumsum_w[j - 1] : sh_cumsum_w[j - 1] - sh_cumsum_w[i - 1];
+                YW = (i == 0) ? sh_cumsum_yw[j - 1] : sh_cumsum_yw[j - 1] - sh_cumsum_yw[i - 1];
+                YW -= ybar * W;
+
+                bls = bls_power(YW, W, YY, ignore_negative_delta_sols);
+
+                if (bls > thread_max_bls) {
+                    thread_max_bls = bls;
+                    thread_q = q;
+                    thread_phi0 = phi0;
+                }
+            }
+
+            // Wrapped transits: from i to end, then 0 to k
+            if (j < i) {
+                unsigned int k = j;
+                phi0 = sh_phi[i];
+
+                if (k > 0) {
+                    q = (1.f - phi0) + 0.5f * (sh_phi[k - 1] + sh_phi[k]);
+                } else {
+                    q = 1.f - phi0;
+                }
+
+                if (q > 0.5f) continue;
+
+                // W and YW = sum from i to end, plus 0 to k-1
+                if (i > 0) {
+                    W = (sh_cumsum_w[ndata - 1] - sh_cumsum_w[i - 1]);
+                    YW = (sh_cumsum_yw[ndata - 1] - sh_cumsum_yw[i - 1]);
+                } else {
+                    W = sh_cumsum_w[ndata - 1];
+                    YW = sh_cumsum_yw[ndata - 1];
+                }
+
+                if (k > 0) {
+                    W += sh_cumsum_w[k - 1];
+                    YW += sh_cumsum_yw[k - 1];
+                }
+
+                YW -= ybar * W;
+
+                bls = bls_power(YW, W, YY, ignore_negative_delta_sols);
+
+                if (bls > thread_max_bls) {
+                    thread_max_bls = bls;
+                    thread_q = q;
+                    thread_phi0 = phi0;
+                }
+            }
+        }
+
+        // Store thread results
+        thread_results[tid] = thread_max_bls;
+        thread_results[blockDim.x + tid] = thread_q;
+        thread_results[2 * blockDim.x + tid] = thread_phi0;
+        __syncthreads();
+
+        // Step 7: Reduce across threads to find maximum BLS
+        for (unsigned int stride = blockDim.x / 2; stride > 0; stride /= 2) {
+            if (tid < stride) {
+                float bls1 = thread_results[tid];
+                float bls2 = thread_results[tid + stride];
+
+                if (bls2 > bls1) {
+                    thread_results[tid] = bls2;
+                    thread_results[blockDim.x + tid] = thread_results[blockDim.x + tid + stride];
+                    thread_results[2 * blockDim.x + tid] = thread_results[2 * blockDim.x + tid + stride];
+                }
+            }
+            __syncthreads();
+        }
+
+        // Step 8: Write results to global memory
+        if (tid == 0) {
+            bls_powers[freq_idx] = thread_results[0];
+            best_q[freq_idx] = thread_results[blockDim.x];
+            best_phi[freq_idx] = thread_results[2 * blockDim.x];
+        }
+
+        // Move to next frequency
+        freq_idx += gridDim.x;
+    }
+}
diff --git a/cuvarbase/kernels/sparse_bls_simple.cu b/cuvarbase/kernels/sparse_bls_simple.cu
new file mode 100644
index 0000000..99a61f8
--- /dev/null
+++ b/cuvarbase/kernels/sparse_bls_simple.cu
@@ -0,0 +1,254 @@
+#include <stdio.h>
+#define RESTRICT __restrict__
+#define MIN_W 1E-9
+#define MAX_W_COMPLEMENT 1E-9
+//{CPP_DEFS}
+
+/**
+ * Simplified Sparse BLS CUDA Kernel for debugging
+ *
+ * This version uses a simpler O(N³) algorithm without fancy optimizations
+ * to help identify the source of hangs in the full implementation.
+ */
+
+__device__ unsigned int get_id(){
+    return blockIdx.x * blockDim.x + threadIdx.x;
+}
+
+__device__ float mod1(float a){
+    return a - floorf(a);
+}
+
+__device__ float bls_power(float YW, float W, float YY,
+                          unsigned int ignore_negative_delta_sols){
+    if (ignore_negative_delta_sols && YW > 0.f)
+        return 0.f;
+
+    if (W < MIN_W || W > 1.f - MAX_W_COMPLEMENT)
+        return 0.f;
+
+    float bls = (YW * YW) / (W * (1.f - W) * YY);
+    return bls;
+}
+
+/**
+ * Simplified sparse BLS kernel - each block handles one frequency
+ * Uses simple bubble sort and O(N³) algorithm to avoid complex synchronization
+ */
+__global__ void sparse_bls_kernel_simple(
+    const float* __restrict__ t,
+    const float* __restrict__ y,
+    const float* __restrict__ dy,
+    const float* __restrict__ freqs,
+    unsigned int ndata,
+    unsigned int nfreqs,
+    unsigned int ignore_negative_delta_sols,
+    float* __restrict__ bls_powers,
+    float* __restrict__ best_q,
+    float* __restrict__ best_phi)
+{
+    // Shared memory for this block
+    extern __shared__ float shared_mem[];
+
+    float* sh_phi = shared_mem;
+    float* sh_y = &shared_mem[ndata];
+    float* sh_w = &shared_mem[2 * ndata];
+    float* sh_ybar_tmp = &shared_mem[3 * ndata];  // For reduction
+
+    unsigned int freq_idx = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+
+    while (freq_idx < nfreqs) {
+        float freq = freqs[freq_idx];
+
+        // Step 1: Load data and compute phases
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            float phi = mod1(t[i] * freq);
+            float weight = 1.f / (dy[i] * dy[i]);
+
+            sh_phi[i] = phi;
+            sh_y[i] = y[i];
+            sh_w[i] = weight;
+        }
+        __syncthreads();
+
+        // Step 2a: Compute sum of weights - parallel
+        float local_sum_w = 0.f;
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            local_sum_w += sh_w[i];
+        }
+        sh_ybar_tmp[tid] = local_sum_w;
+        __syncthreads();
+
+        // Reduce to get total
+        for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+            if (tid < s && tid + s < blockDim.x) {
+                sh_ybar_tmp[tid] += sh_ybar_tmp[tid + s];
+            }
+            __syncthreads();
+        }
+
+        float sum_w = sh_ybar_tmp[0];
+        __syncthreads();
+
+        // Step 2b: Normalize weights - parallel
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            sh_w[i] /= sum_w;
+        }
+        __syncthreads();
+
+        // Step 3: Compute ybar - parallel reduction
+        float local_ybar = 0.f;
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            local_ybar += sh_w[i] * sh_y[i];
+        }
+        sh_ybar_tmp[tid] = local_ybar;
+        __syncthreads();
+
+        // Reduce in shared memory
+        for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+            if (tid < s && tid + s < blockDim.x) {
+                sh_ybar_tmp[tid] += sh_ybar_tmp[tid + s];
+            }
+            __syncthreads();
+        }
+
+        float ybar = sh_ybar_tmp[0];
+        __syncthreads();
+
+        // Step 4: Compute YY - parallel reduction
+        float local_YY = 0.f;
+        for (unsigned int i = tid; i < ndata; i += blockDim.x) {
+            float diff = sh_y[i] - ybar;
+            local_YY += sh_w[i] * diff * diff;
+        }
+        sh_ybar_tmp[tid] = local_YY;
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+            if (tid < s && tid + s < blockDim.x) {
+                sh_ybar_tmp[tid] += sh_ybar_tmp[tid + s];
+            }
+            __syncthreads();
+        }
+
+        float YY = sh_ybar_tmp[0];
+        __syncthreads();
+
+        // Step 5: Simple bubble sort by phase (single thread)
+        if (tid == 0) {
+            for (unsigned int i = 0; i < ndata - 1; i++) {
+                for (unsigned int j = 0; j < ndata - i - 1; j++) {
+                    if (sh_phi[j] > sh_phi[j + 1]) {
+                        // Swap all arrays
+                        float tmp_phi = sh_phi[j];
+                        sh_phi[j] = sh_phi[j + 1];
+                        sh_phi[j + 1] = tmp_phi;
+
+                        float tmp_y = sh_y[j];
+                        sh_y[j] = sh_y[j + 1];
+                        sh_y[j + 1] = tmp_y;
+
+                        float tmp_w = sh_w[j];
+                        sh_w[j] = sh_w[j + 1];
+                        sh_w[j + 1] = tmp_w;
+                    }
+                }
+            }
+        }
+        __syncthreads();
+
+        // Step 6: Test all transit pairs (single thread for simplicity)
+        if (tid == 0) {
+            float max_bls = 0.f;
+            float best_q_val = 0.f;
+            float best_phi_val = 0.f;
+
+
+            // Non-wrapped transits
+            for (unsigned int i = 0; i < ndata; i++) {
+                for (unsigned int j = i + 1; j <= ndata; j++) {  // Note: j == ndata is a special case for computing q, not for including observation j (which would be out of bounds)
+                    float phi0 = sh_phi[i];
+                    // Compute q properly - match CPU implementation
+                    float q;
+                    if (j < ndata) {
+                        // Transit ends before observation j
+                        if (j < ndata) {
+                            q = 0.5f * (sh_phi[j] + sh_phi[j-1]) - phi0;
+                        } else {
+                            q = sh_phi[j] - phi0;
+                        }
+                    } else {
+                        // Transit includes all remaining observations
+                        q = sh_phi[ndata - 1] - phi0;
+                    }
+
+                    if (q <= 0.f || q > 0.5f) continue;
+
+                    // Compute W and YW for observations i to j-1
+                    float W = 0.f;
+                    float YW = 0.f;
+                    for (unsigned int k = i; k < j && k < ndata; k++) {
+                        W += sh_w[k];
+                        YW += sh_w[k] * sh_y[k];
+                    }
+                    YW -= ybar * W;
+
+                    float bls = bls_power(YW, W, YY, ignore_negative_delta_sols);
+
+
+                    if (bls > max_bls) {
+                        max_bls = bls;
+                        best_q_val = q;
+                        best_phi_val = phi0;
+                    }
+                }
+
+                // Wrapped transits: from i to end, then 0 to k
+                for (unsigned int k = 0; k < i; k++) {
+                    float phi0 = sh_phi[i];
+                    float q;
+                    if (k > 0) {
+                        q = (1.f - sh_phi[i]) + 0.5f * (sh_phi[k-1] + sh_phi[k]);
+                    } else {
+                        q = 1.f - sh_phi[i];
+                    }
+
+                    if (q <= 0.f || q > 0.5f) continue;
+
+                    // Compute W and YW: from i to end, plus 0 to k
+                    float W = 0.f;
+                    float YW = 0.f;
+                    for (unsigned int m = i; m < ndata; m++) {
+                        W += sh_w[m];
+                        YW += sh_w[m] * sh_y[m];
+                    }
+                    for (unsigned int m = 0; m < k; m++) {
+                        W += sh_w[m];
+                        YW += sh_w[m] * sh_y[m];
+                    }
+                    YW -= ybar * W;
+
+                    float bls = bls_power(YW, W, YY, ignore_negative_delta_sols);
+
+
+                    if (bls > max_bls) {
+                        max_bls = bls;
+                        best_q_val = q;
+                        best_phi_val = phi0;
+                    }
+                }
+            }
+
+            // Store results
+            bls_powers[freq_idx] = max_bls;
+            best_q[freq_idx] = best_q_val;
+            best_phi[freq_idx] = best_phi_val;
+
+        }
+        __syncthreads();
+
+        // Move to next frequency
+        freq_idx += gridDim.x;
+    }
+}
diff --git a/cuvarbase/tests/test_bls.py b/cuvarbase/tests/test_bls.py
index 66829d6..77811d4 100644
--- a/cuvarbase/tests/test_bls.py
+++ b/cuvarbase/tests/test_bls.py
@@ -6,7 +6,7 @@
 from ..bls import eebls_gpu, eebls_transit_gpu, \
                   q_transit, compile_bls, hone_solution,\
                   single_bls, eebls_gpu_custom, eebls_gpu_fast, \
-                  sparse_bls_cpu, eebls_transit
+                  sparse_bls_cpu, sparse_bls_gpu, eebls_transit
 
 
 def transit_model(phi0, q, delta, q1=0.):
@@ -481,6 +481,78 @@ def test_sparse_bls(self, freq, q, phi0, ndata, ignore_negative_delta_sols):
         best_freq = freqs[np.argmax(power_sparse)]
         assert np.abs(best_freq - freq) < 10 * df  # Allow more tolerance for sparse
 
+    @pytest.mark.parametrize("freq", [1.0, 2.0])
+    @pytest.mark.parametrize("q", [0.02, 0.1])
+    @pytest.mark.parametrize("phi0", [0.0, 0.5])
+    @pytest.mark.parametrize("ndata", [50, 100, 200])
+    @pytest.mark.parametrize("ignore_negative_delta_sols", [True, False])
+    @mark_cuda_test
+    def test_sparse_bls_gpu(self, freq, q, phi0, ndata, ignore_negative_delta_sols):
+        """Test GPU sparse BLS implementation against CPU sparse BLS"""
+        t, y, dy = data(snr=10, q=q, phi0=phi0, freq=freq,
+                        baseline=365., ndata=ndata)
+
+        # Test a few frequencies around the true frequency
+        df = q / (10 * (max(t) - min(t)))
+        freqs = np.linspace(freq - 5 * df, freq + 5 * df, 11)
+
+        # Run CPU sparse BLS
+        power_cpu, sols_cpu = sparse_bls_cpu(t, y, dy, freqs,
+                                              ignore_negative_delta_sols=ignore_negative_delta_sols)
+
+        # Run GPU sparse BLS
+        power_gpu, sols_gpu = sparse_bls_gpu(t, y, dy, freqs,
+                                              ignore_negative_delta_sols=ignore_negative_delta_sols)
+
+        # Compare CPU and GPU results
+        # Powers should match closely
+        assert_allclose(power_cpu, power_gpu, rtol=1e-4, atol=1e-6,
+                       err_msg=f"Power mismatch for freq={freq}, q={q}, phi0={phi0}")
+
+        # Solutions should match closely
+        for i, (f, (q_cpu, phi_cpu), (q_gpu, phi_gpu)) in enumerate(
+                zip(freqs, sols_cpu, sols_gpu)):
+            # q values should match
+            assert np.abs(q_cpu - q_gpu) < 1e-4, \
+                f"q mismatch at freq={f}: cpu={q_cpu}, gpu={q_gpu}"
+
+            # phi values should match (accounting for wrapping)
+            phi_diff = np.abs(phi_cpu - phi_gpu)
+            phi_diff = min(phi_diff, 1.0 - phi_diff)  # Account for phase wrapping
+            assert phi_diff < 1e-4, \
+                f"phi mismatch at freq={f}: cpu={phi_cpu}, gpu={phi_gpu}"
+
+        # Both should find peak near true frequency
+        best_freq_cpu = freqs[np.argmax(power_cpu)]
+        best_freq_gpu = freqs[np.argmax(power_gpu)]
+        assert np.abs(best_freq_cpu - best_freq_gpu) < df, \
+            f"Best freq mismatch: cpu={best_freq_cpu}, gpu={best_freq_gpu}"
+
+    @pytest.mark.parametrize("freq", [1.0])
+    @pytest.mark.parametrize("q", [0.05])
+    @pytest.mark.parametrize("phi0", [0.0, 0.9])  # Test both non-wrapped and wrapped
+    @pytest.mark.parametrize("ndata", [100])
+    @mark_cuda_test
+    def test_sparse_bls_gpu_vs_single(self, freq, q, phi0, ndata):
+        """Test that GPU sparse BLS solutions match single_bls"""
+        t, y, dy = data(snr=20, q=q, phi0=phi0, freq=freq,
+                        baseline=365., ndata=ndata)
+
+        # Test a few frequencies
+        df = q / (10 * (max(t) - min(t)))
+        freqs = np.linspace(freq - 3 * df, freq + 3 * df, 7)
+
+        # Run GPU sparse BLS
+        power_gpu, sols_gpu = sparse_bls_gpu(t, y, dy, freqs)
+
+        # Verify against single_bls
+        for i, (f, (q_gpu, phi_gpu)) in enumerate(zip(freqs, sols_gpu)):
+            p_single = single_bls(t, y, dy, f, q_gpu, phi_gpu)
+
+            # The GPU BLS result should match single_bls with the parameters it found
+            assert np.abs(power_gpu[i] - p_single) < 1e-4, \
+                f"Mismatch at freq={f}: gpu={power_gpu[i]}, single={p_single}"
+
     @pytest.mark.parametrize("ndata", [50, 100])
     @pytest.mark.parametrize("use_sparse_override", [None, True, False])
     def test_eebls_transit_auto_select(self, ndata, use_sparse_override):
diff --git a/pyproject.toml b/pyproject.toml
index 69d43b7..8b18804 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "cuvarbase"
 dynamic = ["version"]
 description = "Period-finding and variability on the GPU"
 readme = "README.rst"
-requires-python = ">=3.7"
+requires-python = ">=3.8"
 license = {text = "GPL-3.0"}
 authors = [
     {name = "John Hoffman", email = "johnh2o2@gmail.com"}
@@ -20,7 +20,6 @@ classifiers = [
     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
     "Natural Language :: English",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.7",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",