Merge pull request #177 from florisvb/robustify-tvr

pavelkomarov · web-flow · commit ff79a12e1e2d · 2025-11-21T16:01:35.000-08:00
Robustifying TVR
diff --git a/pynumdiff/kalman_smooth/_kalman_smooth.py b/pynumdiff/kalman_smooth/_kalman_smooth.py
@@ -1,12 +1,12 @@
 import numpy as np
 from warnings import warn
 from scipy.linalg import expm, sqrtm
-from scipy.stats import norm
 from time import time
-
 try: import cvxpy
 except ImportError: pass
 
+from pynumdiff.utils.utility import huber_const
+
 
 def kalman_filter(y, dt_or_t, xhat0, P0, A, Q, C, R, B=None, u=None, save_P=True):
     """Run the forward pass of a Kalman filter, with regular or irregular step size.
@@ -278,7 +278,7 @@ def robustdiff(x, dt, order, log_q, log_r, proc_huberM=6, meas_huberM=0):
     proper :math:`\\ell_1` normalization.
 
     Note that :code:`log_q` and :code:`proc_huberM` are coupled, as are :code:`log_r` and :code:`meas_huberM`, via the relation
-    :math:`\\text{Huber}(q^{-1/2}v, M) = q^{-1}\\text{Huber}(v, Mq^{-1/2})`, but these are still independent enough that for
+    :math:`\\text{Huber}(q^{-1/2}v, M) = q^{-1}\\text{Huber}(v, Mq^{1/2})`, but these are still independent enough that for
     the purposes of optimization we cannot collapse them. Nor can :code:`log_q` and :code:`log_r` be combined into
     :code:`log_qr_ratio` as in RTS smoothing without the addition of a new absolute scale parameter, becausee :math:`q` and
     :math:`r` interact with the distinct Huber :math:`M` parameters.
@@ -329,11 +329,6 @@ def convex_smooth(y, A, Q, C, R, B=None, u=None, proc_huberM=6, meas_huberM=0):
     x_states = cvxpy.Variable((A.shape[0], N)) # each column is [position, velocity, acceleration, ...] at step n
     control = isinstance(B, np.ndarray) and isinstance(u, np.ndarray) # whether there is a control input
 
-    def huber_const(M): # from https://jmlr.org/papers/volume14/aravkin13a/aravkin13a.pdf, with correction for missing sqrt
-        a = 2*np.exp(-M**2 / 2)/M # huber_const smoothly transitions Huber between 1-norm and 2-norm squared cases
-        b = np.sqrt(2*np.pi)*(2*norm.cdf(M) - 1)
-        return np.sqrt((2*a*(1 + M**2)/M**2 + b)/(a + b))
-
     # It is extremely important to run time that CVXPY expressions be in vectorized form
     proc_resids = np.linalg.inv(sqrtm(Q)) @ (x_states[:,1:] - A @ x_states[:,:-1] - (0 if not control else B @ u[1:].T)) # all Q^(-1/2)(x_n - (A x_{n-1} + B u_n))
     meas_resids = np.linalg.inv(sqrtm(R)) @ (y.reshape(C.shape[0],-1) - C @ x_states) # all R^(-1/2)(y_n - C x_n)
diff --git a/pynumdiff/optimize/_optimize.py b/pynumdiff/optimize/_optimize.py
@@ -65,8 +65,10 @@
               {'sigma': (1e-2, 1e3),
                 'lmbd': (1e-3, 0.5)}),
     tvrdiff: ({'gamma': [1e-2, 1e-1, 1, 10, 100, 1000],
-               'order': {1, 2, 3}}, # warning: order 1 hacks the loss function when tvgamma is used, tends to win but is usually suboptimal choice in terms of true RMSE
-              {'gamma': (1e-4, 1e7)}),
+               'order': {1, 2, 3}, # warning: order 1 hacks the loss function when tvgamma is used, tends to win but is usually suboptimal choice in terms of true RMSE
+              'huberM': [0., 1, 2, 6]}, # comb lower values more finely, because the scale of sigma is mad(x), bigger than mad(y-x) residuals
+              {'gamma': (1e-4, 1e7),
+              'huberM': (0, 6)}),
     velocity: ({'gamma': [1e-2, 1e-1, 1, 10, 100, 1000]}, # Deprecated method
                {'gamma': (1e-4, 1e7)}),
     iterative_velocity: ({'scale': 'small', # Rare to optimize this one, because it's longer-running than convex version
diff --git a/pynumdiff/tests/test_diff_methods.py b/pynumdiff/tests/test_diff_methods.py
@@ -57,7 +57,7 @@ def spline_irreg_step(*args, **kwargs): return splinediff(*args, **kwargs)
     (jerk, {'gamma':10}), (jerk, [10]),
     (iterative_velocity, {'num_iterations':5, 'gamma':0.05}), (iterative_velocity, [5, 0.05]),
     (smooth_acceleration, {'gamma':2, 'window_size':5}), (smooth_acceleration, [2, 5]),
-    (lineardiff, {'order':3, 'gamma':5, 'window_size':11, 'solver':'CLARABEL'}), (lineardiff, [3, 5, 11], {'solver':'CLARABEL'})
+    (lineardiff, {'order':3, 'gamma':0.01, 'window_size':11, 'solver':'CLARABEL'}), (lineardiff, [3, 0.01, 11], {'solver':'CLARABEL'})
     ]
 
 # All the testing methodology follows the exact same pattern; the only thing that changes is the
@@ -108,8 +108,8 @@ def spline_irreg_step(*args, **kwargs): return splinediff(*args, **kwargs)
                    [(-25, -25), (0, -1), (0, 0), (1, 1)],
                    [(-25, -25), (1, 1), (0, 0), (1, 1)],
                    [(-25, -25), (3, 3), (0, 0), (3, 3)]],
-    iterated_second_order: [[(-7, -8), (-25, -25), (0, -1), (0, 0)],
-                           [(-7, -8), (-14, -14), (0, -1), (0, 0)],
+    iterated_second_order: [[(-25, -25), (-25, -25), (0, -1), (0, 0)],
+                           [(-14, -14), (-14, -14), (0, -1), (0, 0)],
                            [(-1, -1), (0, 0), (0, -1), (0, 0)],
                            [(0, 0), (1, 0), (0, 0), (1, 0)],
                            [(1, 1), (2, 2), (1, 1), (2, 2)],
@@ -120,8 +120,8 @@ def spline_irreg_step(*args, **kwargs): return splinediff(*args, **kwargs)
                    [(-25, -25), (-2, -2), (0, 0), (1, 1)],
                    [(-25, -25), (1, 0), (0, 0), (1, 1)],
                    [(-25, -25), (2, 2), (0, 0), (2, 2)]],
-    iterated_fourth_order: [[(-7, -8), (-25, -25), (0, -1), (0, 0)],
-                            [(-7, -8), (-13, -13), (0, -1), (0, 0)],
+    iterated_fourth_order: [[(-25, -25), (-25, -25), (0, -1), (0, 0)],
+                            [(-14, -14), (-13, -13), (0, -1), (0, 0)],
                             [(-1, -1), (0, 0), (-1, -1), (0, 0)],
                             [(0, -1), (1, 1), (0, 0), (1, 1)],
                             [(1, 1), (2, 2), (1, 1), (2, 2)],
@@ -132,8 +132,8 @@ def spline_irreg_step(*args, **kwargs): return splinediff(*args, **kwargs)
                [(-2, -2), (0, 0), (0, -1), (1, 1)],
                [(0, 0), (1, 1), (0, -1), (1, 1)],
                [(0, 0), (3, 3), (0, 0), (3, 3)]],
-    savgoldiff: [[(-7, -7), (-13, -14), (0, -1), (0, 0)],
-                 [(-7, -7), (-13, -13), (0, -1), (0, 0)],
+    savgoldiff: [[(-13, -14), (-13, -14), (0, -1), (0, 0)],
+                 [(-13, -13), (-13, -13), (0, -1), (0, 0)],
                  [(-2, -2), (-1, -1), (0, -1), (0, 0)],
                  [(0, -1), (0, 0), (0, 0), (1, 0)],
                  [(1, 1), (2, 2), (1, 1), (2, 2)],
@@ -164,16 +164,16 @@ def spline_irreg_step(*args, **kwargs): return splinediff(*args, **kwargs)
               [(1, 1), (3, 3), (1, 1), (3, 3)]],
     velocity: [[(-25, -25), (-18, -19), (0, -1), (1, 0)],
                [(-12, -12), (-11, -12), (-1, -1), (-1, -2)],
-               [(0, 0), (1, 0), (0, 0), (1, 0)],
+               [(0, -1), (1, 0), (0, -1), (1, 0)],
                [(0, -1), (1, 1), (0, 0), (1, 0)],
-               [(1, 1), (2, 2), (1, 1), (2, 2)],
-               [(1, 0), (3, 3), (1, 0), (3, 3)]],
-    acceleration: [[(-25, -25), (-18, -18), (0, -1), (0, 0)],
+               [(1, 0), (2, 2), (1, 0), (2, 2)],
+               [(0, 0), (3, 3), (0, 0), (3, 3)]],
+    acceleration: [[(-25, -25), (-18, -18), (0, -1), (1, 0)],
                    [(-10, -10), (-9, -9), (-1, -1), (0, -1)],
                    [(-10, -10), (-9, -10), (-1, -1), (0, -1)],
                    [(0, -1), (1, 0), (0, -1), (1, 0)],
-                   [(1, 1), (2, 2), (1, 1), (2, 2)],
-                   [(1, 1), (3, 3), (1, 1), (3, 3)]],
+                   [(1, 0), (2, 2), (1, 0), (2, 2)],
+                   [(0, 0), (3, 3), (0, 0), (3, 3)]],
     jerk: [[(-25, -25), (-18, -18), (-1, -1), (0, 0)],
            [(-9, -10), (-9, -9), (-1, -1), (0, 0)],
            [(-10, -10), (-9, -10), (-1, -1), (0, 0)],
@@ -186,8 +186,8 @@ def spline_irreg_step(*args, **kwargs): return splinediff(*args, **kwargs)
                          [(1, 0), (1, 1), (1, 0), (1, 1)],
                          [(2, 1), (2, 2), (2, 1), (2, 2)],
                          [(1, 1), (3, 3), (1, 1), (3, 3)]],
-    smooth_acceleration: [[(-7, -8), (-18, -18), (0, -1), (0, 0)],
-                          [(-7, -7), (-10, -10), (-1, -1), (-1, -1)],
+    smooth_acceleration: [[(-25, -25), (-21, -21), (0, -1), (0, 0)],
+                          [(-10, -11), (-10, -10), (-1, -1), (-1, -1)],
                           [(-2, -2), (-1, -1), (-1, -1), (0, -1)],
                           [(0, 0), (1, 0), (0, -1), (1, 0)],
                           [(1, 1), (2, 2), (1, 1), (2, 2)],
@@ -222,12 +222,12 @@ def spline_irreg_step(*args, **kwargs): return splinediff(*args, **kwargs)
                  [(-7, -7), (-2, -2), (0, -1), (1, 1)],
                  [(0, 0), (2, 2), (0, 0), (2, 2)],
                  [(1, 1), (3, 3), (1, 1), (3, 3)]],
-    lineardiff: [[(-7, -8), (-14, -14), (0, -1), (0, 0)],
+    lineardiff: [[(-3, -4), (-3, -3), (0, -1), (1, 0)],
+                 [(-1, -2), (0, 0), (0, -1), (1, 0)],
+                 [(-1, -1), (0, 0), (0, -1), (1, 1)],
+                 [(-1, -2), (0, 0), (0, -1), (1, 1)],
                  [(0, 0), (2, 1), (0, 0), (2, 1)],
-                 [(1, 0), (2, 2), (1, 0), (2, 2)],
-                 [(1, 0), (2, 1), (1, 0), (2, 1)],
-                 [(1, 1), (2, 2), (1, 1), (2, 2)],
-                 [(1, 1), (3, 3), (1, 1), (3, 3)]]
+                 [(0, -1), (3, 3), (0, 0), (3, 3)]]
 }
 
 # Essentially run the cartesian product of [diff methods] x [test functions] through this one test
diff --git a/pynumdiff/total_variation_regularization/_total_variation_regularization.py b/pynumdiff/total_variation_regularization/_total_variation_regularization.py
@@ -1,5 +1,6 @@
 import numpy as np
 from warnings import warn
+from scipy.stats import median_abs_deviation
 
 from pynumdiff.total_variation_regularization import _chartrand_tvregdiff
 from pynumdiff.utils import utility
@@ -53,25 +54,28 @@ def iterative_velocity(x, dt, params=None, options=None, num_iterations=None, ga
     return x_hat, dxdt_hat
 
 
-def tvrdiff(x, dt, order, gamma, solver=None):
+def tvrdiff(x, dt, order, gamma, huberM=float('inf'), solver=None):
     """Generalized total variation regularized derivatives. Use convex optimization (cvxpy) to solve for a
     total variation regularized derivative. Other convex-solver-based methods in this module call this function.
 
     :param np.array[float] x: data to differentiate
     :param float dt: step size
     :param int order: 1, 2, or 3, the derivative to regularize
     :param float gamma: regularization parameter
+    :param float huberM: Huber loss parameter, in units of scaled median absolute deviation of input data, :code:`x`.
+                    :math:`M = \\infty` reduces to :math:`\\ell_2` loss squared on first, fidelity cost term, and
+                    :math:`M = 0` reduces to :math:`\\ell_1` loss.
     :param str solver: Solver to use. Solver options include: 'MOSEK', 'CVXOPT', 'CLARABEL', 'ECOS'.
-                    In testing, 'MOSEK' was the most robust. If not given, fall back to CVXPY's default.
+                    If not given, fall back to CVXPY's default.
 
     :return: - **x_hat** (np.array) -- estimated (smoothed) x
              - **dxdt_hat** (np.array) -- estimated derivative of x
     """
     # Normalize for numerical consistency with convex solver
-    mean = np.mean(x)
-    std = np.std(x)
-    if std == 0: std = 1 # safety guard
-    x = (x-mean)/std
+    mu = np.mean(x)
+    sigma = median_abs_deviation(x, scale='normal') # robust alternative to std()
+    if sigma == 0: sigma = 1 # safety guard
+    x = (x-mu)/sigma
 
     # Define the variables for the highest order derivative and the integration constants
     deriv_values = cvxpy.Variable(len(x)) # values of the order^th derivative, in which we're penalizing variation
@@ -84,10 +88,13 @@ def tvrdiff(x, dt, order, gamma, solver=None):
     for i in range(order):
         y = cvxpy.cumsum(y) + integration_constants[i]
 
+    # Compare the recursively integrated position to the noisy position. \ell_2 doesn't get scaled by 1/2 here,
+    # so cvxpy's doubled Huber is already the right scale, and \ell_1 should be scaled by 2\sqrt{2} to match.
+    fidelity_cost = cvxpy.sum_squares(y - x) if huberM == float('inf') \
+            else np.sqrt(8)*cvxpy.norm(y - x, 1) if huberM == 0 \
+            else utility.huber_const(huberM)*cvxpy.sum(cvxpy.huber(y - x, huberM*sigma))
     # Set up and solve the optimization problem
-    prob = cvxpy.Problem(cvxpy.Minimize(
-        # Compare the recursively integrated position to the noisy position, and add TVR penalty
-        cvxpy.sum_squares(y - x) + gamma*cvxpy.sum(cvxpy.tv(deriv_values)) ))
+    prob = cvxpy.Problem(cvxpy.Minimize(fidelity_cost + gamma*cvxpy.sum(cvxpy.tv(deriv_values)) ))
     prob.solve(solver=solver)
 
     # Recursively integrate the final derivative values to get back to the function and derivative values
@@ -102,7 +109,7 @@ def tvrdiff(x, dt, order, gamma, solver=None):
     dxdt_hat = (dxdt_hat[:-1] + dxdt_hat[1:])/2
     dxdt_hat = np.hstack((dxdt_hat, 2*dxdt_hat[-1] - dxdt_hat[-2])) # last value = penultimate value [-1] + diff between [-1] and [-2]
 
-    return x_hat*std+mean, dxdt_hat*std # derivative is linear, so scale derivative by std
+    return x_hat*sigma+mu, dxdt_hat*sigma # derivative is linear, so scale derivative by scatter
 
 
 def velocity(x, dt, params=None, options=None, gamma=None, solver=None):
diff --git a/pynumdiff/utils/evaluate.py b/pynumdiff/utils/evaluate.py
@@ -86,18 +86,16 @@ def robust_rme(u, v, padding=0, M=6):
     :param np.array[float] v: e.g. estimated smoothed signal, reconstructed from derivative
     :param int padding: number of snapshots on either side of the array to ignore when calculating
         the metric. If :code:`'auto'`, defaults to 2.5% of the size of inputs
-    :param float M: Huber loss parameter in units of ~1.4*mean absolute deviation, intended to approximate
-        standard deviation robustly.
+    :param float M: Huber loss parameter in units of ~1.4*mean absolute deviation of population of residual
+        errors, intended to approximate standard deviation robustly.
 
     :return: (float) -- Robust root mean error between u and v
     """
     if padding == 'auto': padding = max(1, int(0.025*len(u)))
     s = slice(padding, len(u)-padding) # slice out data we want to measure
-    N = s.stop - s.start
 
     sigma = stats.median_abs_deviation(u[s] - v[s], scale='normal') # M is in units of this robust scatter metric
-    if sigma < 1e-6: sigma = 1 # guard against divide by zero
-    return np.sqrt(2*np.mean(utility.huber((u[s] - v[s])/sigma, M))) * sigma
+    return np.sqrt(2*np.mean(utility.huber(u[s] - v[s], M*sigma)))
 
 
 def rmse(u, v, padding=0):
diff --git a/pynumdiff/utils/utility.py b/pynumdiff/utils/utility.py
@@ -2,7 +2,7 @@
 import numpy as np
 from scipy.integrate import cumulative_trapezoid
 from scipy.optimize import minimize
-from scipy.stats import median_abs_deviation
+from scipy.stats import median_abs_deviation, norm
 
 
 def hankel_matrix(x, num_delays, pad=False): # fixed delay step of 1
@@ -102,6 +102,13 @@ def huber(x, M):
     absx = np.abs(x)
     return np.where(absx <= M, 0.5*x**2, M*(absx - 0.5*M))
 
+def huber_const(M):
+    """Scale that makes :code:`sum(huber())` interpolate :math:`\\sqrt{2}\\|\\cdot\\|_1` and :math:`\\frac{1}{2}\\|\\cdot\\|_2^2`,
+    from https://jmlr.org/papers/volume14/aravkin13a/aravkin13a.pdf, with correction for missing sqrt"""
+    a = 2*np.exp(-M**2 / 2)/M
+    b = np.sqrt(2*np.pi)*(2*norm.cdf(M) - 1)
+    return np.sqrt((2*a*(1 + M**2)/M**2 + b)/(a + b))
+
 
 def integrate_dxdt_hat(dxdt_hat, dt_or_t):
     """Wrapper for scipy.integrate.cumulative_trapezoid. Use 0 as first value so lengths match, see #88.
@@ -123,21 +130,20 @@ def estimate_integration_constant(x, x_hat, M=6):
 
     :param np.array[float] x: timeseries of measurements
     :param np.array[float] x_hat: smoothed estimate of x
-    :param float M: robustifies constant estimation using Huber loss. The default is intended to capture the idea
-        of "six sigma": Assuming Gaussian inliers and M in units of standard deviation, the portion of inliers
-        beyond the Huber loss' transition is only about 1.97e-9. M here is in units of scaled mean absolute deviation,
-        so scatter can be calculated and used to normalize data without being thrown off by outliers.
+    :param float M: constant estimation is robustified with the Huber loss. :code:`M` here is in units of scaled
+        mean absolute deviation of residuals, so scatter can be calculated and used to normalize without being
+        thrown off by outliers. The default is intended to capture the idea of "six sigma": Assuming Gaussian
+        :code:`x - xhat` errors, the portion of inliers beyond the Huber loss' transition is only about 1.97e-9.
 
     :return: **integration constant** (float) -- initial condition that best aligns x_hat with x
     """
-    if M == float('inf'): # calculates the constant to be mean(diff(x, x_hat)), equivalent to argmin_{x0} ||x_hat + x0 - x||_2^2
-        return np.mean(x - x_hat) # Solves the L2 distance minimization
-    elif M < 0.1: # small M looks like L1 loss, and Huber gets too flat to work well
-        return np.median(x - x_hat) # Solves the L1 distance minimization
+    sigma = median_abs_deviation(x - x_hat, scale='normal') # M is in units of this robust scatter metric
+    if M == float('inf') or sigma < 1e-3: # If no scatter, then no outliers, so use L2
+        return np.mean(x - x_hat) # Solves the l2 distance minimization, argmin_{x0} ||x_hat + x0 - x||_2^2
+    elif M < 1e-3: # small M looks like l1 loss, and Huber gets too flat to work well
+        return np.median(x - x_hat) # Solves the l1 distance minimization, argmin_{x0} ||x_hat + x0 - x||_1
     else:
-        sigma = median_abs_deviation(x - x_hat, scale='normal') # M is in units of this robust scatter metric
-        if sigma < 1e-6: sigma = 1 # guard against divide by zero
-        return minimize(lambda x0: np.mean(huber((x - (x_hat+x0))/sigma, M)), # fn to minimize in 1st argument
+        return minimize(lambda x0: np.sum(huber(x - (x_hat+x0), M*sigma)), # fn to minimize in 1st argument
             0, method='SLSQP').x[0] # result is a vector, even if initial guess is just a scalar