diff --git a/qstack/qml/b2r2.py b/qstack/qml/b2r2.py index f3db3d7..0c0f98f 100644 --- a/qstack/qml/b2r2.py +++ b/qstack/qml/b2r2.py @@ -13,6 +13,10 @@ defaults = SimpleNamespace(rcut=3.5, gridspace=0.03) +class Reaction: + def __init__(self, reactants, products): + self.reactants = reactants + self.products = products def get_bags(unique_ncharges): """Generate all unique element pair combinations including self-interactions. diff --git a/qstack/regression/hyperparameters.py b/qstack/regression/hyperparameters.py index c762b38..1cefc5b 100644 --- a/qstack/regression/hyperparameters.py +++ b/qstack/regression/hyperparameters.py @@ -1,19 +1,333 @@ """Hyperparameter optimization.""" -import sys +import sys, logging import numpy as np import scipy from sklearn.model_selection import KFold +from sklearn.utils.parallel import Parallel, delayed from qstack.mathutils.fps import do_fps from qstack.tools import correct_num_threads from .kernel_utils import get_kernel, defaults, train_test_split_idx, sparse_regression_kernel from .parser import RegressionParser +logger = logging.getLogger("qstack.regression.hyperparameters") + + +# ##################### +# parabola-based line search + +def fit_quadratic(x1,x2,x3, y1,y2,y3): + """ + Compute the three coefficients of a quadratic polynomial going through three given points. + Could probably be replaced by `np.polyfit` now that I know about it. Fluff it, we ball. + """ + # we need to change coordinates around for this + + # first, slopes at 0.5(x1+x2) and 0.5(x2+x3) + # the 'meta-slope' allows us to get 2*curvature + slope1 = (y2-y1)/(x2-x1) + slope2 = (y3-y2)/(x3-x2) + curv = (slope2-slope1)/(x3-x1) # the "0.5*" disappears + + # then, remove the curvature from points 1 and 2 to determine 1st-degree term + y1_b = y1 - curv*x1**2 + y2_b = y2 - curv*x2**2 + slope = (y2_b-y1_b)/(x2-x1) + + # finally, the intercept + intercept = y1_b - slope*x1 + return curv, slope, intercept + +def parabolic_search(x_left, x_right, get_err, n_iter=10, x_thres=0.1, y_thres=0.01): + """ + Gradient-less line search of the minimum of `get_err`, supposedly between `x_left` and `x_right`. + Fits quadratic polynomials to perform this search, meaning `get_err` is assumed to be convex. + + Args: + x_left (float): supposed left bound of the minimum of `get_err` + x_right (float): supposed right bound of the minimum of `get_err` + get_err (callable float->float): the function to minimise. + n_iter (int): the number of function calls allowed + x_thres (float): the acceptable error threshold for the the argmin to find + y_thres (float): the acceptable error threshold for the min to find + + Returns: + the (argmin, min) tuple characterising the minimum of the function (2x float) + """ + + y_left = get_err(x_left) + y_right = get_err(x_right) + x_center = 0.5*(x_left+x_right) + y_center = get_err(x_center) + + all_errs = [(x_left,y_left),(x_center,y_center),(x_right,y_right)] + + while y_left < y_center or y_right < y_center: + # while it looks like we need to look elsewhere than in our original bracket + # (because the center isn't closer to the local minimum than the bounds) + if y_left < y_right: + logger.debug('no local minimum in sight, extending to the left...') + x_right, y_right = x_center, y_center + x_center, y_center = x_left, y_left + x_left = 2*x_center - x_right + y_left = get_err(x_left) + all_errs.insert(0, (x_left,y_left)) + else: + logger.debug('no local minimum in sight, extending to the right...') + x_left, y_left = x_center, y_center + x_center, y_center = x_right, y_right + x_right = 2*x_center - x_left + y_right = get_err(x_right) + all_errs.append((x_right,y_right)) + n_iter -= 1 + if n_iter <=0: + break + # after this point, either we are exiting early or we have found the right bounds + all_errs.sort() + logger.debug('local minimum in bounds, proceeding with parabolic search (bounds at: %r)', all_errs) + logger.debug('chosen: %f\\%f/%f', x_left, x_center, x_right) + while n_iter > 0: + a,b,c = fit_quadratic(x_left, x_center, x_right, y_left, y_center, y_right) + if a<=0: # lol no local minimum + logger.debug('no local minimum...') + if x_left < x_right: + x_new = 0.5*(x_left+x_center) + ypred_new = np.nan + else: + x_new = 0.5*(x_right+x_center) + ypred_new = np.nan + else: + x_new = -0.5*b/a + ypred_new = -0.25*b**2/a + c + y_new = get_err(x_new) + n_iter -=1 + logger.debug('from chosen points %f\\%f/%f', x_left, x_center, x_right) + logger.debug('predicted local minimum at %f->%f, true error %f', x_new, ypred_new, y_new) + all_errs.append((x_new, y_new)) ; all_errs.sort() + logger.debug('current data: %r', all_errs) + + if x_new < x_left or x_new > x_right: + logger.debug('predicted local minimum not in immediate bounds, regaining bearings...') + new_index = np.argmin(np.asarray(all_errs)[:,1]) + if new_index in (0, len(all_errs)-1): + raise AssertionError('edges of the search are somehow the minimum in second phase of function') + x_left, y_left = all_errs[new_index-1] + x_right, y_right = all_errs[new_index+1] + x_center, y_center = all_errs[new_index] + + elif max(y_right,y_left, y_new)-min(y_new, y_center) < y_thres: + break + elif y_new > y_center: + if x_new > x_center: + x_right, y_right = x_new, y_new + else: + x_left, y_left = x_new, y_new + else: # if y_new <= y_center + if x_new > x_center: + x_left, y_left = x_center, y_center + x_center, y_center = x_new, y_new + else: + x_right, y_right = x_center, y_center + x_center, y_center = x_new, y_new + + if abs(x_right - x_left) < x_thres: + break + + opt_idx = np.argmin(np.asarray(all_errs)[:,1]) + return all_errs[opt_idx] + + +def standard_grid_search(x_list, get_err): + errors = [] + for x in x_list: + errors.append(get_err(x)) + errors = np.asarray(errors) + xi = errors.argmin() + return xi, errors[xi] + +def adaptative_grid_search(x_list, get_err): + work_list = x_list + errors = [] + direction = None + while True: + errors = list(errors) + for x in work_list: + errors.append((x,get_err(x))) + errors = np.array(errors) + ind = np.argsort(errors[:,1])[::-1] + errors = errors[ind] + + current_argmin = errors[-1,0] + new_sigma = None + + if direction is None: + if current_argmin==max(work_list): + direction = 'up' + elif current_argmin==min(work_list): + direction = 'down' + + # at the 1st iteration if is checked twice on purpose + if direction=='up' and current_argmin==max(work_list): + work_list = current_argmin*np.array(defaults.sigmaarr_mult[1:]) + elif direction=='down' and current_argmin==min(work_list): + work_list = current_argmin/np.array(defaults.sigmaarr_mult[1:]) + else: + break + + print('next iteration:', work_list, flush=True) + return errors + +# ##################### +# main functions of the hyperparameter optimisation + + +def kfold_alpha_eval(K_all, y_train, n_splits, eta_list, sparse, parallel = None): + """Module-internal function: optimise alpha (regularisation parameter) of a KRR learning model, using a K-fold validation. + + Args: + K_all: matrix of kernel values (can be n_total*n_total for naive KRR or n_total*n_references for sparse KRR) + y_train: learnable properties for all inputs (n_total-length vector) + n_splits: number of folds for k-fold validation + eta_list: all the values of eta (KRR regularisation parameter) to try (array-like) + sparse: whether the KRR to run is sparse (bool) + parallel: optional joblib.Parallel instance to use to parallelise this function (by default one is constructed) + + Returns: + errors: array of "entries", each with the three values (np.ndarray, shape (len(eta_list),3) ): + mean: mean (over k-folds) validation error for this value of eta + stddev: standard deviation of the same error + eta: the corresponding value of eta + """ + if parallel is None: + parallel = Parallel(n_jobs=-1, return_as="generator_unordered") + kfold = KFold(n_splits=n_splits, shuffle=False) + maes = np.full((n_splits, len(eta_list)), np.inf) + y_train = np.asarray(y_train) + + + def inner_call(fold_i, eta_i, K_all, sparse, y_train, eta, train_idx, test_idx): + y_kf_train, y_kf_test = y_train[train_idx], y_train[test_idx] + + if not sparse: + K_solve = K_all [np.ix_(train_idx,train_idx)] + if np.may_share_memory(K_solve, K_all): + K_solve = K_solve.copy() + K_solve[np.diag_indices_from(K_solve)] += eta + y_solve = y_kf_train + Ks = K_all [np.ix_(test_idx,train_idx)] + else: + K_solve, y_solve = sparse_regression_kernel(K_all[train_idx], y_kf_train, slice(None), eta) + Ks = K_all[test_idx] + + try: + alpha = scipy.linalg.solve(K_solve, y_solve, assume_a='pos', overwrite_a=True) + except scipy.linalg.LinAlgError: + print('singular matrix') + raise + y_kf_predict = np.dot(Ks, alpha) + return fold_i, eta_i, np.mean(np.abs(y_kf_predict-y_kf_test)) + + mae_generator = parallel( + delayed(inner_call)(fold_i, eta_i, K_all, sparse, y_train, eta, t,v) + for eta_i,eta in enumerate(eta_list) + for fold_i,(t,v) in enumerate(kfold.split(y_train)) + ) + for split_i, eta_i, mae in mae_generator: + maes[split_i, eta_i] = mae + + concat_results = np.full((len(eta_list), 3), np.inf) + for eta_i in range(len(eta_list)): + res = maes[:,eta_i] + #res = res[np.isfinite(res)] + concat_results[eta_i,0] = res.mean() + concat_results[eta_i,1] = res.std() + concat_results[eta_i,2] = eta_list[eta_i] + return concat_results + +def search_sigma( + X_train, y_train, splits, + kernel, sigma, eta, + sparse_idx=None, + n_sigma_iter=5, stddev_portion=0.0, + adaptive=False, adaptive_v2=False, + read_kernel=False, printlevel=0, +): + """Search the optimal values of sigma and alpha for a KRR model with known representations. + Sigma is the width parameter of the kernel function used, + and alpha is the regularisation parameter of the resulting matrix equation. + + Internally, this can call for either a simple grid search, or be modified as so: + - the grid is adaptative for sigma (adaptive) + - the grid search for sigma becomes a continuous line search (adaptive_v2) + No matter what, the optimisation of alpha is done over a grid, with k-fold validation. + + Args: + X (np.ndarray[n_total,n_features]: feature vectors for the combined train-validation dataset + y (np.ndarray[n_total]): learnable properties for all inputs + sigma (array-like(float)): values of sigma. for `adaptive`, starting values, for `adaptive_v2`, only the first and last values are used, as presumed bounds of the optimal value of sigma + alpha_grid (array-like of floats): values of alpha to try + n_sigma_iter (int): number of iterations for the sigma line-search (if adaptive_v2) + n_splits (int): number of folds for k-fold validation + stddev_portion (float): contribution of the error's standard deviation to compare error distributions + sparse_idx (optional np.ndarray[int, n_references]): selection of reference inputs for sparse KRR. + adaptive (bool): to use the adaptive grid for sigma + adaptive_v2 (bool): to use the line search for sigma + read_kernel (bool): to completely discard sigma, assuming the representation array is a precomputed kernel array + printlevel (int): level of verbosity + + Returns: + sigma (float): optimal value of sigma + alpha (float): optimal value of alpha + costs (np.ndarray[n_splits]): validation error distribution for these values of sigma,alpha + """ + errors = [] + + def get_err(s): + if read_kernel is False: + K_all = kernel(X_train, X_train, 1.0/s) + else: + K_all = X_train + + sparse = sparse_idx != None + if sparse: + K_all = K_all[:,sparse_idx] + + results_per_eta = kfold_alpha_eval( + K_all, y_train, splits, eta, sparse, + parallel = parallel, + ) + for mean,std,e in results_per_eta: + if printlevel>0 : + sys.stderr.flush() + print(s, e, mean, std, flush=True) + errors.append((mean, std, e, s)) + + costs = results_per_eta[:,0] + stddev_portion*results_per_eta[:,1] + return costs.min() + + with Parallel(n_jobs=-1) as parallel: + if adaptive_v2: + assert not adaptive + _, _ = parabolic_search( + np.log(sigma[0]), np.log(sigma[-1]), + lambda log_s: get_err(np.exp(log_s)), + n_iter=n_sigma_iter, x_thres=0.1, y_thres=0.01, + ) + elif adaptive: + _ = adaptative_grid_search(sigma, get_err) + else: + for s in sigma: + get_err(s) + + return np.asarray(errors) def hyperparameters(X, y, - sigma=defaults.sigmaarr, eta=defaults.etaarr, akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict, + sigma=defaults.sigmaarr, eta=defaults.etaarr, sparse=None, + akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict, read_kernel=False, test_size=defaults.test_size, splits=defaults.splits, idx_test=None, idx_train=None, - printlevel=0, adaptive=False, read_kernel=False, sparse=None, random_state=defaults.random_state): + stddev_portion=0.0, n_sigma_iters=5, + printlevel=0, adaptive=False, adaptive_v2=False, random_state=defaults.random_state, +): """Perform a Kfold cross-validated hyperparameter optimization (for width of kernel and regularization parameter). Args: @@ -21,17 +335,20 @@ def hyperparameters(X, y, y (numpy.1darray[Nsamples]): Array containing the target property of all Nsamples. sigma (list): List of kernel width for the grid search. eta (list): List of regularization strength for the grid search. + sparse (int): The number of reference environnments to consider for sparse regression. akernel (str): Local kernel ('L' for Laplacian, 'G' for Gaussian, 'dot', 'cosine'). gkernel (str): Global kernel (None, 'REM', 'avg'). gdict (dict): Parameters of the global kernels. + read_kernel (bool): If 'X' is a kernel and not an array of representations. test_size (float or int): Test set fraction (or number of samples). splits (int): K number of splits for the Kfold cross-validation. idx_test (numpy.1darray): List of indices for the test set (based on the sequence in X). idx_train (numpy.1darray): List of indices for the training set (based on the sequence in X). + stddev_portion (float): The amount of error standard deviation to add to error means, for error distribution ranking. + n_sigma_iters (int): for adaptive_v2, the number of iterations to run the sigma line search for. printlevel (int): Controls level of output printing. adaptive (bool): To expand the grid search adaptatively. - read_kernel (bool): If 'X' is a kernel and not an array of representations. - sparse (int): The number of reference environnments to consider for sparse regression. + adaptive_v2 (bool): To optimise sigma though line search rather than grid search, using the ends of the `sigma` list as presumed lower/upper bounds for the optimal value. random_state (int): The seed used for random number generator (controls train/test splitting). Returns: @@ -42,46 +359,7 @@ def hyperparameters(X, y, Raises: RuntimeError: If 'X' is a kernel and sparse regression is chosen. """ - def k_fold_opt(K_all, eta): - kfold = KFold(n_splits=splits, shuffle=False) - all_maes = [] - for train_idx, test_idx in kfold.split(X_train): - y_kf_train, y_kf_test = y_train[train_idx], y_train[test_idx] - - if not sparse: - K_solve = np.copy(K_all [np.ix_(train_idx,train_idx)]) - K_solve[np.diag_indices_from(K_solve)] += eta - y_solve = y_kf_train - Ks = K_all [np.ix_(test_idx,train_idx)] - else: - K_solve, y_solve = sparse_regression_kernel(K_all[train_idx], y_kf_train, sparse_idx, eta) - Ks = K_all [np.ix_(test_idx,sparse_idx)] - - try: - alpha = scipy.linalg.solve(K_solve, y_solve, assume_a='pos', overwrite_a=True) - except scipy.linalg.LinAlgError: - print('singular matrix') - all_maes.append(np.nan) - break - y_kf_predict = np.dot(Ks, alpha) - all_maes.append(np.mean(np.abs(y_kf_predict-y_kf_test))) - return np.mean(all_maes), np.std(all_maes) - - def hyper_loop(sigma, eta): - errors = [] - for s in sigma: - if read_kernel is False: - K_all = kernel(X_train, X_train, 1.0/s) - else: - K_all = X_train - - for e in eta: - mean, std = k_fold_opt(K_all, e) - if printlevel>0 : - sys.stderr.flush() - print(s, e, mean, std, flush=True) - errors.append((mean, std, e, s)) - return errors + if gkernel is None: gwrap = None else: @@ -92,55 +370,88 @@ def hyper_loop(sigma, eta): test_size=test_size, random_state=random_state) if read_kernel is False: X_train = X[idx_train] + optimise_sigma = True else: X_train = X[np.ix_(idx_train,idx_train)] - sigma = [np.nan] + optimise_sigma = False if sparse: if read_kernel: raise RuntimeError('Cannot do FPS with kernels') sparse_idx = do_fps(X_train)[0][:sparse] + else: + sparse_idx = None - work_sigma = sigma - errors = [] - direction = None - while True: - errors = list(errors) - errors.extend(hyper_loop(work_sigma, eta)) - errors = np.array(errors) - ind = np.argsort(errors[:,0])[::-1] - errors = errors[ind] + with Parallel(n_jobs=1, return_as="generator_unordered") as parallel: + if optimise_sigma: + errors = search_sigma( + X_train, y_train, splits, + kernel, sigma, eta, sparse_idx, + n_sigma_iters, stddev_portion, + adaptive, adaptive_v2, + read_kernel, printlevel, + ) + else: + K_all = X_train + sparse = sparse_idx is not None + if sparse: + K_all = K_all[:, sparse_idx] + sigma = np.nan + partial_errors = kfold_alpha_eval( + K_all, y, splits, eta, + sparse, parallel, + ) + errors = np.ndarray((len(partial_errors),4)) + errors[:,:3] = partial_errors + errors[:,3] = np.nan - if not adaptive: - break + sorter = (errors[:,0] + stddev_portion*errors[:,1]).argsort() + errors = errors[sorter[::-1]] - best_sigma = errors[-1][3] - new_sigma = None + # work_sigma = sigma + # errors = [] + # direction = None + # while True: + # errors = list(errors) + # errors.extend(hyper_loop(work_sigma, eta)) + # errors = np.array(errors) + # ind = np.argsort(errors[:,0])[::-1] + # errors = errors[ind] - if direction is None: - if best_sigma==max(work_sigma): - direction = 'up' - elif best_sigma==min(work_sigma): - direction = 'down' + # if not adaptive: + # break - # at the 1st iteration if is checked twice on purpose - if direction=='up' and best_sigma==max(work_sigma): - new_sigma = best_sigma*np.array(defaults.sigmaarr_mult[1:]) - elif direction=='down' and best_sigma==min(work_sigma): - new_sigma = best_sigma/np.array(defaults.sigmaarr_mult[1:]) + # best_sigma = errors[-1][3] + # new_sigma = None - if new_sigma is None: - break - work_sigma = new_sigma - print('next iteration:', work_sigma, flush=True) + # if direction is None: + # if best_sigma==max(work_sigma): + # direction = 'up' + # elif best_sigma==min(work_sigma): + # direction = 'down' + + # # at the 1st iteration if is checked twice on purpose + # if direction=='up' and best_sigma==max(work_sigma): + # new_sigma = best_sigma*np.array(defaults.sigmaarr_mult[1:]) + # elif direction=='down' and best_sigma==min(work_sigma): + # new_sigma = best_sigma/np.array(defaults.sigmaarr_mult[1:]) + + # if new_sigma is None: + # break + # work_sigma = new_sigma + # print('next iteration:', work_sigma, flush=True) return errors def _get_arg_parser(): """Parse CLI arguments.""" parser = RegressionParser(description='This program finds the optimal hyperparameters.', hyperparameters_set='array') - parser.remove_argument("random_state") parser.remove_argument("train_size") + parser.add_argument('--ada2', action='store_true', dest='adaptive_v2', default=False, help='whether to use a continuous adaptative approach to sigma. If so, only the first and last sigma values will be used to start the optimisation.') + parser.add_argument('--stddev-portion', type=float, dest='stddev_portion', default=0.0, help='amount of error standard deviation to add to error means, for error distribution ranking in the output.') + parser.add_argument('--sigma-iterations', type=int, dest='n_sigma_iters', default=5, help='number of iterations for the sigma-optimisation line search') + + return parser @@ -156,10 +467,15 @@ def main(): X = np.load(args.repr) y = np.loadtxt(args.prop) - errors = hyperparameters(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, - akernel=args.akernel, gkernel=args.gkernel, gdict=args.gdict, - sparse=args.sparse, - test_size=args.test_size, splits=args.splits, printlevel=args.printlevel, adaptive=args.adaptive) + errors = hyperparameters( + X, y, + sigma=args.sigma, eta=args.eta, sparse=args.sparse, + akernel=args.akernel, gkernel=args.gkernel, gdict=args.gdict, read_kernel=args.readk, + test_size=args.test_size, splits=args.splits, idx_test=None, idx_train=None, + stddev_portion=args.stddev_portion, n_sigma_iters=args.n_sigma_iters, + printlevel=args.printlevel, adaptive=args.adaptive, adaptive_v2=args.adaptive_v2, + random_state=args.random_state, + ) errors = np.array(errors) if args.nameout is not None: np.savetxt(args.nameout, errors, header="error stdev eta sigma") diff --git a/qstack/regression/hyperparameters2.py b/qstack/regression/hyperparameters2.py new file mode 100644 index 0000000..7a7147b --- /dev/null +++ b/qstack/regression/hyperparameters2.py @@ -0,0 +1,412 @@ +"""Hyperparameter optimisation using a smoother version of sigma selection, +using a no-gradient line search""" + +import sys, logging +import numpy as np +import scipy +from sklearn.metrics import mean_absolute_error +from sklearn.model_selection import KFold +from sklearn.utils.parallel import Parallel, delayed +from qstack.mathutils.fps import do_fps +from qstack.tools import correct_num_threads +from .kernel_utils import get_kernel, defaults, train_test_split_idx +from .parser import RegressionParser + +logger = logging.getLogger("qstack.regression.hyperparameters2") + +# ##################### +# parabola-based line search + + +def fit_quadratic(x1,x2,x3, y1,y2,y3): + """ + Compute the three coefficients of a quadratic polynomial going through three given points. + Could probably be replaced by `np.polyfit` now that I know about it. Fluff it, we ball. + """ + # we need to change coordinates around for this + + # first, slopes at 0.5(x1+x2) and 0.5(x2+x3) + # the 'meta-slope' allows us to get 2*curvature + slope1 = (y2-y1)/(x2-x1) + slope2 = (y3-y2)/(x3-x2) + curv = (slope2-slope1)/(x3-x1) # the "0.5*" disappears + + # then, remove the curvature from points 1 and 2 to determine 1st-degree term + y1_b = y1 - curv*x1**2 + y2_b = y2 - curv*x2**2 + slope = (y2_b-y1_b)/(x2-x1) + + # finally, the intercept + intercept = y1_b - slope*x1 + return curv, slope, intercept + +def parabolic_search(x_left, x_right, get_err, n_iter=10, x_thres=0.1, y_thres=0.01): + """ + Gradient-less line search of the minimum of `get_err`, supposedly between `x_left` and `x_right`. + Fits quadratic polynomials to perform this search, meaning `get_err` is assumed to be convex. + + Args: + x_left (float): supposed left bound of the minimum of `get_err` + x_right (float): supposed right bound of the minimum of `get_err` + get_err (callable float->float): the function to minimise. + n_iter (int): the number of function calls allowed + x_thres (float): the acceptable error threshold for the the argmin to find + y_thres (float): the acceptable error threshold for the min to find + + Returns: + the (argmin, min) tuple characterising the minimum of the function (2x float) + """ + + y_left = get_err(x_left) + y_right = get_err(x_right) + x_center = 0.5*(x_left+x_right) + y_center = get_err(x_center) + + all_errs = [(x_left,y_left),(x_center,y_center),(x_right,y_right)] + + while y_left < y_center or y_right < y_center: + # while it looks like we need to look elsewhere than in our original bracket + # (because the center isn't closer to the local minimum than the bounds) + if y_left < y_right: + logger.debug('no local minimum in sight, extending to the left...') + x_right, y_right = x_center, y_center + x_center, y_center = x_left, y_left + x_left = 2*x_center - x_right + y_left = get_err(x_left) + all_errs.insert(0, (x_left,y_left)) + else: + logger.debug('no local minimum in sight, extending to the right...') + x_left, y_left = x_center, y_center + x_center, y_center = x_right, y_right + x_right = 2*x_center - x_left + y_right = get_err(x_right) + all_errs.append((x_right,y_right)) + n_iter -= 1 + if n_iter <=0: + break + # after this point, either we are exiting early or we have found the right bounds + all_errs.sort() + logger.debug('local minimum in bounds, proceeding with parabolic search (bounds at: %r)', all_errs) + logger.debug('chosen: %f\\%f/%f', x_left, x_center, x_right) + while n_iter > 0: + a,b,c = fit_quadratic(x_left, x_center, x_right, y_left, y_center, y_right) + if a<=0: # lol no local minimum + logger.debug('no local minimum...') + if x_left < x_right: + x_new = 0.5*(x_left+x_center) + ypred_new = np.nan + else: + x_new = 0.5*(x_right+x_center) + ypred_new = np.nan + else: + x_new = -0.5*b/a + ypred_new = -0.25*b**2/a + c + y_new = get_err(x_new) + n_iter -=1 + logger.debug('from chosen points %f\\%f/%f', x_left, x_center, x_right) + logger.debug('predicted local minimum at %f->%f, true error %f', x_new, ypred_new, y_new) + all_errs.append((x_new, y_new)) ; all_errs.sort() + logger.debug('current data: %r', all_errs) + + if x_new < x_left or x_new > x_right: + logger.debug('predicted local minimum not in immediate bounds, regaining bearings...') + new_index = np.argmin(np.asarray(all_errs)[:,1]) + if new_index in (0, len(all_errs)-1): + raise AssertionError('edges of the search are somehow the minimum in second phase of function') + x_left, y_left = all_errs[new_index-1] + x_right, y_right = all_errs[new_index+1] + x_center, y_center = all_errs[new_index] + + elif max(y_right,y_left, y_new)-min(y_new, y_center) < y_thres: + break + elif y_new > y_center: + if x_new > x_center: + x_right, y_right = x_new, y_new + else: + x_left, y_left = x_new, y_new + else: # if y_new <= y_center + if x_new > x_center: + x_left, y_left = x_center, y_center + x_center, y_center = x_new, y_new + else: + x_right, y_right = x_center, y_center + x_center, y_center = x_new, y_new + + if abs(x_right - x_left) < x_thres: + break + + opt_idx = np.argmin(np.asarray(all_errs)[:,1]) + return all_errs[opt_idx] + + + +def kfold_alpha_eval(K_all, y, n_splits, alpha_grid, parallel=None, on_compute=(lambda eta,err,stderr:None)): + """Module-internal function: optimise alpha (regularisation parameter) of a KRR learning model, using a K-fold validation. + + Args: + K_all: matrix of kernel values (can be n_total*n_total for naive KRR or n_total*n_references for sparse KRR) + y: learnable properties for all inputs (n_total-length vector) + n_splits: number of folds for k-fold validation + alpha_grid: all the values of alpha to try (array-like) + parallel: optional joblib.Parallel instance to use to parallelise this function (by default one is constructed) + on_compute: function to call for the error summaries of each value of alpha + (callable: alpha, error_mean, error_stddev -> None) + + Returns: + - optimal value of alpha + - validation error list for all k-fold evaluations for this value of alpha + """ + if parallel is None: + parallel = Parallel(n_jobs=-1, return_as="generator_unordered") + kfold = KFold(n_splits=n_splits, shuffle=False) + maes = np.full((kfold.get_n_splits(), len(alpha_grid)), np.inf) + y = np.asarray(y) + is_sparse = K_all.shape[0] != K_all.shape[1] + + def inner_loop(split_i, alpha_i, train_idx, val_idx, alpha): + y_val = y[val_idx] + y_train = y[train_idx] + if is_sparse: + K_train = K_all[train_idx, :] + K_val = K_all[val_idx, :] + y_train = K_train.T @ y_train + K_train = K_train.T @ K_train + else: + K_train = K_all[np.ix_(train_idx,train_idx)] + K_val = K_all [np.ix_(val_idx,train_idx)] + if np.may_share_memory(K_train, K_all): + K_train = K_train.copy() + K_train[np.diag_indices_from(K_train)] += alpha + + try: + weights = scipy.linalg.solve(K_train, y_train, assume_a='pos', overwrite_a=True) + except Exception as err: + raise + # bad fit (singular matrix)! + return split_i, alpha_i, np.inf + predict = K_val @ weights + return split_i, alpha_i, mean_absolute_error(y_val, predict) + + mae_generator = parallel( + delayed(inner_loop)(s_i,a_i, t,v,a) + for a_i,a in enumerate(alpha_grid) + for s_i,(t,v) in enumerate(kfold.split(y)) + ) + for split_i, alpha_i, mae in mae_generator: + maes[split_i, alpha_i] = mae + + concat_results = np.full((len(alpha_grid), 2), np.inf) + for alpha_i in range(len(alpha_grid)): + if not np.isfinite(maes[:, alpha_i]).any(): + pass + else: + res = maes[:,alpha_i] + res = res[np.isfinite(res)] + concat_results[alpha_i,0] = res.mean() + concat_results[alpha_i,1] = res.std() + on_compute(alpha_grid[alpha_i], *concat_results[alpha_i]) + #print("kfold evaluation for alpha grid",alpha_grid,concat_results) + selected_alpha_i = concat_results[:,0].argmin() + return alpha_grid[selected_alpha_i], maes[:,selected_alpha_i]#, models[:, selected_alpha_i].copy() + + +def search_sigma( + X, y, kernel, + sigma_bounds, alpha_grid, + n_iter, n_splits, + stddev_portion=+1.0, sparse_idx=None, + parallel=None, on_compute=(lambda sigma,alpha,err,stderr:None) +): + """Search the optimal values of sigma and alpha for a KRR model with known representations. + Sigma is the width parameter of the kernel function used, + and alpha is the regularisation parameter of the resulting matrix equation. + + Internally, calls the line-search rountine for gamma, + where the function to minimise performs its own grid-based optimisation of alpha. + + Args: + X (np.ndarray[n_total,n_features]: feature vectors for the combined train-validation dataset + y (np.ndarray[n_total]): learnable properties for all inputs + sigma_bounds (tuple(float,float)): presumed bounds of the optimal value of sigma + alpha_grid (array-like of floats): values of alpha to try + n_iter (int): number of iterations for the sigma line-search + n_splits (int): number of folds for k-fold validation + stddev_portion (float): contribution of the error's standard deviation to compare error distributions + sparse_idx (optional np.ndarray[int, n_references]): selection of reference inputs for sparse KRR. + parallel (optional joblib.Parallel): tool to make the optimisation more parallel. by default, one will be (re)created as often as necessary. + on_compute (callable sigma,alpha,err_mean,err_stddev -> None) + + Returns: + sigma (float): optimal value of sigma + alpha (float): optimal value of alpha + costs (np.ndarray[n_splits]): validation error distribution for these values of sigma,alpha + """ + + sigma_left, sigma_right = sigma_bounds + + err_dict = {} + + def get_err(log_sigma): + sigma = np.exp(log_sigma) + + K_all = kernel(X, X, 1.0/sigma) + if sparse_idx is not None: + K_all = K_all[:, sparse_idx] + + alpha, costs = kfold_alpha_eval( + K_all, y, n_splits, alpha_grid, + parallel=parallel, on_compute=(lambda eta,err,stderr: on_compute(sigma,eta,err,stderr)), + ) + err_dict[log_sigma] = (alpha,costs) + cost_res = costs.mean() + stddev_portion*costs.std() + #print("now eval'ing σ=", sigma, '... α=', alpha, costs.shape, costs.mean(), costs.std()) + return cost_res + + log_sigma_selected, cost_selected = parabolic_search( + np.log(sigma_left), np.log(sigma_right), + get_err, + n_iter=n_iter, x_thres=0.1, y_thres=0.01, + ) + + alpha_selected, costs_selected = err_dict[log_sigma_selected] + sigma = np.exp(log_sigma_selected) + + return sigma, alpha_selected, costs_selected + + + +def hyperparameters(X, y, + sigma_low=defaults.sigmaarr[0], sigma_high=defaults.sigmaarr[-1], eta=defaults.etaarr, + akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict, read_kernel=False, + test_size=defaults.test_size, splits=defaults.splits, n_sigma_iters=5, idx_test=None, idx_train=None, + printlevel=0, sparse=None, + stddev_portion=+1.0, + random_state=defaults.random_state, +): + """Perform a Kfold cross-validated hyperparameter optimization (for width of kernel and regularization parameter). + + Args: + X (numpy.ndarray[Nsamples,...]): Array containing the representations of all Nsamples. + y (numpy.1darray[Nsamples]): Array containing the target property of all Nsamples. + sigma_low (float): Estimated low bound forthe kernel width. + sigma_high (float): Estimated high bound forthe kernel width. + eta (list): List of regularization strength for the grid search. + akernel (str): Local kernel ('L' for Laplacian, 'G' for Gaussian, 'dot', 'cosine'). + gkernel (str): Global kernel (None, 'REM', 'avg'). + gdict (dict): Parameters of the global kernels. + test_size (float or int): Test set fraction (or number of samples). + splits (int): K number of splits for the Kfold cross-validation. + n_sigma_iters (int): number of iterations for the sigma-optimisation line search + idx_test (numpy.1darray): List of indices for the test set (based on the sequence in X). + idx_train (numpy.1darray): List of indices for the training set (based on the sequence in X). + printlevel (int): Controls level of output printing. + read_kernel (bool): If 'X' is a kernel and not an array of representations (disables sigma optimisation). + sparse (int): The number of reference environnments to consider for sparse regression. + stddev_portion (float): The amount of error standard deviation to add to error means, for error distribution ranking. + random_state (int): The seed used for random number generator (controls train/test splitting). + + Returns: + The results of the grid search as a numpy.2darray [Cx(MAE,std,eta,sigma)], + where C is the number of parameter set and + the array is sorted according to MAEs (last is minimum) + + Raises: + RuntimeError: If 'X' is a kernel and sparse regression is chosen. + """ + + if gkernel is None: + gwrap = None + else: + gwrap = [gkernel, gdict] + kernel = get_kernel(akernel, gwrap) + + idx_train, _, y_train, _ = train_test_split_idx(y=y, idx_test=idx_test, idx_train=idx_train, + test_size=test_size, random_state=random_state) + if read_kernel is False: + X_train = X[idx_train] + optimise_sigma = True + else: + X_train = X[np.ix_(idx_train,idx_train)] + optimise_sigma = False + + if sparse: + if read_kernel: + raise RuntimeError('Cannot do FPS with kernels') + sparse_idx = do_fps(X_train)[0][:sparse] + else: + sparse_idx = np.arange(X_train.shape[0]) + + errors = [] + with Parallel(n_jobs=1, return_as="generator_unordered") as parallel: + if optimise_sigma: + err_append = lambda sigma,alpha,err,stderr: errors.append((err,stderr, alpha,sigma)) + _,_,_ = search_sigma( + X_train, y_train, kernel, (sigma_low, sigma_high), alpha_grid=eta, + parallel=parallel, on_compute=err_append, + n_iter = n_sigma_iters, n_splits=splits, stddev_portion=stddev_portion, + sparse_idx=sparse_idx, + ) + else: + if sparse_idx is not None: + K_all = X_train[:, sparse_idx] + else: + K_all = X_train + sigma = np.nan + err_append = lambda alpha,err,stderr: errors.append((err,stderr, alpha,sigma)) + _,_ = kfold_alpha_eval( + K_all, y, splits, alpha_grid=eta, + parallel=parallel, on_compute=err_append, + ) + + + errors = np.array(errors) + ind = np.argsort(errors[:,0]+stddev_portion*errors[:,1])[::-1] + errors = errors[ind] + return errors + + +def _get_arg_parser(): + """Parse CLI arguments.""" + parser = RegressionParser(description='This program finds the optimal hyperparameters.', hyperparameters_set='array') + parser.remove_argument("train_size") + parser.remove_argument("sigma") + parser.remove_argument("adaptative") + parser.add_argument('--sigma-low', type=float, dest='sigma_low', default=1E-2, help='estimated low bound for sigma') + parser.add_argument('--sigma-high', type=float, dest='sigma_high', default=1E+2, help='estimated high bound for sigma') + parser.add_argument('--stddev-portion', type=float, dest='stddev_portion', default=1, help='amount of error standard deviation to add to error means, for error distribution ranking in the output.') + parser.add_argument('--sigma-iterations', type=int, dest='n_sigma_iters', default=5, help='number of iterations for the sigma-optimisation line search') + + + return parser + + +def main(): + """Command-line entry point for hyperparameter optimization.""" + args = _get_arg_parser().parse_args() + if args.readk: + args.sigma = [np.nan] + if args.ll: + correct_num_threads() + print(vars(args)) + + X = np.load(args.repr) + y = np.loadtxt(args.prop) + + errors = hyperparameters( + X, y, read_kernel=args.readk, + sigma_low=args.sigma_low, sigma_high=args.sigma_high, eta=args.eta, + akernel=args.akernel, gkernel=args.gkernel, gdict=args.gdict, sparse=args.sparse, + test_size=args.test_size, splits=args.splits, n_sigma_iters=args.n_sigma_iters, + printlevel=args.printlevel, random_state=args.random_state, stddev_portion=args.stddev_portion, + ) + errors = np.array(errors) + if args.nameout is not None: + np.savetxt(args.nameout, errors, header="error stdev eta sigma") + print() + print('error stdev eta sigma') + for error in errors: + print("{:e} {:e} | {:e} {:e}".format(*tuple(error))) + + +if __name__ == "__main__": + main() diff --git a/qstack/regression/skl_objects.py b/qstack/regression/skl_objects.py new file mode 100644 index 0000000..2be632a --- /dev/null +++ b/qstack/regression/skl_objects.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing._data import KernelCenterer +from sklearn.utils.validation import ( + FLOAT_DTYPES, + _check_sample_weight, + check_is_fitted, + validate_data, +) + +from qstack.qml import b2r2, slatm + +def _restore_from_pickle(objname: str, version: int, hypers: dict, params: dict|None): + pass + + + + + + +class B2R2Representation(TransformerMixin, BaseEstimator): + """Transform reactions into their B2R2 representations + + Reference: + P. van Gerwen, A. Fabrizio, M. D. Wodrich, C. Corminboeuf, + "Physics-based representations for machine learning properties of chemical reactions", + Mach. Learn.: Sci. Technol. 3, 045005 (2022), doi:10.1088/2632-2153/ac8f1a + + This representation can be computed for molecules or for reactions, + by giving to this transformer a list of one or a list of the other. + Note that no fitting is required, and this object is a simple shim best used + in pipeline objects. + + Molecules are ASE molecule objects, or any object with `.numbers` and `.positions` (in Å) properties. + Reactions, however, are ``qstack.qml.b2r2.Reaction objects, + or tuples of two lists of molecules (reactants, products). + + Parameters + ---------- + variant: str, default "l" + B2R2 variant to compute. Options: + - 'l': Local variant with element-resolved skewed Gaussians (default). + - 'a': Agnostic variant with element-pair Gaussians. + - 'n': Nuclear variant with combined skewed Gaussians. + + progress: bool, default False + If True, displays progress bar + + rcut: float, default 3.5 + Cutoff radius for bond detection in Å + + gridspace: float, default 0.03 + Grid spacing for discretization in Å + + + Attributes + ---------- + None + + Examples + -------- + [ fixme ] + """ + + def __init__( + self, + variant='l', + progress=False, + rcut=b2r2.defaults.rcut, + gridspace=b2r2.defaults.gridspace, + ): + """Initialize StandardFlexibleScaler.""" + self.variant = variant + self.progress = progress + self.rcut = rcut + self.gridspace = gridspace + self.elements_ = [] + + def __reduce__(self): + return ( + _restore_from_pickle, + "B2R2", 1, + dict( + variant = self.variant, + progress = self.progress, + rcut = self.rcut, + gridspace = self.gridspace, + ), + {'elements_': self.elements} if self.elements else None, + ) + + def fit(self, X, y=None, sample_weight=None): + """Determine the types of elements to consider, by feeding them from all objects to consider later. + + Parameters + ---------- + X : numpy.ndarray of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + y: None + Ignored. + sample_weight: numpy.ndarray of shape (n_samples,) + Weights for each sample. Sample weighting can be used to center + (and scale) data using a weighted mean. Weights are internally + normalized before preprocessing. + + Returns + ------- + self : object + Fitted scaler. + """ + + elems = set() + for obj in X: + if isinstance(obj, tuple) and len(obj) == 2: + reac_mols = obj[0] + obj[1] + elif isinstance(obj, b2r2.Reaction): + reac_mols = obj.reactants + obj.products + elif hasattr(X[0], "numbers") and hasattr(X[0], "positions"): + reac_mols = [obj] + for mol in reac_mols: + elems.update(mol.numbers) + self.elements_ = sorted(elems) + + return self + + def transform(self, X, y=None, copy=None): + """Normalize a vector based on previously computed mean and scaling. + + Parameters + ---------- + X : list of length n_samples, of molecules or list of reactions + The chemical objects to compute representations of. + Please note they should all be of the same type (reactions OR molecules) + y: None + Ignored. + copy : bool, default=None + Ignored + + Returns + ------- + X : {array-like} of shape (n_samples, n_features) + Transformed data. + """ + + if self.variant=='l': + get_b2r2_molecular=b2r2.get_b2r2_l_molecular + combine = lambda r,p: p-r + elif self.variant=='a': + get_b2r2_molecular = b2r2.get_b2r2_a_molecular + combine = lambda r,p: p-r + elif self.variant=='n': + get_b2r2_molecular=b2r2.get_b2r2_n_molecular + combine = lambda r,p: np.hstack((r,p)) + else: + raise RuntimeError(f'Unknown B2R2 {variant=}') + + if isinstance(X[0], tuple) and len(X[0]) == 2: + mode = "reac-2" + first_array = self._get_reac_array(X[0][0], X[0][1], get_b2r2_molecular, combine) + elif isinstance(X[0], b2r2.Reaction): + mode = "reac" + first_array = self._get_reac_array(X[0].reactants, X[0].products, get_b2r2_molecular, combine) + elif hasattr(X[0], "numbers") and hasattr(X[0], "positions"): + mode = "mol" + first_array = self._get_mol_array(X[0], get_b2r2_molecular) + else: + raise ValueError("unknown type of input") + + assert first_array.ndim==1 + full_array = np.empty_like(first_array, shape=(len(X), *first_array.shape)) + full_array[0] = first_array + + for object_i,x in enumerate(X[1:]): + if mode == "reac-2": + full_array[object_i+1] = self._get_reac_array(x[0], x[1], get_b2r2_molecular, combine) + elif mode == "reac": + full_array[object_i+1] = self._get_reac_array(x.reactants, x.products, get_b2r2_molecular, combine) + elif mode == "mol": + full_array[object_i+1] = self._get_mol_array(x, get_b2r2_molecular) + return full_array + + def _get_reac_array(self, reactants, products, mol_rep_func, combine): + reac_repr = self._get_mol_array(reactants[0], mol_rep_func) + for reac in reactants[1:]: + reac_repr += self._get_mol_array(reac, mol_rep_func) + prod_repr = self._get_mol_array(products[0], mol_rep_func) + for prod in products[1:]: + prod_repr += self._get_mol_array(prod, mol_rep_func) + return combine(reac_repr, prod_repr) + + def _get_mol_array(self, mol, mol_rep_func): + return mol_rep_func(mol.numbers, mol.positions, self.rcut, self.gridspace, self.elements_) diff --git a/tests/test_regression.py b/tests/test_regression.py index 9702fa5..a60e606 100755 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -3,6 +3,7 @@ import os import numpy as np import qstack.regression.hyperparameters as hyperparameters +import qstack.regression.hyperparameters2 as hyperparameters2 import qstack.regression.regression as regression import qstack.regression.final_error as final_error import qstack.regression.condition as condition @@ -17,12 +18,43 @@ def test_hyperparameters(): yfile = os.path.join(path, 'data/mols/dipole.dat') y = np.loadtxt(yfile) - hyper = hyperparameters.hyperparameters(X, y)[-4:] + hyper = hyperparameters.hyperparameters(X, y, stddev_portion=0)[-4:] true_hyper = [ [5.18303885e-01,3.00507798e-01,1.00000000e-05,3.16227766e+01], [5.18262897e-01,3.00473853e-01,3.16227766e-08,3.16227766e+01], [5.18262767e-01,3.00473746e-01,1.00000000e-10,3.16227766e+01], [5.10592542e-01,3.38247735e-01,1.00000000e+00,3.16227766e+01]] + + assert (np.allclose(hyper, true_hyper)) + +def test_hyperparameters_adapt2(): + #import logging + #logging.basicConfig() + #logging.getLogger("qstack").setLevel('DEBUG') + path = os.path.dirname(os.path.realpath(__file__)) + xfile = os.path.join(path, 'data/mols/X_lb.npy') + X = np.load(xfile) + yfile = os.path.join(path, 'data/mols/dipole.dat') + y = np.loadtxt(yfile) + print(np.isfinite(X).all(), np.isfinite(y).all()) + + hyper = hyperparameters.hyperparameters(X, y, random_state=42, stddev_portion=1.0, adaptive_v2=True)[-4:] + true_hyper = [ + [5.15813198e-01, 2.37774396e-01, 3.16227766e-08, 3.64079252e+04], + [5.15719232e-01, 2.37430538e-01, 1.00000000e-10, 1.00000000e+06], + [5.15657638e-01, 2.37472003e-01, 1.00000000e-10, 3.64079252e+04], + [5.15699162e-01, 2.37420839e-01, 1.00000000e-10, 1.71990639e+05], + ] + + assert (np.allclose(hyper, true_hyper)) + + hyper = hyperparameters.hyperparameters(X, y, random_state=42, stddev_portion=0.0, adaptive_v2=True)[-4:] + true_hyper = [ + [5.10293083e-01, 2.45220151e-01, 1.00000000e-10, 3.25422973e+02], + [5.04772360e-01, 2.54644677e-01, 1.00000000e-05, 1.51009311e+02], + [5.04560208e-01, 2.54267311e-01, 3.16227766e-08, 1.51009311e+02], + [5.04559535e-01, 2.54266113e-01, 1.00000000e-10, 1.51009311e+02], + ] assert (np.allclose(hyper, true_hyper))