Source code for mcmodels.models.homogeneous.subset_selection

"""
Greedy subset selection for conditioning of input data.
Rank Degeneracy and Least Squares Problems : Golub, Klema, Stewart 1976
"""
# Authors: Joseph Knox <josephk@alleninstitute.org>
# License: Allen Institute Software License

import numpy as np
import numpy.linalg as LA
import scipy.linalg as linalg
from sklearn.utils import check_random_state


[docs]def svd_subset_selection(X, n): """svd subset selection to return n cols that are less linearly dependent. Parameters ---------- X : array, shape (n_samples, n_features) Array whose columns we wish to subset. n : int The number of columns to return. Returns ------- C : array, shape (n_samples, n) Array containing the n ~least dependent columns of X. """ # NOTE: may want sklearn.utils.check_X_y X = np.atleast_2d(X.copy()) if n > X.shape[1]: raise ValueError("n must be less than the number of columns of X") if n < 1: raise ValueError("n must be at least 1") _, _, vh = linalg.svd(X, full_matrices=False) _, _, p = linalg.qr(vh[:n], pivoting=True) return p[:n]
[docs]def backward_subset_selection_conditioning(X, kappa=1000): """Conditioning through subselecting columns of X. Uses svd_subset_selection to greedily remove single columns of X till the desired conditioning is reached. Parameters ---------- X : array, shape (n_samples, n_features) Array whose columns we wish to subset. kappa : float, optional, default: 1000 The maximum condition number desired. Returns ------- C : array, shape (n_samples, ?) Array with condition number < kappa, containing the lesser linearly dependent columns of X. """ n_cols = X.shape[1] columns = np.arange(n_cols) # initialize for first iteration subset = np.arange(n_cols) while LA.cond(X[:, subset]) > kappa and len(columns) > 1: # greedily subset columns of X using svd subset selection subset = svd_subset_selection(X, len(columns) - 1) columns = columns[subset] return X[:, columns], columns
[docs]def forward_subset_selection_conditioning(X, kappa=1000, random_state=None): """Conditioning through subselecting columns of X. Randomly select initial column of X, then greedily add columns that minimially increase the conditioning. Parameters ---------- X : array, shape (n_samples, n_features) Array whose columns we wish to subset. kappa : float, optional, default: 1000 The maximum condition number desired. Returns ------- C : array, shape (n_samples, ?) Array with condition number < kappa, containing the lesser linearly dependent columns of X. """ n_cols = X.shape[1] candidates = list(range(n_cols)) # initial random subset of single column initial = check_random_state(random_state).choice(n_cols) columns = [initial] candidates.remove(initial) while len(columns) < n_cols: # greedily subset columns of X using svd subset selection condition = [LA.cond(X[:, columns + [c]]) for c in candidates] best = candidates[np.argmin(condition)] if LA.cond(np.atleast_2d(X[:, columns + [best]])) > kappa: break # update columns.append(best) candidates.remove(best) columns = np.sort(columns) return X[:, columns], columns