Feature Selection for Regression Data
I recently developed a model to predict ship roll damping using Linear Regression. I spend quite much time doing the feature selection, here I want to revisit this but using a synthetic regression dataset.
I took the inspiration from this post: feature-selection-for-regression-data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import altair as alt
X, y, coefarray = make_regression(n_samples=1000, n_features=5, n_informative=2, noise=0, random_state=1, coef=True)
columns = ['x%s' % key for key in range(X.shape[1])]
data = pd.DataFrame(data=X, columns=columns)
data['y']=y
alt.Chart(data).mark_circle().encode(
alt.X(alt.repeat("column"), type='quantitative'),
alt.Y(alt.repeat("row"), type='quantitative'),
).properties(
width=100,
height=150
).repeat(
row=['y'],
column=columns,
).interactive()
Note that the data looks like it has some noise when plotted in this way eventhough it hasn't. Here is the coefficients of the model that generated this data:
fig,ax=plt.subplots()
ax.bar(x=columns,height=coefarray);
ax.set_title('Coefficients of the real underlying model');
And if we fit a LinearRegression to this data, we get the same coefficients:
model=LinearRegression()
model.fit(X=X,y=y);
fig,ax=plt.subplots()
ax.bar(x=columns,height=model.coef_);
ax.set_title('Coefficients of the regressed model');
And the goodness of fit is 1, so we have no noise and a perfect model.
model.score(X=X, y=y)
But maybe having no noise is not so realistic, so we add som noice
X, y, coefarray = make_regression(n_samples=300, n_features=5, n_informative=2, noise=100, random_state=1, coef=True)
columns = ['x%s' % key for key in range(X.shape[1])]
data = pd.DataFrame(data=X, columns=columns)
data['y']=y
alt.Chart(data).mark_circle().encode(
alt.X(alt.repeat("column"), type='quantitative'),
alt.Y(alt.repeat("row"), type='quantitative'),
).properties(
width=100,
height=150
).repeat(
row=['y'],
column=columns,
).interactive()
fig,ax=plt.subplots()
ax.bar(x=columns,height=coefarray);
ax.set_title('Coefficients of the real underlying model');
As a consequence the LinearRegression cannot find the exact underlying model anymore:
model=LinearRegression()
model.fit(X=X,y=y);
fig,ax=plt.subplots()
ax.bar(x=columns,height=model.coef_);
ax.set_title('Coefficients of the regressed model');
Splitting the data in a training set and a testing set:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
from sklearn.metrics import mean_absolute_error
model.fit(X=X_train, y=y_train);
yhat = model.predict(X=X_test)
mean_absolute_error(y_true=y_test, y_pred=yhat)
model.score(X=X_test, y=y_test)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
F, p_val = f_regression(X,y)
fs = SelectKBest(score_func=f_regression, k='all')
# learn relationship from training data
fs.fit(X_train, y_train)
# transform train input data
X_train_fs = fs.transform(X_train)
# transform test input data
X_test_fs = fs.transform(X_test)
fig,ax = plt.subplots()
ax.bar(x=columns, height=fs.scores_)
ax.set_title('Correlation Feature Importance');
Obviously X3 and X4 are the ones that are relevant.
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
results = []
num_features = np.arange(1,X.shape[1])
for k in num_features:
# create pipeline
model = LinearRegression()
fs = SelectKBest(score_func=f_regression, k=k)
pipeline = Pipeline(steps=[('sel',fs), ('lr', model)])
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='r2', cv=cv, n_jobs=-1)
results.append(scores)
fig,ax=plt.subplots()
ax.boxplot(results, labels=num_features, showmeans=True);
ax.set_xlabel('Number of features')
ax.set_ylabel('$R^2$');
ax.set_title('Box and Whisker Plots of $R^2$ for Each Number of Selected Features');
Here is a nice description of the anatomy of the Box and Whisker Plots.
obviosly using only 1 feature is not a good idea, but what about using 2, 3 or 4?
fig,ax=plt.subplots()
ax.boxplot(results[1:], labels=num_features[1:], showmeans=True);
ax.set_xlabel('Number of features')
ax.set_ylabel('$R^2$');
ax.set_title('Box and Whisker Plots of $R^2$ for Each Number of Selected Features');
And if we look at the mean score for each number of features:
fig,ax=plt.subplots()
mean_scores = np.mean(results, axis=1)
ax.bar(x=num_features, height=mean_scores);
ax.set_title('Mean score for number of features from the cross validation')
ax.set_xlabel('Number of features')
ax.set_ylabel('$mean(R^2)$')
it seems that a model with two features will do the best, eventhought there is just a small difference.
It seems as long as there is enought features the cross validation score is good. Having too many features in the first order linear regression does not seem to be a problem.
There is a way to perform the grid search automatically in Scikit-learn.
from sklearn.model_selection import GridSearchCV
# define the grid
grid = dict()
grid['sel__k'] = [i for i in range(X.shape[1]-20, X.shape[1]+1)]
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
# define the pipeline to evaluate
model = LinearRegression()
fs = SelectKBest(score_func=f_regression)
pipeline = Pipeline(steps=[('sel',fs), ('lr', model)])
# define the grid search
search = GridSearchCV(estimator=pipeline, param_grid=grid, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# perform the search
search_result = search.fit(X, y)
search_result.best_estimator_
...and the grid search also gave k=2 as the best.
from sklearn.preprocessing import PolynomialFeatures
We create some new data by applying the following transformation:
$y=x_3^2 + x_3 \cdot x_4 + x_4^2$
def real_model(X, noise_std=0):
noise = np.random.normal(loc=0, scale=noise_std, size=X.shape[0])
x_3=X[:,3]
x_4=X[:,4]
y_polynom =x_3**2+x_3*x_4+x_4**2+noise
return y_polynom
np.random.seed(0)
noise_std=1
y_polynom = real_model(X=X, noise_std=noise_std)
y_polynom_train = real_model(X=X_train, noise_std=noise_std)
y_polynom_test = real_model(X=X_test, noise_std=noise_std)
data_polynom = pd.DataFrame(data=X, columns=columns)
data_polynom['y']=y_polynom
alt.Chart(data_polynom).mark_circle().encode(
alt.X(alt.repeat("column"), type='quantitative'),
alt.Y(alt.repeat("row"), type='quantitative'),
).properties(
width=100,
height=150
).repeat(
row=['y'],
column=columns,
).interactive()
Fitting a linear model with linear regression is now no longer working very well:
model = LinearRegression()
model.fit(X=X, y=y_polynom)
model.score(X=X, y=y_polynom)
But if we transform the features to polynomial features, linear regression works again. PolynomialFeatures expands the x1, x2, x3, x4 to polynomial features as such:
polynomial_features = PolynomialFeatures(degree=2)
polynomial_features.fit(X=X, y=y_polynom)
polynomial_features.get_feature_names()
linear_regression = LinearRegression()
polynomial_features = PolynomialFeatures(degree=2)
pipeline = Pipeline(steps=[('polynomial_features',polynomial_features), ('linear_regression', linear_regression)])
pipeline.fit(X=X, y=y_polynom)
pipeline.score(X=X, y=y_polynom)
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
# define the pipeline to evaluate
select_k_best = SelectKBest(score_func=f_regression)
steps = [
('polynomial_features',polynomial_features),
('sel',select_k_best),
('linear_regression', linear_regression),
]
pipeline = Pipeline(steps=steps)
# define the grid search
search = GridSearchCV(estimator=pipeline, param_grid=grid, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# perform the search
search_result = search.fit(X, y_polynom)
search_result.best_estimator_
search_result.best_estimator_.score(X=X_test, y=y_polynom_test)
selector_best = search_result.best_estimator_['sel']
selector_best
To see which features that have been selected you can use the following mask:
selector_best.get_support()
feature_names = np.array(polynomial_features.get_feature_names())
feature_names[selector_best.get_support()]
So it seems that we found the original expression.
results = []
num_features = np.arange(len(feature_names))
for k in num_features:
# create pipeline
select_k_best = SelectKBest(score_func=f_regression, k=k)
steps = [
('polynomial_features',polynomial_features),
('sel',select_k_best),
('linear_regression', linear_regression),
]
pipeline = Pipeline(steps=steps)
scores = cross_val_score(pipeline, X, y_polynom, scoring='r2', cv=cv, n_jobs=-1)
results.append(scores)
fig,ax=plt.subplots()
ax.boxplot(results, labels=num_features, showmeans=True);
ax.set_xlabel('Number of features')
ax.set_ylabel('$R^2$');
ax.set_title('Box and Whisker Plots of $R^2$ for Each Number of Selected Features');
fig,ax=plt.subplots()
ax.boxplot(results[4:], labels=num_features[4:], showmeans=True);
ax.set_xlabel('Number of features')
ax.set_ylabel('$R^2$');
ax.set_title('Box and Whisker Plots of $R^2$ for Each Number of Selected Features');
So the conclusion is similar here as well. As long as we have sufficient number of features we get good results with the model.