I took the inspiration from this post: feature-selection-for-regression-data.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import altair as alt

generate regression dataset

scikit-learn library has a function make_regression which generates a synthetic regression dataset.

Here is a dataset with 5 features but where only 2 contain any information that influences the result (y).

X, y, coefarray = make_regression(n_samples=1000, n_features=5, n_informative=2, noise=0, random_state=1, coef=True)
columns = ['x%s' % key for key in range(X.shape[1])]
data = pd.DataFrame(data=X, columns=columns)
data['y']=y
alt.Chart(data).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),

).properties(
    width=100,
    height=150
).repeat(
    row=['y'],
    column=columns,
).interactive()

Note that the data looks like it has some noise when plotted in this way eventhough it hasn't. Here is the coefficients of the model that generated this data:

fig,ax=plt.subplots()
ax.bar(x=columns,height=coefarray);
ax.set_title('Coefficients of the real underlying model');

And if we fit a LinearRegression to this data, we get the same coefficients:

model=LinearRegression()
model.fit(X=X,y=y);
fig,ax=plt.subplots()
ax.bar(x=columns,height=model.coef_);
ax.set_title('Coefficients of the regressed model');

And the goodness of fit is 1, so we have no noise and a perfect model.

model.score(X=X, y=y)
1.0

But maybe having no noise is not so realistic, so we add som noice

X, y, coefarray = make_regression(n_samples=300, n_features=5, n_informative=2, noise=100, random_state=1, coef=True)
columns = ['x%s' % key for key in range(X.shape[1])]
data = pd.DataFrame(data=X, columns=columns)
data['y']=y
alt.Chart(data).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),

).properties(
    width=100,
    height=150
).repeat(
    row=['y'],
    column=columns,
).interactive()
fig,ax=plt.subplots()
ax.bar(x=columns,height=coefarray);
ax.set_title('Coefficients of the real underlying model');

As a consequence the LinearRegression cannot find the exact underlying model anymore:

model=LinearRegression()
model.fit(X=X,y=y);
fig,ax=plt.subplots()
ax.bar(x=columns,height=model.coef_);
ax.set_title('Coefficients of the regressed model');

Splitting the data in a training set and a testing set:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
from sklearn.metrics import mean_absolute_error
model.fit(X=X_train, y=y_train);
yhat = model.predict(X=X_test)
mean_absolute_error(y_true=y_test, y_pred=yhat)
84.98706397499996
model.score(X=X_test, y=y_test)
0.4323407342654969

Numerical Feature Selection

Correlation Feature Selection

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
F, p_val = f_regression(X,y)
fs = SelectKBest(score_func=f_regression, k='all')
# learn relationship from training data
fs.fit(X_train, y_train)
# transform train input data
X_train_fs = fs.transform(X_train)
# transform test input data
X_test_fs = fs.transform(X_test)
fig,ax = plt.subplots()
ax.bar(x=columns, height=fs.scores_)
ax.set_title('Correlation Feature Importance');

Obviously X3 and X4 are the ones that are relevant.

Tune the Number of Selected Features

Instead of guessing, we can systematically test a range of different numbers of selected features and discover which results in the best performing model. This is called a grid search, where the k argument to the SelectKBest class can be tuned.

from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
results = []
num_features = np.arange(1,X.shape[1])
for k in num_features:
    # create pipeline
    model = LinearRegression()
    fs = SelectKBest(score_func=f_regression, k=k)
    pipeline = Pipeline(steps=[('sel',fs), ('lr', model)])
    
    # evaluate the model
    cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='r2', cv=cv, n_jobs=-1)
    results.append(scores)
fig,ax=plt.subplots()
ax.boxplot(results, labels=num_features, showmeans=True);
ax.set_xlabel('Number of features')
ax.set_ylabel('$R^2$');
ax.set_title('Box and Whisker Plots of $R^2$ for Each Number of Selected Features');

Here is a nice description of the anatomy of the Box and Whisker Plots.

obviosly using only 1 feature is not a good idea, but what about using 2, 3 or 4?

fig,ax=plt.subplots()
ax.boxplot(results[1:], labels=num_features[1:], showmeans=True);
ax.set_xlabel('Number of features')
ax.set_ylabel('$R^2$');
ax.set_title('Box and Whisker Plots of $R^2$ for Each Number of Selected Features');

And if we look at the mean score for each number of features:

fig,ax=plt.subplots()
mean_scores = np.mean(results, axis=1)
ax.bar(x=num_features, height=mean_scores);
ax.set_title('Mean score for number of features from the cross validation')
ax.set_xlabel('Number of features')
ax.set_ylabel('$mean(R^2)$')
Text(0, 0.5, '$mean(R^2)$')

it seems that a model with two features will do the best, eventhought there is just a small difference.

It seems as long as there is enought features the cross validation score is good. Having too many features in the first order linear regression does not seem to be a problem.

There is a way to perform the grid search automatically in Scikit-learn.

from sklearn.model_selection import GridSearchCV
# define the grid
grid = dict()
grid['sel__k'] = [i for i in range(X.shape[1]-20, X.shape[1]+1)]
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
# define the pipeline to evaluate
model = LinearRegression()
fs = SelectKBest(score_func=f_regression)
pipeline = Pipeline(steps=[('sel',fs), ('lr', model)])

# define the grid search
search = GridSearchCV(estimator=pipeline, param_grid=grid, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# perform the search
search_result = search.fit(X, y)
search_result.best_estimator_
Pipeline(steps=[('sel',
                 SelectKBest(k=2,
                             score_func=<function f_regression at 0x132751048>)),
                ('lr', LinearRegression())])

...and the grid search also gave k=2 as the best.

What about higher order Linear regression?

from sklearn.preprocessing import PolynomialFeatures

We create some new data by applying the following transformation:

$y=x_3^2 + x_3 \cdot x_4 + x_4^2$

def real_model(X, noise_std=0):
    
    noise = np.random.normal(loc=0, scale=noise_std, size=X.shape[0])
    x_3=X[:,3]
    x_4=X[:,4]
    y_polynom =x_3**2+x_3*x_4+x_4**2+noise
    return y_polynom
    
np.random.seed(0)
noise_std=1
y_polynom = real_model(X=X, noise_std=noise_std)
y_polynom_train = real_model(X=X_train, noise_std=noise_std)
y_polynom_test = real_model(X=X_test, noise_std=noise_std)
data_polynom = pd.DataFrame(data=X, columns=columns)
data_polynom['y']=y_polynom
alt.Chart(data_polynom).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),

).properties(
    width=100,
    height=150
).repeat(
    row=['y'],
    column=columns,
).interactive()

Fitting a linear model with linear regression is now no longer working very well:

model = LinearRegression()
model.fit(X=X, y=y_polynom)
model.score(X=X, y=y_polynom)
0.034330021602930305

But if we transform the features to polynomial features, linear regression works again. PolynomialFeatures expands the x1, x2, x3, x4 to polynomial features as such:

polynomial_features = PolynomialFeatures(degree=2)
polynomial_features.fit(X=X, y=y_polynom)
polynomial_features.get_feature_names()
['1',
 'x0',
 'x1',
 'x2',
 'x3',
 'x4',
 'x0^2',
 'x0 x1',
 'x0 x2',
 'x0 x3',
 'x0 x4',
 'x1^2',
 'x1 x2',
 'x1 x3',
 'x1 x4',
 'x2^2',
 'x2 x3',
 'x2 x4',
 'x3^2',
 'x3 x4',
 'x4^2']
linear_regression = LinearRegression()
polynomial_features = PolynomialFeatures(degree=2)
pipeline = Pipeline(steps=[('polynomial_features',polynomial_features), ('linear_regression', linear_regression)])
pipeline.fit(X=X, y=y_polynom)
pipeline.score(X=X, y=y_polynom)
0.8273580623429667
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
# define the pipeline to evaluate
select_k_best = SelectKBest(score_func=f_regression)

steps = [
    ('polynomial_features',polynomial_features),
    ('sel',select_k_best),
    ('linear_regression', linear_regression),
]

pipeline = Pipeline(steps=steps)

# define the grid search
search = GridSearchCV(estimator=pipeline, param_grid=grid, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# perform the search
search_result = search.fit(X, y_polynom)
/Users/martinalexandersson/Dev/blog/venv/lib/python3.6/site-packages/sklearn/feature_selection/_univariate_selection.py:302: RuntimeWarning: divide by zero encountered in true_divide
  corr /= X_norms
/Users/martinalexandersson/Dev/blog/venv/lib/python3.6/site-packages/sklearn/feature_selection/_univariate_selection.py:307: RuntimeWarning: invalid value encountered in true_divide
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
search_result.best_estimator_
Pipeline(steps=[('polynomial_features', PolynomialFeatures()),
                ('sel',
                 SelectKBest(k=3,
                             score_func=<function f_regression at 0x132751048>)),
                ('linear_regression', LinearRegression())])
search_result.best_estimator_.score(X=X_test, y=y_polynom_test)
0.6837781841335228
selector_best = search_result.best_estimator_['sel']
selector_best
SelectKBest(k=3, score_func=<function f_regression at 0x132751048>)

To see which features that have been selected you can use the following mask:

selector_best.get_support()
array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,  True])
feature_names = np.array(polynomial_features.get_feature_names())
feature_names[selector_best.get_support()]
array(['x3^2', 'x3 x4', 'x4^2'], dtype='<U5')

So it seems that we found the original expression.

results = []
num_features = np.arange(len(feature_names))
for k in num_features:
    # create pipeline
    
    select_k_best = SelectKBest(score_func=f_regression, k=k)

    steps = [
        ('polynomial_features',polynomial_features),
        ('sel',select_k_best),
        ('linear_regression', linear_regression),
    ]
    
    pipeline = Pipeline(steps=steps)
    
    scores = cross_val_score(pipeline, X, y_polynom, scoring='r2', cv=cv, n_jobs=-1)
    results.append(scores)
fig,ax=plt.subplots()
ax.boxplot(results, labels=num_features, showmeans=True);
ax.set_xlabel('Number of features')
ax.set_ylabel('$R^2$');
ax.set_title('Box and Whisker Plots of $R^2$ for Each Number of Selected Features');
fig,ax=plt.subplots()
ax.boxplot(results[4:], labels=num_features[4:], showmeans=True);
ax.set_xlabel('Number of features')
ax.set_ylabel('$R^2$');
ax.set_title('Box and Whisker Plots of $R^2$ for Each Number of Selected Features');

So the conclusion is similar here as well. As long as we have sufficient number of features we get good results with the model.