我正在尝试用python实现我自己的超级学习者版本。下面是代码:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import neighbors
from sklearn import datasets
import matplotlib.pyplot as plt
from scipy import optimize
from pandas.plotting import scatter_matrix
import numpy as np
import pandas as pd
class SuperLearner(BaseEstimator, RegressorMixin):
def __init__(self, base_estimators):
self.base_estimators = base_estimators
self.meta_learner = linear_model.LinearRegression(positive=True)
self.weights = None
def rss(self, weights, X, y):
y_pred = np.dot(X, weights)
return np.sum((y - y_pred)**2)
def constraint(self, weights):
return np.sum(weights) - 1
def fit(self, X, y):
X, y = check_X_y(X, y)
meta_predictions = np.zeros((X.shape[0], len(self.base_estimators)), dtype=np.float64)
#TODO: modify the number of folds depending on the number of base estimators and the size of the dataset
kf = KFold(n_splits=5)
for i, (tran_idx, val_idx) in enumerate(kf.split(X)):
X_train, X_val = X[tran_idx], X[val_idx]
y_train, y_val = y[tran_idx], y[val_idx]
for j, estimator in enumerate(self.base_estimators):
estimator.fit(X_train, y_train)
meta_predictions[val_idx, j] = estimator.predict(X_val)
guess = np.empty(len(self.base_estimators))
bounds = [(0,1)] * len(self.base_estimators)
result = optimize.minimize(self.rss, guess, args=(meta_predictions, y), method='SLSQP', bounds=bounds, constraints={'type':'eq', 'fun':self.constraint})
print(result.x, np.sum(result.x))
result = optimize.nnls(meta_predictions, y)
print(result[0], np.sum(result[0]))
self.meta_learner.fit(meta_predictions, y)
self.weights= self.meta_learner.coef_
self.weights= self.weights / np.sum(self.weights)
print(self.weights, np.sum(self.weights))
return self
def predict(self, X):
check_is_fitted(self, 'meta_learner')
X = check_array(X)
base_predictions = np.zeros((X.shape[0], len(self.base_estimators)), dtype=np.float64)
for i, estimator in enumerate(self.base_estimators):
base_predictions[:, i] = estimator.predict(X)
return np.dot(base_predictions, self.weights)
def main():
np.random.seed(100)
X, y = datasets.make_friedman1(1000)
ols = linear_model.LinearRegression()
elastic = linear_model.ElasticNetCV()
ridge = linear_model.RidgeCV()
lars = linear_model.LarsCV()
lasso = linear_model.LassoCV()
knn = neighbors.KNeighborsRegressor()
superLeaner = SuperLearner([ols, elastic, ridge, lars, lasso, knn])
superLeaner.fit(X, y)
y_pred = superLeaner.predict(X)
print("MSE: ", np.mean((y_pred - y)**2))
if __name__ == "__main__":
main()
字符串
我使用三种不同的方法来评估每个模型在最终预测中应该具有的权重。虽然scipy.nnls和我使用scipy.minimize实现的优化方法产生了类似的结果,但sklearn的LinearRegression产生的结果完全不同。我甚至在GitHub上查看了LinearRegression的代码,它似乎调用了相同的scipy函数(scipy.nnls)当正参数被设置为true时,就像在这种情况下一样。有人知道为什么吗?
1条答案
按热度按时间j2datikz1#
try LinearRegression(positive=True,fit_intercept=False)cause fit_intercept default True