python 具有分层K值误差的随机过采样

0s7z1bwu  于 2023-05-27  发布在  Python
关注(0)|答案(2)|浏览(161)

我有一个 Dataframe ,看起来像这样。使用标准定标器和为所有分类变量添加的虚拟变量对数据集进行标准化。它现在被分成训练集和测试集。

amt    gender   city_pop    birth_year  distance        
153118  -0.786537   0.0    -0.318571    0.913779    -0.400876   
153226  -0.488455   0.0    -0.322397    0.741579     1.384297   
153228  0.437970    0.0    -0.329167    1.774776    -0.658839   
153303  -0.877627   0.0    -0.329656    1.258177    -1.100713   
153313  0.462143    1.0    -0.313817    1.372977     0.038791

我现在正试图用这些数据创建一些模型(如Logistic回归,决策树和随机森林),使用RandomOverSampler和StratifiedKFold交叉验证。这是因为我的目标变量上的少数类只有0.3%。
我已经用不平衡的数据创建了模型,它工作得很好。但是当我尝试进行采样时,我得到了以下错误。我的代码也在这里。

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler

skf = StratifiedKFold(n_splits=5, random_state=None)

for fold, (train_index, test_index) in enumerate(skf.split(X,y), 1):
    X_train = X.reindex(index = train_index)
    y_train = y.reindex(index = train_index) 
    X_test = X.reindex(index = test_index)
    y_test = y.reindex(index = test_index)
    ROS = RandomOverSampler(sampling_strategy=0.5)
    X_over, y_over= ROS.fit_resample(X_train, y_train)
  
#Create Dataframe for X_over
X_over = pd.DataFrame(data=X_over,   columns=X_train.columns)

我得到了下面的错误。

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-90-372645e869d1> in <module>
      4 oversample = RandomOverSampler(sampling_strategy=1)
      5 # fit and apply the transform
----> 6 X_over, y_over = oversample.fit_resample(X_train, y_train)

~\anaconda3\lib\site-packages\imblearn\base.py in fit_resample(self, X, y)
     73             The corresponding label of `X_resampled`.
     74         """
---> 75         check_classification_targets(y)
     76         arrays_transformer = ArraysTransformer(X, y)
     77         X, y, binarize_y = self._check_X_y(X, y)

~\anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
    178     y : array-like
    179     """
--> 180     y_type = type_of_target(y)
    181     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    182                       'multilabel-indicator', 'multilabel-sequences']:

~\anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
    301     if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
    302         # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
--> 303         _assert_all_finite(y)
    304         return 'continuous' + suffix
    305 

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
    104                     msg_err.format
    105                     (type_err,
--> 106                      msg_dtype if msg_dtype is not None else X.dtype)
    107             )
    108     # for object dtype data, we only check for NaNs (GH-13254)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
py49o6xq

py49o6xq1#

看了数据再回答比较好。但我建议在交叉验证步骤之前进行过采样。请试试看。

vbopmzt1

vbopmzt12#

你可以这样做

auc_scores = []
best_clf = []

for i,(train_index, test_index) in enumerate(cv.split(X_train, y_train)):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    
    ros = RandomOverSampler(random_state=42)
    X_train_ros, y_train_ros = ros.fit_resample(X_train_cv, y_train_cv)
    print(X_train_ros.shape, y_train_ros.shape)
    
    grid_clf_acc = GridSearchCV(clf, param_grid = params, scoring = 'roc_auc', verbose = 2)
    grid_clf_acc.fit(X_train_ros, y_train_ros)
###clf is classifier

训练模型

auc_scores = []
best_clf = []

for i,(train_index, test_index) in enumerate(cv.split(X_train, y_train)):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    
    ros = RandomOverSampler(random_state=42)
    X_train_ros, y_train_ros = ros.fit_resample(X_train_cv, y_train_cv)
    print(X_train_ros.shape, y_train_ros.shape)
    
    #perform grid search for the best C value on X_train_ros & y_train_ros
    clf = LogisticRegression()
    
    params = {
                'penalty' : ['l1', 'l2', 'none'],
                'C' : [10, 1.0, 0.1, 0.01]
          }
    
    grid_clf_acc = GridSearchCV(clf, param_grid = params, scoring = 'roc_auc', verbose = 2)
    grid_clf_acc.fit(X_train_ros, y_train_ros)
    
    #predict X_test_ros using the best_estimator_ and store it in y_pred
    y_pred = grid_clf_acc.best_estimator_.predict(X_test_cv)
    
    #calculate roc_auc_score for y_test_cv and y_pred_prob and store it in acc
    acc = roc_auc_score(y_test_cv, y_pred)
    
    #store the roc_auc_score for that value of C in auc_scores
    auc_scores.append(acc)
    
    #store the best_estimator_ for that value of C in best_clf
    best_clf.append(grid_clf_acc.best_estimator_)
    
    #print out the auc_scores[i] and best_clf[i]
    print(auc_scores[i])
    print(best_clf[i])

相关问题