import pandas as pd
# Shuffle your dataset
shuffle_df = df.sample(frac=1)
# Define a size for your train set
train_size = int(0.7 * len(df))
# Split your dataset
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]
import numpy as np
from itertools import chain
def _indexing(x, indices):
"""
:param x: array from which indices has to be fetched
:param indices: indices to be fetched
:return: sub-array from given array and indices
"""
# np array indexing
if hasattr(x, 'shape'):
return x[indices]
# list indexing
return [x[idx] for idx in indices]
def train_test_split(*arrays, test_size=0.25, shufffle=True, random_seed=1):
"""
splits array into train and test data.
:param arrays: arrays to split in train and test
:param test_size: size of test set in range (0,1)
:param shufffle: whether to shuffle arrays or not
:param random_seed: random seed value
:return: return 2*len(arrays) divided into train ans test
"""
# checks
assert 0 < test_size < 1
assert len(arrays) > 0
length = len(arrays[0])
for i in arrays:
assert len(i) == length
n_test = int(np.ceil(length*test_size))
n_train = length - n_test
if shufffle:
perm = np.random.RandomState(random_seed).permutation(length)
test_indices = perm[:n_test]
train_indices = perm[n_test:]
else:
train_indices = np.arange(n_train)
test_indices = np.arange(n_train, length)
return list(chain.from_iterable((_indexing(x, train_indices), _indexing(x, test_indices)) for x in arrays))
import numpy as np
num_of_rows = len(X_data) * 0.8
values = X_data.values
np.random_shuffle(values) #shuffles data to make it random
train_data = values[:num_of_rows] #indexes rows for training data
test_data = values[num_of_rows:] #indexes rows for test data
5条答案
按热度按时间lndjwyie1#
我知道您的问题是只对
numpy
或scipy
执行train_test_split,但实际上有一个非常简单的方法可以对Pandas执行此操作:对于那些谁想要一个快速和容易的解决方案。
pu3pd22g2#
虽然这是个老问题,但这个答案可能会有所帮助。
这就是sklearn实现
train_test_split
的方式,下面给出的这个方法采用与sklearn类似的参数。当然,sklearn的实现支持分层k折叠,Pandas系列的分裂等。这一个只适用于分裂列表和numpy数组,我认为这将适用于您的情况。
mqkwyuun3#
这段代码应该可以工作(假设
X_data
是一个PandasDataFrame):希望这对你有帮助!
cunj1qz14#
假设您想要随机分割,我们会建立一个索引清单,长度与您拥有的数据点数目相同,即X_data的第一个轴然后,我们将它们按随机顺序排列,并仅将这些随机索引的前80%作为训练数据,其余用于测试。
[:num_training_indices]
仅从列表中选择前num_training_indices。之后,您只需使用随机索引列表从数据中提取行,然后您的数据被拆分。如果您希望拆分可重复(开始时为np.random.seed(some_integer)
),请记住从X_data中删除价格,并设置种子。tnkciper5#
这个解决方案只使用Pandas和numpy