import numpy as np
# Generate some data that lies along a line
x = np.mgrid[-2:5:120j]
y = np.mgrid[1:9:120j]
z = np.mgrid[-5:3:120j]
data = np.concatenate((x[:, np.newaxis],
y[:, np.newaxis],
z[:, np.newaxis]),
axis=1)
# Perturb with some Gaussian noise
data += np.random.normal(size=data.shape) * 0.4
# Calculate the mean of the points, i.e. the 'center' of the cloud
datamean = data.mean(axis=0)
# Do an SVD on the mean-centered data.
uu, dd, vv = np.linalg.svd(data - datamean)
# Now vv[0] contains the first principal component, i.e. the direction
# vector of the 'best fit' line in the least squares sense.
# Now generate some points along this best fit line, for plotting.
# I use -7, 7 since the spread of the data is roughly 14
# and we want it to have mean 0 (like the points we did
# the svd on). Also, it's a straight line, so we only need 2 points.
linepts = vv[0] * np.mgrid[-7:7:2j][:, np.newaxis]
# shift by the mean to get the line in the right place
linepts += datamean
# Verify that everything looks right.
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d as m3d
ax = m3d.Axes3D(plt.figure())
ax.scatter3D(*data.T)
ax.plot3D(*linepts.T)
plt.show()
import numpy as np
pts = np.add.accumulate(np.random.random((10,3)))
x,y,z = pts.T
# this will find the slope and x-intercept of a plane
# parallel to the y-axis that best fits the data
A_xz = np.vstack((x, np.ones(len(x)))).T
m_xz, c_xz = np.linalg.lstsq(A_xz, z)[0]
# again for a plane parallel to the x-axis
A_yz = np.vstack((y, np.ones(len(y)))).T
m_yz, c_yz = np.linalg.lstsq(A_yz, z)[0]
# the intersection of those two planes and
# the function for the line would be:
# z = m_yz * y + c_yz
# z = m_xz * x + c_xz
# or:
def lin(z):
x = (z - c_xz)/m_xz
y = (z - c_yz)/m_yz
return x,y
#verifying:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
fig = plt.figure()
ax = Axes3D(fig)
zz = np.linspace(0,5)
xx,yy = lin(zz)
ax.scatter(x, y, z)
ax.plot(xx,yy,zz)
plt.savefig('test.png')
plt.show()
2条答案
按热度按时间gmxoilav1#
如果你试图从另外两个值中预测一个值,那么你应该使用
lstsq
和a
参数作为自变量(加上一列1来估计截距),b
作为因变量。另一方面,如果你只是想得到数据的最佳拟合线,即。如果你把数据投影到这条线上,这条线会使真实的点和它的投影之间的平方距离最小,那么你需要的是第一个主成分。
定义它的一种方法是其方向向量是协方差矩阵的特征向量的直线,该特征向量对应于最大特征值,该特征值通过数据的均值。也就是说,
eig(cov(data))
是一种非常糟糕的计算方法,因为它做了很多不必要的计算和复制,并且可能比使用svd
更不准确。见下文:它看起来是这样的:
xggvc2p62#
如果你的数据表现得相当好,那么它应该足以找到分量距离的最小二乘和。然后你可以找到线性回归,z独立于x,然后再次独立于y。
以documentation为例:
如果你想最小化从直线(与直线正交)到三维空间中的点的实际正交距离(我不确定这是否被称为线性回归)。然后我将构建一个计算RSS的函数,并使用scipy.optimize最小化函数来解决它。