R语言 使用基于以前获得的数据的模型将聚类分配给新数据

pu3pd22g  于 2023-03-05  发布在  其他
关注(0)|答案(1)|浏览(196)

在一个实验中,我们通过在线调查获得数据,利用这些数据进行聚类分析(使用library(mclust)库),然后根据我们分析得到的聚类,选择一些人进行进一步的检查。
后来,我们重新开始了在线调查,获得了更多的数据(尽管没有第一轮那么多)。
现在,我想分配新数据(此处:seconddata)使用基于第一数据的模型(此处:* 第一数据 *)。
为了首先进行聚类,我们对 firstdata 进行了缩放,然后进行了聚类分析。现在,为了将聚类分配给新数据,我也对 seconddata 进行了缩放,然后尝试分配聚类。尽管理论上我们从与 firstdata 相同的总体中提取 seconddata,但它们的分布存在细微差异--当然,意料之中。
在我的原始数据中,分配给 seconddata 的聚类似乎没有什么意义,因为我突然有了一些以前最普遍的空聚类。我觉得这可能是因为两个数据集的分布略有不同,但我不确定。
下面是我如何尝试进行分配的一个示例。集群本身没有意义,但对于我的原始数据,过程保持不变。

# library -----------------------------------------------------------------
library(tidyverse)
library(ggplot2)
library(scatterplot3d)
library(hopkins)
library(factoextra)
library(NbClust)
library(mclust)
library(ggpubr)
library(writexl)
library(reshape2)
library(hms)
library("lubridate") 

set.seed(1591593)

# first dataset -----------------------------------------------------------

n1 = 100
#number of observations in first data frame

firstdata <- expand.grid(n1 = 1:n1,
                         x1 = NA,
                         x2 = NA,
                         x3 = NA)

firstdata$x1 <- sample(runif(n = n1, min = 50, max = 100), replace = TRUE)
firstdata$x2 <- sample(runif(n = n1, min = 20, max = 25), replace = TRUE)
firstdata$x3 <- sample(runif(n = n1, min = 30, max = 80), replace = TRUE)
#creating data

firstdata = firstdata %>%
  select(contains("x")) %>%
  mutate_all(scale)
#scale data

par(mfrow = c(1, 1)) # display 1 plot
scatterplot3d(firstdata$x1, firstdata$x2, firstdata$x3)

## model-based clustering ---------------------------------------------

# Fit model
c_model <- Mclust(firstdata)

# Show optimal model
summary(c_model) 

# Plot all models
fviz_mclust(c_model, "BIC", palette = "jco")

# Print cluster sizes
table(c_model$classification)

# Cluster plot
fviz_mclust(c_model, "classification", geom = "point", 
            pointsize = 1.5, palette = "jco")

# Cluster no. (classification) as variable (new dataframe)
firstdata_cluster = firstdata
firstdata_cluster$cluster = as.factor(c_model$classification)

# Compute probability to belong to resp. cluster and not to other cluster
firstdata_cluster$probability = 1-(c_model$uncertainty)

# Cluster label in classifications
clusters = MclustDA(firstdata, class = firstdata_cluster$cluster)
summary(clusters)

# second dataset ----------------------------------------------------------

n2 = 10
#number of observations in second data frame

seconddata <- expand.grid(n2= 1:n2,
                            x1 = NA,
                            x2 = NA,
                            x3 = NA)

seconddata$x1 <- sample(runif(n = n2, min = 50, max = 100), replace = TRUE)
seconddata$x2 <- sample(runif(n = n2, min = 20, max = 25), replace = TRUE)
seconddata$x3 <- sample(runif(n = n2, min = 30, max = 80), replace = TRUE)
# create second data frame

## assigning clusters to the new data ------------------------------------

seconddata = seconddata %>%
  select(contains("x")) %>%
  mutate_all(scale)
# scale second data frame

prediction_seconddata = predict(clusters, seconddata)
round(prediction_seconddata$z, 0)
#assign clusters to the new second data frame/new data using the first model

prediction_seconddata_cluster<-as.data.frame(round(prediction_seconddata$z, 0))
prediction_seconddata_cluster
#assigned clusters of the second data frame

我没有任何聚类数据的经验,所以我有点不确定这种方法是否可以接受。我非常确定我的过程中有什么地方不对...
您对如何将 seconddata 与聚类匹配有什么建议吗?
谢谢!

cvxl0en2

cvxl0en21#

我的问题的答案是,在做聚类之前,我必须根据 * 第一个数据 * 重新调整 * 第二个日期 *。
参见:# rescale seconddata --------

# library -----------------------------------------------------------------
library(tidyverse)
library(ggplot2)
library(scatterplot3d)
library(hopkins)
library(factoextra)
library(NbClust)
library(mclust)
library(ggpubr)
library(writexl)
library(reshape2)
library(hms)
library("lubridate") 

set.seed(1591593)

# first dataset -----------------------------------------------------------

n1 = 100
#number of observations in first data frame

firstdata <- expand.grid(n1 = 1:n1,
                         x1 = NA,
                         x2 = NA,
                         x3 = NA)

firstdata$x1 <- sample(runif(n = n1, min = 50, max = 100), replace = TRUE)
firstdata$x2 <- sample(runif(n = n1, min = 20, max = 25), replace = TRUE)
firstdata$x3 <- sample(runif(n = n1, min = 30, max = 80), replace = TRUE)
#creating data

firstdata = firstdata %>%
  select(contains("x")) %>%
  mutate_all(scale)
#scale data

par(mfrow = c(1, 1)) # display 1 plot
scatterplot3d(firstdata$x1, firstdata$x2, firstdata$x3)

## model-based clustering ---------------------------------------------

# Fit model
c_model <- Mclust(firstdata)

# Show optimal model
summary(c_model) 

# Plot all models
fviz_mclust(c_model, "BIC", palette = "jco")

# Print cluster sizes
table(c_model$classification)

# Cluster plot
fviz_mclust(c_model, "classification", geom = "point", 
            pointsize = 1.5, palette = "jco")

# Cluster no. (classification) as variable (new dataframe)
firstdata_cluster = firstdata
firstdata_cluster$cluster = as.factor(c_model$classification)

# Compute probability to belong to resp. cluster and not to other cluster
firstdata_cluster$probability = 1-(c_model$uncertainty)

# Cluster label in classifications
clusters = MclustDA(firstdata, class = firstdata_cluster$cluster)
summary(clusters)

# second dataset ----------------------------------------------------------

n2 = 10
#number of observations in second data frame

seconddata <- expand.grid(n2= 1:n2,
                          x1 = NA,
                          x2 = NA,
                          x3 = NA)

seconddata$x1 <- sample(runif(n = n2, min = 50, max = 100), replace = TRUE)
seconddata$x2 <- sample(runif(n = n2, min = 20, max = 25), replace = TRUE)
seconddata$x3 <- sample(runif(n = n2, min = 30, max = 80), replace = TRUE)
# create second data frame


# rescale seconddata --------------------------------------------------------

firstdata_x1_mean <- mean(firstdata$x1)
firstdata_x1_sd <- sd(firstdata$x1)
firstdata_x2_mean <- mean(firstdata$x2)
firstdata_x2_sd <- sd(firstdata$x2)
firstdata_x3_mean <- mean(firstdata$x2)
firstdata_x3_sd <- sd(firstdata$x2)
#mean and sd of old data mss

seconddata <- seconddata %>% 
  mutate_at(vars(x1),function(x) (x - firstdata_x1_mean) / firstdata_x1_sd) %>% 
  mutate_at(vars(x2),function(x) (x - firstdata_x2_mean) / firstdata_x2_sd) %>% 
  mutate_at(vars(x3),function(x) (x - firstdata_x3_mean) / firstdata_x3_sd) 
#rescale the new data according to the parameters of the old data

## assigning clusters to the new data ------------------------------------

seconddata = seconddata %>%
  select(contains("x"))

prediction_seconddata = predict(clusters, seconddata)
round(prediction_seconddata$z, 0)
#assign clusters to the new second data frame/new data using the first model

prediction_seconddata_cluster<-as.data.frame(round(prediction_seconddata$z, 0))
prediction_seconddata_cluster
#assigned clusters of the second data frame

现在新的聚类是正确的。
也许这能帮到别人。

相关问题