如何在R中的适当位置修复缺失值?

xa9qqrwz  于 2023-04-09  发布在  其他
关注(0)|答案(1)|浏览(89)

我有一个看起来像这样的数据框...

dat <- data.frame(matrix(nrow = 10, ncol = 4))
dat$X1 <- seq(1,10)
dat$X2 <- c(1,1,1,1,2,2,2,2,3,3)
dat$X3 <- c(1)
dat$X4 <- c("c(2,3,4)","c(1,2,4)","c(1,2,3,4)","c(1,2,3)","c(1,2,3,4)",
            "c(1,2,3,4)","c(2,3,4)","c(1,2,3)","c(1,2,3,4)","c(1,2,3,4)")
colnames(dat) <- c("Subject","Session","other","option")

然后我用tidyr把它修好...

library(tidyr)
dat <- separate(dat, option, sep = ",", into = paste0("option.", 1:4))
colnames(dat) <- c("Subject","Session","other","pk1","pk2","pk3","pk4")
dat$pk1 <- gsub("c", "", dat$pk1)
dat$pk1 <- gsub("[()]", "", dat$pk1)
dat$pk3 <- gsub("[()]", "", dat$pk3)

我想让它看起来像这样。。

dat <- data.frame(matrix(nrow = 10, ncol = 4))
dat$X1 <- seq(1,10)
dat$X2 <- c(1,1,1,1,2,2,2,2,3,3)
dat$X3 <- c(1)
dat$X4 <- c("c(NA,2,3,4)","c(1,2,NA,4)","c(1,2,3,4)","c(1,2,3,NA)","c(1,2,3,4)",
            "c(1,2,3,4)","c(NA,2,3,4)","c(1,2,3,NA)","c(1,2,3,4)","c(1,2,3,4)")
colnames(dat) <- c("Subject","Session","other","option")
dat <- separate(dat, option, sep = ",", into = paste0("option.", 1:4))
colnames(dat) <- c("Subject","Session","other","pk1","pk2","pk3","pk4")
dat$pk1 <- gsub("c", "", dat$pk1)
dat$pk1 <- gsub("[()]", "", dat$pk1)
dat$pk4 <- gsub("[()]", "", dat$pk4)
dat
#    Subject Session other pk1 pk2 pk3 pk4
# 1        1       1     1  NA   2   3   4
# 2        2       1     1   1   2  NA   4
# 3        3       1     1   1   2   3   4
# 4        4       1     1   1   2   3  NA
# 5        5       2     1   1   2   3   4
# 6        6       2     1   1   2   3   4
# 7        7       2     1  NA   2   3   4
# 8        8       2     1   1   2   3  NA
# 9        9       3     1   1   2   3   4
# 10      10       3     1   1   2   3   4

我该如何将缺失值添加到序列中呢?

ars1skjm

ars1skjm1#

首先删除除数字和逗号以外的所有内容,然后分隔成行,然后将轴旋转得更宽。两个arrange()将列和行的顺序与示例输出的顺序相同。

library(tidyr)
library(stringr)
library(dplyr)

dat %>% 
  mutate(option = str_remove_all(option, "[^\\d,]")) %>% 
  separate_rows(option, sep = ",") %>% 
  arrange(option) %>% 
  pivot_wider(names_from = option, values_from = option, names_prefix = "pk") %>% 
  arrange(Subject)
# A tibble: 10 × 7
   Subject Session other pk1   pk2   pk3   pk4  
     <int>   <dbl> <dbl> <chr> <chr> <chr> <chr>
 1       1       1     1 NA    2     3     4    
 2       2       1     1 1     2     NA    4    
 3       3       1     1 1     2     3     4    
 4       4       1     1 1     2     3     NA   
 5       5       2     1 1     2     3     4    
 6       6       2     1 1     2     3     4    
 7       7       2     1 NA    2     3     4    
 8       8       2     1 1     2     3     NA   
 9       9       3     1 1     2     3     4    
10      10       3     1 1     2     3     4

也就是说,我很好奇dat$X4中的值最初是从哪里来的,如果你有一个数字向量列表,你可以在上游的某个地方将其分解为字符,那么可能会有一个更好的方法。

相关问题