在保留信息的同时将 Dataframe 折叠为4列

falq053o  于 2023-02-06  发布在  其他
关注(0)|答案(3)|浏览(122)

我有一个 Dataframe ,看起来像这样(但〉10k行)。

df <- data.frame(id=1:2,
                 feature1=1:2,
                 feature2=4:5,
                 feature3=7:8)

并想把它重塑成这样的样子:

df_goal <- data.frame(value1=c(1,2,1,2,1,2),
                      value2=c(1,2,4,5,7,8),
                      x=c("feature1","feature1","feature1","feature1","feature1","feature1"),
                      y=c("feature1","feature1","feature2","feature2","feature3","feature3"))

df_goal只是我希望实现的最终df的一个子集,最终我希望得到feature1feature2feature3的每一个组合以及相应的值。
我想使用生成的df进行ggplot。
我认为应该有一个直接的tidyr解决方案来解决我的问题,但我似乎找不到它。
任何帮助都是非常感谢!

fhg3lkii

fhg3lkii1#

这里有一个起点:pivot数据集,然后计算combn选项(unnestasplit只是为了最终使其成为 Dataframe )。

library(tidyr)
library(dplyr) #requires dplyr 1.1.0

data1 <- 
  pivot_longer(df, -id, names_to = "feature") %>% 
  reframe(across(c(value, feature), ~ asplit(t(combn(.x, 2)), 1)),
          .by = id) %>% 
  unnest_wider(c(value, feature), names_sep = "")

# A tibble: 6 × 5
#     id    value1    value2 feature1  feature2 
#1     1         1         4 feature1  feature2 
#2     1         1         7 feature1  feature3 
#3     1         4         7 feature2  feature3 
#4     2         2         5 feature1  feature2 
#5     2         2         8 feature1  feature3 
#6     2         5         8 feature2  feature3

要获取剩余的行,可以执行以下操作:

data2 <- 
  pivot_longer(df, -id, names_to = "feature") %>% 
  mutate(feature1 = feature, feature2 = feature,
         value1 = value, value2 = value, .keep = "unused")

# A tibble: 6 × 5
#     id feature1 feature2 value1 value2
#1     1 feature1 feature1      1      1
#2     1 feature2 feature2      4      4
#3     1 feature3 feature3      7      7
#4     2 feature1 feature1      2      2
#5     2 feature2 feature2      5      5
#6     2 feature3 feature3      8      8

那么,总而言之:

bind_rows(data1, data2) %>% 
  arrange(id, feature1, feature2)

# A tibble: 12 × 5
      id    value1    value2 feature1  feature2 
 1     1         1         1 feature1  feature1 
 2     1         1         4 feature1  feature2 
 3     1         1         7 feature1  feature3 
 4     1         4         4 feature2  feature2 
 5     1         4         7 feature2  feature3 
 6     1         7         7 feature3  feature3 
 7     2         2         2 feature1  feature1 
 8     2         2         5 feature1  feature2 
 9     2         2         8 feature1  feature3 
10     2         5         5 feature2  feature2 
11     2         5         8 feature2  feature3 
12     2         8         8 feature3  feature3
rjzwgtxy

rjzwgtxy2#

可能是透视、重命名和full_join Dataframe 到其自身的情况?

df <- data.frame(
  id = 1:2,
  feature1 = 1:2,
  feature2 = 4:5,
  feature3 = 7:8
)

library(tidyverse)

df_longer <- pivot_longer(df,-id)

df_longer |>
  rename(name1 = name, value1 = value) |>
  full_join(df_longer |> rename(name2 = name, value2 = value),
            by = "id") |> 
  select(id, value1, value2, name1, name2) |> 
  arrange(name1, name2)

#> # A tibble: 18 × 5
#>       id value1 value2 name1    name2   
#>    <int>  <int>  <int> <chr>    <chr>   
#>  1     1      1      1 feature1 feature1
#>  2     2      2      2 feature1 feature1
#>  3     1      1      4 feature1 feature2
#>  4     2      2      5 feature1 feature2
#>  5     1      1      7 feature1 feature3
#>  6     2      2      8 feature1 feature3
#>  7     1      4      1 feature2 feature1
#>  8     2      5      2 feature2 feature1
#>  9     1      4      4 feature2 feature2
#> 10     2      5      5 feature2 feature2
#> 11     1      4      7 feature2 feature3
#> 12     2      5      8 feature2 feature3
#> 13     1      7      1 feature3 feature1
#> 14     2      8      2 feature3 feature1
#> 15     1      7      4 feature3 feature2
#> 16     2      8      5 feature3 feature2
#> 17     1      7      7 feature3 feature3
#> 18     2      8      8 feature3 feature3

这似乎与您上面提供的输出数据集的六行匹配-这是您要查找的吗?

uqxowvwt

uqxowvwt3#

或者,请检查以下代码

data.frame(id=1:2,
                 feature1=1:2,
                 feature2=4:5,
                 feature3=7:8) %>% 
pivot_longer(starts_with('feature'), names_to = 'y', values_to = 'value2') %>% 
  arrange(y) %>% mutate(x=first(y)) %>% rename(value1=id)

创建于2023年2月3日,使用reprex v2.0.2

# A tibble: 6 × 4
  value1 y        value2 x       
   <int> <chr>     <int> <chr>   
1      1 feature1      1 feature1
2      2 feature1      2 feature1
3      1 feature2      4 feature1
4      2 feature2      5 feature1
5      1 feature3      7 feature1
6      2 feature3      8 feature1

相关问题