R语言 如果一个键列的值包含在另一个键列中,则合并两个表

ffx8fchx  于 2023-03-10  发布在  其他
关注(0)|答案(4)|浏览(136)

我有一个包含一组事务的 Dataframe df1

set.seed(99)

df1 <- tibble::tibble(
  ID = 1:10,
  Items = replicate(10, paste0('item-', sample(1:10, sample(3:5)[1]), collapse = ', '))
)

# # A tibble: 10 × 2
#       ID Items                                 
#    <int> <chr>                                 
#  1     1 item-6, item-5, item-3                
#  2     2 item-6, item-4, item-9, item-7        
#  3     3 item-6, item-4, item-5                
#  4     4 item-1, item-7, item-2, item-9        
#  5     5 item-5, item-8, item-7, item-2        
#  6     6 item-10, item-1, item-6, item-4       
#  7     7 item-6, item-7, item-9, item-4, item-5
#  8     8 item-6, item-9, item-1, item-3, item-5
#  9     9 item-6, item-8, item-7, item-3, item-9
# 10    10 item-4, item-7, item-5, item-9

我还有另一个查找表df2,它指示获得优惠券的条件:

df2 <- tibble::tibble(
  Items = c("item-4, item-6", "item-7, item-9"),
  Coupons = c("coupon-1", "coupon-2")
)

# # A tibble: 2 × 2
#   Items          Coupons 
#   <chr>          <chr>   
# 1 item-4, item-6 coupon-1
# 2 item-7, item-9 coupon-2

这意味着,如果有人在一次交易中购买了"item-4""item-6",他/她将获得"coupon-1"
我想合并df1df2,这样我就可以知道一个事务是否满足了获取优惠券的任何条件。

# # A tibble: 12 × 3
#       ID Items                                  Coupons 
#    <int> <chr>                                  <chr>   
#  1     1 item-6, item-5, item-3                 NA      
#  2     2 item-6, item-4, item-9, item-7         coupon-1
#  3     2 item-6, item-4, item-9, item-7         coupon-2
#  4     3 item-6, item-4, item-5                 coupon-1
#  5     4 item-1, item-7, item-2, item-9         coupon-2
#  6     5 item-5, item-8, item-7, item-2         NA      
#  7     6 item-10, item-1, item-6, item-4        coupon-1
#  8     7 item-6, item-7, item-9, item-4, item-5 coupon-1
#  9     7 item-6, item-7, item-9, item-4, item-5 coupon-2
# 10     8 item-6, item-9, item-1, item-3, item-5 NA      
# 11     9 item-6, item-8, item-7, item-3, item-9 coupon-2
# 12    10 item-4, item-7, item-5, item-9         coupon-2

事务ID27重复,因为它们符合其项集中的所有条件。我已经尝试了merge()dplyr::left_join(),但没有成功。希望有人能为这个问题提供一些线索。提前感谢!

oipij1gg

oipij1gg1#

使用greplmerge的 * 基本 * 选项(变体1)。

i <- lapply(strsplit(df2$Items, ", "), function(s) {
    Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
s <- do.call(rbind, Map(function(i, j) cbind(ID = df1$ID[i], Coupons = j),
                        i, df2$Coupons))
merge(df1, s, all.x = TRUE)
#   ID                                  Items  Coupons
#1   1                 item-6, item-5, item-3     <NA>
#2   2         item-6, item-4, item-9, item-7 coupon-1
#3   2         item-6, item-4, item-9, item-7 coupon-2
#4   3                 item-6, item-4, item-5 coupon-1
#5   4         item-1, item-7, item-2, item-9 coupon-2
#6   5         item-5, item-8, item-7, item-2     <NA>
#7   6        item-10, item-1, item-6, item-4 coupon-1
#8   7 item-6, item-7, item-9, item-4, item-5 coupon-1
#9   7 item-6, item-7, item-9, item-4, item-5 coupon-2
#10  8 item-6, item-9, item-1, item-3, item-5     <NA>
#11  9 item-6, item-8, item-7, item-3, item-9 coupon-2
#12 10         item-4, item-7, item-5, item-9 coupon-2

代替使用merge子集化匹配(变体2)。

i <- lapply(strsplit(df2$Items, ", "), function(s) {
    Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })

m <- rbind(cbind(df1[!Reduce(`|`, i),], Coupons = NA), do.call(rbind,
        Map(function(i, j) cbind(df1[i,], Coupons = j), i, df2$Coupons)) )

m[order(m$ID),]
#   ID                                  Items  Coupons
#1   1                 item-6, item-5, item-3     <NA>
#4   2         item-6, item-4, item-9, item-7 coupon-1
#8   2         item-6, item-4, item-9, item-7 coupon-2
#5   3                 item-6, item-4, item-5 coupon-1
#9   4         item-1, item-7, item-2, item-9 coupon-2
#2   5         item-5, item-8, item-7, item-2     <NA>
#6   6        item-10, item-1, item-6, item-4 coupon-1
#7   7 item-6, item-7, item-9, item-4, item-5 coupon-1
#10  7 item-6, item-7, item-9, item-4, item-5 coupon-2
#3   8 item-6, item-9, item-1, item-3, item-5     <NA>
#11  9 item-6, item-8, item-7, item-3, item-9 coupon-2
#12 10         item-4, item-7, item-5, item-9 coupon-2

另一变型(3)

i <- lapply(strsplit(df2$Items, ", "), function(s) {
    Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA),
                                                    sapply(i, sum)))

另一个变体,仅在第一个字符串命中的情况下测试字符串(变体4)。

i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
  Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)],
         s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i)))

基准

bench::mark(check=FALSE,
varaint1 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
    Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
s <- do.call(rbind, Map(function(i, j) cbind(ID = df1$ID[i], Coupons = j),
                        i, df2$Coupons))
    merge(df1, s, all.x = TRUE)},
variant2 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
    Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
rbind(cbind(df1[!Reduce(`|`, i),], Coupons = NA), do.call(rbind,
                                                          Map(function(i, j) cbind(df1[i,], Coupons = j), i, df2$Coupons)) )},
variant3 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
    Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
    cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA), sapply(i, sum))) },
variant4 = {i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
  Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)], s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i))) }
)

结果

expression      min  median itr/s…¹ mem_a…² gc/se…³ n_itr  n_gc total…⁴ result
  <bch:expr> <bch:tm> <bch:t>   <dbl> <bch:b>   <dbl> <int> <dbl> <bch:t> <list>
1 varaint1      435µs   465µs   2131. 17.55KB    21.0  1016    10   477ms <NULL>
2 variant2      703µs   758µs   1322. 16.09KB    21.2   625    10   473ms <NULL>
3 variant3      223µs   241µs   4015.  9.87KB    23.3  1895    11   472ms <NULL>
4 variant4      208µs   224µs   4323. 24.57KB    20.9  2066    10   478ms <NULL>

在本例中,变体4速度较快,变体3使用的内存量最低。
与其他方法比较。

set.seed(99)

df1 <- tibble::tibble(
  ID = 1:10,
  Items = replicate(10, paste0('item-', sample(1:10, sample(3:5)[1]), collapse = ', '))
)

df2 <- tibble::tibble(
  Items = c("item-4, item-6", "item-7, item-9"),
  Coupons = c("coupon-1", "coupon-2")
)

library(dplyr)
library(fuzzyjoin)
library(stringr)
library(data.table)

bench::mark(check=FALSE,
Darren1 = {fuzzy_left_join(df1, rename(df2, key = Items), by = c("Items" = "key"),
                match_fun = Vectorize(\(x, y) all(strsplit(y, ', ')[[1]] %in% strsplit(x, ', ')[[1]]))) %>%
    select(-key)},
Darren2 = {df2_pattern <- df2 %>%
  mutate(key = sapply(str_split(Items, ', '), \(x) str_c("(?=.*", x, ")", collapse = "")), .keep = "unused")
fuzzy_left_join(df1, df2_pattern, by = c("Items" = "key"),
                match_fun = str_detect) %>%
    select(-key) },
arg0naut91A = {df1 %>%
  left_join(
    full_join(df1, df2 %>% rename(CouponItems = Items), by = character()) %>%
      rowwise %>%
      filter(all(unlist(strsplit(CouponItems, ', ')) %in% unlist(strsplit(Items, ', ')))) %>%
      select(-CouponItems), multiple = "all"
  )},
arg0naut91B = {df1 %>%
  left_join(
    cross_join(df1, df2 %>% rename(CouponItems = Items)) %>%
      rowwise %>%
      filter(all(unlist(strsplit(CouponItems, ', ')) %in% unlist(strsplit(Items, ', ')))) %>%
      select(-CouponItems), multiple = "all"
  )},
Thomas = {unique(
  na.omit(
    setDT(df2)[, .(Items = unlist(strsplit(Items, ", "))), Coupons][
      setDT(df1)[, .(Items = unlist(strsplit(Items, ", "))), ID],
      on = "Items"
    ]
  )[
    ,
    .SD[uniqueN(Items) > 1], .(ID, Coupons)
  ][, Items := NULL]
)[df1,
  on = "ID",
  allow.cartesian = TRUE
][
  ,
  .(ID, Items, Coupons)
]},
GKi3 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
    Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA), sapply(i, sum))) },
GKi4 = {i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
  Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)], s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i))) }
)

结果

expression       min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total…¹
  <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl> <bch:t>
1 Darren1       34.6ms  34.61ms      28.5   223.7KB     85.4     3     9   105ms
2 Darren2       32.1ms  33.66ms      29.7   173.8KB     53.5     5     9   168ms
3 arg0naut91A   18.6ms  21.15ms      47.6    95.3KB     36.6    13    10   273ms
4 arg0naut91B   14.8ms  16.73ms      60.9    76.3KB     37.2    18    11   296ms
5 Thomas         3.6ms   4.13ms     222.    688.7KB     18.3    97     8   437ms
6 GKi3         289.1µs 322.51µs    2857.     42.4KB     24.0  1308    11   458ms
7 GKi4         284.4µs 312.12µs    3035.     46.5KB     23.6  1412    11   465ms

在这种情况下,GKi3和GKi4速度最快,使用的内存量最少。

wkyowqbh

wkyowqbh2#

您可以使用fuzzyjoin

library(dplyr)
library(fuzzyjoin)

fuzzy_left_join(df1, rename(df2, key = Items), by = c("Items" = "key"),
                match_fun = Vectorize(\(x, y) all(strsplit(y, ', ')[[1]] %in% strsplit(x, ', ')[[1]]))) %>%
  select(-key)

# # A tibble: 12 × 3
#       ID Items                                  Coupons 
#    <int> <chr>                                  <chr>   
#  1     1 item-6, item-5, item-3                 NA      
#  2     2 item-6, item-4, item-9, item-7         coupon-1
#  3     2 item-6, item-4, item-9, item-7         coupon-2
#  4     3 item-6, item-4, item-5                 coupon-1
#  5     4 item-1, item-7, item-2, item-9         coupon-2
#  6     5 item-5, item-8, item-7, item-2         NA      
#  7     6 item-10, item-1, item-6, item-4        coupon-1
#  8     7 item-6, item-7, item-9, item-4, item-5 coupon-1
#  9     7 item-6, item-7, item-9, item-4, item-5 coupon-2
# 10     8 item-6, item-9, item-1, item-3, item-5 NA      
# 11     9 item-6, item-8, item-7, item-3, item-9 coupon-2
# 12    10 item-4, item-7, item-5, item-9         coupon-2

您还可以使用regex前瞻检测:
x一个一个一个一个x一个一个二个x

pobjuy32

pobjuy323#

dplyr的选项:

library(dplyr)

df1 %>%
  left_join(
    full_join(df1, df2 %>% rename(CouponItems = Items), by = character()) %>%
      rowwise %>%
      filter(all(unlist(strsplit(CouponItems, ', ')) %in% unlist(strsplit(Items, ', ')))) %>%
      select(-CouponItems)
  )

输出:

# A tibble: 12 x 3
      ID Items                                  Coupons 
   <int> <chr>                                  <chr>   
 1     1 item-6, item-5, item-3                 NA      
 2     2 item-6, item-4, item-9, item-7         coupon-1
 3     2 item-6, item-4, item-9, item-7         coupon-2
 4     3 item-6, item-4, item-5                 coupon-1
 5     4 item-1, item-7, item-2, item-9         coupon-2
 6     5 item-5, item-8, item-7, item-2         NA      
 7     6 item-10, item-1, item-6, item-4        coupon-1
 8     7 item-6, item-7, item-9, item-4, item-5 coupon-1
 9     7 item-6, item-7, item-9, item-4, item-5 coupon-2
10     8 item-6, item-9, item-1, item-3, item-5 NA      
11     9 item-6, item-8, item-7, item-3, item-9 coupon-2
12    10 item-4, item-7, item-5, item-9         coupon-2

或最新版本的cross_join1.1.0):

df1 %>%
  left_join(
    cross_join(df1, df2 %>% rename(CouponItems = Items)) %>%
      rowwise %>%
      filter(all(unlist(strsplit(CouponItems, ', ')) %in% unlist(strsplit(Items, ', ')))) %>%
      select(-CouponItems)
  )
gwo2fgha

gwo2fgha4#

一个data.table选项

unique(
  na.omit(
    setDT(df2)[, .(Items = unlist(strsplit(Items, ", "))), Coupons][
      setDT(df1)[, .(Items = unlist(strsplit(Items, ", "))), ID],
      on = "Items"
    ]
  )[
    ,
    .SD[uniqueN(Items) > 1], .(ID, Coupons)
  ][, Items := NULL]
)[df1,
  on = "ID",
  allow.cartesian = TRUE
][
  ,
  .(ID, Items, Coupons)
]

给予

ID                                  Items  Coupons
 1:  1                 item-6, item-5, item-3     <NA>
 2:  2         item-6, item-4, item-9, item-7 coupon-1
 3:  2         item-6, item-4, item-9, item-7 coupon-2
 4:  3                 item-6, item-4, item-5 coupon-1
 5:  4         item-1, item-7, item-2, item-9 coupon-2
 6:  5         item-5, item-8, item-7, item-2     <NA>
 7:  6        item-10, item-1, item-6, item-4 coupon-1
 8:  7 item-6, item-7, item-9, item-4, item-5 coupon-1
 9:  7 item-6, item-7, item-9, item-4, item-5 coupon-2
10:  8 item-6, item-9, item-1, item-3, item-5     <NA>
11:  9 item-6, item-8, item-7, item-3, item-9 coupon-2
12: 10         item-4, item-7, item-5, item-9 coupon-2

相关问题