R语言 通过向量缩短逗号分隔字符串中的元素数

eqfvzcg8  于 2023-05-11  发布在  其他
关注(0)|答案(7)|浏览(82)

我有一些数据,比如Area_bsl,它包含逗号分隔的值的字符串,还有一列diffr,它声明了Area_bsl必须 * 缩短 * 的元素数量:

df <- data.frame(
  id = 1:3,
  Area_bsl = c("155,199,198,195,100,112,177,199,188,144",
               "100,99,98,95,100,112,111,99",                        
               "131,166,155,111,100,117,166,188,101,101,105,166"),
  diffr = c(3,0,6)
)

所以我要做的就是切断...

  • Area_bslid == 1中的最后3个元素
  • Area_bslid == 2中的0个元素
  • Area_bslid == 3中的最后6个元素

我一直是这样完成这个任务的使用slice_head的最后一部分抛出错误:

library(tidyverse)
df %>%
  # separate comma-separated values into rows:
  separate_rows(Area_bsl) %>%
  # for each `id`...:
  group_by(id) %>%
  #... create a row counter:
  mutate(rowid = row_number()) %>%
  # ...create the cutoff point:
  mutate(cutoff = last(rowid) - diffr) %>%
  # ...slice out as many as `cutoff` rows: <--- does not work! 
  slice_head(n = cutoff[1])
Error in `slice_head()`:
! `n` must be a constant.
Caused by error in `force()`:
! object 'cutoff' not found

预期结果如下:

id Area_bsl diffr rowid cutoff
   <int> <chr>    <dbl> <int>  <dbl>
 1     1 155          3     1      7
 2     1 199          3     2      7
 3     1 198          3     3      7
 4     1 195          3     4      7
 5     1 100          3     5      7
 6     1 112          3     6      7
 7     1 177          3     7      7
11     2 100          0     1      8
12     2 99           0     2      8
13     2 98           0     3      8
14     2 95           0     4      8
15     2 100          0     5      8
16     2 112          0     6      8
17     2 111          0     7      8
18     2 99           0     8      8
19     3 131          6     1      6
20     3 166          6     2      6
21     3 155          6     3      6
22     3 111          6     4      6
23     3 100          6     5      6
24     3 117          6     6      6
0lvr5msh

0lvr5msh1#

您可以在mapply中使用sub来缩短给定数量的元素,然后使用tidyr::separate_rows

df$Area_bsl <- mapply(\(s, n) sub(paste0("(,\\d+){", n, "}$"), "", s),
                       df$Area_bsl, df$diffr)
df
#  id                    Area_bsl diffr
#1  1 155,199,198,195,100,112,177     3
#2  2 100,99,98,95,100,112,111,99     0
#3  3     131,166,155,111,100,117     6

tidyr::separate_rows(df, Area_bsl)
## A tibble: 21 × 3
#      id Area_bsl diffr
#   <int> <chr>    <dbl>
# 1     1 155          3
# 2     1 199          3
# 3     1 198          3
# 4     1 195          3
# 5     1 100          3
# 6     1 112          3
# 7     1 177          3
# 8     2 100          0
# 9     2 99           0
#10     2 98           0
## ℹ 11 more rows
## ℹ Use `print(n = ...)` to see more rows
1l5u6lss

1l5u6lss2#

还有另一个选择:

library(dplyr)
library(tidyr)

df %>%
  separate_longer_delim(Area_bsl, ",") %>%
  slice(head(row_number(), n() - first(diffr)), .by = id)

   id Area_bsl diffr
1   1      155     3
2   1      199     3
3   1      198     3
4   1      195     3
5   1      100     3
6   1      112     3
7   1      177     3
8   2      100     0
9   2       99     0
10  2       98     0
11  2       95     0
12  2      100     0
13  2      112     0
14  2      111     0
15  2       99     0
16  3      131     6
17  3      166     6
18  3      155     6
19  3      111     6
20  3      100     6
21  3      117     6
vd2z7a6w

vd2z7a6w3#

您可以使用strsplit并将Area_bsl作为列表列进行管理。然后是unnest。使用row_number()时,rowid和cutoff应该很简单

df %>%
  mutate(Area_bsl = strsplit(Area_bsl, ',')) %>%
  mutate(Area_bsl = Map(function(a, b) a[1:(length(a) - b)], Area_bsl, diffr)) %>%
  unnest(Area_bsl) %>%
  group_by(id) %>%
  mutate(rowid = row_number(), cutoff = max(rowid))
#> # A tibble: 21 x 5
#> # Groups:   id [3]
#>       id Area_bsl diffr rowid cutoff
#>    <int> <chr>    <dbl> <int>  <int>
#>  1     1 155          3     1      7
#>  2     1 199          3     2      7
#>  3     1 198          3     3      7
#>  4     1 195          3     4      7
#>  5     1 100          3     5      7
#>  6     1 112          3     6      7
#>  7     1 177          3     7      7
#>  8     2 100          0     1      8
#>  9     2 99           0     2      8
#> 10     2 98           0     3      8
#> # ... with 11 more rows

创建于2023-05-05带有reprex v2.0.2

2cmtqfgy

2cmtqfgy4#

使用strsplit + head

library(dplyr)
library(tidyr)
df %>% 
  mutate(Area_bsl = mapply(\(x, y) head(x, length(x) - y), strsplit(Area_bsl, ","), diffr)) %>% 
  unnest(Area_bsl)

输出

# A tibble: 21 × 3
      id Area_bsl diffr
   <int> <chr>    <dbl>
 1     1 155          3
 2     1 199          3
 3     1 198          3
 4     1 195          3
 5     1 100          3
 6     1 112          3
 7     1 177          3
 8     2 100          0
 9     2 99           0
10     2 98           0
11     2 95           0
12     2 100          0
13     2 112          0
14     2 111          0
15     2 99           0
16     3 131          6
17     3 166          6
18     3 155          6
19     3 111          6
20     3 100          6
21     3 117          6
pbgvytdp

pbgvytdp5#

**更新:**如果我们想省略rowwise操作:

我们可以分组:

df %>% 
  mutate(Area_bsl = ifelse(diffr == 0, 
                           Area_bsl, 
                           paste(head(strsplit(Area_bsl, ",")[[1]], -diffr), collapse = ",")), .by=id) %>% 
  separate_rows(Area_bsl, sep = ",")

第一个答案:

首先,我们从字符串Area_bsl中删除n = diffr,其中strsplit(),然后再次删除collapse。最后我们使用separate_rows

library(dplyr)
library(tidyr)

df %>% 
  rowwise() %>% 
  mutate(Area_bsl = ifelse(diffr == 0, Area_bsl, paste(head(strsplit(Area_bsl, ",")[[1]], -diffr), collapse = ","))) %>% 
  separate_rows(Area_bsl, sep = ",") %>% 
  data.frame()

library(dplyr)
library(tidyr)

df %>% 
  rowwise() %>% 
  mutate(Area_bsl = ifelse(diffr == 0, Area_bsl, paste(head(strsplit(Area_bsl, ",")[[1]], -diffr), collapse = ","))) %>% 
  separate_longer_delim(Area_bsl, delim = ",")
id Area_bsl diffr
1   1      155     3
2   1      199     3
3   1      198     3
4   1      195     3
5   1      100     3
6   1      112     3
7   1      177     3
8   2      100     0
9   2       99     0
10  2       98     0
11  2       95     0
12  2      100     0
13  2      112     0
14  2      111     0
15  2       99     0
16  3      131     6
17  3      166     6
18  3      155     6
19  3      111     6
20  3      100     6
21  3      117     6
yqhsw0fo

yqhsw0fo6#

我们可以扩展Area_bsl列,然后扩展范围内的filter rowid。

library(tidyverse)

df |> separate_longer_delim(Area_bsl, delim = ",") |> 
  mutate(rowid = row_number(), 
         cutoff = max(rowid) - diffr, .by = "id") |> 
  filter(rowid %in% (rowid - diffr), .by = "id")

   id Area_bsl diffr rowid cutoff
1   1      155     3     1      7
2   1      199     3     2      7
3   1      198     3     3      7
4   1      195     3     4      7
5   1      100     3     5      7
6   1      112     3     6      7
7   1      177     3     7      7
8   2      100     0     1      8
9   2       99     0     2      8
10  2       98     0     3      8
11  2       95     0     4      8
12  2      100     0     5      8
13  2      112     0     6      8
14  2      111     0     7      8
15  2       99     0     8      8
16  3      131     6     1      6
17  3      166     6     2      6
18  3      155     6     3      6
19  3      111     6     4      6
20  3      100     6     5      6
21  3      117     6     6      6
2w3kk1z5

2w3kk1z57#

下面是另一个选项,其中sub + scan用于解析Area_bsl列中所需的数字子集

df %>%
    mutate(Area_bsl = Map(
        \(s, k) scan(text = sub(sprintf("(,\\d+){%s}$", k), "", s), sep = ",", quiet = TRUE),
        Area_bsl, diffr
    )) %>%
    unnest(Area_bsl) %>%
    mutate(rowid = row_number(), cutoff = n(), .by=id)

它给出了

# A tibble: 21 × 5
      id Area_bsl diffr rowid cutoff
   <int>    <dbl> <dbl> <int>  <int>
 1     1      155     3     1      7
 2     1      199     3     2      7
 3     1      198     3     3      7
 4     1      195     3     4      7
 5     1      100     3     5      7
 6     1      112     3     6      7
 7     1      177     3     7      7
 8     2      100     0     1      8
 9     2       99     0     2      8
10     2       98     0     3      8
# ℹ 11 more rows
# ℹ Use `print(n = ...)` to see more rows

相关问题