返回R中具有最高、第二高和第三高值的列的名称

qij5mzcb 于 2022-12-20 发布在其他

关注(0)|答案(4)|浏览(180)

努力想把事情做好希望有人能帮忙...
我有一个数据集，在这个数据集中，我试图让row_max成为A：N列之间的最大值，row_min应该是第二大值（不是零）。
我尝试了以下方法来获取row_max：

tmp$row_max = colnames(tmp[,5:9])[apply(tmp[,5:9],1,which.max)]

which.min返回包含0的列

contig  pos ref cov  A  T  C  G N row_max 
1: NW_017095466.1  130   N  41 39  2  0  0 0       A 
2: NW_017095466.1  166   N  48  0 46  2  0 0       T 
3: NW_017095466.1  427   N  52 50  0  0  2 0       A 
4: NW_017095466.1 1736   N  54 44  0 10  0 0       A 
5: NW_017095466.1 1918   N  46  0  0  3 43 0       G 
6: NW_017095466.1 2688   N  52  5  0 47  0 0       C

我可以对列进行排序，然后选择第二高的列，但这会给我值，很难得到要返回的column_name：

apply(tmp[,5:9], 1, FUN = function(x) sort(x)[4])

有没有一个简洁的tidyverse解决方案？

dput(tmp)
structure(list(contig = c("NW_017095466.1", "NW_017095466.1", 
"NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"
), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N", 
"N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L
), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L, 
0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L, 
0L), N = c(0L, 0L, 0L, 0L, 0L, 0L), row_max = c("A", "T", "A", 
"A", "G", "C"), row_min = c("C", "A", "T", "T", "A", "T")), row.names = c(NA, 
-6L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x7f94c30168e0>)

来源：https://stackoverflow.com/questions/74841239/return-the-name-of-the-column-with-highest-and-2nd-highest-and-3rd-highest-value

4条答案

按热度按时间

iyr7buue1#

使用order()可以很容易地做到这一点，如果元素被排序，它会给予你元素的位置，使用它以正确的顺序选择名称，你会得到一个按元素大小排序的名称矩阵。

library(tidyverse)

tmp <- structure(list(contig = c("NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N", "N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L, 0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L, 0L), N = c(0L, 0L, 0L, 0L, 0L, 0L), row_max = c("A", "T", "A", "A", "G", "C"), row_min = c("C", "A", "T", "T", "A", "T")), row.names = c(NA, -6L), class = c("data.table", "data.frame"))

tmp[5:9] %>% 
  apply(1,\(row) rev(names(.)[order(row)])) %>%
  t()
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,] "A"  "T"  "N"  "G"  "C" 
#> [2,] "T"  "C"  "N"  "G"  "A" 
#> [3,] "A"  "G"  "N"  "C"  "T" 
#> [4,] "A"  "C"  "N"  "G"  "T" 
#> [5,] "G"  "C"  "N"  "T"  "A" 
#> [6,] "C"  "A"  "N"  "G"  "T"

赞(0）回复(0）举报 2022-12-20

s4n0splo2#

这里有一种排除min部分的0的方法。还要记住，如果数据混合了numeric和character，则apply会转换为character，因此显式转换为数值可以避免奇数空格。

data.frame(tmp, t(apply(tmp, 1, function(x){ nuc_num <- as.numeric(x[5:8])
  c(row_max = colnames(tmp)[5:8][which.max(nuc_num)], 
    row_min = colnames(tmp)[5:8][which(nuc_num > 0)[which.min(nuc_num[nuc_num > 0])]]) })))
          contig  pos ref cov  A  T  C  G N row_max row_min
1 NW_017095466.1  130   N  41 39  2  0  0 0       A       T
2 NW_017095466.1  166   N  48  0 46  2  0 0       T       C
3 NW_017095466.1  427   N  52 50  0  0  2 0       A       G
4 NW_017095466.1 1736   N  54 44  0 10  0 0       A       C
5 NW_017095466.1 1918   N  46  0  0  3 43 0       G       C
6 NW_017095466.1 2688   N  52  5  0 47  0 0       C       A

使用dplyr

library(dplyr)

tmp %>% 
  rowwise() %>% 
  mutate(nuc = list(names(across(A:G))), 
         row_max = nuc[which.max(across(A:G))], 
         nuc_nzero = list(which(across(A:G) > 0)), 
         row_min = nuc[nuc_nzero[which.min(across(A:G)[nuc_nzero])]], 
         nuc = NULL, nuc_nzero = NULL) %>% 
  ungroup()
# A tibble: 6 × 11
  contig           pos ref     cov     A     T     C     G     N row_max row_min
  <chr>          <int> <chr> <int> <int> <int> <int> <int> <int> <chr>   <chr>
1 NW_017095466.1   130 N        41    39     2     0     0     0 A       T
2 NW_017095466.1   166 N        48     0    46     2     0     0 T       C
3 NW_017095466.1   427 N        52    50     0     0     2     0 A       G
4 NW_017095466.1  1736 N        54    44     0    10     0     0 A       C
5 NW_017095466.1  1918 N        46     0     0     3    43     0 G       C
6 NW_017095466.1  2688 N        52     5     0    47     0     0 C       A

数据

tmp <- structure(list(contig = c("NW_017095466.1", "NW_017095466.1",
"NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"
), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N",
"N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L
), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L,
0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L,
0L), N = c(0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")

赞(0）回复(0）举报 2022-12-20

zpqajqem3#

下面是另一个解决方案：

library(tidyverse)

tmp |> 
  mutate(row = row_number()) |> 
  group_by(row) |> 
  pivot_longer(cols = A:N) |> 
  mutate(max = name[value == max(value)]) |> 
  mutate(value = ifelse(value == 0, NA, value)) |> 
  mutate(tm = min(value, na.rm = T)) |> 
  mutate(value = ifelse(is.na(value), 0, value)) |> 
  mutate(min = name[value == tm]) |> 
  pivot_wider(names_from = name, values_from = value) |> 
  ungroup() |> 
  select(-row, -tm) |> 
  relocate(c(min, max), .after = last_col())
#> # A tibble: 6 × 11
#>   contig           pos ref     cov     A     T     C     G     N min   max  
#>   <chr>          <int> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 NW_017095466.1   130 N        41    39     2     0     0     0 T     A    
#> 2 NW_017095466.1   166 N        48     0    46     2     0     0 C     T    
#> 3 NW_017095466.1   427 N        52    50     0     0     2     0 G     A    
#> 4 NW_017095466.1  1736 N        54    44     0    10     0     0 C     A    
#> 5 NW_017095466.1  1918 N        46     0     0     3    43     0 C     G    
#> 6 NW_017095466.1  2688 N        52     5     0    47     0     0 A     C

赞(0）回复(0）举报 2022-12-20

aurhwmvo4#

由于您示例数据集似乎是data.table，因此这里使用了data.table方法

# create temporary dataset
tempdata <- melt(tmp, measure.vars = patterns("^[A-Z]$"))[order(-value), .SD[1:2], by = .(pos)][!value == 0,]
# join
tmp[tempdata[, paste0(variable, collapse = ","), by = .(pos)], 
    c("row.max", "row.min") := tstrsplit(i.V1, ","), on = .(pos)][]
#            contig  pos ref cov  A  T  C  G N row.max row.min
# 1: NW_017095466.1  130   N  41 39  2  0  0 0       A       T
# 2: NW_017095466.1  166   N  48  0 46  2  0 0       T       C
# 3: NW_017095466.1  427   N  52 50  0  0  2 0       A       G
# 4: NW_017095466.1 1736   N  54 44  0 10  0 0       A       C
# 5: NW_017095466.1 1918   N  46  0  0  3 43 0       G       C
# 6: NW_017095466.1 2688   N  52  5  0 47  0 0       C       A

赞(0）回复(0）举报 2022-12-20

我来回答

返回R中具有最高、第二高和第三高值的列的名称

4条答案

数据

相关问题

热门标签

最新问答