如何在r中将3列合并为一列(无间隙)

n3schb8v  于 2023-10-13  发布在  其他
关注(0)|答案(5)|浏览(116)

我有3个单独的列,每行只有一个列中的数据,类似于下面的.

| Country | Country 2 | Country 3 |
| ------- | --------- | --------- |
| UK      |           |           |
|         | France    |           |
|         |           | Germany   |
| Italy   |           |           |
| USA     |           |           |
|         | France    |           |
|         |           | Germany   |

我想创建一个列合并这些一起如下.

| Final Country |
| ------------- |
| UK            |
| France        |
| Germany       |
| Italy         |
| USA           |
| France        |
| Germany       |

我试过使用合并和粘贴,这似乎是工作的2列,但不是3。
注意:这不是整个表,所以我希望能够选择其中的3列

9cbw7uwe

9cbw7uwe1#

在这种情况下,一个简单的替代方法使用基R将只是使用max跨行应用(“窃取”Seth的数据-谢谢@Seth!):

df$finalcountry <- apply(df, 1, max)

#   country1 country2 country3 finalcountry
# 1       UK                             UK
# 2            France                France
# 3                    Germany      Germany
# 4    Italy                          Italy
# 5      USA                            USA
# 6            France                France
# 7                    Germany      Germany

如果你只想在某些列上应用它,你可以指定列,但你想,即:

wantcols <- 2:4
# or
wantcols <- grep("country", names(df))
# or
wantcols <- c("country1", "country2", "country3")

df$finalcountry <- apply(df[wantcols], 1, max)

#   other_data country1 country2 country3 finalcountry
# 1         88       UK                             UK
# 2         81            France                France
# 3         76                    Germany      Germany
# 4        100    Italy                          Italy
# 5         60      USA                            USA
# 6         70            France                France
# 7         82                    Germany      Germany

如果你想在一个单独的 Dataframe :

data.frame(finalcountry = apply(df[wantcols], 1, max))

#   finalcountry
# 1           UK
# 2       France
# 3      Germany
# 4        Italy
# 5          USA
# 6       France
# 7      Germany

注意,如果您有NA值而不是真正的空格,请添加na.rm = TRUE

apply(df[wantcols], 1, max, na.rm = TRUE)

转载@Seth的数据给后代:

df <- data.frame(other_data = sample(50:100, 7),
                 country1 = c('UK','','','Italy','USA','',''),
                 country2 = c('','France','','','','France',''),
                 country3 = c('','','Germany','','','','Germany'))
jc3wubiy

jc3wubiy2#

  • 编辑 * 问题澄清只需要粘贴一些列。更新列选择。

您可以使用tidyr::unite()粘贴多个列,按名称提供一组指定的列,或使用tidy-select函数,如本例所示:

library(dplyr)
library(tidyr)

df %>%
  unite(col = 'Final_Country',
        starts_with('country'),
        sep = '',
        remove = TRUE)
#>   other_data Final_Country
#> 1         83            UK
#> 2         98        France
#> 3         80       Germany
#> 4         59         Italy
#> 5         70           USA
#> 6         63        France
#> 7         72       Germany

数据

structure(list(other_data = c(77L, 63L, 94L, 73L, 82L, 52L, 80L
), country1 = c("UK", "", "", "Italy", "USA", "", ""), country2 = c("", 
"France", "", "", "", "France", ""), country3 = c("", "", "Germany", 
"", "", "", "Germany")), class = "data.frame", row.names = c(NA, 
-7L))
inkz8wg9

inkz8wg93#

另一个R基替代方案:

> data.frame(Final_Country = apply(df, 1, paste0, collapse=""))
  Final_Country
1            UK
2        France
3       Germany
4         Italy
5           USA
6        France
7       Germany

另一个使用coalesce的解决方案是:

> df %>% 
    mutate(across(everything(), ~na_if(., ""))) %>% 
    do.call(coalesce, .) %>% 
    data.frame(Final_country = .)
  Final_country
1            UK
2        France
3       Germany
4         Italy
5           USA
6        France
7       Germany

我也从@Seth获取了数据。

ej83mcc0

ej83mcc04#

data.table版本:

library(data.table)

DF <- data.frame(
         country_1 = c("UK", "", "", "Italy", "USA", "", ""),
         country_2 = c("", "France", "", "", "", "France", ""),
         country_3 = c("", "", "Germany", "", "", "", "Germany")
)

setDT(DF)
DF[, country := do.call(paste0, .SD), .SDcols = names(DF)]
print(DF)
lmyy7pcs

lmyy7pcs5#

一种可能性是转置数据并将那些不是''的数据子集化。

x <- DF[c("country1", "country2", "country3")] #select the 3 columns

x <- t(x)  #transpose the data
x[x != ''] #Get those which are not empty
#x[!is.na(x) & x != ''] #In case there are NA
#[1] "UK"      "France"  "Germany" "Italy"   "USA"     "France"  "Germany"

或者使用max.col

x <- DF[c("country1", "country2", "country3")] #select the 3 columns

x[cbind(seq_len(nrow(x)), max.col(x != ''))]
#x[cbind(seq_len(nrow(x)), max.col(!is.na(x) & x != ''))]  #In case there are NA
#[1] "UK"      "France"  "Germany" "Italy"   "USA"     "France"  "Germany"

或者在do.call中使用paste0

x <- DF[c("country1", "country2", "country3")] #select the 3 columns

#x[is.na(x)] <- ''  #In case there are NA
do.call(paste0, x)
#[1] "UK"      "France"  "Germany" "Italy"   "USA"     "France"  "Germany"

或者在do.call中使用pmax

x <- DF[c("country1", "country2", "country3")] #select the 3 columns

#x[is.na(x)] <- ''  #In case there are NA
do.call(pmax, x)
#[1] "UK"      "France"  "Germany" "Italy"   "USA"     "France"  "Germany"

数据

DF <- data.frame(a = 1:7,
                 country1 = c('UK','','','Italy','USA','',''),
                 country2 = c('','France','','','','France',''),
                 country3 = c('','','Germany','','','','Germany'),
                 b = 7:1)

基准

x <- DF[c("country1", "country2", "country3")] #select the 3 columns

bench::mark(
           max.col = x[cbind(seq_len(nrow(x)), max.col(x != ''))],
           applyPaste = apply(x, 1, paste0, collapse=""),  # @Jilber Urbina
           applyMax = apply(x, 1, max),  # @jpsmith
           t = {y <- t(x); y[y != '']},
           pamx = do.call(pmax, x),
           paste0 = do.call(paste0, x)
       )
#  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 max.col    67.07µs 72.85µs    13211.    2.91KB     32.3  6138    15    464.6ms
#2 applyPaste 48.62µs 53.49µs    18218.      432B     30.1  8484    14    465.7ms
#3 applyMax    41.5µs 44.65µs    21755.      432B     32.7  9985    15      459ms
#4 t          24.06µs 26.39µs    36081.      432B     32.5  9991     9    276.9ms
#5 pamx        5.74µs  6.57µs   147943.        0B     44.4  9997     3     67.6ms
#6 paste0      2.56µs  2.91µs   319035.        0B     31.9  9999     1     31.3ms

相关问题