R语言 如何根据变量的另一个值添加新的变量?

pdtvr36n  于 2023-05-04  发布在  其他
关注(0)|答案(3)|浏览(199)

这是我的dataframe:

或者你可以复制它:

structure(list(A_levels = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    College_Uni = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), CSEs = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0), NVQ_HND_HNC = c(0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0), O_levels_GCSEs = c(1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1), Other_prof_qual = c(0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0), Prefer_not_to_answer = c(0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0), None_of_the_above = c(0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0), id = 5000:5010), row.names = c(NA, -11L), class = c("tbl_df", 
"tbl", "data. Frame")) -> df

我想改变一个新变量的edu_level,它的值如下:

#my vairabe               edu_level
#College_Uni              high
#A_levels                 medium
#O_levels_GCSEs           medium
#CSEs)                    low
#NVQ_HND_HNC              low
#Other_prof_qual          low
#Prefer_not_to_answer     NA
#None_of_the_above        NA

我想可能是因为是选择题,有些人有两个以上的水平,所以我不知道如何指定分类。
我想首先选择其edu_level的优先级为最高学位。例如,如果它同时具有College_Uni和A_levels,则edu_level的值为高。

whlutmcx

whlutmcx1#

我创建了一个不同教育水平的数据框来展示解决方案;

library(dplyr)
library(tidyr)

myvars %>% 
  mutate(num_level = case_when(is.na(edu_level) ~ 0, 
                               edu_level == "low" ~ 1, 
                               edu_level == "medium" ~ 2,
                               edu_level == "high" ~ 3)) -> myvars

df1 %>% 
  pivot_longer(-id, names_to = "degree") %>% 
  filter(value != 0) %>% 
  right_join(., myvars, join_by(degree == variable)) %>% 
  slice(which.max(num_level), .by = id) %>% 
  select(-value, -num_level) %>% ## you can also drop "degree"
  left_join(., df1)
#> Joining with `by = join_by(id)`
#> # A tibble: 11 x 11
#>       id degree  edu_level A_levels College_Uni  CSEs NVQ_HND_HNC O_levels_GCSEs
#>    <int> <chr>   <chr>        <dbl>       <dbl> <dbl>       <dbl>          <dbl>
#>  1  5000 Colleg~ high             0           1     0           0              1
#>  2  5001 A_leve~ medium           1           0     0           0              1
#>  3  5002 A_leve~ medium           1           0     0           0              1
#>  4  5003 O_leve~ medium           0           0     0           1              1
#>  5  5004 Colleg~ high             0           1     0           0              1
#>  6  5005 Other_~ low              0           0     0           0              0
#>  7  5006 Colleg~ high             0           1     0           1              1
#>  8  5007 CSEs    low              0           0     1           0              0
#>  9  5008 Colleg~ high             0           1     0           0              0
#> 10  5009 Prefer~ <NA>             0           0     0           0              0
#> 11  5010 None_o~ <NA>             0           0     0           0              0
#> # i 3 more variables: Other_prof_qual <dbl>, Prefer_not_to_answer <dbl>,
#> #   None_of_the_above <dbl>

数据:

data.frame(A_levels =             c(0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), 
           College_Uni =          c(1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), 
           CSEs =                 c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), 
           NVQ_HND_HNC =          c(0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0), 
           O_levels_GCSEs =       c(1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0), 
           Other_prof_qual =      c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), 
           Prefer_not_to_answer = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), 
           None_of_the_above =    c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), 
           id = 5000:5010) -> df1

read.table(text = "variable              edu_level
                   College_Uni           high
                   A_levels              medium
                   O_levels_GCSEs        medium
                   CSEs                  low
                   NVQ_HND_HNC           low
                   Other_prof_qual       low
                   Prefer_not_to_answer  NA
                   None_of_the_above     NA", header = T, stringsAsFactor = F) -> myvars
6g8kf2rb

6g8kf2rb2#

试试这个,让我知道它是否有帮助。我将您的调查从0和1转换为排名数字,然后从每行中找到最大数字。
可重现数据

dat <- data.frame(A_levels = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
               College_Uni = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
               CSEs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
               NVQ_HND_HNC = c(0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0), 
               O_levels_GCSEs = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
               Other_prof_qual = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
               Prefer_not_to_answer = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
               None_of_the_above = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
               id = 5000:5010)

scoring_df <- data.frame(variable = c("A_levels", "College_Uni", "CSEs", "NVQ_HND_HNC", "O_levels_GCSEs", "Other_prof_qual", "Prefer_not_to_answer","None_of_the_above"),
                         edu_level = c("medium", "high", "low","low","medium","low",NA, NA),
                         edu_score = c(2,3,1,1,2,1,NA,NA))
# I added the "edu_score"

工作

# remove the id column
dat_scores_only <- dat[, -which(colnames(dat) == "id")] 

# fetch the numeric scores for each column
scoring_ordered <- scoring_df$edu_score[match(scoring_df$variable, colnames(dat) )] 

# replace 0s & 1s with scores using multiplication, https://stackoverflow.com/questions/36111444/multiply-columns-in-a-data-frame-by-a-vector
results <- t(t(dat_scores_only)*scoring_ordered) 

# now check for the maximum level per row
# using apply(matrix, by row = 1 and by column = 2, function is max, argument is to ignore NAs)
max_values_by_row <- apply(results, 1, max, na.rm = T)

# and look back up the score in your scoring df to get the words you wanted per row as a new variable "edu_level"
dat$edu_level <- scoring_df[match(max_values_by_row,scoring_df$edu_score), "edu_level"]

结果(见最右栏)

dat
#   A_levels College_Uni CSEs NVQ_HND_HNC O_levels_GCSEs Other_prof_qual Prefer_not_to_answer None_of_the_above   id edu_score
#1         0           1    0           0              1               0                    0                 0 5000      high
#2         0           1    0           0              1               0                    0                 0 5001      high
#3         0           1    0           0              1               0                    0                 0 5002      high
#4         0           1    0           0              1               0                    0                 0 5003      high
#5         0           1    0           0              1               0                    0                 0 5004      high
#6         0           1    0           0              1               0                    0                 0 5005      high
#7         0           1    0           0              1               0                    0                 0 5006      high
#8         0           1    0           0              1               0                    0                 0 5007      high
#9         0           1    0           0              1               0                    0                 0 5008      high
#10        0           1    0           0              1               0                    0                 0 5009      high
#11        0           1    0           0              1               0                    0                 0 5010      high
nxagd54h

nxagd54h3#

这里有一个方法

library(dplyr)
keyval$edu_level <- ordered(keyval$edu_level, levels = c("low", "medium", "high"))
df1 %>% 
  mutate(edu_level = do.call(pmax, c(across(any_of(keyval$variable), 
   ~ case_when(as.logical(.x) ~ keyval$edu_level[match(cur_column(), 
     keyval$variable)])), na.rm = TRUE))
 )
  • 输出
A_levels College_Uni CSEs NVQ_HND_HNC O_levels_GCSEs Other_prof_qual Prefer_not_to_answer None_of_the_above   id edu_level
1         0           1    0           0              1               0                    0                 0 5000      high
2         1           0    0           0              1               0                    0                 0 5001    medium
3         1           0    0           0              1               0                    0                 0 5002    medium
4         0           0    0           1              1               0                    0                 0 5003    medium
5         0           1    0           0              1               0                    0                 0 5004      high
6         0           0    0           0              0               1                    0                 0 5005       low
7         0           1    0           1              1               0                    0                 0 5006      high
8         0           0    1           0              0               0                    0                 0 5007       low
9         0           1    0           0              0               0                    0                 0 5008      high
10        0           0    0           0              0               0                    1                 0 5009      <NA>
11        0           0    0           0              0               0                    0                 1 5010      <NA>

相关问题