R语言 根据分组变量改变权重变量

q9yhzks0  于 2023-01-10  发布在  其他
关注(0)|答案(2)|浏览(186)

我有如下示例数据:

library(diagis) # weighted_se
table_selection <- structure(list(year = c(2006, 2006, 2006, 2006, 2006), Totaal_pop_weights = c(12.125, 
12.125, 12.125, 12.125, 12.125), Y02_pop_weights = c(97, 97, 
97, 97, 97), Y01_pop_weights = c(12.125, 12.125, 12.125, 12.125, 
12.125), h10_pop_weights = c(12.125, 12.125, 12.125, 12.125, 
12.125), A_ha_pop_weights = c(12.125, 12.125, 12.125, 12.125, 
12.125), B_ha_pop_weights = c(12.125, 12.125, 12.125, 12.125, 
12.125), C_ha_pop_weights = c(97, 97, 97, 97, 97), D_ha_pop_weights = c(12.125, 
12.125, 12.125, 12.125, 12.125), variable = structure(c(2L, 1L, 
1L, 4L, 1L), levels = c("A_ha", "B_ha", "C_ha", 
"C_ha", "Y01", "Y02", "Totaal", "X10"), class = "factor"), 
    value = c(2, 3, 1, 1, 12.9)), row.names = c(NA, -5L), class = c("data.table", 
"data.frame"))

   year Totaal_pop_weights Y02_pop_weights Y01_pop_weights h10_pop_weights A_ha_pop_weights B_ha_pop_weights
1: 2006             12.125              97          12.125          12.125           12.125           12.125
2: 2006             12.125              97          12.125          12.125           12.125           12.125
3: 2006             12.125              97          12.125          12.125           12.125           12.125
4: 2006             12.125              97          12.125          12.125           12.125           12.125
5: 2006             12.125              97          12.125          12.125           12.125           12.125
   C_ha_pop_weights D_ha_pop_weights variable value
1:               97           12.125     B_ha   2.0
2:               97           12.125     A_ha   3.0
3:               97           12.125     A_ha   1.0
4:               97           12.125     C_ha   1.0
5:               97           12.125     A_ha  12.9

我想对这些意见作如下权衡:

weights_of_interest <- select(table_selection, contains(c("weights")))
table_selection <- table_selection %>%
    group_by(year, variable) %>%
    summarize(weighted_mean = weighted_mean(value, w = Y01_pop_weights , na.rm=TRUE),
              weighted_se = weighted_se(value, w = Y01_pop_weights , na.rm=TRUE))

但是它一直使用相同的权重,我怎么改变权重,使得变量为A_ha的值,使用A_ha_pop_weights作为权重。

0ve6wy6x

0ve6wy6x1#

如果table_selection是一个data.table(如示例数据所示),则可以创建一个新的单列wt,根据variable中的值保存弹出权重值

table_selection[
  ,
  wt:=.SD[[paste0(variable,"_pop_weights")]][1],
  by = 1:nrow(table_selection),
  .SDcols = patterns("ha_pop_weights")
]

下面是使用dplyrrowwise()cacross())的相同方法

# helper function
f <- function(d,v) d[[paste0(v,"_pop_weights")]][1]

# vector of wt variable names
ha_wts = names(table_selection)[grepl("ha_pop_weights$", names(table_selection))]

# mutate the `wt` column
table_selection %>% 
  rowwise() %>% 
  mutate(wt = f(setNames(c_across(all_of(ha_wts)), ha_wts),variable))

无论使用哪种方法,您都可以在上面对summarize()的调用中使用w=wt

wfsdck30

wfsdck302#

如果你想要一个tidyverse的解决方案,我认为最好的方法是使用tidyr把数据转换成长格式。我的计算机不知道函数'weighted_mean'或'weighted_se',所以我不能100%肯定这会工作。

library(magrittr)
table_selection %>% 
  tidyr::pivot_longer(cols = tidyselect::contains("weights"),
                      values_to = "pop_values",
                      names_to = "NAMES") %>% 
  dplyr::group_by(year, variable, NAMES) %>%
  dplyr::summarize(weighted_mean = weighted_mean(value, w = pop_values, na.rm=TRUE),
weighted_se = weighted_se(value, w = pop_values , na.rm=TRUE))

但是使用统计数据包中的加权平均值...

table_selection %>% 
  tidyr::pivot_longer(cols = tidyselect::contains("weights"),
                      values_to = "pop_values",
                      names_to = "NAMES") %>% 
  dplyr::group_by(year, variable, NAMES) %>%
  dplyr::summarize(weighted_mean = stats::weighted.mean(value, w = pop_values , na.rm=TRUE),
                   #weighted_se = weighted_se(value, w = pop_values , na.rm=TRUE))

退货:

# A tibble: 24 x 4
# Groups:   year, variable [3]
    year variable NAMES              weighted_mean
   <dbl> <fct>    <chr>                      <dbl>
 1  2006 A_ha     A_ha_pop_weights            5.63
 2  2006 A_ha     B_ha_pop_weights            5.63
 3  2006 A_ha     C_ha_pop_weights            5.63
 4  2006 A_ha     D_ha_pop_weights            5.63
 5  2006 A_ha     h10_pop_weights             5.63
 6  2006 A_ha     Totaal_pop_weights          5.63
 7  2006 A_ha     Y01_pop_weights             5.63
 8  2006 A_ha     Y02_pop_weights             5.63
 9  2006 B_ha     A_ha_pop_weights            2   
10  2006 B_ha     B_ha_pop_weights            2   
# ... with 14 more rows

相关问题