如何在R中有效地对跨国时间序列数据执行减法?

dddzy1tm  于 2023-03-27  发布在  其他
关注(0)|答案(2)|浏览(94)

我的数据如下所示

structure(list(Country.Name = c("Afghanistan", "Africa Eastern and Southern", 
"Africa Western and Central", "Albania", "Algeria", "American Samoa", 
"Andorra", "Angola", "Antigua and Barbuda", "Arab World"), Indicator.Name.x = c("Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)", 
"Employment in agriculture, female (% of female employment) (modeled ILO estimate)"
), X1991_f = c(76.37999725, 70.89825914, 57.87195138, 66.16000366, 
14.88000011, NA, NA, 45.75, NA, 46.76332995), X1992_f = c(76.41000366, 
70.90647329, 57.68949562, 65.09999847, 14.65999985, NA, NA, 46.56999969, 
NA, 46.41667524), X1993_f = c(76.76000214, 70.7319986, 57.41803289, 
64.56999969, 14.55000019, NA, NA, 47.09999847, NA, 44.36386763
), X1994_f = c(76.33999634, 70.56800973, 57.07966283, 63.97000122, 
14.44999981, NA, NA, 47.45999908, NA, 44.80191031), X1995_f = c(76.65000153, 
70.35511259, 56.72241777, 63.27000046, 14.31000042, NA, NA, 46.81999969, 
NA, 43.64430037), X1996_f = c(77.05000305, 70.18675759, 56.29510159, 
64.83999634, 14.02999973, NA, NA, 46.24000168, NA, 42.95114865
), X1997_f = c(77.41000366, 69.97559992, 55.80528904, 65.48999786, 
14.01000023, NA, NA, 45.47000122, NA, 42.12963533), X1998_f = c(77.75, 
69.8989892, 55.26462465, 65.01999664, 13.39000034, NA, NA, 44.41999817, 
NA, 40.20414039), X1999_f = c(78.19000244, 69.8013891, 54.73848758, 
64.75, 13.30000019, NA, NA, 45.24000168, NA, 38.70946622), X2000_f = c(78.79000092, 
70.24647238, 54.24866548, 63.97999954, 13.55000019, NA, NA, 44.93000031, 
NA, 41.05161104), X2001_f = c(79.63999939, 70.42984028, 53.48707555, 
63.20000076, 13.03999996, NA, NA, 44.09999847, NA, 38.05426761
), X2002_f = c(78.72000122, 70.92044757, 52.72674308, 62.18999863, 
12.34000015, NA, NA, 43.99000168, NA, 36.20345416), X2003_f = c(78.38999939, 
70.90105446, 52.08667381, 61.06000137, 11.60999966, NA, NA, 43.97999954, 
NA, 37.82208055), X2004_f = c(78.95999908, 70.67664189, 51.49707855, 
60.34999847, 10.06999969, NA, NA, 44.22999954, NA, 38.9673068
), X2005_f = c(78.08999634, 70.79609724, 50.5431249, 59.65999985, 
8.859999657, NA, NA, 46.08000183, NA, 38.32149263), X2006_f = c(77.63999939, 
70.16321174, 49.64469383, 58.54000092, 7.489999771, NA, NA, 47.77999878, 
NA, 36.74298129), X2007_f = c(76.37999725, 69.54450304, 48.60527577, 
57.72999954, 6.269999981, NA, NA, 49.38000107, NA, 37.2889534
), X2008_f = c(76.16000366, 68.54660244, 47.37431999, 56.88999939, 
5.329999924, NA, NA, 51.13999939, NA, 35.97835105), X2009_f = c(74.22000122, 
68.0518831, 46.19847271, 57.22000122, 4.199999809, NA, NA, 52.75, 
NA, 35.34189842), X2010_f = c(72.90000153, 67.76753275, 44.96248185, 
53.09000015, 3.519999981, NA, NA, 55.47000122, NA, 33.55782243
), X2011_f = c(72.41000366, 66.98750257, 43.8503065, 54.36999893, 
2.960000038, NA, NA, 58.00999832, NA, 32.65826301), X2012_f = c(70.94999695, 
66.02737174, 42.36432472, 57, 3.170000076, NA, NA, 58.45000076, 
NA, 30.48576338), X2013_f = c(69.98999786, 65.45106245, 41.00997613, 
53.16999817, 3.319999933, NA, NA, 58.45999908, NA, 31.3212523
), X2014_f = c(69.18000031, 64.74486799, 39.422334, 49.88000107, 
3.460000038, NA, NA, 58.20000076, NA, 31.1129401), X2015_f = c(68.59999847, 
64.0600908, 37.89897833, 47.68000031, 3.569999933, NA, NA, 57.79999924, 
NA, 29.60966539), X2016_f = c(67.55999756, 63.55588709, 37.23167103, 
45.22999954, 3.700000048, NA, NA, 57.52000046, NA, 28.65057866
), X2017_f = c(66.22000122, 62.95115242, 36.39239843, 42.38999939, 
3.839999914, NA, NA, 57.20000076, NA, 28.24277769), X2018_f = c(65.76999664, 
62.45878849, 35.75188861, 42.27999878, 3.579999924, NA, NA, 57.06000137, 
NA, 23.98984823), X2019_f = c(64.95999908, 61.93481888, 34.98255693, 
41.63000107, 3.380000114, NA, NA, 56.77999878, NA, 23.03898223
), Indicator.Name.y = c("Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)", 
"Employment in agriculture, male (% of male employment) (modeled ILO estimate)"
), X1991_m = c(61.11999893, 61.38082542, 60.42776603, 51.61000061, 
26.45999908, NA, NA, 34.15000153, NA, 31.40304127), X1992_m = c(61.40000153, 
61.57731927, 60.35758857, 52.63000107, 26.31999969, NA, NA, 33.54000092, 
NA, 31.06940314), X1993_m = c(62.24000168, 61.6839476, 60.41184424, 
52.13999939, 26.12999916, NA, NA, 33.63999939, NA, 30.46141617
), X1994_m = c(62.36999893, 61.68793227, 60.38606372, 51.59000015, 
25.89999962, NA, NA, 33.02999878, NA, 30.04424893), X1995_m = c(62.15999985, 
61.65570615, 60.32138756, 50.84000015, 25.63999939, NA, NA, 32.58000183, 
NA, 29.61220427), X1996_m = c(62.54999924, 61.54173509, 60.11798216, 
49, 25.36000061, NA, NA, 31.98999977, NA, 28.91751457), X1997_m = c(62.86000061, 
61.66644396, 59.86985078, 48.72000122, 24.96999931, NA, NA, 31.84000015, 
NA, 28.32475036), X1998_m = c(63.08000183, 61.65208625, 59.63380569, 
47.93999863, 24.64999962, NA, NA, 32.06999969, NA, 27.85096791
), X1999_m = c(63.31000137, 61.56719624, 59.46312819, 46.31000137, 
24.31999969, NA, NA, 30.65999985, NA, 27.42025206), X2000_m = c(63.65000153, 
61.63792017, 59.1542363, 44.91999817, 23.70999908, NA, NA, 30.19000053, 
NA, 27.1720503), X2001_m = c(64.01999664, 61.78336159, 58.77138786, 
42.95999908, 23.07999992, NA, NA, 30.44000053, NA, 26.93498439
), X2002_m = c(61.97000122, 61.75729597, 58.04072823, 41.70000076, 
23.21999931, NA, NA, 29.21999931, NA, 26.502578), X2003_m = c(61.15000153, 
61.67732635, 57.43396689, 40.93999863, 23.36000061, NA, NA, 28.97999954, 
NA, 26.1462127), X2004_m = c(60.50999832, 61.38274438, 56.65664598, 
39.77999878, 21.80999947, NA, NA, 28.14999962, NA, 26.00187369
), X2005_m = c(59.29999924, 61.34758091, 56.26699531, 38.56000137, 
20.18000031, NA, NA, 30.14999962, NA, 25.03849467), X2006_m = c(58.38999939, 
60.77753162, 55.81997131, 37.43000031, 18.70999908, NA, NA, 32.63999939, 
NA, 24.39160015), X2007_m = c(57.29000092, 60.37245552, 55.24539063, 
36.68000031, 17.23999977, NA, NA, 35.02000046, NA, 23.4421701
), X2008_m = c(56.36000061, 60.08068911, 54.64176858, 35.75, 
15.90999985, NA, NA, 37.56000137, NA, 22.77119995), X2009_m = c(53.88999939, 
60.04838745, 53.96863359, 35.04999924, 14.61999989, NA, NA, 40.20000076, 
NA, 21.59570526), X2010_m = c(51.38999939, 59.77404798, 53.1852058, 
34.25999832, 13.35999966, NA, NA, 42.54000092, NA, 20.65910673
), X2011_m = c(50.20000076, 59.32684789, 52.3719792, 38.81000137, 
12.30000019, NA, NA, 44.52000046, NA, 20.42663255), X2012_m = c(47.81000137, 
58.86642424, 51.54752003, 37.59000015, 12.28999996, NA, NA, 44.09999847, 
NA, 19.87213684), X2013_m = c(45.88000107, 58.75190065, 50.68928721, 
37.56999969, 12.15999985, NA, NA, 43.99000168, NA, 19.51637588
), X2014_m = c(44.09000015, 58.25105258, 49.53175737, 36.75999832, 
11.94999981, NA, NA, 44.15000153, NA, 19.30844549), X2015_m = c(42.29999924, 
57.82885815, 48.63115767, 36.74000168, 11.77999973, NA, NA, 44.54000092, 
NA, 18.8722441), X2016_m = c(40.65999985, 57.63116957, 48.50541695, 
36.29000092, 11.56000042, NA, NA, 44.79999924, NA, 18.44252983
), X2017_m = c(38.36000061, 57.44003748, 47.97781438, 34.95999908, 
11.35999966, NA, NA, 44.79999924, NA, 17.91710234), X2018_m = c(37.25999832, 
57.10172448, 47.43563728, 33.72000122, 11.05000019, NA, NA, 44.75, 
NA, 17.47135992), X2019_m = c(36.59999847, 56.98322298, 46.98550533, 
32.70999908, 10.76000023, NA, NA, 44.65000153, NA, 16.91990401
)), row.names = c(NA, 10L), class = "data.frame")

_f结尾的变量表示该年女性在农业中的份额,而以_m结尾的变量表示该年男性在农业中的份额。我如何找到以_f_m结尾的相应变量的差异。
例如,1991年女性在农业中的份额由变量X1991_f给出,而1991年男性在农业中的份额为X1991_m。粗略地说,我可以通过减去这两个变量来找到差异,但鉴于我有1991年至2019年的跨国和时间序列数据,我如何有效地为所有数据年份做到这一点?
谢谢你。

mwkjh3gx

mwkjh3gx1#

你可以通过使用grep索引和简单的减法在R中实现这一点。
我将缩短您的数据集,使其成为一个 * 最小 * 可重复的示例(df_small),仅使用三年(1991 - 1993):

df_small

#                 Country.Name  X1991_f  X1992_f  X1993_f  X1991_m  X1992_m  X1993_m
#1                  Afghanistan 76.38000 76.41000 76.76000 61.12000 61.40000 62.24000
#2  Africa Eastern and Southern 70.89826 70.90647 70.73200 61.38083 61.57732 61.68395
#3   Africa Western and Central 57.87195 57.68950 57.41803 60.42777 60.35759 60.41184
#4                      Albania 66.16000 65.10000 64.57000 51.61000 52.63000 52.14000
#5                      Algeria 14.88000 14.66000 14.55000 26.46000 26.32000 26.13000
#6               American Samoa       NA       NA       NA       NA       NA       NA
#7                      Andorra       NA       NA       NA       NA       NA       NA
#8                       Angola 45.75000 46.57000 47.10000 34.15000 33.54000 33.64000
#9          Antigua and Barbuda       NA       NA       NA       NA       NA       NA
#10                  Arab World 46.76333 46.41668 44.36387 31.40304 31.06940 30.46142

同时创建新列和相减的代码是:

df_small[, paste0(1991:1993, "_diff")] <-
  df_small[, grep("_f", colnames(df_small))] - df_small[, grep("_m", colnames(df_small))]

如果你为女性(idx_f)和男性(idx_m)创建索引,可能会更容易看到发生了什么:

idx_f <- grep("_f", colnames(df_small))
idx_m <- grep("_m", colnames(df_small))

df_small[, paste0(1991:1993, "_diff")] <-
  df_small[, idx_f] - df_small[, idx_m]

无论哪种方式,输出都是:

#                  Country.Name  X1991_f  X1992_f  X1993_f  X1991_m  X1992_m  X1993_m  1991_diff  1992_diff  1993_diff
#1                  Afghanistan 76.38000 76.41000 76.76000 61.12000 61.40000 62.24000  15.259998  15.010002  14.520000
#2  Africa Eastern and Southern 70.89826 70.90647 70.73200 61.38083 61.57732 61.68395   9.517434   9.329154   9.048051
#3   Africa Western and Central 57.87195 57.68950 57.41803 60.42777 60.35759 60.41184  -2.555815  -2.668093  -2.993811
#4                      Albania 66.16000 65.10000 64.57000 51.61000 52.63000 52.14000  14.550003  12.469997  12.430000
#5                      Algeria 14.88000 14.66000 14.55000 26.46000 26.32000 26.13000 -11.579999 -11.660000 -11.579999
#6               American Samoa       NA       NA       NA       NA       NA       NA         NA         NA         NA
#7                      Andorra       NA       NA       NA       NA       NA       NA         NA         NA         NA
#8                       Angola 45.75000 46.57000 47.10000 34.15000 33.54000 33.64000  11.599998  13.029999  13.459999
#9          Antigua and Barbuda       NA       NA       NA       NA       NA       NA         NA         NA         NA
#10                  Arab World 46.76333 46.41668 44.36387 31.40304 31.06940 30.46142  15.360289  15.347272  13.902451

数据

# after using OP's dput
df_small <- df[,c(1, grep("1991|1992|1993", colnames(df)))]
vmjh9lq9

vmjh9lq92#

您的数据当前是宽格式,但如果它是长格式,此操作会容易得多。因此,您可以首先使用pivot_longer()dplyr重新调整 Dataframe ,然后进行计算。

require(tidyverse) 

d %>%
    select(!contains("Indicator.Name")) %>%
    pivot_longer(!c(Country.Name), names_sep = "_",
                 names_to = c("year", ".value")) %>%
    mutate(year = str_sub(year, 2, str_length(year)), 
           diff_per_year = f-m)

它将返回如下内容:

# A tibble: 290 × 5
   Country.Name year      f     m diff_per_year
   <chr>        <chr> <dbl> <dbl>         <dbl>
 1 Afghanistan  1991   76.4  61.1          15.3
 2 Afghanistan  1992   76.4  61.4          15.0
 3 Afghanistan  1993   76.8  62.2          14.5
 4 Afghanistan  1994   76.3  62.4          14.0
 5 Afghanistan  1995   76.7  62.2          14.5
 6 Afghanistan  1996   77.1  62.5          14.5
 7 Afghanistan  1997   77.4  62.9          14.6
 8 Afghanistan  1998   77.8  63.1          14.7
 9 Afghanistan  1999   78.2  63.3          14.9
10 Afghanistan  2000   78.8  63.7          15.1
# … with 280 more rows

请注意,这是从删除Indicator.Name列(您有两个)开始的,因为一旦我们重新塑造数据,此信息将分别吸收到fm列中。
此外,现在数据是长格式的,您可能还希望使用lag()lead()函数来查看跨年度差异(例如,在给定的国家内,农业中的男性/女性就业情况或他们之间的差异如何随时间变化)。这些文档是here

相关问题