我如何在r中合并这些不平衡的数据?

rdlzhqv9  于 2022-12-20  发布在  其他
关注(0)|答案(3)|浏览(171)

我有一个数据集,但它有点奇怪。所以我试图修复它的权利。

a_name     b_name         latitude      longitude        min.distance
           kangseo        37.1562        126.9762         73.950928
           kangbook       38.1255        126.1952         128.179185
           jongro         37.8226        127.1724         95.626161
           dongmoon       37.1161        127.1345         12.531519
           jamsil         36.1215        127.9722         175.154161
           nolboo         37.3213        127.3211         85.151616
           ilsan          37.1111        127.1331         16.115151
jongone                   38.1241        126.1215         95.626161
jongtwo                   37.1161        127.1345         12.531519
jongthree                 37.1562        126.9762         73.950928
jongfour                  37.1551        127.6262         17.124115
jongfive                  36.1515        127.9627         175.154161

这个数据被分成两部分,我想把它变成干净的数据。

a_name     b_name         latitude      longitude        min.distance
jongthree  kangseo        37.1562        126.9762         73.950928
           kangbook       38.1255        126.1952         128.179185
jongone    jongro         38.1241        126.1215         95.626161
jongtwo   dongmoon        37.1161        127.1345         12.531519
jongfive   jamsil         36.1515        127.9627         175.154161
           nolboo         37.3213        127.3211         85.151616
jongfour   ilsan          37.1515        127.6262         17.124115

我尝试了left_join和merge,但效果不佳。

left_join(A,B, by="min.distance")

我需要你的帮助。

icnyk63a

icnyk63a1#

您可以在此处使用full_join()

full_join(
  d %>% filter(is.na(b_name)) %>% select(-b_name),
  d %>% filter(is.na(a_name)) %>% select(b_name, min.distance),
) %>% relocate(b_name, .before=latitude)

获得相同结果的另一种方法是将filter()步骤合并到对split()的单个调用中

d=split(d,is.na(d$a_name))
full_join(select(d[[1]], -b_name), select(d[[2]], c(b_name, min.distance)))

输出:

a_name   b_name latitude longitude min.distance
1   jongone   jongro  38.1241  126.1215     95.62616
2   jongtwo dongmoon  37.1161  127.1345     12.53152
3 jongthree  kangseo  37.1562  126.9762     73.95093
4  jongfour     <NA>  37.1551  127.6262     17.12411
5  jongfive   jamsil  36.1515  127.9627    175.15416
6      <NA> kangbook       NA        NA    128.17918
7      <NA>   nolboo       NA        NA     85.15162
8      <NA>    ilsan       NA        NA     16.11515

输入:

d = structure(list(a_name = c(NA, NA, NA, NA, NA, NA, NA, "jongone", 
"jongtwo", "jongthree", "jongfour", "jongfive"), b_name = c("kangseo", 
"kangbook", "jongro", "dongmoon", "jamsil", "nolboo", "ilsan", 
NA, NA, NA, NA, NA), latitude = c(37.1562, 38.1255, 37.8226, 
37.1161, 36.1215, 37.3213, 37.1111, 38.1241, 37.1161, 37.1562, 
37.1551, 36.1515), longitude = c(126.9762, 126.1952, 127.1724, 
127.1345, 127.9722, 127.3211, 127.1331, 126.1215, 127.1345, 126.9762, 
127.6262, 127.9627), min.distance = c(73.950928, 128.179185, 
95.626161, 12.531519, 175.154161, 85.151616, 16.115151, 95.626161, 
12.531519, 73.950928, 17.124115, 175.154161)), class = "data.frame", row.names = c(NA, 
-12L))
b1uwtaje

b1uwtaje2#

我们可以按"min.distance"分组,并将NA元素修改为非NA值,从而获得distinct行,而无需进行任何连接

library(dplyr)
df1 %>% 
   group_by(min.distance) %>%
   mutate(across(c(a_name, b_name), ~ (.x[!is.na(na_if(.x, ""))][1]))) %>% 
   ungroup %>%
   distinct(min.distance, .keep_all = TRUE)
  • 输出
# A tibble: 8 × 5
  a_name    b_name   latitude longitude min.distance
  <chr>     <chr>       <dbl>     <dbl>        <dbl>
1 jongthree kangseo      37.2      127.         74.0
2 <NA>      kangbook     38.1      126.        128. 
3 jongone   jongro       37.8      127.         95.6
4 jongtwo   dongmoon     37.1      127.         12.5
5 jongfive  jamsil       36.1      128.        175. 
6 <NA>      nolboo       37.3      127.         85.2
7 <NA>      ilsan        37.1      127.         16.1
8 jongfour  <NA>         37.2      128.         17.1

数据

df1 <- structure(list(a_name = c(NA, NA, NA, NA, NA, NA, NA, "jongone", 
"jongtwo", "jongthree", "jongfour", "jongfive"), b_name = c("kangseo", 
"kangbook", "jongro", "dongmoon", "jamsil", "nolboo", "ilsan", 
NA, NA, NA, NA, NA), latitude = c(37.1562, 38.1255, 37.8226, 
37.1161, 36.1215, 37.3213, 37.1111, 38.1241, 37.1161, 37.1562, 
37.1551, 36.1515), longitude = c(126.9762, 126.1952, 127.1724, 
127.1345, 127.9722, 127.3211, 127.1331, 126.1215, 127.1345, 126.9762, 
127.6262, 127.9627), min.distance = c(73.950928, 128.179185, 
95.626161, 12.531519, 175.154161, 85.151616, 16.115151, 95.626161, 
12.531519, 73.950928, 17.124115, 175.154161)), class = "data.frame", 
row.names = c(NA, 
-12L))
xn1cxnb4

xn1cxnb43#

如果不想依赖于min.distance值的精确匹配,您可以尝试真正的空间匹配(并对距离差异进行一定容差的过滤),如下所示:

library(sf) ## library for spatial processing

根据a_nameis.na()状态,将df拆分为两个 Dataframe 的列表,并将列表成员命名为A和B:

df_list <- setNames(
  df1 |>  split(is.na(df1$a_name)),
  c('A', 'B')
)

一次性将两个 Dataframe 转换为空间 Dataframe (包含geometry column)(通过Map列表):

df_list <- 
  df_list |> 
  map( ~ st_as_sf(.x, coords = c('latitude', 'longitude'))
)

为A的每一行找到B的最近特征(行),并将该行列绑定到A:

df_list$A |>
  bind_cols(
    df_list$B |>
    slice(st_nearest_feature(df_list$A, df_list$B))
  ) |>
## select (only) and rename desired columns):
select('a_name' = 1, 'b_name' = 6,
       'a_min_dist' = 3, 'b_min_dist' = 7
       ) |>
mutate(mutual_distance = abs(a_min_dist - b_min_dist))

输出

##      a_name   b_name a_min_dist b_min_dist mutual_distance
## 1   jongone kangbook   95.62616  128.17918        32.55302
## 2   jongtwo dongmoon   12.53152   12.53152         0.00000
## 3 jongthree  kangseo   73.95093   73.95093         0.00000
## 4  jongfour   nolboo   17.12411   85.15162        68.02750
## 5  jongfive   jamsil  175.15416  175.15416         0.00000

现在您可以按可接受的mutual_distance进行过滤。

相关问题